xref: /openbmc/linux/mm/shmem.c (revision 6fd7353829cafc4067aad9eea0dc95da67e7df16)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Resizable virtual memory filesystem for Linux.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright (C) 2000 Linus Torvalds.
51da177e4SLinus Torvalds  *		 2000 Transmeta Corp.
61da177e4SLinus Torvalds  *		 2000-2001 Christoph Rohland
71da177e4SLinus Torvalds  *		 2000-2001 SAP AG
81da177e4SLinus Torvalds  *		 2002 Red Hat Inc.
96922c0c7SHugh Dickins  * Copyright (C) 2002-2011 Hugh Dickins.
106922c0c7SHugh Dickins  * Copyright (C) 2011 Google Inc.
110edd73b3SHugh Dickins  * Copyright (C) 2002-2005 VERITAS Software Corporation.
121da177e4SLinus Torvalds  * Copyright (C) 2004 Andi Kleen, SuSE Labs
131da177e4SLinus Torvalds  *
141da177e4SLinus Torvalds  * Extended attribute support for tmpfs:
151da177e4SLinus Torvalds  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
161da177e4SLinus Torvalds  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
171da177e4SLinus Torvalds  *
18853ac43aSMatt Mackall  * tiny-shmem:
19853ac43aSMatt Mackall  * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20853ac43aSMatt Mackall  *
211da177e4SLinus Torvalds  * This file is released under the GPL.
221da177e4SLinus Torvalds  */
231da177e4SLinus Torvalds 
24853ac43aSMatt Mackall #include <linux/fs.h>
25853ac43aSMatt Mackall #include <linux/init.h>
26853ac43aSMatt Mackall #include <linux/vfs.h>
27853ac43aSMatt Mackall #include <linux/mount.h>
28250297edSAndrew Morton #include <linux/ramfs.h>
29caefba17SHugh Dickins #include <linux/pagemap.h>
30853ac43aSMatt Mackall #include <linux/file.h>
31e408e695STheodore Ts'o #include <linux/fileattr.h>
32853ac43aSMatt Mackall #include <linux/mm.h>
3346c9a946SArnd Bergmann #include <linux/random.h>
34174cd4b1SIngo Molnar #include <linux/sched/signal.h>
35b95f1b31SPaul Gortmaker #include <linux/export.h>
36853ac43aSMatt Mackall #include <linux/swap.h>
37e2e40f2cSChristoph Hellwig #include <linux/uio.h>
38749df87bSMike Kravetz #include <linux/hugetlb.h>
39626c3920SAl Viro #include <linux/fs_parser.h>
4086a2f3f2SMiaohe Lin #include <linux/swapfile.h>
4136f05cabSJeff Layton #include <linux/iversion.h>
42014bb1deSNeilBrown #include "swap.h"
4395cc09d6SAndrea Arcangeli 
44853ac43aSMatt Mackall static struct vfsmount *shm_mnt;
45853ac43aSMatt Mackall 
46853ac43aSMatt Mackall #ifdef CONFIG_SHMEM
471da177e4SLinus Torvalds /*
481da177e4SLinus Torvalds  * This virtual memory filesystem is heavily based on the ramfs. It
491da177e4SLinus Torvalds  * extends ramfs by the ability to use swap and honor resource limits
501da177e4SLinus Torvalds  * which makes it a completely usable filesystem.
511da177e4SLinus Torvalds  */
521da177e4SLinus Torvalds 
5339f0247dSAndreas Gruenbacher #include <linux/xattr.h>
54a5694255SChristoph Hellwig #include <linux/exportfs.h>
551c7c474cSChristoph Hellwig #include <linux/posix_acl.h>
56feda821eSChristoph Hellwig #include <linux/posix_acl_xattr.h>
571da177e4SLinus Torvalds #include <linux/mman.h>
581da177e4SLinus Torvalds #include <linux/string.h>
591da177e4SLinus Torvalds #include <linux/slab.h>
601da177e4SLinus Torvalds #include <linux/backing-dev.h>
611da177e4SLinus Torvalds #include <linux/shmem_fs.h>
621da177e4SLinus Torvalds #include <linux/writeback.h>
63bda97eabSHugh Dickins #include <linux/pagevec.h>
6441ffe5d5SHugh Dickins #include <linux/percpu_counter.h>
6583e4fa9cSHugh Dickins #include <linux/falloc.h>
66708e3508SHugh Dickins #include <linux/splice.h>
671da177e4SLinus Torvalds #include <linux/security.h>
681da177e4SLinus Torvalds #include <linux/swapops.h>
691da177e4SLinus Torvalds #include <linux/mempolicy.h>
701da177e4SLinus Torvalds #include <linux/namei.h>
71b00dc3adSHugh Dickins #include <linux/ctype.h>
72304dbdb7SLee Schermerhorn #include <linux/migrate.h>
73c1f60a5aSChristoph Lameter #include <linux/highmem.h>
74680d794bSakpm@linux-foundation.org #include <linux/seq_file.h>
7592562927SMimi Zohar #include <linux/magic.h>
769183df25SDavid Herrmann #include <linux/syscalls.h>
7740e041a2SDavid Herrmann #include <linux/fcntl.h>
789183df25SDavid Herrmann #include <uapi/linux/memfd.h>
79cfda0526SMike Rapoport #include <linux/userfaultfd_k.h>
804c27fe4cSMike Rapoport #include <linux/rmap.h>
812b4db796SAmir Goldstein #include <linux/uuid.h>
82304dbdb7SLee Schermerhorn 
837c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
841da177e4SLinus Torvalds 
85dd56b046SMel Gorman #include "internal.h"
86dd56b046SMel Gorman 
8709cbfeafSKirill A. Shutemov #define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
8809cbfeafSKirill A. Shutemov #define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
891da177e4SLinus Torvalds 
901da177e4SLinus Torvalds /* Pretend that each entry is of this size in directory's i_size */
911da177e4SLinus Torvalds #define BOGO_DIRENT_SIZE 20
921da177e4SLinus Torvalds 
9369f07ec9SHugh Dickins /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
9469f07ec9SHugh Dickins #define SHORT_SYMLINK_LEN 128
9569f07ec9SHugh Dickins 
961aac1400SHugh Dickins /*
97f00cdc6dSHugh Dickins  * shmem_fallocate communicates with shmem_fault or shmem_writepage via
989608703eSJan Kara  * inode->i_private (with i_rwsem making sure that it has only one user at
99f00cdc6dSHugh Dickins  * a time): we would prefer not to enlarge the shmem inode just for that.
1001aac1400SHugh Dickins  */
1011aac1400SHugh Dickins struct shmem_falloc {
1028e205f77SHugh Dickins 	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
1031aac1400SHugh Dickins 	pgoff_t start;		/* start of range currently being fallocated */
1041aac1400SHugh Dickins 	pgoff_t next;		/* the next page offset to be fallocated */
1051aac1400SHugh Dickins 	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
1061aac1400SHugh Dickins 	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
1071aac1400SHugh Dickins };
1081aac1400SHugh Dickins 
1090b5071ddSAl Viro struct shmem_options {
1100b5071ddSAl Viro 	unsigned long long blocks;
1110b5071ddSAl Viro 	unsigned long long inodes;
1120b5071ddSAl Viro 	struct mempolicy *mpol;
1130b5071ddSAl Viro 	kuid_t uid;
1140b5071ddSAl Viro 	kgid_t gid;
1150b5071ddSAl Viro 	umode_t mode;
116ea3271f7SChris Down 	bool full_inums;
1170b5071ddSAl Viro 	int huge;
1180b5071ddSAl Viro 	int seen;
1190b5071ddSAl Viro #define SHMEM_SEEN_BLOCKS 1
1200b5071ddSAl Viro #define SHMEM_SEEN_INODES 2
1210b5071ddSAl Viro #define SHMEM_SEEN_HUGE 4
122ea3271f7SChris Down #define SHMEM_SEEN_INUMS 8
1230b5071ddSAl Viro };
1240b5071ddSAl Viro 
125b76db735SAndrew Morton #ifdef CONFIG_TMPFS
126680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_blocks(void)
127680d794bSakpm@linux-foundation.org {
128ca79b0c2SArun KS 	return totalram_pages() / 2;
129680d794bSakpm@linux-foundation.org }
130680d794bSakpm@linux-foundation.org 
131680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_inodes(void)
132680d794bSakpm@linux-foundation.org {
133ca79b0c2SArun KS 	unsigned long nr_pages = totalram_pages();
134ca79b0c2SArun KS 
135ca79b0c2SArun KS 	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
136680d794bSakpm@linux-foundation.org }
137b76db735SAndrew Morton #endif
138680d794bSakpm@linux-foundation.org 
139da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
140da08e9b7SMatthew Wilcox (Oracle) 			     struct folio **foliop, enum sgp_type sgp,
141c5bf121eSVineeth Remanan Pillai 			     gfp_t gfp, struct vm_area_struct *vma,
142c5bf121eSVineeth Remanan Pillai 			     vm_fault_t *fault_type);
1431da177e4SLinus Torvalds 
1441da177e4SLinus Torvalds static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
1451da177e4SLinus Torvalds {
1461da177e4SLinus Torvalds 	return sb->s_fs_info;
1471da177e4SLinus Torvalds }
1481da177e4SLinus Torvalds 
1491da177e4SLinus Torvalds /*
1501da177e4SLinus Torvalds  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
1511da177e4SLinus Torvalds  * for shared memory and for shared anonymous (/dev/zero) mappings
1521da177e4SLinus Torvalds  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
1531da177e4SLinus Torvalds  * consistent with the pre-accounting of private mappings ...
1541da177e4SLinus Torvalds  */
1551da177e4SLinus Torvalds static inline int shmem_acct_size(unsigned long flags, loff_t size)
1561da177e4SLinus Torvalds {
1570b0a0806SHugh Dickins 	return (flags & VM_NORESERVE) ?
158191c5424SAl Viro 		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
1591da177e4SLinus Torvalds }
1601da177e4SLinus Torvalds 
1611da177e4SLinus Torvalds static inline void shmem_unacct_size(unsigned long flags, loff_t size)
1621da177e4SLinus Torvalds {
1630b0a0806SHugh Dickins 	if (!(flags & VM_NORESERVE))
1641da177e4SLinus Torvalds 		vm_unacct_memory(VM_ACCT(size));
1651da177e4SLinus Torvalds }
1661da177e4SLinus Torvalds 
16777142517SKonstantin Khlebnikov static inline int shmem_reacct_size(unsigned long flags,
16877142517SKonstantin Khlebnikov 		loff_t oldsize, loff_t newsize)
16977142517SKonstantin Khlebnikov {
17077142517SKonstantin Khlebnikov 	if (!(flags & VM_NORESERVE)) {
17177142517SKonstantin Khlebnikov 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
17277142517SKonstantin Khlebnikov 			return security_vm_enough_memory_mm(current->mm,
17377142517SKonstantin Khlebnikov 					VM_ACCT(newsize) - VM_ACCT(oldsize));
17477142517SKonstantin Khlebnikov 		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
17577142517SKonstantin Khlebnikov 			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
17677142517SKonstantin Khlebnikov 	}
17777142517SKonstantin Khlebnikov 	return 0;
17877142517SKonstantin Khlebnikov }
17977142517SKonstantin Khlebnikov 
1801da177e4SLinus Torvalds /*
1811da177e4SLinus Torvalds  * ... whereas tmpfs objects are accounted incrementally as
18275edd345SHugh Dickins  * pages are allocated, in order to allow large sparse files.
183923e2f0eSMatthew Wilcox (Oracle)  * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
1841da177e4SLinus Torvalds  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
1851da177e4SLinus Torvalds  */
186800d8c63SKirill A. Shutemov static inline int shmem_acct_block(unsigned long flags, long pages)
1871da177e4SLinus Torvalds {
188800d8c63SKirill A. Shutemov 	if (!(flags & VM_NORESERVE))
189800d8c63SKirill A. Shutemov 		return 0;
190800d8c63SKirill A. Shutemov 
191800d8c63SKirill A. Shutemov 	return security_vm_enough_memory_mm(current->mm,
192800d8c63SKirill A. Shutemov 			pages * VM_ACCT(PAGE_SIZE));
1931da177e4SLinus Torvalds }
1941da177e4SLinus Torvalds 
1951da177e4SLinus Torvalds static inline void shmem_unacct_blocks(unsigned long flags, long pages)
1961da177e4SLinus Torvalds {
1970b0a0806SHugh Dickins 	if (flags & VM_NORESERVE)
19809cbfeafSKirill A. Shutemov 		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
1991da177e4SLinus Torvalds }
2001da177e4SLinus Torvalds 
2010f079694SMike Rapoport static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
2020f079694SMike Rapoport {
2030f079694SMike Rapoport 	struct shmem_inode_info *info = SHMEM_I(inode);
2040f079694SMike Rapoport 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2050f079694SMike Rapoport 
2060f079694SMike Rapoport 	if (shmem_acct_block(info->flags, pages))
2070f079694SMike Rapoport 		return false;
2080f079694SMike Rapoport 
2090f079694SMike Rapoport 	if (sbinfo->max_blocks) {
2100f079694SMike Rapoport 		if (percpu_counter_compare(&sbinfo->used_blocks,
2110f079694SMike Rapoport 					   sbinfo->max_blocks - pages) > 0)
2120f079694SMike Rapoport 			goto unacct;
2130f079694SMike Rapoport 		percpu_counter_add(&sbinfo->used_blocks, pages);
2140f079694SMike Rapoport 	}
2150f079694SMike Rapoport 
2160f079694SMike Rapoport 	return true;
2170f079694SMike Rapoport 
2180f079694SMike Rapoport unacct:
2190f079694SMike Rapoport 	shmem_unacct_blocks(info->flags, pages);
2200f079694SMike Rapoport 	return false;
2210f079694SMike Rapoport }
2220f079694SMike Rapoport 
2230f079694SMike Rapoport static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
2240f079694SMike Rapoport {
2250f079694SMike Rapoport 	struct shmem_inode_info *info = SHMEM_I(inode);
2260f079694SMike Rapoport 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2270f079694SMike Rapoport 
2280f079694SMike Rapoport 	if (sbinfo->max_blocks)
2290f079694SMike Rapoport 		percpu_counter_sub(&sbinfo->used_blocks, pages);
2300f079694SMike Rapoport 	shmem_unacct_blocks(info->flags, pages);
2310f079694SMike Rapoport }
2320f079694SMike Rapoport 
233759b9775SHugh Dickins static const struct super_operations shmem_ops;
23430e6a51dSHui Su const struct address_space_operations shmem_aops;
23515ad7cdcSHelge Deller static const struct file_operations shmem_file_operations;
23692e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations;
23792e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations;
23892e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations;
239f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops;
240d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops;
241779750d2SKirill A. Shutemov static struct file_system_type shmem_fs_type;
2421da177e4SLinus Torvalds 
243d09e8ca6SPasha Tatashin bool vma_is_anon_shmem(struct vm_area_struct *vma)
244d09e8ca6SPasha Tatashin {
245d09e8ca6SPasha Tatashin 	return vma->vm_ops == &shmem_anon_vm_ops;
246d09e8ca6SPasha Tatashin }
247d09e8ca6SPasha Tatashin 
248b0506e48SMike Rapoport bool vma_is_shmem(struct vm_area_struct *vma)
249b0506e48SMike Rapoport {
250d09e8ca6SPasha Tatashin 	return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
251b0506e48SMike Rapoport }
252b0506e48SMike Rapoport 
2531da177e4SLinus Torvalds static LIST_HEAD(shmem_swaplist);
254cb5f7b9aSHugh Dickins static DEFINE_MUTEX(shmem_swaplist_mutex);
2551da177e4SLinus Torvalds 
256e809d5f0SChris Down /*
257e809d5f0SChris Down  * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
258e809d5f0SChris Down  * produces a novel ino for the newly allocated inode.
259e809d5f0SChris Down  *
260e809d5f0SChris Down  * It may also be called when making a hard link to permit the space needed by
261e809d5f0SChris Down  * each dentry. However, in that case, no new inode number is needed since that
262e809d5f0SChris Down  * internally draws from another pool of inode numbers (currently global
263e809d5f0SChris Down  * get_next_ino()). This case is indicated by passing NULL as inop.
264e809d5f0SChris Down  */
265e809d5f0SChris Down #define SHMEM_INO_BATCH 1024
266e809d5f0SChris Down static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
2675b04c689SPavel Emelyanov {
2685b04c689SPavel Emelyanov 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
269e809d5f0SChris Down 	ino_t ino;
270e809d5f0SChris Down 
271e809d5f0SChris Down 	if (!(sb->s_flags & SB_KERNMOUNT)) {
272bf11b9a8SSebastian Andrzej Siewior 		raw_spin_lock(&sbinfo->stat_lock);
273bb3e96d6SByron Stanoszek 		if (sbinfo->max_inodes) {
2745b04c689SPavel Emelyanov 			if (!sbinfo->free_inodes) {
275bf11b9a8SSebastian Andrzej Siewior 				raw_spin_unlock(&sbinfo->stat_lock);
2765b04c689SPavel Emelyanov 				return -ENOSPC;
2775b04c689SPavel Emelyanov 			}
2785b04c689SPavel Emelyanov 			sbinfo->free_inodes--;
279bb3e96d6SByron Stanoszek 		}
280e809d5f0SChris Down 		if (inop) {
281e809d5f0SChris Down 			ino = sbinfo->next_ino++;
282e809d5f0SChris Down 			if (unlikely(is_zero_ino(ino)))
283e809d5f0SChris Down 				ino = sbinfo->next_ino++;
284ea3271f7SChris Down 			if (unlikely(!sbinfo->full_inums &&
285ea3271f7SChris Down 				     ino > UINT_MAX)) {
286e809d5f0SChris Down 				/*
287e809d5f0SChris Down 				 * Emulate get_next_ino uint wraparound for
288e809d5f0SChris Down 				 * compatibility
289e809d5f0SChris Down 				 */
290ea3271f7SChris Down 				if (IS_ENABLED(CONFIG_64BIT))
291ea3271f7SChris Down 					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
292ea3271f7SChris Down 						__func__, MINOR(sb->s_dev));
293ea3271f7SChris Down 				sbinfo->next_ino = 1;
294ea3271f7SChris Down 				ino = sbinfo->next_ino++;
2955b04c689SPavel Emelyanov 			}
296e809d5f0SChris Down 			*inop = ino;
297e809d5f0SChris Down 		}
298bf11b9a8SSebastian Andrzej Siewior 		raw_spin_unlock(&sbinfo->stat_lock);
299e809d5f0SChris Down 	} else if (inop) {
300e809d5f0SChris Down 		/*
301e809d5f0SChris Down 		 * __shmem_file_setup, one of our callers, is lock-free: it
302e809d5f0SChris Down 		 * doesn't hold stat_lock in shmem_reserve_inode since
303e809d5f0SChris Down 		 * max_inodes is always 0, and is called from potentially
304e809d5f0SChris Down 		 * unknown contexts. As such, use a per-cpu batched allocator
305e809d5f0SChris Down 		 * which doesn't require the per-sb stat_lock unless we are at
306e809d5f0SChris Down 		 * the batch boundary.
307ea3271f7SChris Down 		 *
308ea3271f7SChris Down 		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
309ea3271f7SChris Down 		 * shmem mounts are not exposed to userspace, so we don't need
310ea3271f7SChris Down 		 * to worry about things like glibc compatibility.
311e809d5f0SChris Down 		 */
312e809d5f0SChris Down 		ino_t *next_ino;
313bf11b9a8SSebastian Andrzej Siewior 
314e809d5f0SChris Down 		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
315e809d5f0SChris Down 		ino = *next_ino;
316e809d5f0SChris Down 		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
317bf11b9a8SSebastian Andrzej Siewior 			raw_spin_lock(&sbinfo->stat_lock);
318e809d5f0SChris Down 			ino = sbinfo->next_ino;
319e809d5f0SChris Down 			sbinfo->next_ino += SHMEM_INO_BATCH;
320bf11b9a8SSebastian Andrzej Siewior 			raw_spin_unlock(&sbinfo->stat_lock);
321e809d5f0SChris Down 			if (unlikely(is_zero_ino(ino)))
322e809d5f0SChris Down 				ino++;
323e809d5f0SChris Down 		}
324e809d5f0SChris Down 		*inop = ino;
325e809d5f0SChris Down 		*next_ino = ++ino;
326e809d5f0SChris Down 		put_cpu();
327e809d5f0SChris Down 	}
328e809d5f0SChris Down 
3295b04c689SPavel Emelyanov 	return 0;
3305b04c689SPavel Emelyanov }
3315b04c689SPavel Emelyanov 
3325b04c689SPavel Emelyanov static void shmem_free_inode(struct super_block *sb)
3335b04c689SPavel Emelyanov {
3345b04c689SPavel Emelyanov 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3355b04c689SPavel Emelyanov 	if (sbinfo->max_inodes) {
336bf11b9a8SSebastian Andrzej Siewior 		raw_spin_lock(&sbinfo->stat_lock);
3375b04c689SPavel Emelyanov 		sbinfo->free_inodes++;
338bf11b9a8SSebastian Andrzej Siewior 		raw_spin_unlock(&sbinfo->stat_lock);
3395b04c689SPavel Emelyanov 	}
3405b04c689SPavel Emelyanov }
3415b04c689SPavel Emelyanov 
34246711810SRandy Dunlap /**
34341ffe5d5SHugh Dickins  * shmem_recalc_inode - recalculate the block usage of an inode
3441da177e4SLinus Torvalds  * @inode: inode to recalc
3451da177e4SLinus Torvalds  *
3461da177e4SLinus Torvalds  * We have to calculate the free blocks since the mm can drop
3471da177e4SLinus Torvalds  * undirtied hole pages behind our back.
3481da177e4SLinus Torvalds  *
3491da177e4SLinus Torvalds  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
3501da177e4SLinus Torvalds  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
3511da177e4SLinus Torvalds  *
3521da177e4SLinus Torvalds  * It has to be called with the spinlock held.
3531da177e4SLinus Torvalds  */
3541da177e4SLinus Torvalds static void shmem_recalc_inode(struct inode *inode)
3551da177e4SLinus Torvalds {
3561da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
3571da177e4SLinus Torvalds 	long freed;
3581da177e4SLinus Torvalds 
3591da177e4SLinus Torvalds 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
3601da177e4SLinus Torvalds 	if (freed > 0) {
3611da177e4SLinus Torvalds 		info->alloced -= freed;
36254af6042SHugh Dickins 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
3630f079694SMike Rapoport 		shmem_inode_unacct_blocks(inode, freed);
3641da177e4SLinus Torvalds 	}
3651da177e4SLinus Torvalds }
3661da177e4SLinus Torvalds 
367800d8c63SKirill A. Shutemov bool shmem_charge(struct inode *inode, long pages)
368800d8c63SKirill A. Shutemov {
369800d8c63SKirill A. Shutemov 	struct shmem_inode_info *info = SHMEM_I(inode);
3704595ef88SKirill A. Shutemov 	unsigned long flags;
371800d8c63SKirill A. Shutemov 
3720f079694SMike Rapoport 	if (!shmem_inode_acct_block(inode, pages))
373800d8c63SKirill A. Shutemov 		return false;
374b1cc94abSMike Rapoport 
375aaa52e34SHugh Dickins 	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
376aaa52e34SHugh Dickins 	inode->i_mapping->nrpages += pages;
377aaa52e34SHugh Dickins 
3784595ef88SKirill A. Shutemov 	spin_lock_irqsave(&info->lock, flags);
379800d8c63SKirill A. Shutemov 	info->alloced += pages;
380800d8c63SKirill A. Shutemov 	inode->i_blocks += pages * BLOCKS_PER_PAGE;
381800d8c63SKirill A. Shutemov 	shmem_recalc_inode(inode);
3824595ef88SKirill A. Shutemov 	spin_unlock_irqrestore(&info->lock, flags);
383800d8c63SKirill A. Shutemov 
384800d8c63SKirill A. Shutemov 	return true;
385800d8c63SKirill A. Shutemov }
386800d8c63SKirill A. Shutemov 
387800d8c63SKirill A. Shutemov void shmem_uncharge(struct inode *inode, long pages)
388800d8c63SKirill A. Shutemov {
389800d8c63SKirill A. Shutemov 	struct shmem_inode_info *info = SHMEM_I(inode);
3904595ef88SKirill A. Shutemov 	unsigned long flags;
391800d8c63SKirill A. Shutemov 
3926ffcd825SMatthew Wilcox (Oracle) 	/* nrpages adjustment done by __filemap_remove_folio() or caller */
393aaa52e34SHugh Dickins 
3944595ef88SKirill A. Shutemov 	spin_lock_irqsave(&info->lock, flags);
395800d8c63SKirill A. Shutemov 	info->alloced -= pages;
396800d8c63SKirill A. Shutemov 	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
397800d8c63SKirill A. Shutemov 	shmem_recalc_inode(inode);
3984595ef88SKirill A. Shutemov 	spin_unlock_irqrestore(&info->lock, flags);
399800d8c63SKirill A. Shutemov 
4000f079694SMike Rapoport 	shmem_inode_unacct_blocks(inode, pages);
401800d8c63SKirill A. Shutemov }
402800d8c63SKirill A. Shutemov 
4037a5d0fbbSHugh Dickins /*
40462f945b6SMatthew Wilcox  * Replace item expected in xarray by a new item, while holding xa_lock.
4057a5d0fbbSHugh Dickins  */
40662f945b6SMatthew Wilcox static int shmem_replace_entry(struct address_space *mapping,
4077a5d0fbbSHugh Dickins 			pgoff_t index, void *expected, void *replacement)
4087a5d0fbbSHugh Dickins {
40962f945b6SMatthew Wilcox 	XA_STATE(xas, &mapping->i_pages, index);
4106dbaf22cSJohannes Weiner 	void *item;
4117a5d0fbbSHugh Dickins 
4127a5d0fbbSHugh Dickins 	VM_BUG_ON(!expected);
4136dbaf22cSJohannes Weiner 	VM_BUG_ON(!replacement);
41462f945b6SMatthew Wilcox 	item = xas_load(&xas);
4157a5d0fbbSHugh Dickins 	if (item != expected)
4167a5d0fbbSHugh Dickins 		return -ENOENT;
41762f945b6SMatthew Wilcox 	xas_store(&xas, replacement);
4187a5d0fbbSHugh Dickins 	return 0;
4197a5d0fbbSHugh Dickins }
4207a5d0fbbSHugh Dickins 
4217a5d0fbbSHugh Dickins /*
422d1899228SHugh Dickins  * Sometimes, before we decide whether to proceed or to fail, we must check
423d1899228SHugh Dickins  * that an entry was not already brought back from swap by a racing thread.
424d1899228SHugh Dickins  *
425d1899228SHugh Dickins  * Checking page is not enough: by the time a SwapCache page is locked, it
426d1899228SHugh Dickins  * might be reused, and again be SwapCache, using the same swap as before.
427d1899228SHugh Dickins  */
428d1899228SHugh Dickins static bool shmem_confirm_swap(struct address_space *mapping,
429d1899228SHugh Dickins 			       pgoff_t index, swp_entry_t swap)
430d1899228SHugh Dickins {
431a12831bfSMatthew Wilcox 	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
432d1899228SHugh Dickins }
433d1899228SHugh Dickins 
434d1899228SHugh Dickins /*
4355a6e75f8SKirill A. Shutemov  * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
4365a6e75f8SKirill A. Shutemov  *
4375a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_NEVER:
4385a6e75f8SKirill A. Shutemov  *	disables huge pages for the mount;
4395a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_ALWAYS:
4405a6e75f8SKirill A. Shutemov  *	enables huge pages for the mount;
4415a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_WITHIN_SIZE:
4425a6e75f8SKirill A. Shutemov  *	only allocate huge pages if the page will be fully within i_size,
4435a6e75f8SKirill A. Shutemov  *	also respect fadvise()/madvise() hints;
4445a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_ADVISE:
4455a6e75f8SKirill A. Shutemov  *	only allocate huge pages if requested with fadvise()/madvise();
4465a6e75f8SKirill A. Shutemov  */
4475a6e75f8SKirill A. Shutemov 
4485a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_NEVER	0
4495a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ALWAYS	1
4505a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_WITHIN_SIZE	2
4515a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ADVISE	3
4525a6e75f8SKirill A. Shutemov 
4535a6e75f8SKirill A. Shutemov /*
4545a6e75f8SKirill A. Shutemov  * Special values.
4555a6e75f8SKirill A. Shutemov  * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
4565a6e75f8SKirill A. Shutemov  *
4575a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_DENY:
4585a6e75f8SKirill A. Shutemov  *	disables huge on shm_mnt and all mounts, for emergency use;
4595a6e75f8SKirill A. Shutemov  * SHMEM_HUGE_FORCE:
4605a6e75f8SKirill A. Shutemov  *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
4615a6e75f8SKirill A. Shutemov  *
4625a6e75f8SKirill A. Shutemov  */
4635a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_DENY		(-1)
4645a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_FORCE	(-2)
4655a6e75f8SKirill A. Shutemov 
466396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4675a6e75f8SKirill A. Shutemov /* ifdef here to avoid bloating shmem.o when not necessary */
4685a6e75f8SKirill A. Shutemov 
4695e6e5a12SHugh Dickins static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
4705a6e75f8SKirill A. Shutemov 
4717c6c6cc4SZach O'Keefe bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
4727c6c6cc4SZach O'Keefe 		   pgoff_t index, bool shmem_huge_force)
473c852023eSHugh Dickins {
474c852023eSHugh Dickins 	loff_t i_size;
475c852023eSHugh Dickins 
476f7cd16a5SXavier Roche 	if (!S_ISREG(inode->i_mode))
477f7cd16a5SXavier Roche 		return false;
4785e6e5a12SHugh Dickins 	if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
4795e6e5a12SHugh Dickins 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
480c852023eSHugh Dickins 		return false;
4817c6c6cc4SZach O'Keefe 	if (shmem_huge == SHMEM_HUGE_DENY)
4827c6c6cc4SZach O'Keefe 		return false;
4833de0c269SZach O'Keefe 	if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
4843de0c269SZach O'Keefe 		return true;
4855e6e5a12SHugh Dickins 
4865e6e5a12SHugh Dickins 	switch (SHMEM_SB(inode->i_sb)->huge) {
487c852023eSHugh Dickins 	case SHMEM_HUGE_ALWAYS:
488c852023eSHugh Dickins 		return true;
489c852023eSHugh Dickins 	case SHMEM_HUGE_WITHIN_SIZE:
490de6ee659SLiu Yuntao 		index = round_up(index + 1, HPAGE_PMD_NR);
491c852023eSHugh Dickins 		i_size = round_up(i_size_read(inode), PAGE_SIZE);
492de6ee659SLiu Yuntao 		if (i_size >> PAGE_SHIFT >= index)
493c852023eSHugh Dickins 			return true;
494c852023eSHugh Dickins 		fallthrough;
495c852023eSHugh Dickins 	case SHMEM_HUGE_ADVISE:
4965e6e5a12SHugh Dickins 		if (vma && (vma->vm_flags & VM_HUGEPAGE))
4975e6e5a12SHugh Dickins 			return true;
4985e6e5a12SHugh Dickins 		fallthrough;
499c852023eSHugh Dickins 	default:
500c852023eSHugh Dickins 		return false;
501c852023eSHugh Dickins 	}
502c852023eSHugh Dickins }
5035a6e75f8SKirill A. Shutemov 
504e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS)
5055a6e75f8SKirill A. Shutemov static int shmem_parse_huge(const char *str)
5065a6e75f8SKirill A. Shutemov {
5075a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "never"))
5085a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_NEVER;
5095a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "always"))
5105a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_ALWAYS;
5115a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "within_size"))
5125a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_WITHIN_SIZE;
5135a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "advise"))
5145a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_ADVISE;
5155a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "deny"))
5165a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_DENY;
5175a6e75f8SKirill A. Shutemov 	if (!strcmp(str, "force"))
5185a6e75f8SKirill A. Shutemov 		return SHMEM_HUGE_FORCE;
5195a6e75f8SKirill A. Shutemov 	return -EINVAL;
5205a6e75f8SKirill A. Shutemov }
521e5f2249aSArnd Bergmann #endif
5225a6e75f8SKirill A. Shutemov 
523e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
5245a6e75f8SKirill A. Shutemov static const char *shmem_format_huge(int huge)
5255a6e75f8SKirill A. Shutemov {
5265a6e75f8SKirill A. Shutemov 	switch (huge) {
5275a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_NEVER:
5285a6e75f8SKirill A. Shutemov 		return "never";
5295a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_ALWAYS:
5305a6e75f8SKirill A. Shutemov 		return "always";
5315a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_WITHIN_SIZE:
5325a6e75f8SKirill A. Shutemov 		return "within_size";
5335a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_ADVISE:
5345a6e75f8SKirill A. Shutemov 		return "advise";
5355a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_DENY:
5365a6e75f8SKirill A. Shutemov 		return "deny";
5375a6e75f8SKirill A. Shutemov 	case SHMEM_HUGE_FORCE:
5385a6e75f8SKirill A. Shutemov 		return "force";
5395a6e75f8SKirill A. Shutemov 	default:
5405a6e75f8SKirill A. Shutemov 		VM_BUG_ON(1);
5415a6e75f8SKirill A. Shutemov 		return "bad_val";
5425a6e75f8SKirill A. Shutemov 	}
5435a6e75f8SKirill A. Shutemov }
544f1f5929cSJérémy Lefaure #endif
5455a6e75f8SKirill A. Shutemov 
546779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
547779750d2SKirill A. Shutemov 		struct shrink_control *sc, unsigned long nr_to_split)
548779750d2SKirill A. Shutemov {
549779750d2SKirill A. Shutemov 	LIST_HEAD(list), *pos, *next;
550253fd0f0SKirill A. Shutemov 	LIST_HEAD(to_remove);
551779750d2SKirill A. Shutemov 	struct inode *inode;
552779750d2SKirill A. Shutemov 	struct shmem_inode_info *info;
55305624571SMatthew Wilcox (Oracle) 	struct folio *folio;
554779750d2SKirill A. Shutemov 	unsigned long batch = sc ? sc->nr_to_scan : 128;
55562c9827cSGang Li 	int split = 0;
556779750d2SKirill A. Shutemov 
557779750d2SKirill A. Shutemov 	if (list_empty(&sbinfo->shrinklist))
558779750d2SKirill A. Shutemov 		return SHRINK_STOP;
559779750d2SKirill A. Shutemov 
560779750d2SKirill A. Shutemov 	spin_lock(&sbinfo->shrinklist_lock);
561779750d2SKirill A. Shutemov 	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
562779750d2SKirill A. Shutemov 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
563779750d2SKirill A. Shutemov 
564779750d2SKirill A. Shutemov 		/* pin the inode */
565779750d2SKirill A. Shutemov 		inode = igrab(&info->vfs_inode);
566779750d2SKirill A. Shutemov 
567779750d2SKirill A. Shutemov 		/* inode is about to be evicted */
568779750d2SKirill A. Shutemov 		if (!inode) {
569779750d2SKirill A. Shutemov 			list_del_init(&info->shrinklist);
570779750d2SKirill A. Shutemov 			goto next;
571779750d2SKirill A. Shutemov 		}
572779750d2SKirill A. Shutemov 
573779750d2SKirill A. Shutemov 		/* Check if there's anything to gain */
574779750d2SKirill A. Shutemov 		if (round_up(inode->i_size, PAGE_SIZE) ==
575779750d2SKirill A. Shutemov 				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
576253fd0f0SKirill A. Shutemov 			list_move(&info->shrinklist, &to_remove);
577779750d2SKirill A. Shutemov 			goto next;
578779750d2SKirill A. Shutemov 		}
579779750d2SKirill A. Shutemov 
580779750d2SKirill A. Shutemov 		list_move(&info->shrinklist, &list);
581779750d2SKirill A. Shutemov next:
58262c9827cSGang Li 		sbinfo->shrinklist_len--;
583779750d2SKirill A. Shutemov 		if (!--batch)
584779750d2SKirill A. Shutemov 			break;
585779750d2SKirill A. Shutemov 	}
586779750d2SKirill A. Shutemov 	spin_unlock(&sbinfo->shrinklist_lock);
587779750d2SKirill A. Shutemov 
588253fd0f0SKirill A. Shutemov 	list_for_each_safe(pos, next, &to_remove) {
589253fd0f0SKirill A. Shutemov 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
590253fd0f0SKirill A. Shutemov 		inode = &info->vfs_inode;
591253fd0f0SKirill A. Shutemov 		list_del_init(&info->shrinklist);
592253fd0f0SKirill A. Shutemov 		iput(inode);
593253fd0f0SKirill A. Shutemov 	}
594253fd0f0SKirill A. Shutemov 
595779750d2SKirill A. Shutemov 	list_for_each_safe(pos, next, &list) {
596779750d2SKirill A. Shutemov 		int ret;
59705624571SMatthew Wilcox (Oracle) 		pgoff_t index;
598779750d2SKirill A. Shutemov 
599779750d2SKirill A. Shutemov 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
600779750d2SKirill A. Shutemov 		inode = &info->vfs_inode;
601779750d2SKirill A. Shutemov 
602b3cd54b2SKirill A. Shutemov 		if (nr_to_split && split >= nr_to_split)
60362c9827cSGang Li 			goto move_back;
604779750d2SKirill A. Shutemov 
60505624571SMatthew Wilcox (Oracle) 		index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
60605624571SMatthew Wilcox (Oracle) 		folio = filemap_get_folio(inode->i_mapping, index);
60705624571SMatthew Wilcox (Oracle) 		if (!folio)
608779750d2SKirill A. Shutemov 			goto drop;
609779750d2SKirill A. Shutemov 
610b3cd54b2SKirill A. Shutemov 		/* No huge page at the end of the file: nothing to split */
61105624571SMatthew Wilcox (Oracle) 		if (!folio_test_large(folio)) {
61205624571SMatthew Wilcox (Oracle) 			folio_put(folio);
613779750d2SKirill A. Shutemov 			goto drop;
614779750d2SKirill A. Shutemov 		}
615779750d2SKirill A. Shutemov 
616b3cd54b2SKirill A. Shutemov 		/*
61762c9827cSGang Li 		 * Move the inode on the list back to shrinklist if we failed
61862c9827cSGang Li 		 * to lock the page at this time.
619b3cd54b2SKirill A. Shutemov 		 *
620b3cd54b2SKirill A. Shutemov 		 * Waiting for the lock may lead to deadlock in the
621b3cd54b2SKirill A. Shutemov 		 * reclaim path.
622b3cd54b2SKirill A. Shutemov 		 */
62305624571SMatthew Wilcox (Oracle) 		if (!folio_trylock(folio)) {
62405624571SMatthew Wilcox (Oracle) 			folio_put(folio);
62562c9827cSGang Li 			goto move_back;
626b3cd54b2SKirill A. Shutemov 		}
627b3cd54b2SKirill A. Shutemov 
628d788f5b3SMatthew Wilcox (Oracle) 		ret = split_folio(folio);
62905624571SMatthew Wilcox (Oracle) 		folio_unlock(folio);
63005624571SMatthew Wilcox (Oracle) 		folio_put(folio);
631779750d2SKirill A. Shutemov 
63262c9827cSGang Li 		/* If split failed move the inode on the list back to shrinklist */
633b3cd54b2SKirill A. Shutemov 		if (ret)
63462c9827cSGang Li 			goto move_back;
635779750d2SKirill A. Shutemov 
636779750d2SKirill A. Shutemov 		split++;
637779750d2SKirill A. Shutemov drop:
638779750d2SKirill A. Shutemov 		list_del_init(&info->shrinklist);
63962c9827cSGang Li 		goto put;
64062c9827cSGang Li move_back:
64162c9827cSGang Li 		/*
64262c9827cSGang Li 		 * Make sure the inode is either on the global list or deleted
64362c9827cSGang Li 		 * from any local list before iput() since it could be deleted
64462c9827cSGang Li 		 * in another thread once we put the inode (then the local list
64562c9827cSGang Li 		 * is corrupted).
64662c9827cSGang Li 		 */
64762c9827cSGang Li 		spin_lock(&sbinfo->shrinklist_lock);
64862c9827cSGang Li 		list_move(&info->shrinklist, &sbinfo->shrinklist);
64962c9827cSGang Li 		sbinfo->shrinklist_len++;
65062c9827cSGang Li 		spin_unlock(&sbinfo->shrinklist_lock);
65162c9827cSGang Li put:
652779750d2SKirill A. Shutemov 		iput(inode);
653779750d2SKirill A. Shutemov 	}
654779750d2SKirill A. Shutemov 
655779750d2SKirill A. Shutemov 	return split;
656779750d2SKirill A. Shutemov }
657779750d2SKirill A. Shutemov 
658779750d2SKirill A. Shutemov static long shmem_unused_huge_scan(struct super_block *sb,
659779750d2SKirill A. Shutemov 		struct shrink_control *sc)
660779750d2SKirill A. Shutemov {
661779750d2SKirill A. Shutemov 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
662779750d2SKirill A. Shutemov 
663779750d2SKirill A. Shutemov 	if (!READ_ONCE(sbinfo->shrinklist_len))
664779750d2SKirill A. Shutemov 		return SHRINK_STOP;
665779750d2SKirill A. Shutemov 
666779750d2SKirill A. Shutemov 	return shmem_unused_huge_shrink(sbinfo, sc, 0);
667779750d2SKirill A. Shutemov }
668779750d2SKirill A. Shutemov 
669779750d2SKirill A. Shutemov static long shmem_unused_huge_count(struct super_block *sb,
670779750d2SKirill A. Shutemov 		struct shrink_control *sc)
671779750d2SKirill A. Shutemov {
672779750d2SKirill A. Shutemov 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
673779750d2SKirill A. Shutemov 	return READ_ONCE(sbinfo->shrinklist_len);
674779750d2SKirill A. Shutemov }
675396bcc52SMatthew Wilcox (Oracle) #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
6765a6e75f8SKirill A. Shutemov 
6775a6e75f8SKirill A. Shutemov #define shmem_huge SHMEM_HUGE_DENY
6785a6e75f8SKirill A. Shutemov 
6797c6c6cc4SZach O'Keefe bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
6807c6c6cc4SZach O'Keefe 		   pgoff_t index, bool shmem_huge_force)
6815e6e5a12SHugh Dickins {
6825e6e5a12SHugh Dickins 	return false;
6835e6e5a12SHugh Dickins }
6845e6e5a12SHugh Dickins 
685779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
686779750d2SKirill A. Shutemov 		struct shrink_control *sc, unsigned long nr_to_split)
687779750d2SKirill A. Shutemov {
688779750d2SKirill A. Shutemov 	return 0;
689779750d2SKirill A. Shutemov }
690396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
6915a6e75f8SKirill A. Shutemov 
6925a6e75f8SKirill A. Shutemov /*
6932bb876b5SMatthew Wilcox (Oracle)  * Like filemap_add_folio, but error if expected item has gone.
69446f65ec1SHugh Dickins  */
695b7dd44a1SMatthew Wilcox (Oracle) static int shmem_add_to_page_cache(struct folio *folio,
69646f65ec1SHugh Dickins 				   struct address_space *mapping,
6973fea5a49SJohannes Weiner 				   pgoff_t index, void *expected, gfp_t gfp,
6983fea5a49SJohannes Weiner 				   struct mm_struct *charge_mm)
69946f65ec1SHugh Dickins {
700b7dd44a1SMatthew Wilcox (Oracle) 	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
701b7dd44a1SMatthew Wilcox (Oracle) 	long nr = folio_nr_pages(folio);
7023fea5a49SJohannes Weiner 	int error;
70346f65ec1SHugh Dickins 
704b7dd44a1SMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
705b7dd44a1SMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
706b7dd44a1SMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
707b7dd44a1SMatthew Wilcox (Oracle) 	VM_BUG_ON(expected && folio_test_large(folio));
70846f65ec1SHugh Dickins 
709b7dd44a1SMatthew Wilcox (Oracle) 	folio_ref_add(folio, nr);
710b7dd44a1SMatthew Wilcox (Oracle) 	folio->mapping = mapping;
711b7dd44a1SMatthew Wilcox (Oracle) 	folio->index = index;
71246f65ec1SHugh Dickins 
713b7dd44a1SMatthew Wilcox (Oracle) 	if (!folio_test_swapcache(folio)) {
714b7dd44a1SMatthew Wilcox (Oracle) 		error = mem_cgroup_charge(folio, charge_mm, gfp);
7153fea5a49SJohannes Weiner 		if (error) {
716b7dd44a1SMatthew Wilcox (Oracle) 			if (folio_test_pmd_mappable(folio)) {
7173fea5a49SJohannes Weiner 				count_vm_event(THP_FILE_FALLBACK);
7183fea5a49SJohannes Weiner 				count_vm_event(THP_FILE_FALLBACK_CHARGE);
7193fea5a49SJohannes Weiner 			}
7203fea5a49SJohannes Weiner 			goto error;
7213fea5a49SJohannes Weiner 		}
7224c6355b2SJohannes Weiner 	}
723b7dd44a1SMatthew Wilcox (Oracle) 	folio_throttle_swaprate(folio, gfp);
7243fea5a49SJohannes Weiner 
725552446a4SMatthew Wilcox 	do {
726552446a4SMatthew Wilcox 		xas_lock_irq(&xas);
7276b24ca4aSMatthew Wilcox (Oracle) 		if (expected != xas_find_conflict(&xas)) {
728552446a4SMatthew Wilcox 			xas_set_err(&xas, -EEXIST);
7296b24ca4aSMatthew Wilcox (Oracle) 			goto unlock;
7306b24ca4aSMatthew Wilcox (Oracle) 		}
7316b24ca4aSMatthew Wilcox (Oracle) 		if (expected && xas_find_conflict(&xas)) {
7326b24ca4aSMatthew Wilcox (Oracle) 			xas_set_err(&xas, -EEXIST);
7336b24ca4aSMatthew Wilcox (Oracle) 			goto unlock;
7346b24ca4aSMatthew Wilcox (Oracle) 		}
735b7dd44a1SMatthew Wilcox (Oracle) 		xas_store(&xas, folio);
736552446a4SMatthew Wilcox 		if (xas_error(&xas))
737552446a4SMatthew Wilcox 			goto unlock;
738b7dd44a1SMatthew Wilcox (Oracle) 		if (folio_test_pmd_mappable(folio)) {
739800d8c63SKirill A. Shutemov 			count_vm_event(THP_FILE_ALLOC);
740b7dd44a1SMatthew Wilcox (Oracle) 			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
741552446a4SMatthew Wilcox 		}
742552446a4SMatthew Wilcox 		mapping->nrpages += nr;
743b7dd44a1SMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
744b7dd44a1SMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
745552446a4SMatthew Wilcox unlock:
746552446a4SMatthew Wilcox 		xas_unlock_irq(&xas);
747552446a4SMatthew Wilcox 	} while (xas_nomem(&xas, gfp));
748552446a4SMatthew Wilcox 
749552446a4SMatthew Wilcox 	if (xas_error(&xas)) {
7503fea5a49SJohannes Weiner 		error = xas_error(&xas);
7513fea5a49SJohannes Weiner 		goto error;
75246f65ec1SHugh Dickins 	}
753552446a4SMatthew Wilcox 
754552446a4SMatthew Wilcox 	return 0;
7553fea5a49SJohannes Weiner error:
756b7dd44a1SMatthew Wilcox (Oracle) 	folio->mapping = NULL;
757b7dd44a1SMatthew Wilcox (Oracle) 	folio_ref_sub(folio, nr);
7583fea5a49SJohannes Weiner 	return error;
75946f65ec1SHugh Dickins }
76046f65ec1SHugh Dickins 
76146f65ec1SHugh Dickins /*
7624cd400fdSMatthew Wilcox (Oracle)  * Like delete_from_page_cache, but substitutes swap for @folio.
7636922c0c7SHugh Dickins  */
7644cd400fdSMatthew Wilcox (Oracle) static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
7656922c0c7SHugh Dickins {
7664cd400fdSMatthew Wilcox (Oracle) 	struct address_space *mapping = folio->mapping;
7674cd400fdSMatthew Wilcox (Oracle) 	long nr = folio_nr_pages(folio);
7686922c0c7SHugh Dickins 	int error;
7696922c0c7SHugh Dickins 
770b93b0163SMatthew Wilcox 	xa_lock_irq(&mapping->i_pages);
7714cd400fdSMatthew Wilcox (Oracle) 	error = shmem_replace_entry(mapping, folio->index, folio, radswap);
7724cd400fdSMatthew Wilcox (Oracle) 	folio->mapping = NULL;
7734cd400fdSMatthew Wilcox (Oracle) 	mapping->nrpages -= nr;
7744cd400fdSMatthew Wilcox (Oracle) 	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
7754cd400fdSMatthew Wilcox (Oracle) 	__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
776b93b0163SMatthew Wilcox 	xa_unlock_irq(&mapping->i_pages);
7774cd400fdSMatthew Wilcox (Oracle) 	folio_put(folio);
7786922c0c7SHugh Dickins 	BUG_ON(error);
7796922c0c7SHugh Dickins }
7806922c0c7SHugh Dickins 
7816922c0c7SHugh Dickins /*
782c121d3bbSMatthew Wilcox  * Remove swap entry from page cache, free the swap and its page cache.
7837a5d0fbbSHugh Dickins  */
7847a5d0fbbSHugh Dickins static int shmem_free_swap(struct address_space *mapping,
7857a5d0fbbSHugh Dickins 			   pgoff_t index, void *radswap)
7867a5d0fbbSHugh Dickins {
7876dbaf22cSJohannes Weiner 	void *old;
7887a5d0fbbSHugh Dickins 
78955f3f7eaSMatthew Wilcox 	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
7906dbaf22cSJohannes Weiner 	if (old != radswap)
7916dbaf22cSJohannes Weiner 		return -ENOENT;
7927a5d0fbbSHugh Dickins 	free_swap_and_cache(radix_to_swp_entry(radswap));
7936dbaf22cSJohannes Weiner 	return 0;
7947a5d0fbbSHugh Dickins }
7957a5d0fbbSHugh Dickins 
7967a5d0fbbSHugh Dickins /*
7976a15a370SVlastimil Babka  * Determine (in bytes) how many of the shmem object's pages mapped by the
79848131e03SVlastimil Babka  * given offsets are swapped out.
7996a15a370SVlastimil Babka  *
8009608703eSJan Kara  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
8016a15a370SVlastimil Babka  * as long as the inode doesn't go away and racy results are not a problem.
8026a15a370SVlastimil Babka  */
80348131e03SVlastimil Babka unsigned long shmem_partial_swap_usage(struct address_space *mapping,
80448131e03SVlastimil Babka 						pgoff_t start, pgoff_t end)
8056a15a370SVlastimil Babka {
8067ae3424fSMatthew Wilcox 	XA_STATE(xas, &mapping->i_pages, start);
8076a15a370SVlastimil Babka 	struct page *page;
80848131e03SVlastimil Babka 	unsigned long swapped = 0;
8096a15a370SVlastimil Babka 
8106a15a370SVlastimil Babka 	rcu_read_lock();
8117ae3424fSMatthew Wilcox 	xas_for_each(&xas, page, end - 1) {
8127ae3424fSMatthew Wilcox 		if (xas_retry(&xas, page))
8132cf938aaSMatthew Wilcox 			continue;
8143159f943SMatthew Wilcox 		if (xa_is_value(page))
8156a15a370SVlastimil Babka 			swapped++;
8166a15a370SVlastimil Babka 
8176a15a370SVlastimil Babka 		if (need_resched()) {
8187ae3424fSMatthew Wilcox 			xas_pause(&xas);
8196a15a370SVlastimil Babka 			cond_resched_rcu();
8206a15a370SVlastimil Babka 		}
8216a15a370SVlastimil Babka 	}
8226a15a370SVlastimil Babka 
8236a15a370SVlastimil Babka 	rcu_read_unlock();
8246a15a370SVlastimil Babka 
8256a15a370SVlastimil Babka 	return swapped << PAGE_SHIFT;
8266a15a370SVlastimil Babka }
8276a15a370SVlastimil Babka 
8286a15a370SVlastimil Babka /*
82948131e03SVlastimil Babka  * Determine (in bytes) how many of the shmem object's pages mapped by the
83048131e03SVlastimil Babka  * given vma is swapped out.
83148131e03SVlastimil Babka  *
8329608703eSJan Kara  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
83348131e03SVlastimil Babka  * as long as the inode doesn't go away and racy results are not a problem.
83448131e03SVlastimil Babka  */
83548131e03SVlastimil Babka unsigned long shmem_swap_usage(struct vm_area_struct *vma)
83648131e03SVlastimil Babka {
83748131e03SVlastimil Babka 	struct inode *inode = file_inode(vma->vm_file);
83848131e03SVlastimil Babka 	struct shmem_inode_info *info = SHMEM_I(inode);
83948131e03SVlastimil Babka 	struct address_space *mapping = inode->i_mapping;
84048131e03SVlastimil Babka 	unsigned long swapped;
84148131e03SVlastimil Babka 
84248131e03SVlastimil Babka 	/* Be careful as we don't hold info->lock */
84348131e03SVlastimil Babka 	swapped = READ_ONCE(info->swapped);
84448131e03SVlastimil Babka 
84548131e03SVlastimil Babka 	/*
84648131e03SVlastimil Babka 	 * The easier cases are when the shmem object has nothing in swap, or
84748131e03SVlastimil Babka 	 * the vma maps it whole. Then we can simply use the stats that we
84848131e03SVlastimil Babka 	 * already track.
84948131e03SVlastimil Babka 	 */
85048131e03SVlastimil Babka 	if (!swapped)
85148131e03SVlastimil Babka 		return 0;
85248131e03SVlastimil Babka 
85348131e03SVlastimil Babka 	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
85448131e03SVlastimil Babka 		return swapped << PAGE_SHIFT;
85548131e03SVlastimil Babka 
85648131e03SVlastimil Babka 	/* Here comes the more involved part */
85702399c88SPeter Xu 	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
85802399c88SPeter Xu 					vma->vm_pgoff + vma_pages(vma));
85948131e03SVlastimil Babka }
86048131e03SVlastimil Babka 
86148131e03SVlastimil Babka /*
86224513264SHugh Dickins  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
86324513264SHugh Dickins  */
86424513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping)
86524513264SHugh Dickins {
866105c988fSMatthew Wilcox (Oracle) 	struct folio_batch fbatch;
86724513264SHugh Dickins 	pgoff_t index = 0;
86824513264SHugh Dickins 
869105c988fSMatthew Wilcox (Oracle) 	folio_batch_init(&fbatch);
87024513264SHugh Dickins 	/*
87124513264SHugh Dickins 	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
87224513264SHugh Dickins 	 */
873105c988fSMatthew Wilcox (Oracle) 	while (!mapping_unevictable(mapping) &&
874105c988fSMatthew Wilcox (Oracle) 	       filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
875105c988fSMatthew Wilcox (Oracle) 		check_move_unevictable_folios(&fbatch);
876105c988fSMatthew Wilcox (Oracle) 		folio_batch_release(&fbatch);
87724513264SHugh Dickins 		cond_resched();
87824513264SHugh Dickins 	}
8797a5d0fbbSHugh Dickins }
8807a5d0fbbSHugh Dickins 
881b9a8a419SMatthew Wilcox (Oracle) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
88271725ed1SHugh Dickins {
883b9a8a419SMatthew Wilcox (Oracle) 	struct folio *folio;
88471725ed1SHugh Dickins 
885b9a8a419SMatthew Wilcox (Oracle) 	/*
886a7f5862cSMatthew Wilcox (Oracle) 	 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
887b9a8a419SMatthew Wilcox (Oracle) 	 * beyond i_size, and reports fallocated pages as holes.
888b9a8a419SMatthew Wilcox (Oracle) 	 */
889b9a8a419SMatthew Wilcox (Oracle) 	folio = __filemap_get_folio(inode->i_mapping, index,
890b9a8a419SMatthew Wilcox (Oracle) 					FGP_ENTRY | FGP_LOCK, 0);
891b9a8a419SMatthew Wilcox (Oracle) 	if (!xa_is_value(folio))
892b9a8a419SMatthew Wilcox (Oracle) 		return folio;
893b9a8a419SMatthew Wilcox (Oracle) 	/*
894b9a8a419SMatthew Wilcox (Oracle) 	 * But read a page back from swap if any of it is within i_size
895b9a8a419SMatthew Wilcox (Oracle) 	 * (although in some cases this is just a waste of time).
896b9a8a419SMatthew Wilcox (Oracle) 	 */
897a7f5862cSMatthew Wilcox (Oracle) 	folio = NULL;
898a7f5862cSMatthew Wilcox (Oracle) 	shmem_get_folio(inode, index, &folio, SGP_READ);
899a7f5862cSMatthew Wilcox (Oracle) 	return folio;
90071725ed1SHugh Dickins }
90171725ed1SHugh Dickins 
90271725ed1SHugh Dickins /*
9037f4446eeSMatthew Wilcox  * Remove range of pages and swap entries from page cache, and free them.
9041635f6a7SHugh Dickins  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
9057a5d0fbbSHugh Dickins  */
9061635f6a7SHugh Dickins static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
9071635f6a7SHugh Dickins 								 bool unfalloc)
9081da177e4SLinus Torvalds {
909285b2c4fSHugh Dickins 	struct address_space *mapping = inode->i_mapping;
9101da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
91109cbfeafSKirill A. Shutemov 	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
91209cbfeafSKirill A. Shutemov 	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
9130e499ed3SMatthew Wilcox (Oracle) 	struct folio_batch fbatch;
9147a5d0fbbSHugh Dickins 	pgoff_t indices[PAGEVEC_SIZE];
915b9a8a419SMatthew Wilcox (Oracle) 	struct folio *folio;
916b9a8a419SMatthew Wilcox (Oracle) 	bool same_folio;
9177a5d0fbbSHugh Dickins 	long nr_swaps_freed = 0;
918285b2c4fSHugh Dickins 	pgoff_t index;
919bda97eabSHugh Dickins 	int i;
9201da177e4SLinus Torvalds 
92183e4fa9cSHugh Dickins 	if (lend == -1)
92283e4fa9cSHugh Dickins 		end = -1;	/* unsigned, so actually very big */
923bda97eabSHugh Dickins 
924d144bf62SHugh Dickins 	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
925d144bf62SHugh Dickins 		info->fallocend = start;
926d144bf62SHugh Dickins 
92751dcbdacSMatthew Wilcox (Oracle) 	folio_batch_init(&fbatch);
928bda97eabSHugh Dickins 	index = start;
9293392ca12SVishal Moola (Oracle) 	while (index < end && find_lock_entries(mapping, &index, end - 1,
93051dcbdacSMatthew Wilcox (Oracle) 			&fbatch, indices)) {
93151dcbdacSMatthew Wilcox (Oracle) 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
932b9a8a419SMatthew Wilcox (Oracle) 			folio = fbatch.folios[i];
933bda97eabSHugh Dickins 
9347b774aabSMatthew Wilcox (Oracle) 			if (xa_is_value(folio)) {
9351635f6a7SHugh Dickins 				if (unfalloc)
9361635f6a7SHugh Dickins 					continue;
9377a5d0fbbSHugh Dickins 				nr_swaps_freed += !shmem_free_swap(mapping,
9383392ca12SVishal Moola (Oracle) 							indices[i], folio);
9397a5d0fbbSHugh Dickins 				continue;
9407a5d0fbbSHugh Dickins 			}
9417a5d0fbbSHugh Dickins 
9427b774aabSMatthew Wilcox (Oracle) 			if (!unfalloc || !folio_test_uptodate(folio))
9431e84a3d9SMatthew Wilcox (Oracle) 				truncate_inode_folio(mapping, folio);
9447b774aabSMatthew Wilcox (Oracle) 			folio_unlock(folio);
945bda97eabSHugh Dickins 		}
94651dcbdacSMatthew Wilcox (Oracle) 		folio_batch_remove_exceptionals(&fbatch);
94751dcbdacSMatthew Wilcox (Oracle) 		folio_batch_release(&fbatch);
948bda97eabSHugh Dickins 		cond_resched();
949bda97eabSHugh Dickins 	}
950bda97eabSHugh Dickins 
95144bcabd7SHugh Dickins 	/*
95244bcabd7SHugh Dickins 	 * When undoing a failed fallocate, we want none of the partial folio
95344bcabd7SHugh Dickins 	 * zeroing and splitting below, but shall want to truncate the whole
95444bcabd7SHugh Dickins 	 * folio when !uptodate indicates that it was added by this fallocate,
95544bcabd7SHugh Dickins 	 * even when [lstart, lend] covers only a part of the folio.
95644bcabd7SHugh Dickins 	 */
95744bcabd7SHugh Dickins 	if (unfalloc)
95844bcabd7SHugh Dickins 		goto whole_folios;
95944bcabd7SHugh Dickins 
960b9a8a419SMatthew Wilcox (Oracle) 	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
961b9a8a419SMatthew Wilcox (Oracle) 	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
962b9a8a419SMatthew Wilcox (Oracle) 	if (folio) {
963b9a8a419SMatthew Wilcox (Oracle) 		same_folio = lend < folio_pos(folio) + folio_size(folio);
964b9a8a419SMatthew Wilcox (Oracle) 		folio_mark_dirty(folio);
965b9a8a419SMatthew Wilcox (Oracle) 		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
966b9a8a419SMatthew Wilcox (Oracle) 			start = folio->index + folio_nr_pages(folio);
967b9a8a419SMatthew Wilcox (Oracle) 			if (same_folio)
968b9a8a419SMatthew Wilcox (Oracle) 				end = folio->index;
96983e4fa9cSHugh Dickins 		}
970b9a8a419SMatthew Wilcox (Oracle) 		folio_unlock(folio);
971b9a8a419SMatthew Wilcox (Oracle) 		folio_put(folio);
972b9a8a419SMatthew Wilcox (Oracle) 		folio = NULL;
973bda97eabSHugh Dickins 	}
974b9a8a419SMatthew Wilcox (Oracle) 
975b9a8a419SMatthew Wilcox (Oracle) 	if (!same_folio)
976b9a8a419SMatthew Wilcox (Oracle) 		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
977b9a8a419SMatthew Wilcox (Oracle) 	if (folio) {
978b9a8a419SMatthew Wilcox (Oracle) 		folio_mark_dirty(folio);
979b9a8a419SMatthew Wilcox (Oracle) 		if (!truncate_inode_partial_folio(folio, lstart, lend))
980b9a8a419SMatthew Wilcox (Oracle) 			end = folio->index;
981b9a8a419SMatthew Wilcox (Oracle) 		folio_unlock(folio);
982b9a8a419SMatthew Wilcox (Oracle) 		folio_put(folio);
983bda97eabSHugh Dickins 	}
984bda97eabSHugh Dickins 
98544bcabd7SHugh Dickins whole_folios:
98644bcabd7SHugh Dickins 
987bda97eabSHugh Dickins 	index = start;
988b1a36650SHugh Dickins 	while (index < end) {
989bda97eabSHugh Dickins 		cond_resched();
9900cd6144aSJohannes Weiner 
9919fb6beeaSVishal Moola (Oracle) 		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
992cf2039afSMatthew Wilcox (Oracle) 				indices)) {
993b1a36650SHugh Dickins 			/* If all gone or hole-punch or unfalloc, we're done */
994b1a36650SHugh Dickins 			if (index == start || end != -1)
995bda97eabSHugh Dickins 				break;
996b1a36650SHugh Dickins 			/* But if truncating, restart to make sure all gone */
997bda97eabSHugh Dickins 			index = start;
998bda97eabSHugh Dickins 			continue;
999bda97eabSHugh Dickins 		}
10000e499ed3SMatthew Wilcox (Oracle) 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1001b9a8a419SMatthew Wilcox (Oracle) 			folio = fbatch.folios[i];
1002bda97eabSHugh Dickins 
10030e499ed3SMatthew Wilcox (Oracle) 			if (xa_is_value(folio)) {
10041635f6a7SHugh Dickins 				if (unfalloc)
10051635f6a7SHugh Dickins 					continue;
10069fb6beeaSVishal Moola (Oracle) 				if (shmem_free_swap(mapping, indices[i], folio)) {
1007b1a36650SHugh Dickins 					/* Swap was replaced by page: retry */
10089fb6beeaSVishal Moola (Oracle) 					index = indices[i];
1009b1a36650SHugh Dickins 					break;
1010b1a36650SHugh Dickins 				}
1011b1a36650SHugh Dickins 				nr_swaps_freed++;
10127a5d0fbbSHugh Dickins 				continue;
10137a5d0fbbSHugh Dickins 			}
10147a5d0fbbSHugh Dickins 
10150e499ed3SMatthew Wilcox (Oracle) 			folio_lock(folio);
1016800d8c63SKirill A. Shutemov 
10170e499ed3SMatthew Wilcox (Oracle) 			if (!unfalloc || !folio_test_uptodate(folio)) {
10180e499ed3SMatthew Wilcox (Oracle) 				if (folio_mapping(folio) != mapping) {
1019b1a36650SHugh Dickins 					/* Page was replaced by swap: retry */
10200e499ed3SMatthew Wilcox (Oracle) 					folio_unlock(folio);
10219fb6beeaSVishal Moola (Oracle) 					index = indices[i];
1022b1a36650SHugh Dickins 					break;
10237a5d0fbbSHugh Dickins 				}
10240e499ed3SMatthew Wilcox (Oracle) 				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
10250e499ed3SMatthew Wilcox (Oracle) 						folio);
10260e499ed3SMatthew Wilcox (Oracle) 				truncate_inode_folio(mapping, folio);
102771725ed1SHugh Dickins 			}
10280e499ed3SMatthew Wilcox (Oracle) 			folio_unlock(folio);
1029bda97eabSHugh Dickins 		}
10300e499ed3SMatthew Wilcox (Oracle) 		folio_batch_remove_exceptionals(&fbatch);
10310e499ed3SMatthew Wilcox (Oracle) 		folio_batch_release(&fbatch);
1032bda97eabSHugh Dickins 	}
103394c1e62dSHugh Dickins 
10344595ef88SKirill A. Shutemov 	spin_lock_irq(&info->lock);
10357a5d0fbbSHugh Dickins 	info->swapped -= nr_swaps_freed;
10361da177e4SLinus Torvalds 	shmem_recalc_inode(inode);
10374595ef88SKirill A. Shutemov 	spin_unlock_irq(&info->lock);
10381635f6a7SHugh Dickins }
10391da177e4SLinus Torvalds 
10401635f6a7SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
10411635f6a7SHugh Dickins {
10421635f6a7SHugh Dickins 	shmem_undo_range(inode, lstart, lend, false);
1043078cd827SDeepa Dinamani 	inode->i_ctime = inode->i_mtime = current_time(inode);
104436f05cabSJeff Layton 	inode_inc_iversion(inode);
10451da177e4SLinus Torvalds }
104694c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range);
10471da177e4SLinus Torvalds 
1048549c7297SChristian Brauner static int shmem_getattr(struct user_namespace *mnt_userns,
1049549c7297SChristian Brauner 			 const struct path *path, struct kstat *stat,
1050a528d35eSDavid Howells 			 u32 request_mask, unsigned int query_flags)
105144a30220SYu Zhao {
1052a528d35eSDavid Howells 	struct inode *inode = path->dentry->d_inode;
105344a30220SYu Zhao 	struct shmem_inode_info *info = SHMEM_I(inode);
105444a30220SYu Zhao 
1055d0424c42SHugh Dickins 	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
10564595ef88SKirill A. Shutemov 		spin_lock_irq(&info->lock);
105744a30220SYu Zhao 		shmem_recalc_inode(inode);
10584595ef88SKirill A. Shutemov 		spin_unlock_irq(&info->lock);
1059d0424c42SHugh Dickins 	}
1060e408e695STheodore Ts'o 	if (info->fsflags & FS_APPEND_FL)
1061e408e695STheodore Ts'o 		stat->attributes |= STATX_ATTR_APPEND;
1062e408e695STheodore Ts'o 	if (info->fsflags & FS_IMMUTABLE_FL)
1063e408e695STheodore Ts'o 		stat->attributes |= STATX_ATTR_IMMUTABLE;
1064e408e695STheodore Ts'o 	if (info->fsflags & FS_NODUMP_FL)
1065e408e695STheodore Ts'o 		stat->attributes |= STATX_ATTR_NODUMP;
1066e408e695STheodore Ts'o 	stat->attributes_mask |= (STATX_ATTR_APPEND |
1067e408e695STheodore Ts'o 			STATX_ATTR_IMMUTABLE |
1068e408e695STheodore Ts'o 			STATX_ATTR_NODUMP);
10690d56a451SChristian Brauner 	generic_fillattr(&init_user_ns, inode, stat);
107089fdcd26SYang Shi 
10717c6c6cc4SZach O'Keefe 	if (shmem_is_huge(NULL, inode, 0, false))
107289fdcd26SYang Shi 		stat->blksize = HPAGE_PMD_SIZE;
107389fdcd26SYang Shi 
1074f7cd16a5SXavier Roche 	if (request_mask & STATX_BTIME) {
1075f7cd16a5SXavier Roche 		stat->result_mask |= STATX_BTIME;
1076f7cd16a5SXavier Roche 		stat->btime.tv_sec = info->i_crtime.tv_sec;
1077f7cd16a5SXavier Roche 		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1078f7cd16a5SXavier Roche 	}
1079f7cd16a5SXavier Roche 
108044a30220SYu Zhao 	return 0;
108144a30220SYu Zhao }
108244a30220SYu Zhao 
1083549c7297SChristian Brauner static int shmem_setattr(struct user_namespace *mnt_userns,
1084549c7297SChristian Brauner 			 struct dentry *dentry, struct iattr *attr)
10851da177e4SLinus Torvalds {
108675c3cfa8SDavid Howells 	struct inode *inode = d_inode(dentry);
108740e041a2SDavid Herrmann 	struct shmem_inode_info *info = SHMEM_I(inode);
10881da177e4SLinus Torvalds 	int error;
108936f05cabSJeff Layton 	bool update_mtime = false;
109036f05cabSJeff Layton 	bool update_ctime = true;
10911da177e4SLinus Torvalds 
10922f221d6fSChristian Brauner 	error = setattr_prepare(&init_user_ns, dentry, attr);
1093db78b877SChristoph Hellwig 	if (error)
1094db78b877SChristoph Hellwig 		return error;
1095db78b877SChristoph Hellwig 
1096*6fd73538SDaniel Verkamp 	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1097*6fd73538SDaniel Verkamp 		if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1098*6fd73538SDaniel Verkamp 			return -EPERM;
1099*6fd73538SDaniel Verkamp 		}
1100*6fd73538SDaniel Verkamp 	}
1101*6fd73538SDaniel Verkamp 
110294c1e62dSHugh Dickins 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
110394c1e62dSHugh Dickins 		loff_t oldsize = inode->i_size;
110494c1e62dSHugh Dickins 		loff_t newsize = attr->ia_size;
11053889e6e7Snpiggin@suse.de 
11069608703eSJan Kara 		/* protected by i_rwsem */
110740e041a2SDavid Herrmann 		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
110840e041a2SDavid Herrmann 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
110940e041a2SDavid Herrmann 			return -EPERM;
111040e041a2SDavid Herrmann 
111194c1e62dSHugh Dickins 		if (newsize != oldsize) {
111277142517SKonstantin Khlebnikov 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
111377142517SKonstantin Khlebnikov 					oldsize, newsize);
111477142517SKonstantin Khlebnikov 			if (error)
111577142517SKonstantin Khlebnikov 				return error;
111694c1e62dSHugh Dickins 			i_size_write(inode, newsize);
111736f05cabSJeff Layton 			update_mtime = true;
111836f05cabSJeff Layton 		} else {
111936f05cabSJeff Layton 			update_ctime = false;
112094c1e62dSHugh Dickins 		}
1121afa2db2fSJosef Bacik 		if (newsize <= oldsize) {
112294c1e62dSHugh Dickins 			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1123d0424c42SHugh Dickins 			if (oldsize > holebegin)
1124d0424c42SHugh Dickins 				unmap_mapping_range(inode->i_mapping,
1125d0424c42SHugh Dickins 							holebegin, 0, 1);
1126d0424c42SHugh Dickins 			if (info->alloced)
1127d0424c42SHugh Dickins 				shmem_truncate_range(inode,
1128d0424c42SHugh Dickins 							newsize, (loff_t)-1);
112994c1e62dSHugh Dickins 			/* unmap again to remove racily COWed private pages */
1130d0424c42SHugh Dickins 			if (oldsize > holebegin)
1131d0424c42SHugh Dickins 				unmap_mapping_range(inode->i_mapping,
1132d0424c42SHugh Dickins 							holebegin, 0, 1);
113394c1e62dSHugh Dickins 		}
11341da177e4SLinus Torvalds 	}
11351da177e4SLinus Torvalds 
11362f221d6fSChristian Brauner 	setattr_copy(&init_user_ns, inode, attr);
1137db78b877SChristoph Hellwig 	if (attr->ia_valid & ATTR_MODE)
1138138060baSChristian Brauner 		error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
113936f05cabSJeff Layton 	if (!error && update_ctime) {
114036f05cabSJeff Layton 		inode->i_ctime = current_time(inode);
114136f05cabSJeff Layton 		if (update_mtime)
114236f05cabSJeff Layton 			inode->i_mtime = inode->i_ctime;
114336f05cabSJeff Layton 		inode_inc_iversion(inode);
114436f05cabSJeff Layton 	}
11451da177e4SLinus Torvalds 	return error;
11461da177e4SLinus Torvalds }
11471da177e4SLinus Torvalds 
11481f895f75SAl Viro static void shmem_evict_inode(struct inode *inode)
11491da177e4SLinus Torvalds {
11501da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
1151779750d2SKirill A. Shutemov 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
11521da177e4SLinus Torvalds 
115330e6a51dSHui Su 	if (shmem_mapping(inode->i_mapping)) {
11541da177e4SLinus Torvalds 		shmem_unacct_size(info->flags, inode->i_size);
11551da177e4SLinus Torvalds 		inode->i_size = 0;
1156bc786390SHugh Dickins 		mapping_set_exiting(inode->i_mapping);
11573889e6e7Snpiggin@suse.de 		shmem_truncate_range(inode, 0, (loff_t)-1);
1158779750d2SKirill A. Shutemov 		if (!list_empty(&info->shrinklist)) {
1159779750d2SKirill A. Shutemov 			spin_lock(&sbinfo->shrinklist_lock);
1160779750d2SKirill A. Shutemov 			if (!list_empty(&info->shrinklist)) {
1161779750d2SKirill A. Shutemov 				list_del_init(&info->shrinklist);
1162779750d2SKirill A. Shutemov 				sbinfo->shrinklist_len--;
1163779750d2SKirill A. Shutemov 			}
1164779750d2SKirill A. Shutemov 			spin_unlock(&sbinfo->shrinklist_lock);
1165779750d2SKirill A. Shutemov 		}
1166af53d3e9SHugh Dickins 		while (!list_empty(&info->swaplist)) {
1167af53d3e9SHugh Dickins 			/* Wait while shmem_unuse() is scanning this inode... */
1168af53d3e9SHugh Dickins 			wait_var_event(&info->stop_eviction,
1169af53d3e9SHugh Dickins 				       !atomic_read(&info->stop_eviction));
1170cb5f7b9aSHugh Dickins 			mutex_lock(&shmem_swaplist_mutex);
1171af53d3e9SHugh Dickins 			/* ...but beware of the race if we peeked too early */
1172af53d3e9SHugh Dickins 			if (!atomic_read(&info->stop_eviction))
11731da177e4SLinus Torvalds 				list_del_init(&info->swaplist);
1174cb5f7b9aSHugh Dickins 			mutex_unlock(&shmem_swaplist_mutex);
11751da177e4SLinus Torvalds 		}
11763ed47db3SAl Viro 	}
1177b09e0fa4SEric Paris 
117838f38657SAristeu Rozanski 	simple_xattrs_free(&info->xattrs);
11790f3c42f5SHugh Dickins 	WARN_ON(inode->i_blocks);
11805b04c689SPavel Emelyanov 	shmem_free_inode(inode->i_sb);
1181dbd5768fSJan Kara 	clear_inode(inode);
11821da177e4SLinus Torvalds }
11831da177e4SLinus Torvalds 
1184b56a2d8aSVineeth Remanan Pillai static int shmem_find_swap_entries(struct address_space *mapping,
1185da08e9b7SMatthew Wilcox (Oracle) 				   pgoff_t start, struct folio_batch *fbatch,
1186da08e9b7SMatthew Wilcox (Oracle) 				   pgoff_t *indices, unsigned int type)
1187478922e2SMatthew Wilcox {
1188b56a2d8aSVineeth Remanan Pillai 	XA_STATE(xas, &mapping->i_pages, start);
1189da08e9b7SMatthew Wilcox (Oracle) 	struct folio *folio;
119087039546SHugh Dickins 	swp_entry_t entry;
1191478922e2SMatthew Wilcox 
1192478922e2SMatthew Wilcox 	rcu_read_lock();
1193da08e9b7SMatthew Wilcox (Oracle) 	xas_for_each(&xas, folio, ULONG_MAX) {
1194da08e9b7SMatthew Wilcox (Oracle) 		if (xas_retry(&xas, folio))
11955b9c98f3SMike Kravetz 			continue;
1196b56a2d8aSVineeth Remanan Pillai 
1197da08e9b7SMatthew Wilcox (Oracle) 		if (!xa_is_value(folio))
1198478922e2SMatthew Wilcox 			continue;
1199b56a2d8aSVineeth Remanan Pillai 
1200da08e9b7SMatthew Wilcox (Oracle) 		entry = radix_to_swp_entry(folio);
12016cec2b95SMiaohe Lin 		/*
12026cec2b95SMiaohe Lin 		 * swapin error entries can be found in the mapping. But they're
12036cec2b95SMiaohe Lin 		 * deliberately ignored here as we've done everything we can do.
12046cec2b95SMiaohe Lin 		 */
120587039546SHugh Dickins 		if (swp_type(entry) != type)
1206b56a2d8aSVineeth Remanan Pillai 			continue;
1207b56a2d8aSVineeth Remanan Pillai 
1208e384200eSHugh Dickins 		indices[folio_batch_count(fbatch)] = xas.xa_index;
1209da08e9b7SMatthew Wilcox (Oracle) 		if (!folio_batch_add(fbatch, folio))
1210da08e9b7SMatthew Wilcox (Oracle) 			break;
1211b56a2d8aSVineeth Remanan Pillai 
1212b56a2d8aSVineeth Remanan Pillai 		if (need_resched()) {
1213e21a2955SMatthew Wilcox 			xas_pause(&xas);
1214478922e2SMatthew Wilcox 			cond_resched_rcu();
1215478922e2SMatthew Wilcox 		}
1216b56a2d8aSVineeth Remanan Pillai 	}
1217478922e2SMatthew Wilcox 	rcu_read_unlock();
1218e21a2955SMatthew Wilcox 
1219da08e9b7SMatthew Wilcox (Oracle) 	return xas.xa_index;
1220b56a2d8aSVineeth Remanan Pillai }
1221b56a2d8aSVineeth Remanan Pillai 
1222b56a2d8aSVineeth Remanan Pillai /*
1223b56a2d8aSVineeth Remanan Pillai  * Move the swapped pages for an inode to page cache. Returns the count
1224b56a2d8aSVineeth Remanan Pillai  * of pages swapped in, or the error in case of failure.
1225b56a2d8aSVineeth Remanan Pillai  */
1226da08e9b7SMatthew Wilcox (Oracle) static int shmem_unuse_swap_entries(struct inode *inode,
1227da08e9b7SMatthew Wilcox (Oracle) 		struct folio_batch *fbatch, pgoff_t *indices)
1228b56a2d8aSVineeth Remanan Pillai {
1229b56a2d8aSVineeth Remanan Pillai 	int i = 0;
1230b56a2d8aSVineeth Remanan Pillai 	int ret = 0;
1231b56a2d8aSVineeth Remanan Pillai 	int error = 0;
1232b56a2d8aSVineeth Remanan Pillai 	struct address_space *mapping = inode->i_mapping;
1233b56a2d8aSVineeth Remanan Pillai 
1234da08e9b7SMatthew Wilcox (Oracle) 	for (i = 0; i < folio_batch_count(fbatch); i++) {
1235da08e9b7SMatthew Wilcox (Oracle) 		struct folio *folio = fbatch->folios[i];
1236b56a2d8aSVineeth Remanan Pillai 
1237da08e9b7SMatthew Wilcox (Oracle) 		if (!xa_is_value(folio))
1238b56a2d8aSVineeth Remanan Pillai 			continue;
1239da08e9b7SMatthew Wilcox (Oracle) 		error = shmem_swapin_folio(inode, indices[i],
1240da08e9b7SMatthew Wilcox (Oracle) 					  &folio, SGP_CACHE,
1241b56a2d8aSVineeth Remanan Pillai 					  mapping_gfp_mask(mapping),
1242b56a2d8aSVineeth Remanan Pillai 					  NULL, NULL);
1243b56a2d8aSVineeth Remanan Pillai 		if (error == 0) {
1244da08e9b7SMatthew Wilcox (Oracle) 			folio_unlock(folio);
1245da08e9b7SMatthew Wilcox (Oracle) 			folio_put(folio);
1246b56a2d8aSVineeth Remanan Pillai 			ret++;
1247b56a2d8aSVineeth Remanan Pillai 		}
1248b56a2d8aSVineeth Remanan Pillai 		if (error == -ENOMEM)
1249b56a2d8aSVineeth Remanan Pillai 			break;
1250b56a2d8aSVineeth Remanan Pillai 		error = 0;
1251b56a2d8aSVineeth Remanan Pillai 	}
1252b56a2d8aSVineeth Remanan Pillai 	return error ? error : ret;
1253478922e2SMatthew Wilcox }
1254478922e2SMatthew Wilcox 
125546f65ec1SHugh Dickins /*
125646f65ec1SHugh Dickins  * If swap found in inode, free it and move page from swapcache to filecache.
125746f65ec1SHugh Dickins  */
125810a9c496SChristoph Hellwig static int shmem_unuse_inode(struct inode *inode, unsigned int type)
12591da177e4SLinus Torvalds {
1260b56a2d8aSVineeth Remanan Pillai 	struct address_space *mapping = inode->i_mapping;
1261b56a2d8aSVineeth Remanan Pillai 	pgoff_t start = 0;
1262da08e9b7SMatthew Wilcox (Oracle) 	struct folio_batch fbatch;
1263b56a2d8aSVineeth Remanan Pillai 	pgoff_t indices[PAGEVEC_SIZE];
1264b56a2d8aSVineeth Remanan Pillai 	int ret = 0;
12651da177e4SLinus Torvalds 
1266b56a2d8aSVineeth Remanan Pillai 	do {
1267da08e9b7SMatthew Wilcox (Oracle) 		folio_batch_init(&fbatch);
1268da08e9b7SMatthew Wilcox (Oracle) 		shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1269da08e9b7SMatthew Wilcox (Oracle) 		if (folio_batch_count(&fbatch) == 0) {
1270b56a2d8aSVineeth Remanan Pillai 			ret = 0;
1271778dd893SHugh Dickins 			break;
1272b56a2d8aSVineeth Remanan Pillai 		}
1273b56a2d8aSVineeth Remanan Pillai 
1274da08e9b7SMatthew Wilcox (Oracle) 		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1275b56a2d8aSVineeth Remanan Pillai 		if (ret < 0)
1276b56a2d8aSVineeth Remanan Pillai 			break;
1277b56a2d8aSVineeth Remanan Pillai 
1278da08e9b7SMatthew Wilcox (Oracle) 		start = indices[folio_batch_count(&fbatch) - 1];
1279b56a2d8aSVineeth Remanan Pillai 	} while (true);
1280b56a2d8aSVineeth Remanan Pillai 
1281b56a2d8aSVineeth Remanan Pillai 	return ret;
1282b56a2d8aSVineeth Remanan Pillai }
1283b56a2d8aSVineeth Remanan Pillai 
1284b56a2d8aSVineeth Remanan Pillai /*
1285b56a2d8aSVineeth Remanan Pillai  * Read all the shared memory data that resides in the swap
1286b56a2d8aSVineeth Remanan Pillai  * device 'type' back into memory, so the swap device can be
1287b56a2d8aSVineeth Remanan Pillai  * unused.
1288b56a2d8aSVineeth Remanan Pillai  */
128910a9c496SChristoph Hellwig int shmem_unuse(unsigned int type)
1290b56a2d8aSVineeth Remanan Pillai {
1291b56a2d8aSVineeth Remanan Pillai 	struct shmem_inode_info *info, *next;
1292b56a2d8aSVineeth Remanan Pillai 	int error = 0;
1293b56a2d8aSVineeth Remanan Pillai 
1294b56a2d8aSVineeth Remanan Pillai 	if (list_empty(&shmem_swaplist))
1295b56a2d8aSVineeth Remanan Pillai 		return 0;
1296b56a2d8aSVineeth Remanan Pillai 
1297b56a2d8aSVineeth Remanan Pillai 	mutex_lock(&shmem_swaplist_mutex);
1298b56a2d8aSVineeth Remanan Pillai 	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1299b56a2d8aSVineeth Remanan Pillai 		if (!info->swapped) {
1300b56a2d8aSVineeth Remanan Pillai 			list_del_init(&info->swaplist);
1301b56a2d8aSVineeth Remanan Pillai 			continue;
1302b56a2d8aSVineeth Remanan Pillai 		}
1303af53d3e9SHugh Dickins 		/*
1304af53d3e9SHugh Dickins 		 * Drop the swaplist mutex while searching the inode for swap;
1305af53d3e9SHugh Dickins 		 * but before doing so, make sure shmem_evict_inode() will not
1306af53d3e9SHugh Dickins 		 * remove placeholder inode from swaplist, nor let it be freed
1307af53d3e9SHugh Dickins 		 * (igrab() would protect from unlink, but not from unmount).
1308af53d3e9SHugh Dickins 		 */
1309af53d3e9SHugh Dickins 		atomic_inc(&info->stop_eviction);
1310b56a2d8aSVineeth Remanan Pillai 		mutex_unlock(&shmem_swaplist_mutex);
1311b56a2d8aSVineeth Remanan Pillai 
131210a9c496SChristoph Hellwig 		error = shmem_unuse_inode(&info->vfs_inode, type);
1313b56a2d8aSVineeth Remanan Pillai 		cond_resched();
1314b56a2d8aSVineeth Remanan Pillai 
1315b56a2d8aSVineeth Remanan Pillai 		mutex_lock(&shmem_swaplist_mutex);
1316b56a2d8aSVineeth Remanan Pillai 		next = list_next_entry(info, swaplist);
1317b56a2d8aSVineeth Remanan Pillai 		if (!info->swapped)
1318b56a2d8aSVineeth Remanan Pillai 			list_del_init(&info->swaplist);
1319af53d3e9SHugh Dickins 		if (atomic_dec_and_test(&info->stop_eviction))
1320af53d3e9SHugh Dickins 			wake_up_var(&info->stop_eviction);
1321b56a2d8aSVineeth Remanan Pillai 		if (error)
1322b56a2d8aSVineeth Remanan Pillai 			break;
13231da177e4SLinus Torvalds 	}
1324cb5f7b9aSHugh Dickins 	mutex_unlock(&shmem_swaplist_mutex);
1325778dd893SHugh Dickins 
1326778dd893SHugh Dickins 	return error;
13271da177e4SLinus Torvalds }
13281da177e4SLinus Torvalds 
13291da177e4SLinus Torvalds /*
13301da177e4SLinus Torvalds  * Move the page from the page cache to the swap cache.
13311da177e4SLinus Torvalds  */
13321da177e4SLinus Torvalds static int shmem_writepage(struct page *page, struct writeback_control *wbc)
13331da177e4SLinus Torvalds {
1334e2e3fdc7SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
13351da177e4SLinus Torvalds 	struct shmem_inode_info *info;
13361da177e4SLinus Torvalds 	struct address_space *mapping;
13371da177e4SLinus Torvalds 	struct inode *inode;
13386922c0c7SHugh Dickins 	swp_entry_t swap;
13396922c0c7SHugh Dickins 	pgoff_t index;
13401da177e4SLinus Torvalds 
13411e6decf3SHugh Dickins 	/*
13421e6decf3SHugh Dickins 	 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
13431e6decf3SHugh Dickins 	 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
13441e6decf3SHugh Dickins 	 * and its shmem_writeback() needs them to be split when swapping.
13451e6decf3SHugh Dickins 	 */
1346f530ed0eSMatthew Wilcox (Oracle) 	if (folio_test_large(folio)) {
13471e6decf3SHugh Dickins 		/* Ensure the subpages are still dirty */
1348f530ed0eSMatthew Wilcox (Oracle) 		folio_test_set_dirty(folio);
13491e6decf3SHugh Dickins 		if (split_huge_page(page) < 0)
13501e6decf3SHugh Dickins 			goto redirty;
1351f530ed0eSMatthew Wilcox (Oracle) 		folio = page_folio(page);
1352f530ed0eSMatthew Wilcox (Oracle) 		folio_clear_dirty(folio);
13531e6decf3SHugh Dickins 	}
13541e6decf3SHugh Dickins 
1355f530ed0eSMatthew Wilcox (Oracle) 	BUG_ON(!folio_test_locked(folio));
1356f530ed0eSMatthew Wilcox (Oracle) 	mapping = folio->mapping;
1357f530ed0eSMatthew Wilcox (Oracle) 	index = folio->index;
13581da177e4SLinus Torvalds 	inode = mapping->host;
13591da177e4SLinus Torvalds 	info = SHMEM_I(inode);
13601da177e4SLinus Torvalds 	if (info->flags & VM_LOCKED)
13611da177e4SLinus Torvalds 		goto redirty;
1362d9fe526aSHugh Dickins 	if (!total_swap_pages)
13631da177e4SLinus Torvalds 		goto redirty;
13641da177e4SLinus Torvalds 
1365d9fe526aSHugh Dickins 	/*
136697b713baSChristoph Hellwig 	 * Our capabilities prevent regular writeback or sync from ever calling
136797b713baSChristoph Hellwig 	 * shmem_writepage; but a stacking filesystem might use ->writepage of
136897b713baSChristoph Hellwig 	 * its underlying filesystem, in which case tmpfs should write out to
136997b713baSChristoph Hellwig 	 * swap only in response to memory pressure, and not for the writeback
137097b713baSChristoph Hellwig 	 * threads or sync.
1371d9fe526aSHugh Dickins 	 */
137248f170fbSHugh Dickins 	if (!wbc->for_reclaim) {
137348f170fbSHugh Dickins 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
137448f170fbSHugh Dickins 		goto redirty;
137548f170fbSHugh Dickins 	}
13761635f6a7SHugh Dickins 
13771635f6a7SHugh Dickins 	/*
13781635f6a7SHugh Dickins 	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
13791635f6a7SHugh Dickins 	 * value into swapfile.c, the only way we can correctly account for a
1380f530ed0eSMatthew Wilcox (Oracle) 	 * fallocated folio arriving here is now to initialize it and write it.
13811aac1400SHugh Dickins 	 *
1382f530ed0eSMatthew Wilcox (Oracle) 	 * That's okay for a folio already fallocated earlier, but if we have
13831aac1400SHugh Dickins 	 * not yet completed the fallocation, then (a) we want to keep track
1384f530ed0eSMatthew Wilcox (Oracle) 	 * of this folio in case we have to undo it, and (b) it may not be a
13851aac1400SHugh Dickins 	 * good idea to continue anyway, once we're pushing into swap.  So
1386f530ed0eSMatthew Wilcox (Oracle) 	 * reactivate the folio, and let shmem_fallocate() quit when too many.
13871635f6a7SHugh Dickins 	 */
1388f530ed0eSMatthew Wilcox (Oracle) 	if (!folio_test_uptodate(folio)) {
13891aac1400SHugh Dickins 		if (inode->i_private) {
13901aac1400SHugh Dickins 			struct shmem_falloc *shmem_falloc;
13911aac1400SHugh Dickins 			spin_lock(&inode->i_lock);
13921aac1400SHugh Dickins 			shmem_falloc = inode->i_private;
13931aac1400SHugh Dickins 			if (shmem_falloc &&
13948e205f77SHugh Dickins 			    !shmem_falloc->waitq &&
13951aac1400SHugh Dickins 			    index >= shmem_falloc->start &&
13961aac1400SHugh Dickins 			    index < shmem_falloc->next)
13971aac1400SHugh Dickins 				shmem_falloc->nr_unswapped++;
13981aac1400SHugh Dickins 			else
13991aac1400SHugh Dickins 				shmem_falloc = NULL;
14001aac1400SHugh Dickins 			spin_unlock(&inode->i_lock);
14011aac1400SHugh Dickins 			if (shmem_falloc)
14021aac1400SHugh Dickins 				goto redirty;
14031aac1400SHugh Dickins 		}
1404f530ed0eSMatthew Wilcox (Oracle) 		folio_zero_range(folio, 0, folio_size(folio));
1405f530ed0eSMatthew Wilcox (Oracle) 		flush_dcache_folio(folio);
1406f530ed0eSMatthew Wilcox (Oracle) 		folio_mark_uptodate(folio);
14071635f6a7SHugh Dickins 	}
14081635f6a7SHugh Dickins 
1409e2e3fdc7SMatthew Wilcox (Oracle) 	swap = folio_alloc_swap(folio);
141048f170fbSHugh Dickins 	if (!swap.val)
141148f170fbSHugh Dickins 		goto redirty;
1412d9fe526aSHugh Dickins 
1413b1dea800SHugh Dickins 	/*
1414b1dea800SHugh Dickins 	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1415f530ed0eSMatthew Wilcox (Oracle) 	 * if it's not already there.  Do it now before the folio is
14166922c0c7SHugh Dickins 	 * moved to swap cache, when its pagelock no longer protects
1417b1dea800SHugh Dickins 	 * the inode from eviction.  But don't unlock the mutex until
14186922c0c7SHugh Dickins 	 * we've incremented swapped, because shmem_unuse_inode() will
14196922c0c7SHugh Dickins 	 * prune a !swapped inode from the swaplist under this mutex.
1420b1dea800SHugh Dickins 	 */
1421b1dea800SHugh Dickins 	mutex_lock(&shmem_swaplist_mutex);
142205bf86b4SHugh Dickins 	if (list_empty(&info->swaplist))
1423b56a2d8aSVineeth Remanan Pillai 		list_add(&info->swaplist, &shmem_swaplist);
1424b1dea800SHugh Dickins 
1425a4c366f0SMatthew Wilcox (Oracle) 	if (add_to_swap_cache(folio, swap,
14263852f676SJoonsoo Kim 			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
14273852f676SJoonsoo Kim 			NULL) == 0) {
14284595ef88SKirill A. Shutemov 		spin_lock_irq(&info->lock);
1429267a4c76SHugh Dickins 		shmem_recalc_inode(inode);
1430267a4c76SHugh Dickins 		info->swapped++;
14314595ef88SKirill A. Shutemov 		spin_unlock_irq(&info->lock);
1432267a4c76SHugh Dickins 
1433aaa46865SHugh Dickins 		swap_shmem_alloc(swap);
14344cd400fdSMatthew Wilcox (Oracle) 		shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
14356922c0c7SHugh Dickins 
14366922c0c7SHugh Dickins 		mutex_unlock(&shmem_swaplist_mutex);
1437f530ed0eSMatthew Wilcox (Oracle) 		BUG_ON(folio_mapped(folio));
1438f530ed0eSMatthew Wilcox (Oracle) 		swap_writepage(&folio->page, wbc);
14391da177e4SLinus Torvalds 		return 0;
14401da177e4SLinus Torvalds 	}
14411da177e4SLinus Torvalds 
14426922c0c7SHugh Dickins 	mutex_unlock(&shmem_swaplist_mutex);
14434081f744SMatthew Wilcox (Oracle) 	put_swap_folio(folio, swap);
14441da177e4SLinus Torvalds redirty:
1445f530ed0eSMatthew Wilcox (Oracle) 	folio_mark_dirty(folio);
1446d9fe526aSHugh Dickins 	if (wbc->for_reclaim)
1447f530ed0eSMatthew Wilcox (Oracle) 		return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
1448f530ed0eSMatthew Wilcox (Oracle) 	folio_unlock(folio);
1449d9fe526aSHugh Dickins 	return 0;
14501da177e4SLinus Torvalds }
14511da177e4SLinus Torvalds 
145275edd345SHugh Dickins #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
145371fe804bSLee Schermerhorn static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1454680d794bSakpm@linux-foundation.org {
1455680d794bSakpm@linux-foundation.org 	char buffer[64];
1456680d794bSakpm@linux-foundation.org 
145771fe804bSLee Schermerhorn 	if (!mpol || mpol->mode == MPOL_DEFAULT)
1458095f1fc4SLee Schermerhorn 		return;		/* show nothing */
1459095f1fc4SLee Schermerhorn 
1460a7a88b23SHugh Dickins 	mpol_to_str(buffer, sizeof(buffer), mpol);
1461095f1fc4SLee Schermerhorn 
1462095f1fc4SLee Schermerhorn 	seq_printf(seq, ",mpol=%s", buffer);
1463680d794bSakpm@linux-foundation.org }
146471fe804bSLee Schermerhorn 
146571fe804bSLee Schermerhorn static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
146671fe804bSLee Schermerhorn {
146771fe804bSLee Schermerhorn 	struct mempolicy *mpol = NULL;
146871fe804bSLee Schermerhorn 	if (sbinfo->mpol) {
1469bf11b9a8SSebastian Andrzej Siewior 		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
147071fe804bSLee Schermerhorn 		mpol = sbinfo->mpol;
147171fe804bSLee Schermerhorn 		mpol_get(mpol);
1472bf11b9a8SSebastian Andrzej Siewior 		raw_spin_unlock(&sbinfo->stat_lock);
147371fe804bSLee Schermerhorn 	}
147471fe804bSLee Schermerhorn 	return mpol;
147571fe804bSLee Schermerhorn }
147675edd345SHugh Dickins #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
147775edd345SHugh Dickins static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
147875edd345SHugh Dickins {
147975edd345SHugh Dickins }
148075edd345SHugh Dickins static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
148175edd345SHugh Dickins {
148275edd345SHugh Dickins 	return NULL;
148375edd345SHugh Dickins }
148475edd345SHugh Dickins #endif /* CONFIG_NUMA && CONFIG_TMPFS */
148575edd345SHugh Dickins #ifndef CONFIG_NUMA
148675edd345SHugh Dickins #define vm_policy vm_private_data
148775edd345SHugh Dickins #endif
1488680d794bSakpm@linux-foundation.org 
1489800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1490800d8c63SKirill A. Shutemov 		struct shmem_inode_info *info, pgoff_t index)
1491800d8c63SKirill A. Shutemov {
1492800d8c63SKirill A. Shutemov 	/* Create a pseudo vma that just contains the policy */
14932c4541e2SKirill A. Shutemov 	vma_init(vma, NULL);
1494800d8c63SKirill A. Shutemov 	/* Bias interleave by inode number to distribute better across nodes */
1495800d8c63SKirill A. Shutemov 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
1496800d8c63SKirill A. Shutemov 	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1497800d8c63SKirill A. Shutemov }
1498800d8c63SKirill A. Shutemov 
1499800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1500800d8c63SKirill A. Shutemov {
1501800d8c63SKirill A. Shutemov 	/* Drop reference taken by mpol_shared_policy_lookup() */
1502800d8c63SKirill A. Shutemov 	mpol_cond_put(vma->vm_policy);
1503800d8c63SKirill A. Shutemov }
1504800d8c63SKirill A. Shutemov 
15055739a81cSMatthew Wilcox (Oracle) static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
150641ffe5d5SHugh Dickins 			struct shmem_inode_info *info, pgoff_t index)
15071da177e4SLinus Torvalds {
15081da177e4SLinus Torvalds 	struct vm_area_struct pvma;
150918a2f371SMel Gorman 	struct page *page;
15108c63ca5bSWill Deacon 	struct vm_fault vmf = {
15118c63ca5bSWill Deacon 		.vma = &pvma,
15128c63ca5bSWill Deacon 	};
15131da177e4SLinus Torvalds 
1514800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_init(&pvma, info, index);
1515e9e9b7ecSMinchan Kim 	page = swap_cluster_readahead(swap, gfp, &vmf);
1516800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_destroy(&pvma);
151718a2f371SMel Gorman 
15185739a81cSMatthew Wilcox (Oracle) 	if (!page)
15195739a81cSMatthew Wilcox (Oracle) 		return NULL;
15205739a81cSMatthew Wilcox (Oracle) 	return page_folio(page);
1521800d8c63SKirill A. Shutemov }
152218a2f371SMel Gorman 
152378cc8cdcSRik van Riel /*
152478cc8cdcSRik van Riel  * Make sure huge_gfp is always more limited than limit_gfp.
152578cc8cdcSRik van Riel  * Some of the flags set permissions, while others set limitations.
152678cc8cdcSRik van Riel  */
152778cc8cdcSRik van Riel static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
152878cc8cdcSRik van Riel {
152978cc8cdcSRik van Riel 	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
153078cc8cdcSRik van Riel 	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1531187df5ddSRik van Riel 	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1532187df5ddSRik van Riel 	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1533187df5ddSRik van Riel 
1534187df5ddSRik van Riel 	/* Allow allocations only from the originally specified zones. */
1535187df5ddSRik van Riel 	result |= zoneflags;
153678cc8cdcSRik van Riel 
153778cc8cdcSRik van Riel 	/*
153878cc8cdcSRik van Riel 	 * Minimize the result gfp by taking the union with the deny flags,
153978cc8cdcSRik van Riel 	 * and the intersection of the allow flags.
154078cc8cdcSRik van Riel 	 */
154178cc8cdcSRik van Riel 	result |= (limit_gfp & denyflags);
154278cc8cdcSRik van Riel 	result |= (huge_gfp & limit_gfp) & allowflags;
154378cc8cdcSRik van Riel 
154478cc8cdcSRik van Riel 	return result;
154578cc8cdcSRik van Riel }
154678cc8cdcSRik van Riel 
154772827e5cSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
1548800d8c63SKirill A. Shutemov 		struct shmem_inode_info *info, pgoff_t index)
1549800d8c63SKirill A. Shutemov {
1550800d8c63SKirill A. Shutemov 	struct vm_area_struct pvma;
15517b8d046fSMatthew Wilcox 	struct address_space *mapping = info->vfs_inode.i_mapping;
15527b8d046fSMatthew Wilcox 	pgoff_t hindex;
1553dfe98499SMatthew Wilcox (Oracle) 	struct folio *folio;
1554800d8c63SKirill A. Shutemov 
15554620a06eSGeert Uytterhoeven 	hindex = round_down(index, HPAGE_PMD_NR);
15567b8d046fSMatthew Wilcox 	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
15577b8d046fSMatthew Wilcox 								XA_PRESENT))
1558800d8c63SKirill A. Shutemov 		return NULL;
1559800d8c63SKirill A. Shutemov 
1560800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_init(&pvma, info, hindex);
1561dfe98499SMatthew Wilcox (Oracle) 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
1562800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_destroy(&pvma);
1563dfe98499SMatthew Wilcox (Oracle) 	if (!folio)
1564dcdf11eeSDavid Rientjes 		count_vm_event(THP_FILE_FALLBACK);
156572827e5cSMatthew Wilcox (Oracle) 	return folio;
156618a2f371SMel Gorman }
156718a2f371SMel Gorman 
15680c023ef5SMatthew Wilcox (Oracle) static struct folio *shmem_alloc_folio(gfp_t gfp,
156918a2f371SMel Gorman 			struct shmem_inode_info *info, pgoff_t index)
157018a2f371SMel Gorman {
157118a2f371SMel Gorman 	struct vm_area_struct pvma;
15720c023ef5SMatthew Wilcox (Oracle) 	struct folio *folio;
157318a2f371SMel Gorman 
1574800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_init(&pvma, info, index);
15750c023ef5SMatthew Wilcox (Oracle) 	folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
1576800d8c63SKirill A. Shutemov 	shmem_pseudo_vma_destroy(&pvma);
157718a2f371SMel Gorman 
15780c023ef5SMatthew Wilcox (Oracle) 	return folio;
157918a2f371SMel Gorman }
158018a2f371SMel Gorman 
1581b1d0ec3aSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
1582800d8c63SKirill A. Shutemov 		pgoff_t index, bool huge)
1583800d8c63SKirill A. Shutemov {
15840f079694SMike Rapoport 	struct shmem_inode_info *info = SHMEM_I(inode);
158572827e5cSMatthew Wilcox (Oracle) 	struct folio *folio;
1586800d8c63SKirill A. Shutemov 	int nr;
1587800d8c63SKirill A. Shutemov 	int err = -ENOSPC;
1588800d8c63SKirill A. Shutemov 
1589396bcc52SMatthew Wilcox (Oracle) 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1590800d8c63SKirill A. Shutemov 		huge = false;
1591800d8c63SKirill A. Shutemov 	nr = huge ? HPAGE_PMD_NR : 1;
1592800d8c63SKirill A. Shutemov 
15930f079694SMike Rapoport 	if (!shmem_inode_acct_block(inode, nr))
1594800d8c63SKirill A. Shutemov 		goto failed;
1595800d8c63SKirill A. Shutemov 
1596800d8c63SKirill A. Shutemov 	if (huge)
159772827e5cSMatthew Wilcox (Oracle) 		folio = shmem_alloc_hugefolio(gfp, info, index);
1598800d8c63SKirill A. Shutemov 	else
159972827e5cSMatthew Wilcox (Oracle) 		folio = shmem_alloc_folio(gfp, info, index);
160072827e5cSMatthew Wilcox (Oracle) 	if (folio) {
160172827e5cSMatthew Wilcox (Oracle) 		__folio_set_locked(folio);
160272827e5cSMatthew Wilcox (Oracle) 		__folio_set_swapbacked(folio);
1603b1d0ec3aSMatthew Wilcox (Oracle) 		return folio;
160475edd345SHugh Dickins 	}
160518a2f371SMel Gorman 
1606800d8c63SKirill A. Shutemov 	err = -ENOMEM;
16070f079694SMike Rapoport 	shmem_inode_unacct_blocks(inode, nr);
1608800d8c63SKirill A. Shutemov failed:
1609800d8c63SKirill A. Shutemov 	return ERR_PTR(err);
16101da177e4SLinus Torvalds }
161171fe804bSLee Schermerhorn 
16121da177e4SLinus Torvalds /*
1613bde05d1cSHugh Dickins  * When a page is moved from swapcache to shmem filecache (either by the
1614fc26babbSMatthew Wilcox (Oracle)  * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
1615bde05d1cSHugh Dickins  * shmem_unuse_inode()), it may have been read in earlier from swap, in
1616bde05d1cSHugh Dickins  * ignorance of the mapping it belongs to.  If that mapping has special
1617bde05d1cSHugh Dickins  * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1618bde05d1cSHugh Dickins  * we may need to copy to a suitable page before moving to filecache.
1619bde05d1cSHugh Dickins  *
1620bde05d1cSHugh Dickins  * In a future release, this may well be extended to respect cpuset and
1621bde05d1cSHugh Dickins  * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1622bde05d1cSHugh Dickins  * but for now it is a simple matter of zone.
1623bde05d1cSHugh Dickins  */
1624069d849cSMatthew Wilcox (Oracle) static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
1625bde05d1cSHugh Dickins {
1626069d849cSMatthew Wilcox (Oracle) 	return folio_zonenum(folio) > gfp_zone(gfp);
1627bde05d1cSHugh Dickins }
1628bde05d1cSHugh Dickins 
16290d698e25SMatthew Wilcox (Oracle) static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
1630bde05d1cSHugh Dickins 				struct shmem_inode_info *info, pgoff_t index)
1631bde05d1cSHugh Dickins {
1632d21bba2bSMatthew Wilcox (Oracle) 	struct folio *old, *new;
1633bde05d1cSHugh Dickins 	struct address_space *swap_mapping;
1634c1cb20d4SYu Zhao 	swp_entry_t entry;
1635bde05d1cSHugh Dickins 	pgoff_t swap_index;
1636bde05d1cSHugh Dickins 	int error;
1637bde05d1cSHugh Dickins 
16380d698e25SMatthew Wilcox (Oracle) 	old = *foliop;
1639907ea17eSMatthew Wilcox (Oracle) 	entry = folio_swap_entry(old);
1640c1cb20d4SYu Zhao 	swap_index = swp_offset(entry);
1641907ea17eSMatthew Wilcox (Oracle) 	swap_mapping = swap_address_space(entry);
1642bde05d1cSHugh Dickins 
1643bde05d1cSHugh Dickins 	/*
1644bde05d1cSHugh Dickins 	 * We have arrived here because our zones are constrained, so don't
1645bde05d1cSHugh Dickins 	 * limit chance of success by further cpuset and node constraints.
1646bde05d1cSHugh Dickins 	 */
1647bde05d1cSHugh Dickins 	gfp &= ~GFP_CONSTRAINT_MASK;
1648907ea17eSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(folio_test_large(old), old);
1649907ea17eSMatthew Wilcox (Oracle) 	new = shmem_alloc_folio(gfp, info, index);
1650907ea17eSMatthew Wilcox (Oracle) 	if (!new)
1651bde05d1cSHugh Dickins 		return -ENOMEM;
1652bde05d1cSHugh Dickins 
1653907ea17eSMatthew Wilcox (Oracle) 	folio_get(new);
1654907ea17eSMatthew Wilcox (Oracle) 	folio_copy(new, old);
1655907ea17eSMatthew Wilcox (Oracle) 	flush_dcache_folio(new);
1656bde05d1cSHugh Dickins 
1657907ea17eSMatthew Wilcox (Oracle) 	__folio_set_locked(new);
1658907ea17eSMatthew Wilcox (Oracle) 	__folio_set_swapbacked(new);
1659907ea17eSMatthew Wilcox (Oracle) 	folio_mark_uptodate(new);
1660907ea17eSMatthew Wilcox (Oracle) 	folio_set_swap_entry(new, entry);
1661907ea17eSMatthew Wilcox (Oracle) 	folio_set_swapcache(new);
1662bde05d1cSHugh Dickins 
1663bde05d1cSHugh Dickins 	/*
1664bde05d1cSHugh Dickins 	 * Our caller will very soon move newpage out of swapcache, but it's
1665bde05d1cSHugh Dickins 	 * a nice clean interface for us to replace oldpage by newpage there.
1666bde05d1cSHugh Dickins 	 */
1667b93b0163SMatthew Wilcox 	xa_lock_irq(&swap_mapping->i_pages);
1668907ea17eSMatthew Wilcox (Oracle) 	error = shmem_replace_entry(swap_mapping, swap_index, old, new);
16690142ef6cSHugh Dickins 	if (!error) {
1670d21bba2bSMatthew Wilcox (Oracle) 		mem_cgroup_migrate(old, new);
1671907ea17eSMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
1672907ea17eSMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
1673907ea17eSMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
1674907ea17eSMatthew Wilcox (Oracle) 		__lruvec_stat_mod_folio(old, NR_SHMEM, -1);
16750142ef6cSHugh Dickins 	}
1676b93b0163SMatthew Wilcox 	xa_unlock_irq(&swap_mapping->i_pages);
1677bde05d1cSHugh Dickins 
16780142ef6cSHugh Dickins 	if (unlikely(error)) {
16790142ef6cSHugh Dickins 		/*
16800142ef6cSHugh Dickins 		 * Is this possible?  I think not, now that our callers check
16810142ef6cSHugh Dickins 		 * both PageSwapCache and page_private after getting page lock;
16820142ef6cSHugh Dickins 		 * but be defensive.  Reverse old to newpage for clear and free.
16830142ef6cSHugh Dickins 		 */
1684907ea17eSMatthew Wilcox (Oracle) 		old = new;
16850142ef6cSHugh Dickins 	} else {
1686907ea17eSMatthew Wilcox (Oracle) 		folio_add_lru(new);
16870d698e25SMatthew Wilcox (Oracle) 		*foliop = new;
16880142ef6cSHugh Dickins 	}
1689bde05d1cSHugh Dickins 
1690907ea17eSMatthew Wilcox (Oracle) 	folio_clear_swapcache(old);
1691907ea17eSMatthew Wilcox (Oracle) 	old->private = NULL;
1692bde05d1cSHugh Dickins 
1693907ea17eSMatthew Wilcox (Oracle) 	folio_unlock(old);
1694907ea17eSMatthew Wilcox (Oracle) 	folio_put_refs(old, 2);
16950142ef6cSHugh Dickins 	return error;
1696bde05d1cSHugh Dickins }
1697bde05d1cSHugh Dickins 
16986cec2b95SMiaohe Lin static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
16996cec2b95SMiaohe Lin 					 struct folio *folio, swp_entry_t swap)
17006cec2b95SMiaohe Lin {
17016cec2b95SMiaohe Lin 	struct address_space *mapping = inode->i_mapping;
17026cec2b95SMiaohe Lin 	struct shmem_inode_info *info = SHMEM_I(inode);
17036cec2b95SMiaohe Lin 	swp_entry_t swapin_error;
17046cec2b95SMiaohe Lin 	void *old;
17056cec2b95SMiaohe Lin 
170615520a3fSPeter Xu 	swapin_error = make_swapin_error_entry();
17076cec2b95SMiaohe Lin 	old = xa_cmpxchg_irq(&mapping->i_pages, index,
17086cec2b95SMiaohe Lin 			     swp_to_radix_entry(swap),
17096cec2b95SMiaohe Lin 			     swp_to_radix_entry(swapin_error), 0);
17106cec2b95SMiaohe Lin 	if (old != swp_to_radix_entry(swap))
17116cec2b95SMiaohe Lin 		return;
17126cec2b95SMiaohe Lin 
17136cec2b95SMiaohe Lin 	folio_wait_writeback(folio);
171475fa68a5SMatthew Wilcox (Oracle) 	delete_from_swap_cache(folio);
17156cec2b95SMiaohe Lin 	spin_lock_irq(&info->lock);
17166cec2b95SMiaohe Lin 	/*
17176cec2b95SMiaohe Lin 	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
17186cec2b95SMiaohe Lin 	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
17196cec2b95SMiaohe Lin 	 * shmem_evict_inode.
17206cec2b95SMiaohe Lin 	 */
17216cec2b95SMiaohe Lin 	info->alloced--;
17226cec2b95SMiaohe Lin 	info->swapped--;
17236cec2b95SMiaohe Lin 	shmem_recalc_inode(inode);
17246cec2b95SMiaohe Lin 	spin_unlock_irq(&info->lock);
17256cec2b95SMiaohe Lin 	swap_free(swap);
17266cec2b95SMiaohe Lin }
17276cec2b95SMiaohe Lin 
1728bde05d1cSHugh Dickins /*
1729833de10fSMiaohe Lin  * Swap in the folio pointed to by *foliop.
1730833de10fSMiaohe Lin  * Caller has to make sure that *foliop contains a valid swapped folio.
1731833de10fSMiaohe Lin  * Returns 0 and the folio in foliop if success. On failure, returns the
1732833de10fSMiaohe Lin  * error code and NULL in *foliop.
17331da177e4SLinus Torvalds  */
1734da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
1735da08e9b7SMatthew Wilcox (Oracle) 			     struct folio **foliop, enum sgp_type sgp,
1736c5bf121eSVineeth Remanan Pillai 			     gfp_t gfp, struct vm_area_struct *vma,
17372b740303SSouptick Joarder 			     vm_fault_t *fault_type)
17381da177e4SLinus Torvalds {
17391da177e4SLinus Torvalds 	struct address_space *mapping = inode->i_mapping;
174023f919d4SArnd Bergmann 	struct shmem_inode_info *info = SHMEM_I(inode);
174104f94e3fSDan Schatzberg 	struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
1742da08e9b7SMatthew Wilcox (Oracle) 	struct folio *folio = NULL;
17431da177e4SLinus Torvalds 	swp_entry_t swap;
17441da177e4SLinus Torvalds 	int error;
17451da177e4SLinus Torvalds 
1746da08e9b7SMatthew Wilcox (Oracle) 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
1747da08e9b7SMatthew Wilcox (Oracle) 	swap = radix_to_swp_entry(*foliop);
1748da08e9b7SMatthew Wilcox (Oracle) 	*foliop = NULL;
174954af6042SHugh Dickins 
17506cec2b95SMiaohe Lin 	if (is_swapin_error_entry(swap))
17516cec2b95SMiaohe Lin 		return -EIO;
17526cec2b95SMiaohe Lin 
17531da177e4SLinus Torvalds 	/* Look it up and read it in.. */
17545739a81cSMatthew Wilcox (Oracle) 	folio = swap_cache_get_folio(swap, NULL, 0);
17555739a81cSMatthew Wilcox (Oracle) 	if (!folio) {
17569e18eb29SAndres Lagar-Cavilla 		/* Or update major stats only when swapin succeeds?? */
17579e18eb29SAndres Lagar-Cavilla 		if (fault_type) {
175868da9f05SHugh Dickins 			*fault_type |= VM_FAULT_MAJOR;
17599e18eb29SAndres Lagar-Cavilla 			count_vm_event(PGMAJFAULT);
17602262185cSRoman Gushchin 			count_memcg_event_mm(charge_mm, PGMAJFAULT);
17619e18eb29SAndres Lagar-Cavilla 		}
17629e18eb29SAndres Lagar-Cavilla 		/* Here we actually start the io */
17635739a81cSMatthew Wilcox (Oracle) 		folio = shmem_swapin(swap, gfp, info, index);
17645739a81cSMatthew Wilcox (Oracle) 		if (!folio) {
17651da177e4SLinus Torvalds 			error = -ENOMEM;
176654af6042SHugh Dickins 			goto failed;
1767285b2c4fSHugh Dickins 		}
17681da177e4SLinus Torvalds 	}
17691da177e4SLinus Torvalds 
1770833de10fSMiaohe Lin 	/* We have to do this with folio locked to prevent races */
1771da08e9b7SMatthew Wilcox (Oracle) 	folio_lock(folio);
1772da08e9b7SMatthew Wilcox (Oracle) 	if (!folio_test_swapcache(folio) ||
1773da08e9b7SMatthew Wilcox (Oracle) 	    folio_swap_entry(folio).val != swap.val ||
1774d1899228SHugh Dickins 	    !shmem_confirm_swap(mapping, index, swap)) {
1775c5bf121eSVineeth Remanan Pillai 		error = -EEXIST;
1776d1899228SHugh Dickins 		goto unlock;
1777bde05d1cSHugh Dickins 	}
1778da08e9b7SMatthew Wilcox (Oracle) 	if (!folio_test_uptodate(folio)) {
17791da177e4SLinus Torvalds 		error = -EIO;
178054af6042SHugh Dickins 		goto failed;
178154af6042SHugh Dickins 	}
1782da08e9b7SMatthew Wilcox (Oracle) 	folio_wait_writeback(folio);
178354af6042SHugh Dickins 
17848a84802eSSteven Price 	/*
17858a84802eSSteven Price 	 * Some architectures may have to restore extra metadata to the
1786da08e9b7SMatthew Wilcox (Oracle) 	 * folio after reading from swap.
17878a84802eSSteven Price 	 */
1788da08e9b7SMatthew Wilcox (Oracle) 	arch_swap_restore(swap, folio);
17898a84802eSSteven Price 
1790069d849cSMatthew Wilcox (Oracle) 	if (shmem_should_replace_folio(folio, gfp)) {
17910d698e25SMatthew Wilcox (Oracle) 		error = shmem_replace_folio(&folio, gfp, info, index);
1792bde05d1cSHugh Dickins 		if (error)
179354af6042SHugh Dickins 			goto failed;
17941da177e4SLinus Torvalds 	}
17951da177e4SLinus Torvalds 
1796b7dd44a1SMatthew Wilcox (Oracle) 	error = shmem_add_to_page_cache(folio, mapping, index,
17973fea5a49SJohannes Weiner 					swp_to_radix_entry(swap), gfp,
17983fea5a49SJohannes Weiner 					charge_mm);
179954af6042SHugh Dickins 	if (error)
180054af6042SHugh Dickins 		goto failed;
180154af6042SHugh Dickins 
18024595ef88SKirill A. Shutemov 	spin_lock_irq(&info->lock);
180354af6042SHugh Dickins 	info->swapped--;
180454af6042SHugh Dickins 	shmem_recalc_inode(inode);
18054595ef88SKirill A. Shutemov 	spin_unlock_irq(&info->lock);
180627ab7006SHugh Dickins 
180766d2f4d2SHugh Dickins 	if (sgp == SGP_WRITE)
1808da08e9b7SMatthew Wilcox (Oracle) 		folio_mark_accessed(folio);
180966d2f4d2SHugh Dickins 
181075fa68a5SMatthew Wilcox (Oracle) 	delete_from_swap_cache(folio);
1811da08e9b7SMatthew Wilcox (Oracle) 	folio_mark_dirty(folio);
181227ab7006SHugh Dickins 	swap_free(swap);
181327ab7006SHugh Dickins 
1814da08e9b7SMatthew Wilcox (Oracle) 	*foliop = folio;
1815c5bf121eSVineeth Remanan Pillai 	return 0;
1816c5bf121eSVineeth Remanan Pillai failed:
1817c5bf121eSVineeth Remanan Pillai 	if (!shmem_confirm_swap(mapping, index, swap))
1818c5bf121eSVineeth Remanan Pillai 		error = -EEXIST;
18196cec2b95SMiaohe Lin 	if (error == -EIO)
18206cec2b95SMiaohe Lin 		shmem_set_folio_swapin_error(inode, index, folio, swap);
1821c5bf121eSVineeth Remanan Pillai unlock:
1822da08e9b7SMatthew Wilcox (Oracle) 	if (folio) {
1823da08e9b7SMatthew Wilcox (Oracle) 		folio_unlock(folio);
1824da08e9b7SMatthew Wilcox (Oracle) 		folio_put(folio);
1825c5bf121eSVineeth Remanan Pillai 	}
1826c5bf121eSVineeth Remanan Pillai 
1827c5bf121eSVineeth Remanan Pillai 	return error;
1828c5bf121eSVineeth Remanan Pillai }
1829c5bf121eSVineeth Remanan Pillai 
1830c5bf121eSVineeth Remanan Pillai /*
1831fc26babbSMatthew Wilcox (Oracle)  * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
1832c5bf121eSVineeth Remanan Pillai  *
1833c5bf121eSVineeth Remanan Pillai  * If we allocate a new one we do not mark it dirty. That's up to the
1834c5bf121eSVineeth Remanan Pillai  * vm. If we swap it in we mark it dirty since we also free the swap
1835c5bf121eSVineeth Remanan Pillai  * entry since a page cannot live in both the swap and page cache.
1836c5bf121eSVineeth Remanan Pillai  *
1837c949b097SAxel Rasmussen  * vma, vmf, and fault_type are only supplied by shmem_fault:
1838c5bf121eSVineeth Remanan Pillai  * otherwise they are NULL.
1839c5bf121eSVineeth Remanan Pillai  */
1840fc26babbSMatthew Wilcox (Oracle) static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
1841fc26babbSMatthew Wilcox (Oracle) 		struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
1842c5bf121eSVineeth Remanan Pillai 		struct vm_area_struct *vma, struct vm_fault *vmf,
1843c5bf121eSVineeth Remanan Pillai 		vm_fault_t *fault_type)
1844c5bf121eSVineeth Remanan Pillai {
1845c5bf121eSVineeth Remanan Pillai 	struct address_space *mapping = inode->i_mapping;
1846c5bf121eSVineeth Remanan Pillai 	struct shmem_inode_info *info = SHMEM_I(inode);
1847c5bf121eSVineeth Remanan Pillai 	struct shmem_sb_info *sbinfo;
1848c5bf121eSVineeth Remanan Pillai 	struct mm_struct *charge_mm;
1849b7dd44a1SMatthew Wilcox (Oracle) 	struct folio *folio;
18506fe7d712SLukas Bulwahn 	pgoff_t hindex;
1851164cc4feSRik van Riel 	gfp_t huge_gfp;
1852c5bf121eSVineeth Remanan Pillai 	int error;
1853c5bf121eSVineeth Remanan Pillai 	int once = 0;
1854c5bf121eSVineeth Remanan Pillai 	int alloced = 0;
1855c5bf121eSVineeth Remanan Pillai 
1856c5bf121eSVineeth Remanan Pillai 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1857c5bf121eSVineeth Remanan Pillai 		return -EFBIG;
1858c5bf121eSVineeth Remanan Pillai repeat:
1859c5bf121eSVineeth Remanan Pillai 	if (sgp <= SGP_CACHE &&
1860c5bf121eSVineeth Remanan Pillai 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1861c5bf121eSVineeth Remanan Pillai 		return -EINVAL;
1862c5bf121eSVineeth Remanan Pillai 	}
1863c5bf121eSVineeth Remanan Pillai 
1864c5bf121eSVineeth Remanan Pillai 	sbinfo = SHMEM_SB(inode->i_sb);
186504f94e3fSDan Schatzberg 	charge_mm = vma ? vma->vm_mm : NULL;
1866c5bf121eSVineeth Remanan Pillai 
1867b1d0ec3aSMatthew Wilcox (Oracle) 	folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
1868b1d0ec3aSMatthew Wilcox (Oracle) 	if (folio && vma && userfaultfd_minor(vma)) {
1869b1d0ec3aSMatthew Wilcox (Oracle) 		if (!xa_is_value(folio)) {
1870b1d0ec3aSMatthew Wilcox (Oracle) 			folio_unlock(folio);
1871b1d0ec3aSMatthew Wilcox (Oracle) 			folio_put(folio);
1872c949b097SAxel Rasmussen 		}
1873c949b097SAxel Rasmussen 		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1874c949b097SAxel Rasmussen 		return 0;
1875c949b097SAxel Rasmussen 	}
1876c949b097SAxel Rasmussen 
1877b1d0ec3aSMatthew Wilcox (Oracle) 	if (xa_is_value(folio)) {
1878da08e9b7SMatthew Wilcox (Oracle) 		error = shmem_swapin_folio(inode, index, &folio,
1879c5bf121eSVineeth Remanan Pillai 					  sgp, gfp, vma, fault_type);
1880c5bf121eSVineeth Remanan Pillai 		if (error == -EEXIST)
1881c5bf121eSVineeth Remanan Pillai 			goto repeat;
1882c5bf121eSVineeth Remanan Pillai 
1883fc26babbSMatthew Wilcox (Oracle) 		*foliop = folio;
1884c5bf121eSVineeth Remanan Pillai 		return error;
1885c5bf121eSVineeth Remanan Pillai 	}
1886c5bf121eSVineeth Remanan Pillai 
1887b1d0ec3aSMatthew Wilcox (Oracle) 	if (folio) {
1888acdd9f8eSHugh Dickins 		if (sgp == SGP_WRITE)
1889b1d0ec3aSMatthew Wilcox (Oracle) 			folio_mark_accessed(folio);
1890b1d0ec3aSMatthew Wilcox (Oracle) 		if (folio_test_uptodate(folio))
1891acdd9f8eSHugh Dickins 			goto out;
1892fc26babbSMatthew Wilcox (Oracle) 		/* fallocated folio */
1893c5bf121eSVineeth Remanan Pillai 		if (sgp != SGP_READ)
1894c5bf121eSVineeth Remanan Pillai 			goto clear;
1895b1d0ec3aSMatthew Wilcox (Oracle) 		folio_unlock(folio);
1896b1d0ec3aSMatthew Wilcox (Oracle) 		folio_put(folio);
1897c5bf121eSVineeth Remanan Pillai 	}
1898c5bf121eSVineeth Remanan Pillai 
1899c5bf121eSVineeth Remanan Pillai 	/*
1900fc26babbSMatthew Wilcox (Oracle) 	 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
1901fc26babbSMatthew Wilcox (Oracle) 	 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
1902acdd9f8eSHugh Dickins 	 */
1903fc26babbSMatthew Wilcox (Oracle) 	*foliop = NULL;
1904acdd9f8eSHugh Dickins 	if (sgp == SGP_READ)
1905acdd9f8eSHugh Dickins 		return 0;
1906acdd9f8eSHugh Dickins 	if (sgp == SGP_NOALLOC)
1907acdd9f8eSHugh Dickins 		return -ENOENT;
1908acdd9f8eSHugh Dickins 
1909acdd9f8eSHugh Dickins 	/*
1910acdd9f8eSHugh Dickins 	 * Fast cache lookup and swap lookup did not find it: allocate.
1911c5bf121eSVineeth Remanan Pillai 	 */
1912c5bf121eSVineeth Remanan Pillai 
1913cfda0526SMike Rapoport 	if (vma && userfaultfd_missing(vma)) {
1914cfda0526SMike Rapoport 		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1915cfda0526SMike Rapoport 		return 0;
1916cfda0526SMike Rapoport 	}
1917cfda0526SMike Rapoport 
19187c6c6cc4SZach O'Keefe 	if (!shmem_is_huge(vma, inode, index, false))
1919800d8c63SKirill A. Shutemov 		goto alloc_nohuge;
192027d80fa2SKees Cook 
1921164cc4feSRik van Riel 	huge_gfp = vma_thp_gfp_mask(vma);
192278cc8cdcSRik van Riel 	huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1923b1d0ec3aSMatthew Wilcox (Oracle) 	folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
1924b1d0ec3aSMatthew Wilcox (Oracle) 	if (IS_ERR(folio)) {
1925c5bf121eSVineeth Remanan Pillai alloc_nohuge:
1926b1d0ec3aSMatthew Wilcox (Oracle) 		folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
192754af6042SHugh Dickins 	}
1928b1d0ec3aSMatthew Wilcox (Oracle) 	if (IS_ERR(folio)) {
1929779750d2SKirill A. Shutemov 		int retry = 5;
1930c5bf121eSVineeth Remanan Pillai 
1931b1d0ec3aSMatthew Wilcox (Oracle) 		error = PTR_ERR(folio);
1932b1d0ec3aSMatthew Wilcox (Oracle) 		folio = NULL;
1933779750d2SKirill A. Shutemov 		if (error != -ENOSPC)
1934c5bf121eSVineeth Remanan Pillai 			goto unlock;
1935779750d2SKirill A. Shutemov 		/*
1936fc26babbSMatthew Wilcox (Oracle) 		 * Try to reclaim some space by splitting a large folio
1937779750d2SKirill A. Shutemov 		 * beyond i_size on the filesystem.
1938779750d2SKirill A. Shutemov 		 */
1939779750d2SKirill A. Shutemov 		while (retry--) {
1940779750d2SKirill A. Shutemov 			int ret;
1941c5bf121eSVineeth Remanan Pillai 
1942779750d2SKirill A. Shutemov 			ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1943779750d2SKirill A. Shutemov 			if (ret == SHRINK_STOP)
1944779750d2SKirill A. Shutemov 				break;
1945779750d2SKirill A. Shutemov 			if (ret)
1946779750d2SKirill A. Shutemov 				goto alloc_nohuge;
1947779750d2SKirill A. Shutemov 		}
1948c5bf121eSVineeth Remanan Pillai 		goto unlock;
1949800d8c63SKirill A. Shutemov 	}
1950800d8c63SKirill A. Shutemov 
1951b1d0ec3aSMatthew Wilcox (Oracle) 	hindex = round_down(index, folio_nr_pages(folio));
1952800d8c63SKirill A. Shutemov 
195366d2f4d2SHugh Dickins 	if (sgp == SGP_WRITE)
1954b1d0ec3aSMatthew Wilcox (Oracle) 		__folio_set_referenced(folio);
195566d2f4d2SHugh Dickins 
1956b7dd44a1SMatthew Wilcox (Oracle) 	error = shmem_add_to_page_cache(folio, mapping, hindex,
19573fea5a49SJohannes Weiner 					NULL, gfp & GFP_RECLAIM_MASK,
19583fea5a49SJohannes Weiner 					charge_mm);
19593fea5a49SJohannes Weiner 	if (error)
1960800d8c63SKirill A. Shutemov 		goto unacct;
1961b1d0ec3aSMatthew Wilcox (Oracle) 	folio_add_lru(folio);
196254af6042SHugh Dickins 
19634595ef88SKirill A. Shutemov 	spin_lock_irq(&info->lock);
1964b1d0ec3aSMatthew Wilcox (Oracle) 	info->alloced += folio_nr_pages(folio);
1965fa020a2bSAndrew Morton 	inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
196654af6042SHugh Dickins 	shmem_recalc_inode(inode);
19674595ef88SKirill A. Shutemov 	spin_unlock_irq(&info->lock);
19681635f6a7SHugh Dickins 	alloced = true;
196954af6042SHugh Dickins 
1970b1d0ec3aSMatthew Wilcox (Oracle) 	if (folio_test_pmd_mappable(folio) &&
1971779750d2SKirill A. Shutemov 	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1972fc26babbSMatthew Wilcox (Oracle) 					folio_next_index(folio) - 1) {
1973779750d2SKirill A. Shutemov 		/*
1974fc26babbSMatthew Wilcox (Oracle) 		 * Part of the large folio is beyond i_size: subject
1975779750d2SKirill A. Shutemov 		 * to shrink under memory pressure.
1976779750d2SKirill A. Shutemov 		 */
1977779750d2SKirill A. Shutemov 		spin_lock(&sbinfo->shrinklist_lock);
1978d041353dSCong Wang 		/*
1979d041353dSCong Wang 		 * _careful to defend against unlocked access to
1980d041353dSCong Wang 		 * ->shrink_list in shmem_unused_huge_shrink()
1981d041353dSCong Wang 		 */
1982d041353dSCong Wang 		if (list_empty_careful(&info->shrinklist)) {
1983779750d2SKirill A. Shutemov 			list_add_tail(&info->shrinklist,
1984779750d2SKirill A. Shutemov 				      &sbinfo->shrinklist);
1985779750d2SKirill A. Shutemov 			sbinfo->shrinklist_len++;
1986779750d2SKirill A. Shutemov 		}
1987779750d2SKirill A. Shutemov 		spin_unlock(&sbinfo->shrinklist_lock);
1988779750d2SKirill A. Shutemov 	}
1989779750d2SKirill A. Shutemov 
1990ec9516fbSHugh Dickins 	/*
1991fc26babbSMatthew Wilcox (Oracle) 	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
19921635f6a7SHugh Dickins 	 */
19931635f6a7SHugh Dickins 	if (sgp == SGP_FALLOC)
19941635f6a7SHugh Dickins 		sgp = SGP_WRITE;
19951635f6a7SHugh Dickins clear:
19961635f6a7SHugh Dickins 	/*
1997fc26babbSMatthew Wilcox (Oracle) 	 * Let SGP_WRITE caller clear ends if write does not fill folio;
1998fc26babbSMatthew Wilcox (Oracle) 	 * but SGP_FALLOC on a folio fallocated earlier must initialize
19991635f6a7SHugh Dickins 	 * it now, lest undo on failure cancel our earlier guarantee.
2000ec9516fbSHugh Dickins 	 */
2001b1d0ec3aSMatthew Wilcox (Oracle) 	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2002b1d0ec3aSMatthew Wilcox (Oracle) 		long i, n = folio_nr_pages(folio);
2003800d8c63SKirill A. Shutemov 
2004b1d0ec3aSMatthew Wilcox (Oracle) 		for (i = 0; i < n; i++)
2005b1d0ec3aSMatthew Wilcox (Oracle) 			clear_highpage(folio_page(folio, i));
2006b1d0ec3aSMatthew Wilcox (Oracle) 		flush_dcache_folio(folio);
2007b1d0ec3aSMatthew Wilcox (Oracle) 		folio_mark_uptodate(folio);
2008ec9516fbSHugh Dickins 	}
2009bde05d1cSHugh Dickins 
201054af6042SHugh Dickins 	/* Perhaps the file has been truncated since we checked */
201175edd345SHugh Dickins 	if (sgp <= SGP_CACHE &&
201209cbfeafSKirill A. Shutemov 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2013267a4c76SHugh Dickins 		if (alloced) {
2014b1d0ec3aSMatthew Wilcox (Oracle) 			folio_clear_dirty(folio);
2015b1d0ec3aSMatthew Wilcox (Oracle) 			filemap_remove_folio(folio);
20164595ef88SKirill A. Shutemov 			spin_lock_irq(&info->lock);
2017267a4c76SHugh Dickins 			shmem_recalc_inode(inode);
20184595ef88SKirill A. Shutemov 			spin_unlock_irq(&info->lock);
2019267a4c76SHugh Dickins 		}
202054af6042SHugh Dickins 		error = -EINVAL;
2021267a4c76SHugh Dickins 		goto unlock;
2022ff36b801SShaohua Li 	}
202363ec1973SMatthew Wilcox (Oracle) out:
2024fc26babbSMatthew Wilcox (Oracle) 	*foliop = folio;
202554af6042SHugh Dickins 	return 0;
2026d00806b1SNick Piggin 
2027d0217ac0SNick Piggin 	/*
202854af6042SHugh Dickins 	 * Error recovery.
20291da177e4SLinus Torvalds 	 */
203054af6042SHugh Dickins unacct:
2031b1d0ec3aSMatthew Wilcox (Oracle) 	shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
2032800d8c63SKirill A. Shutemov 
2033b1d0ec3aSMatthew Wilcox (Oracle) 	if (folio_test_large(folio)) {
2034b1d0ec3aSMatthew Wilcox (Oracle) 		folio_unlock(folio);
2035b1d0ec3aSMatthew Wilcox (Oracle) 		folio_put(folio);
2036800d8c63SKirill A. Shutemov 		goto alloc_nohuge;
2037800d8c63SKirill A. Shutemov 	}
2038d1899228SHugh Dickins unlock:
2039b1d0ec3aSMatthew Wilcox (Oracle) 	if (folio) {
2040b1d0ec3aSMatthew Wilcox (Oracle) 		folio_unlock(folio);
2041b1d0ec3aSMatthew Wilcox (Oracle) 		folio_put(folio);
204254af6042SHugh Dickins 	}
204354af6042SHugh Dickins 	if (error == -ENOSPC && !once++) {
20444595ef88SKirill A. Shutemov 		spin_lock_irq(&info->lock);
204554af6042SHugh Dickins 		shmem_recalc_inode(inode);
20464595ef88SKirill A. Shutemov 		spin_unlock_irq(&info->lock);
20471da177e4SLinus Torvalds 		goto repeat;
2048d8dc74f2SAdrian Bunk 	}
20497f4446eeSMatthew Wilcox 	if (error == -EEXIST)
205054af6042SHugh Dickins 		goto repeat;
205154af6042SHugh Dickins 	return error;
20521da177e4SLinus Torvalds }
20531da177e4SLinus Torvalds 
20544e1fc793SMatthew Wilcox (Oracle) int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
20554e1fc793SMatthew Wilcox (Oracle) 		enum sgp_type sgp)
20564e1fc793SMatthew Wilcox (Oracle) {
20574e1fc793SMatthew Wilcox (Oracle) 	return shmem_get_folio_gfp(inode, index, foliop, sgp,
20584e1fc793SMatthew Wilcox (Oracle) 			mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
20594e1fc793SMatthew Wilcox (Oracle) }
20604e1fc793SMatthew Wilcox (Oracle) 
206110d20bd2SLinus Torvalds /*
206210d20bd2SLinus Torvalds  * This is like autoremove_wake_function, but it removes the wait queue
206310d20bd2SLinus Torvalds  * entry unconditionally - even if something else had already woken the
206410d20bd2SLinus Torvalds  * target.
206510d20bd2SLinus Torvalds  */
2066ac6424b9SIngo Molnar static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
206710d20bd2SLinus Torvalds {
206810d20bd2SLinus Torvalds 	int ret = default_wake_function(wait, mode, sync, key);
20692055da97SIngo Molnar 	list_del_init(&wait->entry);
207010d20bd2SLinus Torvalds 	return ret;
207110d20bd2SLinus Torvalds }
207210d20bd2SLinus Torvalds 
207320acce67SSouptick Joarder static vm_fault_t shmem_fault(struct vm_fault *vmf)
20741da177e4SLinus Torvalds {
207511bac800SDave Jiang 	struct vm_area_struct *vma = vmf->vma;
2076496ad9aaSAl Viro 	struct inode *inode = file_inode(vma->vm_file);
20779e18eb29SAndres Lagar-Cavilla 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
207868a54100SMatthew Wilcox (Oracle) 	struct folio *folio = NULL;
207920acce67SSouptick Joarder 	int err;
208020acce67SSouptick Joarder 	vm_fault_t ret = VM_FAULT_LOCKED;
20811da177e4SLinus Torvalds 
2082f00cdc6dSHugh Dickins 	/*
2083f00cdc6dSHugh Dickins 	 * Trinity finds that probing a hole which tmpfs is punching can
2084f00cdc6dSHugh Dickins 	 * prevent the hole-punch from ever completing: which in turn
20859608703eSJan Kara 	 * locks writers out with its hold on i_rwsem.  So refrain from
20868e205f77SHugh Dickins 	 * faulting pages into the hole while it's being punched.  Although
20878e205f77SHugh Dickins 	 * shmem_undo_range() does remove the additions, it may be unable to
20888e205f77SHugh Dickins 	 * keep up, as each new page needs its own unmap_mapping_range() call,
20898e205f77SHugh Dickins 	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
20908e205f77SHugh Dickins 	 *
20918e205f77SHugh Dickins 	 * It does not matter if we sometimes reach this check just before the
20928e205f77SHugh Dickins 	 * hole-punch begins, so that one fault then races with the punch:
20938e205f77SHugh Dickins 	 * we just need to make racing faults a rare case.
20948e205f77SHugh Dickins 	 *
20958e205f77SHugh Dickins 	 * The implementation below would be much simpler if we just used a
20969608703eSJan Kara 	 * standard mutex or completion: but we cannot take i_rwsem in fault,
20978e205f77SHugh Dickins 	 * and bloating every shmem inode for this unlikely case would be sad.
2098f00cdc6dSHugh Dickins 	 */
2099f00cdc6dSHugh Dickins 	if (unlikely(inode->i_private)) {
2100f00cdc6dSHugh Dickins 		struct shmem_falloc *shmem_falloc;
2101f00cdc6dSHugh Dickins 
2102f00cdc6dSHugh Dickins 		spin_lock(&inode->i_lock);
2103f00cdc6dSHugh Dickins 		shmem_falloc = inode->i_private;
21048e205f77SHugh Dickins 		if (shmem_falloc &&
21058e205f77SHugh Dickins 		    shmem_falloc->waitq &&
21068e205f77SHugh Dickins 		    vmf->pgoff >= shmem_falloc->start &&
21078e205f77SHugh Dickins 		    vmf->pgoff < shmem_falloc->next) {
21088897c1b1SKirill A. Shutemov 			struct file *fpin;
21098e205f77SHugh Dickins 			wait_queue_head_t *shmem_falloc_waitq;
211010d20bd2SLinus Torvalds 			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
21118e205f77SHugh Dickins 
21128e205f77SHugh Dickins 			ret = VM_FAULT_NOPAGE;
21138897c1b1SKirill A. Shutemov 			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
21148897c1b1SKirill A. Shutemov 			if (fpin)
21158e205f77SHugh Dickins 				ret = VM_FAULT_RETRY;
21168e205f77SHugh Dickins 
21178e205f77SHugh Dickins 			shmem_falloc_waitq = shmem_falloc->waitq;
21188e205f77SHugh Dickins 			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
21198e205f77SHugh Dickins 					TASK_UNINTERRUPTIBLE);
21208e205f77SHugh Dickins 			spin_unlock(&inode->i_lock);
21218e205f77SHugh Dickins 			schedule();
21228e205f77SHugh Dickins 
21238e205f77SHugh Dickins 			/*
21248e205f77SHugh Dickins 			 * shmem_falloc_waitq points into the shmem_fallocate()
21258e205f77SHugh Dickins 			 * stack of the hole-punching task: shmem_falloc_waitq
21268e205f77SHugh Dickins 			 * is usually invalid by the time we reach here, but
21278e205f77SHugh Dickins 			 * finish_wait() does not dereference it in that case;
21288e205f77SHugh Dickins 			 * though i_lock needed lest racing with wake_up_all().
21298e205f77SHugh Dickins 			 */
21308e205f77SHugh Dickins 			spin_lock(&inode->i_lock);
21318e205f77SHugh Dickins 			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
21328e205f77SHugh Dickins 			spin_unlock(&inode->i_lock);
21338897c1b1SKirill A. Shutemov 
21348897c1b1SKirill A. Shutemov 			if (fpin)
21358897c1b1SKirill A. Shutemov 				fput(fpin);
21368e205f77SHugh Dickins 			return ret;
2137f00cdc6dSHugh Dickins 		}
21388e205f77SHugh Dickins 		spin_unlock(&inode->i_lock);
2139f00cdc6dSHugh Dickins 	}
2140f00cdc6dSHugh Dickins 
214168a54100SMatthew Wilcox (Oracle) 	err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
2142cfda0526SMike Rapoport 				  gfp, vma, vmf, &ret);
214320acce67SSouptick Joarder 	if (err)
214420acce67SSouptick Joarder 		return vmf_error(err);
214568a54100SMatthew Wilcox (Oracle) 	if (folio)
214668a54100SMatthew Wilcox (Oracle) 		vmf->page = folio_file_page(folio, vmf->pgoff);
214768da9f05SHugh Dickins 	return ret;
21481da177e4SLinus Torvalds }
21491da177e4SLinus Torvalds 
2150c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file,
2151c01d5b30SHugh Dickins 				      unsigned long uaddr, unsigned long len,
2152c01d5b30SHugh Dickins 				      unsigned long pgoff, unsigned long flags)
2153c01d5b30SHugh Dickins {
2154c01d5b30SHugh Dickins 	unsigned long (*get_area)(struct file *,
2155c01d5b30SHugh Dickins 		unsigned long, unsigned long, unsigned long, unsigned long);
2156c01d5b30SHugh Dickins 	unsigned long addr;
2157c01d5b30SHugh Dickins 	unsigned long offset;
2158c01d5b30SHugh Dickins 	unsigned long inflated_len;
2159c01d5b30SHugh Dickins 	unsigned long inflated_addr;
2160c01d5b30SHugh Dickins 	unsigned long inflated_offset;
2161c01d5b30SHugh Dickins 
2162c01d5b30SHugh Dickins 	if (len > TASK_SIZE)
2163c01d5b30SHugh Dickins 		return -ENOMEM;
2164c01d5b30SHugh Dickins 
2165c01d5b30SHugh Dickins 	get_area = current->mm->get_unmapped_area;
2166c01d5b30SHugh Dickins 	addr = get_area(file, uaddr, len, pgoff, flags);
2167c01d5b30SHugh Dickins 
2168396bcc52SMatthew Wilcox (Oracle) 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2169c01d5b30SHugh Dickins 		return addr;
2170c01d5b30SHugh Dickins 	if (IS_ERR_VALUE(addr))
2171c01d5b30SHugh Dickins 		return addr;
2172c01d5b30SHugh Dickins 	if (addr & ~PAGE_MASK)
2173c01d5b30SHugh Dickins 		return addr;
2174c01d5b30SHugh Dickins 	if (addr > TASK_SIZE - len)
2175c01d5b30SHugh Dickins 		return addr;
2176c01d5b30SHugh Dickins 
2177c01d5b30SHugh Dickins 	if (shmem_huge == SHMEM_HUGE_DENY)
2178c01d5b30SHugh Dickins 		return addr;
2179c01d5b30SHugh Dickins 	if (len < HPAGE_PMD_SIZE)
2180c01d5b30SHugh Dickins 		return addr;
2181c01d5b30SHugh Dickins 	if (flags & MAP_FIXED)
2182c01d5b30SHugh Dickins 		return addr;
2183c01d5b30SHugh Dickins 	/*
2184c01d5b30SHugh Dickins 	 * Our priority is to support MAP_SHARED mapped hugely;
2185c01d5b30SHugh Dickins 	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
218699158997SKirill A. Shutemov 	 * But if caller specified an address hint and we allocated area there
218799158997SKirill A. Shutemov 	 * successfully, respect that as before.
2188c01d5b30SHugh Dickins 	 */
218999158997SKirill A. Shutemov 	if (uaddr == addr)
2190c01d5b30SHugh Dickins 		return addr;
2191c01d5b30SHugh Dickins 
2192c01d5b30SHugh Dickins 	if (shmem_huge != SHMEM_HUGE_FORCE) {
2193c01d5b30SHugh Dickins 		struct super_block *sb;
2194c01d5b30SHugh Dickins 
2195c01d5b30SHugh Dickins 		if (file) {
2196c01d5b30SHugh Dickins 			VM_BUG_ON(file->f_op != &shmem_file_operations);
2197c01d5b30SHugh Dickins 			sb = file_inode(file)->i_sb;
2198c01d5b30SHugh Dickins 		} else {
2199c01d5b30SHugh Dickins 			/*
2200c01d5b30SHugh Dickins 			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2201c01d5b30SHugh Dickins 			 * for "/dev/zero", to create a shared anonymous object.
2202c01d5b30SHugh Dickins 			 */
2203c01d5b30SHugh Dickins 			if (IS_ERR(shm_mnt))
2204c01d5b30SHugh Dickins 				return addr;
2205c01d5b30SHugh Dickins 			sb = shm_mnt->mnt_sb;
2206c01d5b30SHugh Dickins 		}
22073089bf61SToshi Kani 		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2208c01d5b30SHugh Dickins 			return addr;
2209c01d5b30SHugh Dickins 	}
2210c01d5b30SHugh Dickins 
2211c01d5b30SHugh Dickins 	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2212c01d5b30SHugh Dickins 	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2213c01d5b30SHugh Dickins 		return addr;
2214c01d5b30SHugh Dickins 	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2215c01d5b30SHugh Dickins 		return addr;
2216c01d5b30SHugh Dickins 
2217c01d5b30SHugh Dickins 	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2218c01d5b30SHugh Dickins 	if (inflated_len > TASK_SIZE)
2219c01d5b30SHugh Dickins 		return addr;
2220c01d5b30SHugh Dickins 	if (inflated_len < len)
2221c01d5b30SHugh Dickins 		return addr;
2222c01d5b30SHugh Dickins 
222399158997SKirill A. Shutemov 	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2224c01d5b30SHugh Dickins 	if (IS_ERR_VALUE(inflated_addr))
2225c01d5b30SHugh Dickins 		return addr;
2226c01d5b30SHugh Dickins 	if (inflated_addr & ~PAGE_MASK)
2227c01d5b30SHugh Dickins 		return addr;
2228c01d5b30SHugh Dickins 
2229c01d5b30SHugh Dickins 	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2230c01d5b30SHugh Dickins 	inflated_addr += offset - inflated_offset;
2231c01d5b30SHugh Dickins 	if (inflated_offset > offset)
2232c01d5b30SHugh Dickins 		inflated_addr += HPAGE_PMD_SIZE;
2233c01d5b30SHugh Dickins 
2234c01d5b30SHugh Dickins 	if (inflated_addr > TASK_SIZE - len)
2235c01d5b30SHugh Dickins 		return addr;
2236c01d5b30SHugh Dickins 	return inflated_addr;
2237c01d5b30SHugh Dickins }
2238c01d5b30SHugh Dickins 
22391da177e4SLinus Torvalds #ifdef CONFIG_NUMA
224041ffe5d5SHugh Dickins static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
22411da177e4SLinus Torvalds {
2242496ad9aaSAl Viro 	struct inode *inode = file_inode(vma->vm_file);
224341ffe5d5SHugh Dickins 	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
22441da177e4SLinus Torvalds }
22451da177e4SLinus Torvalds 
2246d8dc74f2SAdrian Bunk static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2247d8dc74f2SAdrian Bunk 					  unsigned long addr)
22481da177e4SLinus Torvalds {
2249496ad9aaSAl Viro 	struct inode *inode = file_inode(vma->vm_file);
225041ffe5d5SHugh Dickins 	pgoff_t index;
22511da177e4SLinus Torvalds 
225241ffe5d5SHugh Dickins 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
225341ffe5d5SHugh Dickins 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
22541da177e4SLinus Torvalds }
22551da177e4SLinus Torvalds #endif
22561da177e4SLinus Torvalds 
2257d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
22581da177e4SLinus Torvalds {
2259496ad9aaSAl Viro 	struct inode *inode = file_inode(file);
22601da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
22611da177e4SLinus Torvalds 	int retval = -ENOMEM;
22621da177e4SLinus Torvalds 
2263ea0dfeb4SHugh Dickins 	/*
2264ea0dfeb4SHugh Dickins 	 * What serializes the accesses to info->flags?
2265ea0dfeb4SHugh Dickins 	 * ipc_lock_object() when called from shmctl_do_lock(),
2266ea0dfeb4SHugh Dickins 	 * no serialization needed when called from shm_destroy().
2267ea0dfeb4SHugh Dickins 	 */
22681da177e4SLinus Torvalds 	if (lock && !(info->flags & VM_LOCKED)) {
2269d7c9e99aSAlexey Gladkov 		if (!user_shm_lock(inode->i_size, ucounts))
22701da177e4SLinus Torvalds 			goto out_nomem;
22711da177e4SLinus Torvalds 		info->flags |= VM_LOCKED;
227289e004eaSLee Schermerhorn 		mapping_set_unevictable(file->f_mapping);
22731da177e4SLinus Torvalds 	}
2274d7c9e99aSAlexey Gladkov 	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2275d7c9e99aSAlexey Gladkov 		user_shm_unlock(inode->i_size, ucounts);
22761da177e4SLinus Torvalds 		info->flags &= ~VM_LOCKED;
227789e004eaSLee Schermerhorn 		mapping_clear_unevictable(file->f_mapping);
22781da177e4SLinus Torvalds 	}
22791da177e4SLinus Torvalds 	retval = 0;
228089e004eaSLee Schermerhorn 
22811da177e4SLinus Torvalds out_nomem:
22821da177e4SLinus Torvalds 	return retval;
22831da177e4SLinus Torvalds }
22841da177e4SLinus Torvalds 
22859b83a6a8SAdrian Bunk static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
22861da177e4SLinus Torvalds {
2287d09e8ca6SPasha Tatashin 	struct inode *inode = file_inode(file);
2288d09e8ca6SPasha Tatashin 	struct shmem_inode_info *info = SHMEM_I(inode);
228922247efdSPeter Xu 	int ret;
2290ab3948f5SJoel Fernandes (Google) 
229122247efdSPeter Xu 	ret = seal_check_future_write(info->seals, vma);
229222247efdSPeter Xu 	if (ret)
229322247efdSPeter Xu 		return ret;
2294ab3948f5SJoel Fernandes (Google) 
229551b0bff2SCatalin Marinas 	/* arm64 - allow memory tagging on RAM-based files */
229651b0bff2SCatalin Marinas 	vma->vm_flags |= VM_MTE_ALLOWED;
229751b0bff2SCatalin Marinas 
22981da177e4SLinus Torvalds 	file_accessed(file);
2299d09e8ca6SPasha Tatashin 	/* This is anonymous shared memory if it is unlinked at the time of mmap */
2300d09e8ca6SPasha Tatashin 	if (inode->i_nlink)
23011da177e4SLinus Torvalds 		vma->vm_ops = &shmem_vm_ops;
2302d09e8ca6SPasha Tatashin 	else
2303d09e8ca6SPasha Tatashin 		vma->vm_ops = &shmem_anon_vm_ops;
23041da177e4SLinus Torvalds 	return 0;
23051da177e4SLinus Torvalds }
23061da177e4SLinus Torvalds 
2307cb241339SHugh Dickins #ifdef CONFIG_TMPFS_XATTR
2308cb241339SHugh Dickins static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2309cb241339SHugh Dickins 
2310cb241339SHugh Dickins /*
2311cb241339SHugh Dickins  * chattr's fsflags are unrelated to extended attributes,
2312cb241339SHugh Dickins  * but tmpfs has chosen to enable them under the same config option.
2313cb241339SHugh Dickins  */
2314cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2315e408e695STheodore Ts'o {
2316cb241339SHugh Dickins 	unsigned int i_flags = 0;
2317cb241339SHugh Dickins 
2318cb241339SHugh Dickins 	if (fsflags & FS_NOATIME_FL)
2319cb241339SHugh Dickins 		i_flags |= S_NOATIME;
2320cb241339SHugh Dickins 	if (fsflags & FS_APPEND_FL)
2321cb241339SHugh Dickins 		i_flags |= S_APPEND;
2322cb241339SHugh Dickins 	if (fsflags & FS_IMMUTABLE_FL)
2323cb241339SHugh Dickins 		i_flags |= S_IMMUTABLE;
2324cb241339SHugh Dickins 	/*
2325cb241339SHugh Dickins 	 * But FS_NODUMP_FL does not require any action in i_flags.
2326cb241339SHugh Dickins 	 */
2327cb241339SHugh Dickins 	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2328e408e695STheodore Ts'o }
2329cb241339SHugh Dickins #else
2330cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2331cb241339SHugh Dickins {
2332cb241339SHugh Dickins }
2333cb241339SHugh Dickins #define shmem_initxattrs NULL
2334cb241339SHugh Dickins #endif
2335e408e695STheodore Ts'o 
2336e408e695STheodore Ts'o static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
233709208d15SAl Viro 				     umode_t mode, dev_t dev, unsigned long flags)
23381da177e4SLinus Torvalds {
23391da177e4SLinus Torvalds 	struct inode *inode;
23401da177e4SLinus Torvalds 	struct shmem_inode_info *info;
23411da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2342e809d5f0SChris Down 	ino_t ino;
23431da177e4SLinus Torvalds 
2344e809d5f0SChris Down 	if (shmem_reserve_inode(sb, &ino))
23451da177e4SLinus Torvalds 		return NULL;
23461da177e4SLinus Torvalds 
23471da177e4SLinus Torvalds 	inode = new_inode(sb);
23481da177e4SLinus Torvalds 	if (inode) {
2349e809d5f0SChris Down 		inode->i_ino = ino;
235021cb47beSChristian Brauner 		inode_init_owner(&init_user_ns, inode, dir, mode);
23511da177e4SLinus Torvalds 		inode->i_blocks = 0;
2352078cd827SDeepa Dinamani 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2353a251c17aSJason A. Donenfeld 		inode->i_generation = get_random_u32();
23541da177e4SLinus Torvalds 		info = SHMEM_I(inode);
23551da177e4SLinus Torvalds 		memset(info, 0, (char *)inode - (char *)info);
23561da177e4SLinus Torvalds 		spin_lock_init(&info->lock);
2357af53d3e9SHugh Dickins 		atomic_set(&info->stop_eviction, 0);
235840e041a2SDavid Herrmann 		info->seals = F_SEAL_SEAL;
23590b0a0806SHugh Dickins 		info->flags = flags & VM_NORESERVE;
2360f7cd16a5SXavier Roche 		info->i_crtime = inode->i_mtime;
2361e408e695STheodore Ts'o 		info->fsflags = (dir == NULL) ? 0 :
2362e408e695STheodore Ts'o 			SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
2363cb241339SHugh Dickins 		if (info->fsflags)
2364cb241339SHugh Dickins 			shmem_set_inode_flags(inode, info->fsflags);
2365779750d2SKirill A. Shutemov 		INIT_LIST_HEAD(&info->shrinklist);
23661da177e4SLinus Torvalds 		INIT_LIST_HEAD(&info->swaplist);
236738f38657SAristeu Rozanski 		simple_xattrs_init(&info->xattrs);
236872c04902SAl Viro 		cache_no_acl(inode);
2369ff36da69SMatthew Wilcox (Oracle) 		mapping_set_large_folios(inode->i_mapping);
23701da177e4SLinus Torvalds 
23711da177e4SLinus Torvalds 		switch (mode & S_IFMT) {
23721da177e4SLinus Torvalds 		default:
237339f0247dSAndreas Gruenbacher 			inode->i_op = &shmem_special_inode_operations;
23741da177e4SLinus Torvalds 			init_special_inode(inode, mode, dev);
23751da177e4SLinus Torvalds 			break;
23761da177e4SLinus Torvalds 		case S_IFREG:
237714fcc23fSHugh Dickins 			inode->i_mapping->a_ops = &shmem_aops;
23781da177e4SLinus Torvalds 			inode->i_op = &shmem_inode_operations;
23791da177e4SLinus Torvalds 			inode->i_fop = &shmem_file_operations;
238071fe804bSLee Schermerhorn 			mpol_shared_policy_init(&info->policy,
238171fe804bSLee Schermerhorn 						 shmem_get_sbmpol(sbinfo));
23821da177e4SLinus Torvalds 			break;
23831da177e4SLinus Torvalds 		case S_IFDIR:
2384d8c76e6fSDave Hansen 			inc_nlink(inode);
23851da177e4SLinus Torvalds 			/* Some things misbehave if size == 0 on a directory */
23861da177e4SLinus Torvalds 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
23871da177e4SLinus Torvalds 			inode->i_op = &shmem_dir_inode_operations;
23881da177e4SLinus Torvalds 			inode->i_fop = &simple_dir_operations;
23891da177e4SLinus Torvalds 			break;
23901da177e4SLinus Torvalds 		case S_IFLNK:
23911da177e4SLinus Torvalds 			/*
23921da177e4SLinus Torvalds 			 * Must not load anything in the rbtree,
23931da177e4SLinus Torvalds 			 * mpol_free_shared_policy will not be called.
23941da177e4SLinus Torvalds 			 */
239571fe804bSLee Schermerhorn 			mpol_shared_policy_init(&info->policy, NULL);
23961da177e4SLinus Torvalds 			break;
23971da177e4SLinus Torvalds 		}
2398b45d71fbSJoel Fernandes (Google) 
2399b45d71fbSJoel Fernandes (Google) 		lockdep_annotate_inode_mutex_key(inode);
24005b04c689SPavel Emelyanov 	} else
24015b04c689SPavel Emelyanov 		shmem_free_inode(sb);
24021da177e4SLinus Torvalds 	return inode;
24031da177e4SLinus Torvalds }
24041da177e4SLinus Torvalds 
24053460f6e5SAxel Rasmussen #ifdef CONFIG_USERFAULTFD
24063460f6e5SAxel Rasmussen int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
24074c27fe4cSMike Rapoport 			   pmd_t *dst_pmd,
24084c27fe4cSMike Rapoport 			   struct vm_area_struct *dst_vma,
24094c27fe4cSMike Rapoport 			   unsigned long dst_addr,
24104c27fe4cSMike Rapoport 			   unsigned long src_addr,
24118ee79edfSPeter Xu 			   bool zeropage, bool wp_copy,
24124c27fe4cSMike Rapoport 			   struct page **pagep)
24134c27fe4cSMike Rapoport {
24144c27fe4cSMike Rapoport 	struct inode *inode = file_inode(dst_vma->vm_file);
24154c27fe4cSMike Rapoport 	struct shmem_inode_info *info = SHMEM_I(inode);
24164c27fe4cSMike Rapoport 	struct address_space *mapping = inode->i_mapping;
24174c27fe4cSMike Rapoport 	gfp_t gfp = mapping_gfp_mask(mapping);
24184c27fe4cSMike Rapoport 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
24194c27fe4cSMike Rapoport 	void *page_kaddr;
2420b7dd44a1SMatthew Wilcox (Oracle) 	struct folio *folio;
24214c27fe4cSMike Rapoport 	int ret;
24223460f6e5SAxel Rasmussen 	pgoff_t max_off;
24234c27fe4cSMike Rapoport 
24247ed9d238SAxel Rasmussen 	if (!shmem_inode_acct_block(inode, 1)) {
24257ed9d238SAxel Rasmussen 		/*
24267ed9d238SAxel Rasmussen 		 * We may have got a page, returned -ENOENT triggering a retry,
24277ed9d238SAxel Rasmussen 		 * and now we find ourselves with -ENOMEM. Release the page, to
24287ed9d238SAxel Rasmussen 		 * avoid a BUG_ON in our caller.
24297ed9d238SAxel Rasmussen 		 */
24307ed9d238SAxel Rasmussen 		if (unlikely(*pagep)) {
24317ed9d238SAxel Rasmussen 			put_page(*pagep);
24327ed9d238SAxel Rasmussen 			*pagep = NULL;
24337ed9d238SAxel Rasmussen 		}
24347d64ae3aSAxel Rasmussen 		return -ENOMEM;
24357ed9d238SAxel Rasmussen 	}
24364c27fe4cSMike Rapoport 
2437cb658a45SAndrea Arcangeli 	if (!*pagep) {
24387d64ae3aSAxel Rasmussen 		ret = -ENOMEM;
24397a7256d5SMatthew Wilcox (Oracle) 		folio = shmem_alloc_folio(gfp, info, pgoff);
24407a7256d5SMatthew Wilcox (Oracle) 		if (!folio)
24410f079694SMike Rapoport 			goto out_unacct_blocks;
24424c27fe4cSMike Rapoport 
24433460f6e5SAxel Rasmussen 		if (!zeropage) {	/* COPY */
24447a7256d5SMatthew Wilcox (Oracle) 			page_kaddr = kmap_local_folio(folio, 0);
24455dc21f0cSIra Weiny 			/*
24465dc21f0cSIra Weiny 			 * The read mmap_lock is held here.  Despite the
24475dc21f0cSIra Weiny 			 * mmap_lock being read recursive a deadlock is still
24485dc21f0cSIra Weiny 			 * possible if a writer has taken a lock.  For example:
24495dc21f0cSIra Weiny 			 *
24505dc21f0cSIra Weiny 			 * process A thread 1 takes read lock on own mmap_lock
24515dc21f0cSIra Weiny 			 * process A thread 2 calls mmap, blocks taking write lock
24525dc21f0cSIra Weiny 			 * process B thread 1 takes page fault, read lock on own mmap lock
24535dc21f0cSIra Weiny 			 * process B thread 2 calls mmap, blocks taking write lock
24545dc21f0cSIra Weiny 			 * process A thread 1 blocks taking read lock on process B
24555dc21f0cSIra Weiny 			 * process B thread 1 blocks taking read lock on process A
24565dc21f0cSIra Weiny 			 *
24575dc21f0cSIra Weiny 			 * Disable page faults to prevent potential deadlock
24585dc21f0cSIra Weiny 			 * and retry the copy outside the mmap_lock.
24595dc21f0cSIra Weiny 			 */
24605dc21f0cSIra Weiny 			pagefault_disable();
24618d103963SMike Rapoport 			ret = copy_from_user(page_kaddr,
24628d103963SMike Rapoport 					     (const void __user *)src_addr,
24634c27fe4cSMike Rapoport 					     PAGE_SIZE);
24645dc21f0cSIra Weiny 			pagefault_enable();
24657a7256d5SMatthew Wilcox (Oracle) 			kunmap_local(page_kaddr);
24664c27fe4cSMike Rapoport 
2467c1e8d7c6SMichel Lespinasse 			/* fallback to copy_from_user outside mmap_lock */
24684c27fe4cSMike Rapoport 			if (unlikely(ret)) {
24697a7256d5SMatthew Wilcox (Oracle) 				*pagep = &folio->page;
24707d64ae3aSAxel Rasmussen 				ret = -ENOENT;
24714c27fe4cSMike Rapoport 				/* don't free the page */
24727d64ae3aSAxel Rasmussen 				goto out_unacct_blocks;
24734c27fe4cSMike Rapoport 			}
247419b482c2SMuchun Song 
24757a7256d5SMatthew Wilcox (Oracle) 			flush_dcache_folio(folio);
24763460f6e5SAxel Rasmussen 		} else {		/* ZEROPAGE */
24777a7256d5SMatthew Wilcox (Oracle) 			clear_user_highpage(&folio->page, dst_addr);
24788d103963SMike Rapoport 		}
24794c27fe4cSMike Rapoport 	} else {
24807a7256d5SMatthew Wilcox (Oracle) 		folio = page_folio(*pagep);
24817a7256d5SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
24824c27fe4cSMike Rapoport 		*pagep = NULL;
24834c27fe4cSMike Rapoport 	}
24844c27fe4cSMike Rapoport 
24857a7256d5SMatthew Wilcox (Oracle) 	VM_BUG_ON(folio_test_locked(folio));
24867a7256d5SMatthew Wilcox (Oracle) 	VM_BUG_ON(folio_test_swapbacked(folio));
24877a7256d5SMatthew Wilcox (Oracle) 	__folio_set_locked(folio);
24887a7256d5SMatthew Wilcox (Oracle) 	__folio_set_swapbacked(folio);
24897a7256d5SMatthew Wilcox (Oracle) 	__folio_mark_uptodate(folio);
24909cc90c66SAndrea Arcangeli 
2491e2a50c1fSAndrea Arcangeli 	ret = -EFAULT;
2492e2a50c1fSAndrea Arcangeli 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
24933460f6e5SAxel Rasmussen 	if (unlikely(pgoff >= max_off))
2494e2a50c1fSAndrea Arcangeli 		goto out_release;
2495e2a50c1fSAndrea Arcangeli 
2496b7dd44a1SMatthew Wilcox (Oracle) 	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
24973fea5a49SJohannes Weiner 				      gfp & GFP_RECLAIM_MASK, dst_mm);
24984c27fe4cSMike Rapoport 	if (ret)
24994c27fe4cSMike Rapoport 		goto out_release;
25004c27fe4cSMike Rapoport 
25017d64ae3aSAxel Rasmussen 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
25027a7256d5SMatthew Wilcox (Oracle) 				       &folio->page, true, wp_copy);
25037d64ae3aSAxel Rasmussen 	if (ret)
25047d64ae3aSAxel Rasmussen 		goto out_delete_from_cache;
25054c27fe4cSMike Rapoport 
250694b7cc01SYang Shi 	spin_lock_irq(&info->lock);
25074c27fe4cSMike Rapoport 	info->alloced++;
25084c27fe4cSMike Rapoport 	inode->i_blocks += BLOCKS_PER_PAGE;
25094c27fe4cSMike Rapoport 	shmem_recalc_inode(inode);
251094b7cc01SYang Shi 	spin_unlock_irq(&info->lock);
25114c27fe4cSMike Rapoport 
25127a7256d5SMatthew Wilcox (Oracle) 	folio_unlock(folio);
25137d64ae3aSAxel Rasmussen 	return 0;
25147d64ae3aSAxel Rasmussen out_delete_from_cache:
25157a7256d5SMatthew Wilcox (Oracle) 	filemap_remove_folio(folio);
25164c27fe4cSMike Rapoport out_release:
25177a7256d5SMatthew Wilcox (Oracle) 	folio_unlock(folio);
25187a7256d5SMatthew Wilcox (Oracle) 	folio_put(folio);
25194c27fe4cSMike Rapoport out_unacct_blocks:
25200f079694SMike Rapoport 	shmem_inode_unacct_blocks(inode, 1);
25217d64ae3aSAxel Rasmussen 	return ret;
25224c27fe4cSMike Rapoport }
25233460f6e5SAxel Rasmussen #endif /* CONFIG_USERFAULTFD */
25248d103963SMike Rapoport 
25251da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
252692e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations;
252769f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations;
25281da177e4SLinus Torvalds 
25291da177e4SLinus Torvalds static int
2530800d15a5SNick Piggin shmem_write_begin(struct file *file, struct address_space *mapping,
25319d6b0cd7SMatthew Wilcox (Oracle) 			loff_t pos, unsigned len,
2532800d15a5SNick Piggin 			struct page **pagep, void **fsdata)
25331da177e4SLinus Torvalds {
2534800d15a5SNick Piggin 	struct inode *inode = mapping->host;
253540e041a2SDavid Herrmann 	struct shmem_inode_info *info = SHMEM_I(inode);
253609cbfeafSKirill A. Shutemov 	pgoff_t index = pos >> PAGE_SHIFT;
2537eff1f906SMatthew Wilcox (Oracle) 	struct folio *folio;
2538a7605426SYang Shi 	int ret = 0;
253940e041a2SDavid Herrmann 
25409608703eSJan Kara 	/* i_rwsem is held by caller */
2541ab3948f5SJoel Fernandes (Google) 	if (unlikely(info->seals & (F_SEAL_GROW |
2542ab3948f5SJoel Fernandes (Google) 				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2543ab3948f5SJoel Fernandes (Google) 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
254440e041a2SDavid Herrmann 			return -EPERM;
254540e041a2SDavid Herrmann 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
254640e041a2SDavid Herrmann 			return -EPERM;
254740e041a2SDavid Herrmann 	}
254840e041a2SDavid Herrmann 
2549eff1f906SMatthew Wilcox (Oracle) 	ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
2550a7605426SYang Shi 
2551a7605426SYang Shi 	if (ret)
2552a7605426SYang Shi 		return ret;
2553a7605426SYang Shi 
2554eff1f906SMatthew Wilcox (Oracle) 	*pagep = folio_file_page(folio, index);
2555a7605426SYang Shi 	if (PageHWPoison(*pagep)) {
2556eff1f906SMatthew Wilcox (Oracle) 		folio_unlock(folio);
2557eff1f906SMatthew Wilcox (Oracle) 		folio_put(folio);
2558a7605426SYang Shi 		*pagep = NULL;
2559a7605426SYang Shi 		return -EIO;
2560a7605426SYang Shi 	}
2561a7605426SYang Shi 
2562a7605426SYang Shi 	return 0;
2563800d15a5SNick Piggin }
2564800d15a5SNick Piggin 
2565800d15a5SNick Piggin static int
2566800d15a5SNick Piggin shmem_write_end(struct file *file, struct address_space *mapping,
2567800d15a5SNick Piggin 			loff_t pos, unsigned len, unsigned copied,
2568800d15a5SNick Piggin 			struct page *page, void *fsdata)
2569800d15a5SNick Piggin {
2570800d15a5SNick Piggin 	struct inode *inode = mapping->host;
2571800d15a5SNick Piggin 
2572800d15a5SNick Piggin 	if (pos + copied > inode->i_size)
2573800d15a5SNick Piggin 		i_size_write(inode, pos + copied);
2574800d15a5SNick Piggin 
2575ec9516fbSHugh Dickins 	if (!PageUptodate(page)) {
2576800d8c63SKirill A. Shutemov 		struct page *head = compound_head(page);
2577800d8c63SKirill A. Shutemov 		if (PageTransCompound(page)) {
2578800d8c63SKirill A. Shutemov 			int i;
2579800d8c63SKirill A. Shutemov 
2580800d8c63SKirill A. Shutemov 			for (i = 0; i < HPAGE_PMD_NR; i++) {
2581800d8c63SKirill A. Shutemov 				if (head + i == page)
2582800d8c63SKirill A. Shutemov 					continue;
2583800d8c63SKirill A. Shutemov 				clear_highpage(head + i);
2584800d8c63SKirill A. Shutemov 				flush_dcache_page(head + i);
2585800d8c63SKirill A. Shutemov 			}
2586800d8c63SKirill A. Shutemov 		}
258709cbfeafSKirill A. Shutemov 		if (copied < PAGE_SIZE) {
258809cbfeafSKirill A. Shutemov 			unsigned from = pos & (PAGE_SIZE - 1);
2589ec9516fbSHugh Dickins 			zero_user_segments(page, 0, from,
259009cbfeafSKirill A. Shutemov 					from + copied, PAGE_SIZE);
2591ec9516fbSHugh Dickins 		}
2592800d8c63SKirill A. Shutemov 		SetPageUptodate(head);
2593ec9516fbSHugh Dickins 	}
2594d3602444SHugh Dickins 	set_page_dirty(page);
25956746aff7SWu Fengguang 	unlock_page(page);
259609cbfeafSKirill A. Shutemov 	put_page(page);
2597d3602444SHugh Dickins 
2598800d15a5SNick Piggin 	return copied;
25991da177e4SLinus Torvalds }
26001da177e4SLinus Torvalds 
26012ba5bbedSAl Viro static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
26021da177e4SLinus Torvalds {
26036e58e79dSAl Viro 	struct file *file = iocb->ki_filp;
26046e58e79dSAl Viro 	struct inode *inode = file_inode(file);
26051da177e4SLinus Torvalds 	struct address_space *mapping = inode->i_mapping;
260641ffe5d5SHugh Dickins 	pgoff_t index;
260741ffe5d5SHugh Dickins 	unsigned long offset;
2608f7c1d074SGeert Uytterhoeven 	int error = 0;
2609cb66a7a1SAl Viro 	ssize_t retval = 0;
26106e58e79dSAl Viro 	loff_t *ppos = &iocb->ki_pos;
2611a0ee5ec5SHugh Dickins 
261209cbfeafSKirill A. Shutemov 	index = *ppos >> PAGE_SHIFT;
261309cbfeafSKirill A. Shutemov 	offset = *ppos & ~PAGE_MASK;
26141da177e4SLinus Torvalds 
26151da177e4SLinus Torvalds 	for (;;) {
26164601e2fcSMatthew Wilcox (Oracle) 		struct folio *folio = NULL;
26171da177e4SLinus Torvalds 		struct page *page = NULL;
261841ffe5d5SHugh Dickins 		pgoff_t end_index;
261941ffe5d5SHugh Dickins 		unsigned long nr, ret;
26201da177e4SLinus Torvalds 		loff_t i_size = i_size_read(inode);
26211da177e4SLinus Torvalds 
262209cbfeafSKirill A. Shutemov 		end_index = i_size >> PAGE_SHIFT;
26231da177e4SLinus Torvalds 		if (index > end_index)
26241da177e4SLinus Torvalds 			break;
26251da177e4SLinus Torvalds 		if (index == end_index) {
262609cbfeafSKirill A. Shutemov 			nr = i_size & ~PAGE_MASK;
26271da177e4SLinus Torvalds 			if (nr <= offset)
26281da177e4SLinus Torvalds 				break;
26291da177e4SLinus Torvalds 		}
26301da177e4SLinus Torvalds 
26314601e2fcSMatthew Wilcox (Oracle) 		error = shmem_get_folio(inode, index, &folio, SGP_READ);
26326e58e79dSAl Viro 		if (error) {
26336e58e79dSAl Viro 			if (error == -EINVAL)
26346e58e79dSAl Viro 				error = 0;
26351da177e4SLinus Torvalds 			break;
26361da177e4SLinus Torvalds 		}
26374601e2fcSMatthew Wilcox (Oracle) 		if (folio) {
26384601e2fcSMatthew Wilcox (Oracle) 			folio_unlock(folio);
2639a7605426SYang Shi 
26404601e2fcSMatthew Wilcox (Oracle) 			page = folio_file_page(folio, index);
2641a7605426SYang Shi 			if (PageHWPoison(page)) {
26424601e2fcSMatthew Wilcox (Oracle) 				folio_put(folio);
2643a7605426SYang Shi 				error = -EIO;
2644a7605426SYang Shi 				break;
2645a7605426SYang Shi 			}
264675edd345SHugh Dickins 		}
26471da177e4SLinus Torvalds 
26481da177e4SLinus Torvalds 		/*
26491da177e4SLinus Torvalds 		 * We must evaluate after, since reads (unlike writes)
26509608703eSJan Kara 		 * are called without i_rwsem protection against truncate
26511da177e4SLinus Torvalds 		 */
265209cbfeafSKirill A. Shutemov 		nr = PAGE_SIZE;
26531da177e4SLinus Torvalds 		i_size = i_size_read(inode);
265409cbfeafSKirill A. Shutemov 		end_index = i_size >> PAGE_SHIFT;
26551da177e4SLinus Torvalds 		if (index == end_index) {
265609cbfeafSKirill A. Shutemov 			nr = i_size & ~PAGE_MASK;
26571da177e4SLinus Torvalds 			if (nr <= offset) {
26584601e2fcSMatthew Wilcox (Oracle) 				if (folio)
26594601e2fcSMatthew Wilcox (Oracle) 					folio_put(folio);
26601da177e4SLinus Torvalds 				break;
26611da177e4SLinus Torvalds 			}
26621da177e4SLinus Torvalds 		}
26631da177e4SLinus Torvalds 		nr -= offset;
26641da177e4SLinus Torvalds 
26654601e2fcSMatthew Wilcox (Oracle) 		if (folio) {
26661da177e4SLinus Torvalds 			/*
26671da177e4SLinus Torvalds 			 * If users can be writing to this page using arbitrary
26681da177e4SLinus Torvalds 			 * virtual addresses, take care about potential aliasing
26691da177e4SLinus Torvalds 			 * before reading the page on the kernel side.
26701da177e4SLinus Torvalds 			 */
26711da177e4SLinus Torvalds 			if (mapping_writably_mapped(mapping))
26721da177e4SLinus Torvalds 				flush_dcache_page(page);
26731da177e4SLinus Torvalds 			/*
26741da177e4SLinus Torvalds 			 * Mark the page accessed if we read the beginning.
26751da177e4SLinus Torvalds 			 */
26761da177e4SLinus Torvalds 			if (!offset)
26774601e2fcSMatthew Wilcox (Oracle) 				folio_mark_accessed(folio);
26781da177e4SLinus Torvalds 			/*
26791da177e4SLinus Torvalds 			 * Ok, we have the page, and it's up-to-date, so
26801da177e4SLinus Torvalds 			 * now we can copy it to user space...
26811da177e4SLinus Torvalds 			 */
26822ba5bbedSAl Viro 			ret = copy_page_to_iter(page, offset, nr, to);
26834601e2fcSMatthew Wilcox (Oracle) 			folio_put(folio);
26841bdec44bSHugh Dickins 
2685fcb14cb1SAl Viro 		} else if (user_backed_iter(to)) {
26861bdec44bSHugh Dickins 			/*
26871bdec44bSHugh Dickins 			 * Copy to user tends to be so well optimized, but
26881bdec44bSHugh Dickins 			 * clear_user() not so much, that it is noticeably
26891bdec44bSHugh Dickins 			 * faster to copy the zero page instead of clearing.
26901bdec44bSHugh Dickins 			 */
26911bdec44bSHugh Dickins 			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
26921bdec44bSHugh Dickins 		} else {
26931bdec44bSHugh Dickins 			/*
26941bdec44bSHugh Dickins 			 * But submitting the same page twice in a row to
26951bdec44bSHugh Dickins 			 * splice() - or others? - can result in confusion:
26961bdec44bSHugh Dickins 			 * so don't attempt that optimization on pipes etc.
26971bdec44bSHugh Dickins 			 */
26981bdec44bSHugh Dickins 			ret = iov_iter_zero(nr, to);
26991bdec44bSHugh Dickins 		}
27001bdec44bSHugh Dickins 
27016e58e79dSAl Viro 		retval += ret;
27021da177e4SLinus Torvalds 		offset += ret;
270309cbfeafSKirill A. Shutemov 		index += offset >> PAGE_SHIFT;
270409cbfeafSKirill A. Shutemov 		offset &= ~PAGE_MASK;
27051da177e4SLinus Torvalds 
27062ba5bbedSAl Viro 		if (!iov_iter_count(to))
27071da177e4SLinus Torvalds 			break;
27086e58e79dSAl Viro 		if (ret < nr) {
27096e58e79dSAl Viro 			error = -EFAULT;
27106e58e79dSAl Viro 			break;
27116e58e79dSAl Viro 		}
27121da177e4SLinus Torvalds 		cond_resched();
27131da177e4SLinus Torvalds 	}
27141da177e4SLinus Torvalds 
271509cbfeafSKirill A. Shutemov 	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
27166e58e79dSAl Viro 	file_accessed(file);
27176e58e79dSAl Viro 	return retval ? retval : error;
27181da177e4SLinus Torvalds }
27191da177e4SLinus Torvalds 
2720965c8e59SAndrew Morton static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2721220f2ac9SHugh Dickins {
2722220f2ac9SHugh Dickins 	struct address_space *mapping = file->f_mapping;
2723220f2ac9SHugh Dickins 	struct inode *inode = mapping->host;
2724220f2ac9SHugh Dickins 
2725965c8e59SAndrew Morton 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
2726965c8e59SAndrew Morton 		return generic_file_llseek_size(file, offset, whence,
2727220f2ac9SHugh Dickins 					MAX_LFS_FILESIZE, i_size_read(inode));
272841139aa4SMatthew Wilcox (Oracle) 	if (offset < 0)
272941139aa4SMatthew Wilcox (Oracle) 		return -ENXIO;
273041139aa4SMatthew Wilcox (Oracle) 
27315955102cSAl Viro 	inode_lock(inode);
27329608703eSJan Kara 	/* We're holding i_rwsem so we can access i_size directly */
273341139aa4SMatthew Wilcox (Oracle) 	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2734387aae6fSHugh Dickins 	if (offset >= 0)
273546a1c2c7SJie Liu 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
27365955102cSAl Viro 	inode_unlock(inode);
2737220f2ac9SHugh Dickins 	return offset;
2738220f2ac9SHugh Dickins }
2739220f2ac9SHugh Dickins 
274083e4fa9cSHugh Dickins static long shmem_fallocate(struct file *file, int mode, loff_t offset,
274183e4fa9cSHugh Dickins 							 loff_t len)
274283e4fa9cSHugh Dickins {
2743496ad9aaSAl Viro 	struct inode *inode = file_inode(file);
2744e2d12e22SHugh Dickins 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
274540e041a2SDavid Herrmann 	struct shmem_inode_info *info = SHMEM_I(inode);
27461aac1400SHugh Dickins 	struct shmem_falloc shmem_falloc;
2747d144bf62SHugh Dickins 	pgoff_t start, index, end, undo_fallocend;
2748e2d12e22SHugh Dickins 	int error;
274983e4fa9cSHugh Dickins 
275013ace4d0SHugh Dickins 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
275113ace4d0SHugh Dickins 		return -EOPNOTSUPP;
275213ace4d0SHugh Dickins 
27535955102cSAl Viro 	inode_lock(inode);
275483e4fa9cSHugh Dickins 
275583e4fa9cSHugh Dickins 	if (mode & FALLOC_FL_PUNCH_HOLE) {
275683e4fa9cSHugh Dickins 		struct address_space *mapping = file->f_mapping;
275783e4fa9cSHugh Dickins 		loff_t unmap_start = round_up(offset, PAGE_SIZE);
275883e4fa9cSHugh Dickins 		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
27598e205f77SHugh Dickins 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
276083e4fa9cSHugh Dickins 
27619608703eSJan Kara 		/* protected by i_rwsem */
2762ab3948f5SJoel Fernandes (Google) 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
276340e041a2SDavid Herrmann 			error = -EPERM;
276440e041a2SDavid Herrmann 			goto out;
276540e041a2SDavid Herrmann 		}
276640e041a2SDavid Herrmann 
27678e205f77SHugh Dickins 		shmem_falloc.waitq = &shmem_falloc_waitq;
2768aa71ecd8SChen Jun 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2769f00cdc6dSHugh Dickins 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2770f00cdc6dSHugh Dickins 		spin_lock(&inode->i_lock);
2771f00cdc6dSHugh Dickins 		inode->i_private = &shmem_falloc;
2772f00cdc6dSHugh Dickins 		spin_unlock(&inode->i_lock);
2773f00cdc6dSHugh Dickins 
277483e4fa9cSHugh Dickins 		if ((u64)unmap_end > (u64)unmap_start)
277583e4fa9cSHugh Dickins 			unmap_mapping_range(mapping, unmap_start,
277683e4fa9cSHugh Dickins 					    1 + unmap_end - unmap_start, 0);
277783e4fa9cSHugh Dickins 		shmem_truncate_range(inode, offset, offset + len - 1);
277883e4fa9cSHugh Dickins 		/* No need to unmap again: hole-punching leaves COWed pages */
27798e205f77SHugh Dickins 
27808e205f77SHugh Dickins 		spin_lock(&inode->i_lock);
27818e205f77SHugh Dickins 		inode->i_private = NULL;
27828e205f77SHugh Dickins 		wake_up_all(&shmem_falloc_waitq);
27832055da97SIngo Molnar 		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
27848e205f77SHugh Dickins 		spin_unlock(&inode->i_lock);
278583e4fa9cSHugh Dickins 		error = 0;
27868e205f77SHugh Dickins 		goto out;
278783e4fa9cSHugh Dickins 	}
278883e4fa9cSHugh Dickins 
2789e2d12e22SHugh Dickins 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2790e2d12e22SHugh Dickins 	error = inode_newsize_ok(inode, offset + len);
2791e2d12e22SHugh Dickins 	if (error)
2792e2d12e22SHugh Dickins 		goto out;
2793e2d12e22SHugh Dickins 
279440e041a2SDavid Herrmann 	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
279540e041a2SDavid Herrmann 		error = -EPERM;
279640e041a2SDavid Herrmann 		goto out;
279740e041a2SDavid Herrmann 	}
279840e041a2SDavid Herrmann 
279909cbfeafSKirill A. Shutemov 	start = offset >> PAGE_SHIFT;
280009cbfeafSKirill A. Shutemov 	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2801e2d12e22SHugh Dickins 	/* Try to avoid a swapstorm if len is impossible to satisfy */
2802e2d12e22SHugh Dickins 	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2803e2d12e22SHugh Dickins 		error = -ENOSPC;
2804e2d12e22SHugh Dickins 		goto out;
2805e2d12e22SHugh Dickins 	}
2806e2d12e22SHugh Dickins 
28078e205f77SHugh Dickins 	shmem_falloc.waitq = NULL;
28081aac1400SHugh Dickins 	shmem_falloc.start = start;
28091aac1400SHugh Dickins 	shmem_falloc.next  = start;
28101aac1400SHugh Dickins 	shmem_falloc.nr_falloced = 0;
28111aac1400SHugh Dickins 	shmem_falloc.nr_unswapped = 0;
28121aac1400SHugh Dickins 	spin_lock(&inode->i_lock);
28131aac1400SHugh Dickins 	inode->i_private = &shmem_falloc;
28141aac1400SHugh Dickins 	spin_unlock(&inode->i_lock);
28151aac1400SHugh Dickins 
2816d144bf62SHugh Dickins 	/*
2817d144bf62SHugh Dickins 	 * info->fallocend is only relevant when huge pages might be
2818d144bf62SHugh Dickins 	 * involved: to prevent split_huge_page() freeing fallocated
2819d144bf62SHugh Dickins 	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2820d144bf62SHugh Dickins 	 */
2821d144bf62SHugh Dickins 	undo_fallocend = info->fallocend;
2822d144bf62SHugh Dickins 	if (info->fallocend < end)
2823d144bf62SHugh Dickins 		info->fallocend = end;
2824d144bf62SHugh Dickins 
2825050dcb5cSHugh Dickins 	for (index = start; index < end; ) {
2826b0802b22SMatthew Wilcox (Oracle) 		struct folio *folio;
2827e2d12e22SHugh Dickins 
2828e2d12e22SHugh Dickins 		/*
2829e2d12e22SHugh Dickins 		 * Good, the fallocate(2) manpage permits EINTR: we may have
2830e2d12e22SHugh Dickins 		 * been interrupted because we are using up too much memory.
2831e2d12e22SHugh Dickins 		 */
2832e2d12e22SHugh Dickins 		if (signal_pending(current))
2833e2d12e22SHugh Dickins 			error = -EINTR;
28341aac1400SHugh Dickins 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
28351aac1400SHugh Dickins 			error = -ENOMEM;
2836e2d12e22SHugh Dickins 		else
2837b0802b22SMatthew Wilcox (Oracle) 			error = shmem_get_folio(inode, index, &folio,
2838b0802b22SMatthew Wilcox (Oracle) 						SGP_FALLOC);
2839e2d12e22SHugh Dickins 		if (error) {
2840d144bf62SHugh Dickins 			info->fallocend = undo_fallocend;
2841b0802b22SMatthew Wilcox (Oracle) 			/* Remove the !uptodate folios we added */
28427f556567SHugh Dickins 			if (index > start) {
28431635f6a7SHugh Dickins 				shmem_undo_range(inode,
284409cbfeafSKirill A. Shutemov 				    (loff_t)start << PAGE_SHIFT,
2845b9b4bb26SAnthony Romano 				    ((loff_t)index << PAGE_SHIFT) - 1, true);
28467f556567SHugh Dickins 			}
28471aac1400SHugh Dickins 			goto undone;
2848e2d12e22SHugh Dickins 		}
2849e2d12e22SHugh Dickins 
2850050dcb5cSHugh Dickins 		/*
2851050dcb5cSHugh Dickins 		 * Here is a more important optimization than it appears:
2852b0802b22SMatthew Wilcox (Oracle) 		 * a second SGP_FALLOC on the same large folio will clear it,
2853b0802b22SMatthew Wilcox (Oracle) 		 * making it uptodate and un-undoable if we fail later.
2854050dcb5cSHugh Dickins 		 */
2855b0802b22SMatthew Wilcox (Oracle) 		index = folio_next_index(folio);
2856050dcb5cSHugh Dickins 		/* Beware 32-bit wraparound */
2857050dcb5cSHugh Dickins 		if (!index)
2858050dcb5cSHugh Dickins 			index--;
2859050dcb5cSHugh Dickins 
2860e2d12e22SHugh Dickins 		/*
28611aac1400SHugh Dickins 		 * Inform shmem_writepage() how far we have reached.
28621aac1400SHugh Dickins 		 * No need for lock or barrier: we have the page lock.
28631aac1400SHugh Dickins 		 */
2864b0802b22SMatthew Wilcox (Oracle) 		if (!folio_test_uptodate(folio))
2865050dcb5cSHugh Dickins 			shmem_falloc.nr_falloced += index - shmem_falloc.next;
2866050dcb5cSHugh Dickins 		shmem_falloc.next = index;
28671aac1400SHugh Dickins 
28681aac1400SHugh Dickins 		/*
2869b0802b22SMatthew Wilcox (Oracle) 		 * If !uptodate, leave it that way so that freeable folios
28701635f6a7SHugh Dickins 		 * can be recognized if we need to rollback on error later.
2871b0802b22SMatthew Wilcox (Oracle) 		 * But mark it dirty so that memory pressure will swap rather
2872b0802b22SMatthew Wilcox (Oracle) 		 * than free the folios we are allocating (and SGP_CACHE folios
2873e2d12e22SHugh Dickins 		 * might still be clean: we now need to mark those dirty too).
2874e2d12e22SHugh Dickins 		 */
2875b0802b22SMatthew Wilcox (Oracle) 		folio_mark_dirty(folio);
2876b0802b22SMatthew Wilcox (Oracle) 		folio_unlock(folio);
2877b0802b22SMatthew Wilcox (Oracle) 		folio_put(folio);
2878e2d12e22SHugh Dickins 		cond_resched();
2879e2d12e22SHugh Dickins 	}
2880e2d12e22SHugh Dickins 
2881e2d12e22SHugh Dickins 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2882e2d12e22SHugh Dickins 		i_size_write(inode, offset + len);
28831aac1400SHugh Dickins undone:
28841aac1400SHugh Dickins 	spin_lock(&inode->i_lock);
28851aac1400SHugh Dickins 	inode->i_private = NULL;
28861aac1400SHugh Dickins 	spin_unlock(&inode->i_lock);
2887e2d12e22SHugh Dickins out:
288815f242bbSHugh Dickins 	if (!error)
288915f242bbSHugh Dickins 		file_modified(file);
28905955102cSAl Viro 	inode_unlock(inode);
289183e4fa9cSHugh Dickins 	return error;
289283e4fa9cSHugh Dickins }
289383e4fa9cSHugh Dickins 
2894726c3342SDavid Howells static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
28951da177e4SLinus Torvalds {
2896726c3342SDavid Howells 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
28971da177e4SLinus Torvalds 
28981da177e4SLinus Torvalds 	buf->f_type = TMPFS_MAGIC;
289909cbfeafSKirill A. Shutemov 	buf->f_bsize = PAGE_SIZE;
29001da177e4SLinus Torvalds 	buf->f_namelen = NAME_MAX;
29010edd73b3SHugh Dickins 	if (sbinfo->max_blocks) {
29021da177e4SLinus Torvalds 		buf->f_blocks = sbinfo->max_blocks;
290341ffe5d5SHugh Dickins 		buf->f_bavail =
290441ffe5d5SHugh Dickins 		buf->f_bfree  = sbinfo->max_blocks -
290541ffe5d5SHugh Dickins 				percpu_counter_sum(&sbinfo->used_blocks);
29060edd73b3SHugh Dickins 	}
29070edd73b3SHugh Dickins 	if (sbinfo->max_inodes) {
29081da177e4SLinus Torvalds 		buf->f_files = sbinfo->max_inodes;
29091da177e4SLinus Torvalds 		buf->f_ffree = sbinfo->free_inodes;
29101da177e4SLinus Torvalds 	}
29111da177e4SLinus Torvalds 	/* else leave those fields 0 like simple_statfs */
291259cda49eSAmir Goldstein 
291359cda49eSAmir Goldstein 	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
291459cda49eSAmir Goldstein 
29151da177e4SLinus Torvalds 	return 0;
29161da177e4SLinus Torvalds }
29171da177e4SLinus Torvalds 
29181da177e4SLinus Torvalds /*
29191da177e4SLinus Torvalds  * File creation. Allocate an inode, and we're done..
29201da177e4SLinus Torvalds  */
29211da177e4SLinus Torvalds static int
2922549c7297SChristian Brauner shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
2923549c7297SChristian Brauner 	    struct dentry *dentry, umode_t mode, dev_t dev)
29241da177e4SLinus Torvalds {
29250b0a0806SHugh Dickins 	struct inode *inode;
29261da177e4SLinus Torvalds 	int error = -ENOSPC;
29271da177e4SLinus Torvalds 
2928454abafeSDmitry Monakhov 	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
29291da177e4SLinus Torvalds 	if (inode) {
2930feda821eSChristoph Hellwig 		error = simple_acl_create(dir, inode);
2931feda821eSChristoph Hellwig 		if (error)
2932feda821eSChristoph Hellwig 			goto out_iput;
29332a7dba39SEric Paris 		error = security_inode_init_security(inode, dir,
29349d8f13baSMimi Zohar 						     &dentry->d_name,
29356d9d88d0SJarkko Sakkinen 						     shmem_initxattrs, NULL);
2936feda821eSChristoph Hellwig 		if (error && error != -EOPNOTSUPP)
2937feda821eSChristoph Hellwig 			goto out_iput;
293837ec43cdSMimi Zohar 
2939718deb6bSAl Viro 		error = 0;
29401da177e4SLinus Torvalds 		dir->i_size += BOGO_DIRENT_SIZE;
2941078cd827SDeepa Dinamani 		dir->i_ctime = dir->i_mtime = current_time(dir);
294236f05cabSJeff Layton 		inode_inc_iversion(dir);
29431da177e4SLinus Torvalds 		d_instantiate(dentry, inode);
29441da177e4SLinus Torvalds 		dget(dentry); /* Extra count - pin the dentry in core */
29451da177e4SLinus Torvalds 	}
29461da177e4SLinus Torvalds 	return error;
2947feda821eSChristoph Hellwig out_iput:
2948feda821eSChristoph Hellwig 	iput(inode);
2949feda821eSChristoph Hellwig 	return error;
29501da177e4SLinus Torvalds }
29511da177e4SLinus Torvalds 
295260545d0dSAl Viro static int
2953549c7297SChristian Brauner shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
2954863f144fSMiklos Szeredi 	      struct file *file, umode_t mode)
295560545d0dSAl Viro {
295660545d0dSAl Viro 	struct inode *inode;
295760545d0dSAl Viro 	int error = -ENOSPC;
295860545d0dSAl Viro 
295960545d0dSAl Viro 	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
296060545d0dSAl Viro 	if (inode) {
296160545d0dSAl Viro 		error = security_inode_init_security(inode, dir,
296260545d0dSAl Viro 						     NULL,
296360545d0dSAl Viro 						     shmem_initxattrs, NULL);
2964feda821eSChristoph Hellwig 		if (error && error != -EOPNOTSUPP)
2965feda821eSChristoph Hellwig 			goto out_iput;
2966feda821eSChristoph Hellwig 		error = simple_acl_create(dir, inode);
2967feda821eSChristoph Hellwig 		if (error)
2968feda821eSChristoph Hellwig 			goto out_iput;
2969863f144fSMiklos Szeredi 		d_tmpfile(file, inode);
297060545d0dSAl Viro 	}
2971863f144fSMiklos Szeredi 	return finish_open_simple(file, error);
2972feda821eSChristoph Hellwig out_iput:
2973feda821eSChristoph Hellwig 	iput(inode);
2974feda821eSChristoph Hellwig 	return error;
297560545d0dSAl Viro }
297660545d0dSAl Viro 
2977549c7297SChristian Brauner static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
2978549c7297SChristian Brauner 		       struct dentry *dentry, umode_t mode)
29791da177e4SLinus Torvalds {
29801da177e4SLinus Torvalds 	int error;
29811da177e4SLinus Torvalds 
2982549c7297SChristian Brauner 	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
2983549c7297SChristian Brauner 				 mode | S_IFDIR, 0)))
29841da177e4SLinus Torvalds 		return error;
2985d8c76e6fSDave Hansen 	inc_nlink(dir);
29861da177e4SLinus Torvalds 	return 0;
29871da177e4SLinus Torvalds }
29881da177e4SLinus Torvalds 
2989549c7297SChristian Brauner static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
2990549c7297SChristian Brauner 			struct dentry *dentry, umode_t mode, bool excl)
29911da177e4SLinus Torvalds {
2992549c7297SChristian Brauner 	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
29931da177e4SLinus Torvalds }
29941da177e4SLinus Torvalds 
29951da177e4SLinus Torvalds /*
29961da177e4SLinus Torvalds  * Link a file..
29971da177e4SLinus Torvalds  */
29981da177e4SLinus Torvalds static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
29991da177e4SLinus Torvalds {
300075c3cfa8SDavid Howells 	struct inode *inode = d_inode(old_dentry);
300129b00e60SDarrick J. Wong 	int ret = 0;
30021da177e4SLinus Torvalds 
30031da177e4SLinus Torvalds 	/*
30041da177e4SLinus Torvalds 	 * No ordinary (disk based) filesystem counts links as inodes;
30051da177e4SLinus Torvalds 	 * but each new link needs a new dentry, pinning lowmem, and
30061da177e4SLinus Torvalds 	 * tmpfs dentries cannot be pruned until they are unlinked.
30071062af92SDarrick J. Wong 	 * But if an O_TMPFILE file is linked into the tmpfs, the
30081062af92SDarrick J. Wong 	 * first link must skip that, to get the accounting right.
30091da177e4SLinus Torvalds 	 */
30101062af92SDarrick J. Wong 	if (inode->i_nlink) {
3011e809d5f0SChris Down 		ret = shmem_reserve_inode(inode->i_sb, NULL);
30125b04c689SPavel Emelyanov 		if (ret)
30135b04c689SPavel Emelyanov 			goto out;
30141062af92SDarrick J. Wong 	}
30151da177e4SLinus Torvalds 
30161da177e4SLinus Torvalds 	dir->i_size += BOGO_DIRENT_SIZE;
3017078cd827SDeepa Dinamani 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
301836f05cabSJeff Layton 	inode_inc_iversion(dir);
3019d8c76e6fSDave Hansen 	inc_nlink(inode);
30207de9c6eeSAl Viro 	ihold(inode);	/* New dentry reference */
30211da177e4SLinus Torvalds 	dget(dentry);		/* Extra pinning count for the created dentry */
30221da177e4SLinus Torvalds 	d_instantiate(dentry, inode);
30235b04c689SPavel Emelyanov out:
30245b04c689SPavel Emelyanov 	return ret;
30251da177e4SLinus Torvalds }
30261da177e4SLinus Torvalds 
30271da177e4SLinus Torvalds static int shmem_unlink(struct inode *dir, struct dentry *dentry)
30281da177e4SLinus Torvalds {
302975c3cfa8SDavid Howells 	struct inode *inode = d_inode(dentry);
30301da177e4SLinus Torvalds 
30315b04c689SPavel Emelyanov 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
30325b04c689SPavel Emelyanov 		shmem_free_inode(inode->i_sb);
30331da177e4SLinus Torvalds 
30341da177e4SLinus Torvalds 	dir->i_size -= BOGO_DIRENT_SIZE;
3035078cd827SDeepa Dinamani 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
303636f05cabSJeff Layton 	inode_inc_iversion(dir);
30379a53c3a7SDave Hansen 	drop_nlink(inode);
30381da177e4SLinus Torvalds 	dput(dentry);	/* Undo the count from "create" - this does all the work */
30391da177e4SLinus Torvalds 	return 0;
30401da177e4SLinus Torvalds }
30411da177e4SLinus Torvalds 
30421da177e4SLinus Torvalds static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
30431da177e4SLinus Torvalds {
30441da177e4SLinus Torvalds 	if (!simple_empty(dentry))
30451da177e4SLinus Torvalds 		return -ENOTEMPTY;
30461da177e4SLinus Torvalds 
304775c3cfa8SDavid Howells 	drop_nlink(d_inode(dentry));
30489a53c3a7SDave Hansen 	drop_nlink(dir);
30491da177e4SLinus Torvalds 	return shmem_unlink(dir, dentry);
30501da177e4SLinus Torvalds }
30511da177e4SLinus Torvalds 
3052549c7297SChristian Brauner static int shmem_whiteout(struct user_namespace *mnt_userns,
3053549c7297SChristian Brauner 			  struct inode *old_dir, struct dentry *old_dentry)
305446fdb794SMiklos Szeredi {
305546fdb794SMiklos Szeredi 	struct dentry *whiteout;
305646fdb794SMiklos Szeredi 	int error;
305746fdb794SMiklos Szeredi 
305846fdb794SMiklos Szeredi 	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
305946fdb794SMiklos Szeredi 	if (!whiteout)
306046fdb794SMiklos Szeredi 		return -ENOMEM;
306146fdb794SMiklos Szeredi 
3062549c7297SChristian Brauner 	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
306346fdb794SMiklos Szeredi 			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
306446fdb794SMiklos Szeredi 	dput(whiteout);
306546fdb794SMiklos Szeredi 	if (error)
306646fdb794SMiklos Szeredi 		return error;
306746fdb794SMiklos Szeredi 
306846fdb794SMiklos Szeredi 	/*
306946fdb794SMiklos Szeredi 	 * Cheat and hash the whiteout while the old dentry is still in
307046fdb794SMiklos Szeredi 	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
307146fdb794SMiklos Szeredi 	 *
307246fdb794SMiklos Szeredi 	 * d_lookup() will consistently find one of them at this point,
307346fdb794SMiklos Szeredi 	 * not sure which one, but that isn't even important.
307446fdb794SMiklos Szeredi 	 */
307546fdb794SMiklos Szeredi 	d_rehash(whiteout);
307646fdb794SMiklos Szeredi 	return 0;
307746fdb794SMiklos Szeredi }
307846fdb794SMiklos Szeredi 
30791da177e4SLinus Torvalds /*
30801da177e4SLinus Torvalds  * The VFS layer already does all the dentry stuff for rename,
30811da177e4SLinus Torvalds  * we just have to decrement the usage count for the target if
30821da177e4SLinus Torvalds  * it exists so that the VFS layer correctly free's it when it
30831da177e4SLinus Torvalds  * gets overwritten.
30841da177e4SLinus Torvalds  */
3085549c7297SChristian Brauner static int shmem_rename2(struct user_namespace *mnt_userns,
3086549c7297SChristian Brauner 			 struct inode *old_dir, struct dentry *old_dentry,
3087549c7297SChristian Brauner 			 struct inode *new_dir, struct dentry *new_dentry,
3088549c7297SChristian Brauner 			 unsigned int flags)
30891da177e4SLinus Torvalds {
309075c3cfa8SDavid Howells 	struct inode *inode = d_inode(old_dentry);
30911da177e4SLinus Torvalds 	int they_are_dirs = S_ISDIR(inode->i_mode);
30921da177e4SLinus Torvalds 
309346fdb794SMiklos Szeredi 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
30943b69ff51SMiklos Szeredi 		return -EINVAL;
30953b69ff51SMiklos Szeredi 
309637456771SMiklos Szeredi 	if (flags & RENAME_EXCHANGE)
30976429e463SLorenz Bauer 		return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
309837456771SMiklos Szeredi 
30991da177e4SLinus Torvalds 	if (!simple_empty(new_dentry))
31001da177e4SLinus Torvalds 		return -ENOTEMPTY;
31011da177e4SLinus Torvalds 
310246fdb794SMiklos Szeredi 	if (flags & RENAME_WHITEOUT) {
310346fdb794SMiklos Szeredi 		int error;
310446fdb794SMiklos Szeredi 
3105549c7297SChristian Brauner 		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
310646fdb794SMiklos Szeredi 		if (error)
310746fdb794SMiklos Szeredi 			return error;
310846fdb794SMiklos Szeredi 	}
310946fdb794SMiklos Szeredi 
311075c3cfa8SDavid Howells 	if (d_really_is_positive(new_dentry)) {
31111da177e4SLinus Torvalds 		(void) shmem_unlink(new_dir, new_dentry);
3112b928095bSMiklos Szeredi 		if (they_are_dirs) {
311375c3cfa8SDavid Howells 			drop_nlink(d_inode(new_dentry));
31149a53c3a7SDave Hansen 			drop_nlink(old_dir);
3115b928095bSMiklos Szeredi 		}
31161da177e4SLinus Torvalds 	} else if (they_are_dirs) {
31179a53c3a7SDave Hansen 		drop_nlink(old_dir);
3118d8c76e6fSDave Hansen 		inc_nlink(new_dir);
31191da177e4SLinus Torvalds 	}
31201da177e4SLinus Torvalds 
31211da177e4SLinus Torvalds 	old_dir->i_size -= BOGO_DIRENT_SIZE;
31221da177e4SLinus Torvalds 	new_dir->i_size += BOGO_DIRENT_SIZE;
31231da177e4SLinus Torvalds 	old_dir->i_ctime = old_dir->i_mtime =
31241da177e4SLinus Torvalds 	new_dir->i_ctime = new_dir->i_mtime =
3125078cd827SDeepa Dinamani 	inode->i_ctime = current_time(old_dir);
312636f05cabSJeff Layton 	inode_inc_iversion(old_dir);
312736f05cabSJeff Layton 	inode_inc_iversion(new_dir);
31281da177e4SLinus Torvalds 	return 0;
31291da177e4SLinus Torvalds }
31301da177e4SLinus Torvalds 
3131549c7297SChristian Brauner static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
3132549c7297SChristian Brauner 			 struct dentry *dentry, const char *symname)
31331da177e4SLinus Torvalds {
31341da177e4SLinus Torvalds 	int error;
31351da177e4SLinus Torvalds 	int len;
31361da177e4SLinus Torvalds 	struct inode *inode;
31377ad0414bSMatthew Wilcox (Oracle) 	struct folio *folio;
31381da177e4SLinus Torvalds 
31391da177e4SLinus Torvalds 	len = strlen(symname) + 1;
314009cbfeafSKirill A. Shutemov 	if (len > PAGE_SIZE)
31411da177e4SLinus Torvalds 		return -ENAMETOOLONG;
31421da177e4SLinus Torvalds 
31430825a6f9SJoe Perches 	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
31440825a6f9SJoe Perches 				VM_NORESERVE);
31451da177e4SLinus Torvalds 	if (!inode)
31461da177e4SLinus Torvalds 		return -ENOSPC;
31471da177e4SLinus Torvalds 
31489d8f13baSMimi Zohar 	error = security_inode_init_security(inode, dir, &dentry->d_name,
31496d9d88d0SJarkko Sakkinen 					     shmem_initxattrs, NULL);
3150343c3d7fSMateusz Nosek 	if (error && error != -EOPNOTSUPP) {
3151570bc1c2SStephen Smalley 		iput(inode);
3152570bc1c2SStephen Smalley 		return error;
3153570bc1c2SStephen Smalley 	}
3154570bc1c2SStephen Smalley 
31551da177e4SLinus Torvalds 	inode->i_size = len-1;
315669f07ec9SHugh Dickins 	if (len <= SHORT_SYMLINK_LEN) {
31573ed47db3SAl Viro 		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
31583ed47db3SAl Viro 		if (!inode->i_link) {
315969f07ec9SHugh Dickins 			iput(inode);
316069f07ec9SHugh Dickins 			return -ENOMEM;
316169f07ec9SHugh Dickins 		}
316269f07ec9SHugh Dickins 		inode->i_op = &shmem_short_symlink_operations;
31631da177e4SLinus Torvalds 	} else {
3164e8ecde25SAl Viro 		inode_nohighmem(inode);
31657ad0414bSMatthew Wilcox (Oracle) 		error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
31661da177e4SLinus Torvalds 		if (error) {
31671da177e4SLinus Torvalds 			iput(inode);
31681da177e4SLinus Torvalds 			return error;
31691da177e4SLinus Torvalds 		}
317014fcc23fSHugh Dickins 		inode->i_mapping->a_ops = &shmem_aops;
31711da177e4SLinus Torvalds 		inode->i_op = &shmem_symlink_inode_operations;
31727ad0414bSMatthew Wilcox (Oracle) 		memcpy(folio_address(folio), symname, len);
31737ad0414bSMatthew Wilcox (Oracle) 		folio_mark_uptodate(folio);
31747ad0414bSMatthew Wilcox (Oracle) 		folio_mark_dirty(folio);
31757ad0414bSMatthew Wilcox (Oracle) 		folio_unlock(folio);
31767ad0414bSMatthew Wilcox (Oracle) 		folio_put(folio);
31771da177e4SLinus Torvalds 	}
31781da177e4SLinus Torvalds 	dir->i_size += BOGO_DIRENT_SIZE;
3179078cd827SDeepa Dinamani 	dir->i_ctime = dir->i_mtime = current_time(dir);
318036f05cabSJeff Layton 	inode_inc_iversion(dir);
31811da177e4SLinus Torvalds 	d_instantiate(dentry, inode);
31821da177e4SLinus Torvalds 	dget(dentry);
31831da177e4SLinus Torvalds 	return 0;
31841da177e4SLinus Torvalds }
31851da177e4SLinus Torvalds 
3186fceef393SAl Viro static void shmem_put_link(void *arg)
3187fceef393SAl Viro {
3188e4b57722SMatthew Wilcox (Oracle) 	folio_mark_accessed(arg);
3189e4b57722SMatthew Wilcox (Oracle) 	folio_put(arg);
3190fceef393SAl Viro }
3191fceef393SAl Viro 
31926b255391SAl Viro static const char *shmem_get_link(struct dentry *dentry,
3193fceef393SAl Viro 				  struct inode *inode,
3194fceef393SAl Viro 				  struct delayed_call *done)
31951da177e4SLinus Torvalds {
3196e4b57722SMatthew Wilcox (Oracle) 	struct folio *folio = NULL;
31976b255391SAl Viro 	int error;
3198e4b57722SMatthew Wilcox (Oracle) 
31996a6c9904SAl Viro 	if (!dentry) {
3200e4b57722SMatthew Wilcox (Oracle) 		folio = filemap_get_folio(inode->i_mapping, 0);
3201e4b57722SMatthew Wilcox (Oracle) 		if (!folio)
32026b255391SAl Viro 			return ERR_PTR(-ECHILD);
32037459c149SMatthew Wilcox (Oracle) 		if (PageHWPoison(folio_page(folio, 0)) ||
3204e4b57722SMatthew Wilcox (Oracle) 		    !folio_test_uptodate(folio)) {
3205e4b57722SMatthew Wilcox (Oracle) 			folio_put(folio);
32066a6c9904SAl Viro 			return ERR_PTR(-ECHILD);
32076a6c9904SAl Viro 		}
32086a6c9904SAl Viro 	} else {
3209e4b57722SMatthew Wilcox (Oracle) 		error = shmem_get_folio(inode, 0, &folio, SGP_READ);
3210680baacbSAl Viro 		if (error)
3211680baacbSAl Viro 			return ERR_PTR(error);
3212e4b57722SMatthew Wilcox (Oracle) 		if (!folio)
3213a7605426SYang Shi 			return ERR_PTR(-ECHILD);
32147459c149SMatthew Wilcox (Oracle) 		if (PageHWPoison(folio_page(folio, 0))) {
3215e4b57722SMatthew Wilcox (Oracle) 			folio_unlock(folio);
3216e4b57722SMatthew Wilcox (Oracle) 			folio_put(folio);
3217a7605426SYang Shi 			return ERR_PTR(-ECHILD);
3218a7605426SYang Shi 		}
3219e4b57722SMatthew Wilcox (Oracle) 		folio_unlock(folio);
32201da177e4SLinus Torvalds 	}
3221e4b57722SMatthew Wilcox (Oracle) 	set_delayed_call(done, shmem_put_link, folio);
3222e4b57722SMatthew Wilcox (Oracle) 	return folio_address(folio);
32231da177e4SLinus Torvalds }
32241da177e4SLinus Torvalds 
3225b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3226e408e695STheodore Ts'o 
3227e408e695STheodore Ts'o static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3228e408e695STheodore Ts'o {
3229e408e695STheodore Ts'o 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3230e408e695STheodore Ts'o 
3231e408e695STheodore Ts'o 	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3232e408e695STheodore Ts'o 
3233e408e695STheodore Ts'o 	return 0;
3234e408e695STheodore Ts'o }
3235e408e695STheodore Ts'o 
3236e408e695STheodore Ts'o static int shmem_fileattr_set(struct user_namespace *mnt_userns,
3237e408e695STheodore Ts'o 			      struct dentry *dentry, struct fileattr *fa)
3238e408e695STheodore Ts'o {
3239e408e695STheodore Ts'o 	struct inode *inode = d_inode(dentry);
3240e408e695STheodore Ts'o 	struct shmem_inode_info *info = SHMEM_I(inode);
3241e408e695STheodore Ts'o 
3242e408e695STheodore Ts'o 	if (fileattr_has_fsx(fa))
3243e408e695STheodore Ts'o 		return -EOPNOTSUPP;
3244cb241339SHugh Dickins 	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3245cb241339SHugh Dickins 		return -EOPNOTSUPP;
3246e408e695STheodore Ts'o 
3247e408e695STheodore Ts'o 	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3248e408e695STheodore Ts'o 		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
3249e408e695STheodore Ts'o 
3250cb241339SHugh Dickins 	shmem_set_inode_flags(inode, info->fsflags);
3251e408e695STheodore Ts'o 	inode->i_ctime = current_time(inode);
325236f05cabSJeff Layton 	inode_inc_iversion(inode);
3253e408e695STheodore Ts'o 	return 0;
3254e408e695STheodore Ts'o }
3255e408e695STheodore Ts'o 
3256b09e0fa4SEric Paris /*
3257b09e0fa4SEric Paris  * Superblocks without xattr inode operations may get some security.* xattr
3258b09e0fa4SEric Paris  * support from the LSM "for free". As soon as we have any other xattrs
3259b09e0fa4SEric Paris  * like ACLs, we also need to implement the security.* handlers at
3260b09e0fa4SEric Paris  * filesystem level, though.
3261b09e0fa4SEric Paris  */
3262b09e0fa4SEric Paris 
32636d9d88d0SJarkko Sakkinen /*
32646d9d88d0SJarkko Sakkinen  * Callback for security_inode_init_security() for acquiring xattrs.
32656d9d88d0SJarkko Sakkinen  */
32666d9d88d0SJarkko Sakkinen static int shmem_initxattrs(struct inode *inode,
32676d9d88d0SJarkko Sakkinen 			    const struct xattr *xattr_array,
32686d9d88d0SJarkko Sakkinen 			    void *fs_info)
32696d9d88d0SJarkko Sakkinen {
32706d9d88d0SJarkko Sakkinen 	struct shmem_inode_info *info = SHMEM_I(inode);
32716d9d88d0SJarkko Sakkinen 	const struct xattr *xattr;
327238f38657SAristeu Rozanski 	struct simple_xattr *new_xattr;
32736d9d88d0SJarkko Sakkinen 	size_t len;
32746d9d88d0SJarkko Sakkinen 
32756d9d88d0SJarkko Sakkinen 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
327638f38657SAristeu Rozanski 		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
32776d9d88d0SJarkko Sakkinen 		if (!new_xattr)
32786d9d88d0SJarkko Sakkinen 			return -ENOMEM;
32796d9d88d0SJarkko Sakkinen 
32806d9d88d0SJarkko Sakkinen 		len = strlen(xattr->name) + 1;
32816d9d88d0SJarkko Sakkinen 		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
32826d9d88d0SJarkko Sakkinen 					  GFP_KERNEL);
32836d9d88d0SJarkko Sakkinen 		if (!new_xattr->name) {
32843bef735aSChengguang Xu 			kvfree(new_xattr);
32856d9d88d0SJarkko Sakkinen 			return -ENOMEM;
32866d9d88d0SJarkko Sakkinen 		}
32876d9d88d0SJarkko Sakkinen 
32886d9d88d0SJarkko Sakkinen 		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
32896d9d88d0SJarkko Sakkinen 		       XATTR_SECURITY_PREFIX_LEN);
32906d9d88d0SJarkko Sakkinen 		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
32916d9d88d0SJarkko Sakkinen 		       xattr->name, len);
32926d9d88d0SJarkko Sakkinen 
32933b4c7bc0SChristian Brauner 		simple_xattr_add(&info->xattrs, new_xattr);
32946d9d88d0SJarkko Sakkinen 	}
32956d9d88d0SJarkko Sakkinen 
32966d9d88d0SJarkko Sakkinen 	return 0;
32976d9d88d0SJarkko Sakkinen }
32986d9d88d0SJarkko Sakkinen 
3299aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3300b296821aSAl Viro 				   struct dentry *unused, struct inode *inode,
3301b296821aSAl Viro 				   const char *name, void *buffer, size_t size)
3302aa7c5241SAndreas Gruenbacher {
3303b296821aSAl Viro 	struct shmem_inode_info *info = SHMEM_I(inode);
3304aa7c5241SAndreas Gruenbacher 
3305aa7c5241SAndreas Gruenbacher 	name = xattr_full_name(handler, name);
3306aa7c5241SAndreas Gruenbacher 	return simple_xattr_get(&info->xattrs, name, buffer, size);
3307aa7c5241SAndreas Gruenbacher }
3308aa7c5241SAndreas Gruenbacher 
3309aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3310e65ce2a5SChristian Brauner 				   struct user_namespace *mnt_userns,
331159301226SAl Viro 				   struct dentry *unused, struct inode *inode,
331259301226SAl Viro 				   const char *name, const void *value,
331359301226SAl Viro 				   size_t size, int flags)
3314aa7c5241SAndreas Gruenbacher {
331559301226SAl Viro 	struct shmem_inode_info *info = SHMEM_I(inode);
331636f05cabSJeff Layton 	int err;
3317aa7c5241SAndreas Gruenbacher 
3318aa7c5241SAndreas Gruenbacher 	name = xattr_full_name(handler, name);
331936f05cabSJeff Layton 	err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
332036f05cabSJeff Layton 	if (!err) {
332136f05cabSJeff Layton 		inode->i_ctime = current_time(inode);
332236f05cabSJeff Layton 		inode_inc_iversion(inode);
332336f05cabSJeff Layton 	}
332436f05cabSJeff Layton 	return err;
3325aa7c5241SAndreas Gruenbacher }
3326aa7c5241SAndreas Gruenbacher 
3327aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_security_xattr_handler = {
3328aa7c5241SAndreas Gruenbacher 	.prefix = XATTR_SECURITY_PREFIX,
3329aa7c5241SAndreas Gruenbacher 	.get = shmem_xattr_handler_get,
3330aa7c5241SAndreas Gruenbacher 	.set = shmem_xattr_handler_set,
3331aa7c5241SAndreas Gruenbacher };
3332aa7c5241SAndreas Gruenbacher 
3333aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_trusted_xattr_handler = {
3334aa7c5241SAndreas Gruenbacher 	.prefix = XATTR_TRUSTED_PREFIX,
3335aa7c5241SAndreas Gruenbacher 	.get = shmem_xattr_handler_get,
3336aa7c5241SAndreas Gruenbacher 	.set = shmem_xattr_handler_set,
3337aa7c5241SAndreas Gruenbacher };
3338aa7c5241SAndreas Gruenbacher 
3339b09e0fa4SEric Paris static const struct xattr_handler *shmem_xattr_handlers[] = {
3340b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL
3341feda821eSChristoph Hellwig 	&posix_acl_access_xattr_handler,
3342feda821eSChristoph Hellwig 	&posix_acl_default_xattr_handler,
3343b09e0fa4SEric Paris #endif
3344aa7c5241SAndreas Gruenbacher 	&shmem_security_xattr_handler,
3345aa7c5241SAndreas Gruenbacher 	&shmem_trusted_xattr_handler,
3346b09e0fa4SEric Paris 	NULL
3347b09e0fa4SEric Paris };
3348b09e0fa4SEric Paris 
3349b09e0fa4SEric Paris static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3350b09e0fa4SEric Paris {
335175c3cfa8SDavid Howells 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3352786534b9SAndreas Gruenbacher 	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3353b09e0fa4SEric Paris }
3354b09e0fa4SEric Paris #endif /* CONFIG_TMPFS_XATTR */
3355b09e0fa4SEric Paris 
335669f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations = {
3357f7cd16a5SXavier Roche 	.getattr	= shmem_getattr,
33586b255391SAl Viro 	.get_link	= simple_get_link,
3359b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3360b09e0fa4SEric Paris 	.listxattr	= shmem_listxattr,
3361b09e0fa4SEric Paris #endif
33621da177e4SLinus Torvalds };
33631da177e4SLinus Torvalds 
336492e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations = {
3365f7cd16a5SXavier Roche 	.getattr	= shmem_getattr,
33666b255391SAl Viro 	.get_link	= shmem_get_link,
3367b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3368b09e0fa4SEric Paris 	.listxattr	= shmem_listxattr,
336939f0247dSAndreas Gruenbacher #endif
3370b09e0fa4SEric Paris };
337139f0247dSAndreas Gruenbacher 
337291828a40SDavid M. Grimes static struct dentry *shmem_get_parent(struct dentry *child)
337391828a40SDavid M. Grimes {
337491828a40SDavid M. Grimes 	return ERR_PTR(-ESTALE);
337591828a40SDavid M. Grimes }
337691828a40SDavid M. Grimes 
337791828a40SDavid M. Grimes static int shmem_match(struct inode *ino, void *vfh)
337891828a40SDavid M. Grimes {
337991828a40SDavid M. Grimes 	__u32 *fh = vfh;
338091828a40SDavid M. Grimes 	__u64 inum = fh[2];
338191828a40SDavid M. Grimes 	inum = (inum << 32) | fh[1];
338291828a40SDavid M. Grimes 	return ino->i_ino == inum && fh[0] == ino->i_generation;
338391828a40SDavid M. Grimes }
338491828a40SDavid M. Grimes 
338512ba780dSAmir Goldstein /* Find any alias of inode, but prefer a hashed alias */
338612ba780dSAmir Goldstein static struct dentry *shmem_find_alias(struct inode *inode)
338712ba780dSAmir Goldstein {
338812ba780dSAmir Goldstein 	struct dentry *alias = d_find_alias(inode);
338912ba780dSAmir Goldstein 
339012ba780dSAmir Goldstein 	return alias ?: d_find_any_alias(inode);
339112ba780dSAmir Goldstein }
339212ba780dSAmir Goldstein 
339312ba780dSAmir Goldstein 
3394480b116cSChristoph Hellwig static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3395480b116cSChristoph Hellwig 		struct fid *fid, int fh_len, int fh_type)
339691828a40SDavid M. Grimes {
339791828a40SDavid M. Grimes 	struct inode *inode;
3398480b116cSChristoph Hellwig 	struct dentry *dentry = NULL;
339935c2a7f4SHugh Dickins 	u64 inum;
340091828a40SDavid M. Grimes 
3401480b116cSChristoph Hellwig 	if (fh_len < 3)
3402480b116cSChristoph Hellwig 		return NULL;
3403480b116cSChristoph Hellwig 
340435c2a7f4SHugh Dickins 	inum = fid->raw[2];
340535c2a7f4SHugh Dickins 	inum = (inum << 32) | fid->raw[1];
340635c2a7f4SHugh Dickins 
3407480b116cSChristoph Hellwig 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3408480b116cSChristoph Hellwig 			shmem_match, fid->raw);
340991828a40SDavid M. Grimes 	if (inode) {
341012ba780dSAmir Goldstein 		dentry = shmem_find_alias(inode);
341191828a40SDavid M. Grimes 		iput(inode);
341291828a40SDavid M. Grimes 	}
341391828a40SDavid M. Grimes 
3414480b116cSChristoph Hellwig 	return dentry;
341591828a40SDavid M. Grimes }
341691828a40SDavid M. Grimes 
3417b0b0382bSAl Viro static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3418b0b0382bSAl Viro 				struct inode *parent)
341991828a40SDavid M. Grimes {
34205fe0c237SAneesh Kumar K.V 	if (*len < 3) {
34215fe0c237SAneesh Kumar K.V 		*len = 3;
342294e07a75SNamjae Jeon 		return FILEID_INVALID;
34235fe0c237SAneesh Kumar K.V 	}
342491828a40SDavid M. Grimes 
34251d3382cbSAl Viro 	if (inode_unhashed(inode)) {
342691828a40SDavid M. Grimes 		/* Unfortunately insert_inode_hash is not idempotent,
342791828a40SDavid M. Grimes 		 * so as we hash inodes here rather than at creation
342891828a40SDavid M. Grimes 		 * time, we need a lock to ensure we only try
342991828a40SDavid M. Grimes 		 * to do it once
343091828a40SDavid M. Grimes 		 */
343191828a40SDavid M. Grimes 		static DEFINE_SPINLOCK(lock);
343291828a40SDavid M. Grimes 		spin_lock(&lock);
34331d3382cbSAl Viro 		if (inode_unhashed(inode))
343491828a40SDavid M. Grimes 			__insert_inode_hash(inode,
343591828a40SDavid M. Grimes 					    inode->i_ino + inode->i_generation);
343691828a40SDavid M. Grimes 		spin_unlock(&lock);
343791828a40SDavid M. Grimes 	}
343891828a40SDavid M. Grimes 
343991828a40SDavid M. Grimes 	fh[0] = inode->i_generation;
344091828a40SDavid M. Grimes 	fh[1] = inode->i_ino;
344191828a40SDavid M. Grimes 	fh[2] = ((__u64)inode->i_ino) >> 32;
344291828a40SDavid M. Grimes 
344391828a40SDavid M. Grimes 	*len = 3;
344491828a40SDavid M. Grimes 	return 1;
344591828a40SDavid M. Grimes }
344691828a40SDavid M. Grimes 
344739655164SChristoph Hellwig static const struct export_operations shmem_export_ops = {
344891828a40SDavid M. Grimes 	.get_parent     = shmem_get_parent,
344991828a40SDavid M. Grimes 	.encode_fh      = shmem_encode_fh,
3450480b116cSChristoph Hellwig 	.fh_to_dentry	= shmem_fh_to_dentry,
345191828a40SDavid M. Grimes };
345291828a40SDavid M. Grimes 
3453626c3920SAl Viro enum shmem_param {
3454626c3920SAl Viro 	Opt_gid,
3455626c3920SAl Viro 	Opt_huge,
3456626c3920SAl Viro 	Opt_mode,
3457626c3920SAl Viro 	Opt_mpol,
3458626c3920SAl Viro 	Opt_nr_blocks,
3459626c3920SAl Viro 	Opt_nr_inodes,
3460626c3920SAl Viro 	Opt_size,
3461626c3920SAl Viro 	Opt_uid,
3462ea3271f7SChris Down 	Opt_inode32,
3463ea3271f7SChris Down 	Opt_inode64,
3464626c3920SAl Viro };
34651da177e4SLinus Torvalds 
34665eede625SAl Viro static const struct constant_table shmem_param_enums_huge[] = {
34672710c957SAl Viro 	{"never",	SHMEM_HUGE_NEVER },
34682710c957SAl Viro 	{"always",	SHMEM_HUGE_ALWAYS },
34692710c957SAl Viro 	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
34702710c957SAl Viro 	{"advise",	SHMEM_HUGE_ADVISE },
34712710c957SAl Viro 	{}
34722710c957SAl Viro };
34732710c957SAl Viro 
3474d7167b14SAl Viro const struct fs_parameter_spec shmem_fs_parameters[] = {
3475626c3920SAl Viro 	fsparam_u32   ("gid",		Opt_gid),
34762710c957SAl Viro 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
3477626c3920SAl Viro 	fsparam_u32oct("mode",		Opt_mode),
3478626c3920SAl Viro 	fsparam_string("mpol",		Opt_mpol),
3479626c3920SAl Viro 	fsparam_string("nr_blocks",	Opt_nr_blocks),
3480626c3920SAl Viro 	fsparam_string("nr_inodes",	Opt_nr_inodes),
3481626c3920SAl Viro 	fsparam_string("size",		Opt_size),
3482626c3920SAl Viro 	fsparam_u32   ("uid",		Opt_uid),
3483ea3271f7SChris Down 	fsparam_flag  ("inode32",	Opt_inode32),
3484ea3271f7SChris Down 	fsparam_flag  ("inode64",	Opt_inode64),
3485626c3920SAl Viro 	{}
3486626c3920SAl Viro };
3487626c3920SAl Viro 
3488f3235626SDavid Howells static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3489626c3920SAl Viro {
3490f3235626SDavid Howells 	struct shmem_options *ctx = fc->fs_private;
3491626c3920SAl Viro 	struct fs_parse_result result;
3492e04dc423SAl Viro 	unsigned long long size;
3493626c3920SAl Viro 	char *rest;
3494626c3920SAl Viro 	int opt;
3495626c3920SAl Viro 
3496d7167b14SAl Viro 	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3497f3235626SDavid Howells 	if (opt < 0)
3498626c3920SAl Viro 		return opt;
3499626c3920SAl Viro 
3500626c3920SAl Viro 	switch (opt) {
3501626c3920SAl Viro 	case Opt_size:
3502626c3920SAl Viro 		size = memparse(param->string, &rest);
3503e04dc423SAl Viro 		if (*rest == '%') {
3504e04dc423SAl Viro 			size <<= PAGE_SHIFT;
3505e04dc423SAl Viro 			size *= totalram_pages();
3506e04dc423SAl Viro 			do_div(size, 100);
3507e04dc423SAl Viro 			rest++;
3508e04dc423SAl Viro 		}
3509e04dc423SAl Viro 		if (*rest)
3510626c3920SAl Viro 			goto bad_value;
3511e04dc423SAl Viro 		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3512e04dc423SAl Viro 		ctx->seen |= SHMEM_SEEN_BLOCKS;
3513626c3920SAl Viro 		break;
3514626c3920SAl Viro 	case Opt_nr_blocks:
3515626c3920SAl Viro 		ctx->blocks = memparse(param->string, &rest);
35160c98c8e1SZhaoLong Wang 		if (*rest || ctx->blocks > S64_MAX)
3517626c3920SAl Viro 			goto bad_value;
3518e04dc423SAl Viro 		ctx->seen |= SHMEM_SEEN_BLOCKS;
3519626c3920SAl Viro 		break;
3520626c3920SAl Viro 	case Opt_nr_inodes:
3521626c3920SAl Viro 		ctx->inodes = memparse(param->string, &rest);
3522e04dc423SAl Viro 		if (*rest)
3523626c3920SAl Viro 			goto bad_value;
3524e04dc423SAl Viro 		ctx->seen |= SHMEM_SEEN_INODES;
3525626c3920SAl Viro 		break;
3526626c3920SAl Viro 	case Opt_mode:
3527626c3920SAl Viro 		ctx->mode = result.uint_32 & 07777;
3528626c3920SAl Viro 		break;
3529626c3920SAl Viro 	case Opt_uid:
3530626c3920SAl Viro 		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3531e04dc423SAl Viro 		if (!uid_valid(ctx->uid))
3532626c3920SAl Viro 			goto bad_value;
3533626c3920SAl Viro 		break;
3534626c3920SAl Viro 	case Opt_gid:
3535626c3920SAl Viro 		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3536e04dc423SAl Viro 		if (!gid_valid(ctx->gid))
3537626c3920SAl Viro 			goto bad_value;
3538626c3920SAl Viro 		break;
3539626c3920SAl Viro 	case Opt_huge:
3540626c3920SAl Viro 		ctx->huge = result.uint_32;
3541626c3920SAl Viro 		if (ctx->huge != SHMEM_HUGE_NEVER &&
3542396bcc52SMatthew Wilcox (Oracle) 		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3543626c3920SAl Viro 		      has_transparent_hugepage()))
3544626c3920SAl Viro 			goto unsupported_parameter;
3545e04dc423SAl Viro 		ctx->seen |= SHMEM_SEEN_HUGE;
3546626c3920SAl Viro 		break;
3547626c3920SAl Viro 	case Opt_mpol:
3548626c3920SAl Viro 		if (IS_ENABLED(CONFIG_NUMA)) {
3549e04dc423SAl Viro 			mpol_put(ctx->mpol);
3550e04dc423SAl Viro 			ctx->mpol = NULL;
3551626c3920SAl Viro 			if (mpol_parse_str(param->string, &ctx->mpol))
3552626c3920SAl Viro 				goto bad_value;
3553626c3920SAl Viro 			break;
3554626c3920SAl Viro 		}
3555626c3920SAl Viro 		goto unsupported_parameter;
3556ea3271f7SChris Down 	case Opt_inode32:
3557ea3271f7SChris Down 		ctx->full_inums = false;
3558ea3271f7SChris Down 		ctx->seen |= SHMEM_SEEN_INUMS;
3559ea3271f7SChris Down 		break;
3560ea3271f7SChris Down 	case Opt_inode64:
3561ea3271f7SChris Down 		if (sizeof(ino_t) < 8) {
3562ea3271f7SChris Down 			return invalfc(fc,
3563ea3271f7SChris Down 				       "Cannot use inode64 with <64bit inums in kernel\n");
3564ea3271f7SChris Down 		}
3565ea3271f7SChris Down 		ctx->full_inums = true;
3566ea3271f7SChris Down 		ctx->seen |= SHMEM_SEEN_INUMS;
3567ea3271f7SChris Down 		break;
3568e04dc423SAl Viro 	}
3569e04dc423SAl Viro 	return 0;
3570e04dc423SAl Viro 
3571626c3920SAl Viro unsupported_parameter:
3572f35aa2bcSAl Viro 	return invalfc(fc, "Unsupported parameter '%s'", param->key);
3573626c3920SAl Viro bad_value:
3574f35aa2bcSAl Viro 	return invalfc(fc, "Bad value for '%s'", param->key);
3575e04dc423SAl Viro }
3576e04dc423SAl Viro 
3577f3235626SDavid Howells static int shmem_parse_options(struct fs_context *fc, void *data)
3578e04dc423SAl Viro {
3579f3235626SDavid Howells 	char *options = data;
3580f3235626SDavid Howells 
358133f37c64SAl Viro 	if (options) {
358233f37c64SAl Viro 		int err = security_sb_eat_lsm_opts(options, &fc->security);
358333f37c64SAl Viro 		if (err)
358433f37c64SAl Viro 			return err;
358533f37c64SAl Viro 	}
358633f37c64SAl Viro 
3587b00dc3adSHugh Dickins 	while (options != NULL) {
3588626c3920SAl Viro 		char *this_char = options;
3589b00dc3adSHugh Dickins 		for (;;) {
3590b00dc3adSHugh Dickins 			/*
3591b00dc3adSHugh Dickins 			 * NUL-terminate this option: unfortunately,
3592b00dc3adSHugh Dickins 			 * mount options form a comma-separated list,
3593b00dc3adSHugh Dickins 			 * but mpol's nodelist may also contain commas.
3594b00dc3adSHugh Dickins 			 */
3595b00dc3adSHugh Dickins 			options = strchr(options, ',');
3596b00dc3adSHugh Dickins 			if (options == NULL)
3597b00dc3adSHugh Dickins 				break;
3598b00dc3adSHugh Dickins 			options++;
3599b00dc3adSHugh Dickins 			if (!isdigit(*options)) {
3600b00dc3adSHugh Dickins 				options[-1] = '\0';
3601b00dc3adSHugh Dickins 				break;
3602b00dc3adSHugh Dickins 			}
3603b00dc3adSHugh Dickins 		}
3604626c3920SAl Viro 		if (*this_char) {
3605626c3920SAl Viro 			char *value = strchr(this_char, '=');
3606f3235626SDavid Howells 			size_t len = 0;
3607626c3920SAl Viro 			int err;
3608626c3920SAl Viro 
3609626c3920SAl Viro 			if (value) {
3610626c3920SAl Viro 				*value++ = '\0';
3611f3235626SDavid Howells 				len = strlen(value);
36121da177e4SLinus Torvalds 			}
3613f3235626SDavid Howells 			err = vfs_parse_fs_string(fc, this_char, value, len);
3614f3235626SDavid Howells 			if (err < 0)
3615f3235626SDavid Howells 				return err;
36161da177e4SLinus Torvalds 		}
3617626c3920SAl Viro 	}
36181da177e4SLinus Torvalds 	return 0;
36191da177e4SLinus Torvalds }
36201da177e4SLinus Torvalds 
3621f3235626SDavid Howells /*
3622f3235626SDavid Howells  * Reconfigure a shmem filesystem.
3623f3235626SDavid Howells  *
3624f3235626SDavid Howells  * Note that we disallow change from limited->unlimited blocks/inodes while any
3625f3235626SDavid Howells  * are in use; but we must separately disallow unlimited->limited, because in
3626f3235626SDavid Howells  * that case we have no record of how much is already in use.
3627f3235626SDavid Howells  */
3628f3235626SDavid Howells static int shmem_reconfigure(struct fs_context *fc)
36291da177e4SLinus Torvalds {
3630f3235626SDavid Howells 	struct shmem_options *ctx = fc->fs_private;
3631f3235626SDavid Howells 	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
36320edd73b3SHugh Dickins 	unsigned long inodes;
3633bf11b9a8SSebastian Andrzej Siewior 	struct mempolicy *mpol = NULL;
3634f3235626SDavid Howells 	const char *err;
36350edd73b3SHugh Dickins 
3636bf11b9a8SSebastian Andrzej Siewior 	raw_spin_lock(&sbinfo->stat_lock);
36370edd73b3SHugh Dickins 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
36380c98c8e1SZhaoLong Wang 
3639f3235626SDavid Howells 	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3640f3235626SDavid Howells 		if (!sbinfo->max_blocks) {
3641f3235626SDavid Howells 			err = "Cannot retroactively limit size";
36420edd73b3SHugh Dickins 			goto out;
36430b5071ddSAl Viro 		}
3644f3235626SDavid Howells 		if (percpu_counter_compare(&sbinfo->used_blocks,
3645f3235626SDavid Howells 					   ctx->blocks) > 0) {
3646f3235626SDavid Howells 			err = "Too small a size for current use";
36470b5071ddSAl Viro 			goto out;
3648f3235626SDavid Howells 		}
3649f3235626SDavid Howells 	}
3650f3235626SDavid Howells 	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3651f3235626SDavid Howells 		if (!sbinfo->max_inodes) {
3652f3235626SDavid Howells 			err = "Cannot retroactively limit inodes";
36530b5071ddSAl Viro 			goto out;
36540b5071ddSAl Viro 		}
3655f3235626SDavid Howells 		if (ctx->inodes < inodes) {
3656f3235626SDavid Howells 			err = "Too few inodes for current use";
3657f3235626SDavid Howells 			goto out;
3658f3235626SDavid Howells 		}
3659f3235626SDavid Howells 	}
36600edd73b3SHugh Dickins 
3661ea3271f7SChris Down 	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3662ea3271f7SChris Down 	    sbinfo->next_ino > UINT_MAX) {
3663ea3271f7SChris Down 		err = "Current inum too high to switch to 32-bit inums";
3664ea3271f7SChris Down 		goto out;
3665ea3271f7SChris Down 	}
3666ea3271f7SChris Down 
3667f3235626SDavid Howells 	if (ctx->seen & SHMEM_SEEN_HUGE)
3668f3235626SDavid Howells 		sbinfo->huge = ctx->huge;
3669ea3271f7SChris Down 	if (ctx->seen & SHMEM_SEEN_INUMS)
3670ea3271f7SChris Down 		sbinfo->full_inums = ctx->full_inums;
3671f3235626SDavid Howells 	if (ctx->seen & SHMEM_SEEN_BLOCKS)
3672f3235626SDavid Howells 		sbinfo->max_blocks  = ctx->blocks;
3673f3235626SDavid Howells 	if (ctx->seen & SHMEM_SEEN_INODES) {
3674f3235626SDavid Howells 		sbinfo->max_inodes  = ctx->inodes;
3675f3235626SDavid Howells 		sbinfo->free_inodes = ctx->inodes - inodes;
36760b5071ddSAl Viro 	}
367771fe804bSLee Schermerhorn 
36785f00110fSGreg Thelen 	/*
36795f00110fSGreg Thelen 	 * Preserve previous mempolicy unless mpol remount option was specified.
36805f00110fSGreg Thelen 	 */
3681f3235626SDavid Howells 	if (ctx->mpol) {
3682bf11b9a8SSebastian Andrzej Siewior 		mpol = sbinfo->mpol;
3683f3235626SDavid Howells 		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
3684f3235626SDavid Howells 		ctx->mpol = NULL;
36855f00110fSGreg Thelen 	}
3686bf11b9a8SSebastian Andrzej Siewior 	raw_spin_unlock(&sbinfo->stat_lock);
3687bf11b9a8SSebastian Andrzej Siewior 	mpol_put(mpol);
3688f3235626SDavid Howells 	return 0;
36890edd73b3SHugh Dickins out:
3690bf11b9a8SSebastian Andrzej Siewior 	raw_spin_unlock(&sbinfo->stat_lock);
3691f35aa2bcSAl Viro 	return invalfc(fc, "%s", err);
36921da177e4SLinus Torvalds }
3693680d794bSakpm@linux-foundation.org 
369434c80b1dSAl Viro static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3695680d794bSakpm@linux-foundation.org {
369634c80b1dSAl Viro 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3697680d794bSakpm@linux-foundation.org 
3698680d794bSakpm@linux-foundation.org 	if (sbinfo->max_blocks != shmem_default_max_blocks())
3699680d794bSakpm@linux-foundation.org 		seq_printf(seq, ",size=%luk",
370009cbfeafSKirill A. Shutemov 			sbinfo->max_blocks << (PAGE_SHIFT - 10));
3701680d794bSakpm@linux-foundation.org 	if (sbinfo->max_inodes != shmem_default_max_inodes())
3702680d794bSakpm@linux-foundation.org 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
37030825a6f9SJoe Perches 	if (sbinfo->mode != (0777 | S_ISVTX))
370409208d15SAl Viro 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
37058751e039SEric W. Biederman 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
37068751e039SEric W. Biederman 		seq_printf(seq, ",uid=%u",
37078751e039SEric W. Biederman 				from_kuid_munged(&init_user_ns, sbinfo->uid));
37088751e039SEric W. Biederman 	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
37098751e039SEric W. Biederman 		seq_printf(seq, ",gid=%u",
37108751e039SEric W. Biederman 				from_kgid_munged(&init_user_ns, sbinfo->gid));
3711ea3271f7SChris Down 
3712ea3271f7SChris Down 	/*
3713ea3271f7SChris Down 	 * Showing inode{64,32} might be useful even if it's the system default,
3714ea3271f7SChris Down 	 * since then people don't have to resort to checking both here and
3715ea3271f7SChris Down 	 * /proc/config.gz to confirm 64-bit inums were successfully applied
3716ea3271f7SChris Down 	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
3717ea3271f7SChris Down 	 *
3718ea3271f7SChris Down 	 * We hide it when inode64 isn't the default and we are using 32-bit
3719ea3271f7SChris Down 	 * inodes, since that probably just means the feature isn't even under
3720ea3271f7SChris Down 	 * consideration.
3721ea3271f7SChris Down 	 *
3722ea3271f7SChris Down 	 * As such:
3723ea3271f7SChris Down 	 *
3724ea3271f7SChris Down 	 *                     +-----------------+-----------------+
3725ea3271f7SChris Down 	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
3726ea3271f7SChris Down 	 *  +------------------+-----------------+-----------------+
3727ea3271f7SChris Down 	 *  | full_inums=true  | show            | show            |
3728ea3271f7SChris Down 	 *  | full_inums=false | show            | hide            |
3729ea3271f7SChris Down 	 *  +------------------+-----------------+-----------------+
3730ea3271f7SChris Down 	 *
3731ea3271f7SChris Down 	 */
3732ea3271f7SChris Down 	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3733ea3271f7SChris Down 		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3734396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
37355a6e75f8SKirill A. Shutemov 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
37365a6e75f8SKirill A. Shutemov 	if (sbinfo->huge)
37375a6e75f8SKirill A. Shutemov 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
37385a6e75f8SKirill A. Shutemov #endif
373971fe804bSLee Schermerhorn 	shmem_show_mpol(seq, sbinfo->mpol);
3740680d794bSakpm@linux-foundation.org 	return 0;
3741680d794bSakpm@linux-foundation.org }
37429183df25SDavid Herrmann 
3743680d794bSakpm@linux-foundation.org #endif /* CONFIG_TMPFS */
37441da177e4SLinus Torvalds 
37451da177e4SLinus Torvalds static void shmem_put_super(struct super_block *sb)
37461da177e4SLinus Torvalds {
3747602586a8SHugh Dickins 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3748602586a8SHugh Dickins 
3749e809d5f0SChris Down 	free_percpu(sbinfo->ino_batch);
3750602586a8SHugh Dickins 	percpu_counter_destroy(&sbinfo->used_blocks);
375149cd0a5cSGreg Thelen 	mpol_put(sbinfo->mpol);
3752602586a8SHugh Dickins 	kfree(sbinfo);
37531da177e4SLinus Torvalds 	sb->s_fs_info = NULL;
37541da177e4SLinus Torvalds }
37551da177e4SLinus Torvalds 
3756f3235626SDavid Howells static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
37571da177e4SLinus Torvalds {
3758f3235626SDavid Howells 	struct shmem_options *ctx = fc->fs_private;
37591da177e4SLinus Torvalds 	struct inode *inode;
37600edd73b3SHugh Dickins 	struct shmem_sb_info *sbinfo;
3761680d794bSakpm@linux-foundation.org 
3762680d794bSakpm@linux-foundation.org 	/* Round up to L1_CACHE_BYTES to resist false sharing */
3763425fbf04SPekka Enberg 	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3764680d794bSakpm@linux-foundation.org 				L1_CACHE_BYTES), GFP_KERNEL);
3765680d794bSakpm@linux-foundation.org 	if (!sbinfo)
3766680d794bSakpm@linux-foundation.org 		return -ENOMEM;
3767680d794bSakpm@linux-foundation.org 
3768680d794bSakpm@linux-foundation.org 	sb->s_fs_info = sbinfo;
37691da177e4SLinus Torvalds 
37700edd73b3SHugh Dickins #ifdef CONFIG_TMPFS
37711da177e4SLinus Torvalds 	/*
37721da177e4SLinus Torvalds 	 * Per default we only allow half of the physical ram per
37731da177e4SLinus Torvalds 	 * tmpfs instance, limiting inodes to one per page of lowmem;
37741da177e4SLinus Torvalds 	 * but the internal instance is left unlimited.
37751da177e4SLinus Torvalds 	 */
37761751e8a6SLinus Torvalds 	if (!(sb->s_flags & SB_KERNMOUNT)) {
3777f3235626SDavid Howells 		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3778f3235626SDavid Howells 			ctx->blocks = shmem_default_max_blocks();
3779f3235626SDavid Howells 		if (!(ctx->seen & SHMEM_SEEN_INODES))
3780f3235626SDavid Howells 			ctx->inodes = shmem_default_max_inodes();
3781ea3271f7SChris Down 		if (!(ctx->seen & SHMEM_SEEN_INUMS))
3782ea3271f7SChris Down 			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3783ca4e0519SAl Viro 	} else {
37841751e8a6SLinus Torvalds 		sb->s_flags |= SB_NOUSER;
37851da177e4SLinus Torvalds 	}
378691828a40SDavid M. Grimes 	sb->s_export_op = &shmem_export_ops;
378736f05cabSJeff Layton 	sb->s_flags |= SB_NOSEC | SB_I_VERSION;
37880edd73b3SHugh Dickins #else
37891751e8a6SLinus Torvalds 	sb->s_flags |= SB_NOUSER;
37900edd73b3SHugh Dickins #endif
3791f3235626SDavid Howells 	sbinfo->max_blocks = ctx->blocks;
3792f3235626SDavid Howells 	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3793e809d5f0SChris Down 	if (sb->s_flags & SB_KERNMOUNT) {
3794e809d5f0SChris Down 		sbinfo->ino_batch = alloc_percpu(ino_t);
3795e809d5f0SChris Down 		if (!sbinfo->ino_batch)
3796e809d5f0SChris Down 			goto failed;
3797e809d5f0SChris Down 	}
3798f3235626SDavid Howells 	sbinfo->uid = ctx->uid;
3799f3235626SDavid Howells 	sbinfo->gid = ctx->gid;
3800ea3271f7SChris Down 	sbinfo->full_inums = ctx->full_inums;
3801f3235626SDavid Howells 	sbinfo->mode = ctx->mode;
3802f3235626SDavid Howells 	sbinfo->huge = ctx->huge;
3803f3235626SDavid Howells 	sbinfo->mpol = ctx->mpol;
3804f3235626SDavid Howells 	ctx->mpol = NULL;
38051da177e4SLinus Torvalds 
3806bf11b9a8SSebastian Andrzej Siewior 	raw_spin_lock_init(&sbinfo->stat_lock);
3807908c7f19STejun Heo 	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3808602586a8SHugh Dickins 		goto failed;
3809779750d2SKirill A. Shutemov 	spin_lock_init(&sbinfo->shrinklist_lock);
3810779750d2SKirill A. Shutemov 	INIT_LIST_HEAD(&sbinfo->shrinklist);
38111da177e4SLinus Torvalds 
3812285b2c4fSHugh Dickins 	sb->s_maxbytes = MAX_LFS_FILESIZE;
381309cbfeafSKirill A. Shutemov 	sb->s_blocksize = PAGE_SIZE;
381409cbfeafSKirill A. Shutemov 	sb->s_blocksize_bits = PAGE_SHIFT;
38151da177e4SLinus Torvalds 	sb->s_magic = TMPFS_MAGIC;
38161da177e4SLinus Torvalds 	sb->s_op = &shmem_ops;
3817cfd95a9cSRobin H. Johnson 	sb->s_time_gran = 1;
3818b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
381939f0247dSAndreas Gruenbacher 	sb->s_xattr = shmem_xattr_handlers;
3820b09e0fa4SEric Paris #endif
3821b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL
38221751e8a6SLinus Torvalds 	sb->s_flags |= SB_POSIXACL;
382339f0247dSAndreas Gruenbacher #endif
38242b4db796SAmir Goldstein 	uuid_gen(&sb->s_uuid);
38250edd73b3SHugh Dickins 
3826454abafeSDmitry Monakhov 	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
38271da177e4SLinus Torvalds 	if (!inode)
38281da177e4SLinus Torvalds 		goto failed;
3829680d794bSakpm@linux-foundation.org 	inode->i_uid = sbinfo->uid;
3830680d794bSakpm@linux-foundation.org 	inode->i_gid = sbinfo->gid;
3831318ceed0SAl Viro 	sb->s_root = d_make_root(inode);
3832318ceed0SAl Viro 	if (!sb->s_root)
383348fde701SAl Viro 		goto failed;
38341da177e4SLinus Torvalds 	return 0;
38351da177e4SLinus Torvalds 
38361da177e4SLinus Torvalds failed:
38371da177e4SLinus Torvalds 	shmem_put_super(sb);
3838f2b346e4SMiaohe Lin 	return -ENOMEM;
38391da177e4SLinus Torvalds }
38401da177e4SLinus Torvalds 
3841f3235626SDavid Howells static int shmem_get_tree(struct fs_context *fc)
3842f3235626SDavid Howells {
3843f3235626SDavid Howells 	return get_tree_nodev(fc, shmem_fill_super);
3844f3235626SDavid Howells }
3845f3235626SDavid Howells 
3846f3235626SDavid Howells static void shmem_free_fc(struct fs_context *fc)
3847f3235626SDavid Howells {
3848f3235626SDavid Howells 	struct shmem_options *ctx = fc->fs_private;
3849f3235626SDavid Howells 
3850f3235626SDavid Howells 	if (ctx) {
3851f3235626SDavid Howells 		mpol_put(ctx->mpol);
3852f3235626SDavid Howells 		kfree(ctx);
3853f3235626SDavid Howells 	}
3854f3235626SDavid Howells }
3855f3235626SDavid Howells 
3856f3235626SDavid Howells static const struct fs_context_operations shmem_fs_context_ops = {
3857f3235626SDavid Howells 	.free			= shmem_free_fc,
3858f3235626SDavid Howells 	.get_tree		= shmem_get_tree,
3859f3235626SDavid Howells #ifdef CONFIG_TMPFS
3860f3235626SDavid Howells 	.parse_monolithic	= shmem_parse_options,
3861f3235626SDavid Howells 	.parse_param		= shmem_parse_one,
3862f3235626SDavid Howells 	.reconfigure		= shmem_reconfigure,
3863f3235626SDavid Howells #endif
3864f3235626SDavid Howells };
3865f3235626SDavid Howells 
3866fcc234f8SPekka Enberg static struct kmem_cache *shmem_inode_cachep;
38671da177e4SLinus Torvalds 
38681da177e4SLinus Torvalds static struct inode *shmem_alloc_inode(struct super_block *sb)
38691da177e4SLinus Torvalds {
387041ffe5d5SHugh Dickins 	struct shmem_inode_info *info;
3871fd60b288SMuchun Song 	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
387241ffe5d5SHugh Dickins 	if (!info)
38731da177e4SLinus Torvalds 		return NULL;
387441ffe5d5SHugh Dickins 	return &info->vfs_inode;
38751da177e4SLinus Torvalds }
38761da177e4SLinus Torvalds 
387774b1da56SAl Viro static void shmem_free_in_core_inode(struct inode *inode)
3878fa0d7e3dSNick Piggin {
387984e710daSAl Viro 	if (S_ISLNK(inode->i_mode))
38803ed47db3SAl Viro 		kfree(inode->i_link);
3881fa0d7e3dSNick Piggin 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3882fa0d7e3dSNick Piggin }
3883fa0d7e3dSNick Piggin 
38841da177e4SLinus Torvalds static void shmem_destroy_inode(struct inode *inode)
38851da177e4SLinus Torvalds {
388609208d15SAl Viro 	if (S_ISREG(inode->i_mode))
38871da177e4SLinus Torvalds 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
38881da177e4SLinus Torvalds }
38891da177e4SLinus Torvalds 
389041ffe5d5SHugh Dickins static void shmem_init_inode(void *foo)
38911da177e4SLinus Torvalds {
389241ffe5d5SHugh Dickins 	struct shmem_inode_info *info = foo;
389341ffe5d5SHugh Dickins 	inode_init_once(&info->vfs_inode);
38941da177e4SLinus Torvalds }
38951da177e4SLinus Torvalds 
38969a8ec03eSweiping zhang static void shmem_init_inodecache(void)
38971da177e4SLinus Torvalds {
38981da177e4SLinus Torvalds 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
38991da177e4SLinus Torvalds 				sizeof(struct shmem_inode_info),
39005d097056SVladimir Davydov 				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
39011da177e4SLinus Torvalds }
39021da177e4SLinus Torvalds 
390341ffe5d5SHugh Dickins static void shmem_destroy_inodecache(void)
39041da177e4SLinus Torvalds {
39051a1d92c1SAlexey Dobriyan 	kmem_cache_destroy(shmem_inode_cachep);
39061da177e4SLinus Torvalds }
39071da177e4SLinus Torvalds 
3908a7605426SYang Shi /* Keep the page in page cache instead of truncating it */
3909a7605426SYang Shi static int shmem_error_remove_page(struct address_space *mapping,
3910a7605426SYang Shi 				   struct page *page)
3911a7605426SYang Shi {
3912a7605426SYang Shi 	return 0;
3913a7605426SYang Shi }
3914a7605426SYang Shi 
391530e6a51dSHui Su const struct address_space_operations shmem_aops = {
39161da177e4SLinus Torvalds 	.writepage	= shmem_writepage,
391746de8b97SMatthew Wilcox (Oracle) 	.dirty_folio	= noop_dirty_folio,
39181da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
3919800d15a5SNick Piggin 	.write_begin	= shmem_write_begin,
3920800d15a5SNick Piggin 	.write_end	= shmem_write_end,
39211da177e4SLinus Torvalds #endif
39221c93923cSAndrew Morton #ifdef CONFIG_MIGRATION
392354184650SMatthew Wilcox (Oracle) 	.migrate_folio	= migrate_folio,
39241c93923cSAndrew Morton #endif
3925a7605426SYang Shi 	.error_remove_page = shmem_error_remove_page,
39261da177e4SLinus Torvalds };
392730e6a51dSHui Su EXPORT_SYMBOL(shmem_aops);
39281da177e4SLinus Torvalds 
392915ad7cdcSHelge Deller static const struct file_operations shmem_file_operations = {
39301da177e4SLinus Torvalds 	.mmap		= shmem_mmap,
3931a5454f95SThomas Weißschuh 	.open		= generic_file_open,
3932c01d5b30SHugh Dickins 	.get_unmapped_area = shmem_get_unmapped_area,
39331da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
3934220f2ac9SHugh Dickins 	.llseek		= shmem_file_llseek,
39352ba5bbedSAl Viro 	.read_iter	= shmem_file_read_iter,
39368174202bSAl Viro 	.write_iter	= generic_file_write_iter,
39371b061d92SChristoph Hellwig 	.fsync		= noop_fsync,
393882c156f8SAl Viro 	.splice_read	= generic_file_splice_read,
3939f6cb85d0SAl Viro 	.splice_write	= iter_file_splice_write,
394083e4fa9cSHugh Dickins 	.fallocate	= shmem_fallocate,
39411da177e4SLinus Torvalds #endif
39421da177e4SLinus Torvalds };
39431da177e4SLinus Torvalds 
394492e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations = {
394544a30220SYu Zhao 	.getattr	= shmem_getattr,
394694c1e62dSHugh Dickins 	.setattr	= shmem_setattr,
3947b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3948b09e0fa4SEric Paris 	.listxattr	= shmem_listxattr,
3949feda821eSChristoph Hellwig 	.set_acl	= simple_set_acl,
3950e408e695STheodore Ts'o 	.fileattr_get	= shmem_fileattr_get,
3951e408e695STheodore Ts'o 	.fileattr_set	= shmem_fileattr_set,
3952b09e0fa4SEric Paris #endif
39531da177e4SLinus Torvalds };
39541da177e4SLinus Torvalds 
395592e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations = {
39561da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
3957f7cd16a5SXavier Roche 	.getattr	= shmem_getattr,
39581da177e4SLinus Torvalds 	.create		= shmem_create,
39591da177e4SLinus Torvalds 	.lookup		= simple_lookup,
39601da177e4SLinus Torvalds 	.link		= shmem_link,
39611da177e4SLinus Torvalds 	.unlink		= shmem_unlink,
39621da177e4SLinus Torvalds 	.symlink	= shmem_symlink,
39631da177e4SLinus Torvalds 	.mkdir		= shmem_mkdir,
39641da177e4SLinus Torvalds 	.rmdir		= shmem_rmdir,
39651da177e4SLinus Torvalds 	.mknod		= shmem_mknod,
39662773bf00SMiklos Szeredi 	.rename		= shmem_rename2,
396760545d0dSAl Viro 	.tmpfile	= shmem_tmpfile,
39681da177e4SLinus Torvalds #endif
3969b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3970b09e0fa4SEric Paris 	.listxattr	= shmem_listxattr,
3971e408e695STheodore Ts'o 	.fileattr_get	= shmem_fileattr_get,
3972e408e695STheodore Ts'o 	.fileattr_set	= shmem_fileattr_set,
3973b09e0fa4SEric Paris #endif
397439f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL
397594c1e62dSHugh Dickins 	.setattr	= shmem_setattr,
3976feda821eSChristoph Hellwig 	.set_acl	= simple_set_acl,
397739f0247dSAndreas Gruenbacher #endif
397839f0247dSAndreas Gruenbacher };
397939f0247dSAndreas Gruenbacher 
398092e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations = {
3981f7cd16a5SXavier Roche 	.getattr	= shmem_getattr,
3982b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR
3983b09e0fa4SEric Paris 	.listxattr	= shmem_listxattr,
3984b09e0fa4SEric Paris #endif
398539f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL
398694c1e62dSHugh Dickins 	.setattr	= shmem_setattr,
3987feda821eSChristoph Hellwig 	.set_acl	= simple_set_acl,
398839f0247dSAndreas Gruenbacher #endif
39891da177e4SLinus Torvalds };
39901da177e4SLinus Torvalds 
3991759b9775SHugh Dickins static const struct super_operations shmem_ops = {
39921da177e4SLinus Torvalds 	.alloc_inode	= shmem_alloc_inode,
399374b1da56SAl Viro 	.free_inode	= shmem_free_in_core_inode,
39941da177e4SLinus Torvalds 	.destroy_inode	= shmem_destroy_inode,
39951da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
39961da177e4SLinus Torvalds 	.statfs		= shmem_statfs,
3997680d794bSakpm@linux-foundation.org 	.show_options	= shmem_show_options,
39981da177e4SLinus Torvalds #endif
39991f895f75SAl Viro 	.evict_inode	= shmem_evict_inode,
40001da177e4SLinus Torvalds 	.drop_inode	= generic_delete_inode,
40011da177e4SLinus Torvalds 	.put_super	= shmem_put_super,
4002396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4003779750d2SKirill A. Shutemov 	.nr_cached_objects	= shmem_unused_huge_count,
4004779750d2SKirill A. Shutemov 	.free_cached_objects	= shmem_unused_huge_scan,
4005779750d2SKirill A. Shutemov #endif
40061da177e4SLinus Torvalds };
40071da177e4SLinus Torvalds 
4008f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops = {
400954cb8821SNick Piggin 	.fault		= shmem_fault,
4010d7c17551SNing Qu 	.map_pages	= filemap_map_pages,
40111da177e4SLinus Torvalds #ifdef CONFIG_NUMA
40121da177e4SLinus Torvalds 	.set_policy     = shmem_set_policy,
40131da177e4SLinus Torvalds 	.get_policy     = shmem_get_policy,
40141da177e4SLinus Torvalds #endif
40151da177e4SLinus Torvalds };
40161da177e4SLinus Torvalds 
4017d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops = {
4018d09e8ca6SPasha Tatashin 	.fault		= shmem_fault,
4019d09e8ca6SPasha Tatashin 	.map_pages	= filemap_map_pages,
4020d09e8ca6SPasha Tatashin #ifdef CONFIG_NUMA
4021d09e8ca6SPasha Tatashin 	.set_policy     = shmem_set_policy,
4022d09e8ca6SPasha Tatashin 	.get_policy     = shmem_get_policy,
4023d09e8ca6SPasha Tatashin #endif
4024d09e8ca6SPasha Tatashin };
4025d09e8ca6SPasha Tatashin 
4026f3235626SDavid Howells int shmem_init_fs_context(struct fs_context *fc)
40271da177e4SLinus Torvalds {
4028f3235626SDavid Howells 	struct shmem_options *ctx;
4029f3235626SDavid Howells 
4030f3235626SDavid Howells 	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
4031f3235626SDavid Howells 	if (!ctx)
4032f3235626SDavid Howells 		return -ENOMEM;
4033f3235626SDavid Howells 
4034f3235626SDavid Howells 	ctx->mode = 0777 | S_ISVTX;
4035f3235626SDavid Howells 	ctx->uid = current_fsuid();
4036f3235626SDavid Howells 	ctx->gid = current_fsgid();
4037f3235626SDavid Howells 
4038f3235626SDavid Howells 	fc->fs_private = ctx;
4039f3235626SDavid Howells 	fc->ops = &shmem_fs_context_ops;
4040f3235626SDavid Howells 	return 0;
40411da177e4SLinus Torvalds }
40421da177e4SLinus Torvalds 
404341ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = {
40441da177e4SLinus Torvalds 	.owner		= THIS_MODULE,
40451da177e4SLinus Torvalds 	.name		= "tmpfs",
4046f3235626SDavid Howells 	.init_fs_context = shmem_init_fs_context,
4047f3235626SDavid Howells #ifdef CONFIG_TMPFS
4048d7167b14SAl Viro 	.parameters	= shmem_fs_parameters,
4049f3235626SDavid Howells #endif
40501da177e4SLinus Torvalds 	.kill_sb	= kill_litter_super,
4051ff36da69SMatthew Wilcox (Oracle) 	.fs_flags	= FS_USERNS_MOUNT,
40521da177e4SLinus Torvalds };
40531da177e4SLinus Torvalds 
40549096bbe9SMiaohe Lin void __init shmem_init(void)
40551da177e4SLinus Torvalds {
40561da177e4SLinus Torvalds 	int error;
40571da177e4SLinus Torvalds 
40589a8ec03eSweiping zhang 	shmem_init_inodecache();
40591da177e4SLinus Torvalds 
406041ffe5d5SHugh Dickins 	error = register_filesystem(&shmem_fs_type);
40611da177e4SLinus Torvalds 	if (error) {
40621170532bSJoe Perches 		pr_err("Could not register tmpfs\n");
40631da177e4SLinus Torvalds 		goto out2;
40641da177e4SLinus Torvalds 	}
406595dc112aSGreg Kroah-Hartman 
4066ca4e0519SAl Viro 	shm_mnt = kern_mount(&shmem_fs_type);
40671da177e4SLinus Torvalds 	if (IS_ERR(shm_mnt)) {
40681da177e4SLinus Torvalds 		error = PTR_ERR(shm_mnt);
40691170532bSJoe Perches 		pr_err("Could not kern_mount tmpfs\n");
40701da177e4SLinus Torvalds 		goto out1;
40711da177e4SLinus Torvalds 	}
40725a6e75f8SKirill A. Shutemov 
4073396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4074435c0b87SKirill A. Shutemov 	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
40755a6e75f8SKirill A. Shutemov 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
40765a6e75f8SKirill A. Shutemov 	else
40775e6e5a12SHugh Dickins 		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
40785a6e75f8SKirill A. Shutemov #endif
40799096bbe9SMiaohe Lin 	return;
40801da177e4SLinus Torvalds 
40811da177e4SLinus Torvalds out1:
408241ffe5d5SHugh Dickins 	unregister_filesystem(&shmem_fs_type);
40831da177e4SLinus Torvalds out2:
408441ffe5d5SHugh Dickins 	shmem_destroy_inodecache();
40851da177e4SLinus Torvalds 	shm_mnt = ERR_PTR(error);
40861da177e4SLinus Torvalds }
4087853ac43aSMatt Mackall 
4088396bcc52SMatthew Wilcox (Oracle) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
40895a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_show(struct kobject *kobj,
40905a6e75f8SKirill A. Shutemov 				  struct kobj_attribute *attr, char *buf)
40915a6e75f8SKirill A. Shutemov {
409226083eb6SColin Ian King 	static const int values[] = {
40935a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_ALWAYS,
40945a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_WITHIN_SIZE,
40955a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_ADVISE,
40965a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_NEVER,
40975a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_DENY,
40985a6e75f8SKirill A. Shutemov 		SHMEM_HUGE_FORCE,
40995a6e75f8SKirill A. Shutemov 	};
410079d4d38aSJoe Perches 	int len = 0;
410179d4d38aSJoe Perches 	int i;
41025a6e75f8SKirill A. Shutemov 
410379d4d38aSJoe Perches 	for (i = 0; i < ARRAY_SIZE(values); i++) {
410479d4d38aSJoe Perches 		len += sysfs_emit_at(buf, len,
410579d4d38aSJoe Perches 				     shmem_huge == values[i] ? "%s[%s]" : "%s%s",
410679d4d38aSJoe Perches 				     i ? " " : "",
41075a6e75f8SKirill A. Shutemov 				     shmem_format_huge(values[i]));
41085a6e75f8SKirill A. Shutemov 	}
410979d4d38aSJoe Perches 
411079d4d38aSJoe Perches 	len += sysfs_emit_at(buf, len, "\n");
411179d4d38aSJoe Perches 
411279d4d38aSJoe Perches 	return len;
41135a6e75f8SKirill A. Shutemov }
41145a6e75f8SKirill A. Shutemov 
41155a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_store(struct kobject *kobj,
41165a6e75f8SKirill A. Shutemov 		struct kobj_attribute *attr, const char *buf, size_t count)
41175a6e75f8SKirill A. Shutemov {
41185a6e75f8SKirill A. Shutemov 	char tmp[16];
41195a6e75f8SKirill A. Shutemov 	int huge;
41205a6e75f8SKirill A. Shutemov 
41215a6e75f8SKirill A. Shutemov 	if (count + 1 > sizeof(tmp))
41225a6e75f8SKirill A. Shutemov 		return -EINVAL;
41235a6e75f8SKirill A. Shutemov 	memcpy(tmp, buf, count);
41245a6e75f8SKirill A. Shutemov 	tmp[count] = '\0';
41255a6e75f8SKirill A. Shutemov 	if (count && tmp[count - 1] == '\n')
41265a6e75f8SKirill A. Shutemov 		tmp[count - 1] = '\0';
41275a6e75f8SKirill A. Shutemov 
41285a6e75f8SKirill A. Shutemov 	huge = shmem_parse_huge(tmp);
41295a6e75f8SKirill A. Shutemov 	if (huge == -EINVAL)
41305a6e75f8SKirill A. Shutemov 		return -EINVAL;
41315a6e75f8SKirill A. Shutemov 	if (!has_transparent_hugepage() &&
41325a6e75f8SKirill A. Shutemov 			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
41335a6e75f8SKirill A. Shutemov 		return -EINVAL;
41345a6e75f8SKirill A. Shutemov 
41355a6e75f8SKirill A. Shutemov 	shmem_huge = huge;
4136435c0b87SKirill A. Shutemov 	if (shmem_huge > SHMEM_HUGE_DENY)
41375a6e75f8SKirill A. Shutemov 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
41385a6e75f8SKirill A. Shutemov 	return count;
41395a6e75f8SKirill A. Shutemov }
41405a6e75f8SKirill A. Shutemov 
41414bfa8adaSMiaohe Lin struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
4142396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4143f3f0e1d2SKirill A. Shutemov 
4144853ac43aSMatt Mackall #else /* !CONFIG_SHMEM */
4145853ac43aSMatt Mackall 
4146853ac43aSMatt Mackall /*
4147853ac43aSMatt Mackall  * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4148853ac43aSMatt Mackall  *
4149853ac43aSMatt Mackall  * This is intended for small system where the benefits of the full
4150853ac43aSMatt Mackall  * shmem code (swap-backed and resource-limited) are outweighed by
4151853ac43aSMatt Mackall  * their complexity. On systems without swap this code should be
4152853ac43aSMatt Mackall  * effectively equivalent, but much lighter weight.
4153853ac43aSMatt Mackall  */
4154853ac43aSMatt Mackall 
415541ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = {
4156853ac43aSMatt Mackall 	.name		= "tmpfs",
4157f3235626SDavid Howells 	.init_fs_context = ramfs_init_fs_context,
4158d7167b14SAl Viro 	.parameters	= ramfs_fs_parameters,
4159853ac43aSMatt Mackall 	.kill_sb	= kill_litter_super,
41602b8576cbSEric W. Biederman 	.fs_flags	= FS_USERNS_MOUNT,
4161853ac43aSMatt Mackall };
4162853ac43aSMatt Mackall 
41639096bbe9SMiaohe Lin void __init shmem_init(void)
4164853ac43aSMatt Mackall {
416541ffe5d5SHugh Dickins 	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4166853ac43aSMatt Mackall 
416741ffe5d5SHugh Dickins 	shm_mnt = kern_mount(&shmem_fs_type);
4168853ac43aSMatt Mackall 	BUG_ON(IS_ERR(shm_mnt));
4169853ac43aSMatt Mackall }
4170853ac43aSMatt Mackall 
417110a9c496SChristoph Hellwig int shmem_unuse(unsigned int type)
4172853ac43aSMatt Mackall {
4173853ac43aSMatt Mackall 	return 0;
4174853ac43aSMatt Mackall }
4175853ac43aSMatt Mackall 
4176d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
41773f96b79aSHugh Dickins {
41783f96b79aSHugh Dickins 	return 0;
41793f96b79aSHugh Dickins }
41803f96b79aSHugh Dickins 
418124513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping)
418224513264SHugh Dickins {
418324513264SHugh Dickins }
418424513264SHugh Dickins 
4185c01d5b30SHugh Dickins #ifdef CONFIG_MMU
4186c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file,
4187c01d5b30SHugh Dickins 				      unsigned long addr, unsigned long len,
4188c01d5b30SHugh Dickins 				      unsigned long pgoff, unsigned long flags)
4189c01d5b30SHugh Dickins {
4190c01d5b30SHugh Dickins 	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4191c01d5b30SHugh Dickins }
4192c01d5b30SHugh Dickins #endif
4193c01d5b30SHugh Dickins 
419441ffe5d5SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
419594c1e62dSHugh Dickins {
419641ffe5d5SHugh Dickins 	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
419794c1e62dSHugh Dickins }
419894c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range);
419994c1e62dSHugh Dickins 
4200853ac43aSMatt Mackall #define shmem_vm_ops				generic_file_vm_ops
4201d09e8ca6SPasha Tatashin #define shmem_anon_vm_ops			generic_file_vm_ops
42020b0a0806SHugh Dickins #define shmem_file_operations			ramfs_file_operations
4203454abafeSDmitry Monakhov #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
42040b0a0806SHugh Dickins #define shmem_acct_size(flags, size)		0
42050b0a0806SHugh Dickins #define shmem_unacct_size(flags, size)		do {} while (0)
4206853ac43aSMatt Mackall 
4207853ac43aSMatt Mackall #endif /* CONFIG_SHMEM */
4208853ac43aSMatt Mackall 
4209853ac43aSMatt Mackall /* common code */
42101da177e4SLinus Torvalds 
4211703321b6SMatthew Auld static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4212c7277090SEric Paris 				       unsigned long flags, unsigned int i_flags)
42131da177e4SLinus Torvalds {
42141da177e4SLinus Torvalds 	struct inode *inode;
421593dec2daSAl Viro 	struct file *res;
42161da177e4SLinus Torvalds 
4217703321b6SMatthew Auld 	if (IS_ERR(mnt))
4218703321b6SMatthew Auld 		return ERR_CAST(mnt);
42191da177e4SLinus Torvalds 
4220285b2c4fSHugh Dickins 	if (size < 0 || size > MAX_LFS_FILESIZE)
42211da177e4SLinus Torvalds 		return ERR_PTR(-EINVAL);
42221da177e4SLinus Torvalds 
42231da177e4SLinus Torvalds 	if (shmem_acct_size(flags, size))
42241da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
42251da177e4SLinus Torvalds 
422693dec2daSAl Viro 	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
422793dec2daSAl Viro 				flags);
4228dac2d1f6SAl Viro 	if (unlikely(!inode)) {
4229dac2d1f6SAl Viro 		shmem_unacct_size(flags, size);
4230dac2d1f6SAl Viro 		return ERR_PTR(-ENOSPC);
4231dac2d1f6SAl Viro 	}
4232c7277090SEric Paris 	inode->i_flags |= i_flags;
42331da177e4SLinus Torvalds 	inode->i_size = size;
42346d6b77f1SMiklos Szeredi 	clear_nlink(inode);	/* It is unlinked */
423526567cdbSAl Viro 	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
423693dec2daSAl Viro 	if (!IS_ERR(res))
423793dec2daSAl Viro 		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
42384b42af81SAl Viro 				&shmem_file_operations);
42396b4d0b27SAl Viro 	if (IS_ERR(res))
424093dec2daSAl Viro 		iput(inode);
42416b4d0b27SAl Viro 	return res;
42421da177e4SLinus Torvalds }
4243c7277090SEric Paris 
4244c7277090SEric Paris /**
4245c7277090SEric Paris  * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4246c7277090SEric Paris  * 	kernel internal.  There will be NO LSM permission checks against the
4247c7277090SEric Paris  * 	underlying inode.  So users of this interface must do LSM checks at a
4248e1832f29SStephen Smalley  *	higher layer.  The users are the big_key and shm implementations.  LSM
4249e1832f29SStephen Smalley  *	checks are provided at the key or shm level rather than the inode.
4250c7277090SEric Paris  * @name: name for dentry (to be seen in /proc/<pid>/maps
4251c7277090SEric Paris  * @size: size to be set for the file
4252c7277090SEric Paris  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4253c7277090SEric Paris  */
4254c7277090SEric Paris struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4255c7277090SEric Paris {
4256703321b6SMatthew Auld 	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4257c7277090SEric Paris }
4258c7277090SEric Paris 
4259c7277090SEric Paris /**
4260c7277090SEric Paris  * shmem_file_setup - get an unlinked file living in tmpfs
4261c7277090SEric Paris  * @name: name for dentry (to be seen in /proc/<pid>/maps
4262c7277090SEric Paris  * @size: size to be set for the file
4263c7277090SEric Paris  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4264c7277090SEric Paris  */
4265c7277090SEric Paris struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4266c7277090SEric Paris {
4267703321b6SMatthew Auld 	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4268c7277090SEric Paris }
4269395e0ddcSKeith Packard EXPORT_SYMBOL_GPL(shmem_file_setup);
42701da177e4SLinus Torvalds 
427146711810SRandy Dunlap /**
4272703321b6SMatthew Auld  * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4273703321b6SMatthew Auld  * @mnt: the tmpfs mount where the file will be created
4274703321b6SMatthew Auld  * @name: name for dentry (to be seen in /proc/<pid>/maps
4275703321b6SMatthew Auld  * @size: size to be set for the file
4276703321b6SMatthew Auld  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4277703321b6SMatthew Auld  */
4278703321b6SMatthew Auld struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4279703321b6SMatthew Auld 				       loff_t size, unsigned long flags)
4280703321b6SMatthew Auld {
4281703321b6SMatthew Auld 	return __shmem_file_setup(mnt, name, size, flags, 0);
4282703321b6SMatthew Auld }
4283703321b6SMatthew Auld EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4284703321b6SMatthew Auld 
4285703321b6SMatthew Auld /**
42861da177e4SLinus Torvalds  * shmem_zero_setup - setup a shared anonymous mapping
428745e55300SPeter Collingbourne  * @vma: the vma to be mmapped is prepared by do_mmap
42881da177e4SLinus Torvalds  */
42891da177e4SLinus Torvalds int shmem_zero_setup(struct vm_area_struct *vma)
42901da177e4SLinus Torvalds {
42911da177e4SLinus Torvalds 	struct file *file;
42921da177e4SLinus Torvalds 	loff_t size = vma->vm_end - vma->vm_start;
42931da177e4SLinus Torvalds 
429466fc1303SHugh Dickins 	/*
4295c1e8d7c6SMichel Lespinasse 	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
429666fc1303SHugh Dickins 	 * between XFS directory reading and selinux: since this file is only
429766fc1303SHugh Dickins 	 * accessible to the user through its mapping, use S_PRIVATE flag to
429866fc1303SHugh Dickins 	 * bypass file security, in the same way as shmem_kernel_file_setup().
429966fc1303SHugh Dickins 	 */
4300703321b6SMatthew Auld 	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
43011da177e4SLinus Torvalds 	if (IS_ERR(file))
43021da177e4SLinus Torvalds 		return PTR_ERR(file);
43031da177e4SLinus Torvalds 
43041da177e4SLinus Torvalds 	if (vma->vm_file)
43051da177e4SLinus Torvalds 		fput(vma->vm_file);
43061da177e4SLinus Torvalds 	vma->vm_file = file;
4307d09e8ca6SPasha Tatashin 	vma->vm_ops = &shmem_anon_vm_ops;
4308f3f0e1d2SKirill A. Shutemov 
43091da177e4SLinus Torvalds 	return 0;
43101da177e4SLinus Torvalds }
4311d9d90e5eSHugh Dickins 
4312d9d90e5eSHugh Dickins /**
4313d9d90e5eSHugh Dickins  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4314d9d90e5eSHugh Dickins  * @mapping:	the page's address_space
4315d9d90e5eSHugh Dickins  * @index:	the page index
4316d9d90e5eSHugh Dickins  * @gfp:	the page allocator flags to use if allocating
4317d9d90e5eSHugh Dickins  *
4318d9d90e5eSHugh Dickins  * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4319d9d90e5eSHugh Dickins  * with any new page allocations done using the specified allocation flags.
43207e0a1265SMatthew Wilcox (Oracle)  * But read_cache_page_gfp() uses the ->read_folio() method: which does not
4321d9d90e5eSHugh Dickins  * suit tmpfs, since it may have pages in swapcache, and needs to find those
4322d9d90e5eSHugh Dickins  * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4323d9d90e5eSHugh Dickins  *
432468da9f05SHugh Dickins  * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
432568da9f05SHugh Dickins  * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4326d9d90e5eSHugh Dickins  */
4327d9d90e5eSHugh Dickins struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4328d9d90e5eSHugh Dickins 					 pgoff_t index, gfp_t gfp)
4329d9d90e5eSHugh Dickins {
433068da9f05SHugh Dickins #ifdef CONFIG_SHMEM
433168da9f05SHugh Dickins 	struct inode *inode = mapping->host;
4332a3a9c397SMatthew Wilcox (Oracle) 	struct folio *folio;
43339276aad6SHugh Dickins 	struct page *page;
433468da9f05SHugh Dickins 	int error;
433568da9f05SHugh Dickins 
433630e6a51dSHui Su 	BUG_ON(!shmem_mapping(mapping));
4337a3a9c397SMatthew Wilcox (Oracle) 	error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
4338cfda0526SMike Rapoport 				  gfp, NULL, NULL, NULL);
433968da9f05SHugh Dickins 	if (error)
4340a7605426SYang Shi 		return ERR_PTR(error);
4341a7605426SYang Shi 
4342a3a9c397SMatthew Wilcox (Oracle) 	folio_unlock(folio);
4343a3a9c397SMatthew Wilcox (Oracle) 	page = folio_file_page(folio, index);
4344a7605426SYang Shi 	if (PageHWPoison(page)) {
4345a3a9c397SMatthew Wilcox (Oracle) 		folio_put(folio);
4346a7605426SYang Shi 		return ERR_PTR(-EIO);
4347a7605426SYang Shi 	}
4348a7605426SYang Shi 
434968da9f05SHugh Dickins 	return page;
435068da9f05SHugh Dickins #else
435168da9f05SHugh Dickins 	/*
435268da9f05SHugh Dickins 	 * The tiny !SHMEM case uses ramfs without swap
435368da9f05SHugh Dickins 	 */
4354d9d90e5eSHugh Dickins 	return read_cache_page_gfp(mapping, index, gfp);
435568da9f05SHugh Dickins #endif
4356d9d90e5eSHugh Dickins }
4357d9d90e5eSHugh Dickins EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4358