11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Resizable virtual memory filesystem for Linux. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 2000 Linus Torvalds. 51da177e4SLinus Torvalds * 2000 Transmeta Corp. 61da177e4SLinus Torvalds * 2000-2001 Christoph Rohland 71da177e4SLinus Torvalds * 2000-2001 SAP AG 81da177e4SLinus Torvalds * 2002 Red Hat Inc. 96922c0c7SHugh Dickins * Copyright (C) 2002-2011 Hugh Dickins. 106922c0c7SHugh Dickins * Copyright (C) 2011 Google Inc. 110edd73b3SHugh Dickins * Copyright (C) 2002-2005 VERITAS Software Corporation. 121da177e4SLinus Torvalds * Copyright (C) 2004 Andi Kleen, SuSE Labs 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * Extended attribute support for tmpfs: 151da177e4SLinus Torvalds * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 161da177e4SLinus Torvalds * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 171da177e4SLinus Torvalds * 18853ac43aSMatt Mackall * tiny-shmem: 19853ac43aSMatt Mackall * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20853ac43aSMatt Mackall * 211da177e4SLinus Torvalds * This file is released under the GPL. 221da177e4SLinus Torvalds */ 231da177e4SLinus Torvalds 24853ac43aSMatt Mackall #include <linux/fs.h> 25853ac43aSMatt Mackall #include <linux/init.h> 26853ac43aSMatt Mackall #include <linux/vfs.h> 27853ac43aSMatt Mackall #include <linux/mount.h> 28250297edSAndrew Morton #include <linux/ramfs.h> 29caefba17SHugh Dickins #include <linux/pagemap.h> 30853ac43aSMatt Mackall #include <linux/file.h> 31e408e695STheodore Ts'o #include <linux/fileattr.h> 32853ac43aSMatt Mackall #include <linux/mm.h> 3346c9a946SArnd Bergmann #include <linux/random.h> 34174cd4b1SIngo Molnar #include <linux/sched/signal.h> 35b95f1b31SPaul Gortmaker #include <linux/export.h> 365ff2121aSMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 37853ac43aSMatt Mackall #include <linux/swap.h> 38e2e40f2cSChristoph Hellwig #include <linux/uio.h> 39749df87bSMike Kravetz #include <linux/hugetlb.h> 40626c3920SAl Viro #include <linux/fs_parser.h> 4186a2f3f2SMiaohe Lin #include <linux/swapfile.h> 4236f05cabSJeff Layton #include <linux/iversion.h> 43014bb1deSNeilBrown #include "swap.h" 4495cc09d6SAndrea Arcangeli 45853ac43aSMatt Mackall static struct vfsmount *shm_mnt; 46853ac43aSMatt Mackall 47853ac43aSMatt Mackall #ifdef CONFIG_SHMEM 481da177e4SLinus Torvalds /* 491da177e4SLinus Torvalds * This virtual memory filesystem is heavily based on the ramfs. It 501da177e4SLinus Torvalds * extends ramfs by the ability to use swap and honor resource limits 511da177e4SLinus Torvalds * which makes it a completely usable filesystem. 521da177e4SLinus Torvalds */ 531da177e4SLinus Torvalds 5439f0247dSAndreas Gruenbacher #include <linux/xattr.h> 55a5694255SChristoph Hellwig #include <linux/exportfs.h> 561c7c474cSChristoph Hellwig #include <linux/posix_acl.h> 57feda821eSChristoph Hellwig #include <linux/posix_acl_xattr.h> 581da177e4SLinus Torvalds #include <linux/mman.h> 591da177e4SLinus Torvalds #include <linux/string.h> 601da177e4SLinus Torvalds #include <linux/slab.h> 611da177e4SLinus Torvalds #include <linux/backing-dev.h> 621da177e4SLinus Torvalds #include <linux/writeback.h> 63bda97eabSHugh Dickins #include <linux/pagevec.h> 6441ffe5d5SHugh Dickins #include <linux/percpu_counter.h> 6583e4fa9cSHugh Dickins #include <linux/falloc.h> 66708e3508SHugh Dickins #include <linux/splice.h> 671da177e4SLinus Torvalds #include <linux/security.h> 681da177e4SLinus Torvalds #include <linux/swapops.h> 691da177e4SLinus Torvalds #include <linux/mempolicy.h> 701da177e4SLinus Torvalds #include <linux/namei.h> 71b00dc3adSHugh Dickins #include <linux/ctype.h> 72304dbdb7SLee Schermerhorn #include <linux/migrate.h> 73c1f60a5aSChristoph Lameter #include <linux/highmem.h> 74680d794bSakpm@linux-foundation.org #include <linux/seq_file.h> 7592562927SMimi Zohar #include <linux/magic.h> 769183df25SDavid Herrmann #include <linux/syscalls.h> 7740e041a2SDavid Herrmann #include <linux/fcntl.h> 789183df25SDavid Herrmann #include <uapi/linux/memfd.h> 794c27fe4cSMike Rapoport #include <linux/rmap.h> 802b4db796SAmir Goldstein #include <linux/uuid.h> 81e09764cfSCarlos Maiolino #include <linux/quotaops.h> 82304dbdb7SLee Schermerhorn 837c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 841da177e4SLinus Torvalds 85dd56b046SMel Gorman #include "internal.h" 86dd56b046SMel Gorman 8709cbfeafSKirill A. Shutemov #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 8809cbfeafSKirill A. Shutemov #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 891da177e4SLinus Torvalds 901da177e4SLinus Torvalds /* Pretend that each entry is of this size in directory's i_size */ 911da177e4SLinus Torvalds #define BOGO_DIRENT_SIZE 20 921da177e4SLinus Torvalds 9369f07ec9SHugh Dickins /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 9469f07ec9SHugh Dickins #define SHORT_SYMLINK_LEN 128 9569f07ec9SHugh Dickins 961aac1400SHugh Dickins /* 97f00cdc6dSHugh Dickins * shmem_fallocate communicates with shmem_fault or shmem_writepage via 989608703eSJan Kara * inode->i_private (with i_rwsem making sure that it has only one user at 99f00cdc6dSHugh Dickins * a time): we would prefer not to enlarge the shmem inode just for that. 1001aac1400SHugh Dickins */ 1011aac1400SHugh Dickins struct shmem_falloc { 1028e205f77SHugh Dickins wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 1031aac1400SHugh Dickins pgoff_t start; /* start of range currently being fallocated */ 1041aac1400SHugh Dickins pgoff_t next; /* the next page offset to be fallocated */ 1051aac1400SHugh Dickins pgoff_t nr_falloced; /* how many new pages have been fallocated */ 1061aac1400SHugh Dickins pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 1071aac1400SHugh Dickins }; 1081aac1400SHugh Dickins 1090b5071ddSAl Viro struct shmem_options { 1100b5071ddSAl Viro unsigned long long blocks; 1110b5071ddSAl Viro unsigned long long inodes; 1120b5071ddSAl Viro struct mempolicy *mpol; 1130b5071ddSAl Viro kuid_t uid; 1140b5071ddSAl Viro kgid_t gid; 1150b5071ddSAl Viro umode_t mode; 116ea3271f7SChris Down bool full_inums; 1170b5071ddSAl Viro int huge; 1180b5071ddSAl Viro int seen; 1192c6efe9cSLuis Chamberlain bool noswap; 120e09764cfSCarlos Maiolino unsigned short quota_types; 121de4c0e7cSLukas Czerner struct shmem_quota_limits qlimits; 1220b5071ddSAl Viro #define SHMEM_SEEN_BLOCKS 1 1230b5071ddSAl Viro #define SHMEM_SEEN_INODES 2 1240b5071ddSAl Viro #define SHMEM_SEEN_HUGE 4 125ea3271f7SChris Down #define SHMEM_SEEN_INUMS 8 1262c6efe9cSLuis Chamberlain #define SHMEM_SEEN_NOSWAP 16 127e09764cfSCarlos Maiolino #define SHMEM_SEEN_QUOTA 32 1280b5071ddSAl Viro }; 1290b5071ddSAl Viro 130b76db735SAndrew Morton #ifdef CONFIG_TMPFS 131680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_blocks(void) 132680d794bSakpm@linux-foundation.org { 133ca79b0c2SArun KS return totalram_pages() / 2; 134680d794bSakpm@linux-foundation.org } 135680d794bSakpm@linux-foundation.org 136680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_inodes(void) 137680d794bSakpm@linux-foundation.org { 138ca79b0c2SArun KS unsigned long nr_pages = totalram_pages(); 139ca79b0c2SArun KS 140ca79b0c2SArun KS return min(nr_pages - totalhigh_pages(), nr_pages / 2); 141680d794bSakpm@linux-foundation.org } 142b76db735SAndrew Morton #endif 143680d794bSakpm@linux-foundation.org 144da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 145da08e9b7SMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, 146c5bf121eSVineeth Remanan Pillai gfp_t gfp, struct vm_area_struct *vma, 147c5bf121eSVineeth Remanan Pillai vm_fault_t *fault_type); 1481da177e4SLinus Torvalds 1491da177e4SLinus Torvalds static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 1501da177e4SLinus Torvalds { 1511da177e4SLinus Torvalds return sb->s_fs_info; 1521da177e4SLinus Torvalds } 1531da177e4SLinus Torvalds 1541da177e4SLinus Torvalds /* 1551da177e4SLinus Torvalds * shmem_file_setup pre-accounts the whole fixed size of a VM object, 1561da177e4SLinus Torvalds * for shared memory and for shared anonymous (/dev/zero) mappings 1571da177e4SLinus Torvalds * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 1581da177e4SLinus Torvalds * consistent with the pre-accounting of private mappings ... 1591da177e4SLinus Torvalds */ 1601da177e4SLinus Torvalds static inline int shmem_acct_size(unsigned long flags, loff_t size) 1611da177e4SLinus Torvalds { 1620b0a0806SHugh Dickins return (flags & VM_NORESERVE) ? 163191c5424SAl Viro 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 1641da177e4SLinus Torvalds } 1651da177e4SLinus Torvalds 1661da177e4SLinus Torvalds static inline void shmem_unacct_size(unsigned long flags, loff_t size) 1671da177e4SLinus Torvalds { 1680b0a0806SHugh Dickins if (!(flags & VM_NORESERVE)) 1691da177e4SLinus Torvalds vm_unacct_memory(VM_ACCT(size)); 1701da177e4SLinus Torvalds } 1711da177e4SLinus Torvalds 17277142517SKonstantin Khlebnikov static inline int shmem_reacct_size(unsigned long flags, 17377142517SKonstantin Khlebnikov loff_t oldsize, loff_t newsize) 17477142517SKonstantin Khlebnikov { 17577142517SKonstantin Khlebnikov if (!(flags & VM_NORESERVE)) { 17677142517SKonstantin Khlebnikov if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 17777142517SKonstantin Khlebnikov return security_vm_enough_memory_mm(current->mm, 17877142517SKonstantin Khlebnikov VM_ACCT(newsize) - VM_ACCT(oldsize)); 17977142517SKonstantin Khlebnikov else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 18077142517SKonstantin Khlebnikov vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 18177142517SKonstantin Khlebnikov } 18277142517SKonstantin Khlebnikov return 0; 18377142517SKonstantin Khlebnikov } 18477142517SKonstantin Khlebnikov 1851da177e4SLinus Torvalds /* 1861da177e4SLinus Torvalds * ... whereas tmpfs objects are accounted incrementally as 18775edd345SHugh Dickins * pages are allocated, in order to allow large sparse files. 188923e2f0eSMatthew Wilcox (Oracle) * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 1891da177e4SLinus Torvalds * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 1901da177e4SLinus Torvalds */ 191800d8c63SKirill A. Shutemov static inline int shmem_acct_block(unsigned long flags, long pages) 1921da177e4SLinus Torvalds { 193800d8c63SKirill A. Shutemov if (!(flags & VM_NORESERVE)) 194800d8c63SKirill A. Shutemov return 0; 195800d8c63SKirill A. Shutemov 196800d8c63SKirill A. Shutemov return security_vm_enough_memory_mm(current->mm, 197800d8c63SKirill A. Shutemov pages * VM_ACCT(PAGE_SIZE)); 1981da177e4SLinus Torvalds } 1991da177e4SLinus Torvalds 2001da177e4SLinus Torvalds static inline void shmem_unacct_blocks(unsigned long flags, long pages) 2011da177e4SLinus Torvalds { 2020b0a0806SHugh Dickins if (flags & VM_NORESERVE) 20309cbfeafSKirill A. Shutemov vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 2041da177e4SLinus Torvalds } 2051da177e4SLinus Torvalds 206c7e263abSLukas Czerner static inline int shmem_inode_acct_block(struct inode *inode, long pages) 2070f079694SMike Rapoport { 2080f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 2090f079694SMike Rapoport struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 210c7e263abSLukas Czerner int err = -ENOSPC; 2110f079694SMike Rapoport 2120f079694SMike Rapoport if (shmem_acct_block(info->flags, pages)) 213c7e263abSLukas Czerner return err; 2140f079694SMike Rapoport 2150f079694SMike Rapoport if (sbinfo->max_blocks) { 2160f079694SMike Rapoport if (percpu_counter_compare(&sbinfo->used_blocks, 2170f079694SMike Rapoport sbinfo->max_blocks - pages) > 0) 2180f079694SMike Rapoport goto unacct; 219e09764cfSCarlos Maiolino 220e09764cfSCarlos Maiolino err = dquot_alloc_block_nodirty(inode, pages); 221e09764cfSCarlos Maiolino if (err) 222e09764cfSCarlos Maiolino goto unacct; 223e09764cfSCarlos Maiolino 2240f079694SMike Rapoport percpu_counter_add(&sbinfo->used_blocks, pages); 225e09764cfSCarlos Maiolino } else { 226e09764cfSCarlos Maiolino err = dquot_alloc_block_nodirty(inode, pages); 227e09764cfSCarlos Maiolino if (err) 228e09764cfSCarlos Maiolino goto unacct; 2290f079694SMike Rapoport } 2300f079694SMike Rapoport 231c7e263abSLukas Czerner return 0; 2320f079694SMike Rapoport 2330f079694SMike Rapoport unacct: 2340f079694SMike Rapoport shmem_unacct_blocks(info->flags, pages); 235c7e263abSLukas Czerner return err; 2360f079694SMike Rapoport } 2370f079694SMike Rapoport 2380f079694SMike Rapoport static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) 2390f079694SMike Rapoport { 2400f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 2410f079694SMike Rapoport struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2420f079694SMike Rapoport 243e09764cfSCarlos Maiolino dquot_free_block_nodirty(inode, pages); 244e09764cfSCarlos Maiolino 2450f079694SMike Rapoport if (sbinfo->max_blocks) 2460f079694SMike Rapoport percpu_counter_sub(&sbinfo->used_blocks, pages); 2470f079694SMike Rapoport shmem_unacct_blocks(info->flags, pages); 2480f079694SMike Rapoport } 2490f079694SMike Rapoport 250759b9775SHugh Dickins static const struct super_operations shmem_ops; 25130e6a51dSHui Su const struct address_space_operations shmem_aops; 25215ad7cdcSHelge Deller static const struct file_operations shmem_file_operations; 25392e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations; 25492e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations; 25592e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations; 256f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops; 257d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops; 258779750d2SKirill A. Shutemov static struct file_system_type shmem_fs_type; 2591da177e4SLinus Torvalds 260d09e8ca6SPasha Tatashin bool vma_is_anon_shmem(struct vm_area_struct *vma) 261d09e8ca6SPasha Tatashin { 262d09e8ca6SPasha Tatashin return vma->vm_ops == &shmem_anon_vm_ops; 263d09e8ca6SPasha Tatashin } 264d09e8ca6SPasha Tatashin 265b0506e48SMike Rapoport bool vma_is_shmem(struct vm_area_struct *vma) 266b0506e48SMike Rapoport { 267d09e8ca6SPasha Tatashin return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; 268b0506e48SMike Rapoport } 269b0506e48SMike Rapoport 2701da177e4SLinus Torvalds static LIST_HEAD(shmem_swaplist); 271cb5f7b9aSHugh Dickins static DEFINE_MUTEX(shmem_swaplist_mutex); 2721da177e4SLinus Torvalds 273e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 274e09764cfSCarlos Maiolino 275e09764cfSCarlos Maiolino static int shmem_enable_quotas(struct super_block *sb, 276e09764cfSCarlos Maiolino unsigned short quota_types) 277e09764cfSCarlos Maiolino { 278e09764cfSCarlos Maiolino int type, err = 0; 279e09764cfSCarlos Maiolino 280e09764cfSCarlos Maiolino sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 281e09764cfSCarlos Maiolino for (type = 0; type < SHMEM_MAXQUOTAS; type++) { 282e09764cfSCarlos Maiolino if (!(quota_types & (1 << type))) 283e09764cfSCarlos Maiolino continue; 284e09764cfSCarlos Maiolino err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, 285e09764cfSCarlos Maiolino DQUOT_USAGE_ENABLED | 286e09764cfSCarlos Maiolino DQUOT_LIMITS_ENABLED); 287e09764cfSCarlos Maiolino if (err) 288e09764cfSCarlos Maiolino goto out_err; 289e09764cfSCarlos Maiolino } 290e09764cfSCarlos Maiolino return 0; 291e09764cfSCarlos Maiolino 292e09764cfSCarlos Maiolino out_err: 293e09764cfSCarlos Maiolino pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", 294e09764cfSCarlos Maiolino type, err); 295e09764cfSCarlos Maiolino for (type--; type >= 0; type--) 296e09764cfSCarlos Maiolino dquot_quota_off(sb, type); 297e09764cfSCarlos Maiolino return err; 298e09764cfSCarlos Maiolino } 299e09764cfSCarlos Maiolino 300e09764cfSCarlos Maiolino static void shmem_disable_quotas(struct super_block *sb) 301e09764cfSCarlos Maiolino { 302e09764cfSCarlos Maiolino int type; 303e09764cfSCarlos Maiolino 304e09764cfSCarlos Maiolino for (type = 0; type < SHMEM_MAXQUOTAS; type++) 305e09764cfSCarlos Maiolino dquot_quota_off(sb, type); 306e09764cfSCarlos Maiolino } 307e09764cfSCarlos Maiolino 308e09764cfSCarlos Maiolino static struct dquot **shmem_get_dquots(struct inode *inode) 309e09764cfSCarlos Maiolino { 310e09764cfSCarlos Maiolino return SHMEM_I(inode)->i_dquot; 311e09764cfSCarlos Maiolino } 312e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 313e09764cfSCarlos Maiolino 314e809d5f0SChris Down /* 315e809d5f0SChris Down * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and 316e809d5f0SChris Down * produces a novel ino for the newly allocated inode. 317e809d5f0SChris Down * 318e809d5f0SChris Down * It may also be called when making a hard link to permit the space needed by 319e809d5f0SChris Down * each dentry. However, in that case, no new inode number is needed since that 320e809d5f0SChris Down * internally draws from another pool of inode numbers (currently global 321e809d5f0SChris Down * get_next_ino()). This case is indicated by passing NULL as inop. 322e809d5f0SChris Down */ 323e809d5f0SChris Down #define SHMEM_INO_BATCH 1024 324e809d5f0SChris Down static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) 3255b04c689SPavel Emelyanov { 3265b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 327e809d5f0SChris Down ino_t ino; 328e809d5f0SChris Down 329e809d5f0SChris Down if (!(sb->s_flags & SB_KERNMOUNT)) { 330bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 331bb3e96d6SByron Stanoszek if (sbinfo->max_inodes) { 3325b04c689SPavel Emelyanov if (!sbinfo->free_inodes) { 333bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 3345b04c689SPavel Emelyanov return -ENOSPC; 3355b04c689SPavel Emelyanov } 3365b04c689SPavel Emelyanov sbinfo->free_inodes--; 337bb3e96d6SByron Stanoszek } 338e809d5f0SChris Down if (inop) { 339e809d5f0SChris Down ino = sbinfo->next_ino++; 340e809d5f0SChris Down if (unlikely(is_zero_ino(ino))) 341e809d5f0SChris Down ino = sbinfo->next_ino++; 342ea3271f7SChris Down if (unlikely(!sbinfo->full_inums && 343ea3271f7SChris Down ino > UINT_MAX)) { 344e809d5f0SChris Down /* 345e809d5f0SChris Down * Emulate get_next_ino uint wraparound for 346e809d5f0SChris Down * compatibility 347e809d5f0SChris Down */ 348ea3271f7SChris Down if (IS_ENABLED(CONFIG_64BIT)) 349ea3271f7SChris Down pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", 350ea3271f7SChris Down __func__, MINOR(sb->s_dev)); 351ea3271f7SChris Down sbinfo->next_ino = 1; 352ea3271f7SChris Down ino = sbinfo->next_ino++; 3535b04c689SPavel Emelyanov } 354e809d5f0SChris Down *inop = ino; 355e809d5f0SChris Down } 356bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 357e809d5f0SChris Down } else if (inop) { 358e809d5f0SChris Down /* 359e809d5f0SChris Down * __shmem_file_setup, one of our callers, is lock-free: it 360e809d5f0SChris Down * doesn't hold stat_lock in shmem_reserve_inode since 361e809d5f0SChris Down * max_inodes is always 0, and is called from potentially 362e809d5f0SChris Down * unknown contexts. As such, use a per-cpu batched allocator 363e809d5f0SChris Down * which doesn't require the per-sb stat_lock unless we are at 364e809d5f0SChris Down * the batch boundary. 365ea3271f7SChris Down * 366ea3271f7SChris Down * We don't need to worry about inode{32,64} since SB_KERNMOUNT 367ea3271f7SChris Down * shmem mounts are not exposed to userspace, so we don't need 368ea3271f7SChris Down * to worry about things like glibc compatibility. 369e809d5f0SChris Down */ 370e809d5f0SChris Down ino_t *next_ino; 371bf11b9a8SSebastian Andrzej Siewior 372e809d5f0SChris Down next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); 373e809d5f0SChris Down ino = *next_ino; 374e809d5f0SChris Down if (unlikely(ino % SHMEM_INO_BATCH == 0)) { 375bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 376e809d5f0SChris Down ino = sbinfo->next_ino; 377e809d5f0SChris Down sbinfo->next_ino += SHMEM_INO_BATCH; 378bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 379e809d5f0SChris Down if (unlikely(is_zero_ino(ino))) 380e809d5f0SChris Down ino++; 381e809d5f0SChris Down } 382e809d5f0SChris Down *inop = ino; 383e809d5f0SChris Down *next_ino = ++ino; 384e809d5f0SChris Down put_cpu(); 385e809d5f0SChris Down } 386e809d5f0SChris Down 3875b04c689SPavel Emelyanov return 0; 3885b04c689SPavel Emelyanov } 3895b04c689SPavel Emelyanov 3905b04c689SPavel Emelyanov static void shmem_free_inode(struct super_block *sb) 3915b04c689SPavel Emelyanov { 3925b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3935b04c689SPavel Emelyanov if (sbinfo->max_inodes) { 394bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 3955b04c689SPavel Emelyanov sbinfo->free_inodes++; 396bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 3975b04c689SPavel Emelyanov } 3985b04c689SPavel Emelyanov } 3995b04c689SPavel Emelyanov 40046711810SRandy Dunlap /** 40141ffe5d5SHugh Dickins * shmem_recalc_inode - recalculate the block usage of an inode 4021da177e4SLinus Torvalds * @inode: inode to recalc 4031da177e4SLinus Torvalds * 4041da177e4SLinus Torvalds * We have to calculate the free blocks since the mm can drop 4051da177e4SLinus Torvalds * undirtied hole pages behind our back. 4061da177e4SLinus Torvalds * 4071da177e4SLinus Torvalds * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 4081da177e4SLinus Torvalds * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 4091da177e4SLinus Torvalds * 4101da177e4SLinus Torvalds * It has to be called with the spinlock held. 4111da177e4SLinus Torvalds */ 4121da177e4SLinus Torvalds static void shmem_recalc_inode(struct inode *inode) 4131da177e4SLinus Torvalds { 4141da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 4151da177e4SLinus Torvalds long freed; 4161da177e4SLinus Torvalds 4171da177e4SLinus Torvalds freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 4181da177e4SLinus Torvalds if (freed > 0) { 4191da177e4SLinus Torvalds info->alloced -= freed; 4200f079694SMike Rapoport shmem_inode_unacct_blocks(inode, freed); 4211da177e4SLinus Torvalds } 4221da177e4SLinus Torvalds } 4231da177e4SLinus Torvalds 424800d8c63SKirill A. Shutemov bool shmem_charge(struct inode *inode, long pages) 425800d8c63SKirill A. Shutemov { 426800d8c63SKirill A. Shutemov struct shmem_inode_info *info = SHMEM_I(inode); 427509f0069SHugh Dickins struct address_space *mapping = inode->i_mapping; 428800d8c63SKirill A. Shutemov 429c7e263abSLukas Czerner if (shmem_inode_acct_block(inode, pages)) 430800d8c63SKirill A. Shutemov return false; 431b1cc94abSMike Rapoport 432aaa52e34SHugh Dickins /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 433509f0069SHugh Dickins xa_lock_irq(&mapping->i_pages); 434509f0069SHugh Dickins mapping->nrpages += pages; 435509f0069SHugh Dickins xa_unlock_irq(&mapping->i_pages); 436aaa52e34SHugh Dickins 437509f0069SHugh Dickins spin_lock_irq(&info->lock); 438800d8c63SKirill A. Shutemov info->alloced += pages; 439800d8c63SKirill A. Shutemov shmem_recalc_inode(inode); 440509f0069SHugh Dickins spin_unlock_irq(&info->lock); 441800d8c63SKirill A. Shutemov 442800d8c63SKirill A. Shutemov return true; 443800d8c63SKirill A. Shutemov } 444800d8c63SKirill A. Shutemov 445800d8c63SKirill A. Shutemov void shmem_uncharge(struct inode *inode, long pages) 446800d8c63SKirill A. Shutemov { 447800d8c63SKirill A. Shutemov struct shmem_inode_info *info = SHMEM_I(inode); 448800d8c63SKirill A. Shutemov 4496ffcd825SMatthew Wilcox (Oracle) /* nrpages adjustment done by __filemap_remove_folio() or caller */ 450aaa52e34SHugh Dickins 451509f0069SHugh Dickins spin_lock_irq(&info->lock); 452800d8c63SKirill A. Shutemov shmem_recalc_inode(inode); 453509f0069SHugh Dickins /* which has called shmem_inode_unacct_blocks() if necessary */ 454509f0069SHugh Dickins spin_unlock_irq(&info->lock); 455800d8c63SKirill A. Shutemov } 456800d8c63SKirill A. Shutemov 4577a5d0fbbSHugh Dickins /* 45862f945b6SMatthew Wilcox * Replace item expected in xarray by a new item, while holding xa_lock. 4597a5d0fbbSHugh Dickins */ 46062f945b6SMatthew Wilcox static int shmem_replace_entry(struct address_space *mapping, 4617a5d0fbbSHugh Dickins pgoff_t index, void *expected, void *replacement) 4627a5d0fbbSHugh Dickins { 46362f945b6SMatthew Wilcox XA_STATE(xas, &mapping->i_pages, index); 4646dbaf22cSJohannes Weiner void *item; 4657a5d0fbbSHugh Dickins 4667a5d0fbbSHugh Dickins VM_BUG_ON(!expected); 4676dbaf22cSJohannes Weiner VM_BUG_ON(!replacement); 46862f945b6SMatthew Wilcox item = xas_load(&xas); 4697a5d0fbbSHugh Dickins if (item != expected) 4707a5d0fbbSHugh Dickins return -ENOENT; 47162f945b6SMatthew Wilcox xas_store(&xas, replacement); 4727a5d0fbbSHugh Dickins return 0; 4737a5d0fbbSHugh Dickins } 4747a5d0fbbSHugh Dickins 4757a5d0fbbSHugh Dickins /* 476d1899228SHugh Dickins * Sometimes, before we decide whether to proceed or to fail, we must check 477d1899228SHugh Dickins * that an entry was not already brought back from swap by a racing thread. 478d1899228SHugh Dickins * 479d1899228SHugh Dickins * Checking page is not enough: by the time a SwapCache page is locked, it 480d1899228SHugh Dickins * might be reused, and again be SwapCache, using the same swap as before. 481d1899228SHugh Dickins */ 482d1899228SHugh Dickins static bool shmem_confirm_swap(struct address_space *mapping, 483d1899228SHugh Dickins pgoff_t index, swp_entry_t swap) 484d1899228SHugh Dickins { 485a12831bfSMatthew Wilcox return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 486d1899228SHugh Dickins } 487d1899228SHugh Dickins 488d1899228SHugh Dickins /* 4895a6e75f8SKirill A. Shutemov * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 4905a6e75f8SKirill A. Shutemov * 4915a6e75f8SKirill A. Shutemov * SHMEM_HUGE_NEVER: 4925a6e75f8SKirill A. Shutemov * disables huge pages for the mount; 4935a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ALWAYS: 4945a6e75f8SKirill A. Shutemov * enables huge pages for the mount; 4955a6e75f8SKirill A. Shutemov * SHMEM_HUGE_WITHIN_SIZE: 4965a6e75f8SKirill A. Shutemov * only allocate huge pages if the page will be fully within i_size, 4975a6e75f8SKirill A. Shutemov * also respect fadvise()/madvise() hints; 4985a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ADVISE: 4995a6e75f8SKirill A. Shutemov * only allocate huge pages if requested with fadvise()/madvise(); 5005a6e75f8SKirill A. Shutemov */ 5015a6e75f8SKirill A. Shutemov 5025a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_NEVER 0 5035a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ALWAYS 1 5045a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_WITHIN_SIZE 2 5055a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ADVISE 3 5065a6e75f8SKirill A. Shutemov 5075a6e75f8SKirill A. Shutemov /* 5085a6e75f8SKirill A. Shutemov * Special values. 5095a6e75f8SKirill A. Shutemov * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 5105a6e75f8SKirill A. Shutemov * 5115a6e75f8SKirill A. Shutemov * SHMEM_HUGE_DENY: 5125a6e75f8SKirill A. Shutemov * disables huge on shm_mnt and all mounts, for emergency use; 5135a6e75f8SKirill A. Shutemov * SHMEM_HUGE_FORCE: 5145a6e75f8SKirill A. Shutemov * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 5155a6e75f8SKirill A. Shutemov * 5165a6e75f8SKirill A. Shutemov */ 5175a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_DENY (-1) 5185a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_FORCE (-2) 5195a6e75f8SKirill A. Shutemov 520396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5215a6e75f8SKirill A. Shutemov /* ifdef here to avoid bloating shmem.o when not necessary */ 5225a6e75f8SKirill A. Shutemov 5235e6e5a12SHugh Dickins static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 5245a6e75f8SKirill A. Shutemov 5252cf13384SDavid Stevens bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 5262cf13384SDavid Stevens struct mm_struct *mm, unsigned long vm_flags) 527c852023eSHugh Dickins { 528c852023eSHugh Dickins loff_t i_size; 529c852023eSHugh Dickins 530f7cd16a5SXavier Roche if (!S_ISREG(inode->i_mode)) 531f7cd16a5SXavier Roche return false; 5322cf13384SDavid Stevens if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) 533c852023eSHugh Dickins return false; 5347c6c6cc4SZach O'Keefe if (shmem_huge == SHMEM_HUGE_DENY) 5357c6c6cc4SZach O'Keefe return false; 5363de0c269SZach O'Keefe if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) 5373de0c269SZach O'Keefe return true; 5385e6e5a12SHugh Dickins 5395e6e5a12SHugh Dickins switch (SHMEM_SB(inode->i_sb)->huge) { 540c852023eSHugh Dickins case SHMEM_HUGE_ALWAYS: 541c852023eSHugh Dickins return true; 542c852023eSHugh Dickins case SHMEM_HUGE_WITHIN_SIZE: 543de6ee659SLiu Yuntao index = round_up(index + 1, HPAGE_PMD_NR); 544c852023eSHugh Dickins i_size = round_up(i_size_read(inode), PAGE_SIZE); 545de6ee659SLiu Yuntao if (i_size >> PAGE_SHIFT >= index) 546c852023eSHugh Dickins return true; 547c852023eSHugh Dickins fallthrough; 548c852023eSHugh Dickins case SHMEM_HUGE_ADVISE: 5492cf13384SDavid Stevens if (mm && (vm_flags & VM_HUGEPAGE)) 5505e6e5a12SHugh Dickins return true; 5515e6e5a12SHugh Dickins fallthrough; 552c852023eSHugh Dickins default: 553c852023eSHugh Dickins return false; 554c852023eSHugh Dickins } 555c852023eSHugh Dickins } 5565a6e75f8SKirill A. Shutemov 557e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS) 5585a6e75f8SKirill A. Shutemov static int shmem_parse_huge(const char *str) 5595a6e75f8SKirill A. Shutemov { 5605a6e75f8SKirill A. Shutemov if (!strcmp(str, "never")) 5615a6e75f8SKirill A. Shutemov return SHMEM_HUGE_NEVER; 5625a6e75f8SKirill A. Shutemov if (!strcmp(str, "always")) 5635a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ALWAYS; 5645a6e75f8SKirill A. Shutemov if (!strcmp(str, "within_size")) 5655a6e75f8SKirill A. Shutemov return SHMEM_HUGE_WITHIN_SIZE; 5665a6e75f8SKirill A. Shutemov if (!strcmp(str, "advise")) 5675a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ADVISE; 5685a6e75f8SKirill A. Shutemov if (!strcmp(str, "deny")) 5695a6e75f8SKirill A. Shutemov return SHMEM_HUGE_DENY; 5705a6e75f8SKirill A. Shutemov if (!strcmp(str, "force")) 5715a6e75f8SKirill A. Shutemov return SHMEM_HUGE_FORCE; 5725a6e75f8SKirill A. Shutemov return -EINVAL; 5735a6e75f8SKirill A. Shutemov } 574e5f2249aSArnd Bergmann #endif 5755a6e75f8SKirill A. Shutemov 576e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 5775a6e75f8SKirill A. Shutemov static const char *shmem_format_huge(int huge) 5785a6e75f8SKirill A. Shutemov { 5795a6e75f8SKirill A. Shutemov switch (huge) { 5805a6e75f8SKirill A. Shutemov case SHMEM_HUGE_NEVER: 5815a6e75f8SKirill A. Shutemov return "never"; 5825a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ALWAYS: 5835a6e75f8SKirill A. Shutemov return "always"; 5845a6e75f8SKirill A. Shutemov case SHMEM_HUGE_WITHIN_SIZE: 5855a6e75f8SKirill A. Shutemov return "within_size"; 5865a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ADVISE: 5875a6e75f8SKirill A. Shutemov return "advise"; 5885a6e75f8SKirill A. Shutemov case SHMEM_HUGE_DENY: 5895a6e75f8SKirill A. Shutemov return "deny"; 5905a6e75f8SKirill A. Shutemov case SHMEM_HUGE_FORCE: 5915a6e75f8SKirill A. Shutemov return "force"; 5925a6e75f8SKirill A. Shutemov default: 5935a6e75f8SKirill A. Shutemov VM_BUG_ON(1); 5945a6e75f8SKirill A. Shutemov return "bad_val"; 5955a6e75f8SKirill A. Shutemov } 5965a6e75f8SKirill A. Shutemov } 597f1f5929cSJérémy Lefaure #endif 5985a6e75f8SKirill A. Shutemov 599779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 600779750d2SKirill A. Shutemov struct shrink_control *sc, unsigned long nr_to_split) 601779750d2SKirill A. Shutemov { 602779750d2SKirill A. Shutemov LIST_HEAD(list), *pos, *next; 603253fd0f0SKirill A. Shutemov LIST_HEAD(to_remove); 604779750d2SKirill A. Shutemov struct inode *inode; 605779750d2SKirill A. Shutemov struct shmem_inode_info *info; 60605624571SMatthew Wilcox (Oracle) struct folio *folio; 607779750d2SKirill A. Shutemov unsigned long batch = sc ? sc->nr_to_scan : 128; 60862c9827cSGang Li int split = 0; 609779750d2SKirill A. Shutemov 610779750d2SKirill A. Shutemov if (list_empty(&sbinfo->shrinklist)) 611779750d2SKirill A. Shutemov return SHRINK_STOP; 612779750d2SKirill A. Shutemov 613779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 614779750d2SKirill A. Shutemov list_for_each_safe(pos, next, &sbinfo->shrinklist) { 615779750d2SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 616779750d2SKirill A. Shutemov 617779750d2SKirill A. Shutemov /* pin the inode */ 618779750d2SKirill A. Shutemov inode = igrab(&info->vfs_inode); 619779750d2SKirill A. Shutemov 620779750d2SKirill A. Shutemov /* inode is about to be evicted */ 621779750d2SKirill A. Shutemov if (!inode) { 622779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 623779750d2SKirill A. Shutemov goto next; 624779750d2SKirill A. Shutemov } 625779750d2SKirill A. Shutemov 626779750d2SKirill A. Shutemov /* Check if there's anything to gain */ 627779750d2SKirill A. Shutemov if (round_up(inode->i_size, PAGE_SIZE) == 628779750d2SKirill A. Shutemov round_up(inode->i_size, HPAGE_PMD_SIZE)) { 629253fd0f0SKirill A. Shutemov list_move(&info->shrinklist, &to_remove); 630779750d2SKirill A. Shutemov goto next; 631779750d2SKirill A. Shutemov } 632779750d2SKirill A. Shutemov 633779750d2SKirill A. Shutemov list_move(&info->shrinklist, &list); 634779750d2SKirill A. Shutemov next: 63562c9827cSGang Li sbinfo->shrinklist_len--; 636779750d2SKirill A. Shutemov if (!--batch) 637779750d2SKirill A. Shutemov break; 638779750d2SKirill A. Shutemov } 639779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 640779750d2SKirill A. Shutemov 641253fd0f0SKirill A. Shutemov list_for_each_safe(pos, next, &to_remove) { 642253fd0f0SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 643253fd0f0SKirill A. Shutemov inode = &info->vfs_inode; 644253fd0f0SKirill A. Shutemov list_del_init(&info->shrinklist); 645253fd0f0SKirill A. Shutemov iput(inode); 646253fd0f0SKirill A. Shutemov } 647253fd0f0SKirill A. Shutemov 648779750d2SKirill A. Shutemov list_for_each_safe(pos, next, &list) { 649779750d2SKirill A. Shutemov int ret; 65005624571SMatthew Wilcox (Oracle) pgoff_t index; 651779750d2SKirill A. Shutemov 652779750d2SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 653779750d2SKirill A. Shutemov inode = &info->vfs_inode; 654779750d2SKirill A. Shutemov 655b3cd54b2SKirill A. Shutemov if (nr_to_split && split >= nr_to_split) 65662c9827cSGang Li goto move_back; 657779750d2SKirill A. Shutemov 65805624571SMatthew Wilcox (Oracle) index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; 65905624571SMatthew Wilcox (Oracle) folio = filemap_get_folio(inode->i_mapping, index); 66066dabbb6SChristoph Hellwig if (IS_ERR(folio)) 661779750d2SKirill A. Shutemov goto drop; 662779750d2SKirill A. Shutemov 663b3cd54b2SKirill A. Shutemov /* No huge page at the end of the file: nothing to split */ 66405624571SMatthew Wilcox (Oracle) if (!folio_test_large(folio)) { 66505624571SMatthew Wilcox (Oracle) folio_put(folio); 666779750d2SKirill A. Shutemov goto drop; 667779750d2SKirill A. Shutemov } 668779750d2SKirill A. Shutemov 669b3cd54b2SKirill A. Shutemov /* 67062c9827cSGang Li * Move the inode on the list back to shrinklist if we failed 67162c9827cSGang Li * to lock the page at this time. 672b3cd54b2SKirill A. Shutemov * 673b3cd54b2SKirill A. Shutemov * Waiting for the lock may lead to deadlock in the 674b3cd54b2SKirill A. Shutemov * reclaim path. 675b3cd54b2SKirill A. Shutemov */ 67605624571SMatthew Wilcox (Oracle) if (!folio_trylock(folio)) { 67705624571SMatthew Wilcox (Oracle) folio_put(folio); 67862c9827cSGang Li goto move_back; 679b3cd54b2SKirill A. Shutemov } 680b3cd54b2SKirill A. Shutemov 681d788f5b3SMatthew Wilcox (Oracle) ret = split_folio(folio); 68205624571SMatthew Wilcox (Oracle) folio_unlock(folio); 68305624571SMatthew Wilcox (Oracle) folio_put(folio); 684779750d2SKirill A. Shutemov 68562c9827cSGang Li /* If split failed move the inode on the list back to shrinklist */ 686b3cd54b2SKirill A. Shutemov if (ret) 68762c9827cSGang Li goto move_back; 688779750d2SKirill A. Shutemov 689779750d2SKirill A. Shutemov split++; 690779750d2SKirill A. Shutemov drop: 691779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 69262c9827cSGang Li goto put; 69362c9827cSGang Li move_back: 69462c9827cSGang Li /* 69562c9827cSGang Li * Make sure the inode is either on the global list or deleted 69662c9827cSGang Li * from any local list before iput() since it could be deleted 69762c9827cSGang Li * in another thread once we put the inode (then the local list 69862c9827cSGang Li * is corrupted). 69962c9827cSGang Li */ 70062c9827cSGang Li spin_lock(&sbinfo->shrinklist_lock); 70162c9827cSGang Li list_move(&info->shrinklist, &sbinfo->shrinklist); 70262c9827cSGang Li sbinfo->shrinklist_len++; 70362c9827cSGang Li spin_unlock(&sbinfo->shrinklist_lock); 70462c9827cSGang Li put: 705779750d2SKirill A. Shutemov iput(inode); 706779750d2SKirill A. Shutemov } 707779750d2SKirill A. Shutemov 708779750d2SKirill A. Shutemov return split; 709779750d2SKirill A. Shutemov } 710779750d2SKirill A. Shutemov 711779750d2SKirill A. Shutemov static long shmem_unused_huge_scan(struct super_block *sb, 712779750d2SKirill A. Shutemov struct shrink_control *sc) 713779750d2SKirill A. Shutemov { 714779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 715779750d2SKirill A. Shutemov 716779750d2SKirill A. Shutemov if (!READ_ONCE(sbinfo->shrinklist_len)) 717779750d2SKirill A. Shutemov return SHRINK_STOP; 718779750d2SKirill A. Shutemov 719779750d2SKirill A. Shutemov return shmem_unused_huge_shrink(sbinfo, sc, 0); 720779750d2SKirill A. Shutemov } 721779750d2SKirill A. Shutemov 722779750d2SKirill A. Shutemov static long shmem_unused_huge_count(struct super_block *sb, 723779750d2SKirill A. Shutemov struct shrink_control *sc) 724779750d2SKirill A. Shutemov { 725779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 726779750d2SKirill A. Shutemov return READ_ONCE(sbinfo->shrinklist_len); 727779750d2SKirill A. Shutemov } 728396bcc52SMatthew Wilcox (Oracle) #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 7295a6e75f8SKirill A. Shutemov 7305a6e75f8SKirill A. Shutemov #define shmem_huge SHMEM_HUGE_DENY 7315a6e75f8SKirill A. Shutemov 7322cf13384SDavid Stevens bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 7332cf13384SDavid Stevens struct mm_struct *mm, unsigned long vm_flags) 7345e6e5a12SHugh Dickins { 7355e6e5a12SHugh Dickins return false; 7365e6e5a12SHugh Dickins } 7375e6e5a12SHugh Dickins 738779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 739779750d2SKirill A. Shutemov struct shrink_control *sc, unsigned long nr_to_split) 740779750d2SKirill A. Shutemov { 741779750d2SKirill A. Shutemov return 0; 742779750d2SKirill A. Shutemov } 743396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 7445a6e75f8SKirill A. Shutemov 7455a6e75f8SKirill A. Shutemov /* 7462bb876b5SMatthew Wilcox (Oracle) * Like filemap_add_folio, but error if expected item has gone. 74746f65ec1SHugh Dickins */ 748b7dd44a1SMatthew Wilcox (Oracle) static int shmem_add_to_page_cache(struct folio *folio, 74946f65ec1SHugh Dickins struct address_space *mapping, 7503fea5a49SJohannes Weiner pgoff_t index, void *expected, gfp_t gfp, 7513fea5a49SJohannes Weiner struct mm_struct *charge_mm) 75246f65ec1SHugh Dickins { 753b7dd44a1SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 754b7dd44a1SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 7553fea5a49SJohannes Weiner int error; 75646f65ec1SHugh Dickins 757b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 758b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 759b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 760b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON(expected && folio_test_large(folio)); 76146f65ec1SHugh Dickins 762b7dd44a1SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 763b7dd44a1SMatthew Wilcox (Oracle) folio->mapping = mapping; 764b7dd44a1SMatthew Wilcox (Oracle) folio->index = index; 76546f65ec1SHugh Dickins 766b7dd44a1SMatthew Wilcox (Oracle) if (!folio_test_swapcache(folio)) { 767b7dd44a1SMatthew Wilcox (Oracle) error = mem_cgroup_charge(folio, charge_mm, gfp); 7683fea5a49SJohannes Weiner if (error) { 769b7dd44a1SMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio)) { 7703fea5a49SJohannes Weiner count_vm_event(THP_FILE_FALLBACK); 7713fea5a49SJohannes Weiner count_vm_event(THP_FILE_FALLBACK_CHARGE); 7723fea5a49SJohannes Weiner } 7733fea5a49SJohannes Weiner goto error; 7743fea5a49SJohannes Weiner } 7754c6355b2SJohannes Weiner } 776b7dd44a1SMatthew Wilcox (Oracle) folio_throttle_swaprate(folio, gfp); 7773fea5a49SJohannes Weiner 778552446a4SMatthew Wilcox do { 779552446a4SMatthew Wilcox xas_lock_irq(&xas); 7806b24ca4aSMatthew Wilcox (Oracle) if (expected != xas_find_conflict(&xas)) { 781552446a4SMatthew Wilcox xas_set_err(&xas, -EEXIST); 7826b24ca4aSMatthew Wilcox (Oracle) goto unlock; 7836b24ca4aSMatthew Wilcox (Oracle) } 7846b24ca4aSMatthew Wilcox (Oracle) if (expected && xas_find_conflict(&xas)) { 7856b24ca4aSMatthew Wilcox (Oracle) xas_set_err(&xas, -EEXIST); 7866b24ca4aSMatthew Wilcox (Oracle) goto unlock; 7876b24ca4aSMatthew Wilcox (Oracle) } 788b7dd44a1SMatthew Wilcox (Oracle) xas_store(&xas, folio); 789552446a4SMatthew Wilcox if (xas_error(&xas)) 790552446a4SMatthew Wilcox goto unlock; 791b7dd44a1SMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio)) { 792800d8c63SKirill A. Shutemov count_vm_event(THP_FILE_ALLOC); 793b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); 794552446a4SMatthew Wilcox } 795552446a4SMatthew Wilcox mapping->nrpages += nr; 796b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 797b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); 798552446a4SMatthew Wilcox unlock: 799552446a4SMatthew Wilcox xas_unlock_irq(&xas); 800552446a4SMatthew Wilcox } while (xas_nomem(&xas, gfp)); 801552446a4SMatthew Wilcox 802552446a4SMatthew Wilcox if (xas_error(&xas)) { 8033fea5a49SJohannes Weiner error = xas_error(&xas); 8043fea5a49SJohannes Weiner goto error; 80546f65ec1SHugh Dickins } 806552446a4SMatthew Wilcox 807552446a4SMatthew Wilcox return 0; 8083fea5a49SJohannes Weiner error: 809b7dd44a1SMatthew Wilcox (Oracle) folio->mapping = NULL; 810b7dd44a1SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 8113fea5a49SJohannes Weiner return error; 81246f65ec1SHugh Dickins } 81346f65ec1SHugh Dickins 81446f65ec1SHugh Dickins /* 8154cd400fdSMatthew Wilcox (Oracle) * Like delete_from_page_cache, but substitutes swap for @folio. 8166922c0c7SHugh Dickins */ 8174cd400fdSMatthew Wilcox (Oracle) static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) 8186922c0c7SHugh Dickins { 8194cd400fdSMatthew Wilcox (Oracle) struct address_space *mapping = folio->mapping; 8204cd400fdSMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 8216922c0c7SHugh Dickins int error; 8226922c0c7SHugh Dickins 823b93b0163SMatthew Wilcox xa_lock_irq(&mapping->i_pages); 8244cd400fdSMatthew Wilcox (Oracle) error = shmem_replace_entry(mapping, folio->index, folio, radswap); 8254cd400fdSMatthew Wilcox (Oracle) folio->mapping = NULL; 8264cd400fdSMatthew Wilcox (Oracle) mapping->nrpages -= nr; 8274cd400fdSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 8284cd400fdSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 829b93b0163SMatthew Wilcox xa_unlock_irq(&mapping->i_pages); 8304cd400fdSMatthew Wilcox (Oracle) folio_put(folio); 8316922c0c7SHugh Dickins BUG_ON(error); 8326922c0c7SHugh Dickins } 8336922c0c7SHugh Dickins 8346922c0c7SHugh Dickins /* 835c121d3bbSMatthew Wilcox * Remove swap entry from page cache, free the swap and its page cache. 8367a5d0fbbSHugh Dickins */ 8377a5d0fbbSHugh Dickins static int shmem_free_swap(struct address_space *mapping, 8387a5d0fbbSHugh Dickins pgoff_t index, void *radswap) 8397a5d0fbbSHugh Dickins { 8406dbaf22cSJohannes Weiner void *old; 8417a5d0fbbSHugh Dickins 84255f3f7eaSMatthew Wilcox old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 8436dbaf22cSJohannes Weiner if (old != radswap) 8446dbaf22cSJohannes Weiner return -ENOENT; 8457a5d0fbbSHugh Dickins free_swap_and_cache(radix_to_swp_entry(radswap)); 8466dbaf22cSJohannes Weiner return 0; 8477a5d0fbbSHugh Dickins } 8487a5d0fbbSHugh Dickins 8497a5d0fbbSHugh Dickins /* 8506a15a370SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 85148131e03SVlastimil Babka * given offsets are swapped out. 8526a15a370SVlastimil Babka * 8539608703eSJan Kara * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 8546a15a370SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 8556a15a370SVlastimil Babka */ 85648131e03SVlastimil Babka unsigned long shmem_partial_swap_usage(struct address_space *mapping, 85748131e03SVlastimil Babka pgoff_t start, pgoff_t end) 8586a15a370SVlastimil Babka { 8597ae3424fSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, start); 8606a15a370SVlastimil Babka struct page *page; 86148131e03SVlastimil Babka unsigned long swapped = 0; 8626a15a370SVlastimil Babka 8636a15a370SVlastimil Babka rcu_read_lock(); 8647ae3424fSMatthew Wilcox xas_for_each(&xas, page, end - 1) { 8657ae3424fSMatthew Wilcox if (xas_retry(&xas, page)) 8662cf938aaSMatthew Wilcox continue; 8673159f943SMatthew Wilcox if (xa_is_value(page)) 8686a15a370SVlastimil Babka swapped++; 8696a15a370SVlastimil Babka 8706a15a370SVlastimil Babka if (need_resched()) { 8717ae3424fSMatthew Wilcox xas_pause(&xas); 8726a15a370SVlastimil Babka cond_resched_rcu(); 8736a15a370SVlastimil Babka } 8746a15a370SVlastimil Babka } 8756a15a370SVlastimil Babka 8766a15a370SVlastimil Babka rcu_read_unlock(); 8776a15a370SVlastimil Babka 8786a15a370SVlastimil Babka return swapped << PAGE_SHIFT; 8796a15a370SVlastimil Babka } 8806a15a370SVlastimil Babka 8816a15a370SVlastimil Babka /* 88248131e03SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 88348131e03SVlastimil Babka * given vma is swapped out. 88448131e03SVlastimil Babka * 8859608703eSJan Kara * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 88648131e03SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 88748131e03SVlastimil Babka */ 88848131e03SVlastimil Babka unsigned long shmem_swap_usage(struct vm_area_struct *vma) 88948131e03SVlastimil Babka { 89048131e03SVlastimil Babka struct inode *inode = file_inode(vma->vm_file); 89148131e03SVlastimil Babka struct shmem_inode_info *info = SHMEM_I(inode); 89248131e03SVlastimil Babka struct address_space *mapping = inode->i_mapping; 89348131e03SVlastimil Babka unsigned long swapped; 89448131e03SVlastimil Babka 89548131e03SVlastimil Babka /* Be careful as we don't hold info->lock */ 89648131e03SVlastimil Babka swapped = READ_ONCE(info->swapped); 89748131e03SVlastimil Babka 89848131e03SVlastimil Babka /* 89948131e03SVlastimil Babka * The easier cases are when the shmem object has nothing in swap, or 90048131e03SVlastimil Babka * the vma maps it whole. Then we can simply use the stats that we 90148131e03SVlastimil Babka * already track. 90248131e03SVlastimil Babka */ 90348131e03SVlastimil Babka if (!swapped) 90448131e03SVlastimil Babka return 0; 90548131e03SVlastimil Babka 90648131e03SVlastimil Babka if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 90748131e03SVlastimil Babka return swapped << PAGE_SHIFT; 90848131e03SVlastimil Babka 90948131e03SVlastimil Babka /* Here comes the more involved part */ 91002399c88SPeter Xu return shmem_partial_swap_usage(mapping, vma->vm_pgoff, 91102399c88SPeter Xu vma->vm_pgoff + vma_pages(vma)); 91248131e03SVlastimil Babka } 91348131e03SVlastimil Babka 91448131e03SVlastimil Babka /* 91524513264SHugh Dickins * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 91624513264SHugh Dickins */ 91724513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 91824513264SHugh Dickins { 919105c988fSMatthew Wilcox (Oracle) struct folio_batch fbatch; 92024513264SHugh Dickins pgoff_t index = 0; 92124513264SHugh Dickins 922105c988fSMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 92324513264SHugh Dickins /* 92424513264SHugh Dickins * Minor point, but we might as well stop if someone else SHM_LOCKs it. 92524513264SHugh Dickins */ 926105c988fSMatthew Wilcox (Oracle) while (!mapping_unevictable(mapping) && 927105c988fSMatthew Wilcox (Oracle) filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { 928105c988fSMatthew Wilcox (Oracle) check_move_unevictable_folios(&fbatch); 929105c988fSMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 93024513264SHugh Dickins cond_resched(); 93124513264SHugh Dickins } 9327a5d0fbbSHugh Dickins } 9337a5d0fbbSHugh Dickins 934b9a8a419SMatthew Wilcox (Oracle) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) 93571725ed1SHugh Dickins { 936b9a8a419SMatthew Wilcox (Oracle) struct folio *folio; 93771725ed1SHugh Dickins 938b9a8a419SMatthew Wilcox (Oracle) /* 939a7f5862cSMatthew Wilcox (Oracle) * At first avoid shmem_get_folio(,,,SGP_READ): that fails 94081914affSHugh Dickins * beyond i_size, and reports fallocated folios as holes. 941b9a8a419SMatthew Wilcox (Oracle) */ 94281914affSHugh Dickins folio = filemap_get_entry(inode->i_mapping, index); 94381914affSHugh Dickins if (!folio) 944b9a8a419SMatthew Wilcox (Oracle) return folio; 94581914affSHugh Dickins if (!xa_is_value(folio)) { 94681914affSHugh Dickins folio_lock(folio); 94781914affSHugh Dickins if (folio->mapping == inode->i_mapping) 94881914affSHugh Dickins return folio; 94981914affSHugh Dickins /* The folio has been swapped out */ 95081914affSHugh Dickins folio_unlock(folio); 95181914affSHugh Dickins folio_put(folio); 95281914affSHugh Dickins } 953b9a8a419SMatthew Wilcox (Oracle) /* 95481914affSHugh Dickins * But read a folio back from swap if any of it is within i_size 955b9a8a419SMatthew Wilcox (Oracle) * (although in some cases this is just a waste of time). 956b9a8a419SMatthew Wilcox (Oracle) */ 957a7f5862cSMatthew Wilcox (Oracle) folio = NULL; 958a7f5862cSMatthew Wilcox (Oracle) shmem_get_folio(inode, index, &folio, SGP_READ); 959a7f5862cSMatthew Wilcox (Oracle) return folio; 96071725ed1SHugh Dickins } 96171725ed1SHugh Dickins 96271725ed1SHugh Dickins /* 9637f4446eeSMatthew Wilcox * Remove range of pages and swap entries from page cache, and free them. 9641635f6a7SHugh Dickins * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 9657a5d0fbbSHugh Dickins */ 9661635f6a7SHugh Dickins static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 9671635f6a7SHugh Dickins bool unfalloc) 9681da177e4SLinus Torvalds { 969285b2c4fSHugh Dickins struct address_space *mapping = inode->i_mapping; 9701da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 97109cbfeafSKirill A. Shutemov pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 97209cbfeafSKirill A. Shutemov pgoff_t end = (lend + 1) >> PAGE_SHIFT; 9730e499ed3SMatthew Wilcox (Oracle) struct folio_batch fbatch; 9747a5d0fbbSHugh Dickins pgoff_t indices[PAGEVEC_SIZE]; 975b9a8a419SMatthew Wilcox (Oracle) struct folio *folio; 976b9a8a419SMatthew Wilcox (Oracle) bool same_folio; 9777a5d0fbbSHugh Dickins long nr_swaps_freed = 0; 978285b2c4fSHugh Dickins pgoff_t index; 979bda97eabSHugh Dickins int i; 9801da177e4SLinus Torvalds 98183e4fa9cSHugh Dickins if (lend == -1) 98283e4fa9cSHugh Dickins end = -1; /* unsigned, so actually very big */ 983bda97eabSHugh Dickins 984d144bf62SHugh Dickins if (info->fallocend > start && info->fallocend <= end && !unfalloc) 985d144bf62SHugh Dickins info->fallocend = start; 986d144bf62SHugh Dickins 98751dcbdacSMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 988bda97eabSHugh Dickins index = start; 9893392ca12SVishal Moola (Oracle) while (index < end && find_lock_entries(mapping, &index, end - 1, 99051dcbdacSMatthew Wilcox (Oracle) &fbatch, indices)) { 99151dcbdacSMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(&fbatch); i++) { 992b9a8a419SMatthew Wilcox (Oracle) folio = fbatch.folios[i]; 993bda97eabSHugh Dickins 9947b774aabSMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 9951635f6a7SHugh Dickins if (unfalloc) 9961635f6a7SHugh Dickins continue; 9977a5d0fbbSHugh Dickins nr_swaps_freed += !shmem_free_swap(mapping, 9983392ca12SVishal Moola (Oracle) indices[i], folio); 9997a5d0fbbSHugh Dickins continue; 10007a5d0fbbSHugh Dickins } 10017a5d0fbbSHugh Dickins 10027b774aabSMatthew Wilcox (Oracle) if (!unfalloc || !folio_test_uptodate(folio)) 10031e84a3d9SMatthew Wilcox (Oracle) truncate_inode_folio(mapping, folio); 10047b774aabSMatthew Wilcox (Oracle) folio_unlock(folio); 1005bda97eabSHugh Dickins } 100651dcbdacSMatthew Wilcox (Oracle) folio_batch_remove_exceptionals(&fbatch); 100751dcbdacSMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 1008bda97eabSHugh Dickins cond_resched(); 1009bda97eabSHugh Dickins } 1010bda97eabSHugh Dickins 101144bcabd7SHugh Dickins /* 101244bcabd7SHugh Dickins * When undoing a failed fallocate, we want none of the partial folio 101344bcabd7SHugh Dickins * zeroing and splitting below, but shall want to truncate the whole 101444bcabd7SHugh Dickins * folio when !uptodate indicates that it was added by this fallocate, 101544bcabd7SHugh Dickins * even when [lstart, lend] covers only a part of the folio. 101644bcabd7SHugh Dickins */ 101744bcabd7SHugh Dickins if (unfalloc) 101844bcabd7SHugh Dickins goto whole_folios; 101944bcabd7SHugh Dickins 1020b9a8a419SMatthew Wilcox (Oracle) same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); 1021b9a8a419SMatthew Wilcox (Oracle) folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); 1022b9a8a419SMatthew Wilcox (Oracle) if (folio) { 1023b9a8a419SMatthew Wilcox (Oracle) same_folio = lend < folio_pos(folio) + folio_size(folio); 1024b9a8a419SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1025b9a8a419SMatthew Wilcox (Oracle) if (!truncate_inode_partial_folio(folio, lstart, lend)) { 1026b9a8a419SMatthew Wilcox (Oracle) start = folio->index + folio_nr_pages(folio); 1027b9a8a419SMatthew Wilcox (Oracle) if (same_folio) 1028b9a8a419SMatthew Wilcox (Oracle) end = folio->index; 102983e4fa9cSHugh Dickins } 1030b9a8a419SMatthew Wilcox (Oracle) folio_unlock(folio); 1031b9a8a419SMatthew Wilcox (Oracle) folio_put(folio); 1032b9a8a419SMatthew Wilcox (Oracle) folio = NULL; 1033bda97eabSHugh Dickins } 1034b9a8a419SMatthew Wilcox (Oracle) 1035b9a8a419SMatthew Wilcox (Oracle) if (!same_folio) 1036b9a8a419SMatthew Wilcox (Oracle) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); 1037b9a8a419SMatthew Wilcox (Oracle) if (folio) { 1038b9a8a419SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1039b9a8a419SMatthew Wilcox (Oracle) if (!truncate_inode_partial_folio(folio, lstart, lend)) 1040b9a8a419SMatthew Wilcox (Oracle) end = folio->index; 1041b9a8a419SMatthew Wilcox (Oracle) folio_unlock(folio); 1042b9a8a419SMatthew Wilcox (Oracle) folio_put(folio); 1043bda97eabSHugh Dickins } 1044bda97eabSHugh Dickins 104544bcabd7SHugh Dickins whole_folios: 104644bcabd7SHugh Dickins 1047bda97eabSHugh Dickins index = start; 1048b1a36650SHugh Dickins while (index < end) { 1049bda97eabSHugh Dickins cond_resched(); 10500cd6144aSJohannes Weiner 10519fb6beeaSVishal Moola (Oracle) if (!find_get_entries(mapping, &index, end - 1, &fbatch, 1052cf2039afSMatthew Wilcox (Oracle) indices)) { 1053b1a36650SHugh Dickins /* If all gone or hole-punch or unfalloc, we're done */ 1054b1a36650SHugh Dickins if (index == start || end != -1) 1055bda97eabSHugh Dickins break; 1056b1a36650SHugh Dickins /* But if truncating, restart to make sure all gone */ 1057bda97eabSHugh Dickins index = start; 1058bda97eabSHugh Dickins continue; 1059bda97eabSHugh Dickins } 10600e499ed3SMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(&fbatch); i++) { 1061b9a8a419SMatthew Wilcox (Oracle) folio = fbatch.folios[i]; 1062bda97eabSHugh Dickins 10630e499ed3SMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 10641635f6a7SHugh Dickins if (unfalloc) 10651635f6a7SHugh Dickins continue; 10669fb6beeaSVishal Moola (Oracle) if (shmem_free_swap(mapping, indices[i], folio)) { 1067b1a36650SHugh Dickins /* Swap was replaced by page: retry */ 10689fb6beeaSVishal Moola (Oracle) index = indices[i]; 1069b1a36650SHugh Dickins break; 1070b1a36650SHugh Dickins } 1071b1a36650SHugh Dickins nr_swaps_freed++; 10727a5d0fbbSHugh Dickins continue; 10737a5d0fbbSHugh Dickins } 10747a5d0fbbSHugh Dickins 10750e499ed3SMatthew Wilcox (Oracle) folio_lock(folio); 1076800d8c63SKirill A. Shutemov 10770e499ed3SMatthew Wilcox (Oracle) if (!unfalloc || !folio_test_uptodate(folio)) { 10780e499ed3SMatthew Wilcox (Oracle) if (folio_mapping(folio) != mapping) { 1079b1a36650SHugh Dickins /* Page was replaced by swap: retry */ 10800e499ed3SMatthew Wilcox (Oracle) folio_unlock(folio); 10819fb6beeaSVishal Moola (Oracle) index = indices[i]; 1082b1a36650SHugh Dickins break; 10837a5d0fbbSHugh Dickins } 10840e499ed3SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), 10850e499ed3SMatthew Wilcox (Oracle) folio); 10860e499ed3SMatthew Wilcox (Oracle) truncate_inode_folio(mapping, folio); 108771725ed1SHugh Dickins } 10880e499ed3SMatthew Wilcox (Oracle) folio_unlock(folio); 1089bda97eabSHugh Dickins } 10900e499ed3SMatthew Wilcox (Oracle) folio_batch_remove_exceptionals(&fbatch); 10910e499ed3SMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 1092bda97eabSHugh Dickins } 109394c1e62dSHugh Dickins 10944595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 10957a5d0fbbSHugh Dickins info->swapped -= nr_swaps_freed; 10961da177e4SLinus Torvalds shmem_recalc_inode(inode); 10974595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 10981635f6a7SHugh Dickins } 10991da177e4SLinus Torvalds 11001635f6a7SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 11011635f6a7SHugh Dickins { 11021635f6a7SHugh Dickins shmem_undo_range(inode, lstart, lend, false); 1103078cd827SDeepa Dinamani inode->i_ctime = inode->i_mtime = current_time(inode); 110436f05cabSJeff Layton inode_inc_iversion(inode); 11051da177e4SLinus Torvalds } 110694c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 11071da177e4SLinus Torvalds 1108b74d24f7SChristian Brauner static int shmem_getattr(struct mnt_idmap *idmap, 1109549c7297SChristian Brauner const struct path *path, struct kstat *stat, 1110a528d35eSDavid Howells u32 request_mask, unsigned int query_flags) 111144a30220SYu Zhao { 1112a528d35eSDavid Howells struct inode *inode = path->dentry->d_inode; 111344a30220SYu Zhao struct shmem_inode_info *info = SHMEM_I(inode); 111444a30220SYu Zhao 1115d0424c42SHugh Dickins if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 11164595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 111744a30220SYu Zhao shmem_recalc_inode(inode); 11184595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 1119d0424c42SHugh Dickins } 1120e408e695STheodore Ts'o if (info->fsflags & FS_APPEND_FL) 1121e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_APPEND; 1122e408e695STheodore Ts'o if (info->fsflags & FS_IMMUTABLE_FL) 1123e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_IMMUTABLE; 1124e408e695STheodore Ts'o if (info->fsflags & FS_NODUMP_FL) 1125e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_NODUMP; 1126e408e695STheodore Ts'o stat->attributes_mask |= (STATX_ATTR_APPEND | 1127e408e695STheodore Ts'o STATX_ATTR_IMMUTABLE | 1128e408e695STheodore Ts'o STATX_ATTR_NODUMP); 11297a80e5b8SGiuseppe Scrivano generic_fillattr(idmap, inode, stat); 113089fdcd26SYang Shi 11312cf13384SDavid Stevens if (shmem_is_huge(inode, 0, false, NULL, 0)) 113289fdcd26SYang Shi stat->blksize = HPAGE_PMD_SIZE; 113389fdcd26SYang Shi 1134f7cd16a5SXavier Roche if (request_mask & STATX_BTIME) { 1135f7cd16a5SXavier Roche stat->result_mask |= STATX_BTIME; 1136f7cd16a5SXavier Roche stat->btime.tv_sec = info->i_crtime.tv_sec; 1137f7cd16a5SXavier Roche stat->btime.tv_nsec = info->i_crtime.tv_nsec; 1138f7cd16a5SXavier Roche } 1139f7cd16a5SXavier Roche 114044a30220SYu Zhao return 0; 114144a30220SYu Zhao } 114244a30220SYu Zhao 1143c1632a0fSChristian Brauner static int shmem_setattr(struct mnt_idmap *idmap, 1144549c7297SChristian Brauner struct dentry *dentry, struct iattr *attr) 11451da177e4SLinus Torvalds { 114675c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 114740e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 11481da177e4SLinus Torvalds int error; 114936f05cabSJeff Layton bool update_mtime = false; 115036f05cabSJeff Layton bool update_ctime = true; 11511da177e4SLinus Torvalds 11527a80e5b8SGiuseppe Scrivano error = setattr_prepare(idmap, dentry, attr); 1153db78b877SChristoph Hellwig if (error) 1154db78b877SChristoph Hellwig return error; 1155db78b877SChristoph Hellwig 11566fd73538SDaniel Verkamp if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { 11576fd73538SDaniel Verkamp if ((inode->i_mode ^ attr->ia_mode) & 0111) { 11586fd73538SDaniel Verkamp return -EPERM; 11596fd73538SDaniel Verkamp } 11606fd73538SDaniel Verkamp } 11616fd73538SDaniel Verkamp 116294c1e62dSHugh Dickins if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 116394c1e62dSHugh Dickins loff_t oldsize = inode->i_size; 116494c1e62dSHugh Dickins loff_t newsize = attr->ia_size; 11653889e6e7Snpiggin@suse.de 11669608703eSJan Kara /* protected by i_rwsem */ 116740e041a2SDavid Herrmann if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 116840e041a2SDavid Herrmann (newsize > oldsize && (info->seals & F_SEAL_GROW))) 116940e041a2SDavid Herrmann return -EPERM; 117040e041a2SDavid Herrmann 117194c1e62dSHugh Dickins if (newsize != oldsize) { 117277142517SKonstantin Khlebnikov error = shmem_reacct_size(SHMEM_I(inode)->flags, 117377142517SKonstantin Khlebnikov oldsize, newsize); 117477142517SKonstantin Khlebnikov if (error) 117577142517SKonstantin Khlebnikov return error; 117694c1e62dSHugh Dickins i_size_write(inode, newsize); 117736f05cabSJeff Layton update_mtime = true; 117836f05cabSJeff Layton } else { 117936f05cabSJeff Layton update_ctime = false; 118094c1e62dSHugh Dickins } 1181afa2db2fSJosef Bacik if (newsize <= oldsize) { 118294c1e62dSHugh Dickins loff_t holebegin = round_up(newsize, PAGE_SIZE); 1183d0424c42SHugh Dickins if (oldsize > holebegin) 1184d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 1185d0424c42SHugh Dickins holebegin, 0, 1); 1186d0424c42SHugh Dickins if (info->alloced) 1187d0424c42SHugh Dickins shmem_truncate_range(inode, 1188d0424c42SHugh Dickins newsize, (loff_t)-1); 118994c1e62dSHugh Dickins /* unmap again to remove racily COWed private pages */ 1190d0424c42SHugh Dickins if (oldsize > holebegin) 1191d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 1192d0424c42SHugh Dickins holebegin, 0, 1); 119394c1e62dSHugh Dickins } 11941da177e4SLinus Torvalds } 11951da177e4SLinus Torvalds 1196e09764cfSCarlos Maiolino if (is_quota_modification(idmap, inode, attr)) { 1197e09764cfSCarlos Maiolino error = dquot_initialize(inode); 1198e09764cfSCarlos Maiolino if (error) 1199e09764cfSCarlos Maiolino return error; 1200e09764cfSCarlos Maiolino } 1201e09764cfSCarlos Maiolino 1202e09764cfSCarlos Maiolino /* Transfer quota accounting */ 1203e09764cfSCarlos Maiolino if (i_uid_needs_update(idmap, attr, inode) || 1204e09764cfSCarlos Maiolino i_gid_needs_update(idmap, attr, inode)) { 1205e09764cfSCarlos Maiolino error = dquot_transfer(idmap, inode, attr); 1206e09764cfSCarlos Maiolino 1207e09764cfSCarlos Maiolino if (error) 1208e09764cfSCarlos Maiolino return error; 1209e09764cfSCarlos Maiolino } 1210e09764cfSCarlos Maiolino 12117a80e5b8SGiuseppe Scrivano setattr_copy(idmap, inode, attr); 1212db78b877SChristoph Hellwig if (attr->ia_valid & ATTR_MODE) 12137a80e5b8SGiuseppe Scrivano error = posix_acl_chmod(idmap, dentry, inode->i_mode); 121436f05cabSJeff Layton if (!error && update_ctime) { 121536f05cabSJeff Layton inode->i_ctime = current_time(inode); 121636f05cabSJeff Layton if (update_mtime) 121736f05cabSJeff Layton inode->i_mtime = inode->i_ctime; 121836f05cabSJeff Layton inode_inc_iversion(inode); 121936f05cabSJeff Layton } 12201da177e4SLinus Torvalds return error; 12211da177e4SLinus Torvalds } 12221da177e4SLinus Torvalds 12231f895f75SAl Viro static void shmem_evict_inode(struct inode *inode) 12241da177e4SLinus Torvalds { 12251da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 1226779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 12271da177e4SLinus Torvalds 122830e6a51dSHui Su if (shmem_mapping(inode->i_mapping)) { 12291da177e4SLinus Torvalds shmem_unacct_size(info->flags, inode->i_size); 12301da177e4SLinus Torvalds inode->i_size = 0; 1231bc786390SHugh Dickins mapping_set_exiting(inode->i_mapping); 12323889e6e7Snpiggin@suse.de shmem_truncate_range(inode, 0, (loff_t)-1); 1233779750d2SKirill A. Shutemov if (!list_empty(&info->shrinklist)) { 1234779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 1235779750d2SKirill A. Shutemov if (!list_empty(&info->shrinklist)) { 1236779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 1237779750d2SKirill A. Shutemov sbinfo->shrinklist_len--; 1238779750d2SKirill A. Shutemov } 1239779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 1240779750d2SKirill A. Shutemov } 1241af53d3e9SHugh Dickins while (!list_empty(&info->swaplist)) { 1242af53d3e9SHugh Dickins /* Wait while shmem_unuse() is scanning this inode... */ 1243af53d3e9SHugh Dickins wait_var_event(&info->stop_eviction, 1244af53d3e9SHugh Dickins !atomic_read(&info->stop_eviction)); 1245cb5f7b9aSHugh Dickins mutex_lock(&shmem_swaplist_mutex); 1246af53d3e9SHugh Dickins /* ...but beware of the race if we peeked too early */ 1247af53d3e9SHugh Dickins if (!atomic_read(&info->stop_eviction)) 12481da177e4SLinus Torvalds list_del_init(&info->swaplist); 1249cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 12501da177e4SLinus Torvalds } 12513ed47db3SAl Viro } 1252b09e0fa4SEric Paris 125338f38657SAristeu Rozanski simple_xattrs_free(&info->xattrs); 12540f3c42f5SHugh Dickins WARN_ON(inode->i_blocks); 12555b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 1256dbd5768fSJan Kara clear_inode(inode); 1257e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 1258e09764cfSCarlos Maiolino dquot_free_inode(inode); 1259e09764cfSCarlos Maiolino dquot_drop(inode); 1260e09764cfSCarlos Maiolino #endif 12611da177e4SLinus Torvalds } 12621da177e4SLinus Torvalds 1263b56a2d8aSVineeth Remanan Pillai static int shmem_find_swap_entries(struct address_space *mapping, 1264da08e9b7SMatthew Wilcox (Oracle) pgoff_t start, struct folio_batch *fbatch, 1265da08e9b7SMatthew Wilcox (Oracle) pgoff_t *indices, unsigned int type) 1266478922e2SMatthew Wilcox { 1267b56a2d8aSVineeth Remanan Pillai XA_STATE(xas, &mapping->i_pages, start); 1268da08e9b7SMatthew Wilcox (Oracle) struct folio *folio; 126987039546SHugh Dickins swp_entry_t entry; 1270478922e2SMatthew Wilcox 1271478922e2SMatthew Wilcox rcu_read_lock(); 1272da08e9b7SMatthew Wilcox (Oracle) xas_for_each(&xas, folio, ULONG_MAX) { 1273da08e9b7SMatthew Wilcox (Oracle) if (xas_retry(&xas, folio)) 12745b9c98f3SMike Kravetz continue; 1275b56a2d8aSVineeth Remanan Pillai 1276da08e9b7SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 1277478922e2SMatthew Wilcox continue; 1278b56a2d8aSVineeth Remanan Pillai 1279da08e9b7SMatthew Wilcox (Oracle) entry = radix_to_swp_entry(folio); 12806cec2b95SMiaohe Lin /* 12816cec2b95SMiaohe Lin * swapin error entries can be found in the mapping. But they're 12826cec2b95SMiaohe Lin * deliberately ignored here as we've done everything we can do. 12836cec2b95SMiaohe Lin */ 128487039546SHugh Dickins if (swp_type(entry) != type) 1285b56a2d8aSVineeth Remanan Pillai continue; 1286b56a2d8aSVineeth Remanan Pillai 1287e384200eSHugh Dickins indices[folio_batch_count(fbatch)] = xas.xa_index; 1288da08e9b7SMatthew Wilcox (Oracle) if (!folio_batch_add(fbatch, folio)) 1289da08e9b7SMatthew Wilcox (Oracle) break; 1290b56a2d8aSVineeth Remanan Pillai 1291b56a2d8aSVineeth Remanan Pillai if (need_resched()) { 1292e21a2955SMatthew Wilcox xas_pause(&xas); 1293478922e2SMatthew Wilcox cond_resched_rcu(); 1294478922e2SMatthew Wilcox } 1295b56a2d8aSVineeth Remanan Pillai } 1296478922e2SMatthew Wilcox rcu_read_unlock(); 1297e21a2955SMatthew Wilcox 1298da08e9b7SMatthew Wilcox (Oracle) return xas.xa_index; 1299b56a2d8aSVineeth Remanan Pillai } 1300b56a2d8aSVineeth Remanan Pillai 1301b56a2d8aSVineeth Remanan Pillai /* 1302b56a2d8aSVineeth Remanan Pillai * Move the swapped pages for an inode to page cache. Returns the count 1303b56a2d8aSVineeth Remanan Pillai * of pages swapped in, or the error in case of failure. 1304b56a2d8aSVineeth Remanan Pillai */ 1305da08e9b7SMatthew Wilcox (Oracle) static int shmem_unuse_swap_entries(struct inode *inode, 1306da08e9b7SMatthew Wilcox (Oracle) struct folio_batch *fbatch, pgoff_t *indices) 1307b56a2d8aSVineeth Remanan Pillai { 1308b56a2d8aSVineeth Remanan Pillai int i = 0; 1309b56a2d8aSVineeth Remanan Pillai int ret = 0; 1310b56a2d8aSVineeth Remanan Pillai int error = 0; 1311b56a2d8aSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1312b56a2d8aSVineeth Remanan Pillai 1313da08e9b7SMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(fbatch); i++) { 1314da08e9b7SMatthew Wilcox (Oracle) struct folio *folio = fbatch->folios[i]; 1315b56a2d8aSVineeth Remanan Pillai 1316da08e9b7SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 1317b56a2d8aSVineeth Remanan Pillai continue; 1318da08e9b7SMatthew Wilcox (Oracle) error = shmem_swapin_folio(inode, indices[i], 1319da08e9b7SMatthew Wilcox (Oracle) &folio, SGP_CACHE, 1320b56a2d8aSVineeth Remanan Pillai mapping_gfp_mask(mapping), 1321b56a2d8aSVineeth Remanan Pillai NULL, NULL); 1322b56a2d8aSVineeth Remanan Pillai if (error == 0) { 1323da08e9b7SMatthew Wilcox (Oracle) folio_unlock(folio); 1324da08e9b7SMatthew Wilcox (Oracle) folio_put(folio); 1325b56a2d8aSVineeth Remanan Pillai ret++; 1326b56a2d8aSVineeth Remanan Pillai } 1327b56a2d8aSVineeth Remanan Pillai if (error == -ENOMEM) 1328b56a2d8aSVineeth Remanan Pillai break; 1329b56a2d8aSVineeth Remanan Pillai error = 0; 1330b56a2d8aSVineeth Remanan Pillai } 1331b56a2d8aSVineeth Remanan Pillai return error ? error : ret; 1332478922e2SMatthew Wilcox } 1333478922e2SMatthew Wilcox 133446f65ec1SHugh Dickins /* 133546f65ec1SHugh Dickins * If swap found in inode, free it and move page from swapcache to filecache. 133646f65ec1SHugh Dickins */ 133710a9c496SChristoph Hellwig static int shmem_unuse_inode(struct inode *inode, unsigned int type) 13381da177e4SLinus Torvalds { 1339b56a2d8aSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1340b56a2d8aSVineeth Remanan Pillai pgoff_t start = 0; 1341da08e9b7SMatthew Wilcox (Oracle) struct folio_batch fbatch; 1342b56a2d8aSVineeth Remanan Pillai pgoff_t indices[PAGEVEC_SIZE]; 1343b56a2d8aSVineeth Remanan Pillai int ret = 0; 13441da177e4SLinus Torvalds 1345b56a2d8aSVineeth Remanan Pillai do { 1346da08e9b7SMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 1347da08e9b7SMatthew Wilcox (Oracle) shmem_find_swap_entries(mapping, start, &fbatch, indices, type); 1348da08e9b7SMatthew Wilcox (Oracle) if (folio_batch_count(&fbatch) == 0) { 1349b56a2d8aSVineeth Remanan Pillai ret = 0; 1350778dd893SHugh Dickins break; 1351b56a2d8aSVineeth Remanan Pillai } 1352b56a2d8aSVineeth Remanan Pillai 1353da08e9b7SMatthew Wilcox (Oracle) ret = shmem_unuse_swap_entries(inode, &fbatch, indices); 1354b56a2d8aSVineeth Remanan Pillai if (ret < 0) 1355b56a2d8aSVineeth Remanan Pillai break; 1356b56a2d8aSVineeth Remanan Pillai 1357da08e9b7SMatthew Wilcox (Oracle) start = indices[folio_batch_count(&fbatch) - 1]; 1358b56a2d8aSVineeth Remanan Pillai } while (true); 1359b56a2d8aSVineeth Remanan Pillai 1360b56a2d8aSVineeth Remanan Pillai return ret; 1361b56a2d8aSVineeth Remanan Pillai } 1362b56a2d8aSVineeth Remanan Pillai 1363b56a2d8aSVineeth Remanan Pillai /* 1364b56a2d8aSVineeth Remanan Pillai * Read all the shared memory data that resides in the swap 1365b56a2d8aSVineeth Remanan Pillai * device 'type' back into memory, so the swap device can be 1366b56a2d8aSVineeth Remanan Pillai * unused. 1367b56a2d8aSVineeth Remanan Pillai */ 136810a9c496SChristoph Hellwig int shmem_unuse(unsigned int type) 1369b56a2d8aSVineeth Remanan Pillai { 1370b56a2d8aSVineeth Remanan Pillai struct shmem_inode_info *info, *next; 1371b56a2d8aSVineeth Remanan Pillai int error = 0; 1372b56a2d8aSVineeth Remanan Pillai 1373b56a2d8aSVineeth Remanan Pillai if (list_empty(&shmem_swaplist)) 1374b56a2d8aSVineeth Remanan Pillai return 0; 1375b56a2d8aSVineeth Remanan Pillai 1376b56a2d8aSVineeth Remanan Pillai mutex_lock(&shmem_swaplist_mutex); 1377b56a2d8aSVineeth Remanan Pillai list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1378b56a2d8aSVineeth Remanan Pillai if (!info->swapped) { 1379b56a2d8aSVineeth Remanan Pillai list_del_init(&info->swaplist); 1380b56a2d8aSVineeth Remanan Pillai continue; 1381b56a2d8aSVineeth Remanan Pillai } 1382af53d3e9SHugh Dickins /* 1383af53d3e9SHugh Dickins * Drop the swaplist mutex while searching the inode for swap; 1384af53d3e9SHugh Dickins * but before doing so, make sure shmem_evict_inode() will not 1385af53d3e9SHugh Dickins * remove placeholder inode from swaplist, nor let it be freed 1386af53d3e9SHugh Dickins * (igrab() would protect from unlink, but not from unmount). 1387af53d3e9SHugh Dickins */ 1388af53d3e9SHugh Dickins atomic_inc(&info->stop_eviction); 1389b56a2d8aSVineeth Remanan Pillai mutex_unlock(&shmem_swaplist_mutex); 1390b56a2d8aSVineeth Remanan Pillai 139110a9c496SChristoph Hellwig error = shmem_unuse_inode(&info->vfs_inode, type); 1392b56a2d8aSVineeth Remanan Pillai cond_resched(); 1393b56a2d8aSVineeth Remanan Pillai 1394b56a2d8aSVineeth Remanan Pillai mutex_lock(&shmem_swaplist_mutex); 1395b56a2d8aSVineeth Remanan Pillai next = list_next_entry(info, swaplist); 1396b56a2d8aSVineeth Remanan Pillai if (!info->swapped) 1397b56a2d8aSVineeth Remanan Pillai list_del_init(&info->swaplist); 1398af53d3e9SHugh Dickins if (atomic_dec_and_test(&info->stop_eviction)) 1399af53d3e9SHugh Dickins wake_up_var(&info->stop_eviction); 1400b56a2d8aSVineeth Remanan Pillai if (error) 1401b56a2d8aSVineeth Remanan Pillai break; 14021da177e4SLinus Torvalds } 1403cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 1404778dd893SHugh Dickins 1405778dd893SHugh Dickins return error; 14061da177e4SLinus Torvalds } 14071da177e4SLinus Torvalds 14081da177e4SLinus Torvalds /* 14091da177e4SLinus Torvalds * Move the page from the page cache to the swap cache. 14101da177e4SLinus Torvalds */ 14111da177e4SLinus Torvalds static int shmem_writepage(struct page *page, struct writeback_control *wbc) 14121da177e4SLinus Torvalds { 1413e2e3fdc7SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 14148ccee8c1SLuis Chamberlain struct address_space *mapping = folio->mapping; 14158ccee8c1SLuis Chamberlain struct inode *inode = mapping->host; 14168ccee8c1SLuis Chamberlain struct shmem_inode_info *info = SHMEM_I(inode); 14172c6efe9cSLuis Chamberlain struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 14186922c0c7SHugh Dickins swp_entry_t swap; 14196922c0c7SHugh Dickins pgoff_t index; 14201da177e4SLinus Torvalds 14211e6decf3SHugh Dickins /* 1422cf7992bfSLuis Chamberlain * Our capabilities prevent regular writeback or sync from ever calling 1423cf7992bfSLuis Chamberlain * shmem_writepage; but a stacking filesystem might use ->writepage of 1424cf7992bfSLuis Chamberlain * its underlying filesystem, in which case tmpfs should write out to 1425cf7992bfSLuis Chamberlain * swap only in response to memory pressure, and not for the writeback 1426cf7992bfSLuis Chamberlain * threads or sync. 1427cf7992bfSLuis Chamberlain */ 1428cf7992bfSLuis Chamberlain if (WARN_ON_ONCE(!wbc->for_reclaim)) 1429cf7992bfSLuis Chamberlain goto redirty; 1430cf7992bfSLuis Chamberlain 14312c6efe9cSLuis Chamberlain if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) 14329a976f0cSLuis Chamberlain goto redirty; 14339a976f0cSLuis Chamberlain 14349a976f0cSLuis Chamberlain if (!total_swap_pages) 14359a976f0cSLuis Chamberlain goto redirty; 14369a976f0cSLuis Chamberlain 1437cf7992bfSLuis Chamberlain /* 14381e6decf3SHugh Dickins * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or 14391e6decf3SHugh Dickins * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, 14401e6decf3SHugh Dickins * and its shmem_writeback() needs them to be split when swapping. 14411e6decf3SHugh Dickins */ 1442f530ed0eSMatthew Wilcox (Oracle) if (folio_test_large(folio)) { 14431e6decf3SHugh Dickins /* Ensure the subpages are still dirty */ 1444f530ed0eSMatthew Wilcox (Oracle) folio_test_set_dirty(folio); 14451e6decf3SHugh Dickins if (split_huge_page(page) < 0) 14461e6decf3SHugh Dickins goto redirty; 1447f530ed0eSMatthew Wilcox (Oracle) folio = page_folio(page); 1448f530ed0eSMatthew Wilcox (Oracle) folio_clear_dirty(folio); 14491e6decf3SHugh Dickins } 14501e6decf3SHugh Dickins 1451f530ed0eSMatthew Wilcox (Oracle) index = folio->index; 14521635f6a7SHugh Dickins 14531635f6a7SHugh Dickins /* 14541635f6a7SHugh Dickins * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 14551635f6a7SHugh Dickins * value into swapfile.c, the only way we can correctly account for a 1456f530ed0eSMatthew Wilcox (Oracle) * fallocated folio arriving here is now to initialize it and write it. 14571aac1400SHugh Dickins * 1458f530ed0eSMatthew Wilcox (Oracle) * That's okay for a folio already fallocated earlier, but if we have 14591aac1400SHugh Dickins * not yet completed the fallocation, then (a) we want to keep track 1460f530ed0eSMatthew Wilcox (Oracle) * of this folio in case we have to undo it, and (b) it may not be a 14611aac1400SHugh Dickins * good idea to continue anyway, once we're pushing into swap. So 1462f530ed0eSMatthew Wilcox (Oracle) * reactivate the folio, and let shmem_fallocate() quit when too many. 14631635f6a7SHugh Dickins */ 1464f530ed0eSMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 14651aac1400SHugh Dickins if (inode->i_private) { 14661aac1400SHugh Dickins struct shmem_falloc *shmem_falloc; 14671aac1400SHugh Dickins spin_lock(&inode->i_lock); 14681aac1400SHugh Dickins shmem_falloc = inode->i_private; 14691aac1400SHugh Dickins if (shmem_falloc && 14708e205f77SHugh Dickins !shmem_falloc->waitq && 14711aac1400SHugh Dickins index >= shmem_falloc->start && 14721aac1400SHugh Dickins index < shmem_falloc->next) 14731aac1400SHugh Dickins shmem_falloc->nr_unswapped++; 14741aac1400SHugh Dickins else 14751aac1400SHugh Dickins shmem_falloc = NULL; 14761aac1400SHugh Dickins spin_unlock(&inode->i_lock); 14771aac1400SHugh Dickins if (shmem_falloc) 14781aac1400SHugh Dickins goto redirty; 14791aac1400SHugh Dickins } 1480f530ed0eSMatthew Wilcox (Oracle) folio_zero_range(folio, 0, folio_size(folio)); 1481f530ed0eSMatthew Wilcox (Oracle) flush_dcache_folio(folio); 1482f530ed0eSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 14831635f6a7SHugh Dickins } 14841635f6a7SHugh Dickins 1485e2e3fdc7SMatthew Wilcox (Oracle) swap = folio_alloc_swap(folio); 148648f170fbSHugh Dickins if (!swap.val) 148748f170fbSHugh Dickins goto redirty; 1488d9fe526aSHugh Dickins 1489b1dea800SHugh Dickins /* 1490b1dea800SHugh Dickins * Add inode to shmem_unuse()'s list of swapped-out inodes, 1491f530ed0eSMatthew Wilcox (Oracle) * if it's not already there. Do it now before the folio is 14926922c0c7SHugh Dickins * moved to swap cache, when its pagelock no longer protects 1493b1dea800SHugh Dickins * the inode from eviction. But don't unlock the mutex until 14946922c0c7SHugh Dickins * we've incremented swapped, because shmem_unuse_inode() will 14956922c0c7SHugh Dickins * prune a !swapped inode from the swaplist under this mutex. 1496b1dea800SHugh Dickins */ 1497b1dea800SHugh Dickins mutex_lock(&shmem_swaplist_mutex); 149805bf86b4SHugh Dickins if (list_empty(&info->swaplist)) 1499b56a2d8aSVineeth Remanan Pillai list_add(&info->swaplist, &shmem_swaplist); 1500b1dea800SHugh Dickins 1501a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, swap, 15023852f676SJoonsoo Kim __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, 15033852f676SJoonsoo Kim NULL) == 0) { 15044595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 1505267a4c76SHugh Dickins shmem_recalc_inode(inode); 1506267a4c76SHugh Dickins info->swapped++; 15074595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 1508267a4c76SHugh Dickins 1509aaa46865SHugh Dickins swap_shmem_alloc(swap); 15104cd400fdSMatthew Wilcox (Oracle) shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); 15116922c0c7SHugh Dickins 15126922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 1513f530ed0eSMatthew Wilcox (Oracle) BUG_ON(folio_mapped(folio)); 1514f530ed0eSMatthew Wilcox (Oracle) swap_writepage(&folio->page, wbc); 15151da177e4SLinus Torvalds return 0; 15161da177e4SLinus Torvalds } 15171da177e4SLinus Torvalds 15186922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 15194081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, swap); 15201da177e4SLinus Torvalds redirty: 1521f530ed0eSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1522d9fe526aSHugh Dickins if (wbc->for_reclaim) 1523f530ed0eSMatthew Wilcox (Oracle) return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ 1524f530ed0eSMatthew Wilcox (Oracle) folio_unlock(folio); 1525d9fe526aSHugh Dickins return 0; 15261da177e4SLinus Torvalds } 15271da177e4SLinus Torvalds 152875edd345SHugh Dickins #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 152971fe804bSLee Schermerhorn static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1530680d794bSakpm@linux-foundation.org { 1531680d794bSakpm@linux-foundation.org char buffer[64]; 1532680d794bSakpm@linux-foundation.org 153371fe804bSLee Schermerhorn if (!mpol || mpol->mode == MPOL_DEFAULT) 1534095f1fc4SLee Schermerhorn return; /* show nothing */ 1535095f1fc4SLee Schermerhorn 1536a7a88b23SHugh Dickins mpol_to_str(buffer, sizeof(buffer), mpol); 1537095f1fc4SLee Schermerhorn 1538095f1fc4SLee Schermerhorn seq_printf(seq, ",mpol=%s", buffer); 1539680d794bSakpm@linux-foundation.org } 154071fe804bSLee Schermerhorn 154171fe804bSLee Schermerhorn static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 154271fe804bSLee Schermerhorn { 154371fe804bSLee Schermerhorn struct mempolicy *mpol = NULL; 154471fe804bSLee Schermerhorn if (sbinfo->mpol) { 1545bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 154671fe804bSLee Schermerhorn mpol = sbinfo->mpol; 154771fe804bSLee Schermerhorn mpol_get(mpol); 1548bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 154971fe804bSLee Schermerhorn } 155071fe804bSLee Schermerhorn return mpol; 155171fe804bSLee Schermerhorn } 155275edd345SHugh Dickins #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 155375edd345SHugh Dickins static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 155475edd345SHugh Dickins { 155575edd345SHugh Dickins } 155675edd345SHugh Dickins static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 155775edd345SHugh Dickins { 155875edd345SHugh Dickins return NULL; 155975edd345SHugh Dickins } 156075edd345SHugh Dickins #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 156175edd345SHugh Dickins #ifndef CONFIG_NUMA 156275edd345SHugh Dickins #define vm_policy vm_private_data 156375edd345SHugh Dickins #endif 1564680d794bSakpm@linux-foundation.org 1565800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1566800d8c63SKirill A. Shutemov struct shmem_inode_info *info, pgoff_t index) 1567800d8c63SKirill A. Shutemov { 1568800d8c63SKirill A. Shutemov /* Create a pseudo vma that just contains the policy */ 15692c4541e2SKirill A. Shutemov vma_init(vma, NULL); 1570800d8c63SKirill A. Shutemov /* Bias interleave by inode number to distribute better across nodes */ 1571800d8c63SKirill A. Shutemov vma->vm_pgoff = index + info->vfs_inode.i_ino; 1572800d8c63SKirill A. Shutemov vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1573800d8c63SKirill A. Shutemov } 1574800d8c63SKirill A. Shutemov 1575800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1576800d8c63SKirill A. Shutemov { 1577800d8c63SKirill A. Shutemov /* Drop reference taken by mpol_shared_policy_lookup() */ 1578800d8c63SKirill A. Shutemov mpol_cond_put(vma->vm_policy); 1579800d8c63SKirill A. Shutemov } 1580800d8c63SKirill A. Shutemov 15815739a81cSMatthew Wilcox (Oracle) static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, 158241ffe5d5SHugh Dickins struct shmem_inode_info *info, pgoff_t index) 15831da177e4SLinus Torvalds { 15841da177e4SLinus Torvalds struct vm_area_struct pvma; 158518a2f371SMel Gorman struct page *page; 15868c63ca5bSWill Deacon struct vm_fault vmf = { 15878c63ca5bSWill Deacon .vma = &pvma, 15888c63ca5bSWill Deacon }; 15891da177e4SLinus Torvalds 1590800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, index); 1591e9e9b7ecSMinchan Kim page = swap_cluster_readahead(swap, gfp, &vmf); 1592800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 159318a2f371SMel Gorman 15945739a81cSMatthew Wilcox (Oracle) if (!page) 15955739a81cSMatthew Wilcox (Oracle) return NULL; 15965739a81cSMatthew Wilcox (Oracle) return page_folio(page); 1597800d8c63SKirill A. Shutemov } 159818a2f371SMel Gorman 159978cc8cdcSRik van Riel /* 160078cc8cdcSRik van Riel * Make sure huge_gfp is always more limited than limit_gfp. 160178cc8cdcSRik van Riel * Some of the flags set permissions, while others set limitations. 160278cc8cdcSRik van Riel */ 160378cc8cdcSRik van Riel static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) 160478cc8cdcSRik van Riel { 160578cc8cdcSRik van Riel gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 160678cc8cdcSRik van Riel gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; 1607187df5ddSRik van Riel gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; 1608187df5ddSRik van Riel gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); 1609187df5ddSRik van Riel 1610187df5ddSRik van Riel /* Allow allocations only from the originally specified zones. */ 1611187df5ddSRik van Riel result |= zoneflags; 161278cc8cdcSRik van Riel 161378cc8cdcSRik van Riel /* 161478cc8cdcSRik van Riel * Minimize the result gfp by taking the union with the deny flags, 161578cc8cdcSRik van Riel * and the intersection of the allow flags. 161678cc8cdcSRik van Riel */ 161778cc8cdcSRik van Riel result |= (limit_gfp & denyflags); 161878cc8cdcSRik van Riel result |= (huge_gfp & limit_gfp) & allowflags; 161978cc8cdcSRik van Riel 162078cc8cdcSRik van Riel return result; 162178cc8cdcSRik van Riel } 162278cc8cdcSRik van Riel 162372827e5cSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, 1624800d8c63SKirill A. Shutemov struct shmem_inode_info *info, pgoff_t index) 1625800d8c63SKirill A. Shutemov { 1626800d8c63SKirill A. Shutemov struct vm_area_struct pvma; 16277b8d046fSMatthew Wilcox struct address_space *mapping = info->vfs_inode.i_mapping; 16287b8d046fSMatthew Wilcox pgoff_t hindex; 1629dfe98499SMatthew Wilcox (Oracle) struct folio *folio; 1630800d8c63SKirill A. Shutemov 16314620a06eSGeert Uytterhoeven hindex = round_down(index, HPAGE_PMD_NR); 16327b8d046fSMatthew Wilcox if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, 16337b8d046fSMatthew Wilcox XA_PRESENT)) 1634800d8c63SKirill A. Shutemov return NULL; 1635800d8c63SKirill A. Shutemov 1636800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, hindex); 1637dfe98499SMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); 1638800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 1639dfe98499SMatthew Wilcox (Oracle) if (!folio) 1640dcdf11eeSDavid Rientjes count_vm_event(THP_FILE_FALLBACK); 164172827e5cSMatthew Wilcox (Oracle) return folio; 164218a2f371SMel Gorman } 164318a2f371SMel Gorman 16440c023ef5SMatthew Wilcox (Oracle) static struct folio *shmem_alloc_folio(gfp_t gfp, 164518a2f371SMel Gorman struct shmem_inode_info *info, pgoff_t index) 164618a2f371SMel Gorman { 164718a2f371SMel Gorman struct vm_area_struct pvma; 16480c023ef5SMatthew Wilcox (Oracle) struct folio *folio; 164918a2f371SMel Gorman 1650800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, index); 16510c023ef5SMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); 1652800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 165318a2f371SMel Gorman 16540c023ef5SMatthew Wilcox (Oracle) return folio; 165518a2f371SMel Gorman } 165618a2f371SMel Gorman 1657b1d0ec3aSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, 1658800d8c63SKirill A. Shutemov pgoff_t index, bool huge) 1659800d8c63SKirill A. Shutemov { 16600f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 166172827e5cSMatthew Wilcox (Oracle) struct folio *folio; 1662800d8c63SKirill A. Shutemov int nr; 1663c7e263abSLukas Czerner int err; 1664800d8c63SKirill A. Shutemov 1665396bcc52SMatthew Wilcox (Oracle) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1666800d8c63SKirill A. Shutemov huge = false; 1667800d8c63SKirill A. Shutemov nr = huge ? HPAGE_PMD_NR : 1; 1668800d8c63SKirill A. Shutemov 1669c7e263abSLukas Czerner err = shmem_inode_acct_block(inode, nr); 1670c7e263abSLukas Czerner if (err) 1671800d8c63SKirill A. Shutemov goto failed; 1672800d8c63SKirill A. Shutemov 1673800d8c63SKirill A. Shutemov if (huge) 167472827e5cSMatthew Wilcox (Oracle) folio = shmem_alloc_hugefolio(gfp, info, index); 1675800d8c63SKirill A. Shutemov else 167672827e5cSMatthew Wilcox (Oracle) folio = shmem_alloc_folio(gfp, info, index); 167772827e5cSMatthew Wilcox (Oracle) if (folio) { 167872827e5cSMatthew Wilcox (Oracle) __folio_set_locked(folio); 167972827e5cSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 1680b1d0ec3aSMatthew Wilcox (Oracle) return folio; 168175edd345SHugh Dickins } 168218a2f371SMel Gorman 1683800d8c63SKirill A. Shutemov err = -ENOMEM; 16840f079694SMike Rapoport shmem_inode_unacct_blocks(inode, nr); 1685800d8c63SKirill A. Shutemov failed: 1686800d8c63SKirill A. Shutemov return ERR_PTR(err); 16871da177e4SLinus Torvalds } 168871fe804bSLee Schermerhorn 16891da177e4SLinus Torvalds /* 1690bde05d1cSHugh Dickins * When a page is moved from swapcache to shmem filecache (either by the 1691fc26babbSMatthew Wilcox (Oracle) * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of 1692bde05d1cSHugh Dickins * shmem_unuse_inode()), it may have been read in earlier from swap, in 1693bde05d1cSHugh Dickins * ignorance of the mapping it belongs to. If that mapping has special 1694bde05d1cSHugh Dickins * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1695bde05d1cSHugh Dickins * we may need to copy to a suitable page before moving to filecache. 1696bde05d1cSHugh Dickins * 1697bde05d1cSHugh Dickins * In a future release, this may well be extended to respect cpuset and 1698bde05d1cSHugh Dickins * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1699bde05d1cSHugh Dickins * but for now it is a simple matter of zone. 1700bde05d1cSHugh Dickins */ 1701069d849cSMatthew Wilcox (Oracle) static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) 1702bde05d1cSHugh Dickins { 1703069d849cSMatthew Wilcox (Oracle) return folio_zonenum(folio) > gfp_zone(gfp); 1704bde05d1cSHugh Dickins } 1705bde05d1cSHugh Dickins 17060d698e25SMatthew Wilcox (Oracle) static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, 1707bde05d1cSHugh Dickins struct shmem_inode_info *info, pgoff_t index) 1708bde05d1cSHugh Dickins { 1709d21bba2bSMatthew Wilcox (Oracle) struct folio *old, *new; 1710bde05d1cSHugh Dickins struct address_space *swap_mapping; 1711c1cb20d4SYu Zhao swp_entry_t entry; 1712bde05d1cSHugh Dickins pgoff_t swap_index; 1713bde05d1cSHugh Dickins int error; 1714bde05d1cSHugh Dickins 17150d698e25SMatthew Wilcox (Oracle) old = *foliop; 1716907ea17eSMatthew Wilcox (Oracle) entry = folio_swap_entry(old); 1717c1cb20d4SYu Zhao swap_index = swp_offset(entry); 1718907ea17eSMatthew Wilcox (Oracle) swap_mapping = swap_address_space(entry); 1719bde05d1cSHugh Dickins 1720bde05d1cSHugh Dickins /* 1721bde05d1cSHugh Dickins * We have arrived here because our zones are constrained, so don't 1722bde05d1cSHugh Dickins * limit chance of success by further cpuset and node constraints. 1723bde05d1cSHugh Dickins */ 1724bde05d1cSHugh Dickins gfp &= ~GFP_CONSTRAINT_MASK; 1725907ea17eSMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_large(old), old); 1726907ea17eSMatthew Wilcox (Oracle) new = shmem_alloc_folio(gfp, info, index); 1727907ea17eSMatthew Wilcox (Oracle) if (!new) 1728bde05d1cSHugh Dickins return -ENOMEM; 1729bde05d1cSHugh Dickins 1730907ea17eSMatthew Wilcox (Oracle) folio_get(new); 1731907ea17eSMatthew Wilcox (Oracle) folio_copy(new, old); 1732907ea17eSMatthew Wilcox (Oracle) flush_dcache_folio(new); 1733bde05d1cSHugh Dickins 1734907ea17eSMatthew Wilcox (Oracle) __folio_set_locked(new); 1735907ea17eSMatthew Wilcox (Oracle) __folio_set_swapbacked(new); 1736907ea17eSMatthew Wilcox (Oracle) folio_mark_uptodate(new); 1737907ea17eSMatthew Wilcox (Oracle) folio_set_swap_entry(new, entry); 1738907ea17eSMatthew Wilcox (Oracle) folio_set_swapcache(new); 1739bde05d1cSHugh Dickins 1740bde05d1cSHugh Dickins /* 1741bde05d1cSHugh Dickins * Our caller will very soon move newpage out of swapcache, but it's 1742bde05d1cSHugh Dickins * a nice clean interface for us to replace oldpage by newpage there. 1743bde05d1cSHugh Dickins */ 1744b93b0163SMatthew Wilcox xa_lock_irq(&swap_mapping->i_pages); 1745907ea17eSMatthew Wilcox (Oracle) error = shmem_replace_entry(swap_mapping, swap_index, old, new); 17460142ef6cSHugh Dickins if (!error) { 1747d21bba2bSMatthew Wilcox (Oracle) mem_cgroup_migrate(old, new); 1748907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); 1749907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(new, NR_SHMEM, 1); 1750907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); 1751907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(old, NR_SHMEM, -1); 17520142ef6cSHugh Dickins } 1753b93b0163SMatthew Wilcox xa_unlock_irq(&swap_mapping->i_pages); 1754bde05d1cSHugh Dickins 17550142ef6cSHugh Dickins if (unlikely(error)) { 17560142ef6cSHugh Dickins /* 17570142ef6cSHugh Dickins * Is this possible? I think not, now that our callers check 17580142ef6cSHugh Dickins * both PageSwapCache and page_private after getting page lock; 17590142ef6cSHugh Dickins * but be defensive. Reverse old to newpage for clear and free. 17600142ef6cSHugh Dickins */ 1761907ea17eSMatthew Wilcox (Oracle) old = new; 17620142ef6cSHugh Dickins } else { 1763907ea17eSMatthew Wilcox (Oracle) folio_add_lru(new); 17640d698e25SMatthew Wilcox (Oracle) *foliop = new; 17650142ef6cSHugh Dickins } 1766bde05d1cSHugh Dickins 1767907ea17eSMatthew Wilcox (Oracle) folio_clear_swapcache(old); 1768907ea17eSMatthew Wilcox (Oracle) old->private = NULL; 1769bde05d1cSHugh Dickins 1770907ea17eSMatthew Wilcox (Oracle) folio_unlock(old); 1771907ea17eSMatthew Wilcox (Oracle) folio_put_refs(old, 2); 17720142ef6cSHugh Dickins return error; 1773bde05d1cSHugh Dickins } 1774bde05d1cSHugh Dickins 17756cec2b95SMiaohe Lin static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 17766cec2b95SMiaohe Lin struct folio *folio, swp_entry_t swap) 17776cec2b95SMiaohe Lin { 17786cec2b95SMiaohe Lin struct address_space *mapping = inode->i_mapping; 17796cec2b95SMiaohe Lin struct shmem_inode_info *info = SHMEM_I(inode); 17806cec2b95SMiaohe Lin swp_entry_t swapin_error; 17816cec2b95SMiaohe Lin void *old; 17826cec2b95SMiaohe Lin 178315520a3fSPeter Xu swapin_error = make_swapin_error_entry(); 17846cec2b95SMiaohe Lin old = xa_cmpxchg_irq(&mapping->i_pages, index, 17856cec2b95SMiaohe Lin swp_to_radix_entry(swap), 17866cec2b95SMiaohe Lin swp_to_radix_entry(swapin_error), 0); 17876cec2b95SMiaohe Lin if (old != swp_to_radix_entry(swap)) 17886cec2b95SMiaohe Lin return; 17896cec2b95SMiaohe Lin 17906cec2b95SMiaohe Lin folio_wait_writeback(folio); 179175fa68a5SMatthew Wilcox (Oracle) delete_from_swap_cache(folio); 17926cec2b95SMiaohe Lin spin_lock_irq(&info->lock); 17936cec2b95SMiaohe Lin /* 17946cec2b95SMiaohe Lin * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't 17956cec2b95SMiaohe Lin * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in 17966cec2b95SMiaohe Lin * shmem_evict_inode. 17976cec2b95SMiaohe Lin */ 17986cec2b95SMiaohe Lin info->alloced--; 17996cec2b95SMiaohe Lin info->swapped--; 18006cec2b95SMiaohe Lin shmem_recalc_inode(inode); 18016cec2b95SMiaohe Lin spin_unlock_irq(&info->lock); 18026cec2b95SMiaohe Lin swap_free(swap); 18036cec2b95SMiaohe Lin } 18046cec2b95SMiaohe Lin 1805bde05d1cSHugh Dickins /* 1806833de10fSMiaohe Lin * Swap in the folio pointed to by *foliop. 1807833de10fSMiaohe Lin * Caller has to make sure that *foliop contains a valid swapped folio. 1808833de10fSMiaohe Lin * Returns 0 and the folio in foliop if success. On failure, returns the 1809833de10fSMiaohe Lin * error code and NULL in *foliop. 18101da177e4SLinus Torvalds */ 1811da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 1812da08e9b7SMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, 1813c5bf121eSVineeth Remanan Pillai gfp_t gfp, struct vm_area_struct *vma, 18142b740303SSouptick Joarder vm_fault_t *fault_type) 18151da177e4SLinus Torvalds { 18161da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 181723f919d4SArnd Bergmann struct shmem_inode_info *info = SHMEM_I(inode); 181804f94e3fSDan Schatzberg struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; 1819cbc2bd98SKairui Song struct swap_info_struct *si; 1820da08e9b7SMatthew Wilcox (Oracle) struct folio *folio = NULL; 18211da177e4SLinus Torvalds swp_entry_t swap; 18221da177e4SLinus Torvalds int error; 18231da177e4SLinus Torvalds 1824da08e9b7SMatthew Wilcox (Oracle) VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1825da08e9b7SMatthew Wilcox (Oracle) swap = radix_to_swp_entry(*foliop); 1826da08e9b7SMatthew Wilcox (Oracle) *foliop = NULL; 182754af6042SHugh Dickins 18286cec2b95SMiaohe Lin if (is_swapin_error_entry(swap)) 18296cec2b95SMiaohe Lin return -EIO; 18306cec2b95SMiaohe Lin 1831cbc2bd98SKairui Song si = get_swap_device(swap); 1832cbc2bd98SKairui Song if (!si) { 1833cbc2bd98SKairui Song if (!shmem_confirm_swap(mapping, index, swap)) 1834cbc2bd98SKairui Song return -EEXIST; 1835cbc2bd98SKairui Song else 1836cbc2bd98SKairui Song return -EINVAL; 1837cbc2bd98SKairui Song } 1838cbc2bd98SKairui Song 18391da177e4SLinus Torvalds /* Look it up and read it in.. */ 18405739a81cSMatthew Wilcox (Oracle) folio = swap_cache_get_folio(swap, NULL, 0); 18415739a81cSMatthew Wilcox (Oracle) if (!folio) { 18429e18eb29SAndres Lagar-Cavilla /* Or update major stats only when swapin succeeds?? */ 18439e18eb29SAndres Lagar-Cavilla if (fault_type) { 184468da9f05SHugh Dickins *fault_type |= VM_FAULT_MAJOR; 18459e18eb29SAndres Lagar-Cavilla count_vm_event(PGMAJFAULT); 18462262185cSRoman Gushchin count_memcg_event_mm(charge_mm, PGMAJFAULT); 18479e18eb29SAndres Lagar-Cavilla } 18489e18eb29SAndres Lagar-Cavilla /* Here we actually start the io */ 18495739a81cSMatthew Wilcox (Oracle) folio = shmem_swapin(swap, gfp, info, index); 18505739a81cSMatthew Wilcox (Oracle) if (!folio) { 18511da177e4SLinus Torvalds error = -ENOMEM; 185254af6042SHugh Dickins goto failed; 1853285b2c4fSHugh Dickins } 18541da177e4SLinus Torvalds } 18551da177e4SLinus Torvalds 1856833de10fSMiaohe Lin /* We have to do this with folio locked to prevent races */ 1857da08e9b7SMatthew Wilcox (Oracle) folio_lock(folio); 1858da08e9b7SMatthew Wilcox (Oracle) if (!folio_test_swapcache(folio) || 1859da08e9b7SMatthew Wilcox (Oracle) folio_swap_entry(folio).val != swap.val || 1860d1899228SHugh Dickins !shmem_confirm_swap(mapping, index, swap)) { 1861c5bf121eSVineeth Remanan Pillai error = -EEXIST; 1862d1899228SHugh Dickins goto unlock; 1863bde05d1cSHugh Dickins } 1864da08e9b7SMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 18651da177e4SLinus Torvalds error = -EIO; 186654af6042SHugh Dickins goto failed; 186754af6042SHugh Dickins } 1868da08e9b7SMatthew Wilcox (Oracle) folio_wait_writeback(folio); 186954af6042SHugh Dickins 18708a84802eSSteven Price /* 18718a84802eSSteven Price * Some architectures may have to restore extra metadata to the 1872da08e9b7SMatthew Wilcox (Oracle) * folio after reading from swap. 18738a84802eSSteven Price */ 1874da08e9b7SMatthew Wilcox (Oracle) arch_swap_restore(swap, folio); 18758a84802eSSteven Price 1876069d849cSMatthew Wilcox (Oracle) if (shmem_should_replace_folio(folio, gfp)) { 18770d698e25SMatthew Wilcox (Oracle) error = shmem_replace_folio(&folio, gfp, info, index); 1878bde05d1cSHugh Dickins if (error) 187954af6042SHugh Dickins goto failed; 18801da177e4SLinus Torvalds } 18811da177e4SLinus Torvalds 1882b7dd44a1SMatthew Wilcox (Oracle) error = shmem_add_to_page_cache(folio, mapping, index, 18833fea5a49SJohannes Weiner swp_to_radix_entry(swap), gfp, 18843fea5a49SJohannes Weiner charge_mm); 188554af6042SHugh Dickins if (error) 188654af6042SHugh Dickins goto failed; 188754af6042SHugh Dickins 18884595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 188954af6042SHugh Dickins info->swapped--; 189054af6042SHugh Dickins shmem_recalc_inode(inode); 18914595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 189227ab7006SHugh Dickins 189366d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 1894da08e9b7SMatthew Wilcox (Oracle) folio_mark_accessed(folio); 189566d2f4d2SHugh Dickins 189675fa68a5SMatthew Wilcox (Oracle) delete_from_swap_cache(folio); 1897da08e9b7SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 189827ab7006SHugh Dickins swap_free(swap); 1899cbc2bd98SKairui Song put_swap_device(si); 190027ab7006SHugh Dickins 1901da08e9b7SMatthew Wilcox (Oracle) *foliop = folio; 1902c5bf121eSVineeth Remanan Pillai return 0; 1903c5bf121eSVineeth Remanan Pillai failed: 1904c5bf121eSVineeth Remanan Pillai if (!shmem_confirm_swap(mapping, index, swap)) 1905c5bf121eSVineeth Remanan Pillai error = -EEXIST; 19066cec2b95SMiaohe Lin if (error == -EIO) 19076cec2b95SMiaohe Lin shmem_set_folio_swapin_error(inode, index, folio, swap); 1908c5bf121eSVineeth Remanan Pillai unlock: 1909da08e9b7SMatthew Wilcox (Oracle) if (folio) { 1910da08e9b7SMatthew Wilcox (Oracle) folio_unlock(folio); 1911da08e9b7SMatthew Wilcox (Oracle) folio_put(folio); 1912c5bf121eSVineeth Remanan Pillai } 1913cbc2bd98SKairui Song put_swap_device(si); 1914c5bf121eSVineeth Remanan Pillai 1915c5bf121eSVineeth Remanan Pillai return error; 1916c5bf121eSVineeth Remanan Pillai } 1917c5bf121eSVineeth Remanan Pillai 1918c5bf121eSVineeth Remanan Pillai /* 1919fc26babbSMatthew Wilcox (Oracle) * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate 1920c5bf121eSVineeth Remanan Pillai * 1921c5bf121eSVineeth Remanan Pillai * If we allocate a new one we do not mark it dirty. That's up to the 1922c5bf121eSVineeth Remanan Pillai * vm. If we swap it in we mark it dirty since we also free the swap 1923c5bf121eSVineeth Remanan Pillai * entry since a page cannot live in both the swap and page cache. 1924c5bf121eSVineeth Remanan Pillai * 1925c949b097SAxel Rasmussen * vma, vmf, and fault_type are only supplied by shmem_fault: 1926c5bf121eSVineeth Remanan Pillai * otherwise they are NULL. 1927c5bf121eSVineeth Remanan Pillai */ 1928fc26babbSMatthew Wilcox (Oracle) static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, 1929fc26babbSMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 1930c5bf121eSVineeth Remanan Pillai struct vm_area_struct *vma, struct vm_fault *vmf, 1931c5bf121eSVineeth Remanan Pillai vm_fault_t *fault_type) 1932c5bf121eSVineeth Remanan Pillai { 1933c5bf121eSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1934c5bf121eSVineeth Remanan Pillai struct shmem_inode_info *info = SHMEM_I(inode); 1935c5bf121eSVineeth Remanan Pillai struct shmem_sb_info *sbinfo; 1936c5bf121eSVineeth Remanan Pillai struct mm_struct *charge_mm; 1937b7dd44a1SMatthew Wilcox (Oracle) struct folio *folio; 19386fe7d712SLukas Bulwahn pgoff_t hindex; 1939164cc4feSRik van Riel gfp_t huge_gfp; 1940c5bf121eSVineeth Remanan Pillai int error; 1941c5bf121eSVineeth Remanan Pillai int once = 0; 1942c5bf121eSVineeth Remanan Pillai int alloced = 0; 1943c5bf121eSVineeth Remanan Pillai 1944c5bf121eSVineeth Remanan Pillai if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1945c5bf121eSVineeth Remanan Pillai return -EFBIG; 1946c5bf121eSVineeth Remanan Pillai repeat: 1947c5bf121eSVineeth Remanan Pillai if (sgp <= SGP_CACHE && 1948c5bf121eSVineeth Remanan Pillai ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1949c5bf121eSVineeth Remanan Pillai return -EINVAL; 1950c5bf121eSVineeth Remanan Pillai } 1951c5bf121eSVineeth Remanan Pillai 1952c5bf121eSVineeth Remanan Pillai sbinfo = SHMEM_SB(inode->i_sb); 195304f94e3fSDan Schatzberg charge_mm = vma ? vma->vm_mm : NULL; 1954c5bf121eSVineeth Remanan Pillai 1955aaeb94ebSChristoph Hellwig folio = filemap_get_entry(mapping, index); 1956b1d0ec3aSMatthew Wilcox (Oracle) if (folio && vma && userfaultfd_minor(vma)) { 1957aaeb94ebSChristoph Hellwig if (!xa_is_value(folio)) 1958b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 1959c949b097SAxel Rasmussen *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); 1960c949b097SAxel Rasmussen return 0; 1961c949b097SAxel Rasmussen } 1962c949b097SAxel Rasmussen 1963b1d0ec3aSMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 1964da08e9b7SMatthew Wilcox (Oracle) error = shmem_swapin_folio(inode, index, &folio, 1965c5bf121eSVineeth Remanan Pillai sgp, gfp, vma, fault_type); 1966c5bf121eSVineeth Remanan Pillai if (error == -EEXIST) 1967c5bf121eSVineeth Remanan Pillai goto repeat; 1968c5bf121eSVineeth Remanan Pillai 1969fc26babbSMatthew Wilcox (Oracle) *foliop = folio; 1970c5bf121eSVineeth Remanan Pillai return error; 1971c5bf121eSVineeth Remanan Pillai } 1972c5bf121eSVineeth Remanan Pillai 1973b1d0ec3aSMatthew Wilcox (Oracle) if (folio) { 1974aaeb94ebSChristoph Hellwig folio_lock(folio); 1975aaeb94ebSChristoph Hellwig 1976aaeb94ebSChristoph Hellwig /* Has the folio been truncated or swapped out? */ 1977aaeb94ebSChristoph Hellwig if (unlikely(folio->mapping != mapping)) { 1978aaeb94ebSChristoph Hellwig folio_unlock(folio); 1979aaeb94ebSChristoph Hellwig folio_put(folio); 1980aaeb94ebSChristoph Hellwig goto repeat; 1981aaeb94ebSChristoph Hellwig } 1982acdd9f8eSHugh Dickins if (sgp == SGP_WRITE) 1983b1d0ec3aSMatthew Wilcox (Oracle) folio_mark_accessed(folio); 1984b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_uptodate(folio)) 1985acdd9f8eSHugh Dickins goto out; 1986fc26babbSMatthew Wilcox (Oracle) /* fallocated folio */ 1987c5bf121eSVineeth Remanan Pillai if (sgp != SGP_READ) 1988c5bf121eSVineeth Remanan Pillai goto clear; 1989b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 1990b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 1991c5bf121eSVineeth Remanan Pillai } 1992c5bf121eSVineeth Remanan Pillai 1993c5bf121eSVineeth Remanan Pillai /* 1994fc26babbSMatthew Wilcox (Oracle) * SGP_READ: succeed on hole, with NULL folio, letting caller zero. 1995fc26babbSMatthew Wilcox (Oracle) * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. 1996acdd9f8eSHugh Dickins */ 1997fc26babbSMatthew Wilcox (Oracle) *foliop = NULL; 1998acdd9f8eSHugh Dickins if (sgp == SGP_READ) 1999acdd9f8eSHugh Dickins return 0; 2000acdd9f8eSHugh Dickins if (sgp == SGP_NOALLOC) 2001acdd9f8eSHugh Dickins return -ENOENT; 2002acdd9f8eSHugh Dickins 2003acdd9f8eSHugh Dickins /* 2004acdd9f8eSHugh Dickins * Fast cache lookup and swap lookup did not find it: allocate. 2005c5bf121eSVineeth Remanan Pillai */ 2006c5bf121eSVineeth Remanan Pillai 2007cfda0526SMike Rapoport if (vma && userfaultfd_missing(vma)) { 2008cfda0526SMike Rapoport *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 2009cfda0526SMike Rapoport return 0; 2010cfda0526SMike Rapoport } 2011cfda0526SMike Rapoport 20122cf13384SDavid Stevens if (!shmem_is_huge(inode, index, false, 20132cf13384SDavid Stevens vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) 2014800d8c63SKirill A. Shutemov goto alloc_nohuge; 201527d80fa2SKees Cook 2016164cc4feSRik van Riel huge_gfp = vma_thp_gfp_mask(vma); 201778cc8cdcSRik van Riel huge_gfp = limit_gfp_mask(huge_gfp, gfp); 2018b1d0ec3aSMatthew Wilcox (Oracle) folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); 2019b1d0ec3aSMatthew Wilcox (Oracle) if (IS_ERR(folio)) { 2020c5bf121eSVineeth Remanan Pillai alloc_nohuge: 2021b1d0ec3aSMatthew Wilcox (Oracle) folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); 202254af6042SHugh Dickins } 2023b1d0ec3aSMatthew Wilcox (Oracle) if (IS_ERR(folio)) { 2024779750d2SKirill A. Shutemov int retry = 5; 2025c5bf121eSVineeth Remanan Pillai 2026b1d0ec3aSMatthew Wilcox (Oracle) error = PTR_ERR(folio); 2027b1d0ec3aSMatthew Wilcox (Oracle) folio = NULL; 2028779750d2SKirill A. Shutemov if (error != -ENOSPC) 2029c5bf121eSVineeth Remanan Pillai goto unlock; 2030779750d2SKirill A. Shutemov /* 2031fc26babbSMatthew Wilcox (Oracle) * Try to reclaim some space by splitting a large folio 2032779750d2SKirill A. Shutemov * beyond i_size on the filesystem. 2033779750d2SKirill A. Shutemov */ 2034779750d2SKirill A. Shutemov while (retry--) { 2035779750d2SKirill A. Shutemov int ret; 2036c5bf121eSVineeth Remanan Pillai 2037779750d2SKirill A. Shutemov ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 2038779750d2SKirill A. Shutemov if (ret == SHRINK_STOP) 2039779750d2SKirill A. Shutemov break; 2040779750d2SKirill A. Shutemov if (ret) 2041779750d2SKirill A. Shutemov goto alloc_nohuge; 2042779750d2SKirill A. Shutemov } 2043c5bf121eSVineeth Remanan Pillai goto unlock; 2044800d8c63SKirill A. Shutemov } 2045800d8c63SKirill A. Shutemov 2046b1d0ec3aSMatthew Wilcox (Oracle) hindex = round_down(index, folio_nr_pages(folio)); 2047800d8c63SKirill A. Shutemov 204866d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 2049b1d0ec3aSMatthew Wilcox (Oracle) __folio_set_referenced(folio); 205066d2f4d2SHugh Dickins 2051b7dd44a1SMatthew Wilcox (Oracle) error = shmem_add_to_page_cache(folio, mapping, hindex, 20523fea5a49SJohannes Weiner NULL, gfp & GFP_RECLAIM_MASK, 20533fea5a49SJohannes Weiner charge_mm); 20543fea5a49SJohannes Weiner if (error) 2055800d8c63SKirill A. Shutemov goto unacct; 2056b1d0ec3aSMatthew Wilcox (Oracle) folio_add_lru(folio); 205754af6042SHugh Dickins 20584595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 2059b1d0ec3aSMatthew Wilcox (Oracle) info->alloced += folio_nr_pages(folio); 206054af6042SHugh Dickins shmem_recalc_inode(inode); 20614595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 20621635f6a7SHugh Dickins alloced = true; 206354af6042SHugh Dickins 2064b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio) && 2065779750d2SKirill A. Shutemov DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 2066fc26babbSMatthew Wilcox (Oracle) folio_next_index(folio) - 1) { 2067779750d2SKirill A. Shutemov /* 2068fc26babbSMatthew Wilcox (Oracle) * Part of the large folio is beyond i_size: subject 2069779750d2SKirill A. Shutemov * to shrink under memory pressure. 2070779750d2SKirill A. Shutemov */ 2071779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 2072d041353dSCong Wang /* 2073d041353dSCong Wang * _careful to defend against unlocked access to 2074d041353dSCong Wang * ->shrink_list in shmem_unused_huge_shrink() 2075d041353dSCong Wang */ 2076d041353dSCong Wang if (list_empty_careful(&info->shrinklist)) { 2077779750d2SKirill A. Shutemov list_add_tail(&info->shrinklist, 2078779750d2SKirill A. Shutemov &sbinfo->shrinklist); 2079779750d2SKirill A. Shutemov sbinfo->shrinklist_len++; 2080779750d2SKirill A. Shutemov } 2081779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 2082779750d2SKirill A. Shutemov } 2083779750d2SKirill A. Shutemov 2084ec9516fbSHugh Dickins /* 2085fc26babbSMatthew Wilcox (Oracle) * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. 20861635f6a7SHugh Dickins */ 20871635f6a7SHugh Dickins if (sgp == SGP_FALLOC) 20881635f6a7SHugh Dickins sgp = SGP_WRITE; 20891635f6a7SHugh Dickins clear: 20901635f6a7SHugh Dickins /* 2091fc26babbSMatthew Wilcox (Oracle) * Let SGP_WRITE caller clear ends if write does not fill folio; 2092fc26babbSMatthew Wilcox (Oracle) * but SGP_FALLOC on a folio fallocated earlier must initialize 20931635f6a7SHugh Dickins * it now, lest undo on failure cancel our earlier guarantee. 2094ec9516fbSHugh Dickins */ 2095b1d0ec3aSMatthew Wilcox (Oracle) if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { 2096b1d0ec3aSMatthew Wilcox (Oracle) long i, n = folio_nr_pages(folio); 2097800d8c63SKirill A. Shutemov 2098b1d0ec3aSMatthew Wilcox (Oracle) for (i = 0; i < n; i++) 2099b1d0ec3aSMatthew Wilcox (Oracle) clear_highpage(folio_page(folio, i)); 2100b1d0ec3aSMatthew Wilcox (Oracle) flush_dcache_folio(folio); 2101b1d0ec3aSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 2102ec9516fbSHugh Dickins } 2103bde05d1cSHugh Dickins 210454af6042SHugh Dickins /* Perhaps the file has been truncated since we checked */ 210575edd345SHugh Dickins if (sgp <= SGP_CACHE && 210609cbfeafSKirill A. Shutemov ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 2107267a4c76SHugh Dickins if (alloced) { 2108b1d0ec3aSMatthew Wilcox (Oracle) folio_clear_dirty(folio); 2109b1d0ec3aSMatthew Wilcox (Oracle) filemap_remove_folio(folio); 21104595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 2111267a4c76SHugh Dickins shmem_recalc_inode(inode); 21124595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 2113267a4c76SHugh Dickins } 211454af6042SHugh Dickins error = -EINVAL; 2115267a4c76SHugh Dickins goto unlock; 2116ff36b801SShaohua Li } 211763ec1973SMatthew Wilcox (Oracle) out: 2118fc26babbSMatthew Wilcox (Oracle) *foliop = folio; 211954af6042SHugh Dickins return 0; 2120d00806b1SNick Piggin 2121d0217ac0SNick Piggin /* 212254af6042SHugh Dickins * Error recovery. 21231da177e4SLinus Torvalds */ 212454af6042SHugh Dickins unacct: 2125b1d0ec3aSMatthew Wilcox (Oracle) shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); 2126800d8c63SKirill A. Shutemov 2127b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_large(folio)) { 2128b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 2129b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 2130800d8c63SKirill A. Shutemov goto alloc_nohuge; 2131800d8c63SKirill A. Shutemov } 2132d1899228SHugh Dickins unlock: 2133b1d0ec3aSMatthew Wilcox (Oracle) if (folio) { 2134b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 2135b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 213654af6042SHugh Dickins } 213754af6042SHugh Dickins if (error == -ENOSPC && !once++) { 21384595ef88SKirill A. Shutemov spin_lock_irq(&info->lock); 213954af6042SHugh Dickins shmem_recalc_inode(inode); 21404595ef88SKirill A. Shutemov spin_unlock_irq(&info->lock); 21411da177e4SLinus Torvalds goto repeat; 2142d8dc74f2SAdrian Bunk } 21437f4446eeSMatthew Wilcox if (error == -EEXIST) 214454af6042SHugh Dickins goto repeat; 214554af6042SHugh Dickins return error; 21461da177e4SLinus Torvalds } 21471da177e4SLinus Torvalds 21484e1fc793SMatthew Wilcox (Oracle) int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, 21494e1fc793SMatthew Wilcox (Oracle) enum sgp_type sgp) 21504e1fc793SMatthew Wilcox (Oracle) { 21514e1fc793SMatthew Wilcox (Oracle) return shmem_get_folio_gfp(inode, index, foliop, sgp, 21524e1fc793SMatthew Wilcox (Oracle) mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 21534e1fc793SMatthew Wilcox (Oracle) } 21544e1fc793SMatthew Wilcox (Oracle) 215510d20bd2SLinus Torvalds /* 215610d20bd2SLinus Torvalds * This is like autoremove_wake_function, but it removes the wait queue 215710d20bd2SLinus Torvalds * entry unconditionally - even if something else had already woken the 215810d20bd2SLinus Torvalds * target. 215910d20bd2SLinus Torvalds */ 2160ac6424b9SIngo Molnar static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 216110d20bd2SLinus Torvalds { 216210d20bd2SLinus Torvalds int ret = default_wake_function(wait, mode, sync, key); 21632055da97SIngo Molnar list_del_init(&wait->entry); 216410d20bd2SLinus Torvalds return ret; 216510d20bd2SLinus Torvalds } 216610d20bd2SLinus Torvalds 216720acce67SSouptick Joarder static vm_fault_t shmem_fault(struct vm_fault *vmf) 21681da177e4SLinus Torvalds { 216911bac800SDave Jiang struct vm_area_struct *vma = vmf->vma; 2170496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 21719e18eb29SAndres Lagar-Cavilla gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 217268a54100SMatthew Wilcox (Oracle) struct folio *folio = NULL; 217320acce67SSouptick Joarder int err; 217420acce67SSouptick Joarder vm_fault_t ret = VM_FAULT_LOCKED; 21751da177e4SLinus Torvalds 2176f00cdc6dSHugh Dickins /* 2177f00cdc6dSHugh Dickins * Trinity finds that probing a hole which tmpfs is punching can 2178f00cdc6dSHugh Dickins * prevent the hole-punch from ever completing: which in turn 21799608703eSJan Kara * locks writers out with its hold on i_rwsem. So refrain from 21808e205f77SHugh Dickins * faulting pages into the hole while it's being punched. Although 21818e205f77SHugh Dickins * shmem_undo_range() does remove the additions, it may be unable to 21828e205f77SHugh Dickins * keep up, as each new page needs its own unmap_mapping_range() call, 21838e205f77SHugh Dickins * and the i_mmap tree grows ever slower to scan if new vmas are added. 21848e205f77SHugh Dickins * 21858e205f77SHugh Dickins * It does not matter if we sometimes reach this check just before the 21868e205f77SHugh Dickins * hole-punch begins, so that one fault then races with the punch: 21878e205f77SHugh Dickins * we just need to make racing faults a rare case. 21888e205f77SHugh Dickins * 21898e205f77SHugh Dickins * The implementation below would be much simpler if we just used a 21909608703eSJan Kara * standard mutex or completion: but we cannot take i_rwsem in fault, 21918e205f77SHugh Dickins * and bloating every shmem inode for this unlikely case would be sad. 2192f00cdc6dSHugh Dickins */ 2193f00cdc6dSHugh Dickins if (unlikely(inode->i_private)) { 2194f00cdc6dSHugh Dickins struct shmem_falloc *shmem_falloc; 2195f00cdc6dSHugh Dickins 2196f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 2197f00cdc6dSHugh Dickins shmem_falloc = inode->i_private; 21988e205f77SHugh Dickins if (shmem_falloc && 21998e205f77SHugh Dickins shmem_falloc->waitq && 22008e205f77SHugh Dickins vmf->pgoff >= shmem_falloc->start && 22018e205f77SHugh Dickins vmf->pgoff < shmem_falloc->next) { 22028897c1b1SKirill A. Shutemov struct file *fpin; 22038e205f77SHugh Dickins wait_queue_head_t *shmem_falloc_waitq; 220410d20bd2SLinus Torvalds DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 22058e205f77SHugh Dickins 22068e205f77SHugh Dickins ret = VM_FAULT_NOPAGE; 22078897c1b1SKirill A. Shutemov fpin = maybe_unlock_mmap_for_io(vmf, NULL); 22088897c1b1SKirill A. Shutemov if (fpin) 22098e205f77SHugh Dickins ret = VM_FAULT_RETRY; 22108e205f77SHugh Dickins 22118e205f77SHugh Dickins shmem_falloc_waitq = shmem_falloc->waitq; 22128e205f77SHugh Dickins prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 22138e205f77SHugh Dickins TASK_UNINTERRUPTIBLE); 22148e205f77SHugh Dickins spin_unlock(&inode->i_lock); 22158e205f77SHugh Dickins schedule(); 22168e205f77SHugh Dickins 22178e205f77SHugh Dickins /* 22188e205f77SHugh Dickins * shmem_falloc_waitq points into the shmem_fallocate() 22198e205f77SHugh Dickins * stack of the hole-punching task: shmem_falloc_waitq 22208e205f77SHugh Dickins * is usually invalid by the time we reach here, but 22218e205f77SHugh Dickins * finish_wait() does not dereference it in that case; 22228e205f77SHugh Dickins * though i_lock needed lest racing with wake_up_all(). 22238e205f77SHugh Dickins */ 22248e205f77SHugh Dickins spin_lock(&inode->i_lock); 22258e205f77SHugh Dickins finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 22268e205f77SHugh Dickins spin_unlock(&inode->i_lock); 22278897c1b1SKirill A. Shutemov 22288897c1b1SKirill A. Shutemov if (fpin) 22298897c1b1SKirill A. Shutemov fput(fpin); 22308e205f77SHugh Dickins return ret; 2231f00cdc6dSHugh Dickins } 22328e205f77SHugh Dickins spin_unlock(&inode->i_lock); 2233f00cdc6dSHugh Dickins } 2234f00cdc6dSHugh Dickins 223568a54100SMatthew Wilcox (Oracle) err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, 2236cfda0526SMike Rapoport gfp, vma, vmf, &ret); 223720acce67SSouptick Joarder if (err) 223820acce67SSouptick Joarder return vmf_error(err); 223968a54100SMatthew Wilcox (Oracle) if (folio) 224068a54100SMatthew Wilcox (Oracle) vmf->page = folio_file_page(folio, vmf->pgoff); 224168da9f05SHugh Dickins return ret; 22421da177e4SLinus Torvalds } 22431da177e4SLinus Torvalds 2244c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 2245c01d5b30SHugh Dickins unsigned long uaddr, unsigned long len, 2246c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 2247c01d5b30SHugh Dickins { 2248c01d5b30SHugh Dickins unsigned long (*get_area)(struct file *, 2249c01d5b30SHugh Dickins unsigned long, unsigned long, unsigned long, unsigned long); 2250c01d5b30SHugh Dickins unsigned long addr; 2251c01d5b30SHugh Dickins unsigned long offset; 2252c01d5b30SHugh Dickins unsigned long inflated_len; 2253c01d5b30SHugh Dickins unsigned long inflated_addr; 2254c01d5b30SHugh Dickins unsigned long inflated_offset; 2255c01d5b30SHugh Dickins 2256c01d5b30SHugh Dickins if (len > TASK_SIZE) 2257c01d5b30SHugh Dickins return -ENOMEM; 2258c01d5b30SHugh Dickins 2259c01d5b30SHugh Dickins get_area = current->mm->get_unmapped_area; 2260c01d5b30SHugh Dickins addr = get_area(file, uaddr, len, pgoff, flags); 2261c01d5b30SHugh Dickins 2262396bcc52SMatthew Wilcox (Oracle) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 2263c01d5b30SHugh Dickins return addr; 2264c01d5b30SHugh Dickins if (IS_ERR_VALUE(addr)) 2265c01d5b30SHugh Dickins return addr; 2266c01d5b30SHugh Dickins if (addr & ~PAGE_MASK) 2267c01d5b30SHugh Dickins return addr; 2268c01d5b30SHugh Dickins if (addr > TASK_SIZE - len) 2269c01d5b30SHugh Dickins return addr; 2270c01d5b30SHugh Dickins 2271c01d5b30SHugh Dickins if (shmem_huge == SHMEM_HUGE_DENY) 2272c01d5b30SHugh Dickins return addr; 2273c01d5b30SHugh Dickins if (len < HPAGE_PMD_SIZE) 2274c01d5b30SHugh Dickins return addr; 2275c01d5b30SHugh Dickins if (flags & MAP_FIXED) 2276c01d5b30SHugh Dickins return addr; 2277c01d5b30SHugh Dickins /* 2278c01d5b30SHugh Dickins * Our priority is to support MAP_SHARED mapped hugely; 2279c01d5b30SHugh Dickins * and support MAP_PRIVATE mapped hugely too, until it is COWed. 228099158997SKirill A. Shutemov * But if caller specified an address hint and we allocated area there 228199158997SKirill A. Shutemov * successfully, respect that as before. 2282c01d5b30SHugh Dickins */ 228399158997SKirill A. Shutemov if (uaddr == addr) 2284c01d5b30SHugh Dickins return addr; 2285c01d5b30SHugh Dickins 2286c01d5b30SHugh Dickins if (shmem_huge != SHMEM_HUGE_FORCE) { 2287c01d5b30SHugh Dickins struct super_block *sb; 2288c01d5b30SHugh Dickins 2289c01d5b30SHugh Dickins if (file) { 2290c01d5b30SHugh Dickins VM_BUG_ON(file->f_op != &shmem_file_operations); 2291c01d5b30SHugh Dickins sb = file_inode(file)->i_sb; 2292c01d5b30SHugh Dickins } else { 2293c01d5b30SHugh Dickins /* 2294c01d5b30SHugh Dickins * Called directly from mm/mmap.c, or drivers/char/mem.c 2295c01d5b30SHugh Dickins * for "/dev/zero", to create a shared anonymous object. 2296c01d5b30SHugh Dickins */ 2297c01d5b30SHugh Dickins if (IS_ERR(shm_mnt)) 2298c01d5b30SHugh Dickins return addr; 2299c01d5b30SHugh Dickins sb = shm_mnt->mnt_sb; 2300c01d5b30SHugh Dickins } 23013089bf61SToshi Kani if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2302c01d5b30SHugh Dickins return addr; 2303c01d5b30SHugh Dickins } 2304c01d5b30SHugh Dickins 2305c01d5b30SHugh Dickins offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2306c01d5b30SHugh Dickins if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2307c01d5b30SHugh Dickins return addr; 2308c01d5b30SHugh Dickins if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2309c01d5b30SHugh Dickins return addr; 2310c01d5b30SHugh Dickins 2311c01d5b30SHugh Dickins inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2312c01d5b30SHugh Dickins if (inflated_len > TASK_SIZE) 2313c01d5b30SHugh Dickins return addr; 2314c01d5b30SHugh Dickins if (inflated_len < len) 2315c01d5b30SHugh Dickins return addr; 2316c01d5b30SHugh Dickins 231799158997SKirill A. Shutemov inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); 2318c01d5b30SHugh Dickins if (IS_ERR_VALUE(inflated_addr)) 2319c01d5b30SHugh Dickins return addr; 2320c01d5b30SHugh Dickins if (inflated_addr & ~PAGE_MASK) 2321c01d5b30SHugh Dickins return addr; 2322c01d5b30SHugh Dickins 2323c01d5b30SHugh Dickins inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2324c01d5b30SHugh Dickins inflated_addr += offset - inflated_offset; 2325c01d5b30SHugh Dickins if (inflated_offset > offset) 2326c01d5b30SHugh Dickins inflated_addr += HPAGE_PMD_SIZE; 2327c01d5b30SHugh Dickins 2328c01d5b30SHugh Dickins if (inflated_addr > TASK_SIZE - len) 2329c01d5b30SHugh Dickins return addr; 2330c01d5b30SHugh Dickins return inflated_addr; 2331c01d5b30SHugh Dickins } 2332c01d5b30SHugh Dickins 23331da177e4SLinus Torvalds #ifdef CONFIG_NUMA 233441ffe5d5SHugh Dickins static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 23351da177e4SLinus Torvalds { 2336496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 233741ffe5d5SHugh Dickins return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 23381da177e4SLinus Torvalds } 23391da177e4SLinus Torvalds 2340d8dc74f2SAdrian Bunk static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2341d8dc74f2SAdrian Bunk unsigned long addr) 23421da177e4SLinus Torvalds { 2343496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 234441ffe5d5SHugh Dickins pgoff_t index; 23451da177e4SLinus Torvalds 234641ffe5d5SHugh Dickins index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 234741ffe5d5SHugh Dickins return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 23481da177e4SLinus Torvalds } 23491da177e4SLinus Torvalds #endif 23501da177e4SLinus Torvalds 2351d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 23521da177e4SLinus Torvalds { 2353496ad9aaSAl Viro struct inode *inode = file_inode(file); 23541da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 23551da177e4SLinus Torvalds int retval = -ENOMEM; 23561da177e4SLinus Torvalds 2357ea0dfeb4SHugh Dickins /* 2358ea0dfeb4SHugh Dickins * What serializes the accesses to info->flags? 2359ea0dfeb4SHugh Dickins * ipc_lock_object() when called from shmctl_do_lock(), 2360ea0dfeb4SHugh Dickins * no serialization needed when called from shm_destroy(). 2361ea0dfeb4SHugh Dickins */ 23621da177e4SLinus Torvalds if (lock && !(info->flags & VM_LOCKED)) { 2363d7c9e99aSAlexey Gladkov if (!user_shm_lock(inode->i_size, ucounts)) 23641da177e4SLinus Torvalds goto out_nomem; 23651da177e4SLinus Torvalds info->flags |= VM_LOCKED; 236689e004eaSLee Schermerhorn mapping_set_unevictable(file->f_mapping); 23671da177e4SLinus Torvalds } 2368d7c9e99aSAlexey Gladkov if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2369d7c9e99aSAlexey Gladkov user_shm_unlock(inode->i_size, ucounts); 23701da177e4SLinus Torvalds info->flags &= ~VM_LOCKED; 237189e004eaSLee Schermerhorn mapping_clear_unevictable(file->f_mapping); 23721da177e4SLinus Torvalds } 23731da177e4SLinus Torvalds retval = 0; 237489e004eaSLee Schermerhorn 23751da177e4SLinus Torvalds out_nomem: 23761da177e4SLinus Torvalds return retval; 23771da177e4SLinus Torvalds } 23781da177e4SLinus Torvalds 23799b83a6a8SAdrian Bunk static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 23801da177e4SLinus Torvalds { 2381d09e8ca6SPasha Tatashin struct inode *inode = file_inode(file); 2382d09e8ca6SPasha Tatashin struct shmem_inode_info *info = SHMEM_I(inode); 238322247efdSPeter Xu int ret; 2384ab3948f5SJoel Fernandes (Google) 238522247efdSPeter Xu ret = seal_check_future_write(info->seals, vma); 238622247efdSPeter Xu if (ret) 238722247efdSPeter Xu return ret; 2388ab3948f5SJoel Fernandes (Google) 238951b0bff2SCatalin Marinas /* arm64 - allow memory tagging on RAM-based files */ 23901c71222eSSuren Baghdasaryan vm_flags_set(vma, VM_MTE_ALLOWED); 239151b0bff2SCatalin Marinas 23921da177e4SLinus Torvalds file_accessed(file); 2393d09e8ca6SPasha Tatashin /* This is anonymous shared memory if it is unlinked at the time of mmap */ 2394d09e8ca6SPasha Tatashin if (inode->i_nlink) 23951da177e4SLinus Torvalds vma->vm_ops = &shmem_vm_ops; 2396d09e8ca6SPasha Tatashin else 2397d09e8ca6SPasha Tatashin vma->vm_ops = &shmem_anon_vm_ops; 23981da177e4SLinus Torvalds return 0; 23991da177e4SLinus Torvalds } 24001da177e4SLinus Torvalds 2401cb241339SHugh Dickins #ifdef CONFIG_TMPFS_XATTR 2402cb241339SHugh Dickins static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2403cb241339SHugh Dickins 2404cb241339SHugh Dickins /* 2405cb241339SHugh Dickins * chattr's fsflags are unrelated to extended attributes, 2406cb241339SHugh Dickins * but tmpfs has chosen to enable them under the same config option. 2407cb241339SHugh Dickins */ 2408cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2409e408e695STheodore Ts'o { 2410cb241339SHugh Dickins unsigned int i_flags = 0; 2411cb241339SHugh Dickins 2412cb241339SHugh Dickins if (fsflags & FS_NOATIME_FL) 2413cb241339SHugh Dickins i_flags |= S_NOATIME; 2414cb241339SHugh Dickins if (fsflags & FS_APPEND_FL) 2415cb241339SHugh Dickins i_flags |= S_APPEND; 2416cb241339SHugh Dickins if (fsflags & FS_IMMUTABLE_FL) 2417cb241339SHugh Dickins i_flags |= S_IMMUTABLE; 2418cb241339SHugh Dickins /* 2419cb241339SHugh Dickins * But FS_NODUMP_FL does not require any action in i_flags. 2420cb241339SHugh Dickins */ 2421cb241339SHugh Dickins inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); 2422e408e695STheodore Ts'o } 2423cb241339SHugh Dickins #else 2424cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2425cb241339SHugh Dickins { 2426cb241339SHugh Dickins } 2427cb241339SHugh Dickins #define shmem_initxattrs NULL 2428cb241339SHugh Dickins #endif 2429e408e695STheodore Ts'o 2430e09764cfSCarlos Maiolino static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, 2431e09764cfSCarlos Maiolino struct super_block *sb, 2432e09764cfSCarlos Maiolino struct inode *dir, umode_t mode, 2433e09764cfSCarlos Maiolino dev_t dev, unsigned long flags) 24341da177e4SLinus Torvalds { 24351da177e4SLinus Torvalds struct inode *inode; 24361da177e4SLinus Torvalds struct shmem_inode_info *info; 24371da177e4SLinus Torvalds struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2438e809d5f0SChris Down ino_t ino; 243971480663SCarlos Maiolino int err; 24401da177e4SLinus Torvalds 244171480663SCarlos Maiolino err = shmem_reserve_inode(sb, &ino); 244271480663SCarlos Maiolino if (err) 244371480663SCarlos Maiolino return ERR_PTR(err); 244471480663SCarlos Maiolino 24451da177e4SLinus Torvalds 24461da177e4SLinus Torvalds inode = new_inode(sb); 244771480663SCarlos Maiolino 244871480663SCarlos Maiolino if (!inode) { 244971480663SCarlos Maiolino shmem_free_inode(sb); 245071480663SCarlos Maiolino return ERR_PTR(-ENOSPC); 245171480663SCarlos Maiolino } 245271480663SCarlos Maiolino 2453e809d5f0SChris Down inode->i_ino = ino; 24547a80e5b8SGiuseppe Scrivano inode_init_owner(idmap, inode, dir, mode); 24551da177e4SLinus Torvalds inode->i_blocks = 0; 2456078cd827SDeepa Dinamani inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2457a251c17aSJason A. Donenfeld inode->i_generation = get_random_u32(); 24581da177e4SLinus Torvalds info = SHMEM_I(inode); 24591da177e4SLinus Torvalds memset(info, 0, (char *)inode - (char *)info); 24601da177e4SLinus Torvalds spin_lock_init(&info->lock); 2461af53d3e9SHugh Dickins atomic_set(&info->stop_eviction, 0); 246240e041a2SDavid Herrmann info->seals = F_SEAL_SEAL; 24630b0a0806SHugh Dickins info->flags = flags & VM_NORESERVE; 2464f7cd16a5SXavier Roche info->i_crtime = inode->i_mtime; 2465e408e695STheodore Ts'o info->fsflags = (dir == NULL) ? 0 : 2466e408e695STheodore Ts'o SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; 2467cb241339SHugh Dickins if (info->fsflags) 2468cb241339SHugh Dickins shmem_set_inode_flags(inode, info->fsflags); 2469779750d2SKirill A. Shutemov INIT_LIST_HEAD(&info->shrinklist); 24701da177e4SLinus Torvalds INIT_LIST_HEAD(&info->swaplist); 247171480663SCarlos Maiolino INIT_LIST_HEAD(&info->swaplist); 24722c6efe9cSLuis Chamberlain if (sbinfo->noswap) 24732c6efe9cSLuis Chamberlain mapping_set_unevictable(inode->i_mapping); 247438f38657SAristeu Rozanski simple_xattrs_init(&info->xattrs); 247572c04902SAl Viro cache_no_acl(inode); 2476ff36da69SMatthew Wilcox (Oracle) mapping_set_large_folios(inode->i_mapping); 24771da177e4SLinus Torvalds 24781da177e4SLinus Torvalds switch (mode & S_IFMT) { 24791da177e4SLinus Torvalds default: 248039f0247dSAndreas Gruenbacher inode->i_op = &shmem_special_inode_operations; 24811da177e4SLinus Torvalds init_special_inode(inode, mode, dev); 24821da177e4SLinus Torvalds break; 24831da177e4SLinus Torvalds case S_IFREG: 248414fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 24851da177e4SLinus Torvalds inode->i_op = &shmem_inode_operations; 24861da177e4SLinus Torvalds inode->i_fop = &shmem_file_operations; 248771fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, 248871fe804bSLee Schermerhorn shmem_get_sbmpol(sbinfo)); 24891da177e4SLinus Torvalds break; 24901da177e4SLinus Torvalds case S_IFDIR: 2491d8c76e6fSDave Hansen inc_nlink(inode); 24921da177e4SLinus Torvalds /* Some things misbehave if size == 0 on a directory */ 24931da177e4SLinus Torvalds inode->i_size = 2 * BOGO_DIRENT_SIZE; 24941da177e4SLinus Torvalds inode->i_op = &shmem_dir_inode_operations; 24951da177e4SLinus Torvalds inode->i_fop = &simple_dir_operations; 24961da177e4SLinus Torvalds break; 24971da177e4SLinus Torvalds case S_IFLNK: 24981da177e4SLinus Torvalds /* 24991da177e4SLinus Torvalds * Must not load anything in the rbtree, 25001da177e4SLinus Torvalds * mpol_free_shared_policy will not be called. 25011da177e4SLinus Torvalds */ 250271fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, NULL); 25031da177e4SLinus Torvalds break; 25041da177e4SLinus Torvalds } 2505b45d71fbSJoel Fernandes (Google) 2506b45d71fbSJoel Fernandes (Google) lockdep_annotate_inode_mutex_key(inode); 25071da177e4SLinus Torvalds return inode; 25081da177e4SLinus Torvalds } 25091da177e4SLinus Torvalds 2510e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 2511e09764cfSCarlos Maiolino static struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2512e09764cfSCarlos Maiolino struct super_block *sb, struct inode *dir, 2513e09764cfSCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 2514e09764cfSCarlos Maiolino { 2515e09764cfSCarlos Maiolino int err; 2516e09764cfSCarlos Maiolino struct inode *inode; 2517e09764cfSCarlos Maiolino 2518e09764cfSCarlos Maiolino inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2519e09764cfSCarlos Maiolino if (IS_ERR(inode)) 2520e09764cfSCarlos Maiolino return inode; 2521e09764cfSCarlos Maiolino 2522e09764cfSCarlos Maiolino err = dquot_initialize(inode); 2523e09764cfSCarlos Maiolino if (err) 2524e09764cfSCarlos Maiolino goto errout; 2525e09764cfSCarlos Maiolino 2526e09764cfSCarlos Maiolino err = dquot_alloc_inode(inode); 2527e09764cfSCarlos Maiolino if (err) { 2528e09764cfSCarlos Maiolino dquot_drop(inode); 2529e09764cfSCarlos Maiolino goto errout; 2530e09764cfSCarlos Maiolino } 2531e09764cfSCarlos Maiolino return inode; 2532e09764cfSCarlos Maiolino 2533e09764cfSCarlos Maiolino errout: 2534e09764cfSCarlos Maiolino inode->i_flags |= S_NOQUOTA; 2535e09764cfSCarlos Maiolino iput(inode); 2536e09764cfSCarlos Maiolino return ERR_PTR(err); 2537e09764cfSCarlos Maiolino } 2538e09764cfSCarlos Maiolino #else 2539e09764cfSCarlos Maiolino static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2540e09764cfSCarlos Maiolino struct super_block *sb, struct inode *dir, 2541e09764cfSCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 2542e09764cfSCarlos Maiolino { 2543e09764cfSCarlos Maiolino return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2544e09764cfSCarlos Maiolino } 2545e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 2546e09764cfSCarlos Maiolino 25473460f6e5SAxel Rasmussen #ifdef CONFIG_USERFAULTFD 254861c50040SAxel Rasmussen int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 25494c27fe4cSMike Rapoport struct vm_area_struct *dst_vma, 25504c27fe4cSMike Rapoport unsigned long dst_addr, 25514c27fe4cSMike Rapoport unsigned long src_addr, 2552d9712937SAxel Rasmussen uffd_flags_t flags, 2553d7be6d7eSZhangPeng struct folio **foliop) 25544c27fe4cSMike Rapoport { 25554c27fe4cSMike Rapoport struct inode *inode = file_inode(dst_vma->vm_file); 25564c27fe4cSMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 25574c27fe4cSMike Rapoport struct address_space *mapping = inode->i_mapping; 25584c27fe4cSMike Rapoport gfp_t gfp = mapping_gfp_mask(mapping); 25594c27fe4cSMike Rapoport pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 25604c27fe4cSMike Rapoport void *page_kaddr; 2561b7dd44a1SMatthew Wilcox (Oracle) struct folio *folio; 25624c27fe4cSMike Rapoport int ret; 25633460f6e5SAxel Rasmussen pgoff_t max_off; 25644c27fe4cSMike Rapoport 2565c7e263abSLukas Czerner if (shmem_inode_acct_block(inode, 1)) { 25667ed9d238SAxel Rasmussen /* 25677ed9d238SAxel Rasmussen * We may have got a page, returned -ENOENT triggering a retry, 25687ed9d238SAxel Rasmussen * and now we find ourselves with -ENOMEM. Release the page, to 25697ed9d238SAxel Rasmussen * avoid a BUG_ON in our caller. 25707ed9d238SAxel Rasmussen */ 2571d7be6d7eSZhangPeng if (unlikely(*foliop)) { 2572d7be6d7eSZhangPeng folio_put(*foliop); 2573d7be6d7eSZhangPeng *foliop = NULL; 25747ed9d238SAxel Rasmussen } 25757d64ae3aSAxel Rasmussen return -ENOMEM; 25767ed9d238SAxel Rasmussen } 25774c27fe4cSMike Rapoport 2578d7be6d7eSZhangPeng if (!*foliop) { 25797d64ae3aSAxel Rasmussen ret = -ENOMEM; 25807a7256d5SMatthew Wilcox (Oracle) folio = shmem_alloc_folio(gfp, info, pgoff); 25817a7256d5SMatthew Wilcox (Oracle) if (!folio) 25820f079694SMike Rapoport goto out_unacct_blocks; 25834c27fe4cSMike Rapoport 2584d9712937SAxel Rasmussen if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 25857a7256d5SMatthew Wilcox (Oracle) page_kaddr = kmap_local_folio(folio, 0); 25865dc21f0cSIra Weiny /* 25875dc21f0cSIra Weiny * The read mmap_lock is held here. Despite the 25885dc21f0cSIra Weiny * mmap_lock being read recursive a deadlock is still 25895dc21f0cSIra Weiny * possible if a writer has taken a lock. For example: 25905dc21f0cSIra Weiny * 25915dc21f0cSIra Weiny * process A thread 1 takes read lock on own mmap_lock 25925dc21f0cSIra Weiny * process A thread 2 calls mmap, blocks taking write lock 25935dc21f0cSIra Weiny * process B thread 1 takes page fault, read lock on own mmap lock 25945dc21f0cSIra Weiny * process B thread 2 calls mmap, blocks taking write lock 25955dc21f0cSIra Weiny * process A thread 1 blocks taking read lock on process B 25965dc21f0cSIra Weiny * process B thread 1 blocks taking read lock on process A 25975dc21f0cSIra Weiny * 25985dc21f0cSIra Weiny * Disable page faults to prevent potential deadlock 25995dc21f0cSIra Weiny * and retry the copy outside the mmap_lock. 26005dc21f0cSIra Weiny */ 26015dc21f0cSIra Weiny pagefault_disable(); 26028d103963SMike Rapoport ret = copy_from_user(page_kaddr, 26038d103963SMike Rapoport (const void __user *)src_addr, 26044c27fe4cSMike Rapoport PAGE_SIZE); 26055dc21f0cSIra Weiny pagefault_enable(); 26067a7256d5SMatthew Wilcox (Oracle) kunmap_local(page_kaddr); 26074c27fe4cSMike Rapoport 2608c1e8d7c6SMichel Lespinasse /* fallback to copy_from_user outside mmap_lock */ 26094c27fe4cSMike Rapoport if (unlikely(ret)) { 2610d7be6d7eSZhangPeng *foliop = folio; 26117d64ae3aSAxel Rasmussen ret = -ENOENT; 26124c27fe4cSMike Rapoport /* don't free the page */ 26137d64ae3aSAxel Rasmussen goto out_unacct_blocks; 26144c27fe4cSMike Rapoport } 261519b482c2SMuchun Song 26167a7256d5SMatthew Wilcox (Oracle) flush_dcache_folio(folio); 26173460f6e5SAxel Rasmussen } else { /* ZEROPAGE */ 26187a7256d5SMatthew Wilcox (Oracle) clear_user_highpage(&folio->page, dst_addr); 26198d103963SMike Rapoport } 26204c27fe4cSMike Rapoport } else { 2621d7be6d7eSZhangPeng folio = *foliop; 26227a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 2623d7be6d7eSZhangPeng *foliop = NULL; 26244c27fe4cSMike Rapoport } 26254c27fe4cSMike Rapoport 26267a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON(folio_test_locked(folio)); 26277a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON(folio_test_swapbacked(folio)); 26287a7256d5SMatthew Wilcox (Oracle) __folio_set_locked(folio); 26297a7256d5SMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 26307a7256d5SMatthew Wilcox (Oracle) __folio_mark_uptodate(folio); 26319cc90c66SAndrea Arcangeli 2632e2a50c1fSAndrea Arcangeli ret = -EFAULT; 2633e2a50c1fSAndrea Arcangeli max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 26343460f6e5SAxel Rasmussen if (unlikely(pgoff >= max_off)) 2635e2a50c1fSAndrea Arcangeli goto out_release; 2636e2a50c1fSAndrea Arcangeli 2637b7dd44a1SMatthew Wilcox (Oracle) ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, 263861c50040SAxel Rasmussen gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); 26394c27fe4cSMike Rapoport if (ret) 26404c27fe4cSMike Rapoport goto out_release; 26414c27fe4cSMike Rapoport 264261c50040SAxel Rasmussen ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 2643d9712937SAxel Rasmussen &folio->page, true, flags); 26447d64ae3aSAxel Rasmussen if (ret) 26457d64ae3aSAxel Rasmussen goto out_delete_from_cache; 26464c27fe4cSMike Rapoport 264794b7cc01SYang Shi spin_lock_irq(&info->lock); 26484c27fe4cSMike Rapoport info->alloced++; 26494c27fe4cSMike Rapoport shmem_recalc_inode(inode); 265094b7cc01SYang Shi spin_unlock_irq(&info->lock); 26514c27fe4cSMike Rapoport 26527a7256d5SMatthew Wilcox (Oracle) folio_unlock(folio); 26537d64ae3aSAxel Rasmussen return 0; 26547d64ae3aSAxel Rasmussen out_delete_from_cache: 26557a7256d5SMatthew Wilcox (Oracle) filemap_remove_folio(folio); 26564c27fe4cSMike Rapoport out_release: 26577a7256d5SMatthew Wilcox (Oracle) folio_unlock(folio); 26587a7256d5SMatthew Wilcox (Oracle) folio_put(folio); 26594c27fe4cSMike Rapoport out_unacct_blocks: 26600f079694SMike Rapoport shmem_inode_unacct_blocks(inode, 1); 26617d64ae3aSAxel Rasmussen return ret; 26624c27fe4cSMike Rapoport } 26633460f6e5SAxel Rasmussen #endif /* CONFIG_USERFAULTFD */ 26648d103963SMike Rapoport 26651da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 266692e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations; 266769f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations; 26681da177e4SLinus Torvalds 26691da177e4SLinus Torvalds static int 2670800d15a5SNick Piggin shmem_write_begin(struct file *file, struct address_space *mapping, 26719d6b0cd7SMatthew Wilcox (Oracle) loff_t pos, unsigned len, 2672800d15a5SNick Piggin struct page **pagep, void **fsdata) 26731da177e4SLinus Torvalds { 2674800d15a5SNick Piggin struct inode *inode = mapping->host; 267540e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 267609cbfeafSKirill A. Shutemov pgoff_t index = pos >> PAGE_SHIFT; 2677eff1f906SMatthew Wilcox (Oracle) struct folio *folio; 2678a7605426SYang Shi int ret = 0; 267940e041a2SDavid Herrmann 26809608703eSJan Kara /* i_rwsem is held by caller */ 2681ab3948f5SJoel Fernandes (Google) if (unlikely(info->seals & (F_SEAL_GROW | 2682ab3948f5SJoel Fernandes (Google) F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2683ab3948f5SJoel Fernandes (Google) if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 268440e041a2SDavid Herrmann return -EPERM; 268540e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 268640e041a2SDavid Herrmann return -EPERM; 268740e041a2SDavid Herrmann } 268840e041a2SDavid Herrmann 2689eff1f906SMatthew Wilcox (Oracle) ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); 2690a7605426SYang Shi 2691a7605426SYang Shi if (ret) 2692a7605426SYang Shi return ret; 2693a7605426SYang Shi 2694eff1f906SMatthew Wilcox (Oracle) *pagep = folio_file_page(folio, index); 2695a7605426SYang Shi if (PageHWPoison(*pagep)) { 2696eff1f906SMatthew Wilcox (Oracle) folio_unlock(folio); 2697eff1f906SMatthew Wilcox (Oracle) folio_put(folio); 2698a7605426SYang Shi *pagep = NULL; 2699a7605426SYang Shi return -EIO; 2700a7605426SYang Shi } 2701a7605426SYang Shi 2702a7605426SYang Shi return 0; 2703800d15a5SNick Piggin } 2704800d15a5SNick Piggin 2705800d15a5SNick Piggin static int 2706800d15a5SNick Piggin shmem_write_end(struct file *file, struct address_space *mapping, 2707800d15a5SNick Piggin loff_t pos, unsigned len, unsigned copied, 2708800d15a5SNick Piggin struct page *page, void *fsdata) 2709800d15a5SNick Piggin { 271069bbb87bSMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 2711800d15a5SNick Piggin struct inode *inode = mapping->host; 2712800d15a5SNick Piggin 2713800d15a5SNick Piggin if (pos + copied > inode->i_size) 2714800d15a5SNick Piggin i_size_write(inode, pos + copied); 2715800d15a5SNick Piggin 271669bbb87bSMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 271769bbb87bSMatthew Wilcox (Oracle) if (copied < folio_size(folio)) { 271869bbb87bSMatthew Wilcox (Oracle) size_t from = offset_in_folio(folio, pos); 271969bbb87bSMatthew Wilcox (Oracle) folio_zero_segments(folio, 0, from, 272069bbb87bSMatthew Wilcox (Oracle) from + copied, folio_size(folio)); 2721800d8c63SKirill A. Shutemov } 272269bbb87bSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 2723800d8c63SKirill A. Shutemov } 272469bbb87bSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 272569bbb87bSMatthew Wilcox (Oracle) folio_unlock(folio); 272669bbb87bSMatthew Wilcox (Oracle) folio_put(folio); 2727d3602444SHugh Dickins 2728800d15a5SNick Piggin return copied; 27291da177e4SLinus Torvalds } 27301da177e4SLinus Torvalds 27312ba5bbedSAl Viro static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 27321da177e4SLinus Torvalds { 27336e58e79dSAl Viro struct file *file = iocb->ki_filp; 27346e58e79dSAl Viro struct inode *inode = file_inode(file); 27351da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 273641ffe5d5SHugh Dickins pgoff_t index; 273741ffe5d5SHugh Dickins unsigned long offset; 2738f7c1d074SGeert Uytterhoeven int error = 0; 2739cb66a7a1SAl Viro ssize_t retval = 0; 27406e58e79dSAl Viro loff_t *ppos = &iocb->ki_pos; 2741a0ee5ec5SHugh Dickins 274209cbfeafSKirill A. Shutemov index = *ppos >> PAGE_SHIFT; 274309cbfeafSKirill A. Shutemov offset = *ppos & ~PAGE_MASK; 27441da177e4SLinus Torvalds 27451da177e4SLinus Torvalds for (;;) { 27464601e2fcSMatthew Wilcox (Oracle) struct folio *folio = NULL; 27471da177e4SLinus Torvalds struct page *page = NULL; 274841ffe5d5SHugh Dickins pgoff_t end_index; 274941ffe5d5SHugh Dickins unsigned long nr, ret; 27501da177e4SLinus Torvalds loff_t i_size = i_size_read(inode); 27511da177e4SLinus Torvalds 275209cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 27531da177e4SLinus Torvalds if (index > end_index) 27541da177e4SLinus Torvalds break; 27551da177e4SLinus Torvalds if (index == end_index) { 275609cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 27571da177e4SLinus Torvalds if (nr <= offset) 27581da177e4SLinus Torvalds break; 27591da177e4SLinus Torvalds } 27601da177e4SLinus Torvalds 27614601e2fcSMatthew Wilcox (Oracle) error = shmem_get_folio(inode, index, &folio, SGP_READ); 27626e58e79dSAl Viro if (error) { 27636e58e79dSAl Viro if (error == -EINVAL) 27646e58e79dSAl Viro error = 0; 27651da177e4SLinus Torvalds break; 27661da177e4SLinus Torvalds } 27674601e2fcSMatthew Wilcox (Oracle) if (folio) { 27684601e2fcSMatthew Wilcox (Oracle) folio_unlock(folio); 2769a7605426SYang Shi 27704601e2fcSMatthew Wilcox (Oracle) page = folio_file_page(folio, index); 2771a7605426SYang Shi if (PageHWPoison(page)) { 27724601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 2773a7605426SYang Shi error = -EIO; 2774a7605426SYang Shi break; 2775a7605426SYang Shi } 277675edd345SHugh Dickins } 27771da177e4SLinus Torvalds 27781da177e4SLinus Torvalds /* 27791da177e4SLinus Torvalds * We must evaluate after, since reads (unlike writes) 27809608703eSJan Kara * are called without i_rwsem protection against truncate 27811da177e4SLinus Torvalds */ 278209cbfeafSKirill A. Shutemov nr = PAGE_SIZE; 27831da177e4SLinus Torvalds i_size = i_size_read(inode); 278409cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 27851da177e4SLinus Torvalds if (index == end_index) { 278609cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 27871da177e4SLinus Torvalds if (nr <= offset) { 27884601e2fcSMatthew Wilcox (Oracle) if (folio) 27894601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 27901da177e4SLinus Torvalds break; 27911da177e4SLinus Torvalds } 27921da177e4SLinus Torvalds } 27931da177e4SLinus Torvalds nr -= offset; 27941da177e4SLinus Torvalds 27954601e2fcSMatthew Wilcox (Oracle) if (folio) { 27961da177e4SLinus Torvalds /* 27971da177e4SLinus Torvalds * If users can be writing to this page using arbitrary 27981da177e4SLinus Torvalds * virtual addresses, take care about potential aliasing 27991da177e4SLinus Torvalds * before reading the page on the kernel side. 28001da177e4SLinus Torvalds */ 28011da177e4SLinus Torvalds if (mapping_writably_mapped(mapping)) 28021da177e4SLinus Torvalds flush_dcache_page(page); 28031da177e4SLinus Torvalds /* 28041da177e4SLinus Torvalds * Mark the page accessed if we read the beginning. 28051da177e4SLinus Torvalds */ 28061da177e4SLinus Torvalds if (!offset) 28074601e2fcSMatthew Wilcox (Oracle) folio_mark_accessed(folio); 28081da177e4SLinus Torvalds /* 28091da177e4SLinus Torvalds * Ok, we have the page, and it's up-to-date, so 28101da177e4SLinus Torvalds * now we can copy it to user space... 28111da177e4SLinus Torvalds */ 28122ba5bbedSAl Viro ret = copy_page_to_iter(page, offset, nr, to); 28134601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 28141bdec44bSHugh Dickins 2815fcb14cb1SAl Viro } else if (user_backed_iter(to)) { 28161bdec44bSHugh Dickins /* 28171bdec44bSHugh Dickins * Copy to user tends to be so well optimized, but 28181bdec44bSHugh Dickins * clear_user() not so much, that it is noticeably 28191bdec44bSHugh Dickins * faster to copy the zero page instead of clearing. 28201bdec44bSHugh Dickins */ 28211bdec44bSHugh Dickins ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); 28221bdec44bSHugh Dickins } else { 28231bdec44bSHugh Dickins /* 28241bdec44bSHugh Dickins * But submitting the same page twice in a row to 28251bdec44bSHugh Dickins * splice() - or others? - can result in confusion: 28261bdec44bSHugh Dickins * so don't attempt that optimization on pipes etc. 28271bdec44bSHugh Dickins */ 28281bdec44bSHugh Dickins ret = iov_iter_zero(nr, to); 28291bdec44bSHugh Dickins } 28301bdec44bSHugh Dickins 28316e58e79dSAl Viro retval += ret; 28321da177e4SLinus Torvalds offset += ret; 283309cbfeafSKirill A. Shutemov index += offset >> PAGE_SHIFT; 283409cbfeafSKirill A. Shutemov offset &= ~PAGE_MASK; 28351da177e4SLinus Torvalds 28362ba5bbedSAl Viro if (!iov_iter_count(to)) 28371da177e4SLinus Torvalds break; 28386e58e79dSAl Viro if (ret < nr) { 28396e58e79dSAl Viro error = -EFAULT; 28406e58e79dSAl Viro break; 28416e58e79dSAl Viro } 28421da177e4SLinus Torvalds cond_resched(); 28431da177e4SLinus Torvalds } 28441da177e4SLinus Torvalds 284509cbfeafSKirill A. Shutemov *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 28466e58e79dSAl Viro file_accessed(file); 28476e58e79dSAl Viro return retval ? retval : error; 28481da177e4SLinus Torvalds } 28491da177e4SLinus Torvalds 2850bd194b18SDavid Howells static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, 2851bd194b18SDavid Howells struct pipe_buffer *buf) 2852bd194b18SDavid Howells { 2853bd194b18SDavid Howells return true; 2854bd194b18SDavid Howells } 2855bd194b18SDavid Howells 2856bd194b18SDavid Howells static void zero_pipe_buf_release(struct pipe_inode_info *pipe, 2857bd194b18SDavid Howells struct pipe_buffer *buf) 2858bd194b18SDavid Howells { 2859bd194b18SDavid Howells } 2860bd194b18SDavid Howells 2861bd194b18SDavid Howells static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, 2862bd194b18SDavid Howells struct pipe_buffer *buf) 2863bd194b18SDavid Howells { 2864bd194b18SDavid Howells return false; 2865bd194b18SDavid Howells } 2866bd194b18SDavid Howells 2867bd194b18SDavid Howells static const struct pipe_buf_operations zero_pipe_buf_ops = { 2868bd194b18SDavid Howells .release = zero_pipe_buf_release, 2869bd194b18SDavid Howells .try_steal = zero_pipe_buf_try_steal, 2870bd194b18SDavid Howells .get = zero_pipe_buf_get, 2871bd194b18SDavid Howells }; 2872bd194b18SDavid Howells 2873bd194b18SDavid Howells static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, 2874bd194b18SDavid Howells loff_t fpos, size_t size) 2875bd194b18SDavid Howells { 2876bd194b18SDavid Howells size_t offset = fpos & ~PAGE_MASK; 2877bd194b18SDavid Howells 2878bd194b18SDavid Howells size = min_t(size_t, size, PAGE_SIZE - offset); 2879bd194b18SDavid Howells 2880bd194b18SDavid Howells if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 2881bd194b18SDavid Howells struct pipe_buffer *buf = pipe_head_buf(pipe); 2882bd194b18SDavid Howells 2883bd194b18SDavid Howells *buf = (struct pipe_buffer) { 2884bd194b18SDavid Howells .ops = &zero_pipe_buf_ops, 2885bd194b18SDavid Howells .page = ZERO_PAGE(0), 2886bd194b18SDavid Howells .offset = offset, 2887bd194b18SDavid Howells .len = size, 2888bd194b18SDavid Howells }; 2889bd194b18SDavid Howells pipe->head++; 2890bd194b18SDavid Howells } 2891bd194b18SDavid Howells 2892bd194b18SDavid Howells return size; 2893bd194b18SDavid Howells } 2894bd194b18SDavid Howells 2895bd194b18SDavid Howells static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 2896bd194b18SDavid Howells struct pipe_inode_info *pipe, 2897bd194b18SDavid Howells size_t len, unsigned int flags) 2898bd194b18SDavid Howells { 2899bd194b18SDavid Howells struct inode *inode = file_inode(in); 2900bd194b18SDavid Howells struct address_space *mapping = inode->i_mapping; 2901bd194b18SDavid Howells struct folio *folio = NULL; 2902bd194b18SDavid Howells size_t total_spliced = 0, used, npages, n, part; 2903bd194b18SDavid Howells loff_t isize; 2904bd194b18SDavid Howells int error = 0; 2905bd194b18SDavid Howells 2906bd194b18SDavid Howells /* Work out how much data we can actually add into the pipe */ 2907bd194b18SDavid Howells used = pipe_occupancy(pipe->head, pipe->tail); 2908bd194b18SDavid Howells npages = max_t(ssize_t, pipe->max_usage - used, 0); 2909bd194b18SDavid Howells len = min_t(size_t, len, npages * PAGE_SIZE); 2910bd194b18SDavid Howells 2911bd194b18SDavid Howells do { 2912bd194b18SDavid Howells if (*ppos >= i_size_read(inode)) 2913bd194b18SDavid Howells break; 2914bd194b18SDavid Howells 2915fa598952SHugh Dickins error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, 2916fa598952SHugh Dickins SGP_READ); 2917bd194b18SDavid Howells if (error) { 2918bd194b18SDavid Howells if (error == -EINVAL) 2919bd194b18SDavid Howells error = 0; 2920bd194b18SDavid Howells break; 2921bd194b18SDavid Howells } 2922bd194b18SDavid Howells if (folio) { 2923bd194b18SDavid Howells folio_unlock(folio); 2924bd194b18SDavid Howells 2925fa598952SHugh Dickins if (folio_test_hwpoison(folio) || 2926fa598952SHugh Dickins (folio_test_large(folio) && 2927fa598952SHugh Dickins folio_test_has_hwpoisoned(folio))) { 2928bd194b18SDavid Howells error = -EIO; 2929bd194b18SDavid Howells break; 2930bd194b18SDavid Howells } 2931bd194b18SDavid Howells } 2932bd194b18SDavid Howells 2933bd194b18SDavid Howells /* 2934bd194b18SDavid Howells * i_size must be checked after we know the pages are Uptodate. 2935bd194b18SDavid Howells * 2936bd194b18SDavid Howells * Checking i_size after the check allows us to calculate 2937bd194b18SDavid Howells * the correct value for "nr", which means the zero-filled 2938bd194b18SDavid Howells * part of the page is not copied back to userspace (unless 2939bd194b18SDavid Howells * another truncate extends the file - this is desired though). 2940bd194b18SDavid Howells */ 2941bd194b18SDavid Howells isize = i_size_read(inode); 2942bd194b18SDavid Howells if (unlikely(*ppos >= isize)) 2943bd194b18SDavid Howells break; 2944bd194b18SDavid Howells part = min_t(loff_t, isize - *ppos, len); 2945bd194b18SDavid Howells 2946bd194b18SDavid Howells if (folio) { 2947bd194b18SDavid Howells /* 2948bd194b18SDavid Howells * If users can be writing to this page using arbitrary 2949bd194b18SDavid Howells * virtual addresses, take care about potential aliasing 2950bd194b18SDavid Howells * before reading the page on the kernel side. 2951bd194b18SDavid Howells */ 2952bd194b18SDavid Howells if (mapping_writably_mapped(mapping)) 2953bd194b18SDavid Howells flush_dcache_folio(folio); 2954bd194b18SDavid Howells folio_mark_accessed(folio); 2955bd194b18SDavid Howells /* 2956bd194b18SDavid Howells * Ok, we have the page, and it's up-to-date, so we can 2957bd194b18SDavid Howells * now splice it into the pipe. 2958bd194b18SDavid Howells */ 2959bd194b18SDavid Howells n = splice_folio_into_pipe(pipe, folio, *ppos, part); 2960bd194b18SDavid Howells folio_put(folio); 2961bd194b18SDavid Howells folio = NULL; 2962bd194b18SDavid Howells } else { 2963fa598952SHugh Dickins n = splice_zeropage_into_pipe(pipe, *ppos, part); 2964bd194b18SDavid Howells } 2965bd194b18SDavid Howells 2966bd194b18SDavid Howells if (!n) 2967bd194b18SDavid Howells break; 2968bd194b18SDavid Howells len -= n; 2969bd194b18SDavid Howells total_spliced += n; 2970bd194b18SDavid Howells *ppos += n; 2971bd194b18SDavid Howells in->f_ra.prev_pos = *ppos; 2972bd194b18SDavid Howells if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 2973bd194b18SDavid Howells break; 2974bd194b18SDavid Howells 2975bd194b18SDavid Howells cond_resched(); 2976bd194b18SDavid Howells } while (len); 2977bd194b18SDavid Howells 2978bd194b18SDavid Howells if (folio) 2979bd194b18SDavid Howells folio_put(folio); 2980bd194b18SDavid Howells 2981bd194b18SDavid Howells file_accessed(in); 2982bd194b18SDavid Howells return total_spliced ? total_spliced : error; 2983bd194b18SDavid Howells } 2984bd194b18SDavid Howells 2985965c8e59SAndrew Morton static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2986220f2ac9SHugh Dickins { 2987220f2ac9SHugh Dickins struct address_space *mapping = file->f_mapping; 2988220f2ac9SHugh Dickins struct inode *inode = mapping->host; 2989220f2ac9SHugh Dickins 2990965c8e59SAndrew Morton if (whence != SEEK_DATA && whence != SEEK_HOLE) 2991965c8e59SAndrew Morton return generic_file_llseek_size(file, offset, whence, 2992220f2ac9SHugh Dickins MAX_LFS_FILESIZE, i_size_read(inode)); 299341139aa4SMatthew Wilcox (Oracle) if (offset < 0) 299441139aa4SMatthew Wilcox (Oracle) return -ENXIO; 299541139aa4SMatthew Wilcox (Oracle) 29965955102cSAl Viro inode_lock(inode); 29979608703eSJan Kara /* We're holding i_rwsem so we can access i_size directly */ 299841139aa4SMatthew Wilcox (Oracle) offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); 2999387aae6fSHugh Dickins if (offset >= 0) 300046a1c2c7SJie Liu offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 30015955102cSAl Viro inode_unlock(inode); 3002220f2ac9SHugh Dickins return offset; 3003220f2ac9SHugh Dickins } 3004220f2ac9SHugh Dickins 300583e4fa9cSHugh Dickins static long shmem_fallocate(struct file *file, int mode, loff_t offset, 300683e4fa9cSHugh Dickins loff_t len) 300783e4fa9cSHugh Dickins { 3008496ad9aaSAl Viro struct inode *inode = file_inode(file); 3009e2d12e22SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 301040e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 30111aac1400SHugh Dickins struct shmem_falloc shmem_falloc; 3012d144bf62SHugh Dickins pgoff_t start, index, end, undo_fallocend; 3013e2d12e22SHugh Dickins int error; 301483e4fa9cSHugh Dickins 301513ace4d0SHugh Dickins if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 301613ace4d0SHugh Dickins return -EOPNOTSUPP; 301713ace4d0SHugh Dickins 30185955102cSAl Viro inode_lock(inode); 301983e4fa9cSHugh Dickins 302083e4fa9cSHugh Dickins if (mode & FALLOC_FL_PUNCH_HOLE) { 302183e4fa9cSHugh Dickins struct address_space *mapping = file->f_mapping; 302283e4fa9cSHugh Dickins loff_t unmap_start = round_up(offset, PAGE_SIZE); 302383e4fa9cSHugh Dickins loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 30248e205f77SHugh Dickins DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 302583e4fa9cSHugh Dickins 30269608703eSJan Kara /* protected by i_rwsem */ 3027ab3948f5SJoel Fernandes (Google) if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 302840e041a2SDavid Herrmann error = -EPERM; 302940e041a2SDavid Herrmann goto out; 303040e041a2SDavid Herrmann } 303140e041a2SDavid Herrmann 30328e205f77SHugh Dickins shmem_falloc.waitq = &shmem_falloc_waitq; 3033aa71ecd8SChen Jun shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; 3034f00cdc6dSHugh Dickins shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 3035f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 3036f00cdc6dSHugh Dickins inode->i_private = &shmem_falloc; 3037f00cdc6dSHugh Dickins spin_unlock(&inode->i_lock); 3038f00cdc6dSHugh Dickins 303983e4fa9cSHugh Dickins if ((u64)unmap_end > (u64)unmap_start) 304083e4fa9cSHugh Dickins unmap_mapping_range(mapping, unmap_start, 304183e4fa9cSHugh Dickins 1 + unmap_end - unmap_start, 0); 304283e4fa9cSHugh Dickins shmem_truncate_range(inode, offset, offset + len - 1); 304383e4fa9cSHugh Dickins /* No need to unmap again: hole-punching leaves COWed pages */ 30448e205f77SHugh Dickins 30458e205f77SHugh Dickins spin_lock(&inode->i_lock); 30468e205f77SHugh Dickins inode->i_private = NULL; 30478e205f77SHugh Dickins wake_up_all(&shmem_falloc_waitq); 30482055da97SIngo Molnar WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 30498e205f77SHugh Dickins spin_unlock(&inode->i_lock); 305083e4fa9cSHugh Dickins error = 0; 30518e205f77SHugh Dickins goto out; 305283e4fa9cSHugh Dickins } 305383e4fa9cSHugh Dickins 3054e2d12e22SHugh Dickins /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 3055e2d12e22SHugh Dickins error = inode_newsize_ok(inode, offset + len); 3056e2d12e22SHugh Dickins if (error) 3057e2d12e22SHugh Dickins goto out; 3058e2d12e22SHugh Dickins 305940e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 306040e041a2SDavid Herrmann error = -EPERM; 306140e041a2SDavid Herrmann goto out; 306240e041a2SDavid Herrmann } 306340e041a2SDavid Herrmann 306409cbfeafSKirill A. Shutemov start = offset >> PAGE_SHIFT; 306509cbfeafSKirill A. Shutemov end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 3066e2d12e22SHugh Dickins /* Try to avoid a swapstorm if len is impossible to satisfy */ 3067e2d12e22SHugh Dickins if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 3068e2d12e22SHugh Dickins error = -ENOSPC; 3069e2d12e22SHugh Dickins goto out; 3070e2d12e22SHugh Dickins } 3071e2d12e22SHugh Dickins 30728e205f77SHugh Dickins shmem_falloc.waitq = NULL; 30731aac1400SHugh Dickins shmem_falloc.start = start; 30741aac1400SHugh Dickins shmem_falloc.next = start; 30751aac1400SHugh Dickins shmem_falloc.nr_falloced = 0; 30761aac1400SHugh Dickins shmem_falloc.nr_unswapped = 0; 30771aac1400SHugh Dickins spin_lock(&inode->i_lock); 30781aac1400SHugh Dickins inode->i_private = &shmem_falloc; 30791aac1400SHugh Dickins spin_unlock(&inode->i_lock); 30801aac1400SHugh Dickins 3081d144bf62SHugh Dickins /* 3082d144bf62SHugh Dickins * info->fallocend is only relevant when huge pages might be 3083d144bf62SHugh Dickins * involved: to prevent split_huge_page() freeing fallocated 3084d144bf62SHugh Dickins * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. 3085d144bf62SHugh Dickins */ 3086d144bf62SHugh Dickins undo_fallocend = info->fallocend; 3087d144bf62SHugh Dickins if (info->fallocend < end) 3088d144bf62SHugh Dickins info->fallocend = end; 3089d144bf62SHugh Dickins 3090050dcb5cSHugh Dickins for (index = start; index < end; ) { 3091b0802b22SMatthew Wilcox (Oracle) struct folio *folio; 3092e2d12e22SHugh Dickins 3093e2d12e22SHugh Dickins /* 3094e2d12e22SHugh Dickins * Good, the fallocate(2) manpage permits EINTR: we may have 3095e2d12e22SHugh Dickins * been interrupted because we are using up too much memory. 3096e2d12e22SHugh Dickins */ 3097e2d12e22SHugh Dickins if (signal_pending(current)) 3098e2d12e22SHugh Dickins error = -EINTR; 30991aac1400SHugh Dickins else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 31001aac1400SHugh Dickins error = -ENOMEM; 3101e2d12e22SHugh Dickins else 3102b0802b22SMatthew Wilcox (Oracle) error = shmem_get_folio(inode, index, &folio, 3103b0802b22SMatthew Wilcox (Oracle) SGP_FALLOC); 3104e2d12e22SHugh Dickins if (error) { 3105d144bf62SHugh Dickins info->fallocend = undo_fallocend; 3106b0802b22SMatthew Wilcox (Oracle) /* Remove the !uptodate folios we added */ 31077f556567SHugh Dickins if (index > start) { 31081635f6a7SHugh Dickins shmem_undo_range(inode, 310909cbfeafSKirill A. Shutemov (loff_t)start << PAGE_SHIFT, 3110b9b4bb26SAnthony Romano ((loff_t)index << PAGE_SHIFT) - 1, true); 31117f556567SHugh Dickins } 31121aac1400SHugh Dickins goto undone; 3113e2d12e22SHugh Dickins } 3114e2d12e22SHugh Dickins 3115050dcb5cSHugh Dickins /* 3116050dcb5cSHugh Dickins * Here is a more important optimization than it appears: 3117b0802b22SMatthew Wilcox (Oracle) * a second SGP_FALLOC on the same large folio will clear it, 3118b0802b22SMatthew Wilcox (Oracle) * making it uptodate and un-undoable if we fail later. 3119050dcb5cSHugh Dickins */ 3120b0802b22SMatthew Wilcox (Oracle) index = folio_next_index(folio); 3121050dcb5cSHugh Dickins /* Beware 32-bit wraparound */ 3122050dcb5cSHugh Dickins if (!index) 3123050dcb5cSHugh Dickins index--; 3124050dcb5cSHugh Dickins 3125e2d12e22SHugh Dickins /* 31261aac1400SHugh Dickins * Inform shmem_writepage() how far we have reached. 31271aac1400SHugh Dickins * No need for lock or barrier: we have the page lock. 31281aac1400SHugh Dickins */ 3129b0802b22SMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) 3130050dcb5cSHugh Dickins shmem_falloc.nr_falloced += index - shmem_falloc.next; 3131050dcb5cSHugh Dickins shmem_falloc.next = index; 31321aac1400SHugh Dickins 31331aac1400SHugh Dickins /* 3134b0802b22SMatthew Wilcox (Oracle) * If !uptodate, leave it that way so that freeable folios 31351635f6a7SHugh Dickins * can be recognized if we need to rollback on error later. 3136b0802b22SMatthew Wilcox (Oracle) * But mark it dirty so that memory pressure will swap rather 3137b0802b22SMatthew Wilcox (Oracle) * than free the folios we are allocating (and SGP_CACHE folios 3138e2d12e22SHugh Dickins * might still be clean: we now need to mark those dirty too). 3139e2d12e22SHugh Dickins */ 3140b0802b22SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 3141b0802b22SMatthew Wilcox (Oracle) folio_unlock(folio); 3142b0802b22SMatthew Wilcox (Oracle) folio_put(folio); 3143e2d12e22SHugh Dickins cond_resched(); 3144e2d12e22SHugh Dickins } 3145e2d12e22SHugh Dickins 3146e2d12e22SHugh Dickins if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 3147e2d12e22SHugh Dickins i_size_write(inode, offset + len); 31481aac1400SHugh Dickins undone: 31491aac1400SHugh Dickins spin_lock(&inode->i_lock); 31501aac1400SHugh Dickins inode->i_private = NULL; 31511aac1400SHugh Dickins spin_unlock(&inode->i_lock); 3152e2d12e22SHugh Dickins out: 315315f242bbSHugh Dickins if (!error) 315415f242bbSHugh Dickins file_modified(file); 31555955102cSAl Viro inode_unlock(inode); 315683e4fa9cSHugh Dickins return error; 315783e4fa9cSHugh Dickins } 315883e4fa9cSHugh Dickins 3159726c3342SDavid Howells static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 31601da177e4SLinus Torvalds { 3161726c3342SDavid Howells struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 31621da177e4SLinus Torvalds 31631da177e4SLinus Torvalds buf->f_type = TMPFS_MAGIC; 316409cbfeafSKirill A. Shutemov buf->f_bsize = PAGE_SIZE; 31651da177e4SLinus Torvalds buf->f_namelen = NAME_MAX; 31660edd73b3SHugh Dickins if (sbinfo->max_blocks) { 31671da177e4SLinus Torvalds buf->f_blocks = sbinfo->max_blocks; 316841ffe5d5SHugh Dickins buf->f_bavail = 316941ffe5d5SHugh Dickins buf->f_bfree = sbinfo->max_blocks - 317041ffe5d5SHugh Dickins percpu_counter_sum(&sbinfo->used_blocks); 31710edd73b3SHugh Dickins } 31720edd73b3SHugh Dickins if (sbinfo->max_inodes) { 31731da177e4SLinus Torvalds buf->f_files = sbinfo->max_inodes; 31741da177e4SLinus Torvalds buf->f_ffree = sbinfo->free_inodes; 31751da177e4SLinus Torvalds } 31761da177e4SLinus Torvalds /* else leave those fields 0 like simple_statfs */ 317759cda49eSAmir Goldstein 317859cda49eSAmir Goldstein buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 317959cda49eSAmir Goldstein 31801da177e4SLinus Torvalds return 0; 31811da177e4SLinus Torvalds } 31821da177e4SLinus Torvalds 31831da177e4SLinus Torvalds /* 31841da177e4SLinus Torvalds * File creation. Allocate an inode, and we're done.. 31851da177e4SLinus Torvalds */ 31861da177e4SLinus Torvalds static int 31875ebb29beSChristian Brauner shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, 3188549c7297SChristian Brauner struct dentry *dentry, umode_t mode, dev_t dev) 31891da177e4SLinus Torvalds { 31900b0a0806SHugh Dickins struct inode *inode; 319171480663SCarlos Maiolino int error; 31921da177e4SLinus Torvalds 31937a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); 319471480663SCarlos Maiolino 319571480663SCarlos Maiolino if (IS_ERR(inode)) 319671480663SCarlos Maiolino return PTR_ERR(inode); 319771480663SCarlos Maiolino 3198feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 3199feda821eSChristoph Hellwig if (error) 3200feda821eSChristoph Hellwig goto out_iput; 32012a7dba39SEric Paris error = security_inode_init_security(inode, dir, 32029d8f13baSMimi Zohar &dentry->d_name, 32036d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 3204feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 3205feda821eSChristoph Hellwig goto out_iput; 320637ec43cdSMimi Zohar 3207718deb6bSAl Viro error = 0; 32081da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3209078cd827SDeepa Dinamani dir->i_ctime = dir->i_mtime = current_time(dir); 321036f05cabSJeff Layton inode_inc_iversion(dir); 32111da177e4SLinus Torvalds d_instantiate(dentry, inode); 32121da177e4SLinus Torvalds dget(dentry); /* Extra count - pin the dentry in core */ 32131da177e4SLinus Torvalds return error; 321471480663SCarlos Maiolino 3215feda821eSChristoph Hellwig out_iput: 3216feda821eSChristoph Hellwig iput(inode); 3217feda821eSChristoph Hellwig return error; 32181da177e4SLinus Torvalds } 32191da177e4SLinus Torvalds 322060545d0dSAl Viro static int 3221011e2b71SChristian Brauner shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 3222863f144fSMiklos Szeredi struct file *file, umode_t mode) 322360545d0dSAl Viro { 322460545d0dSAl Viro struct inode *inode; 322571480663SCarlos Maiolino int error; 322660545d0dSAl Viro 32277a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); 322871480663SCarlos Maiolino 322971480663SCarlos Maiolino if (IS_ERR(inode)) { 323071480663SCarlos Maiolino error = PTR_ERR(inode); 323171480663SCarlos Maiolino goto err_out; 323271480663SCarlos Maiolino } 323371480663SCarlos Maiolino 323460545d0dSAl Viro error = security_inode_init_security(inode, dir, 323560545d0dSAl Viro NULL, 323660545d0dSAl Viro shmem_initxattrs, NULL); 3237feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 3238feda821eSChristoph Hellwig goto out_iput; 3239feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 3240feda821eSChristoph Hellwig if (error) 3241feda821eSChristoph Hellwig goto out_iput; 3242863f144fSMiklos Szeredi d_tmpfile(file, inode); 324371480663SCarlos Maiolino 324471480663SCarlos Maiolino err_out: 3245863f144fSMiklos Szeredi return finish_open_simple(file, error); 3246feda821eSChristoph Hellwig out_iput: 3247feda821eSChristoph Hellwig iput(inode); 3248feda821eSChristoph Hellwig return error; 324960545d0dSAl Viro } 325060545d0dSAl Viro 3251c54bd91eSChristian Brauner static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, 3252549c7297SChristian Brauner struct dentry *dentry, umode_t mode) 32531da177e4SLinus Torvalds { 32541da177e4SLinus Torvalds int error; 32551da177e4SLinus Torvalds 32567a80e5b8SGiuseppe Scrivano error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); 32577a80e5b8SGiuseppe Scrivano if (error) 32581da177e4SLinus Torvalds return error; 3259d8c76e6fSDave Hansen inc_nlink(dir); 32601da177e4SLinus Torvalds return 0; 32611da177e4SLinus Torvalds } 32621da177e4SLinus Torvalds 32636c960e68SChristian Brauner static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, 3264549c7297SChristian Brauner struct dentry *dentry, umode_t mode, bool excl) 32651da177e4SLinus Torvalds { 32667a80e5b8SGiuseppe Scrivano return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); 32671da177e4SLinus Torvalds } 32681da177e4SLinus Torvalds 32691da177e4SLinus Torvalds /* 32701da177e4SLinus Torvalds * Link a file.. 32711da177e4SLinus Torvalds */ 32721da177e4SLinus Torvalds static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 32731da177e4SLinus Torvalds { 327475c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 327529b00e60SDarrick J. Wong int ret = 0; 32761da177e4SLinus Torvalds 32771da177e4SLinus Torvalds /* 32781da177e4SLinus Torvalds * No ordinary (disk based) filesystem counts links as inodes; 32791da177e4SLinus Torvalds * but each new link needs a new dentry, pinning lowmem, and 32801da177e4SLinus Torvalds * tmpfs dentries cannot be pruned until they are unlinked. 32811062af92SDarrick J. Wong * But if an O_TMPFILE file is linked into the tmpfs, the 32821062af92SDarrick J. Wong * first link must skip that, to get the accounting right. 32831da177e4SLinus Torvalds */ 32841062af92SDarrick J. Wong if (inode->i_nlink) { 3285e809d5f0SChris Down ret = shmem_reserve_inode(inode->i_sb, NULL); 32865b04c689SPavel Emelyanov if (ret) 32875b04c689SPavel Emelyanov goto out; 32881062af92SDarrick J. Wong } 32891da177e4SLinus Torvalds 32901da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3291078cd827SDeepa Dinamani inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 329236f05cabSJeff Layton inode_inc_iversion(dir); 3293d8c76e6fSDave Hansen inc_nlink(inode); 32947de9c6eeSAl Viro ihold(inode); /* New dentry reference */ 32951da177e4SLinus Torvalds dget(dentry); /* Extra pinning count for the created dentry */ 32961da177e4SLinus Torvalds d_instantiate(dentry, inode); 32975b04c689SPavel Emelyanov out: 32985b04c689SPavel Emelyanov return ret; 32991da177e4SLinus Torvalds } 33001da177e4SLinus Torvalds 33011da177e4SLinus Torvalds static int shmem_unlink(struct inode *dir, struct dentry *dentry) 33021da177e4SLinus Torvalds { 330375c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 33041da177e4SLinus Torvalds 33055b04c689SPavel Emelyanov if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 33065b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 33071da177e4SLinus Torvalds 33081da177e4SLinus Torvalds dir->i_size -= BOGO_DIRENT_SIZE; 3309078cd827SDeepa Dinamani inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 331036f05cabSJeff Layton inode_inc_iversion(dir); 33119a53c3a7SDave Hansen drop_nlink(inode); 33121da177e4SLinus Torvalds dput(dentry); /* Undo the count from "create" - this does all the work */ 33131da177e4SLinus Torvalds return 0; 33141da177e4SLinus Torvalds } 33151da177e4SLinus Torvalds 33161da177e4SLinus Torvalds static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 33171da177e4SLinus Torvalds { 33181da177e4SLinus Torvalds if (!simple_empty(dentry)) 33191da177e4SLinus Torvalds return -ENOTEMPTY; 33201da177e4SLinus Torvalds 332175c3cfa8SDavid Howells drop_nlink(d_inode(dentry)); 33229a53c3a7SDave Hansen drop_nlink(dir); 33231da177e4SLinus Torvalds return shmem_unlink(dir, dentry); 33241da177e4SLinus Torvalds } 33251da177e4SLinus Torvalds 3326e18275aeSChristian Brauner static int shmem_whiteout(struct mnt_idmap *idmap, 3327549c7297SChristian Brauner struct inode *old_dir, struct dentry *old_dentry) 332846fdb794SMiklos Szeredi { 332946fdb794SMiklos Szeredi struct dentry *whiteout; 333046fdb794SMiklos Szeredi int error; 333146fdb794SMiklos Szeredi 333246fdb794SMiklos Szeredi whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 333346fdb794SMiklos Szeredi if (!whiteout) 333446fdb794SMiklos Szeredi return -ENOMEM; 333546fdb794SMiklos Szeredi 33367a80e5b8SGiuseppe Scrivano error = shmem_mknod(idmap, old_dir, whiteout, 333746fdb794SMiklos Szeredi S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 333846fdb794SMiklos Szeredi dput(whiteout); 333946fdb794SMiklos Szeredi if (error) 334046fdb794SMiklos Szeredi return error; 334146fdb794SMiklos Szeredi 334246fdb794SMiklos Szeredi /* 334346fdb794SMiklos Szeredi * Cheat and hash the whiteout while the old dentry is still in 334446fdb794SMiklos Szeredi * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 334546fdb794SMiklos Szeredi * 334646fdb794SMiklos Szeredi * d_lookup() will consistently find one of them at this point, 334746fdb794SMiklos Szeredi * not sure which one, but that isn't even important. 334846fdb794SMiklos Szeredi */ 334946fdb794SMiklos Szeredi d_rehash(whiteout); 335046fdb794SMiklos Szeredi return 0; 335146fdb794SMiklos Szeredi } 335246fdb794SMiklos Szeredi 33531da177e4SLinus Torvalds /* 33541da177e4SLinus Torvalds * The VFS layer already does all the dentry stuff for rename, 33551da177e4SLinus Torvalds * we just have to decrement the usage count for the target if 33561da177e4SLinus Torvalds * it exists so that the VFS layer correctly free's it when it 33571da177e4SLinus Torvalds * gets overwritten. 33581da177e4SLinus Torvalds */ 3359e18275aeSChristian Brauner static int shmem_rename2(struct mnt_idmap *idmap, 3360549c7297SChristian Brauner struct inode *old_dir, struct dentry *old_dentry, 3361549c7297SChristian Brauner struct inode *new_dir, struct dentry *new_dentry, 3362549c7297SChristian Brauner unsigned int flags) 33631da177e4SLinus Torvalds { 336475c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 33651da177e4SLinus Torvalds int they_are_dirs = S_ISDIR(inode->i_mode); 33661da177e4SLinus Torvalds 336746fdb794SMiklos Szeredi if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 33683b69ff51SMiklos Szeredi return -EINVAL; 33693b69ff51SMiklos Szeredi 337037456771SMiklos Szeredi if (flags & RENAME_EXCHANGE) 33716429e463SLorenz Bauer return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 337237456771SMiklos Szeredi 33731da177e4SLinus Torvalds if (!simple_empty(new_dentry)) 33741da177e4SLinus Torvalds return -ENOTEMPTY; 33751da177e4SLinus Torvalds 337646fdb794SMiklos Szeredi if (flags & RENAME_WHITEOUT) { 337746fdb794SMiklos Szeredi int error; 337846fdb794SMiklos Szeredi 33797a80e5b8SGiuseppe Scrivano error = shmem_whiteout(idmap, old_dir, old_dentry); 338046fdb794SMiklos Szeredi if (error) 338146fdb794SMiklos Szeredi return error; 338246fdb794SMiklos Szeredi } 338346fdb794SMiklos Szeredi 338475c3cfa8SDavid Howells if (d_really_is_positive(new_dentry)) { 33851da177e4SLinus Torvalds (void) shmem_unlink(new_dir, new_dentry); 3386b928095bSMiklos Szeredi if (they_are_dirs) { 338775c3cfa8SDavid Howells drop_nlink(d_inode(new_dentry)); 33889a53c3a7SDave Hansen drop_nlink(old_dir); 3389b928095bSMiklos Szeredi } 33901da177e4SLinus Torvalds } else if (they_are_dirs) { 33919a53c3a7SDave Hansen drop_nlink(old_dir); 3392d8c76e6fSDave Hansen inc_nlink(new_dir); 33931da177e4SLinus Torvalds } 33941da177e4SLinus Torvalds 33951da177e4SLinus Torvalds old_dir->i_size -= BOGO_DIRENT_SIZE; 33961da177e4SLinus Torvalds new_dir->i_size += BOGO_DIRENT_SIZE; 33971da177e4SLinus Torvalds old_dir->i_ctime = old_dir->i_mtime = 33981da177e4SLinus Torvalds new_dir->i_ctime = new_dir->i_mtime = 3399078cd827SDeepa Dinamani inode->i_ctime = current_time(old_dir); 340036f05cabSJeff Layton inode_inc_iversion(old_dir); 340136f05cabSJeff Layton inode_inc_iversion(new_dir); 34021da177e4SLinus Torvalds return 0; 34031da177e4SLinus Torvalds } 34041da177e4SLinus Torvalds 34057a77db95SChristian Brauner static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, 3406549c7297SChristian Brauner struct dentry *dentry, const char *symname) 34071da177e4SLinus Torvalds { 34081da177e4SLinus Torvalds int error; 34091da177e4SLinus Torvalds int len; 34101da177e4SLinus Torvalds struct inode *inode; 34117ad0414bSMatthew Wilcox (Oracle) struct folio *folio; 34121da177e4SLinus Torvalds 34131da177e4SLinus Torvalds len = strlen(symname) + 1; 341409cbfeafSKirill A. Shutemov if (len > PAGE_SIZE) 34151da177e4SLinus Torvalds return -ENAMETOOLONG; 34161da177e4SLinus Torvalds 34177a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, 34180825a6f9SJoe Perches VM_NORESERVE); 341971480663SCarlos Maiolino 342071480663SCarlos Maiolino if (IS_ERR(inode)) 342171480663SCarlos Maiolino return PTR_ERR(inode); 34221da177e4SLinus Torvalds 34239d8f13baSMimi Zohar error = security_inode_init_security(inode, dir, &dentry->d_name, 34246d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 3425*23a31d87SChuck Lever if (error && error != -EOPNOTSUPP) 3426*23a31d87SChuck Lever goto out_iput; 3427570bc1c2SStephen Smalley 34281da177e4SLinus Torvalds inode->i_size = len-1; 342969f07ec9SHugh Dickins if (len <= SHORT_SYMLINK_LEN) { 34303ed47db3SAl Viro inode->i_link = kmemdup(symname, len, GFP_KERNEL); 34313ed47db3SAl Viro if (!inode->i_link) { 3432*23a31d87SChuck Lever error = -ENOMEM; 3433*23a31d87SChuck Lever goto out_iput; 343469f07ec9SHugh Dickins } 343569f07ec9SHugh Dickins inode->i_op = &shmem_short_symlink_operations; 34361da177e4SLinus Torvalds } else { 3437e8ecde25SAl Viro inode_nohighmem(inode); 34387ad0414bSMatthew Wilcox (Oracle) error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); 3439*23a31d87SChuck Lever if (error) 3440*23a31d87SChuck Lever goto out_iput; 344114fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 34421da177e4SLinus Torvalds inode->i_op = &shmem_symlink_inode_operations; 34437ad0414bSMatthew Wilcox (Oracle) memcpy(folio_address(folio), symname, len); 34447ad0414bSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 34457ad0414bSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 34467ad0414bSMatthew Wilcox (Oracle) folio_unlock(folio); 34477ad0414bSMatthew Wilcox (Oracle) folio_put(folio); 34481da177e4SLinus Torvalds } 34491da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3450078cd827SDeepa Dinamani dir->i_ctime = dir->i_mtime = current_time(dir); 345136f05cabSJeff Layton inode_inc_iversion(dir); 34521da177e4SLinus Torvalds d_instantiate(dentry, inode); 34531da177e4SLinus Torvalds dget(dentry); 34541da177e4SLinus Torvalds return 0; 3455*23a31d87SChuck Lever out_iput: 3456*23a31d87SChuck Lever iput(inode); 3457*23a31d87SChuck Lever return error; 34581da177e4SLinus Torvalds } 34591da177e4SLinus Torvalds 3460fceef393SAl Viro static void shmem_put_link(void *arg) 3461fceef393SAl Viro { 3462e4b57722SMatthew Wilcox (Oracle) folio_mark_accessed(arg); 3463e4b57722SMatthew Wilcox (Oracle) folio_put(arg); 3464fceef393SAl Viro } 3465fceef393SAl Viro 34666b255391SAl Viro static const char *shmem_get_link(struct dentry *dentry, 3467fceef393SAl Viro struct inode *inode, 3468fceef393SAl Viro struct delayed_call *done) 34691da177e4SLinus Torvalds { 3470e4b57722SMatthew Wilcox (Oracle) struct folio *folio = NULL; 34716b255391SAl Viro int error; 3472e4b57722SMatthew Wilcox (Oracle) 34736a6c9904SAl Viro if (!dentry) { 3474e4b57722SMatthew Wilcox (Oracle) folio = filemap_get_folio(inode->i_mapping, 0); 347566dabbb6SChristoph Hellwig if (IS_ERR(folio)) 34766b255391SAl Viro return ERR_PTR(-ECHILD); 34777459c149SMatthew Wilcox (Oracle) if (PageHWPoison(folio_page(folio, 0)) || 3478e4b57722SMatthew Wilcox (Oracle) !folio_test_uptodate(folio)) { 3479e4b57722SMatthew Wilcox (Oracle) folio_put(folio); 34806a6c9904SAl Viro return ERR_PTR(-ECHILD); 34816a6c9904SAl Viro } 34826a6c9904SAl Viro } else { 3483e4b57722SMatthew Wilcox (Oracle) error = shmem_get_folio(inode, 0, &folio, SGP_READ); 3484680baacbSAl Viro if (error) 3485680baacbSAl Viro return ERR_PTR(error); 3486e4b57722SMatthew Wilcox (Oracle) if (!folio) 3487a7605426SYang Shi return ERR_PTR(-ECHILD); 34887459c149SMatthew Wilcox (Oracle) if (PageHWPoison(folio_page(folio, 0))) { 3489e4b57722SMatthew Wilcox (Oracle) folio_unlock(folio); 3490e4b57722SMatthew Wilcox (Oracle) folio_put(folio); 3491a7605426SYang Shi return ERR_PTR(-ECHILD); 3492a7605426SYang Shi } 3493e4b57722SMatthew Wilcox (Oracle) folio_unlock(folio); 34941da177e4SLinus Torvalds } 3495e4b57722SMatthew Wilcox (Oracle) set_delayed_call(done, shmem_put_link, folio); 3496e4b57722SMatthew Wilcox (Oracle) return folio_address(folio); 34971da177e4SLinus Torvalds } 34981da177e4SLinus Torvalds 3499b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3500e408e695STheodore Ts'o 3501e408e695STheodore Ts'o static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) 3502e408e695STheodore Ts'o { 3503e408e695STheodore Ts'o struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3504e408e695STheodore Ts'o 3505e408e695STheodore Ts'o fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); 3506e408e695STheodore Ts'o 3507e408e695STheodore Ts'o return 0; 3508e408e695STheodore Ts'o } 3509e408e695STheodore Ts'o 35108782a9aeSChristian Brauner static int shmem_fileattr_set(struct mnt_idmap *idmap, 3511e408e695STheodore Ts'o struct dentry *dentry, struct fileattr *fa) 3512e408e695STheodore Ts'o { 3513e408e695STheodore Ts'o struct inode *inode = d_inode(dentry); 3514e408e695STheodore Ts'o struct shmem_inode_info *info = SHMEM_I(inode); 3515e408e695STheodore Ts'o 3516e408e695STheodore Ts'o if (fileattr_has_fsx(fa)) 3517e408e695STheodore Ts'o return -EOPNOTSUPP; 3518cb241339SHugh Dickins if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) 3519cb241339SHugh Dickins return -EOPNOTSUPP; 3520e408e695STheodore Ts'o 3521e408e695STheodore Ts'o info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | 3522e408e695STheodore Ts'o (fa->flags & SHMEM_FL_USER_MODIFIABLE); 3523e408e695STheodore Ts'o 3524cb241339SHugh Dickins shmem_set_inode_flags(inode, info->fsflags); 3525e408e695STheodore Ts'o inode->i_ctime = current_time(inode); 352636f05cabSJeff Layton inode_inc_iversion(inode); 3527e408e695STheodore Ts'o return 0; 3528e408e695STheodore Ts'o } 3529e408e695STheodore Ts'o 3530b09e0fa4SEric Paris /* 3531b09e0fa4SEric Paris * Superblocks without xattr inode operations may get some security.* xattr 3532b09e0fa4SEric Paris * support from the LSM "for free". As soon as we have any other xattrs 3533b09e0fa4SEric Paris * like ACLs, we also need to implement the security.* handlers at 3534b09e0fa4SEric Paris * filesystem level, though. 3535b09e0fa4SEric Paris */ 3536b09e0fa4SEric Paris 35376d9d88d0SJarkko Sakkinen /* 35386d9d88d0SJarkko Sakkinen * Callback for security_inode_init_security() for acquiring xattrs. 35396d9d88d0SJarkko Sakkinen */ 35406d9d88d0SJarkko Sakkinen static int shmem_initxattrs(struct inode *inode, 35416d9d88d0SJarkko Sakkinen const struct xattr *xattr_array, 35426d9d88d0SJarkko Sakkinen void *fs_info) 35436d9d88d0SJarkko Sakkinen { 35446d9d88d0SJarkko Sakkinen struct shmem_inode_info *info = SHMEM_I(inode); 35456d9d88d0SJarkko Sakkinen const struct xattr *xattr; 354638f38657SAristeu Rozanski struct simple_xattr *new_xattr; 35476d9d88d0SJarkko Sakkinen size_t len; 35486d9d88d0SJarkko Sakkinen 35496d9d88d0SJarkko Sakkinen for (xattr = xattr_array; xattr->name != NULL; xattr++) { 355038f38657SAristeu Rozanski new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 35516d9d88d0SJarkko Sakkinen if (!new_xattr) 35526d9d88d0SJarkko Sakkinen return -ENOMEM; 35536d9d88d0SJarkko Sakkinen 35546d9d88d0SJarkko Sakkinen len = strlen(xattr->name) + 1; 35556d9d88d0SJarkko Sakkinen new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 35566d9d88d0SJarkko Sakkinen GFP_KERNEL); 35576d9d88d0SJarkko Sakkinen if (!new_xattr->name) { 35583bef735aSChengguang Xu kvfree(new_xattr); 35596d9d88d0SJarkko Sakkinen return -ENOMEM; 35606d9d88d0SJarkko Sakkinen } 35616d9d88d0SJarkko Sakkinen 35626d9d88d0SJarkko Sakkinen memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 35636d9d88d0SJarkko Sakkinen XATTR_SECURITY_PREFIX_LEN); 35646d9d88d0SJarkko Sakkinen memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 35656d9d88d0SJarkko Sakkinen xattr->name, len); 35666d9d88d0SJarkko Sakkinen 35673b4c7bc0SChristian Brauner simple_xattr_add(&info->xattrs, new_xattr); 35686d9d88d0SJarkko Sakkinen } 35696d9d88d0SJarkko Sakkinen 35706d9d88d0SJarkko Sakkinen return 0; 35716d9d88d0SJarkko Sakkinen } 35726d9d88d0SJarkko Sakkinen 3573aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3574b296821aSAl Viro struct dentry *unused, struct inode *inode, 3575b296821aSAl Viro const char *name, void *buffer, size_t size) 3576aa7c5241SAndreas Gruenbacher { 3577b296821aSAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 3578aa7c5241SAndreas Gruenbacher 3579aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 3580aa7c5241SAndreas Gruenbacher return simple_xattr_get(&info->xattrs, name, buffer, size); 3581aa7c5241SAndreas Gruenbacher } 3582aa7c5241SAndreas Gruenbacher 3583aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_set(const struct xattr_handler *handler, 358439f60c1cSChristian Brauner struct mnt_idmap *idmap, 358559301226SAl Viro struct dentry *unused, struct inode *inode, 358659301226SAl Viro const char *name, const void *value, 358759301226SAl Viro size_t size, int flags) 3588aa7c5241SAndreas Gruenbacher { 358959301226SAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 359036f05cabSJeff Layton int err; 3591aa7c5241SAndreas Gruenbacher 3592aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 359336f05cabSJeff Layton err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); 359436f05cabSJeff Layton if (!err) { 359536f05cabSJeff Layton inode->i_ctime = current_time(inode); 359636f05cabSJeff Layton inode_inc_iversion(inode); 359736f05cabSJeff Layton } 359836f05cabSJeff Layton return err; 3599aa7c5241SAndreas Gruenbacher } 3600aa7c5241SAndreas Gruenbacher 3601aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_security_xattr_handler = { 3602aa7c5241SAndreas Gruenbacher .prefix = XATTR_SECURITY_PREFIX, 3603aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 3604aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 3605aa7c5241SAndreas Gruenbacher }; 3606aa7c5241SAndreas Gruenbacher 3607aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_trusted_xattr_handler = { 3608aa7c5241SAndreas Gruenbacher .prefix = XATTR_TRUSTED_PREFIX, 3609aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 3610aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 3611aa7c5241SAndreas Gruenbacher }; 3612aa7c5241SAndreas Gruenbacher 3613b09e0fa4SEric Paris static const struct xattr_handler *shmem_xattr_handlers[] = { 3614aa7c5241SAndreas Gruenbacher &shmem_security_xattr_handler, 3615aa7c5241SAndreas Gruenbacher &shmem_trusted_xattr_handler, 3616b09e0fa4SEric Paris NULL 3617b09e0fa4SEric Paris }; 3618b09e0fa4SEric Paris 3619b09e0fa4SEric Paris static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3620b09e0fa4SEric Paris { 362175c3cfa8SDavid Howells struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3622786534b9SAndreas Gruenbacher return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3623b09e0fa4SEric Paris } 3624b09e0fa4SEric Paris #endif /* CONFIG_TMPFS_XATTR */ 3625b09e0fa4SEric Paris 362669f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations = { 3627f7cd16a5SXavier Roche .getattr = shmem_getattr, 3628e09764cfSCarlos Maiolino .setattr = shmem_setattr, 36296b255391SAl Viro .get_link = simple_get_link, 3630b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3631b09e0fa4SEric Paris .listxattr = shmem_listxattr, 3632b09e0fa4SEric Paris #endif 36331da177e4SLinus Torvalds }; 36341da177e4SLinus Torvalds 363592e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations = { 3636f7cd16a5SXavier Roche .getattr = shmem_getattr, 3637e09764cfSCarlos Maiolino .setattr = shmem_setattr, 36386b255391SAl Viro .get_link = shmem_get_link, 3639b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3640b09e0fa4SEric Paris .listxattr = shmem_listxattr, 364139f0247dSAndreas Gruenbacher #endif 3642b09e0fa4SEric Paris }; 364339f0247dSAndreas Gruenbacher 364491828a40SDavid M. Grimes static struct dentry *shmem_get_parent(struct dentry *child) 364591828a40SDavid M. Grimes { 364691828a40SDavid M. Grimes return ERR_PTR(-ESTALE); 364791828a40SDavid M. Grimes } 364891828a40SDavid M. Grimes 364991828a40SDavid M. Grimes static int shmem_match(struct inode *ino, void *vfh) 365091828a40SDavid M. Grimes { 365191828a40SDavid M. Grimes __u32 *fh = vfh; 365291828a40SDavid M. Grimes __u64 inum = fh[2]; 365391828a40SDavid M. Grimes inum = (inum << 32) | fh[1]; 365491828a40SDavid M. Grimes return ino->i_ino == inum && fh[0] == ino->i_generation; 365591828a40SDavid M. Grimes } 365691828a40SDavid M. Grimes 365712ba780dSAmir Goldstein /* Find any alias of inode, but prefer a hashed alias */ 365812ba780dSAmir Goldstein static struct dentry *shmem_find_alias(struct inode *inode) 365912ba780dSAmir Goldstein { 366012ba780dSAmir Goldstein struct dentry *alias = d_find_alias(inode); 366112ba780dSAmir Goldstein 366212ba780dSAmir Goldstein return alias ?: d_find_any_alias(inode); 366312ba780dSAmir Goldstein } 366412ba780dSAmir Goldstein 366512ba780dSAmir Goldstein 3666480b116cSChristoph Hellwig static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3667480b116cSChristoph Hellwig struct fid *fid, int fh_len, int fh_type) 366891828a40SDavid M. Grimes { 366991828a40SDavid M. Grimes struct inode *inode; 3670480b116cSChristoph Hellwig struct dentry *dentry = NULL; 367135c2a7f4SHugh Dickins u64 inum; 367291828a40SDavid M. Grimes 3673480b116cSChristoph Hellwig if (fh_len < 3) 3674480b116cSChristoph Hellwig return NULL; 3675480b116cSChristoph Hellwig 367635c2a7f4SHugh Dickins inum = fid->raw[2]; 367735c2a7f4SHugh Dickins inum = (inum << 32) | fid->raw[1]; 367835c2a7f4SHugh Dickins 3679480b116cSChristoph Hellwig inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3680480b116cSChristoph Hellwig shmem_match, fid->raw); 368191828a40SDavid M. Grimes if (inode) { 368212ba780dSAmir Goldstein dentry = shmem_find_alias(inode); 368391828a40SDavid M. Grimes iput(inode); 368491828a40SDavid M. Grimes } 368591828a40SDavid M. Grimes 3686480b116cSChristoph Hellwig return dentry; 368791828a40SDavid M. Grimes } 368891828a40SDavid M. Grimes 3689b0b0382bSAl Viro static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3690b0b0382bSAl Viro struct inode *parent) 369191828a40SDavid M. Grimes { 36925fe0c237SAneesh Kumar K.V if (*len < 3) { 36935fe0c237SAneesh Kumar K.V *len = 3; 369494e07a75SNamjae Jeon return FILEID_INVALID; 36955fe0c237SAneesh Kumar K.V } 369691828a40SDavid M. Grimes 36971d3382cbSAl Viro if (inode_unhashed(inode)) { 369891828a40SDavid M. Grimes /* Unfortunately insert_inode_hash is not idempotent, 369991828a40SDavid M. Grimes * so as we hash inodes here rather than at creation 370091828a40SDavid M. Grimes * time, we need a lock to ensure we only try 370191828a40SDavid M. Grimes * to do it once 370291828a40SDavid M. Grimes */ 370391828a40SDavid M. Grimes static DEFINE_SPINLOCK(lock); 370491828a40SDavid M. Grimes spin_lock(&lock); 37051d3382cbSAl Viro if (inode_unhashed(inode)) 370691828a40SDavid M. Grimes __insert_inode_hash(inode, 370791828a40SDavid M. Grimes inode->i_ino + inode->i_generation); 370891828a40SDavid M. Grimes spin_unlock(&lock); 370991828a40SDavid M. Grimes } 371091828a40SDavid M. Grimes 371191828a40SDavid M. Grimes fh[0] = inode->i_generation; 371291828a40SDavid M. Grimes fh[1] = inode->i_ino; 371391828a40SDavid M. Grimes fh[2] = ((__u64)inode->i_ino) >> 32; 371491828a40SDavid M. Grimes 371591828a40SDavid M. Grimes *len = 3; 371691828a40SDavid M. Grimes return 1; 371791828a40SDavid M. Grimes } 371891828a40SDavid M. Grimes 371939655164SChristoph Hellwig static const struct export_operations shmem_export_ops = { 372091828a40SDavid M. Grimes .get_parent = shmem_get_parent, 372191828a40SDavid M. Grimes .encode_fh = shmem_encode_fh, 3722480b116cSChristoph Hellwig .fh_to_dentry = shmem_fh_to_dentry, 372391828a40SDavid M. Grimes }; 372491828a40SDavid M. Grimes 3725626c3920SAl Viro enum shmem_param { 3726626c3920SAl Viro Opt_gid, 3727626c3920SAl Viro Opt_huge, 3728626c3920SAl Viro Opt_mode, 3729626c3920SAl Viro Opt_mpol, 3730626c3920SAl Viro Opt_nr_blocks, 3731626c3920SAl Viro Opt_nr_inodes, 3732626c3920SAl Viro Opt_size, 3733626c3920SAl Viro Opt_uid, 3734ea3271f7SChris Down Opt_inode32, 3735ea3271f7SChris Down Opt_inode64, 37362c6efe9cSLuis Chamberlain Opt_noswap, 3737e09764cfSCarlos Maiolino Opt_quota, 3738e09764cfSCarlos Maiolino Opt_usrquota, 3739e09764cfSCarlos Maiolino Opt_grpquota, 3740de4c0e7cSLukas Czerner Opt_usrquota_block_hardlimit, 3741de4c0e7cSLukas Czerner Opt_usrquota_inode_hardlimit, 3742de4c0e7cSLukas Czerner Opt_grpquota_block_hardlimit, 3743de4c0e7cSLukas Czerner Opt_grpquota_inode_hardlimit, 3744626c3920SAl Viro }; 37451da177e4SLinus Torvalds 37465eede625SAl Viro static const struct constant_table shmem_param_enums_huge[] = { 37472710c957SAl Viro {"never", SHMEM_HUGE_NEVER }, 37482710c957SAl Viro {"always", SHMEM_HUGE_ALWAYS }, 37492710c957SAl Viro {"within_size", SHMEM_HUGE_WITHIN_SIZE }, 37502710c957SAl Viro {"advise", SHMEM_HUGE_ADVISE }, 37512710c957SAl Viro {} 37522710c957SAl Viro }; 37532710c957SAl Viro 3754d7167b14SAl Viro const struct fs_parameter_spec shmem_fs_parameters[] = { 3755626c3920SAl Viro fsparam_u32 ("gid", Opt_gid), 37562710c957SAl Viro fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), 3757626c3920SAl Viro fsparam_u32oct("mode", Opt_mode), 3758626c3920SAl Viro fsparam_string("mpol", Opt_mpol), 3759626c3920SAl Viro fsparam_string("nr_blocks", Opt_nr_blocks), 3760626c3920SAl Viro fsparam_string("nr_inodes", Opt_nr_inodes), 3761626c3920SAl Viro fsparam_string("size", Opt_size), 3762626c3920SAl Viro fsparam_u32 ("uid", Opt_uid), 3763ea3271f7SChris Down fsparam_flag ("inode32", Opt_inode32), 3764ea3271f7SChris Down fsparam_flag ("inode64", Opt_inode64), 37652c6efe9cSLuis Chamberlain fsparam_flag ("noswap", Opt_noswap), 3766e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 3767e09764cfSCarlos Maiolino fsparam_flag ("quota", Opt_quota), 3768e09764cfSCarlos Maiolino fsparam_flag ("usrquota", Opt_usrquota), 3769e09764cfSCarlos Maiolino fsparam_flag ("grpquota", Opt_grpquota), 3770de4c0e7cSLukas Czerner fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), 3771de4c0e7cSLukas Czerner fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), 3772de4c0e7cSLukas Czerner fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), 3773de4c0e7cSLukas Czerner fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), 3774e09764cfSCarlos Maiolino #endif 3775626c3920SAl Viro {} 3776626c3920SAl Viro }; 3777626c3920SAl Viro 3778f3235626SDavid Howells static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) 3779626c3920SAl Viro { 3780f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 3781626c3920SAl Viro struct fs_parse_result result; 3782e04dc423SAl Viro unsigned long long size; 3783626c3920SAl Viro char *rest; 3784626c3920SAl Viro int opt; 3785626c3920SAl Viro 3786d7167b14SAl Viro opt = fs_parse(fc, shmem_fs_parameters, param, &result); 3787f3235626SDavid Howells if (opt < 0) 3788626c3920SAl Viro return opt; 3789626c3920SAl Viro 3790626c3920SAl Viro switch (opt) { 3791626c3920SAl Viro case Opt_size: 3792626c3920SAl Viro size = memparse(param->string, &rest); 3793e04dc423SAl Viro if (*rest == '%') { 3794e04dc423SAl Viro size <<= PAGE_SHIFT; 3795e04dc423SAl Viro size *= totalram_pages(); 3796e04dc423SAl Viro do_div(size, 100); 3797e04dc423SAl Viro rest++; 3798e04dc423SAl Viro } 3799e04dc423SAl Viro if (*rest) 3800626c3920SAl Viro goto bad_value; 3801e04dc423SAl Viro ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); 3802e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_BLOCKS; 3803626c3920SAl Viro break; 3804626c3920SAl Viro case Opt_nr_blocks: 3805626c3920SAl Viro ctx->blocks = memparse(param->string, &rest); 38060c98c8e1SZhaoLong Wang if (*rest || ctx->blocks > S64_MAX) 3807626c3920SAl Viro goto bad_value; 3808e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_BLOCKS; 3809626c3920SAl Viro break; 3810626c3920SAl Viro case Opt_nr_inodes: 3811626c3920SAl Viro ctx->inodes = memparse(param->string, &rest); 3812e04dc423SAl Viro if (*rest) 3813626c3920SAl Viro goto bad_value; 3814e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_INODES; 3815626c3920SAl Viro break; 3816626c3920SAl Viro case Opt_mode: 3817626c3920SAl Viro ctx->mode = result.uint_32 & 07777; 3818626c3920SAl Viro break; 3819626c3920SAl Viro case Opt_uid: 3820626c3920SAl Viro ctx->uid = make_kuid(current_user_ns(), result.uint_32); 3821e04dc423SAl Viro if (!uid_valid(ctx->uid)) 3822626c3920SAl Viro goto bad_value; 3823626c3920SAl Viro break; 3824626c3920SAl Viro case Opt_gid: 3825626c3920SAl Viro ctx->gid = make_kgid(current_user_ns(), result.uint_32); 3826e04dc423SAl Viro if (!gid_valid(ctx->gid)) 3827626c3920SAl Viro goto bad_value; 3828626c3920SAl Viro break; 3829626c3920SAl Viro case Opt_huge: 3830626c3920SAl Viro ctx->huge = result.uint_32; 3831626c3920SAl Viro if (ctx->huge != SHMEM_HUGE_NEVER && 3832396bcc52SMatthew Wilcox (Oracle) !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 3833626c3920SAl Viro has_transparent_hugepage())) 3834626c3920SAl Viro goto unsupported_parameter; 3835e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_HUGE; 3836626c3920SAl Viro break; 3837626c3920SAl Viro case Opt_mpol: 3838626c3920SAl Viro if (IS_ENABLED(CONFIG_NUMA)) { 3839e04dc423SAl Viro mpol_put(ctx->mpol); 3840e04dc423SAl Viro ctx->mpol = NULL; 3841626c3920SAl Viro if (mpol_parse_str(param->string, &ctx->mpol)) 3842626c3920SAl Viro goto bad_value; 3843626c3920SAl Viro break; 3844626c3920SAl Viro } 3845626c3920SAl Viro goto unsupported_parameter; 3846ea3271f7SChris Down case Opt_inode32: 3847ea3271f7SChris Down ctx->full_inums = false; 3848ea3271f7SChris Down ctx->seen |= SHMEM_SEEN_INUMS; 3849ea3271f7SChris Down break; 3850ea3271f7SChris Down case Opt_inode64: 3851ea3271f7SChris Down if (sizeof(ino_t) < 8) { 3852ea3271f7SChris Down return invalfc(fc, 3853ea3271f7SChris Down "Cannot use inode64 with <64bit inums in kernel\n"); 3854ea3271f7SChris Down } 3855ea3271f7SChris Down ctx->full_inums = true; 3856ea3271f7SChris Down ctx->seen |= SHMEM_SEEN_INUMS; 3857ea3271f7SChris Down break; 38582c6efe9cSLuis Chamberlain case Opt_noswap: 385901106e14SChristian Brauner if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { 386001106e14SChristian Brauner return invalfc(fc, 386101106e14SChristian Brauner "Turning off swap in unprivileged tmpfs mounts unsupported"); 386201106e14SChristian Brauner } 38632c6efe9cSLuis Chamberlain ctx->noswap = true; 38642c6efe9cSLuis Chamberlain ctx->seen |= SHMEM_SEEN_NOSWAP; 38652c6efe9cSLuis Chamberlain break; 3866e09764cfSCarlos Maiolino case Opt_quota: 3867e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3868e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3869e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3870e09764cfSCarlos Maiolino ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); 3871e09764cfSCarlos Maiolino break; 3872e09764cfSCarlos Maiolino case Opt_usrquota: 3873e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3874e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3875e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3876e09764cfSCarlos Maiolino ctx->quota_types |= QTYPE_MASK_USR; 3877e09764cfSCarlos Maiolino break; 3878e09764cfSCarlos Maiolino case Opt_grpquota: 3879e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3880e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3881e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3882e09764cfSCarlos Maiolino ctx->quota_types |= QTYPE_MASK_GRP; 3883e09764cfSCarlos Maiolino break; 3884de4c0e7cSLukas Czerner case Opt_usrquota_block_hardlimit: 3885de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3886de4c0e7cSLukas Czerner if (*rest || !size) 3887de4c0e7cSLukas Czerner goto bad_value; 3888de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 3889de4c0e7cSLukas Czerner return invalfc(fc, 3890de4c0e7cSLukas Czerner "User quota block hardlimit too large."); 3891de4c0e7cSLukas Czerner ctx->qlimits.usrquota_bhardlimit = size; 3892de4c0e7cSLukas Czerner break; 3893de4c0e7cSLukas Czerner case Opt_grpquota_block_hardlimit: 3894de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3895de4c0e7cSLukas Czerner if (*rest || !size) 3896de4c0e7cSLukas Czerner goto bad_value; 3897de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 3898de4c0e7cSLukas Czerner return invalfc(fc, 3899de4c0e7cSLukas Czerner "Group quota block hardlimit too large."); 3900de4c0e7cSLukas Czerner ctx->qlimits.grpquota_bhardlimit = size; 3901de4c0e7cSLukas Czerner break; 3902de4c0e7cSLukas Czerner case Opt_usrquota_inode_hardlimit: 3903de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3904de4c0e7cSLukas Czerner if (*rest || !size) 3905de4c0e7cSLukas Czerner goto bad_value; 3906de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 3907de4c0e7cSLukas Czerner return invalfc(fc, 3908de4c0e7cSLukas Czerner "User quota inode hardlimit too large."); 3909de4c0e7cSLukas Czerner ctx->qlimits.usrquota_ihardlimit = size; 3910de4c0e7cSLukas Czerner break; 3911de4c0e7cSLukas Czerner case Opt_grpquota_inode_hardlimit: 3912de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3913de4c0e7cSLukas Czerner if (*rest || !size) 3914de4c0e7cSLukas Czerner goto bad_value; 3915de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 3916de4c0e7cSLukas Czerner return invalfc(fc, 3917de4c0e7cSLukas Czerner "Group quota inode hardlimit too large."); 3918de4c0e7cSLukas Czerner ctx->qlimits.grpquota_ihardlimit = size; 3919de4c0e7cSLukas Czerner break; 3920e04dc423SAl Viro } 3921e04dc423SAl Viro return 0; 3922e04dc423SAl Viro 3923626c3920SAl Viro unsupported_parameter: 3924f35aa2bcSAl Viro return invalfc(fc, "Unsupported parameter '%s'", param->key); 3925626c3920SAl Viro bad_value: 3926f35aa2bcSAl Viro return invalfc(fc, "Bad value for '%s'", param->key); 3927e04dc423SAl Viro } 3928e04dc423SAl Viro 3929f3235626SDavid Howells static int shmem_parse_options(struct fs_context *fc, void *data) 3930e04dc423SAl Viro { 3931f3235626SDavid Howells char *options = data; 3932f3235626SDavid Howells 393333f37c64SAl Viro if (options) { 393433f37c64SAl Viro int err = security_sb_eat_lsm_opts(options, &fc->security); 393533f37c64SAl Viro if (err) 393633f37c64SAl Viro return err; 393733f37c64SAl Viro } 393833f37c64SAl Viro 3939b00dc3adSHugh Dickins while (options != NULL) { 3940626c3920SAl Viro char *this_char = options; 3941b00dc3adSHugh Dickins for (;;) { 3942b00dc3adSHugh Dickins /* 3943b00dc3adSHugh Dickins * NUL-terminate this option: unfortunately, 3944b00dc3adSHugh Dickins * mount options form a comma-separated list, 3945b00dc3adSHugh Dickins * but mpol's nodelist may also contain commas. 3946b00dc3adSHugh Dickins */ 3947b00dc3adSHugh Dickins options = strchr(options, ','); 3948b00dc3adSHugh Dickins if (options == NULL) 3949b00dc3adSHugh Dickins break; 3950b00dc3adSHugh Dickins options++; 3951b00dc3adSHugh Dickins if (!isdigit(*options)) { 3952b00dc3adSHugh Dickins options[-1] = '\0'; 3953b00dc3adSHugh Dickins break; 3954b00dc3adSHugh Dickins } 3955b00dc3adSHugh Dickins } 3956626c3920SAl Viro if (*this_char) { 3957626c3920SAl Viro char *value = strchr(this_char, '='); 3958f3235626SDavid Howells size_t len = 0; 3959626c3920SAl Viro int err; 3960626c3920SAl Viro 3961626c3920SAl Viro if (value) { 3962626c3920SAl Viro *value++ = '\0'; 3963f3235626SDavid Howells len = strlen(value); 39641da177e4SLinus Torvalds } 3965f3235626SDavid Howells err = vfs_parse_fs_string(fc, this_char, value, len); 3966f3235626SDavid Howells if (err < 0) 3967f3235626SDavid Howells return err; 39681da177e4SLinus Torvalds } 3969626c3920SAl Viro } 39701da177e4SLinus Torvalds return 0; 39711da177e4SLinus Torvalds } 39721da177e4SLinus Torvalds 3973f3235626SDavid Howells /* 3974f3235626SDavid Howells * Reconfigure a shmem filesystem. 3975f3235626SDavid Howells * 3976f3235626SDavid Howells * Note that we disallow change from limited->unlimited blocks/inodes while any 3977f3235626SDavid Howells * are in use; but we must separately disallow unlimited->limited, because in 3978f3235626SDavid Howells * that case we have no record of how much is already in use. 3979f3235626SDavid Howells */ 3980f3235626SDavid Howells static int shmem_reconfigure(struct fs_context *fc) 39811da177e4SLinus Torvalds { 3982f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 3983f3235626SDavid Howells struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); 39840edd73b3SHugh Dickins unsigned long inodes; 3985bf11b9a8SSebastian Andrzej Siewior struct mempolicy *mpol = NULL; 3986f3235626SDavid Howells const char *err; 39870edd73b3SHugh Dickins 3988bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 39890edd73b3SHugh Dickins inodes = sbinfo->max_inodes - sbinfo->free_inodes; 39900c98c8e1SZhaoLong Wang 3991f3235626SDavid Howells if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { 3992f3235626SDavid Howells if (!sbinfo->max_blocks) { 3993f3235626SDavid Howells err = "Cannot retroactively limit size"; 39940edd73b3SHugh Dickins goto out; 39950b5071ddSAl Viro } 3996f3235626SDavid Howells if (percpu_counter_compare(&sbinfo->used_blocks, 3997f3235626SDavid Howells ctx->blocks) > 0) { 3998f3235626SDavid Howells err = "Too small a size for current use"; 39990b5071ddSAl Viro goto out; 4000f3235626SDavid Howells } 4001f3235626SDavid Howells } 4002f3235626SDavid Howells if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { 4003f3235626SDavid Howells if (!sbinfo->max_inodes) { 4004f3235626SDavid Howells err = "Cannot retroactively limit inodes"; 40050b5071ddSAl Viro goto out; 40060b5071ddSAl Viro } 4007f3235626SDavid Howells if (ctx->inodes < inodes) { 4008f3235626SDavid Howells err = "Too few inodes for current use"; 4009f3235626SDavid Howells goto out; 4010f3235626SDavid Howells } 4011f3235626SDavid Howells } 40120edd73b3SHugh Dickins 4013ea3271f7SChris Down if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && 4014ea3271f7SChris Down sbinfo->next_ino > UINT_MAX) { 4015ea3271f7SChris Down err = "Current inum too high to switch to 32-bit inums"; 4016ea3271f7SChris Down goto out; 4017ea3271f7SChris Down } 40182c6efe9cSLuis Chamberlain if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { 40192c6efe9cSLuis Chamberlain err = "Cannot disable swap on remount"; 40202c6efe9cSLuis Chamberlain goto out; 40212c6efe9cSLuis Chamberlain } 40222c6efe9cSLuis Chamberlain if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { 40232c6efe9cSLuis Chamberlain err = "Cannot enable swap on remount if it was disabled on first mount"; 40242c6efe9cSLuis Chamberlain goto out; 40252c6efe9cSLuis Chamberlain } 4026ea3271f7SChris Down 4027e09764cfSCarlos Maiolino if (ctx->seen & SHMEM_SEEN_QUOTA && 4028e09764cfSCarlos Maiolino !sb_any_quota_loaded(fc->root->d_sb)) { 4029e09764cfSCarlos Maiolino err = "Cannot enable quota on remount"; 4030e09764cfSCarlos Maiolino goto out; 4031e09764cfSCarlos Maiolino } 4032e09764cfSCarlos Maiolino 4033de4c0e7cSLukas Czerner #ifdef CONFIG_TMPFS_QUOTA 4034de4c0e7cSLukas Czerner #define CHANGED_LIMIT(name) \ 4035de4c0e7cSLukas Czerner (ctx->qlimits.name## hardlimit && \ 4036de4c0e7cSLukas Czerner (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) 4037de4c0e7cSLukas Czerner 4038de4c0e7cSLukas Czerner if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || 4039de4c0e7cSLukas Czerner CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { 4040de4c0e7cSLukas Czerner err = "Cannot change global quota limit on remount"; 4041de4c0e7cSLukas Czerner goto out; 4042de4c0e7cSLukas Czerner } 4043de4c0e7cSLukas Czerner #endif /* CONFIG_TMPFS_QUOTA */ 4044de4c0e7cSLukas Czerner 4045f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_HUGE) 4046f3235626SDavid Howells sbinfo->huge = ctx->huge; 4047ea3271f7SChris Down if (ctx->seen & SHMEM_SEEN_INUMS) 4048ea3271f7SChris Down sbinfo->full_inums = ctx->full_inums; 4049f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_BLOCKS) 4050f3235626SDavid Howells sbinfo->max_blocks = ctx->blocks; 4051f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_INODES) { 4052f3235626SDavid Howells sbinfo->max_inodes = ctx->inodes; 4053f3235626SDavid Howells sbinfo->free_inodes = ctx->inodes - inodes; 40540b5071ddSAl Viro } 405571fe804bSLee Schermerhorn 40565f00110fSGreg Thelen /* 40575f00110fSGreg Thelen * Preserve previous mempolicy unless mpol remount option was specified. 40585f00110fSGreg Thelen */ 4059f3235626SDavid Howells if (ctx->mpol) { 4060bf11b9a8SSebastian Andrzej Siewior mpol = sbinfo->mpol; 4061f3235626SDavid Howells sbinfo->mpol = ctx->mpol; /* transfers initial ref */ 4062f3235626SDavid Howells ctx->mpol = NULL; 40635f00110fSGreg Thelen } 40642c6efe9cSLuis Chamberlain 40652c6efe9cSLuis Chamberlain if (ctx->noswap) 40662c6efe9cSLuis Chamberlain sbinfo->noswap = true; 40672c6efe9cSLuis Chamberlain 4068bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 4069bf11b9a8SSebastian Andrzej Siewior mpol_put(mpol); 4070f3235626SDavid Howells return 0; 40710edd73b3SHugh Dickins out: 4072bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 4073f35aa2bcSAl Viro return invalfc(fc, "%s", err); 40741da177e4SLinus Torvalds } 4075680d794bSakpm@linux-foundation.org 407634c80b1dSAl Viro static int shmem_show_options(struct seq_file *seq, struct dentry *root) 4077680d794bSakpm@linux-foundation.org { 407834c80b1dSAl Viro struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 4079283ebdeeSTu Jinjiang struct mempolicy *mpol; 4080680d794bSakpm@linux-foundation.org 4081680d794bSakpm@linux-foundation.org if (sbinfo->max_blocks != shmem_default_max_blocks()) 4082680d794bSakpm@linux-foundation.org seq_printf(seq, ",size=%luk", 408309cbfeafSKirill A. Shutemov sbinfo->max_blocks << (PAGE_SHIFT - 10)); 4084680d794bSakpm@linux-foundation.org if (sbinfo->max_inodes != shmem_default_max_inodes()) 4085680d794bSakpm@linux-foundation.org seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 40860825a6f9SJoe Perches if (sbinfo->mode != (0777 | S_ISVTX)) 408709208d15SAl Viro seq_printf(seq, ",mode=%03ho", sbinfo->mode); 40888751e039SEric W. Biederman if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 40898751e039SEric W. Biederman seq_printf(seq, ",uid=%u", 40908751e039SEric W. Biederman from_kuid_munged(&init_user_ns, sbinfo->uid)); 40918751e039SEric W. Biederman if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 40928751e039SEric W. Biederman seq_printf(seq, ",gid=%u", 40938751e039SEric W. Biederman from_kgid_munged(&init_user_ns, sbinfo->gid)); 4094ea3271f7SChris Down 4095ea3271f7SChris Down /* 4096ea3271f7SChris Down * Showing inode{64,32} might be useful even if it's the system default, 4097ea3271f7SChris Down * since then people don't have to resort to checking both here and 4098ea3271f7SChris Down * /proc/config.gz to confirm 64-bit inums were successfully applied 4099ea3271f7SChris Down * (which may not even exist if IKCONFIG_PROC isn't enabled). 4100ea3271f7SChris Down * 4101ea3271f7SChris Down * We hide it when inode64 isn't the default and we are using 32-bit 4102ea3271f7SChris Down * inodes, since that probably just means the feature isn't even under 4103ea3271f7SChris Down * consideration. 4104ea3271f7SChris Down * 4105ea3271f7SChris Down * As such: 4106ea3271f7SChris Down * 4107ea3271f7SChris Down * +-----------------+-----------------+ 4108ea3271f7SChris Down * | TMPFS_INODE64=y | TMPFS_INODE64=n | 4109ea3271f7SChris Down * +------------------+-----------------+-----------------+ 4110ea3271f7SChris Down * | full_inums=true | show | show | 4111ea3271f7SChris Down * | full_inums=false | show | hide | 4112ea3271f7SChris Down * +------------------+-----------------+-----------------+ 4113ea3271f7SChris Down * 4114ea3271f7SChris Down */ 4115ea3271f7SChris Down if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) 4116ea3271f7SChris Down seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); 4117396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 41185a6e75f8SKirill A. Shutemov /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 41195a6e75f8SKirill A. Shutemov if (sbinfo->huge) 41205a6e75f8SKirill A. Shutemov seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 41215a6e75f8SKirill A. Shutemov #endif 4122283ebdeeSTu Jinjiang mpol = shmem_get_sbmpol(sbinfo); 4123283ebdeeSTu Jinjiang shmem_show_mpol(seq, mpol); 4124283ebdeeSTu Jinjiang mpol_put(mpol); 41252c6efe9cSLuis Chamberlain if (sbinfo->noswap) 41262c6efe9cSLuis Chamberlain seq_printf(seq, ",noswap"); 4127680d794bSakpm@linux-foundation.org return 0; 4128680d794bSakpm@linux-foundation.org } 41299183df25SDavid Herrmann 4130680d794bSakpm@linux-foundation.org #endif /* CONFIG_TMPFS */ 41311da177e4SLinus Torvalds 41321da177e4SLinus Torvalds static void shmem_put_super(struct super_block *sb) 41331da177e4SLinus Torvalds { 4134602586a8SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 4135602586a8SHugh Dickins 4136e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4137e09764cfSCarlos Maiolino shmem_disable_quotas(sb); 4138e09764cfSCarlos Maiolino #endif 4139e809d5f0SChris Down free_percpu(sbinfo->ino_batch); 4140602586a8SHugh Dickins percpu_counter_destroy(&sbinfo->used_blocks); 414149cd0a5cSGreg Thelen mpol_put(sbinfo->mpol); 4142602586a8SHugh Dickins kfree(sbinfo); 41431da177e4SLinus Torvalds sb->s_fs_info = NULL; 41441da177e4SLinus Torvalds } 41451da177e4SLinus Torvalds 4146f3235626SDavid Howells static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) 41471da177e4SLinus Torvalds { 4148f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 41491da177e4SLinus Torvalds struct inode *inode; 41500edd73b3SHugh Dickins struct shmem_sb_info *sbinfo; 415171480663SCarlos Maiolino int error = -ENOMEM; 4152680d794bSakpm@linux-foundation.org 4153680d794bSakpm@linux-foundation.org /* Round up to L1_CACHE_BYTES to resist false sharing */ 4154425fbf04SPekka Enberg sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 4155680d794bSakpm@linux-foundation.org L1_CACHE_BYTES), GFP_KERNEL); 4156680d794bSakpm@linux-foundation.org if (!sbinfo) 415771480663SCarlos Maiolino return error; 4158680d794bSakpm@linux-foundation.org 4159680d794bSakpm@linux-foundation.org sb->s_fs_info = sbinfo; 41601da177e4SLinus Torvalds 41610edd73b3SHugh Dickins #ifdef CONFIG_TMPFS 41621da177e4SLinus Torvalds /* 41631da177e4SLinus Torvalds * Per default we only allow half of the physical ram per 41641da177e4SLinus Torvalds * tmpfs instance, limiting inodes to one per page of lowmem; 41651da177e4SLinus Torvalds * but the internal instance is left unlimited. 41661da177e4SLinus Torvalds */ 41671751e8a6SLinus Torvalds if (!(sb->s_flags & SB_KERNMOUNT)) { 4168f3235626SDavid Howells if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) 4169f3235626SDavid Howells ctx->blocks = shmem_default_max_blocks(); 4170f3235626SDavid Howells if (!(ctx->seen & SHMEM_SEEN_INODES)) 4171f3235626SDavid Howells ctx->inodes = shmem_default_max_inodes(); 4172ea3271f7SChris Down if (!(ctx->seen & SHMEM_SEEN_INUMS)) 4173ea3271f7SChris Down ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); 41742c6efe9cSLuis Chamberlain sbinfo->noswap = ctx->noswap; 4175ca4e0519SAl Viro } else { 41761751e8a6SLinus Torvalds sb->s_flags |= SB_NOUSER; 41771da177e4SLinus Torvalds } 417891828a40SDavid M. Grimes sb->s_export_op = &shmem_export_ops; 417936f05cabSJeff Layton sb->s_flags |= SB_NOSEC | SB_I_VERSION; 41800edd73b3SHugh Dickins #else 41811751e8a6SLinus Torvalds sb->s_flags |= SB_NOUSER; 41820edd73b3SHugh Dickins #endif 4183f3235626SDavid Howells sbinfo->max_blocks = ctx->blocks; 4184f3235626SDavid Howells sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; 4185e809d5f0SChris Down if (sb->s_flags & SB_KERNMOUNT) { 4186e809d5f0SChris Down sbinfo->ino_batch = alloc_percpu(ino_t); 4187e809d5f0SChris Down if (!sbinfo->ino_batch) 4188e809d5f0SChris Down goto failed; 4189e809d5f0SChris Down } 4190f3235626SDavid Howells sbinfo->uid = ctx->uid; 4191f3235626SDavid Howells sbinfo->gid = ctx->gid; 4192ea3271f7SChris Down sbinfo->full_inums = ctx->full_inums; 4193f3235626SDavid Howells sbinfo->mode = ctx->mode; 4194f3235626SDavid Howells sbinfo->huge = ctx->huge; 4195f3235626SDavid Howells sbinfo->mpol = ctx->mpol; 4196f3235626SDavid Howells ctx->mpol = NULL; 41971da177e4SLinus Torvalds 4198bf11b9a8SSebastian Andrzej Siewior raw_spin_lock_init(&sbinfo->stat_lock); 4199908c7f19STejun Heo if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 4200602586a8SHugh Dickins goto failed; 4201779750d2SKirill A. Shutemov spin_lock_init(&sbinfo->shrinklist_lock); 4202779750d2SKirill A. Shutemov INIT_LIST_HEAD(&sbinfo->shrinklist); 42031da177e4SLinus Torvalds 4204285b2c4fSHugh Dickins sb->s_maxbytes = MAX_LFS_FILESIZE; 420509cbfeafSKirill A. Shutemov sb->s_blocksize = PAGE_SIZE; 420609cbfeafSKirill A. Shutemov sb->s_blocksize_bits = PAGE_SHIFT; 42071da177e4SLinus Torvalds sb->s_magic = TMPFS_MAGIC; 42081da177e4SLinus Torvalds sb->s_op = &shmem_ops; 4209cfd95a9cSRobin H. Johnson sb->s_time_gran = 1; 4210b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 421139f0247dSAndreas Gruenbacher sb->s_xattr = shmem_xattr_handlers; 4212b09e0fa4SEric Paris #endif 4213b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL 42141751e8a6SLinus Torvalds sb->s_flags |= SB_POSIXACL; 421539f0247dSAndreas Gruenbacher #endif 42162b4db796SAmir Goldstein uuid_gen(&sb->s_uuid); 42170edd73b3SHugh Dickins 4218e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4219e09764cfSCarlos Maiolino if (ctx->seen & SHMEM_SEEN_QUOTA) { 4220e09764cfSCarlos Maiolino sb->dq_op = &shmem_quota_operations; 4221e09764cfSCarlos Maiolino sb->s_qcop = &dquot_quotactl_sysfile_ops; 4222e09764cfSCarlos Maiolino sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 4223e09764cfSCarlos Maiolino 4224de4c0e7cSLukas Czerner /* Copy the default limits from ctx into sbinfo */ 4225de4c0e7cSLukas Czerner memcpy(&sbinfo->qlimits, &ctx->qlimits, 4226de4c0e7cSLukas Czerner sizeof(struct shmem_quota_limits)); 4227de4c0e7cSLukas Czerner 4228e09764cfSCarlos Maiolino if (shmem_enable_quotas(sb, ctx->quota_types)) 4229e09764cfSCarlos Maiolino goto failed; 4230e09764cfSCarlos Maiolino } 4231e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 4232e09764cfSCarlos Maiolino 42337a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, 42347a80e5b8SGiuseppe Scrivano VM_NORESERVE); 423571480663SCarlos Maiolino if (IS_ERR(inode)) { 423671480663SCarlos Maiolino error = PTR_ERR(inode); 42371da177e4SLinus Torvalds goto failed; 423871480663SCarlos Maiolino } 4239680d794bSakpm@linux-foundation.org inode->i_uid = sbinfo->uid; 4240680d794bSakpm@linux-foundation.org inode->i_gid = sbinfo->gid; 4241318ceed0SAl Viro sb->s_root = d_make_root(inode); 4242318ceed0SAl Viro if (!sb->s_root) 424348fde701SAl Viro goto failed; 42441da177e4SLinus Torvalds return 0; 42451da177e4SLinus Torvalds 42461da177e4SLinus Torvalds failed: 42471da177e4SLinus Torvalds shmem_put_super(sb); 424871480663SCarlos Maiolino return error; 42491da177e4SLinus Torvalds } 42501da177e4SLinus Torvalds 4251f3235626SDavid Howells static int shmem_get_tree(struct fs_context *fc) 4252f3235626SDavid Howells { 4253f3235626SDavid Howells return get_tree_nodev(fc, shmem_fill_super); 4254f3235626SDavid Howells } 4255f3235626SDavid Howells 4256f3235626SDavid Howells static void shmem_free_fc(struct fs_context *fc) 4257f3235626SDavid Howells { 4258f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 4259f3235626SDavid Howells 4260f3235626SDavid Howells if (ctx) { 4261f3235626SDavid Howells mpol_put(ctx->mpol); 4262f3235626SDavid Howells kfree(ctx); 4263f3235626SDavid Howells } 4264f3235626SDavid Howells } 4265f3235626SDavid Howells 4266f3235626SDavid Howells static const struct fs_context_operations shmem_fs_context_ops = { 4267f3235626SDavid Howells .free = shmem_free_fc, 4268f3235626SDavid Howells .get_tree = shmem_get_tree, 4269f3235626SDavid Howells #ifdef CONFIG_TMPFS 4270f3235626SDavid Howells .parse_monolithic = shmem_parse_options, 4271f3235626SDavid Howells .parse_param = shmem_parse_one, 4272f3235626SDavid Howells .reconfigure = shmem_reconfigure, 4273f3235626SDavid Howells #endif 4274f3235626SDavid Howells }; 4275f3235626SDavid Howells 4276fcc234f8SPekka Enberg static struct kmem_cache *shmem_inode_cachep; 42771da177e4SLinus Torvalds 42781da177e4SLinus Torvalds static struct inode *shmem_alloc_inode(struct super_block *sb) 42791da177e4SLinus Torvalds { 428041ffe5d5SHugh Dickins struct shmem_inode_info *info; 4281fd60b288SMuchun Song info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); 428241ffe5d5SHugh Dickins if (!info) 42831da177e4SLinus Torvalds return NULL; 428441ffe5d5SHugh Dickins return &info->vfs_inode; 42851da177e4SLinus Torvalds } 42861da177e4SLinus Torvalds 428774b1da56SAl Viro static void shmem_free_in_core_inode(struct inode *inode) 4288fa0d7e3dSNick Piggin { 428984e710daSAl Viro if (S_ISLNK(inode->i_mode)) 42903ed47db3SAl Viro kfree(inode->i_link); 4291fa0d7e3dSNick Piggin kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 4292fa0d7e3dSNick Piggin } 4293fa0d7e3dSNick Piggin 42941da177e4SLinus Torvalds static void shmem_destroy_inode(struct inode *inode) 42951da177e4SLinus Torvalds { 429609208d15SAl Viro if (S_ISREG(inode->i_mode)) 42971da177e4SLinus Torvalds mpol_free_shared_policy(&SHMEM_I(inode)->policy); 42981da177e4SLinus Torvalds } 42991da177e4SLinus Torvalds 430041ffe5d5SHugh Dickins static void shmem_init_inode(void *foo) 43011da177e4SLinus Torvalds { 430241ffe5d5SHugh Dickins struct shmem_inode_info *info = foo; 430341ffe5d5SHugh Dickins inode_init_once(&info->vfs_inode); 43041da177e4SLinus Torvalds } 43051da177e4SLinus Torvalds 43069a8ec03eSweiping zhang static void shmem_init_inodecache(void) 43071da177e4SLinus Torvalds { 43081da177e4SLinus Torvalds shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 43091da177e4SLinus Torvalds sizeof(struct shmem_inode_info), 43105d097056SVladimir Davydov 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 43111da177e4SLinus Torvalds } 43121da177e4SLinus Torvalds 431341ffe5d5SHugh Dickins static void shmem_destroy_inodecache(void) 43141da177e4SLinus Torvalds { 43151a1d92c1SAlexey Dobriyan kmem_cache_destroy(shmem_inode_cachep); 43161da177e4SLinus Torvalds } 43171da177e4SLinus Torvalds 4318a7605426SYang Shi /* Keep the page in page cache instead of truncating it */ 4319a7605426SYang Shi static int shmem_error_remove_page(struct address_space *mapping, 4320a7605426SYang Shi struct page *page) 4321a7605426SYang Shi { 4322a7605426SYang Shi return 0; 4323a7605426SYang Shi } 4324a7605426SYang Shi 432530e6a51dSHui Su const struct address_space_operations shmem_aops = { 43261da177e4SLinus Torvalds .writepage = shmem_writepage, 432746de8b97SMatthew Wilcox (Oracle) .dirty_folio = noop_dirty_folio, 43281da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4329800d15a5SNick Piggin .write_begin = shmem_write_begin, 4330800d15a5SNick Piggin .write_end = shmem_write_end, 43311da177e4SLinus Torvalds #endif 43321c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 433354184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 43341c93923cSAndrew Morton #endif 4335a7605426SYang Shi .error_remove_page = shmem_error_remove_page, 43361da177e4SLinus Torvalds }; 433730e6a51dSHui Su EXPORT_SYMBOL(shmem_aops); 43381da177e4SLinus Torvalds 433915ad7cdcSHelge Deller static const struct file_operations shmem_file_operations = { 43401da177e4SLinus Torvalds .mmap = shmem_mmap, 4341a5454f95SThomas Weißschuh .open = generic_file_open, 4342c01d5b30SHugh Dickins .get_unmapped_area = shmem_get_unmapped_area, 43431da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4344220f2ac9SHugh Dickins .llseek = shmem_file_llseek, 43452ba5bbedSAl Viro .read_iter = shmem_file_read_iter, 43468174202bSAl Viro .write_iter = generic_file_write_iter, 43471b061d92SChristoph Hellwig .fsync = noop_fsync, 4348bd194b18SDavid Howells .splice_read = shmem_file_splice_read, 4349f6cb85d0SAl Viro .splice_write = iter_file_splice_write, 435083e4fa9cSHugh Dickins .fallocate = shmem_fallocate, 43511da177e4SLinus Torvalds #endif 43521da177e4SLinus Torvalds }; 43531da177e4SLinus Torvalds 435492e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations = { 435544a30220SYu Zhao .getattr = shmem_getattr, 435694c1e62dSHugh Dickins .setattr = shmem_setattr, 4357b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4358b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4359feda821eSChristoph Hellwig .set_acl = simple_set_acl, 4360e408e695STheodore Ts'o .fileattr_get = shmem_fileattr_get, 4361e408e695STheodore Ts'o .fileattr_set = shmem_fileattr_set, 4362b09e0fa4SEric Paris #endif 43631da177e4SLinus Torvalds }; 43641da177e4SLinus Torvalds 436592e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations = { 43661da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4367f7cd16a5SXavier Roche .getattr = shmem_getattr, 43681da177e4SLinus Torvalds .create = shmem_create, 43691da177e4SLinus Torvalds .lookup = simple_lookup, 43701da177e4SLinus Torvalds .link = shmem_link, 43711da177e4SLinus Torvalds .unlink = shmem_unlink, 43721da177e4SLinus Torvalds .symlink = shmem_symlink, 43731da177e4SLinus Torvalds .mkdir = shmem_mkdir, 43741da177e4SLinus Torvalds .rmdir = shmem_rmdir, 43751da177e4SLinus Torvalds .mknod = shmem_mknod, 43762773bf00SMiklos Szeredi .rename = shmem_rename2, 437760545d0dSAl Viro .tmpfile = shmem_tmpfile, 43781da177e4SLinus Torvalds #endif 4379b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4380b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4381e408e695STheodore Ts'o .fileattr_get = shmem_fileattr_get, 4382e408e695STheodore Ts'o .fileattr_set = shmem_fileattr_set, 4383b09e0fa4SEric Paris #endif 438439f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 438594c1e62dSHugh Dickins .setattr = shmem_setattr, 4386feda821eSChristoph Hellwig .set_acl = simple_set_acl, 438739f0247dSAndreas Gruenbacher #endif 438839f0247dSAndreas Gruenbacher }; 438939f0247dSAndreas Gruenbacher 439092e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations = { 4391f7cd16a5SXavier Roche .getattr = shmem_getattr, 4392b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4393b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4394b09e0fa4SEric Paris #endif 439539f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 439694c1e62dSHugh Dickins .setattr = shmem_setattr, 4397feda821eSChristoph Hellwig .set_acl = simple_set_acl, 439839f0247dSAndreas Gruenbacher #endif 43991da177e4SLinus Torvalds }; 44001da177e4SLinus Torvalds 4401759b9775SHugh Dickins static const struct super_operations shmem_ops = { 44021da177e4SLinus Torvalds .alloc_inode = shmem_alloc_inode, 440374b1da56SAl Viro .free_inode = shmem_free_in_core_inode, 44041da177e4SLinus Torvalds .destroy_inode = shmem_destroy_inode, 44051da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 44061da177e4SLinus Torvalds .statfs = shmem_statfs, 4407680d794bSakpm@linux-foundation.org .show_options = shmem_show_options, 44081da177e4SLinus Torvalds #endif 4409e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4410e09764cfSCarlos Maiolino .get_dquots = shmem_get_dquots, 4411e09764cfSCarlos Maiolino #endif 44121f895f75SAl Viro .evict_inode = shmem_evict_inode, 44131da177e4SLinus Torvalds .drop_inode = generic_delete_inode, 44141da177e4SLinus Torvalds .put_super = shmem_put_super, 4415396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4416779750d2SKirill A. Shutemov .nr_cached_objects = shmem_unused_huge_count, 4417779750d2SKirill A. Shutemov .free_cached_objects = shmem_unused_huge_scan, 4418779750d2SKirill A. Shutemov #endif 44191da177e4SLinus Torvalds }; 44201da177e4SLinus Torvalds 4421f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops = { 442254cb8821SNick Piggin .fault = shmem_fault, 4423d7c17551SNing Qu .map_pages = filemap_map_pages, 44241da177e4SLinus Torvalds #ifdef CONFIG_NUMA 44251da177e4SLinus Torvalds .set_policy = shmem_set_policy, 44261da177e4SLinus Torvalds .get_policy = shmem_get_policy, 44271da177e4SLinus Torvalds #endif 44281da177e4SLinus Torvalds }; 44291da177e4SLinus Torvalds 4430d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops = { 4431d09e8ca6SPasha Tatashin .fault = shmem_fault, 4432d09e8ca6SPasha Tatashin .map_pages = filemap_map_pages, 4433d09e8ca6SPasha Tatashin #ifdef CONFIG_NUMA 4434d09e8ca6SPasha Tatashin .set_policy = shmem_set_policy, 4435d09e8ca6SPasha Tatashin .get_policy = shmem_get_policy, 4436d09e8ca6SPasha Tatashin #endif 4437d09e8ca6SPasha Tatashin }; 4438d09e8ca6SPasha Tatashin 4439f3235626SDavid Howells int shmem_init_fs_context(struct fs_context *fc) 44401da177e4SLinus Torvalds { 4441f3235626SDavid Howells struct shmem_options *ctx; 4442f3235626SDavid Howells 4443f3235626SDavid Howells ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); 4444f3235626SDavid Howells if (!ctx) 4445f3235626SDavid Howells return -ENOMEM; 4446f3235626SDavid Howells 4447f3235626SDavid Howells ctx->mode = 0777 | S_ISVTX; 4448f3235626SDavid Howells ctx->uid = current_fsuid(); 4449f3235626SDavid Howells ctx->gid = current_fsgid(); 4450f3235626SDavid Howells 4451f3235626SDavid Howells fc->fs_private = ctx; 4452f3235626SDavid Howells fc->ops = &shmem_fs_context_ops; 4453f3235626SDavid Howells return 0; 44541da177e4SLinus Torvalds } 44551da177e4SLinus Torvalds 445641ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 44571da177e4SLinus Torvalds .owner = THIS_MODULE, 44581da177e4SLinus Torvalds .name = "tmpfs", 4459f3235626SDavid Howells .init_fs_context = shmem_init_fs_context, 4460f3235626SDavid Howells #ifdef CONFIG_TMPFS 4461d7167b14SAl Viro .parameters = shmem_fs_parameters, 4462f3235626SDavid Howells #endif 44631da177e4SLinus Torvalds .kill_sb = kill_litter_super, 44647a80e5b8SGiuseppe Scrivano #ifdef CONFIG_SHMEM 44657a80e5b8SGiuseppe Scrivano .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 44667a80e5b8SGiuseppe Scrivano #else 4467ff36da69SMatthew Wilcox (Oracle) .fs_flags = FS_USERNS_MOUNT, 44687a80e5b8SGiuseppe Scrivano #endif 44691da177e4SLinus Torvalds }; 44701da177e4SLinus Torvalds 44719096bbe9SMiaohe Lin void __init shmem_init(void) 44721da177e4SLinus Torvalds { 44731da177e4SLinus Torvalds int error; 44741da177e4SLinus Torvalds 44759a8ec03eSweiping zhang shmem_init_inodecache(); 44761da177e4SLinus Torvalds 4477e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4478e09764cfSCarlos Maiolino error = register_quota_format(&shmem_quota_format); 4479e09764cfSCarlos Maiolino if (error < 0) { 4480e09764cfSCarlos Maiolino pr_err("Could not register quota format\n"); 4481e09764cfSCarlos Maiolino goto out3; 4482e09764cfSCarlos Maiolino } 4483e09764cfSCarlos Maiolino #endif 4484e09764cfSCarlos Maiolino 448541ffe5d5SHugh Dickins error = register_filesystem(&shmem_fs_type); 44861da177e4SLinus Torvalds if (error) { 44871170532bSJoe Perches pr_err("Could not register tmpfs\n"); 44881da177e4SLinus Torvalds goto out2; 44891da177e4SLinus Torvalds } 449095dc112aSGreg Kroah-Hartman 4491ca4e0519SAl Viro shm_mnt = kern_mount(&shmem_fs_type); 44921da177e4SLinus Torvalds if (IS_ERR(shm_mnt)) { 44931da177e4SLinus Torvalds error = PTR_ERR(shm_mnt); 44941170532bSJoe Perches pr_err("Could not kern_mount tmpfs\n"); 44951da177e4SLinus Torvalds goto out1; 44961da177e4SLinus Torvalds } 44975a6e75f8SKirill A. Shutemov 4498396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4499435c0b87SKirill A. Shutemov if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 45005a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 45015a6e75f8SKirill A. Shutemov else 45025e6e5a12SHugh Dickins shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ 45035a6e75f8SKirill A. Shutemov #endif 45049096bbe9SMiaohe Lin return; 45051da177e4SLinus Torvalds 45061da177e4SLinus Torvalds out1: 450741ffe5d5SHugh Dickins unregister_filesystem(&shmem_fs_type); 45081da177e4SLinus Torvalds out2: 4509e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4510e09764cfSCarlos Maiolino unregister_quota_format(&shmem_quota_format); 4511e09764cfSCarlos Maiolino out3: 4512e09764cfSCarlos Maiolino #endif 451341ffe5d5SHugh Dickins shmem_destroy_inodecache(); 45141da177e4SLinus Torvalds shm_mnt = ERR_PTR(error); 45151da177e4SLinus Torvalds } 4516853ac43aSMatt Mackall 4517396bcc52SMatthew Wilcox (Oracle) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 45185a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_show(struct kobject *kobj, 45195a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, char *buf) 45205a6e75f8SKirill A. Shutemov { 452126083eb6SColin Ian King static const int values[] = { 45225a6e75f8SKirill A. Shutemov SHMEM_HUGE_ALWAYS, 45235a6e75f8SKirill A. Shutemov SHMEM_HUGE_WITHIN_SIZE, 45245a6e75f8SKirill A. Shutemov SHMEM_HUGE_ADVISE, 45255a6e75f8SKirill A. Shutemov SHMEM_HUGE_NEVER, 45265a6e75f8SKirill A. Shutemov SHMEM_HUGE_DENY, 45275a6e75f8SKirill A. Shutemov SHMEM_HUGE_FORCE, 45285a6e75f8SKirill A. Shutemov }; 452979d4d38aSJoe Perches int len = 0; 453079d4d38aSJoe Perches int i; 45315a6e75f8SKirill A. Shutemov 453279d4d38aSJoe Perches for (i = 0; i < ARRAY_SIZE(values); i++) { 453379d4d38aSJoe Perches len += sysfs_emit_at(buf, len, 453479d4d38aSJoe Perches shmem_huge == values[i] ? "%s[%s]" : "%s%s", 453579d4d38aSJoe Perches i ? " " : "", 45365a6e75f8SKirill A. Shutemov shmem_format_huge(values[i])); 45375a6e75f8SKirill A. Shutemov } 453879d4d38aSJoe Perches 453979d4d38aSJoe Perches len += sysfs_emit_at(buf, len, "\n"); 454079d4d38aSJoe Perches 454179d4d38aSJoe Perches return len; 45425a6e75f8SKirill A. Shutemov } 45435a6e75f8SKirill A. Shutemov 45445a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_store(struct kobject *kobj, 45455a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, const char *buf, size_t count) 45465a6e75f8SKirill A. Shutemov { 45475a6e75f8SKirill A. Shutemov char tmp[16]; 45485a6e75f8SKirill A. Shutemov int huge; 45495a6e75f8SKirill A. Shutemov 45505a6e75f8SKirill A. Shutemov if (count + 1 > sizeof(tmp)) 45515a6e75f8SKirill A. Shutemov return -EINVAL; 45525a6e75f8SKirill A. Shutemov memcpy(tmp, buf, count); 45535a6e75f8SKirill A. Shutemov tmp[count] = '\0'; 45545a6e75f8SKirill A. Shutemov if (count && tmp[count - 1] == '\n') 45555a6e75f8SKirill A. Shutemov tmp[count - 1] = '\0'; 45565a6e75f8SKirill A. Shutemov 45575a6e75f8SKirill A. Shutemov huge = shmem_parse_huge(tmp); 45585a6e75f8SKirill A. Shutemov if (huge == -EINVAL) 45595a6e75f8SKirill A. Shutemov return -EINVAL; 45605a6e75f8SKirill A. Shutemov if (!has_transparent_hugepage() && 45615a6e75f8SKirill A. Shutemov huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 45625a6e75f8SKirill A. Shutemov return -EINVAL; 45635a6e75f8SKirill A. Shutemov 45645a6e75f8SKirill A. Shutemov shmem_huge = huge; 4565435c0b87SKirill A. Shutemov if (shmem_huge > SHMEM_HUGE_DENY) 45665a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 45675a6e75f8SKirill A. Shutemov return count; 45685a6e75f8SKirill A. Shutemov } 45695a6e75f8SKirill A. Shutemov 45704bfa8adaSMiaohe Lin struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); 4571396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 4572f3f0e1d2SKirill A. Shutemov 4573853ac43aSMatt Mackall #else /* !CONFIG_SHMEM */ 4574853ac43aSMatt Mackall 4575853ac43aSMatt Mackall /* 4576853ac43aSMatt Mackall * tiny-shmem: simple shmemfs and tmpfs using ramfs code 4577853ac43aSMatt Mackall * 4578853ac43aSMatt Mackall * This is intended for small system where the benefits of the full 4579853ac43aSMatt Mackall * shmem code (swap-backed and resource-limited) are outweighed by 4580853ac43aSMatt Mackall * their complexity. On systems without swap this code should be 4581853ac43aSMatt Mackall * effectively equivalent, but much lighter weight. 4582853ac43aSMatt Mackall */ 4583853ac43aSMatt Mackall 458441ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 4585853ac43aSMatt Mackall .name = "tmpfs", 4586f3235626SDavid Howells .init_fs_context = ramfs_init_fs_context, 4587d7167b14SAl Viro .parameters = ramfs_fs_parameters, 458836ce9d76SRoberto Sassu .kill_sb = ramfs_kill_sb, 45892b8576cbSEric W. Biederman .fs_flags = FS_USERNS_MOUNT, 4590853ac43aSMatt Mackall }; 4591853ac43aSMatt Mackall 45929096bbe9SMiaohe Lin void __init shmem_init(void) 4593853ac43aSMatt Mackall { 459441ffe5d5SHugh Dickins BUG_ON(register_filesystem(&shmem_fs_type) != 0); 4595853ac43aSMatt Mackall 459641ffe5d5SHugh Dickins shm_mnt = kern_mount(&shmem_fs_type); 4597853ac43aSMatt Mackall BUG_ON(IS_ERR(shm_mnt)); 4598853ac43aSMatt Mackall } 4599853ac43aSMatt Mackall 460010a9c496SChristoph Hellwig int shmem_unuse(unsigned int type) 4601853ac43aSMatt Mackall { 4602853ac43aSMatt Mackall return 0; 4603853ac43aSMatt Mackall } 4604853ac43aSMatt Mackall 4605d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 46063f96b79aSHugh Dickins { 46073f96b79aSHugh Dickins return 0; 46083f96b79aSHugh Dickins } 46093f96b79aSHugh Dickins 461024513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 461124513264SHugh Dickins { 461224513264SHugh Dickins } 461324513264SHugh Dickins 4614c01d5b30SHugh Dickins #ifdef CONFIG_MMU 4615c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 4616c01d5b30SHugh Dickins unsigned long addr, unsigned long len, 4617c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 4618c01d5b30SHugh Dickins { 4619c01d5b30SHugh Dickins return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 4620c01d5b30SHugh Dickins } 4621c01d5b30SHugh Dickins #endif 4622c01d5b30SHugh Dickins 462341ffe5d5SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 462494c1e62dSHugh Dickins { 462541ffe5d5SHugh Dickins truncate_inode_pages_range(inode->i_mapping, lstart, lend); 462694c1e62dSHugh Dickins } 462794c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 462894c1e62dSHugh Dickins 4629853ac43aSMatt Mackall #define shmem_vm_ops generic_file_vm_ops 4630d09e8ca6SPasha Tatashin #define shmem_anon_vm_ops generic_file_vm_ops 46310b0a0806SHugh Dickins #define shmem_file_operations ramfs_file_operations 46320b0a0806SHugh Dickins #define shmem_acct_size(flags, size) 0 46330b0a0806SHugh Dickins #define shmem_unacct_size(flags, size) do {} while (0) 4634853ac43aSMatt Mackall 463571480663SCarlos Maiolino static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, 463671480663SCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 463771480663SCarlos Maiolino { 463871480663SCarlos Maiolino struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); 463971480663SCarlos Maiolino return inode ? inode : ERR_PTR(-ENOSPC); 464071480663SCarlos Maiolino } 464171480663SCarlos Maiolino 4642853ac43aSMatt Mackall #endif /* CONFIG_SHMEM */ 4643853ac43aSMatt Mackall 4644853ac43aSMatt Mackall /* common code */ 46451da177e4SLinus Torvalds 4646703321b6SMatthew Auld static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 4647c7277090SEric Paris unsigned long flags, unsigned int i_flags) 46481da177e4SLinus Torvalds { 46491da177e4SLinus Torvalds struct inode *inode; 465093dec2daSAl Viro struct file *res; 46511da177e4SLinus Torvalds 4652703321b6SMatthew Auld if (IS_ERR(mnt)) 4653703321b6SMatthew Auld return ERR_CAST(mnt); 46541da177e4SLinus Torvalds 4655285b2c4fSHugh Dickins if (size < 0 || size > MAX_LFS_FILESIZE) 46561da177e4SLinus Torvalds return ERR_PTR(-EINVAL); 46571da177e4SLinus Torvalds 46581da177e4SLinus Torvalds if (shmem_acct_size(flags, size)) 46591da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 46601da177e4SLinus Torvalds 46617a80e5b8SGiuseppe Scrivano if (is_idmapped_mnt(mnt)) 46627a80e5b8SGiuseppe Scrivano return ERR_PTR(-EINVAL); 46637a80e5b8SGiuseppe Scrivano 46647a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, 46657a80e5b8SGiuseppe Scrivano S_IFREG | S_IRWXUGO, 0, flags); 466671480663SCarlos Maiolino 466771480663SCarlos Maiolino if (IS_ERR(inode)) { 4668dac2d1f6SAl Viro shmem_unacct_size(flags, size); 466971480663SCarlos Maiolino return ERR_CAST(inode); 4670dac2d1f6SAl Viro } 4671c7277090SEric Paris inode->i_flags |= i_flags; 46721da177e4SLinus Torvalds inode->i_size = size; 46736d6b77f1SMiklos Szeredi clear_nlink(inode); /* It is unlinked */ 467426567cdbSAl Viro res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 467593dec2daSAl Viro if (!IS_ERR(res)) 467693dec2daSAl Viro res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 46774b42af81SAl Viro &shmem_file_operations); 46786b4d0b27SAl Viro if (IS_ERR(res)) 467993dec2daSAl Viro iput(inode); 46806b4d0b27SAl Viro return res; 46811da177e4SLinus Torvalds } 4682c7277090SEric Paris 4683c7277090SEric Paris /** 4684c7277090SEric Paris * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 4685c7277090SEric Paris * kernel internal. There will be NO LSM permission checks against the 4686c7277090SEric Paris * underlying inode. So users of this interface must do LSM checks at a 4687e1832f29SStephen Smalley * higher layer. The users are the big_key and shm implementations. LSM 4688e1832f29SStephen Smalley * checks are provided at the key or shm level rather than the inode. 4689c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 4690c7277090SEric Paris * @size: size to be set for the file 4691c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4692c7277090SEric Paris */ 4693c7277090SEric Paris struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 4694c7277090SEric Paris { 4695703321b6SMatthew Auld return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 4696c7277090SEric Paris } 4697c7277090SEric Paris 4698c7277090SEric Paris /** 4699c7277090SEric Paris * shmem_file_setup - get an unlinked file living in tmpfs 4700c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 4701c7277090SEric Paris * @size: size to be set for the file 4702c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4703c7277090SEric Paris */ 4704c7277090SEric Paris struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 4705c7277090SEric Paris { 4706703321b6SMatthew Auld return __shmem_file_setup(shm_mnt, name, size, flags, 0); 4707c7277090SEric Paris } 4708395e0ddcSKeith Packard EXPORT_SYMBOL_GPL(shmem_file_setup); 47091da177e4SLinus Torvalds 471046711810SRandy Dunlap /** 4711703321b6SMatthew Auld * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 4712703321b6SMatthew Auld * @mnt: the tmpfs mount where the file will be created 4713703321b6SMatthew Auld * @name: name for dentry (to be seen in /proc/<pid>/maps 4714703321b6SMatthew Auld * @size: size to be set for the file 4715703321b6SMatthew Auld * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4716703321b6SMatthew Auld */ 4717703321b6SMatthew Auld struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 4718703321b6SMatthew Auld loff_t size, unsigned long flags) 4719703321b6SMatthew Auld { 4720703321b6SMatthew Auld return __shmem_file_setup(mnt, name, size, flags, 0); 4721703321b6SMatthew Auld } 4722703321b6SMatthew Auld EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 4723703321b6SMatthew Auld 4724703321b6SMatthew Auld /** 47251da177e4SLinus Torvalds * shmem_zero_setup - setup a shared anonymous mapping 472645e55300SPeter Collingbourne * @vma: the vma to be mmapped is prepared by do_mmap 47271da177e4SLinus Torvalds */ 47281da177e4SLinus Torvalds int shmem_zero_setup(struct vm_area_struct *vma) 47291da177e4SLinus Torvalds { 47301da177e4SLinus Torvalds struct file *file; 47311da177e4SLinus Torvalds loff_t size = vma->vm_end - vma->vm_start; 47321da177e4SLinus Torvalds 473366fc1303SHugh Dickins /* 4734c1e8d7c6SMichel Lespinasse * Cloning a new file under mmap_lock leads to a lock ordering conflict 473566fc1303SHugh Dickins * between XFS directory reading and selinux: since this file is only 473666fc1303SHugh Dickins * accessible to the user through its mapping, use S_PRIVATE flag to 473766fc1303SHugh Dickins * bypass file security, in the same way as shmem_kernel_file_setup(). 473866fc1303SHugh Dickins */ 4739703321b6SMatthew Auld file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 47401da177e4SLinus Torvalds if (IS_ERR(file)) 47411da177e4SLinus Torvalds return PTR_ERR(file); 47421da177e4SLinus Torvalds 47431da177e4SLinus Torvalds if (vma->vm_file) 47441da177e4SLinus Torvalds fput(vma->vm_file); 47451da177e4SLinus Torvalds vma->vm_file = file; 4746d09e8ca6SPasha Tatashin vma->vm_ops = &shmem_anon_vm_ops; 4747f3f0e1d2SKirill A. Shutemov 47481da177e4SLinus Torvalds return 0; 47491da177e4SLinus Torvalds } 4750d9d90e5eSHugh Dickins 4751d9d90e5eSHugh Dickins /** 4752f01b2b3eSMatthew Wilcox (Oracle) * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. 4753f01b2b3eSMatthew Wilcox (Oracle) * @mapping: the folio's address_space 4754f01b2b3eSMatthew Wilcox (Oracle) * @index: the folio index 4755d9d90e5eSHugh Dickins * @gfp: the page allocator flags to use if allocating 4756d9d90e5eSHugh Dickins * 4757d9d90e5eSHugh Dickins * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4758d9d90e5eSHugh Dickins * with any new page allocations done using the specified allocation flags. 47597e0a1265SMatthew Wilcox (Oracle) * But read_cache_page_gfp() uses the ->read_folio() method: which does not 4760d9d90e5eSHugh Dickins * suit tmpfs, since it may have pages in swapcache, and needs to find those 4761d9d90e5eSHugh Dickins * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4762d9d90e5eSHugh Dickins * 476368da9f05SHugh Dickins * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 476468da9f05SHugh Dickins * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4765d9d90e5eSHugh Dickins */ 4766f01b2b3eSMatthew Wilcox (Oracle) struct folio *shmem_read_folio_gfp(struct address_space *mapping, 4767d9d90e5eSHugh Dickins pgoff_t index, gfp_t gfp) 4768d9d90e5eSHugh Dickins { 476968da9f05SHugh Dickins #ifdef CONFIG_SHMEM 477068da9f05SHugh Dickins struct inode *inode = mapping->host; 4771a3a9c397SMatthew Wilcox (Oracle) struct folio *folio; 477268da9f05SHugh Dickins int error; 477368da9f05SHugh Dickins 477430e6a51dSHui Su BUG_ON(!shmem_mapping(mapping)); 4775a3a9c397SMatthew Wilcox (Oracle) error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, 4776cfda0526SMike Rapoport gfp, NULL, NULL, NULL); 477768da9f05SHugh Dickins if (error) 4778a7605426SYang Shi return ERR_PTR(error); 4779a7605426SYang Shi 4780a3a9c397SMatthew Wilcox (Oracle) folio_unlock(folio); 4781f01b2b3eSMatthew Wilcox (Oracle) return folio; 4782f01b2b3eSMatthew Wilcox (Oracle) #else 4783f01b2b3eSMatthew Wilcox (Oracle) /* 4784f01b2b3eSMatthew Wilcox (Oracle) * The tiny !SHMEM case uses ramfs without swap 4785f01b2b3eSMatthew Wilcox (Oracle) */ 4786f01b2b3eSMatthew Wilcox (Oracle) return mapping_read_folio_gfp(mapping, index, gfp); 4787f01b2b3eSMatthew Wilcox (Oracle) #endif 4788f01b2b3eSMatthew Wilcox (Oracle) } 4789f01b2b3eSMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); 4790f01b2b3eSMatthew Wilcox (Oracle) 4791f01b2b3eSMatthew Wilcox (Oracle) struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4792f01b2b3eSMatthew Wilcox (Oracle) pgoff_t index, gfp_t gfp) 4793f01b2b3eSMatthew Wilcox (Oracle) { 4794f01b2b3eSMatthew Wilcox (Oracle) struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); 4795f01b2b3eSMatthew Wilcox (Oracle) struct page *page; 4796f01b2b3eSMatthew Wilcox (Oracle) 4797f01b2b3eSMatthew Wilcox (Oracle) if (IS_ERR(folio)) 4798f01b2b3eSMatthew Wilcox (Oracle) return &folio->page; 4799f01b2b3eSMatthew Wilcox (Oracle) 4800a3a9c397SMatthew Wilcox (Oracle) page = folio_file_page(folio, index); 4801a7605426SYang Shi if (PageHWPoison(page)) { 4802a3a9c397SMatthew Wilcox (Oracle) folio_put(folio); 4803a7605426SYang Shi return ERR_PTR(-EIO); 4804a7605426SYang Shi } 4805a7605426SYang Shi 480668da9f05SHugh Dickins return page; 4807d9d90e5eSHugh Dickins } 4808d9d90e5eSHugh Dickins EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4809