11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Resizable virtual memory filesystem for Linux. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 2000 Linus Torvalds. 51da177e4SLinus Torvalds * 2000 Transmeta Corp. 61da177e4SLinus Torvalds * 2000-2001 Christoph Rohland 71da177e4SLinus Torvalds * 2000-2001 SAP AG 81da177e4SLinus Torvalds * 2002 Red Hat Inc. 96922c0c7SHugh Dickins * Copyright (C) 2002-2011 Hugh Dickins. 106922c0c7SHugh Dickins * Copyright (C) 2011 Google Inc. 110edd73b3SHugh Dickins * Copyright (C) 2002-2005 VERITAS Software Corporation. 121da177e4SLinus Torvalds * Copyright (C) 2004 Andi Kleen, SuSE Labs 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * Extended attribute support for tmpfs: 151da177e4SLinus Torvalds * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 161da177e4SLinus Torvalds * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 171da177e4SLinus Torvalds * 18853ac43aSMatt Mackall * tiny-shmem: 19853ac43aSMatt Mackall * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20853ac43aSMatt Mackall * 211da177e4SLinus Torvalds * This file is released under the GPL. 221da177e4SLinus Torvalds */ 231da177e4SLinus Torvalds 24853ac43aSMatt Mackall #include <linux/fs.h> 25853ac43aSMatt Mackall #include <linux/init.h> 26853ac43aSMatt Mackall #include <linux/vfs.h> 27853ac43aSMatt Mackall #include <linux/mount.h> 28250297edSAndrew Morton #include <linux/ramfs.h> 29caefba17SHugh Dickins #include <linux/pagemap.h> 30853ac43aSMatt Mackall #include <linux/file.h> 31e408e695STheodore Ts'o #include <linux/fileattr.h> 32853ac43aSMatt Mackall #include <linux/mm.h> 3346c9a946SArnd Bergmann #include <linux/random.h> 34174cd4b1SIngo Molnar #include <linux/sched/signal.h> 35b95f1b31SPaul Gortmaker #include <linux/export.h> 365ff2121aSMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 37853ac43aSMatt Mackall #include <linux/swap.h> 38e2e40f2cSChristoph Hellwig #include <linux/uio.h> 39749df87bSMike Kravetz #include <linux/hugetlb.h> 40626c3920SAl Viro #include <linux/fs_parser.h> 4186a2f3f2SMiaohe Lin #include <linux/swapfile.h> 4236f05cabSJeff Layton #include <linux/iversion.h> 43014bb1deSNeilBrown #include "swap.h" 4495cc09d6SAndrea Arcangeli 45853ac43aSMatt Mackall static struct vfsmount *shm_mnt; 46853ac43aSMatt Mackall 47853ac43aSMatt Mackall #ifdef CONFIG_SHMEM 481da177e4SLinus Torvalds /* 491da177e4SLinus Torvalds * This virtual memory filesystem is heavily based on the ramfs. It 501da177e4SLinus Torvalds * extends ramfs by the ability to use swap and honor resource limits 511da177e4SLinus Torvalds * which makes it a completely usable filesystem. 521da177e4SLinus Torvalds */ 531da177e4SLinus Torvalds 5439f0247dSAndreas Gruenbacher #include <linux/xattr.h> 55a5694255SChristoph Hellwig #include <linux/exportfs.h> 561c7c474cSChristoph Hellwig #include <linux/posix_acl.h> 57feda821eSChristoph Hellwig #include <linux/posix_acl_xattr.h> 581da177e4SLinus Torvalds #include <linux/mman.h> 591da177e4SLinus Torvalds #include <linux/string.h> 601da177e4SLinus Torvalds #include <linux/slab.h> 611da177e4SLinus Torvalds #include <linux/backing-dev.h> 621da177e4SLinus Torvalds #include <linux/writeback.h> 63bda97eabSHugh Dickins #include <linux/pagevec.h> 6441ffe5d5SHugh Dickins #include <linux/percpu_counter.h> 6583e4fa9cSHugh Dickins #include <linux/falloc.h> 66708e3508SHugh Dickins #include <linux/splice.h> 671da177e4SLinus Torvalds #include <linux/security.h> 681da177e4SLinus Torvalds #include <linux/swapops.h> 691da177e4SLinus Torvalds #include <linux/mempolicy.h> 701da177e4SLinus Torvalds #include <linux/namei.h> 71b00dc3adSHugh Dickins #include <linux/ctype.h> 72304dbdb7SLee Schermerhorn #include <linux/migrate.h> 73c1f60a5aSChristoph Lameter #include <linux/highmem.h> 74680d794bSakpm@linux-foundation.org #include <linux/seq_file.h> 7592562927SMimi Zohar #include <linux/magic.h> 769183df25SDavid Herrmann #include <linux/syscalls.h> 7740e041a2SDavid Herrmann #include <linux/fcntl.h> 789183df25SDavid Herrmann #include <uapi/linux/memfd.h> 794c27fe4cSMike Rapoport #include <linux/rmap.h> 802b4db796SAmir Goldstein #include <linux/uuid.h> 81e09764cfSCarlos Maiolino #include <linux/quotaops.h> 82304dbdb7SLee Schermerhorn 837c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 841da177e4SLinus Torvalds 85dd56b046SMel Gorman #include "internal.h" 86dd56b046SMel Gorman 8709cbfeafSKirill A. Shutemov #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 8809cbfeafSKirill A. Shutemov #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 891da177e4SLinus Torvalds 901da177e4SLinus Torvalds /* Pretend that each entry is of this size in directory's i_size */ 911da177e4SLinus Torvalds #define BOGO_DIRENT_SIZE 20 921da177e4SLinus Torvalds 9369f07ec9SHugh Dickins /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 9469f07ec9SHugh Dickins #define SHORT_SYMLINK_LEN 128 9569f07ec9SHugh Dickins 961aac1400SHugh Dickins /* 97f00cdc6dSHugh Dickins * shmem_fallocate communicates with shmem_fault or shmem_writepage via 989608703eSJan Kara * inode->i_private (with i_rwsem making sure that it has only one user at 99f00cdc6dSHugh Dickins * a time): we would prefer not to enlarge the shmem inode just for that. 1001aac1400SHugh Dickins */ 1011aac1400SHugh Dickins struct shmem_falloc { 1028e205f77SHugh Dickins wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 1031aac1400SHugh Dickins pgoff_t start; /* start of range currently being fallocated */ 1041aac1400SHugh Dickins pgoff_t next; /* the next page offset to be fallocated */ 1051aac1400SHugh Dickins pgoff_t nr_falloced; /* how many new pages have been fallocated */ 1061aac1400SHugh Dickins pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 1071aac1400SHugh Dickins }; 1081aac1400SHugh Dickins 1090b5071ddSAl Viro struct shmem_options { 1100b5071ddSAl Viro unsigned long long blocks; 1110b5071ddSAl Viro unsigned long long inodes; 1120b5071ddSAl Viro struct mempolicy *mpol; 1130b5071ddSAl Viro kuid_t uid; 1140b5071ddSAl Viro kgid_t gid; 1150b5071ddSAl Viro umode_t mode; 116ea3271f7SChris Down bool full_inums; 1170b5071ddSAl Viro int huge; 1180b5071ddSAl Viro int seen; 1192c6efe9cSLuis Chamberlain bool noswap; 120e09764cfSCarlos Maiolino unsigned short quota_types; 121de4c0e7cSLukas Czerner struct shmem_quota_limits qlimits; 1220b5071ddSAl Viro #define SHMEM_SEEN_BLOCKS 1 1230b5071ddSAl Viro #define SHMEM_SEEN_INODES 2 1240b5071ddSAl Viro #define SHMEM_SEEN_HUGE 4 125ea3271f7SChris Down #define SHMEM_SEEN_INUMS 8 1262c6efe9cSLuis Chamberlain #define SHMEM_SEEN_NOSWAP 16 127e09764cfSCarlos Maiolino #define SHMEM_SEEN_QUOTA 32 1280b5071ddSAl Viro }; 1290b5071ddSAl Viro 130b76db735SAndrew Morton #ifdef CONFIG_TMPFS 131680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_blocks(void) 132680d794bSakpm@linux-foundation.org { 133ca79b0c2SArun KS return totalram_pages() / 2; 134680d794bSakpm@linux-foundation.org } 135680d794bSakpm@linux-foundation.org 136680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_inodes(void) 137680d794bSakpm@linux-foundation.org { 138ca79b0c2SArun KS unsigned long nr_pages = totalram_pages(); 139ca79b0c2SArun KS 140ca79b0c2SArun KS return min(nr_pages - totalhigh_pages(), nr_pages / 2); 141680d794bSakpm@linux-foundation.org } 142b76db735SAndrew Morton #endif 143680d794bSakpm@linux-foundation.org 144da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 145da08e9b7SMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, 146c5bf121eSVineeth Remanan Pillai gfp_t gfp, struct vm_area_struct *vma, 147c5bf121eSVineeth Remanan Pillai vm_fault_t *fault_type); 1481da177e4SLinus Torvalds 1491da177e4SLinus Torvalds static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 1501da177e4SLinus Torvalds { 1511da177e4SLinus Torvalds return sb->s_fs_info; 1521da177e4SLinus Torvalds } 1531da177e4SLinus Torvalds 1541da177e4SLinus Torvalds /* 1551da177e4SLinus Torvalds * shmem_file_setup pre-accounts the whole fixed size of a VM object, 1561da177e4SLinus Torvalds * for shared memory and for shared anonymous (/dev/zero) mappings 1571da177e4SLinus Torvalds * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 1581da177e4SLinus Torvalds * consistent with the pre-accounting of private mappings ... 1591da177e4SLinus Torvalds */ 1601da177e4SLinus Torvalds static inline int shmem_acct_size(unsigned long flags, loff_t size) 1611da177e4SLinus Torvalds { 1620b0a0806SHugh Dickins return (flags & VM_NORESERVE) ? 163191c5424SAl Viro 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 1641da177e4SLinus Torvalds } 1651da177e4SLinus Torvalds 1661da177e4SLinus Torvalds static inline void shmem_unacct_size(unsigned long flags, loff_t size) 1671da177e4SLinus Torvalds { 1680b0a0806SHugh Dickins if (!(flags & VM_NORESERVE)) 1691da177e4SLinus Torvalds vm_unacct_memory(VM_ACCT(size)); 1701da177e4SLinus Torvalds } 1711da177e4SLinus Torvalds 17277142517SKonstantin Khlebnikov static inline int shmem_reacct_size(unsigned long flags, 17377142517SKonstantin Khlebnikov loff_t oldsize, loff_t newsize) 17477142517SKonstantin Khlebnikov { 17577142517SKonstantin Khlebnikov if (!(flags & VM_NORESERVE)) { 17677142517SKonstantin Khlebnikov if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 17777142517SKonstantin Khlebnikov return security_vm_enough_memory_mm(current->mm, 17877142517SKonstantin Khlebnikov VM_ACCT(newsize) - VM_ACCT(oldsize)); 17977142517SKonstantin Khlebnikov else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 18077142517SKonstantin Khlebnikov vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 18177142517SKonstantin Khlebnikov } 18277142517SKonstantin Khlebnikov return 0; 18377142517SKonstantin Khlebnikov } 18477142517SKonstantin Khlebnikov 1851da177e4SLinus Torvalds /* 1861da177e4SLinus Torvalds * ... whereas tmpfs objects are accounted incrementally as 18775edd345SHugh Dickins * pages are allocated, in order to allow large sparse files. 188923e2f0eSMatthew Wilcox (Oracle) * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 1891da177e4SLinus Torvalds * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 1901da177e4SLinus Torvalds */ 191800d8c63SKirill A. Shutemov static inline int shmem_acct_block(unsigned long flags, long pages) 1921da177e4SLinus Torvalds { 193800d8c63SKirill A. Shutemov if (!(flags & VM_NORESERVE)) 194800d8c63SKirill A. Shutemov return 0; 195800d8c63SKirill A. Shutemov 196800d8c63SKirill A. Shutemov return security_vm_enough_memory_mm(current->mm, 197800d8c63SKirill A. Shutemov pages * VM_ACCT(PAGE_SIZE)); 1981da177e4SLinus Torvalds } 1991da177e4SLinus Torvalds 2001da177e4SLinus Torvalds static inline void shmem_unacct_blocks(unsigned long flags, long pages) 2011da177e4SLinus Torvalds { 2020b0a0806SHugh Dickins if (flags & VM_NORESERVE) 20309cbfeafSKirill A. Shutemov vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 2041da177e4SLinus Torvalds } 2051da177e4SLinus Torvalds 206*3c1b7528SHugh Dickins static int shmem_inode_acct_block(struct inode *inode, long pages) 2070f079694SMike Rapoport { 2080f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 2090f079694SMike Rapoport struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 210c7e263abSLukas Czerner int err = -ENOSPC; 2110f079694SMike Rapoport 2120f079694SMike Rapoport if (shmem_acct_block(info->flags, pages)) 213c7e263abSLukas Czerner return err; 2140f079694SMike Rapoport 215*3c1b7528SHugh Dickins might_sleep(); /* when quotas */ 2160f079694SMike Rapoport if (sbinfo->max_blocks) { 2170f079694SMike Rapoport if (percpu_counter_compare(&sbinfo->used_blocks, 2180f079694SMike Rapoport sbinfo->max_blocks - pages) > 0) 2190f079694SMike Rapoport goto unacct; 220e09764cfSCarlos Maiolino 221e09764cfSCarlos Maiolino err = dquot_alloc_block_nodirty(inode, pages); 222e09764cfSCarlos Maiolino if (err) 223e09764cfSCarlos Maiolino goto unacct; 224e09764cfSCarlos Maiolino 2250f079694SMike Rapoport percpu_counter_add(&sbinfo->used_blocks, pages); 226e09764cfSCarlos Maiolino } else { 227e09764cfSCarlos Maiolino err = dquot_alloc_block_nodirty(inode, pages); 228e09764cfSCarlos Maiolino if (err) 229e09764cfSCarlos Maiolino goto unacct; 2300f079694SMike Rapoport } 2310f079694SMike Rapoport 232c7e263abSLukas Czerner return 0; 2330f079694SMike Rapoport 2340f079694SMike Rapoport unacct: 2350f079694SMike Rapoport shmem_unacct_blocks(info->flags, pages); 236c7e263abSLukas Czerner return err; 2370f079694SMike Rapoport } 2380f079694SMike Rapoport 239*3c1b7528SHugh Dickins static void shmem_inode_unacct_blocks(struct inode *inode, long pages) 2400f079694SMike Rapoport { 2410f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 2420f079694SMike Rapoport struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2430f079694SMike Rapoport 244*3c1b7528SHugh Dickins might_sleep(); /* when quotas */ 245e09764cfSCarlos Maiolino dquot_free_block_nodirty(inode, pages); 246e09764cfSCarlos Maiolino 2470f079694SMike Rapoport if (sbinfo->max_blocks) 2480f079694SMike Rapoport percpu_counter_sub(&sbinfo->used_blocks, pages); 2490f079694SMike Rapoport shmem_unacct_blocks(info->flags, pages); 2500f079694SMike Rapoport } 2510f079694SMike Rapoport 252759b9775SHugh Dickins static const struct super_operations shmem_ops; 25330e6a51dSHui Su const struct address_space_operations shmem_aops; 25415ad7cdcSHelge Deller static const struct file_operations shmem_file_operations; 25592e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations; 25692e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations; 25792e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations; 258f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops; 259d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops; 260779750d2SKirill A. Shutemov static struct file_system_type shmem_fs_type; 2611da177e4SLinus Torvalds 262d09e8ca6SPasha Tatashin bool vma_is_anon_shmem(struct vm_area_struct *vma) 263d09e8ca6SPasha Tatashin { 264d09e8ca6SPasha Tatashin return vma->vm_ops == &shmem_anon_vm_ops; 265d09e8ca6SPasha Tatashin } 266d09e8ca6SPasha Tatashin 267b0506e48SMike Rapoport bool vma_is_shmem(struct vm_area_struct *vma) 268b0506e48SMike Rapoport { 269d09e8ca6SPasha Tatashin return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; 270b0506e48SMike Rapoport } 271b0506e48SMike Rapoport 2721da177e4SLinus Torvalds static LIST_HEAD(shmem_swaplist); 273cb5f7b9aSHugh Dickins static DEFINE_MUTEX(shmem_swaplist_mutex); 2741da177e4SLinus Torvalds 275e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 276e09764cfSCarlos Maiolino 277e09764cfSCarlos Maiolino static int shmem_enable_quotas(struct super_block *sb, 278e09764cfSCarlos Maiolino unsigned short quota_types) 279e09764cfSCarlos Maiolino { 280e09764cfSCarlos Maiolino int type, err = 0; 281e09764cfSCarlos Maiolino 282e09764cfSCarlos Maiolino sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 283e09764cfSCarlos Maiolino for (type = 0; type < SHMEM_MAXQUOTAS; type++) { 284e09764cfSCarlos Maiolino if (!(quota_types & (1 << type))) 285e09764cfSCarlos Maiolino continue; 286e09764cfSCarlos Maiolino err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, 287e09764cfSCarlos Maiolino DQUOT_USAGE_ENABLED | 288e09764cfSCarlos Maiolino DQUOT_LIMITS_ENABLED); 289e09764cfSCarlos Maiolino if (err) 290e09764cfSCarlos Maiolino goto out_err; 291e09764cfSCarlos Maiolino } 292e09764cfSCarlos Maiolino return 0; 293e09764cfSCarlos Maiolino 294e09764cfSCarlos Maiolino out_err: 295e09764cfSCarlos Maiolino pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", 296e09764cfSCarlos Maiolino type, err); 297e09764cfSCarlos Maiolino for (type--; type >= 0; type--) 298e09764cfSCarlos Maiolino dquot_quota_off(sb, type); 299e09764cfSCarlos Maiolino return err; 300e09764cfSCarlos Maiolino } 301e09764cfSCarlos Maiolino 302e09764cfSCarlos Maiolino static void shmem_disable_quotas(struct super_block *sb) 303e09764cfSCarlos Maiolino { 304e09764cfSCarlos Maiolino int type; 305e09764cfSCarlos Maiolino 306e09764cfSCarlos Maiolino for (type = 0; type < SHMEM_MAXQUOTAS; type++) 307e09764cfSCarlos Maiolino dquot_quota_off(sb, type); 308e09764cfSCarlos Maiolino } 309e09764cfSCarlos Maiolino 310e09764cfSCarlos Maiolino static struct dquot **shmem_get_dquots(struct inode *inode) 311e09764cfSCarlos Maiolino { 312e09764cfSCarlos Maiolino return SHMEM_I(inode)->i_dquot; 313e09764cfSCarlos Maiolino } 314e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 315e09764cfSCarlos Maiolino 316e809d5f0SChris Down /* 317e809d5f0SChris Down * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and 318e809d5f0SChris Down * produces a novel ino for the newly allocated inode. 319e809d5f0SChris Down * 320e809d5f0SChris Down * It may also be called when making a hard link to permit the space needed by 321e809d5f0SChris Down * each dentry. However, in that case, no new inode number is needed since that 322e809d5f0SChris Down * internally draws from another pool of inode numbers (currently global 323e809d5f0SChris Down * get_next_ino()). This case is indicated by passing NULL as inop. 324e809d5f0SChris Down */ 325e809d5f0SChris Down #define SHMEM_INO_BATCH 1024 326e809d5f0SChris Down static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) 3275b04c689SPavel Emelyanov { 3285b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 329e809d5f0SChris Down ino_t ino; 330e809d5f0SChris Down 331e809d5f0SChris Down if (!(sb->s_flags & SB_KERNMOUNT)) { 332bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 333bb3e96d6SByron Stanoszek if (sbinfo->max_inodes) { 3345b04c689SPavel Emelyanov if (!sbinfo->free_inodes) { 335bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 3365b04c689SPavel Emelyanov return -ENOSPC; 3375b04c689SPavel Emelyanov } 3385b04c689SPavel Emelyanov sbinfo->free_inodes--; 339bb3e96d6SByron Stanoszek } 340e809d5f0SChris Down if (inop) { 341e809d5f0SChris Down ino = sbinfo->next_ino++; 342e809d5f0SChris Down if (unlikely(is_zero_ino(ino))) 343e809d5f0SChris Down ino = sbinfo->next_ino++; 344ea3271f7SChris Down if (unlikely(!sbinfo->full_inums && 345ea3271f7SChris Down ino > UINT_MAX)) { 346e809d5f0SChris Down /* 347e809d5f0SChris Down * Emulate get_next_ino uint wraparound for 348e809d5f0SChris Down * compatibility 349e809d5f0SChris Down */ 350ea3271f7SChris Down if (IS_ENABLED(CONFIG_64BIT)) 351ea3271f7SChris Down pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", 352ea3271f7SChris Down __func__, MINOR(sb->s_dev)); 353ea3271f7SChris Down sbinfo->next_ino = 1; 354ea3271f7SChris Down ino = sbinfo->next_ino++; 3555b04c689SPavel Emelyanov } 356e809d5f0SChris Down *inop = ino; 357e809d5f0SChris Down } 358bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 359e809d5f0SChris Down } else if (inop) { 360e809d5f0SChris Down /* 361e809d5f0SChris Down * __shmem_file_setup, one of our callers, is lock-free: it 362e809d5f0SChris Down * doesn't hold stat_lock in shmem_reserve_inode since 363e809d5f0SChris Down * max_inodes is always 0, and is called from potentially 364e809d5f0SChris Down * unknown contexts. As such, use a per-cpu batched allocator 365e809d5f0SChris Down * which doesn't require the per-sb stat_lock unless we are at 366e809d5f0SChris Down * the batch boundary. 367ea3271f7SChris Down * 368ea3271f7SChris Down * We don't need to worry about inode{32,64} since SB_KERNMOUNT 369ea3271f7SChris Down * shmem mounts are not exposed to userspace, so we don't need 370ea3271f7SChris Down * to worry about things like glibc compatibility. 371e809d5f0SChris Down */ 372e809d5f0SChris Down ino_t *next_ino; 373bf11b9a8SSebastian Andrzej Siewior 374e809d5f0SChris Down next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); 375e809d5f0SChris Down ino = *next_ino; 376e809d5f0SChris Down if (unlikely(ino % SHMEM_INO_BATCH == 0)) { 377bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 378e809d5f0SChris Down ino = sbinfo->next_ino; 379e809d5f0SChris Down sbinfo->next_ino += SHMEM_INO_BATCH; 380bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 381e809d5f0SChris Down if (unlikely(is_zero_ino(ino))) 382e809d5f0SChris Down ino++; 383e809d5f0SChris Down } 384e809d5f0SChris Down *inop = ino; 385e809d5f0SChris Down *next_ino = ++ino; 386e809d5f0SChris Down put_cpu(); 387e809d5f0SChris Down } 388e809d5f0SChris Down 3895b04c689SPavel Emelyanov return 0; 3905b04c689SPavel Emelyanov } 3915b04c689SPavel Emelyanov 3925b04c689SPavel Emelyanov static void shmem_free_inode(struct super_block *sb) 3935b04c689SPavel Emelyanov { 3945b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3955b04c689SPavel Emelyanov if (sbinfo->max_inodes) { 396bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 3975b04c689SPavel Emelyanov sbinfo->free_inodes++; 398bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 3995b04c689SPavel Emelyanov } 4005b04c689SPavel Emelyanov } 4015b04c689SPavel Emelyanov 40246711810SRandy Dunlap /** 40341ffe5d5SHugh Dickins * shmem_recalc_inode - recalculate the block usage of an inode 4041da177e4SLinus Torvalds * @inode: inode to recalc 405*3c1b7528SHugh Dickins * @alloced: the change in number of pages allocated to inode 406*3c1b7528SHugh Dickins * @swapped: the change in number of pages swapped from inode 4071da177e4SLinus Torvalds * 4081da177e4SLinus Torvalds * We have to calculate the free blocks since the mm can drop 4091da177e4SLinus Torvalds * undirtied hole pages behind our back. 4101da177e4SLinus Torvalds * 4111da177e4SLinus Torvalds * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 4121da177e4SLinus Torvalds * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 4131da177e4SLinus Torvalds */ 414*3c1b7528SHugh Dickins static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) 4151da177e4SLinus Torvalds { 4161da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 4171da177e4SLinus Torvalds long freed; 4181da177e4SLinus Torvalds 419*3c1b7528SHugh Dickins spin_lock(&info->lock); 420*3c1b7528SHugh Dickins info->alloced += alloced; 421*3c1b7528SHugh Dickins info->swapped += swapped; 422*3c1b7528SHugh Dickins freed = info->alloced - info->swapped - 423*3c1b7528SHugh Dickins READ_ONCE(inode->i_mapping->nrpages); 424*3c1b7528SHugh Dickins /* 425*3c1b7528SHugh Dickins * Special case: whereas normally shmem_recalc_inode() is called 426*3c1b7528SHugh Dickins * after i_mapping->nrpages has already been adjusted (up or down), 427*3c1b7528SHugh Dickins * shmem_writepage() has to raise swapped before nrpages is lowered - 428*3c1b7528SHugh Dickins * to stop a racing shmem_recalc_inode() from thinking that a page has 429*3c1b7528SHugh Dickins * been freed. Compensate here, to avoid the need for a followup call. 430*3c1b7528SHugh Dickins */ 431*3c1b7528SHugh Dickins if (swapped > 0) 432*3c1b7528SHugh Dickins freed += swapped; 433*3c1b7528SHugh Dickins if (freed > 0) 4341da177e4SLinus Torvalds info->alloced -= freed; 435*3c1b7528SHugh Dickins spin_unlock(&info->lock); 436*3c1b7528SHugh Dickins 437*3c1b7528SHugh Dickins /* The quota case may block */ 438*3c1b7528SHugh Dickins if (freed > 0) 4390f079694SMike Rapoport shmem_inode_unacct_blocks(inode, freed); 4401da177e4SLinus Torvalds } 4411da177e4SLinus Torvalds 442800d8c63SKirill A. Shutemov bool shmem_charge(struct inode *inode, long pages) 443800d8c63SKirill A. Shutemov { 444509f0069SHugh Dickins struct address_space *mapping = inode->i_mapping; 445800d8c63SKirill A. Shutemov 446c7e263abSLukas Czerner if (shmem_inode_acct_block(inode, pages)) 447800d8c63SKirill A. Shutemov return false; 448b1cc94abSMike Rapoport 449aaa52e34SHugh Dickins /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 450509f0069SHugh Dickins xa_lock_irq(&mapping->i_pages); 451509f0069SHugh Dickins mapping->nrpages += pages; 452509f0069SHugh Dickins xa_unlock_irq(&mapping->i_pages); 453aaa52e34SHugh Dickins 454*3c1b7528SHugh Dickins shmem_recalc_inode(inode, pages, 0); 455800d8c63SKirill A. Shutemov return true; 456800d8c63SKirill A. Shutemov } 457800d8c63SKirill A. Shutemov 458800d8c63SKirill A. Shutemov void shmem_uncharge(struct inode *inode, long pages) 459800d8c63SKirill A. Shutemov { 460*3c1b7528SHugh Dickins /* pages argument is currently unused: keep it to help debugging */ 4616ffcd825SMatthew Wilcox (Oracle) /* nrpages adjustment done by __filemap_remove_folio() or caller */ 462aaa52e34SHugh Dickins 463*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, 0); 464800d8c63SKirill A. Shutemov } 465800d8c63SKirill A. Shutemov 4667a5d0fbbSHugh Dickins /* 46762f945b6SMatthew Wilcox * Replace item expected in xarray by a new item, while holding xa_lock. 4687a5d0fbbSHugh Dickins */ 46962f945b6SMatthew Wilcox static int shmem_replace_entry(struct address_space *mapping, 4707a5d0fbbSHugh Dickins pgoff_t index, void *expected, void *replacement) 4717a5d0fbbSHugh Dickins { 47262f945b6SMatthew Wilcox XA_STATE(xas, &mapping->i_pages, index); 4736dbaf22cSJohannes Weiner void *item; 4747a5d0fbbSHugh Dickins 4757a5d0fbbSHugh Dickins VM_BUG_ON(!expected); 4766dbaf22cSJohannes Weiner VM_BUG_ON(!replacement); 47762f945b6SMatthew Wilcox item = xas_load(&xas); 4787a5d0fbbSHugh Dickins if (item != expected) 4797a5d0fbbSHugh Dickins return -ENOENT; 48062f945b6SMatthew Wilcox xas_store(&xas, replacement); 4817a5d0fbbSHugh Dickins return 0; 4827a5d0fbbSHugh Dickins } 4837a5d0fbbSHugh Dickins 4847a5d0fbbSHugh Dickins /* 485d1899228SHugh Dickins * Sometimes, before we decide whether to proceed or to fail, we must check 486d1899228SHugh Dickins * that an entry was not already brought back from swap by a racing thread. 487d1899228SHugh Dickins * 488d1899228SHugh Dickins * Checking page is not enough: by the time a SwapCache page is locked, it 489d1899228SHugh Dickins * might be reused, and again be SwapCache, using the same swap as before. 490d1899228SHugh Dickins */ 491d1899228SHugh Dickins static bool shmem_confirm_swap(struct address_space *mapping, 492d1899228SHugh Dickins pgoff_t index, swp_entry_t swap) 493d1899228SHugh Dickins { 494a12831bfSMatthew Wilcox return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 495d1899228SHugh Dickins } 496d1899228SHugh Dickins 497d1899228SHugh Dickins /* 4985a6e75f8SKirill A. Shutemov * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 4995a6e75f8SKirill A. Shutemov * 5005a6e75f8SKirill A. Shutemov * SHMEM_HUGE_NEVER: 5015a6e75f8SKirill A. Shutemov * disables huge pages for the mount; 5025a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ALWAYS: 5035a6e75f8SKirill A. Shutemov * enables huge pages for the mount; 5045a6e75f8SKirill A. Shutemov * SHMEM_HUGE_WITHIN_SIZE: 5055a6e75f8SKirill A. Shutemov * only allocate huge pages if the page will be fully within i_size, 5065a6e75f8SKirill A. Shutemov * also respect fadvise()/madvise() hints; 5075a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ADVISE: 5085a6e75f8SKirill A. Shutemov * only allocate huge pages if requested with fadvise()/madvise(); 5095a6e75f8SKirill A. Shutemov */ 5105a6e75f8SKirill A. Shutemov 5115a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_NEVER 0 5125a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ALWAYS 1 5135a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_WITHIN_SIZE 2 5145a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ADVISE 3 5155a6e75f8SKirill A. Shutemov 5165a6e75f8SKirill A. Shutemov /* 5175a6e75f8SKirill A. Shutemov * Special values. 5185a6e75f8SKirill A. Shutemov * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 5195a6e75f8SKirill A. Shutemov * 5205a6e75f8SKirill A. Shutemov * SHMEM_HUGE_DENY: 5215a6e75f8SKirill A. Shutemov * disables huge on shm_mnt and all mounts, for emergency use; 5225a6e75f8SKirill A. Shutemov * SHMEM_HUGE_FORCE: 5235a6e75f8SKirill A. Shutemov * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 5245a6e75f8SKirill A. Shutemov * 5255a6e75f8SKirill A. Shutemov */ 5265a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_DENY (-1) 5275a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_FORCE (-2) 5285a6e75f8SKirill A. Shutemov 529396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5305a6e75f8SKirill A. Shutemov /* ifdef here to avoid bloating shmem.o when not necessary */ 5315a6e75f8SKirill A. Shutemov 5325e6e5a12SHugh Dickins static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 5335a6e75f8SKirill A. Shutemov 5342cf13384SDavid Stevens bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 5352cf13384SDavid Stevens struct mm_struct *mm, unsigned long vm_flags) 536c852023eSHugh Dickins { 537c852023eSHugh Dickins loff_t i_size; 538c852023eSHugh Dickins 539f7cd16a5SXavier Roche if (!S_ISREG(inode->i_mode)) 540f7cd16a5SXavier Roche return false; 5412cf13384SDavid Stevens if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) 542c852023eSHugh Dickins return false; 5437c6c6cc4SZach O'Keefe if (shmem_huge == SHMEM_HUGE_DENY) 5447c6c6cc4SZach O'Keefe return false; 5453de0c269SZach O'Keefe if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) 5463de0c269SZach O'Keefe return true; 5475e6e5a12SHugh Dickins 5485e6e5a12SHugh Dickins switch (SHMEM_SB(inode->i_sb)->huge) { 549c852023eSHugh Dickins case SHMEM_HUGE_ALWAYS: 550c852023eSHugh Dickins return true; 551c852023eSHugh Dickins case SHMEM_HUGE_WITHIN_SIZE: 552de6ee659SLiu Yuntao index = round_up(index + 1, HPAGE_PMD_NR); 553c852023eSHugh Dickins i_size = round_up(i_size_read(inode), PAGE_SIZE); 554de6ee659SLiu Yuntao if (i_size >> PAGE_SHIFT >= index) 555c852023eSHugh Dickins return true; 556c852023eSHugh Dickins fallthrough; 557c852023eSHugh Dickins case SHMEM_HUGE_ADVISE: 5582cf13384SDavid Stevens if (mm && (vm_flags & VM_HUGEPAGE)) 5595e6e5a12SHugh Dickins return true; 5605e6e5a12SHugh Dickins fallthrough; 561c852023eSHugh Dickins default: 562c852023eSHugh Dickins return false; 563c852023eSHugh Dickins } 564c852023eSHugh Dickins } 5655a6e75f8SKirill A. Shutemov 566e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS) 5675a6e75f8SKirill A. Shutemov static int shmem_parse_huge(const char *str) 5685a6e75f8SKirill A. Shutemov { 5695a6e75f8SKirill A. Shutemov if (!strcmp(str, "never")) 5705a6e75f8SKirill A. Shutemov return SHMEM_HUGE_NEVER; 5715a6e75f8SKirill A. Shutemov if (!strcmp(str, "always")) 5725a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ALWAYS; 5735a6e75f8SKirill A. Shutemov if (!strcmp(str, "within_size")) 5745a6e75f8SKirill A. Shutemov return SHMEM_HUGE_WITHIN_SIZE; 5755a6e75f8SKirill A. Shutemov if (!strcmp(str, "advise")) 5765a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ADVISE; 5775a6e75f8SKirill A. Shutemov if (!strcmp(str, "deny")) 5785a6e75f8SKirill A. Shutemov return SHMEM_HUGE_DENY; 5795a6e75f8SKirill A. Shutemov if (!strcmp(str, "force")) 5805a6e75f8SKirill A. Shutemov return SHMEM_HUGE_FORCE; 5815a6e75f8SKirill A. Shutemov return -EINVAL; 5825a6e75f8SKirill A. Shutemov } 583e5f2249aSArnd Bergmann #endif 5845a6e75f8SKirill A. Shutemov 585e5f2249aSArnd Bergmann #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 5865a6e75f8SKirill A. Shutemov static const char *shmem_format_huge(int huge) 5875a6e75f8SKirill A. Shutemov { 5885a6e75f8SKirill A. Shutemov switch (huge) { 5895a6e75f8SKirill A. Shutemov case SHMEM_HUGE_NEVER: 5905a6e75f8SKirill A. Shutemov return "never"; 5915a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ALWAYS: 5925a6e75f8SKirill A. Shutemov return "always"; 5935a6e75f8SKirill A. Shutemov case SHMEM_HUGE_WITHIN_SIZE: 5945a6e75f8SKirill A. Shutemov return "within_size"; 5955a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ADVISE: 5965a6e75f8SKirill A. Shutemov return "advise"; 5975a6e75f8SKirill A. Shutemov case SHMEM_HUGE_DENY: 5985a6e75f8SKirill A. Shutemov return "deny"; 5995a6e75f8SKirill A. Shutemov case SHMEM_HUGE_FORCE: 6005a6e75f8SKirill A. Shutemov return "force"; 6015a6e75f8SKirill A. Shutemov default: 6025a6e75f8SKirill A. Shutemov VM_BUG_ON(1); 6035a6e75f8SKirill A. Shutemov return "bad_val"; 6045a6e75f8SKirill A. Shutemov } 6055a6e75f8SKirill A. Shutemov } 606f1f5929cSJérémy Lefaure #endif 6075a6e75f8SKirill A. Shutemov 608779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 609779750d2SKirill A. Shutemov struct shrink_control *sc, unsigned long nr_to_split) 610779750d2SKirill A. Shutemov { 611779750d2SKirill A. Shutemov LIST_HEAD(list), *pos, *next; 612253fd0f0SKirill A. Shutemov LIST_HEAD(to_remove); 613779750d2SKirill A. Shutemov struct inode *inode; 614779750d2SKirill A. Shutemov struct shmem_inode_info *info; 61505624571SMatthew Wilcox (Oracle) struct folio *folio; 616779750d2SKirill A. Shutemov unsigned long batch = sc ? sc->nr_to_scan : 128; 61762c9827cSGang Li int split = 0; 618779750d2SKirill A. Shutemov 619779750d2SKirill A. Shutemov if (list_empty(&sbinfo->shrinklist)) 620779750d2SKirill A. Shutemov return SHRINK_STOP; 621779750d2SKirill A. Shutemov 622779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 623779750d2SKirill A. Shutemov list_for_each_safe(pos, next, &sbinfo->shrinklist) { 624779750d2SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 625779750d2SKirill A. Shutemov 626779750d2SKirill A. Shutemov /* pin the inode */ 627779750d2SKirill A. Shutemov inode = igrab(&info->vfs_inode); 628779750d2SKirill A. Shutemov 629779750d2SKirill A. Shutemov /* inode is about to be evicted */ 630779750d2SKirill A. Shutemov if (!inode) { 631779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 632779750d2SKirill A. Shutemov goto next; 633779750d2SKirill A. Shutemov } 634779750d2SKirill A. Shutemov 635779750d2SKirill A. Shutemov /* Check if there's anything to gain */ 636779750d2SKirill A. Shutemov if (round_up(inode->i_size, PAGE_SIZE) == 637779750d2SKirill A. Shutemov round_up(inode->i_size, HPAGE_PMD_SIZE)) { 638253fd0f0SKirill A. Shutemov list_move(&info->shrinklist, &to_remove); 639779750d2SKirill A. Shutemov goto next; 640779750d2SKirill A. Shutemov } 641779750d2SKirill A. Shutemov 642779750d2SKirill A. Shutemov list_move(&info->shrinklist, &list); 643779750d2SKirill A. Shutemov next: 64462c9827cSGang Li sbinfo->shrinklist_len--; 645779750d2SKirill A. Shutemov if (!--batch) 646779750d2SKirill A. Shutemov break; 647779750d2SKirill A. Shutemov } 648779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 649779750d2SKirill A. Shutemov 650253fd0f0SKirill A. Shutemov list_for_each_safe(pos, next, &to_remove) { 651253fd0f0SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 652253fd0f0SKirill A. Shutemov inode = &info->vfs_inode; 653253fd0f0SKirill A. Shutemov list_del_init(&info->shrinklist); 654253fd0f0SKirill A. Shutemov iput(inode); 655253fd0f0SKirill A. Shutemov } 656253fd0f0SKirill A. Shutemov 657779750d2SKirill A. Shutemov list_for_each_safe(pos, next, &list) { 658779750d2SKirill A. Shutemov int ret; 65905624571SMatthew Wilcox (Oracle) pgoff_t index; 660779750d2SKirill A. Shutemov 661779750d2SKirill A. Shutemov info = list_entry(pos, struct shmem_inode_info, shrinklist); 662779750d2SKirill A. Shutemov inode = &info->vfs_inode; 663779750d2SKirill A. Shutemov 664b3cd54b2SKirill A. Shutemov if (nr_to_split && split >= nr_to_split) 66562c9827cSGang Li goto move_back; 666779750d2SKirill A. Shutemov 66705624571SMatthew Wilcox (Oracle) index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; 66805624571SMatthew Wilcox (Oracle) folio = filemap_get_folio(inode->i_mapping, index); 66966dabbb6SChristoph Hellwig if (IS_ERR(folio)) 670779750d2SKirill A. Shutemov goto drop; 671779750d2SKirill A. Shutemov 672b3cd54b2SKirill A. Shutemov /* No huge page at the end of the file: nothing to split */ 67305624571SMatthew Wilcox (Oracle) if (!folio_test_large(folio)) { 67405624571SMatthew Wilcox (Oracle) folio_put(folio); 675779750d2SKirill A. Shutemov goto drop; 676779750d2SKirill A. Shutemov } 677779750d2SKirill A. Shutemov 678b3cd54b2SKirill A. Shutemov /* 67962c9827cSGang Li * Move the inode on the list back to shrinklist if we failed 68062c9827cSGang Li * to lock the page at this time. 681b3cd54b2SKirill A. Shutemov * 682b3cd54b2SKirill A. Shutemov * Waiting for the lock may lead to deadlock in the 683b3cd54b2SKirill A. Shutemov * reclaim path. 684b3cd54b2SKirill A. Shutemov */ 68505624571SMatthew Wilcox (Oracle) if (!folio_trylock(folio)) { 68605624571SMatthew Wilcox (Oracle) folio_put(folio); 68762c9827cSGang Li goto move_back; 688b3cd54b2SKirill A. Shutemov } 689b3cd54b2SKirill A. Shutemov 690d788f5b3SMatthew Wilcox (Oracle) ret = split_folio(folio); 69105624571SMatthew Wilcox (Oracle) folio_unlock(folio); 69205624571SMatthew Wilcox (Oracle) folio_put(folio); 693779750d2SKirill A. Shutemov 69462c9827cSGang Li /* If split failed move the inode on the list back to shrinklist */ 695b3cd54b2SKirill A. Shutemov if (ret) 69662c9827cSGang Li goto move_back; 697779750d2SKirill A. Shutemov 698779750d2SKirill A. Shutemov split++; 699779750d2SKirill A. Shutemov drop: 700779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 70162c9827cSGang Li goto put; 70262c9827cSGang Li move_back: 70362c9827cSGang Li /* 70462c9827cSGang Li * Make sure the inode is either on the global list or deleted 70562c9827cSGang Li * from any local list before iput() since it could be deleted 70662c9827cSGang Li * in another thread once we put the inode (then the local list 70762c9827cSGang Li * is corrupted). 70862c9827cSGang Li */ 70962c9827cSGang Li spin_lock(&sbinfo->shrinklist_lock); 71062c9827cSGang Li list_move(&info->shrinklist, &sbinfo->shrinklist); 71162c9827cSGang Li sbinfo->shrinklist_len++; 71262c9827cSGang Li spin_unlock(&sbinfo->shrinklist_lock); 71362c9827cSGang Li put: 714779750d2SKirill A. Shutemov iput(inode); 715779750d2SKirill A. Shutemov } 716779750d2SKirill A. Shutemov 717779750d2SKirill A. Shutemov return split; 718779750d2SKirill A. Shutemov } 719779750d2SKirill A. Shutemov 720779750d2SKirill A. Shutemov static long shmem_unused_huge_scan(struct super_block *sb, 721779750d2SKirill A. Shutemov struct shrink_control *sc) 722779750d2SKirill A. Shutemov { 723779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 724779750d2SKirill A. Shutemov 725779750d2SKirill A. Shutemov if (!READ_ONCE(sbinfo->shrinklist_len)) 726779750d2SKirill A. Shutemov return SHRINK_STOP; 727779750d2SKirill A. Shutemov 728779750d2SKirill A. Shutemov return shmem_unused_huge_shrink(sbinfo, sc, 0); 729779750d2SKirill A. Shutemov } 730779750d2SKirill A. Shutemov 731779750d2SKirill A. Shutemov static long shmem_unused_huge_count(struct super_block *sb, 732779750d2SKirill A. Shutemov struct shrink_control *sc) 733779750d2SKirill A. Shutemov { 734779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 735779750d2SKirill A. Shutemov return READ_ONCE(sbinfo->shrinklist_len); 736779750d2SKirill A. Shutemov } 737396bcc52SMatthew Wilcox (Oracle) #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 7385a6e75f8SKirill A. Shutemov 7395a6e75f8SKirill A. Shutemov #define shmem_huge SHMEM_HUGE_DENY 7405a6e75f8SKirill A. Shutemov 7412cf13384SDavid Stevens bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 7422cf13384SDavid Stevens struct mm_struct *mm, unsigned long vm_flags) 7435e6e5a12SHugh Dickins { 7445e6e5a12SHugh Dickins return false; 7455e6e5a12SHugh Dickins } 7465e6e5a12SHugh Dickins 747779750d2SKirill A. Shutemov static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 748779750d2SKirill A. Shutemov struct shrink_control *sc, unsigned long nr_to_split) 749779750d2SKirill A. Shutemov { 750779750d2SKirill A. Shutemov return 0; 751779750d2SKirill A. Shutemov } 752396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 7535a6e75f8SKirill A. Shutemov 7545a6e75f8SKirill A. Shutemov /* 7552bb876b5SMatthew Wilcox (Oracle) * Like filemap_add_folio, but error if expected item has gone. 75646f65ec1SHugh Dickins */ 757b7dd44a1SMatthew Wilcox (Oracle) static int shmem_add_to_page_cache(struct folio *folio, 75846f65ec1SHugh Dickins struct address_space *mapping, 7593fea5a49SJohannes Weiner pgoff_t index, void *expected, gfp_t gfp, 7603fea5a49SJohannes Weiner struct mm_struct *charge_mm) 76146f65ec1SHugh Dickins { 762b7dd44a1SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 763b7dd44a1SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 7643fea5a49SJohannes Weiner int error; 76546f65ec1SHugh Dickins 766b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 767b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 768b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 769b7dd44a1SMatthew Wilcox (Oracle) VM_BUG_ON(expected && folio_test_large(folio)); 77046f65ec1SHugh Dickins 771b7dd44a1SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 772b7dd44a1SMatthew Wilcox (Oracle) folio->mapping = mapping; 773b7dd44a1SMatthew Wilcox (Oracle) folio->index = index; 77446f65ec1SHugh Dickins 775b7dd44a1SMatthew Wilcox (Oracle) if (!folio_test_swapcache(folio)) { 776b7dd44a1SMatthew Wilcox (Oracle) error = mem_cgroup_charge(folio, charge_mm, gfp); 7773fea5a49SJohannes Weiner if (error) { 778b7dd44a1SMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio)) { 7793fea5a49SJohannes Weiner count_vm_event(THP_FILE_FALLBACK); 7803fea5a49SJohannes Weiner count_vm_event(THP_FILE_FALLBACK_CHARGE); 7813fea5a49SJohannes Weiner } 7823fea5a49SJohannes Weiner goto error; 7833fea5a49SJohannes Weiner } 7844c6355b2SJohannes Weiner } 785b7dd44a1SMatthew Wilcox (Oracle) folio_throttle_swaprate(folio, gfp); 7863fea5a49SJohannes Weiner 787552446a4SMatthew Wilcox do { 788552446a4SMatthew Wilcox xas_lock_irq(&xas); 7896b24ca4aSMatthew Wilcox (Oracle) if (expected != xas_find_conflict(&xas)) { 790552446a4SMatthew Wilcox xas_set_err(&xas, -EEXIST); 7916b24ca4aSMatthew Wilcox (Oracle) goto unlock; 7926b24ca4aSMatthew Wilcox (Oracle) } 7936b24ca4aSMatthew Wilcox (Oracle) if (expected && xas_find_conflict(&xas)) { 7946b24ca4aSMatthew Wilcox (Oracle) xas_set_err(&xas, -EEXIST); 7956b24ca4aSMatthew Wilcox (Oracle) goto unlock; 7966b24ca4aSMatthew Wilcox (Oracle) } 797b7dd44a1SMatthew Wilcox (Oracle) xas_store(&xas, folio); 798552446a4SMatthew Wilcox if (xas_error(&xas)) 799552446a4SMatthew Wilcox goto unlock; 800b7dd44a1SMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio)) { 801800d8c63SKirill A. Shutemov count_vm_event(THP_FILE_ALLOC); 802b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); 803552446a4SMatthew Wilcox } 804552446a4SMatthew Wilcox mapping->nrpages += nr; 805b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 806b7dd44a1SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); 807552446a4SMatthew Wilcox unlock: 808552446a4SMatthew Wilcox xas_unlock_irq(&xas); 809552446a4SMatthew Wilcox } while (xas_nomem(&xas, gfp)); 810552446a4SMatthew Wilcox 811552446a4SMatthew Wilcox if (xas_error(&xas)) { 8123fea5a49SJohannes Weiner error = xas_error(&xas); 8133fea5a49SJohannes Weiner goto error; 81446f65ec1SHugh Dickins } 815552446a4SMatthew Wilcox 816552446a4SMatthew Wilcox return 0; 8173fea5a49SJohannes Weiner error: 818b7dd44a1SMatthew Wilcox (Oracle) folio->mapping = NULL; 819b7dd44a1SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 8203fea5a49SJohannes Weiner return error; 82146f65ec1SHugh Dickins } 82246f65ec1SHugh Dickins 82346f65ec1SHugh Dickins /* 8244cd400fdSMatthew Wilcox (Oracle) * Like delete_from_page_cache, but substitutes swap for @folio. 8256922c0c7SHugh Dickins */ 8264cd400fdSMatthew Wilcox (Oracle) static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) 8276922c0c7SHugh Dickins { 8284cd400fdSMatthew Wilcox (Oracle) struct address_space *mapping = folio->mapping; 8294cd400fdSMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 8306922c0c7SHugh Dickins int error; 8316922c0c7SHugh Dickins 832b93b0163SMatthew Wilcox xa_lock_irq(&mapping->i_pages); 8334cd400fdSMatthew Wilcox (Oracle) error = shmem_replace_entry(mapping, folio->index, folio, radswap); 8344cd400fdSMatthew Wilcox (Oracle) folio->mapping = NULL; 8354cd400fdSMatthew Wilcox (Oracle) mapping->nrpages -= nr; 8364cd400fdSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 8374cd400fdSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 838b93b0163SMatthew Wilcox xa_unlock_irq(&mapping->i_pages); 8394cd400fdSMatthew Wilcox (Oracle) folio_put(folio); 8406922c0c7SHugh Dickins BUG_ON(error); 8416922c0c7SHugh Dickins } 8426922c0c7SHugh Dickins 8436922c0c7SHugh Dickins /* 844c121d3bbSMatthew Wilcox * Remove swap entry from page cache, free the swap and its page cache. 8457a5d0fbbSHugh Dickins */ 8467a5d0fbbSHugh Dickins static int shmem_free_swap(struct address_space *mapping, 8477a5d0fbbSHugh Dickins pgoff_t index, void *radswap) 8487a5d0fbbSHugh Dickins { 8496dbaf22cSJohannes Weiner void *old; 8507a5d0fbbSHugh Dickins 85155f3f7eaSMatthew Wilcox old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 8526dbaf22cSJohannes Weiner if (old != radswap) 8536dbaf22cSJohannes Weiner return -ENOENT; 8547a5d0fbbSHugh Dickins free_swap_and_cache(radix_to_swp_entry(radswap)); 8556dbaf22cSJohannes Weiner return 0; 8567a5d0fbbSHugh Dickins } 8577a5d0fbbSHugh Dickins 8587a5d0fbbSHugh Dickins /* 8596a15a370SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 86048131e03SVlastimil Babka * given offsets are swapped out. 8616a15a370SVlastimil Babka * 8629608703eSJan Kara * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 8636a15a370SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 8646a15a370SVlastimil Babka */ 86548131e03SVlastimil Babka unsigned long shmem_partial_swap_usage(struct address_space *mapping, 86648131e03SVlastimil Babka pgoff_t start, pgoff_t end) 8676a15a370SVlastimil Babka { 8687ae3424fSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, start); 8696a15a370SVlastimil Babka struct page *page; 87048131e03SVlastimil Babka unsigned long swapped = 0; 8716a15a370SVlastimil Babka 8726a15a370SVlastimil Babka rcu_read_lock(); 8737ae3424fSMatthew Wilcox xas_for_each(&xas, page, end - 1) { 8747ae3424fSMatthew Wilcox if (xas_retry(&xas, page)) 8752cf938aaSMatthew Wilcox continue; 8763159f943SMatthew Wilcox if (xa_is_value(page)) 8776a15a370SVlastimil Babka swapped++; 8786a15a370SVlastimil Babka 8796a15a370SVlastimil Babka if (need_resched()) { 8807ae3424fSMatthew Wilcox xas_pause(&xas); 8816a15a370SVlastimil Babka cond_resched_rcu(); 8826a15a370SVlastimil Babka } 8836a15a370SVlastimil Babka } 8846a15a370SVlastimil Babka 8856a15a370SVlastimil Babka rcu_read_unlock(); 8866a15a370SVlastimil Babka 8876a15a370SVlastimil Babka return swapped << PAGE_SHIFT; 8886a15a370SVlastimil Babka } 8896a15a370SVlastimil Babka 8906a15a370SVlastimil Babka /* 89148131e03SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 89248131e03SVlastimil Babka * given vma is swapped out. 89348131e03SVlastimil Babka * 8949608703eSJan Kara * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 89548131e03SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 89648131e03SVlastimil Babka */ 89748131e03SVlastimil Babka unsigned long shmem_swap_usage(struct vm_area_struct *vma) 89848131e03SVlastimil Babka { 89948131e03SVlastimil Babka struct inode *inode = file_inode(vma->vm_file); 90048131e03SVlastimil Babka struct shmem_inode_info *info = SHMEM_I(inode); 90148131e03SVlastimil Babka struct address_space *mapping = inode->i_mapping; 90248131e03SVlastimil Babka unsigned long swapped; 90348131e03SVlastimil Babka 90448131e03SVlastimil Babka /* Be careful as we don't hold info->lock */ 90548131e03SVlastimil Babka swapped = READ_ONCE(info->swapped); 90648131e03SVlastimil Babka 90748131e03SVlastimil Babka /* 90848131e03SVlastimil Babka * The easier cases are when the shmem object has nothing in swap, or 90948131e03SVlastimil Babka * the vma maps it whole. Then we can simply use the stats that we 91048131e03SVlastimil Babka * already track. 91148131e03SVlastimil Babka */ 91248131e03SVlastimil Babka if (!swapped) 91348131e03SVlastimil Babka return 0; 91448131e03SVlastimil Babka 91548131e03SVlastimil Babka if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 91648131e03SVlastimil Babka return swapped << PAGE_SHIFT; 91748131e03SVlastimil Babka 91848131e03SVlastimil Babka /* Here comes the more involved part */ 91902399c88SPeter Xu return shmem_partial_swap_usage(mapping, vma->vm_pgoff, 92002399c88SPeter Xu vma->vm_pgoff + vma_pages(vma)); 92148131e03SVlastimil Babka } 92248131e03SVlastimil Babka 92348131e03SVlastimil Babka /* 92424513264SHugh Dickins * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 92524513264SHugh Dickins */ 92624513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 92724513264SHugh Dickins { 928105c988fSMatthew Wilcox (Oracle) struct folio_batch fbatch; 92924513264SHugh Dickins pgoff_t index = 0; 93024513264SHugh Dickins 931105c988fSMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 93224513264SHugh Dickins /* 93324513264SHugh Dickins * Minor point, but we might as well stop if someone else SHM_LOCKs it. 93424513264SHugh Dickins */ 935105c988fSMatthew Wilcox (Oracle) while (!mapping_unevictable(mapping) && 936105c988fSMatthew Wilcox (Oracle) filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { 937105c988fSMatthew Wilcox (Oracle) check_move_unevictable_folios(&fbatch); 938105c988fSMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 93924513264SHugh Dickins cond_resched(); 94024513264SHugh Dickins } 9417a5d0fbbSHugh Dickins } 9427a5d0fbbSHugh Dickins 943b9a8a419SMatthew Wilcox (Oracle) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) 94471725ed1SHugh Dickins { 945b9a8a419SMatthew Wilcox (Oracle) struct folio *folio; 94671725ed1SHugh Dickins 947b9a8a419SMatthew Wilcox (Oracle) /* 948a7f5862cSMatthew Wilcox (Oracle) * At first avoid shmem_get_folio(,,,SGP_READ): that fails 94981914affSHugh Dickins * beyond i_size, and reports fallocated folios as holes. 950b9a8a419SMatthew Wilcox (Oracle) */ 95181914affSHugh Dickins folio = filemap_get_entry(inode->i_mapping, index); 95281914affSHugh Dickins if (!folio) 953b9a8a419SMatthew Wilcox (Oracle) return folio; 95481914affSHugh Dickins if (!xa_is_value(folio)) { 95581914affSHugh Dickins folio_lock(folio); 95681914affSHugh Dickins if (folio->mapping == inode->i_mapping) 95781914affSHugh Dickins return folio; 95881914affSHugh Dickins /* The folio has been swapped out */ 95981914affSHugh Dickins folio_unlock(folio); 96081914affSHugh Dickins folio_put(folio); 96181914affSHugh Dickins } 962b9a8a419SMatthew Wilcox (Oracle) /* 96381914affSHugh Dickins * But read a folio back from swap if any of it is within i_size 964b9a8a419SMatthew Wilcox (Oracle) * (although in some cases this is just a waste of time). 965b9a8a419SMatthew Wilcox (Oracle) */ 966a7f5862cSMatthew Wilcox (Oracle) folio = NULL; 967a7f5862cSMatthew Wilcox (Oracle) shmem_get_folio(inode, index, &folio, SGP_READ); 968a7f5862cSMatthew Wilcox (Oracle) return folio; 96971725ed1SHugh Dickins } 97071725ed1SHugh Dickins 97171725ed1SHugh Dickins /* 9727f4446eeSMatthew Wilcox * Remove range of pages and swap entries from page cache, and free them. 9731635f6a7SHugh Dickins * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 9747a5d0fbbSHugh Dickins */ 9751635f6a7SHugh Dickins static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 9761635f6a7SHugh Dickins bool unfalloc) 9771da177e4SLinus Torvalds { 978285b2c4fSHugh Dickins struct address_space *mapping = inode->i_mapping; 9791da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 98009cbfeafSKirill A. Shutemov pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 98109cbfeafSKirill A. Shutemov pgoff_t end = (lend + 1) >> PAGE_SHIFT; 9820e499ed3SMatthew Wilcox (Oracle) struct folio_batch fbatch; 9837a5d0fbbSHugh Dickins pgoff_t indices[PAGEVEC_SIZE]; 984b9a8a419SMatthew Wilcox (Oracle) struct folio *folio; 985b9a8a419SMatthew Wilcox (Oracle) bool same_folio; 9867a5d0fbbSHugh Dickins long nr_swaps_freed = 0; 987285b2c4fSHugh Dickins pgoff_t index; 988bda97eabSHugh Dickins int i; 9891da177e4SLinus Torvalds 99083e4fa9cSHugh Dickins if (lend == -1) 99183e4fa9cSHugh Dickins end = -1; /* unsigned, so actually very big */ 992bda97eabSHugh Dickins 993d144bf62SHugh Dickins if (info->fallocend > start && info->fallocend <= end && !unfalloc) 994d144bf62SHugh Dickins info->fallocend = start; 995d144bf62SHugh Dickins 99651dcbdacSMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 997bda97eabSHugh Dickins index = start; 9983392ca12SVishal Moola (Oracle) while (index < end && find_lock_entries(mapping, &index, end - 1, 99951dcbdacSMatthew Wilcox (Oracle) &fbatch, indices)) { 100051dcbdacSMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(&fbatch); i++) { 1001b9a8a419SMatthew Wilcox (Oracle) folio = fbatch.folios[i]; 1002bda97eabSHugh Dickins 10037b774aabSMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 10041635f6a7SHugh Dickins if (unfalloc) 10051635f6a7SHugh Dickins continue; 10067a5d0fbbSHugh Dickins nr_swaps_freed += !shmem_free_swap(mapping, 10073392ca12SVishal Moola (Oracle) indices[i], folio); 10087a5d0fbbSHugh Dickins continue; 10097a5d0fbbSHugh Dickins } 10107a5d0fbbSHugh Dickins 10117b774aabSMatthew Wilcox (Oracle) if (!unfalloc || !folio_test_uptodate(folio)) 10121e84a3d9SMatthew Wilcox (Oracle) truncate_inode_folio(mapping, folio); 10137b774aabSMatthew Wilcox (Oracle) folio_unlock(folio); 1014bda97eabSHugh Dickins } 101551dcbdacSMatthew Wilcox (Oracle) folio_batch_remove_exceptionals(&fbatch); 101651dcbdacSMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 1017bda97eabSHugh Dickins cond_resched(); 1018bda97eabSHugh Dickins } 1019bda97eabSHugh Dickins 102044bcabd7SHugh Dickins /* 102144bcabd7SHugh Dickins * When undoing a failed fallocate, we want none of the partial folio 102244bcabd7SHugh Dickins * zeroing and splitting below, but shall want to truncate the whole 102344bcabd7SHugh Dickins * folio when !uptodate indicates that it was added by this fallocate, 102444bcabd7SHugh Dickins * even when [lstart, lend] covers only a part of the folio. 102544bcabd7SHugh Dickins */ 102644bcabd7SHugh Dickins if (unfalloc) 102744bcabd7SHugh Dickins goto whole_folios; 102844bcabd7SHugh Dickins 1029b9a8a419SMatthew Wilcox (Oracle) same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); 1030b9a8a419SMatthew Wilcox (Oracle) folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); 1031b9a8a419SMatthew Wilcox (Oracle) if (folio) { 1032b9a8a419SMatthew Wilcox (Oracle) same_folio = lend < folio_pos(folio) + folio_size(folio); 1033b9a8a419SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1034b9a8a419SMatthew Wilcox (Oracle) if (!truncate_inode_partial_folio(folio, lstart, lend)) { 1035b9a8a419SMatthew Wilcox (Oracle) start = folio->index + folio_nr_pages(folio); 1036b9a8a419SMatthew Wilcox (Oracle) if (same_folio) 1037b9a8a419SMatthew Wilcox (Oracle) end = folio->index; 103883e4fa9cSHugh Dickins } 1039b9a8a419SMatthew Wilcox (Oracle) folio_unlock(folio); 1040b9a8a419SMatthew Wilcox (Oracle) folio_put(folio); 1041b9a8a419SMatthew Wilcox (Oracle) folio = NULL; 1042bda97eabSHugh Dickins } 1043b9a8a419SMatthew Wilcox (Oracle) 1044b9a8a419SMatthew Wilcox (Oracle) if (!same_folio) 1045b9a8a419SMatthew Wilcox (Oracle) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); 1046b9a8a419SMatthew Wilcox (Oracle) if (folio) { 1047b9a8a419SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1048b9a8a419SMatthew Wilcox (Oracle) if (!truncate_inode_partial_folio(folio, lstart, lend)) 1049b9a8a419SMatthew Wilcox (Oracle) end = folio->index; 1050b9a8a419SMatthew Wilcox (Oracle) folio_unlock(folio); 1051b9a8a419SMatthew Wilcox (Oracle) folio_put(folio); 1052bda97eabSHugh Dickins } 1053bda97eabSHugh Dickins 105444bcabd7SHugh Dickins whole_folios: 105544bcabd7SHugh Dickins 1056bda97eabSHugh Dickins index = start; 1057b1a36650SHugh Dickins while (index < end) { 1058bda97eabSHugh Dickins cond_resched(); 10590cd6144aSJohannes Weiner 10609fb6beeaSVishal Moola (Oracle) if (!find_get_entries(mapping, &index, end - 1, &fbatch, 1061cf2039afSMatthew Wilcox (Oracle) indices)) { 1062b1a36650SHugh Dickins /* If all gone or hole-punch or unfalloc, we're done */ 1063b1a36650SHugh Dickins if (index == start || end != -1) 1064bda97eabSHugh Dickins break; 1065b1a36650SHugh Dickins /* But if truncating, restart to make sure all gone */ 1066bda97eabSHugh Dickins index = start; 1067bda97eabSHugh Dickins continue; 1068bda97eabSHugh Dickins } 10690e499ed3SMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(&fbatch); i++) { 1070b9a8a419SMatthew Wilcox (Oracle) folio = fbatch.folios[i]; 1071bda97eabSHugh Dickins 10720e499ed3SMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 10731635f6a7SHugh Dickins if (unfalloc) 10741635f6a7SHugh Dickins continue; 10759fb6beeaSVishal Moola (Oracle) if (shmem_free_swap(mapping, indices[i], folio)) { 1076b1a36650SHugh Dickins /* Swap was replaced by page: retry */ 10779fb6beeaSVishal Moola (Oracle) index = indices[i]; 1078b1a36650SHugh Dickins break; 1079b1a36650SHugh Dickins } 1080b1a36650SHugh Dickins nr_swaps_freed++; 10817a5d0fbbSHugh Dickins continue; 10827a5d0fbbSHugh Dickins } 10837a5d0fbbSHugh Dickins 10840e499ed3SMatthew Wilcox (Oracle) folio_lock(folio); 1085800d8c63SKirill A. Shutemov 10860e499ed3SMatthew Wilcox (Oracle) if (!unfalloc || !folio_test_uptodate(folio)) { 10870e499ed3SMatthew Wilcox (Oracle) if (folio_mapping(folio) != mapping) { 1088b1a36650SHugh Dickins /* Page was replaced by swap: retry */ 10890e499ed3SMatthew Wilcox (Oracle) folio_unlock(folio); 10909fb6beeaSVishal Moola (Oracle) index = indices[i]; 1091b1a36650SHugh Dickins break; 10927a5d0fbbSHugh Dickins } 10930e499ed3SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), 10940e499ed3SMatthew Wilcox (Oracle) folio); 10950e499ed3SMatthew Wilcox (Oracle) truncate_inode_folio(mapping, folio); 109671725ed1SHugh Dickins } 10970e499ed3SMatthew Wilcox (Oracle) folio_unlock(folio); 1098bda97eabSHugh Dickins } 10990e499ed3SMatthew Wilcox (Oracle) folio_batch_remove_exceptionals(&fbatch); 11000e499ed3SMatthew Wilcox (Oracle) folio_batch_release(&fbatch); 1101bda97eabSHugh Dickins } 110294c1e62dSHugh Dickins 1103*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, -nr_swaps_freed); 11041635f6a7SHugh Dickins } 11051da177e4SLinus Torvalds 11061635f6a7SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 11071635f6a7SHugh Dickins { 11081635f6a7SHugh Dickins shmem_undo_range(inode, lstart, lend, false); 1109078cd827SDeepa Dinamani inode->i_ctime = inode->i_mtime = current_time(inode); 111036f05cabSJeff Layton inode_inc_iversion(inode); 11111da177e4SLinus Torvalds } 111294c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 11131da177e4SLinus Torvalds 1114b74d24f7SChristian Brauner static int shmem_getattr(struct mnt_idmap *idmap, 1115549c7297SChristian Brauner const struct path *path, struct kstat *stat, 1116a528d35eSDavid Howells u32 request_mask, unsigned int query_flags) 111744a30220SYu Zhao { 1118a528d35eSDavid Howells struct inode *inode = path->dentry->d_inode; 111944a30220SYu Zhao struct shmem_inode_info *info = SHMEM_I(inode); 112044a30220SYu Zhao 1121*3c1b7528SHugh Dickins if (info->alloced - info->swapped != inode->i_mapping->nrpages) 1122*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, 0); 1123*3c1b7528SHugh Dickins 1124e408e695STheodore Ts'o if (info->fsflags & FS_APPEND_FL) 1125e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_APPEND; 1126e408e695STheodore Ts'o if (info->fsflags & FS_IMMUTABLE_FL) 1127e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_IMMUTABLE; 1128e408e695STheodore Ts'o if (info->fsflags & FS_NODUMP_FL) 1129e408e695STheodore Ts'o stat->attributes |= STATX_ATTR_NODUMP; 1130e408e695STheodore Ts'o stat->attributes_mask |= (STATX_ATTR_APPEND | 1131e408e695STheodore Ts'o STATX_ATTR_IMMUTABLE | 1132e408e695STheodore Ts'o STATX_ATTR_NODUMP); 11337a80e5b8SGiuseppe Scrivano generic_fillattr(idmap, inode, stat); 113489fdcd26SYang Shi 11352cf13384SDavid Stevens if (shmem_is_huge(inode, 0, false, NULL, 0)) 113689fdcd26SYang Shi stat->blksize = HPAGE_PMD_SIZE; 113789fdcd26SYang Shi 1138f7cd16a5SXavier Roche if (request_mask & STATX_BTIME) { 1139f7cd16a5SXavier Roche stat->result_mask |= STATX_BTIME; 1140f7cd16a5SXavier Roche stat->btime.tv_sec = info->i_crtime.tv_sec; 1141f7cd16a5SXavier Roche stat->btime.tv_nsec = info->i_crtime.tv_nsec; 1142f7cd16a5SXavier Roche } 1143f7cd16a5SXavier Roche 114444a30220SYu Zhao return 0; 114544a30220SYu Zhao } 114644a30220SYu Zhao 1147c1632a0fSChristian Brauner static int shmem_setattr(struct mnt_idmap *idmap, 1148549c7297SChristian Brauner struct dentry *dentry, struct iattr *attr) 11491da177e4SLinus Torvalds { 115075c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 115140e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 11521da177e4SLinus Torvalds int error; 115336f05cabSJeff Layton bool update_mtime = false; 115436f05cabSJeff Layton bool update_ctime = true; 11551da177e4SLinus Torvalds 11567a80e5b8SGiuseppe Scrivano error = setattr_prepare(idmap, dentry, attr); 1157db78b877SChristoph Hellwig if (error) 1158db78b877SChristoph Hellwig return error; 1159db78b877SChristoph Hellwig 11606fd73538SDaniel Verkamp if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { 11616fd73538SDaniel Verkamp if ((inode->i_mode ^ attr->ia_mode) & 0111) { 11626fd73538SDaniel Verkamp return -EPERM; 11636fd73538SDaniel Verkamp } 11646fd73538SDaniel Verkamp } 11656fd73538SDaniel Verkamp 116694c1e62dSHugh Dickins if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 116794c1e62dSHugh Dickins loff_t oldsize = inode->i_size; 116894c1e62dSHugh Dickins loff_t newsize = attr->ia_size; 11693889e6e7Snpiggin@suse.de 11709608703eSJan Kara /* protected by i_rwsem */ 117140e041a2SDavid Herrmann if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 117240e041a2SDavid Herrmann (newsize > oldsize && (info->seals & F_SEAL_GROW))) 117340e041a2SDavid Herrmann return -EPERM; 117440e041a2SDavid Herrmann 117594c1e62dSHugh Dickins if (newsize != oldsize) { 117677142517SKonstantin Khlebnikov error = shmem_reacct_size(SHMEM_I(inode)->flags, 117777142517SKonstantin Khlebnikov oldsize, newsize); 117877142517SKonstantin Khlebnikov if (error) 117977142517SKonstantin Khlebnikov return error; 118094c1e62dSHugh Dickins i_size_write(inode, newsize); 118136f05cabSJeff Layton update_mtime = true; 118236f05cabSJeff Layton } else { 118336f05cabSJeff Layton update_ctime = false; 118494c1e62dSHugh Dickins } 1185afa2db2fSJosef Bacik if (newsize <= oldsize) { 118694c1e62dSHugh Dickins loff_t holebegin = round_up(newsize, PAGE_SIZE); 1187d0424c42SHugh Dickins if (oldsize > holebegin) 1188d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 1189d0424c42SHugh Dickins holebegin, 0, 1); 1190d0424c42SHugh Dickins if (info->alloced) 1191d0424c42SHugh Dickins shmem_truncate_range(inode, 1192d0424c42SHugh Dickins newsize, (loff_t)-1); 119394c1e62dSHugh Dickins /* unmap again to remove racily COWed private pages */ 1194d0424c42SHugh Dickins if (oldsize > holebegin) 1195d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 1196d0424c42SHugh Dickins holebegin, 0, 1); 119794c1e62dSHugh Dickins } 11981da177e4SLinus Torvalds } 11991da177e4SLinus Torvalds 1200e09764cfSCarlos Maiolino if (is_quota_modification(idmap, inode, attr)) { 1201e09764cfSCarlos Maiolino error = dquot_initialize(inode); 1202e09764cfSCarlos Maiolino if (error) 1203e09764cfSCarlos Maiolino return error; 1204e09764cfSCarlos Maiolino } 1205e09764cfSCarlos Maiolino 1206e09764cfSCarlos Maiolino /* Transfer quota accounting */ 1207e09764cfSCarlos Maiolino if (i_uid_needs_update(idmap, attr, inode) || 1208e09764cfSCarlos Maiolino i_gid_needs_update(idmap, attr, inode)) { 1209e09764cfSCarlos Maiolino error = dquot_transfer(idmap, inode, attr); 1210e09764cfSCarlos Maiolino 1211e09764cfSCarlos Maiolino if (error) 1212e09764cfSCarlos Maiolino return error; 1213e09764cfSCarlos Maiolino } 1214e09764cfSCarlos Maiolino 12157a80e5b8SGiuseppe Scrivano setattr_copy(idmap, inode, attr); 1216db78b877SChristoph Hellwig if (attr->ia_valid & ATTR_MODE) 12177a80e5b8SGiuseppe Scrivano error = posix_acl_chmod(idmap, dentry, inode->i_mode); 121836f05cabSJeff Layton if (!error && update_ctime) { 121936f05cabSJeff Layton inode->i_ctime = current_time(inode); 122036f05cabSJeff Layton if (update_mtime) 122136f05cabSJeff Layton inode->i_mtime = inode->i_ctime; 122236f05cabSJeff Layton inode_inc_iversion(inode); 122336f05cabSJeff Layton } 12241da177e4SLinus Torvalds return error; 12251da177e4SLinus Torvalds } 12261da177e4SLinus Torvalds 12271f895f75SAl Viro static void shmem_evict_inode(struct inode *inode) 12281da177e4SLinus Torvalds { 12291da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 1230779750d2SKirill A. Shutemov struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 12311da177e4SLinus Torvalds 123230e6a51dSHui Su if (shmem_mapping(inode->i_mapping)) { 12331da177e4SLinus Torvalds shmem_unacct_size(info->flags, inode->i_size); 12341da177e4SLinus Torvalds inode->i_size = 0; 1235bc786390SHugh Dickins mapping_set_exiting(inode->i_mapping); 12363889e6e7Snpiggin@suse.de shmem_truncate_range(inode, 0, (loff_t)-1); 1237779750d2SKirill A. Shutemov if (!list_empty(&info->shrinklist)) { 1238779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 1239779750d2SKirill A. Shutemov if (!list_empty(&info->shrinklist)) { 1240779750d2SKirill A. Shutemov list_del_init(&info->shrinklist); 1241779750d2SKirill A. Shutemov sbinfo->shrinklist_len--; 1242779750d2SKirill A. Shutemov } 1243779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 1244779750d2SKirill A. Shutemov } 1245af53d3e9SHugh Dickins while (!list_empty(&info->swaplist)) { 1246af53d3e9SHugh Dickins /* Wait while shmem_unuse() is scanning this inode... */ 1247af53d3e9SHugh Dickins wait_var_event(&info->stop_eviction, 1248af53d3e9SHugh Dickins !atomic_read(&info->stop_eviction)); 1249cb5f7b9aSHugh Dickins mutex_lock(&shmem_swaplist_mutex); 1250af53d3e9SHugh Dickins /* ...but beware of the race if we peeked too early */ 1251af53d3e9SHugh Dickins if (!atomic_read(&info->stop_eviction)) 12521da177e4SLinus Torvalds list_del_init(&info->swaplist); 1253cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 12541da177e4SLinus Torvalds } 12553ed47db3SAl Viro } 1256b09e0fa4SEric Paris 125738f38657SAristeu Rozanski simple_xattrs_free(&info->xattrs); 12580f3c42f5SHugh Dickins WARN_ON(inode->i_blocks); 12595b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 1260dbd5768fSJan Kara clear_inode(inode); 1261e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 1262e09764cfSCarlos Maiolino dquot_free_inode(inode); 1263e09764cfSCarlos Maiolino dquot_drop(inode); 1264e09764cfSCarlos Maiolino #endif 12651da177e4SLinus Torvalds } 12661da177e4SLinus Torvalds 1267b56a2d8aSVineeth Remanan Pillai static int shmem_find_swap_entries(struct address_space *mapping, 1268da08e9b7SMatthew Wilcox (Oracle) pgoff_t start, struct folio_batch *fbatch, 1269da08e9b7SMatthew Wilcox (Oracle) pgoff_t *indices, unsigned int type) 1270478922e2SMatthew Wilcox { 1271b56a2d8aSVineeth Remanan Pillai XA_STATE(xas, &mapping->i_pages, start); 1272da08e9b7SMatthew Wilcox (Oracle) struct folio *folio; 127387039546SHugh Dickins swp_entry_t entry; 1274478922e2SMatthew Wilcox 1275478922e2SMatthew Wilcox rcu_read_lock(); 1276da08e9b7SMatthew Wilcox (Oracle) xas_for_each(&xas, folio, ULONG_MAX) { 1277da08e9b7SMatthew Wilcox (Oracle) if (xas_retry(&xas, folio)) 12785b9c98f3SMike Kravetz continue; 1279b56a2d8aSVineeth Remanan Pillai 1280da08e9b7SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 1281478922e2SMatthew Wilcox continue; 1282b56a2d8aSVineeth Remanan Pillai 1283da08e9b7SMatthew Wilcox (Oracle) entry = radix_to_swp_entry(folio); 12846cec2b95SMiaohe Lin /* 12856cec2b95SMiaohe Lin * swapin error entries can be found in the mapping. But they're 12866cec2b95SMiaohe Lin * deliberately ignored here as we've done everything we can do. 12876cec2b95SMiaohe Lin */ 128887039546SHugh Dickins if (swp_type(entry) != type) 1289b56a2d8aSVineeth Remanan Pillai continue; 1290b56a2d8aSVineeth Remanan Pillai 1291e384200eSHugh Dickins indices[folio_batch_count(fbatch)] = xas.xa_index; 1292da08e9b7SMatthew Wilcox (Oracle) if (!folio_batch_add(fbatch, folio)) 1293da08e9b7SMatthew Wilcox (Oracle) break; 1294b56a2d8aSVineeth Remanan Pillai 1295b56a2d8aSVineeth Remanan Pillai if (need_resched()) { 1296e21a2955SMatthew Wilcox xas_pause(&xas); 1297478922e2SMatthew Wilcox cond_resched_rcu(); 1298478922e2SMatthew Wilcox } 1299b56a2d8aSVineeth Remanan Pillai } 1300478922e2SMatthew Wilcox rcu_read_unlock(); 1301e21a2955SMatthew Wilcox 1302da08e9b7SMatthew Wilcox (Oracle) return xas.xa_index; 1303b56a2d8aSVineeth Remanan Pillai } 1304b56a2d8aSVineeth Remanan Pillai 1305b56a2d8aSVineeth Remanan Pillai /* 1306b56a2d8aSVineeth Remanan Pillai * Move the swapped pages for an inode to page cache. Returns the count 1307b56a2d8aSVineeth Remanan Pillai * of pages swapped in, or the error in case of failure. 1308b56a2d8aSVineeth Remanan Pillai */ 1309da08e9b7SMatthew Wilcox (Oracle) static int shmem_unuse_swap_entries(struct inode *inode, 1310da08e9b7SMatthew Wilcox (Oracle) struct folio_batch *fbatch, pgoff_t *indices) 1311b56a2d8aSVineeth Remanan Pillai { 1312b56a2d8aSVineeth Remanan Pillai int i = 0; 1313b56a2d8aSVineeth Remanan Pillai int ret = 0; 1314b56a2d8aSVineeth Remanan Pillai int error = 0; 1315b56a2d8aSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1316b56a2d8aSVineeth Remanan Pillai 1317da08e9b7SMatthew Wilcox (Oracle) for (i = 0; i < folio_batch_count(fbatch); i++) { 1318da08e9b7SMatthew Wilcox (Oracle) struct folio *folio = fbatch->folios[i]; 1319b56a2d8aSVineeth Remanan Pillai 1320da08e9b7SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 1321b56a2d8aSVineeth Remanan Pillai continue; 1322da08e9b7SMatthew Wilcox (Oracle) error = shmem_swapin_folio(inode, indices[i], 1323da08e9b7SMatthew Wilcox (Oracle) &folio, SGP_CACHE, 1324b56a2d8aSVineeth Remanan Pillai mapping_gfp_mask(mapping), 1325b56a2d8aSVineeth Remanan Pillai NULL, NULL); 1326b56a2d8aSVineeth Remanan Pillai if (error == 0) { 1327da08e9b7SMatthew Wilcox (Oracle) folio_unlock(folio); 1328da08e9b7SMatthew Wilcox (Oracle) folio_put(folio); 1329b56a2d8aSVineeth Remanan Pillai ret++; 1330b56a2d8aSVineeth Remanan Pillai } 1331b56a2d8aSVineeth Remanan Pillai if (error == -ENOMEM) 1332b56a2d8aSVineeth Remanan Pillai break; 1333b56a2d8aSVineeth Remanan Pillai error = 0; 1334b56a2d8aSVineeth Remanan Pillai } 1335b56a2d8aSVineeth Remanan Pillai return error ? error : ret; 1336478922e2SMatthew Wilcox } 1337478922e2SMatthew Wilcox 133846f65ec1SHugh Dickins /* 133946f65ec1SHugh Dickins * If swap found in inode, free it and move page from swapcache to filecache. 134046f65ec1SHugh Dickins */ 134110a9c496SChristoph Hellwig static int shmem_unuse_inode(struct inode *inode, unsigned int type) 13421da177e4SLinus Torvalds { 1343b56a2d8aSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1344b56a2d8aSVineeth Remanan Pillai pgoff_t start = 0; 1345da08e9b7SMatthew Wilcox (Oracle) struct folio_batch fbatch; 1346b56a2d8aSVineeth Remanan Pillai pgoff_t indices[PAGEVEC_SIZE]; 1347b56a2d8aSVineeth Remanan Pillai int ret = 0; 13481da177e4SLinus Torvalds 1349b56a2d8aSVineeth Remanan Pillai do { 1350da08e9b7SMatthew Wilcox (Oracle) folio_batch_init(&fbatch); 1351da08e9b7SMatthew Wilcox (Oracle) shmem_find_swap_entries(mapping, start, &fbatch, indices, type); 1352da08e9b7SMatthew Wilcox (Oracle) if (folio_batch_count(&fbatch) == 0) { 1353b56a2d8aSVineeth Remanan Pillai ret = 0; 1354778dd893SHugh Dickins break; 1355b56a2d8aSVineeth Remanan Pillai } 1356b56a2d8aSVineeth Remanan Pillai 1357da08e9b7SMatthew Wilcox (Oracle) ret = shmem_unuse_swap_entries(inode, &fbatch, indices); 1358b56a2d8aSVineeth Remanan Pillai if (ret < 0) 1359b56a2d8aSVineeth Remanan Pillai break; 1360b56a2d8aSVineeth Remanan Pillai 1361da08e9b7SMatthew Wilcox (Oracle) start = indices[folio_batch_count(&fbatch) - 1]; 1362b56a2d8aSVineeth Remanan Pillai } while (true); 1363b56a2d8aSVineeth Remanan Pillai 1364b56a2d8aSVineeth Remanan Pillai return ret; 1365b56a2d8aSVineeth Remanan Pillai } 1366b56a2d8aSVineeth Remanan Pillai 1367b56a2d8aSVineeth Remanan Pillai /* 1368b56a2d8aSVineeth Remanan Pillai * Read all the shared memory data that resides in the swap 1369b56a2d8aSVineeth Remanan Pillai * device 'type' back into memory, so the swap device can be 1370b56a2d8aSVineeth Remanan Pillai * unused. 1371b56a2d8aSVineeth Remanan Pillai */ 137210a9c496SChristoph Hellwig int shmem_unuse(unsigned int type) 1373b56a2d8aSVineeth Remanan Pillai { 1374b56a2d8aSVineeth Remanan Pillai struct shmem_inode_info *info, *next; 1375b56a2d8aSVineeth Remanan Pillai int error = 0; 1376b56a2d8aSVineeth Remanan Pillai 1377b56a2d8aSVineeth Remanan Pillai if (list_empty(&shmem_swaplist)) 1378b56a2d8aSVineeth Remanan Pillai return 0; 1379b56a2d8aSVineeth Remanan Pillai 1380b56a2d8aSVineeth Remanan Pillai mutex_lock(&shmem_swaplist_mutex); 1381b56a2d8aSVineeth Remanan Pillai list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1382b56a2d8aSVineeth Remanan Pillai if (!info->swapped) { 1383b56a2d8aSVineeth Remanan Pillai list_del_init(&info->swaplist); 1384b56a2d8aSVineeth Remanan Pillai continue; 1385b56a2d8aSVineeth Remanan Pillai } 1386af53d3e9SHugh Dickins /* 1387af53d3e9SHugh Dickins * Drop the swaplist mutex while searching the inode for swap; 1388af53d3e9SHugh Dickins * but before doing so, make sure shmem_evict_inode() will not 1389af53d3e9SHugh Dickins * remove placeholder inode from swaplist, nor let it be freed 1390af53d3e9SHugh Dickins * (igrab() would protect from unlink, but not from unmount). 1391af53d3e9SHugh Dickins */ 1392af53d3e9SHugh Dickins atomic_inc(&info->stop_eviction); 1393b56a2d8aSVineeth Remanan Pillai mutex_unlock(&shmem_swaplist_mutex); 1394b56a2d8aSVineeth Remanan Pillai 139510a9c496SChristoph Hellwig error = shmem_unuse_inode(&info->vfs_inode, type); 1396b56a2d8aSVineeth Remanan Pillai cond_resched(); 1397b56a2d8aSVineeth Remanan Pillai 1398b56a2d8aSVineeth Remanan Pillai mutex_lock(&shmem_swaplist_mutex); 1399b56a2d8aSVineeth Remanan Pillai next = list_next_entry(info, swaplist); 1400b56a2d8aSVineeth Remanan Pillai if (!info->swapped) 1401b56a2d8aSVineeth Remanan Pillai list_del_init(&info->swaplist); 1402af53d3e9SHugh Dickins if (atomic_dec_and_test(&info->stop_eviction)) 1403af53d3e9SHugh Dickins wake_up_var(&info->stop_eviction); 1404b56a2d8aSVineeth Remanan Pillai if (error) 1405b56a2d8aSVineeth Remanan Pillai break; 14061da177e4SLinus Torvalds } 1407cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 1408778dd893SHugh Dickins 1409778dd893SHugh Dickins return error; 14101da177e4SLinus Torvalds } 14111da177e4SLinus Torvalds 14121da177e4SLinus Torvalds /* 14131da177e4SLinus Torvalds * Move the page from the page cache to the swap cache. 14141da177e4SLinus Torvalds */ 14151da177e4SLinus Torvalds static int shmem_writepage(struct page *page, struct writeback_control *wbc) 14161da177e4SLinus Torvalds { 1417e2e3fdc7SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 14188ccee8c1SLuis Chamberlain struct address_space *mapping = folio->mapping; 14198ccee8c1SLuis Chamberlain struct inode *inode = mapping->host; 14208ccee8c1SLuis Chamberlain struct shmem_inode_info *info = SHMEM_I(inode); 14212c6efe9cSLuis Chamberlain struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 14226922c0c7SHugh Dickins swp_entry_t swap; 14236922c0c7SHugh Dickins pgoff_t index; 14241da177e4SLinus Torvalds 14251e6decf3SHugh Dickins /* 1426cf7992bfSLuis Chamberlain * Our capabilities prevent regular writeback or sync from ever calling 1427cf7992bfSLuis Chamberlain * shmem_writepage; but a stacking filesystem might use ->writepage of 1428cf7992bfSLuis Chamberlain * its underlying filesystem, in which case tmpfs should write out to 1429cf7992bfSLuis Chamberlain * swap only in response to memory pressure, and not for the writeback 1430cf7992bfSLuis Chamberlain * threads or sync. 1431cf7992bfSLuis Chamberlain */ 1432cf7992bfSLuis Chamberlain if (WARN_ON_ONCE(!wbc->for_reclaim)) 1433cf7992bfSLuis Chamberlain goto redirty; 1434cf7992bfSLuis Chamberlain 14352c6efe9cSLuis Chamberlain if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) 14369a976f0cSLuis Chamberlain goto redirty; 14379a976f0cSLuis Chamberlain 14389a976f0cSLuis Chamberlain if (!total_swap_pages) 14399a976f0cSLuis Chamberlain goto redirty; 14409a976f0cSLuis Chamberlain 1441cf7992bfSLuis Chamberlain /* 14421e6decf3SHugh Dickins * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or 14431e6decf3SHugh Dickins * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, 14441e6decf3SHugh Dickins * and its shmem_writeback() needs them to be split when swapping. 14451e6decf3SHugh Dickins */ 1446f530ed0eSMatthew Wilcox (Oracle) if (folio_test_large(folio)) { 14471e6decf3SHugh Dickins /* Ensure the subpages are still dirty */ 1448f530ed0eSMatthew Wilcox (Oracle) folio_test_set_dirty(folio); 14491e6decf3SHugh Dickins if (split_huge_page(page) < 0) 14501e6decf3SHugh Dickins goto redirty; 1451f530ed0eSMatthew Wilcox (Oracle) folio = page_folio(page); 1452f530ed0eSMatthew Wilcox (Oracle) folio_clear_dirty(folio); 14531e6decf3SHugh Dickins } 14541e6decf3SHugh Dickins 1455f530ed0eSMatthew Wilcox (Oracle) index = folio->index; 14561635f6a7SHugh Dickins 14571635f6a7SHugh Dickins /* 14581635f6a7SHugh Dickins * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 14591635f6a7SHugh Dickins * value into swapfile.c, the only way we can correctly account for a 1460f530ed0eSMatthew Wilcox (Oracle) * fallocated folio arriving here is now to initialize it and write it. 14611aac1400SHugh Dickins * 1462f530ed0eSMatthew Wilcox (Oracle) * That's okay for a folio already fallocated earlier, but if we have 14631aac1400SHugh Dickins * not yet completed the fallocation, then (a) we want to keep track 1464f530ed0eSMatthew Wilcox (Oracle) * of this folio in case we have to undo it, and (b) it may not be a 14651aac1400SHugh Dickins * good idea to continue anyway, once we're pushing into swap. So 1466f530ed0eSMatthew Wilcox (Oracle) * reactivate the folio, and let shmem_fallocate() quit when too many. 14671635f6a7SHugh Dickins */ 1468f530ed0eSMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 14691aac1400SHugh Dickins if (inode->i_private) { 14701aac1400SHugh Dickins struct shmem_falloc *shmem_falloc; 14711aac1400SHugh Dickins spin_lock(&inode->i_lock); 14721aac1400SHugh Dickins shmem_falloc = inode->i_private; 14731aac1400SHugh Dickins if (shmem_falloc && 14748e205f77SHugh Dickins !shmem_falloc->waitq && 14751aac1400SHugh Dickins index >= shmem_falloc->start && 14761aac1400SHugh Dickins index < shmem_falloc->next) 14771aac1400SHugh Dickins shmem_falloc->nr_unswapped++; 14781aac1400SHugh Dickins else 14791aac1400SHugh Dickins shmem_falloc = NULL; 14801aac1400SHugh Dickins spin_unlock(&inode->i_lock); 14811aac1400SHugh Dickins if (shmem_falloc) 14821aac1400SHugh Dickins goto redirty; 14831aac1400SHugh Dickins } 1484f530ed0eSMatthew Wilcox (Oracle) folio_zero_range(folio, 0, folio_size(folio)); 1485f530ed0eSMatthew Wilcox (Oracle) flush_dcache_folio(folio); 1486f530ed0eSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 14871635f6a7SHugh Dickins } 14881635f6a7SHugh Dickins 1489e2e3fdc7SMatthew Wilcox (Oracle) swap = folio_alloc_swap(folio); 149048f170fbSHugh Dickins if (!swap.val) 149148f170fbSHugh Dickins goto redirty; 1492d9fe526aSHugh Dickins 1493b1dea800SHugh Dickins /* 1494b1dea800SHugh Dickins * Add inode to shmem_unuse()'s list of swapped-out inodes, 1495f530ed0eSMatthew Wilcox (Oracle) * if it's not already there. Do it now before the folio is 14966922c0c7SHugh Dickins * moved to swap cache, when its pagelock no longer protects 1497b1dea800SHugh Dickins * the inode from eviction. But don't unlock the mutex until 14986922c0c7SHugh Dickins * we've incremented swapped, because shmem_unuse_inode() will 14996922c0c7SHugh Dickins * prune a !swapped inode from the swaplist under this mutex. 1500b1dea800SHugh Dickins */ 1501b1dea800SHugh Dickins mutex_lock(&shmem_swaplist_mutex); 150205bf86b4SHugh Dickins if (list_empty(&info->swaplist)) 1503b56a2d8aSVineeth Remanan Pillai list_add(&info->swaplist, &shmem_swaplist); 1504b1dea800SHugh Dickins 1505a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, swap, 15063852f676SJoonsoo Kim __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, 15073852f676SJoonsoo Kim NULL) == 0) { 1508*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, 1); 1509aaa46865SHugh Dickins swap_shmem_alloc(swap); 15104cd400fdSMatthew Wilcox (Oracle) shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); 15116922c0c7SHugh Dickins 15126922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 1513f530ed0eSMatthew Wilcox (Oracle) BUG_ON(folio_mapped(folio)); 1514f530ed0eSMatthew Wilcox (Oracle) swap_writepage(&folio->page, wbc); 15151da177e4SLinus Torvalds return 0; 15161da177e4SLinus Torvalds } 15171da177e4SLinus Torvalds 15186922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 15194081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, swap); 15201da177e4SLinus Torvalds redirty: 1521f530ed0eSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 1522d9fe526aSHugh Dickins if (wbc->for_reclaim) 1523f530ed0eSMatthew Wilcox (Oracle) return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ 1524f530ed0eSMatthew Wilcox (Oracle) folio_unlock(folio); 1525d9fe526aSHugh Dickins return 0; 15261da177e4SLinus Torvalds } 15271da177e4SLinus Torvalds 152875edd345SHugh Dickins #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 152971fe804bSLee Schermerhorn static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1530680d794bSakpm@linux-foundation.org { 1531680d794bSakpm@linux-foundation.org char buffer[64]; 1532680d794bSakpm@linux-foundation.org 153371fe804bSLee Schermerhorn if (!mpol || mpol->mode == MPOL_DEFAULT) 1534095f1fc4SLee Schermerhorn return; /* show nothing */ 1535095f1fc4SLee Schermerhorn 1536a7a88b23SHugh Dickins mpol_to_str(buffer, sizeof(buffer), mpol); 1537095f1fc4SLee Schermerhorn 1538095f1fc4SLee Schermerhorn seq_printf(seq, ",mpol=%s", buffer); 1539680d794bSakpm@linux-foundation.org } 154071fe804bSLee Schermerhorn 154171fe804bSLee Schermerhorn static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 154271fe804bSLee Schermerhorn { 154371fe804bSLee Schermerhorn struct mempolicy *mpol = NULL; 154471fe804bSLee Schermerhorn if (sbinfo->mpol) { 1545bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 154671fe804bSLee Schermerhorn mpol = sbinfo->mpol; 154771fe804bSLee Schermerhorn mpol_get(mpol); 1548bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 154971fe804bSLee Schermerhorn } 155071fe804bSLee Schermerhorn return mpol; 155171fe804bSLee Schermerhorn } 155275edd345SHugh Dickins #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 155375edd345SHugh Dickins static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 155475edd345SHugh Dickins { 155575edd345SHugh Dickins } 155675edd345SHugh Dickins static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 155775edd345SHugh Dickins { 155875edd345SHugh Dickins return NULL; 155975edd345SHugh Dickins } 156075edd345SHugh Dickins #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 156175edd345SHugh Dickins #ifndef CONFIG_NUMA 156275edd345SHugh Dickins #define vm_policy vm_private_data 156375edd345SHugh Dickins #endif 1564680d794bSakpm@linux-foundation.org 1565800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1566800d8c63SKirill A. Shutemov struct shmem_inode_info *info, pgoff_t index) 1567800d8c63SKirill A. Shutemov { 1568800d8c63SKirill A. Shutemov /* Create a pseudo vma that just contains the policy */ 15692c4541e2SKirill A. Shutemov vma_init(vma, NULL); 1570800d8c63SKirill A. Shutemov /* Bias interleave by inode number to distribute better across nodes */ 1571800d8c63SKirill A. Shutemov vma->vm_pgoff = index + info->vfs_inode.i_ino; 1572800d8c63SKirill A. Shutemov vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1573800d8c63SKirill A. Shutemov } 1574800d8c63SKirill A. Shutemov 1575800d8c63SKirill A. Shutemov static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1576800d8c63SKirill A. Shutemov { 1577800d8c63SKirill A. Shutemov /* Drop reference taken by mpol_shared_policy_lookup() */ 1578800d8c63SKirill A. Shutemov mpol_cond_put(vma->vm_policy); 1579800d8c63SKirill A. Shutemov } 1580800d8c63SKirill A. Shutemov 15815739a81cSMatthew Wilcox (Oracle) static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, 158241ffe5d5SHugh Dickins struct shmem_inode_info *info, pgoff_t index) 15831da177e4SLinus Torvalds { 15841da177e4SLinus Torvalds struct vm_area_struct pvma; 158518a2f371SMel Gorman struct page *page; 15868c63ca5bSWill Deacon struct vm_fault vmf = { 15878c63ca5bSWill Deacon .vma = &pvma, 15888c63ca5bSWill Deacon }; 15891da177e4SLinus Torvalds 1590800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, index); 1591e9e9b7ecSMinchan Kim page = swap_cluster_readahead(swap, gfp, &vmf); 1592800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 159318a2f371SMel Gorman 15945739a81cSMatthew Wilcox (Oracle) if (!page) 15955739a81cSMatthew Wilcox (Oracle) return NULL; 15965739a81cSMatthew Wilcox (Oracle) return page_folio(page); 1597800d8c63SKirill A. Shutemov } 159818a2f371SMel Gorman 159978cc8cdcSRik van Riel /* 160078cc8cdcSRik van Riel * Make sure huge_gfp is always more limited than limit_gfp. 160178cc8cdcSRik van Riel * Some of the flags set permissions, while others set limitations. 160278cc8cdcSRik van Riel */ 160378cc8cdcSRik van Riel static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) 160478cc8cdcSRik van Riel { 160578cc8cdcSRik van Riel gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 160678cc8cdcSRik van Riel gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; 1607187df5ddSRik van Riel gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; 1608187df5ddSRik van Riel gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); 1609187df5ddSRik van Riel 1610187df5ddSRik van Riel /* Allow allocations only from the originally specified zones. */ 1611187df5ddSRik van Riel result |= zoneflags; 161278cc8cdcSRik van Riel 161378cc8cdcSRik van Riel /* 161478cc8cdcSRik van Riel * Minimize the result gfp by taking the union with the deny flags, 161578cc8cdcSRik van Riel * and the intersection of the allow flags. 161678cc8cdcSRik van Riel */ 161778cc8cdcSRik van Riel result |= (limit_gfp & denyflags); 161878cc8cdcSRik van Riel result |= (huge_gfp & limit_gfp) & allowflags; 161978cc8cdcSRik van Riel 162078cc8cdcSRik van Riel return result; 162178cc8cdcSRik van Riel } 162278cc8cdcSRik van Riel 162372827e5cSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, 1624800d8c63SKirill A. Shutemov struct shmem_inode_info *info, pgoff_t index) 1625800d8c63SKirill A. Shutemov { 1626800d8c63SKirill A. Shutemov struct vm_area_struct pvma; 16277b8d046fSMatthew Wilcox struct address_space *mapping = info->vfs_inode.i_mapping; 16287b8d046fSMatthew Wilcox pgoff_t hindex; 1629dfe98499SMatthew Wilcox (Oracle) struct folio *folio; 1630800d8c63SKirill A. Shutemov 16314620a06eSGeert Uytterhoeven hindex = round_down(index, HPAGE_PMD_NR); 16327b8d046fSMatthew Wilcox if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, 16337b8d046fSMatthew Wilcox XA_PRESENT)) 1634800d8c63SKirill A. Shutemov return NULL; 1635800d8c63SKirill A. Shutemov 1636800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, hindex); 1637dfe98499SMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); 1638800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 1639dfe98499SMatthew Wilcox (Oracle) if (!folio) 1640dcdf11eeSDavid Rientjes count_vm_event(THP_FILE_FALLBACK); 164172827e5cSMatthew Wilcox (Oracle) return folio; 164218a2f371SMel Gorman } 164318a2f371SMel Gorman 16440c023ef5SMatthew Wilcox (Oracle) static struct folio *shmem_alloc_folio(gfp_t gfp, 164518a2f371SMel Gorman struct shmem_inode_info *info, pgoff_t index) 164618a2f371SMel Gorman { 164718a2f371SMel Gorman struct vm_area_struct pvma; 16480c023ef5SMatthew Wilcox (Oracle) struct folio *folio; 164918a2f371SMel Gorman 1650800d8c63SKirill A. Shutemov shmem_pseudo_vma_init(&pvma, info, index); 16510c023ef5SMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); 1652800d8c63SKirill A. Shutemov shmem_pseudo_vma_destroy(&pvma); 165318a2f371SMel Gorman 16540c023ef5SMatthew Wilcox (Oracle) return folio; 165518a2f371SMel Gorman } 165618a2f371SMel Gorman 1657b1d0ec3aSMatthew Wilcox (Oracle) static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, 1658800d8c63SKirill A. Shutemov pgoff_t index, bool huge) 1659800d8c63SKirill A. Shutemov { 16600f079694SMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 166172827e5cSMatthew Wilcox (Oracle) struct folio *folio; 1662800d8c63SKirill A. Shutemov int nr; 1663c7e263abSLukas Czerner int err; 1664800d8c63SKirill A. Shutemov 1665396bcc52SMatthew Wilcox (Oracle) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1666800d8c63SKirill A. Shutemov huge = false; 1667800d8c63SKirill A. Shutemov nr = huge ? HPAGE_PMD_NR : 1; 1668800d8c63SKirill A. Shutemov 1669c7e263abSLukas Czerner err = shmem_inode_acct_block(inode, nr); 1670c7e263abSLukas Czerner if (err) 1671800d8c63SKirill A. Shutemov goto failed; 1672800d8c63SKirill A. Shutemov 1673800d8c63SKirill A. Shutemov if (huge) 167472827e5cSMatthew Wilcox (Oracle) folio = shmem_alloc_hugefolio(gfp, info, index); 1675800d8c63SKirill A. Shutemov else 167672827e5cSMatthew Wilcox (Oracle) folio = shmem_alloc_folio(gfp, info, index); 167772827e5cSMatthew Wilcox (Oracle) if (folio) { 167872827e5cSMatthew Wilcox (Oracle) __folio_set_locked(folio); 167972827e5cSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 1680b1d0ec3aSMatthew Wilcox (Oracle) return folio; 168175edd345SHugh Dickins } 168218a2f371SMel Gorman 1683800d8c63SKirill A. Shutemov err = -ENOMEM; 16840f079694SMike Rapoport shmem_inode_unacct_blocks(inode, nr); 1685800d8c63SKirill A. Shutemov failed: 1686800d8c63SKirill A. Shutemov return ERR_PTR(err); 16871da177e4SLinus Torvalds } 168871fe804bSLee Schermerhorn 16891da177e4SLinus Torvalds /* 1690bde05d1cSHugh Dickins * When a page is moved from swapcache to shmem filecache (either by the 1691fc26babbSMatthew Wilcox (Oracle) * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of 1692bde05d1cSHugh Dickins * shmem_unuse_inode()), it may have been read in earlier from swap, in 1693bde05d1cSHugh Dickins * ignorance of the mapping it belongs to. If that mapping has special 1694bde05d1cSHugh Dickins * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1695bde05d1cSHugh Dickins * we may need to copy to a suitable page before moving to filecache. 1696bde05d1cSHugh Dickins * 1697bde05d1cSHugh Dickins * In a future release, this may well be extended to respect cpuset and 1698bde05d1cSHugh Dickins * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1699bde05d1cSHugh Dickins * but for now it is a simple matter of zone. 1700bde05d1cSHugh Dickins */ 1701069d849cSMatthew Wilcox (Oracle) static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) 1702bde05d1cSHugh Dickins { 1703069d849cSMatthew Wilcox (Oracle) return folio_zonenum(folio) > gfp_zone(gfp); 1704bde05d1cSHugh Dickins } 1705bde05d1cSHugh Dickins 17060d698e25SMatthew Wilcox (Oracle) static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, 1707bde05d1cSHugh Dickins struct shmem_inode_info *info, pgoff_t index) 1708bde05d1cSHugh Dickins { 1709d21bba2bSMatthew Wilcox (Oracle) struct folio *old, *new; 1710bde05d1cSHugh Dickins struct address_space *swap_mapping; 1711c1cb20d4SYu Zhao swp_entry_t entry; 1712bde05d1cSHugh Dickins pgoff_t swap_index; 1713bde05d1cSHugh Dickins int error; 1714bde05d1cSHugh Dickins 17150d698e25SMatthew Wilcox (Oracle) old = *foliop; 1716907ea17eSMatthew Wilcox (Oracle) entry = folio_swap_entry(old); 1717c1cb20d4SYu Zhao swap_index = swp_offset(entry); 1718907ea17eSMatthew Wilcox (Oracle) swap_mapping = swap_address_space(entry); 1719bde05d1cSHugh Dickins 1720bde05d1cSHugh Dickins /* 1721bde05d1cSHugh Dickins * We have arrived here because our zones are constrained, so don't 1722bde05d1cSHugh Dickins * limit chance of success by further cpuset and node constraints. 1723bde05d1cSHugh Dickins */ 1724bde05d1cSHugh Dickins gfp &= ~GFP_CONSTRAINT_MASK; 1725907ea17eSMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_large(old), old); 1726907ea17eSMatthew Wilcox (Oracle) new = shmem_alloc_folio(gfp, info, index); 1727907ea17eSMatthew Wilcox (Oracle) if (!new) 1728bde05d1cSHugh Dickins return -ENOMEM; 1729bde05d1cSHugh Dickins 1730907ea17eSMatthew Wilcox (Oracle) folio_get(new); 1731907ea17eSMatthew Wilcox (Oracle) folio_copy(new, old); 1732907ea17eSMatthew Wilcox (Oracle) flush_dcache_folio(new); 1733bde05d1cSHugh Dickins 1734907ea17eSMatthew Wilcox (Oracle) __folio_set_locked(new); 1735907ea17eSMatthew Wilcox (Oracle) __folio_set_swapbacked(new); 1736907ea17eSMatthew Wilcox (Oracle) folio_mark_uptodate(new); 1737907ea17eSMatthew Wilcox (Oracle) folio_set_swap_entry(new, entry); 1738907ea17eSMatthew Wilcox (Oracle) folio_set_swapcache(new); 1739bde05d1cSHugh Dickins 1740bde05d1cSHugh Dickins /* 1741bde05d1cSHugh Dickins * Our caller will very soon move newpage out of swapcache, but it's 1742bde05d1cSHugh Dickins * a nice clean interface for us to replace oldpage by newpage there. 1743bde05d1cSHugh Dickins */ 1744b93b0163SMatthew Wilcox xa_lock_irq(&swap_mapping->i_pages); 1745907ea17eSMatthew Wilcox (Oracle) error = shmem_replace_entry(swap_mapping, swap_index, old, new); 17460142ef6cSHugh Dickins if (!error) { 1747d21bba2bSMatthew Wilcox (Oracle) mem_cgroup_migrate(old, new); 1748907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); 1749907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(new, NR_SHMEM, 1); 1750907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); 1751907ea17eSMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(old, NR_SHMEM, -1); 17520142ef6cSHugh Dickins } 1753b93b0163SMatthew Wilcox xa_unlock_irq(&swap_mapping->i_pages); 1754bde05d1cSHugh Dickins 17550142ef6cSHugh Dickins if (unlikely(error)) { 17560142ef6cSHugh Dickins /* 17570142ef6cSHugh Dickins * Is this possible? I think not, now that our callers check 17580142ef6cSHugh Dickins * both PageSwapCache and page_private after getting page lock; 17590142ef6cSHugh Dickins * but be defensive. Reverse old to newpage for clear and free. 17600142ef6cSHugh Dickins */ 1761907ea17eSMatthew Wilcox (Oracle) old = new; 17620142ef6cSHugh Dickins } else { 1763907ea17eSMatthew Wilcox (Oracle) folio_add_lru(new); 17640d698e25SMatthew Wilcox (Oracle) *foliop = new; 17650142ef6cSHugh Dickins } 1766bde05d1cSHugh Dickins 1767907ea17eSMatthew Wilcox (Oracle) folio_clear_swapcache(old); 1768907ea17eSMatthew Wilcox (Oracle) old->private = NULL; 1769bde05d1cSHugh Dickins 1770907ea17eSMatthew Wilcox (Oracle) folio_unlock(old); 1771907ea17eSMatthew Wilcox (Oracle) folio_put_refs(old, 2); 17720142ef6cSHugh Dickins return error; 1773bde05d1cSHugh Dickins } 1774bde05d1cSHugh Dickins 17756cec2b95SMiaohe Lin static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 17766cec2b95SMiaohe Lin struct folio *folio, swp_entry_t swap) 17776cec2b95SMiaohe Lin { 17786cec2b95SMiaohe Lin struct address_space *mapping = inode->i_mapping; 17796cec2b95SMiaohe Lin swp_entry_t swapin_error; 17806cec2b95SMiaohe Lin void *old; 17816cec2b95SMiaohe Lin 178215520a3fSPeter Xu swapin_error = make_swapin_error_entry(); 17836cec2b95SMiaohe Lin old = xa_cmpxchg_irq(&mapping->i_pages, index, 17846cec2b95SMiaohe Lin swp_to_radix_entry(swap), 17856cec2b95SMiaohe Lin swp_to_radix_entry(swapin_error), 0); 17866cec2b95SMiaohe Lin if (old != swp_to_radix_entry(swap)) 17876cec2b95SMiaohe Lin return; 17886cec2b95SMiaohe Lin 17896cec2b95SMiaohe Lin folio_wait_writeback(folio); 179075fa68a5SMatthew Wilcox (Oracle) delete_from_swap_cache(folio); 17916cec2b95SMiaohe Lin /* 1792*3c1b7528SHugh Dickins * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks 1793*3c1b7528SHugh Dickins * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) 1794*3c1b7528SHugh Dickins * in shmem_evict_inode(). 17956cec2b95SMiaohe Lin */ 1796*3c1b7528SHugh Dickins shmem_recalc_inode(inode, -1, -1); 17976cec2b95SMiaohe Lin swap_free(swap); 17986cec2b95SMiaohe Lin } 17996cec2b95SMiaohe Lin 1800bde05d1cSHugh Dickins /* 1801833de10fSMiaohe Lin * Swap in the folio pointed to by *foliop. 1802833de10fSMiaohe Lin * Caller has to make sure that *foliop contains a valid swapped folio. 1803833de10fSMiaohe Lin * Returns 0 and the folio in foliop if success. On failure, returns the 1804833de10fSMiaohe Lin * error code and NULL in *foliop. 18051da177e4SLinus Torvalds */ 1806da08e9b7SMatthew Wilcox (Oracle) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 1807da08e9b7SMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, 1808c5bf121eSVineeth Remanan Pillai gfp_t gfp, struct vm_area_struct *vma, 18092b740303SSouptick Joarder vm_fault_t *fault_type) 18101da177e4SLinus Torvalds { 18111da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 181223f919d4SArnd Bergmann struct shmem_inode_info *info = SHMEM_I(inode); 181304f94e3fSDan Schatzberg struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; 1814cbc2bd98SKairui Song struct swap_info_struct *si; 1815da08e9b7SMatthew Wilcox (Oracle) struct folio *folio = NULL; 18161da177e4SLinus Torvalds swp_entry_t swap; 18171da177e4SLinus Torvalds int error; 18181da177e4SLinus Torvalds 1819da08e9b7SMatthew Wilcox (Oracle) VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1820da08e9b7SMatthew Wilcox (Oracle) swap = radix_to_swp_entry(*foliop); 1821da08e9b7SMatthew Wilcox (Oracle) *foliop = NULL; 182254af6042SHugh Dickins 18236cec2b95SMiaohe Lin if (is_swapin_error_entry(swap)) 18246cec2b95SMiaohe Lin return -EIO; 18256cec2b95SMiaohe Lin 1826cbc2bd98SKairui Song si = get_swap_device(swap); 1827cbc2bd98SKairui Song if (!si) { 1828cbc2bd98SKairui Song if (!shmem_confirm_swap(mapping, index, swap)) 1829cbc2bd98SKairui Song return -EEXIST; 1830cbc2bd98SKairui Song else 1831cbc2bd98SKairui Song return -EINVAL; 1832cbc2bd98SKairui Song } 1833cbc2bd98SKairui Song 18341da177e4SLinus Torvalds /* Look it up and read it in.. */ 18355739a81cSMatthew Wilcox (Oracle) folio = swap_cache_get_folio(swap, NULL, 0); 18365739a81cSMatthew Wilcox (Oracle) if (!folio) { 18379e18eb29SAndres Lagar-Cavilla /* Or update major stats only when swapin succeeds?? */ 18389e18eb29SAndres Lagar-Cavilla if (fault_type) { 183968da9f05SHugh Dickins *fault_type |= VM_FAULT_MAJOR; 18409e18eb29SAndres Lagar-Cavilla count_vm_event(PGMAJFAULT); 18412262185cSRoman Gushchin count_memcg_event_mm(charge_mm, PGMAJFAULT); 18429e18eb29SAndres Lagar-Cavilla } 18439e18eb29SAndres Lagar-Cavilla /* Here we actually start the io */ 18445739a81cSMatthew Wilcox (Oracle) folio = shmem_swapin(swap, gfp, info, index); 18455739a81cSMatthew Wilcox (Oracle) if (!folio) { 18461da177e4SLinus Torvalds error = -ENOMEM; 184754af6042SHugh Dickins goto failed; 1848285b2c4fSHugh Dickins } 18491da177e4SLinus Torvalds } 18501da177e4SLinus Torvalds 1851833de10fSMiaohe Lin /* We have to do this with folio locked to prevent races */ 1852da08e9b7SMatthew Wilcox (Oracle) folio_lock(folio); 1853da08e9b7SMatthew Wilcox (Oracle) if (!folio_test_swapcache(folio) || 1854da08e9b7SMatthew Wilcox (Oracle) folio_swap_entry(folio).val != swap.val || 1855d1899228SHugh Dickins !shmem_confirm_swap(mapping, index, swap)) { 1856c5bf121eSVineeth Remanan Pillai error = -EEXIST; 1857d1899228SHugh Dickins goto unlock; 1858bde05d1cSHugh Dickins } 1859da08e9b7SMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 18601da177e4SLinus Torvalds error = -EIO; 186154af6042SHugh Dickins goto failed; 186254af6042SHugh Dickins } 1863da08e9b7SMatthew Wilcox (Oracle) folio_wait_writeback(folio); 186454af6042SHugh Dickins 18658a84802eSSteven Price /* 18668a84802eSSteven Price * Some architectures may have to restore extra metadata to the 1867da08e9b7SMatthew Wilcox (Oracle) * folio after reading from swap. 18688a84802eSSteven Price */ 1869da08e9b7SMatthew Wilcox (Oracle) arch_swap_restore(swap, folio); 18708a84802eSSteven Price 1871069d849cSMatthew Wilcox (Oracle) if (shmem_should_replace_folio(folio, gfp)) { 18720d698e25SMatthew Wilcox (Oracle) error = shmem_replace_folio(&folio, gfp, info, index); 1873bde05d1cSHugh Dickins if (error) 187454af6042SHugh Dickins goto failed; 18751da177e4SLinus Torvalds } 18761da177e4SLinus Torvalds 1877b7dd44a1SMatthew Wilcox (Oracle) error = shmem_add_to_page_cache(folio, mapping, index, 18783fea5a49SJohannes Weiner swp_to_radix_entry(swap), gfp, 18793fea5a49SJohannes Weiner charge_mm); 188054af6042SHugh Dickins if (error) 188154af6042SHugh Dickins goto failed; 188254af6042SHugh Dickins 1883*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, -1); 188427ab7006SHugh Dickins 188566d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 1886da08e9b7SMatthew Wilcox (Oracle) folio_mark_accessed(folio); 188766d2f4d2SHugh Dickins 188875fa68a5SMatthew Wilcox (Oracle) delete_from_swap_cache(folio); 1889da08e9b7SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 189027ab7006SHugh Dickins swap_free(swap); 1891cbc2bd98SKairui Song put_swap_device(si); 189227ab7006SHugh Dickins 1893da08e9b7SMatthew Wilcox (Oracle) *foliop = folio; 1894c5bf121eSVineeth Remanan Pillai return 0; 1895c5bf121eSVineeth Remanan Pillai failed: 1896c5bf121eSVineeth Remanan Pillai if (!shmem_confirm_swap(mapping, index, swap)) 1897c5bf121eSVineeth Remanan Pillai error = -EEXIST; 18986cec2b95SMiaohe Lin if (error == -EIO) 18996cec2b95SMiaohe Lin shmem_set_folio_swapin_error(inode, index, folio, swap); 1900c5bf121eSVineeth Remanan Pillai unlock: 1901da08e9b7SMatthew Wilcox (Oracle) if (folio) { 1902da08e9b7SMatthew Wilcox (Oracle) folio_unlock(folio); 1903da08e9b7SMatthew Wilcox (Oracle) folio_put(folio); 1904c5bf121eSVineeth Remanan Pillai } 1905cbc2bd98SKairui Song put_swap_device(si); 1906c5bf121eSVineeth Remanan Pillai 1907c5bf121eSVineeth Remanan Pillai return error; 1908c5bf121eSVineeth Remanan Pillai } 1909c5bf121eSVineeth Remanan Pillai 1910c5bf121eSVineeth Remanan Pillai /* 1911fc26babbSMatthew Wilcox (Oracle) * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate 1912c5bf121eSVineeth Remanan Pillai * 1913c5bf121eSVineeth Remanan Pillai * If we allocate a new one we do not mark it dirty. That's up to the 1914c5bf121eSVineeth Remanan Pillai * vm. If we swap it in we mark it dirty since we also free the swap 1915c5bf121eSVineeth Remanan Pillai * entry since a page cannot live in both the swap and page cache. 1916c5bf121eSVineeth Remanan Pillai * 1917c949b097SAxel Rasmussen * vma, vmf, and fault_type are only supplied by shmem_fault: 1918c5bf121eSVineeth Remanan Pillai * otherwise they are NULL. 1919c5bf121eSVineeth Remanan Pillai */ 1920fc26babbSMatthew Wilcox (Oracle) static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, 1921fc26babbSMatthew Wilcox (Oracle) struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 1922c5bf121eSVineeth Remanan Pillai struct vm_area_struct *vma, struct vm_fault *vmf, 1923c5bf121eSVineeth Remanan Pillai vm_fault_t *fault_type) 1924c5bf121eSVineeth Remanan Pillai { 1925c5bf121eSVineeth Remanan Pillai struct address_space *mapping = inode->i_mapping; 1926c5bf121eSVineeth Remanan Pillai struct shmem_inode_info *info = SHMEM_I(inode); 1927c5bf121eSVineeth Remanan Pillai struct shmem_sb_info *sbinfo; 1928c5bf121eSVineeth Remanan Pillai struct mm_struct *charge_mm; 1929b7dd44a1SMatthew Wilcox (Oracle) struct folio *folio; 19306fe7d712SLukas Bulwahn pgoff_t hindex; 1931164cc4feSRik van Riel gfp_t huge_gfp; 1932c5bf121eSVineeth Remanan Pillai int error; 1933c5bf121eSVineeth Remanan Pillai int once = 0; 1934c5bf121eSVineeth Remanan Pillai int alloced = 0; 1935c5bf121eSVineeth Remanan Pillai 1936c5bf121eSVineeth Remanan Pillai if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1937c5bf121eSVineeth Remanan Pillai return -EFBIG; 1938c5bf121eSVineeth Remanan Pillai repeat: 1939c5bf121eSVineeth Remanan Pillai if (sgp <= SGP_CACHE && 1940c5bf121eSVineeth Remanan Pillai ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1941c5bf121eSVineeth Remanan Pillai return -EINVAL; 1942c5bf121eSVineeth Remanan Pillai } 1943c5bf121eSVineeth Remanan Pillai 1944c5bf121eSVineeth Remanan Pillai sbinfo = SHMEM_SB(inode->i_sb); 194504f94e3fSDan Schatzberg charge_mm = vma ? vma->vm_mm : NULL; 1946c5bf121eSVineeth Remanan Pillai 1947aaeb94ebSChristoph Hellwig folio = filemap_get_entry(mapping, index); 1948b1d0ec3aSMatthew Wilcox (Oracle) if (folio && vma && userfaultfd_minor(vma)) { 1949aaeb94ebSChristoph Hellwig if (!xa_is_value(folio)) 1950b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 1951c949b097SAxel Rasmussen *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); 1952c949b097SAxel Rasmussen return 0; 1953c949b097SAxel Rasmussen } 1954c949b097SAxel Rasmussen 1955b1d0ec3aSMatthew Wilcox (Oracle) if (xa_is_value(folio)) { 1956da08e9b7SMatthew Wilcox (Oracle) error = shmem_swapin_folio(inode, index, &folio, 1957c5bf121eSVineeth Remanan Pillai sgp, gfp, vma, fault_type); 1958c5bf121eSVineeth Remanan Pillai if (error == -EEXIST) 1959c5bf121eSVineeth Remanan Pillai goto repeat; 1960c5bf121eSVineeth Remanan Pillai 1961fc26babbSMatthew Wilcox (Oracle) *foliop = folio; 1962c5bf121eSVineeth Remanan Pillai return error; 1963c5bf121eSVineeth Remanan Pillai } 1964c5bf121eSVineeth Remanan Pillai 1965b1d0ec3aSMatthew Wilcox (Oracle) if (folio) { 1966aaeb94ebSChristoph Hellwig folio_lock(folio); 1967aaeb94ebSChristoph Hellwig 1968aaeb94ebSChristoph Hellwig /* Has the folio been truncated or swapped out? */ 1969aaeb94ebSChristoph Hellwig if (unlikely(folio->mapping != mapping)) { 1970aaeb94ebSChristoph Hellwig folio_unlock(folio); 1971aaeb94ebSChristoph Hellwig folio_put(folio); 1972aaeb94ebSChristoph Hellwig goto repeat; 1973aaeb94ebSChristoph Hellwig } 1974acdd9f8eSHugh Dickins if (sgp == SGP_WRITE) 1975b1d0ec3aSMatthew Wilcox (Oracle) folio_mark_accessed(folio); 1976b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_uptodate(folio)) 1977acdd9f8eSHugh Dickins goto out; 1978fc26babbSMatthew Wilcox (Oracle) /* fallocated folio */ 1979c5bf121eSVineeth Remanan Pillai if (sgp != SGP_READ) 1980c5bf121eSVineeth Remanan Pillai goto clear; 1981b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 1982b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 1983c5bf121eSVineeth Remanan Pillai } 1984c5bf121eSVineeth Remanan Pillai 1985c5bf121eSVineeth Remanan Pillai /* 1986fc26babbSMatthew Wilcox (Oracle) * SGP_READ: succeed on hole, with NULL folio, letting caller zero. 1987fc26babbSMatthew Wilcox (Oracle) * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. 1988acdd9f8eSHugh Dickins */ 1989fc26babbSMatthew Wilcox (Oracle) *foliop = NULL; 1990acdd9f8eSHugh Dickins if (sgp == SGP_READ) 1991acdd9f8eSHugh Dickins return 0; 1992acdd9f8eSHugh Dickins if (sgp == SGP_NOALLOC) 1993acdd9f8eSHugh Dickins return -ENOENT; 1994acdd9f8eSHugh Dickins 1995acdd9f8eSHugh Dickins /* 1996acdd9f8eSHugh Dickins * Fast cache lookup and swap lookup did not find it: allocate. 1997c5bf121eSVineeth Remanan Pillai */ 1998c5bf121eSVineeth Remanan Pillai 1999cfda0526SMike Rapoport if (vma && userfaultfd_missing(vma)) { 2000cfda0526SMike Rapoport *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 2001cfda0526SMike Rapoport return 0; 2002cfda0526SMike Rapoport } 2003cfda0526SMike Rapoport 20042cf13384SDavid Stevens if (!shmem_is_huge(inode, index, false, 20052cf13384SDavid Stevens vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) 2006800d8c63SKirill A. Shutemov goto alloc_nohuge; 200727d80fa2SKees Cook 2008164cc4feSRik van Riel huge_gfp = vma_thp_gfp_mask(vma); 200978cc8cdcSRik van Riel huge_gfp = limit_gfp_mask(huge_gfp, gfp); 2010b1d0ec3aSMatthew Wilcox (Oracle) folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); 2011b1d0ec3aSMatthew Wilcox (Oracle) if (IS_ERR(folio)) { 2012c5bf121eSVineeth Remanan Pillai alloc_nohuge: 2013b1d0ec3aSMatthew Wilcox (Oracle) folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); 201454af6042SHugh Dickins } 2015b1d0ec3aSMatthew Wilcox (Oracle) if (IS_ERR(folio)) { 2016779750d2SKirill A. Shutemov int retry = 5; 2017c5bf121eSVineeth Remanan Pillai 2018b1d0ec3aSMatthew Wilcox (Oracle) error = PTR_ERR(folio); 2019b1d0ec3aSMatthew Wilcox (Oracle) folio = NULL; 2020779750d2SKirill A. Shutemov if (error != -ENOSPC) 2021c5bf121eSVineeth Remanan Pillai goto unlock; 2022779750d2SKirill A. Shutemov /* 2023fc26babbSMatthew Wilcox (Oracle) * Try to reclaim some space by splitting a large folio 2024779750d2SKirill A. Shutemov * beyond i_size on the filesystem. 2025779750d2SKirill A. Shutemov */ 2026779750d2SKirill A. Shutemov while (retry--) { 2027779750d2SKirill A. Shutemov int ret; 2028c5bf121eSVineeth Remanan Pillai 2029779750d2SKirill A. Shutemov ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 2030779750d2SKirill A. Shutemov if (ret == SHRINK_STOP) 2031779750d2SKirill A. Shutemov break; 2032779750d2SKirill A. Shutemov if (ret) 2033779750d2SKirill A. Shutemov goto alloc_nohuge; 2034779750d2SKirill A. Shutemov } 2035c5bf121eSVineeth Remanan Pillai goto unlock; 2036800d8c63SKirill A. Shutemov } 2037800d8c63SKirill A. Shutemov 2038b1d0ec3aSMatthew Wilcox (Oracle) hindex = round_down(index, folio_nr_pages(folio)); 2039800d8c63SKirill A. Shutemov 204066d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 2041b1d0ec3aSMatthew Wilcox (Oracle) __folio_set_referenced(folio); 204266d2f4d2SHugh Dickins 2043b7dd44a1SMatthew Wilcox (Oracle) error = shmem_add_to_page_cache(folio, mapping, hindex, 20443fea5a49SJohannes Weiner NULL, gfp & GFP_RECLAIM_MASK, 20453fea5a49SJohannes Weiner charge_mm); 20463fea5a49SJohannes Weiner if (error) 2047800d8c63SKirill A. Shutemov goto unacct; 204854af6042SHugh Dickins 2049*3c1b7528SHugh Dickins folio_add_lru(folio); 2050*3c1b7528SHugh Dickins shmem_recalc_inode(inode, folio_nr_pages(folio), 0); 20511635f6a7SHugh Dickins alloced = true; 205254af6042SHugh Dickins 2053b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_pmd_mappable(folio) && 2054779750d2SKirill A. Shutemov DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 2055fc26babbSMatthew Wilcox (Oracle) folio_next_index(folio) - 1) { 2056779750d2SKirill A. Shutemov /* 2057fc26babbSMatthew Wilcox (Oracle) * Part of the large folio is beyond i_size: subject 2058779750d2SKirill A. Shutemov * to shrink under memory pressure. 2059779750d2SKirill A. Shutemov */ 2060779750d2SKirill A. Shutemov spin_lock(&sbinfo->shrinklist_lock); 2061d041353dSCong Wang /* 2062d041353dSCong Wang * _careful to defend against unlocked access to 2063d041353dSCong Wang * ->shrink_list in shmem_unused_huge_shrink() 2064d041353dSCong Wang */ 2065d041353dSCong Wang if (list_empty_careful(&info->shrinklist)) { 2066779750d2SKirill A. Shutemov list_add_tail(&info->shrinklist, 2067779750d2SKirill A. Shutemov &sbinfo->shrinklist); 2068779750d2SKirill A. Shutemov sbinfo->shrinklist_len++; 2069779750d2SKirill A. Shutemov } 2070779750d2SKirill A. Shutemov spin_unlock(&sbinfo->shrinklist_lock); 2071779750d2SKirill A. Shutemov } 2072779750d2SKirill A. Shutemov 2073ec9516fbSHugh Dickins /* 2074fc26babbSMatthew Wilcox (Oracle) * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. 20751635f6a7SHugh Dickins */ 20761635f6a7SHugh Dickins if (sgp == SGP_FALLOC) 20771635f6a7SHugh Dickins sgp = SGP_WRITE; 20781635f6a7SHugh Dickins clear: 20791635f6a7SHugh Dickins /* 2080fc26babbSMatthew Wilcox (Oracle) * Let SGP_WRITE caller clear ends if write does not fill folio; 2081fc26babbSMatthew Wilcox (Oracle) * but SGP_FALLOC on a folio fallocated earlier must initialize 20821635f6a7SHugh Dickins * it now, lest undo on failure cancel our earlier guarantee. 2083ec9516fbSHugh Dickins */ 2084b1d0ec3aSMatthew Wilcox (Oracle) if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { 2085b1d0ec3aSMatthew Wilcox (Oracle) long i, n = folio_nr_pages(folio); 2086800d8c63SKirill A. Shutemov 2087b1d0ec3aSMatthew Wilcox (Oracle) for (i = 0; i < n; i++) 2088b1d0ec3aSMatthew Wilcox (Oracle) clear_highpage(folio_page(folio, i)); 2089b1d0ec3aSMatthew Wilcox (Oracle) flush_dcache_folio(folio); 2090b1d0ec3aSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 2091ec9516fbSHugh Dickins } 2092bde05d1cSHugh Dickins 209354af6042SHugh Dickins /* Perhaps the file has been truncated since we checked */ 209475edd345SHugh Dickins if (sgp <= SGP_CACHE && 209509cbfeafSKirill A. Shutemov ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 2096267a4c76SHugh Dickins if (alloced) { 2097b1d0ec3aSMatthew Wilcox (Oracle) folio_clear_dirty(folio); 2098b1d0ec3aSMatthew Wilcox (Oracle) filemap_remove_folio(folio); 2099*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, 0); 2100267a4c76SHugh Dickins } 210154af6042SHugh Dickins error = -EINVAL; 2102267a4c76SHugh Dickins goto unlock; 2103ff36b801SShaohua Li } 210463ec1973SMatthew Wilcox (Oracle) out: 2105fc26babbSMatthew Wilcox (Oracle) *foliop = folio; 210654af6042SHugh Dickins return 0; 2107d00806b1SNick Piggin 2108d0217ac0SNick Piggin /* 210954af6042SHugh Dickins * Error recovery. 21101da177e4SLinus Torvalds */ 211154af6042SHugh Dickins unacct: 2112b1d0ec3aSMatthew Wilcox (Oracle) shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); 2113800d8c63SKirill A. Shutemov 2114b1d0ec3aSMatthew Wilcox (Oracle) if (folio_test_large(folio)) { 2115b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 2116b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 2117800d8c63SKirill A. Shutemov goto alloc_nohuge; 2118800d8c63SKirill A. Shutemov } 2119d1899228SHugh Dickins unlock: 2120b1d0ec3aSMatthew Wilcox (Oracle) if (folio) { 2121b1d0ec3aSMatthew Wilcox (Oracle) folio_unlock(folio); 2122b1d0ec3aSMatthew Wilcox (Oracle) folio_put(folio); 212354af6042SHugh Dickins } 212454af6042SHugh Dickins if (error == -ENOSPC && !once++) { 2125*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 0, 0); 21261da177e4SLinus Torvalds goto repeat; 2127d8dc74f2SAdrian Bunk } 21287f4446eeSMatthew Wilcox if (error == -EEXIST) 212954af6042SHugh Dickins goto repeat; 213054af6042SHugh Dickins return error; 21311da177e4SLinus Torvalds } 21321da177e4SLinus Torvalds 21334e1fc793SMatthew Wilcox (Oracle) int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, 21344e1fc793SMatthew Wilcox (Oracle) enum sgp_type sgp) 21354e1fc793SMatthew Wilcox (Oracle) { 21364e1fc793SMatthew Wilcox (Oracle) return shmem_get_folio_gfp(inode, index, foliop, sgp, 21374e1fc793SMatthew Wilcox (Oracle) mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 21384e1fc793SMatthew Wilcox (Oracle) } 21394e1fc793SMatthew Wilcox (Oracle) 214010d20bd2SLinus Torvalds /* 214110d20bd2SLinus Torvalds * This is like autoremove_wake_function, but it removes the wait queue 214210d20bd2SLinus Torvalds * entry unconditionally - even if something else had already woken the 214310d20bd2SLinus Torvalds * target. 214410d20bd2SLinus Torvalds */ 2145ac6424b9SIngo Molnar static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 214610d20bd2SLinus Torvalds { 214710d20bd2SLinus Torvalds int ret = default_wake_function(wait, mode, sync, key); 21482055da97SIngo Molnar list_del_init(&wait->entry); 214910d20bd2SLinus Torvalds return ret; 215010d20bd2SLinus Torvalds } 215110d20bd2SLinus Torvalds 215220acce67SSouptick Joarder static vm_fault_t shmem_fault(struct vm_fault *vmf) 21531da177e4SLinus Torvalds { 215411bac800SDave Jiang struct vm_area_struct *vma = vmf->vma; 2155496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 21569e18eb29SAndres Lagar-Cavilla gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 215768a54100SMatthew Wilcox (Oracle) struct folio *folio = NULL; 215820acce67SSouptick Joarder int err; 215920acce67SSouptick Joarder vm_fault_t ret = VM_FAULT_LOCKED; 21601da177e4SLinus Torvalds 2161f00cdc6dSHugh Dickins /* 2162f00cdc6dSHugh Dickins * Trinity finds that probing a hole which tmpfs is punching can 2163f00cdc6dSHugh Dickins * prevent the hole-punch from ever completing: which in turn 21649608703eSJan Kara * locks writers out with its hold on i_rwsem. So refrain from 21658e205f77SHugh Dickins * faulting pages into the hole while it's being punched. Although 21668e205f77SHugh Dickins * shmem_undo_range() does remove the additions, it may be unable to 21678e205f77SHugh Dickins * keep up, as each new page needs its own unmap_mapping_range() call, 21688e205f77SHugh Dickins * and the i_mmap tree grows ever slower to scan if new vmas are added. 21698e205f77SHugh Dickins * 21708e205f77SHugh Dickins * It does not matter if we sometimes reach this check just before the 21718e205f77SHugh Dickins * hole-punch begins, so that one fault then races with the punch: 21728e205f77SHugh Dickins * we just need to make racing faults a rare case. 21738e205f77SHugh Dickins * 21748e205f77SHugh Dickins * The implementation below would be much simpler if we just used a 21759608703eSJan Kara * standard mutex or completion: but we cannot take i_rwsem in fault, 21768e205f77SHugh Dickins * and bloating every shmem inode for this unlikely case would be sad. 2177f00cdc6dSHugh Dickins */ 2178f00cdc6dSHugh Dickins if (unlikely(inode->i_private)) { 2179f00cdc6dSHugh Dickins struct shmem_falloc *shmem_falloc; 2180f00cdc6dSHugh Dickins 2181f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 2182f00cdc6dSHugh Dickins shmem_falloc = inode->i_private; 21838e205f77SHugh Dickins if (shmem_falloc && 21848e205f77SHugh Dickins shmem_falloc->waitq && 21858e205f77SHugh Dickins vmf->pgoff >= shmem_falloc->start && 21868e205f77SHugh Dickins vmf->pgoff < shmem_falloc->next) { 21878897c1b1SKirill A. Shutemov struct file *fpin; 21888e205f77SHugh Dickins wait_queue_head_t *shmem_falloc_waitq; 218910d20bd2SLinus Torvalds DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 21908e205f77SHugh Dickins 21918e205f77SHugh Dickins ret = VM_FAULT_NOPAGE; 21928897c1b1SKirill A. Shutemov fpin = maybe_unlock_mmap_for_io(vmf, NULL); 21938897c1b1SKirill A. Shutemov if (fpin) 21948e205f77SHugh Dickins ret = VM_FAULT_RETRY; 21958e205f77SHugh Dickins 21968e205f77SHugh Dickins shmem_falloc_waitq = shmem_falloc->waitq; 21978e205f77SHugh Dickins prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 21988e205f77SHugh Dickins TASK_UNINTERRUPTIBLE); 21998e205f77SHugh Dickins spin_unlock(&inode->i_lock); 22008e205f77SHugh Dickins schedule(); 22018e205f77SHugh Dickins 22028e205f77SHugh Dickins /* 22038e205f77SHugh Dickins * shmem_falloc_waitq points into the shmem_fallocate() 22048e205f77SHugh Dickins * stack of the hole-punching task: shmem_falloc_waitq 22058e205f77SHugh Dickins * is usually invalid by the time we reach here, but 22068e205f77SHugh Dickins * finish_wait() does not dereference it in that case; 22078e205f77SHugh Dickins * though i_lock needed lest racing with wake_up_all(). 22088e205f77SHugh Dickins */ 22098e205f77SHugh Dickins spin_lock(&inode->i_lock); 22108e205f77SHugh Dickins finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 22118e205f77SHugh Dickins spin_unlock(&inode->i_lock); 22128897c1b1SKirill A. Shutemov 22138897c1b1SKirill A. Shutemov if (fpin) 22148897c1b1SKirill A. Shutemov fput(fpin); 22158e205f77SHugh Dickins return ret; 2216f00cdc6dSHugh Dickins } 22178e205f77SHugh Dickins spin_unlock(&inode->i_lock); 2218f00cdc6dSHugh Dickins } 2219f00cdc6dSHugh Dickins 222068a54100SMatthew Wilcox (Oracle) err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, 2221cfda0526SMike Rapoport gfp, vma, vmf, &ret); 222220acce67SSouptick Joarder if (err) 222320acce67SSouptick Joarder return vmf_error(err); 222468a54100SMatthew Wilcox (Oracle) if (folio) 222568a54100SMatthew Wilcox (Oracle) vmf->page = folio_file_page(folio, vmf->pgoff); 222668da9f05SHugh Dickins return ret; 22271da177e4SLinus Torvalds } 22281da177e4SLinus Torvalds 2229c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 2230c01d5b30SHugh Dickins unsigned long uaddr, unsigned long len, 2231c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 2232c01d5b30SHugh Dickins { 2233c01d5b30SHugh Dickins unsigned long (*get_area)(struct file *, 2234c01d5b30SHugh Dickins unsigned long, unsigned long, unsigned long, unsigned long); 2235c01d5b30SHugh Dickins unsigned long addr; 2236c01d5b30SHugh Dickins unsigned long offset; 2237c01d5b30SHugh Dickins unsigned long inflated_len; 2238c01d5b30SHugh Dickins unsigned long inflated_addr; 2239c01d5b30SHugh Dickins unsigned long inflated_offset; 2240c01d5b30SHugh Dickins 2241c01d5b30SHugh Dickins if (len > TASK_SIZE) 2242c01d5b30SHugh Dickins return -ENOMEM; 2243c01d5b30SHugh Dickins 2244c01d5b30SHugh Dickins get_area = current->mm->get_unmapped_area; 2245c01d5b30SHugh Dickins addr = get_area(file, uaddr, len, pgoff, flags); 2246c01d5b30SHugh Dickins 2247396bcc52SMatthew Wilcox (Oracle) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 2248c01d5b30SHugh Dickins return addr; 2249c01d5b30SHugh Dickins if (IS_ERR_VALUE(addr)) 2250c01d5b30SHugh Dickins return addr; 2251c01d5b30SHugh Dickins if (addr & ~PAGE_MASK) 2252c01d5b30SHugh Dickins return addr; 2253c01d5b30SHugh Dickins if (addr > TASK_SIZE - len) 2254c01d5b30SHugh Dickins return addr; 2255c01d5b30SHugh Dickins 2256c01d5b30SHugh Dickins if (shmem_huge == SHMEM_HUGE_DENY) 2257c01d5b30SHugh Dickins return addr; 2258c01d5b30SHugh Dickins if (len < HPAGE_PMD_SIZE) 2259c01d5b30SHugh Dickins return addr; 2260c01d5b30SHugh Dickins if (flags & MAP_FIXED) 2261c01d5b30SHugh Dickins return addr; 2262c01d5b30SHugh Dickins /* 2263c01d5b30SHugh Dickins * Our priority is to support MAP_SHARED mapped hugely; 2264c01d5b30SHugh Dickins * and support MAP_PRIVATE mapped hugely too, until it is COWed. 226599158997SKirill A. Shutemov * But if caller specified an address hint and we allocated area there 226699158997SKirill A. Shutemov * successfully, respect that as before. 2267c01d5b30SHugh Dickins */ 226899158997SKirill A. Shutemov if (uaddr == addr) 2269c01d5b30SHugh Dickins return addr; 2270c01d5b30SHugh Dickins 2271c01d5b30SHugh Dickins if (shmem_huge != SHMEM_HUGE_FORCE) { 2272c01d5b30SHugh Dickins struct super_block *sb; 2273c01d5b30SHugh Dickins 2274c01d5b30SHugh Dickins if (file) { 2275c01d5b30SHugh Dickins VM_BUG_ON(file->f_op != &shmem_file_operations); 2276c01d5b30SHugh Dickins sb = file_inode(file)->i_sb; 2277c01d5b30SHugh Dickins } else { 2278c01d5b30SHugh Dickins /* 2279c01d5b30SHugh Dickins * Called directly from mm/mmap.c, or drivers/char/mem.c 2280c01d5b30SHugh Dickins * for "/dev/zero", to create a shared anonymous object. 2281c01d5b30SHugh Dickins */ 2282c01d5b30SHugh Dickins if (IS_ERR(shm_mnt)) 2283c01d5b30SHugh Dickins return addr; 2284c01d5b30SHugh Dickins sb = shm_mnt->mnt_sb; 2285c01d5b30SHugh Dickins } 22863089bf61SToshi Kani if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2287c01d5b30SHugh Dickins return addr; 2288c01d5b30SHugh Dickins } 2289c01d5b30SHugh Dickins 2290c01d5b30SHugh Dickins offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2291c01d5b30SHugh Dickins if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2292c01d5b30SHugh Dickins return addr; 2293c01d5b30SHugh Dickins if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2294c01d5b30SHugh Dickins return addr; 2295c01d5b30SHugh Dickins 2296c01d5b30SHugh Dickins inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2297c01d5b30SHugh Dickins if (inflated_len > TASK_SIZE) 2298c01d5b30SHugh Dickins return addr; 2299c01d5b30SHugh Dickins if (inflated_len < len) 2300c01d5b30SHugh Dickins return addr; 2301c01d5b30SHugh Dickins 230299158997SKirill A. Shutemov inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); 2303c01d5b30SHugh Dickins if (IS_ERR_VALUE(inflated_addr)) 2304c01d5b30SHugh Dickins return addr; 2305c01d5b30SHugh Dickins if (inflated_addr & ~PAGE_MASK) 2306c01d5b30SHugh Dickins return addr; 2307c01d5b30SHugh Dickins 2308c01d5b30SHugh Dickins inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2309c01d5b30SHugh Dickins inflated_addr += offset - inflated_offset; 2310c01d5b30SHugh Dickins if (inflated_offset > offset) 2311c01d5b30SHugh Dickins inflated_addr += HPAGE_PMD_SIZE; 2312c01d5b30SHugh Dickins 2313c01d5b30SHugh Dickins if (inflated_addr > TASK_SIZE - len) 2314c01d5b30SHugh Dickins return addr; 2315c01d5b30SHugh Dickins return inflated_addr; 2316c01d5b30SHugh Dickins } 2317c01d5b30SHugh Dickins 23181da177e4SLinus Torvalds #ifdef CONFIG_NUMA 231941ffe5d5SHugh Dickins static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 23201da177e4SLinus Torvalds { 2321496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 232241ffe5d5SHugh Dickins return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 23231da177e4SLinus Torvalds } 23241da177e4SLinus Torvalds 2325d8dc74f2SAdrian Bunk static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2326d8dc74f2SAdrian Bunk unsigned long addr) 23271da177e4SLinus Torvalds { 2328496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 232941ffe5d5SHugh Dickins pgoff_t index; 23301da177e4SLinus Torvalds 233141ffe5d5SHugh Dickins index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 233241ffe5d5SHugh Dickins return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 23331da177e4SLinus Torvalds } 23341da177e4SLinus Torvalds #endif 23351da177e4SLinus Torvalds 2336d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 23371da177e4SLinus Torvalds { 2338496ad9aaSAl Viro struct inode *inode = file_inode(file); 23391da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 23401da177e4SLinus Torvalds int retval = -ENOMEM; 23411da177e4SLinus Torvalds 2342ea0dfeb4SHugh Dickins /* 2343ea0dfeb4SHugh Dickins * What serializes the accesses to info->flags? 2344ea0dfeb4SHugh Dickins * ipc_lock_object() when called from shmctl_do_lock(), 2345ea0dfeb4SHugh Dickins * no serialization needed when called from shm_destroy(). 2346ea0dfeb4SHugh Dickins */ 23471da177e4SLinus Torvalds if (lock && !(info->flags & VM_LOCKED)) { 2348d7c9e99aSAlexey Gladkov if (!user_shm_lock(inode->i_size, ucounts)) 23491da177e4SLinus Torvalds goto out_nomem; 23501da177e4SLinus Torvalds info->flags |= VM_LOCKED; 235189e004eaSLee Schermerhorn mapping_set_unevictable(file->f_mapping); 23521da177e4SLinus Torvalds } 2353d7c9e99aSAlexey Gladkov if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2354d7c9e99aSAlexey Gladkov user_shm_unlock(inode->i_size, ucounts); 23551da177e4SLinus Torvalds info->flags &= ~VM_LOCKED; 235689e004eaSLee Schermerhorn mapping_clear_unevictable(file->f_mapping); 23571da177e4SLinus Torvalds } 23581da177e4SLinus Torvalds retval = 0; 235989e004eaSLee Schermerhorn 23601da177e4SLinus Torvalds out_nomem: 23611da177e4SLinus Torvalds return retval; 23621da177e4SLinus Torvalds } 23631da177e4SLinus Torvalds 23649b83a6a8SAdrian Bunk static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 23651da177e4SLinus Torvalds { 2366d09e8ca6SPasha Tatashin struct inode *inode = file_inode(file); 2367d09e8ca6SPasha Tatashin struct shmem_inode_info *info = SHMEM_I(inode); 236822247efdSPeter Xu int ret; 2369ab3948f5SJoel Fernandes (Google) 237022247efdSPeter Xu ret = seal_check_future_write(info->seals, vma); 237122247efdSPeter Xu if (ret) 237222247efdSPeter Xu return ret; 2373ab3948f5SJoel Fernandes (Google) 237451b0bff2SCatalin Marinas /* arm64 - allow memory tagging on RAM-based files */ 23751c71222eSSuren Baghdasaryan vm_flags_set(vma, VM_MTE_ALLOWED); 237651b0bff2SCatalin Marinas 23771da177e4SLinus Torvalds file_accessed(file); 2378d09e8ca6SPasha Tatashin /* This is anonymous shared memory if it is unlinked at the time of mmap */ 2379d09e8ca6SPasha Tatashin if (inode->i_nlink) 23801da177e4SLinus Torvalds vma->vm_ops = &shmem_vm_ops; 2381d09e8ca6SPasha Tatashin else 2382d09e8ca6SPasha Tatashin vma->vm_ops = &shmem_anon_vm_ops; 23831da177e4SLinus Torvalds return 0; 23841da177e4SLinus Torvalds } 23851da177e4SLinus Torvalds 2386cb241339SHugh Dickins #ifdef CONFIG_TMPFS_XATTR 2387cb241339SHugh Dickins static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2388cb241339SHugh Dickins 2389cb241339SHugh Dickins /* 2390cb241339SHugh Dickins * chattr's fsflags are unrelated to extended attributes, 2391cb241339SHugh Dickins * but tmpfs has chosen to enable them under the same config option. 2392cb241339SHugh Dickins */ 2393cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2394e408e695STheodore Ts'o { 2395cb241339SHugh Dickins unsigned int i_flags = 0; 2396cb241339SHugh Dickins 2397cb241339SHugh Dickins if (fsflags & FS_NOATIME_FL) 2398cb241339SHugh Dickins i_flags |= S_NOATIME; 2399cb241339SHugh Dickins if (fsflags & FS_APPEND_FL) 2400cb241339SHugh Dickins i_flags |= S_APPEND; 2401cb241339SHugh Dickins if (fsflags & FS_IMMUTABLE_FL) 2402cb241339SHugh Dickins i_flags |= S_IMMUTABLE; 2403cb241339SHugh Dickins /* 2404cb241339SHugh Dickins * But FS_NODUMP_FL does not require any action in i_flags. 2405cb241339SHugh Dickins */ 2406cb241339SHugh Dickins inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); 2407e408e695STheodore Ts'o } 2408cb241339SHugh Dickins #else 2409cb241339SHugh Dickins static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2410cb241339SHugh Dickins { 2411cb241339SHugh Dickins } 2412cb241339SHugh Dickins #define shmem_initxattrs NULL 2413cb241339SHugh Dickins #endif 2414e408e695STheodore Ts'o 2415a2e45955SChuck Lever static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) 2416a2e45955SChuck Lever { 2417a2e45955SChuck Lever return &SHMEM_I(inode)->dir_offsets; 2418a2e45955SChuck Lever } 2419a2e45955SChuck Lever 2420e09764cfSCarlos Maiolino static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, 2421e09764cfSCarlos Maiolino struct super_block *sb, 2422e09764cfSCarlos Maiolino struct inode *dir, umode_t mode, 2423e09764cfSCarlos Maiolino dev_t dev, unsigned long flags) 24241da177e4SLinus Torvalds { 24251da177e4SLinus Torvalds struct inode *inode; 24261da177e4SLinus Torvalds struct shmem_inode_info *info; 24271da177e4SLinus Torvalds struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2428e809d5f0SChris Down ino_t ino; 242971480663SCarlos Maiolino int err; 24301da177e4SLinus Torvalds 243171480663SCarlos Maiolino err = shmem_reserve_inode(sb, &ino); 243271480663SCarlos Maiolino if (err) 243371480663SCarlos Maiolino return ERR_PTR(err); 243471480663SCarlos Maiolino 24351da177e4SLinus Torvalds 24361da177e4SLinus Torvalds inode = new_inode(sb); 243771480663SCarlos Maiolino 243871480663SCarlos Maiolino if (!inode) { 243971480663SCarlos Maiolino shmem_free_inode(sb); 244071480663SCarlos Maiolino return ERR_PTR(-ENOSPC); 244171480663SCarlos Maiolino } 244271480663SCarlos Maiolino 2443e809d5f0SChris Down inode->i_ino = ino; 24447a80e5b8SGiuseppe Scrivano inode_init_owner(idmap, inode, dir, mode); 24451da177e4SLinus Torvalds inode->i_blocks = 0; 2446078cd827SDeepa Dinamani inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2447a251c17aSJason A. Donenfeld inode->i_generation = get_random_u32(); 24481da177e4SLinus Torvalds info = SHMEM_I(inode); 24491da177e4SLinus Torvalds memset(info, 0, (char *)inode - (char *)info); 24501da177e4SLinus Torvalds spin_lock_init(&info->lock); 2451af53d3e9SHugh Dickins atomic_set(&info->stop_eviction, 0); 245240e041a2SDavid Herrmann info->seals = F_SEAL_SEAL; 24530b0a0806SHugh Dickins info->flags = flags & VM_NORESERVE; 2454f7cd16a5SXavier Roche info->i_crtime = inode->i_mtime; 2455e408e695STheodore Ts'o info->fsflags = (dir == NULL) ? 0 : 2456e408e695STheodore Ts'o SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; 2457cb241339SHugh Dickins if (info->fsflags) 2458cb241339SHugh Dickins shmem_set_inode_flags(inode, info->fsflags); 2459779750d2SKirill A. Shutemov INIT_LIST_HEAD(&info->shrinklist); 24601da177e4SLinus Torvalds INIT_LIST_HEAD(&info->swaplist); 246171480663SCarlos Maiolino INIT_LIST_HEAD(&info->swaplist); 24622c6efe9cSLuis Chamberlain if (sbinfo->noswap) 24632c6efe9cSLuis Chamberlain mapping_set_unevictable(inode->i_mapping); 246438f38657SAristeu Rozanski simple_xattrs_init(&info->xattrs); 246572c04902SAl Viro cache_no_acl(inode); 2466ff36da69SMatthew Wilcox (Oracle) mapping_set_large_folios(inode->i_mapping); 24671da177e4SLinus Torvalds 24681da177e4SLinus Torvalds switch (mode & S_IFMT) { 24691da177e4SLinus Torvalds default: 247039f0247dSAndreas Gruenbacher inode->i_op = &shmem_special_inode_operations; 24711da177e4SLinus Torvalds init_special_inode(inode, mode, dev); 24721da177e4SLinus Torvalds break; 24731da177e4SLinus Torvalds case S_IFREG: 247414fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 24751da177e4SLinus Torvalds inode->i_op = &shmem_inode_operations; 24761da177e4SLinus Torvalds inode->i_fop = &shmem_file_operations; 247771fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, 247871fe804bSLee Schermerhorn shmem_get_sbmpol(sbinfo)); 24791da177e4SLinus Torvalds break; 24801da177e4SLinus Torvalds case S_IFDIR: 2481d8c76e6fSDave Hansen inc_nlink(inode); 24821da177e4SLinus Torvalds /* Some things misbehave if size == 0 on a directory */ 24831da177e4SLinus Torvalds inode->i_size = 2 * BOGO_DIRENT_SIZE; 24841da177e4SLinus Torvalds inode->i_op = &shmem_dir_inode_operations; 2485a2e45955SChuck Lever inode->i_fop = &simple_offset_dir_operations; 2486a2e45955SChuck Lever simple_offset_init(shmem_get_offset_ctx(inode)); 24871da177e4SLinus Torvalds break; 24881da177e4SLinus Torvalds case S_IFLNK: 24891da177e4SLinus Torvalds /* 24901da177e4SLinus Torvalds * Must not load anything in the rbtree, 24911da177e4SLinus Torvalds * mpol_free_shared_policy will not be called. 24921da177e4SLinus Torvalds */ 249371fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, NULL); 24941da177e4SLinus Torvalds break; 24951da177e4SLinus Torvalds } 2496b45d71fbSJoel Fernandes (Google) 2497b45d71fbSJoel Fernandes (Google) lockdep_annotate_inode_mutex_key(inode); 24981da177e4SLinus Torvalds return inode; 24991da177e4SLinus Torvalds } 25001da177e4SLinus Torvalds 2501e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 2502e09764cfSCarlos Maiolino static struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2503e09764cfSCarlos Maiolino struct super_block *sb, struct inode *dir, 2504e09764cfSCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 2505e09764cfSCarlos Maiolino { 2506e09764cfSCarlos Maiolino int err; 2507e09764cfSCarlos Maiolino struct inode *inode; 2508e09764cfSCarlos Maiolino 2509e09764cfSCarlos Maiolino inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2510e09764cfSCarlos Maiolino if (IS_ERR(inode)) 2511e09764cfSCarlos Maiolino return inode; 2512e09764cfSCarlos Maiolino 2513e09764cfSCarlos Maiolino err = dquot_initialize(inode); 2514e09764cfSCarlos Maiolino if (err) 2515e09764cfSCarlos Maiolino goto errout; 2516e09764cfSCarlos Maiolino 2517e09764cfSCarlos Maiolino err = dquot_alloc_inode(inode); 2518e09764cfSCarlos Maiolino if (err) { 2519e09764cfSCarlos Maiolino dquot_drop(inode); 2520e09764cfSCarlos Maiolino goto errout; 2521e09764cfSCarlos Maiolino } 2522e09764cfSCarlos Maiolino return inode; 2523e09764cfSCarlos Maiolino 2524e09764cfSCarlos Maiolino errout: 2525e09764cfSCarlos Maiolino inode->i_flags |= S_NOQUOTA; 2526e09764cfSCarlos Maiolino iput(inode); 2527e09764cfSCarlos Maiolino return ERR_PTR(err); 2528e09764cfSCarlos Maiolino } 2529e09764cfSCarlos Maiolino #else 2530e09764cfSCarlos Maiolino static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2531e09764cfSCarlos Maiolino struct super_block *sb, struct inode *dir, 2532e09764cfSCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 2533e09764cfSCarlos Maiolino { 2534e09764cfSCarlos Maiolino return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2535e09764cfSCarlos Maiolino } 2536e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 2537e09764cfSCarlos Maiolino 25383460f6e5SAxel Rasmussen #ifdef CONFIG_USERFAULTFD 253961c50040SAxel Rasmussen int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 25404c27fe4cSMike Rapoport struct vm_area_struct *dst_vma, 25414c27fe4cSMike Rapoport unsigned long dst_addr, 25424c27fe4cSMike Rapoport unsigned long src_addr, 2543d9712937SAxel Rasmussen uffd_flags_t flags, 2544d7be6d7eSZhangPeng struct folio **foliop) 25454c27fe4cSMike Rapoport { 25464c27fe4cSMike Rapoport struct inode *inode = file_inode(dst_vma->vm_file); 25474c27fe4cSMike Rapoport struct shmem_inode_info *info = SHMEM_I(inode); 25484c27fe4cSMike Rapoport struct address_space *mapping = inode->i_mapping; 25494c27fe4cSMike Rapoport gfp_t gfp = mapping_gfp_mask(mapping); 25504c27fe4cSMike Rapoport pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 25514c27fe4cSMike Rapoport void *page_kaddr; 2552b7dd44a1SMatthew Wilcox (Oracle) struct folio *folio; 25534c27fe4cSMike Rapoport int ret; 25543460f6e5SAxel Rasmussen pgoff_t max_off; 25554c27fe4cSMike Rapoport 2556c7e263abSLukas Czerner if (shmem_inode_acct_block(inode, 1)) { 25577ed9d238SAxel Rasmussen /* 25587ed9d238SAxel Rasmussen * We may have got a page, returned -ENOENT triggering a retry, 25597ed9d238SAxel Rasmussen * and now we find ourselves with -ENOMEM. Release the page, to 25607ed9d238SAxel Rasmussen * avoid a BUG_ON in our caller. 25617ed9d238SAxel Rasmussen */ 2562d7be6d7eSZhangPeng if (unlikely(*foliop)) { 2563d7be6d7eSZhangPeng folio_put(*foliop); 2564d7be6d7eSZhangPeng *foliop = NULL; 25657ed9d238SAxel Rasmussen } 25667d64ae3aSAxel Rasmussen return -ENOMEM; 25677ed9d238SAxel Rasmussen } 25684c27fe4cSMike Rapoport 2569d7be6d7eSZhangPeng if (!*foliop) { 25707d64ae3aSAxel Rasmussen ret = -ENOMEM; 25717a7256d5SMatthew Wilcox (Oracle) folio = shmem_alloc_folio(gfp, info, pgoff); 25727a7256d5SMatthew Wilcox (Oracle) if (!folio) 25730f079694SMike Rapoport goto out_unacct_blocks; 25744c27fe4cSMike Rapoport 2575d9712937SAxel Rasmussen if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 25767a7256d5SMatthew Wilcox (Oracle) page_kaddr = kmap_local_folio(folio, 0); 25775dc21f0cSIra Weiny /* 25785dc21f0cSIra Weiny * The read mmap_lock is held here. Despite the 25795dc21f0cSIra Weiny * mmap_lock being read recursive a deadlock is still 25805dc21f0cSIra Weiny * possible if a writer has taken a lock. For example: 25815dc21f0cSIra Weiny * 25825dc21f0cSIra Weiny * process A thread 1 takes read lock on own mmap_lock 25835dc21f0cSIra Weiny * process A thread 2 calls mmap, blocks taking write lock 25845dc21f0cSIra Weiny * process B thread 1 takes page fault, read lock on own mmap lock 25855dc21f0cSIra Weiny * process B thread 2 calls mmap, blocks taking write lock 25865dc21f0cSIra Weiny * process A thread 1 blocks taking read lock on process B 25875dc21f0cSIra Weiny * process B thread 1 blocks taking read lock on process A 25885dc21f0cSIra Weiny * 25895dc21f0cSIra Weiny * Disable page faults to prevent potential deadlock 25905dc21f0cSIra Weiny * and retry the copy outside the mmap_lock. 25915dc21f0cSIra Weiny */ 25925dc21f0cSIra Weiny pagefault_disable(); 25938d103963SMike Rapoport ret = copy_from_user(page_kaddr, 25948d103963SMike Rapoport (const void __user *)src_addr, 25954c27fe4cSMike Rapoport PAGE_SIZE); 25965dc21f0cSIra Weiny pagefault_enable(); 25977a7256d5SMatthew Wilcox (Oracle) kunmap_local(page_kaddr); 25984c27fe4cSMike Rapoport 2599c1e8d7c6SMichel Lespinasse /* fallback to copy_from_user outside mmap_lock */ 26004c27fe4cSMike Rapoport if (unlikely(ret)) { 2601d7be6d7eSZhangPeng *foliop = folio; 26027d64ae3aSAxel Rasmussen ret = -ENOENT; 26034c27fe4cSMike Rapoport /* don't free the page */ 26047d64ae3aSAxel Rasmussen goto out_unacct_blocks; 26054c27fe4cSMike Rapoport } 260619b482c2SMuchun Song 26077a7256d5SMatthew Wilcox (Oracle) flush_dcache_folio(folio); 26083460f6e5SAxel Rasmussen } else { /* ZEROPAGE */ 26097a7256d5SMatthew Wilcox (Oracle) clear_user_highpage(&folio->page, dst_addr); 26108d103963SMike Rapoport } 26114c27fe4cSMike Rapoport } else { 2612d7be6d7eSZhangPeng folio = *foliop; 26137a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 2614d7be6d7eSZhangPeng *foliop = NULL; 26154c27fe4cSMike Rapoport } 26164c27fe4cSMike Rapoport 26177a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON(folio_test_locked(folio)); 26187a7256d5SMatthew Wilcox (Oracle) VM_BUG_ON(folio_test_swapbacked(folio)); 26197a7256d5SMatthew Wilcox (Oracle) __folio_set_locked(folio); 26207a7256d5SMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 26217a7256d5SMatthew Wilcox (Oracle) __folio_mark_uptodate(folio); 26229cc90c66SAndrea Arcangeli 2623e2a50c1fSAndrea Arcangeli ret = -EFAULT; 2624e2a50c1fSAndrea Arcangeli max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 26253460f6e5SAxel Rasmussen if (unlikely(pgoff >= max_off)) 2626e2a50c1fSAndrea Arcangeli goto out_release; 2627e2a50c1fSAndrea Arcangeli 2628b7dd44a1SMatthew Wilcox (Oracle) ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, 262961c50040SAxel Rasmussen gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); 26304c27fe4cSMike Rapoport if (ret) 26314c27fe4cSMike Rapoport goto out_release; 26324c27fe4cSMike Rapoport 263361c50040SAxel Rasmussen ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 2634d9712937SAxel Rasmussen &folio->page, true, flags); 26357d64ae3aSAxel Rasmussen if (ret) 26367d64ae3aSAxel Rasmussen goto out_delete_from_cache; 26374c27fe4cSMike Rapoport 2638*3c1b7528SHugh Dickins shmem_recalc_inode(inode, 1, 0); 26397a7256d5SMatthew Wilcox (Oracle) folio_unlock(folio); 26407d64ae3aSAxel Rasmussen return 0; 26417d64ae3aSAxel Rasmussen out_delete_from_cache: 26427a7256d5SMatthew Wilcox (Oracle) filemap_remove_folio(folio); 26434c27fe4cSMike Rapoport out_release: 26447a7256d5SMatthew Wilcox (Oracle) folio_unlock(folio); 26457a7256d5SMatthew Wilcox (Oracle) folio_put(folio); 26464c27fe4cSMike Rapoport out_unacct_blocks: 26470f079694SMike Rapoport shmem_inode_unacct_blocks(inode, 1); 26487d64ae3aSAxel Rasmussen return ret; 26494c27fe4cSMike Rapoport } 26503460f6e5SAxel Rasmussen #endif /* CONFIG_USERFAULTFD */ 26518d103963SMike Rapoport 26521da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 265392e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations; 265469f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations; 26551da177e4SLinus Torvalds 26561da177e4SLinus Torvalds static int 2657800d15a5SNick Piggin shmem_write_begin(struct file *file, struct address_space *mapping, 26589d6b0cd7SMatthew Wilcox (Oracle) loff_t pos, unsigned len, 2659800d15a5SNick Piggin struct page **pagep, void **fsdata) 26601da177e4SLinus Torvalds { 2661800d15a5SNick Piggin struct inode *inode = mapping->host; 266240e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 266309cbfeafSKirill A. Shutemov pgoff_t index = pos >> PAGE_SHIFT; 2664eff1f906SMatthew Wilcox (Oracle) struct folio *folio; 2665a7605426SYang Shi int ret = 0; 266640e041a2SDavid Herrmann 26679608703eSJan Kara /* i_rwsem is held by caller */ 2668ab3948f5SJoel Fernandes (Google) if (unlikely(info->seals & (F_SEAL_GROW | 2669ab3948f5SJoel Fernandes (Google) F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2670ab3948f5SJoel Fernandes (Google) if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 267140e041a2SDavid Herrmann return -EPERM; 267240e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 267340e041a2SDavid Herrmann return -EPERM; 267440e041a2SDavid Herrmann } 267540e041a2SDavid Herrmann 2676eff1f906SMatthew Wilcox (Oracle) ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); 2677a7605426SYang Shi 2678a7605426SYang Shi if (ret) 2679a7605426SYang Shi return ret; 2680a7605426SYang Shi 2681eff1f906SMatthew Wilcox (Oracle) *pagep = folio_file_page(folio, index); 2682a7605426SYang Shi if (PageHWPoison(*pagep)) { 2683eff1f906SMatthew Wilcox (Oracle) folio_unlock(folio); 2684eff1f906SMatthew Wilcox (Oracle) folio_put(folio); 2685a7605426SYang Shi *pagep = NULL; 2686a7605426SYang Shi return -EIO; 2687a7605426SYang Shi } 2688a7605426SYang Shi 2689a7605426SYang Shi return 0; 2690800d15a5SNick Piggin } 2691800d15a5SNick Piggin 2692800d15a5SNick Piggin static int 2693800d15a5SNick Piggin shmem_write_end(struct file *file, struct address_space *mapping, 2694800d15a5SNick Piggin loff_t pos, unsigned len, unsigned copied, 2695800d15a5SNick Piggin struct page *page, void *fsdata) 2696800d15a5SNick Piggin { 269769bbb87bSMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 2698800d15a5SNick Piggin struct inode *inode = mapping->host; 2699800d15a5SNick Piggin 2700800d15a5SNick Piggin if (pos + copied > inode->i_size) 2701800d15a5SNick Piggin i_size_write(inode, pos + copied); 2702800d15a5SNick Piggin 270369bbb87bSMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) { 270469bbb87bSMatthew Wilcox (Oracle) if (copied < folio_size(folio)) { 270569bbb87bSMatthew Wilcox (Oracle) size_t from = offset_in_folio(folio, pos); 270669bbb87bSMatthew Wilcox (Oracle) folio_zero_segments(folio, 0, from, 270769bbb87bSMatthew Wilcox (Oracle) from + copied, folio_size(folio)); 2708800d8c63SKirill A. Shutemov } 270969bbb87bSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 2710800d8c63SKirill A. Shutemov } 271169bbb87bSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 271269bbb87bSMatthew Wilcox (Oracle) folio_unlock(folio); 271369bbb87bSMatthew Wilcox (Oracle) folio_put(folio); 2714d3602444SHugh Dickins 2715800d15a5SNick Piggin return copied; 27161da177e4SLinus Torvalds } 27171da177e4SLinus Torvalds 27182ba5bbedSAl Viro static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 27191da177e4SLinus Torvalds { 27206e58e79dSAl Viro struct file *file = iocb->ki_filp; 27216e58e79dSAl Viro struct inode *inode = file_inode(file); 27221da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 272341ffe5d5SHugh Dickins pgoff_t index; 272441ffe5d5SHugh Dickins unsigned long offset; 2725f7c1d074SGeert Uytterhoeven int error = 0; 2726cb66a7a1SAl Viro ssize_t retval = 0; 27276e58e79dSAl Viro loff_t *ppos = &iocb->ki_pos; 2728a0ee5ec5SHugh Dickins 272909cbfeafSKirill A. Shutemov index = *ppos >> PAGE_SHIFT; 273009cbfeafSKirill A. Shutemov offset = *ppos & ~PAGE_MASK; 27311da177e4SLinus Torvalds 27321da177e4SLinus Torvalds for (;;) { 27334601e2fcSMatthew Wilcox (Oracle) struct folio *folio = NULL; 27341da177e4SLinus Torvalds struct page *page = NULL; 273541ffe5d5SHugh Dickins pgoff_t end_index; 273641ffe5d5SHugh Dickins unsigned long nr, ret; 27371da177e4SLinus Torvalds loff_t i_size = i_size_read(inode); 27381da177e4SLinus Torvalds 273909cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 27401da177e4SLinus Torvalds if (index > end_index) 27411da177e4SLinus Torvalds break; 27421da177e4SLinus Torvalds if (index == end_index) { 274309cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 27441da177e4SLinus Torvalds if (nr <= offset) 27451da177e4SLinus Torvalds break; 27461da177e4SLinus Torvalds } 27471da177e4SLinus Torvalds 27484601e2fcSMatthew Wilcox (Oracle) error = shmem_get_folio(inode, index, &folio, SGP_READ); 27496e58e79dSAl Viro if (error) { 27506e58e79dSAl Viro if (error == -EINVAL) 27516e58e79dSAl Viro error = 0; 27521da177e4SLinus Torvalds break; 27531da177e4SLinus Torvalds } 27544601e2fcSMatthew Wilcox (Oracle) if (folio) { 27554601e2fcSMatthew Wilcox (Oracle) folio_unlock(folio); 2756a7605426SYang Shi 27574601e2fcSMatthew Wilcox (Oracle) page = folio_file_page(folio, index); 2758a7605426SYang Shi if (PageHWPoison(page)) { 27594601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 2760a7605426SYang Shi error = -EIO; 2761a7605426SYang Shi break; 2762a7605426SYang Shi } 276375edd345SHugh Dickins } 27641da177e4SLinus Torvalds 27651da177e4SLinus Torvalds /* 27661da177e4SLinus Torvalds * We must evaluate after, since reads (unlike writes) 27679608703eSJan Kara * are called without i_rwsem protection against truncate 27681da177e4SLinus Torvalds */ 276909cbfeafSKirill A. Shutemov nr = PAGE_SIZE; 27701da177e4SLinus Torvalds i_size = i_size_read(inode); 277109cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 27721da177e4SLinus Torvalds if (index == end_index) { 277309cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 27741da177e4SLinus Torvalds if (nr <= offset) { 27754601e2fcSMatthew Wilcox (Oracle) if (folio) 27764601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 27771da177e4SLinus Torvalds break; 27781da177e4SLinus Torvalds } 27791da177e4SLinus Torvalds } 27801da177e4SLinus Torvalds nr -= offset; 27811da177e4SLinus Torvalds 27824601e2fcSMatthew Wilcox (Oracle) if (folio) { 27831da177e4SLinus Torvalds /* 27841da177e4SLinus Torvalds * If users can be writing to this page using arbitrary 27851da177e4SLinus Torvalds * virtual addresses, take care about potential aliasing 27861da177e4SLinus Torvalds * before reading the page on the kernel side. 27871da177e4SLinus Torvalds */ 27881da177e4SLinus Torvalds if (mapping_writably_mapped(mapping)) 27891da177e4SLinus Torvalds flush_dcache_page(page); 27901da177e4SLinus Torvalds /* 27911da177e4SLinus Torvalds * Mark the page accessed if we read the beginning. 27921da177e4SLinus Torvalds */ 27931da177e4SLinus Torvalds if (!offset) 27944601e2fcSMatthew Wilcox (Oracle) folio_mark_accessed(folio); 27951da177e4SLinus Torvalds /* 27961da177e4SLinus Torvalds * Ok, we have the page, and it's up-to-date, so 27971da177e4SLinus Torvalds * now we can copy it to user space... 27981da177e4SLinus Torvalds */ 27992ba5bbedSAl Viro ret = copy_page_to_iter(page, offset, nr, to); 28004601e2fcSMatthew Wilcox (Oracle) folio_put(folio); 28011bdec44bSHugh Dickins 2802fcb14cb1SAl Viro } else if (user_backed_iter(to)) { 28031bdec44bSHugh Dickins /* 28041bdec44bSHugh Dickins * Copy to user tends to be so well optimized, but 28051bdec44bSHugh Dickins * clear_user() not so much, that it is noticeably 28061bdec44bSHugh Dickins * faster to copy the zero page instead of clearing. 28071bdec44bSHugh Dickins */ 28081bdec44bSHugh Dickins ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); 28091bdec44bSHugh Dickins } else { 28101bdec44bSHugh Dickins /* 28111bdec44bSHugh Dickins * But submitting the same page twice in a row to 28121bdec44bSHugh Dickins * splice() - or others? - can result in confusion: 28131bdec44bSHugh Dickins * so don't attempt that optimization on pipes etc. 28141bdec44bSHugh Dickins */ 28151bdec44bSHugh Dickins ret = iov_iter_zero(nr, to); 28161bdec44bSHugh Dickins } 28171bdec44bSHugh Dickins 28186e58e79dSAl Viro retval += ret; 28191da177e4SLinus Torvalds offset += ret; 282009cbfeafSKirill A. Shutemov index += offset >> PAGE_SHIFT; 282109cbfeafSKirill A. Shutemov offset &= ~PAGE_MASK; 28221da177e4SLinus Torvalds 28232ba5bbedSAl Viro if (!iov_iter_count(to)) 28241da177e4SLinus Torvalds break; 28256e58e79dSAl Viro if (ret < nr) { 28266e58e79dSAl Viro error = -EFAULT; 28276e58e79dSAl Viro break; 28286e58e79dSAl Viro } 28291da177e4SLinus Torvalds cond_resched(); 28301da177e4SLinus Torvalds } 28311da177e4SLinus Torvalds 283209cbfeafSKirill A. Shutemov *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 28336e58e79dSAl Viro file_accessed(file); 28346e58e79dSAl Viro return retval ? retval : error; 28351da177e4SLinus Torvalds } 28361da177e4SLinus Torvalds 2837bd194b18SDavid Howells static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, 2838bd194b18SDavid Howells struct pipe_buffer *buf) 2839bd194b18SDavid Howells { 2840bd194b18SDavid Howells return true; 2841bd194b18SDavid Howells } 2842bd194b18SDavid Howells 2843bd194b18SDavid Howells static void zero_pipe_buf_release(struct pipe_inode_info *pipe, 2844bd194b18SDavid Howells struct pipe_buffer *buf) 2845bd194b18SDavid Howells { 2846bd194b18SDavid Howells } 2847bd194b18SDavid Howells 2848bd194b18SDavid Howells static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, 2849bd194b18SDavid Howells struct pipe_buffer *buf) 2850bd194b18SDavid Howells { 2851bd194b18SDavid Howells return false; 2852bd194b18SDavid Howells } 2853bd194b18SDavid Howells 2854bd194b18SDavid Howells static const struct pipe_buf_operations zero_pipe_buf_ops = { 2855bd194b18SDavid Howells .release = zero_pipe_buf_release, 2856bd194b18SDavid Howells .try_steal = zero_pipe_buf_try_steal, 2857bd194b18SDavid Howells .get = zero_pipe_buf_get, 2858bd194b18SDavid Howells }; 2859bd194b18SDavid Howells 2860bd194b18SDavid Howells static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, 2861bd194b18SDavid Howells loff_t fpos, size_t size) 2862bd194b18SDavid Howells { 2863bd194b18SDavid Howells size_t offset = fpos & ~PAGE_MASK; 2864bd194b18SDavid Howells 2865bd194b18SDavid Howells size = min_t(size_t, size, PAGE_SIZE - offset); 2866bd194b18SDavid Howells 2867bd194b18SDavid Howells if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 2868bd194b18SDavid Howells struct pipe_buffer *buf = pipe_head_buf(pipe); 2869bd194b18SDavid Howells 2870bd194b18SDavid Howells *buf = (struct pipe_buffer) { 2871bd194b18SDavid Howells .ops = &zero_pipe_buf_ops, 2872bd194b18SDavid Howells .page = ZERO_PAGE(0), 2873bd194b18SDavid Howells .offset = offset, 2874bd194b18SDavid Howells .len = size, 2875bd194b18SDavid Howells }; 2876bd194b18SDavid Howells pipe->head++; 2877bd194b18SDavid Howells } 2878bd194b18SDavid Howells 2879bd194b18SDavid Howells return size; 2880bd194b18SDavid Howells } 2881bd194b18SDavid Howells 2882bd194b18SDavid Howells static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 2883bd194b18SDavid Howells struct pipe_inode_info *pipe, 2884bd194b18SDavid Howells size_t len, unsigned int flags) 2885bd194b18SDavid Howells { 2886bd194b18SDavid Howells struct inode *inode = file_inode(in); 2887bd194b18SDavid Howells struct address_space *mapping = inode->i_mapping; 2888bd194b18SDavid Howells struct folio *folio = NULL; 2889bd194b18SDavid Howells size_t total_spliced = 0, used, npages, n, part; 2890bd194b18SDavid Howells loff_t isize; 2891bd194b18SDavid Howells int error = 0; 2892bd194b18SDavid Howells 2893bd194b18SDavid Howells /* Work out how much data we can actually add into the pipe */ 2894bd194b18SDavid Howells used = pipe_occupancy(pipe->head, pipe->tail); 2895bd194b18SDavid Howells npages = max_t(ssize_t, pipe->max_usage - used, 0); 2896bd194b18SDavid Howells len = min_t(size_t, len, npages * PAGE_SIZE); 2897bd194b18SDavid Howells 2898bd194b18SDavid Howells do { 2899bd194b18SDavid Howells if (*ppos >= i_size_read(inode)) 2900bd194b18SDavid Howells break; 2901bd194b18SDavid Howells 2902fa598952SHugh Dickins error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, 2903fa598952SHugh Dickins SGP_READ); 2904bd194b18SDavid Howells if (error) { 2905bd194b18SDavid Howells if (error == -EINVAL) 2906bd194b18SDavid Howells error = 0; 2907bd194b18SDavid Howells break; 2908bd194b18SDavid Howells } 2909bd194b18SDavid Howells if (folio) { 2910bd194b18SDavid Howells folio_unlock(folio); 2911bd194b18SDavid Howells 2912fa598952SHugh Dickins if (folio_test_hwpoison(folio) || 2913fa598952SHugh Dickins (folio_test_large(folio) && 2914fa598952SHugh Dickins folio_test_has_hwpoisoned(folio))) { 2915bd194b18SDavid Howells error = -EIO; 2916bd194b18SDavid Howells break; 2917bd194b18SDavid Howells } 2918bd194b18SDavid Howells } 2919bd194b18SDavid Howells 2920bd194b18SDavid Howells /* 2921bd194b18SDavid Howells * i_size must be checked after we know the pages are Uptodate. 2922bd194b18SDavid Howells * 2923bd194b18SDavid Howells * Checking i_size after the check allows us to calculate 2924bd194b18SDavid Howells * the correct value for "nr", which means the zero-filled 2925bd194b18SDavid Howells * part of the page is not copied back to userspace (unless 2926bd194b18SDavid Howells * another truncate extends the file - this is desired though). 2927bd194b18SDavid Howells */ 2928bd194b18SDavid Howells isize = i_size_read(inode); 2929bd194b18SDavid Howells if (unlikely(*ppos >= isize)) 2930bd194b18SDavid Howells break; 2931bd194b18SDavid Howells part = min_t(loff_t, isize - *ppos, len); 2932bd194b18SDavid Howells 2933bd194b18SDavid Howells if (folio) { 2934bd194b18SDavid Howells /* 2935bd194b18SDavid Howells * If users can be writing to this page using arbitrary 2936bd194b18SDavid Howells * virtual addresses, take care about potential aliasing 2937bd194b18SDavid Howells * before reading the page on the kernel side. 2938bd194b18SDavid Howells */ 2939bd194b18SDavid Howells if (mapping_writably_mapped(mapping)) 2940bd194b18SDavid Howells flush_dcache_folio(folio); 2941bd194b18SDavid Howells folio_mark_accessed(folio); 2942bd194b18SDavid Howells /* 2943bd194b18SDavid Howells * Ok, we have the page, and it's up-to-date, so we can 2944bd194b18SDavid Howells * now splice it into the pipe. 2945bd194b18SDavid Howells */ 2946bd194b18SDavid Howells n = splice_folio_into_pipe(pipe, folio, *ppos, part); 2947bd194b18SDavid Howells folio_put(folio); 2948bd194b18SDavid Howells folio = NULL; 2949bd194b18SDavid Howells } else { 2950fa598952SHugh Dickins n = splice_zeropage_into_pipe(pipe, *ppos, part); 2951bd194b18SDavid Howells } 2952bd194b18SDavid Howells 2953bd194b18SDavid Howells if (!n) 2954bd194b18SDavid Howells break; 2955bd194b18SDavid Howells len -= n; 2956bd194b18SDavid Howells total_spliced += n; 2957bd194b18SDavid Howells *ppos += n; 2958bd194b18SDavid Howells in->f_ra.prev_pos = *ppos; 2959bd194b18SDavid Howells if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 2960bd194b18SDavid Howells break; 2961bd194b18SDavid Howells 2962bd194b18SDavid Howells cond_resched(); 2963bd194b18SDavid Howells } while (len); 2964bd194b18SDavid Howells 2965bd194b18SDavid Howells if (folio) 2966bd194b18SDavid Howells folio_put(folio); 2967bd194b18SDavid Howells 2968bd194b18SDavid Howells file_accessed(in); 2969bd194b18SDavid Howells return total_spliced ? total_spliced : error; 2970bd194b18SDavid Howells } 2971bd194b18SDavid Howells 2972965c8e59SAndrew Morton static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2973220f2ac9SHugh Dickins { 2974220f2ac9SHugh Dickins struct address_space *mapping = file->f_mapping; 2975220f2ac9SHugh Dickins struct inode *inode = mapping->host; 2976220f2ac9SHugh Dickins 2977965c8e59SAndrew Morton if (whence != SEEK_DATA && whence != SEEK_HOLE) 2978965c8e59SAndrew Morton return generic_file_llseek_size(file, offset, whence, 2979220f2ac9SHugh Dickins MAX_LFS_FILESIZE, i_size_read(inode)); 298041139aa4SMatthew Wilcox (Oracle) if (offset < 0) 298141139aa4SMatthew Wilcox (Oracle) return -ENXIO; 298241139aa4SMatthew Wilcox (Oracle) 29835955102cSAl Viro inode_lock(inode); 29849608703eSJan Kara /* We're holding i_rwsem so we can access i_size directly */ 298541139aa4SMatthew Wilcox (Oracle) offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); 2986387aae6fSHugh Dickins if (offset >= 0) 298746a1c2c7SJie Liu offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 29885955102cSAl Viro inode_unlock(inode); 2989220f2ac9SHugh Dickins return offset; 2990220f2ac9SHugh Dickins } 2991220f2ac9SHugh Dickins 299283e4fa9cSHugh Dickins static long shmem_fallocate(struct file *file, int mode, loff_t offset, 299383e4fa9cSHugh Dickins loff_t len) 299483e4fa9cSHugh Dickins { 2995496ad9aaSAl Viro struct inode *inode = file_inode(file); 2996e2d12e22SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 299740e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 29981aac1400SHugh Dickins struct shmem_falloc shmem_falloc; 2999d144bf62SHugh Dickins pgoff_t start, index, end, undo_fallocend; 3000e2d12e22SHugh Dickins int error; 300183e4fa9cSHugh Dickins 300213ace4d0SHugh Dickins if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 300313ace4d0SHugh Dickins return -EOPNOTSUPP; 300413ace4d0SHugh Dickins 30055955102cSAl Viro inode_lock(inode); 300683e4fa9cSHugh Dickins 300783e4fa9cSHugh Dickins if (mode & FALLOC_FL_PUNCH_HOLE) { 300883e4fa9cSHugh Dickins struct address_space *mapping = file->f_mapping; 300983e4fa9cSHugh Dickins loff_t unmap_start = round_up(offset, PAGE_SIZE); 301083e4fa9cSHugh Dickins loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 30118e205f77SHugh Dickins DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 301283e4fa9cSHugh Dickins 30139608703eSJan Kara /* protected by i_rwsem */ 3014ab3948f5SJoel Fernandes (Google) if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 301540e041a2SDavid Herrmann error = -EPERM; 301640e041a2SDavid Herrmann goto out; 301740e041a2SDavid Herrmann } 301840e041a2SDavid Herrmann 30198e205f77SHugh Dickins shmem_falloc.waitq = &shmem_falloc_waitq; 3020aa71ecd8SChen Jun shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; 3021f00cdc6dSHugh Dickins shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 3022f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 3023f00cdc6dSHugh Dickins inode->i_private = &shmem_falloc; 3024f00cdc6dSHugh Dickins spin_unlock(&inode->i_lock); 3025f00cdc6dSHugh Dickins 302683e4fa9cSHugh Dickins if ((u64)unmap_end > (u64)unmap_start) 302783e4fa9cSHugh Dickins unmap_mapping_range(mapping, unmap_start, 302883e4fa9cSHugh Dickins 1 + unmap_end - unmap_start, 0); 302983e4fa9cSHugh Dickins shmem_truncate_range(inode, offset, offset + len - 1); 303083e4fa9cSHugh Dickins /* No need to unmap again: hole-punching leaves COWed pages */ 30318e205f77SHugh Dickins 30328e205f77SHugh Dickins spin_lock(&inode->i_lock); 30338e205f77SHugh Dickins inode->i_private = NULL; 30348e205f77SHugh Dickins wake_up_all(&shmem_falloc_waitq); 30352055da97SIngo Molnar WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 30368e205f77SHugh Dickins spin_unlock(&inode->i_lock); 303783e4fa9cSHugh Dickins error = 0; 30388e205f77SHugh Dickins goto out; 303983e4fa9cSHugh Dickins } 304083e4fa9cSHugh Dickins 3041e2d12e22SHugh Dickins /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 3042e2d12e22SHugh Dickins error = inode_newsize_ok(inode, offset + len); 3043e2d12e22SHugh Dickins if (error) 3044e2d12e22SHugh Dickins goto out; 3045e2d12e22SHugh Dickins 304640e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 304740e041a2SDavid Herrmann error = -EPERM; 304840e041a2SDavid Herrmann goto out; 304940e041a2SDavid Herrmann } 305040e041a2SDavid Herrmann 305109cbfeafSKirill A. Shutemov start = offset >> PAGE_SHIFT; 305209cbfeafSKirill A. Shutemov end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 3053e2d12e22SHugh Dickins /* Try to avoid a swapstorm if len is impossible to satisfy */ 3054e2d12e22SHugh Dickins if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 3055e2d12e22SHugh Dickins error = -ENOSPC; 3056e2d12e22SHugh Dickins goto out; 3057e2d12e22SHugh Dickins } 3058e2d12e22SHugh Dickins 30598e205f77SHugh Dickins shmem_falloc.waitq = NULL; 30601aac1400SHugh Dickins shmem_falloc.start = start; 30611aac1400SHugh Dickins shmem_falloc.next = start; 30621aac1400SHugh Dickins shmem_falloc.nr_falloced = 0; 30631aac1400SHugh Dickins shmem_falloc.nr_unswapped = 0; 30641aac1400SHugh Dickins spin_lock(&inode->i_lock); 30651aac1400SHugh Dickins inode->i_private = &shmem_falloc; 30661aac1400SHugh Dickins spin_unlock(&inode->i_lock); 30671aac1400SHugh Dickins 3068d144bf62SHugh Dickins /* 3069d144bf62SHugh Dickins * info->fallocend is only relevant when huge pages might be 3070d144bf62SHugh Dickins * involved: to prevent split_huge_page() freeing fallocated 3071d144bf62SHugh Dickins * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. 3072d144bf62SHugh Dickins */ 3073d144bf62SHugh Dickins undo_fallocend = info->fallocend; 3074d144bf62SHugh Dickins if (info->fallocend < end) 3075d144bf62SHugh Dickins info->fallocend = end; 3076d144bf62SHugh Dickins 3077050dcb5cSHugh Dickins for (index = start; index < end; ) { 3078b0802b22SMatthew Wilcox (Oracle) struct folio *folio; 3079e2d12e22SHugh Dickins 3080e2d12e22SHugh Dickins /* 3081e2d12e22SHugh Dickins * Good, the fallocate(2) manpage permits EINTR: we may have 3082e2d12e22SHugh Dickins * been interrupted because we are using up too much memory. 3083e2d12e22SHugh Dickins */ 3084e2d12e22SHugh Dickins if (signal_pending(current)) 3085e2d12e22SHugh Dickins error = -EINTR; 30861aac1400SHugh Dickins else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 30871aac1400SHugh Dickins error = -ENOMEM; 3088e2d12e22SHugh Dickins else 3089b0802b22SMatthew Wilcox (Oracle) error = shmem_get_folio(inode, index, &folio, 3090b0802b22SMatthew Wilcox (Oracle) SGP_FALLOC); 3091e2d12e22SHugh Dickins if (error) { 3092d144bf62SHugh Dickins info->fallocend = undo_fallocend; 3093b0802b22SMatthew Wilcox (Oracle) /* Remove the !uptodate folios we added */ 30947f556567SHugh Dickins if (index > start) { 30951635f6a7SHugh Dickins shmem_undo_range(inode, 309609cbfeafSKirill A. Shutemov (loff_t)start << PAGE_SHIFT, 3097b9b4bb26SAnthony Romano ((loff_t)index << PAGE_SHIFT) - 1, true); 30987f556567SHugh Dickins } 30991aac1400SHugh Dickins goto undone; 3100e2d12e22SHugh Dickins } 3101e2d12e22SHugh Dickins 3102050dcb5cSHugh Dickins /* 3103050dcb5cSHugh Dickins * Here is a more important optimization than it appears: 3104b0802b22SMatthew Wilcox (Oracle) * a second SGP_FALLOC on the same large folio will clear it, 3105b0802b22SMatthew Wilcox (Oracle) * making it uptodate and un-undoable if we fail later. 3106050dcb5cSHugh Dickins */ 3107b0802b22SMatthew Wilcox (Oracle) index = folio_next_index(folio); 3108050dcb5cSHugh Dickins /* Beware 32-bit wraparound */ 3109050dcb5cSHugh Dickins if (!index) 3110050dcb5cSHugh Dickins index--; 3111050dcb5cSHugh Dickins 3112e2d12e22SHugh Dickins /* 31131aac1400SHugh Dickins * Inform shmem_writepage() how far we have reached. 31141aac1400SHugh Dickins * No need for lock or barrier: we have the page lock. 31151aac1400SHugh Dickins */ 3116b0802b22SMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio)) 3117050dcb5cSHugh Dickins shmem_falloc.nr_falloced += index - shmem_falloc.next; 3118050dcb5cSHugh Dickins shmem_falloc.next = index; 31191aac1400SHugh Dickins 31201aac1400SHugh Dickins /* 3121b0802b22SMatthew Wilcox (Oracle) * If !uptodate, leave it that way so that freeable folios 31221635f6a7SHugh Dickins * can be recognized if we need to rollback on error later. 3123b0802b22SMatthew Wilcox (Oracle) * But mark it dirty so that memory pressure will swap rather 3124b0802b22SMatthew Wilcox (Oracle) * than free the folios we are allocating (and SGP_CACHE folios 3125e2d12e22SHugh Dickins * might still be clean: we now need to mark those dirty too). 3126e2d12e22SHugh Dickins */ 3127b0802b22SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 3128b0802b22SMatthew Wilcox (Oracle) folio_unlock(folio); 3129b0802b22SMatthew Wilcox (Oracle) folio_put(folio); 3130e2d12e22SHugh Dickins cond_resched(); 3131e2d12e22SHugh Dickins } 3132e2d12e22SHugh Dickins 3133e2d12e22SHugh Dickins if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 3134e2d12e22SHugh Dickins i_size_write(inode, offset + len); 31351aac1400SHugh Dickins undone: 31361aac1400SHugh Dickins spin_lock(&inode->i_lock); 31371aac1400SHugh Dickins inode->i_private = NULL; 31381aac1400SHugh Dickins spin_unlock(&inode->i_lock); 3139e2d12e22SHugh Dickins out: 314015f242bbSHugh Dickins if (!error) 314115f242bbSHugh Dickins file_modified(file); 31425955102cSAl Viro inode_unlock(inode); 314383e4fa9cSHugh Dickins return error; 314483e4fa9cSHugh Dickins } 314583e4fa9cSHugh Dickins 3146726c3342SDavid Howells static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 31471da177e4SLinus Torvalds { 3148726c3342SDavid Howells struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 31491da177e4SLinus Torvalds 31501da177e4SLinus Torvalds buf->f_type = TMPFS_MAGIC; 315109cbfeafSKirill A. Shutemov buf->f_bsize = PAGE_SIZE; 31521da177e4SLinus Torvalds buf->f_namelen = NAME_MAX; 31530edd73b3SHugh Dickins if (sbinfo->max_blocks) { 31541da177e4SLinus Torvalds buf->f_blocks = sbinfo->max_blocks; 315541ffe5d5SHugh Dickins buf->f_bavail = 315641ffe5d5SHugh Dickins buf->f_bfree = sbinfo->max_blocks - 315741ffe5d5SHugh Dickins percpu_counter_sum(&sbinfo->used_blocks); 31580edd73b3SHugh Dickins } 31590edd73b3SHugh Dickins if (sbinfo->max_inodes) { 31601da177e4SLinus Torvalds buf->f_files = sbinfo->max_inodes; 31611da177e4SLinus Torvalds buf->f_ffree = sbinfo->free_inodes; 31621da177e4SLinus Torvalds } 31631da177e4SLinus Torvalds /* else leave those fields 0 like simple_statfs */ 316459cda49eSAmir Goldstein 316559cda49eSAmir Goldstein buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 316659cda49eSAmir Goldstein 31671da177e4SLinus Torvalds return 0; 31681da177e4SLinus Torvalds } 31691da177e4SLinus Torvalds 31701da177e4SLinus Torvalds /* 31711da177e4SLinus Torvalds * File creation. Allocate an inode, and we're done.. 31721da177e4SLinus Torvalds */ 31731da177e4SLinus Torvalds static int 31745ebb29beSChristian Brauner shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, 3175549c7297SChristian Brauner struct dentry *dentry, umode_t mode, dev_t dev) 31761da177e4SLinus Torvalds { 31770b0a0806SHugh Dickins struct inode *inode; 317871480663SCarlos Maiolino int error; 31791da177e4SLinus Torvalds 31807a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); 318171480663SCarlos Maiolino 318271480663SCarlos Maiolino if (IS_ERR(inode)) 318371480663SCarlos Maiolino return PTR_ERR(inode); 318471480663SCarlos Maiolino 3185feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 3186feda821eSChristoph Hellwig if (error) 3187feda821eSChristoph Hellwig goto out_iput; 31882a7dba39SEric Paris error = security_inode_init_security(inode, dir, 31899d8f13baSMimi Zohar &dentry->d_name, 31906d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 3191feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 3192feda821eSChristoph Hellwig goto out_iput; 319337ec43cdSMimi Zohar 3194a2e45955SChuck Lever error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3195a2e45955SChuck Lever if (error) 3196a2e45955SChuck Lever goto out_iput; 3197a2e45955SChuck Lever 31981da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3199078cd827SDeepa Dinamani dir->i_ctime = dir->i_mtime = current_time(dir); 320036f05cabSJeff Layton inode_inc_iversion(dir); 32011da177e4SLinus Torvalds d_instantiate(dentry, inode); 32021da177e4SLinus Torvalds dget(dentry); /* Extra count - pin the dentry in core */ 32031da177e4SLinus Torvalds return error; 320471480663SCarlos Maiolino 3205feda821eSChristoph Hellwig out_iput: 3206feda821eSChristoph Hellwig iput(inode); 3207feda821eSChristoph Hellwig return error; 32081da177e4SLinus Torvalds } 32091da177e4SLinus Torvalds 321060545d0dSAl Viro static int 3211011e2b71SChristian Brauner shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 3212863f144fSMiklos Szeredi struct file *file, umode_t mode) 321360545d0dSAl Viro { 321460545d0dSAl Viro struct inode *inode; 321571480663SCarlos Maiolino int error; 321660545d0dSAl Viro 32177a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); 321871480663SCarlos Maiolino 321971480663SCarlos Maiolino if (IS_ERR(inode)) { 322071480663SCarlos Maiolino error = PTR_ERR(inode); 322171480663SCarlos Maiolino goto err_out; 322271480663SCarlos Maiolino } 322371480663SCarlos Maiolino 322460545d0dSAl Viro error = security_inode_init_security(inode, dir, 322560545d0dSAl Viro NULL, 322660545d0dSAl Viro shmem_initxattrs, NULL); 3227feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 3228feda821eSChristoph Hellwig goto out_iput; 3229feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 3230feda821eSChristoph Hellwig if (error) 3231feda821eSChristoph Hellwig goto out_iput; 3232863f144fSMiklos Szeredi d_tmpfile(file, inode); 323371480663SCarlos Maiolino 323471480663SCarlos Maiolino err_out: 3235863f144fSMiklos Szeredi return finish_open_simple(file, error); 3236feda821eSChristoph Hellwig out_iput: 3237feda821eSChristoph Hellwig iput(inode); 3238feda821eSChristoph Hellwig return error; 323960545d0dSAl Viro } 324060545d0dSAl Viro 3241c54bd91eSChristian Brauner static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, 3242549c7297SChristian Brauner struct dentry *dentry, umode_t mode) 32431da177e4SLinus Torvalds { 32441da177e4SLinus Torvalds int error; 32451da177e4SLinus Torvalds 32467a80e5b8SGiuseppe Scrivano error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); 32477a80e5b8SGiuseppe Scrivano if (error) 32481da177e4SLinus Torvalds return error; 3249d8c76e6fSDave Hansen inc_nlink(dir); 32501da177e4SLinus Torvalds return 0; 32511da177e4SLinus Torvalds } 32521da177e4SLinus Torvalds 32536c960e68SChristian Brauner static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, 3254549c7297SChristian Brauner struct dentry *dentry, umode_t mode, bool excl) 32551da177e4SLinus Torvalds { 32567a80e5b8SGiuseppe Scrivano return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); 32571da177e4SLinus Torvalds } 32581da177e4SLinus Torvalds 32591da177e4SLinus Torvalds /* 32601da177e4SLinus Torvalds * Link a file.. 32611da177e4SLinus Torvalds */ 32621da177e4SLinus Torvalds static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 32631da177e4SLinus Torvalds { 326475c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 326529b00e60SDarrick J. Wong int ret = 0; 32661da177e4SLinus Torvalds 32671da177e4SLinus Torvalds /* 32681da177e4SLinus Torvalds * No ordinary (disk based) filesystem counts links as inodes; 32691da177e4SLinus Torvalds * but each new link needs a new dentry, pinning lowmem, and 32701da177e4SLinus Torvalds * tmpfs dentries cannot be pruned until they are unlinked. 32711062af92SDarrick J. Wong * But if an O_TMPFILE file is linked into the tmpfs, the 32721062af92SDarrick J. Wong * first link must skip that, to get the accounting right. 32731da177e4SLinus Torvalds */ 32741062af92SDarrick J. Wong if (inode->i_nlink) { 3275e809d5f0SChris Down ret = shmem_reserve_inode(inode->i_sb, NULL); 32765b04c689SPavel Emelyanov if (ret) 32775b04c689SPavel Emelyanov goto out; 32781062af92SDarrick J. Wong } 32791da177e4SLinus Torvalds 3280a2e45955SChuck Lever ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3281a2e45955SChuck Lever if (ret) { 3282a2e45955SChuck Lever if (inode->i_nlink) 3283a2e45955SChuck Lever shmem_free_inode(inode->i_sb); 3284a2e45955SChuck Lever goto out; 3285a2e45955SChuck Lever } 3286a2e45955SChuck Lever 32871da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3288078cd827SDeepa Dinamani inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 328936f05cabSJeff Layton inode_inc_iversion(dir); 3290d8c76e6fSDave Hansen inc_nlink(inode); 32917de9c6eeSAl Viro ihold(inode); /* New dentry reference */ 32921da177e4SLinus Torvalds dget(dentry); /* Extra pinning count for the created dentry */ 32931da177e4SLinus Torvalds d_instantiate(dentry, inode); 32945b04c689SPavel Emelyanov out: 32955b04c689SPavel Emelyanov return ret; 32961da177e4SLinus Torvalds } 32971da177e4SLinus Torvalds 32981da177e4SLinus Torvalds static int shmem_unlink(struct inode *dir, struct dentry *dentry) 32991da177e4SLinus Torvalds { 330075c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 33011da177e4SLinus Torvalds 33025b04c689SPavel Emelyanov if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 33035b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 33041da177e4SLinus Torvalds 3305a2e45955SChuck Lever simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 3306a2e45955SChuck Lever 33071da177e4SLinus Torvalds dir->i_size -= BOGO_DIRENT_SIZE; 3308078cd827SDeepa Dinamani inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 330936f05cabSJeff Layton inode_inc_iversion(dir); 33109a53c3a7SDave Hansen drop_nlink(inode); 33111da177e4SLinus Torvalds dput(dentry); /* Undo the count from "create" - this does all the work */ 33121da177e4SLinus Torvalds return 0; 33131da177e4SLinus Torvalds } 33141da177e4SLinus Torvalds 33151da177e4SLinus Torvalds static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 33161da177e4SLinus Torvalds { 33171da177e4SLinus Torvalds if (!simple_empty(dentry)) 33181da177e4SLinus Torvalds return -ENOTEMPTY; 33191da177e4SLinus Torvalds 332075c3cfa8SDavid Howells drop_nlink(d_inode(dentry)); 33219a53c3a7SDave Hansen drop_nlink(dir); 33221da177e4SLinus Torvalds return shmem_unlink(dir, dentry); 33231da177e4SLinus Torvalds } 33241da177e4SLinus Torvalds 3325e18275aeSChristian Brauner static int shmem_whiteout(struct mnt_idmap *idmap, 3326549c7297SChristian Brauner struct inode *old_dir, struct dentry *old_dentry) 332746fdb794SMiklos Szeredi { 332846fdb794SMiklos Szeredi struct dentry *whiteout; 332946fdb794SMiklos Szeredi int error; 333046fdb794SMiklos Szeredi 333146fdb794SMiklos Szeredi whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 333246fdb794SMiklos Szeredi if (!whiteout) 333346fdb794SMiklos Szeredi return -ENOMEM; 333446fdb794SMiklos Szeredi 33357a80e5b8SGiuseppe Scrivano error = shmem_mknod(idmap, old_dir, whiteout, 333646fdb794SMiklos Szeredi S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 333746fdb794SMiklos Szeredi dput(whiteout); 333846fdb794SMiklos Szeredi if (error) 333946fdb794SMiklos Szeredi return error; 334046fdb794SMiklos Szeredi 334146fdb794SMiklos Szeredi /* 334246fdb794SMiklos Szeredi * Cheat and hash the whiteout while the old dentry is still in 334346fdb794SMiklos Szeredi * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 334446fdb794SMiklos Szeredi * 334546fdb794SMiklos Szeredi * d_lookup() will consistently find one of them at this point, 334646fdb794SMiklos Szeredi * not sure which one, but that isn't even important. 334746fdb794SMiklos Szeredi */ 334846fdb794SMiklos Szeredi d_rehash(whiteout); 334946fdb794SMiklos Szeredi return 0; 335046fdb794SMiklos Szeredi } 335146fdb794SMiklos Szeredi 33521da177e4SLinus Torvalds /* 33531da177e4SLinus Torvalds * The VFS layer already does all the dentry stuff for rename, 33541da177e4SLinus Torvalds * we just have to decrement the usage count for the target if 33551da177e4SLinus Torvalds * it exists so that the VFS layer correctly free's it when it 33561da177e4SLinus Torvalds * gets overwritten. 33571da177e4SLinus Torvalds */ 3358e18275aeSChristian Brauner static int shmem_rename2(struct mnt_idmap *idmap, 3359549c7297SChristian Brauner struct inode *old_dir, struct dentry *old_dentry, 3360549c7297SChristian Brauner struct inode *new_dir, struct dentry *new_dentry, 3361549c7297SChristian Brauner unsigned int flags) 33621da177e4SLinus Torvalds { 336375c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 33641da177e4SLinus Torvalds int they_are_dirs = S_ISDIR(inode->i_mode); 3365a2e45955SChuck Lever int error; 33661da177e4SLinus Torvalds 336746fdb794SMiklos Szeredi if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 33683b69ff51SMiklos Szeredi return -EINVAL; 33693b69ff51SMiklos Szeredi 337037456771SMiklos Szeredi if (flags & RENAME_EXCHANGE) 3371a2e45955SChuck Lever return simple_offset_rename_exchange(old_dir, old_dentry, 3372a2e45955SChuck Lever new_dir, new_dentry); 337337456771SMiklos Szeredi 33741da177e4SLinus Torvalds if (!simple_empty(new_dentry)) 33751da177e4SLinus Torvalds return -ENOTEMPTY; 33761da177e4SLinus Torvalds 337746fdb794SMiklos Szeredi if (flags & RENAME_WHITEOUT) { 33787a80e5b8SGiuseppe Scrivano error = shmem_whiteout(idmap, old_dir, old_dentry); 337946fdb794SMiklos Szeredi if (error) 338046fdb794SMiklos Szeredi return error; 338146fdb794SMiklos Szeredi } 338246fdb794SMiklos Szeredi 3383a2e45955SChuck Lever simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); 3384a2e45955SChuck Lever error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); 3385a2e45955SChuck Lever if (error) 3386a2e45955SChuck Lever return error; 3387a2e45955SChuck Lever 338875c3cfa8SDavid Howells if (d_really_is_positive(new_dentry)) { 33891da177e4SLinus Torvalds (void) shmem_unlink(new_dir, new_dentry); 3390b928095bSMiklos Szeredi if (they_are_dirs) { 339175c3cfa8SDavid Howells drop_nlink(d_inode(new_dentry)); 33929a53c3a7SDave Hansen drop_nlink(old_dir); 3393b928095bSMiklos Szeredi } 33941da177e4SLinus Torvalds } else if (they_are_dirs) { 33959a53c3a7SDave Hansen drop_nlink(old_dir); 3396d8c76e6fSDave Hansen inc_nlink(new_dir); 33971da177e4SLinus Torvalds } 33981da177e4SLinus Torvalds 33991da177e4SLinus Torvalds old_dir->i_size -= BOGO_DIRENT_SIZE; 34001da177e4SLinus Torvalds new_dir->i_size += BOGO_DIRENT_SIZE; 34011da177e4SLinus Torvalds old_dir->i_ctime = old_dir->i_mtime = 34021da177e4SLinus Torvalds new_dir->i_ctime = new_dir->i_mtime = 3403078cd827SDeepa Dinamani inode->i_ctime = current_time(old_dir); 340436f05cabSJeff Layton inode_inc_iversion(old_dir); 340536f05cabSJeff Layton inode_inc_iversion(new_dir); 34061da177e4SLinus Torvalds return 0; 34071da177e4SLinus Torvalds } 34081da177e4SLinus Torvalds 34097a77db95SChristian Brauner static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, 3410549c7297SChristian Brauner struct dentry *dentry, const char *symname) 34111da177e4SLinus Torvalds { 34121da177e4SLinus Torvalds int error; 34131da177e4SLinus Torvalds int len; 34141da177e4SLinus Torvalds struct inode *inode; 34157ad0414bSMatthew Wilcox (Oracle) struct folio *folio; 34161da177e4SLinus Torvalds 34171da177e4SLinus Torvalds len = strlen(symname) + 1; 341809cbfeafSKirill A. Shutemov if (len > PAGE_SIZE) 34191da177e4SLinus Torvalds return -ENAMETOOLONG; 34201da177e4SLinus Torvalds 34217a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, 34220825a6f9SJoe Perches VM_NORESERVE); 342371480663SCarlos Maiolino 342471480663SCarlos Maiolino if (IS_ERR(inode)) 342571480663SCarlos Maiolino return PTR_ERR(inode); 34261da177e4SLinus Torvalds 34279d8f13baSMimi Zohar error = security_inode_init_security(inode, dir, &dentry->d_name, 34286d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 342923a31d87SChuck Lever if (error && error != -EOPNOTSUPP) 343023a31d87SChuck Lever goto out_iput; 3431570bc1c2SStephen Smalley 3432a2e45955SChuck Lever error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3433a2e45955SChuck Lever if (error) 3434a2e45955SChuck Lever goto out_iput; 3435a2e45955SChuck Lever 34361da177e4SLinus Torvalds inode->i_size = len-1; 343769f07ec9SHugh Dickins if (len <= SHORT_SYMLINK_LEN) { 34383ed47db3SAl Viro inode->i_link = kmemdup(symname, len, GFP_KERNEL); 34393ed47db3SAl Viro if (!inode->i_link) { 344023a31d87SChuck Lever error = -ENOMEM; 3441a2e45955SChuck Lever goto out_remove_offset; 344269f07ec9SHugh Dickins } 344369f07ec9SHugh Dickins inode->i_op = &shmem_short_symlink_operations; 34441da177e4SLinus Torvalds } else { 3445e8ecde25SAl Viro inode_nohighmem(inode); 34467ad0414bSMatthew Wilcox (Oracle) error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); 344723a31d87SChuck Lever if (error) 3448a2e45955SChuck Lever goto out_remove_offset; 344914fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 34501da177e4SLinus Torvalds inode->i_op = &shmem_symlink_inode_operations; 34517ad0414bSMatthew Wilcox (Oracle) memcpy(folio_address(folio), symname, len); 34527ad0414bSMatthew Wilcox (Oracle) folio_mark_uptodate(folio); 34537ad0414bSMatthew Wilcox (Oracle) folio_mark_dirty(folio); 34547ad0414bSMatthew Wilcox (Oracle) folio_unlock(folio); 34557ad0414bSMatthew Wilcox (Oracle) folio_put(folio); 34561da177e4SLinus Torvalds } 34571da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 3458078cd827SDeepa Dinamani dir->i_ctime = dir->i_mtime = current_time(dir); 345936f05cabSJeff Layton inode_inc_iversion(dir); 34601da177e4SLinus Torvalds d_instantiate(dentry, inode); 34611da177e4SLinus Torvalds dget(dentry); 34621da177e4SLinus Torvalds return 0; 3463a2e45955SChuck Lever 3464a2e45955SChuck Lever out_remove_offset: 3465a2e45955SChuck Lever simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 346623a31d87SChuck Lever out_iput: 346723a31d87SChuck Lever iput(inode); 346823a31d87SChuck Lever return error; 34691da177e4SLinus Torvalds } 34701da177e4SLinus Torvalds 3471fceef393SAl Viro static void shmem_put_link(void *arg) 3472fceef393SAl Viro { 3473e4b57722SMatthew Wilcox (Oracle) folio_mark_accessed(arg); 3474e4b57722SMatthew Wilcox (Oracle) folio_put(arg); 3475fceef393SAl Viro } 3476fceef393SAl Viro 34776b255391SAl Viro static const char *shmem_get_link(struct dentry *dentry, 3478fceef393SAl Viro struct inode *inode, 3479fceef393SAl Viro struct delayed_call *done) 34801da177e4SLinus Torvalds { 3481e4b57722SMatthew Wilcox (Oracle) struct folio *folio = NULL; 34826b255391SAl Viro int error; 3483e4b57722SMatthew Wilcox (Oracle) 34846a6c9904SAl Viro if (!dentry) { 3485e4b57722SMatthew Wilcox (Oracle) folio = filemap_get_folio(inode->i_mapping, 0); 348666dabbb6SChristoph Hellwig if (IS_ERR(folio)) 34876b255391SAl Viro return ERR_PTR(-ECHILD); 34887459c149SMatthew Wilcox (Oracle) if (PageHWPoison(folio_page(folio, 0)) || 3489e4b57722SMatthew Wilcox (Oracle) !folio_test_uptodate(folio)) { 3490e4b57722SMatthew Wilcox (Oracle) folio_put(folio); 34916a6c9904SAl Viro return ERR_PTR(-ECHILD); 34926a6c9904SAl Viro } 34936a6c9904SAl Viro } else { 3494e4b57722SMatthew Wilcox (Oracle) error = shmem_get_folio(inode, 0, &folio, SGP_READ); 3495680baacbSAl Viro if (error) 3496680baacbSAl Viro return ERR_PTR(error); 3497e4b57722SMatthew Wilcox (Oracle) if (!folio) 3498a7605426SYang Shi return ERR_PTR(-ECHILD); 34997459c149SMatthew Wilcox (Oracle) if (PageHWPoison(folio_page(folio, 0))) { 3500e4b57722SMatthew Wilcox (Oracle) folio_unlock(folio); 3501e4b57722SMatthew Wilcox (Oracle) folio_put(folio); 3502a7605426SYang Shi return ERR_PTR(-ECHILD); 3503a7605426SYang Shi } 3504e4b57722SMatthew Wilcox (Oracle) folio_unlock(folio); 35051da177e4SLinus Torvalds } 3506e4b57722SMatthew Wilcox (Oracle) set_delayed_call(done, shmem_put_link, folio); 3507e4b57722SMatthew Wilcox (Oracle) return folio_address(folio); 35081da177e4SLinus Torvalds } 35091da177e4SLinus Torvalds 3510b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3511e408e695STheodore Ts'o 3512e408e695STheodore Ts'o static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) 3513e408e695STheodore Ts'o { 3514e408e695STheodore Ts'o struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3515e408e695STheodore Ts'o 3516e408e695STheodore Ts'o fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); 3517e408e695STheodore Ts'o 3518e408e695STheodore Ts'o return 0; 3519e408e695STheodore Ts'o } 3520e408e695STheodore Ts'o 35218782a9aeSChristian Brauner static int shmem_fileattr_set(struct mnt_idmap *idmap, 3522e408e695STheodore Ts'o struct dentry *dentry, struct fileattr *fa) 3523e408e695STheodore Ts'o { 3524e408e695STheodore Ts'o struct inode *inode = d_inode(dentry); 3525e408e695STheodore Ts'o struct shmem_inode_info *info = SHMEM_I(inode); 3526e408e695STheodore Ts'o 3527e408e695STheodore Ts'o if (fileattr_has_fsx(fa)) 3528e408e695STheodore Ts'o return -EOPNOTSUPP; 3529cb241339SHugh Dickins if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) 3530cb241339SHugh Dickins return -EOPNOTSUPP; 3531e408e695STheodore Ts'o 3532e408e695STheodore Ts'o info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | 3533e408e695STheodore Ts'o (fa->flags & SHMEM_FL_USER_MODIFIABLE); 3534e408e695STheodore Ts'o 3535cb241339SHugh Dickins shmem_set_inode_flags(inode, info->fsflags); 3536e408e695STheodore Ts'o inode->i_ctime = current_time(inode); 353736f05cabSJeff Layton inode_inc_iversion(inode); 3538e408e695STheodore Ts'o return 0; 3539e408e695STheodore Ts'o } 3540e408e695STheodore Ts'o 3541b09e0fa4SEric Paris /* 3542b09e0fa4SEric Paris * Superblocks without xattr inode operations may get some security.* xattr 3543b09e0fa4SEric Paris * support from the LSM "for free". As soon as we have any other xattrs 3544b09e0fa4SEric Paris * like ACLs, we also need to implement the security.* handlers at 3545b09e0fa4SEric Paris * filesystem level, though. 3546b09e0fa4SEric Paris */ 3547b09e0fa4SEric Paris 35486d9d88d0SJarkko Sakkinen /* 35496d9d88d0SJarkko Sakkinen * Callback for security_inode_init_security() for acquiring xattrs. 35506d9d88d0SJarkko Sakkinen */ 35516d9d88d0SJarkko Sakkinen static int shmem_initxattrs(struct inode *inode, 35526d9d88d0SJarkko Sakkinen const struct xattr *xattr_array, 35536d9d88d0SJarkko Sakkinen void *fs_info) 35546d9d88d0SJarkko Sakkinen { 35556d9d88d0SJarkko Sakkinen struct shmem_inode_info *info = SHMEM_I(inode); 35566d9d88d0SJarkko Sakkinen const struct xattr *xattr; 355738f38657SAristeu Rozanski struct simple_xattr *new_xattr; 35586d9d88d0SJarkko Sakkinen size_t len; 35596d9d88d0SJarkko Sakkinen 35606d9d88d0SJarkko Sakkinen for (xattr = xattr_array; xattr->name != NULL; xattr++) { 356138f38657SAristeu Rozanski new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 35626d9d88d0SJarkko Sakkinen if (!new_xattr) 35636d9d88d0SJarkko Sakkinen return -ENOMEM; 35646d9d88d0SJarkko Sakkinen 35656d9d88d0SJarkko Sakkinen len = strlen(xattr->name) + 1; 35666d9d88d0SJarkko Sakkinen new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 35676d9d88d0SJarkko Sakkinen GFP_KERNEL); 35686d9d88d0SJarkko Sakkinen if (!new_xattr->name) { 35693bef735aSChengguang Xu kvfree(new_xattr); 35706d9d88d0SJarkko Sakkinen return -ENOMEM; 35716d9d88d0SJarkko Sakkinen } 35726d9d88d0SJarkko Sakkinen 35736d9d88d0SJarkko Sakkinen memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 35746d9d88d0SJarkko Sakkinen XATTR_SECURITY_PREFIX_LEN); 35756d9d88d0SJarkko Sakkinen memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 35766d9d88d0SJarkko Sakkinen xattr->name, len); 35776d9d88d0SJarkko Sakkinen 35783b4c7bc0SChristian Brauner simple_xattr_add(&info->xattrs, new_xattr); 35796d9d88d0SJarkko Sakkinen } 35806d9d88d0SJarkko Sakkinen 35816d9d88d0SJarkko Sakkinen return 0; 35826d9d88d0SJarkko Sakkinen } 35836d9d88d0SJarkko Sakkinen 3584aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3585b296821aSAl Viro struct dentry *unused, struct inode *inode, 3586b296821aSAl Viro const char *name, void *buffer, size_t size) 3587aa7c5241SAndreas Gruenbacher { 3588b296821aSAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 3589aa7c5241SAndreas Gruenbacher 3590aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 3591aa7c5241SAndreas Gruenbacher return simple_xattr_get(&info->xattrs, name, buffer, size); 3592aa7c5241SAndreas Gruenbacher } 3593aa7c5241SAndreas Gruenbacher 3594aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_set(const struct xattr_handler *handler, 359539f60c1cSChristian Brauner struct mnt_idmap *idmap, 359659301226SAl Viro struct dentry *unused, struct inode *inode, 359759301226SAl Viro const char *name, const void *value, 359859301226SAl Viro size_t size, int flags) 3599aa7c5241SAndreas Gruenbacher { 360059301226SAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 360136f05cabSJeff Layton int err; 3602aa7c5241SAndreas Gruenbacher 3603aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 360436f05cabSJeff Layton err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); 360536f05cabSJeff Layton if (!err) { 360636f05cabSJeff Layton inode->i_ctime = current_time(inode); 360736f05cabSJeff Layton inode_inc_iversion(inode); 360836f05cabSJeff Layton } 360936f05cabSJeff Layton return err; 3610aa7c5241SAndreas Gruenbacher } 3611aa7c5241SAndreas Gruenbacher 3612aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_security_xattr_handler = { 3613aa7c5241SAndreas Gruenbacher .prefix = XATTR_SECURITY_PREFIX, 3614aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 3615aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 3616aa7c5241SAndreas Gruenbacher }; 3617aa7c5241SAndreas Gruenbacher 3618aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_trusted_xattr_handler = { 3619aa7c5241SAndreas Gruenbacher .prefix = XATTR_TRUSTED_PREFIX, 3620aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 3621aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 3622aa7c5241SAndreas Gruenbacher }; 3623aa7c5241SAndreas Gruenbacher 3624b09e0fa4SEric Paris static const struct xattr_handler *shmem_xattr_handlers[] = { 3625aa7c5241SAndreas Gruenbacher &shmem_security_xattr_handler, 3626aa7c5241SAndreas Gruenbacher &shmem_trusted_xattr_handler, 3627b09e0fa4SEric Paris NULL 3628b09e0fa4SEric Paris }; 3629b09e0fa4SEric Paris 3630b09e0fa4SEric Paris static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3631b09e0fa4SEric Paris { 363275c3cfa8SDavid Howells struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3633786534b9SAndreas Gruenbacher return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3634b09e0fa4SEric Paris } 3635b09e0fa4SEric Paris #endif /* CONFIG_TMPFS_XATTR */ 3636b09e0fa4SEric Paris 363769f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations = { 3638f7cd16a5SXavier Roche .getattr = shmem_getattr, 3639e09764cfSCarlos Maiolino .setattr = shmem_setattr, 36406b255391SAl Viro .get_link = simple_get_link, 3641b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3642b09e0fa4SEric Paris .listxattr = shmem_listxattr, 3643b09e0fa4SEric Paris #endif 36441da177e4SLinus Torvalds }; 36451da177e4SLinus Torvalds 364692e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations = { 3647f7cd16a5SXavier Roche .getattr = shmem_getattr, 3648e09764cfSCarlos Maiolino .setattr = shmem_setattr, 36496b255391SAl Viro .get_link = shmem_get_link, 3650b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3651b09e0fa4SEric Paris .listxattr = shmem_listxattr, 365239f0247dSAndreas Gruenbacher #endif 3653b09e0fa4SEric Paris }; 365439f0247dSAndreas Gruenbacher 365591828a40SDavid M. Grimes static struct dentry *shmem_get_parent(struct dentry *child) 365691828a40SDavid M. Grimes { 365791828a40SDavid M. Grimes return ERR_PTR(-ESTALE); 365891828a40SDavid M. Grimes } 365991828a40SDavid M. Grimes 366091828a40SDavid M. Grimes static int shmem_match(struct inode *ino, void *vfh) 366191828a40SDavid M. Grimes { 366291828a40SDavid M. Grimes __u32 *fh = vfh; 366391828a40SDavid M. Grimes __u64 inum = fh[2]; 366491828a40SDavid M. Grimes inum = (inum << 32) | fh[1]; 366591828a40SDavid M. Grimes return ino->i_ino == inum && fh[0] == ino->i_generation; 366691828a40SDavid M. Grimes } 366791828a40SDavid M. Grimes 366812ba780dSAmir Goldstein /* Find any alias of inode, but prefer a hashed alias */ 366912ba780dSAmir Goldstein static struct dentry *shmem_find_alias(struct inode *inode) 367012ba780dSAmir Goldstein { 367112ba780dSAmir Goldstein struct dentry *alias = d_find_alias(inode); 367212ba780dSAmir Goldstein 367312ba780dSAmir Goldstein return alias ?: d_find_any_alias(inode); 367412ba780dSAmir Goldstein } 367512ba780dSAmir Goldstein 367612ba780dSAmir Goldstein 3677480b116cSChristoph Hellwig static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3678480b116cSChristoph Hellwig struct fid *fid, int fh_len, int fh_type) 367991828a40SDavid M. Grimes { 368091828a40SDavid M. Grimes struct inode *inode; 3681480b116cSChristoph Hellwig struct dentry *dentry = NULL; 368235c2a7f4SHugh Dickins u64 inum; 368391828a40SDavid M. Grimes 3684480b116cSChristoph Hellwig if (fh_len < 3) 3685480b116cSChristoph Hellwig return NULL; 3686480b116cSChristoph Hellwig 368735c2a7f4SHugh Dickins inum = fid->raw[2]; 368835c2a7f4SHugh Dickins inum = (inum << 32) | fid->raw[1]; 368935c2a7f4SHugh Dickins 3690480b116cSChristoph Hellwig inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3691480b116cSChristoph Hellwig shmem_match, fid->raw); 369291828a40SDavid M. Grimes if (inode) { 369312ba780dSAmir Goldstein dentry = shmem_find_alias(inode); 369491828a40SDavid M. Grimes iput(inode); 369591828a40SDavid M. Grimes } 369691828a40SDavid M. Grimes 3697480b116cSChristoph Hellwig return dentry; 369891828a40SDavid M. Grimes } 369991828a40SDavid M. Grimes 3700b0b0382bSAl Viro static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3701b0b0382bSAl Viro struct inode *parent) 370291828a40SDavid M. Grimes { 37035fe0c237SAneesh Kumar K.V if (*len < 3) { 37045fe0c237SAneesh Kumar K.V *len = 3; 370594e07a75SNamjae Jeon return FILEID_INVALID; 37065fe0c237SAneesh Kumar K.V } 370791828a40SDavid M. Grimes 37081d3382cbSAl Viro if (inode_unhashed(inode)) { 370991828a40SDavid M. Grimes /* Unfortunately insert_inode_hash is not idempotent, 371091828a40SDavid M. Grimes * so as we hash inodes here rather than at creation 371191828a40SDavid M. Grimes * time, we need a lock to ensure we only try 371291828a40SDavid M. Grimes * to do it once 371391828a40SDavid M. Grimes */ 371491828a40SDavid M. Grimes static DEFINE_SPINLOCK(lock); 371591828a40SDavid M. Grimes spin_lock(&lock); 37161d3382cbSAl Viro if (inode_unhashed(inode)) 371791828a40SDavid M. Grimes __insert_inode_hash(inode, 371891828a40SDavid M. Grimes inode->i_ino + inode->i_generation); 371991828a40SDavid M. Grimes spin_unlock(&lock); 372091828a40SDavid M. Grimes } 372191828a40SDavid M. Grimes 372291828a40SDavid M. Grimes fh[0] = inode->i_generation; 372391828a40SDavid M. Grimes fh[1] = inode->i_ino; 372491828a40SDavid M. Grimes fh[2] = ((__u64)inode->i_ino) >> 32; 372591828a40SDavid M. Grimes 372691828a40SDavid M. Grimes *len = 3; 372791828a40SDavid M. Grimes return 1; 372891828a40SDavid M. Grimes } 372991828a40SDavid M. Grimes 373039655164SChristoph Hellwig static const struct export_operations shmem_export_ops = { 373191828a40SDavid M. Grimes .get_parent = shmem_get_parent, 373291828a40SDavid M. Grimes .encode_fh = shmem_encode_fh, 3733480b116cSChristoph Hellwig .fh_to_dentry = shmem_fh_to_dentry, 373491828a40SDavid M. Grimes }; 373591828a40SDavid M. Grimes 3736626c3920SAl Viro enum shmem_param { 3737626c3920SAl Viro Opt_gid, 3738626c3920SAl Viro Opt_huge, 3739626c3920SAl Viro Opt_mode, 3740626c3920SAl Viro Opt_mpol, 3741626c3920SAl Viro Opt_nr_blocks, 3742626c3920SAl Viro Opt_nr_inodes, 3743626c3920SAl Viro Opt_size, 3744626c3920SAl Viro Opt_uid, 3745ea3271f7SChris Down Opt_inode32, 3746ea3271f7SChris Down Opt_inode64, 37472c6efe9cSLuis Chamberlain Opt_noswap, 3748e09764cfSCarlos Maiolino Opt_quota, 3749e09764cfSCarlos Maiolino Opt_usrquota, 3750e09764cfSCarlos Maiolino Opt_grpquota, 3751de4c0e7cSLukas Czerner Opt_usrquota_block_hardlimit, 3752de4c0e7cSLukas Czerner Opt_usrquota_inode_hardlimit, 3753de4c0e7cSLukas Czerner Opt_grpquota_block_hardlimit, 3754de4c0e7cSLukas Czerner Opt_grpquota_inode_hardlimit, 3755626c3920SAl Viro }; 37561da177e4SLinus Torvalds 37575eede625SAl Viro static const struct constant_table shmem_param_enums_huge[] = { 37582710c957SAl Viro {"never", SHMEM_HUGE_NEVER }, 37592710c957SAl Viro {"always", SHMEM_HUGE_ALWAYS }, 37602710c957SAl Viro {"within_size", SHMEM_HUGE_WITHIN_SIZE }, 37612710c957SAl Viro {"advise", SHMEM_HUGE_ADVISE }, 37622710c957SAl Viro {} 37632710c957SAl Viro }; 37642710c957SAl Viro 3765d7167b14SAl Viro const struct fs_parameter_spec shmem_fs_parameters[] = { 3766626c3920SAl Viro fsparam_u32 ("gid", Opt_gid), 37672710c957SAl Viro fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), 3768626c3920SAl Viro fsparam_u32oct("mode", Opt_mode), 3769626c3920SAl Viro fsparam_string("mpol", Opt_mpol), 3770626c3920SAl Viro fsparam_string("nr_blocks", Opt_nr_blocks), 3771626c3920SAl Viro fsparam_string("nr_inodes", Opt_nr_inodes), 3772626c3920SAl Viro fsparam_string("size", Opt_size), 3773626c3920SAl Viro fsparam_u32 ("uid", Opt_uid), 3774ea3271f7SChris Down fsparam_flag ("inode32", Opt_inode32), 3775ea3271f7SChris Down fsparam_flag ("inode64", Opt_inode64), 37762c6efe9cSLuis Chamberlain fsparam_flag ("noswap", Opt_noswap), 3777e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 3778e09764cfSCarlos Maiolino fsparam_flag ("quota", Opt_quota), 3779e09764cfSCarlos Maiolino fsparam_flag ("usrquota", Opt_usrquota), 3780e09764cfSCarlos Maiolino fsparam_flag ("grpquota", Opt_grpquota), 3781de4c0e7cSLukas Czerner fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), 3782de4c0e7cSLukas Czerner fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), 3783de4c0e7cSLukas Czerner fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), 3784de4c0e7cSLukas Czerner fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), 3785e09764cfSCarlos Maiolino #endif 3786626c3920SAl Viro {} 3787626c3920SAl Viro }; 3788626c3920SAl Viro 3789f3235626SDavid Howells static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) 3790626c3920SAl Viro { 3791f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 3792626c3920SAl Viro struct fs_parse_result result; 3793e04dc423SAl Viro unsigned long long size; 3794626c3920SAl Viro char *rest; 3795626c3920SAl Viro int opt; 3796626c3920SAl Viro 3797d7167b14SAl Viro opt = fs_parse(fc, shmem_fs_parameters, param, &result); 3798f3235626SDavid Howells if (opt < 0) 3799626c3920SAl Viro return opt; 3800626c3920SAl Viro 3801626c3920SAl Viro switch (opt) { 3802626c3920SAl Viro case Opt_size: 3803626c3920SAl Viro size = memparse(param->string, &rest); 3804e04dc423SAl Viro if (*rest == '%') { 3805e04dc423SAl Viro size <<= PAGE_SHIFT; 3806e04dc423SAl Viro size *= totalram_pages(); 3807e04dc423SAl Viro do_div(size, 100); 3808e04dc423SAl Viro rest++; 3809e04dc423SAl Viro } 3810e04dc423SAl Viro if (*rest) 3811626c3920SAl Viro goto bad_value; 3812e04dc423SAl Viro ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); 3813e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_BLOCKS; 3814626c3920SAl Viro break; 3815626c3920SAl Viro case Opt_nr_blocks: 3816626c3920SAl Viro ctx->blocks = memparse(param->string, &rest); 38170c98c8e1SZhaoLong Wang if (*rest || ctx->blocks > S64_MAX) 3818626c3920SAl Viro goto bad_value; 3819e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_BLOCKS; 3820626c3920SAl Viro break; 3821626c3920SAl Viro case Opt_nr_inodes: 3822626c3920SAl Viro ctx->inodes = memparse(param->string, &rest); 3823e04dc423SAl Viro if (*rest) 3824626c3920SAl Viro goto bad_value; 3825e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_INODES; 3826626c3920SAl Viro break; 3827626c3920SAl Viro case Opt_mode: 3828626c3920SAl Viro ctx->mode = result.uint_32 & 07777; 3829626c3920SAl Viro break; 3830626c3920SAl Viro case Opt_uid: 3831626c3920SAl Viro ctx->uid = make_kuid(current_user_ns(), result.uint_32); 3832e04dc423SAl Viro if (!uid_valid(ctx->uid)) 3833626c3920SAl Viro goto bad_value; 3834626c3920SAl Viro break; 3835626c3920SAl Viro case Opt_gid: 3836626c3920SAl Viro ctx->gid = make_kgid(current_user_ns(), result.uint_32); 3837e04dc423SAl Viro if (!gid_valid(ctx->gid)) 3838626c3920SAl Viro goto bad_value; 3839626c3920SAl Viro break; 3840626c3920SAl Viro case Opt_huge: 3841626c3920SAl Viro ctx->huge = result.uint_32; 3842626c3920SAl Viro if (ctx->huge != SHMEM_HUGE_NEVER && 3843396bcc52SMatthew Wilcox (Oracle) !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 3844626c3920SAl Viro has_transparent_hugepage())) 3845626c3920SAl Viro goto unsupported_parameter; 3846e04dc423SAl Viro ctx->seen |= SHMEM_SEEN_HUGE; 3847626c3920SAl Viro break; 3848626c3920SAl Viro case Opt_mpol: 3849626c3920SAl Viro if (IS_ENABLED(CONFIG_NUMA)) { 3850e04dc423SAl Viro mpol_put(ctx->mpol); 3851e04dc423SAl Viro ctx->mpol = NULL; 3852626c3920SAl Viro if (mpol_parse_str(param->string, &ctx->mpol)) 3853626c3920SAl Viro goto bad_value; 3854626c3920SAl Viro break; 3855626c3920SAl Viro } 3856626c3920SAl Viro goto unsupported_parameter; 3857ea3271f7SChris Down case Opt_inode32: 3858ea3271f7SChris Down ctx->full_inums = false; 3859ea3271f7SChris Down ctx->seen |= SHMEM_SEEN_INUMS; 3860ea3271f7SChris Down break; 3861ea3271f7SChris Down case Opt_inode64: 3862ea3271f7SChris Down if (sizeof(ino_t) < 8) { 3863ea3271f7SChris Down return invalfc(fc, 3864ea3271f7SChris Down "Cannot use inode64 with <64bit inums in kernel\n"); 3865ea3271f7SChris Down } 3866ea3271f7SChris Down ctx->full_inums = true; 3867ea3271f7SChris Down ctx->seen |= SHMEM_SEEN_INUMS; 3868ea3271f7SChris Down break; 38692c6efe9cSLuis Chamberlain case Opt_noswap: 387001106e14SChristian Brauner if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { 387101106e14SChristian Brauner return invalfc(fc, 387201106e14SChristian Brauner "Turning off swap in unprivileged tmpfs mounts unsupported"); 387301106e14SChristian Brauner } 38742c6efe9cSLuis Chamberlain ctx->noswap = true; 38752c6efe9cSLuis Chamberlain ctx->seen |= SHMEM_SEEN_NOSWAP; 38762c6efe9cSLuis Chamberlain break; 3877e09764cfSCarlos Maiolino case Opt_quota: 3878e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3879e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3880e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3881e09764cfSCarlos Maiolino ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); 3882e09764cfSCarlos Maiolino break; 3883e09764cfSCarlos Maiolino case Opt_usrquota: 3884e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3885e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3886e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3887e09764cfSCarlos Maiolino ctx->quota_types |= QTYPE_MASK_USR; 3888e09764cfSCarlos Maiolino break; 3889e09764cfSCarlos Maiolino case Opt_grpquota: 3890e09764cfSCarlos Maiolino if (fc->user_ns != &init_user_ns) 3891e09764cfSCarlos Maiolino return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 3892e09764cfSCarlos Maiolino ctx->seen |= SHMEM_SEEN_QUOTA; 3893e09764cfSCarlos Maiolino ctx->quota_types |= QTYPE_MASK_GRP; 3894e09764cfSCarlos Maiolino break; 3895de4c0e7cSLukas Czerner case Opt_usrquota_block_hardlimit: 3896de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3897de4c0e7cSLukas Czerner if (*rest || !size) 3898de4c0e7cSLukas Czerner goto bad_value; 3899de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 3900de4c0e7cSLukas Czerner return invalfc(fc, 3901de4c0e7cSLukas Czerner "User quota block hardlimit too large."); 3902de4c0e7cSLukas Czerner ctx->qlimits.usrquota_bhardlimit = size; 3903de4c0e7cSLukas Czerner break; 3904de4c0e7cSLukas Czerner case Opt_grpquota_block_hardlimit: 3905de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3906de4c0e7cSLukas Czerner if (*rest || !size) 3907de4c0e7cSLukas Czerner goto bad_value; 3908de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 3909de4c0e7cSLukas Czerner return invalfc(fc, 3910de4c0e7cSLukas Czerner "Group quota block hardlimit too large."); 3911de4c0e7cSLukas Czerner ctx->qlimits.grpquota_bhardlimit = size; 3912de4c0e7cSLukas Czerner break; 3913de4c0e7cSLukas Czerner case Opt_usrquota_inode_hardlimit: 3914de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3915de4c0e7cSLukas Czerner if (*rest || !size) 3916de4c0e7cSLukas Czerner goto bad_value; 3917de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 3918de4c0e7cSLukas Czerner return invalfc(fc, 3919de4c0e7cSLukas Czerner "User quota inode hardlimit too large."); 3920de4c0e7cSLukas Czerner ctx->qlimits.usrquota_ihardlimit = size; 3921de4c0e7cSLukas Czerner break; 3922de4c0e7cSLukas Czerner case Opt_grpquota_inode_hardlimit: 3923de4c0e7cSLukas Czerner size = memparse(param->string, &rest); 3924de4c0e7cSLukas Czerner if (*rest || !size) 3925de4c0e7cSLukas Czerner goto bad_value; 3926de4c0e7cSLukas Czerner if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 3927de4c0e7cSLukas Czerner return invalfc(fc, 3928de4c0e7cSLukas Czerner "Group quota inode hardlimit too large."); 3929de4c0e7cSLukas Czerner ctx->qlimits.grpquota_ihardlimit = size; 3930de4c0e7cSLukas Czerner break; 3931e04dc423SAl Viro } 3932e04dc423SAl Viro return 0; 3933e04dc423SAl Viro 3934626c3920SAl Viro unsupported_parameter: 3935f35aa2bcSAl Viro return invalfc(fc, "Unsupported parameter '%s'", param->key); 3936626c3920SAl Viro bad_value: 3937f35aa2bcSAl Viro return invalfc(fc, "Bad value for '%s'", param->key); 3938e04dc423SAl Viro } 3939e04dc423SAl Viro 3940f3235626SDavid Howells static int shmem_parse_options(struct fs_context *fc, void *data) 3941e04dc423SAl Viro { 3942f3235626SDavid Howells char *options = data; 3943f3235626SDavid Howells 394433f37c64SAl Viro if (options) { 394533f37c64SAl Viro int err = security_sb_eat_lsm_opts(options, &fc->security); 394633f37c64SAl Viro if (err) 394733f37c64SAl Viro return err; 394833f37c64SAl Viro } 394933f37c64SAl Viro 3950b00dc3adSHugh Dickins while (options != NULL) { 3951626c3920SAl Viro char *this_char = options; 3952b00dc3adSHugh Dickins for (;;) { 3953b00dc3adSHugh Dickins /* 3954b00dc3adSHugh Dickins * NUL-terminate this option: unfortunately, 3955b00dc3adSHugh Dickins * mount options form a comma-separated list, 3956b00dc3adSHugh Dickins * but mpol's nodelist may also contain commas. 3957b00dc3adSHugh Dickins */ 3958b00dc3adSHugh Dickins options = strchr(options, ','); 3959b00dc3adSHugh Dickins if (options == NULL) 3960b00dc3adSHugh Dickins break; 3961b00dc3adSHugh Dickins options++; 3962b00dc3adSHugh Dickins if (!isdigit(*options)) { 3963b00dc3adSHugh Dickins options[-1] = '\0'; 3964b00dc3adSHugh Dickins break; 3965b00dc3adSHugh Dickins } 3966b00dc3adSHugh Dickins } 3967626c3920SAl Viro if (*this_char) { 3968626c3920SAl Viro char *value = strchr(this_char, '='); 3969f3235626SDavid Howells size_t len = 0; 3970626c3920SAl Viro int err; 3971626c3920SAl Viro 3972626c3920SAl Viro if (value) { 3973626c3920SAl Viro *value++ = '\0'; 3974f3235626SDavid Howells len = strlen(value); 39751da177e4SLinus Torvalds } 3976f3235626SDavid Howells err = vfs_parse_fs_string(fc, this_char, value, len); 3977f3235626SDavid Howells if (err < 0) 3978f3235626SDavid Howells return err; 39791da177e4SLinus Torvalds } 3980626c3920SAl Viro } 39811da177e4SLinus Torvalds return 0; 39821da177e4SLinus Torvalds } 39831da177e4SLinus Torvalds 3984f3235626SDavid Howells /* 3985f3235626SDavid Howells * Reconfigure a shmem filesystem. 3986f3235626SDavid Howells * 3987f3235626SDavid Howells * Note that we disallow change from limited->unlimited blocks/inodes while any 3988f3235626SDavid Howells * are in use; but we must separately disallow unlimited->limited, because in 3989f3235626SDavid Howells * that case we have no record of how much is already in use. 3990f3235626SDavid Howells */ 3991f3235626SDavid Howells static int shmem_reconfigure(struct fs_context *fc) 39921da177e4SLinus Torvalds { 3993f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 3994f3235626SDavid Howells struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); 39950edd73b3SHugh Dickins unsigned long inodes; 3996bf11b9a8SSebastian Andrzej Siewior struct mempolicy *mpol = NULL; 3997f3235626SDavid Howells const char *err; 39980edd73b3SHugh Dickins 3999bf11b9a8SSebastian Andrzej Siewior raw_spin_lock(&sbinfo->stat_lock); 40000edd73b3SHugh Dickins inodes = sbinfo->max_inodes - sbinfo->free_inodes; 40010c98c8e1SZhaoLong Wang 4002f3235626SDavid Howells if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { 4003f3235626SDavid Howells if (!sbinfo->max_blocks) { 4004f3235626SDavid Howells err = "Cannot retroactively limit size"; 40050edd73b3SHugh Dickins goto out; 40060b5071ddSAl Viro } 4007f3235626SDavid Howells if (percpu_counter_compare(&sbinfo->used_blocks, 4008f3235626SDavid Howells ctx->blocks) > 0) { 4009f3235626SDavid Howells err = "Too small a size for current use"; 40100b5071ddSAl Viro goto out; 4011f3235626SDavid Howells } 4012f3235626SDavid Howells } 4013f3235626SDavid Howells if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { 4014f3235626SDavid Howells if (!sbinfo->max_inodes) { 4015f3235626SDavid Howells err = "Cannot retroactively limit inodes"; 40160b5071ddSAl Viro goto out; 40170b5071ddSAl Viro } 4018f3235626SDavid Howells if (ctx->inodes < inodes) { 4019f3235626SDavid Howells err = "Too few inodes for current use"; 4020f3235626SDavid Howells goto out; 4021f3235626SDavid Howells } 4022f3235626SDavid Howells } 40230edd73b3SHugh Dickins 4024ea3271f7SChris Down if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && 4025ea3271f7SChris Down sbinfo->next_ino > UINT_MAX) { 4026ea3271f7SChris Down err = "Current inum too high to switch to 32-bit inums"; 4027ea3271f7SChris Down goto out; 4028ea3271f7SChris Down } 40292c6efe9cSLuis Chamberlain if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { 40302c6efe9cSLuis Chamberlain err = "Cannot disable swap on remount"; 40312c6efe9cSLuis Chamberlain goto out; 40322c6efe9cSLuis Chamberlain } 40332c6efe9cSLuis Chamberlain if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { 40342c6efe9cSLuis Chamberlain err = "Cannot enable swap on remount if it was disabled on first mount"; 40352c6efe9cSLuis Chamberlain goto out; 40362c6efe9cSLuis Chamberlain } 4037ea3271f7SChris Down 4038e09764cfSCarlos Maiolino if (ctx->seen & SHMEM_SEEN_QUOTA && 4039e09764cfSCarlos Maiolino !sb_any_quota_loaded(fc->root->d_sb)) { 4040e09764cfSCarlos Maiolino err = "Cannot enable quota on remount"; 4041e09764cfSCarlos Maiolino goto out; 4042e09764cfSCarlos Maiolino } 4043e09764cfSCarlos Maiolino 4044de4c0e7cSLukas Czerner #ifdef CONFIG_TMPFS_QUOTA 4045de4c0e7cSLukas Czerner #define CHANGED_LIMIT(name) \ 4046de4c0e7cSLukas Czerner (ctx->qlimits.name## hardlimit && \ 4047de4c0e7cSLukas Czerner (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) 4048de4c0e7cSLukas Czerner 4049de4c0e7cSLukas Czerner if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || 4050de4c0e7cSLukas Czerner CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { 4051de4c0e7cSLukas Czerner err = "Cannot change global quota limit on remount"; 4052de4c0e7cSLukas Czerner goto out; 4053de4c0e7cSLukas Czerner } 4054de4c0e7cSLukas Czerner #endif /* CONFIG_TMPFS_QUOTA */ 4055de4c0e7cSLukas Czerner 4056f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_HUGE) 4057f3235626SDavid Howells sbinfo->huge = ctx->huge; 4058ea3271f7SChris Down if (ctx->seen & SHMEM_SEEN_INUMS) 4059ea3271f7SChris Down sbinfo->full_inums = ctx->full_inums; 4060f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_BLOCKS) 4061f3235626SDavid Howells sbinfo->max_blocks = ctx->blocks; 4062f3235626SDavid Howells if (ctx->seen & SHMEM_SEEN_INODES) { 4063f3235626SDavid Howells sbinfo->max_inodes = ctx->inodes; 4064f3235626SDavid Howells sbinfo->free_inodes = ctx->inodes - inodes; 40650b5071ddSAl Viro } 406671fe804bSLee Schermerhorn 40675f00110fSGreg Thelen /* 40685f00110fSGreg Thelen * Preserve previous mempolicy unless mpol remount option was specified. 40695f00110fSGreg Thelen */ 4070f3235626SDavid Howells if (ctx->mpol) { 4071bf11b9a8SSebastian Andrzej Siewior mpol = sbinfo->mpol; 4072f3235626SDavid Howells sbinfo->mpol = ctx->mpol; /* transfers initial ref */ 4073f3235626SDavid Howells ctx->mpol = NULL; 40745f00110fSGreg Thelen } 40752c6efe9cSLuis Chamberlain 40762c6efe9cSLuis Chamberlain if (ctx->noswap) 40772c6efe9cSLuis Chamberlain sbinfo->noswap = true; 40782c6efe9cSLuis Chamberlain 4079bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 4080bf11b9a8SSebastian Andrzej Siewior mpol_put(mpol); 4081f3235626SDavid Howells return 0; 40820edd73b3SHugh Dickins out: 4083bf11b9a8SSebastian Andrzej Siewior raw_spin_unlock(&sbinfo->stat_lock); 4084f35aa2bcSAl Viro return invalfc(fc, "%s", err); 40851da177e4SLinus Torvalds } 4086680d794bSakpm@linux-foundation.org 408734c80b1dSAl Viro static int shmem_show_options(struct seq_file *seq, struct dentry *root) 4088680d794bSakpm@linux-foundation.org { 408934c80b1dSAl Viro struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 4090283ebdeeSTu Jinjiang struct mempolicy *mpol; 4091680d794bSakpm@linux-foundation.org 4092680d794bSakpm@linux-foundation.org if (sbinfo->max_blocks != shmem_default_max_blocks()) 4093680d794bSakpm@linux-foundation.org seq_printf(seq, ",size=%luk", 409409cbfeafSKirill A. Shutemov sbinfo->max_blocks << (PAGE_SHIFT - 10)); 4095680d794bSakpm@linux-foundation.org if (sbinfo->max_inodes != shmem_default_max_inodes()) 4096680d794bSakpm@linux-foundation.org seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 40970825a6f9SJoe Perches if (sbinfo->mode != (0777 | S_ISVTX)) 409809208d15SAl Viro seq_printf(seq, ",mode=%03ho", sbinfo->mode); 40998751e039SEric W. Biederman if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 41008751e039SEric W. Biederman seq_printf(seq, ",uid=%u", 41018751e039SEric W. Biederman from_kuid_munged(&init_user_ns, sbinfo->uid)); 41028751e039SEric W. Biederman if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 41038751e039SEric W. Biederman seq_printf(seq, ",gid=%u", 41048751e039SEric W. Biederman from_kgid_munged(&init_user_ns, sbinfo->gid)); 4105ea3271f7SChris Down 4106ea3271f7SChris Down /* 4107ea3271f7SChris Down * Showing inode{64,32} might be useful even if it's the system default, 4108ea3271f7SChris Down * since then people don't have to resort to checking both here and 4109ea3271f7SChris Down * /proc/config.gz to confirm 64-bit inums were successfully applied 4110ea3271f7SChris Down * (which may not even exist if IKCONFIG_PROC isn't enabled). 4111ea3271f7SChris Down * 4112ea3271f7SChris Down * We hide it when inode64 isn't the default and we are using 32-bit 4113ea3271f7SChris Down * inodes, since that probably just means the feature isn't even under 4114ea3271f7SChris Down * consideration. 4115ea3271f7SChris Down * 4116ea3271f7SChris Down * As such: 4117ea3271f7SChris Down * 4118ea3271f7SChris Down * +-----------------+-----------------+ 4119ea3271f7SChris Down * | TMPFS_INODE64=y | TMPFS_INODE64=n | 4120ea3271f7SChris Down * +------------------+-----------------+-----------------+ 4121ea3271f7SChris Down * | full_inums=true | show | show | 4122ea3271f7SChris Down * | full_inums=false | show | hide | 4123ea3271f7SChris Down * +------------------+-----------------+-----------------+ 4124ea3271f7SChris Down * 4125ea3271f7SChris Down */ 4126ea3271f7SChris Down if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) 4127ea3271f7SChris Down seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); 4128396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 41295a6e75f8SKirill A. Shutemov /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 41305a6e75f8SKirill A. Shutemov if (sbinfo->huge) 41315a6e75f8SKirill A. Shutemov seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 41325a6e75f8SKirill A. Shutemov #endif 4133283ebdeeSTu Jinjiang mpol = shmem_get_sbmpol(sbinfo); 4134283ebdeeSTu Jinjiang shmem_show_mpol(seq, mpol); 4135283ebdeeSTu Jinjiang mpol_put(mpol); 41362c6efe9cSLuis Chamberlain if (sbinfo->noswap) 41372c6efe9cSLuis Chamberlain seq_printf(seq, ",noswap"); 4138680d794bSakpm@linux-foundation.org return 0; 4139680d794bSakpm@linux-foundation.org } 41409183df25SDavid Herrmann 4141680d794bSakpm@linux-foundation.org #endif /* CONFIG_TMPFS */ 41421da177e4SLinus Torvalds 41431da177e4SLinus Torvalds static void shmem_put_super(struct super_block *sb) 41441da177e4SLinus Torvalds { 4145602586a8SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 4146602586a8SHugh Dickins 4147e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4148e09764cfSCarlos Maiolino shmem_disable_quotas(sb); 4149e09764cfSCarlos Maiolino #endif 4150e809d5f0SChris Down free_percpu(sbinfo->ino_batch); 4151602586a8SHugh Dickins percpu_counter_destroy(&sbinfo->used_blocks); 415249cd0a5cSGreg Thelen mpol_put(sbinfo->mpol); 4153602586a8SHugh Dickins kfree(sbinfo); 41541da177e4SLinus Torvalds sb->s_fs_info = NULL; 41551da177e4SLinus Torvalds } 41561da177e4SLinus Torvalds 4157f3235626SDavid Howells static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) 41581da177e4SLinus Torvalds { 4159f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 41601da177e4SLinus Torvalds struct inode *inode; 41610edd73b3SHugh Dickins struct shmem_sb_info *sbinfo; 416271480663SCarlos Maiolino int error = -ENOMEM; 4163680d794bSakpm@linux-foundation.org 4164680d794bSakpm@linux-foundation.org /* Round up to L1_CACHE_BYTES to resist false sharing */ 4165425fbf04SPekka Enberg sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 4166680d794bSakpm@linux-foundation.org L1_CACHE_BYTES), GFP_KERNEL); 4167680d794bSakpm@linux-foundation.org if (!sbinfo) 416871480663SCarlos Maiolino return error; 4169680d794bSakpm@linux-foundation.org 4170680d794bSakpm@linux-foundation.org sb->s_fs_info = sbinfo; 41711da177e4SLinus Torvalds 41720edd73b3SHugh Dickins #ifdef CONFIG_TMPFS 41731da177e4SLinus Torvalds /* 41741da177e4SLinus Torvalds * Per default we only allow half of the physical ram per 41751da177e4SLinus Torvalds * tmpfs instance, limiting inodes to one per page of lowmem; 41761da177e4SLinus Torvalds * but the internal instance is left unlimited. 41771da177e4SLinus Torvalds */ 41781751e8a6SLinus Torvalds if (!(sb->s_flags & SB_KERNMOUNT)) { 4179f3235626SDavid Howells if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) 4180f3235626SDavid Howells ctx->blocks = shmem_default_max_blocks(); 4181f3235626SDavid Howells if (!(ctx->seen & SHMEM_SEEN_INODES)) 4182f3235626SDavid Howells ctx->inodes = shmem_default_max_inodes(); 4183ea3271f7SChris Down if (!(ctx->seen & SHMEM_SEEN_INUMS)) 4184ea3271f7SChris Down ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); 41852c6efe9cSLuis Chamberlain sbinfo->noswap = ctx->noswap; 4186ca4e0519SAl Viro } else { 41871751e8a6SLinus Torvalds sb->s_flags |= SB_NOUSER; 41881da177e4SLinus Torvalds } 418991828a40SDavid M. Grimes sb->s_export_op = &shmem_export_ops; 419036f05cabSJeff Layton sb->s_flags |= SB_NOSEC | SB_I_VERSION; 41910edd73b3SHugh Dickins #else 41921751e8a6SLinus Torvalds sb->s_flags |= SB_NOUSER; 41930edd73b3SHugh Dickins #endif 4194f3235626SDavid Howells sbinfo->max_blocks = ctx->blocks; 4195f3235626SDavid Howells sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; 4196e809d5f0SChris Down if (sb->s_flags & SB_KERNMOUNT) { 4197e809d5f0SChris Down sbinfo->ino_batch = alloc_percpu(ino_t); 4198e809d5f0SChris Down if (!sbinfo->ino_batch) 4199e809d5f0SChris Down goto failed; 4200e809d5f0SChris Down } 4201f3235626SDavid Howells sbinfo->uid = ctx->uid; 4202f3235626SDavid Howells sbinfo->gid = ctx->gid; 4203ea3271f7SChris Down sbinfo->full_inums = ctx->full_inums; 4204f3235626SDavid Howells sbinfo->mode = ctx->mode; 4205f3235626SDavid Howells sbinfo->huge = ctx->huge; 4206f3235626SDavid Howells sbinfo->mpol = ctx->mpol; 4207f3235626SDavid Howells ctx->mpol = NULL; 42081da177e4SLinus Torvalds 4209bf11b9a8SSebastian Andrzej Siewior raw_spin_lock_init(&sbinfo->stat_lock); 4210908c7f19STejun Heo if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 4211602586a8SHugh Dickins goto failed; 4212779750d2SKirill A. Shutemov spin_lock_init(&sbinfo->shrinklist_lock); 4213779750d2SKirill A. Shutemov INIT_LIST_HEAD(&sbinfo->shrinklist); 42141da177e4SLinus Torvalds 4215285b2c4fSHugh Dickins sb->s_maxbytes = MAX_LFS_FILESIZE; 421609cbfeafSKirill A. Shutemov sb->s_blocksize = PAGE_SIZE; 421709cbfeafSKirill A. Shutemov sb->s_blocksize_bits = PAGE_SHIFT; 42181da177e4SLinus Torvalds sb->s_magic = TMPFS_MAGIC; 42191da177e4SLinus Torvalds sb->s_op = &shmem_ops; 4220cfd95a9cSRobin H. Johnson sb->s_time_gran = 1; 4221b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 422239f0247dSAndreas Gruenbacher sb->s_xattr = shmem_xattr_handlers; 4223b09e0fa4SEric Paris #endif 4224b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL 42251751e8a6SLinus Torvalds sb->s_flags |= SB_POSIXACL; 422639f0247dSAndreas Gruenbacher #endif 42272b4db796SAmir Goldstein uuid_gen(&sb->s_uuid); 42280edd73b3SHugh Dickins 4229e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4230e09764cfSCarlos Maiolino if (ctx->seen & SHMEM_SEEN_QUOTA) { 4231e09764cfSCarlos Maiolino sb->dq_op = &shmem_quota_operations; 4232e09764cfSCarlos Maiolino sb->s_qcop = &dquot_quotactl_sysfile_ops; 4233e09764cfSCarlos Maiolino sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 4234e09764cfSCarlos Maiolino 4235de4c0e7cSLukas Czerner /* Copy the default limits from ctx into sbinfo */ 4236de4c0e7cSLukas Czerner memcpy(&sbinfo->qlimits, &ctx->qlimits, 4237de4c0e7cSLukas Czerner sizeof(struct shmem_quota_limits)); 4238de4c0e7cSLukas Czerner 4239e09764cfSCarlos Maiolino if (shmem_enable_quotas(sb, ctx->quota_types)) 4240e09764cfSCarlos Maiolino goto failed; 4241e09764cfSCarlos Maiolino } 4242e09764cfSCarlos Maiolino #endif /* CONFIG_TMPFS_QUOTA */ 4243e09764cfSCarlos Maiolino 42447a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, 42457a80e5b8SGiuseppe Scrivano VM_NORESERVE); 424671480663SCarlos Maiolino if (IS_ERR(inode)) { 424771480663SCarlos Maiolino error = PTR_ERR(inode); 42481da177e4SLinus Torvalds goto failed; 424971480663SCarlos Maiolino } 4250680d794bSakpm@linux-foundation.org inode->i_uid = sbinfo->uid; 4251680d794bSakpm@linux-foundation.org inode->i_gid = sbinfo->gid; 4252318ceed0SAl Viro sb->s_root = d_make_root(inode); 4253318ceed0SAl Viro if (!sb->s_root) 425448fde701SAl Viro goto failed; 42551da177e4SLinus Torvalds return 0; 42561da177e4SLinus Torvalds 42571da177e4SLinus Torvalds failed: 42581da177e4SLinus Torvalds shmem_put_super(sb); 425971480663SCarlos Maiolino return error; 42601da177e4SLinus Torvalds } 42611da177e4SLinus Torvalds 4262f3235626SDavid Howells static int shmem_get_tree(struct fs_context *fc) 4263f3235626SDavid Howells { 4264f3235626SDavid Howells return get_tree_nodev(fc, shmem_fill_super); 4265f3235626SDavid Howells } 4266f3235626SDavid Howells 4267f3235626SDavid Howells static void shmem_free_fc(struct fs_context *fc) 4268f3235626SDavid Howells { 4269f3235626SDavid Howells struct shmem_options *ctx = fc->fs_private; 4270f3235626SDavid Howells 4271f3235626SDavid Howells if (ctx) { 4272f3235626SDavid Howells mpol_put(ctx->mpol); 4273f3235626SDavid Howells kfree(ctx); 4274f3235626SDavid Howells } 4275f3235626SDavid Howells } 4276f3235626SDavid Howells 4277f3235626SDavid Howells static const struct fs_context_operations shmem_fs_context_ops = { 4278f3235626SDavid Howells .free = shmem_free_fc, 4279f3235626SDavid Howells .get_tree = shmem_get_tree, 4280f3235626SDavid Howells #ifdef CONFIG_TMPFS 4281f3235626SDavid Howells .parse_monolithic = shmem_parse_options, 4282f3235626SDavid Howells .parse_param = shmem_parse_one, 4283f3235626SDavid Howells .reconfigure = shmem_reconfigure, 4284f3235626SDavid Howells #endif 4285f3235626SDavid Howells }; 4286f3235626SDavid Howells 4287fcc234f8SPekka Enberg static struct kmem_cache *shmem_inode_cachep; 42881da177e4SLinus Torvalds 42891da177e4SLinus Torvalds static struct inode *shmem_alloc_inode(struct super_block *sb) 42901da177e4SLinus Torvalds { 429141ffe5d5SHugh Dickins struct shmem_inode_info *info; 4292fd60b288SMuchun Song info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); 429341ffe5d5SHugh Dickins if (!info) 42941da177e4SLinus Torvalds return NULL; 429541ffe5d5SHugh Dickins return &info->vfs_inode; 42961da177e4SLinus Torvalds } 42971da177e4SLinus Torvalds 429874b1da56SAl Viro static void shmem_free_in_core_inode(struct inode *inode) 4299fa0d7e3dSNick Piggin { 430084e710daSAl Viro if (S_ISLNK(inode->i_mode)) 43013ed47db3SAl Viro kfree(inode->i_link); 4302fa0d7e3dSNick Piggin kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 4303fa0d7e3dSNick Piggin } 4304fa0d7e3dSNick Piggin 43051da177e4SLinus Torvalds static void shmem_destroy_inode(struct inode *inode) 43061da177e4SLinus Torvalds { 430709208d15SAl Viro if (S_ISREG(inode->i_mode)) 43081da177e4SLinus Torvalds mpol_free_shared_policy(&SHMEM_I(inode)->policy); 4309a2e45955SChuck Lever if (S_ISDIR(inode->i_mode)) 4310a2e45955SChuck Lever simple_offset_destroy(shmem_get_offset_ctx(inode)); 43111da177e4SLinus Torvalds } 43121da177e4SLinus Torvalds 431341ffe5d5SHugh Dickins static void shmem_init_inode(void *foo) 43141da177e4SLinus Torvalds { 431541ffe5d5SHugh Dickins struct shmem_inode_info *info = foo; 431641ffe5d5SHugh Dickins inode_init_once(&info->vfs_inode); 43171da177e4SLinus Torvalds } 43181da177e4SLinus Torvalds 43199a8ec03eSweiping zhang static void shmem_init_inodecache(void) 43201da177e4SLinus Torvalds { 43211da177e4SLinus Torvalds shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 43221da177e4SLinus Torvalds sizeof(struct shmem_inode_info), 43235d097056SVladimir Davydov 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 43241da177e4SLinus Torvalds } 43251da177e4SLinus Torvalds 432641ffe5d5SHugh Dickins static void shmem_destroy_inodecache(void) 43271da177e4SLinus Torvalds { 43281a1d92c1SAlexey Dobriyan kmem_cache_destroy(shmem_inode_cachep); 43291da177e4SLinus Torvalds } 43301da177e4SLinus Torvalds 4331a7605426SYang Shi /* Keep the page in page cache instead of truncating it */ 4332a7605426SYang Shi static int shmem_error_remove_page(struct address_space *mapping, 4333a7605426SYang Shi struct page *page) 4334a7605426SYang Shi { 4335a7605426SYang Shi return 0; 4336a7605426SYang Shi } 4337a7605426SYang Shi 433830e6a51dSHui Su const struct address_space_operations shmem_aops = { 43391da177e4SLinus Torvalds .writepage = shmem_writepage, 434046de8b97SMatthew Wilcox (Oracle) .dirty_folio = noop_dirty_folio, 43411da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4342800d15a5SNick Piggin .write_begin = shmem_write_begin, 4343800d15a5SNick Piggin .write_end = shmem_write_end, 43441da177e4SLinus Torvalds #endif 43451c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 434654184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 43471c93923cSAndrew Morton #endif 4348a7605426SYang Shi .error_remove_page = shmem_error_remove_page, 43491da177e4SLinus Torvalds }; 435030e6a51dSHui Su EXPORT_SYMBOL(shmem_aops); 43511da177e4SLinus Torvalds 435215ad7cdcSHelge Deller static const struct file_operations shmem_file_operations = { 43531da177e4SLinus Torvalds .mmap = shmem_mmap, 4354a5454f95SThomas Weißschuh .open = generic_file_open, 4355c01d5b30SHugh Dickins .get_unmapped_area = shmem_get_unmapped_area, 43561da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4357220f2ac9SHugh Dickins .llseek = shmem_file_llseek, 43582ba5bbedSAl Viro .read_iter = shmem_file_read_iter, 43598174202bSAl Viro .write_iter = generic_file_write_iter, 43601b061d92SChristoph Hellwig .fsync = noop_fsync, 4361bd194b18SDavid Howells .splice_read = shmem_file_splice_read, 4362f6cb85d0SAl Viro .splice_write = iter_file_splice_write, 436383e4fa9cSHugh Dickins .fallocate = shmem_fallocate, 43641da177e4SLinus Torvalds #endif 43651da177e4SLinus Torvalds }; 43661da177e4SLinus Torvalds 436792e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations = { 436844a30220SYu Zhao .getattr = shmem_getattr, 436994c1e62dSHugh Dickins .setattr = shmem_setattr, 4370b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4371b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4372feda821eSChristoph Hellwig .set_acl = simple_set_acl, 4373e408e695STheodore Ts'o .fileattr_get = shmem_fileattr_get, 4374e408e695STheodore Ts'o .fileattr_set = shmem_fileattr_set, 4375b09e0fa4SEric Paris #endif 43761da177e4SLinus Torvalds }; 43771da177e4SLinus Torvalds 437892e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations = { 43791da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 4380f7cd16a5SXavier Roche .getattr = shmem_getattr, 43811da177e4SLinus Torvalds .create = shmem_create, 43821da177e4SLinus Torvalds .lookup = simple_lookup, 43831da177e4SLinus Torvalds .link = shmem_link, 43841da177e4SLinus Torvalds .unlink = shmem_unlink, 43851da177e4SLinus Torvalds .symlink = shmem_symlink, 43861da177e4SLinus Torvalds .mkdir = shmem_mkdir, 43871da177e4SLinus Torvalds .rmdir = shmem_rmdir, 43881da177e4SLinus Torvalds .mknod = shmem_mknod, 43892773bf00SMiklos Szeredi .rename = shmem_rename2, 439060545d0dSAl Viro .tmpfile = shmem_tmpfile, 4391a2e45955SChuck Lever .get_offset_ctx = shmem_get_offset_ctx, 43921da177e4SLinus Torvalds #endif 4393b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4394b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4395e408e695STheodore Ts'o .fileattr_get = shmem_fileattr_get, 4396e408e695STheodore Ts'o .fileattr_set = shmem_fileattr_set, 4397b09e0fa4SEric Paris #endif 439839f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 439994c1e62dSHugh Dickins .setattr = shmem_setattr, 4400feda821eSChristoph Hellwig .set_acl = simple_set_acl, 440139f0247dSAndreas Gruenbacher #endif 440239f0247dSAndreas Gruenbacher }; 440339f0247dSAndreas Gruenbacher 440492e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations = { 4405f7cd16a5SXavier Roche .getattr = shmem_getattr, 4406b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 4407b09e0fa4SEric Paris .listxattr = shmem_listxattr, 4408b09e0fa4SEric Paris #endif 440939f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 441094c1e62dSHugh Dickins .setattr = shmem_setattr, 4411feda821eSChristoph Hellwig .set_acl = simple_set_acl, 441239f0247dSAndreas Gruenbacher #endif 44131da177e4SLinus Torvalds }; 44141da177e4SLinus Torvalds 4415759b9775SHugh Dickins static const struct super_operations shmem_ops = { 44161da177e4SLinus Torvalds .alloc_inode = shmem_alloc_inode, 441774b1da56SAl Viro .free_inode = shmem_free_in_core_inode, 44181da177e4SLinus Torvalds .destroy_inode = shmem_destroy_inode, 44191da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 44201da177e4SLinus Torvalds .statfs = shmem_statfs, 4421680d794bSakpm@linux-foundation.org .show_options = shmem_show_options, 44221da177e4SLinus Torvalds #endif 4423e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4424e09764cfSCarlos Maiolino .get_dquots = shmem_get_dquots, 4425e09764cfSCarlos Maiolino #endif 44261f895f75SAl Viro .evict_inode = shmem_evict_inode, 44271da177e4SLinus Torvalds .drop_inode = generic_delete_inode, 44281da177e4SLinus Torvalds .put_super = shmem_put_super, 4429396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4430779750d2SKirill A. Shutemov .nr_cached_objects = shmem_unused_huge_count, 4431779750d2SKirill A. Shutemov .free_cached_objects = shmem_unused_huge_scan, 4432779750d2SKirill A. Shutemov #endif 44331da177e4SLinus Torvalds }; 44341da177e4SLinus Torvalds 4435f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops = { 443654cb8821SNick Piggin .fault = shmem_fault, 4437d7c17551SNing Qu .map_pages = filemap_map_pages, 44381da177e4SLinus Torvalds #ifdef CONFIG_NUMA 44391da177e4SLinus Torvalds .set_policy = shmem_set_policy, 44401da177e4SLinus Torvalds .get_policy = shmem_get_policy, 44411da177e4SLinus Torvalds #endif 44421da177e4SLinus Torvalds }; 44431da177e4SLinus Torvalds 4444d09e8ca6SPasha Tatashin static const struct vm_operations_struct shmem_anon_vm_ops = { 4445d09e8ca6SPasha Tatashin .fault = shmem_fault, 4446d09e8ca6SPasha Tatashin .map_pages = filemap_map_pages, 4447d09e8ca6SPasha Tatashin #ifdef CONFIG_NUMA 4448d09e8ca6SPasha Tatashin .set_policy = shmem_set_policy, 4449d09e8ca6SPasha Tatashin .get_policy = shmem_get_policy, 4450d09e8ca6SPasha Tatashin #endif 4451d09e8ca6SPasha Tatashin }; 4452d09e8ca6SPasha Tatashin 4453f3235626SDavid Howells int shmem_init_fs_context(struct fs_context *fc) 44541da177e4SLinus Torvalds { 4455f3235626SDavid Howells struct shmem_options *ctx; 4456f3235626SDavid Howells 4457f3235626SDavid Howells ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); 4458f3235626SDavid Howells if (!ctx) 4459f3235626SDavid Howells return -ENOMEM; 4460f3235626SDavid Howells 4461f3235626SDavid Howells ctx->mode = 0777 | S_ISVTX; 4462f3235626SDavid Howells ctx->uid = current_fsuid(); 4463f3235626SDavid Howells ctx->gid = current_fsgid(); 4464f3235626SDavid Howells 4465f3235626SDavid Howells fc->fs_private = ctx; 4466f3235626SDavid Howells fc->ops = &shmem_fs_context_ops; 4467f3235626SDavid Howells return 0; 44681da177e4SLinus Torvalds } 44691da177e4SLinus Torvalds 447041ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 44711da177e4SLinus Torvalds .owner = THIS_MODULE, 44721da177e4SLinus Torvalds .name = "tmpfs", 4473f3235626SDavid Howells .init_fs_context = shmem_init_fs_context, 4474f3235626SDavid Howells #ifdef CONFIG_TMPFS 4475d7167b14SAl Viro .parameters = shmem_fs_parameters, 4476f3235626SDavid Howells #endif 44771da177e4SLinus Torvalds .kill_sb = kill_litter_super, 44787a80e5b8SGiuseppe Scrivano #ifdef CONFIG_SHMEM 44797a80e5b8SGiuseppe Scrivano .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 44807a80e5b8SGiuseppe Scrivano #else 4481ff36da69SMatthew Wilcox (Oracle) .fs_flags = FS_USERNS_MOUNT, 44827a80e5b8SGiuseppe Scrivano #endif 44831da177e4SLinus Torvalds }; 44841da177e4SLinus Torvalds 44859096bbe9SMiaohe Lin void __init shmem_init(void) 44861da177e4SLinus Torvalds { 44871da177e4SLinus Torvalds int error; 44881da177e4SLinus Torvalds 44899a8ec03eSweiping zhang shmem_init_inodecache(); 44901da177e4SLinus Torvalds 4491e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4492e09764cfSCarlos Maiolino error = register_quota_format(&shmem_quota_format); 4493e09764cfSCarlos Maiolino if (error < 0) { 4494e09764cfSCarlos Maiolino pr_err("Could not register quota format\n"); 4495e09764cfSCarlos Maiolino goto out3; 4496e09764cfSCarlos Maiolino } 4497e09764cfSCarlos Maiolino #endif 4498e09764cfSCarlos Maiolino 449941ffe5d5SHugh Dickins error = register_filesystem(&shmem_fs_type); 45001da177e4SLinus Torvalds if (error) { 45011170532bSJoe Perches pr_err("Could not register tmpfs\n"); 45021da177e4SLinus Torvalds goto out2; 45031da177e4SLinus Torvalds } 450495dc112aSGreg Kroah-Hartman 4505ca4e0519SAl Viro shm_mnt = kern_mount(&shmem_fs_type); 45061da177e4SLinus Torvalds if (IS_ERR(shm_mnt)) { 45071da177e4SLinus Torvalds error = PTR_ERR(shm_mnt); 45081170532bSJoe Perches pr_err("Could not kern_mount tmpfs\n"); 45091da177e4SLinus Torvalds goto out1; 45101da177e4SLinus Torvalds } 45115a6e75f8SKirill A. Shutemov 4512396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4513435c0b87SKirill A. Shutemov if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 45145a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 45155a6e75f8SKirill A. Shutemov else 45165e6e5a12SHugh Dickins shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ 45175a6e75f8SKirill A. Shutemov #endif 45189096bbe9SMiaohe Lin return; 45191da177e4SLinus Torvalds 45201da177e4SLinus Torvalds out1: 452141ffe5d5SHugh Dickins unregister_filesystem(&shmem_fs_type); 45221da177e4SLinus Torvalds out2: 4523e09764cfSCarlos Maiolino #ifdef CONFIG_TMPFS_QUOTA 4524e09764cfSCarlos Maiolino unregister_quota_format(&shmem_quota_format); 4525e09764cfSCarlos Maiolino out3: 4526e09764cfSCarlos Maiolino #endif 452741ffe5d5SHugh Dickins shmem_destroy_inodecache(); 45281da177e4SLinus Torvalds shm_mnt = ERR_PTR(error); 45291da177e4SLinus Torvalds } 4530853ac43aSMatt Mackall 4531396bcc52SMatthew Wilcox (Oracle) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 45325a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_show(struct kobject *kobj, 45335a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, char *buf) 45345a6e75f8SKirill A. Shutemov { 453526083eb6SColin Ian King static const int values[] = { 45365a6e75f8SKirill A. Shutemov SHMEM_HUGE_ALWAYS, 45375a6e75f8SKirill A. Shutemov SHMEM_HUGE_WITHIN_SIZE, 45385a6e75f8SKirill A. Shutemov SHMEM_HUGE_ADVISE, 45395a6e75f8SKirill A. Shutemov SHMEM_HUGE_NEVER, 45405a6e75f8SKirill A. Shutemov SHMEM_HUGE_DENY, 45415a6e75f8SKirill A. Shutemov SHMEM_HUGE_FORCE, 45425a6e75f8SKirill A. Shutemov }; 454379d4d38aSJoe Perches int len = 0; 454479d4d38aSJoe Perches int i; 45455a6e75f8SKirill A. Shutemov 454679d4d38aSJoe Perches for (i = 0; i < ARRAY_SIZE(values); i++) { 454779d4d38aSJoe Perches len += sysfs_emit_at(buf, len, 454879d4d38aSJoe Perches shmem_huge == values[i] ? "%s[%s]" : "%s%s", 454979d4d38aSJoe Perches i ? " " : "", 45505a6e75f8SKirill A. Shutemov shmem_format_huge(values[i])); 45515a6e75f8SKirill A. Shutemov } 455279d4d38aSJoe Perches 455379d4d38aSJoe Perches len += sysfs_emit_at(buf, len, "\n"); 455479d4d38aSJoe Perches 455579d4d38aSJoe Perches return len; 45565a6e75f8SKirill A. Shutemov } 45575a6e75f8SKirill A. Shutemov 45585a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_store(struct kobject *kobj, 45595a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, const char *buf, size_t count) 45605a6e75f8SKirill A. Shutemov { 45615a6e75f8SKirill A. Shutemov char tmp[16]; 45625a6e75f8SKirill A. Shutemov int huge; 45635a6e75f8SKirill A. Shutemov 45645a6e75f8SKirill A. Shutemov if (count + 1 > sizeof(tmp)) 45655a6e75f8SKirill A. Shutemov return -EINVAL; 45665a6e75f8SKirill A. Shutemov memcpy(tmp, buf, count); 45675a6e75f8SKirill A. Shutemov tmp[count] = '\0'; 45685a6e75f8SKirill A. Shutemov if (count && tmp[count - 1] == '\n') 45695a6e75f8SKirill A. Shutemov tmp[count - 1] = '\0'; 45705a6e75f8SKirill A. Shutemov 45715a6e75f8SKirill A. Shutemov huge = shmem_parse_huge(tmp); 45725a6e75f8SKirill A. Shutemov if (huge == -EINVAL) 45735a6e75f8SKirill A. Shutemov return -EINVAL; 45745a6e75f8SKirill A. Shutemov if (!has_transparent_hugepage() && 45755a6e75f8SKirill A. Shutemov huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 45765a6e75f8SKirill A. Shutemov return -EINVAL; 45775a6e75f8SKirill A. Shutemov 45785a6e75f8SKirill A. Shutemov shmem_huge = huge; 4579435c0b87SKirill A. Shutemov if (shmem_huge > SHMEM_HUGE_DENY) 45805a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 45815a6e75f8SKirill A. Shutemov return count; 45825a6e75f8SKirill A. Shutemov } 45835a6e75f8SKirill A. Shutemov 45844bfa8adaSMiaohe Lin struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); 4585396bcc52SMatthew Wilcox (Oracle) #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 4586f3f0e1d2SKirill A. Shutemov 4587853ac43aSMatt Mackall #else /* !CONFIG_SHMEM */ 4588853ac43aSMatt Mackall 4589853ac43aSMatt Mackall /* 4590853ac43aSMatt Mackall * tiny-shmem: simple shmemfs and tmpfs using ramfs code 4591853ac43aSMatt Mackall * 4592853ac43aSMatt Mackall * This is intended for small system where the benefits of the full 4593853ac43aSMatt Mackall * shmem code (swap-backed and resource-limited) are outweighed by 4594853ac43aSMatt Mackall * their complexity. On systems without swap this code should be 4595853ac43aSMatt Mackall * effectively equivalent, but much lighter weight. 4596853ac43aSMatt Mackall */ 4597853ac43aSMatt Mackall 459841ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 4599853ac43aSMatt Mackall .name = "tmpfs", 4600f3235626SDavid Howells .init_fs_context = ramfs_init_fs_context, 4601d7167b14SAl Viro .parameters = ramfs_fs_parameters, 460236ce9d76SRoberto Sassu .kill_sb = ramfs_kill_sb, 46032b8576cbSEric W. Biederman .fs_flags = FS_USERNS_MOUNT, 4604853ac43aSMatt Mackall }; 4605853ac43aSMatt Mackall 46069096bbe9SMiaohe Lin void __init shmem_init(void) 4607853ac43aSMatt Mackall { 460841ffe5d5SHugh Dickins BUG_ON(register_filesystem(&shmem_fs_type) != 0); 4609853ac43aSMatt Mackall 461041ffe5d5SHugh Dickins shm_mnt = kern_mount(&shmem_fs_type); 4611853ac43aSMatt Mackall BUG_ON(IS_ERR(shm_mnt)); 4612853ac43aSMatt Mackall } 4613853ac43aSMatt Mackall 461410a9c496SChristoph Hellwig int shmem_unuse(unsigned int type) 4615853ac43aSMatt Mackall { 4616853ac43aSMatt Mackall return 0; 4617853ac43aSMatt Mackall } 4618853ac43aSMatt Mackall 4619d7c9e99aSAlexey Gladkov int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 46203f96b79aSHugh Dickins { 46213f96b79aSHugh Dickins return 0; 46223f96b79aSHugh Dickins } 46233f96b79aSHugh Dickins 462424513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 462524513264SHugh Dickins { 462624513264SHugh Dickins } 462724513264SHugh Dickins 4628c01d5b30SHugh Dickins #ifdef CONFIG_MMU 4629c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 4630c01d5b30SHugh Dickins unsigned long addr, unsigned long len, 4631c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 4632c01d5b30SHugh Dickins { 4633c01d5b30SHugh Dickins return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 4634c01d5b30SHugh Dickins } 4635c01d5b30SHugh Dickins #endif 4636c01d5b30SHugh Dickins 463741ffe5d5SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 463894c1e62dSHugh Dickins { 463941ffe5d5SHugh Dickins truncate_inode_pages_range(inode->i_mapping, lstart, lend); 464094c1e62dSHugh Dickins } 464194c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 464294c1e62dSHugh Dickins 4643853ac43aSMatt Mackall #define shmem_vm_ops generic_file_vm_ops 4644d09e8ca6SPasha Tatashin #define shmem_anon_vm_ops generic_file_vm_ops 46450b0a0806SHugh Dickins #define shmem_file_operations ramfs_file_operations 46460b0a0806SHugh Dickins #define shmem_acct_size(flags, size) 0 46470b0a0806SHugh Dickins #define shmem_unacct_size(flags, size) do {} while (0) 4648853ac43aSMatt Mackall 464971480663SCarlos Maiolino static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, 465071480663SCarlos Maiolino umode_t mode, dev_t dev, unsigned long flags) 465171480663SCarlos Maiolino { 465271480663SCarlos Maiolino struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); 465371480663SCarlos Maiolino return inode ? inode : ERR_PTR(-ENOSPC); 465471480663SCarlos Maiolino } 465571480663SCarlos Maiolino 4656853ac43aSMatt Mackall #endif /* CONFIG_SHMEM */ 4657853ac43aSMatt Mackall 4658853ac43aSMatt Mackall /* common code */ 46591da177e4SLinus Torvalds 4660703321b6SMatthew Auld static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 4661c7277090SEric Paris unsigned long flags, unsigned int i_flags) 46621da177e4SLinus Torvalds { 46631da177e4SLinus Torvalds struct inode *inode; 466493dec2daSAl Viro struct file *res; 46651da177e4SLinus Torvalds 4666703321b6SMatthew Auld if (IS_ERR(mnt)) 4667703321b6SMatthew Auld return ERR_CAST(mnt); 46681da177e4SLinus Torvalds 4669285b2c4fSHugh Dickins if (size < 0 || size > MAX_LFS_FILESIZE) 46701da177e4SLinus Torvalds return ERR_PTR(-EINVAL); 46711da177e4SLinus Torvalds 46721da177e4SLinus Torvalds if (shmem_acct_size(flags, size)) 46731da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 46741da177e4SLinus Torvalds 46757a80e5b8SGiuseppe Scrivano if (is_idmapped_mnt(mnt)) 46767a80e5b8SGiuseppe Scrivano return ERR_PTR(-EINVAL); 46777a80e5b8SGiuseppe Scrivano 46787a80e5b8SGiuseppe Scrivano inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, 46797a80e5b8SGiuseppe Scrivano S_IFREG | S_IRWXUGO, 0, flags); 468071480663SCarlos Maiolino 468171480663SCarlos Maiolino if (IS_ERR(inode)) { 4682dac2d1f6SAl Viro shmem_unacct_size(flags, size); 468371480663SCarlos Maiolino return ERR_CAST(inode); 4684dac2d1f6SAl Viro } 4685c7277090SEric Paris inode->i_flags |= i_flags; 46861da177e4SLinus Torvalds inode->i_size = size; 46876d6b77f1SMiklos Szeredi clear_nlink(inode); /* It is unlinked */ 468826567cdbSAl Viro res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 468993dec2daSAl Viro if (!IS_ERR(res)) 469093dec2daSAl Viro res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 46914b42af81SAl Viro &shmem_file_operations); 46926b4d0b27SAl Viro if (IS_ERR(res)) 469393dec2daSAl Viro iput(inode); 46946b4d0b27SAl Viro return res; 46951da177e4SLinus Torvalds } 4696c7277090SEric Paris 4697c7277090SEric Paris /** 4698c7277090SEric Paris * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 4699c7277090SEric Paris * kernel internal. There will be NO LSM permission checks against the 4700c7277090SEric Paris * underlying inode. So users of this interface must do LSM checks at a 4701e1832f29SStephen Smalley * higher layer. The users are the big_key and shm implementations. LSM 4702e1832f29SStephen Smalley * checks are provided at the key or shm level rather than the inode. 4703c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 4704c7277090SEric Paris * @size: size to be set for the file 4705c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4706c7277090SEric Paris */ 4707c7277090SEric Paris struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 4708c7277090SEric Paris { 4709703321b6SMatthew Auld return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 4710c7277090SEric Paris } 4711c7277090SEric Paris 4712c7277090SEric Paris /** 4713c7277090SEric Paris * shmem_file_setup - get an unlinked file living in tmpfs 4714c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 4715c7277090SEric Paris * @size: size to be set for the file 4716c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4717c7277090SEric Paris */ 4718c7277090SEric Paris struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 4719c7277090SEric Paris { 4720703321b6SMatthew Auld return __shmem_file_setup(shm_mnt, name, size, flags, 0); 4721c7277090SEric Paris } 4722395e0ddcSKeith Packard EXPORT_SYMBOL_GPL(shmem_file_setup); 47231da177e4SLinus Torvalds 472446711810SRandy Dunlap /** 4725703321b6SMatthew Auld * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 4726703321b6SMatthew Auld * @mnt: the tmpfs mount where the file will be created 4727703321b6SMatthew Auld * @name: name for dentry (to be seen in /proc/<pid>/maps 4728703321b6SMatthew Auld * @size: size to be set for the file 4729703321b6SMatthew Auld * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4730703321b6SMatthew Auld */ 4731703321b6SMatthew Auld struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 4732703321b6SMatthew Auld loff_t size, unsigned long flags) 4733703321b6SMatthew Auld { 4734703321b6SMatthew Auld return __shmem_file_setup(mnt, name, size, flags, 0); 4735703321b6SMatthew Auld } 4736703321b6SMatthew Auld EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 4737703321b6SMatthew Auld 4738703321b6SMatthew Auld /** 47391da177e4SLinus Torvalds * shmem_zero_setup - setup a shared anonymous mapping 474045e55300SPeter Collingbourne * @vma: the vma to be mmapped is prepared by do_mmap 47411da177e4SLinus Torvalds */ 47421da177e4SLinus Torvalds int shmem_zero_setup(struct vm_area_struct *vma) 47431da177e4SLinus Torvalds { 47441da177e4SLinus Torvalds struct file *file; 47451da177e4SLinus Torvalds loff_t size = vma->vm_end - vma->vm_start; 47461da177e4SLinus Torvalds 474766fc1303SHugh Dickins /* 4748c1e8d7c6SMichel Lespinasse * Cloning a new file under mmap_lock leads to a lock ordering conflict 474966fc1303SHugh Dickins * between XFS directory reading and selinux: since this file is only 475066fc1303SHugh Dickins * accessible to the user through its mapping, use S_PRIVATE flag to 475166fc1303SHugh Dickins * bypass file security, in the same way as shmem_kernel_file_setup(). 475266fc1303SHugh Dickins */ 4753703321b6SMatthew Auld file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 47541da177e4SLinus Torvalds if (IS_ERR(file)) 47551da177e4SLinus Torvalds return PTR_ERR(file); 47561da177e4SLinus Torvalds 47571da177e4SLinus Torvalds if (vma->vm_file) 47581da177e4SLinus Torvalds fput(vma->vm_file); 47591da177e4SLinus Torvalds vma->vm_file = file; 4760d09e8ca6SPasha Tatashin vma->vm_ops = &shmem_anon_vm_ops; 4761f3f0e1d2SKirill A. Shutemov 47621da177e4SLinus Torvalds return 0; 47631da177e4SLinus Torvalds } 4764d9d90e5eSHugh Dickins 4765d9d90e5eSHugh Dickins /** 4766f01b2b3eSMatthew Wilcox (Oracle) * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. 4767f01b2b3eSMatthew Wilcox (Oracle) * @mapping: the folio's address_space 4768f01b2b3eSMatthew Wilcox (Oracle) * @index: the folio index 4769d9d90e5eSHugh Dickins * @gfp: the page allocator flags to use if allocating 4770d9d90e5eSHugh Dickins * 4771d9d90e5eSHugh Dickins * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4772d9d90e5eSHugh Dickins * with any new page allocations done using the specified allocation flags. 47737e0a1265SMatthew Wilcox (Oracle) * But read_cache_page_gfp() uses the ->read_folio() method: which does not 4774d9d90e5eSHugh Dickins * suit tmpfs, since it may have pages in swapcache, and needs to find those 4775d9d90e5eSHugh Dickins * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4776d9d90e5eSHugh Dickins * 477768da9f05SHugh Dickins * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 477868da9f05SHugh Dickins * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4779d9d90e5eSHugh Dickins */ 4780f01b2b3eSMatthew Wilcox (Oracle) struct folio *shmem_read_folio_gfp(struct address_space *mapping, 4781d9d90e5eSHugh Dickins pgoff_t index, gfp_t gfp) 4782d9d90e5eSHugh Dickins { 478368da9f05SHugh Dickins #ifdef CONFIG_SHMEM 478468da9f05SHugh Dickins struct inode *inode = mapping->host; 4785a3a9c397SMatthew Wilcox (Oracle) struct folio *folio; 478668da9f05SHugh Dickins int error; 478768da9f05SHugh Dickins 478830e6a51dSHui Su BUG_ON(!shmem_mapping(mapping)); 4789a3a9c397SMatthew Wilcox (Oracle) error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, 4790cfda0526SMike Rapoport gfp, NULL, NULL, NULL); 479168da9f05SHugh Dickins if (error) 4792a7605426SYang Shi return ERR_PTR(error); 4793a7605426SYang Shi 4794a3a9c397SMatthew Wilcox (Oracle) folio_unlock(folio); 4795f01b2b3eSMatthew Wilcox (Oracle) return folio; 4796f01b2b3eSMatthew Wilcox (Oracle) #else 4797f01b2b3eSMatthew Wilcox (Oracle) /* 4798f01b2b3eSMatthew Wilcox (Oracle) * The tiny !SHMEM case uses ramfs without swap 4799f01b2b3eSMatthew Wilcox (Oracle) */ 4800f01b2b3eSMatthew Wilcox (Oracle) return mapping_read_folio_gfp(mapping, index, gfp); 4801f01b2b3eSMatthew Wilcox (Oracle) #endif 4802f01b2b3eSMatthew Wilcox (Oracle) } 4803f01b2b3eSMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); 4804f01b2b3eSMatthew Wilcox (Oracle) 4805f01b2b3eSMatthew Wilcox (Oracle) struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4806f01b2b3eSMatthew Wilcox (Oracle) pgoff_t index, gfp_t gfp) 4807f01b2b3eSMatthew Wilcox (Oracle) { 4808f01b2b3eSMatthew Wilcox (Oracle) struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); 4809f01b2b3eSMatthew Wilcox (Oracle) struct page *page; 4810f01b2b3eSMatthew Wilcox (Oracle) 4811f01b2b3eSMatthew Wilcox (Oracle) if (IS_ERR(folio)) 4812f01b2b3eSMatthew Wilcox (Oracle) return &folio->page; 4813f01b2b3eSMatthew Wilcox (Oracle) 4814a3a9c397SMatthew Wilcox (Oracle) page = folio_file_page(folio, index); 4815a7605426SYang Shi if (PageHWPoison(page)) { 4816a3a9c397SMatthew Wilcox (Oracle) folio_put(folio); 4817a7605426SYang Shi return ERR_PTR(-EIO); 4818a7605426SYang Shi } 4819a7605426SYang Shi 482068da9f05SHugh Dickins return page; 4821d9d90e5eSHugh Dickins } 4822d9d90e5eSHugh Dickins EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4823