11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Resizable virtual memory filesystem for Linux. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 2000 Linus Torvalds. 51da177e4SLinus Torvalds * 2000 Transmeta Corp. 61da177e4SLinus Torvalds * 2000-2001 Christoph Rohland 71da177e4SLinus Torvalds * 2000-2001 SAP AG 81da177e4SLinus Torvalds * 2002 Red Hat Inc. 96922c0c7SHugh Dickins * Copyright (C) 2002-2011 Hugh Dickins. 106922c0c7SHugh Dickins * Copyright (C) 2011 Google Inc. 110edd73b3SHugh Dickins * Copyright (C) 2002-2005 VERITAS Software Corporation. 121da177e4SLinus Torvalds * Copyright (C) 2004 Andi Kleen, SuSE Labs 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * Extended attribute support for tmpfs: 151da177e4SLinus Torvalds * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 161da177e4SLinus Torvalds * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 171da177e4SLinus Torvalds * 18853ac43aSMatt Mackall * tiny-shmem: 19853ac43aSMatt Mackall * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20853ac43aSMatt Mackall * 211da177e4SLinus Torvalds * This file is released under the GPL. 221da177e4SLinus Torvalds */ 231da177e4SLinus Torvalds 24853ac43aSMatt Mackall #include <linux/fs.h> 25853ac43aSMatt Mackall #include <linux/init.h> 26853ac43aSMatt Mackall #include <linux/vfs.h> 27853ac43aSMatt Mackall #include <linux/mount.h> 28250297edSAndrew Morton #include <linux/ramfs.h> 29caefba17SHugh Dickins #include <linux/pagemap.h> 30853ac43aSMatt Mackall #include <linux/file.h> 31853ac43aSMatt Mackall #include <linux/mm.h> 32b95f1b31SPaul Gortmaker #include <linux/export.h> 33853ac43aSMatt Mackall #include <linux/swap.h> 34e2e40f2cSChristoph Hellwig #include <linux/uio.h> 35853ac43aSMatt Mackall 36853ac43aSMatt Mackall static struct vfsmount *shm_mnt; 37853ac43aSMatt Mackall 38853ac43aSMatt Mackall #ifdef CONFIG_SHMEM 391da177e4SLinus Torvalds /* 401da177e4SLinus Torvalds * This virtual memory filesystem is heavily based on the ramfs. It 411da177e4SLinus Torvalds * extends ramfs by the ability to use swap and honor resource limits 421da177e4SLinus Torvalds * which makes it a completely usable filesystem. 431da177e4SLinus Torvalds */ 441da177e4SLinus Torvalds 4539f0247dSAndreas Gruenbacher #include <linux/xattr.h> 46a5694255SChristoph Hellwig #include <linux/exportfs.h> 471c7c474cSChristoph Hellwig #include <linux/posix_acl.h> 48feda821eSChristoph Hellwig #include <linux/posix_acl_xattr.h> 491da177e4SLinus Torvalds #include <linux/mman.h> 501da177e4SLinus Torvalds #include <linux/string.h> 511da177e4SLinus Torvalds #include <linux/slab.h> 521da177e4SLinus Torvalds #include <linux/backing-dev.h> 531da177e4SLinus Torvalds #include <linux/shmem_fs.h> 541da177e4SLinus Torvalds #include <linux/writeback.h> 551da177e4SLinus Torvalds #include <linux/blkdev.h> 56bda97eabSHugh Dickins #include <linux/pagevec.h> 5741ffe5d5SHugh Dickins #include <linux/percpu_counter.h> 5883e4fa9cSHugh Dickins #include <linux/falloc.h> 59708e3508SHugh Dickins #include <linux/splice.h> 601da177e4SLinus Torvalds #include <linux/security.h> 611da177e4SLinus Torvalds #include <linux/swapops.h> 621da177e4SLinus Torvalds #include <linux/mempolicy.h> 631da177e4SLinus Torvalds #include <linux/namei.h> 64b00dc3adSHugh Dickins #include <linux/ctype.h> 65304dbdb7SLee Schermerhorn #include <linux/migrate.h> 66c1f60a5aSChristoph Lameter #include <linux/highmem.h> 67680d794bSakpm@linux-foundation.org #include <linux/seq_file.h> 6892562927SMimi Zohar #include <linux/magic.h> 699183df25SDavid Herrmann #include <linux/syscalls.h> 7040e041a2SDavid Herrmann #include <linux/fcntl.h> 719183df25SDavid Herrmann #include <uapi/linux/memfd.h> 72304dbdb7SLee Schermerhorn 731da177e4SLinus Torvalds #include <asm/uaccess.h> 741da177e4SLinus Torvalds #include <asm/pgtable.h> 751da177e4SLinus Torvalds 76dd56b046SMel Gorman #include "internal.h" 77dd56b046SMel Gorman 7809cbfeafSKirill A. Shutemov #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 7909cbfeafSKirill A. Shutemov #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 801da177e4SLinus Torvalds 811da177e4SLinus Torvalds /* Pretend that each entry is of this size in directory's i_size */ 821da177e4SLinus Torvalds #define BOGO_DIRENT_SIZE 20 831da177e4SLinus Torvalds 8469f07ec9SHugh Dickins /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 8569f07ec9SHugh Dickins #define SHORT_SYMLINK_LEN 128 8669f07ec9SHugh Dickins 871aac1400SHugh Dickins /* 88f00cdc6dSHugh Dickins * shmem_fallocate communicates with shmem_fault or shmem_writepage via 89f00cdc6dSHugh Dickins * inode->i_private (with i_mutex making sure that it has only one user at 90f00cdc6dSHugh Dickins * a time): we would prefer not to enlarge the shmem inode just for that. 911aac1400SHugh Dickins */ 921aac1400SHugh Dickins struct shmem_falloc { 938e205f77SHugh Dickins wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 941aac1400SHugh Dickins pgoff_t start; /* start of range currently being fallocated */ 951aac1400SHugh Dickins pgoff_t next; /* the next page offset to be fallocated */ 961aac1400SHugh Dickins pgoff_t nr_falloced; /* how many new pages have been fallocated */ 971aac1400SHugh Dickins pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 981aac1400SHugh Dickins }; 991aac1400SHugh Dickins 100285b2c4fSHugh Dickins /* Flag allocation requirements to shmem_getpage */ 1011da177e4SLinus Torvalds enum sgp_type { 1021da177e4SLinus Torvalds SGP_READ, /* don't exceed i_size, don't allocate page */ 1031da177e4SLinus Torvalds SGP_CACHE, /* don't exceed i_size, may allocate page */ 1041635f6a7SHugh Dickins SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ 1051635f6a7SHugh Dickins SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ 1061da177e4SLinus Torvalds }; 1071da177e4SLinus Torvalds 108b76db735SAndrew Morton #ifdef CONFIG_TMPFS 109680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_blocks(void) 110680d794bSakpm@linux-foundation.org { 111680d794bSakpm@linux-foundation.org return totalram_pages / 2; 112680d794bSakpm@linux-foundation.org } 113680d794bSakpm@linux-foundation.org 114680d794bSakpm@linux-foundation.org static unsigned long shmem_default_max_inodes(void) 115680d794bSakpm@linux-foundation.org { 116680d794bSakpm@linux-foundation.org return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 117680d794bSakpm@linux-foundation.org } 118b76db735SAndrew Morton #endif 119680d794bSakpm@linux-foundation.org 120bde05d1cSHugh Dickins static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 121bde05d1cSHugh Dickins static int shmem_replace_page(struct page **pagep, gfp_t gfp, 122bde05d1cSHugh Dickins struct shmem_inode_info *info, pgoff_t index); 12368da9f05SHugh Dickins static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1249e18eb29SAndres Lagar-Cavilla struct page **pagep, enum sgp_type sgp, 1259e18eb29SAndres Lagar-Cavilla gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); 12668da9f05SHugh Dickins 12768da9f05SHugh Dickins static inline int shmem_getpage(struct inode *inode, pgoff_t index, 1289e18eb29SAndres Lagar-Cavilla struct page **pagep, enum sgp_type sgp) 12968da9f05SHugh Dickins { 13068da9f05SHugh Dickins return shmem_getpage_gfp(inode, index, pagep, sgp, 1319e18eb29SAndres Lagar-Cavilla mapping_gfp_mask(inode->i_mapping), NULL, NULL); 13268da9f05SHugh Dickins } 1331da177e4SLinus Torvalds 1341da177e4SLinus Torvalds static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 1351da177e4SLinus Torvalds { 1361da177e4SLinus Torvalds return sb->s_fs_info; 1371da177e4SLinus Torvalds } 1381da177e4SLinus Torvalds 1391da177e4SLinus Torvalds /* 1401da177e4SLinus Torvalds * shmem_file_setup pre-accounts the whole fixed size of a VM object, 1411da177e4SLinus Torvalds * for shared memory and for shared anonymous (/dev/zero) mappings 1421da177e4SLinus Torvalds * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 1431da177e4SLinus Torvalds * consistent with the pre-accounting of private mappings ... 1441da177e4SLinus Torvalds */ 1451da177e4SLinus Torvalds static inline int shmem_acct_size(unsigned long flags, loff_t size) 1461da177e4SLinus Torvalds { 1470b0a0806SHugh Dickins return (flags & VM_NORESERVE) ? 148191c5424SAl Viro 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 1491da177e4SLinus Torvalds } 1501da177e4SLinus Torvalds 1511da177e4SLinus Torvalds static inline void shmem_unacct_size(unsigned long flags, loff_t size) 1521da177e4SLinus Torvalds { 1530b0a0806SHugh Dickins if (!(flags & VM_NORESERVE)) 1541da177e4SLinus Torvalds vm_unacct_memory(VM_ACCT(size)); 1551da177e4SLinus Torvalds } 1561da177e4SLinus Torvalds 15777142517SKonstantin Khlebnikov static inline int shmem_reacct_size(unsigned long flags, 15877142517SKonstantin Khlebnikov loff_t oldsize, loff_t newsize) 15977142517SKonstantin Khlebnikov { 16077142517SKonstantin Khlebnikov if (!(flags & VM_NORESERVE)) { 16177142517SKonstantin Khlebnikov if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 16277142517SKonstantin Khlebnikov return security_vm_enough_memory_mm(current->mm, 16377142517SKonstantin Khlebnikov VM_ACCT(newsize) - VM_ACCT(oldsize)); 16477142517SKonstantin Khlebnikov else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 16577142517SKonstantin Khlebnikov vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 16677142517SKonstantin Khlebnikov } 16777142517SKonstantin Khlebnikov return 0; 16877142517SKonstantin Khlebnikov } 16977142517SKonstantin Khlebnikov 1701da177e4SLinus Torvalds /* 1711da177e4SLinus Torvalds * ... whereas tmpfs objects are accounted incrementally as 17275edd345SHugh Dickins * pages are allocated, in order to allow large sparse files. 1731da177e4SLinus Torvalds * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 1741da177e4SLinus Torvalds * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 1751da177e4SLinus Torvalds */ 1761da177e4SLinus Torvalds static inline int shmem_acct_block(unsigned long flags) 1771da177e4SLinus Torvalds { 1780b0a0806SHugh Dickins return (flags & VM_NORESERVE) ? 17909cbfeafSKirill A. Shutemov security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; 1801da177e4SLinus Torvalds } 1811da177e4SLinus Torvalds 1821da177e4SLinus Torvalds static inline void shmem_unacct_blocks(unsigned long flags, long pages) 1831da177e4SLinus Torvalds { 1840b0a0806SHugh Dickins if (flags & VM_NORESERVE) 18509cbfeafSKirill A. Shutemov vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 1861da177e4SLinus Torvalds } 1871da177e4SLinus Torvalds 188759b9775SHugh Dickins static const struct super_operations shmem_ops; 189f5e54d6eSChristoph Hellwig static const struct address_space_operations shmem_aops; 19015ad7cdcSHelge Deller static const struct file_operations shmem_file_operations; 19192e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations; 19292e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations; 19392e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations; 194f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops; 1951da177e4SLinus Torvalds 1961da177e4SLinus Torvalds static LIST_HEAD(shmem_swaplist); 197cb5f7b9aSHugh Dickins static DEFINE_MUTEX(shmem_swaplist_mutex); 1981da177e4SLinus Torvalds 1995b04c689SPavel Emelyanov static int shmem_reserve_inode(struct super_block *sb) 2005b04c689SPavel Emelyanov { 2015b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2025b04c689SPavel Emelyanov if (sbinfo->max_inodes) { 2035b04c689SPavel Emelyanov spin_lock(&sbinfo->stat_lock); 2045b04c689SPavel Emelyanov if (!sbinfo->free_inodes) { 2055b04c689SPavel Emelyanov spin_unlock(&sbinfo->stat_lock); 2065b04c689SPavel Emelyanov return -ENOSPC; 2075b04c689SPavel Emelyanov } 2085b04c689SPavel Emelyanov sbinfo->free_inodes--; 2095b04c689SPavel Emelyanov spin_unlock(&sbinfo->stat_lock); 2105b04c689SPavel Emelyanov } 2115b04c689SPavel Emelyanov return 0; 2125b04c689SPavel Emelyanov } 2135b04c689SPavel Emelyanov 2145b04c689SPavel Emelyanov static void shmem_free_inode(struct super_block *sb) 2155b04c689SPavel Emelyanov { 2165b04c689SPavel Emelyanov struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2175b04c689SPavel Emelyanov if (sbinfo->max_inodes) { 2185b04c689SPavel Emelyanov spin_lock(&sbinfo->stat_lock); 2195b04c689SPavel Emelyanov sbinfo->free_inodes++; 2205b04c689SPavel Emelyanov spin_unlock(&sbinfo->stat_lock); 2215b04c689SPavel Emelyanov } 2225b04c689SPavel Emelyanov } 2235b04c689SPavel Emelyanov 22446711810SRandy Dunlap /** 22541ffe5d5SHugh Dickins * shmem_recalc_inode - recalculate the block usage of an inode 2261da177e4SLinus Torvalds * @inode: inode to recalc 2271da177e4SLinus Torvalds * 2281da177e4SLinus Torvalds * We have to calculate the free blocks since the mm can drop 2291da177e4SLinus Torvalds * undirtied hole pages behind our back. 2301da177e4SLinus Torvalds * 2311da177e4SLinus Torvalds * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 2321da177e4SLinus Torvalds * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 2331da177e4SLinus Torvalds * 2341da177e4SLinus Torvalds * It has to be called with the spinlock held. 2351da177e4SLinus Torvalds */ 2361da177e4SLinus Torvalds static void shmem_recalc_inode(struct inode *inode) 2371da177e4SLinus Torvalds { 2381da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 2391da177e4SLinus Torvalds long freed; 2401da177e4SLinus Torvalds 2411da177e4SLinus Torvalds freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 2421da177e4SLinus Torvalds if (freed > 0) { 24354af6042SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 24454af6042SHugh Dickins if (sbinfo->max_blocks) 24554af6042SHugh Dickins percpu_counter_add(&sbinfo->used_blocks, -freed); 2461da177e4SLinus Torvalds info->alloced -= freed; 24754af6042SHugh Dickins inode->i_blocks -= freed * BLOCKS_PER_PAGE; 2481da177e4SLinus Torvalds shmem_unacct_blocks(info->flags, freed); 2491da177e4SLinus Torvalds } 2501da177e4SLinus Torvalds } 2511da177e4SLinus Torvalds 2527a5d0fbbSHugh Dickins /* 2537a5d0fbbSHugh Dickins * Replace item expected in radix tree by a new item, while holding tree lock. 2547a5d0fbbSHugh Dickins */ 2557a5d0fbbSHugh Dickins static int shmem_radix_tree_replace(struct address_space *mapping, 2567a5d0fbbSHugh Dickins pgoff_t index, void *expected, void *replacement) 2577a5d0fbbSHugh Dickins { 2587a5d0fbbSHugh Dickins void **pslot; 2596dbaf22cSJohannes Weiner void *item; 2607a5d0fbbSHugh Dickins 2617a5d0fbbSHugh Dickins VM_BUG_ON(!expected); 2626dbaf22cSJohannes Weiner VM_BUG_ON(!replacement); 2637a5d0fbbSHugh Dickins pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 2646dbaf22cSJohannes Weiner if (!pslot) 2656dbaf22cSJohannes Weiner return -ENOENT; 2666dbaf22cSJohannes Weiner item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); 2677a5d0fbbSHugh Dickins if (item != expected) 2687a5d0fbbSHugh Dickins return -ENOENT; 2697a5d0fbbSHugh Dickins radix_tree_replace_slot(pslot, replacement); 2707a5d0fbbSHugh Dickins return 0; 2717a5d0fbbSHugh Dickins } 2727a5d0fbbSHugh Dickins 2737a5d0fbbSHugh Dickins /* 274d1899228SHugh Dickins * Sometimes, before we decide whether to proceed or to fail, we must check 275d1899228SHugh Dickins * that an entry was not already brought back from swap by a racing thread. 276d1899228SHugh Dickins * 277d1899228SHugh Dickins * Checking page is not enough: by the time a SwapCache page is locked, it 278d1899228SHugh Dickins * might be reused, and again be SwapCache, using the same swap as before. 279d1899228SHugh Dickins */ 280d1899228SHugh Dickins static bool shmem_confirm_swap(struct address_space *mapping, 281d1899228SHugh Dickins pgoff_t index, swp_entry_t swap) 282d1899228SHugh Dickins { 283d1899228SHugh Dickins void *item; 284d1899228SHugh Dickins 285d1899228SHugh Dickins rcu_read_lock(); 286d1899228SHugh Dickins item = radix_tree_lookup(&mapping->page_tree, index); 287d1899228SHugh Dickins rcu_read_unlock(); 288d1899228SHugh Dickins return item == swp_to_radix_entry(swap); 289d1899228SHugh Dickins } 290d1899228SHugh Dickins 291d1899228SHugh Dickins /* 2925a6e75f8SKirill A. Shutemov * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 2935a6e75f8SKirill A. Shutemov * 2945a6e75f8SKirill A. Shutemov * SHMEM_HUGE_NEVER: 2955a6e75f8SKirill A. Shutemov * disables huge pages for the mount; 2965a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ALWAYS: 2975a6e75f8SKirill A. Shutemov * enables huge pages for the mount; 2985a6e75f8SKirill A. Shutemov * SHMEM_HUGE_WITHIN_SIZE: 2995a6e75f8SKirill A. Shutemov * only allocate huge pages if the page will be fully within i_size, 3005a6e75f8SKirill A. Shutemov * also respect fadvise()/madvise() hints; 3015a6e75f8SKirill A. Shutemov * SHMEM_HUGE_ADVISE: 3025a6e75f8SKirill A. Shutemov * only allocate huge pages if requested with fadvise()/madvise(); 3035a6e75f8SKirill A. Shutemov */ 3045a6e75f8SKirill A. Shutemov 3055a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_NEVER 0 3065a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ALWAYS 1 3075a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_WITHIN_SIZE 2 3085a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_ADVISE 3 3095a6e75f8SKirill A. Shutemov 3105a6e75f8SKirill A. Shutemov /* 3115a6e75f8SKirill A. Shutemov * Special values. 3125a6e75f8SKirill A. Shutemov * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 3135a6e75f8SKirill A. Shutemov * 3145a6e75f8SKirill A. Shutemov * SHMEM_HUGE_DENY: 3155a6e75f8SKirill A. Shutemov * disables huge on shm_mnt and all mounts, for emergency use; 3165a6e75f8SKirill A. Shutemov * SHMEM_HUGE_FORCE: 3175a6e75f8SKirill A. Shutemov * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 3185a6e75f8SKirill A. Shutemov * 3195a6e75f8SKirill A. Shutemov */ 3205a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_DENY (-1) 3215a6e75f8SKirill A. Shutemov #define SHMEM_HUGE_FORCE (-2) 3225a6e75f8SKirill A. Shutemov 3235a6e75f8SKirill A. Shutemov #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3245a6e75f8SKirill A. Shutemov /* ifdef here to avoid bloating shmem.o when not necessary */ 3255a6e75f8SKirill A. Shutemov 3265a6e75f8SKirill A. Shutemov int shmem_huge __read_mostly; 3275a6e75f8SKirill A. Shutemov 3285a6e75f8SKirill A. Shutemov static int shmem_parse_huge(const char *str) 3295a6e75f8SKirill A. Shutemov { 3305a6e75f8SKirill A. Shutemov if (!strcmp(str, "never")) 3315a6e75f8SKirill A. Shutemov return SHMEM_HUGE_NEVER; 3325a6e75f8SKirill A. Shutemov if (!strcmp(str, "always")) 3335a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ALWAYS; 3345a6e75f8SKirill A. Shutemov if (!strcmp(str, "within_size")) 3355a6e75f8SKirill A. Shutemov return SHMEM_HUGE_WITHIN_SIZE; 3365a6e75f8SKirill A. Shutemov if (!strcmp(str, "advise")) 3375a6e75f8SKirill A. Shutemov return SHMEM_HUGE_ADVISE; 3385a6e75f8SKirill A. Shutemov if (!strcmp(str, "deny")) 3395a6e75f8SKirill A. Shutemov return SHMEM_HUGE_DENY; 3405a6e75f8SKirill A. Shutemov if (!strcmp(str, "force")) 3415a6e75f8SKirill A. Shutemov return SHMEM_HUGE_FORCE; 3425a6e75f8SKirill A. Shutemov return -EINVAL; 3435a6e75f8SKirill A. Shutemov } 3445a6e75f8SKirill A. Shutemov 3455a6e75f8SKirill A. Shutemov static const char *shmem_format_huge(int huge) 3465a6e75f8SKirill A. Shutemov { 3475a6e75f8SKirill A. Shutemov switch (huge) { 3485a6e75f8SKirill A. Shutemov case SHMEM_HUGE_NEVER: 3495a6e75f8SKirill A. Shutemov return "never"; 3505a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ALWAYS: 3515a6e75f8SKirill A. Shutemov return "always"; 3525a6e75f8SKirill A. Shutemov case SHMEM_HUGE_WITHIN_SIZE: 3535a6e75f8SKirill A. Shutemov return "within_size"; 3545a6e75f8SKirill A. Shutemov case SHMEM_HUGE_ADVISE: 3555a6e75f8SKirill A. Shutemov return "advise"; 3565a6e75f8SKirill A. Shutemov case SHMEM_HUGE_DENY: 3575a6e75f8SKirill A. Shutemov return "deny"; 3585a6e75f8SKirill A. Shutemov case SHMEM_HUGE_FORCE: 3595a6e75f8SKirill A. Shutemov return "force"; 3605a6e75f8SKirill A. Shutemov default: 3615a6e75f8SKirill A. Shutemov VM_BUG_ON(1); 3625a6e75f8SKirill A. Shutemov return "bad_val"; 3635a6e75f8SKirill A. Shutemov } 3645a6e75f8SKirill A. Shutemov } 3655a6e75f8SKirill A. Shutemov 3665a6e75f8SKirill A. Shutemov #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 3675a6e75f8SKirill A. Shutemov 3685a6e75f8SKirill A. Shutemov #define shmem_huge SHMEM_HUGE_DENY 3695a6e75f8SKirill A. Shutemov 3705a6e75f8SKirill A. Shutemov #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3715a6e75f8SKirill A. Shutemov 3725a6e75f8SKirill A. Shutemov /* 37346f65ec1SHugh Dickins * Like add_to_page_cache_locked, but error if expected item has gone. 37446f65ec1SHugh Dickins */ 37546f65ec1SHugh Dickins static int shmem_add_to_page_cache(struct page *page, 37646f65ec1SHugh Dickins struct address_space *mapping, 377fed400a1SWang Sheng-Hui pgoff_t index, void *expected) 37846f65ec1SHugh Dickins { 379b065b432SHugh Dickins int error; 38046f65ec1SHugh Dickins 381309381feSSasha Levin VM_BUG_ON_PAGE(!PageLocked(page), page); 382309381feSSasha Levin VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 38346f65ec1SHugh Dickins 38409cbfeafSKirill A. Shutemov get_page(page); 38546f65ec1SHugh Dickins page->mapping = mapping; 38646f65ec1SHugh Dickins page->index = index; 38746f65ec1SHugh Dickins 38846f65ec1SHugh Dickins spin_lock_irq(&mapping->tree_lock); 38946f65ec1SHugh Dickins if (!expected) 390b065b432SHugh Dickins error = radix_tree_insert(&mapping->page_tree, index, page); 39146f65ec1SHugh Dickins else 392b065b432SHugh Dickins error = shmem_radix_tree_replace(mapping, index, expected, 393b065b432SHugh Dickins page); 39446f65ec1SHugh Dickins if (!error) { 39546f65ec1SHugh Dickins mapping->nrpages++; 39646f65ec1SHugh Dickins __inc_zone_page_state(page, NR_FILE_PAGES); 39746f65ec1SHugh Dickins __inc_zone_page_state(page, NR_SHMEM); 39846f65ec1SHugh Dickins spin_unlock_irq(&mapping->tree_lock); 39946f65ec1SHugh Dickins } else { 40046f65ec1SHugh Dickins page->mapping = NULL; 40146f65ec1SHugh Dickins spin_unlock_irq(&mapping->tree_lock); 40209cbfeafSKirill A. Shutemov put_page(page); 40346f65ec1SHugh Dickins } 40446f65ec1SHugh Dickins return error; 40546f65ec1SHugh Dickins } 40646f65ec1SHugh Dickins 40746f65ec1SHugh Dickins /* 4086922c0c7SHugh Dickins * Like delete_from_page_cache, but substitutes swap for page. 4096922c0c7SHugh Dickins */ 4106922c0c7SHugh Dickins static void shmem_delete_from_page_cache(struct page *page, void *radswap) 4116922c0c7SHugh Dickins { 4126922c0c7SHugh Dickins struct address_space *mapping = page->mapping; 4136922c0c7SHugh Dickins int error; 4146922c0c7SHugh Dickins 4156922c0c7SHugh Dickins spin_lock_irq(&mapping->tree_lock); 4166922c0c7SHugh Dickins error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 4176922c0c7SHugh Dickins page->mapping = NULL; 4186922c0c7SHugh Dickins mapping->nrpages--; 4196922c0c7SHugh Dickins __dec_zone_page_state(page, NR_FILE_PAGES); 4206922c0c7SHugh Dickins __dec_zone_page_state(page, NR_SHMEM); 4216922c0c7SHugh Dickins spin_unlock_irq(&mapping->tree_lock); 42209cbfeafSKirill A. Shutemov put_page(page); 4236922c0c7SHugh Dickins BUG_ON(error); 4246922c0c7SHugh Dickins } 4256922c0c7SHugh Dickins 4266922c0c7SHugh Dickins /* 4277a5d0fbbSHugh Dickins * Remove swap entry from radix tree, free the swap and its page cache. 4287a5d0fbbSHugh Dickins */ 4297a5d0fbbSHugh Dickins static int shmem_free_swap(struct address_space *mapping, 4307a5d0fbbSHugh Dickins pgoff_t index, void *radswap) 4317a5d0fbbSHugh Dickins { 4326dbaf22cSJohannes Weiner void *old; 4337a5d0fbbSHugh Dickins 4347a5d0fbbSHugh Dickins spin_lock_irq(&mapping->tree_lock); 4356dbaf22cSJohannes Weiner old = radix_tree_delete_item(&mapping->page_tree, index, radswap); 4367a5d0fbbSHugh Dickins spin_unlock_irq(&mapping->tree_lock); 4376dbaf22cSJohannes Weiner if (old != radswap) 4386dbaf22cSJohannes Weiner return -ENOENT; 4397a5d0fbbSHugh Dickins free_swap_and_cache(radix_to_swp_entry(radswap)); 4406dbaf22cSJohannes Weiner return 0; 4417a5d0fbbSHugh Dickins } 4427a5d0fbbSHugh Dickins 4437a5d0fbbSHugh Dickins /* 4446a15a370SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 44548131e03SVlastimil Babka * given offsets are swapped out. 4466a15a370SVlastimil Babka * 4476a15a370SVlastimil Babka * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, 4486a15a370SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 4496a15a370SVlastimil Babka */ 45048131e03SVlastimil Babka unsigned long shmem_partial_swap_usage(struct address_space *mapping, 45148131e03SVlastimil Babka pgoff_t start, pgoff_t end) 4526a15a370SVlastimil Babka { 4536a15a370SVlastimil Babka struct radix_tree_iter iter; 4546a15a370SVlastimil Babka void **slot; 4556a15a370SVlastimil Babka struct page *page; 45648131e03SVlastimil Babka unsigned long swapped = 0; 4576a15a370SVlastimil Babka 4586a15a370SVlastimil Babka rcu_read_lock(); 4596a15a370SVlastimil Babka 4606a15a370SVlastimil Babka radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 4616a15a370SVlastimil Babka if (iter.index >= end) 4626a15a370SVlastimil Babka break; 4636a15a370SVlastimil Babka 4646a15a370SVlastimil Babka page = radix_tree_deref_slot(slot); 4656a15a370SVlastimil Babka 4662cf938aaSMatthew Wilcox if (radix_tree_deref_retry(page)) { 4672cf938aaSMatthew Wilcox slot = radix_tree_iter_retry(&iter); 4682cf938aaSMatthew Wilcox continue; 4692cf938aaSMatthew Wilcox } 4706a15a370SVlastimil Babka 4716a15a370SVlastimil Babka if (radix_tree_exceptional_entry(page)) 4726a15a370SVlastimil Babka swapped++; 4736a15a370SVlastimil Babka 4746a15a370SVlastimil Babka if (need_resched()) { 4756a15a370SVlastimil Babka cond_resched_rcu(); 4767165092fSMatthew Wilcox slot = radix_tree_iter_next(&iter); 4776a15a370SVlastimil Babka } 4786a15a370SVlastimil Babka } 4796a15a370SVlastimil Babka 4806a15a370SVlastimil Babka rcu_read_unlock(); 4816a15a370SVlastimil Babka 4826a15a370SVlastimil Babka return swapped << PAGE_SHIFT; 4836a15a370SVlastimil Babka } 4846a15a370SVlastimil Babka 4856a15a370SVlastimil Babka /* 48648131e03SVlastimil Babka * Determine (in bytes) how many of the shmem object's pages mapped by the 48748131e03SVlastimil Babka * given vma is swapped out. 48848131e03SVlastimil Babka * 48948131e03SVlastimil Babka * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, 49048131e03SVlastimil Babka * as long as the inode doesn't go away and racy results are not a problem. 49148131e03SVlastimil Babka */ 49248131e03SVlastimil Babka unsigned long shmem_swap_usage(struct vm_area_struct *vma) 49348131e03SVlastimil Babka { 49448131e03SVlastimil Babka struct inode *inode = file_inode(vma->vm_file); 49548131e03SVlastimil Babka struct shmem_inode_info *info = SHMEM_I(inode); 49648131e03SVlastimil Babka struct address_space *mapping = inode->i_mapping; 49748131e03SVlastimil Babka unsigned long swapped; 49848131e03SVlastimil Babka 49948131e03SVlastimil Babka /* Be careful as we don't hold info->lock */ 50048131e03SVlastimil Babka swapped = READ_ONCE(info->swapped); 50148131e03SVlastimil Babka 50248131e03SVlastimil Babka /* 50348131e03SVlastimil Babka * The easier cases are when the shmem object has nothing in swap, or 50448131e03SVlastimil Babka * the vma maps it whole. Then we can simply use the stats that we 50548131e03SVlastimil Babka * already track. 50648131e03SVlastimil Babka */ 50748131e03SVlastimil Babka if (!swapped) 50848131e03SVlastimil Babka return 0; 50948131e03SVlastimil Babka 51048131e03SVlastimil Babka if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 51148131e03SVlastimil Babka return swapped << PAGE_SHIFT; 51248131e03SVlastimil Babka 51348131e03SVlastimil Babka /* Here comes the more involved part */ 51448131e03SVlastimil Babka return shmem_partial_swap_usage(mapping, 51548131e03SVlastimil Babka linear_page_index(vma, vma->vm_start), 51648131e03SVlastimil Babka linear_page_index(vma, vma->vm_end)); 51748131e03SVlastimil Babka } 51848131e03SVlastimil Babka 51948131e03SVlastimil Babka /* 52024513264SHugh Dickins * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 52124513264SHugh Dickins */ 52224513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 52324513264SHugh Dickins { 52424513264SHugh Dickins struct pagevec pvec; 52524513264SHugh Dickins pgoff_t indices[PAGEVEC_SIZE]; 52624513264SHugh Dickins pgoff_t index = 0; 52724513264SHugh Dickins 52824513264SHugh Dickins pagevec_init(&pvec, 0); 52924513264SHugh Dickins /* 53024513264SHugh Dickins * Minor point, but we might as well stop if someone else SHM_LOCKs it. 53124513264SHugh Dickins */ 53224513264SHugh Dickins while (!mapping_unevictable(mapping)) { 53324513264SHugh Dickins /* 53424513264SHugh Dickins * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 53524513264SHugh Dickins * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 53624513264SHugh Dickins */ 5370cd6144aSJohannes Weiner pvec.nr = find_get_entries(mapping, index, 53824513264SHugh Dickins PAGEVEC_SIZE, pvec.pages, indices); 53924513264SHugh Dickins if (!pvec.nr) 54024513264SHugh Dickins break; 54124513264SHugh Dickins index = indices[pvec.nr - 1] + 1; 5420cd6144aSJohannes Weiner pagevec_remove_exceptionals(&pvec); 54324513264SHugh Dickins check_move_unevictable_pages(pvec.pages, pvec.nr); 54424513264SHugh Dickins pagevec_release(&pvec); 54524513264SHugh Dickins cond_resched(); 54624513264SHugh Dickins } 5477a5d0fbbSHugh Dickins } 5487a5d0fbbSHugh Dickins 5497a5d0fbbSHugh Dickins /* 5507a5d0fbbSHugh Dickins * Remove range of pages and swap entries from radix tree, and free them. 5511635f6a7SHugh Dickins * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 5527a5d0fbbSHugh Dickins */ 5531635f6a7SHugh Dickins static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 5541635f6a7SHugh Dickins bool unfalloc) 5551da177e4SLinus Torvalds { 556285b2c4fSHugh Dickins struct address_space *mapping = inode->i_mapping; 5571da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 55809cbfeafSKirill A. Shutemov pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 55909cbfeafSKirill A. Shutemov pgoff_t end = (lend + 1) >> PAGE_SHIFT; 56009cbfeafSKirill A. Shutemov unsigned int partial_start = lstart & (PAGE_SIZE - 1); 56109cbfeafSKirill A. Shutemov unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); 562bda97eabSHugh Dickins struct pagevec pvec; 5637a5d0fbbSHugh Dickins pgoff_t indices[PAGEVEC_SIZE]; 5647a5d0fbbSHugh Dickins long nr_swaps_freed = 0; 565285b2c4fSHugh Dickins pgoff_t index; 566bda97eabSHugh Dickins int i; 5671da177e4SLinus Torvalds 56883e4fa9cSHugh Dickins if (lend == -1) 56983e4fa9cSHugh Dickins end = -1; /* unsigned, so actually very big */ 570bda97eabSHugh Dickins 571bda97eabSHugh Dickins pagevec_init(&pvec, 0); 572bda97eabSHugh Dickins index = start; 57383e4fa9cSHugh Dickins while (index < end) { 5740cd6144aSJohannes Weiner pvec.nr = find_get_entries(mapping, index, 57583e4fa9cSHugh Dickins min(end - index, (pgoff_t)PAGEVEC_SIZE), 5767a5d0fbbSHugh Dickins pvec.pages, indices); 5777a5d0fbbSHugh Dickins if (!pvec.nr) 5787a5d0fbbSHugh Dickins break; 579bda97eabSHugh Dickins for (i = 0; i < pagevec_count(&pvec); i++) { 580bda97eabSHugh Dickins struct page *page = pvec.pages[i]; 581bda97eabSHugh Dickins 5827a5d0fbbSHugh Dickins index = indices[i]; 58383e4fa9cSHugh Dickins if (index >= end) 584bda97eabSHugh Dickins break; 585bda97eabSHugh Dickins 5867a5d0fbbSHugh Dickins if (radix_tree_exceptional_entry(page)) { 5871635f6a7SHugh Dickins if (unfalloc) 5881635f6a7SHugh Dickins continue; 5897a5d0fbbSHugh Dickins nr_swaps_freed += !shmem_free_swap(mapping, 5907a5d0fbbSHugh Dickins index, page); 5917a5d0fbbSHugh Dickins continue; 5927a5d0fbbSHugh Dickins } 5937a5d0fbbSHugh Dickins 594bda97eabSHugh Dickins if (!trylock_page(page)) 595bda97eabSHugh Dickins continue; 5961635f6a7SHugh Dickins if (!unfalloc || !PageUptodate(page)) { 5977a5d0fbbSHugh Dickins if (page->mapping == mapping) { 598309381feSSasha Levin VM_BUG_ON_PAGE(PageWriteback(page), page); 599bda97eabSHugh Dickins truncate_inode_page(mapping, page); 6007a5d0fbbSHugh Dickins } 6011635f6a7SHugh Dickins } 602bda97eabSHugh Dickins unlock_page(page); 603bda97eabSHugh Dickins } 6040cd6144aSJohannes Weiner pagevec_remove_exceptionals(&pvec); 60524513264SHugh Dickins pagevec_release(&pvec); 606bda97eabSHugh Dickins cond_resched(); 607bda97eabSHugh Dickins index++; 608bda97eabSHugh Dickins } 609bda97eabSHugh Dickins 61083e4fa9cSHugh Dickins if (partial_start) { 611bda97eabSHugh Dickins struct page *page = NULL; 6129e18eb29SAndres Lagar-Cavilla shmem_getpage(inode, start - 1, &page, SGP_READ); 613bda97eabSHugh Dickins if (page) { 61409cbfeafSKirill A. Shutemov unsigned int top = PAGE_SIZE; 61583e4fa9cSHugh Dickins if (start > end) { 61683e4fa9cSHugh Dickins top = partial_end; 61783e4fa9cSHugh Dickins partial_end = 0; 61883e4fa9cSHugh Dickins } 61983e4fa9cSHugh Dickins zero_user_segment(page, partial_start, top); 620bda97eabSHugh Dickins set_page_dirty(page); 621bda97eabSHugh Dickins unlock_page(page); 62209cbfeafSKirill A. Shutemov put_page(page); 623bda97eabSHugh Dickins } 624bda97eabSHugh Dickins } 62583e4fa9cSHugh Dickins if (partial_end) { 62683e4fa9cSHugh Dickins struct page *page = NULL; 6279e18eb29SAndres Lagar-Cavilla shmem_getpage(inode, end, &page, SGP_READ); 62883e4fa9cSHugh Dickins if (page) { 62983e4fa9cSHugh Dickins zero_user_segment(page, 0, partial_end); 63083e4fa9cSHugh Dickins set_page_dirty(page); 63183e4fa9cSHugh Dickins unlock_page(page); 63209cbfeafSKirill A. Shutemov put_page(page); 63383e4fa9cSHugh Dickins } 63483e4fa9cSHugh Dickins } 63583e4fa9cSHugh Dickins if (start >= end) 63683e4fa9cSHugh Dickins return; 637bda97eabSHugh Dickins 638bda97eabSHugh Dickins index = start; 639b1a36650SHugh Dickins while (index < end) { 640bda97eabSHugh Dickins cond_resched(); 6410cd6144aSJohannes Weiner 6420cd6144aSJohannes Weiner pvec.nr = find_get_entries(mapping, index, 64383e4fa9cSHugh Dickins min(end - index, (pgoff_t)PAGEVEC_SIZE), 6447a5d0fbbSHugh Dickins pvec.pages, indices); 6457a5d0fbbSHugh Dickins if (!pvec.nr) { 646b1a36650SHugh Dickins /* If all gone or hole-punch or unfalloc, we're done */ 647b1a36650SHugh Dickins if (index == start || end != -1) 648bda97eabSHugh Dickins break; 649b1a36650SHugh Dickins /* But if truncating, restart to make sure all gone */ 650bda97eabSHugh Dickins index = start; 651bda97eabSHugh Dickins continue; 652bda97eabSHugh Dickins } 653bda97eabSHugh Dickins for (i = 0; i < pagevec_count(&pvec); i++) { 654bda97eabSHugh Dickins struct page *page = pvec.pages[i]; 655bda97eabSHugh Dickins 6567a5d0fbbSHugh Dickins index = indices[i]; 65783e4fa9cSHugh Dickins if (index >= end) 658bda97eabSHugh Dickins break; 659bda97eabSHugh Dickins 6607a5d0fbbSHugh Dickins if (radix_tree_exceptional_entry(page)) { 6611635f6a7SHugh Dickins if (unfalloc) 6621635f6a7SHugh Dickins continue; 663b1a36650SHugh Dickins if (shmem_free_swap(mapping, index, page)) { 664b1a36650SHugh Dickins /* Swap was replaced by page: retry */ 665b1a36650SHugh Dickins index--; 666b1a36650SHugh Dickins break; 667b1a36650SHugh Dickins } 668b1a36650SHugh Dickins nr_swaps_freed++; 6697a5d0fbbSHugh Dickins continue; 6707a5d0fbbSHugh Dickins } 6717a5d0fbbSHugh Dickins 672bda97eabSHugh Dickins lock_page(page); 6731635f6a7SHugh Dickins if (!unfalloc || !PageUptodate(page)) { 6747a5d0fbbSHugh Dickins if (page->mapping == mapping) { 675309381feSSasha Levin VM_BUG_ON_PAGE(PageWriteback(page), page); 676bda97eabSHugh Dickins truncate_inode_page(mapping, page); 677b1a36650SHugh Dickins } else { 678b1a36650SHugh Dickins /* Page was replaced by swap: retry */ 679b1a36650SHugh Dickins unlock_page(page); 680b1a36650SHugh Dickins index--; 681b1a36650SHugh Dickins break; 6827a5d0fbbSHugh Dickins } 6831635f6a7SHugh Dickins } 684bda97eabSHugh Dickins unlock_page(page); 685bda97eabSHugh Dickins } 6860cd6144aSJohannes Weiner pagevec_remove_exceptionals(&pvec); 68724513264SHugh Dickins pagevec_release(&pvec); 688bda97eabSHugh Dickins index++; 689bda97eabSHugh Dickins } 69094c1e62dSHugh Dickins 6911da177e4SLinus Torvalds spin_lock(&info->lock); 6927a5d0fbbSHugh Dickins info->swapped -= nr_swaps_freed; 6931da177e4SLinus Torvalds shmem_recalc_inode(inode); 6941da177e4SLinus Torvalds spin_unlock(&info->lock); 6951635f6a7SHugh Dickins } 6961da177e4SLinus Torvalds 6971635f6a7SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 6981635f6a7SHugh Dickins { 6991635f6a7SHugh Dickins shmem_undo_range(inode, lstart, lend, false); 700285b2c4fSHugh Dickins inode->i_ctime = inode->i_mtime = CURRENT_TIME; 7011da177e4SLinus Torvalds } 70294c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 7031da177e4SLinus Torvalds 70444a30220SYu Zhao static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, 70544a30220SYu Zhao struct kstat *stat) 70644a30220SYu Zhao { 70744a30220SYu Zhao struct inode *inode = dentry->d_inode; 70844a30220SYu Zhao struct shmem_inode_info *info = SHMEM_I(inode); 70944a30220SYu Zhao 710d0424c42SHugh Dickins if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 71144a30220SYu Zhao spin_lock(&info->lock); 71244a30220SYu Zhao shmem_recalc_inode(inode); 71344a30220SYu Zhao spin_unlock(&info->lock); 714d0424c42SHugh Dickins } 71544a30220SYu Zhao generic_fillattr(inode, stat); 71644a30220SYu Zhao return 0; 71744a30220SYu Zhao } 71844a30220SYu Zhao 71994c1e62dSHugh Dickins static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 7201da177e4SLinus Torvalds { 72175c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 72240e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 7231da177e4SLinus Torvalds int error; 7241da177e4SLinus Torvalds 725db78b877SChristoph Hellwig error = inode_change_ok(inode, attr); 726db78b877SChristoph Hellwig if (error) 727db78b877SChristoph Hellwig return error; 728db78b877SChristoph Hellwig 72994c1e62dSHugh Dickins if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 73094c1e62dSHugh Dickins loff_t oldsize = inode->i_size; 73194c1e62dSHugh Dickins loff_t newsize = attr->ia_size; 7323889e6e7Snpiggin@suse.de 73340e041a2SDavid Herrmann /* protected by i_mutex */ 73440e041a2SDavid Herrmann if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 73540e041a2SDavid Herrmann (newsize > oldsize && (info->seals & F_SEAL_GROW))) 73640e041a2SDavid Herrmann return -EPERM; 73740e041a2SDavid Herrmann 73894c1e62dSHugh Dickins if (newsize != oldsize) { 73977142517SKonstantin Khlebnikov error = shmem_reacct_size(SHMEM_I(inode)->flags, 74077142517SKonstantin Khlebnikov oldsize, newsize); 74177142517SKonstantin Khlebnikov if (error) 74277142517SKonstantin Khlebnikov return error; 74394c1e62dSHugh Dickins i_size_write(inode, newsize); 74494c1e62dSHugh Dickins inode->i_ctime = inode->i_mtime = CURRENT_TIME; 74594c1e62dSHugh Dickins } 746afa2db2fSJosef Bacik if (newsize <= oldsize) { 74794c1e62dSHugh Dickins loff_t holebegin = round_up(newsize, PAGE_SIZE); 748d0424c42SHugh Dickins if (oldsize > holebegin) 749d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 750d0424c42SHugh Dickins holebegin, 0, 1); 751d0424c42SHugh Dickins if (info->alloced) 752d0424c42SHugh Dickins shmem_truncate_range(inode, 753d0424c42SHugh Dickins newsize, (loff_t)-1); 75494c1e62dSHugh Dickins /* unmap again to remove racily COWed private pages */ 755d0424c42SHugh Dickins if (oldsize > holebegin) 756d0424c42SHugh Dickins unmap_mapping_range(inode->i_mapping, 757d0424c42SHugh Dickins holebegin, 0, 1); 75894c1e62dSHugh Dickins } 7591da177e4SLinus Torvalds } 7601da177e4SLinus Torvalds 7616a1a90adSChristoph Hellwig setattr_copy(inode, attr); 762db78b877SChristoph Hellwig if (attr->ia_valid & ATTR_MODE) 763feda821eSChristoph Hellwig error = posix_acl_chmod(inode, inode->i_mode); 7641da177e4SLinus Torvalds return error; 7651da177e4SLinus Torvalds } 7661da177e4SLinus Torvalds 7671f895f75SAl Viro static void shmem_evict_inode(struct inode *inode) 7681da177e4SLinus Torvalds { 7691da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 7701da177e4SLinus Torvalds 7713889e6e7Snpiggin@suse.de if (inode->i_mapping->a_ops == &shmem_aops) { 7721da177e4SLinus Torvalds shmem_unacct_size(info->flags, inode->i_size); 7731da177e4SLinus Torvalds inode->i_size = 0; 7743889e6e7Snpiggin@suse.de shmem_truncate_range(inode, 0, (loff_t)-1); 7751da177e4SLinus Torvalds if (!list_empty(&info->swaplist)) { 776cb5f7b9aSHugh Dickins mutex_lock(&shmem_swaplist_mutex); 7771da177e4SLinus Torvalds list_del_init(&info->swaplist); 778cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 7791da177e4SLinus Torvalds } 7803ed47db3SAl Viro } 781b09e0fa4SEric Paris 78238f38657SAristeu Rozanski simple_xattrs_free(&info->xattrs); 7830f3c42f5SHugh Dickins WARN_ON(inode->i_blocks); 7845b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 785dbd5768fSJan Kara clear_inode(inode); 7861da177e4SLinus Torvalds } 7871da177e4SLinus Torvalds 78846f65ec1SHugh Dickins /* 78946f65ec1SHugh Dickins * If swap found in inode, free it and move page from swapcache to filecache. 79046f65ec1SHugh Dickins */ 79141ffe5d5SHugh Dickins static int shmem_unuse_inode(struct shmem_inode_info *info, 792bde05d1cSHugh Dickins swp_entry_t swap, struct page **pagep) 7931da177e4SLinus Torvalds { 794285b2c4fSHugh Dickins struct address_space *mapping = info->vfs_inode.i_mapping; 79546f65ec1SHugh Dickins void *radswap; 79641ffe5d5SHugh Dickins pgoff_t index; 797bde05d1cSHugh Dickins gfp_t gfp; 798bde05d1cSHugh Dickins int error = 0; 7991da177e4SLinus Torvalds 80046f65ec1SHugh Dickins radswap = swp_to_radix_entry(swap); 801e504f3fdSHugh Dickins index = radix_tree_locate_item(&mapping->page_tree, radswap); 80246f65ec1SHugh Dickins if (index == -1) 80300501b53SJohannes Weiner return -EAGAIN; /* tell shmem_unuse we found nothing */ 8042e0e26c7SHugh Dickins 8051b1b32f2SHugh Dickins /* 8061b1b32f2SHugh Dickins * Move _head_ to start search for next from here. 8071f895f75SAl Viro * But be careful: shmem_evict_inode checks list_empty without taking 8081b1b32f2SHugh Dickins * mutex, and there's an instant in list_move_tail when info->swaplist 809285b2c4fSHugh Dickins * would appear empty, if it were the only one on shmem_swaplist. 8101b1b32f2SHugh Dickins */ 8111b1b32f2SHugh Dickins if (shmem_swaplist.next != &info->swaplist) 8122e0e26c7SHugh Dickins list_move_tail(&shmem_swaplist, &info->swaplist); 8132e0e26c7SHugh Dickins 814bde05d1cSHugh Dickins gfp = mapping_gfp_mask(mapping); 815bde05d1cSHugh Dickins if (shmem_should_replace_page(*pagep, gfp)) { 816bde05d1cSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 817bde05d1cSHugh Dickins error = shmem_replace_page(pagep, gfp, info, index); 818bde05d1cSHugh Dickins mutex_lock(&shmem_swaplist_mutex); 819bde05d1cSHugh Dickins /* 820bde05d1cSHugh Dickins * We needed to drop mutex to make that restrictive page 8210142ef6cSHugh Dickins * allocation, but the inode might have been freed while we 8220142ef6cSHugh Dickins * dropped it: although a racing shmem_evict_inode() cannot 8230142ef6cSHugh Dickins * complete without emptying the radix_tree, our page lock 8240142ef6cSHugh Dickins * on this swapcache page is not enough to prevent that - 8250142ef6cSHugh Dickins * free_swap_and_cache() of our swap entry will only 8260142ef6cSHugh Dickins * trylock_page(), removing swap from radix_tree whatever. 8270142ef6cSHugh Dickins * 8280142ef6cSHugh Dickins * We must not proceed to shmem_add_to_page_cache() if the 8290142ef6cSHugh Dickins * inode has been freed, but of course we cannot rely on 8300142ef6cSHugh Dickins * inode or mapping or info to check that. However, we can 8310142ef6cSHugh Dickins * safely check if our swap entry is still in use (and here 8320142ef6cSHugh Dickins * it can't have got reused for another page): if it's still 8330142ef6cSHugh Dickins * in use, then the inode cannot have been freed yet, and we 8340142ef6cSHugh Dickins * can safely proceed (if it's no longer in use, that tells 8350142ef6cSHugh Dickins * nothing about the inode, but we don't need to unuse swap). 836bde05d1cSHugh Dickins */ 837bde05d1cSHugh Dickins if (!page_swapcount(*pagep)) 838bde05d1cSHugh Dickins error = -ENOENT; 839bde05d1cSHugh Dickins } 840bde05d1cSHugh Dickins 841d13d1443SKAMEZAWA Hiroyuki /* 842778dd893SHugh Dickins * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 843778dd893SHugh Dickins * but also to hold up shmem_evict_inode(): so inode cannot be freed 844778dd893SHugh Dickins * beneath us (pagelock doesn't help until the page is in pagecache). 845d13d1443SKAMEZAWA Hiroyuki */ 846bde05d1cSHugh Dickins if (!error) 847bde05d1cSHugh Dickins error = shmem_add_to_page_cache(*pagep, mapping, index, 848fed400a1SWang Sheng-Hui radswap); 84948f170fbSHugh Dickins if (error != -ENOMEM) { 85046f65ec1SHugh Dickins /* 85146f65ec1SHugh Dickins * Truncation and eviction use free_swap_and_cache(), which 85246f65ec1SHugh Dickins * only does trylock page: if we raced, best clean up here. 85346f65ec1SHugh Dickins */ 854bde05d1cSHugh Dickins delete_from_swap_cache(*pagep); 855bde05d1cSHugh Dickins set_page_dirty(*pagep); 85646f65ec1SHugh Dickins if (!error) { 85746f65ec1SHugh Dickins spin_lock(&info->lock); 858285b2c4fSHugh Dickins info->swapped--; 85946f65ec1SHugh Dickins spin_unlock(&info->lock); 86041ffe5d5SHugh Dickins swap_free(swap); 86146f65ec1SHugh Dickins } 8621da177e4SLinus Torvalds } 8632e0e26c7SHugh Dickins return error; 8641da177e4SLinus Torvalds } 8651da177e4SLinus Torvalds 8661da177e4SLinus Torvalds /* 86746f65ec1SHugh Dickins * Search through swapped inodes to find and replace swap by page. 8681da177e4SLinus Torvalds */ 86941ffe5d5SHugh Dickins int shmem_unuse(swp_entry_t swap, struct page *page) 8701da177e4SLinus Torvalds { 87141ffe5d5SHugh Dickins struct list_head *this, *next; 8721da177e4SLinus Torvalds struct shmem_inode_info *info; 87300501b53SJohannes Weiner struct mem_cgroup *memcg; 874bde05d1cSHugh Dickins int error = 0; 875bde05d1cSHugh Dickins 876bde05d1cSHugh Dickins /* 877bde05d1cSHugh Dickins * There's a faint possibility that swap page was replaced before 8780142ef6cSHugh Dickins * caller locked it: caller will come back later with the right page. 879bde05d1cSHugh Dickins */ 8800142ef6cSHugh Dickins if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 881bde05d1cSHugh Dickins goto out; 882778dd893SHugh Dickins 883778dd893SHugh Dickins /* 884778dd893SHugh Dickins * Charge page using GFP_KERNEL while we can wait, before taking 885778dd893SHugh Dickins * the shmem_swaplist_mutex which might hold up shmem_writepage(). 886778dd893SHugh Dickins * Charged back to the user (not to caller) when swap account is used. 887778dd893SHugh Dickins */ 888f627c2f5SKirill A. Shutemov error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, 889f627c2f5SKirill A. Shutemov false); 890778dd893SHugh Dickins if (error) 891778dd893SHugh Dickins goto out; 89246f65ec1SHugh Dickins /* No radix_tree_preload: swap entry keeps a place for page in tree */ 89300501b53SJohannes Weiner error = -EAGAIN; 8941da177e4SLinus Torvalds 895cb5f7b9aSHugh Dickins mutex_lock(&shmem_swaplist_mutex); 89641ffe5d5SHugh Dickins list_for_each_safe(this, next, &shmem_swaplist) { 89741ffe5d5SHugh Dickins info = list_entry(this, struct shmem_inode_info, swaplist); 898285b2c4fSHugh Dickins if (info->swapped) 89900501b53SJohannes Weiner error = shmem_unuse_inode(info, swap, &page); 9006922c0c7SHugh Dickins else 9016922c0c7SHugh Dickins list_del_init(&info->swaplist); 902cb5f7b9aSHugh Dickins cond_resched(); 90300501b53SJohannes Weiner if (error != -EAGAIN) 904778dd893SHugh Dickins break; 90500501b53SJohannes Weiner /* found nothing in this: move on to search the next */ 9061da177e4SLinus Torvalds } 907cb5f7b9aSHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 908778dd893SHugh Dickins 90900501b53SJohannes Weiner if (error) { 91000501b53SJohannes Weiner if (error != -ENOMEM) 91100501b53SJohannes Weiner error = 0; 912f627c2f5SKirill A. Shutemov mem_cgroup_cancel_charge(page, memcg, false); 91300501b53SJohannes Weiner } else 914f627c2f5SKirill A. Shutemov mem_cgroup_commit_charge(page, memcg, true, false); 915778dd893SHugh Dickins out: 916aaa46865SHugh Dickins unlock_page(page); 91709cbfeafSKirill A. Shutemov put_page(page); 918778dd893SHugh Dickins return error; 9191da177e4SLinus Torvalds } 9201da177e4SLinus Torvalds 9211da177e4SLinus Torvalds /* 9221da177e4SLinus Torvalds * Move the page from the page cache to the swap cache. 9231da177e4SLinus Torvalds */ 9241da177e4SLinus Torvalds static int shmem_writepage(struct page *page, struct writeback_control *wbc) 9251da177e4SLinus Torvalds { 9261da177e4SLinus Torvalds struct shmem_inode_info *info; 9271da177e4SLinus Torvalds struct address_space *mapping; 9281da177e4SLinus Torvalds struct inode *inode; 9296922c0c7SHugh Dickins swp_entry_t swap; 9306922c0c7SHugh Dickins pgoff_t index; 9311da177e4SLinus Torvalds 9321da177e4SLinus Torvalds BUG_ON(!PageLocked(page)); 9331da177e4SLinus Torvalds mapping = page->mapping; 9341da177e4SLinus Torvalds index = page->index; 9351da177e4SLinus Torvalds inode = mapping->host; 9361da177e4SLinus Torvalds info = SHMEM_I(inode); 9371da177e4SLinus Torvalds if (info->flags & VM_LOCKED) 9381da177e4SLinus Torvalds goto redirty; 939d9fe526aSHugh Dickins if (!total_swap_pages) 9401da177e4SLinus Torvalds goto redirty; 9411da177e4SLinus Torvalds 942d9fe526aSHugh Dickins /* 94397b713baSChristoph Hellwig * Our capabilities prevent regular writeback or sync from ever calling 94497b713baSChristoph Hellwig * shmem_writepage; but a stacking filesystem might use ->writepage of 94597b713baSChristoph Hellwig * its underlying filesystem, in which case tmpfs should write out to 94697b713baSChristoph Hellwig * swap only in response to memory pressure, and not for the writeback 94797b713baSChristoph Hellwig * threads or sync. 948d9fe526aSHugh Dickins */ 94948f170fbSHugh Dickins if (!wbc->for_reclaim) { 95048f170fbSHugh Dickins WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 95148f170fbSHugh Dickins goto redirty; 95248f170fbSHugh Dickins } 9531635f6a7SHugh Dickins 9541635f6a7SHugh Dickins /* 9551635f6a7SHugh Dickins * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 9561635f6a7SHugh Dickins * value into swapfile.c, the only way we can correctly account for a 9571635f6a7SHugh Dickins * fallocated page arriving here is now to initialize it and write it. 9581aac1400SHugh Dickins * 9591aac1400SHugh Dickins * That's okay for a page already fallocated earlier, but if we have 9601aac1400SHugh Dickins * not yet completed the fallocation, then (a) we want to keep track 9611aac1400SHugh Dickins * of this page in case we have to undo it, and (b) it may not be a 9621aac1400SHugh Dickins * good idea to continue anyway, once we're pushing into swap. So 9631aac1400SHugh Dickins * reactivate the page, and let shmem_fallocate() quit when too many. 9641635f6a7SHugh Dickins */ 9651635f6a7SHugh Dickins if (!PageUptodate(page)) { 9661aac1400SHugh Dickins if (inode->i_private) { 9671aac1400SHugh Dickins struct shmem_falloc *shmem_falloc; 9681aac1400SHugh Dickins spin_lock(&inode->i_lock); 9691aac1400SHugh Dickins shmem_falloc = inode->i_private; 9701aac1400SHugh Dickins if (shmem_falloc && 9718e205f77SHugh Dickins !shmem_falloc->waitq && 9721aac1400SHugh Dickins index >= shmem_falloc->start && 9731aac1400SHugh Dickins index < shmem_falloc->next) 9741aac1400SHugh Dickins shmem_falloc->nr_unswapped++; 9751aac1400SHugh Dickins else 9761aac1400SHugh Dickins shmem_falloc = NULL; 9771aac1400SHugh Dickins spin_unlock(&inode->i_lock); 9781aac1400SHugh Dickins if (shmem_falloc) 9791aac1400SHugh Dickins goto redirty; 9801aac1400SHugh Dickins } 9811635f6a7SHugh Dickins clear_highpage(page); 9821635f6a7SHugh Dickins flush_dcache_page(page); 9831635f6a7SHugh Dickins SetPageUptodate(page); 9841635f6a7SHugh Dickins } 9851635f6a7SHugh Dickins 986d9fe526aSHugh Dickins swap = get_swap_page(); 98748f170fbSHugh Dickins if (!swap.val) 98848f170fbSHugh Dickins goto redirty; 989d9fe526aSHugh Dickins 99037e84351SVladimir Davydov if (mem_cgroup_try_charge_swap(page, swap)) 99137e84351SVladimir Davydov goto free_swap; 99237e84351SVladimir Davydov 993b1dea800SHugh Dickins /* 994b1dea800SHugh Dickins * Add inode to shmem_unuse()'s list of swapped-out inodes, 9956922c0c7SHugh Dickins * if it's not already there. Do it now before the page is 9966922c0c7SHugh Dickins * moved to swap cache, when its pagelock no longer protects 997b1dea800SHugh Dickins * the inode from eviction. But don't unlock the mutex until 9986922c0c7SHugh Dickins * we've incremented swapped, because shmem_unuse_inode() will 9996922c0c7SHugh Dickins * prune a !swapped inode from the swaplist under this mutex. 1000b1dea800SHugh Dickins */ 1001b1dea800SHugh Dickins mutex_lock(&shmem_swaplist_mutex); 100205bf86b4SHugh Dickins if (list_empty(&info->swaplist)) 100305bf86b4SHugh Dickins list_add_tail(&info->swaplist, &shmem_swaplist); 1004b1dea800SHugh Dickins 100548f170fbSHugh Dickins if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1006267a4c76SHugh Dickins spin_lock(&info->lock); 1007267a4c76SHugh Dickins shmem_recalc_inode(inode); 1008267a4c76SHugh Dickins info->swapped++; 1009267a4c76SHugh Dickins spin_unlock(&info->lock); 1010267a4c76SHugh Dickins 1011aaa46865SHugh Dickins swap_shmem_alloc(swap); 10126922c0c7SHugh Dickins shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 10136922c0c7SHugh Dickins 10146922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 1015d9fe526aSHugh Dickins BUG_ON(page_mapped(page)); 10169fab5619SHugh Dickins swap_writepage(page, wbc); 10171da177e4SLinus Torvalds return 0; 10181da177e4SLinus Torvalds } 10191da177e4SLinus Torvalds 10206922c0c7SHugh Dickins mutex_unlock(&shmem_swaplist_mutex); 102137e84351SVladimir Davydov free_swap: 10220a31bc97SJohannes Weiner swapcache_free(swap); 10231da177e4SLinus Torvalds redirty: 10241da177e4SLinus Torvalds set_page_dirty(page); 1025d9fe526aSHugh Dickins if (wbc->for_reclaim) 1026d9fe526aSHugh Dickins return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 1027d9fe526aSHugh Dickins unlock_page(page); 1028d9fe526aSHugh Dickins return 0; 10291da177e4SLinus Torvalds } 10301da177e4SLinus Torvalds 103175edd345SHugh Dickins #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 103271fe804bSLee Schermerhorn static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1033680d794bSakpm@linux-foundation.org { 1034680d794bSakpm@linux-foundation.org char buffer[64]; 1035680d794bSakpm@linux-foundation.org 103671fe804bSLee Schermerhorn if (!mpol || mpol->mode == MPOL_DEFAULT) 1037095f1fc4SLee Schermerhorn return; /* show nothing */ 1038095f1fc4SLee Schermerhorn 1039a7a88b23SHugh Dickins mpol_to_str(buffer, sizeof(buffer), mpol); 1040095f1fc4SLee Schermerhorn 1041095f1fc4SLee Schermerhorn seq_printf(seq, ",mpol=%s", buffer); 1042680d794bSakpm@linux-foundation.org } 104371fe804bSLee Schermerhorn 104471fe804bSLee Schermerhorn static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 104571fe804bSLee Schermerhorn { 104671fe804bSLee Schermerhorn struct mempolicy *mpol = NULL; 104771fe804bSLee Schermerhorn if (sbinfo->mpol) { 104871fe804bSLee Schermerhorn spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 104971fe804bSLee Schermerhorn mpol = sbinfo->mpol; 105071fe804bSLee Schermerhorn mpol_get(mpol); 105171fe804bSLee Schermerhorn spin_unlock(&sbinfo->stat_lock); 105271fe804bSLee Schermerhorn } 105371fe804bSLee Schermerhorn return mpol; 105471fe804bSLee Schermerhorn } 105575edd345SHugh Dickins #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 105675edd345SHugh Dickins static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 105775edd345SHugh Dickins { 105875edd345SHugh Dickins } 105975edd345SHugh Dickins static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 106075edd345SHugh Dickins { 106175edd345SHugh Dickins return NULL; 106275edd345SHugh Dickins } 106375edd345SHugh Dickins #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 106475edd345SHugh Dickins #ifndef CONFIG_NUMA 106575edd345SHugh Dickins #define vm_policy vm_private_data 106675edd345SHugh Dickins #endif 1067680d794bSakpm@linux-foundation.org 106841ffe5d5SHugh Dickins static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 106941ffe5d5SHugh Dickins struct shmem_inode_info *info, pgoff_t index) 10701da177e4SLinus Torvalds { 10711da177e4SLinus Torvalds struct vm_area_struct pvma; 107218a2f371SMel Gorman struct page *page; 10731da177e4SLinus Torvalds 1074c4cc6d07SHugh Dickins /* Create a pseudo vma that just contains the policy */ 1075c4cc6d07SHugh Dickins pvma.vm_start = 0; 107609c231cbSNathan Zimmer /* Bias interleave by inode number to distribute better across nodes */ 107709c231cbSNathan Zimmer pvma.vm_pgoff = index + info->vfs_inode.i_ino; 1078c4cc6d07SHugh Dickins pvma.vm_ops = NULL; 107941ffe5d5SHugh Dickins pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 108052cd3b07SLee Schermerhorn 108118a2f371SMel Gorman page = swapin_readahead(swap, gfp, &pvma, 0); 108218a2f371SMel Gorman 108318a2f371SMel Gorman /* Drop reference taken by mpol_shared_policy_lookup() */ 108418a2f371SMel Gorman mpol_cond_put(pvma.vm_policy); 108518a2f371SMel Gorman 108618a2f371SMel Gorman return page; 108718a2f371SMel Gorman } 108818a2f371SMel Gorman 108918a2f371SMel Gorman static struct page *shmem_alloc_page(gfp_t gfp, 109018a2f371SMel Gorman struct shmem_inode_info *info, pgoff_t index) 109118a2f371SMel Gorman { 109218a2f371SMel Gorman struct vm_area_struct pvma; 109318a2f371SMel Gorman struct page *page; 109418a2f371SMel Gorman 109518a2f371SMel Gorman /* Create a pseudo vma that just contains the policy */ 109618a2f371SMel Gorman pvma.vm_start = 0; 109718a2f371SMel Gorman /* Bias interleave by inode number to distribute better across nodes */ 109818a2f371SMel Gorman pvma.vm_pgoff = index + info->vfs_inode.i_ino; 109918a2f371SMel Gorman pvma.vm_ops = NULL; 110018a2f371SMel Gorman pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 110118a2f371SMel Gorman 110275edd345SHugh Dickins page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); 110375edd345SHugh Dickins if (page) { 110475edd345SHugh Dickins __SetPageLocked(page); 110575edd345SHugh Dickins __SetPageSwapBacked(page); 110675edd345SHugh Dickins } 110718a2f371SMel Gorman 110818a2f371SMel Gorman /* Drop reference taken by mpol_shared_policy_lookup() */ 110918a2f371SMel Gorman mpol_cond_put(pvma.vm_policy); 111018a2f371SMel Gorman 111118a2f371SMel Gorman return page; 11121da177e4SLinus Torvalds } 111371fe804bSLee Schermerhorn 11141da177e4SLinus Torvalds /* 1115bde05d1cSHugh Dickins * When a page is moved from swapcache to shmem filecache (either by the 1116bde05d1cSHugh Dickins * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 1117bde05d1cSHugh Dickins * shmem_unuse_inode()), it may have been read in earlier from swap, in 1118bde05d1cSHugh Dickins * ignorance of the mapping it belongs to. If that mapping has special 1119bde05d1cSHugh Dickins * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1120bde05d1cSHugh Dickins * we may need to copy to a suitable page before moving to filecache. 1121bde05d1cSHugh Dickins * 1122bde05d1cSHugh Dickins * In a future release, this may well be extended to respect cpuset and 1123bde05d1cSHugh Dickins * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1124bde05d1cSHugh Dickins * but for now it is a simple matter of zone. 1125bde05d1cSHugh Dickins */ 1126bde05d1cSHugh Dickins static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 1127bde05d1cSHugh Dickins { 1128bde05d1cSHugh Dickins return page_zonenum(page) > gfp_zone(gfp); 1129bde05d1cSHugh Dickins } 1130bde05d1cSHugh Dickins 1131bde05d1cSHugh Dickins static int shmem_replace_page(struct page **pagep, gfp_t gfp, 1132bde05d1cSHugh Dickins struct shmem_inode_info *info, pgoff_t index) 1133bde05d1cSHugh Dickins { 1134bde05d1cSHugh Dickins struct page *oldpage, *newpage; 1135bde05d1cSHugh Dickins struct address_space *swap_mapping; 1136bde05d1cSHugh Dickins pgoff_t swap_index; 1137bde05d1cSHugh Dickins int error; 1138bde05d1cSHugh Dickins 1139bde05d1cSHugh Dickins oldpage = *pagep; 1140bde05d1cSHugh Dickins swap_index = page_private(oldpage); 1141bde05d1cSHugh Dickins swap_mapping = page_mapping(oldpage); 1142bde05d1cSHugh Dickins 1143bde05d1cSHugh Dickins /* 1144bde05d1cSHugh Dickins * We have arrived here because our zones are constrained, so don't 1145bde05d1cSHugh Dickins * limit chance of success by further cpuset and node constraints. 1146bde05d1cSHugh Dickins */ 1147bde05d1cSHugh Dickins gfp &= ~GFP_CONSTRAINT_MASK; 1148bde05d1cSHugh Dickins newpage = shmem_alloc_page(gfp, info, index); 1149bde05d1cSHugh Dickins if (!newpage) 1150bde05d1cSHugh Dickins return -ENOMEM; 1151bde05d1cSHugh Dickins 115209cbfeafSKirill A. Shutemov get_page(newpage); 1153bde05d1cSHugh Dickins copy_highpage(newpage, oldpage); 11540142ef6cSHugh Dickins flush_dcache_page(newpage); 1155bde05d1cSHugh Dickins 1156bde05d1cSHugh Dickins SetPageUptodate(newpage); 1157bde05d1cSHugh Dickins set_page_private(newpage, swap_index); 1158bde05d1cSHugh Dickins SetPageSwapCache(newpage); 1159bde05d1cSHugh Dickins 1160bde05d1cSHugh Dickins /* 1161bde05d1cSHugh Dickins * Our caller will very soon move newpage out of swapcache, but it's 1162bde05d1cSHugh Dickins * a nice clean interface for us to replace oldpage by newpage there. 1163bde05d1cSHugh Dickins */ 1164bde05d1cSHugh Dickins spin_lock_irq(&swap_mapping->tree_lock); 1165bde05d1cSHugh Dickins error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1166bde05d1cSHugh Dickins newpage); 11670142ef6cSHugh Dickins if (!error) { 1168bde05d1cSHugh Dickins __inc_zone_page_state(newpage, NR_FILE_PAGES); 1169bde05d1cSHugh Dickins __dec_zone_page_state(oldpage, NR_FILE_PAGES); 11700142ef6cSHugh Dickins } 1171bde05d1cSHugh Dickins spin_unlock_irq(&swap_mapping->tree_lock); 1172bde05d1cSHugh Dickins 11730142ef6cSHugh Dickins if (unlikely(error)) { 11740142ef6cSHugh Dickins /* 11750142ef6cSHugh Dickins * Is this possible? I think not, now that our callers check 11760142ef6cSHugh Dickins * both PageSwapCache and page_private after getting page lock; 11770142ef6cSHugh Dickins * but be defensive. Reverse old to newpage for clear and free. 11780142ef6cSHugh Dickins */ 11790142ef6cSHugh Dickins oldpage = newpage; 11800142ef6cSHugh Dickins } else { 11816a93ca8fSJohannes Weiner mem_cgroup_migrate(oldpage, newpage); 1182bde05d1cSHugh Dickins lru_cache_add_anon(newpage); 11830142ef6cSHugh Dickins *pagep = newpage; 11840142ef6cSHugh Dickins } 1185bde05d1cSHugh Dickins 1186bde05d1cSHugh Dickins ClearPageSwapCache(oldpage); 1187bde05d1cSHugh Dickins set_page_private(oldpage, 0); 1188bde05d1cSHugh Dickins 1189bde05d1cSHugh Dickins unlock_page(oldpage); 119009cbfeafSKirill A. Shutemov put_page(oldpage); 119109cbfeafSKirill A. Shutemov put_page(oldpage); 11920142ef6cSHugh Dickins return error; 1193bde05d1cSHugh Dickins } 1194bde05d1cSHugh Dickins 1195bde05d1cSHugh Dickins /* 119668da9f05SHugh Dickins * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 11971da177e4SLinus Torvalds * 11981da177e4SLinus Torvalds * If we allocate a new one we do not mark it dirty. That's up to the 11991da177e4SLinus Torvalds * vm. If we swap it in we mark it dirty since we also free the swap 12009e18eb29SAndres Lagar-Cavilla * entry since a page cannot live in both the swap and page cache. 12019e18eb29SAndres Lagar-Cavilla * 12029e18eb29SAndres Lagar-Cavilla * fault_mm and fault_type are only supplied by shmem_fault: 12039e18eb29SAndres Lagar-Cavilla * otherwise they are NULL. 12041da177e4SLinus Torvalds */ 120541ffe5d5SHugh Dickins static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 12069e18eb29SAndres Lagar-Cavilla struct page **pagep, enum sgp_type sgp, gfp_t gfp, 12079e18eb29SAndres Lagar-Cavilla struct mm_struct *fault_mm, int *fault_type) 12081da177e4SLinus Torvalds { 12091da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 121054af6042SHugh Dickins struct shmem_inode_info *info; 12111da177e4SLinus Torvalds struct shmem_sb_info *sbinfo; 12129e18eb29SAndres Lagar-Cavilla struct mm_struct *charge_mm; 121300501b53SJohannes Weiner struct mem_cgroup *memcg; 121427ab7006SHugh Dickins struct page *page; 12151da177e4SLinus Torvalds swp_entry_t swap; 12161da177e4SLinus Torvalds int error; 121754af6042SHugh Dickins int once = 0; 12181635f6a7SHugh Dickins int alloced = 0; 12191da177e4SLinus Torvalds 122009cbfeafSKirill A. Shutemov if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 12211da177e4SLinus Torvalds return -EFBIG; 12221da177e4SLinus Torvalds repeat: 122354af6042SHugh Dickins swap.val = 0; 12240cd6144aSJohannes Weiner page = find_lock_entry(mapping, index); 122554af6042SHugh Dickins if (radix_tree_exceptional_entry(page)) { 122654af6042SHugh Dickins swap = radix_to_swp_entry(page); 122754af6042SHugh Dickins page = NULL; 122854af6042SHugh Dickins } 122954af6042SHugh Dickins 123075edd345SHugh Dickins if (sgp <= SGP_CACHE && 123109cbfeafSKirill A. Shutemov ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 123254af6042SHugh Dickins error = -EINVAL; 1233267a4c76SHugh Dickins goto unlock; 123454af6042SHugh Dickins } 123554af6042SHugh Dickins 123666d2f4d2SHugh Dickins if (page && sgp == SGP_WRITE) 123766d2f4d2SHugh Dickins mark_page_accessed(page); 123866d2f4d2SHugh Dickins 12391635f6a7SHugh Dickins /* fallocated page? */ 12401635f6a7SHugh Dickins if (page && !PageUptodate(page)) { 12411635f6a7SHugh Dickins if (sgp != SGP_READ) 12421635f6a7SHugh Dickins goto clear; 12431635f6a7SHugh Dickins unlock_page(page); 124409cbfeafSKirill A. Shutemov put_page(page); 12451635f6a7SHugh Dickins page = NULL; 12461635f6a7SHugh Dickins } 124754af6042SHugh Dickins if (page || (sgp == SGP_READ && !swap.val)) { 124854af6042SHugh Dickins *pagep = page; 124954af6042SHugh Dickins return 0; 125027ab7006SHugh Dickins } 125127ab7006SHugh Dickins 1252b409f9fcSHugh Dickins /* 125354af6042SHugh Dickins * Fast cache lookup did not find it: 125454af6042SHugh Dickins * bring it back from swap or allocate. 1255b409f9fcSHugh Dickins */ 125654af6042SHugh Dickins info = SHMEM_I(inode); 125754af6042SHugh Dickins sbinfo = SHMEM_SB(inode->i_sb); 12589e18eb29SAndres Lagar-Cavilla charge_mm = fault_mm ? : current->mm; 125927ab7006SHugh Dickins 12601da177e4SLinus Torvalds if (swap.val) { 12611da177e4SLinus Torvalds /* Look it up and read it in.. */ 126227ab7006SHugh Dickins page = lookup_swap_cache(swap); 126327ab7006SHugh Dickins if (!page) { 12649e18eb29SAndres Lagar-Cavilla /* Or update major stats only when swapin succeeds?? */ 12659e18eb29SAndres Lagar-Cavilla if (fault_type) { 126668da9f05SHugh Dickins *fault_type |= VM_FAULT_MAJOR; 12679e18eb29SAndres Lagar-Cavilla count_vm_event(PGMAJFAULT); 12689e18eb29SAndres Lagar-Cavilla mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); 12699e18eb29SAndres Lagar-Cavilla } 12709e18eb29SAndres Lagar-Cavilla /* Here we actually start the io */ 127141ffe5d5SHugh Dickins page = shmem_swapin(swap, gfp, info, index); 127227ab7006SHugh Dickins if (!page) { 12731da177e4SLinus Torvalds error = -ENOMEM; 127454af6042SHugh Dickins goto failed; 1275285b2c4fSHugh Dickins } 12761da177e4SLinus Torvalds } 12771da177e4SLinus Torvalds 12781da177e4SLinus Torvalds /* We have to do this with page locked to prevent races */ 127954af6042SHugh Dickins lock_page(page); 12800142ef6cSHugh Dickins if (!PageSwapCache(page) || page_private(page) != swap.val || 1281d1899228SHugh Dickins !shmem_confirm_swap(mapping, index, swap)) { 1282bde05d1cSHugh Dickins error = -EEXIST; /* try again */ 1283d1899228SHugh Dickins goto unlock; 1284bde05d1cSHugh Dickins } 128527ab7006SHugh Dickins if (!PageUptodate(page)) { 12861da177e4SLinus Torvalds error = -EIO; 128754af6042SHugh Dickins goto failed; 128854af6042SHugh Dickins } 128954af6042SHugh Dickins wait_on_page_writeback(page); 129054af6042SHugh Dickins 1291bde05d1cSHugh Dickins if (shmem_should_replace_page(page, gfp)) { 1292bde05d1cSHugh Dickins error = shmem_replace_page(&page, gfp, info, index); 1293bde05d1cSHugh Dickins if (error) 129454af6042SHugh Dickins goto failed; 12951da177e4SLinus Torvalds } 12961da177e4SLinus Torvalds 12979e18eb29SAndres Lagar-Cavilla error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, 1298f627c2f5SKirill A. Shutemov false); 1299d1899228SHugh Dickins if (!error) { 130054af6042SHugh Dickins error = shmem_add_to_page_cache(page, mapping, index, 1301fed400a1SWang Sheng-Hui swp_to_radix_entry(swap)); 1302215c02bcSHugh Dickins /* 1303215c02bcSHugh Dickins * We already confirmed swap under page lock, and make 1304215c02bcSHugh Dickins * no memory allocation here, so usually no possibility 1305215c02bcSHugh Dickins * of error; but free_swap_and_cache() only trylocks a 1306215c02bcSHugh Dickins * page, so it is just possible that the entry has been 1307215c02bcSHugh Dickins * truncated or holepunched since swap was confirmed. 1308215c02bcSHugh Dickins * shmem_undo_range() will have done some of the 1309215c02bcSHugh Dickins * unaccounting, now delete_from_swap_cache() will do 131093aa7d95SVladimir Davydov * the rest. 1311215c02bcSHugh Dickins * Reset swap.val? No, leave it so "failed" goes back to 1312215c02bcSHugh Dickins * "repeat": reading a hole and writing should succeed. 1313215c02bcSHugh Dickins */ 131400501b53SJohannes Weiner if (error) { 1315f627c2f5SKirill A. Shutemov mem_cgroup_cancel_charge(page, memcg, false); 1316215c02bcSHugh Dickins delete_from_swap_cache(page); 1317d1899228SHugh Dickins } 131800501b53SJohannes Weiner } 131954af6042SHugh Dickins if (error) 132054af6042SHugh Dickins goto failed; 132154af6042SHugh Dickins 1322f627c2f5SKirill A. Shutemov mem_cgroup_commit_charge(page, memcg, true, false); 132300501b53SJohannes Weiner 132454af6042SHugh Dickins spin_lock(&info->lock); 132554af6042SHugh Dickins info->swapped--; 132654af6042SHugh Dickins shmem_recalc_inode(inode); 13271da177e4SLinus Torvalds spin_unlock(&info->lock); 132827ab7006SHugh Dickins 132966d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 133066d2f4d2SHugh Dickins mark_page_accessed(page); 133166d2f4d2SHugh Dickins 133227ab7006SHugh Dickins delete_from_swap_cache(page); 133327ab7006SHugh Dickins set_page_dirty(page); 133427ab7006SHugh Dickins swap_free(swap); 133527ab7006SHugh Dickins 133654af6042SHugh Dickins } else { 133754af6042SHugh Dickins if (shmem_acct_block(info->flags)) { 133854af6042SHugh Dickins error = -ENOSPC; 133954af6042SHugh Dickins goto failed; 13401da177e4SLinus Torvalds } 13410edd73b3SHugh Dickins if (sbinfo->max_blocks) { 1342fc5da22aSHugh Dickins if (percpu_counter_compare(&sbinfo->used_blocks, 134354af6042SHugh Dickins sbinfo->max_blocks) >= 0) { 134454af6042SHugh Dickins error = -ENOSPC; 134554af6042SHugh Dickins goto unacct; 134654af6042SHugh Dickins } 13477e496299STim Chen percpu_counter_inc(&sbinfo->used_blocks); 134859a16eadSHugh Dickins } 13491da177e4SLinus Torvalds 135054af6042SHugh Dickins page = shmem_alloc_page(gfp, info, index); 135154af6042SHugh Dickins if (!page) { 135254af6042SHugh Dickins error = -ENOMEM; 135354af6042SHugh Dickins goto decused; 135454af6042SHugh Dickins } 135566d2f4d2SHugh Dickins if (sgp == SGP_WRITE) 1356eb39d618SHugh Dickins __SetPageReferenced(page); 135766d2f4d2SHugh Dickins 13589e18eb29SAndres Lagar-Cavilla error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, 1359f627c2f5SKirill A. Shutemov false); 136054af6042SHugh Dickins if (error) 136154af6042SHugh Dickins goto decused; 13625e4c0d97SJan Kara error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1363b065b432SHugh Dickins if (!error) { 1364b065b432SHugh Dickins error = shmem_add_to_page_cache(page, mapping, index, 1365fed400a1SWang Sheng-Hui NULL); 1366b065b432SHugh Dickins radix_tree_preload_end(); 1367b065b432SHugh Dickins } 1368b065b432SHugh Dickins if (error) { 1369f627c2f5SKirill A. Shutemov mem_cgroup_cancel_charge(page, memcg, false); 1370b065b432SHugh Dickins goto decused; 1371b065b432SHugh Dickins } 1372f627c2f5SKirill A. Shutemov mem_cgroup_commit_charge(page, memcg, false, false); 137354af6042SHugh Dickins lru_cache_add_anon(page); 137454af6042SHugh Dickins 137554af6042SHugh Dickins spin_lock(&info->lock); 13761da177e4SLinus Torvalds info->alloced++; 137754af6042SHugh Dickins inode->i_blocks += BLOCKS_PER_PAGE; 137854af6042SHugh Dickins shmem_recalc_inode(inode); 137959a16eadSHugh Dickins spin_unlock(&info->lock); 13801635f6a7SHugh Dickins alloced = true; 138154af6042SHugh Dickins 1382ec9516fbSHugh Dickins /* 13831635f6a7SHugh Dickins * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 13841635f6a7SHugh Dickins */ 13851635f6a7SHugh Dickins if (sgp == SGP_FALLOC) 13861635f6a7SHugh Dickins sgp = SGP_WRITE; 13871635f6a7SHugh Dickins clear: 13881635f6a7SHugh Dickins /* 13891635f6a7SHugh Dickins * Let SGP_WRITE caller clear ends if write does not fill page; 13901635f6a7SHugh Dickins * but SGP_FALLOC on a page fallocated earlier must initialize 13911635f6a7SHugh Dickins * it now, lest undo on failure cancel our earlier guarantee. 1392ec9516fbSHugh Dickins */ 1393ec9516fbSHugh Dickins if (sgp != SGP_WRITE) { 139427ab7006SHugh Dickins clear_highpage(page); 139527ab7006SHugh Dickins flush_dcache_page(page); 139627ab7006SHugh Dickins SetPageUptodate(page); 1397ec9516fbSHugh Dickins } 13981da177e4SLinus Torvalds } 1399bde05d1cSHugh Dickins 140054af6042SHugh Dickins /* Perhaps the file has been truncated since we checked */ 140175edd345SHugh Dickins if (sgp <= SGP_CACHE && 140209cbfeafSKirill A. Shutemov ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1403267a4c76SHugh Dickins if (alloced) { 1404267a4c76SHugh Dickins ClearPageDirty(page); 1405267a4c76SHugh Dickins delete_from_page_cache(page); 1406267a4c76SHugh Dickins spin_lock(&info->lock); 1407267a4c76SHugh Dickins shmem_recalc_inode(inode); 1408267a4c76SHugh Dickins spin_unlock(&info->lock); 1409267a4c76SHugh Dickins } 141054af6042SHugh Dickins error = -EINVAL; 1411267a4c76SHugh Dickins goto unlock; 1412ff36b801SShaohua Li } 141354af6042SHugh Dickins *pagep = page; 141454af6042SHugh Dickins return 0; 1415d00806b1SNick Piggin 1416d0217ac0SNick Piggin /* 141754af6042SHugh Dickins * Error recovery. 14181da177e4SLinus Torvalds */ 141954af6042SHugh Dickins decused: 142054af6042SHugh Dickins if (sbinfo->max_blocks) 142154af6042SHugh Dickins percpu_counter_add(&sbinfo->used_blocks, -1); 142254af6042SHugh Dickins unacct: 142354af6042SHugh Dickins shmem_unacct_blocks(info->flags, 1); 142454af6042SHugh Dickins failed: 1425267a4c76SHugh Dickins if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 142654af6042SHugh Dickins error = -EEXIST; 1427d1899228SHugh Dickins unlock: 142827ab7006SHugh Dickins if (page) { 142954af6042SHugh Dickins unlock_page(page); 143009cbfeafSKirill A. Shutemov put_page(page); 143154af6042SHugh Dickins } 143254af6042SHugh Dickins if (error == -ENOSPC && !once++) { 143354af6042SHugh Dickins info = SHMEM_I(inode); 143454af6042SHugh Dickins spin_lock(&info->lock); 143554af6042SHugh Dickins shmem_recalc_inode(inode); 143654af6042SHugh Dickins spin_unlock(&info->lock); 14371da177e4SLinus Torvalds goto repeat; 1438d8dc74f2SAdrian Bunk } 1439d1899228SHugh Dickins if (error == -EEXIST) /* from above or from radix_tree_insert */ 144054af6042SHugh Dickins goto repeat; 144154af6042SHugh Dickins return error; 14421da177e4SLinus Torvalds } 14431da177e4SLinus Torvalds 14441da177e4SLinus Torvalds static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 14451da177e4SLinus Torvalds { 1446496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 14479e18eb29SAndres Lagar-Cavilla gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 14481da177e4SLinus Torvalds int error; 144968da9f05SHugh Dickins int ret = VM_FAULT_LOCKED; 14501da177e4SLinus Torvalds 1451f00cdc6dSHugh Dickins /* 1452f00cdc6dSHugh Dickins * Trinity finds that probing a hole which tmpfs is punching can 1453f00cdc6dSHugh Dickins * prevent the hole-punch from ever completing: which in turn 1454f00cdc6dSHugh Dickins * locks writers out with its hold on i_mutex. So refrain from 14558e205f77SHugh Dickins * faulting pages into the hole while it's being punched. Although 14568e205f77SHugh Dickins * shmem_undo_range() does remove the additions, it may be unable to 14578e205f77SHugh Dickins * keep up, as each new page needs its own unmap_mapping_range() call, 14588e205f77SHugh Dickins * and the i_mmap tree grows ever slower to scan if new vmas are added. 14598e205f77SHugh Dickins * 14608e205f77SHugh Dickins * It does not matter if we sometimes reach this check just before the 14618e205f77SHugh Dickins * hole-punch begins, so that one fault then races with the punch: 14628e205f77SHugh Dickins * we just need to make racing faults a rare case. 14638e205f77SHugh Dickins * 14648e205f77SHugh Dickins * The implementation below would be much simpler if we just used a 14658e205f77SHugh Dickins * standard mutex or completion: but we cannot take i_mutex in fault, 14668e205f77SHugh Dickins * and bloating every shmem inode for this unlikely case would be sad. 1467f00cdc6dSHugh Dickins */ 1468f00cdc6dSHugh Dickins if (unlikely(inode->i_private)) { 1469f00cdc6dSHugh Dickins struct shmem_falloc *shmem_falloc; 1470f00cdc6dSHugh Dickins 1471f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 1472f00cdc6dSHugh Dickins shmem_falloc = inode->i_private; 14738e205f77SHugh Dickins if (shmem_falloc && 14748e205f77SHugh Dickins shmem_falloc->waitq && 14758e205f77SHugh Dickins vmf->pgoff >= shmem_falloc->start && 14768e205f77SHugh Dickins vmf->pgoff < shmem_falloc->next) { 14778e205f77SHugh Dickins wait_queue_head_t *shmem_falloc_waitq; 14788e205f77SHugh Dickins DEFINE_WAIT(shmem_fault_wait); 14798e205f77SHugh Dickins 14808e205f77SHugh Dickins ret = VM_FAULT_NOPAGE; 1481f00cdc6dSHugh Dickins if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1482f00cdc6dSHugh Dickins !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 14838e205f77SHugh Dickins /* It's polite to up mmap_sem if we can */ 1484f00cdc6dSHugh Dickins up_read(&vma->vm_mm->mmap_sem); 14858e205f77SHugh Dickins ret = VM_FAULT_RETRY; 1486f00cdc6dSHugh Dickins } 14878e205f77SHugh Dickins 14888e205f77SHugh Dickins shmem_falloc_waitq = shmem_falloc->waitq; 14898e205f77SHugh Dickins prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 14908e205f77SHugh Dickins TASK_UNINTERRUPTIBLE); 14918e205f77SHugh Dickins spin_unlock(&inode->i_lock); 14928e205f77SHugh Dickins schedule(); 14938e205f77SHugh Dickins 14948e205f77SHugh Dickins /* 14958e205f77SHugh Dickins * shmem_falloc_waitq points into the shmem_fallocate() 14968e205f77SHugh Dickins * stack of the hole-punching task: shmem_falloc_waitq 14978e205f77SHugh Dickins * is usually invalid by the time we reach here, but 14988e205f77SHugh Dickins * finish_wait() does not dereference it in that case; 14998e205f77SHugh Dickins * though i_lock needed lest racing with wake_up_all(). 15008e205f77SHugh Dickins */ 15018e205f77SHugh Dickins spin_lock(&inode->i_lock); 15028e205f77SHugh Dickins finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 15038e205f77SHugh Dickins spin_unlock(&inode->i_lock); 15048e205f77SHugh Dickins return ret; 1505f00cdc6dSHugh Dickins } 15068e205f77SHugh Dickins spin_unlock(&inode->i_lock); 1507f00cdc6dSHugh Dickins } 1508f00cdc6dSHugh Dickins 15099e18eb29SAndres Lagar-Cavilla error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, 15109e18eb29SAndres Lagar-Cavilla gfp, vma->vm_mm, &ret); 15111da177e4SLinus Torvalds if (error) 15121da177e4SLinus Torvalds return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 151368da9f05SHugh Dickins return ret; 15141da177e4SLinus Torvalds } 15151da177e4SLinus Torvalds 1516*c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 1517*c01d5b30SHugh Dickins unsigned long uaddr, unsigned long len, 1518*c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 1519*c01d5b30SHugh Dickins { 1520*c01d5b30SHugh Dickins unsigned long (*get_area)(struct file *, 1521*c01d5b30SHugh Dickins unsigned long, unsigned long, unsigned long, unsigned long); 1522*c01d5b30SHugh Dickins unsigned long addr; 1523*c01d5b30SHugh Dickins unsigned long offset; 1524*c01d5b30SHugh Dickins unsigned long inflated_len; 1525*c01d5b30SHugh Dickins unsigned long inflated_addr; 1526*c01d5b30SHugh Dickins unsigned long inflated_offset; 1527*c01d5b30SHugh Dickins 1528*c01d5b30SHugh Dickins if (len > TASK_SIZE) 1529*c01d5b30SHugh Dickins return -ENOMEM; 1530*c01d5b30SHugh Dickins 1531*c01d5b30SHugh Dickins get_area = current->mm->get_unmapped_area; 1532*c01d5b30SHugh Dickins addr = get_area(file, uaddr, len, pgoff, flags); 1533*c01d5b30SHugh Dickins 1534*c01d5b30SHugh Dickins if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1535*c01d5b30SHugh Dickins return addr; 1536*c01d5b30SHugh Dickins if (IS_ERR_VALUE(addr)) 1537*c01d5b30SHugh Dickins return addr; 1538*c01d5b30SHugh Dickins if (addr & ~PAGE_MASK) 1539*c01d5b30SHugh Dickins return addr; 1540*c01d5b30SHugh Dickins if (addr > TASK_SIZE - len) 1541*c01d5b30SHugh Dickins return addr; 1542*c01d5b30SHugh Dickins 1543*c01d5b30SHugh Dickins if (shmem_huge == SHMEM_HUGE_DENY) 1544*c01d5b30SHugh Dickins return addr; 1545*c01d5b30SHugh Dickins if (len < HPAGE_PMD_SIZE) 1546*c01d5b30SHugh Dickins return addr; 1547*c01d5b30SHugh Dickins if (flags & MAP_FIXED) 1548*c01d5b30SHugh Dickins return addr; 1549*c01d5b30SHugh Dickins /* 1550*c01d5b30SHugh Dickins * Our priority is to support MAP_SHARED mapped hugely; 1551*c01d5b30SHugh Dickins * and support MAP_PRIVATE mapped hugely too, until it is COWed. 1552*c01d5b30SHugh Dickins * But if caller specified an address hint, respect that as before. 1553*c01d5b30SHugh Dickins */ 1554*c01d5b30SHugh Dickins if (uaddr) 1555*c01d5b30SHugh Dickins return addr; 1556*c01d5b30SHugh Dickins 1557*c01d5b30SHugh Dickins if (shmem_huge != SHMEM_HUGE_FORCE) { 1558*c01d5b30SHugh Dickins struct super_block *sb; 1559*c01d5b30SHugh Dickins 1560*c01d5b30SHugh Dickins if (file) { 1561*c01d5b30SHugh Dickins VM_BUG_ON(file->f_op != &shmem_file_operations); 1562*c01d5b30SHugh Dickins sb = file_inode(file)->i_sb; 1563*c01d5b30SHugh Dickins } else { 1564*c01d5b30SHugh Dickins /* 1565*c01d5b30SHugh Dickins * Called directly from mm/mmap.c, or drivers/char/mem.c 1566*c01d5b30SHugh Dickins * for "/dev/zero", to create a shared anonymous object. 1567*c01d5b30SHugh Dickins */ 1568*c01d5b30SHugh Dickins if (IS_ERR(shm_mnt)) 1569*c01d5b30SHugh Dickins return addr; 1570*c01d5b30SHugh Dickins sb = shm_mnt->mnt_sb; 1571*c01d5b30SHugh Dickins } 1572*c01d5b30SHugh Dickins if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) 1573*c01d5b30SHugh Dickins return addr; 1574*c01d5b30SHugh Dickins } 1575*c01d5b30SHugh Dickins 1576*c01d5b30SHugh Dickins offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 1577*c01d5b30SHugh Dickins if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 1578*c01d5b30SHugh Dickins return addr; 1579*c01d5b30SHugh Dickins if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 1580*c01d5b30SHugh Dickins return addr; 1581*c01d5b30SHugh Dickins 1582*c01d5b30SHugh Dickins inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 1583*c01d5b30SHugh Dickins if (inflated_len > TASK_SIZE) 1584*c01d5b30SHugh Dickins return addr; 1585*c01d5b30SHugh Dickins if (inflated_len < len) 1586*c01d5b30SHugh Dickins return addr; 1587*c01d5b30SHugh Dickins 1588*c01d5b30SHugh Dickins inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); 1589*c01d5b30SHugh Dickins if (IS_ERR_VALUE(inflated_addr)) 1590*c01d5b30SHugh Dickins return addr; 1591*c01d5b30SHugh Dickins if (inflated_addr & ~PAGE_MASK) 1592*c01d5b30SHugh Dickins return addr; 1593*c01d5b30SHugh Dickins 1594*c01d5b30SHugh Dickins inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 1595*c01d5b30SHugh Dickins inflated_addr += offset - inflated_offset; 1596*c01d5b30SHugh Dickins if (inflated_offset > offset) 1597*c01d5b30SHugh Dickins inflated_addr += HPAGE_PMD_SIZE; 1598*c01d5b30SHugh Dickins 1599*c01d5b30SHugh Dickins if (inflated_addr > TASK_SIZE - len) 1600*c01d5b30SHugh Dickins return addr; 1601*c01d5b30SHugh Dickins return inflated_addr; 1602*c01d5b30SHugh Dickins } 1603*c01d5b30SHugh Dickins 16041da177e4SLinus Torvalds #ifdef CONFIG_NUMA 160541ffe5d5SHugh Dickins static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 16061da177e4SLinus Torvalds { 1607496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 160841ffe5d5SHugh Dickins return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 16091da177e4SLinus Torvalds } 16101da177e4SLinus Torvalds 1611d8dc74f2SAdrian Bunk static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1612d8dc74f2SAdrian Bunk unsigned long addr) 16131da177e4SLinus Torvalds { 1614496ad9aaSAl Viro struct inode *inode = file_inode(vma->vm_file); 161541ffe5d5SHugh Dickins pgoff_t index; 16161da177e4SLinus Torvalds 161741ffe5d5SHugh Dickins index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 161841ffe5d5SHugh Dickins return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 16191da177e4SLinus Torvalds } 16201da177e4SLinus Torvalds #endif 16211da177e4SLinus Torvalds 16221da177e4SLinus Torvalds int shmem_lock(struct file *file, int lock, struct user_struct *user) 16231da177e4SLinus Torvalds { 1624496ad9aaSAl Viro struct inode *inode = file_inode(file); 16251da177e4SLinus Torvalds struct shmem_inode_info *info = SHMEM_I(inode); 16261da177e4SLinus Torvalds int retval = -ENOMEM; 16271da177e4SLinus Torvalds 16281da177e4SLinus Torvalds spin_lock(&info->lock); 16291da177e4SLinus Torvalds if (lock && !(info->flags & VM_LOCKED)) { 16301da177e4SLinus Torvalds if (!user_shm_lock(inode->i_size, user)) 16311da177e4SLinus Torvalds goto out_nomem; 16321da177e4SLinus Torvalds info->flags |= VM_LOCKED; 163389e004eaSLee Schermerhorn mapping_set_unevictable(file->f_mapping); 16341da177e4SLinus Torvalds } 16351da177e4SLinus Torvalds if (!lock && (info->flags & VM_LOCKED) && user) { 16361da177e4SLinus Torvalds user_shm_unlock(inode->i_size, user); 16371da177e4SLinus Torvalds info->flags &= ~VM_LOCKED; 163889e004eaSLee Schermerhorn mapping_clear_unevictable(file->f_mapping); 16391da177e4SLinus Torvalds } 16401da177e4SLinus Torvalds retval = 0; 164189e004eaSLee Schermerhorn 16421da177e4SLinus Torvalds out_nomem: 16431da177e4SLinus Torvalds spin_unlock(&info->lock); 16441da177e4SLinus Torvalds return retval; 16451da177e4SLinus Torvalds } 16461da177e4SLinus Torvalds 16479b83a6a8SAdrian Bunk static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 16481da177e4SLinus Torvalds { 16491da177e4SLinus Torvalds file_accessed(file); 16501da177e4SLinus Torvalds vma->vm_ops = &shmem_vm_ops; 16511da177e4SLinus Torvalds return 0; 16521da177e4SLinus Torvalds } 16531da177e4SLinus Torvalds 1654454abafeSDmitry Monakhov static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 165509208d15SAl Viro umode_t mode, dev_t dev, unsigned long flags) 16561da177e4SLinus Torvalds { 16571da177e4SLinus Torvalds struct inode *inode; 16581da177e4SLinus Torvalds struct shmem_inode_info *info; 16591da177e4SLinus Torvalds struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 16601da177e4SLinus Torvalds 16615b04c689SPavel Emelyanov if (shmem_reserve_inode(sb)) 16621da177e4SLinus Torvalds return NULL; 16631da177e4SLinus Torvalds 16641da177e4SLinus Torvalds inode = new_inode(sb); 16651da177e4SLinus Torvalds if (inode) { 166685fe4025SChristoph Hellwig inode->i_ino = get_next_ino(); 1667454abafeSDmitry Monakhov inode_init_owner(inode, dir, mode); 16681da177e4SLinus Torvalds inode->i_blocks = 0; 16691da177e4SLinus Torvalds inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 167091828a40SDavid M. Grimes inode->i_generation = get_seconds(); 16711da177e4SLinus Torvalds info = SHMEM_I(inode); 16721da177e4SLinus Torvalds memset(info, 0, (char *)inode - (char *)info); 16731da177e4SLinus Torvalds spin_lock_init(&info->lock); 167440e041a2SDavid Herrmann info->seals = F_SEAL_SEAL; 16750b0a0806SHugh Dickins info->flags = flags & VM_NORESERVE; 16761da177e4SLinus Torvalds INIT_LIST_HEAD(&info->swaplist); 167738f38657SAristeu Rozanski simple_xattrs_init(&info->xattrs); 167872c04902SAl Viro cache_no_acl(inode); 16791da177e4SLinus Torvalds 16801da177e4SLinus Torvalds switch (mode & S_IFMT) { 16811da177e4SLinus Torvalds default: 168239f0247dSAndreas Gruenbacher inode->i_op = &shmem_special_inode_operations; 16831da177e4SLinus Torvalds init_special_inode(inode, mode, dev); 16841da177e4SLinus Torvalds break; 16851da177e4SLinus Torvalds case S_IFREG: 168614fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 16871da177e4SLinus Torvalds inode->i_op = &shmem_inode_operations; 16881da177e4SLinus Torvalds inode->i_fop = &shmem_file_operations; 168971fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, 169071fe804bSLee Schermerhorn shmem_get_sbmpol(sbinfo)); 16911da177e4SLinus Torvalds break; 16921da177e4SLinus Torvalds case S_IFDIR: 1693d8c76e6fSDave Hansen inc_nlink(inode); 16941da177e4SLinus Torvalds /* Some things misbehave if size == 0 on a directory */ 16951da177e4SLinus Torvalds inode->i_size = 2 * BOGO_DIRENT_SIZE; 16961da177e4SLinus Torvalds inode->i_op = &shmem_dir_inode_operations; 16971da177e4SLinus Torvalds inode->i_fop = &simple_dir_operations; 16981da177e4SLinus Torvalds break; 16991da177e4SLinus Torvalds case S_IFLNK: 17001da177e4SLinus Torvalds /* 17011da177e4SLinus Torvalds * Must not load anything in the rbtree, 17021da177e4SLinus Torvalds * mpol_free_shared_policy will not be called. 17031da177e4SLinus Torvalds */ 170471fe804bSLee Schermerhorn mpol_shared_policy_init(&info->policy, NULL); 17051da177e4SLinus Torvalds break; 17061da177e4SLinus Torvalds } 17075b04c689SPavel Emelyanov } else 17085b04c689SPavel Emelyanov shmem_free_inode(sb); 17091da177e4SLinus Torvalds return inode; 17101da177e4SLinus Torvalds } 17111da177e4SLinus Torvalds 17120cd6144aSJohannes Weiner bool shmem_mapping(struct address_space *mapping) 17130cd6144aSJohannes Weiner { 1714f0774d88SSasha Levin if (!mapping->host) 1715f0774d88SSasha Levin return false; 1716f0774d88SSasha Levin 171797b713baSChristoph Hellwig return mapping->host->i_sb->s_op == &shmem_ops; 17180cd6144aSJohannes Weiner } 17190cd6144aSJohannes Weiner 17201da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 172192e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations; 172269f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations; 17231da177e4SLinus Torvalds 17246d9d88d0SJarkko Sakkinen #ifdef CONFIG_TMPFS_XATTR 17256d9d88d0SJarkko Sakkinen static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 17266d9d88d0SJarkko Sakkinen #else 17276d9d88d0SJarkko Sakkinen #define shmem_initxattrs NULL 17286d9d88d0SJarkko Sakkinen #endif 17296d9d88d0SJarkko Sakkinen 17301da177e4SLinus Torvalds static int 1731800d15a5SNick Piggin shmem_write_begin(struct file *file, struct address_space *mapping, 1732800d15a5SNick Piggin loff_t pos, unsigned len, unsigned flags, 1733800d15a5SNick Piggin struct page **pagep, void **fsdata) 17341da177e4SLinus Torvalds { 1735800d15a5SNick Piggin struct inode *inode = mapping->host; 173640e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 173709cbfeafSKirill A. Shutemov pgoff_t index = pos >> PAGE_SHIFT; 173840e041a2SDavid Herrmann 173940e041a2SDavid Herrmann /* i_mutex is held by caller */ 174040e041a2SDavid Herrmann if (unlikely(info->seals)) { 174140e041a2SDavid Herrmann if (info->seals & F_SEAL_WRITE) 174240e041a2SDavid Herrmann return -EPERM; 174340e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 174440e041a2SDavid Herrmann return -EPERM; 174540e041a2SDavid Herrmann } 174640e041a2SDavid Herrmann 17479e18eb29SAndres Lagar-Cavilla return shmem_getpage(inode, index, pagep, SGP_WRITE); 1748800d15a5SNick Piggin } 1749800d15a5SNick Piggin 1750800d15a5SNick Piggin static int 1751800d15a5SNick Piggin shmem_write_end(struct file *file, struct address_space *mapping, 1752800d15a5SNick Piggin loff_t pos, unsigned len, unsigned copied, 1753800d15a5SNick Piggin struct page *page, void *fsdata) 1754800d15a5SNick Piggin { 1755800d15a5SNick Piggin struct inode *inode = mapping->host; 1756800d15a5SNick Piggin 1757800d15a5SNick Piggin if (pos + copied > inode->i_size) 1758800d15a5SNick Piggin i_size_write(inode, pos + copied); 1759800d15a5SNick Piggin 1760ec9516fbSHugh Dickins if (!PageUptodate(page)) { 176109cbfeafSKirill A. Shutemov if (copied < PAGE_SIZE) { 176209cbfeafSKirill A. Shutemov unsigned from = pos & (PAGE_SIZE - 1); 1763ec9516fbSHugh Dickins zero_user_segments(page, 0, from, 176409cbfeafSKirill A. Shutemov from + copied, PAGE_SIZE); 1765ec9516fbSHugh Dickins } 1766ec9516fbSHugh Dickins SetPageUptodate(page); 1767ec9516fbSHugh Dickins } 1768d3602444SHugh Dickins set_page_dirty(page); 17696746aff7SWu Fengguang unlock_page(page); 177009cbfeafSKirill A. Shutemov put_page(page); 1771d3602444SHugh Dickins 1772800d15a5SNick Piggin return copied; 17731da177e4SLinus Torvalds } 17741da177e4SLinus Torvalds 17752ba5bbedSAl Viro static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 17761da177e4SLinus Torvalds { 17776e58e79dSAl Viro struct file *file = iocb->ki_filp; 17786e58e79dSAl Viro struct inode *inode = file_inode(file); 17791da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 178041ffe5d5SHugh Dickins pgoff_t index; 178141ffe5d5SHugh Dickins unsigned long offset; 1782a0ee5ec5SHugh Dickins enum sgp_type sgp = SGP_READ; 1783f7c1d074SGeert Uytterhoeven int error = 0; 1784cb66a7a1SAl Viro ssize_t retval = 0; 17856e58e79dSAl Viro loff_t *ppos = &iocb->ki_pos; 1786a0ee5ec5SHugh Dickins 1787a0ee5ec5SHugh Dickins /* 1788a0ee5ec5SHugh Dickins * Might this read be for a stacking filesystem? Then when reading 1789a0ee5ec5SHugh Dickins * holes of a sparse file, we actually need to allocate those pages, 1790a0ee5ec5SHugh Dickins * and even mark them dirty, so it cannot exceed the max_blocks limit. 1791a0ee5ec5SHugh Dickins */ 1792777eda2cSAl Viro if (!iter_is_iovec(to)) 179375edd345SHugh Dickins sgp = SGP_CACHE; 17941da177e4SLinus Torvalds 179509cbfeafSKirill A. Shutemov index = *ppos >> PAGE_SHIFT; 179609cbfeafSKirill A. Shutemov offset = *ppos & ~PAGE_MASK; 17971da177e4SLinus Torvalds 17981da177e4SLinus Torvalds for (;;) { 17991da177e4SLinus Torvalds struct page *page = NULL; 180041ffe5d5SHugh Dickins pgoff_t end_index; 180141ffe5d5SHugh Dickins unsigned long nr, ret; 18021da177e4SLinus Torvalds loff_t i_size = i_size_read(inode); 18031da177e4SLinus Torvalds 180409cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 18051da177e4SLinus Torvalds if (index > end_index) 18061da177e4SLinus Torvalds break; 18071da177e4SLinus Torvalds if (index == end_index) { 180809cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 18091da177e4SLinus Torvalds if (nr <= offset) 18101da177e4SLinus Torvalds break; 18111da177e4SLinus Torvalds } 18121da177e4SLinus Torvalds 18139e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, index, &page, sgp); 18146e58e79dSAl Viro if (error) { 18156e58e79dSAl Viro if (error == -EINVAL) 18166e58e79dSAl Viro error = 0; 18171da177e4SLinus Torvalds break; 18181da177e4SLinus Torvalds } 181975edd345SHugh Dickins if (page) { 182075edd345SHugh Dickins if (sgp == SGP_CACHE) 182175edd345SHugh Dickins set_page_dirty(page); 1822d3602444SHugh Dickins unlock_page(page); 182375edd345SHugh Dickins } 18241da177e4SLinus Torvalds 18251da177e4SLinus Torvalds /* 18261da177e4SLinus Torvalds * We must evaluate after, since reads (unlike writes) 18271b1dcc1bSJes Sorensen * are called without i_mutex protection against truncate 18281da177e4SLinus Torvalds */ 182909cbfeafSKirill A. Shutemov nr = PAGE_SIZE; 18301da177e4SLinus Torvalds i_size = i_size_read(inode); 183109cbfeafSKirill A. Shutemov end_index = i_size >> PAGE_SHIFT; 18321da177e4SLinus Torvalds if (index == end_index) { 183309cbfeafSKirill A. Shutemov nr = i_size & ~PAGE_MASK; 18341da177e4SLinus Torvalds if (nr <= offset) { 18351da177e4SLinus Torvalds if (page) 183609cbfeafSKirill A. Shutemov put_page(page); 18371da177e4SLinus Torvalds break; 18381da177e4SLinus Torvalds } 18391da177e4SLinus Torvalds } 18401da177e4SLinus Torvalds nr -= offset; 18411da177e4SLinus Torvalds 18421da177e4SLinus Torvalds if (page) { 18431da177e4SLinus Torvalds /* 18441da177e4SLinus Torvalds * If users can be writing to this page using arbitrary 18451da177e4SLinus Torvalds * virtual addresses, take care about potential aliasing 18461da177e4SLinus Torvalds * before reading the page on the kernel side. 18471da177e4SLinus Torvalds */ 18481da177e4SLinus Torvalds if (mapping_writably_mapped(mapping)) 18491da177e4SLinus Torvalds flush_dcache_page(page); 18501da177e4SLinus Torvalds /* 18511da177e4SLinus Torvalds * Mark the page accessed if we read the beginning. 18521da177e4SLinus Torvalds */ 18531da177e4SLinus Torvalds if (!offset) 18541da177e4SLinus Torvalds mark_page_accessed(page); 1855b5810039SNick Piggin } else { 18561da177e4SLinus Torvalds page = ZERO_PAGE(0); 185709cbfeafSKirill A. Shutemov get_page(page); 1858b5810039SNick Piggin } 18591da177e4SLinus Torvalds 18601da177e4SLinus Torvalds /* 18611da177e4SLinus Torvalds * Ok, we have the page, and it's up-to-date, so 18621da177e4SLinus Torvalds * now we can copy it to user space... 18631da177e4SLinus Torvalds */ 18642ba5bbedSAl Viro ret = copy_page_to_iter(page, offset, nr, to); 18656e58e79dSAl Viro retval += ret; 18661da177e4SLinus Torvalds offset += ret; 186709cbfeafSKirill A. Shutemov index += offset >> PAGE_SHIFT; 186809cbfeafSKirill A. Shutemov offset &= ~PAGE_MASK; 18691da177e4SLinus Torvalds 187009cbfeafSKirill A. Shutemov put_page(page); 18712ba5bbedSAl Viro if (!iov_iter_count(to)) 18721da177e4SLinus Torvalds break; 18736e58e79dSAl Viro if (ret < nr) { 18746e58e79dSAl Viro error = -EFAULT; 18756e58e79dSAl Viro break; 18766e58e79dSAl Viro } 18771da177e4SLinus Torvalds cond_resched(); 18781da177e4SLinus Torvalds } 18791da177e4SLinus Torvalds 188009cbfeafSKirill A. Shutemov *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 18816e58e79dSAl Viro file_accessed(file); 18826e58e79dSAl Viro return retval ? retval : error; 18831da177e4SLinus Torvalds } 18841da177e4SLinus Torvalds 1885708e3508SHugh Dickins static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1886708e3508SHugh Dickins struct pipe_inode_info *pipe, size_t len, 1887708e3508SHugh Dickins unsigned int flags) 1888708e3508SHugh Dickins { 1889708e3508SHugh Dickins struct address_space *mapping = in->f_mapping; 189071f0e07aSHugh Dickins struct inode *inode = mapping->host; 1891708e3508SHugh Dickins unsigned int loff, nr_pages, req_pages; 1892708e3508SHugh Dickins struct page *pages[PIPE_DEF_BUFFERS]; 1893708e3508SHugh Dickins struct partial_page partial[PIPE_DEF_BUFFERS]; 1894708e3508SHugh Dickins struct page *page; 1895708e3508SHugh Dickins pgoff_t index, end_index; 1896708e3508SHugh Dickins loff_t isize, left; 1897708e3508SHugh Dickins int error, page_nr; 1898708e3508SHugh Dickins struct splice_pipe_desc spd = { 1899708e3508SHugh Dickins .pages = pages, 1900708e3508SHugh Dickins .partial = partial, 1901047fe360SEric Dumazet .nr_pages_max = PIPE_DEF_BUFFERS, 1902708e3508SHugh Dickins .flags = flags, 1903708e3508SHugh Dickins .ops = &page_cache_pipe_buf_ops, 1904708e3508SHugh Dickins .spd_release = spd_release_page, 1905708e3508SHugh Dickins }; 1906708e3508SHugh Dickins 190771f0e07aSHugh Dickins isize = i_size_read(inode); 1908708e3508SHugh Dickins if (unlikely(*ppos >= isize)) 1909708e3508SHugh Dickins return 0; 1910708e3508SHugh Dickins 1911708e3508SHugh Dickins left = isize - *ppos; 1912708e3508SHugh Dickins if (unlikely(left < len)) 1913708e3508SHugh Dickins len = left; 1914708e3508SHugh Dickins 1915708e3508SHugh Dickins if (splice_grow_spd(pipe, &spd)) 1916708e3508SHugh Dickins return -ENOMEM; 1917708e3508SHugh Dickins 191809cbfeafSKirill A. Shutemov index = *ppos >> PAGE_SHIFT; 191909cbfeafSKirill A. Shutemov loff = *ppos & ~PAGE_MASK; 192009cbfeafSKirill A. Shutemov req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; 1921a786c06dSAl Viro nr_pages = min(req_pages, spd.nr_pages_max); 1922708e3508SHugh Dickins 1923708e3508SHugh Dickins spd.nr_pages = find_get_pages_contig(mapping, index, 1924708e3508SHugh Dickins nr_pages, spd.pages); 1925708e3508SHugh Dickins index += spd.nr_pages; 1926708e3508SHugh Dickins error = 0; 192771f0e07aSHugh Dickins 1928708e3508SHugh Dickins while (spd.nr_pages < nr_pages) { 19299e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, index, &page, SGP_CACHE); 193071f0e07aSHugh Dickins if (error) 1931708e3508SHugh Dickins break; 1932708e3508SHugh Dickins unlock_page(page); 1933708e3508SHugh Dickins spd.pages[spd.nr_pages++] = page; 1934708e3508SHugh Dickins index++; 1935708e3508SHugh Dickins } 1936708e3508SHugh Dickins 193709cbfeafSKirill A. Shutemov index = *ppos >> PAGE_SHIFT; 1938708e3508SHugh Dickins nr_pages = spd.nr_pages; 1939708e3508SHugh Dickins spd.nr_pages = 0; 194071f0e07aSHugh Dickins 1941708e3508SHugh Dickins for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1942708e3508SHugh Dickins unsigned int this_len; 1943708e3508SHugh Dickins 1944708e3508SHugh Dickins if (!len) 1945708e3508SHugh Dickins break; 1946708e3508SHugh Dickins 194709cbfeafSKirill A. Shutemov this_len = min_t(unsigned long, len, PAGE_SIZE - loff); 1948708e3508SHugh Dickins page = spd.pages[page_nr]; 1949708e3508SHugh Dickins 195071f0e07aSHugh Dickins if (!PageUptodate(page) || page->mapping != mapping) { 19519e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, index, &page, SGP_CACHE); 195271f0e07aSHugh Dickins if (error) 1953708e3508SHugh Dickins break; 195471f0e07aSHugh Dickins unlock_page(page); 195509cbfeafSKirill A. Shutemov put_page(spd.pages[page_nr]); 1956708e3508SHugh Dickins spd.pages[page_nr] = page; 1957708e3508SHugh Dickins } 1958708e3508SHugh Dickins 195971f0e07aSHugh Dickins isize = i_size_read(inode); 196009cbfeafSKirill A. Shutemov end_index = (isize - 1) >> PAGE_SHIFT; 1961708e3508SHugh Dickins if (unlikely(!isize || index > end_index)) 1962708e3508SHugh Dickins break; 1963708e3508SHugh Dickins 1964708e3508SHugh Dickins if (end_index == index) { 1965708e3508SHugh Dickins unsigned int plen; 1966708e3508SHugh Dickins 196709cbfeafSKirill A. Shutemov plen = ((isize - 1) & ~PAGE_MASK) + 1; 1968708e3508SHugh Dickins if (plen <= loff) 1969708e3508SHugh Dickins break; 1970708e3508SHugh Dickins 1971708e3508SHugh Dickins this_len = min(this_len, plen - loff); 1972708e3508SHugh Dickins len = this_len; 1973708e3508SHugh Dickins } 1974708e3508SHugh Dickins 1975708e3508SHugh Dickins spd.partial[page_nr].offset = loff; 1976708e3508SHugh Dickins spd.partial[page_nr].len = this_len; 1977708e3508SHugh Dickins len -= this_len; 1978708e3508SHugh Dickins loff = 0; 1979708e3508SHugh Dickins spd.nr_pages++; 1980708e3508SHugh Dickins index++; 1981708e3508SHugh Dickins } 1982708e3508SHugh Dickins 1983708e3508SHugh Dickins while (page_nr < nr_pages) 198409cbfeafSKirill A. Shutemov put_page(spd.pages[page_nr++]); 1985708e3508SHugh Dickins 1986708e3508SHugh Dickins if (spd.nr_pages) 1987708e3508SHugh Dickins error = splice_to_pipe(pipe, &spd); 1988708e3508SHugh Dickins 1989047fe360SEric Dumazet splice_shrink_spd(&spd); 1990708e3508SHugh Dickins 1991708e3508SHugh Dickins if (error > 0) { 1992708e3508SHugh Dickins *ppos += error; 1993708e3508SHugh Dickins file_accessed(in); 1994708e3508SHugh Dickins } 1995708e3508SHugh Dickins return error; 1996708e3508SHugh Dickins } 1997708e3508SHugh Dickins 1998220f2ac9SHugh Dickins /* 1999220f2ac9SHugh Dickins * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 2000220f2ac9SHugh Dickins */ 2001220f2ac9SHugh Dickins static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 2002965c8e59SAndrew Morton pgoff_t index, pgoff_t end, int whence) 2003220f2ac9SHugh Dickins { 2004220f2ac9SHugh Dickins struct page *page; 2005220f2ac9SHugh Dickins struct pagevec pvec; 2006220f2ac9SHugh Dickins pgoff_t indices[PAGEVEC_SIZE]; 2007220f2ac9SHugh Dickins bool done = false; 2008220f2ac9SHugh Dickins int i; 2009220f2ac9SHugh Dickins 2010220f2ac9SHugh Dickins pagevec_init(&pvec, 0); 2011220f2ac9SHugh Dickins pvec.nr = 1; /* start small: we may be there already */ 2012220f2ac9SHugh Dickins while (!done) { 20130cd6144aSJohannes Weiner pvec.nr = find_get_entries(mapping, index, 2014220f2ac9SHugh Dickins pvec.nr, pvec.pages, indices); 2015220f2ac9SHugh Dickins if (!pvec.nr) { 2016965c8e59SAndrew Morton if (whence == SEEK_DATA) 2017220f2ac9SHugh Dickins index = end; 2018220f2ac9SHugh Dickins break; 2019220f2ac9SHugh Dickins } 2020220f2ac9SHugh Dickins for (i = 0; i < pvec.nr; i++, index++) { 2021220f2ac9SHugh Dickins if (index < indices[i]) { 2022965c8e59SAndrew Morton if (whence == SEEK_HOLE) { 2023220f2ac9SHugh Dickins done = true; 2024220f2ac9SHugh Dickins break; 2025220f2ac9SHugh Dickins } 2026220f2ac9SHugh Dickins index = indices[i]; 2027220f2ac9SHugh Dickins } 2028220f2ac9SHugh Dickins page = pvec.pages[i]; 2029220f2ac9SHugh Dickins if (page && !radix_tree_exceptional_entry(page)) { 2030220f2ac9SHugh Dickins if (!PageUptodate(page)) 2031220f2ac9SHugh Dickins page = NULL; 2032220f2ac9SHugh Dickins } 2033220f2ac9SHugh Dickins if (index >= end || 2034965c8e59SAndrew Morton (page && whence == SEEK_DATA) || 2035965c8e59SAndrew Morton (!page && whence == SEEK_HOLE)) { 2036220f2ac9SHugh Dickins done = true; 2037220f2ac9SHugh Dickins break; 2038220f2ac9SHugh Dickins } 2039220f2ac9SHugh Dickins } 20400cd6144aSJohannes Weiner pagevec_remove_exceptionals(&pvec); 2041220f2ac9SHugh Dickins pagevec_release(&pvec); 2042220f2ac9SHugh Dickins pvec.nr = PAGEVEC_SIZE; 2043220f2ac9SHugh Dickins cond_resched(); 2044220f2ac9SHugh Dickins } 2045220f2ac9SHugh Dickins return index; 2046220f2ac9SHugh Dickins } 2047220f2ac9SHugh Dickins 2048965c8e59SAndrew Morton static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2049220f2ac9SHugh Dickins { 2050220f2ac9SHugh Dickins struct address_space *mapping = file->f_mapping; 2051220f2ac9SHugh Dickins struct inode *inode = mapping->host; 2052220f2ac9SHugh Dickins pgoff_t start, end; 2053220f2ac9SHugh Dickins loff_t new_offset; 2054220f2ac9SHugh Dickins 2055965c8e59SAndrew Morton if (whence != SEEK_DATA && whence != SEEK_HOLE) 2056965c8e59SAndrew Morton return generic_file_llseek_size(file, offset, whence, 2057220f2ac9SHugh Dickins MAX_LFS_FILESIZE, i_size_read(inode)); 20585955102cSAl Viro inode_lock(inode); 2059220f2ac9SHugh Dickins /* We're holding i_mutex so we can access i_size directly */ 2060220f2ac9SHugh Dickins 2061220f2ac9SHugh Dickins if (offset < 0) 2062220f2ac9SHugh Dickins offset = -EINVAL; 2063220f2ac9SHugh Dickins else if (offset >= inode->i_size) 2064220f2ac9SHugh Dickins offset = -ENXIO; 2065220f2ac9SHugh Dickins else { 206609cbfeafSKirill A. Shutemov start = offset >> PAGE_SHIFT; 206709cbfeafSKirill A. Shutemov end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2068965c8e59SAndrew Morton new_offset = shmem_seek_hole_data(mapping, start, end, whence); 206909cbfeafSKirill A. Shutemov new_offset <<= PAGE_SHIFT; 2070220f2ac9SHugh Dickins if (new_offset > offset) { 2071220f2ac9SHugh Dickins if (new_offset < inode->i_size) 2072220f2ac9SHugh Dickins offset = new_offset; 2073965c8e59SAndrew Morton else if (whence == SEEK_DATA) 2074220f2ac9SHugh Dickins offset = -ENXIO; 2075220f2ac9SHugh Dickins else 2076220f2ac9SHugh Dickins offset = inode->i_size; 2077220f2ac9SHugh Dickins } 2078220f2ac9SHugh Dickins } 2079220f2ac9SHugh Dickins 2080387aae6fSHugh Dickins if (offset >= 0) 208146a1c2c7SJie Liu offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 20825955102cSAl Viro inode_unlock(inode); 2083220f2ac9SHugh Dickins return offset; 2084220f2ac9SHugh Dickins } 2085220f2ac9SHugh Dickins 208605f65b5cSDavid Herrmann /* 208705f65b5cSDavid Herrmann * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, 208805f65b5cSDavid Herrmann * so reuse a tag which we firmly believe is never set or cleared on shmem. 208905f65b5cSDavid Herrmann */ 209005f65b5cSDavid Herrmann #define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE 209105f65b5cSDavid Herrmann #define LAST_SCAN 4 /* about 150ms max */ 209205f65b5cSDavid Herrmann 209305f65b5cSDavid Herrmann static void shmem_tag_pins(struct address_space *mapping) 209405f65b5cSDavid Herrmann { 209505f65b5cSDavid Herrmann struct radix_tree_iter iter; 209605f65b5cSDavid Herrmann void **slot; 209705f65b5cSDavid Herrmann pgoff_t start; 209805f65b5cSDavid Herrmann struct page *page; 209905f65b5cSDavid Herrmann 210005f65b5cSDavid Herrmann lru_add_drain(); 210105f65b5cSDavid Herrmann start = 0; 210205f65b5cSDavid Herrmann rcu_read_lock(); 210305f65b5cSDavid Herrmann 210405f65b5cSDavid Herrmann radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 210505f65b5cSDavid Herrmann page = radix_tree_deref_slot(slot); 210605f65b5cSDavid Herrmann if (!page || radix_tree_exception(page)) { 21072cf938aaSMatthew Wilcox if (radix_tree_deref_retry(page)) { 21082cf938aaSMatthew Wilcox slot = radix_tree_iter_retry(&iter); 21092cf938aaSMatthew Wilcox continue; 21102cf938aaSMatthew Wilcox } 211105f65b5cSDavid Herrmann } else if (page_count(page) - page_mapcount(page) > 1) { 211205f65b5cSDavid Herrmann spin_lock_irq(&mapping->tree_lock); 211305f65b5cSDavid Herrmann radix_tree_tag_set(&mapping->page_tree, iter.index, 211405f65b5cSDavid Herrmann SHMEM_TAG_PINNED); 211505f65b5cSDavid Herrmann spin_unlock_irq(&mapping->tree_lock); 211605f65b5cSDavid Herrmann } 211705f65b5cSDavid Herrmann 211805f65b5cSDavid Herrmann if (need_resched()) { 211905f65b5cSDavid Herrmann cond_resched_rcu(); 21207165092fSMatthew Wilcox slot = radix_tree_iter_next(&iter); 212105f65b5cSDavid Herrmann } 212205f65b5cSDavid Herrmann } 212305f65b5cSDavid Herrmann rcu_read_unlock(); 212405f65b5cSDavid Herrmann } 212505f65b5cSDavid Herrmann 212605f65b5cSDavid Herrmann /* 212705f65b5cSDavid Herrmann * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 212805f65b5cSDavid Herrmann * via get_user_pages(), drivers might have some pending I/O without any active 212905f65b5cSDavid Herrmann * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages 213005f65b5cSDavid Herrmann * and see whether it has an elevated ref-count. If so, we tag them and wait for 213105f65b5cSDavid Herrmann * them to be dropped. 213205f65b5cSDavid Herrmann * The caller must guarantee that no new user will acquire writable references 213305f65b5cSDavid Herrmann * to those pages to avoid races. 213405f65b5cSDavid Herrmann */ 213540e041a2SDavid Herrmann static int shmem_wait_for_pins(struct address_space *mapping) 213640e041a2SDavid Herrmann { 213705f65b5cSDavid Herrmann struct radix_tree_iter iter; 213805f65b5cSDavid Herrmann void **slot; 213905f65b5cSDavid Herrmann pgoff_t start; 214005f65b5cSDavid Herrmann struct page *page; 214105f65b5cSDavid Herrmann int error, scan; 214205f65b5cSDavid Herrmann 214305f65b5cSDavid Herrmann shmem_tag_pins(mapping); 214405f65b5cSDavid Herrmann 214505f65b5cSDavid Herrmann error = 0; 214605f65b5cSDavid Herrmann for (scan = 0; scan <= LAST_SCAN; scan++) { 214705f65b5cSDavid Herrmann if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) 214805f65b5cSDavid Herrmann break; 214905f65b5cSDavid Herrmann 215005f65b5cSDavid Herrmann if (!scan) 215105f65b5cSDavid Herrmann lru_add_drain_all(); 215205f65b5cSDavid Herrmann else if (schedule_timeout_killable((HZ << scan) / 200)) 215305f65b5cSDavid Herrmann scan = LAST_SCAN; 215405f65b5cSDavid Herrmann 215505f65b5cSDavid Herrmann start = 0; 215605f65b5cSDavid Herrmann rcu_read_lock(); 215705f65b5cSDavid Herrmann radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 215805f65b5cSDavid Herrmann start, SHMEM_TAG_PINNED) { 215905f65b5cSDavid Herrmann 216005f65b5cSDavid Herrmann page = radix_tree_deref_slot(slot); 216105f65b5cSDavid Herrmann if (radix_tree_exception(page)) { 21622cf938aaSMatthew Wilcox if (radix_tree_deref_retry(page)) { 21632cf938aaSMatthew Wilcox slot = radix_tree_iter_retry(&iter); 21642cf938aaSMatthew Wilcox continue; 21652cf938aaSMatthew Wilcox } 216605f65b5cSDavid Herrmann 216705f65b5cSDavid Herrmann page = NULL; 216805f65b5cSDavid Herrmann } 216905f65b5cSDavid Herrmann 217005f65b5cSDavid Herrmann if (page && 217105f65b5cSDavid Herrmann page_count(page) - page_mapcount(page) != 1) { 217205f65b5cSDavid Herrmann if (scan < LAST_SCAN) 217305f65b5cSDavid Herrmann goto continue_resched; 217405f65b5cSDavid Herrmann 217505f65b5cSDavid Herrmann /* 217605f65b5cSDavid Herrmann * On the last scan, we clean up all those tags 217705f65b5cSDavid Herrmann * we inserted; but make a note that we still 217805f65b5cSDavid Herrmann * found pages pinned. 217905f65b5cSDavid Herrmann */ 218005f65b5cSDavid Herrmann error = -EBUSY; 218105f65b5cSDavid Herrmann } 218205f65b5cSDavid Herrmann 218305f65b5cSDavid Herrmann spin_lock_irq(&mapping->tree_lock); 218405f65b5cSDavid Herrmann radix_tree_tag_clear(&mapping->page_tree, 218505f65b5cSDavid Herrmann iter.index, SHMEM_TAG_PINNED); 218605f65b5cSDavid Herrmann spin_unlock_irq(&mapping->tree_lock); 218705f65b5cSDavid Herrmann continue_resched: 218805f65b5cSDavid Herrmann if (need_resched()) { 218905f65b5cSDavid Herrmann cond_resched_rcu(); 21907165092fSMatthew Wilcox slot = radix_tree_iter_next(&iter); 219105f65b5cSDavid Herrmann } 219205f65b5cSDavid Herrmann } 219305f65b5cSDavid Herrmann rcu_read_unlock(); 219405f65b5cSDavid Herrmann } 219505f65b5cSDavid Herrmann 219605f65b5cSDavid Herrmann return error; 219740e041a2SDavid Herrmann } 219840e041a2SDavid Herrmann 219940e041a2SDavid Herrmann #define F_ALL_SEALS (F_SEAL_SEAL | \ 220040e041a2SDavid Herrmann F_SEAL_SHRINK | \ 220140e041a2SDavid Herrmann F_SEAL_GROW | \ 220240e041a2SDavid Herrmann F_SEAL_WRITE) 220340e041a2SDavid Herrmann 220440e041a2SDavid Herrmann int shmem_add_seals(struct file *file, unsigned int seals) 220540e041a2SDavid Herrmann { 220640e041a2SDavid Herrmann struct inode *inode = file_inode(file); 220740e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 220840e041a2SDavid Herrmann int error; 220940e041a2SDavid Herrmann 221040e041a2SDavid Herrmann /* 221140e041a2SDavid Herrmann * SEALING 221240e041a2SDavid Herrmann * Sealing allows multiple parties to share a shmem-file but restrict 221340e041a2SDavid Herrmann * access to a specific subset of file operations. Seals can only be 221440e041a2SDavid Herrmann * added, but never removed. This way, mutually untrusted parties can 221540e041a2SDavid Herrmann * share common memory regions with a well-defined policy. A malicious 221640e041a2SDavid Herrmann * peer can thus never perform unwanted operations on a shared object. 221740e041a2SDavid Herrmann * 221840e041a2SDavid Herrmann * Seals are only supported on special shmem-files and always affect 221940e041a2SDavid Herrmann * the whole underlying inode. Once a seal is set, it may prevent some 222040e041a2SDavid Herrmann * kinds of access to the file. Currently, the following seals are 222140e041a2SDavid Herrmann * defined: 222240e041a2SDavid Herrmann * SEAL_SEAL: Prevent further seals from being set on this file 222340e041a2SDavid Herrmann * SEAL_SHRINK: Prevent the file from shrinking 222440e041a2SDavid Herrmann * SEAL_GROW: Prevent the file from growing 222540e041a2SDavid Herrmann * SEAL_WRITE: Prevent write access to the file 222640e041a2SDavid Herrmann * 222740e041a2SDavid Herrmann * As we don't require any trust relationship between two parties, we 222840e041a2SDavid Herrmann * must prevent seals from being removed. Therefore, sealing a file 222940e041a2SDavid Herrmann * only adds a given set of seals to the file, it never touches 223040e041a2SDavid Herrmann * existing seals. Furthermore, the "setting seals"-operation can be 223140e041a2SDavid Herrmann * sealed itself, which basically prevents any further seal from being 223240e041a2SDavid Herrmann * added. 223340e041a2SDavid Herrmann * 223440e041a2SDavid Herrmann * Semantics of sealing are only defined on volatile files. Only 223540e041a2SDavid Herrmann * anonymous shmem files support sealing. More importantly, seals are 223640e041a2SDavid Herrmann * never written to disk. Therefore, there's no plan to support it on 223740e041a2SDavid Herrmann * other file types. 223840e041a2SDavid Herrmann */ 223940e041a2SDavid Herrmann 224040e041a2SDavid Herrmann if (file->f_op != &shmem_file_operations) 224140e041a2SDavid Herrmann return -EINVAL; 224240e041a2SDavid Herrmann if (!(file->f_mode & FMODE_WRITE)) 224340e041a2SDavid Herrmann return -EPERM; 224440e041a2SDavid Herrmann if (seals & ~(unsigned int)F_ALL_SEALS) 224540e041a2SDavid Herrmann return -EINVAL; 224640e041a2SDavid Herrmann 22475955102cSAl Viro inode_lock(inode); 224840e041a2SDavid Herrmann 224940e041a2SDavid Herrmann if (info->seals & F_SEAL_SEAL) { 225040e041a2SDavid Herrmann error = -EPERM; 225140e041a2SDavid Herrmann goto unlock; 225240e041a2SDavid Herrmann } 225340e041a2SDavid Herrmann 225440e041a2SDavid Herrmann if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { 225540e041a2SDavid Herrmann error = mapping_deny_writable(file->f_mapping); 225640e041a2SDavid Herrmann if (error) 225740e041a2SDavid Herrmann goto unlock; 225840e041a2SDavid Herrmann 225940e041a2SDavid Herrmann error = shmem_wait_for_pins(file->f_mapping); 226040e041a2SDavid Herrmann if (error) { 226140e041a2SDavid Herrmann mapping_allow_writable(file->f_mapping); 226240e041a2SDavid Herrmann goto unlock; 226340e041a2SDavid Herrmann } 226440e041a2SDavid Herrmann } 226540e041a2SDavid Herrmann 226640e041a2SDavid Herrmann info->seals |= seals; 226740e041a2SDavid Herrmann error = 0; 226840e041a2SDavid Herrmann 226940e041a2SDavid Herrmann unlock: 22705955102cSAl Viro inode_unlock(inode); 227140e041a2SDavid Herrmann return error; 227240e041a2SDavid Herrmann } 227340e041a2SDavid Herrmann EXPORT_SYMBOL_GPL(shmem_add_seals); 227440e041a2SDavid Herrmann 227540e041a2SDavid Herrmann int shmem_get_seals(struct file *file) 227640e041a2SDavid Herrmann { 227740e041a2SDavid Herrmann if (file->f_op != &shmem_file_operations) 227840e041a2SDavid Herrmann return -EINVAL; 227940e041a2SDavid Herrmann 228040e041a2SDavid Herrmann return SHMEM_I(file_inode(file))->seals; 228140e041a2SDavid Herrmann } 228240e041a2SDavid Herrmann EXPORT_SYMBOL_GPL(shmem_get_seals); 228340e041a2SDavid Herrmann 228440e041a2SDavid Herrmann long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 228540e041a2SDavid Herrmann { 228640e041a2SDavid Herrmann long error; 228740e041a2SDavid Herrmann 228840e041a2SDavid Herrmann switch (cmd) { 228940e041a2SDavid Herrmann case F_ADD_SEALS: 229040e041a2SDavid Herrmann /* disallow upper 32bit */ 229140e041a2SDavid Herrmann if (arg > UINT_MAX) 229240e041a2SDavid Herrmann return -EINVAL; 229340e041a2SDavid Herrmann 229440e041a2SDavid Herrmann error = shmem_add_seals(file, arg); 229540e041a2SDavid Herrmann break; 229640e041a2SDavid Herrmann case F_GET_SEALS: 229740e041a2SDavid Herrmann error = shmem_get_seals(file); 229840e041a2SDavid Herrmann break; 229940e041a2SDavid Herrmann default: 230040e041a2SDavid Herrmann error = -EINVAL; 230140e041a2SDavid Herrmann break; 230240e041a2SDavid Herrmann } 230340e041a2SDavid Herrmann 230440e041a2SDavid Herrmann return error; 230540e041a2SDavid Herrmann } 230640e041a2SDavid Herrmann 230783e4fa9cSHugh Dickins static long shmem_fallocate(struct file *file, int mode, loff_t offset, 230883e4fa9cSHugh Dickins loff_t len) 230983e4fa9cSHugh Dickins { 2310496ad9aaSAl Viro struct inode *inode = file_inode(file); 2311e2d12e22SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 231240e041a2SDavid Herrmann struct shmem_inode_info *info = SHMEM_I(inode); 23131aac1400SHugh Dickins struct shmem_falloc shmem_falloc; 2314e2d12e22SHugh Dickins pgoff_t start, index, end; 2315e2d12e22SHugh Dickins int error; 231683e4fa9cSHugh Dickins 231713ace4d0SHugh Dickins if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 231813ace4d0SHugh Dickins return -EOPNOTSUPP; 231913ace4d0SHugh Dickins 23205955102cSAl Viro inode_lock(inode); 232183e4fa9cSHugh Dickins 232283e4fa9cSHugh Dickins if (mode & FALLOC_FL_PUNCH_HOLE) { 232383e4fa9cSHugh Dickins struct address_space *mapping = file->f_mapping; 232483e4fa9cSHugh Dickins loff_t unmap_start = round_up(offset, PAGE_SIZE); 232583e4fa9cSHugh Dickins loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 23268e205f77SHugh Dickins DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 232783e4fa9cSHugh Dickins 232840e041a2SDavid Herrmann /* protected by i_mutex */ 232940e041a2SDavid Herrmann if (info->seals & F_SEAL_WRITE) { 233040e041a2SDavid Herrmann error = -EPERM; 233140e041a2SDavid Herrmann goto out; 233240e041a2SDavid Herrmann } 233340e041a2SDavid Herrmann 23348e205f77SHugh Dickins shmem_falloc.waitq = &shmem_falloc_waitq; 2335f00cdc6dSHugh Dickins shmem_falloc.start = unmap_start >> PAGE_SHIFT; 2336f00cdc6dSHugh Dickins shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 2337f00cdc6dSHugh Dickins spin_lock(&inode->i_lock); 2338f00cdc6dSHugh Dickins inode->i_private = &shmem_falloc; 2339f00cdc6dSHugh Dickins spin_unlock(&inode->i_lock); 2340f00cdc6dSHugh Dickins 234183e4fa9cSHugh Dickins if ((u64)unmap_end > (u64)unmap_start) 234283e4fa9cSHugh Dickins unmap_mapping_range(mapping, unmap_start, 234383e4fa9cSHugh Dickins 1 + unmap_end - unmap_start, 0); 234483e4fa9cSHugh Dickins shmem_truncate_range(inode, offset, offset + len - 1); 234583e4fa9cSHugh Dickins /* No need to unmap again: hole-punching leaves COWed pages */ 23468e205f77SHugh Dickins 23478e205f77SHugh Dickins spin_lock(&inode->i_lock); 23488e205f77SHugh Dickins inode->i_private = NULL; 23498e205f77SHugh Dickins wake_up_all(&shmem_falloc_waitq); 23508e205f77SHugh Dickins spin_unlock(&inode->i_lock); 235183e4fa9cSHugh Dickins error = 0; 23528e205f77SHugh Dickins goto out; 235383e4fa9cSHugh Dickins } 235483e4fa9cSHugh Dickins 2355e2d12e22SHugh Dickins /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 2356e2d12e22SHugh Dickins error = inode_newsize_ok(inode, offset + len); 2357e2d12e22SHugh Dickins if (error) 2358e2d12e22SHugh Dickins goto out; 2359e2d12e22SHugh Dickins 236040e041a2SDavid Herrmann if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 236140e041a2SDavid Herrmann error = -EPERM; 236240e041a2SDavid Herrmann goto out; 236340e041a2SDavid Herrmann } 236440e041a2SDavid Herrmann 236509cbfeafSKirill A. Shutemov start = offset >> PAGE_SHIFT; 236609cbfeafSKirill A. Shutemov end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 2367e2d12e22SHugh Dickins /* Try to avoid a swapstorm if len is impossible to satisfy */ 2368e2d12e22SHugh Dickins if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2369e2d12e22SHugh Dickins error = -ENOSPC; 2370e2d12e22SHugh Dickins goto out; 2371e2d12e22SHugh Dickins } 2372e2d12e22SHugh Dickins 23738e205f77SHugh Dickins shmem_falloc.waitq = NULL; 23741aac1400SHugh Dickins shmem_falloc.start = start; 23751aac1400SHugh Dickins shmem_falloc.next = start; 23761aac1400SHugh Dickins shmem_falloc.nr_falloced = 0; 23771aac1400SHugh Dickins shmem_falloc.nr_unswapped = 0; 23781aac1400SHugh Dickins spin_lock(&inode->i_lock); 23791aac1400SHugh Dickins inode->i_private = &shmem_falloc; 23801aac1400SHugh Dickins spin_unlock(&inode->i_lock); 23811aac1400SHugh Dickins 2382e2d12e22SHugh Dickins for (index = start; index < end; index++) { 2383e2d12e22SHugh Dickins struct page *page; 2384e2d12e22SHugh Dickins 2385e2d12e22SHugh Dickins /* 2386e2d12e22SHugh Dickins * Good, the fallocate(2) manpage permits EINTR: we may have 2387e2d12e22SHugh Dickins * been interrupted because we are using up too much memory. 2388e2d12e22SHugh Dickins */ 2389e2d12e22SHugh Dickins if (signal_pending(current)) 2390e2d12e22SHugh Dickins error = -EINTR; 23911aac1400SHugh Dickins else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 23921aac1400SHugh Dickins error = -ENOMEM; 2393e2d12e22SHugh Dickins else 23949e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, index, &page, SGP_FALLOC); 2395e2d12e22SHugh Dickins if (error) { 23961635f6a7SHugh Dickins /* Remove the !PageUptodate pages we added */ 23977f556567SHugh Dickins if (index > start) { 23981635f6a7SHugh Dickins shmem_undo_range(inode, 239909cbfeafSKirill A. Shutemov (loff_t)start << PAGE_SHIFT, 2400b9b4bb26SAnthony Romano ((loff_t)index << PAGE_SHIFT) - 1, true); 24017f556567SHugh Dickins } 24021aac1400SHugh Dickins goto undone; 2403e2d12e22SHugh Dickins } 2404e2d12e22SHugh Dickins 2405e2d12e22SHugh Dickins /* 24061aac1400SHugh Dickins * Inform shmem_writepage() how far we have reached. 24071aac1400SHugh Dickins * No need for lock or barrier: we have the page lock. 24081aac1400SHugh Dickins */ 24091aac1400SHugh Dickins shmem_falloc.next++; 24101aac1400SHugh Dickins if (!PageUptodate(page)) 24111aac1400SHugh Dickins shmem_falloc.nr_falloced++; 24121aac1400SHugh Dickins 24131aac1400SHugh Dickins /* 24141635f6a7SHugh Dickins * If !PageUptodate, leave it that way so that freeable pages 24151635f6a7SHugh Dickins * can be recognized if we need to rollback on error later. 24161635f6a7SHugh Dickins * But set_page_dirty so that memory pressure will swap rather 2417e2d12e22SHugh Dickins * than free the pages we are allocating (and SGP_CACHE pages 2418e2d12e22SHugh Dickins * might still be clean: we now need to mark those dirty too). 2419e2d12e22SHugh Dickins */ 2420e2d12e22SHugh Dickins set_page_dirty(page); 2421e2d12e22SHugh Dickins unlock_page(page); 242209cbfeafSKirill A. Shutemov put_page(page); 2423e2d12e22SHugh Dickins cond_resched(); 2424e2d12e22SHugh Dickins } 2425e2d12e22SHugh Dickins 2426e2d12e22SHugh Dickins if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2427e2d12e22SHugh Dickins i_size_write(inode, offset + len); 2428e2d12e22SHugh Dickins inode->i_ctime = CURRENT_TIME; 24291aac1400SHugh Dickins undone: 24301aac1400SHugh Dickins spin_lock(&inode->i_lock); 24311aac1400SHugh Dickins inode->i_private = NULL; 24321aac1400SHugh Dickins spin_unlock(&inode->i_lock); 2433e2d12e22SHugh Dickins out: 24345955102cSAl Viro inode_unlock(inode); 243583e4fa9cSHugh Dickins return error; 243683e4fa9cSHugh Dickins } 243783e4fa9cSHugh Dickins 2438726c3342SDavid Howells static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 24391da177e4SLinus Torvalds { 2440726c3342SDavid Howells struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 24411da177e4SLinus Torvalds 24421da177e4SLinus Torvalds buf->f_type = TMPFS_MAGIC; 244309cbfeafSKirill A. Shutemov buf->f_bsize = PAGE_SIZE; 24441da177e4SLinus Torvalds buf->f_namelen = NAME_MAX; 24450edd73b3SHugh Dickins if (sbinfo->max_blocks) { 24461da177e4SLinus Torvalds buf->f_blocks = sbinfo->max_blocks; 244741ffe5d5SHugh Dickins buf->f_bavail = 244841ffe5d5SHugh Dickins buf->f_bfree = sbinfo->max_blocks - 244941ffe5d5SHugh Dickins percpu_counter_sum(&sbinfo->used_blocks); 24500edd73b3SHugh Dickins } 24510edd73b3SHugh Dickins if (sbinfo->max_inodes) { 24521da177e4SLinus Torvalds buf->f_files = sbinfo->max_inodes; 24531da177e4SLinus Torvalds buf->f_ffree = sbinfo->free_inodes; 24541da177e4SLinus Torvalds } 24551da177e4SLinus Torvalds /* else leave those fields 0 like simple_statfs */ 24561da177e4SLinus Torvalds return 0; 24571da177e4SLinus Torvalds } 24581da177e4SLinus Torvalds 24591da177e4SLinus Torvalds /* 24601da177e4SLinus Torvalds * File creation. Allocate an inode, and we're done.. 24611da177e4SLinus Torvalds */ 24621da177e4SLinus Torvalds static int 24631a67aafbSAl Viro shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 24641da177e4SLinus Torvalds { 24650b0a0806SHugh Dickins struct inode *inode; 24661da177e4SLinus Torvalds int error = -ENOSPC; 24671da177e4SLinus Torvalds 2468454abafeSDmitry Monakhov inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 24691da177e4SLinus Torvalds if (inode) { 2470feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 2471feda821eSChristoph Hellwig if (error) 2472feda821eSChristoph Hellwig goto out_iput; 24732a7dba39SEric Paris error = security_inode_init_security(inode, dir, 24749d8f13baSMimi Zohar &dentry->d_name, 24756d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 2476feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 2477feda821eSChristoph Hellwig goto out_iput; 247837ec43cdSMimi Zohar 2479718deb6bSAl Viro error = 0; 24801da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 24811da177e4SLinus Torvalds dir->i_ctime = dir->i_mtime = CURRENT_TIME; 24821da177e4SLinus Torvalds d_instantiate(dentry, inode); 24831da177e4SLinus Torvalds dget(dentry); /* Extra count - pin the dentry in core */ 24841da177e4SLinus Torvalds } 24851da177e4SLinus Torvalds return error; 2486feda821eSChristoph Hellwig out_iput: 2487feda821eSChristoph Hellwig iput(inode); 2488feda821eSChristoph Hellwig return error; 24891da177e4SLinus Torvalds } 24901da177e4SLinus Torvalds 249160545d0dSAl Viro static int 249260545d0dSAl Viro shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 249360545d0dSAl Viro { 249460545d0dSAl Viro struct inode *inode; 249560545d0dSAl Viro int error = -ENOSPC; 249660545d0dSAl Viro 249760545d0dSAl Viro inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 249860545d0dSAl Viro if (inode) { 249960545d0dSAl Viro error = security_inode_init_security(inode, dir, 250060545d0dSAl Viro NULL, 250160545d0dSAl Viro shmem_initxattrs, NULL); 2502feda821eSChristoph Hellwig if (error && error != -EOPNOTSUPP) 2503feda821eSChristoph Hellwig goto out_iput; 2504feda821eSChristoph Hellwig error = simple_acl_create(dir, inode); 2505feda821eSChristoph Hellwig if (error) 2506feda821eSChristoph Hellwig goto out_iput; 250760545d0dSAl Viro d_tmpfile(dentry, inode); 250860545d0dSAl Viro } 250960545d0dSAl Viro return error; 2510feda821eSChristoph Hellwig out_iput: 2511feda821eSChristoph Hellwig iput(inode); 2512feda821eSChristoph Hellwig return error; 251360545d0dSAl Viro } 251460545d0dSAl Viro 251518bb1db3SAl Viro static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 25161da177e4SLinus Torvalds { 25171da177e4SLinus Torvalds int error; 25181da177e4SLinus Torvalds 25191da177e4SLinus Torvalds if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 25201da177e4SLinus Torvalds return error; 2521d8c76e6fSDave Hansen inc_nlink(dir); 25221da177e4SLinus Torvalds return 0; 25231da177e4SLinus Torvalds } 25241da177e4SLinus Torvalds 25254acdaf27SAl Viro static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2526ebfc3b49SAl Viro bool excl) 25271da177e4SLinus Torvalds { 25281da177e4SLinus Torvalds return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 25291da177e4SLinus Torvalds } 25301da177e4SLinus Torvalds 25311da177e4SLinus Torvalds /* 25321da177e4SLinus Torvalds * Link a file.. 25331da177e4SLinus Torvalds */ 25341da177e4SLinus Torvalds static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 25351da177e4SLinus Torvalds { 253675c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 25375b04c689SPavel Emelyanov int ret; 25381da177e4SLinus Torvalds 25391da177e4SLinus Torvalds /* 25401da177e4SLinus Torvalds * No ordinary (disk based) filesystem counts links as inodes; 25411da177e4SLinus Torvalds * but each new link needs a new dentry, pinning lowmem, and 25421da177e4SLinus Torvalds * tmpfs dentries cannot be pruned until they are unlinked. 25431da177e4SLinus Torvalds */ 25445b04c689SPavel Emelyanov ret = shmem_reserve_inode(inode->i_sb); 25455b04c689SPavel Emelyanov if (ret) 25465b04c689SPavel Emelyanov goto out; 25471da177e4SLinus Torvalds 25481da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 25491da177e4SLinus Torvalds inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2550d8c76e6fSDave Hansen inc_nlink(inode); 25517de9c6eeSAl Viro ihold(inode); /* New dentry reference */ 25521da177e4SLinus Torvalds dget(dentry); /* Extra pinning count for the created dentry */ 25531da177e4SLinus Torvalds d_instantiate(dentry, inode); 25545b04c689SPavel Emelyanov out: 25555b04c689SPavel Emelyanov return ret; 25561da177e4SLinus Torvalds } 25571da177e4SLinus Torvalds 25581da177e4SLinus Torvalds static int shmem_unlink(struct inode *dir, struct dentry *dentry) 25591da177e4SLinus Torvalds { 256075c3cfa8SDavid Howells struct inode *inode = d_inode(dentry); 25611da177e4SLinus Torvalds 25625b04c689SPavel Emelyanov if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 25635b04c689SPavel Emelyanov shmem_free_inode(inode->i_sb); 25641da177e4SLinus Torvalds 25651da177e4SLinus Torvalds dir->i_size -= BOGO_DIRENT_SIZE; 25661da177e4SLinus Torvalds inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 25679a53c3a7SDave Hansen drop_nlink(inode); 25681da177e4SLinus Torvalds dput(dentry); /* Undo the count from "create" - this does all the work */ 25691da177e4SLinus Torvalds return 0; 25701da177e4SLinus Torvalds } 25711da177e4SLinus Torvalds 25721da177e4SLinus Torvalds static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 25731da177e4SLinus Torvalds { 25741da177e4SLinus Torvalds if (!simple_empty(dentry)) 25751da177e4SLinus Torvalds return -ENOTEMPTY; 25761da177e4SLinus Torvalds 257775c3cfa8SDavid Howells drop_nlink(d_inode(dentry)); 25789a53c3a7SDave Hansen drop_nlink(dir); 25791da177e4SLinus Torvalds return shmem_unlink(dir, dentry); 25801da177e4SLinus Torvalds } 25811da177e4SLinus Torvalds 258237456771SMiklos Szeredi static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 258337456771SMiklos Szeredi { 2584e36cb0b8SDavid Howells bool old_is_dir = d_is_dir(old_dentry); 2585e36cb0b8SDavid Howells bool new_is_dir = d_is_dir(new_dentry); 258637456771SMiklos Szeredi 258737456771SMiklos Szeredi if (old_dir != new_dir && old_is_dir != new_is_dir) { 258837456771SMiklos Szeredi if (old_is_dir) { 258937456771SMiklos Szeredi drop_nlink(old_dir); 259037456771SMiklos Szeredi inc_nlink(new_dir); 259137456771SMiklos Szeredi } else { 259237456771SMiklos Szeredi drop_nlink(new_dir); 259337456771SMiklos Szeredi inc_nlink(old_dir); 259437456771SMiklos Szeredi } 259537456771SMiklos Szeredi } 259637456771SMiklos Szeredi old_dir->i_ctime = old_dir->i_mtime = 259737456771SMiklos Szeredi new_dir->i_ctime = new_dir->i_mtime = 259875c3cfa8SDavid Howells d_inode(old_dentry)->i_ctime = 259975c3cfa8SDavid Howells d_inode(new_dentry)->i_ctime = CURRENT_TIME; 260037456771SMiklos Szeredi 260137456771SMiklos Szeredi return 0; 260237456771SMiklos Szeredi } 260337456771SMiklos Szeredi 260446fdb794SMiklos Szeredi static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) 260546fdb794SMiklos Szeredi { 260646fdb794SMiklos Szeredi struct dentry *whiteout; 260746fdb794SMiklos Szeredi int error; 260846fdb794SMiklos Szeredi 260946fdb794SMiklos Szeredi whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 261046fdb794SMiklos Szeredi if (!whiteout) 261146fdb794SMiklos Szeredi return -ENOMEM; 261246fdb794SMiklos Szeredi 261346fdb794SMiklos Szeredi error = shmem_mknod(old_dir, whiteout, 261446fdb794SMiklos Szeredi S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 261546fdb794SMiklos Szeredi dput(whiteout); 261646fdb794SMiklos Szeredi if (error) 261746fdb794SMiklos Szeredi return error; 261846fdb794SMiklos Szeredi 261946fdb794SMiklos Szeredi /* 262046fdb794SMiklos Szeredi * Cheat and hash the whiteout while the old dentry is still in 262146fdb794SMiklos Szeredi * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 262246fdb794SMiklos Szeredi * 262346fdb794SMiklos Szeredi * d_lookup() will consistently find one of them at this point, 262446fdb794SMiklos Szeredi * not sure which one, but that isn't even important. 262546fdb794SMiklos Szeredi */ 262646fdb794SMiklos Szeredi d_rehash(whiteout); 262746fdb794SMiklos Szeredi return 0; 262846fdb794SMiklos Szeredi } 262946fdb794SMiklos Szeredi 26301da177e4SLinus Torvalds /* 26311da177e4SLinus Torvalds * The VFS layer already does all the dentry stuff for rename, 26321da177e4SLinus Torvalds * we just have to decrement the usage count for the target if 26331da177e4SLinus Torvalds * it exists so that the VFS layer correctly free's it when it 26341da177e4SLinus Torvalds * gets overwritten. 26351da177e4SLinus Torvalds */ 26363b69ff51SMiklos Szeredi static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) 26371da177e4SLinus Torvalds { 263875c3cfa8SDavid Howells struct inode *inode = d_inode(old_dentry); 26391da177e4SLinus Torvalds int they_are_dirs = S_ISDIR(inode->i_mode); 26401da177e4SLinus Torvalds 264146fdb794SMiklos Szeredi if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 26423b69ff51SMiklos Szeredi return -EINVAL; 26433b69ff51SMiklos Szeredi 264437456771SMiklos Szeredi if (flags & RENAME_EXCHANGE) 264537456771SMiklos Szeredi return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); 264637456771SMiklos Szeredi 26471da177e4SLinus Torvalds if (!simple_empty(new_dentry)) 26481da177e4SLinus Torvalds return -ENOTEMPTY; 26491da177e4SLinus Torvalds 265046fdb794SMiklos Szeredi if (flags & RENAME_WHITEOUT) { 265146fdb794SMiklos Szeredi int error; 265246fdb794SMiklos Szeredi 265346fdb794SMiklos Szeredi error = shmem_whiteout(old_dir, old_dentry); 265446fdb794SMiklos Szeredi if (error) 265546fdb794SMiklos Szeredi return error; 265646fdb794SMiklos Szeredi } 265746fdb794SMiklos Szeredi 265875c3cfa8SDavid Howells if (d_really_is_positive(new_dentry)) { 26591da177e4SLinus Torvalds (void) shmem_unlink(new_dir, new_dentry); 2660b928095bSMiklos Szeredi if (they_are_dirs) { 266175c3cfa8SDavid Howells drop_nlink(d_inode(new_dentry)); 26629a53c3a7SDave Hansen drop_nlink(old_dir); 2663b928095bSMiklos Szeredi } 26641da177e4SLinus Torvalds } else if (they_are_dirs) { 26659a53c3a7SDave Hansen drop_nlink(old_dir); 2666d8c76e6fSDave Hansen inc_nlink(new_dir); 26671da177e4SLinus Torvalds } 26681da177e4SLinus Torvalds 26691da177e4SLinus Torvalds old_dir->i_size -= BOGO_DIRENT_SIZE; 26701da177e4SLinus Torvalds new_dir->i_size += BOGO_DIRENT_SIZE; 26711da177e4SLinus Torvalds old_dir->i_ctime = old_dir->i_mtime = 26721da177e4SLinus Torvalds new_dir->i_ctime = new_dir->i_mtime = 26731da177e4SLinus Torvalds inode->i_ctime = CURRENT_TIME; 26741da177e4SLinus Torvalds return 0; 26751da177e4SLinus Torvalds } 26761da177e4SLinus Torvalds 26771da177e4SLinus Torvalds static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 26781da177e4SLinus Torvalds { 26791da177e4SLinus Torvalds int error; 26801da177e4SLinus Torvalds int len; 26811da177e4SLinus Torvalds struct inode *inode; 26829276aad6SHugh Dickins struct page *page; 26831da177e4SLinus Torvalds struct shmem_inode_info *info; 26841da177e4SLinus Torvalds 26851da177e4SLinus Torvalds len = strlen(symname) + 1; 268609cbfeafSKirill A. Shutemov if (len > PAGE_SIZE) 26871da177e4SLinus Torvalds return -ENAMETOOLONG; 26881da177e4SLinus Torvalds 2689454abafeSDmitry Monakhov inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 26901da177e4SLinus Torvalds if (!inode) 26911da177e4SLinus Torvalds return -ENOSPC; 26921da177e4SLinus Torvalds 26939d8f13baSMimi Zohar error = security_inode_init_security(inode, dir, &dentry->d_name, 26946d9d88d0SJarkko Sakkinen shmem_initxattrs, NULL); 2695570bc1c2SStephen Smalley if (error) { 2696570bc1c2SStephen Smalley if (error != -EOPNOTSUPP) { 2697570bc1c2SStephen Smalley iput(inode); 2698570bc1c2SStephen Smalley return error; 2699570bc1c2SStephen Smalley } 2700570bc1c2SStephen Smalley error = 0; 2701570bc1c2SStephen Smalley } 2702570bc1c2SStephen Smalley 27031da177e4SLinus Torvalds info = SHMEM_I(inode); 27041da177e4SLinus Torvalds inode->i_size = len-1; 270569f07ec9SHugh Dickins if (len <= SHORT_SYMLINK_LEN) { 27063ed47db3SAl Viro inode->i_link = kmemdup(symname, len, GFP_KERNEL); 27073ed47db3SAl Viro if (!inode->i_link) { 270869f07ec9SHugh Dickins iput(inode); 270969f07ec9SHugh Dickins return -ENOMEM; 271069f07ec9SHugh Dickins } 271169f07ec9SHugh Dickins inode->i_op = &shmem_short_symlink_operations; 27121da177e4SLinus Torvalds } else { 2713e8ecde25SAl Viro inode_nohighmem(inode); 27149e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, 0, &page, SGP_WRITE); 27151da177e4SLinus Torvalds if (error) { 27161da177e4SLinus Torvalds iput(inode); 27171da177e4SLinus Torvalds return error; 27181da177e4SLinus Torvalds } 271914fcc23fSHugh Dickins inode->i_mapping->a_ops = &shmem_aops; 27201da177e4SLinus Torvalds inode->i_op = &shmem_symlink_inode_operations; 272121fc61c7SAl Viro memcpy(page_address(page), symname, len); 2722ec9516fbSHugh Dickins SetPageUptodate(page); 27231da177e4SLinus Torvalds set_page_dirty(page); 27246746aff7SWu Fengguang unlock_page(page); 272509cbfeafSKirill A. Shutemov put_page(page); 27261da177e4SLinus Torvalds } 27271da177e4SLinus Torvalds dir->i_size += BOGO_DIRENT_SIZE; 27281da177e4SLinus Torvalds dir->i_ctime = dir->i_mtime = CURRENT_TIME; 27291da177e4SLinus Torvalds d_instantiate(dentry, inode); 27301da177e4SLinus Torvalds dget(dentry); 27311da177e4SLinus Torvalds return 0; 27321da177e4SLinus Torvalds } 27331da177e4SLinus Torvalds 2734fceef393SAl Viro static void shmem_put_link(void *arg) 2735fceef393SAl Viro { 2736fceef393SAl Viro mark_page_accessed(arg); 2737fceef393SAl Viro put_page(arg); 2738fceef393SAl Viro } 2739fceef393SAl Viro 27406b255391SAl Viro static const char *shmem_get_link(struct dentry *dentry, 2741fceef393SAl Viro struct inode *inode, 2742fceef393SAl Viro struct delayed_call *done) 27431da177e4SLinus Torvalds { 27441da177e4SLinus Torvalds struct page *page = NULL; 27456b255391SAl Viro int error; 27466a6c9904SAl Viro if (!dentry) { 27476a6c9904SAl Viro page = find_get_page(inode->i_mapping, 0); 27486a6c9904SAl Viro if (!page) 27496b255391SAl Viro return ERR_PTR(-ECHILD); 27506a6c9904SAl Viro if (!PageUptodate(page)) { 27516a6c9904SAl Viro put_page(page); 27526a6c9904SAl Viro return ERR_PTR(-ECHILD); 27536a6c9904SAl Viro } 27546a6c9904SAl Viro } else { 27559e18eb29SAndres Lagar-Cavilla error = shmem_getpage(inode, 0, &page, SGP_READ); 2756680baacbSAl Viro if (error) 2757680baacbSAl Viro return ERR_PTR(error); 2758d3602444SHugh Dickins unlock_page(page); 27591da177e4SLinus Torvalds } 2760fceef393SAl Viro set_delayed_call(done, shmem_put_link, page); 276121fc61c7SAl Viro return page_address(page); 27621da177e4SLinus Torvalds } 27631da177e4SLinus Torvalds 2764b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 2765b09e0fa4SEric Paris /* 2766b09e0fa4SEric Paris * Superblocks without xattr inode operations may get some security.* xattr 2767b09e0fa4SEric Paris * support from the LSM "for free". As soon as we have any other xattrs 2768b09e0fa4SEric Paris * like ACLs, we also need to implement the security.* handlers at 2769b09e0fa4SEric Paris * filesystem level, though. 2770b09e0fa4SEric Paris */ 2771b09e0fa4SEric Paris 27726d9d88d0SJarkko Sakkinen /* 27736d9d88d0SJarkko Sakkinen * Callback for security_inode_init_security() for acquiring xattrs. 27746d9d88d0SJarkko Sakkinen */ 27756d9d88d0SJarkko Sakkinen static int shmem_initxattrs(struct inode *inode, 27766d9d88d0SJarkko Sakkinen const struct xattr *xattr_array, 27776d9d88d0SJarkko Sakkinen void *fs_info) 27786d9d88d0SJarkko Sakkinen { 27796d9d88d0SJarkko Sakkinen struct shmem_inode_info *info = SHMEM_I(inode); 27806d9d88d0SJarkko Sakkinen const struct xattr *xattr; 278138f38657SAristeu Rozanski struct simple_xattr *new_xattr; 27826d9d88d0SJarkko Sakkinen size_t len; 27836d9d88d0SJarkko Sakkinen 27846d9d88d0SJarkko Sakkinen for (xattr = xattr_array; xattr->name != NULL; xattr++) { 278538f38657SAristeu Rozanski new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 27866d9d88d0SJarkko Sakkinen if (!new_xattr) 27876d9d88d0SJarkko Sakkinen return -ENOMEM; 27886d9d88d0SJarkko Sakkinen 27896d9d88d0SJarkko Sakkinen len = strlen(xattr->name) + 1; 27906d9d88d0SJarkko Sakkinen new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 27916d9d88d0SJarkko Sakkinen GFP_KERNEL); 27926d9d88d0SJarkko Sakkinen if (!new_xattr->name) { 27936d9d88d0SJarkko Sakkinen kfree(new_xattr); 27946d9d88d0SJarkko Sakkinen return -ENOMEM; 27956d9d88d0SJarkko Sakkinen } 27966d9d88d0SJarkko Sakkinen 27976d9d88d0SJarkko Sakkinen memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 27986d9d88d0SJarkko Sakkinen XATTR_SECURITY_PREFIX_LEN); 27996d9d88d0SJarkko Sakkinen memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 28006d9d88d0SJarkko Sakkinen xattr->name, len); 28016d9d88d0SJarkko Sakkinen 280238f38657SAristeu Rozanski simple_xattr_list_add(&info->xattrs, new_xattr); 28036d9d88d0SJarkko Sakkinen } 28046d9d88d0SJarkko Sakkinen 28056d9d88d0SJarkko Sakkinen return 0; 28066d9d88d0SJarkko Sakkinen } 28076d9d88d0SJarkko Sakkinen 2808aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_get(const struct xattr_handler *handler, 2809b296821aSAl Viro struct dentry *unused, struct inode *inode, 2810b296821aSAl Viro const char *name, void *buffer, size_t size) 2811aa7c5241SAndreas Gruenbacher { 2812b296821aSAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 2813aa7c5241SAndreas Gruenbacher 2814aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 2815aa7c5241SAndreas Gruenbacher return simple_xattr_get(&info->xattrs, name, buffer, size); 2816aa7c5241SAndreas Gruenbacher } 2817aa7c5241SAndreas Gruenbacher 2818aa7c5241SAndreas Gruenbacher static int shmem_xattr_handler_set(const struct xattr_handler *handler, 281959301226SAl Viro struct dentry *unused, struct inode *inode, 282059301226SAl Viro const char *name, const void *value, 282159301226SAl Viro size_t size, int flags) 2822aa7c5241SAndreas Gruenbacher { 282359301226SAl Viro struct shmem_inode_info *info = SHMEM_I(inode); 2824aa7c5241SAndreas Gruenbacher 2825aa7c5241SAndreas Gruenbacher name = xattr_full_name(handler, name); 2826aa7c5241SAndreas Gruenbacher return simple_xattr_set(&info->xattrs, name, value, size, flags); 2827aa7c5241SAndreas Gruenbacher } 2828aa7c5241SAndreas Gruenbacher 2829aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_security_xattr_handler = { 2830aa7c5241SAndreas Gruenbacher .prefix = XATTR_SECURITY_PREFIX, 2831aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 2832aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 2833aa7c5241SAndreas Gruenbacher }; 2834aa7c5241SAndreas Gruenbacher 2835aa7c5241SAndreas Gruenbacher static const struct xattr_handler shmem_trusted_xattr_handler = { 2836aa7c5241SAndreas Gruenbacher .prefix = XATTR_TRUSTED_PREFIX, 2837aa7c5241SAndreas Gruenbacher .get = shmem_xattr_handler_get, 2838aa7c5241SAndreas Gruenbacher .set = shmem_xattr_handler_set, 2839aa7c5241SAndreas Gruenbacher }; 2840aa7c5241SAndreas Gruenbacher 2841b09e0fa4SEric Paris static const struct xattr_handler *shmem_xattr_handlers[] = { 2842b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL 2843feda821eSChristoph Hellwig &posix_acl_access_xattr_handler, 2844feda821eSChristoph Hellwig &posix_acl_default_xattr_handler, 2845b09e0fa4SEric Paris #endif 2846aa7c5241SAndreas Gruenbacher &shmem_security_xattr_handler, 2847aa7c5241SAndreas Gruenbacher &shmem_trusted_xattr_handler, 2848b09e0fa4SEric Paris NULL 2849b09e0fa4SEric Paris }; 2850b09e0fa4SEric Paris 2851b09e0fa4SEric Paris static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2852b09e0fa4SEric Paris { 285375c3cfa8SDavid Howells struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 2854786534b9SAndreas Gruenbacher return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 2855b09e0fa4SEric Paris } 2856b09e0fa4SEric Paris #endif /* CONFIG_TMPFS_XATTR */ 2857b09e0fa4SEric Paris 285869f07ec9SHugh Dickins static const struct inode_operations shmem_short_symlink_operations = { 28591da177e4SLinus Torvalds .readlink = generic_readlink, 28606b255391SAl Viro .get_link = simple_get_link, 2861b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 2862aa7c5241SAndreas Gruenbacher .setxattr = generic_setxattr, 2863aa7c5241SAndreas Gruenbacher .getxattr = generic_getxattr, 2864b09e0fa4SEric Paris .listxattr = shmem_listxattr, 2865aa7c5241SAndreas Gruenbacher .removexattr = generic_removexattr, 2866b09e0fa4SEric Paris #endif 28671da177e4SLinus Torvalds }; 28681da177e4SLinus Torvalds 286992e1d5beSArjan van de Ven static const struct inode_operations shmem_symlink_inode_operations = { 28701da177e4SLinus Torvalds .readlink = generic_readlink, 28716b255391SAl Viro .get_link = shmem_get_link, 2872b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 2873aa7c5241SAndreas Gruenbacher .setxattr = generic_setxattr, 2874aa7c5241SAndreas Gruenbacher .getxattr = generic_getxattr, 2875b09e0fa4SEric Paris .listxattr = shmem_listxattr, 2876aa7c5241SAndreas Gruenbacher .removexattr = generic_removexattr, 287739f0247dSAndreas Gruenbacher #endif 2878b09e0fa4SEric Paris }; 287939f0247dSAndreas Gruenbacher 288091828a40SDavid M. Grimes static struct dentry *shmem_get_parent(struct dentry *child) 288191828a40SDavid M. Grimes { 288291828a40SDavid M. Grimes return ERR_PTR(-ESTALE); 288391828a40SDavid M. Grimes } 288491828a40SDavid M. Grimes 288591828a40SDavid M. Grimes static int shmem_match(struct inode *ino, void *vfh) 288691828a40SDavid M. Grimes { 288791828a40SDavid M. Grimes __u32 *fh = vfh; 288891828a40SDavid M. Grimes __u64 inum = fh[2]; 288991828a40SDavid M. Grimes inum = (inum << 32) | fh[1]; 289091828a40SDavid M. Grimes return ino->i_ino == inum && fh[0] == ino->i_generation; 289191828a40SDavid M. Grimes } 289291828a40SDavid M. Grimes 2893480b116cSChristoph Hellwig static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 2894480b116cSChristoph Hellwig struct fid *fid, int fh_len, int fh_type) 289591828a40SDavid M. Grimes { 289691828a40SDavid M. Grimes struct inode *inode; 2897480b116cSChristoph Hellwig struct dentry *dentry = NULL; 289835c2a7f4SHugh Dickins u64 inum; 289991828a40SDavid M. Grimes 2900480b116cSChristoph Hellwig if (fh_len < 3) 2901480b116cSChristoph Hellwig return NULL; 2902480b116cSChristoph Hellwig 290335c2a7f4SHugh Dickins inum = fid->raw[2]; 290435c2a7f4SHugh Dickins inum = (inum << 32) | fid->raw[1]; 290535c2a7f4SHugh Dickins 2906480b116cSChristoph Hellwig inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2907480b116cSChristoph Hellwig shmem_match, fid->raw); 290891828a40SDavid M. Grimes if (inode) { 2909480b116cSChristoph Hellwig dentry = d_find_alias(inode); 291091828a40SDavid M. Grimes iput(inode); 291191828a40SDavid M. Grimes } 291291828a40SDavid M. Grimes 2913480b116cSChristoph Hellwig return dentry; 291491828a40SDavid M. Grimes } 291591828a40SDavid M. Grimes 2916b0b0382bSAl Viro static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 2917b0b0382bSAl Viro struct inode *parent) 291891828a40SDavid M. Grimes { 29195fe0c237SAneesh Kumar K.V if (*len < 3) { 29205fe0c237SAneesh Kumar K.V *len = 3; 292194e07a75SNamjae Jeon return FILEID_INVALID; 29225fe0c237SAneesh Kumar K.V } 292391828a40SDavid M. Grimes 29241d3382cbSAl Viro if (inode_unhashed(inode)) { 292591828a40SDavid M. Grimes /* Unfortunately insert_inode_hash is not idempotent, 292691828a40SDavid M. Grimes * so as we hash inodes here rather than at creation 292791828a40SDavid M. Grimes * time, we need a lock to ensure we only try 292891828a40SDavid M. Grimes * to do it once 292991828a40SDavid M. Grimes */ 293091828a40SDavid M. Grimes static DEFINE_SPINLOCK(lock); 293191828a40SDavid M. Grimes spin_lock(&lock); 29321d3382cbSAl Viro if (inode_unhashed(inode)) 293391828a40SDavid M. Grimes __insert_inode_hash(inode, 293491828a40SDavid M. Grimes inode->i_ino + inode->i_generation); 293591828a40SDavid M. Grimes spin_unlock(&lock); 293691828a40SDavid M. Grimes } 293791828a40SDavid M. Grimes 293891828a40SDavid M. Grimes fh[0] = inode->i_generation; 293991828a40SDavid M. Grimes fh[1] = inode->i_ino; 294091828a40SDavid M. Grimes fh[2] = ((__u64)inode->i_ino) >> 32; 294191828a40SDavid M. Grimes 294291828a40SDavid M. Grimes *len = 3; 294391828a40SDavid M. Grimes return 1; 294491828a40SDavid M. Grimes } 294591828a40SDavid M. Grimes 294639655164SChristoph Hellwig static const struct export_operations shmem_export_ops = { 294791828a40SDavid M. Grimes .get_parent = shmem_get_parent, 294891828a40SDavid M. Grimes .encode_fh = shmem_encode_fh, 2949480b116cSChristoph Hellwig .fh_to_dentry = shmem_fh_to_dentry, 295091828a40SDavid M. Grimes }; 295191828a40SDavid M. Grimes 2952680d794bSakpm@linux-foundation.org static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 2953680d794bSakpm@linux-foundation.org bool remount) 29541da177e4SLinus Torvalds { 29551da177e4SLinus Torvalds char *this_char, *value, *rest; 295649cd0a5cSGreg Thelen struct mempolicy *mpol = NULL; 29578751e039SEric W. Biederman uid_t uid; 29588751e039SEric W. Biederman gid_t gid; 29591da177e4SLinus Torvalds 2960b00dc3adSHugh Dickins while (options != NULL) { 2961b00dc3adSHugh Dickins this_char = options; 2962b00dc3adSHugh Dickins for (;;) { 2963b00dc3adSHugh Dickins /* 2964b00dc3adSHugh Dickins * NUL-terminate this option: unfortunately, 2965b00dc3adSHugh Dickins * mount options form a comma-separated list, 2966b00dc3adSHugh Dickins * but mpol's nodelist may also contain commas. 2967b00dc3adSHugh Dickins */ 2968b00dc3adSHugh Dickins options = strchr(options, ','); 2969b00dc3adSHugh Dickins if (options == NULL) 2970b00dc3adSHugh Dickins break; 2971b00dc3adSHugh Dickins options++; 2972b00dc3adSHugh Dickins if (!isdigit(*options)) { 2973b00dc3adSHugh Dickins options[-1] = '\0'; 2974b00dc3adSHugh Dickins break; 2975b00dc3adSHugh Dickins } 2976b00dc3adSHugh Dickins } 29771da177e4SLinus Torvalds if (!*this_char) 29781da177e4SLinus Torvalds continue; 29791da177e4SLinus Torvalds if ((value = strchr(this_char,'=')) != NULL) { 29801da177e4SLinus Torvalds *value++ = 0; 29811da177e4SLinus Torvalds } else { 29821170532bSJoe Perches pr_err("tmpfs: No value for mount option '%s'\n", 29831da177e4SLinus Torvalds this_char); 298449cd0a5cSGreg Thelen goto error; 29851da177e4SLinus Torvalds } 29861da177e4SLinus Torvalds 29871da177e4SLinus Torvalds if (!strcmp(this_char,"size")) { 29881da177e4SLinus Torvalds unsigned long long size; 29891da177e4SLinus Torvalds size = memparse(value,&rest); 29901da177e4SLinus Torvalds if (*rest == '%') { 29911da177e4SLinus Torvalds size <<= PAGE_SHIFT; 29921da177e4SLinus Torvalds size *= totalram_pages; 29931da177e4SLinus Torvalds do_div(size, 100); 29941da177e4SLinus Torvalds rest++; 29951da177e4SLinus Torvalds } 29961da177e4SLinus Torvalds if (*rest) 29971da177e4SLinus Torvalds goto bad_val; 2998680d794bSakpm@linux-foundation.org sbinfo->max_blocks = 299909cbfeafSKirill A. Shutemov DIV_ROUND_UP(size, PAGE_SIZE); 30001da177e4SLinus Torvalds } else if (!strcmp(this_char,"nr_blocks")) { 3001680d794bSakpm@linux-foundation.org sbinfo->max_blocks = memparse(value, &rest); 30021da177e4SLinus Torvalds if (*rest) 30031da177e4SLinus Torvalds goto bad_val; 30041da177e4SLinus Torvalds } else if (!strcmp(this_char,"nr_inodes")) { 3005680d794bSakpm@linux-foundation.org sbinfo->max_inodes = memparse(value, &rest); 30061da177e4SLinus Torvalds if (*rest) 30071da177e4SLinus Torvalds goto bad_val; 30081da177e4SLinus Torvalds } else if (!strcmp(this_char,"mode")) { 3009680d794bSakpm@linux-foundation.org if (remount) 30101da177e4SLinus Torvalds continue; 3011680d794bSakpm@linux-foundation.org sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 30121da177e4SLinus Torvalds if (*rest) 30131da177e4SLinus Torvalds goto bad_val; 30141da177e4SLinus Torvalds } else if (!strcmp(this_char,"uid")) { 3015680d794bSakpm@linux-foundation.org if (remount) 30161da177e4SLinus Torvalds continue; 30178751e039SEric W. Biederman uid = simple_strtoul(value, &rest, 0); 30181da177e4SLinus Torvalds if (*rest) 30191da177e4SLinus Torvalds goto bad_val; 30208751e039SEric W. Biederman sbinfo->uid = make_kuid(current_user_ns(), uid); 30218751e039SEric W. Biederman if (!uid_valid(sbinfo->uid)) 30228751e039SEric W. Biederman goto bad_val; 30231da177e4SLinus Torvalds } else if (!strcmp(this_char,"gid")) { 3024680d794bSakpm@linux-foundation.org if (remount) 30251da177e4SLinus Torvalds continue; 30268751e039SEric W. Biederman gid = simple_strtoul(value, &rest, 0); 30271da177e4SLinus Torvalds if (*rest) 30281da177e4SLinus Torvalds goto bad_val; 30298751e039SEric W. Biederman sbinfo->gid = make_kgid(current_user_ns(), gid); 30308751e039SEric W. Biederman if (!gid_valid(sbinfo->gid)) 30318751e039SEric W. Biederman goto bad_val; 30325a6e75f8SKirill A. Shutemov #ifdef CONFIG_TRANSPARENT_HUGEPAGE 30335a6e75f8SKirill A. Shutemov } else if (!strcmp(this_char, "huge")) { 30345a6e75f8SKirill A. Shutemov int huge; 30355a6e75f8SKirill A. Shutemov huge = shmem_parse_huge(value); 30365a6e75f8SKirill A. Shutemov if (huge < 0) 30375a6e75f8SKirill A. Shutemov goto bad_val; 30385a6e75f8SKirill A. Shutemov if (!has_transparent_hugepage() && 30395a6e75f8SKirill A. Shutemov huge != SHMEM_HUGE_NEVER) 30405a6e75f8SKirill A. Shutemov goto bad_val; 30415a6e75f8SKirill A. Shutemov sbinfo->huge = huge; 30425a6e75f8SKirill A. Shutemov #endif 30435a6e75f8SKirill A. Shutemov #ifdef CONFIG_NUMA 30447339ff83SRobin Holt } else if (!strcmp(this_char,"mpol")) { 304549cd0a5cSGreg Thelen mpol_put(mpol); 304649cd0a5cSGreg Thelen mpol = NULL; 304749cd0a5cSGreg Thelen if (mpol_parse_str(value, &mpol)) 30487339ff83SRobin Holt goto bad_val; 30495a6e75f8SKirill A. Shutemov #endif 30501da177e4SLinus Torvalds } else { 30511170532bSJoe Perches pr_err("tmpfs: Bad mount option %s\n", this_char); 305249cd0a5cSGreg Thelen goto error; 30531da177e4SLinus Torvalds } 30541da177e4SLinus Torvalds } 305549cd0a5cSGreg Thelen sbinfo->mpol = mpol; 30561da177e4SLinus Torvalds return 0; 30571da177e4SLinus Torvalds 30581da177e4SLinus Torvalds bad_val: 30591170532bSJoe Perches pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", 30601da177e4SLinus Torvalds value, this_char); 306149cd0a5cSGreg Thelen error: 306249cd0a5cSGreg Thelen mpol_put(mpol); 30631da177e4SLinus Torvalds return 1; 30641da177e4SLinus Torvalds 30651da177e4SLinus Torvalds } 30661da177e4SLinus Torvalds 30671da177e4SLinus Torvalds static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 30681da177e4SLinus Torvalds { 30691da177e4SLinus Torvalds struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3070680d794bSakpm@linux-foundation.org struct shmem_sb_info config = *sbinfo; 30710edd73b3SHugh Dickins unsigned long inodes; 30720edd73b3SHugh Dickins int error = -EINVAL; 30731da177e4SLinus Torvalds 30745f00110fSGreg Thelen config.mpol = NULL; 3075680d794bSakpm@linux-foundation.org if (shmem_parse_options(data, &config, true)) 30760edd73b3SHugh Dickins return error; 30770edd73b3SHugh Dickins 30780edd73b3SHugh Dickins spin_lock(&sbinfo->stat_lock); 30790edd73b3SHugh Dickins inodes = sbinfo->max_inodes - sbinfo->free_inodes; 30807e496299STim Chen if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 30810edd73b3SHugh Dickins goto out; 3082680d794bSakpm@linux-foundation.org if (config.max_inodes < inodes) 30830edd73b3SHugh Dickins goto out; 30840edd73b3SHugh Dickins /* 308554af6042SHugh Dickins * Those tests disallow limited->unlimited while any are in use; 30860edd73b3SHugh Dickins * but we must separately disallow unlimited->limited, because 30870edd73b3SHugh Dickins * in that case we have no record of how much is already in use. 30880edd73b3SHugh Dickins */ 3089680d794bSakpm@linux-foundation.org if (config.max_blocks && !sbinfo->max_blocks) 30900edd73b3SHugh Dickins goto out; 3091680d794bSakpm@linux-foundation.org if (config.max_inodes && !sbinfo->max_inodes) 30920edd73b3SHugh Dickins goto out; 30930edd73b3SHugh Dickins 30940edd73b3SHugh Dickins error = 0; 30955a6e75f8SKirill A. Shutemov sbinfo->huge = config.huge; 3096680d794bSakpm@linux-foundation.org sbinfo->max_blocks = config.max_blocks; 3097680d794bSakpm@linux-foundation.org sbinfo->max_inodes = config.max_inodes; 3098680d794bSakpm@linux-foundation.org sbinfo->free_inodes = config.max_inodes - inodes; 309971fe804bSLee Schermerhorn 31005f00110fSGreg Thelen /* 31015f00110fSGreg Thelen * Preserve previous mempolicy unless mpol remount option was specified. 31025f00110fSGreg Thelen */ 31035f00110fSGreg Thelen if (config.mpol) { 310471fe804bSLee Schermerhorn mpol_put(sbinfo->mpol); 310571fe804bSLee Schermerhorn sbinfo->mpol = config.mpol; /* transfers initial ref */ 31065f00110fSGreg Thelen } 31070edd73b3SHugh Dickins out: 31080edd73b3SHugh Dickins spin_unlock(&sbinfo->stat_lock); 31090edd73b3SHugh Dickins return error; 31101da177e4SLinus Torvalds } 3111680d794bSakpm@linux-foundation.org 311234c80b1dSAl Viro static int shmem_show_options(struct seq_file *seq, struct dentry *root) 3113680d794bSakpm@linux-foundation.org { 311434c80b1dSAl Viro struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 3115680d794bSakpm@linux-foundation.org 3116680d794bSakpm@linux-foundation.org if (sbinfo->max_blocks != shmem_default_max_blocks()) 3117680d794bSakpm@linux-foundation.org seq_printf(seq, ",size=%luk", 311809cbfeafSKirill A. Shutemov sbinfo->max_blocks << (PAGE_SHIFT - 10)); 3119680d794bSakpm@linux-foundation.org if (sbinfo->max_inodes != shmem_default_max_inodes()) 3120680d794bSakpm@linux-foundation.org seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 3121680d794bSakpm@linux-foundation.org if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 312209208d15SAl Viro seq_printf(seq, ",mode=%03ho", sbinfo->mode); 31238751e039SEric W. Biederman if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 31248751e039SEric W. Biederman seq_printf(seq, ",uid=%u", 31258751e039SEric W. Biederman from_kuid_munged(&init_user_ns, sbinfo->uid)); 31268751e039SEric W. Biederman if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 31278751e039SEric W. Biederman seq_printf(seq, ",gid=%u", 31288751e039SEric W. Biederman from_kgid_munged(&init_user_ns, sbinfo->gid)); 31295a6e75f8SKirill A. Shutemov #ifdef CONFIG_TRANSPARENT_HUGEPAGE 31305a6e75f8SKirill A. Shutemov /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 31315a6e75f8SKirill A. Shutemov if (sbinfo->huge) 31325a6e75f8SKirill A. Shutemov seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 31335a6e75f8SKirill A. Shutemov #endif 313471fe804bSLee Schermerhorn shmem_show_mpol(seq, sbinfo->mpol); 3135680d794bSakpm@linux-foundation.org return 0; 3136680d794bSakpm@linux-foundation.org } 31379183df25SDavid Herrmann 31389183df25SDavid Herrmann #define MFD_NAME_PREFIX "memfd:" 31399183df25SDavid Herrmann #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 31409183df25SDavid Herrmann #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 31419183df25SDavid Herrmann 31429183df25SDavid Herrmann #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) 31439183df25SDavid Herrmann 31449183df25SDavid Herrmann SYSCALL_DEFINE2(memfd_create, 31459183df25SDavid Herrmann const char __user *, uname, 31469183df25SDavid Herrmann unsigned int, flags) 31479183df25SDavid Herrmann { 31489183df25SDavid Herrmann struct shmem_inode_info *info; 31499183df25SDavid Herrmann struct file *file; 31509183df25SDavid Herrmann int fd, error; 31519183df25SDavid Herrmann char *name; 31529183df25SDavid Herrmann long len; 31539183df25SDavid Herrmann 31549183df25SDavid Herrmann if (flags & ~(unsigned int)MFD_ALL_FLAGS) 31559183df25SDavid Herrmann return -EINVAL; 31569183df25SDavid Herrmann 31579183df25SDavid Herrmann /* length includes terminating zero */ 31589183df25SDavid Herrmann len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 31599183df25SDavid Herrmann if (len <= 0) 31609183df25SDavid Herrmann return -EFAULT; 31619183df25SDavid Herrmann if (len > MFD_NAME_MAX_LEN + 1) 31629183df25SDavid Herrmann return -EINVAL; 31639183df25SDavid Herrmann 31649183df25SDavid Herrmann name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); 31659183df25SDavid Herrmann if (!name) 31669183df25SDavid Herrmann return -ENOMEM; 31679183df25SDavid Herrmann 31689183df25SDavid Herrmann strcpy(name, MFD_NAME_PREFIX); 31699183df25SDavid Herrmann if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 31709183df25SDavid Herrmann error = -EFAULT; 31719183df25SDavid Herrmann goto err_name; 31729183df25SDavid Herrmann } 31739183df25SDavid Herrmann 31749183df25SDavid Herrmann /* terminating-zero may have changed after strnlen_user() returned */ 31759183df25SDavid Herrmann if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 31769183df25SDavid Herrmann error = -EFAULT; 31779183df25SDavid Herrmann goto err_name; 31789183df25SDavid Herrmann } 31799183df25SDavid Herrmann 31809183df25SDavid Herrmann fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 31819183df25SDavid Herrmann if (fd < 0) { 31829183df25SDavid Herrmann error = fd; 31839183df25SDavid Herrmann goto err_name; 31849183df25SDavid Herrmann } 31859183df25SDavid Herrmann 31869183df25SDavid Herrmann file = shmem_file_setup(name, 0, VM_NORESERVE); 31879183df25SDavid Herrmann if (IS_ERR(file)) { 31889183df25SDavid Herrmann error = PTR_ERR(file); 31899183df25SDavid Herrmann goto err_fd; 31909183df25SDavid Herrmann } 31919183df25SDavid Herrmann info = SHMEM_I(file_inode(file)); 31929183df25SDavid Herrmann file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 31939183df25SDavid Herrmann file->f_flags |= O_RDWR | O_LARGEFILE; 31949183df25SDavid Herrmann if (flags & MFD_ALLOW_SEALING) 31959183df25SDavid Herrmann info->seals &= ~F_SEAL_SEAL; 31969183df25SDavid Herrmann 31979183df25SDavid Herrmann fd_install(fd, file); 31989183df25SDavid Herrmann kfree(name); 31999183df25SDavid Herrmann return fd; 32009183df25SDavid Herrmann 32019183df25SDavid Herrmann err_fd: 32029183df25SDavid Herrmann put_unused_fd(fd); 32039183df25SDavid Herrmann err_name: 32049183df25SDavid Herrmann kfree(name); 32059183df25SDavid Herrmann return error; 32069183df25SDavid Herrmann } 32079183df25SDavid Herrmann 3208680d794bSakpm@linux-foundation.org #endif /* CONFIG_TMPFS */ 32091da177e4SLinus Torvalds 32101da177e4SLinus Torvalds static void shmem_put_super(struct super_block *sb) 32111da177e4SLinus Torvalds { 3212602586a8SHugh Dickins struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3213602586a8SHugh Dickins 3214602586a8SHugh Dickins percpu_counter_destroy(&sbinfo->used_blocks); 321549cd0a5cSGreg Thelen mpol_put(sbinfo->mpol); 3216602586a8SHugh Dickins kfree(sbinfo); 32171da177e4SLinus Torvalds sb->s_fs_info = NULL; 32181da177e4SLinus Torvalds } 32191da177e4SLinus Torvalds 32202b2af54aSKay Sievers int shmem_fill_super(struct super_block *sb, void *data, int silent) 32211da177e4SLinus Torvalds { 32221da177e4SLinus Torvalds struct inode *inode; 32230edd73b3SHugh Dickins struct shmem_sb_info *sbinfo; 3224680d794bSakpm@linux-foundation.org int err = -ENOMEM; 3225680d794bSakpm@linux-foundation.org 3226680d794bSakpm@linux-foundation.org /* Round up to L1_CACHE_BYTES to resist false sharing */ 3227425fbf04SPekka Enberg sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 3228680d794bSakpm@linux-foundation.org L1_CACHE_BYTES), GFP_KERNEL); 3229680d794bSakpm@linux-foundation.org if (!sbinfo) 3230680d794bSakpm@linux-foundation.org return -ENOMEM; 3231680d794bSakpm@linux-foundation.org 3232680d794bSakpm@linux-foundation.org sbinfo->mode = S_IRWXUGO | S_ISVTX; 323376aac0e9SDavid Howells sbinfo->uid = current_fsuid(); 323476aac0e9SDavid Howells sbinfo->gid = current_fsgid(); 3235680d794bSakpm@linux-foundation.org sb->s_fs_info = sbinfo; 32361da177e4SLinus Torvalds 32370edd73b3SHugh Dickins #ifdef CONFIG_TMPFS 32381da177e4SLinus Torvalds /* 32391da177e4SLinus Torvalds * Per default we only allow half of the physical ram per 32401da177e4SLinus Torvalds * tmpfs instance, limiting inodes to one per page of lowmem; 32411da177e4SLinus Torvalds * but the internal instance is left unlimited. 32421da177e4SLinus Torvalds */ 3243ca4e0519SAl Viro if (!(sb->s_flags & MS_KERNMOUNT)) { 3244680d794bSakpm@linux-foundation.org sbinfo->max_blocks = shmem_default_max_blocks(); 3245680d794bSakpm@linux-foundation.org sbinfo->max_inodes = shmem_default_max_inodes(); 3246680d794bSakpm@linux-foundation.org if (shmem_parse_options(data, sbinfo, false)) { 3247680d794bSakpm@linux-foundation.org err = -EINVAL; 3248680d794bSakpm@linux-foundation.org goto failed; 3249680d794bSakpm@linux-foundation.org } 3250ca4e0519SAl Viro } else { 3251ca4e0519SAl Viro sb->s_flags |= MS_NOUSER; 32521da177e4SLinus Torvalds } 325391828a40SDavid M. Grimes sb->s_export_op = &shmem_export_ops; 32542f6e38f3SHugh Dickins sb->s_flags |= MS_NOSEC; 32550edd73b3SHugh Dickins #else 32560edd73b3SHugh Dickins sb->s_flags |= MS_NOUSER; 32570edd73b3SHugh Dickins #endif 32581da177e4SLinus Torvalds 32591da177e4SLinus Torvalds spin_lock_init(&sbinfo->stat_lock); 3260908c7f19STejun Heo if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3261602586a8SHugh Dickins goto failed; 3262680d794bSakpm@linux-foundation.org sbinfo->free_inodes = sbinfo->max_inodes; 32631da177e4SLinus Torvalds 3264285b2c4fSHugh Dickins sb->s_maxbytes = MAX_LFS_FILESIZE; 326509cbfeafSKirill A. Shutemov sb->s_blocksize = PAGE_SIZE; 326609cbfeafSKirill A. Shutemov sb->s_blocksize_bits = PAGE_SHIFT; 32671da177e4SLinus Torvalds sb->s_magic = TMPFS_MAGIC; 32681da177e4SLinus Torvalds sb->s_op = &shmem_ops; 3269cfd95a9cSRobin H. Johnson sb->s_time_gran = 1; 3270b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 327139f0247dSAndreas Gruenbacher sb->s_xattr = shmem_xattr_handlers; 3272b09e0fa4SEric Paris #endif 3273b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_POSIX_ACL 327439f0247dSAndreas Gruenbacher sb->s_flags |= MS_POSIXACL; 327539f0247dSAndreas Gruenbacher #endif 32760edd73b3SHugh Dickins 3277454abafeSDmitry Monakhov inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 32781da177e4SLinus Torvalds if (!inode) 32791da177e4SLinus Torvalds goto failed; 3280680d794bSakpm@linux-foundation.org inode->i_uid = sbinfo->uid; 3281680d794bSakpm@linux-foundation.org inode->i_gid = sbinfo->gid; 3282318ceed0SAl Viro sb->s_root = d_make_root(inode); 3283318ceed0SAl Viro if (!sb->s_root) 328448fde701SAl Viro goto failed; 32851da177e4SLinus Torvalds return 0; 32861da177e4SLinus Torvalds 32871da177e4SLinus Torvalds failed: 32881da177e4SLinus Torvalds shmem_put_super(sb); 32891da177e4SLinus Torvalds return err; 32901da177e4SLinus Torvalds } 32911da177e4SLinus Torvalds 3292fcc234f8SPekka Enberg static struct kmem_cache *shmem_inode_cachep; 32931da177e4SLinus Torvalds 32941da177e4SLinus Torvalds static struct inode *shmem_alloc_inode(struct super_block *sb) 32951da177e4SLinus Torvalds { 329641ffe5d5SHugh Dickins struct shmem_inode_info *info; 329741ffe5d5SHugh Dickins info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 329841ffe5d5SHugh Dickins if (!info) 32991da177e4SLinus Torvalds return NULL; 330041ffe5d5SHugh Dickins return &info->vfs_inode; 33011da177e4SLinus Torvalds } 33021da177e4SLinus Torvalds 330341ffe5d5SHugh Dickins static void shmem_destroy_callback(struct rcu_head *head) 3304fa0d7e3dSNick Piggin { 3305fa0d7e3dSNick Piggin struct inode *inode = container_of(head, struct inode, i_rcu); 330684e710daSAl Viro if (S_ISLNK(inode->i_mode)) 33073ed47db3SAl Viro kfree(inode->i_link); 3308fa0d7e3dSNick Piggin kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 3309fa0d7e3dSNick Piggin } 3310fa0d7e3dSNick Piggin 33111da177e4SLinus Torvalds static void shmem_destroy_inode(struct inode *inode) 33121da177e4SLinus Torvalds { 331309208d15SAl Viro if (S_ISREG(inode->i_mode)) 33141da177e4SLinus Torvalds mpol_free_shared_policy(&SHMEM_I(inode)->policy); 331541ffe5d5SHugh Dickins call_rcu(&inode->i_rcu, shmem_destroy_callback); 33161da177e4SLinus Torvalds } 33171da177e4SLinus Torvalds 331841ffe5d5SHugh Dickins static void shmem_init_inode(void *foo) 33191da177e4SLinus Torvalds { 332041ffe5d5SHugh Dickins struct shmem_inode_info *info = foo; 332141ffe5d5SHugh Dickins inode_init_once(&info->vfs_inode); 33221da177e4SLinus Torvalds } 33231da177e4SLinus Torvalds 332441ffe5d5SHugh Dickins static int shmem_init_inodecache(void) 33251da177e4SLinus Torvalds { 33261da177e4SLinus Torvalds shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 33271da177e4SLinus Torvalds sizeof(struct shmem_inode_info), 33285d097056SVladimir Davydov 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 33291da177e4SLinus Torvalds return 0; 33301da177e4SLinus Torvalds } 33311da177e4SLinus Torvalds 333241ffe5d5SHugh Dickins static void shmem_destroy_inodecache(void) 33331da177e4SLinus Torvalds { 33341a1d92c1SAlexey Dobriyan kmem_cache_destroy(shmem_inode_cachep); 33351da177e4SLinus Torvalds } 33361da177e4SLinus Torvalds 3337f5e54d6eSChristoph Hellwig static const struct address_space_operations shmem_aops = { 33381da177e4SLinus Torvalds .writepage = shmem_writepage, 333976719325SKen Chen .set_page_dirty = __set_page_dirty_no_writeback, 33401da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 3341800d15a5SNick Piggin .write_begin = shmem_write_begin, 3342800d15a5SNick Piggin .write_end = shmem_write_end, 33431da177e4SLinus Torvalds #endif 33441c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3345304dbdb7SLee Schermerhorn .migratepage = migrate_page, 33461c93923cSAndrew Morton #endif 3347aa261f54SAndi Kleen .error_remove_page = generic_error_remove_page, 33481da177e4SLinus Torvalds }; 33491da177e4SLinus Torvalds 335015ad7cdcSHelge Deller static const struct file_operations shmem_file_operations = { 33511da177e4SLinus Torvalds .mmap = shmem_mmap, 3352*c01d5b30SHugh Dickins .get_unmapped_area = shmem_get_unmapped_area, 33531da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 3354220f2ac9SHugh Dickins .llseek = shmem_file_llseek, 33552ba5bbedSAl Viro .read_iter = shmem_file_read_iter, 33568174202bSAl Viro .write_iter = generic_file_write_iter, 33571b061d92SChristoph Hellwig .fsync = noop_fsync, 3358708e3508SHugh Dickins .splice_read = shmem_file_splice_read, 3359f6cb85d0SAl Viro .splice_write = iter_file_splice_write, 336083e4fa9cSHugh Dickins .fallocate = shmem_fallocate, 33611da177e4SLinus Torvalds #endif 33621da177e4SLinus Torvalds }; 33631da177e4SLinus Torvalds 336492e1d5beSArjan van de Ven static const struct inode_operations shmem_inode_operations = { 336544a30220SYu Zhao .getattr = shmem_getattr, 336694c1e62dSHugh Dickins .setattr = shmem_setattr, 3367b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3368aa7c5241SAndreas Gruenbacher .setxattr = generic_setxattr, 3369aa7c5241SAndreas Gruenbacher .getxattr = generic_getxattr, 3370b09e0fa4SEric Paris .listxattr = shmem_listxattr, 3371aa7c5241SAndreas Gruenbacher .removexattr = generic_removexattr, 3372feda821eSChristoph Hellwig .set_acl = simple_set_acl, 3373b09e0fa4SEric Paris #endif 33741da177e4SLinus Torvalds }; 33751da177e4SLinus Torvalds 337692e1d5beSArjan van de Ven static const struct inode_operations shmem_dir_inode_operations = { 33771da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 33781da177e4SLinus Torvalds .create = shmem_create, 33791da177e4SLinus Torvalds .lookup = simple_lookup, 33801da177e4SLinus Torvalds .link = shmem_link, 33811da177e4SLinus Torvalds .unlink = shmem_unlink, 33821da177e4SLinus Torvalds .symlink = shmem_symlink, 33831da177e4SLinus Torvalds .mkdir = shmem_mkdir, 33841da177e4SLinus Torvalds .rmdir = shmem_rmdir, 33851da177e4SLinus Torvalds .mknod = shmem_mknod, 33863b69ff51SMiklos Szeredi .rename2 = shmem_rename2, 338760545d0dSAl Viro .tmpfile = shmem_tmpfile, 33881da177e4SLinus Torvalds #endif 3389b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3390aa7c5241SAndreas Gruenbacher .setxattr = generic_setxattr, 3391aa7c5241SAndreas Gruenbacher .getxattr = generic_getxattr, 3392b09e0fa4SEric Paris .listxattr = shmem_listxattr, 3393aa7c5241SAndreas Gruenbacher .removexattr = generic_removexattr, 3394b09e0fa4SEric Paris #endif 339539f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 339694c1e62dSHugh Dickins .setattr = shmem_setattr, 3397feda821eSChristoph Hellwig .set_acl = simple_set_acl, 339839f0247dSAndreas Gruenbacher #endif 339939f0247dSAndreas Gruenbacher }; 340039f0247dSAndreas Gruenbacher 340192e1d5beSArjan van de Ven static const struct inode_operations shmem_special_inode_operations = { 3402b09e0fa4SEric Paris #ifdef CONFIG_TMPFS_XATTR 3403aa7c5241SAndreas Gruenbacher .setxattr = generic_setxattr, 3404aa7c5241SAndreas Gruenbacher .getxattr = generic_getxattr, 3405b09e0fa4SEric Paris .listxattr = shmem_listxattr, 3406aa7c5241SAndreas Gruenbacher .removexattr = generic_removexattr, 3407b09e0fa4SEric Paris #endif 340839f0247dSAndreas Gruenbacher #ifdef CONFIG_TMPFS_POSIX_ACL 340994c1e62dSHugh Dickins .setattr = shmem_setattr, 3410feda821eSChristoph Hellwig .set_acl = simple_set_acl, 341139f0247dSAndreas Gruenbacher #endif 34121da177e4SLinus Torvalds }; 34131da177e4SLinus Torvalds 3414759b9775SHugh Dickins static const struct super_operations shmem_ops = { 34151da177e4SLinus Torvalds .alloc_inode = shmem_alloc_inode, 34161da177e4SLinus Torvalds .destroy_inode = shmem_destroy_inode, 34171da177e4SLinus Torvalds #ifdef CONFIG_TMPFS 34181da177e4SLinus Torvalds .statfs = shmem_statfs, 34191da177e4SLinus Torvalds .remount_fs = shmem_remount_fs, 3420680d794bSakpm@linux-foundation.org .show_options = shmem_show_options, 34211da177e4SLinus Torvalds #endif 34221f895f75SAl Viro .evict_inode = shmem_evict_inode, 34231da177e4SLinus Torvalds .drop_inode = generic_delete_inode, 34241da177e4SLinus Torvalds .put_super = shmem_put_super, 34251da177e4SLinus Torvalds }; 34261da177e4SLinus Torvalds 3427f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct shmem_vm_ops = { 342854cb8821SNick Piggin .fault = shmem_fault, 3429d7c17551SNing Qu .map_pages = filemap_map_pages, 34301da177e4SLinus Torvalds #ifdef CONFIG_NUMA 34311da177e4SLinus Torvalds .set_policy = shmem_set_policy, 34321da177e4SLinus Torvalds .get_policy = shmem_get_policy, 34331da177e4SLinus Torvalds #endif 34341da177e4SLinus Torvalds }; 34351da177e4SLinus Torvalds 34363c26ff6eSAl Viro static struct dentry *shmem_mount(struct file_system_type *fs_type, 34373c26ff6eSAl Viro int flags, const char *dev_name, void *data) 34381da177e4SLinus Torvalds { 34393c26ff6eSAl Viro return mount_nodev(fs_type, flags, data, shmem_fill_super); 34401da177e4SLinus Torvalds } 34411da177e4SLinus Torvalds 344241ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 34431da177e4SLinus Torvalds .owner = THIS_MODULE, 34441da177e4SLinus Torvalds .name = "tmpfs", 34453c26ff6eSAl Viro .mount = shmem_mount, 34461da177e4SLinus Torvalds .kill_sb = kill_litter_super, 34472b8576cbSEric W. Biederman .fs_flags = FS_USERNS_MOUNT, 34481da177e4SLinus Torvalds }; 34491da177e4SLinus Torvalds 345041ffe5d5SHugh Dickins int __init shmem_init(void) 34511da177e4SLinus Torvalds { 34521da177e4SLinus Torvalds int error; 34531da177e4SLinus Torvalds 345416203a7aSRob Landley /* If rootfs called this, don't re-init */ 345516203a7aSRob Landley if (shmem_inode_cachep) 345616203a7aSRob Landley return 0; 345716203a7aSRob Landley 345841ffe5d5SHugh Dickins error = shmem_init_inodecache(); 34591da177e4SLinus Torvalds if (error) 34601da177e4SLinus Torvalds goto out3; 34611da177e4SLinus Torvalds 346241ffe5d5SHugh Dickins error = register_filesystem(&shmem_fs_type); 34631da177e4SLinus Torvalds if (error) { 34641170532bSJoe Perches pr_err("Could not register tmpfs\n"); 34651da177e4SLinus Torvalds goto out2; 34661da177e4SLinus Torvalds } 346795dc112aSGreg Kroah-Hartman 3468ca4e0519SAl Viro shm_mnt = kern_mount(&shmem_fs_type); 34691da177e4SLinus Torvalds if (IS_ERR(shm_mnt)) { 34701da177e4SLinus Torvalds error = PTR_ERR(shm_mnt); 34711170532bSJoe Perches pr_err("Could not kern_mount tmpfs\n"); 34721da177e4SLinus Torvalds goto out1; 34731da177e4SLinus Torvalds } 34745a6e75f8SKirill A. Shutemov 34755a6e75f8SKirill A. Shutemov #ifdef CONFIG_TRANSPARENT_HUGEPAGE 34765a6e75f8SKirill A. Shutemov if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) 34775a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 34785a6e75f8SKirill A. Shutemov else 34795a6e75f8SKirill A. Shutemov shmem_huge = 0; /* just in case it was patched */ 34805a6e75f8SKirill A. Shutemov #endif 34811da177e4SLinus Torvalds return 0; 34821da177e4SLinus Torvalds 34831da177e4SLinus Torvalds out1: 348441ffe5d5SHugh Dickins unregister_filesystem(&shmem_fs_type); 34851da177e4SLinus Torvalds out2: 348641ffe5d5SHugh Dickins shmem_destroy_inodecache(); 34871da177e4SLinus Torvalds out3: 34881da177e4SLinus Torvalds shm_mnt = ERR_PTR(error); 34891da177e4SLinus Torvalds return error; 34901da177e4SLinus Torvalds } 3491853ac43aSMatt Mackall 34925a6e75f8SKirill A. Shutemov #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 34935a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_show(struct kobject *kobj, 34945a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, char *buf) 34955a6e75f8SKirill A. Shutemov { 34965a6e75f8SKirill A. Shutemov int values[] = { 34975a6e75f8SKirill A. Shutemov SHMEM_HUGE_ALWAYS, 34985a6e75f8SKirill A. Shutemov SHMEM_HUGE_WITHIN_SIZE, 34995a6e75f8SKirill A. Shutemov SHMEM_HUGE_ADVISE, 35005a6e75f8SKirill A. Shutemov SHMEM_HUGE_NEVER, 35015a6e75f8SKirill A. Shutemov SHMEM_HUGE_DENY, 35025a6e75f8SKirill A. Shutemov SHMEM_HUGE_FORCE, 35035a6e75f8SKirill A. Shutemov }; 35045a6e75f8SKirill A. Shutemov int i, count; 35055a6e75f8SKirill A. Shutemov 35065a6e75f8SKirill A. Shutemov for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { 35075a6e75f8SKirill A. Shutemov const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; 35085a6e75f8SKirill A. Shutemov 35095a6e75f8SKirill A. Shutemov count += sprintf(buf + count, fmt, 35105a6e75f8SKirill A. Shutemov shmem_format_huge(values[i])); 35115a6e75f8SKirill A. Shutemov } 35125a6e75f8SKirill A. Shutemov buf[count - 1] = '\n'; 35135a6e75f8SKirill A. Shutemov return count; 35145a6e75f8SKirill A. Shutemov } 35155a6e75f8SKirill A. Shutemov 35165a6e75f8SKirill A. Shutemov static ssize_t shmem_enabled_store(struct kobject *kobj, 35175a6e75f8SKirill A. Shutemov struct kobj_attribute *attr, const char *buf, size_t count) 35185a6e75f8SKirill A. Shutemov { 35195a6e75f8SKirill A. Shutemov char tmp[16]; 35205a6e75f8SKirill A. Shutemov int huge; 35215a6e75f8SKirill A. Shutemov 35225a6e75f8SKirill A. Shutemov if (count + 1 > sizeof(tmp)) 35235a6e75f8SKirill A. Shutemov return -EINVAL; 35245a6e75f8SKirill A. Shutemov memcpy(tmp, buf, count); 35255a6e75f8SKirill A. Shutemov tmp[count] = '\0'; 35265a6e75f8SKirill A. Shutemov if (count && tmp[count - 1] == '\n') 35275a6e75f8SKirill A. Shutemov tmp[count - 1] = '\0'; 35285a6e75f8SKirill A. Shutemov 35295a6e75f8SKirill A. Shutemov huge = shmem_parse_huge(tmp); 35305a6e75f8SKirill A. Shutemov if (huge == -EINVAL) 35315a6e75f8SKirill A. Shutemov return -EINVAL; 35325a6e75f8SKirill A. Shutemov if (!has_transparent_hugepage() && 35335a6e75f8SKirill A. Shutemov huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 35345a6e75f8SKirill A. Shutemov return -EINVAL; 35355a6e75f8SKirill A. Shutemov 35365a6e75f8SKirill A. Shutemov shmem_huge = huge; 35375a6e75f8SKirill A. Shutemov if (shmem_huge < SHMEM_HUGE_DENY) 35385a6e75f8SKirill A. Shutemov SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 35395a6e75f8SKirill A. Shutemov return count; 35405a6e75f8SKirill A. Shutemov } 35415a6e75f8SKirill A. Shutemov 35425a6e75f8SKirill A. Shutemov struct kobj_attribute shmem_enabled_attr = 35435a6e75f8SKirill A. Shutemov __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); 35445a6e75f8SKirill A. Shutemov #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 35455a6e75f8SKirill A. Shutemov 3546853ac43aSMatt Mackall #else /* !CONFIG_SHMEM */ 3547853ac43aSMatt Mackall 3548853ac43aSMatt Mackall /* 3549853ac43aSMatt Mackall * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3550853ac43aSMatt Mackall * 3551853ac43aSMatt Mackall * This is intended for small system where the benefits of the full 3552853ac43aSMatt Mackall * shmem code (swap-backed and resource-limited) are outweighed by 3553853ac43aSMatt Mackall * their complexity. On systems without swap this code should be 3554853ac43aSMatt Mackall * effectively equivalent, but much lighter weight. 3555853ac43aSMatt Mackall */ 3556853ac43aSMatt Mackall 355741ffe5d5SHugh Dickins static struct file_system_type shmem_fs_type = { 3558853ac43aSMatt Mackall .name = "tmpfs", 35593c26ff6eSAl Viro .mount = ramfs_mount, 3560853ac43aSMatt Mackall .kill_sb = kill_litter_super, 35612b8576cbSEric W. Biederman .fs_flags = FS_USERNS_MOUNT, 3562853ac43aSMatt Mackall }; 3563853ac43aSMatt Mackall 356441ffe5d5SHugh Dickins int __init shmem_init(void) 3565853ac43aSMatt Mackall { 356641ffe5d5SHugh Dickins BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3567853ac43aSMatt Mackall 356841ffe5d5SHugh Dickins shm_mnt = kern_mount(&shmem_fs_type); 3569853ac43aSMatt Mackall BUG_ON(IS_ERR(shm_mnt)); 3570853ac43aSMatt Mackall 3571853ac43aSMatt Mackall return 0; 3572853ac43aSMatt Mackall } 3573853ac43aSMatt Mackall 357441ffe5d5SHugh Dickins int shmem_unuse(swp_entry_t swap, struct page *page) 3575853ac43aSMatt Mackall { 3576853ac43aSMatt Mackall return 0; 3577853ac43aSMatt Mackall } 3578853ac43aSMatt Mackall 35793f96b79aSHugh Dickins int shmem_lock(struct file *file, int lock, struct user_struct *user) 35803f96b79aSHugh Dickins { 35813f96b79aSHugh Dickins return 0; 35823f96b79aSHugh Dickins } 35833f96b79aSHugh Dickins 358424513264SHugh Dickins void shmem_unlock_mapping(struct address_space *mapping) 358524513264SHugh Dickins { 358624513264SHugh Dickins } 358724513264SHugh Dickins 3588*c01d5b30SHugh Dickins #ifdef CONFIG_MMU 3589*c01d5b30SHugh Dickins unsigned long shmem_get_unmapped_area(struct file *file, 3590*c01d5b30SHugh Dickins unsigned long addr, unsigned long len, 3591*c01d5b30SHugh Dickins unsigned long pgoff, unsigned long flags) 3592*c01d5b30SHugh Dickins { 3593*c01d5b30SHugh Dickins return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 3594*c01d5b30SHugh Dickins } 3595*c01d5b30SHugh Dickins #endif 3596*c01d5b30SHugh Dickins 359741ffe5d5SHugh Dickins void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 359894c1e62dSHugh Dickins { 359941ffe5d5SHugh Dickins truncate_inode_pages_range(inode->i_mapping, lstart, lend); 360094c1e62dSHugh Dickins } 360194c1e62dSHugh Dickins EXPORT_SYMBOL_GPL(shmem_truncate_range); 360294c1e62dSHugh Dickins 3603853ac43aSMatt Mackall #define shmem_vm_ops generic_file_vm_ops 36040b0a0806SHugh Dickins #define shmem_file_operations ramfs_file_operations 3605454abafeSDmitry Monakhov #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 36060b0a0806SHugh Dickins #define shmem_acct_size(flags, size) 0 36070b0a0806SHugh Dickins #define shmem_unacct_size(flags, size) do {} while (0) 3608853ac43aSMatt Mackall 3609853ac43aSMatt Mackall #endif /* CONFIG_SHMEM */ 3610853ac43aSMatt Mackall 3611853ac43aSMatt Mackall /* common code */ 36121da177e4SLinus Torvalds 36133451538aSAl Viro static struct dentry_operations anon_ops = { 3614118b2302SAl Viro .d_dname = simple_dname 36153451538aSAl Viro }; 36163451538aSAl Viro 3617c7277090SEric Paris static struct file *__shmem_file_setup(const char *name, loff_t size, 3618c7277090SEric Paris unsigned long flags, unsigned int i_flags) 36191da177e4SLinus Torvalds { 36206b4d0b27SAl Viro struct file *res; 36211da177e4SLinus Torvalds struct inode *inode; 36222c48b9c4SAl Viro struct path path; 36233451538aSAl Viro struct super_block *sb; 36241da177e4SLinus Torvalds struct qstr this; 36251da177e4SLinus Torvalds 36261da177e4SLinus Torvalds if (IS_ERR(shm_mnt)) 36276b4d0b27SAl Viro return ERR_CAST(shm_mnt); 36281da177e4SLinus Torvalds 3629285b2c4fSHugh Dickins if (size < 0 || size > MAX_LFS_FILESIZE) 36301da177e4SLinus Torvalds return ERR_PTR(-EINVAL); 36311da177e4SLinus Torvalds 36321da177e4SLinus Torvalds if (shmem_acct_size(flags, size)) 36331da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 36341da177e4SLinus Torvalds 36356b4d0b27SAl Viro res = ERR_PTR(-ENOMEM); 36361da177e4SLinus Torvalds this.name = name; 36371da177e4SLinus Torvalds this.len = strlen(name); 36381da177e4SLinus Torvalds this.hash = 0; /* will go */ 36393451538aSAl Viro sb = shm_mnt->mnt_sb; 364066ee4b88SKonstantin Khlebnikov path.mnt = mntget(shm_mnt); 36413451538aSAl Viro path.dentry = d_alloc_pseudo(sb, &this); 36422c48b9c4SAl Viro if (!path.dentry) 36431da177e4SLinus Torvalds goto put_memory; 36443451538aSAl Viro d_set_d_op(path.dentry, &anon_ops); 36451da177e4SLinus Torvalds 36466b4d0b27SAl Viro res = ERR_PTR(-ENOSPC); 36473451538aSAl Viro inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 36481da177e4SLinus Torvalds if (!inode) 364966ee4b88SKonstantin Khlebnikov goto put_memory; 36501da177e4SLinus Torvalds 3651c7277090SEric Paris inode->i_flags |= i_flags; 36522c48b9c4SAl Viro d_instantiate(path.dentry, inode); 36531da177e4SLinus Torvalds inode->i_size = size; 36546d6b77f1SMiklos Szeredi clear_nlink(inode); /* It is unlinked */ 365526567cdbSAl Viro res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 365626567cdbSAl Viro if (IS_ERR(res)) 365766ee4b88SKonstantin Khlebnikov goto put_path; 36584b42af81SAl Viro 36596b4d0b27SAl Viro res = alloc_file(&path, FMODE_WRITE | FMODE_READ, 36604b42af81SAl Viro &shmem_file_operations); 36616b4d0b27SAl Viro if (IS_ERR(res)) 366266ee4b88SKonstantin Khlebnikov goto put_path; 36634b42af81SAl Viro 36646b4d0b27SAl Viro return res; 36651da177e4SLinus Torvalds 36661da177e4SLinus Torvalds put_memory: 36671da177e4SLinus Torvalds shmem_unacct_size(flags, size); 366866ee4b88SKonstantin Khlebnikov put_path: 366966ee4b88SKonstantin Khlebnikov path_put(&path); 36706b4d0b27SAl Viro return res; 36711da177e4SLinus Torvalds } 3672c7277090SEric Paris 3673c7277090SEric Paris /** 3674c7277090SEric Paris * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3675c7277090SEric Paris * kernel internal. There will be NO LSM permission checks against the 3676c7277090SEric Paris * underlying inode. So users of this interface must do LSM checks at a 3677e1832f29SStephen Smalley * higher layer. The users are the big_key and shm implementations. LSM 3678e1832f29SStephen Smalley * checks are provided at the key or shm level rather than the inode. 3679c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 3680c7277090SEric Paris * @size: size to be set for the file 3681c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3682c7277090SEric Paris */ 3683c7277090SEric Paris struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3684c7277090SEric Paris { 3685c7277090SEric Paris return __shmem_file_setup(name, size, flags, S_PRIVATE); 3686c7277090SEric Paris } 3687c7277090SEric Paris 3688c7277090SEric Paris /** 3689c7277090SEric Paris * shmem_file_setup - get an unlinked file living in tmpfs 3690c7277090SEric Paris * @name: name for dentry (to be seen in /proc/<pid>/maps 3691c7277090SEric Paris * @size: size to be set for the file 3692c7277090SEric Paris * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3693c7277090SEric Paris */ 3694c7277090SEric Paris struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3695c7277090SEric Paris { 3696c7277090SEric Paris return __shmem_file_setup(name, size, flags, 0); 3697c7277090SEric Paris } 3698395e0ddcSKeith Packard EXPORT_SYMBOL_GPL(shmem_file_setup); 36991da177e4SLinus Torvalds 370046711810SRandy Dunlap /** 37011da177e4SLinus Torvalds * shmem_zero_setup - setup a shared anonymous mapping 37021da177e4SLinus Torvalds * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 37031da177e4SLinus Torvalds */ 37041da177e4SLinus Torvalds int shmem_zero_setup(struct vm_area_struct *vma) 37051da177e4SLinus Torvalds { 37061da177e4SLinus Torvalds struct file *file; 37071da177e4SLinus Torvalds loff_t size = vma->vm_end - vma->vm_start; 37081da177e4SLinus Torvalds 370966fc1303SHugh Dickins /* 371066fc1303SHugh Dickins * Cloning a new file under mmap_sem leads to a lock ordering conflict 371166fc1303SHugh Dickins * between XFS directory reading and selinux: since this file is only 371266fc1303SHugh Dickins * accessible to the user through its mapping, use S_PRIVATE flag to 371366fc1303SHugh Dickins * bypass file security, in the same way as shmem_kernel_file_setup(). 371466fc1303SHugh Dickins */ 371566fc1303SHugh Dickins file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE); 37161da177e4SLinus Torvalds if (IS_ERR(file)) 37171da177e4SLinus Torvalds return PTR_ERR(file); 37181da177e4SLinus Torvalds 37191da177e4SLinus Torvalds if (vma->vm_file) 37201da177e4SLinus Torvalds fput(vma->vm_file); 37211da177e4SLinus Torvalds vma->vm_file = file; 37221da177e4SLinus Torvalds vma->vm_ops = &shmem_vm_ops; 37231da177e4SLinus Torvalds return 0; 37241da177e4SLinus Torvalds } 3725d9d90e5eSHugh Dickins 3726d9d90e5eSHugh Dickins /** 3727d9d90e5eSHugh Dickins * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 3728d9d90e5eSHugh Dickins * @mapping: the page's address_space 3729d9d90e5eSHugh Dickins * @index: the page index 3730d9d90e5eSHugh Dickins * @gfp: the page allocator flags to use if allocating 3731d9d90e5eSHugh Dickins * 3732d9d90e5eSHugh Dickins * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 3733d9d90e5eSHugh Dickins * with any new page allocations done using the specified allocation flags. 3734d9d90e5eSHugh Dickins * But read_cache_page_gfp() uses the ->readpage() method: which does not 3735d9d90e5eSHugh Dickins * suit tmpfs, since it may have pages in swapcache, and needs to find those 3736d9d90e5eSHugh Dickins * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3737d9d90e5eSHugh Dickins * 373868da9f05SHugh Dickins * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 373968da9f05SHugh Dickins * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 3740d9d90e5eSHugh Dickins */ 3741d9d90e5eSHugh Dickins struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3742d9d90e5eSHugh Dickins pgoff_t index, gfp_t gfp) 3743d9d90e5eSHugh Dickins { 374468da9f05SHugh Dickins #ifdef CONFIG_SHMEM 374568da9f05SHugh Dickins struct inode *inode = mapping->host; 37469276aad6SHugh Dickins struct page *page; 374768da9f05SHugh Dickins int error; 374868da9f05SHugh Dickins 374968da9f05SHugh Dickins BUG_ON(mapping->a_ops != &shmem_aops); 37509e18eb29SAndres Lagar-Cavilla error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, 37519e18eb29SAndres Lagar-Cavilla gfp, NULL, NULL); 375268da9f05SHugh Dickins if (error) 375368da9f05SHugh Dickins page = ERR_PTR(error); 375468da9f05SHugh Dickins else 375568da9f05SHugh Dickins unlock_page(page); 375668da9f05SHugh Dickins return page; 375768da9f05SHugh Dickins #else 375868da9f05SHugh Dickins /* 375968da9f05SHugh Dickins * The tiny !SHMEM case uses ramfs without swap 376068da9f05SHugh Dickins */ 3761d9d90e5eSHugh Dickins return read_cache_page_gfp(mapping, index, gfp); 376268da9f05SHugh Dickins #endif 3763d9d90e5eSHugh Dickins } 3764d9d90e5eSHugh Dickins EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3765