xref: /openbmc/linux/mm/shmem.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1*1da177e4SLinus Torvalds /*
2*1da177e4SLinus Torvalds  * Resizable virtual memory filesystem for Linux.
3*1da177e4SLinus Torvalds  *
4*1da177e4SLinus Torvalds  * Copyright (C) 2000 Linus Torvalds.
5*1da177e4SLinus Torvalds  *		 2000 Transmeta Corp.
6*1da177e4SLinus Torvalds  *		 2000-2001 Christoph Rohland
7*1da177e4SLinus Torvalds  *		 2000-2001 SAP AG
8*1da177e4SLinus Torvalds  *		 2002 Red Hat Inc.
9*1da177e4SLinus Torvalds  * Copyright (C) 2002-2004 Hugh Dickins.
10*1da177e4SLinus Torvalds  * Copyright (C) 2002-2004 VERITAS Software Corporation.
11*1da177e4SLinus Torvalds  * Copyright (C) 2004 Andi Kleen, SuSE Labs
12*1da177e4SLinus Torvalds  *
13*1da177e4SLinus Torvalds  * Extended attribute support for tmpfs:
14*1da177e4SLinus Torvalds  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15*1da177e4SLinus Torvalds  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16*1da177e4SLinus Torvalds  *
17*1da177e4SLinus Torvalds  * This file is released under the GPL.
18*1da177e4SLinus Torvalds  */
19*1da177e4SLinus Torvalds 
20*1da177e4SLinus Torvalds /*
21*1da177e4SLinus Torvalds  * This virtual memory filesystem is heavily based on the ramfs. It
22*1da177e4SLinus Torvalds  * extends ramfs by the ability to use swap and honor resource limits
23*1da177e4SLinus Torvalds  * which makes it a completely usable filesystem.
24*1da177e4SLinus Torvalds  */
25*1da177e4SLinus Torvalds 
26*1da177e4SLinus Torvalds #include <linux/config.h>
27*1da177e4SLinus Torvalds #include <linux/module.h>
28*1da177e4SLinus Torvalds #include <linux/init.h>
29*1da177e4SLinus Torvalds #include <linux/devfs_fs_kernel.h>
30*1da177e4SLinus Torvalds #include <linux/fs.h>
31*1da177e4SLinus Torvalds #include <linux/mm.h>
32*1da177e4SLinus Torvalds #include <linux/mman.h>
33*1da177e4SLinus Torvalds #include <linux/file.h>
34*1da177e4SLinus Torvalds #include <linux/swap.h>
35*1da177e4SLinus Torvalds #include <linux/pagemap.h>
36*1da177e4SLinus Torvalds #include <linux/string.h>
37*1da177e4SLinus Torvalds #include <linux/slab.h>
38*1da177e4SLinus Torvalds #include <linux/backing-dev.h>
39*1da177e4SLinus Torvalds #include <linux/shmem_fs.h>
40*1da177e4SLinus Torvalds #include <linux/mount.h>
41*1da177e4SLinus Torvalds #include <linux/writeback.h>
42*1da177e4SLinus Torvalds #include <linux/vfs.h>
43*1da177e4SLinus Torvalds #include <linux/blkdev.h>
44*1da177e4SLinus Torvalds #include <linux/security.h>
45*1da177e4SLinus Torvalds #include <linux/swapops.h>
46*1da177e4SLinus Torvalds #include <linux/mempolicy.h>
47*1da177e4SLinus Torvalds #include <linux/namei.h>
48*1da177e4SLinus Torvalds #include <linux/xattr.h>
49*1da177e4SLinus Torvalds #include <asm/uaccess.h>
50*1da177e4SLinus Torvalds #include <asm/div64.h>
51*1da177e4SLinus Torvalds #include <asm/pgtable.h>
52*1da177e4SLinus Torvalds 
53*1da177e4SLinus Torvalds /* This magic number is used in glibc for posix shared memory */
54*1da177e4SLinus Torvalds #define TMPFS_MAGIC	0x01021994
55*1da177e4SLinus Torvalds 
56*1da177e4SLinus Torvalds #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
57*1da177e4SLinus Torvalds #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
58*1da177e4SLinus Torvalds #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
59*1da177e4SLinus Torvalds 
60*1da177e4SLinus Torvalds #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
61*1da177e4SLinus Torvalds #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
62*1da177e4SLinus Torvalds 
63*1da177e4SLinus Torvalds #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
64*1da177e4SLinus Torvalds 
65*1da177e4SLinus Torvalds /* info->flags needs VM_flags to handle pagein/truncate races efficiently */
66*1da177e4SLinus Torvalds #define SHMEM_PAGEIN	 VM_READ
67*1da177e4SLinus Torvalds #define SHMEM_TRUNCATE	 VM_WRITE
68*1da177e4SLinus Torvalds 
69*1da177e4SLinus Torvalds /* Definition to limit shmem_truncate's steps between cond_rescheds */
70*1da177e4SLinus Torvalds #define LATENCY_LIMIT	 64
71*1da177e4SLinus Torvalds 
72*1da177e4SLinus Torvalds /* Pretend that each entry is of this size in directory's i_size */
73*1da177e4SLinus Torvalds #define BOGO_DIRENT_SIZE 20
74*1da177e4SLinus Torvalds 
75*1da177e4SLinus Torvalds /* Keep swapped page count in private field of indirect struct page */
76*1da177e4SLinus Torvalds #define nr_swapped		private
77*1da177e4SLinus Torvalds 
78*1da177e4SLinus Torvalds /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
79*1da177e4SLinus Torvalds enum sgp_type {
80*1da177e4SLinus Torvalds 	SGP_QUICK,	/* don't try more than file page cache lookup */
81*1da177e4SLinus Torvalds 	SGP_READ,	/* don't exceed i_size, don't allocate page */
82*1da177e4SLinus Torvalds 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
83*1da177e4SLinus Torvalds 	SGP_WRITE,	/* may exceed i_size, may allocate page */
84*1da177e4SLinus Torvalds };
85*1da177e4SLinus Torvalds 
86*1da177e4SLinus Torvalds static int shmem_getpage(struct inode *inode, unsigned long idx,
87*1da177e4SLinus Torvalds 			 struct page **pagep, enum sgp_type sgp, int *type);
88*1da177e4SLinus Torvalds 
89*1da177e4SLinus Torvalds static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
90*1da177e4SLinus Torvalds {
91*1da177e4SLinus Torvalds 	/*
92*1da177e4SLinus Torvalds 	 * The above definition of ENTRIES_PER_PAGE, and the use of
93*1da177e4SLinus Torvalds 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
94*1da177e4SLinus Torvalds 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
95*1da177e4SLinus Torvalds 	 */
96*1da177e4SLinus Torvalds 	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
97*1da177e4SLinus Torvalds }
98*1da177e4SLinus Torvalds 
99*1da177e4SLinus Torvalds static inline void shmem_dir_free(struct page *page)
100*1da177e4SLinus Torvalds {
101*1da177e4SLinus Torvalds 	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
102*1da177e4SLinus Torvalds }
103*1da177e4SLinus Torvalds 
104*1da177e4SLinus Torvalds static struct page **shmem_dir_map(struct page *page)
105*1da177e4SLinus Torvalds {
106*1da177e4SLinus Torvalds 	return (struct page **)kmap_atomic(page, KM_USER0);
107*1da177e4SLinus Torvalds }
108*1da177e4SLinus Torvalds 
109*1da177e4SLinus Torvalds static inline void shmem_dir_unmap(struct page **dir)
110*1da177e4SLinus Torvalds {
111*1da177e4SLinus Torvalds 	kunmap_atomic(dir, KM_USER0);
112*1da177e4SLinus Torvalds }
113*1da177e4SLinus Torvalds 
114*1da177e4SLinus Torvalds static swp_entry_t *shmem_swp_map(struct page *page)
115*1da177e4SLinus Torvalds {
116*1da177e4SLinus Torvalds 	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
117*1da177e4SLinus Torvalds }
118*1da177e4SLinus Torvalds 
119*1da177e4SLinus Torvalds static inline void shmem_swp_balance_unmap(void)
120*1da177e4SLinus Torvalds {
121*1da177e4SLinus Torvalds 	/*
122*1da177e4SLinus Torvalds 	 * When passing a pointer to an i_direct entry, to code which
123*1da177e4SLinus Torvalds 	 * also handles indirect entries and so will shmem_swp_unmap,
124*1da177e4SLinus Torvalds 	 * we must arrange for the preempt count to remain in balance.
125*1da177e4SLinus Torvalds 	 * What kmap_atomic of a lowmem page does depends on config
126*1da177e4SLinus Torvalds 	 * and architecture, so pretend to kmap_atomic some lowmem page.
127*1da177e4SLinus Torvalds 	 */
128*1da177e4SLinus Torvalds 	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
129*1da177e4SLinus Torvalds }
130*1da177e4SLinus Torvalds 
131*1da177e4SLinus Torvalds static inline void shmem_swp_unmap(swp_entry_t *entry)
132*1da177e4SLinus Torvalds {
133*1da177e4SLinus Torvalds 	kunmap_atomic(entry, KM_USER1);
134*1da177e4SLinus Torvalds }
135*1da177e4SLinus Torvalds 
136*1da177e4SLinus Torvalds static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
137*1da177e4SLinus Torvalds {
138*1da177e4SLinus Torvalds 	return sb->s_fs_info;
139*1da177e4SLinus Torvalds }
140*1da177e4SLinus Torvalds 
141*1da177e4SLinus Torvalds /*
142*1da177e4SLinus Torvalds  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
143*1da177e4SLinus Torvalds  * for shared memory and for shared anonymous (/dev/zero) mappings
144*1da177e4SLinus Torvalds  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
145*1da177e4SLinus Torvalds  * consistent with the pre-accounting of private mappings ...
146*1da177e4SLinus Torvalds  */
147*1da177e4SLinus Torvalds static inline int shmem_acct_size(unsigned long flags, loff_t size)
148*1da177e4SLinus Torvalds {
149*1da177e4SLinus Torvalds 	return (flags & VM_ACCOUNT)?
150*1da177e4SLinus Torvalds 		security_vm_enough_memory(VM_ACCT(size)): 0;
151*1da177e4SLinus Torvalds }
152*1da177e4SLinus Torvalds 
153*1da177e4SLinus Torvalds static inline void shmem_unacct_size(unsigned long flags, loff_t size)
154*1da177e4SLinus Torvalds {
155*1da177e4SLinus Torvalds 	if (flags & VM_ACCOUNT)
156*1da177e4SLinus Torvalds 		vm_unacct_memory(VM_ACCT(size));
157*1da177e4SLinus Torvalds }
158*1da177e4SLinus Torvalds 
159*1da177e4SLinus Torvalds /*
160*1da177e4SLinus Torvalds  * ... whereas tmpfs objects are accounted incrementally as
161*1da177e4SLinus Torvalds  * pages are allocated, in order to allow huge sparse files.
162*1da177e4SLinus Torvalds  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
163*1da177e4SLinus Torvalds  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
164*1da177e4SLinus Torvalds  */
165*1da177e4SLinus Torvalds static inline int shmem_acct_block(unsigned long flags)
166*1da177e4SLinus Torvalds {
167*1da177e4SLinus Torvalds 	return (flags & VM_ACCOUNT)?
168*1da177e4SLinus Torvalds 		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
169*1da177e4SLinus Torvalds }
170*1da177e4SLinus Torvalds 
171*1da177e4SLinus Torvalds static inline void shmem_unacct_blocks(unsigned long flags, long pages)
172*1da177e4SLinus Torvalds {
173*1da177e4SLinus Torvalds 	if (!(flags & VM_ACCOUNT))
174*1da177e4SLinus Torvalds 		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
175*1da177e4SLinus Torvalds }
176*1da177e4SLinus Torvalds 
177*1da177e4SLinus Torvalds static struct super_operations shmem_ops;
178*1da177e4SLinus Torvalds static struct address_space_operations shmem_aops;
179*1da177e4SLinus Torvalds static struct file_operations shmem_file_operations;
180*1da177e4SLinus Torvalds static struct inode_operations shmem_inode_operations;
181*1da177e4SLinus Torvalds static struct inode_operations shmem_dir_inode_operations;
182*1da177e4SLinus Torvalds static struct inode_operations shmem_special_inode_operations;
183*1da177e4SLinus Torvalds static struct vm_operations_struct shmem_vm_ops;
184*1da177e4SLinus Torvalds 
185*1da177e4SLinus Torvalds static struct backing_dev_info shmem_backing_dev_info = {
186*1da177e4SLinus Torvalds 	.ra_pages	= 0,	/* No readahead */
187*1da177e4SLinus Torvalds 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
188*1da177e4SLinus Torvalds 	.unplug_io_fn	= default_unplug_io_fn,
189*1da177e4SLinus Torvalds };
190*1da177e4SLinus Torvalds 
191*1da177e4SLinus Torvalds static LIST_HEAD(shmem_swaplist);
192*1da177e4SLinus Torvalds static DEFINE_SPINLOCK(shmem_swaplist_lock);
193*1da177e4SLinus Torvalds 
194*1da177e4SLinus Torvalds static void shmem_free_blocks(struct inode *inode, long pages)
195*1da177e4SLinus Torvalds {
196*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
197*1da177e4SLinus Torvalds 	if (sbinfo) {
198*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
199*1da177e4SLinus Torvalds 		sbinfo->free_blocks += pages;
200*1da177e4SLinus Torvalds 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
201*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
202*1da177e4SLinus Torvalds 	}
203*1da177e4SLinus Torvalds }
204*1da177e4SLinus Torvalds 
205*1da177e4SLinus Torvalds /*
206*1da177e4SLinus Torvalds  * shmem_recalc_inode - recalculate the size of an inode
207*1da177e4SLinus Torvalds  *
208*1da177e4SLinus Torvalds  * @inode: inode to recalc
209*1da177e4SLinus Torvalds  *
210*1da177e4SLinus Torvalds  * We have to calculate the free blocks since the mm can drop
211*1da177e4SLinus Torvalds  * undirtied hole pages behind our back.
212*1da177e4SLinus Torvalds  *
213*1da177e4SLinus Torvalds  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
214*1da177e4SLinus Torvalds  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
215*1da177e4SLinus Torvalds  *
216*1da177e4SLinus Torvalds  * It has to be called with the spinlock held.
217*1da177e4SLinus Torvalds  */
218*1da177e4SLinus Torvalds static void shmem_recalc_inode(struct inode *inode)
219*1da177e4SLinus Torvalds {
220*1da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
221*1da177e4SLinus Torvalds 	long freed;
222*1da177e4SLinus Torvalds 
223*1da177e4SLinus Torvalds 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
224*1da177e4SLinus Torvalds 	if (freed > 0) {
225*1da177e4SLinus Torvalds 		info->alloced -= freed;
226*1da177e4SLinus Torvalds 		shmem_unacct_blocks(info->flags, freed);
227*1da177e4SLinus Torvalds 		shmem_free_blocks(inode, freed);
228*1da177e4SLinus Torvalds 	}
229*1da177e4SLinus Torvalds }
230*1da177e4SLinus Torvalds 
231*1da177e4SLinus Torvalds /*
232*1da177e4SLinus Torvalds  * shmem_swp_entry - find the swap vector position in the info structure
233*1da177e4SLinus Torvalds  *
234*1da177e4SLinus Torvalds  * @info:  info structure for the inode
235*1da177e4SLinus Torvalds  * @index: index of the page to find
236*1da177e4SLinus Torvalds  * @page:  optional page to add to the structure. Has to be preset to
237*1da177e4SLinus Torvalds  *         all zeros
238*1da177e4SLinus Torvalds  *
239*1da177e4SLinus Torvalds  * If there is no space allocated yet it will return NULL when
240*1da177e4SLinus Torvalds  * page is NULL, else it will use the page for the needed block,
241*1da177e4SLinus Torvalds  * setting it to NULL on return to indicate that it has been used.
242*1da177e4SLinus Torvalds  *
243*1da177e4SLinus Torvalds  * The swap vector is organized the following way:
244*1da177e4SLinus Torvalds  *
245*1da177e4SLinus Torvalds  * There are SHMEM_NR_DIRECT entries directly stored in the
246*1da177e4SLinus Torvalds  * shmem_inode_info structure. So small files do not need an addional
247*1da177e4SLinus Torvalds  * allocation.
248*1da177e4SLinus Torvalds  *
249*1da177e4SLinus Torvalds  * For pages with index > SHMEM_NR_DIRECT there is the pointer
250*1da177e4SLinus Torvalds  * i_indirect which points to a page which holds in the first half
251*1da177e4SLinus Torvalds  * doubly indirect blocks, in the second half triple indirect blocks:
252*1da177e4SLinus Torvalds  *
253*1da177e4SLinus Torvalds  * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
254*1da177e4SLinus Torvalds  * following layout (for SHMEM_NR_DIRECT == 16):
255*1da177e4SLinus Torvalds  *
256*1da177e4SLinus Torvalds  * i_indirect -> dir --> 16-19
257*1da177e4SLinus Torvalds  * 	      |	     +-> 20-23
258*1da177e4SLinus Torvalds  * 	      |
259*1da177e4SLinus Torvalds  * 	      +-->dir2 --> 24-27
260*1da177e4SLinus Torvalds  * 	      |	       +-> 28-31
261*1da177e4SLinus Torvalds  * 	      |	       +-> 32-35
262*1da177e4SLinus Torvalds  * 	      |	       +-> 36-39
263*1da177e4SLinus Torvalds  * 	      |
264*1da177e4SLinus Torvalds  * 	      +-->dir3 --> 40-43
265*1da177e4SLinus Torvalds  * 	       	       +-> 44-47
266*1da177e4SLinus Torvalds  * 	      	       +-> 48-51
267*1da177e4SLinus Torvalds  * 	      	       +-> 52-55
268*1da177e4SLinus Torvalds  */
269*1da177e4SLinus Torvalds static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
270*1da177e4SLinus Torvalds {
271*1da177e4SLinus Torvalds 	unsigned long offset;
272*1da177e4SLinus Torvalds 	struct page **dir;
273*1da177e4SLinus Torvalds 	struct page *subdir;
274*1da177e4SLinus Torvalds 
275*1da177e4SLinus Torvalds 	if (index < SHMEM_NR_DIRECT) {
276*1da177e4SLinus Torvalds 		shmem_swp_balance_unmap();
277*1da177e4SLinus Torvalds 		return info->i_direct+index;
278*1da177e4SLinus Torvalds 	}
279*1da177e4SLinus Torvalds 	if (!info->i_indirect) {
280*1da177e4SLinus Torvalds 		if (page) {
281*1da177e4SLinus Torvalds 			info->i_indirect = *page;
282*1da177e4SLinus Torvalds 			*page = NULL;
283*1da177e4SLinus Torvalds 		}
284*1da177e4SLinus Torvalds 		return NULL;			/* need another page */
285*1da177e4SLinus Torvalds 	}
286*1da177e4SLinus Torvalds 
287*1da177e4SLinus Torvalds 	index -= SHMEM_NR_DIRECT;
288*1da177e4SLinus Torvalds 	offset = index % ENTRIES_PER_PAGE;
289*1da177e4SLinus Torvalds 	index /= ENTRIES_PER_PAGE;
290*1da177e4SLinus Torvalds 	dir = shmem_dir_map(info->i_indirect);
291*1da177e4SLinus Torvalds 
292*1da177e4SLinus Torvalds 	if (index >= ENTRIES_PER_PAGE/2) {
293*1da177e4SLinus Torvalds 		index -= ENTRIES_PER_PAGE/2;
294*1da177e4SLinus Torvalds 		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
295*1da177e4SLinus Torvalds 		index %= ENTRIES_PER_PAGE;
296*1da177e4SLinus Torvalds 		subdir = *dir;
297*1da177e4SLinus Torvalds 		if (!subdir) {
298*1da177e4SLinus Torvalds 			if (page) {
299*1da177e4SLinus Torvalds 				*dir = *page;
300*1da177e4SLinus Torvalds 				*page = NULL;
301*1da177e4SLinus Torvalds 			}
302*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
303*1da177e4SLinus Torvalds 			return NULL;		/* need another page */
304*1da177e4SLinus Torvalds 		}
305*1da177e4SLinus Torvalds 		shmem_dir_unmap(dir);
306*1da177e4SLinus Torvalds 		dir = shmem_dir_map(subdir);
307*1da177e4SLinus Torvalds 	}
308*1da177e4SLinus Torvalds 
309*1da177e4SLinus Torvalds 	dir += index;
310*1da177e4SLinus Torvalds 	subdir = *dir;
311*1da177e4SLinus Torvalds 	if (!subdir) {
312*1da177e4SLinus Torvalds 		if (!page || !(subdir = *page)) {
313*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
314*1da177e4SLinus Torvalds 			return NULL;		/* need a page */
315*1da177e4SLinus Torvalds 		}
316*1da177e4SLinus Torvalds 		*dir = subdir;
317*1da177e4SLinus Torvalds 		*page = NULL;
318*1da177e4SLinus Torvalds 	}
319*1da177e4SLinus Torvalds 	shmem_dir_unmap(dir);
320*1da177e4SLinus Torvalds 	return shmem_swp_map(subdir) + offset;
321*1da177e4SLinus Torvalds }
322*1da177e4SLinus Torvalds 
323*1da177e4SLinus Torvalds static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
324*1da177e4SLinus Torvalds {
325*1da177e4SLinus Torvalds 	long incdec = value? 1: -1;
326*1da177e4SLinus Torvalds 
327*1da177e4SLinus Torvalds 	entry->val = value;
328*1da177e4SLinus Torvalds 	info->swapped += incdec;
329*1da177e4SLinus Torvalds 	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
330*1da177e4SLinus Torvalds 		kmap_atomic_to_page(entry)->nr_swapped += incdec;
331*1da177e4SLinus Torvalds }
332*1da177e4SLinus Torvalds 
333*1da177e4SLinus Torvalds /*
334*1da177e4SLinus Torvalds  * shmem_swp_alloc - get the position of the swap entry for the page.
335*1da177e4SLinus Torvalds  *                   If it does not exist allocate the entry.
336*1da177e4SLinus Torvalds  *
337*1da177e4SLinus Torvalds  * @info:	info structure for the inode
338*1da177e4SLinus Torvalds  * @index:	index of the page to find
339*1da177e4SLinus Torvalds  * @sgp:	check and recheck i_size? skip allocation?
340*1da177e4SLinus Torvalds  */
341*1da177e4SLinus Torvalds static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
342*1da177e4SLinus Torvalds {
343*1da177e4SLinus Torvalds 	struct inode *inode = &info->vfs_inode;
344*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
345*1da177e4SLinus Torvalds 	struct page *page = NULL;
346*1da177e4SLinus Torvalds 	swp_entry_t *entry;
347*1da177e4SLinus Torvalds 
348*1da177e4SLinus Torvalds 	if (sgp != SGP_WRITE &&
349*1da177e4SLinus Torvalds 	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
350*1da177e4SLinus Torvalds 		return ERR_PTR(-EINVAL);
351*1da177e4SLinus Torvalds 
352*1da177e4SLinus Torvalds 	while (!(entry = shmem_swp_entry(info, index, &page))) {
353*1da177e4SLinus Torvalds 		if (sgp == SGP_READ)
354*1da177e4SLinus Torvalds 			return shmem_swp_map(ZERO_PAGE(0));
355*1da177e4SLinus Torvalds 		/*
356*1da177e4SLinus Torvalds 		 * Test free_blocks against 1 not 0, since we have 1 data
357*1da177e4SLinus Torvalds 		 * page (and perhaps indirect index pages) yet to allocate:
358*1da177e4SLinus Torvalds 		 * a waste to allocate index if we cannot allocate data.
359*1da177e4SLinus Torvalds 		 */
360*1da177e4SLinus Torvalds 		if (sbinfo) {
361*1da177e4SLinus Torvalds 			spin_lock(&sbinfo->stat_lock);
362*1da177e4SLinus Torvalds 			if (sbinfo->free_blocks <= 1) {
363*1da177e4SLinus Torvalds 				spin_unlock(&sbinfo->stat_lock);
364*1da177e4SLinus Torvalds 				return ERR_PTR(-ENOSPC);
365*1da177e4SLinus Torvalds 			}
366*1da177e4SLinus Torvalds 			sbinfo->free_blocks--;
367*1da177e4SLinus Torvalds 			inode->i_blocks += BLOCKS_PER_PAGE;
368*1da177e4SLinus Torvalds 			spin_unlock(&sbinfo->stat_lock);
369*1da177e4SLinus Torvalds 		}
370*1da177e4SLinus Torvalds 
371*1da177e4SLinus Torvalds 		spin_unlock(&info->lock);
372*1da177e4SLinus Torvalds 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
373*1da177e4SLinus Torvalds 		if (page) {
374*1da177e4SLinus Torvalds 			page->nr_swapped = 0;
375*1da177e4SLinus Torvalds 		}
376*1da177e4SLinus Torvalds 		spin_lock(&info->lock);
377*1da177e4SLinus Torvalds 
378*1da177e4SLinus Torvalds 		if (!page) {
379*1da177e4SLinus Torvalds 			shmem_free_blocks(inode, 1);
380*1da177e4SLinus Torvalds 			return ERR_PTR(-ENOMEM);
381*1da177e4SLinus Torvalds 		}
382*1da177e4SLinus Torvalds 		if (sgp != SGP_WRITE &&
383*1da177e4SLinus Torvalds 		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
384*1da177e4SLinus Torvalds 			entry = ERR_PTR(-EINVAL);
385*1da177e4SLinus Torvalds 			break;
386*1da177e4SLinus Torvalds 		}
387*1da177e4SLinus Torvalds 		if (info->next_index <= index)
388*1da177e4SLinus Torvalds 			info->next_index = index + 1;
389*1da177e4SLinus Torvalds 	}
390*1da177e4SLinus Torvalds 	if (page) {
391*1da177e4SLinus Torvalds 		/* another task gave its page, or truncated the file */
392*1da177e4SLinus Torvalds 		shmem_free_blocks(inode, 1);
393*1da177e4SLinus Torvalds 		shmem_dir_free(page);
394*1da177e4SLinus Torvalds 	}
395*1da177e4SLinus Torvalds 	if (info->next_index <= index && !IS_ERR(entry))
396*1da177e4SLinus Torvalds 		info->next_index = index + 1;
397*1da177e4SLinus Torvalds 	return entry;
398*1da177e4SLinus Torvalds }
399*1da177e4SLinus Torvalds 
400*1da177e4SLinus Torvalds /*
401*1da177e4SLinus Torvalds  * shmem_free_swp - free some swap entries in a directory
402*1da177e4SLinus Torvalds  *
403*1da177e4SLinus Torvalds  * @dir:   pointer to the directory
404*1da177e4SLinus Torvalds  * @edir:  pointer after last entry of the directory
405*1da177e4SLinus Torvalds  */
406*1da177e4SLinus Torvalds static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
407*1da177e4SLinus Torvalds {
408*1da177e4SLinus Torvalds 	swp_entry_t *ptr;
409*1da177e4SLinus Torvalds 	int freed = 0;
410*1da177e4SLinus Torvalds 
411*1da177e4SLinus Torvalds 	for (ptr = dir; ptr < edir; ptr++) {
412*1da177e4SLinus Torvalds 		if (ptr->val) {
413*1da177e4SLinus Torvalds 			free_swap_and_cache(*ptr);
414*1da177e4SLinus Torvalds 			*ptr = (swp_entry_t){0};
415*1da177e4SLinus Torvalds 			freed++;
416*1da177e4SLinus Torvalds 		}
417*1da177e4SLinus Torvalds 	}
418*1da177e4SLinus Torvalds 	return freed;
419*1da177e4SLinus Torvalds }
420*1da177e4SLinus Torvalds 
421*1da177e4SLinus Torvalds static int shmem_map_and_free_swp(struct page *subdir,
422*1da177e4SLinus Torvalds 		int offset, int limit, struct page ***dir)
423*1da177e4SLinus Torvalds {
424*1da177e4SLinus Torvalds 	swp_entry_t *ptr;
425*1da177e4SLinus Torvalds 	int freed = 0;
426*1da177e4SLinus Torvalds 
427*1da177e4SLinus Torvalds 	ptr = shmem_swp_map(subdir);
428*1da177e4SLinus Torvalds 	for (; offset < limit; offset += LATENCY_LIMIT) {
429*1da177e4SLinus Torvalds 		int size = limit - offset;
430*1da177e4SLinus Torvalds 		if (size > LATENCY_LIMIT)
431*1da177e4SLinus Torvalds 			size = LATENCY_LIMIT;
432*1da177e4SLinus Torvalds 		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
433*1da177e4SLinus Torvalds 		if (need_resched()) {
434*1da177e4SLinus Torvalds 			shmem_swp_unmap(ptr);
435*1da177e4SLinus Torvalds 			if (*dir) {
436*1da177e4SLinus Torvalds 				shmem_dir_unmap(*dir);
437*1da177e4SLinus Torvalds 				*dir = NULL;
438*1da177e4SLinus Torvalds 			}
439*1da177e4SLinus Torvalds 			cond_resched();
440*1da177e4SLinus Torvalds 			ptr = shmem_swp_map(subdir);
441*1da177e4SLinus Torvalds 		}
442*1da177e4SLinus Torvalds 	}
443*1da177e4SLinus Torvalds 	shmem_swp_unmap(ptr);
444*1da177e4SLinus Torvalds 	return freed;
445*1da177e4SLinus Torvalds }
446*1da177e4SLinus Torvalds 
447*1da177e4SLinus Torvalds static void shmem_free_pages(struct list_head *next)
448*1da177e4SLinus Torvalds {
449*1da177e4SLinus Torvalds 	struct page *page;
450*1da177e4SLinus Torvalds 	int freed = 0;
451*1da177e4SLinus Torvalds 
452*1da177e4SLinus Torvalds 	do {
453*1da177e4SLinus Torvalds 		page = container_of(next, struct page, lru);
454*1da177e4SLinus Torvalds 		next = next->next;
455*1da177e4SLinus Torvalds 		shmem_dir_free(page);
456*1da177e4SLinus Torvalds 		freed++;
457*1da177e4SLinus Torvalds 		if (freed >= LATENCY_LIMIT) {
458*1da177e4SLinus Torvalds 			cond_resched();
459*1da177e4SLinus Torvalds 			freed = 0;
460*1da177e4SLinus Torvalds 		}
461*1da177e4SLinus Torvalds 	} while (next);
462*1da177e4SLinus Torvalds }
463*1da177e4SLinus Torvalds 
464*1da177e4SLinus Torvalds static void shmem_truncate(struct inode *inode)
465*1da177e4SLinus Torvalds {
466*1da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
467*1da177e4SLinus Torvalds 	unsigned long idx;
468*1da177e4SLinus Torvalds 	unsigned long size;
469*1da177e4SLinus Torvalds 	unsigned long limit;
470*1da177e4SLinus Torvalds 	unsigned long stage;
471*1da177e4SLinus Torvalds 	unsigned long diroff;
472*1da177e4SLinus Torvalds 	struct page **dir;
473*1da177e4SLinus Torvalds 	struct page *topdir;
474*1da177e4SLinus Torvalds 	struct page *middir;
475*1da177e4SLinus Torvalds 	struct page *subdir;
476*1da177e4SLinus Torvalds 	swp_entry_t *ptr;
477*1da177e4SLinus Torvalds 	LIST_HEAD(pages_to_free);
478*1da177e4SLinus Torvalds 	long nr_pages_to_free = 0;
479*1da177e4SLinus Torvalds 	long nr_swaps_freed = 0;
480*1da177e4SLinus Torvalds 	int offset;
481*1da177e4SLinus Torvalds 	int freed;
482*1da177e4SLinus Torvalds 
483*1da177e4SLinus Torvalds 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
484*1da177e4SLinus Torvalds 	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
485*1da177e4SLinus Torvalds 	if (idx >= info->next_index)
486*1da177e4SLinus Torvalds 		return;
487*1da177e4SLinus Torvalds 
488*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
489*1da177e4SLinus Torvalds 	info->flags |= SHMEM_TRUNCATE;
490*1da177e4SLinus Torvalds 	limit = info->next_index;
491*1da177e4SLinus Torvalds 	info->next_index = idx;
492*1da177e4SLinus Torvalds 	topdir = info->i_indirect;
493*1da177e4SLinus Torvalds 	if (topdir && idx <= SHMEM_NR_DIRECT) {
494*1da177e4SLinus Torvalds 		info->i_indirect = NULL;
495*1da177e4SLinus Torvalds 		nr_pages_to_free++;
496*1da177e4SLinus Torvalds 		list_add(&topdir->lru, &pages_to_free);
497*1da177e4SLinus Torvalds 	}
498*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
499*1da177e4SLinus Torvalds 
500*1da177e4SLinus Torvalds 	if (info->swapped && idx < SHMEM_NR_DIRECT) {
501*1da177e4SLinus Torvalds 		ptr = info->i_direct;
502*1da177e4SLinus Torvalds 		size = limit;
503*1da177e4SLinus Torvalds 		if (size > SHMEM_NR_DIRECT)
504*1da177e4SLinus Torvalds 			size = SHMEM_NR_DIRECT;
505*1da177e4SLinus Torvalds 		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
506*1da177e4SLinus Torvalds 	}
507*1da177e4SLinus Torvalds 	if (!topdir)
508*1da177e4SLinus Torvalds 		goto done2;
509*1da177e4SLinus Torvalds 
510*1da177e4SLinus Torvalds 	BUG_ON(limit <= SHMEM_NR_DIRECT);
511*1da177e4SLinus Torvalds 	limit -= SHMEM_NR_DIRECT;
512*1da177e4SLinus Torvalds 	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
513*1da177e4SLinus Torvalds 	offset = idx % ENTRIES_PER_PAGE;
514*1da177e4SLinus Torvalds 	idx -= offset;
515*1da177e4SLinus Torvalds 
516*1da177e4SLinus Torvalds 	dir = shmem_dir_map(topdir);
517*1da177e4SLinus Torvalds 	stage = ENTRIES_PER_PAGEPAGE/2;
518*1da177e4SLinus Torvalds 	if (idx < ENTRIES_PER_PAGEPAGE/2) {
519*1da177e4SLinus Torvalds 		middir = topdir;
520*1da177e4SLinus Torvalds 		diroff = idx/ENTRIES_PER_PAGE;
521*1da177e4SLinus Torvalds 	} else {
522*1da177e4SLinus Torvalds 		dir += ENTRIES_PER_PAGE/2;
523*1da177e4SLinus Torvalds 		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
524*1da177e4SLinus Torvalds 		while (stage <= idx)
525*1da177e4SLinus Torvalds 			stage += ENTRIES_PER_PAGEPAGE;
526*1da177e4SLinus Torvalds 		middir = *dir;
527*1da177e4SLinus Torvalds 		if (*dir) {
528*1da177e4SLinus Torvalds 			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
529*1da177e4SLinus Torvalds 				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
530*1da177e4SLinus Torvalds 			if (!diroff && !offset) {
531*1da177e4SLinus Torvalds 				*dir = NULL;
532*1da177e4SLinus Torvalds 				nr_pages_to_free++;
533*1da177e4SLinus Torvalds 				list_add(&middir->lru, &pages_to_free);
534*1da177e4SLinus Torvalds 			}
535*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
536*1da177e4SLinus Torvalds 			dir = shmem_dir_map(middir);
537*1da177e4SLinus Torvalds 		} else {
538*1da177e4SLinus Torvalds 			diroff = 0;
539*1da177e4SLinus Torvalds 			offset = 0;
540*1da177e4SLinus Torvalds 			idx = stage;
541*1da177e4SLinus Torvalds 		}
542*1da177e4SLinus Torvalds 	}
543*1da177e4SLinus Torvalds 
544*1da177e4SLinus Torvalds 	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
545*1da177e4SLinus Torvalds 		if (unlikely(idx == stage)) {
546*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
547*1da177e4SLinus Torvalds 			dir = shmem_dir_map(topdir) +
548*1da177e4SLinus Torvalds 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
549*1da177e4SLinus Torvalds 			while (!*dir) {
550*1da177e4SLinus Torvalds 				dir++;
551*1da177e4SLinus Torvalds 				idx += ENTRIES_PER_PAGEPAGE;
552*1da177e4SLinus Torvalds 				if (idx >= limit)
553*1da177e4SLinus Torvalds 					goto done1;
554*1da177e4SLinus Torvalds 			}
555*1da177e4SLinus Torvalds 			stage = idx + ENTRIES_PER_PAGEPAGE;
556*1da177e4SLinus Torvalds 			middir = *dir;
557*1da177e4SLinus Torvalds 			*dir = NULL;
558*1da177e4SLinus Torvalds 			nr_pages_to_free++;
559*1da177e4SLinus Torvalds 			list_add(&middir->lru, &pages_to_free);
560*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
561*1da177e4SLinus Torvalds 			cond_resched();
562*1da177e4SLinus Torvalds 			dir = shmem_dir_map(middir);
563*1da177e4SLinus Torvalds 			diroff = 0;
564*1da177e4SLinus Torvalds 		}
565*1da177e4SLinus Torvalds 		subdir = dir[diroff];
566*1da177e4SLinus Torvalds 		if (subdir && subdir->nr_swapped) {
567*1da177e4SLinus Torvalds 			size = limit - idx;
568*1da177e4SLinus Torvalds 			if (size > ENTRIES_PER_PAGE)
569*1da177e4SLinus Torvalds 				size = ENTRIES_PER_PAGE;
570*1da177e4SLinus Torvalds 			freed = shmem_map_and_free_swp(subdir,
571*1da177e4SLinus Torvalds 						offset, size, &dir);
572*1da177e4SLinus Torvalds 			if (!dir)
573*1da177e4SLinus Torvalds 				dir = shmem_dir_map(middir);
574*1da177e4SLinus Torvalds 			nr_swaps_freed += freed;
575*1da177e4SLinus Torvalds 			if (offset)
576*1da177e4SLinus Torvalds 				spin_lock(&info->lock);
577*1da177e4SLinus Torvalds 			subdir->nr_swapped -= freed;
578*1da177e4SLinus Torvalds 			if (offset)
579*1da177e4SLinus Torvalds 				spin_unlock(&info->lock);
580*1da177e4SLinus Torvalds 			BUG_ON(subdir->nr_swapped > offset);
581*1da177e4SLinus Torvalds 		}
582*1da177e4SLinus Torvalds 		if (offset)
583*1da177e4SLinus Torvalds 			offset = 0;
584*1da177e4SLinus Torvalds 		else if (subdir) {
585*1da177e4SLinus Torvalds 			dir[diroff] = NULL;
586*1da177e4SLinus Torvalds 			nr_pages_to_free++;
587*1da177e4SLinus Torvalds 			list_add(&subdir->lru, &pages_to_free);
588*1da177e4SLinus Torvalds 		}
589*1da177e4SLinus Torvalds 	}
590*1da177e4SLinus Torvalds done1:
591*1da177e4SLinus Torvalds 	shmem_dir_unmap(dir);
592*1da177e4SLinus Torvalds done2:
593*1da177e4SLinus Torvalds 	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
594*1da177e4SLinus Torvalds 		/*
595*1da177e4SLinus Torvalds 		 * Call truncate_inode_pages again: racing shmem_unuse_inode
596*1da177e4SLinus Torvalds 		 * may have swizzled a page in from swap since vmtruncate or
597*1da177e4SLinus Torvalds 		 * generic_delete_inode did it, before we lowered next_index.
598*1da177e4SLinus Torvalds 		 * Also, though shmem_getpage checks i_size before adding to
599*1da177e4SLinus Torvalds 		 * cache, no recheck after: so fix the narrow window there too.
600*1da177e4SLinus Torvalds 		 */
601*1da177e4SLinus Torvalds 		truncate_inode_pages(inode->i_mapping, inode->i_size);
602*1da177e4SLinus Torvalds 	}
603*1da177e4SLinus Torvalds 
604*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
605*1da177e4SLinus Torvalds 	info->flags &= ~SHMEM_TRUNCATE;
606*1da177e4SLinus Torvalds 	info->swapped -= nr_swaps_freed;
607*1da177e4SLinus Torvalds 	if (nr_pages_to_free)
608*1da177e4SLinus Torvalds 		shmem_free_blocks(inode, nr_pages_to_free);
609*1da177e4SLinus Torvalds 	shmem_recalc_inode(inode);
610*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
611*1da177e4SLinus Torvalds 
612*1da177e4SLinus Torvalds 	/*
613*1da177e4SLinus Torvalds 	 * Empty swap vector directory pages to be freed?
614*1da177e4SLinus Torvalds 	 */
615*1da177e4SLinus Torvalds 	if (!list_empty(&pages_to_free)) {
616*1da177e4SLinus Torvalds 		pages_to_free.prev->next = NULL;
617*1da177e4SLinus Torvalds 		shmem_free_pages(pages_to_free.next);
618*1da177e4SLinus Torvalds 	}
619*1da177e4SLinus Torvalds }
620*1da177e4SLinus Torvalds 
621*1da177e4SLinus Torvalds static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
622*1da177e4SLinus Torvalds {
623*1da177e4SLinus Torvalds 	struct inode *inode = dentry->d_inode;
624*1da177e4SLinus Torvalds 	struct page *page = NULL;
625*1da177e4SLinus Torvalds 	int error;
626*1da177e4SLinus Torvalds 
627*1da177e4SLinus Torvalds 	if (attr->ia_valid & ATTR_SIZE) {
628*1da177e4SLinus Torvalds 		if (attr->ia_size < inode->i_size) {
629*1da177e4SLinus Torvalds 			/*
630*1da177e4SLinus Torvalds 			 * If truncating down to a partial page, then
631*1da177e4SLinus Torvalds 			 * if that page is already allocated, hold it
632*1da177e4SLinus Torvalds 			 * in memory until the truncation is over, so
633*1da177e4SLinus Torvalds 			 * truncate_partial_page cannnot miss it were
634*1da177e4SLinus Torvalds 			 * it assigned to swap.
635*1da177e4SLinus Torvalds 			 */
636*1da177e4SLinus Torvalds 			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
637*1da177e4SLinus Torvalds 				(void) shmem_getpage(inode,
638*1da177e4SLinus Torvalds 					attr->ia_size>>PAGE_CACHE_SHIFT,
639*1da177e4SLinus Torvalds 						&page, SGP_READ, NULL);
640*1da177e4SLinus Torvalds 			}
641*1da177e4SLinus Torvalds 			/*
642*1da177e4SLinus Torvalds 			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
643*1da177e4SLinus Torvalds 			 * detect if any pages might have been added to cache
644*1da177e4SLinus Torvalds 			 * after truncate_inode_pages.  But we needn't bother
645*1da177e4SLinus Torvalds 			 * if it's being fully truncated to zero-length: the
646*1da177e4SLinus Torvalds 			 * nrpages check is efficient enough in that case.
647*1da177e4SLinus Torvalds 			 */
648*1da177e4SLinus Torvalds 			if (attr->ia_size) {
649*1da177e4SLinus Torvalds 				struct shmem_inode_info *info = SHMEM_I(inode);
650*1da177e4SLinus Torvalds 				spin_lock(&info->lock);
651*1da177e4SLinus Torvalds 				info->flags &= ~SHMEM_PAGEIN;
652*1da177e4SLinus Torvalds 				spin_unlock(&info->lock);
653*1da177e4SLinus Torvalds 			}
654*1da177e4SLinus Torvalds 		}
655*1da177e4SLinus Torvalds 	}
656*1da177e4SLinus Torvalds 
657*1da177e4SLinus Torvalds 	error = inode_change_ok(inode, attr);
658*1da177e4SLinus Torvalds 	if (!error)
659*1da177e4SLinus Torvalds 		error = inode_setattr(inode, attr);
660*1da177e4SLinus Torvalds 	if (page)
661*1da177e4SLinus Torvalds 		page_cache_release(page);
662*1da177e4SLinus Torvalds 	return error;
663*1da177e4SLinus Torvalds }
664*1da177e4SLinus Torvalds 
665*1da177e4SLinus Torvalds static void shmem_delete_inode(struct inode *inode)
666*1da177e4SLinus Torvalds {
667*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
668*1da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
669*1da177e4SLinus Torvalds 
670*1da177e4SLinus Torvalds 	if (inode->i_op->truncate == shmem_truncate) {
671*1da177e4SLinus Torvalds 		shmem_unacct_size(info->flags, inode->i_size);
672*1da177e4SLinus Torvalds 		inode->i_size = 0;
673*1da177e4SLinus Torvalds 		shmem_truncate(inode);
674*1da177e4SLinus Torvalds 		if (!list_empty(&info->swaplist)) {
675*1da177e4SLinus Torvalds 			spin_lock(&shmem_swaplist_lock);
676*1da177e4SLinus Torvalds 			list_del_init(&info->swaplist);
677*1da177e4SLinus Torvalds 			spin_unlock(&shmem_swaplist_lock);
678*1da177e4SLinus Torvalds 		}
679*1da177e4SLinus Torvalds 	}
680*1da177e4SLinus Torvalds 	if (sbinfo) {
681*1da177e4SLinus Torvalds 		BUG_ON(inode->i_blocks);
682*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
683*1da177e4SLinus Torvalds 		sbinfo->free_inodes++;
684*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
685*1da177e4SLinus Torvalds 	}
686*1da177e4SLinus Torvalds 	clear_inode(inode);
687*1da177e4SLinus Torvalds }
688*1da177e4SLinus Torvalds 
689*1da177e4SLinus Torvalds static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
690*1da177e4SLinus Torvalds {
691*1da177e4SLinus Torvalds 	swp_entry_t *ptr;
692*1da177e4SLinus Torvalds 
693*1da177e4SLinus Torvalds 	for (ptr = dir; ptr < edir; ptr++) {
694*1da177e4SLinus Torvalds 		if (ptr->val == entry.val)
695*1da177e4SLinus Torvalds 			return ptr - dir;
696*1da177e4SLinus Torvalds 	}
697*1da177e4SLinus Torvalds 	return -1;
698*1da177e4SLinus Torvalds }
699*1da177e4SLinus Torvalds 
700*1da177e4SLinus Torvalds static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
701*1da177e4SLinus Torvalds {
702*1da177e4SLinus Torvalds 	struct inode *inode;
703*1da177e4SLinus Torvalds 	unsigned long idx;
704*1da177e4SLinus Torvalds 	unsigned long size;
705*1da177e4SLinus Torvalds 	unsigned long limit;
706*1da177e4SLinus Torvalds 	unsigned long stage;
707*1da177e4SLinus Torvalds 	struct page **dir;
708*1da177e4SLinus Torvalds 	struct page *subdir;
709*1da177e4SLinus Torvalds 	swp_entry_t *ptr;
710*1da177e4SLinus Torvalds 	int offset;
711*1da177e4SLinus Torvalds 
712*1da177e4SLinus Torvalds 	idx = 0;
713*1da177e4SLinus Torvalds 	ptr = info->i_direct;
714*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
715*1da177e4SLinus Torvalds 	limit = info->next_index;
716*1da177e4SLinus Torvalds 	size = limit;
717*1da177e4SLinus Torvalds 	if (size > SHMEM_NR_DIRECT)
718*1da177e4SLinus Torvalds 		size = SHMEM_NR_DIRECT;
719*1da177e4SLinus Torvalds 	offset = shmem_find_swp(entry, ptr, ptr+size);
720*1da177e4SLinus Torvalds 	if (offset >= 0) {
721*1da177e4SLinus Torvalds 		shmem_swp_balance_unmap();
722*1da177e4SLinus Torvalds 		goto found;
723*1da177e4SLinus Torvalds 	}
724*1da177e4SLinus Torvalds 	if (!info->i_indirect)
725*1da177e4SLinus Torvalds 		goto lost2;
726*1da177e4SLinus Torvalds 
727*1da177e4SLinus Torvalds 	dir = shmem_dir_map(info->i_indirect);
728*1da177e4SLinus Torvalds 	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
729*1da177e4SLinus Torvalds 
730*1da177e4SLinus Torvalds 	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
731*1da177e4SLinus Torvalds 		if (unlikely(idx == stage)) {
732*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir-1);
733*1da177e4SLinus Torvalds 			dir = shmem_dir_map(info->i_indirect) +
734*1da177e4SLinus Torvalds 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
735*1da177e4SLinus Torvalds 			while (!*dir) {
736*1da177e4SLinus Torvalds 				dir++;
737*1da177e4SLinus Torvalds 				idx += ENTRIES_PER_PAGEPAGE;
738*1da177e4SLinus Torvalds 				if (idx >= limit)
739*1da177e4SLinus Torvalds 					goto lost1;
740*1da177e4SLinus Torvalds 			}
741*1da177e4SLinus Torvalds 			stage = idx + ENTRIES_PER_PAGEPAGE;
742*1da177e4SLinus Torvalds 			subdir = *dir;
743*1da177e4SLinus Torvalds 			shmem_dir_unmap(dir);
744*1da177e4SLinus Torvalds 			dir = shmem_dir_map(subdir);
745*1da177e4SLinus Torvalds 		}
746*1da177e4SLinus Torvalds 		subdir = *dir;
747*1da177e4SLinus Torvalds 		if (subdir && subdir->nr_swapped) {
748*1da177e4SLinus Torvalds 			ptr = shmem_swp_map(subdir);
749*1da177e4SLinus Torvalds 			size = limit - idx;
750*1da177e4SLinus Torvalds 			if (size > ENTRIES_PER_PAGE)
751*1da177e4SLinus Torvalds 				size = ENTRIES_PER_PAGE;
752*1da177e4SLinus Torvalds 			offset = shmem_find_swp(entry, ptr, ptr+size);
753*1da177e4SLinus Torvalds 			if (offset >= 0) {
754*1da177e4SLinus Torvalds 				shmem_dir_unmap(dir);
755*1da177e4SLinus Torvalds 				goto found;
756*1da177e4SLinus Torvalds 			}
757*1da177e4SLinus Torvalds 			shmem_swp_unmap(ptr);
758*1da177e4SLinus Torvalds 		}
759*1da177e4SLinus Torvalds 	}
760*1da177e4SLinus Torvalds lost1:
761*1da177e4SLinus Torvalds 	shmem_dir_unmap(dir-1);
762*1da177e4SLinus Torvalds lost2:
763*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
764*1da177e4SLinus Torvalds 	return 0;
765*1da177e4SLinus Torvalds found:
766*1da177e4SLinus Torvalds 	idx += offset;
767*1da177e4SLinus Torvalds 	inode = &info->vfs_inode;
768*1da177e4SLinus Torvalds 	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
769*1da177e4SLinus Torvalds 		info->flags |= SHMEM_PAGEIN;
770*1da177e4SLinus Torvalds 		shmem_swp_set(info, ptr + offset, 0);
771*1da177e4SLinus Torvalds 	}
772*1da177e4SLinus Torvalds 	shmem_swp_unmap(ptr);
773*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
774*1da177e4SLinus Torvalds 	/*
775*1da177e4SLinus Torvalds 	 * Decrement swap count even when the entry is left behind:
776*1da177e4SLinus Torvalds 	 * try_to_unuse will skip over mms, then reincrement count.
777*1da177e4SLinus Torvalds 	 */
778*1da177e4SLinus Torvalds 	swap_free(entry);
779*1da177e4SLinus Torvalds 	return 1;
780*1da177e4SLinus Torvalds }
781*1da177e4SLinus Torvalds 
782*1da177e4SLinus Torvalds /*
783*1da177e4SLinus Torvalds  * shmem_unuse() search for an eventually swapped out shmem page.
784*1da177e4SLinus Torvalds  */
785*1da177e4SLinus Torvalds int shmem_unuse(swp_entry_t entry, struct page *page)
786*1da177e4SLinus Torvalds {
787*1da177e4SLinus Torvalds 	struct list_head *p, *next;
788*1da177e4SLinus Torvalds 	struct shmem_inode_info *info;
789*1da177e4SLinus Torvalds 	int found = 0;
790*1da177e4SLinus Torvalds 
791*1da177e4SLinus Torvalds 	spin_lock(&shmem_swaplist_lock);
792*1da177e4SLinus Torvalds 	list_for_each_safe(p, next, &shmem_swaplist) {
793*1da177e4SLinus Torvalds 		info = list_entry(p, struct shmem_inode_info, swaplist);
794*1da177e4SLinus Torvalds 		if (!info->swapped)
795*1da177e4SLinus Torvalds 			list_del_init(&info->swaplist);
796*1da177e4SLinus Torvalds 		else if (shmem_unuse_inode(info, entry, page)) {
797*1da177e4SLinus Torvalds 			/* move head to start search for next from here */
798*1da177e4SLinus Torvalds 			list_move_tail(&shmem_swaplist, &info->swaplist);
799*1da177e4SLinus Torvalds 			found = 1;
800*1da177e4SLinus Torvalds 			break;
801*1da177e4SLinus Torvalds 		}
802*1da177e4SLinus Torvalds 	}
803*1da177e4SLinus Torvalds 	spin_unlock(&shmem_swaplist_lock);
804*1da177e4SLinus Torvalds 	return found;
805*1da177e4SLinus Torvalds }
806*1da177e4SLinus Torvalds 
807*1da177e4SLinus Torvalds /*
808*1da177e4SLinus Torvalds  * Move the page from the page cache to the swap cache.
809*1da177e4SLinus Torvalds  */
810*1da177e4SLinus Torvalds static int shmem_writepage(struct page *page, struct writeback_control *wbc)
811*1da177e4SLinus Torvalds {
812*1da177e4SLinus Torvalds 	struct shmem_inode_info *info;
813*1da177e4SLinus Torvalds 	swp_entry_t *entry, swap;
814*1da177e4SLinus Torvalds 	struct address_space *mapping;
815*1da177e4SLinus Torvalds 	unsigned long index;
816*1da177e4SLinus Torvalds 	struct inode *inode;
817*1da177e4SLinus Torvalds 
818*1da177e4SLinus Torvalds 	BUG_ON(!PageLocked(page));
819*1da177e4SLinus Torvalds 	BUG_ON(page_mapped(page));
820*1da177e4SLinus Torvalds 
821*1da177e4SLinus Torvalds 	mapping = page->mapping;
822*1da177e4SLinus Torvalds 	index = page->index;
823*1da177e4SLinus Torvalds 	inode = mapping->host;
824*1da177e4SLinus Torvalds 	info = SHMEM_I(inode);
825*1da177e4SLinus Torvalds 	if (info->flags & VM_LOCKED)
826*1da177e4SLinus Torvalds 		goto redirty;
827*1da177e4SLinus Torvalds 	swap = get_swap_page();
828*1da177e4SLinus Torvalds 	if (!swap.val)
829*1da177e4SLinus Torvalds 		goto redirty;
830*1da177e4SLinus Torvalds 
831*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
832*1da177e4SLinus Torvalds 	shmem_recalc_inode(inode);
833*1da177e4SLinus Torvalds 	if (index >= info->next_index) {
834*1da177e4SLinus Torvalds 		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
835*1da177e4SLinus Torvalds 		goto unlock;
836*1da177e4SLinus Torvalds 	}
837*1da177e4SLinus Torvalds 	entry = shmem_swp_entry(info, index, NULL);
838*1da177e4SLinus Torvalds 	BUG_ON(!entry);
839*1da177e4SLinus Torvalds 	BUG_ON(entry->val);
840*1da177e4SLinus Torvalds 
841*1da177e4SLinus Torvalds 	if (move_to_swap_cache(page, swap) == 0) {
842*1da177e4SLinus Torvalds 		shmem_swp_set(info, entry, swap.val);
843*1da177e4SLinus Torvalds 		shmem_swp_unmap(entry);
844*1da177e4SLinus Torvalds 		spin_unlock(&info->lock);
845*1da177e4SLinus Torvalds 		if (list_empty(&info->swaplist)) {
846*1da177e4SLinus Torvalds 			spin_lock(&shmem_swaplist_lock);
847*1da177e4SLinus Torvalds 			/* move instead of add in case we're racing */
848*1da177e4SLinus Torvalds 			list_move_tail(&info->swaplist, &shmem_swaplist);
849*1da177e4SLinus Torvalds 			spin_unlock(&shmem_swaplist_lock);
850*1da177e4SLinus Torvalds 		}
851*1da177e4SLinus Torvalds 		unlock_page(page);
852*1da177e4SLinus Torvalds 		return 0;
853*1da177e4SLinus Torvalds 	}
854*1da177e4SLinus Torvalds 
855*1da177e4SLinus Torvalds 	shmem_swp_unmap(entry);
856*1da177e4SLinus Torvalds unlock:
857*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
858*1da177e4SLinus Torvalds 	swap_free(swap);
859*1da177e4SLinus Torvalds redirty:
860*1da177e4SLinus Torvalds 	set_page_dirty(page);
861*1da177e4SLinus Torvalds 	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
862*1da177e4SLinus Torvalds }
863*1da177e4SLinus Torvalds 
864*1da177e4SLinus Torvalds #ifdef CONFIG_NUMA
865*1da177e4SLinus Torvalds static struct page *shmem_swapin_async(struct shared_policy *p,
866*1da177e4SLinus Torvalds 				       swp_entry_t entry, unsigned long idx)
867*1da177e4SLinus Torvalds {
868*1da177e4SLinus Torvalds 	struct page *page;
869*1da177e4SLinus Torvalds 	struct vm_area_struct pvma;
870*1da177e4SLinus Torvalds 
871*1da177e4SLinus Torvalds 	/* Create a pseudo vma that just contains the policy */
872*1da177e4SLinus Torvalds 	memset(&pvma, 0, sizeof(struct vm_area_struct));
873*1da177e4SLinus Torvalds 	pvma.vm_end = PAGE_SIZE;
874*1da177e4SLinus Torvalds 	pvma.vm_pgoff = idx;
875*1da177e4SLinus Torvalds 	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
876*1da177e4SLinus Torvalds 	page = read_swap_cache_async(entry, &pvma, 0);
877*1da177e4SLinus Torvalds 	mpol_free(pvma.vm_policy);
878*1da177e4SLinus Torvalds 	return page;
879*1da177e4SLinus Torvalds }
880*1da177e4SLinus Torvalds 
881*1da177e4SLinus Torvalds struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
882*1da177e4SLinus Torvalds 			  unsigned long idx)
883*1da177e4SLinus Torvalds {
884*1da177e4SLinus Torvalds 	struct shared_policy *p = &info->policy;
885*1da177e4SLinus Torvalds 	int i, num;
886*1da177e4SLinus Torvalds 	struct page *page;
887*1da177e4SLinus Torvalds 	unsigned long offset;
888*1da177e4SLinus Torvalds 
889*1da177e4SLinus Torvalds 	num = valid_swaphandles(entry, &offset);
890*1da177e4SLinus Torvalds 	for (i = 0; i < num; offset++, i++) {
891*1da177e4SLinus Torvalds 		page = shmem_swapin_async(p,
892*1da177e4SLinus Torvalds 				swp_entry(swp_type(entry), offset), idx);
893*1da177e4SLinus Torvalds 		if (!page)
894*1da177e4SLinus Torvalds 			break;
895*1da177e4SLinus Torvalds 		page_cache_release(page);
896*1da177e4SLinus Torvalds 	}
897*1da177e4SLinus Torvalds 	lru_add_drain();	/* Push any new pages onto the LRU now */
898*1da177e4SLinus Torvalds 	return shmem_swapin_async(p, entry, idx);
899*1da177e4SLinus Torvalds }
900*1da177e4SLinus Torvalds 
901*1da177e4SLinus Torvalds static struct page *
902*1da177e4SLinus Torvalds shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
903*1da177e4SLinus Torvalds 		 unsigned long idx)
904*1da177e4SLinus Torvalds {
905*1da177e4SLinus Torvalds 	struct vm_area_struct pvma;
906*1da177e4SLinus Torvalds 	struct page *page;
907*1da177e4SLinus Torvalds 
908*1da177e4SLinus Torvalds 	memset(&pvma, 0, sizeof(struct vm_area_struct));
909*1da177e4SLinus Torvalds 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
910*1da177e4SLinus Torvalds 	pvma.vm_pgoff = idx;
911*1da177e4SLinus Torvalds 	pvma.vm_end = PAGE_SIZE;
912*1da177e4SLinus Torvalds 	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
913*1da177e4SLinus Torvalds 	mpol_free(pvma.vm_policy);
914*1da177e4SLinus Torvalds 	return page;
915*1da177e4SLinus Torvalds }
916*1da177e4SLinus Torvalds #else
917*1da177e4SLinus Torvalds static inline struct page *
918*1da177e4SLinus Torvalds shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
919*1da177e4SLinus Torvalds {
920*1da177e4SLinus Torvalds 	swapin_readahead(entry, 0, NULL);
921*1da177e4SLinus Torvalds 	return read_swap_cache_async(entry, NULL, 0);
922*1da177e4SLinus Torvalds }
923*1da177e4SLinus Torvalds 
924*1da177e4SLinus Torvalds static inline struct page *
925*1da177e4SLinus Torvalds shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
926*1da177e4SLinus Torvalds 				 unsigned long idx)
927*1da177e4SLinus Torvalds {
928*1da177e4SLinus Torvalds 	return alloc_page(gfp | __GFP_ZERO);
929*1da177e4SLinus Torvalds }
930*1da177e4SLinus Torvalds #endif
931*1da177e4SLinus Torvalds 
932*1da177e4SLinus Torvalds /*
933*1da177e4SLinus Torvalds  * shmem_getpage - either get the page from swap or allocate a new one
934*1da177e4SLinus Torvalds  *
935*1da177e4SLinus Torvalds  * If we allocate a new one we do not mark it dirty. That's up to the
936*1da177e4SLinus Torvalds  * vm. If we swap it in we mark it dirty since we also free the swap
937*1da177e4SLinus Torvalds  * entry since a page cannot live in both the swap and page cache
938*1da177e4SLinus Torvalds  */
939*1da177e4SLinus Torvalds static int shmem_getpage(struct inode *inode, unsigned long idx,
940*1da177e4SLinus Torvalds 			struct page **pagep, enum sgp_type sgp, int *type)
941*1da177e4SLinus Torvalds {
942*1da177e4SLinus Torvalds 	struct address_space *mapping = inode->i_mapping;
943*1da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
944*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo;
945*1da177e4SLinus Torvalds 	struct page *filepage = *pagep;
946*1da177e4SLinus Torvalds 	struct page *swappage;
947*1da177e4SLinus Torvalds 	swp_entry_t *entry;
948*1da177e4SLinus Torvalds 	swp_entry_t swap;
949*1da177e4SLinus Torvalds 	int error;
950*1da177e4SLinus Torvalds 
951*1da177e4SLinus Torvalds 	if (idx >= SHMEM_MAX_INDEX)
952*1da177e4SLinus Torvalds 		return -EFBIG;
953*1da177e4SLinus Torvalds 	/*
954*1da177e4SLinus Torvalds 	 * Normally, filepage is NULL on entry, and either found
955*1da177e4SLinus Torvalds 	 * uptodate immediately, or allocated and zeroed, or read
956*1da177e4SLinus Torvalds 	 * in under swappage, which is then assigned to filepage.
957*1da177e4SLinus Torvalds 	 * But shmem_prepare_write passes in a locked filepage,
958*1da177e4SLinus Torvalds 	 * which may be found not uptodate by other callers too,
959*1da177e4SLinus Torvalds 	 * and may need to be copied from the swappage read in.
960*1da177e4SLinus Torvalds 	 */
961*1da177e4SLinus Torvalds repeat:
962*1da177e4SLinus Torvalds 	if (!filepage)
963*1da177e4SLinus Torvalds 		filepage = find_lock_page(mapping, idx);
964*1da177e4SLinus Torvalds 	if (filepage && PageUptodate(filepage))
965*1da177e4SLinus Torvalds 		goto done;
966*1da177e4SLinus Torvalds 	error = 0;
967*1da177e4SLinus Torvalds 	if (sgp == SGP_QUICK)
968*1da177e4SLinus Torvalds 		goto failed;
969*1da177e4SLinus Torvalds 
970*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
971*1da177e4SLinus Torvalds 	shmem_recalc_inode(inode);
972*1da177e4SLinus Torvalds 	entry = shmem_swp_alloc(info, idx, sgp);
973*1da177e4SLinus Torvalds 	if (IS_ERR(entry)) {
974*1da177e4SLinus Torvalds 		spin_unlock(&info->lock);
975*1da177e4SLinus Torvalds 		error = PTR_ERR(entry);
976*1da177e4SLinus Torvalds 		goto failed;
977*1da177e4SLinus Torvalds 	}
978*1da177e4SLinus Torvalds 	swap = *entry;
979*1da177e4SLinus Torvalds 
980*1da177e4SLinus Torvalds 	if (swap.val) {
981*1da177e4SLinus Torvalds 		/* Look it up and read it in.. */
982*1da177e4SLinus Torvalds 		swappage = lookup_swap_cache(swap);
983*1da177e4SLinus Torvalds 		if (!swappage) {
984*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
985*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
986*1da177e4SLinus Torvalds 			/* here we actually do the io */
987*1da177e4SLinus Torvalds 			if (type && *type == VM_FAULT_MINOR) {
988*1da177e4SLinus Torvalds 				inc_page_state(pgmajfault);
989*1da177e4SLinus Torvalds 				*type = VM_FAULT_MAJOR;
990*1da177e4SLinus Torvalds 			}
991*1da177e4SLinus Torvalds 			swappage = shmem_swapin(info, swap, idx);
992*1da177e4SLinus Torvalds 			if (!swappage) {
993*1da177e4SLinus Torvalds 				spin_lock(&info->lock);
994*1da177e4SLinus Torvalds 				entry = shmem_swp_alloc(info, idx, sgp);
995*1da177e4SLinus Torvalds 				if (IS_ERR(entry))
996*1da177e4SLinus Torvalds 					error = PTR_ERR(entry);
997*1da177e4SLinus Torvalds 				else {
998*1da177e4SLinus Torvalds 					if (entry->val == swap.val)
999*1da177e4SLinus Torvalds 						error = -ENOMEM;
1000*1da177e4SLinus Torvalds 					shmem_swp_unmap(entry);
1001*1da177e4SLinus Torvalds 				}
1002*1da177e4SLinus Torvalds 				spin_unlock(&info->lock);
1003*1da177e4SLinus Torvalds 				if (error)
1004*1da177e4SLinus Torvalds 					goto failed;
1005*1da177e4SLinus Torvalds 				goto repeat;
1006*1da177e4SLinus Torvalds 			}
1007*1da177e4SLinus Torvalds 			wait_on_page_locked(swappage);
1008*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1009*1da177e4SLinus Torvalds 			goto repeat;
1010*1da177e4SLinus Torvalds 		}
1011*1da177e4SLinus Torvalds 
1012*1da177e4SLinus Torvalds 		/* We have to do this with page locked to prevent races */
1013*1da177e4SLinus Torvalds 		if (TestSetPageLocked(swappage)) {
1014*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1015*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1016*1da177e4SLinus Torvalds 			wait_on_page_locked(swappage);
1017*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1018*1da177e4SLinus Torvalds 			goto repeat;
1019*1da177e4SLinus Torvalds 		}
1020*1da177e4SLinus Torvalds 		if (PageWriteback(swappage)) {
1021*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1022*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1023*1da177e4SLinus Torvalds 			wait_on_page_writeback(swappage);
1024*1da177e4SLinus Torvalds 			unlock_page(swappage);
1025*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1026*1da177e4SLinus Torvalds 			goto repeat;
1027*1da177e4SLinus Torvalds 		}
1028*1da177e4SLinus Torvalds 		if (!PageUptodate(swappage)) {
1029*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1030*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1031*1da177e4SLinus Torvalds 			unlock_page(swappage);
1032*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1033*1da177e4SLinus Torvalds 			error = -EIO;
1034*1da177e4SLinus Torvalds 			goto failed;
1035*1da177e4SLinus Torvalds 		}
1036*1da177e4SLinus Torvalds 
1037*1da177e4SLinus Torvalds 		if (filepage) {
1038*1da177e4SLinus Torvalds 			shmem_swp_set(info, entry, 0);
1039*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1040*1da177e4SLinus Torvalds 			delete_from_swap_cache(swappage);
1041*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1042*1da177e4SLinus Torvalds 			copy_highpage(filepage, swappage);
1043*1da177e4SLinus Torvalds 			unlock_page(swappage);
1044*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1045*1da177e4SLinus Torvalds 			flush_dcache_page(filepage);
1046*1da177e4SLinus Torvalds 			SetPageUptodate(filepage);
1047*1da177e4SLinus Torvalds 			set_page_dirty(filepage);
1048*1da177e4SLinus Torvalds 			swap_free(swap);
1049*1da177e4SLinus Torvalds 		} else if (!(error = move_from_swap_cache(
1050*1da177e4SLinus Torvalds 				swappage, idx, mapping))) {
1051*1da177e4SLinus Torvalds 			info->flags |= SHMEM_PAGEIN;
1052*1da177e4SLinus Torvalds 			shmem_swp_set(info, entry, 0);
1053*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1054*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1055*1da177e4SLinus Torvalds 			filepage = swappage;
1056*1da177e4SLinus Torvalds 			swap_free(swap);
1057*1da177e4SLinus Torvalds 		} else {
1058*1da177e4SLinus Torvalds 			shmem_swp_unmap(entry);
1059*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1060*1da177e4SLinus Torvalds 			unlock_page(swappage);
1061*1da177e4SLinus Torvalds 			page_cache_release(swappage);
1062*1da177e4SLinus Torvalds 			if (error == -ENOMEM) {
1063*1da177e4SLinus Torvalds 				/* let kswapd refresh zone for GFP_ATOMICs */
1064*1da177e4SLinus Torvalds 				blk_congestion_wait(WRITE, HZ/50);
1065*1da177e4SLinus Torvalds 			}
1066*1da177e4SLinus Torvalds 			goto repeat;
1067*1da177e4SLinus Torvalds 		}
1068*1da177e4SLinus Torvalds 	} else if (sgp == SGP_READ && !filepage) {
1069*1da177e4SLinus Torvalds 		shmem_swp_unmap(entry);
1070*1da177e4SLinus Torvalds 		filepage = find_get_page(mapping, idx);
1071*1da177e4SLinus Torvalds 		if (filepage &&
1072*1da177e4SLinus Torvalds 		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1073*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1074*1da177e4SLinus Torvalds 			wait_on_page_locked(filepage);
1075*1da177e4SLinus Torvalds 			page_cache_release(filepage);
1076*1da177e4SLinus Torvalds 			filepage = NULL;
1077*1da177e4SLinus Torvalds 			goto repeat;
1078*1da177e4SLinus Torvalds 		}
1079*1da177e4SLinus Torvalds 		spin_unlock(&info->lock);
1080*1da177e4SLinus Torvalds 	} else {
1081*1da177e4SLinus Torvalds 		shmem_swp_unmap(entry);
1082*1da177e4SLinus Torvalds 		sbinfo = SHMEM_SB(inode->i_sb);
1083*1da177e4SLinus Torvalds 		if (sbinfo) {
1084*1da177e4SLinus Torvalds 			spin_lock(&sbinfo->stat_lock);
1085*1da177e4SLinus Torvalds 			if (sbinfo->free_blocks == 0 ||
1086*1da177e4SLinus Torvalds 			    shmem_acct_block(info->flags)) {
1087*1da177e4SLinus Torvalds 				spin_unlock(&sbinfo->stat_lock);
1088*1da177e4SLinus Torvalds 				spin_unlock(&info->lock);
1089*1da177e4SLinus Torvalds 				error = -ENOSPC;
1090*1da177e4SLinus Torvalds 				goto failed;
1091*1da177e4SLinus Torvalds 			}
1092*1da177e4SLinus Torvalds 			sbinfo->free_blocks--;
1093*1da177e4SLinus Torvalds 			inode->i_blocks += BLOCKS_PER_PAGE;
1094*1da177e4SLinus Torvalds 			spin_unlock(&sbinfo->stat_lock);
1095*1da177e4SLinus Torvalds 		} else if (shmem_acct_block(info->flags)) {
1096*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1097*1da177e4SLinus Torvalds 			error = -ENOSPC;
1098*1da177e4SLinus Torvalds 			goto failed;
1099*1da177e4SLinus Torvalds 		}
1100*1da177e4SLinus Torvalds 
1101*1da177e4SLinus Torvalds 		if (!filepage) {
1102*1da177e4SLinus Torvalds 			spin_unlock(&info->lock);
1103*1da177e4SLinus Torvalds 			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1104*1da177e4SLinus Torvalds 						    info,
1105*1da177e4SLinus Torvalds 						    idx);
1106*1da177e4SLinus Torvalds 			if (!filepage) {
1107*1da177e4SLinus Torvalds 				shmem_unacct_blocks(info->flags, 1);
1108*1da177e4SLinus Torvalds 				shmem_free_blocks(inode, 1);
1109*1da177e4SLinus Torvalds 				error = -ENOMEM;
1110*1da177e4SLinus Torvalds 				goto failed;
1111*1da177e4SLinus Torvalds 			}
1112*1da177e4SLinus Torvalds 
1113*1da177e4SLinus Torvalds 			spin_lock(&info->lock);
1114*1da177e4SLinus Torvalds 			entry = shmem_swp_alloc(info, idx, sgp);
1115*1da177e4SLinus Torvalds 			if (IS_ERR(entry))
1116*1da177e4SLinus Torvalds 				error = PTR_ERR(entry);
1117*1da177e4SLinus Torvalds 			else {
1118*1da177e4SLinus Torvalds 				swap = *entry;
1119*1da177e4SLinus Torvalds 				shmem_swp_unmap(entry);
1120*1da177e4SLinus Torvalds 			}
1121*1da177e4SLinus Torvalds 			if (error || swap.val || 0 != add_to_page_cache_lru(
1122*1da177e4SLinus Torvalds 					filepage, mapping, idx, GFP_ATOMIC)) {
1123*1da177e4SLinus Torvalds 				spin_unlock(&info->lock);
1124*1da177e4SLinus Torvalds 				page_cache_release(filepage);
1125*1da177e4SLinus Torvalds 				shmem_unacct_blocks(info->flags, 1);
1126*1da177e4SLinus Torvalds 				shmem_free_blocks(inode, 1);
1127*1da177e4SLinus Torvalds 				filepage = NULL;
1128*1da177e4SLinus Torvalds 				if (error)
1129*1da177e4SLinus Torvalds 					goto failed;
1130*1da177e4SLinus Torvalds 				goto repeat;
1131*1da177e4SLinus Torvalds 			}
1132*1da177e4SLinus Torvalds 			info->flags |= SHMEM_PAGEIN;
1133*1da177e4SLinus Torvalds 		}
1134*1da177e4SLinus Torvalds 
1135*1da177e4SLinus Torvalds 		info->alloced++;
1136*1da177e4SLinus Torvalds 		spin_unlock(&info->lock);
1137*1da177e4SLinus Torvalds 		flush_dcache_page(filepage);
1138*1da177e4SLinus Torvalds 		SetPageUptodate(filepage);
1139*1da177e4SLinus Torvalds 	}
1140*1da177e4SLinus Torvalds done:
1141*1da177e4SLinus Torvalds 	if (*pagep != filepage) {
1142*1da177e4SLinus Torvalds 		unlock_page(filepage);
1143*1da177e4SLinus Torvalds 		*pagep = filepage;
1144*1da177e4SLinus Torvalds 	}
1145*1da177e4SLinus Torvalds 	return 0;
1146*1da177e4SLinus Torvalds 
1147*1da177e4SLinus Torvalds failed:
1148*1da177e4SLinus Torvalds 	if (*pagep != filepage) {
1149*1da177e4SLinus Torvalds 		unlock_page(filepage);
1150*1da177e4SLinus Torvalds 		page_cache_release(filepage);
1151*1da177e4SLinus Torvalds 	}
1152*1da177e4SLinus Torvalds 	return error;
1153*1da177e4SLinus Torvalds }
1154*1da177e4SLinus Torvalds 
1155*1da177e4SLinus Torvalds struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1156*1da177e4SLinus Torvalds {
1157*1da177e4SLinus Torvalds 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
1158*1da177e4SLinus Torvalds 	struct page *page = NULL;
1159*1da177e4SLinus Torvalds 	unsigned long idx;
1160*1da177e4SLinus Torvalds 	int error;
1161*1da177e4SLinus Torvalds 
1162*1da177e4SLinus Torvalds 	idx = (address - vma->vm_start) >> PAGE_SHIFT;
1163*1da177e4SLinus Torvalds 	idx += vma->vm_pgoff;
1164*1da177e4SLinus Torvalds 	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1165*1da177e4SLinus Torvalds 	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1166*1da177e4SLinus Torvalds 		return NOPAGE_SIGBUS;
1167*1da177e4SLinus Torvalds 
1168*1da177e4SLinus Torvalds 	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1169*1da177e4SLinus Torvalds 	if (error)
1170*1da177e4SLinus Torvalds 		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1171*1da177e4SLinus Torvalds 
1172*1da177e4SLinus Torvalds 	mark_page_accessed(page);
1173*1da177e4SLinus Torvalds 	return page;
1174*1da177e4SLinus Torvalds }
1175*1da177e4SLinus Torvalds 
1176*1da177e4SLinus Torvalds static int shmem_populate(struct vm_area_struct *vma,
1177*1da177e4SLinus Torvalds 	unsigned long addr, unsigned long len,
1178*1da177e4SLinus Torvalds 	pgprot_t prot, unsigned long pgoff, int nonblock)
1179*1da177e4SLinus Torvalds {
1180*1da177e4SLinus Torvalds 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
1181*1da177e4SLinus Torvalds 	struct mm_struct *mm = vma->vm_mm;
1182*1da177e4SLinus Torvalds 	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1183*1da177e4SLinus Torvalds 	unsigned long size;
1184*1da177e4SLinus Torvalds 
1185*1da177e4SLinus Torvalds 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1186*1da177e4SLinus Torvalds 	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1187*1da177e4SLinus Torvalds 		return -EINVAL;
1188*1da177e4SLinus Torvalds 
1189*1da177e4SLinus Torvalds 	while ((long) len > 0) {
1190*1da177e4SLinus Torvalds 		struct page *page = NULL;
1191*1da177e4SLinus Torvalds 		int err;
1192*1da177e4SLinus Torvalds 		/*
1193*1da177e4SLinus Torvalds 		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1194*1da177e4SLinus Torvalds 		 */
1195*1da177e4SLinus Torvalds 		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1196*1da177e4SLinus Torvalds 		if (err)
1197*1da177e4SLinus Torvalds 			return err;
1198*1da177e4SLinus Torvalds 		if (page) {
1199*1da177e4SLinus Torvalds 			mark_page_accessed(page);
1200*1da177e4SLinus Torvalds 			err = install_page(mm, vma, addr, page, prot);
1201*1da177e4SLinus Torvalds 			if (err) {
1202*1da177e4SLinus Torvalds 				page_cache_release(page);
1203*1da177e4SLinus Torvalds 				return err;
1204*1da177e4SLinus Torvalds 			}
1205*1da177e4SLinus Torvalds 		} else if (nonblock) {
1206*1da177e4SLinus Torvalds     			err = install_file_pte(mm, vma, addr, pgoff, prot);
1207*1da177e4SLinus Torvalds 			if (err)
1208*1da177e4SLinus Torvalds 	    			return err;
1209*1da177e4SLinus Torvalds 		}
1210*1da177e4SLinus Torvalds 
1211*1da177e4SLinus Torvalds 		len -= PAGE_SIZE;
1212*1da177e4SLinus Torvalds 		addr += PAGE_SIZE;
1213*1da177e4SLinus Torvalds 		pgoff++;
1214*1da177e4SLinus Torvalds 	}
1215*1da177e4SLinus Torvalds 	return 0;
1216*1da177e4SLinus Torvalds }
1217*1da177e4SLinus Torvalds 
1218*1da177e4SLinus Torvalds #ifdef CONFIG_NUMA
1219*1da177e4SLinus Torvalds int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1220*1da177e4SLinus Torvalds {
1221*1da177e4SLinus Torvalds 	struct inode *i = vma->vm_file->f_dentry->d_inode;
1222*1da177e4SLinus Torvalds 	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1223*1da177e4SLinus Torvalds }
1224*1da177e4SLinus Torvalds 
1225*1da177e4SLinus Torvalds struct mempolicy *
1226*1da177e4SLinus Torvalds shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1227*1da177e4SLinus Torvalds {
1228*1da177e4SLinus Torvalds 	struct inode *i = vma->vm_file->f_dentry->d_inode;
1229*1da177e4SLinus Torvalds 	unsigned long idx;
1230*1da177e4SLinus Torvalds 
1231*1da177e4SLinus Torvalds 	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1232*1da177e4SLinus Torvalds 	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1233*1da177e4SLinus Torvalds }
1234*1da177e4SLinus Torvalds #endif
1235*1da177e4SLinus Torvalds 
1236*1da177e4SLinus Torvalds int shmem_lock(struct file *file, int lock, struct user_struct *user)
1237*1da177e4SLinus Torvalds {
1238*1da177e4SLinus Torvalds 	struct inode *inode = file->f_dentry->d_inode;
1239*1da177e4SLinus Torvalds 	struct shmem_inode_info *info = SHMEM_I(inode);
1240*1da177e4SLinus Torvalds 	int retval = -ENOMEM;
1241*1da177e4SLinus Torvalds 
1242*1da177e4SLinus Torvalds 	spin_lock(&info->lock);
1243*1da177e4SLinus Torvalds 	if (lock && !(info->flags & VM_LOCKED)) {
1244*1da177e4SLinus Torvalds 		if (!user_shm_lock(inode->i_size, user))
1245*1da177e4SLinus Torvalds 			goto out_nomem;
1246*1da177e4SLinus Torvalds 		info->flags |= VM_LOCKED;
1247*1da177e4SLinus Torvalds 	}
1248*1da177e4SLinus Torvalds 	if (!lock && (info->flags & VM_LOCKED) && user) {
1249*1da177e4SLinus Torvalds 		user_shm_unlock(inode->i_size, user);
1250*1da177e4SLinus Torvalds 		info->flags &= ~VM_LOCKED;
1251*1da177e4SLinus Torvalds 	}
1252*1da177e4SLinus Torvalds 	retval = 0;
1253*1da177e4SLinus Torvalds out_nomem:
1254*1da177e4SLinus Torvalds 	spin_unlock(&info->lock);
1255*1da177e4SLinus Torvalds 	return retval;
1256*1da177e4SLinus Torvalds }
1257*1da177e4SLinus Torvalds 
1258*1da177e4SLinus Torvalds static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259*1da177e4SLinus Torvalds {
1260*1da177e4SLinus Torvalds 	file_accessed(file);
1261*1da177e4SLinus Torvalds 	vma->vm_ops = &shmem_vm_ops;
1262*1da177e4SLinus Torvalds 	return 0;
1263*1da177e4SLinus Torvalds }
1264*1da177e4SLinus Torvalds 
1265*1da177e4SLinus Torvalds static struct inode *
1266*1da177e4SLinus Torvalds shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1267*1da177e4SLinus Torvalds {
1268*1da177e4SLinus Torvalds 	struct inode *inode;
1269*1da177e4SLinus Torvalds 	struct shmem_inode_info *info;
1270*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1271*1da177e4SLinus Torvalds 
1272*1da177e4SLinus Torvalds 	if (sbinfo) {
1273*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
1274*1da177e4SLinus Torvalds 		if (!sbinfo->free_inodes) {
1275*1da177e4SLinus Torvalds 			spin_unlock(&sbinfo->stat_lock);
1276*1da177e4SLinus Torvalds 			return NULL;
1277*1da177e4SLinus Torvalds 		}
1278*1da177e4SLinus Torvalds 		sbinfo->free_inodes--;
1279*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
1280*1da177e4SLinus Torvalds 	}
1281*1da177e4SLinus Torvalds 
1282*1da177e4SLinus Torvalds 	inode = new_inode(sb);
1283*1da177e4SLinus Torvalds 	if (inode) {
1284*1da177e4SLinus Torvalds 		inode->i_mode = mode;
1285*1da177e4SLinus Torvalds 		inode->i_uid = current->fsuid;
1286*1da177e4SLinus Torvalds 		inode->i_gid = current->fsgid;
1287*1da177e4SLinus Torvalds 		inode->i_blksize = PAGE_CACHE_SIZE;
1288*1da177e4SLinus Torvalds 		inode->i_blocks = 0;
1289*1da177e4SLinus Torvalds 		inode->i_mapping->a_ops = &shmem_aops;
1290*1da177e4SLinus Torvalds 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1291*1da177e4SLinus Torvalds 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1292*1da177e4SLinus Torvalds 		info = SHMEM_I(inode);
1293*1da177e4SLinus Torvalds 		memset(info, 0, (char *)inode - (char *)info);
1294*1da177e4SLinus Torvalds 		spin_lock_init(&info->lock);
1295*1da177e4SLinus Torvalds 		INIT_LIST_HEAD(&info->swaplist);
1296*1da177e4SLinus Torvalds 
1297*1da177e4SLinus Torvalds 		switch (mode & S_IFMT) {
1298*1da177e4SLinus Torvalds 		default:
1299*1da177e4SLinus Torvalds 			inode->i_op = &shmem_special_inode_operations;
1300*1da177e4SLinus Torvalds 			init_special_inode(inode, mode, dev);
1301*1da177e4SLinus Torvalds 			break;
1302*1da177e4SLinus Torvalds 		case S_IFREG:
1303*1da177e4SLinus Torvalds 			inode->i_op = &shmem_inode_operations;
1304*1da177e4SLinus Torvalds 			inode->i_fop = &shmem_file_operations;
1305*1da177e4SLinus Torvalds 			mpol_shared_policy_init(&info->policy);
1306*1da177e4SLinus Torvalds 			break;
1307*1da177e4SLinus Torvalds 		case S_IFDIR:
1308*1da177e4SLinus Torvalds 			inode->i_nlink++;
1309*1da177e4SLinus Torvalds 			/* Some things misbehave if size == 0 on a directory */
1310*1da177e4SLinus Torvalds 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
1311*1da177e4SLinus Torvalds 			inode->i_op = &shmem_dir_inode_operations;
1312*1da177e4SLinus Torvalds 			inode->i_fop = &simple_dir_operations;
1313*1da177e4SLinus Torvalds 			break;
1314*1da177e4SLinus Torvalds 		case S_IFLNK:
1315*1da177e4SLinus Torvalds 			/*
1316*1da177e4SLinus Torvalds 			 * Must not load anything in the rbtree,
1317*1da177e4SLinus Torvalds 			 * mpol_free_shared_policy will not be called.
1318*1da177e4SLinus Torvalds 			 */
1319*1da177e4SLinus Torvalds 			mpol_shared_policy_init(&info->policy);
1320*1da177e4SLinus Torvalds 			break;
1321*1da177e4SLinus Torvalds 		}
1322*1da177e4SLinus Torvalds 	} else if (sbinfo) {
1323*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
1324*1da177e4SLinus Torvalds 		sbinfo->free_inodes++;
1325*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
1326*1da177e4SLinus Torvalds 	}
1327*1da177e4SLinus Torvalds 	return inode;
1328*1da177e4SLinus Torvalds }
1329*1da177e4SLinus Torvalds 
1330*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
1331*1da177e4SLinus Torvalds 
1332*1da177e4SLinus Torvalds static int shmem_set_size(struct shmem_sb_info *sbinfo,
1333*1da177e4SLinus Torvalds 			  unsigned long max_blocks, unsigned long max_inodes)
1334*1da177e4SLinus Torvalds {
1335*1da177e4SLinus Torvalds 	int error;
1336*1da177e4SLinus Torvalds 	unsigned long blocks, inodes;
1337*1da177e4SLinus Torvalds 
1338*1da177e4SLinus Torvalds 	spin_lock(&sbinfo->stat_lock);
1339*1da177e4SLinus Torvalds 	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
1340*1da177e4SLinus Torvalds 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
1341*1da177e4SLinus Torvalds 	error = -EINVAL;
1342*1da177e4SLinus Torvalds 	if (max_blocks < blocks)
1343*1da177e4SLinus Torvalds 		goto out;
1344*1da177e4SLinus Torvalds 	if (max_inodes < inodes)
1345*1da177e4SLinus Torvalds 		goto out;
1346*1da177e4SLinus Torvalds 	error = 0;
1347*1da177e4SLinus Torvalds 	sbinfo->max_blocks  = max_blocks;
1348*1da177e4SLinus Torvalds 	sbinfo->free_blocks = max_blocks - blocks;
1349*1da177e4SLinus Torvalds 	sbinfo->max_inodes  = max_inodes;
1350*1da177e4SLinus Torvalds 	sbinfo->free_inodes = max_inodes - inodes;
1351*1da177e4SLinus Torvalds out:
1352*1da177e4SLinus Torvalds 	spin_unlock(&sbinfo->stat_lock);
1353*1da177e4SLinus Torvalds 	return error;
1354*1da177e4SLinus Torvalds }
1355*1da177e4SLinus Torvalds 
1356*1da177e4SLinus Torvalds static struct inode_operations shmem_symlink_inode_operations;
1357*1da177e4SLinus Torvalds static struct inode_operations shmem_symlink_inline_operations;
1358*1da177e4SLinus Torvalds 
1359*1da177e4SLinus Torvalds /*
1360*1da177e4SLinus Torvalds  * Normally tmpfs makes no use of shmem_prepare_write, but it
1361*1da177e4SLinus Torvalds  * lets a tmpfs file be used read-write below the loop driver.
1362*1da177e4SLinus Torvalds  */
1363*1da177e4SLinus Torvalds static int
1364*1da177e4SLinus Torvalds shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1365*1da177e4SLinus Torvalds {
1366*1da177e4SLinus Torvalds 	struct inode *inode = page->mapping->host;
1367*1da177e4SLinus Torvalds 	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1368*1da177e4SLinus Torvalds }
1369*1da177e4SLinus Torvalds 
1370*1da177e4SLinus Torvalds static ssize_t
1371*1da177e4SLinus Torvalds shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1372*1da177e4SLinus Torvalds {
1373*1da177e4SLinus Torvalds 	struct inode	*inode = file->f_dentry->d_inode;
1374*1da177e4SLinus Torvalds 	loff_t		pos;
1375*1da177e4SLinus Torvalds 	unsigned long	written;
1376*1da177e4SLinus Torvalds 	ssize_t		err;
1377*1da177e4SLinus Torvalds 
1378*1da177e4SLinus Torvalds 	if ((ssize_t) count < 0)
1379*1da177e4SLinus Torvalds 		return -EINVAL;
1380*1da177e4SLinus Torvalds 
1381*1da177e4SLinus Torvalds 	if (!access_ok(VERIFY_READ, buf, count))
1382*1da177e4SLinus Torvalds 		return -EFAULT;
1383*1da177e4SLinus Torvalds 
1384*1da177e4SLinus Torvalds 	down(&inode->i_sem);
1385*1da177e4SLinus Torvalds 
1386*1da177e4SLinus Torvalds 	pos = *ppos;
1387*1da177e4SLinus Torvalds 	written = 0;
1388*1da177e4SLinus Torvalds 
1389*1da177e4SLinus Torvalds 	err = generic_write_checks(file, &pos, &count, 0);
1390*1da177e4SLinus Torvalds 	if (err || !count)
1391*1da177e4SLinus Torvalds 		goto out;
1392*1da177e4SLinus Torvalds 
1393*1da177e4SLinus Torvalds 	err = remove_suid(file->f_dentry);
1394*1da177e4SLinus Torvalds 	if (err)
1395*1da177e4SLinus Torvalds 		goto out;
1396*1da177e4SLinus Torvalds 
1397*1da177e4SLinus Torvalds 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1398*1da177e4SLinus Torvalds 
1399*1da177e4SLinus Torvalds 	do {
1400*1da177e4SLinus Torvalds 		struct page *page = NULL;
1401*1da177e4SLinus Torvalds 		unsigned long bytes, index, offset;
1402*1da177e4SLinus Torvalds 		char *kaddr;
1403*1da177e4SLinus Torvalds 		int left;
1404*1da177e4SLinus Torvalds 
1405*1da177e4SLinus Torvalds 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1406*1da177e4SLinus Torvalds 		index = pos >> PAGE_CACHE_SHIFT;
1407*1da177e4SLinus Torvalds 		bytes = PAGE_CACHE_SIZE - offset;
1408*1da177e4SLinus Torvalds 		if (bytes > count)
1409*1da177e4SLinus Torvalds 			bytes = count;
1410*1da177e4SLinus Torvalds 
1411*1da177e4SLinus Torvalds 		/*
1412*1da177e4SLinus Torvalds 		 * We don't hold page lock across copy from user -
1413*1da177e4SLinus Torvalds 		 * what would it guard against? - so no deadlock here.
1414*1da177e4SLinus Torvalds 		 * But it still may be a good idea to prefault below.
1415*1da177e4SLinus Torvalds 		 */
1416*1da177e4SLinus Torvalds 
1417*1da177e4SLinus Torvalds 		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1418*1da177e4SLinus Torvalds 		if (err)
1419*1da177e4SLinus Torvalds 			break;
1420*1da177e4SLinus Torvalds 
1421*1da177e4SLinus Torvalds 		left = bytes;
1422*1da177e4SLinus Torvalds 		if (PageHighMem(page)) {
1423*1da177e4SLinus Torvalds 			volatile unsigned char dummy;
1424*1da177e4SLinus Torvalds 			__get_user(dummy, buf);
1425*1da177e4SLinus Torvalds 			__get_user(dummy, buf + bytes - 1);
1426*1da177e4SLinus Torvalds 
1427*1da177e4SLinus Torvalds 			kaddr = kmap_atomic(page, KM_USER0);
1428*1da177e4SLinus Torvalds 			left = __copy_from_user_inatomic(kaddr + offset,
1429*1da177e4SLinus Torvalds 							buf, bytes);
1430*1da177e4SLinus Torvalds 			kunmap_atomic(kaddr, KM_USER0);
1431*1da177e4SLinus Torvalds 		}
1432*1da177e4SLinus Torvalds 		if (left) {
1433*1da177e4SLinus Torvalds 			kaddr = kmap(page);
1434*1da177e4SLinus Torvalds 			left = __copy_from_user(kaddr + offset, buf, bytes);
1435*1da177e4SLinus Torvalds 			kunmap(page);
1436*1da177e4SLinus Torvalds 		}
1437*1da177e4SLinus Torvalds 
1438*1da177e4SLinus Torvalds 		written += bytes;
1439*1da177e4SLinus Torvalds 		count -= bytes;
1440*1da177e4SLinus Torvalds 		pos += bytes;
1441*1da177e4SLinus Torvalds 		buf += bytes;
1442*1da177e4SLinus Torvalds 		if (pos > inode->i_size)
1443*1da177e4SLinus Torvalds 			i_size_write(inode, pos);
1444*1da177e4SLinus Torvalds 
1445*1da177e4SLinus Torvalds 		flush_dcache_page(page);
1446*1da177e4SLinus Torvalds 		set_page_dirty(page);
1447*1da177e4SLinus Torvalds 		mark_page_accessed(page);
1448*1da177e4SLinus Torvalds 		page_cache_release(page);
1449*1da177e4SLinus Torvalds 
1450*1da177e4SLinus Torvalds 		if (left) {
1451*1da177e4SLinus Torvalds 			pos -= left;
1452*1da177e4SLinus Torvalds 			written -= left;
1453*1da177e4SLinus Torvalds 			err = -EFAULT;
1454*1da177e4SLinus Torvalds 			break;
1455*1da177e4SLinus Torvalds 		}
1456*1da177e4SLinus Torvalds 
1457*1da177e4SLinus Torvalds 		/*
1458*1da177e4SLinus Torvalds 		 * Our dirty pages are not counted in nr_dirty,
1459*1da177e4SLinus Torvalds 		 * and we do not attempt to balance dirty pages.
1460*1da177e4SLinus Torvalds 		 */
1461*1da177e4SLinus Torvalds 
1462*1da177e4SLinus Torvalds 		cond_resched();
1463*1da177e4SLinus Torvalds 	} while (count);
1464*1da177e4SLinus Torvalds 
1465*1da177e4SLinus Torvalds 	*ppos = pos;
1466*1da177e4SLinus Torvalds 	if (written)
1467*1da177e4SLinus Torvalds 		err = written;
1468*1da177e4SLinus Torvalds out:
1469*1da177e4SLinus Torvalds 	up(&inode->i_sem);
1470*1da177e4SLinus Torvalds 	return err;
1471*1da177e4SLinus Torvalds }
1472*1da177e4SLinus Torvalds 
1473*1da177e4SLinus Torvalds static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1474*1da177e4SLinus Torvalds {
1475*1da177e4SLinus Torvalds 	struct inode *inode = filp->f_dentry->d_inode;
1476*1da177e4SLinus Torvalds 	struct address_space *mapping = inode->i_mapping;
1477*1da177e4SLinus Torvalds 	unsigned long index, offset;
1478*1da177e4SLinus Torvalds 
1479*1da177e4SLinus Torvalds 	index = *ppos >> PAGE_CACHE_SHIFT;
1480*1da177e4SLinus Torvalds 	offset = *ppos & ~PAGE_CACHE_MASK;
1481*1da177e4SLinus Torvalds 
1482*1da177e4SLinus Torvalds 	for (;;) {
1483*1da177e4SLinus Torvalds 		struct page *page = NULL;
1484*1da177e4SLinus Torvalds 		unsigned long end_index, nr, ret;
1485*1da177e4SLinus Torvalds 		loff_t i_size = i_size_read(inode);
1486*1da177e4SLinus Torvalds 
1487*1da177e4SLinus Torvalds 		end_index = i_size >> PAGE_CACHE_SHIFT;
1488*1da177e4SLinus Torvalds 		if (index > end_index)
1489*1da177e4SLinus Torvalds 			break;
1490*1da177e4SLinus Torvalds 		if (index == end_index) {
1491*1da177e4SLinus Torvalds 			nr = i_size & ~PAGE_CACHE_MASK;
1492*1da177e4SLinus Torvalds 			if (nr <= offset)
1493*1da177e4SLinus Torvalds 				break;
1494*1da177e4SLinus Torvalds 		}
1495*1da177e4SLinus Torvalds 
1496*1da177e4SLinus Torvalds 		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1497*1da177e4SLinus Torvalds 		if (desc->error) {
1498*1da177e4SLinus Torvalds 			if (desc->error == -EINVAL)
1499*1da177e4SLinus Torvalds 				desc->error = 0;
1500*1da177e4SLinus Torvalds 			break;
1501*1da177e4SLinus Torvalds 		}
1502*1da177e4SLinus Torvalds 
1503*1da177e4SLinus Torvalds 		/*
1504*1da177e4SLinus Torvalds 		 * We must evaluate after, since reads (unlike writes)
1505*1da177e4SLinus Torvalds 		 * are called without i_sem protection against truncate
1506*1da177e4SLinus Torvalds 		 */
1507*1da177e4SLinus Torvalds 		nr = PAGE_CACHE_SIZE;
1508*1da177e4SLinus Torvalds 		i_size = i_size_read(inode);
1509*1da177e4SLinus Torvalds 		end_index = i_size >> PAGE_CACHE_SHIFT;
1510*1da177e4SLinus Torvalds 		if (index == end_index) {
1511*1da177e4SLinus Torvalds 			nr = i_size & ~PAGE_CACHE_MASK;
1512*1da177e4SLinus Torvalds 			if (nr <= offset) {
1513*1da177e4SLinus Torvalds 				if (page)
1514*1da177e4SLinus Torvalds 					page_cache_release(page);
1515*1da177e4SLinus Torvalds 				break;
1516*1da177e4SLinus Torvalds 			}
1517*1da177e4SLinus Torvalds 		}
1518*1da177e4SLinus Torvalds 		nr -= offset;
1519*1da177e4SLinus Torvalds 
1520*1da177e4SLinus Torvalds 		if (page) {
1521*1da177e4SLinus Torvalds 			/*
1522*1da177e4SLinus Torvalds 			 * If users can be writing to this page using arbitrary
1523*1da177e4SLinus Torvalds 			 * virtual addresses, take care about potential aliasing
1524*1da177e4SLinus Torvalds 			 * before reading the page on the kernel side.
1525*1da177e4SLinus Torvalds 			 */
1526*1da177e4SLinus Torvalds 			if (mapping_writably_mapped(mapping))
1527*1da177e4SLinus Torvalds 				flush_dcache_page(page);
1528*1da177e4SLinus Torvalds 			/*
1529*1da177e4SLinus Torvalds 			 * Mark the page accessed if we read the beginning.
1530*1da177e4SLinus Torvalds 			 */
1531*1da177e4SLinus Torvalds 			if (!offset)
1532*1da177e4SLinus Torvalds 				mark_page_accessed(page);
1533*1da177e4SLinus Torvalds 		} else
1534*1da177e4SLinus Torvalds 			page = ZERO_PAGE(0);
1535*1da177e4SLinus Torvalds 
1536*1da177e4SLinus Torvalds 		/*
1537*1da177e4SLinus Torvalds 		 * Ok, we have the page, and it's up-to-date, so
1538*1da177e4SLinus Torvalds 		 * now we can copy it to user space...
1539*1da177e4SLinus Torvalds 		 *
1540*1da177e4SLinus Torvalds 		 * The actor routine returns how many bytes were actually used..
1541*1da177e4SLinus Torvalds 		 * NOTE! This may not be the same as how much of a user buffer
1542*1da177e4SLinus Torvalds 		 * we filled up (we may be padding etc), so we can only update
1543*1da177e4SLinus Torvalds 		 * "pos" here (the actor routine has to update the user buffer
1544*1da177e4SLinus Torvalds 		 * pointers and the remaining count).
1545*1da177e4SLinus Torvalds 		 */
1546*1da177e4SLinus Torvalds 		ret = actor(desc, page, offset, nr);
1547*1da177e4SLinus Torvalds 		offset += ret;
1548*1da177e4SLinus Torvalds 		index += offset >> PAGE_CACHE_SHIFT;
1549*1da177e4SLinus Torvalds 		offset &= ~PAGE_CACHE_MASK;
1550*1da177e4SLinus Torvalds 
1551*1da177e4SLinus Torvalds 		page_cache_release(page);
1552*1da177e4SLinus Torvalds 		if (ret != nr || !desc->count)
1553*1da177e4SLinus Torvalds 			break;
1554*1da177e4SLinus Torvalds 
1555*1da177e4SLinus Torvalds 		cond_resched();
1556*1da177e4SLinus Torvalds 	}
1557*1da177e4SLinus Torvalds 
1558*1da177e4SLinus Torvalds 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1559*1da177e4SLinus Torvalds 	file_accessed(filp);
1560*1da177e4SLinus Torvalds }
1561*1da177e4SLinus Torvalds 
1562*1da177e4SLinus Torvalds static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1563*1da177e4SLinus Torvalds {
1564*1da177e4SLinus Torvalds 	read_descriptor_t desc;
1565*1da177e4SLinus Torvalds 
1566*1da177e4SLinus Torvalds 	if ((ssize_t) count < 0)
1567*1da177e4SLinus Torvalds 		return -EINVAL;
1568*1da177e4SLinus Torvalds 	if (!access_ok(VERIFY_WRITE, buf, count))
1569*1da177e4SLinus Torvalds 		return -EFAULT;
1570*1da177e4SLinus Torvalds 	if (!count)
1571*1da177e4SLinus Torvalds 		return 0;
1572*1da177e4SLinus Torvalds 
1573*1da177e4SLinus Torvalds 	desc.written = 0;
1574*1da177e4SLinus Torvalds 	desc.count = count;
1575*1da177e4SLinus Torvalds 	desc.arg.buf = buf;
1576*1da177e4SLinus Torvalds 	desc.error = 0;
1577*1da177e4SLinus Torvalds 
1578*1da177e4SLinus Torvalds 	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1579*1da177e4SLinus Torvalds 	if (desc.written)
1580*1da177e4SLinus Torvalds 		return desc.written;
1581*1da177e4SLinus Torvalds 	return desc.error;
1582*1da177e4SLinus Torvalds }
1583*1da177e4SLinus Torvalds 
1584*1da177e4SLinus Torvalds static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1585*1da177e4SLinus Torvalds 			 size_t count, read_actor_t actor, void *target)
1586*1da177e4SLinus Torvalds {
1587*1da177e4SLinus Torvalds 	read_descriptor_t desc;
1588*1da177e4SLinus Torvalds 
1589*1da177e4SLinus Torvalds 	if (!count)
1590*1da177e4SLinus Torvalds 		return 0;
1591*1da177e4SLinus Torvalds 
1592*1da177e4SLinus Torvalds 	desc.written = 0;
1593*1da177e4SLinus Torvalds 	desc.count = count;
1594*1da177e4SLinus Torvalds 	desc.arg.data = target;
1595*1da177e4SLinus Torvalds 	desc.error = 0;
1596*1da177e4SLinus Torvalds 
1597*1da177e4SLinus Torvalds 	do_shmem_file_read(in_file, ppos, &desc, actor);
1598*1da177e4SLinus Torvalds 	if (desc.written)
1599*1da177e4SLinus Torvalds 		return desc.written;
1600*1da177e4SLinus Torvalds 	return desc.error;
1601*1da177e4SLinus Torvalds }
1602*1da177e4SLinus Torvalds 
1603*1da177e4SLinus Torvalds static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1604*1da177e4SLinus Torvalds {
1605*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1606*1da177e4SLinus Torvalds 
1607*1da177e4SLinus Torvalds 	buf->f_type = TMPFS_MAGIC;
1608*1da177e4SLinus Torvalds 	buf->f_bsize = PAGE_CACHE_SIZE;
1609*1da177e4SLinus Torvalds 	buf->f_namelen = NAME_MAX;
1610*1da177e4SLinus Torvalds 	if (sbinfo) {
1611*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
1612*1da177e4SLinus Torvalds 		buf->f_blocks = sbinfo->max_blocks;
1613*1da177e4SLinus Torvalds 		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1614*1da177e4SLinus Torvalds 		buf->f_files = sbinfo->max_inodes;
1615*1da177e4SLinus Torvalds 		buf->f_ffree = sbinfo->free_inodes;
1616*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
1617*1da177e4SLinus Torvalds 	}
1618*1da177e4SLinus Torvalds 	/* else leave those fields 0 like simple_statfs */
1619*1da177e4SLinus Torvalds 	return 0;
1620*1da177e4SLinus Torvalds }
1621*1da177e4SLinus Torvalds 
1622*1da177e4SLinus Torvalds /*
1623*1da177e4SLinus Torvalds  * File creation. Allocate an inode, and we're done..
1624*1da177e4SLinus Torvalds  */
1625*1da177e4SLinus Torvalds static int
1626*1da177e4SLinus Torvalds shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1627*1da177e4SLinus Torvalds {
1628*1da177e4SLinus Torvalds 	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1629*1da177e4SLinus Torvalds 	int error = -ENOSPC;
1630*1da177e4SLinus Torvalds 
1631*1da177e4SLinus Torvalds 	if (inode) {
1632*1da177e4SLinus Torvalds 		if (dir->i_mode & S_ISGID) {
1633*1da177e4SLinus Torvalds 			inode->i_gid = dir->i_gid;
1634*1da177e4SLinus Torvalds 			if (S_ISDIR(mode))
1635*1da177e4SLinus Torvalds 				inode->i_mode |= S_ISGID;
1636*1da177e4SLinus Torvalds 		}
1637*1da177e4SLinus Torvalds 		dir->i_size += BOGO_DIRENT_SIZE;
1638*1da177e4SLinus Torvalds 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1639*1da177e4SLinus Torvalds 		d_instantiate(dentry, inode);
1640*1da177e4SLinus Torvalds 		dget(dentry); /* Extra count - pin the dentry in core */
1641*1da177e4SLinus Torvalds 		error = 0;
1642*1da177e4SLinus Torvalds 	}
1643*1da177e4SLinus Torvalds 	return error;
1644*1da177e4SLinus Torvalds }
1645*1da177e4SLinus Torvalds 
1646*1da177e4SLinus Torvalds static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1647*1da177e4SLinus Torvalds {
1648*1da177e4SLinus Torvalds 	int error;
1649*1da177e4SLinus Torvalds 
1650*1da177e4SLinus Torvalds 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1651*1da177e4SLinus Torvalds 		return error;
1652*1da177e4SLinus Torvalds 	dir->i_nlink++;
1653*1da177e4SLinus Torvalds 	return 0;
1654*1da177e4SLinus Torvalds }
1655*1da177e4SLinus Torvalds 
1656*1da177e4SLinus Torvalds static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1657*1da177e4SLinus Torvalds 		struct nameidata *nd)
1658*1da177e4SLinus Torvalds {
1659*1da177e4SLinus Torvalds 	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1660*1da177e4SLinus Torvalds }
1661*1da177e4SLinus Torvalds 
1662*1da177e4SLinus Torvalds /*
1663*1da177e4SLinus Torvalds  * Link a file..
1664*1da177e4SLinus Torvalds  */
1665*1da177e4SLinus Torvalds static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1666*1da177e4SLinus Torvalds {
1667*1da177e4SLinus Torvalds 	struct inode *inode = old_dentry->d_inode;
1668*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1669*1da177e4SLinus Torvalds 
1670*1da177e4SLinus Torvalds 	/*
1671*1da177e4SLinus Torvalds 	 * No ordinary (disk based) filesystem counts links as inodes;
1672*1da177e4SLinus Torvalds 	 * but each new link needs a new dentry, pinning lowmem, and
1673*1da177e4SLinus Torvalds 	 * tmpfs dentries cannot be pruned until they are unlinked.
1674*1da177e4SLinus Torvalds 	 */
1675*1da177e4SLinus Torvalds 	if (sbinfo) {
1676*1da177e4SLinus Torvalds 		spin_lock(&sbinfo->stat_lock);
1677*1da177e4SLinus Torvalds 		if (!sbinfo->free_inodes) {
1678*1da177e4SLinus Torvalds 			spin_unlock(&sbinfo->stat_lock);
1679*1da177e4SLinus Torvalds 			return -ENOSPC;
1680*1da177e4SLinus Torvalds 		}
1681*1da177e4SLinus Torvalds 		sbinfo->free_inodes--;
1682*1da177e4SLinus Torvalds 		spin_unlock(&sbinfo->stat_lock);
1683*1da177e4SLinus Torvalds 	}
1684*1da177e4SLinus Torvalds 
1685*1da177e4SLinus Torvalds 	dir->i_size += BOGO_DIRENT_SIZE;
1686*1da177e4SLinus Torvalds 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1687*1da177e4SLinus Torvalds 	inode->i_nlink++;
1688*1da177e4SLinus Torvalds 	atomic_inc(&inode->i_count);	/* New dentry reference */
1689*1da177e4SLinus Torvalds 	dget(dentry);		/* Extra pinning count for the created dentry */
1690*1da177e4SLinus Torvalds 	d_instantiate(dentry, inode);
1691*1da177e4SLinus Torvalds 	return 0;
1692*1da177e4SLinus Torvalds }
1693*1da177e4SLinus Torvalds 
1694*1da177e4SLinus Torvalds static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1695*1da177e4SLinus Torvalds {
1696*1da177e4SLinus Torvalds 	struct inode *inode = dentry->d_inode;
1697*1da177e4SLinus Torvalds 
1698*1da177e4SLinus Torvalds 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1699*1da177e4SLinus Torvalds 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1700*1da177e4SLinus Torvalds 		if (sbinfo) {
1701*1da177e4SLinus Torvalds 			spin_lock(&sbinfo->stat_lock);
1702*1da177e4SLinus Torvalds 			sbinfo->free_inodes++;
1703*1da177e4SLinus Torvalds 			spin_unlock(&sbinfo->stat_lock);
1704*1da177e4SLinus Torvalds 		}
1705*1da177e4SLinus Torvalds 	}
1706*1da177e4SLinus Torvalds 
1707*1da177e4SLinus Torvalds 	dir->i_size -= BOGO_DIRENT_SIZE;
1708*1da177e4SLinus Torvalds 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1709*1da177e4SLinus Torvalds 	inode->i_nlink--;
1710*1da177e4SLinus Torvalds 	dput(dentry);	/* Undo the count from "create" - this does all the work */
1711*1da177e4SLinus Torvalds 	return 0;
1712*1da177e4SLinus Torvalds }
1713*1da177e4SLinus Torvalds 
1714*1da177e4SLinus Torvalds static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1715*1da177e4SLinus Torvalds {
1716*1da177e4SLinus Torvalds 	if (!simple_empty(dentry))
1717*1da177e4SLinus Torvalds 		return -ENOTEMPTY;
1718*1da177e4SLinus Torvalds 
1719*1da177e4SLinus Torvalds 	dir->i_nlink--;
1720*1da177e4SLinus Torvalds 	return shmem_unlink(dir, dentry);
1721*1da177e4SLinus Torvalds }
1722*1da177e4SLinus Torvalds 
1723*1da177e4SLinus Torvalds /*
1724*1da177e4SLinus Torvalds  * The VFS layer already does all the dentry stuff for rename,
1725*1da177e4SLinus Torvalds  * we just have to decrement the usage count for the target if
1726*1da177e4SLinus Torvalds  * it exists so that the VFS layer correctly free's it when it
1727*1da177e4SLinus Torvalds  * gets overwritten.
1728*1da177e4SLinus Torvalds  */
1729*1da177e4SLinus Torvalds static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1730*1da177e4SLinus Torvalds {
1731*1da177e4SLinus Torvalds 	struct inode *inode = old_dentry->d_inode;
1732*1da177e4SLinus Torvalds 	int they_are_dirs = S_ISDIR(inode->i_mode);
1733*1da177e4SLinus Torvalds 
1734*1da177e4SLinus Torvalds 	if (!simple_empty(new_dentry))
1735*1da177e4SLinus Torvalds 		return -ENOTEMPTY;
1736*1da177e4SLinus Torvalds 
1737*1da177e4SLinus Torvalds 	if (new_dentry->d_inode) {
1738*1da177e4SLinus Torvalds 		(void) shmem_unlink(new_dir, new_dentry);
1739*1da177e4SLinus Torvalds 		if (they_are_dirs)
1740*1da177e4SLinus Torvalds 			old_dir->i_nlink--;
1741*1da177e4SLinus Torvalds 	} else if (they_are_dirs) {
1742*1da177e4SLinus Torvalds 		old_dir->i_nlink--;
1743*1da177e4SLinus Torvalds 		new_dir->i_nlink++;
1744*1da177e4SLinus Torvalds 	}
1745*1da177e4SLinus Torvalds 
1746*1da177e4SLinus Torvalds 	old_dir->i_size -= BOGO_DIRENT_SIZE;
1747*1da177e4SLinus Torvalds 	new_dir->i_size += BOGO_DIRENT_SIZE;
1748*1da177e4SLinus Torvalds 	old_dir->i_ctime = old_dir->i_mtime =
1749*1da177e4SLinus Torvalds 	new_dir->i_ctime = new_dir->i_mtime =
1750*1da177e4SLinus Torvalds 	inode->i_ctime = CURRENT_TIME;
1751*1da177e4SLinus Torvalds 	return 0;
1752*1da177e4SLinus Torvalds }
1753*1da177e4SLinus Torvalds 
1754*1da177e4SLinus Torvalds static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1755*1da177e4SLinus Torvalds {
1756*1da177e4SLinus Torvalds 	int error;
1757*1da177e4SLinus Torvalds 	int len;
1758*1da177e4SLinus Torvalds 	struct inode *inode;
1759*1da177e4SLinus Torvalds 	struct page *page = NULL;
1760*1da177e4SLinus Torvalds 	char *kaddr;
1761*1da177e4SLinus Torvalds 	struct shmem_inode_info *info;
1762*1da177e4SLinus Torvalds 
1763*1da177e4SLinus Torvalds 	len = strlen(symname) + 1;
1764*1da177e4SLinus Torvalds 	if (len > PAGE_CACHE_SIZE)
1765*1da177e4SLinus Torvalds 		return -ENAMETOOLONG;
1766*1da177e4SLinus Torvalds 
1767*1da177e4SLinus Torvalds 	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1768*1da177e4SLinus Torvalds 	if (!inode)
1769*1da177e4SLinus Torvalds 		return -ENOSPC;
1770*1da177e4SLinus Torvalds 
1771*1da177e4SLinus Torvalds 	info = SHMEM_I(inode);
1772*1da177e4SLinus Torvalds 	inode->i_size = len-1;
1773*1da177e4SLinus Torvalds 	if (len <= (char *)inode - (char *)info) {
1774*1da177e4SLinus Torvalds 		/* do it inline */
1775*1da177e4SLinus Torvalds 		memcpy(info, symname, len);
1776*1da177e4SLinus Torvalds 		inode->i_op = &shmem_symlink_inline_operations;
1777*1da177e4SLinus Torvalds 	} else {
1778*1da177e4SLinus Torvalds 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1779*1da177e4SLinus Torvalds 		if (error) {
1780*1da177e4SLinus Torvalds 			iput(inode);
1781*1da177e4SLinus Torvalds 			return error;
1782*1da177e4SLinus Torvalds 		}
1783*1da177e4SLinus Torvalds 		inode->i_op = &shmem_symlink_inode_operations;
1784*1da177e4SLinus Torvalds 		kaddr = kmap_atomic(page, KM_USER0);
1785*1da177e4SLinus Torvalds 		memcpy(kaddr, symname, len);
1786*1da177e4SLinus Torvalds 		kunmap_atomic(kaddr, KM_USER0);
1787*1da177e4SLinus Torvalds 		set_page_dirty(page);
1788*1da177e4SLinus Torvalds 		page_cache_release(page);
1789*1da177e4SLinus Torvalds 	}
1790*1da177e4SLinus Torvalds 	if (dir->i_mode & S_ISGID)
1791*1da177e4SLinus Torvalds 		inode->i_gid = dir->i_gid;
1792*1da177e4SLinus Torvalds 	dir->i_size += BOGO_DIRENT_SIZE;
1793*1da177e4SLinus Torvalds 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1794*1da177e4SLinus Torvalds 	d_instantiate(dentry, inode);
1795*1da177e4SLinus Torvalds 	dget(dentry);
1796*1da177e4SLinus Torvalds 	return 0;
1797*1da177e4SLinus Torvalds }
1798*1da177e4SLinus Torvalds 
1799*1da177e4SLinus Torvalds static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1800*1da177e4SLinus Torvalds {
1801*1da177e4SLinus Torvalds 	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1802*1da177e4SLinus Torvalds 	return 0;
1803*1da177e4SLinus Torvalds }
1804*1da177e4SLinus Torvalds 
1805*1da177e4SLinus Torvalds static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1806*1da177e4SLinus Torvalds {
1807*1da177e4SLinus Torvalds 	struct page *page = NULL;
1808*1da177e4SLinus Torvalds 	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1809*1da177e4SLinus Torvalds 	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1810*1da177e4SLinus Torvalds 	return 0;
1811*1da177e4SLinus Torvalds }
1812*1da177e4SLinus Torvalds 
1813*1da177e4SLinus Torvalds static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
1814*1da177e4SLinus Torvalds {
1815*1da177e4SLinus Torvalds 	if (!IS_ERR(nd_get_link(nd))) {
1816*1da177e4SLinus Torvalds 		struct page *page;
1817*1da177e4SLinus Torvalds 
1818*1da177e4SLinus Torvalds 		page = find_get_page(dentry->d_inode->i_mapping, 0);
1819*1da177e4SLinus Torvalds 		if (!page)
1820*1da177e4SLinus Torvalds 			BUG();
1821*1da177e4SLinus Torvalds 		kunmap(page);
1822*1da177e4SLinus Torvalds 		mark_page_accessed(page);
1823*1da177e4SLinus Torvalds 		page_cache_release(page);
1824*1da177e4SLinus Torvalds 		page_cache_release(page);
1825*1da177e4SLinus Torvalds 	}
1826*1da177e4SLinus Torvalds }
1827*1da177e4SLinus Torvalds 
1828*1da177e4SLinus Torvalds static struct inode_operations shmem_symlink_inline_operations = {
1829*1da177e4SLinus Torvalds 	.readlink	= generic_readlink,
1830*1da177e4SLinus Torvalds 	.follow_link	= shmem_follow_link_inline,
1831*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
1832*1da177e4SLinus Torvalds 	.setxattr       = generic_setxattr,
1833*1da177e4SLinus Torvalds 	.getxattr       = generic_getxattr,
1834*1da177e4SLinus Torvalds 	.listxattr      = generic_listxattr,
1835*1da177e4SLinus Torvalds 	.removexattr    = generic_removexattr,
1836*1da177e4SLinus Torvalds #endif
1837*1da177e4SLinus Torvalds };
1838*1da177e4SLinus Torvalds 
1839*1da177e4SLinus Torvalds static struct inode_operations shmem_symlink_inode_operations = {
1840*1da177e4SLinus Torvalds 	.truncate	= shmem_truncate,
1841*1da177e4SLinus Torvalds 	.readlink	= generic_readlink,
1842*1da177e4SLinus Torvalds 	.follow_link	= shmem_follow_link,
1843*1da177e4SLinus Torvalds 	.put_link	= shmem_put_link,
1844*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
1845*1da177e4SLinus Torvalds 	.setxattr       = generic_setxattr,
1846*1da177e4SLinus Torvalds 	.getxattr       = generic_getxattr,
1847*1da177e4SLinus Torvalds 	.listxattr      = generic_listxattr,
1848*1da177e4SLinus Torvalds 	.removexattr    = generic_removexattr,
1849*1da177e4SLinus Torvalds #endif
1850*1da177e4SLinus Torvalds };
1851*1da177e4SLinus Torvalds 
1852*1da177e4SLinus Torvalds static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
1853*1da177e4SLinus Torvalds {
1854*1da177e4SLinus Torvalds 	char *this_char, *value, *rest;
1855*1da177e4SLinus Torvalds 
1856*1da177e4SLinus Torvalds 	while ((this_char = strsep(&options, ",")) != NULL) {
1857*1da177e4SLinus Torvalds 		if (!*this_char)
1858*1da177e4SLinus Torvalds 			continue;
1859*1da177e4SLinus Torvalds 		if ((value = strchr(this_char,'=')) != NULL) {
1860*1da177e4SLinus Torvalds 			*value++ = 0;
1861*1da177e4SLinus Torvalds 		} else {
1862*1da177e4SLinus Torvalds 			printk(KERN_ERR
1863*1da177e4SLinus Torvalds 			    "tmpfs: No value for mount option '%s'\n",
1864*1da177e4SLinus Torvalds 			    this_char);
1865*1da177e4SLinus Torvalds 			return 1;
1866*1da177e4SLinus Torvalds 		}
1867*1da177e4SLinus Torvalds 
1868*1da177e4SLinus Torvalds 		if (!strcmp(this_char,"size")) {
1869*1da177e4SLinus Torvalds 			unsigned long long size;
1870*1da177e4SLinus Torvalds 			size = memparse(value,&rest);
1871*1da177e4SLinus Torvalds 			if (*rest == '%') {
1872*1da177e4SLinus Torvalds 				size <<= PAGE_SHIFT;
1873*1da177e4SLinus Torvalds 				size *= totalram_pages;
1874*1da177e4SLinus Torvalds 				do_div(size, 100);
1875*1da177e4SLinus Torvalds 				rest++;
1876*1da177e4SLinus Torvalds 			}
1877*1da177e4SLinus Torvalds 			if (*rest)
1878*1da177e4SLinus Torvalds 				goto bad_val;
1879*1da177e4SLinus Torvalds 			*blocks = size >> PAGE_CACHE_SHIFT;
1880*1da177e4SLinus Torvalds 		} else if (!strcmp(this_char,"nr_blocks")) {
1881*1da177e4SLinus Torvalds 			*blocks = memparse(value,&rest);
1882*1da177e4SLinus Torvalds 			if (*rest)
1883*1da177e4SLinus Torvalds 				goto bad_val;
1884*1da177e4SLinus Torvalds 		} else if (!strcmp(this_char,"nr_inodes")) {
1885*1da177e4SLinus Torvalds 			*inodes = memparse(value,&rest);
1886*1da177e4SLinus Torvalds 			if (*rest)
1887*1da177e4SLinus Torvalds 				goto bad_val;
1888*1da177e4SLinus Torvalds 		} else if (!strcmp(this_char,"mode")) {
1889*1da177e4SLinus Torvalds 			if (!mode)
1890*1da177e4SLinus Torvalds 				continue;
1891*1da177e4SLinus Torvalds 			*mode = simple_strtoul(value,&rest,8);
1892*1da177e4SLinus Torvalds 			if (*rest)
1893*1da177e4SLinus Torvalds 				goto bad_val;
1894*1da177e4SLinus Torvalds 		} else if (!strcmp(this_char,"uid")) {
1895*1da177e4SLinus Torvalds 			if (!uid)
1896*1da177e4SLinus Torvalds 				continue;
1897*1da177e4SLinus Torvalds 			*uid = simple_strtoul(value,&rest,0);
1898*1da177e4SLinus Torvalds 			if (*rest)
1899*1da177e4SLinus Torvalds 				goto bad_val;
1900*1da177e4SLinus Torvalds 		} else if (!strcmp(this_char,"gid")) {
1901*1da177e4SLinus Torvalds 			if (!gid)
1902*1da177e4SLinus Torvalds 				continue;
1903*1da177e4SLinus Torvalds 			*gid = simple_strtoul(value,&rest,0);
1904*1da177e4SLinus Torvalds 			if (*rest)
1905*1da177e4SLinus Torvalds 				goto bad_val;
1906*1da177e4SLinus Torvalds 		} else {
1907*1da177e4SLinus Torvalds 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1908*1da177e4SLinus Torvalds 			       this_char);
1909*1da177e4SLinus Torvalds 			return 1;
1910*1da177e4SLinus Torvalds 		}
1911*1da177e4SLinus Torvalds 	}
1912*1da177e4SLinus Torvalds 	return 0;
1913*1da177e4SLinus Torvalds 
1914*1da177e4SLinus Torvalds bad_val:
1915*1da177e4SLinus Torvalds 	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1916*1da177e4SLinus Torvalds 	       value, this_char);
1917*1da177e4SLinus Torvalds 	return 1;
1918*1da177e4SLinus Torvalds 
1919*1da177e4SLinus Torvalds }
1920*1da177e4SLinus Torvalds 
1921*1da177e4SLinus Torvalds static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1922*1da177e4SLinus Torvalds {
1923*1da177e4SLinus Torvalds 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1924*1da177e4SLinus Torvalds 	unsigned long max_blocks = 0;
1925*1da177e4SLinus Torvalds 	unsigned long max_inodes = 0;
1926*1da177e4SLinus Torvalds 
1927*1da177e4SLinus Torvalds 	if (sbinfo) {
1928*1da177e4SLinus Torvalds 		max_blocks = sbinfo->max_blocks;
1929*1da177e4SLinus Torvalds 		max_inodes = sbinfo->max_inodes;
1930*1da177e4SLinus Torvalds 	}
1931*1da177e4SLinus Torvalds 	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
1932*1da177e4SLinus Torvalds 		return -EINVAL;
1933*1da177e4SLinus Torvalds 	/* Keep it simple: disallow limited <-> unlimited remount */
1934*1da177e4SLinus Torvalds 	if ((max_blocks || max_inodes) == !sbinfo)
1935*1da177e4SLinus Torvalds 		return -EINVAL;
1936*1da177e4SLinus Torvalds 	/* But allow the pointless unlimited -> unlimited remount */
1937*1da177e4SLinus Torvalds 	if (!sbinfo)
1938*1da177e4SLinus Torvalds 		return 0;
1939*1da177e4SLinus Torvalds 	return shmem_set_size(sbinfo, max_blocks, max_inodes);
1940*1da177e4SLinus Torvalds }
1941*1da177e4SLinus Torvalds #endif
1942*1da177e4SLinus Torvalds 
1943*1da177e4SLinus Torvalds static void shmem_put_super(struct super_block *sb)
1944*1da177e4SLinus Torvalds {
1945*1da177e4SLinus Torvalds 	kfree(sb->s_fs_info);
1946*1da177e4SLinus Torvalds 	sb->s_fs_info = NULL;
1947*1da177e4SLinus Torvalds }
1948*1da177e4SLinus Torvalds 
1949*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
1950*1da177e4SLinus Torvalds static struct xattr_handler *shmem_xattr_handlers[];
1951*1da177e4SLinus Torvalds #else
1952*1da177e4SLinus Torvalds #define shmem_xattr_handlers NULL
1953*1da177e4SLinus Torvalds #endif
1954*1da177e4SLinus Torvalds 
1955*1da177e4SLinus Torvalds static int shmem_fill_super(struct super_block *sb,
1956*1da177e4SLinus Torvalds 			    void *data, int silent)
1957*1da177e4SLinus Torvalds {
1958*1da177e4SLinus Torvalds 	struct inode *inode;
1959*1da177e4SLinus Torvalds 	struct dentry *root;
1960*1da177e4SLinus Torvalds 	int mode   = S_IRWXUGO | S_ISVTX;
1961*1da177e4SLinus Torvalds 	uid_t uid = current->fsuid;
1962*1da177e4SLinus Torvalds 	gid_t gid = current->fsgid;
1963*1da177e4SLinus Torvalds 	int err = -ENOMEM;
1964*1da177e4SLinus Torvalds 
1965*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
1966*1da177e4SLinus Torvalds 	unsigned long blocks = 0;
1967*1da177e4SLinus Torvalds 	unsigned long inodes = 0;
1968*1da177e4SLinus Torvalds 
1969*1da177e4SLinus Torvalds 	/*
1970*1da177e4SLinus Torvalds 	 * Per default we only allow half of the physical ram per
1971*1da177e4SLinus Torvalds 	 * tmpfs instance, limiting inodes to one per page of lowmem;
1972*1da177e4SLinus Torvalds 	 * but the internal instance is left unlimited.
1973*1da177e4SLinus Torvalds 	 */
1974*1da177e4SLinus Torvalds 	if (!(sb->s_flags & MS_NOUSER)) {
1975*1da177e4SLinus Torvalds 		blocks = totalram_pages / 2;
1976*1da177e4SLinus Torvalds 		inodes = totalram_pages - totalhigh_pages;
1977*1da177e4SLinus Torvalds 		if (inodes > blocks)
1978*1da177e4SLinus Torvalds 			inodes = blocks;
1979*1da177e4SLinus Torvalds 
1980*1da177e4SLinus Torvalds 		if (shmem_parse_options(data, &mode,
1981*1da177e4SLinus Torvalds 					&uid, &gid, &blocks, &inodes))
1982*1da177e4SLinus Torvalds 			return -EINVAL;
1983*1da177e4SLinus Torvalds 	}
1984*1da177e4SLinus Torvalds 
1985*1da177e4SLinus Torvalds 	if (blocks || inodes) {
1986*1da177e4SLinus Torvalds 		struct shmem_sb_info *sbinfo;
1987*1da177e4SLinus Torvalds 		sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
1988*1da177e4SLinus Torvalds 		if (!sbinfo)
1989*1da177e4SLinus Torvalds 			return -ENOMEM;
1990*1da177e4SLinus Torvalds 		sb->s_fs_info = sbinfo;
1991*1da177e4SLinus Torvalds 		spin_lock_init(&sbinfo->stat_lock);
1992*1da177e4SLinus Torvalds 		sbinfo->max_blocks = blocks;
1993*1da177e4SLinus Torvalds 		sbinfo->free_blocks = blocks;
1994*1da177e4SLinus Torvalds 		sbinfo->max_inodes = inodes;
1995*1da177e4SLinus Torvalds 		sbinfo->free_inodes = inodes;
1996*1da177e4SLinus Torvalds 	}
1997*1da177e4SLinus Torvalds 	sb->s_xattr = shmem_xattr_handlers;
1998*1da177e4SLinus Torvalds #else
1999*1da177e4SLinus Torvalds 	sb->s_flags |= MS_NOUSER;
2000*1da177e4SLinus Torvalds #endif
2001*1da177e4SLinus Torvalds 
2002*1da177e4SLinus Torvalds 	sb->s_maxbytes = SHMEM_MAX_BYTES;
2003*1da177e4SLinus Torvalds 	sb->s_blocksize = PAGE_CACHE_SIZE;
2004*1da177e4SLinus Torvalds 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2005*1da177e4SLinus Torvalds 	sb->s_magic = TMPFS_MAGIC;
2006*1da177e4SLinus Torvalds 	sb->s_op = &shmem_ops;
2007*1da177e4SLinus Torvalds 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2008*1da177e4SLinus Torvalds 	if (!inode)
2009*1da177e4SLinus Torvalds 		goto failed;
2010*1da177e4SLinus Torvalds 	inode->i_uid = uid;
2011*1da177e4SLinus Torvalds 	inode->i_gid = gid;
2012*1da177e4SLinus Torvalds 	root = d_alloc_root(inode);
2013*1da177e4SLinus Torvalds 	if (!root)
2014*1da177e4SLinus Torvalds 		goto failed_iput;
2015*1da177e4SLinus Torvalds 	sb->s_root = root;
2016*1da177e4SLinus Torvalds 	return 0;
2017*1da177e4SLinus Torvalds 
2018*1da177e4SLinus Torvalds failed_iput:
2019*1da177e4SLinus Torvalds 	iput(inode);
2020*1da177e4SLinus Torvalds failed:
2021*1da177e4SLinus Torvalds 	shmem_put_super(sb);
2022*1da177e4SLinus Torvalds 	return err;
2023*1da177e4SLinus Torvalds }
2024*1da177e4SLinus Torvalds 
2025*1da177e4SLinus Torvalds static kmem_cache_t *shmem_inode_cachep;
2026*1da177e4SLinus Torvalds 
2027*1da177e4SLinus Torvalds static struct inode *shmem_alloc_inode(struct super_block *sb)
2028*1da177e4SLinus Torvalds {
2029*1da177e4SLinus Torvalds 	struct shmem_inode_info *p;
2030*1da177e4SLinus Torvalds 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
2031*1da177e4SLinus Torvalds 	if (!p)
2032*1da177e4SLinus Torvalds 		return NULL;
2033*1da177e4SLinus Torvalds 	return &p->vfs_inode;
2034*1da177e4SLinus Torvalds }
2035*1da177e4SLinus Torvalds 
2036*1da177e4SLinus Torvalds static void shmem_destroy_inode(struct inode *inode)
2037*1da177e4SLinus Torvalds {
2038*1da177e4SLinus Torvalds 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2039*1da177e4SLinus Torvalds 		/* only struct inode is valid if it's an inline symlink */
2040*1da177e4SLinus Torvalds 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2041*1da177e4SLinus Torvalds 	}
2042*1da177e4SLinus Torvalds 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2043*1da177e4SLinus Torvalds }
2044*1da177e4SLinus Torvalds 
2045*1da177e4SLinus Torvalds static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
2046*1da177e4SLinus Torvalds {
2047*1da177e4SLinus Torvalds 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2048*1da177e4SLinus Torvalds 
2049*1da177e4SLinus Torvalds 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2050*1da177e4SLinus Torvalds 	    SLAB_CTOR_CONSTRUCTOR) {
2051*1da177e4SLinus Torvalds 		inode_init_once(&p->vfs_inode);
2052*1da177e4SLinus Torvalds 	}
2053*1da177e4SLinus Torvalds }
2054*1da177e4SLinus Torvalds 
2055*1da177e4SLinus Torvalds static int init_inodecache(void)
2056*1da177e4SLinus Torvalds {
2057*1da177e4SLinus Torvalds 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2058*1da177e4SLinus Torvalds 				sizeof(struct shmem_inode_info),
2059*1da177e4SLinus Torvalds 				0, 0, init_once, NULL);
2060*1da177e4SLinus Torvalds 	if (shmem_inode_cachep == NULL)
2061*1da177e4SLinus Torvalds 		return -ENOMEM;
2062*1da177e4SLinus Torvalds 	return 0;
2063*1da177e4SLinus Torvalds }
2064*1da177e4SLinus Torvalds 
2065*1da177e4SLinus Torvalds static void destroy_inodecache(void)
2066*1da177e4SLinus Torvalds {
2067*1da177e4SLinus Torvalds 	if (kmem_cache_destroy(shmem_inode_cachep))
2068*1da177e4SLinus Torvalds 		printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2069*1da177e4SLinus Torvalds }
2070*1da177e4SLinus Torvalds 
2071*1da177e4SLinus Torvalds static struct address_space_operations shmem_aops = {
2072*1da177e4SLinus Torvalds 	.writepage	= shmem_writepage,
2073*1da177e4SLinus Torvalds 	.set_page_dirty	= __set_page_dirty_nobuffers,
2074*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
2075*1da177e4SLinus Torvalds 	.prepare_write	= shmem_prepare_write,
2076*1da177e4SLinus Torvalds 	.commit_write	= simple_commit_write,
2077*1da177e4SLinus Torvalds #endif
2078*1da177e4SLinus Torvalds };
2079*1da177e4SLinus Torvalds 
2080*1da177e4SLinus Torvalds static struct file_operations shmem_file_operations = {
2081*1da177e4SLinus Torvalds 	.mmap		= shmem_mmap,
2082*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
2083*1da177e4SLinus Torvalds 	.llseek		= generic_file_llseek,
2084*1da177e4SLinus Torvalds 	.read		= shmem_file_read,
2085*1da177e4SLinus Torvalds 	.write		= shmem_file_write,
2086*1da177e4SLinus Torvalds 	.fsync		= simple_sync_file,
2087*1da177e4SLinus Torvalds 	.sendfile	= shmem_file_sendfile,
2088*1da177e4SLinus Torvalds #endif
2089*1da177e4SLinus Torvalds };
2090*1da177e4SLinus Torvalds 
2091*1da177e4SLinus Torvalds static struct inode_operations shmem_inode_operations = {
2092*1da177e4SLinus Torvalds 	.truncate	= shmem_truncate,
2093*1da177e4SLinus Torvalds 	.setattr	= shmem_notify_change,
2094*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
2095*1da177e4SLinus Torvalds 	.setxattr       = generic_setxattr,
2096*1da177e4SLinus Torvalds 	.getxattr       = generic_getxattr,
2097*1da177e4SLinus Torvalds 	.listxattr      = generic_listxattr,
2098*1da177e4SLinus Torvalds 	.removexattr    = generic_removexattr,
2099*1da177e4SLinus Torvalds #endif
2100*1da177e4SLinus Torvalds };
2101*1da177e4SLinus Torvalds 
2102*1da177e4SLinus Torvalds static struct inode_operations shmem_dir_inode_operations = {
2103*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
2104*1da177e4SLinus Torvalds 	.create		= shmem_create,
2105*1da177e4SLinus Torvalds 	.lookup		= simple_lookup,
2106*1da177e4SLinus Torvalds 	.link		= shmem_link,
2107*1da177e4SLinus Torvalds 	.unlink		= shmem_unlink,
2108*1da177e4SLinus Torvalds 	.symlink	= shmem_symlink,
2109*1da177e4SLinus Torvalds 	.mkdir		= shmem_mkdir,
2110*1da177e4SLinus Torvalds 	.rmdir		= shmem_rmdir,
2111*1da177e4SLinus Torvalds 	.mknod		= shmem_mknod,
2112*1da177e4SLinus Torvalds 	.rename		= shmem_rename,
2113*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
2114*1da177e4SLinus Torvalds 	.setxattr       = generic_setxattr,
2115*1da177e4SLinus Torvalds 	.getxattr       = generic_getxattr,
2116*1da177e4SLinus Torvalds 	.listxattr      = generic_listxattr,
2117*1da177e4SLinus Torvalds 	.removexattr    = generic_removexattr,
2118*1da177e4SLinus Torvalds #endif
2119*1da177e4SLinus Torvalds #endif
2120*1da177e4SLinus Torvalds };
2121*1da177e4SLinus Torvalds 
2122*1da177e4SLinus Torvalds static struct inode_operations shmem_special_inode_operations = {
2123*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
2124*1da177e4SLinus Torvalds 	.setxattr	= generic_setxattr,
2125*1da177e4SLinus Torvalds 	.getxattr	= generic_getxattr,
2126*1da177e4SLinus Torvalds 	.listxattr	= generic_listxattr,
2127*1da177e4SLinus Torvalds 	.removexattr	= generic_removexattr,
2128*1da177e4SLinus Torvalds #endif
2129*1da177e4SLinus Torvalds };
2130*1da177e4SLinus Torvalds 
2131*1da177e4SLinus Torvalds static struct super_operations shmem_ops = {
2132*1da177e4SLinus Torvalds 	.alloc_inode	= shmem_alloc_inode,
2133*1da177e4SLinus Torvalds 	.destroy_inode	= shmem_destroy_inode,
2134*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
2135*1da177e4SLinus Torvalds 	.statfs		= shmem_statfs,
2136*1da177e4SLinus Torvalds 	.remount_fs	= shmem_remount_fs,
2137*1da177e4SLinus Torvalds #endif
2138*1da177e4SLinus Torvalds 	.delete_inode	= shmem_delete_inode,
2139*1da177e4SLinus Torvalds 	.drop_inode	= generic_delete_inode,
2140*1da177e4SLinus Torvalds 	.put_super	= shmem_put_super,
2141*1da177e4SLinus Torvalds };
2142*1da177e4SLinus Torvalds 
2143*1da177e4SLinus Torvalds static struct vm_operations_struct shmem_vm_ops = {
2144*1da177e4SLinus Torvalds 	.nopage		= shmem_nopage,
2145*1da177e4SLinus Torvalds 	.populate	= shmem_populate,
2146*1da177e4SLinus Torvalds #ifdef CONFIG_NUMA
2147*1da177e4SLinus Torvalds 	.set_policy     = shmem_set_policy,
2148*1da177e4SLinus Torvalds 	.get_policy     = shmem_get_policy,
2149*1da177e4SLinus Torvalds #endif
2150*1da177e4SLinus Torvalds };
2151*1da177e4SLinus Torvalds 
2152*1da177e4SLinus Torvalds 
2153*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_SECURITY
2154*1da177e4SLinus Torvalds 
2155*1da177e4SLinus Torvalds static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
2156*1da177e4SLinus Torvalds 					const char *name, size_t name_len)
2157*1da177e4SLinus Torvalds {
2158*1da177e4SLinus Torvalds 	return security_inode_listsecurity(inode, list, list_len);
2159*1da177e4SLinus Torvalds }
2160*1da177e4SLinus Torvalds 
2161*1da177e4SLinus Torvalds static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
2162*1da177e4SLinus Torvalds {
2163*1da177e4SLinus Torvalds 	if (strcmp(name, "") == 0)
2164*1da177e4SLinus Torvalds 		return -EINVAL;
2165*1da177e4SLinus Torvalds 	return security_inode_getsecurity(inode, name, buffer, size);
2166*1da177e4SLinus Torvalds }
2167*1da177e4SLinus Torvalds 
2168*1da177e4SLinus Torvalds static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
2169*1da177e4SLinus Torvalds {
2170*1da177e4SLinus Torvalds 	if (strcmp(name, "") == 0)
2171*1da177e4SLinus Torvalds 		return -EINVAL;
2172*1da177e4SLinus Torvalds 	return security_inode_setsecurity(inode, name, value, size, flags);
2173*1da177e4SLinus Torvalds }
2174*1da177e4SLinus Torvalds 
2175*1da177e4SLinus Torvalds static struct xattr_handler shmem_xattr_security_handler = {
2176*1da177e4SLinus Torvalds 	.prefix	= XATTR_SECURITY_PREFIX,
2177*1da177e4SLinus Torvalds 	.list	= shmem_xattr_security_list,
2178*1da177e4SLinus Torvalds 	.get	= shmem_xattr_security_get,
2179*1da177e4SLinus Torvalds 	.set	= shmem_xattr_security_set,
2180*1da177e4SLinus Torvalds };
2181*1da177e4SLinus Torvalds 
2182*1da177e4SLinus Torvalds #endif	/* CONFIG_TMPFS_SECURITY */
2183*1da177e4SLinus Torvalds 
2184*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_XATTR
2185*1da177e4SLinus Torvalds 
2186*1da177e4SLinus Torvalds static struct xattr_handler *shmem_xattr_handlers[] = {
2187*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS_SECURITY
2188*1da177e4SLinus Torvalds 	&shmem_xattr_security_handler,
2189*1da177e4SLinus Torvalds #endif
2190*1da177e4SLinus Torvalds 	NULL
2191*1da177e4SLinus Torvalds };
2192*1da177e4SLinus Torvalds 
2193*1da177e4SLinus Torvalds #endif	/* CONFIG_TMPFS_XATTR */
2194*1da177e4SLinus Torvalds 
2195*1da177e4SLinus Torvalds static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
2196*1da177e4SLinus Torvalds 	int flags, const char *dev_name, void *data)
2197*1da177e4SLinus Torvalds {
2198*1da177e4SLinus Torvalds 	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
2199*1da177e4SLinus Torvalds }
2200*1da177e4SLinus Torvalds 
2201*1da177e4SLinus Torvalds static struct file_system_type tmpfs_fs_type = {
2202*1da177e4SLinus Torvalds 	.owner		= THIS_MODULE,
2203*1da177e4SLinus Torvalds 	.name		= "tmpfs",
2204*1da177e4SLinus Torvalds 	.get_sb		= shmem_get_sb,
2205*1da177e4SLinus Torvalds 	.kill_sb	= kill_litter_super,
2206*1da177e4SLinus Torvalds };
2207*1da177e4SLinus Torvalds static struct vfsmount *shm_mnt;
2208*1da177e4SLinus Torvalds 
2209*1da177e4SLinus Torvalds static int __init init_tmpfs(void)
2210*1da177e4SLinus Torvalds {
2211*1da177e4SLinus Torvalds 	int error;
2212*1da177e4SLinus Torvalds 
2213*1da177e4SLinus Torvalds 	error = init_inodecache();
2214*1da177e4SLinus Torvalds 	if (error)
2215*1da177e4SLinus Torvalds 		goto out3;
2216*1da177e4SLinus Torvalds 
2217*1da177e4SLinus Torvalds 	error = register_filesystem(&tmpfs_fs_type);
2218*1da177e4SLinus Torvalds 	if (error) {
2219*1da177e4SLinus Torvalds 		printk(KERN_ERR "Could not register tmpfs\n");
2220*1da177e4SLinus Torvalds 		goto out2;
2221*1da177e4SLinus Torvalds 	}
2222*1da177e4SLinus Torvalds #ifdef CONFIG_TMPFS
2223*1da177e4SLinus Torvalds 	devfs_mk_dir("shm");
2224*1da177e4SLinus Torvalds #endif
2225*1da177e4SLinus Torvalds 	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
2226*1da177e4SLinus Torvalds 				tmpfs_fs_type.name, NULL);
2227*1da177e4SLinus Torvalds 	if (IS_ERR(shm_mnt)) {
2228*1da177e4SLinus Torvalds 		error = PTR_ERR(shm_mnt);
2229*1da177e4SLinus Torvalds 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2230*1da177e4SLinus Torvalds 		goto out1;
2231*1da177e4SLinus Torvalds 	}
2232*1da177e4SLinus Torvalds 	return 0;
2233*1da177e4SLinus Torvalds 
2234*1da177e4SLinus Torvalds out1:
2235*1da177e4SLinus Torvalds 	unregister_filesystem(&tmpfs_fs_type);
2236*1da177e4SLinus Torvalds out2:
2237*1da177e4SLinus Torvalds 	destroy_inodecache();
2238*1da177e4SLinus Torvalds out3:
2239*1da177e4SLinus Torvalds 	shm_mnt = ERR_PTR(error);
2240*1da177e4SLinus Torvalds 	return error;
2241*1da177e4SLinus Torvalds }
2242*1da177e4SLinus Torvalds module_init(init_tmpfs)
2243*1da177e4SLinus Torvalds 
2244*1da177e4SLinus Torvalds /*
2245*1da177e4SLinus Torvalds  * shmem_file_setup - get an unlinked file living in tmpfs
2246*1da177e4SLinus Torvalds  *
2247*1da177e4SLinus Torvalds  * @name: name for dentry (to be seen in /proc/<pid>/maps
2248*1da177e4SLinus Torvalds  * @size: size to be set for the file
2249*1da177e4SLinus Torvalds  *
2250*1da177e4SLinus Torvalds  */
2251*1da177e4SLinus Torvalds struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2252*1da177e4SLinus Torvalds {
2253*1da177e4SLinus Torvalds 	int error;
2254*1da177e4SLinus Torvalds 	struct file *file;
2255*1da177e4SLinus Torvalds 	struct inode *inode;
2256*1da177e4SLinus Torvalds 	struct dentry *dentry, *root;
2257*1da177e4SLinus Torvalds 	struct qstr this;
2258*1da177e4SLinus Torvalds 
2259*1da177e4SLinus Torvalds 	if (IS_ERR(shm_mnt))
2260*1da177e4SLinus Torvalds 		return (void *)shm_mnt;
2261*1da177e4SLinus Torvalds 
2262*1da177e4SLinus Torvalds 	if (size < 0 || size > SHMEM_MAX_BYTES)
2263*1da177e4SLinus Torvalds 		return ERR_PTR(-EINVAL);
2264*1da177e4SLinus Torvalds 
2265*1da177e4SLinus Torvalds 	if (shmem_acct_size(flags, size))
2266*1da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2267*1da177e4SLinus Torvalds 
2268*1da177e4SLinus Torvalds 	error = -ENOMEM;
2269*1da177e4SLinus Torvalds 	this.name = name;
2270*1da177e4SLinus Torvalds 	this.len = strlen(name);
2271*1da177e4SLinus Torvalds 	this.hash = 0; /* will go */
2272*1da177e4SLinus Torvalds 	root = shm_mnt->mnt_root;
2273*1da177e4SLinus Torvalds 	dentry = d_alloc(root, &this);
2274*1da177e4SLinus Torvalds 	if (!dentry)
2275*1da177e4SLinus Torvalds 		goto put_memory;
2276*1da177e4SLinus Torvalds 
2277*1da177e4SLinus Torvalds 	error = -ENFILE;
2278*1da177e4SLinus Torvalds 	file = get_empty_filp();
2279*1da177e4SLinus Torvalds 	if (!file)
2280*1da177e4SLinus Torvalds 		goto put_dentry;
2281*1da177e4SLinus Torvalds 
2282*1da177e4SLinus Torvalds 	error = -ENOSPC;
2283*1da177e4SLinus Torvalds 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2284*1da177e4SLinus Torvalds 	if (!inode)
2285*1da177e4SLinus Torvalds 		goto close_file;
2286*1da177e4SLinus Torvalds 
2287*1da177e4SLinus Torvalds 	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2288*1da177e4SLinus Torvalds 	d_instantiate(dentry, inode);
2289*1da177e4SLinus Torvalds 	inode->i_size = size;
2290*1da177e4SLinus Torvalds 	inode->i_nlink = 0;	/* It is unlinked */
2291*1da177e4SLinus Torvalds 	file->f_vfsmnt = mntget(shm_mnt);
2292*1da177e4SLinus Torvalds 	file->f_dentry = dentry;
2293*1da177e4SLinus Torvalds 	file->f_mapping = inode->i_mapping;
2294*1da177e4SLinus Torvalds 	file->f_op = &shmem_file_operations;
2295*1da177e4SLinus Torvalds 	file->f_mode = FMODE_WRITE | FMODE_READ;
2296*1da177e4SLinus Torvalds 	return file;
2297*1da177e4SLinus Torvalds 
2298*1da177e4SLinus Torvalds close_file:
2299*1da177e4SLinus Torvalds 	put_filp(file);
2300*1da177e4SLinus Torvalds put_dentry:
2301*1da177e4SLinus Torvalds 	dput(dentry);
2302*1da177e4SLinus Torvalds put_memory:
2303*1da177e4SLinus Torvalds 	shmem_unacct_size(flags, size);
2304*1da177e4SLinus Torvalds 	return ERR_PTR(error);
2305*1da177e4SLinus Torvalds }
2306*1da177e4SLinus Torvalds 
2307*1da177e4SLinus Torvalds /*
2308*1da177e4SLinus Torvalds  * shmem_zero_setup - setup a shared anonymous mapping
2309*1da177e4SLinus Torvalds  *
2310*1da177e4SLinus Torvalds  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2311*1da177e4SLinus Torvalds  */
2312*1da177e4SLinus Torvalds int shmem_zero_setup(struct vm_area_struct *vma)
2313*1da177e4SLinus Torvalds {
2314*1da177e4SLinus Torvalds 	struct file *file;
2315*1da177e4SLinus Torvalds 	loff_t size = vma->vm_end - vma->vm_start;
2316*1da177e4SLinus Torvalds 
2317*1da177e4SLinus Torvalds 	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2318*1da177e4SLinus Torvalds 	if (IS_ERR(file))
2319*1da177e4SLinus Torvalds 		return PTR_ERR(file);
2320*1da177e4SLinus Torvalds 
2321*1da177e4SLinus Torvalds 	if (vma->vm_file)
2322*1da177e4SLinus Torvalds 		fput(vma->vm_file);
2323*1da177e4SLinus Torvalds 	vma->vm_file = file;
2324*1da177e4SLinus Torvalds 	vma->vm_ops = &shmem_vm_ops;
2325*1da177e4SLinus Torvalds 	return 0;
2326*1da177e4SLinus Torvalds }
2327