xref: /openbmc/linux/mm/shmem.c (revision 1b1b32f2c6f6bb32535d2da62075b51c980880eb)
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *		 2000 Transmeta Corp.
6  *		 2000-2001 Christoph Rohland
7  *		 2000-2001 SAP AG
8  *		 2002 Red Hat Inc.
9  * Copyright (C) 2002-2005 Hugh Dickins.
10  * Copyright (C) 2002-2005 VERITAS Software Corporation.
11  * Copyright (C) 2004 Andi Kleen, SuSE Labs
12  *
13  * Extended attribute support for tmpfs:
14  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16  *
17  * This file is released under the GPL.
18  */
19 
20 /*
21  * This virtual memory filesystem is heavily based on the ramfs. It
22  * extends ramfs by the ability to use swap and honor resource limits
23  * which makes it a completely usable filesystem.
24  */
25 
26 #include <linux/module.h>
27 #include <linux/init.h>
28 #include <linux/fs.h>
29 #include <linux/xattr.h>
30 #include <linux/exportfs.h>
31 #include <linux/generic_acl.h>
32 #include <linux/mm.h>
33 #include <linux/mman.h>
34 #include <linux/file.h>
35 #include <linux/swap.h>
36 #include <linux/pagemap.h>
37 #include <linux/string.h>
38 #include <linux/slab.h>
39 #include <linux/backing-dev.h>
40 #include <linux/shmem_fs.h>
41 #include <linux/mount.h>
42 #include <linux/writeback.h>
43 #include <linux/vfs.h>
44 #include <linux/blkdev.h>
45 #include <linux/security.h>
46 #include <linux/swapops.h>
47 #include <linux/mempolicy.h>
48 #include <linux/namei.h>
49 #include <linux/ctype.h>
50 #include <linux/migrate.h>
51 #include <linux/highmem.h>
52 
53 #include <asm/uaccess.h>
54 #include <asm/div64.h>
55 #include <asm/pgtable.h>
56 
57 /* This magic number is used in glibc for posix shared memory */
58 #define TMPFS_MAGIC	0x01021994
59 
60 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
61 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
62 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
63 
64 #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
65 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
66 
67 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
68 
69 /* info->flags needs VM_flags to handle pagein/truncate races efficiently */
70 #define SHMEM_PAGEIN	 VM_READ
71 #define SHMEM_TRUNCATE	 VM_WRITE
72 
73 /* Definition to limit shmem_truncate's steps between cond_rescheds */
74 #define LATENCY_LIMIT	 64
75 
76 /* Pretend that each entry is of this size in directory's i_size */
77 #define BOGO_DIRENT_SIZE 20
78 
79 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
80 enum sgp_type {
81 	SGP_READ,	/* don't exceed i_size, don't allocate page */
82 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
83 	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
84 	SGP_WRITE,	/* may exceed i_size, may allocate page */
85 };
86 
87 static int shmem_getpage(struct inode *inode, unsigned long idx,
88 			 struct page **pagep, enum sgp_type sgp, int *type);
89 
90 static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
91 {
92 	/*
93 	 * The above definition of ENTRIES_PER_PAGE, and the use of
94 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
95 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
96 	 *
97 	 * Mobility flags are masked out as swap vectors cannot move
98 	 */
99 	return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
100 				PAGE_CACHE_SHIFT-PAGE_SHIFT);
101 }
102 
103 static inline void shmem_dir_free(struct page *page)
104 {
105 	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
106 }
107 
108 static struct page **shmem_dir_map(struct page *page)
109 {
110 	return (struct page **)kmap_atomic(page, KM_USER0);
111 }
112 
113 static inline void shmem_dir_unmap(struct page **dir)
114 {
115 	kunmap_atomic(dir, KM_USER0);
116 }
117 
118 static swp_entry_t *shmem_swp_map(struct page *page)
119 {
120 	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
121 }
122 
123 static inline void shmem_swp_balance_unmap(void)
124 {
125 	/*
126 	 * When passing a pointer to an i_direct entry, to code which
127 	 * also handles indirect entries and so will shmem_swp_unmap,
128 	 * we must arrange for the preempt count to remain in balance.
129 	 * What kmap_atomic of a lowmem page does depends on config
130 	 * and architecture, so pretend to kmap_atomic some lowmem page.
131 	 */
132 	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
133 }
134 
135 static inline void shmem_swp_unmap(swp_entry_t *entry)
136 {
137 	kunmap_atomic(entry, KM_USER1);
138 }
139 
140 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
141 {
142 	return sb->s_fs_info;
143 }
144 
145 /*
146  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
147  * for shared memory and for shared anonymous (/dev/zero) mappings
148  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
149  * consistent with the pre-accounting of private mappings ...
150  */
151 static inline int shmem_acct_size(unsigned long flags, loff_t size)
152 {
153 	return (flags & VM_ACCOUNT)?
154 		security_vm_enough_memory(VM_ACCT(size)): 0;
155 }
156 
157 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
158 {
159 	if (flags & VM_ACCOUNT)
160 		vm_unacct_memory(VM_ACCT(size));
161 }
162 
163 /*
164  * ... whereas tmpfs objects are accounted incrementally as
165  * pages are allocated, in order to allow huge sparse files.
166  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
167  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
168  */
169 static inline int shmem_acct_block(unsigned long flags)
170 {
171 	return (flags & VM_ACCOUNT)?
172 		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
173 }
174 
175 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
176 {
177 	if (!(flags & VM_ACCOUNT))
178 		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
179 }
180 
181 static const struct super_operations shmem_ops;
182 static const struct address_space_operations shmem_aops;
183 static const struct file_operations shmem_file_operations;
184 static const struct inode_operations shmem_inode_operations;
185 static const struct inode_operations shmem_dir_inode_operations;
186 static const struct inode_operations shmem_special_inode_operations;
187 static struct vm_operations_struct shmem_vm_ops;
188 
189 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
190 	.ra_pages	= 0,	/* No readahead */
191 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
192 	.unplug_io_fn	= default_unplug_io_fn,
193 };
194 
195 static LIST_HEAD(shmem_swaplist);
196 static DEFINE_MUTEX(shmem_swaplist_mutex);
197 
198 static void shmem_free_blocks(struct inode *inode, long pages)
199 {
200 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
201 	if (sbinfo->max_blocks) {
202 		spin_lock(&sbinfo->stat_lock);
203 		sbinfo->free_blocks += pages;
204 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
205 		spin_unlock(&sbinfo->stat_lock);
206 	}
207 }
208 
209 static int shmem_reserve_inode(struct super_block *sb)
210 {
211 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
212 	if (sbinfo->max_inodes) {
213 		spin_lock(&sbinfo->stat_lock);
214 		if (!sbinfo->free_inodes) {
215 			spin_unlock(&sbinfo->stat_lock);
216 			return -ENOSPC;
217 		}
218 		sbinfo->free_inodes--;
219 		spin_unlock(&sbinfo->stat_lock);
220 	}
221 	return 0;
222 }
223 
224 static void shmem_free_inode(struct super_block *sb)
225 {
226 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
227 	if (sbinfo->max_inodes) {
228 		spin_lock(&sbinfo->stat_lock);
229 		sbinfo->free_inodes++;
230 		spin_unlock(&sbinfo->stat_lock);
231 	}
232 }
233 
234 /*
235  * shmem_recalc_inode - recalculate the size of an inode
236  *
237  * @inode: inode to recalc
238  *
239  * We have to calculate the free blocks since the mm can drop
240  * undirtied hole pages behind our back.
241  *
242  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
243  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
244  *
245  * It has to be called with the spinlock held.
246  */
247 static void shmem_recalc_inode(struct inode *inode)
248 {
249 	struct shmem_inode_info *info = SHMEM_I(inode);
250 	long freed;
251 
252 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
253 	if (freed > 0) {
254 		info->alloced -= freed;
255 		shmem_unacct_blocks(info->flags, freed);
256 		shmem_free_blocks(inode, freed);
257 	}
258 }
259 
260 /*
261  * shmem_swp_entry - find the swap vector position in the info structure
262  *
263  * @info:  info structure for the inode
264  * @index: index of the page to find
265  * @page:  optional page to add to the structure. Has to be preset to
266  *         all zeros
267  *
268  * If there is no space allocated yet it will return NULL when
269  * page is NULL, else it will use the page for the needed block,
270  * setting it to NULL on return to indicate that it has been used.
271  *
272  * The swap vector is organized the following way:
273  *
274  * There are SHMEM_NR_DIRECT entries directly stored in the
275  * shmem_inode_info structure. So small files do not need an addional
276  * allocation.
277  *
278  * For pages with index > SHMEM_NR_DIRECT there is the pointer
279  * i_indirect which points to a page which holds in the first half
280  * doubly indirect blocks, in the second half triple indirect blocks:
281  *
282  * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
283  * following layout (for SHMEM_NR_DIRECT == 16):
284  *
285  * i_indirect -> dir --> 16-19
286  * 	      |	     +-> 20-23
287  * 	      |
288  * 	      +-->dir2 --> 24-27
289  * 	      |	       +-> 28-31
290  * 	      |	       +-> 32-35
291  * 	      |	       +-> 36-39
292  * 	      |
293  * 	      +-->dir3 --> 40-43
294  * 	       	       +-> 44-47
295  * 	      	       +-> 48-51
296  * 	      	       +-> 52-55
297  */
298 static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
299 {
300 	unsigned long offset;
301 	struct page **dir;
302 	struct page *subdir;
303 
304 	if (index < SHMEM_NR_DIRECT) {
305 		shmem_swp_balance_unmap();
306 		return info->i_direct+index;
307 	}
308 	if (!info->i_indirect) {
309 		if (page) {
310 			info->i_indirect = *page;
311 			*page = NULL;
312 		}
313 		return NULL;			/* need another page */
314 	}
315 
316 	index -= SHMEM_NR_DIRECT;
317 	offset = index % ENTRIES_PER_PAGE;
318 	index /= ENTRIES_PER_PAGE;
319 	dir = shmem_dir_map(info->i_indirect);
320 
321 	if (index >= ENTRIES_PER_PAGE/2) {
322 		index -= ENTRIES_PER_PAGE/2;
323 		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
324 		index %= ENTRIES_PER_PAGE;
325 		subdir = *dir;
326 		if (!subdir) {
327 			if (page) {
328 				*dir = *page;
329 				*page = NULL;
330 			}
331 			shmem_dir_unmap(dir);
332 			return NULL;		/* need another page */
333 		}
334 		shmem_dir_unmap(dir);
335 		dir = shmem_dir_map(subdir);
336 	}
337 
338 	dir += index;
339 	subdir = *dir;
340 	if (!subdir) {
341 		if (!page || !(subdir = *page)) {
342 			shmem_dir_unmap(dir);
343 			return NULL;		/* need a page */
344 		}
345 		*dir = subdir;
346 		*page = NULL;
347 	}
348 	shmem_dir_unmap(dir);
349 	return shmem_swp_map(subdir) + offset;
350 }
351 
352 static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
353 {
354 	long incdec = value? 1: -1;
355 
356 	entry->val = value;
357 	info->swapped += incdec;
358 	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
359 		struct page *page = kmap_atomic_to_page(entry);
360 		set_page_private(page, page_private(page) + incdec);
361 	}
362 }
363 
364 /*
365  * shmem_swp_alloc - get the position of the swap entry for the page.
366  *                   If it does not exist allocate the entry.
367  *
368  * @info:	info structure for the inode
369  * @index:	index of the page to find
370  * @sgp:	check and recheck i_size? skip allocation?
371  */
372 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
373 {
374 	struct inode *inode = &info->vfs_inode;
375 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
376 	struct page *page = NULL;
377 	swp_entry_t *entry;
378 
379 	if (sgp != SGP_WRITE &&
380 	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
381 		return ERR_PTR(-EINVAL);
382 
383 	while (!(entry = shmem_swp_entry(info, index, &page))) {
384 		if (sgp == SGP_READ)
385 			return shmem_swp_map(ZERO_PAGE(0));
386 		/*
387 		 * Test free_blocks against 1 not 0, since we have 1 data
388 		 * page (and perhaps indirect index pages) yet to allocate:
389 		 * a waste to allocate index if we cannot allocate data.
390 		 */
391 		if (sbinfo->max_blocks) {
392 			spin_lock(&sbinfo->stat_lock);
393 			if (sbinfo->free_blocks <= 1) {
394 				spin_unlock(&sbinfo->stat_lock);
395 				return ERR_PTR(-ENOSPC);
396 			}
397 			sbinfo->free_blocks--;
398 			inode->i_blocks += BLOCKS_PER_PAGE;
399 			spin_unlock(&sbinfo->stat_lock);
400 		}
401 
402 		spin_unlock(&info->lock);
403 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
404 		if (page)
405 			set_page_private(page, 0);
406 		spin_lock(&info->lock);
407 
408 		if (!page) {
409 			shmem_free_blocks(inode, 1);
410 			return ERR_PTR(-ENOMEM);
411 		}
412 		if (sgp != SGP_WRITE &&
413 		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
414 			entry = ERR_PTR(-EINVAL);
415 			break;
416 		}
417 		if (info->next_index <= index)
418 			info->next_index = index + 1;
419 	}
420 	if (page) {
421 		/* another task gave its page, or truncated the file */
422 		shmem_free_blocks(inode, 1);
423 		shmem_dir_free(page);
424 	}
425 	if (info->next_index <= index && !IS_ERR(entry))
426 		info->next_index = index + 1;
427 	return entry;
428 }
429 
430 /*
431  * shmem_free_swp - free some swap entries in a directory
432  *
433  * @dir:        pointer to the directory
434  * @edir:       pointer after last entry of the directory
435  * @punch_lock: pointer to spinlock when needed for the holepunch case
436  */
437 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
438 						spinlock_t *punch_lock)
439 {
440 	spinlock_t *punch_unlock = NULL;
441 	swp_entry_t *ptr;
442 	int freed = 0;
443 
444 	for (ptr = dir; ptr < edir; ptr++) {
445 		if (ptr->val) {
446 			if (unlikely(punch_lock)) {
447 				punch_unlock = punch_lock;
448 				punch_lock = NULL;
449 				spin_lock(punch_unlock);
450 				if (!ptr->val)
451 					continue;
452 			}
453 			free_swap_and_cache(*ptr);
454 			*ptr = (swp_entry_t){0};
455 			freed++;
456 		}
457 	}
458 	if (punch_unlock)
459 		spin_unlock(punch_unlock);
460 	return freed;
461 }
462 
463 static int shmem_map_and_free_swp(struct page *subdir, int offset,
464 		int limit, struct page ***dir, spinlock_t *punch_lock)
465 {
466 	swp_entry_t *ptr;
467 	int freed = 0;
468 
469 	ptr = shmem_swp_map(subdir);
470 	for (; offset < limit; offset += LATENCY_LIMIT) {
471 		int size = limit - offset;
472 		if (size > LATENCY_LIMIT)
473 			size = LATENCY_LIMIT;
474 		freed += shmem_free_swp(ptr+offset, ptr+offset+size,
475 							punch_lock);
476 		if (need_resched()) {
477 			shmem_swp_unmap(ptr);
478 			if (*dir) {
479 				shmem_dir_unmap(*dir);
480 				*dir = NULL;
481 			}
482 			cond_resched();
483 			ptr = shmem_swp_map(subdir);
484 		}
485 	}
486 	shmem_swp_unmap(ptr);
487 	return freed;
488 }
489 
490 static void shmem_free_pages(struct list_head *next)
491 {
492 	struct page *page;
493 	int freed = 0;
494 
495 	do {
496 		page = container_of(next, struct page, lru);
497 		next = next->next;
498 		shmem_dir_free(page);
499 		freed++;
500 		if (freed >= LATENCY_LIMIT) {
501 			cond_resched();
502 			freed = 0;
503 		}
504 	} while (next);
505 }
506 
507 static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
508 {
509 	struct shmem_inode_info *info = SHMEM_I(inode);
510 	unsigned long idx;
511 	unsigned long size;
512 	unsigned long limit;
513 	unsigned long stage;
514 	unsigned long diroff;
515 	struct page **dir;
516 	struct page *topdir;
517 	struct page *middir;
518 	struct page *subdir;
519 	swp_entry_t *ptr;
520 	LIST_HEAD(pages_to_free);
521 	long nr_pages_to_free = 0;
522 	long nr_swaps_freed = 0;
523 	int offset;
524 	int freed;
525 	int punch_hole;
526 	spinlock_t *needs_lock;
527 	spinlock_t *punch_lock;
528 	unsigned long upper_limit;
529 
530 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
531 	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
532 	if (idx >= info->next_index)
533 		return;
534 
535 	spin_lock(&info->lock);
536 	info->flags |= SHMEM_TRUNCATE;
537 	if (likely(end == (loff_t) -1)) {
538 		limit = info->next_index;
539 		upper_limit = SHMEM_MAX_INDEX;
540 		info->next_index = idx;
541 		needs_lock = NULL;
542 		punch_hole = 0;
543 	} else {
544 		if (end + 1 >= inode->i_size) {	/* we may free a little more */
545 			limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
546 							PAGE_CACHE_SHIFT;
547 			upper_limit = SHMEM_MAX_INDEX;
548 		} else {
549 			limit = (end + 1) >> PAGE_CACHE_SHIFT;
550 			upper_limit = limit;
551 		}
552 		needs_lock = &info->lock;
553 		punch_hole = 1;
554 	}
555 
556 	topdir = info->i_indirect;
557 	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
558 		info->i_indirect = NULL;
559 		nr_pages_to_free++;
560 		list_add(&topdir->lru, &pages_to_free);
561 	}
562 	spin_unlock(&info->lock);
563 
564 	if (info->swapped && idx < SHMEM_NR_DIRECT) {
565 		ptr = info->i_direct;
566 		size = limit;
567 		if (size > SHMEM_NR_DIRECT)
568 			size = SHMEM_NR_DIRECT;
569 		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
570 	}
571 
572 	/*
573 	 * If there are no indirect blocks or we are punching a hole
574 	 * below indirect blocks, nothing to be done.
575 	 */
576 	if (!topdir || limit <= SHMEM_NR_DIRECT)
577 		goto done2;
578 
579 	/*
580 	 * The truncation case has already dropped info->lock, and we're safe
581 	 * because i_size and next_index have already been lowered, preventing
582 	 * access beyond.  But in the punch_hole case, we still need to take
583 	 * the lock when updating the swap directory, because there might be
584 	 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
585 	 * shmem_writepage.  However, whenever we find we can remove a whole
586 	 * directory page (not at the misaligned start or end of the range),
587 	 * we first NULLify its pointer in the level above, and then have no
588 	 * need to take the lock when updating its contents: needs_lock and
589 	 * punch_lock (either pointing to info->lock or NULL) manage this.
590 	 */
591 
592 	upper_limit -= SHMEM_NR_DIRECT;
593 	limit -= SHMEM_NR_DIRECT;
594 	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
595 	offset = idx % ENTRIES_PER_PAGE;
596 	idx -= offset;
597 
598 	dir = shmem_dir_map(topdir);
599 	stage = ENTRIES_PER_PAGEPAGE/2;
600 	if (idx < ENTRIES_PER_PAGEPAGE/2) {
601 		middir = topdir;
602 		diroff = idx/ENTRIES_PER_PAGE;
603 	} else {
604 		dir += ENTRIES_PER_PAGE/2;
605 		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
606 		while (stage <= idx)
607 			stage += ENTRIES_PER_PAGEPAGE;
608 		middir = *dir;
609 		if (*dir) {
610 			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
611 				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
612 			if (!diroff && !offset && upper_limit >= stage) {
613 				if (needs_lock) {
614 					spin_lock(needs_lock);
615 					*dir = NULL;
616 					spin_unlock(needs_lock);
617 					needs_lock = NULL;
618 				} else
619 					*dir = NULL;
620 				nr_pages_to_free++;
621 				list_add(&middir->lru, &pages_to_free);
622 			}
623 			shmem_dir_unmap(dir);
624 			dir = shmem_dir_map(middir);
625 		} else {
626 			diroff = 0;
627 			offset = 0;
628 			idx = stage;
629 		}
630 	}
631 
632 	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
633 		if (unlikely(idx == stage)) {
634 			shmem_dir_unmap(dir);
635 			dir = shmem_dir_map(topdir) +
636 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
637 			while (!*dir) {
638 				dir++;
639 				idx += ENTRIES_PER_PAGEPAGE;
640 				if (idx >= limit)
641 					goto done1;
642 			}
643 			stage = idx + ENTRIES_PER_PAGEPAGE;
644 			middir = *dir;
645 			if (punch_hole)
646 				needs_lock = &info->lock;
647 			if (upper_limit >= stage) {
648 				if (needs_lock) {
649 					spin_lock(needs_lock);
650 					*dir = NULL;
651 					spin_unlock(needs_lock);
652 					needs_lock = NULL;
653 				} else
654 					*dir = NULL;
655 				nr_pages_to_free++;
656 				list_add(&middir->lru, &pages_to_free);
657 			}
658 			shmem_dir_unmap(dir);
659 			cond_resched();
660 			dir = shmem_dir_map(middir);
661 			diroff = 0;
662 		}
663 		punch_lock = needs_lock;
664 		subdir = dir[diroff];
665 		if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
666 			if (needs_lock) {
667 				spin_lock(needs_lock);
668 				dir[diroff] = NULL;
669 				spin_unlock(needs_lock);
670 				punch_lock = NULL;
671 			} else
672 				dir[diroff] = NULL;
673 			nr_pages_to_free++;
674 			list_add(&subdir->lru, &pages_to_free);
675 		}
676 		if (subdir && page_private(subdir) /* has swap entries */) {
677 			size = limit - idx;
678 			if (size > ENTRIES_PER_PAGE)
679 				size = ENTRIES_PER_PAGE;
680 			freed = shmem_map_and_free_swp(subdir,
681 					offset, size, &dir, punch_lock);
682 			if (!dir)
683 				dir = shmem_dir_map(middir);
684 			nr_swaps_freed += freed;
685 			if (offset || punch_lock) {
686 				spin_lock(&info->lock);
687 				set_page_private(subdir,
688 					page_private(subdir) - freed);
689 				spin_unlock(&info->lock);
690 			} else
691 				BUG_ON(page_private(subdir) != freed);
692 		}
693 		offset = 0;
694 	}
695 done1:
696 	shmem_dir_unmap(dir);
697 done2:
698 	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
699 		/*
700 		 * Call truncate_inode_pages again: racing shmem_unuse_inode
701 		 * may have swizzled a page in from swap since vmtruncate or
702 		 * generic_delete_inode did it, before we lowered next_index.
703 		 * Also, though shmem_getpage checks i_size before adding to
704 		 * cache, no recheck after: so fix the narrow window there too.
705 		 *
706 		 * Recalling truncate_inode_pages_range and unmap_mapping_range
707 		 * every time for punch_hole (which never got a chance to clear
708 		 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
709 		 * yet hardly ever necessary: try to optimize them out later.
710 		 */
711 		truncate_inode_pages_range(inode->i_mapping, start, end);
712 		if (punch_hole)
713 			unmap_mapping_range(inode->i_mapping, start,
714 							end - start, 1);
715 	}
716 
717 	spin_lock(&info->lock);
718 	info->flags &= ~SHMEM_TRUNCATE;
719 	info->swapped -= nr_swaps_freed;
720 	if (nr_pages_to_free)
721 		shmem_free_blocks(inode, nr_pages_to_free);
722 	shmem_recalc_inode(inode);
723 	spin_unlock(&info->lock);
724 
725 	/*
726 	 * Empty swap vector directory pages to be freed?
727 	 */
728 	if (!list_empty(&pages_to_free)) {
729 		pages_to_free.prev->next = NULL;
730 		shmem_free_pages(pages_to_free.next);
731 	}
732 }
733 
734 static void shmem_truncate(struct inode *inode)
735 {
736 	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
737 }
738 
739 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
740 {
741 	struct inode *inode = dentry->d_inode;
742 	struct page *page = NULL;
743 	int error;
744 
745 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
746 		if (attr->ia_size < inode->i_size) {
747 			/*
748 			 * If truncating down to a partial page, then
749 			 * if that page is already allocated, hold it
750 			 * in memory until the truncation is over, so
751 			 * truncate_partial_page cannnot miss it were
752 			 * it assigned to swap.
753 			 */
754 			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
755 				(void) shmem_getpage(inode,
756 					attr->ia_size>>PAGE_CACHE_SHIFT,
757 						&page, SGP_READ, NULL);
758 				if (page)
759 					unlock_page(page);
760 			}
761 			/*
762 			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
763 			 * detect if any pages might have been added to cache
764 			 * after truncate_inode_pages.  But we needn't bother
765 			 * if it's being fully truncated to zero-length: the
766 			 * nrpages check is efficient enough in that case.
767 			 */
768 			if (attr->ia_size) {
769 				struct shmem_inode_info *info = SHMEM_I(inode);
770 				spin_lock(&info->lock);
771 				info->flags &= ~SHMEM_PAGEIN;
772 				spin_unlock(&info->lock);
773 			}
774 		}
775 	}
776 
777 	error = inode_change_ok(inode, attr);
778 	if (!error)
779 		error = inode_setattr(inode, attr);
780 #ifdef CONFIG_TMPFS_POSIX_ACL
781 	if (!error && (attr->ia_valid & ATTR_MODE))
782 		error = generic_acl_chmod(inode, &shmem_acl_ops);
783 #endif
784 	if (page)
785 		page_cache_release(page);
786 	return error;
787 }
788 
789 static void shmem_delete_inode(struct inode *inode)
790 {
791 	struct shmem_inode_info *info = SHMEM_I(inode);
792 
793 	if (inode->i_op->truncate == shmem_truncate) {
794 		truncate_inode_pages(inode->i_mapping, 0);
795 		shmem_unacct_size(info->flags, inode->i_size);
796 		inode->i_size = 0;
797 		shmem_truncate(inode);
798 		if (!list_empty(&info->swaplist)) {
799 			mutex_lock(&shmem_swaplist_mutex);
800 			list_del_init(&info->swaplist);
801 			mutex_unlock(&shmem_swaplist_mutex);
802 		}
803 	}
804 	BUG_ON(inode->i_blocks);
805 	shmem_free_inode(inode->i_sb);
806 	clear_inode(inode);
807 }
808 
809 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
810 {
811 	swp_entry_t *ptr;
812 
813 	for (ptr = dir; ptr < edir; ptr++) {
814 		if (ptr->val == entry.val)
815 			return ptr - dir;
816 	}
817 	return -1;
818 }
819 
820 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
821 {
822 	struct inode *inode;
823 	unsigned long idx;
824 	unsigned long size;
825 	unsigned long limit;
826 	unsigned long stage;
827 	struct page **dir;
828 	struct page *subdir;
829 	swp_entry_t *ptr;
830 	int offset;
831 	int error;
832 
833 	idx = 0;
834 	ptr = info->i_direct;
835 	spin_lock(&info->lock);
836 	if (!info->swapped) {
837 		list_del_init(&info->swaplist);
838 		goto lost2;
839 	}
840 	limit = info->next_index;
841 	size = limit;
842 	if (size > SHMEM_NR_DIRECT)
843 		size = SHMEM_NR_DIRECT;
844 	offset = shmem_find_swp(entry, ptr, ptr+size);
845 	if (offset >= 0)
846 		goto found;
847 	if (!info->i_indirect)
848 		goto lost2;
849 
850 	dir = shmem_dir_map(info->i_indirect);
851 	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
852 
853 	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
854 		if (unlikely(idx == stage)) {
855 			shmem_dir_unmap(dir-1);
856 			if (cond_resched_lock(&info->lock)) {
857 				/* check it has not been truncated */
858 				if (limit > info->next_index) {
859 					limit = info->next_index;
860 					if (idx >= limit)
861 						goto lost2;
862 				}
863 			}
864 			dir = shmem_dir_map(info->i_indirect) +
865 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
866 			while (!*dir) {
867 				dir++;
868 				idx += ENTRIES_PER_PAGEPAGE;
869 				if (idx >= limit)
870 					goto lost1;
871 			}
872 			stage = idx + ENTRIES_PER_PAGEPAGE;
873 			subdir = *dir;
874 			shmem_dir_unmap(dir);
875 			dir = shmem_dir_map(subdir);
876 		}
877 		subdir = *dir;
878 		if (subdir && page_private(subdir)) {
879 			ptr = shmem_swp_map(subdir);
880 			size = limit - idx;
881 			if (size > ENTRIES_PER_PAGE)
882 				size = ENTRIES_PER_PAGE;
883 			offset = shmem_find_swp(entry, ptr, ptr+size);
884 			shmem_swp_unmap(ptr);
885 			if (offset >= 0) {
886 				shmem_dir_unmap(dir);
887 				goto found;
888 			}
889 		}
890 	}
891 lost1:
892 	shmem_dir_unmap(dir-1);
893 lost2:
894 	spin_unlock(&info->lock);
895 	return 0;
896 found:
897 	idx += offset;
898 	inode = igrab(&info->vfs_inode);
899 	spin_unlock(&info->lock);
900 
901 	/*
902 	 * Move _head_ to start search for next from here.
903 	 * But be careful: shmem_delete_inode checks list_empty without taking
904 	 * mutex, and there's an instant in list_move_tail when info->swaplist
905 	 * would appear empty, if it were the only one on shmem_swaplist.  We
906 	 * could avoid doing it if inode NULL; or use this minor optimization.
907 	 */
908 	if (shmem_swaplist.next != &info->swaplist)
909 		list_move_tail(&shmem_swaplist, &info->swaplist);
910 	mutex_unlock(&shmem_swaplist_mutex);
911 
912 	error = 1;
913 	if (!inode)
914 		goto out;
915 	error = radix_tree_preload(GFP_KERNEL);
916 	if (error)
917 		goto out;
918 	error = 1;
919 
920 	spin_lock(&info->lock);
921 	ptr = shmem_swp_entry(info, idx, NULL);
922 	if (ptr && ptr->val == entry.val)
923 		error = add_to_page_cache(page, inode->i_mapping,
924 						idx, GFP_NOWAIT);
925 	if (error == -EEXIST) {
926 		struct page *filepage = find_get_page(inode->i_mapping, idx);
927 		error = 1;
928 		if (filepage) {
929 			/*
930 			 * There might be a more uptodate page coming down
931 			 * from a stacked writepage: forget our swappage if so.
932 			 */
933 			if (PageUptodate(filepage))
934 				error = 0;
935 			page_cache_release(filepage);
936 		}
937 	}
938 	if (!error) {
939 		delete_from_swap_cache(page);
940 		set_page_dirty(page);
941 		info->flags |= SHMEM_PAGEIN;
942 		shmem_swp_set(info, ptr, 0);
943 		swap_free(entry);
944 		error = 1;	/* not an error, but entry was found */
945 	}
946 	if (ptr)
947 		shmem_swp_unmap(ptr);
948 	spin_unlock(&info->lock);
949 	radix_tree_preload_end();
950 out:
951 	unlock_page(page);
952 	page_cache_release(page);
953 	iput(inode);		/* allows for NULL */
954 	return error;
955 }
956 
957 /*
958  * shmem_unuse() search for an eventually swapped out shmem page.
959  */
960 int shmem_unuse(swp_entry_t entry, struct page *page)
961 {
962 	struct list_head *p, *next;
963 	struct shmem_inode_info *info;
964 	int found = 0;
965 
966 	mutex_lock(&shmem_swaplist_mutex);
967 	list_for_each_safe(p, next, &shmem_swaplist) {
968 		info = list_entry(p, struct shmem_inode_info, swaplist);
969 		found = shmem_unuse_inode(info, entry, page);
970 		cond_resched();
971 		if (found)
972 			goto out;
973 	}
974 	mutex_unlock(&shmem_swaplist_mutex);
975 out:	return found;	/* 0 or 1 or -ENOMEM */
976 }
977 
978 /*
979  * Move the page from the page cache to the swap cache.
980  */
981 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
982 {
983 	struct shmem_inode_info *info;
984 	swp_entry_t *entry, swap;
985 	struct address_space *mapping;
986 	unsigned long index;
987 	struct inode *inode;
988 
989 	BUG_ON(!PageLocked(page));
990 	mapping = page->mapping;
991 	index = page->index;
992 	inode = mapping->host;
993 	info = SHMEM_I(inode);
994 	if (info->flags & VM_LOCKED)
995 		goto redirty;
996 	if (!total_swap_pages)
997 		goto redirty;
998 
999 	/*
1000 	 * shmem_backing_dev_info's capabilities prevent regular writeback or
1001 	 * sync from ever calling shmem_writepage; but a stacking filesystem
1002 	 * may use the ->writepage of its underlying filesystem, in which case
1003 	 * tmpfs should write out to swap only in response to memory pressure,
1004 	 * and not for pdflush or sync.  However, in those cases, we do still
1005 	 * want to check if there's a redundant swappage to be discarded.
1006 	 */
1007 	if (wbc->for_reclaim)
1008 		swap = get_swap_page();
1009 	else
1010 		swap.val = 0;
1011 
1012 	spin_lock(&info->lock);
1013 	if (index >= info->next_index) {
1014 		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1015 		goto unlock;
1016 	}
1017 	entry = shmem_swp_entry(info, index, NULL);
1018 	if (entry->val) {
1019 		/*
1020 		 * The more uptodate page coming down from a stacked
1021 		 * writepage should replace our old swappage.
1022 		 */
1023 		free_swap_and_cache(*entry);
1024 		shmem_swp_set(info, entry, 0);
1025 	}
1026 	shmem_recalc_inode(inode);
1027 
1028 	if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1029 		remove_from_page_cache(page);
1030 		shmem_swp_set(info, entry, swap.val);
1031 		shmem_swp_unmap(entry);
1032 		if (list_empty(&info->swaplist))
1033 			inode = igrab(inode);
1034 		else
1035 			inode = NULL;
1036 		spin_unlock(&info->lock);
1037 		swap_duplicate(swap);
1038 		BUG_ON(page_mapped(page));
1039 		page_cache_release(page);	/* pagecache ref */
1040 		set_page_dirty(page);
1041 		unlock_page(page);
1042 		if (inode) {
1043 			mutex_lock(&shmem_swaplist_mutex);
1044 			/* move instead of add in case we're racing */
1045 			list_move_tail(&info->swaplist, &shmem_swaplist);
1046 			mutex_unlock(&shmem_swaplist_mutex);
1047 			iput(inode);
1048 		}
1049 		return 0;
1050 	}
1051 
1052 	shmem_swp_unmap(entry);
1053 unlock:
1054 	spin_unlock(&info->lock);
1055 	swap_free(swap);
1056 redirty:
1057 	set_page_dirty(page);
1058 	if (wbc->for_reclaim)
1059 		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
1060 	unlock_page(page);
1061 	return 0;
1062 }
1063 
1064 #ifdef CONFIG_NUMA
1065 static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
1066 {
1067 	char *nodelist = strchr(value, ':');
1068 	int err = 1;
1069 
1070 	if (nodelist) {
1071 		/* NUL-terminate policy string */
1072 		*nodelist++ = '\0';
1073 		if (nodelist_parse(nodelist, *policy_nodes))
1074 			goto out;
1075 		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
1076 			goto out;
1077 	}
1078 	if (!strcmp(value, "default")) {
1079 		*policy = MPOL_DEFAULT;
1080 		/* Don't allow a nodelist */
1081 		if (!nodelist)
1082 			err = 0;
1083 	} else if (!strcmp(value, "prefer")) {
1084 		*policy = MPOL_PREFERRED;
1085 		/* Insist on a nodelist of one node only */
1086 		if (nodelist) {
1087 			char *rest = nodelist;
1088 			while (isdigit(*rest))
1089 				rest++;
1090 			if (!*rest)
1091 				err = 0;
1092 		}
1093 	} else if (!strcmp(value, "bind")) {
1094 		*policy = MPOL_BIND;
1095 		/* Insist on a nodelist */
1096 		if (nodelist)
1097 			err = 0;
1098 	} else if (!strcmp(value, "interleave")) {
1099 		*policy = MPOL_INTERLEAVE;
1100 		/*
1101 		 * Default to online nodes with memory if no nodelist
1102 		 */
1103 		if (!nodelist)
1104 			*policy_nodes = node_states[N_HIGH_MEMORY];
1105 		err = 0;
1106 	}
1107 out:
1108 	/* Restore string for error message */
1109 	if (nodelist)
1110 		*--nodelist = ':';
1111 	return err;
1112 }
1113 
1114 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1115 			struct shmem_inode_info *info, unsigned long idx)
1116 {
1117 	struct vm_area_struct pvma;
1118 	struct page *page;
1119 
1120 	/* Create a pseudo vma that just contains the policy */
1121 	pvma.vm_start = 0;
1122 	pvma.vm_pgoff = idx;
1123 	pvma.vm_ops = NULL;
1124 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1125 	page = swapin_readahead(entry, gfp, &pvma, 0);
1126 	mpol_free(pvma.vm_policy);
1127 	return page;
1128 }
1129 
1130 static struct page *shmem_alloc_page(gfp_t gfp,
1131 			struct shmem_inode_info *info, unsigned long idx)
1132 {
1133 	struct vm_area_struct pvma;
1134 	struct page *page;
1135 
1136 	/* Create a pseudo vma that just contains the policy */
1137 	pvma.vm_start = 0;
1138 	pvma.vm_pgoff = idx;
1139 	pvma.vm_ops = NULL;
1140 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1141 	page = alloc_page_vma(gfp, &pvma, 0);
1142 	mpol_free(pvma.vm_policy);
1143 	return page;
1144 }
1145 #else
1146 static inline int shmem_parse_mpol(char *value, int *policy,
1147 						nodemask_t *policy_nodes)
1148 {
1149 	return 1;
1150 }
1151 
1152 static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1153 			struct shmem_inode_info *info, unsigned long idx)
1154 {
1155 	return swapin_readahead(entry, gfp, NULL, 0);
1156 }
1157 
1158 static inline struct page *shmem_alloc_page(gfp_t gfp,
1159 			struct shmem_inode_info *info, unsigned long idx)
1160 {
1161 	return alloc_page(gfp);
1162 }
1163 #endif
1164 
1165 /*
1166  * shmem_getpage - either get the page from swap or allocate a new one
1167  *
1168  * If we allocate a new one we do not mark it dirty. That's up to the
1169  * vm. If we swap it in we mark it dirty since we also free the swap
1170  * entry since a page cannot live in both the swap and page cache
1171  */
1172 static int shmem_getpage(struct inode *inode, unsigned long idx,
1173 			struct page **pagep, enum sgp_type sgp, int *type)
1174 {
1175 	struct address_space *mapping = inode->i_mapping;
1176 	struct shmem_inode_info *info = SHMEM_I(inode);
1177 	struct shmem_sb_info *sbinfo;
1178 	struct page *filepage = *pagep;
1179 	struct page *swappage;
1180 	swp_entry_t *entry;
1181 	swp_entry_t swap;
1182 	gfp_t gfp;
1183 	int error;
1184 
1185 	if (idx >= SHMEM_MAX_INDEX)
1186 		return -EFBIG;
1187 
1188 	if (type)
1189 		*type = 0;
1190 
1191 	/*
1192 	 * Normally, filepage is NULL on entry, and either found
1193 	 * uptodate immediately, or allocated and zeroed, or read
1194 	 * in under swappage, which is then assigned to filepage.
1195 	 * But shmem_readpage (required for splice) passes in a locked
1196 	 * filepage, which may be found not uptodate by other callers
1197 	 * too, and may need to be copied from the swappage read in.
1198 	 */
1199 repeat:
1200 	if (!filepage)
1201 		filepage = find_lock_page(mapping, idx);
1202 	if (filepage && PageUptodate(filepage))
1203 		goto done;
1204 	error = 0;
1205 	gfp = mapping_gfp_mask(mapping);
1206 	if (!filepage) {
1207 		/*
1208 		 * Try to preload while we can wait, to not make a habit of
1209 		 * draining atomic reserves; but don't latch on to this cpu.
1210 		 */
1211 		error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
1212 		if (error)
1213 			goto failed;
1214 		radix_tree_preload_end();
1215 	}
1216 
1217 	spin_lock(&info->lock);
1218 	shmem_recalc_inode(inode);
1219 	entry = shmem_swp_alloc(info, idx, sgp);
1220 	if (IS_ERR(entry)) {
1221 		spin_unlock(&info->lock);
1222 		error = PTR_ERR(entry);
1223 		goto failed;
1224 	}
1225 	swap = *entry;
1226 
1227 	if (swap.val) {
1228 		/* Look it up and read it in.. */
1229 		swappage = lookup_swap_cache(swap);
1230 		if (!swappage) {
1231 			shmem_swp_unmap(entry);
1232 			/* here we actually do the io */
1233 			if (type && !(*type & VM_FAULT_MAJOR)) {
1234 				__count_vm_event(PGMAJFAULT);
1235 				*type |= VM_FAULT_MAJOR;
1236 			}
1237 			spin_unlock(&info->lock);
1238 			swappage = shmem_swapin(swap, gfp, info, idx);
1239 			if (!swappage) {
1240 				spin_lock(&info->lock);
1241 				entry = shmem_swp_alloc(info, idx, sgp);
1242 				if (IS_ERR(entry))
1243 					error = PTR_ERR(entry);
1244 				else {
1245 					if (entry->val == swap.val)
1246 						error = -ENOMEM;
1247 					shmem_swp_unmap(entry);
1248 				}
1249 				spin_unlock(&info->lock);
1250 				if (error)
1251 					goto failed;
1252 				goto repeat;
1253 			}
1254 			wait_on_page_locked(swappage);
1255 			page_cache_release(swappage);
1256 			goto repeat;
1257 		}
1258 
1259 		/* We have to do this with page locked to prevent races */
1260 		if (TestSetPageLocked(swappage)) {
1261 			shmem_swp_unmap(entry);
1262 			spin_unlock(&info->lock);
1263 			wait_on_page_locked(swappage);
1264 			page_cache_release(swappage);
1265 			goto repeat;
1266 		}
1267 		if (PageWriteback(swappage)) {
1268 			shmem_swp_unmap(entry);
1269 			spin_unlock(&info->lock);
1270 			wait_on_page_writeback(swappage);
1271 			unlock_page(swappage);
1272 			page_cache_release(swappage);
1273 			goto repeat;
1274 		}
1275 		if (!PageUptodate(swappage)) {
1276 			shmem_swp_unmap(entry);
1277 			spin_unlock(&info->lock);
1278 			unlock_page(swappage);
1279 			page_cache_release(swappage);
1280 			error = -EIO;
1281 			goto failed;
1282 		}
1283 
1284 		if (filepage) {
1285 			shmem_swp_set(info, entry, 0);
1286 			shmem_swp_unmap(entry);
1287 			delete_from_swap_cache(swappage);
1288 			spin_unlock(&info->lock);
1289 			copy_highpage(filepage, swappage);
1290 			unlock_page(swappage);
1291 			page_cache_release(swappage);
1292 			flush_dcache_page(filepage);
1293 			SetPageUptodate(filepage);
1294 			set_page_dirty(filepage);
1295 			swap_free(swap);
1296 		} else if (!(error = add_to_page_cache(
1297 				swappage, mapping, idx, GFP_NOWAIT))) {
1298 			info->flags |= SHMEM_PAGEIN;
1299 			shmem_swp_set(info, entry, 0);
1300 			shmem_swp_unmap(entry);
1301 			delete_from_swap_cache(swappage);
1302 			spin_unlock(&info->lock);
1303 			filepage = swappage;
1304 			set_page_dirty(filepage);
1305 			swap_free(swap);
1306 		} else {
1307 			shmem_swp_unmap(entry);
1308 			spin_unlock(&info->lock);
1309 			unlock_page(swappage);
1310 			page_cache_release(swappage);
1311 			goto repeat;
1312 		}
1313 	} else if (sgp == SGP_READ && !filepage) {
1314 		shmem_swp_unmap(entry);
1315 		filepage = find_get_page(mapping, idx);
1316 		if (filepage &&
1317 		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1318 			spin_unlock(&info->lock);
1319 			wait_on_page_locked(filepage);
1320 			page_cache_release(filepage);
1321 			filepage = NULL;
1322 			goto repeat;
1323 		}
1324 		spin_unlock(&info->lock);
1325 	} else {
1326 		shmem_swp_unmap(entry);
1327 		sbinfo = SHMEM_SB(inode->i_sb);
1328 		if (sbinfo->max_blocks) {
1329 			spin_lock(&sbinfo->stat_lock);
1330 			if (sbinfo->free_blocks == 0 ||
1331 			    shmem_acct_block(info->flags)) {
1332 				spin_unlock(&sbinfo->stat_lock);
1333 				spin_unlock(&info->lock);
1334 				error = -ENOSPC;
1335 				goto failed;
1336 			}
1337 			sbinfo->free_blocks--;
1338 			inode->i_blocks += BLOCKS_PER_PAGE;
1339 			spin_unlock(&sbinfo->stat_lock);
1340 		} else if (shmem_acct_block(info->flags)) {
1341 			spin_unlock(&info->lock);
1342 			error = -ENOSPC;
1343 			goto failed;
1344 		}
1345 
1346 		if (!filepage) {
1347 			spin_unlock(&info->lock);
1348 			filepage = shmem_alloc_page(gfp, info, idx);
1349 			if (!filepage) {
1350 				shmem_unacct_blocks(info->flags, 1);
1351 				shmem_free_blocks(inode, 1);
1352 				error = -ENOMEM;
1353 				goto failed;
1354 			}
1355 
1356 			spin_lock(&info->lock);
1357 			entry = shmem_swp_alloc(info, idx, sgp);
1358 			if (IS_ERR(entry))
1359 				error = PTR_ERR(entry);
1360 			else {
1361 				swap = *entry;
1362 				shmem_swp_unmap(entry);
1363 			}
1364 			if (error || swap.val || 0 != add_to_page_cache_lru(
1365 					filepage, mapping, idx, GFP_NOWAIT)) {
1366 				spin_unlock(&info->lock);
1367 				page_cache_release(filepage);
1368 				shmem_unacct_blocks(info->flags, 1);
1369 				shmem_free_blocks(inode, 1);
1370 				filepage = NULL;
1371 				if (error)
1372 					goto failed;
1373 				goto repeat;
1374 			}
1375 			info->flags |= SHMEM_PAGEIN;
1376 		}
1377 
1378 		info->alloced++;
1379 		spin_unlock(&info->lock);
1380 		clear_highpage(filepage);
1381 		flush_dcache_page(filepage);
1382 		SetPageUptodate(filepage);
1383 		if (sgp == SGP_DIRTY)
1384 			set_page_dirty(filepage);
1385 	}
1386 done:
1387 	*pagep = filepage;
1388 	return 0;
1389 
1390 failed:
1391 	if (*pagep != filepage) {
1392 		unlock_page(filepage);
1393 		page_cache_release(filepage);
1394 	}
1395 	return error;
1396 }
1397 
1398 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1399 {
1400 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1401 	int error;
1402 	int ret;
1403 
1404 	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1405 		return VM_FAULT_SIGBUS;
1406 
1407 	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1408 	if (error)
1409 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1410 
1411 	mark_page_accessed(vmf->page);
1412 	return ret | VM_FAULT_LOCKED;
1413 }
1414 
1415 #ifdef CONFIG_NUMA
1416 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1417 {
1418 	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1419 	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1420 }
1421 
1422 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1423 					  unsigned long addr)
1424 {
1425 	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1426 	unsigned long idx;
1427 
1428 	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1429 	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1430 }
1431 #endif
1432 
1433 int shmem_lock(struct file *file, int lock, struct user_struct *user)
1434 {
1435 	struct inode *inode = file->f_path.dentry->d_inode;
1436 	struct shmem_inode_info *info = SHMEM_I(inode);
1437 	int retval = -ENOMEM;
1438 
1439 	spin_lock(&info->lock);
1440 	if (lock && !(info->flags & VM_LOCKED)) {
1441 		if (!user_shm_lock(inode->i_size, user))
1442 			goto out_nomem;
1443 		info->flags |= VM_LOCKED;
1444 	}
1445 	if (!lock && (info->flags & VM_LOCKED) && user) {
1446 		user_shm_unlock(inode->i_size, user);
1447 		info->flags &= ~VM_LOCKED;
1448 	}
1449 	retval = 0;
1450 out_nomem:
1451 	spin_unlock(&info->lock);
1452 	return retval;
1453 }
1454 
1455 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1456 {
1457 	file_accessed(file);
1458 	vma->vm_ops = &shmem_vm_ops;
1459 	vma->vm_flags |= VM_CAN_NONLINEAR;
1460 	return 0;
1461 }
1462 
1463 static struct inode *
1464 shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1465 {
1466 	struct inode *inode;
1467 	struct shmem_inode_info *info;
1468 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1469 
1470 	if (shmem_reserve_inode(sb))
1471 		return NULL;
1472 
1473 	inode = new_inode(sb);
1474 	if (inode) {
1475 		inode->i_mode = mode;
1476 		inode->i_uid = current->fsuid;
1477 		inode->i_gid = current->fsgid;
1478 		inode->i_blocks = 0;
1479 		inode->i_mapping->a_ops = &shmem_aops;
1480 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1481 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1482 		inode->i_generation = get_seconds();
1483 		info = SHMEM_I(inode);
1484 		memset(info, 0, (char *)inode - (char *)info);
1485 		spin_lock_init(&info->lock);
1486 		INIT_LIST_HEAD(&info->swaplist);
1487 
1488 		switch (mode & S_IFMT) {
1489 		default:
1490 			inode->i_op = &shmem_special_inode_operations;
1491 			init_special_inode(inode, mode, dev);
1492 			break;
1493 		case S_IFREG:
1494 			inode->i_op = &shmem_inode_operations;
1495 			inode->i_fop = &shmem_file_operations;
1496 			mpol_shared_policy_init(&info->policy, sbinfo->policy,
1497 							&sbinfo->policy_nodes);
1498 			break;
1499 		case S_IFDIR:
1500 			inc_nlink(inode);
1501 			/* Some things misbehave if size == 0 on a directory */
1502 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
1503 			inode->i_op = &shmem_dir_inode_operations;
1504 			inode->i_fop = &simple_dir_operations;
1505 			break;
1506 		case S_IFLNK:
1507 			/*
1508 			 * Must not load anything in the rbtree,
1509 			 * mpol_free_shared_policy will not be called.
1510 			 */
1511 			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1512 						NULL);
1513 			break;
1514 		}
1515 	} else
1516 		shmem_free_inode(sb);
1517 	return inode;
1518 }
1519 
1520 #ifdef CONFIG_TMPFS
1521 static const struct inode_operations shmem_symlink_inode_operations;
1522 static const struct inode_operations shmem_symlink_inline_operations;
1523 
1524 /*
1525  * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1526  * but providing them allows a tmpfs file to be used for splice, sendfile, and
1527  * below the loop driver, in the generic fashion that many filesystems support.
1528  */
1529 static int shmem_readpage(struct file *file, struct page *page)
1530 {
1531 	struct inode *inode = page->mapping->host;
1532 	int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1533 	unlock_page(page);
1534 	return error;
1535 }
1536 
1537 static int
1538 shmem_write_begin(struct file *file, struct address_space *mapping,
1539 			loff_t pos, unsigned len, unsigned flags,
1540 			struct page **pagep, void **fsdata)
1541 {
1542 	struct inode *inode = mapping->host;
1543 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1544 	*pagep = NULL;
1545 	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1546 }
1547 
1548 static int
1549 shmem_write_end(struct file *file, struct address_space *mapping,
1550 			loff_t pos, unsigned len, unsigned copied,
1551 			struct page *page, void *fsdata)
1552 {
1553 	struct inode *inode = mapping->host;
1554 
1555 	if (pos + copied > inode->i_size)
1556 		i_size_write(inode, pos + copied);
1557 
1558 	unlock_page(page);
1559 	set_page_dirty(page);
1560 	page_cache_release(page);
1561 
1562 	return copied;
1563 }
1564 
1565 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1566 {
1567 	struct inode *inode = filp->f_path.dentry->d_inode;
1568 	struct address_space *mapping = inode->i_mapping;
1569 	unsigned long index, offset;
1570 	enum sgp_type sgp = SGP_READ;
1571 
1572 	/*
1573 	 * Might this read be for a stacking filesystem?  Then when reading
1574 	 * holes of a sparse file, we actually need to allocate those pages,
1575 	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
1576 	 */
1577 	if (segment_eq(get_fs(), KERNEL_DS))
1578 		sgp = SGP_DIRTY;
1579 
1580 	index = *ppos >> PAGE_CACHE_SHIFT;
1581 	offset = *ppos & ~PAGE_CACHE_MASK;
1582 
1583 	for (;;) {
1584 		struct page *page = NULL;
1585 		unsigned long end_index, nr, ret;
1586 		loff_t i_size = i_size_read(inode);
1587 
1588 		end_index = i_size >> PAGE_CACHE_SHIFT;
1589 		if (index > end_index)
1590 			break;
1591 		if (index == end_index) {
1592 			nr = i_size & ~PAGE_CACHE_MASK;
1593 			if (nr <= offset)
1594 				break;
1595 		}
1596 
1597 		desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1598 		if (desc->error) {
1599 			if (desc->error == -EINVAL)
1600 				desc->error = 0;
1601 			break;
1602 		}
1603 		if (page)
1604 			unlock_page(page);
1605 
1606 		/*
1607 		 * We must evaluate after, since reads (unlike writes)
1608 		 * are called without i_mutex protection against truncate
1609 		 */
1610 		nr = PAGE_CACHE_SIZE;
1611 		i_size = i_size_read(inode);
1612 		end_index = i_size >> PAGE_CACHE_SHIFT;
1613 		if (index == end_index) {
1614 			nr = i_size & ~PAGE_CACHE_MASK;
1615 			if (nr <= offset) {
1616 				if (page)
1617 					page_cache_release(page);
1618 				break;
1619 			}
1620 		}
1621 		nr -= offset;
1622 
1623 		if (page) {
1624 			/*
1625 			 * If users can be writing to this page using arbitrary
1626 			 * virtual addresses, take care about potential aliasing
1627 			 * before reading the page on the kernel side.
1628 			 */
1629 			if (mapping_writably_mapped(mapping))
1630 				flush_dcache_page(page);
1631 			/*
1632 			 * Mark the page accessed if we read the beginning.
1633 			 */
1634 			if (!offset)
1635 				mark_page_accessed(page);
1636 		} else {
1637 			page = ZERO_PAGE(0);
1638 			page_cache_get(page);
1639 		}
1640 
1641 		/*
1642 		 * Ok, we have the page, and it's up-to-date, so
1643 		 * now we can copy it to user space...
1644 		 *
1645 		 * The actor routine returns how many bytes were actually used..
1646 		 * NOTE! This may not be the same as how much of a user buffer
1647 		 * we filled up (we may be padding etc), so we can only update
1648 		 * "pos" here (the actor routine has to update the user buffer
1649 		 * pointers and the remaining count).
1650 		 */
1651 		ret = actor(desc, page, offset, nr);
1652 		offset += ret;
1653 		index += offset >> PAGE_CACHE_SHIFT;
1654 		offset &= ~PAGE_CACHE_MASK;
1655 
1656 		page_cache_release(page);
1657 		if (ret != nr || !desc->count)
1658 			break;
1659 
1660 		cond_resched();
1661 	}
1662 
1663 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1664 	file_accessed(filp);
1665 }
1666 
1667 static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1668 {
1669 	read_descriptor_t desc;
1670 
1671 	if ((ssize_t) count < 0)
1672 		return -EINVAL;
1673 	if (!access_ok(VERIFY_WRITE, buf, count))
1674 		return -EFAULT;
1675 	if (!count)
1676 		return 0;
1677 
1678 	desc.written = 0;
1679 	desc.count = count;
1680 	desc.arg.buf = buf;
1681 	desc.error = 0;
1682 
1683 	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1684 	if (desc.written)
1685 		return desc.written;
1686 	return desc.error;
1687 }
1688 
1689 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1690 {
1691 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1692 
1693 	buf->f_type = TMPFS_MAGIC;
1694 	buf->f_bsize = PAGE_CACHE_SIZE;
1695 	buf->f_namelen = NAME_MAX;
1696 	spin_lock(&sbinfo->stat_lock);
1697 	if (sbinfo->max_blocks) {
1698 		buf->f_blocks = sbinfo->max_blocks;
1699 		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1700 	}
1701 	if (sbinfo->max_inodes) {
1702 		buf->f_files = sbinfo->max_inodes;
1703 		buf->f_ffree = sbinfo->free_inodes;
1704 	}
1705 	/* else leave those fields 0 like simple_statfs */
1706 	spin_unlock(&sbinfo->stat_lock);
1707 	return 0;
1708 }
1709 
1710 /*
1711  * File creation. Allocate an inode, and we're done..
1712  */
1713 static int
1714 shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1715 {
1716 	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1717 	int error = -ENOSPC;
1718 
1719 	if (inode) {
1720 		error = security_inode_init_security(inode, dir, NULL, NULL,
1721 						     NULL);
1722 		if (error) {
1723 			if (error != -EOPNOTSUPP) {
1724 				iput(inode);
1725 				return error;
1726 			}
1727 		}
1728 		error = shmem_acl_init(inode, dir);
1729 		if (error) {
1730 			iput(inode);
1731 			return error;
1732 		}
1733 		if (dir->i_mode & S_ISGID) {
1734 			inode->i_gid = dir->i_gid;
1735 			if (S_ISDIR(mode))
1736 				inode->i_mode |= S_ISGID;
1737 		}
1738 		dir->i_size += BOGO_DIRENT_SIZE;
1739 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1740 		d_instantiate(dentry, inode);
1741 		dget(dentry); /* Extra count - pin the dentry in core */
1742 	}
1743 	return error;
1744 }
1745 
1746 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1747 {
1748 	int error;
1749 
1750 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1751 		return error;
1752 	inc_nlink(dir);
1753 	return 0;
1754 }
1755 
1756 static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1757 		struct nameidata *nd)
1758 {
1759 	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1760 }
1761 
1762 /*
1763  * Link a file..
1764  */
1765 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1766 {
1767 	struct inode *inode = old_dentry->d_inode;
1768 	int ret;
1769 
1770 	/*
1771 	 * No ordinary (disk based) filesystem counts links as inodes;
1772 	 * but each new link needs a new dentry, pinning lowmem, and
1773 	 * tmpfs dentries cannot be pruned until they are unlinked.
1774 	 */
1775 	ret = shmem_reserve_inode(inode->i_sb);
1776 	if (ret)
1777 		goto out;
1778 
1779 	dir->i_size += BOGO_DIRENT_SIZE;
1780 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1781 	inc_nlink(inode);
1782 	atomic_inc(&inode->i_count);	/* New dentry reference */
1783 	dget(dentry);		/* Extra pinning count for the created dentry */
1784 	d_instantiate(dentry, inode);
1785 out:
1786 	return ret;
1787 }
1788 
1789 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1790 {
1791 	struct inode *inode = dentry->d_inode;
1792 
1793 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
1794 		shmem_free_inode(inode->i_sb);
1795 
1796 	dir->i_size -= BOGO_DIRENT_SIZE;
1797 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1798 	drop_nlink(inode);
1799 	dput(dentry);	/* Undo the count from "create" - this does all the work */
1800 	return 0;
1801 }
1802 
1803 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1804 {
1805 	if (!simple_empty(dentry))
1806 		return -ENOTEMPTY;
1807 
1808 	drop_nlink(dentry->d_inode);
1809 	drop_nlink(dir);
1810 	return shmem_unlink(dir, dentry);
1811 }
1812 
1813 /*
1814  * The VFS layer already does all the dentry stuff for rename,
1815  * we just have to decrement the usage count for the target if
1816  * it exists so that the VFS layer correctly free's it when it
1817  * gets overwritten.
1818  */
1819 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1820 {
1821 	struct inode *inode = old_dentry->d_inode;
1822 	int they_are_dirs = S_ISDIR(inode->i_mode);
1823 
1824 	if (!simple_empty(new_dentry))
1825 		return -ENOTEMPTY;
1826 
1827 	if (new_dentry->d_inode) {
1828 		(void) shmem_unlink(new_dir, new_dentry);
1829 		if (they_are_dirs)
1830 			drop_nlink(old_dir);
1831 	} else if (they_are_dirs) {
1832 		drop_nlink(old_dir);
1833 		inc_nlink(new_dir);
1834 	}
1835 
1836 	old_dir->i_size -= BOGO_DIRENT_SIZE;
1837 	new_dir->i_size += BOGO_DIRENT_SIZE;
1838 	old_dir->i_ctime = old_dir->i_mtime =
1839 	new_dir->i_ctime = new_dir->i_mtime =
1840 	inode->i_ctime = CURRENT_TIME;
1841 	return 0;
1842 }
1843 
1844 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1845 {
1846 	int error;
1847 	int len;
1848 	struct inode *inode;
1849 	struct page *page = NULL;
1850 	char *kaddr;
1851 	struct shmem_inode_info *info;
1852 
1853 	len = strlen(symname) + 1;
1854 	if (len > PAGE_CACHE_SIZE)
1855 		return -ENAMETOOLONG;
1856 
1857 	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1858 	if (!inode)
1859 		return -ENOSPC;
1860 
1861 	error = security_inode_init_security(inode, dir, NULL, NULL,
1862 					     NULL);
1863 	if (error) {
1864 		if (error != -EOPNOTSUPP) {
1865 			iput(inode);
1866 			return error;
1867 		}
1868 		error = 0;
1869 	}
1870 
1871 	info = SHMEM_I(inode);
1872 	inode->i_size = len-1;
1873 	if (len <= (char *)inode - (char *)info) {
1874 		/* do it inline */
1875 		memcpy(info, symname, len);
1876 		inode->i_op = &shmem_symlink_inline_operations;
1877 	} else {
1878 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1879 		if (error) {
1880 			iput(inode);
1881 			return error;
1882 		}
1883 		unlock_page(page);
1884 		inode->i_op = &shmem_symlink_inode_operations;
1885 		kaddr = kmap_atomic(page, KM_USER0);
1886 		memcpy(kaddr, symname, len);
1887 		kunmap_atomic(kaddr, KM_USER0);
1888 		set_page_dirty(page);
1889 		page_cache_release(page);
1890 	}
1891 	if (dir->i_mode & S_ISGID)
1892 		inode->i_gid = dir->i_gid;
1893 	dir->i_size += BOGO_DIRENT_SIZE;
1894 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1895 	d_instantiate(dentry, inode);
1896 	dget(dentry);
1897 	return 0;
1898 }
1899 
1900 static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1901 {
1902 	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1903 	return NULL;
1904 }
1905 
1906 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1907 {
1908 	struct page *page = NULL;
1909 	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1910 	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1911 	if (page)
1912 		unlock_page(page);
1913 	return page;
1914 }
1915 
1916 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1917 {
1918 	if (!IS_ERR(nd_get_link(nd))) {
1919 		struct page *page = cookie;
1920 		kunmap(page);
1921 		mark_page_accessed(page);
1922 		page_cache_release(page);
1923 	}
1924 }
1925 
1926 static const struct inode_operations shmem_symlink_inline_operations = {
1927 	.readlink	= generic_readlink,
1928 	.follow_link	= shmem_follow_link_inline,
1929 };
1930 
1931 static const struct inode_operations shmem_symlink_inode_operations = {
1932 	.truncate	= shmem_truncate,
1933 	.readlink	= generic_readlink,
1934 	.follow_link	= shmem_follow_link,
1935 	.put_link	= shmem_put_link,
1936 };
1937 
1938 #ifdef CONFIG_TMPFS_POSIX_ACL
1939 /**
1940  * Superblocks without xattr inode operations will get security.* xattr
1941  * support from the VFS "for free". As soon as we have any other xattrs
1942  * like ACLs, we also need to implement the security.* handlers at
1943  * filesystem level, though.
1944  */
1945 
1946 static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1947 					size_t list_len, const char *name,
1948 					size_t name_len)
1949 {
1950 	return security_inode_listsecurity(inode, list, list_len);
1951 }
1952 
1953 static int shmem_xattr_security_get(struct inode *inode, const char *name,
1954 				    void *buffer, size_t size)
1955 {
1956 	if (strcmp(name, "") == 0)
1957 		return -EINVAL;
1958 	return security_inode_getsecurity(inode, name, buffer, size,
1959 					  -EOPNOTSUPP);
1960 }
1961 
1962 static int shmem_xattr_security_set(struct inode *inode, const char *name,
1963 				    const void *value, size_t size, int flags)
1964 {
1965 	if (strcmp(name, "") == 0)
1966 		return -EINVAL;
1967 	return security_inode_setsecurity(inode, name, value, size, flags);
1968 }
1969 
1970 static struct xattr_handler shmem_xattr_security_handler = {
1971 	.prefix = XATTR_SECURITY_PREFIX,
1972 	.list   = shmem_xattr_security_list,
1973 	.get    = shmem_xattr_security_get,
1974 	.set    = shmem_xattr_security_set,
1975 };
1976 
1977 static struct xattr_handler *shmem_xattr_handlers[] = {
1978 	&shmem_xattr_acl_access_handler,
1979 	&shmem_xattr_acl_default_handler,
1980 	&shmem_xattr_security_handler,
1981 	NULL
1982 };
1983 #endif
1984 
1985 static struct dentry *shmem_get_parent(struct dentry *child)
1986 {
1987 	return ERR_PTR(-ESTALE);
1988 }
1989 
1990 static int shmem_match(struct inode *ino, void *vfh)
1991 {
1992 	__u32 *fh = vfh;
1993 	__u64 inum = fh[2];
1994 	inum = (inum << 32) | fh[1];
1995 	return ino->i_ino == inum && fh[0] == ino->i_generation;
1996 }
1997 
1998 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
1999 		struct fid *fid, int fh_len, int fh_type)
2000 {
2001 	struct inode *inode;
2002 	struct dentry *dentry = NULL;
2003 	u64 inum = fid->raw[2];
2004 	inum = (inum << 32) | fid->raw[1];
2005 
2006 	if (fh_len < 3)
2007 		return NULL;
2008 
2009 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2010 			shmem_match, fid->raw);
2011 	if (inode) {
2012 		dentry = d_find_alias(inode);
2013 		iput(inode);
2014 	}
2015 
2016 	return dentry;
2017 }
2018 
2019 static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2020 				int connectable)
2021 {
2022 	struct inode *inode = dentry->d_inode;
2023 
2024 	if (*len < 3)
2025 		return 255;
2026 
2027 	if (hlist_unhashed(&inode->i_hash)) {
2028 		/* Unfortunately insert_inode_hash is not idempotent,
2029 		 * so as we hash inodes here rather than at creation
2030 		 * time, we need a lock to ensure we only try
2031 		 * to do it once
2032 		 */
2033 		static DEFINE_SPINLOCK(lock);
2034 		spin_lock(&lock);
2035 		if (hlist_unhashed(&inode->i_hash))
2036 			__insert_inode_hash(inode,
2037 					    inode->i_ino + inode->i_generation);
2038 		spin_unlock(&lock);
2039 	}
2040 
2041 	fh[0] = inode->i_generation;
2042 	fh[1] = inode->i_ino;
2043 	fh[2] = ((__u64)inode->i_ino) >> 32;
2044 
2045 	*len = 3;
2046 	return 1;
2047 }
2048 
2049 static const struct export_operations shmem_export_ops = {
2050 	.get_parent     = shmem_get_parent,
2051 	.encode_fh      = shmem_encode_fh,
2052 	.fh_to_dentry	= shmem_fh_to_dentry,
2053 };
2054 
2055 static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2056 	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
2057 	int *policy, nodemask_t *policy_nodes)
2058 {
2059 	char *this_char, *value, *rest;
2060 
2061 	while (options != NULL) {
2062 		this_char = options;
2063 		for (;;) {
2064 			/*
2065 			 * NUL-terminate this option: unfortunately,
2066 			 * mount options form a comma-separated list,
2067 			 * but mpol's nodelist may also contain commas.
2068 			 */
2069 			options = strchr(options, ',');
2070 			if (options == NULL)
2071 				break;
2072 			options++;
2073 			if (!isdigit(*options)) {
2074 				options[-1] = '\0';
2075 				break;
2076 			}
2077 		}
2078 		if (!*this_char)
2079 			continue;
2080 		if ((value = strchr(this_char,'=')) != NULL) {
2081 			*value++ = 0;
2082 		} else {
2083 			printk(KERN_ERR
2084 			    "tmpfs: No value for mount option '%s'\n",
2085 			    this_char);
2086 			return 1;
2087 		}
2088 
2089 		if (!strcmp(this_char,"size")) {
2090 			unsigned long long size;
2091 			size = memparse(value,&rest);
2092 			if (*rest == '%') {
2093 				size <<= PAGE_SHIFT;
2094 				size *= totalram_pages;
2095 				do_div(size, 100);
2096 				rest++;
2097 			}
2098 			if (*rest)
2099 				goto bad_val;
2100 			*blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2101 		} else if (!strcmp(this_char,"nr_blocks")) {
2102 			*blocks = memparse(value,&rest);
2103 			if (*rest)
2104 				goto bad_val;
2105 		} else if (!strcmp(this_char,"nr_inodes")) {
2106 			*inodes = memparse(value,&rest);
2107 			if (*rest)
2108 				goto bad_val;
2109 		} else if (!strcmp(this_char,"mode")) {
2110 			if (!mode)
2111 				continue;
2112 			*mode = simple_strtoul(value,&rest,8);
2113 			if (*rest)
2114 				goto bad_val;
2115 		} else if (!strcmp(this_char,"uid")) {
2116 			if (!uid)
2117 				continue;
2118 			*uid = simple_strtoul(value,&rest,0);
2119 			if (*rest)
2120 				goto bad_val;
2121 		} else if (!strcmp(this_char,"gid")) {
2122 			if (!gid)
2123 				continue;
2124 			*gid = simple_strtoul(value,&rest,0);
2125 			if (*rest)
2126 				goto bad_val;
2127 		} else if (!strcmp(this_char,"mpol")) {
2128 			if (shmem_parse_mpol(value,policy,policy_nodes))
2129 				goto bad_val;
2130 		} else {
2131 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2132 			       this_char);
2133 			return 1;
2134 		}
2135 	}
2136 	return 0;
2137 
2138 bad_val:
2139 	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2140 	       value, this_char);
2141 	return 1;
2142 
2143 }
2144 
2145 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2146 {
2147 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2148 	unsigned long max_blocks = sbinfo->max_blocks;
2149 	unsigned long max_inodes = sbinfo->max_inodes;
2150 	int policy = sbinfo->policy;
2151 	nodemask_t policy_nodes = sbinfo->policy_nodes;
2152 	unsigned long blocks;
2153 	unsigned long inodes;
2154 	int error = -EINVAL;
2155 
2156 	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
2157 				&max_inodes, &policy, &policy_nodes))
2158 		return error;
2159 
2160 	spin_lock(&sbinfo->stat_lock);
2161 	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2162 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2163 	if (max_blocks < blocks)
2164 		goto out;
2165 	if (max_inodes < inodes)
2166 		goto out;
2167 	/*
2168 	 * Those tests also disallow limited->unlimited while any are in
2169 	 * use, so i_blocks will always be zero when max_blocks is zero;
2170 	 * but we must separately disallow unlimited->limited, because
2171 	 * in that case we have no record of how much is already in use.
2172 	 */
2173 	if (max_blocks && !sbinfo->max_blocks)
2174 		goto out;
2175 	if (max_inodes && !sbinfo->max_inodes)
2176 		goto out;
2177 
2178 	error = 0;
2179 	sbinfo->max_blocks  = max_blocks;
2180 	sbinfo->free_blocks = max_blocks - blocks;
2181 	sbinfo->max_inodes  = max_inodes;
2182 	sbinfo->free_inodes = max_inodes - inodes;
2183 	sbinfo->policy = policy;
2184 	sbinfo->policy_nodes = policy_nodes;
2185 out:
2186 	spin_unlock(&sbinfo->stat_lock);
2187 	return error;
2188 }
2189 #endif
2190 
2191 static void shmem_put_super(struct super_block *sb)
2192 {
2193 	kfree(sb->s_fs_info);
2194 	sb->s_fs_info = NULL;
2195 }
2196 
2197 static int shmem_fill_super(struct super_block *sb,
2198 			    void *data, int silent)
2199 {
2200 	struct inode *inode;
2201 	struct dentry *root;
2202 	int mode   = S_IRWXUGO | S_ISVTX;
2203 	uid_t uid = current->fsuid;
2204 	gid_t gid = current->fsgid;
2205 	int err = -ENOMEM;
2206 	struct shmem_sb_info *sbinfo;
2207 	unsigned long blocks = 0;
2208 	unsigned long inodes = 0;
2209 	int policy = MPOL_DEFAULT;
2210 	nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
2211 
2212 #ifdef CONFIG_TMPFS
2213 	/*
2214 	 * Per default we only allow half of the physical ram per
2215 	 * tmpfs instance, limiting inodes to one per page of lowmem;
2216 	 * but the internal instance is left unlimited.
2217 	 */
2218 	if (!(sb->s_flags & MS_NOUSER)) {
2219 		blocks = totalram_pages / 2;
2220 		inodes = totalram_pages - totalhigh_pages;
2221 		if (inodes > blocks)
2222 			inodes = blocks;
2223 		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
2224 					&inodes, &policy, &policy_nodes))
2225 			return -EINVAL;
2226 	}
2227 	sb->s_export_op = &shmem_export_ops;
2228 #else
2229 	sb->s_flags |= MS_NOUSER;
2230 #endif
2231 
2232 	/* Round up to L1_CACHE_BYTES to resist false sharing */
2233 	sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
2234 				L1_CACHE_BYTES), GFP_KERNEL);
2235 	if (!sbinfo)
2236 		return -ENOMEM;
2237 
2238 	spin_lock_init(&sbinfo->stat_lock);
2239 	sbinfo->max_blocks = blocks;
2240 	sbinfo->free_blocks = blocks;
2241 	sbinfo->max_inodes = inodes;
2242 	sbinfo->free_inodes = inodes;
2243 	sbinfo->policy = policy;
2244 	sbinfo->policy_nodes = policy_nodes;
2245 
2246 	sb->s_fs_info = sbinfo;
2247 	sb->s_maxbytes = SHMEM_MAX_BYTES;
2248 	sb->s_blocksize = PAGE_CACHE_SIZE;
2249 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2250 	sb->s_magic = TMPFS_MAGIC;
2251 	sb->s_op = &shmem_ops;
2252 	sb->s_time_gran = 1;
2253 #ifdef CONFIG_TMPFS_POSIX_ACL
2254 	sb->s_xattr = shmem_xattr_handlers;
2255 	sb->s_flags |= MS_POSIXACL;
2256 #endif
2257 
2258 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2259 	if (!inode)
2260 		goto failed;
2261 	inode->i_uid = uid;
2262 	inode->i_gid = gid;
2263 	root = d_alloc_root(inode);
2264 	if (!root)
2265 		goto failed_iput;
2266 	sb->s_root = root;
2267 	return 0;
2268 
2269 failed_iput:
2270 	iput(inode);
2271 failed:
2272 	shmem_put_super(sb);
2273 	return err;
2274 }
2275 
2276 static struct kmem_cache *shmem_inode_cachep;
2277 
2278 static struct inode *shmem_alloc_inode(struct super_block *sb)
2279 {
2280 	struct shmem_inode_info *p;
2281 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2282 	if (!p)
2283 		return NULL;
2284 	return &p->vfs_inode;
2285 }
2286 
2287 static void shmem_destroy_inode(struct inode *inode)
2288 {
2289 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2290 		/* only struct inode is valid if it's an inline symlink */
2291 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2292 	}
2293 	shmem_acl_destroy_inode(inode);
2294 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2295 }
2296 
2297 static void init_once(struct kmem_cache *cachep, void *foo)
2298 {
2299 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2300 
2301 	inode_init_once(&p->vfs_inode);
2302 #ifdef CONFIG_TMPFS_POSIX_ACL
2303 	p->i_acl = NULL;
2304 	p->i_default_acl = NULL;
2305 #endif
2306 }
2307 
2308 static int init_inodecache(void)
2309 {
2310 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2311 				sizeof(struct shmem_inode_info),
2312 				0, SLAB_PANIC, init_once);
2313 	return 0;
2314 }
2315 
2316 static void destroy_inodecache(void)
2317 {
2318 	kmem_cache_destroy(shmem_inode_cachep);
2319 }
2320 
2321 static const struct address_space_operations shmem_aops = {
2322 	.writepage	= shmem_writepage,
2323 	.set_page_dirty	= __set_page_dirty_no_writeback,
2324 #ifdef CONFIG_TMPFS
2325 	.readpage	= shmem_readpage,
2326 	.write_begin	= shmem_write_begin,
2327 	.write_end	= shmem_write_end,
2328 #endif
2329 	.migratepage	= migrate_page,
2330 };
2331 
2332 static const struct file_operations shmem_file_operations = {
2333 	.mmap		= shmem_mmap,
2334 #ifdef CONFIG_TMPFS
2335 	.llseek		= generic_file_llseek,
2336 	.read		= shmem_file_read,
2337 	.write		= do_sync_write,
2338 	.aio_write	= generic_file_aio_write,
2339 	.fsync		= simple_sync_file,
2340 	.splice_read	= generic_file_splice_read,
2341 	.splice_write	= generic_file_splice_write,
2342 #endif
2343 };
2344 
2345 static const struct inode_operations shmem_inode_operations = {
2346 	.truncate	= shmem_truncate,
2347 	.setattr	= shmem_notify_change,
2348 	.truncate_range	= shmem_truncate_range,
2349 #ifdef CONFIG_TMPFS_POSIX_ACL
2350 	.setxattr	= generic_setxattr,
2351 	.getxattr	= generic_getxattr,
2352 	.listxattr	= generic_listxattr,
2353 	.removexattr	= generic_removexattr,
2354 	.permission	= shmem_permission,
2355 #endif
2356 
2357 };
2358 
2359 static const struct inode_operations shmem_dir_inode_operations = {
2360 #ifdef CONFIG_TMPFS
2361 	.create		= shmem_create,
2362 	.lookup		= simple_lookup,
2363 	.link		= shmem_link,
2364 	.unlink		= shmem_unlink,
2365 	.symlink	= shmem_symlink,
2366 	.mkdir		= shmem_mkdir,
2367 	.rmdir		= shmem_rmdir,
2368 	.mknod		= shmem_mknod,
2369 	.rename		= shmem_rename,
2370 #endif
2371 #ifdef CONFIG_TMPFS_POSIX_ACL
2372 	.setattr	= shmem_notify_change,
2373 	.setxattr	= generic_setxattr,
2374 	.getxattr	= generic_getxattr,
2375 	.listxattr	= generic_listxattr,
2376 	.removexattr	= generic_removexattr,
2377 	.permission	= shmem_permission,
2378 #endif
2379 };
2380 
2381 static const struct inode_operations shmem_special_inode_operations = {
2382 #ifdef CONFIG_TMPFS_POSIX_ACL
2383 	.setattr	= shmem_notify_change,
2384 	.setxattr	= generic_setxattr,
2385 	.getxattr	= generic_getxattr,
2386 	.listxattr	= generic_listxattr,
2387 	.removexattr	= generic_removexattr,
2388 	.permission	= shmem_permission,
2389 #endif
2390 };
2391 
2392 static const struct super_operations shmem_ops = {
2393 	.alloc_inode	= shmem_alloc_inode,
2394 	.destroy_inode	= shmem_destroy_inode,
2395 #ifdef CONFIG_TMPFS
2396 	.statfs		= shmem_statfs,
2397 	.remount_fs	= shmem_remount_fs,
2398 #endif
2399 	.delete_inode	= shmem_delete_inode,
2400 	.drop_inode	= generic_delete_inode,
2401 	.put_super	= shmem_put_super,
2402 };
2403 
2404 static struct vm_operations_struct shmem_vm_ops = {
2405 	.fault		= shmem_fault,
2406 #ifdef CONFIG_NUMA
2407 	.set_policy     = shmem_set_policy,
2408 	.get_policy     = shmem_get_policy,
2409 #endif
2410 };
2411 
2412 
2413 static int shmem_get_sb(struct file_system_type *fs_type,
2414 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2415 {
2416 	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2417 }
2418 
2419 static struct file_system_type tmpfs_fs_type = {
2420 	.owner		= THIS_MODULE,
2421 	.name		= "tmpfs",
2422 	.get_sb		= shmem_get_sb,
2423 	.kill_sb	= kill_litter_super,
2424 };
2425 static struct vfsmount *shm_mnt;
2426 
2427 static int __init init_tmpfs(void)
2428 {
2429 	int error;
2430 
2431 	error = bdi_init(&shmem_backing_dev_info);
2432 	if (error)
2433 		goto out4;
2434 
2435 	error = init_inodecache();
2436 	if (error)
2437 		goto out3;
2438 
2439 	error = register_filesystem(&tmpfs_fs_type);
2440 	if (error) {
2441 		printk(KERN_ERR "Could not register tmpfs\n");
2442 		goto out2;
2443 	}
2444 
2445 	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2446 				tmpfs_fs_type.name, NULL);
2447 	if (IS_ERR(shm_mnt)) {
2448 		error = PTR_ERR(shm_mnt);
2449 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2450 		goto out1;
2451 	}
2452 	return 0;
2453 
2454 out1:
2455 	unregister_filesystem(&tmpfs_fs_type);
2456 out2:
2457 	destroy_inodecache();
2458 out3:
2459 	bdi_destroy(&shmem_backing_dev_info);
2460 out4:
2461 	shm_mnt = ERR_PTR(error);
2462 	return error;
2463 }
2464 module_init(init_tmpfs)
2465 
2466 /*
2467  * shmem_file_setup - get an unlinked file living in tmpfs
2468  *
2469  * @name: name for dentry (to be seen in /proc/<pid>/maps
2470  * @size: size to be set for the file
2471  *
2472  */
2473 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2474 {
2475 	int error;
2476 	struct file *file;
2477 	struct inode *inode;
2478 	struct dentry *dentry, *root;
2479 	struct qstr this;
2480 
2481 	if (IS_ERR(shm_mnt))
2482 		return (void *)shm_mnt;
2483 
2484 	if (size < 0 || size > SHMEM_MAX_BYTES)
2485 		return ERR_PTR(-EINVAL);
2486 
2487 	if (shmem_acct_size(flags, size))
2488 		return ERR_PTR(-ENOMEM);
2489 
2490 	error = -ENOMEM;
2491 	this.name = name;
2492 	this.len = strlen(name);
2493 	this.hash = 0; /* will go */
2494 	root = shm_mnt->mnt_root;
2495 	dentry = d_alloc(root, &this);
2496 	if (!dentry)
2497 		goto put_memory;
2498 
2499 	error = -ENFILE;
2500 	file = get_empty_filp();
2501 	if (!file)
2502 		goto put_dentry;
2503 
2504 	error = -ENOSPC;
2505 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2506 	if (!inode)
2507 		goto close_file;
2508 
2509 	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2510 	d_instantiate(dentry, inode);
2511 	inode->i_size = size;
2512 	inode->i_nlink = 0;	/* It is unlinked */
2513 	init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2514 			&shmem_file_operations);
2515 	return file;
2516 
2517 close_file:
2518 	put_filp(file);
2519 put_dentry:
2520 	dput(dentry);
2521 put_memory:
2522 	shmem_unacct_size(flags, size);
2523 	return ERR_PTR(error);
2524 }
2525 
2526 /*
2527  * shmem_zero_setup - setup a shared anonymous mapping
2528  *
2529  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2530  */
2531 int shmem_zero_setup(struct vm_area_struct *vma)
2532 {
2533 	struct file *file;
2534 	loff_t size = vma->vm_end - vma->vm_start;
2535 
2536 	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2537 	if (IS_ERR(file))
2538 		return PTR_ERR(file);
2539 
2540 	if (vma->vm_file)
2541 		fput(vma->vm_file);
2542 	vma->vm_file = file;
2543 	vma->vm_ops = &shmem_vm_ops;
2544 	return 0;
2545 }
2546