xref: /openbmc/linux/mm/filemap.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  *	linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/config.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/compiler.h>
16 #include <linux/fs.h>
17 #include <linux/aio.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 #include <linux/mman.h>
22 #include <linux/pagemap.h>
23 #include <linux/file.h>
24 #include <linux/uio.h>
25 #include <linux/hash.h>
26 #include <linux/writeback.h>
27 #include <linux/pagevec.h>
28 #include <linux/blkdev.h>
29 #include <linux/security.h>
30 #include <linux/syscalls.h>
31 #include "filemap.h"
32 /*
33  * FIXME: remove all knowledge of the buffer layer from the core VM
34  */
35 #include <linux/buffer_head.h> /* for generic_osync_inode */
36 
37 #include <asm/uaccess.h>
38 #include <asm/mman.h>
39 
40 static ssize_t
41 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
42 	loff_t offset, unsigned long nr_segs);
43 
44 /*
45  * Shared mappings implemented 30.11.1994. It's not fully working yet,
46  * though.
47  *
48  * Shared mappings now work. 15.8.1995  Bruno.
49  *
50  * finished 'unifying' the page and buffer cache and SMP-threaded the
51  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
52  *
53  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
54  */
55 
56 /*
57  * Lock ordering:
58  *
59  *  ->i_mmap_lock		(vmtruncate)
60  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
61  *      ->swap_lock		(exclusive_swap_page, others)
62  *        ->mapping->tree_lock
63  *
64  *  ->i_sem
65  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
66  *
67  *  ->mmap_sem
68  *    ->i_mmap_lock
69  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
70  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
71  *
72  *  ->mmap_sem
73  *    ->lock_page		(access_process_vm)
74  *
75  *  ->mmap_sem
76  *    ->i_sem			(msync)
77  *
78  *  ->i_sem
79  *    ->i_alloc_sem             (various)
80  *
81  *  ->inode_lock
82  *    ->sb_lock			(fs/fs-writeback.c)
83  *    ->mapping->tree_lock	(__sync_single_inode)
84  *
85  *  ->i_mmap_lock
86  *    ->anon_vma.lock		(vma_adjust)
87  *
88  *  ->anon_vma.lock
89  *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
90  *
91  *  ->page_table_lock or pte_lock
92  *    ->swap_lock		(try_to_unmap_one)
93  *    ->private_lock		(try_to_unmap_one)
94  *    ->tree_lock		(try_to_unmap_one)
95  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
96  *    ->private_lock		(page_remove_rmap->set_page_dirty)
97  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
98  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
99  *    ->inode_lock		(zap_pte_range->set_page_dirty)
100  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
101  *
102  *  ->task->proc_lock
103  *    ->dcache_lock		(proc_pid_lookup)
104  */
105 
106 /*
107  * Remove a page from the page cache and free it. Caller has to make
108  * sure the page is locked and that nobody else uses it - or that usage
109  * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
110  */
111 void __remove_from_page_cache(struct page *page)
112 {
113 	struct address_space *mapping = page->mapping;
114 
115 	radix_tree_delete(&mapping->page_tree, page->index);
116 	page->mapping = NULL;
117 	mapping->nrpages--;
118 	pagecache_acct(-1);
119 }
120 
121 void remove_from_page_cache(struct page *page)
122 {
123 	struct address_space *mapping = page->mapping;
124 
125 	BUG_ON(!PageLocked(page));
126 
127 	write_lock_irq(&mapping->tree_lock);
128 	__remove_from_page_cache(page);
129 	write_unlock_irq(&mapping->tree_lock);
130 }
131 
132 static int sync_page(void *word)
133 {
134 	struct address_space *mapping;
135 	struct page *page;
136 
137 	page = container_of((page_flags_t *)word, struct page, flags);
138 
139 	/*
140 	 * page_mapping() is being called without PG_locked held.
141 	 * Some knowledge of the state and use of the page is used to
142 	 * reduce the requirements down to a memory barrier.
143 	 * The danger here is of a stale page_mapping() return value
144 	 * indicating a struct address_space different from the one it's
145 	 * associated with when it is associated with one.
146 	 * After smp_mb(), it's either the correct page_mapping() for
147 	 * the page, or an old page_mapping() and the page's own
148 	 * page_mapping() has gone NULL.
149 	 * The ->sync_page() address_space operation must tolerate
150 	 * page_mapping() going NULL. By an amazing coincidence,
151 	 * this comes about because none of the users of the page
152 	 * in the ->sync_page() methods make essential use of the
153 	 * page_mapping(), merely passing the page down to the backing
154 	 * device's unplug functions when it's non-NULL, which in turn
155 	 * ignore it for all cases but swap, where only page_private(page) is
156 	 * of interest. When page_mapping() does go NULL, the entire
157 	 * call stack gracefully ignores the page and returns.
158 	 * -- wli
159 	 */
160 	smp_mb();
161 	mapping = page_mapping(page);
162 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
163 		mapping->a_ops->sync_page(page);
164 	io_schedule();
165 	return 0;
166 }
167 
168 /**
169  * filemap_fdatawrite_range - start writeback against all of a mapping's
170  * dirty pages that lie within the byte offsets <start, end>
171  * @mapping:	address space structure to write
172  * @start:	offset in bytes where the range starts
173  * @end:	offset in bytes where the range ends
174  * @sync_mode:	enable synchronous operation
175  *
176  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
177  * opposed to a regular memory * cleansing writeback.  The difference between
178  * these two operations is that if a dirty page/buffer is encountered, it must
179  * be waited upon, and not just skipped over.
180  */
181 static int __filemap_fdatawrite_range(struct address_space *mapping,
182 	loff_t start, loff_t end, int sync_mode)
183 {
184 	int ret;
185 	struct writeback_control wbc = {
186 		.sync_mode = sync_mode,
187 		.nr_to_write = mapping->nrpages * 2,
188 		.start = start,
189 		.end = end,
190 	};
191 
192 	if (!mapping_cap_writeback_dirty(mapping))
193 		return 0;
194 
195 	ret = do_writepages(mapping, &wbc);
196 	return ret;
197 }
198 
199 static inline int __filemap_fdatawrite(struct address_space *mapping,
200 	int sync_mode)
201 {
202 	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
203 }
204 
205 int filemap_fdatawrite(struct address_space *mapping)
206 {
207 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
208 }
209 EXPORT_SYMBOL(filemap_fdatawrite);
210 
211 static int filemap_fdatawrite_range(struct address_space *mapping,
212 	loff_t start, loff_t end)
213 {
214 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
215 }
216 
217 /*
218  * This is a mostly non-blocking flush.  Not suitable for data-integrity
219  * purposes - I/O may not be started against all dirty pages.
220  */
221 int filemap_flush(struct address_space *mapping)
222 {
223 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
224 }
225 EXPORT_SYMBOL(filemap_flush);
226 
227 /*
228  * Wait for writeback to complete against pages indexed by start->end
229  * inclusive
230  */
231 static int wait_on_page_writeback_range(struct address_space *mapping,
232 				pgoff_t start, pgoff_t end)
233 {
234 	struct pagevec pvec;
235 	int nr_pages;
236 	int ret = 0;
237 	pgoff_t index;
238 
239 	if (end < start)
240 		return 0;
241 
242 	pagevec_init(&pvec, 0);
243 	index = start;
244 	while ((index <= end) &&
245 			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
246 			PAGECACHE_TAG_WRITEBACK,
247 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
248 		unsigned i;
249 
250 		for (i = 0; i < nr_pages; i++) {
251 			struct page *page = pvec.pages[i];
252 
253 			/* until radix tree lookup accepts end_index */
254 			if (page->index > end)
255 				continue;
256 
257 			wait_on_page_writeback(page);
258 			if (PageError(page))
259 				ret = -EIO;
260 		}
261 		pagevec_release(&pvec);
262 		cond_resched();
263 	}
264 
265 	/* Check for outstanding write errors */
266 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
267 		ret = -ENOSPC;
268 	if (test_and_clear_bit(AS_EIO, &mapping->flags))
269 		ret = -EIO;
270 
271 	return ret;
272 }
273 
274 /*
275  * Write and wait upon all the pages in the passed range.  This is a "data
276  * integrity" operation.  It waits upon in-flight writeout before starting and
277  * waiting upon new writeout.  If there was an IO error, return it.
278  *
279  * We need to re-take i_sem during the generic_osync_inode list walk because
280  * it is otherwise livelockable.
281  */
282 int sync_page_range(struct inode *inode, struct address_space *mapping,
283 			loff_t pos, size_t count)
284 {
285 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
286 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
287 	int ret;
288 
289 	if (!mapping_cap_writeback_dirty(mapping) || !count)
290 		return 0;
291 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
292 	if (ret == 0) {
293 		down(&inode->i_sem);
294 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
295 		up(&inode->i_sem);
296 	}
297 	if (ret == 0)
298 		ret = wait_on_page_writeback_range(mapping, start, end);
299 	return ret;
300 }
301 EXPORT_SYMBOL(sync_page_range);
302 
303 /*
304  * Note: Holding i_sem across sync_page_range_nolock is not a good idea
305  * as it forces O_SYNC writers to different parts of the same file
306  * to be serialised right until io completion.
307  */
308 static int sync_page_range_nolock(struct inode *inode,
309 				  struct address_space *mapping,
310 				  loff_t pos, size_t count)
311 {
312 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
313 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
314 	int ret;
315 
316 	if (!mapping_cap_writeback_dirty(mapping) || !count)
317 		return 0;
318 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
319 	if (ret == 0)
320 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
321 	if (ret == 0)
322 		ret = wait_on_page_writeback_range(mapping, start, end);
323 	return ret;
324 }
325 
326 /**
327  * filemap_fdatawait - walk the list of under-writeback pages of the given
328  *     address space and wait for all of them.
329  *
330  * @mapping: address space structure to wait for
331  */
332 int filemap_fdatawait(struct address_space *mapping)
333 {
334 	loff_t i_size = i_size_read(mapping->host);
335 
336 	if (i_size == 0)
337 		return 0;
338 
339 	return wait_on_page_writeback_range(mapping, 0,
340 				(i_size - 1) >> PAGE_CACHE_SHIFT);
341 }
342 EXPORT_SYMBOL(filemap_fdatawait);
343 
344 int filemap_write_and_wait(struct address_space *mapping)
345 {
346 	int retval = 0;
347 
348 	if (mapping->nrpages) {
349 		retval = filemap_fdatawrite(mapping);
350 		if (retval == 0)
351 			retval = filemap_fdatawait(mapping);
352 	}
353 	return retval;
354 }
355 
356 int filemap_write_and_wait_range(struct address_space *mapping,
357 				 loff_t lstart, loff_t lend)
358 {
359 	int retval = 0;
360 
361 	if (mapping->nrpages) {
362 		retval = __filemap_fdatawrite_range(mapping, lstart, lend,
363 						    WB_SYNC_ALL);
364 		if (retval == 0)
365 			retval = wait_on_page_writeback_range(mapping,
366 						    lstart >> PAGE_CACHE_SHIFT,
367 						    lend >> PAGE_CACHE_SHIFT);
368 	}
369 	return retval;
370 }
371 
372 /*
373  * This function is used to add newly allocated pagecache pages:
374  * the page is new, so we can just run SetPageLocked() against it.
375  * The other page state flags were set by rmqueue().
376  *
377  * This function does not add the page to the LRU.  The caller must do that.
378  */
379 int add_to_page_cache(struct page *page, struct address_space *mapping,
380 		pgoff_t offset, gfp_t gfp_mask)
381 {
382 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
383 
384 	if (error == 0) {
385 		write_lock_irq(&mapping->tree_lock);
386 		error = radix_tree_insert(&mapping->page_tree, offset, page);
387 		if (!error) {
388 			page_cache_get(page);
389 			SetPageLocked(page);
390 			page->mapping = mapping;
391 			page->index = offset;
392 			mapping->nrpages++;
393 			pagecache_acct(1);
394 		}
395 		write_unlock_irq(&mapping->tree_lock);
396 		radix_tree_preload_end();
397 	}
398 	return error;
399 }
400 
401 EXPORT_SYMBOL(add_to_page_cache);
402 
403 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
404 				pgoff_t offset, gfp_t gfp_mask)
405 {
406 	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
407 	if (ret == 0)
408 		lru_cache_add(page);
409 	return ret;
410 }
411 
412 /*
413  * In order to wait for pages to become available there must be
414  * waitqueues associated with pages. By using a hash table of
415  * waitqueues where the bucket discipline is to maintain all
416  * waiters on the same queue and wake all when any of the pages
417  * become available, and for the woken contexts to check to be
418  * sure the appropriate page became available, this saves space
419  * at a cost of "thundering herd" phenomena during rare hash
420  * collisions.
421  */
422 static wait_queue_head_t *page_waitqueue(struct page *page)
423 {
424 	const struct zone *zone = page_zone(page);
425 
426 	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
427 }
428 
429 static inline void wake_up_page(struct page *page, int bit)
430 {
431 	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
432 }
433 
434 void fastcall wait_on_page_bit(struct page *page, int bit_nr)
435 {
436 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
437 
438 	if (test_bit(bit_nr, &page->flags))
439 		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
440 							TASK_UNINTERRUPTIBLE);
441 }
442 EXPORT_SYMBOL(wait_on_page_bit);
443 
444 /**
445  * unlock_page() - unlock a locked page
446  *
447  * @page: the page
448  *
449  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
450  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
451  * mechananism between PageLocked pages and PageWriteback pages is shared.
452  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
453  *
454  * The first mb is necessary to safely close the critical section opened by the
455  * TestSetPageLocked(), the second mb is necessary to enforce ordering between
456  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
457  * parallel wait_on_page_locked()).
458  */
459 void fastcall unlock_page(struct page *page)
460 {
461 	smp_mb__before_clear_bit();
462 	if (!TestClearPageLocked(page))
463 		BUG();
464 	smp_mb__after_clear_bit();
465 	wake_up_page(page, PG_locked);
466 }
467 EXPORT_SYMBOL(unlock_page);
468 
469 /*
470  * End writeback against a page.
471  */
472 void end_page_writeback(struct page *page)
473 {
474 	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
475 		if (!test_clear_page_writeback(page))
476 			BUG();
477 	}
478 	smp_mb__after_clear_bit();
479 	wake_up_page(page, PG_writeback);
480 }
481 EXPORT_SYMBOL(end_page_writeback);
482 
483 /*
484  * Get a lock on the page, assuming we need to sleep to get it.
485  *
486  * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
487  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
488  * chances are that on the second loop, the block layer's plug list is empty,
489  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
490  */
491 void fastcall __lock_page(struct page *page)
492 {
493 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
494 
495 	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
496 							TASK_UNINTERRUPTIBLE);
497 }
498 EXPORT_SYMBOL(__lock_page);
499 
500 /*
501  * a rather lightweight function, finding and getting a reference to a
502  * hashed page atomically.
503  */
504 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
505 {
506 	struct page *page;
507 
508 	read_lock_irq(&mapping->tree_lock);
509 	page = radix_tree_lookup(&mapping->page_tree, offset);
510 	if (page)
511 		page_cache_get(page);
512 	read_unlock_irq(&mapping->tree_lock);
513 	return page;
514 }
515 
516 EXPORT_SYMBOL(find_get_page);
517 
518 /*
519  * Same as above, but trylock it instead of incrementing the count.
520  */
521 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
522 {
523 	struct page *page;
524 
525 	read_lock_irq(&mapping->tree_lock);
526 	page = radix_tree_lookup(&mapping->page_tree, offset);
527 	if (page && TestSetPageLocked(page))
528 		page = NULL;
529 	read_unlock_irq(&mapping->tree_lock);
530 	return page;
531 }
532 
533 EXPORT_SYMBOL(find_trylock_page);
534 
535 /**
536  * find_lock_page - locate, pin and lock a pagecache page
537  *
538  * @mapping: the address_space to search
539  * @offset: the page index
540  *
541  * Locates the desired pagecache page, locks it, increments its reference
542  * count and returns its address.
543  *
544  * Returns zero if the page was not present. find_lock_page() may sleep.
545  */
546 struct page *find_lock_page(struct address_space *mapping,
547 				unsigned long offset)
548 {
549 	struct page *page;
550 
551 	read_lock_irq(&mapping->tree_lock);
552 repeat:
553 	page = radix_tree_lookup(&mapping->page_tree, offset);
554 	if (page) {
555 		page_cache_get(page);
556 		if (TestSetPageLocked(page)) {
557 			read_unlock_irq(&mapping->tree_lock);
558 			lock_page(page);
559 			read_lock_irq(&mapping->tree_lock);
560 
561 			/* Has the page been truncated while we slept? */
562 			if (page->mapping != mapping || page->index != offset) {
563 				unlock_page(page);
564 				page_cache_release(page);
565 				goto repeat;
566 			}
567 		}
568 	}
569 	read_unlock_irq(&mapping->tree_lock);
570 	return page;
571 }
572 
573 EXPORT_SYMBOL(find_lock_page);
574 
575 /**
576  * find_or_create_page - locate or add a pagecache page
577  *
578  * @mapping: the page's address_space
579  * @index: the page's index into the mapping
580  * @gfp_mask: page allocation mode
581  *
582  * Locates a page in the pagecache.  If the page is not present, a new page
583  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
584  * LRU list.  The returned page is locked and has its reference count
585  * incremented.
586  *
587  * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
588  * allocation!
589  *
590  * find_or_create_page() returns the desired page's address, or zero on
591  * memory exhaustion.
592  */
593 struct page *find_or_create_page(struct address_space *mapping,
594 		unsigned long index, gfp_t gfp_mask)
595 {
596 	struct page *page, *cached_page = NULL;
597 	int err;
598 repeat:
599 	page = find_lock_page(mapping, index);
600 	if (!page) {
601 		if (!cached_page) {
602 			cached_page = alloc_page(gfp_mask);
603 			if (!cached_page)
604 				return NULL;
605 		}
606 		err = add_to_page_cache_lru(cached_page, mapping,
607 					index, gfp_mask);
608 		if (!err) {
609 			page = cached_page;
610 			cached_page = NULL;
611 		} else if (err == -EEXIST)
612 			goto repeat;
613 	}
614 	if (cached_page)
615 		page_cache_release(cached_page);
616 	return page;
617 }
618 
619 EXPORT_SYMBOL(find_or_create_page);
620 
621 /**
622  * find_get_pages - gang pagecache lookup
623  * @mapping:	The address_space to search
624  * @start:	The starting page index
625  * @nr_pages:	The maximum number of pages
626  * @pages:	Where the resulting pages are placed
627  *
628  * find_get_pages() will search for and return a group of up to
629  * @nr_pages pages in the mapping.  The pages are placed at @pages.
630  * find_get_pages() takes a reference against the returned pages.
631  *
632  * The search returns a group of mapping-contiguous pages with ascending
633  * indexes.  There may be holes in the indices due to not-present pages.
634  *
635  * find_get_pages() returns the number of pages which were found.
636  */
637 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
638 			    unsigned int nr_pages, struct page **pages)
639 {
640 	unsigned int i;
641 	unsigned int ret;
642 
643 	read_lock_irq(&mapping->tree_lock);
644 	ret = radix_tree_gang_lookup(&mapping->page_tree,
645 				(void **)pages, start, nr_pages);
646 	for (i = 0; i < ret; i++)
647 		page_cache_get(pages[i]);
648 	read_unlock_irq(&mapping->tree_lock);
649 	return ret;
650 }
651 
652 /*
653  * Like find_get_pages, except we only return pages which are tagged with
654  * `tag'.   We update *index to index the next page for the traversal.
655  */
656 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
657 			int tag, unsigned int nr_pages, struct page **pages)
658 {
659 	unsigned int i;
660 	unsigned int ret;
661 
662 	read_lock_irq(&mapping->tree_lock);
663 	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
664 				(void **)pages, *index, nr_pages, tag);
665 	for (i = 0; i < ret; i++)
666 		page_cache_get(pages[i]);
667 	if (ret)
668 		*index = pages[ret - 1]->index + 1;
669 	read_unlock_irq(&mapping->tree_lock);
670 	return ret;
671 }
672 
673 /*
674  * Same as grab_cache_page, but do not wait if the page is unavailable.
675  * This is intended for speculative data generators, where the data can
676  * be regenerated if the page couldn't be grabbed.  This routine should
677  * be safe to call while holding the lock for another page.
678  *
679  * Clear __GFP_FS when allocating the page to avoid recursion into the fs
680  * and deadlock against the caller's locked page.
681  */
682 struct page *
683 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
684 {
685 	struct page *page = find_get_page(mapping, index);
686 	gfp_t gfp_mask;
687 
688 	if (page) {
689 		if (!TestSetPageLocked(page))
690 			return page;
691 		page_cache_release(page);
692 		return NULL;
693 	}
694 	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
695 	page = alloc_pages(gfp_mask, 0);
696 	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
697 		page_cache_release(page);
698 		page = NULL;
699 	}
700 	return page;
701 }
702 
703 EXPORT_SYMBOL(grab_cache_page_nowait);
704 
705 /*
706  * This is a generic file read routine, and uses the
707  * mapping->a_ops->readpage() function for the actual low-level
708  * stuff.
709  *
710  * This is really ugly. But the goto's actually try to clarify some
711  * of the logic when it comes to error handling etc.
712  *
713  * Note the struct file* is only passed for the use of readpage.  It may be
714  * NULL.
715  */
716 void do_generic_mapping_read(struct address_space *mapping,
717 			     struct file_ra_state *_ra,
718 			     struct file *filp,
719 			     loff_t *ppos,
720 			     read_descriptor_t *desc,
721 			     read_actor_t actor)
722 {
723 	struct inode *inode = mapping->host;
724 	unsigned long index;
725 	unsigned long end_index;
726 	unsigned long offset;
727 	unsigned long last_index;
728 	unsigned long next_index;
729 	unsigned long prev_index;
730 	loff_t isize;
731 	struct page *cached_page;
732 	int error;
733 	struct file_ra_state ra = *_ra;
734 
735 	cached_page = NULL;
736 	index = *ppos >> PAGE_CACHE_SHIFT;
737 	next_index = index;
738 	prev_index = ra.prev_page;
739 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
740 	offset = *ppos & ~PAGE_CACHE_MASK;
741 
742 	isize = i_size_read(inode);
743 	if (!isize)
744 		goto out;
745 
746 	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
747 	for (;;) {
748 		struct page *page;
749 		unsigned long nr, ret;
750 
751 		/* nr is the maximum number of bytes to copy from this page */
752 		nr = PAGE_CACHE_SIZE;
753 		if (index >= end_index) {
754 			if (index > end_index)
755 				goto out;
756 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
757 			if (nr <= offset) {
758 				goto out;
759 			}
760 		}
761 		nr = nr - offset;
762 
763 		cond_resched();
764 		if (index == next_index)
765 			next_index = page_cache_readahead(mapping, &ra, filp,
766 					index, last_index - index);
767 
768 find_page:
769 		page = find_get_page(mapping, index);
770 		if (unlikely(page == NULL)) {
771 			handle_ra_miss(mapping, &ra, index);
772 			goto no_cached_page;
773 		}
774 		if (!PageUptodate(page))
775 			goto page_not_up_to_date;
776 page_ok:
777 
778 		/* If users can be writing to this page using arbitrary
779 		 * virtual addresses, take care about potential aliasing
780 		 * before reading the page on the kernel side.
781 		 */
782 		if (mapping_writably_mapped(mapping))
783 			flush_dcache_page(page);
784 
785 		/*
786 		 * When (part of) the same page is read multiple times
787 		 * in succession, only mark it as accessed the first time.
788 		 */
789 		if (prev_index != index)
790 			mark_page_accessed(page);
791 		prev_index = index;
792 
793 		/*
794 		 * Ok, we have the page, and it's up-to-date, so
795 		 * now we can copy it to user space...
796 		 *
797 		 * The actor routine returns how many bytes were actually used..
798 		 * NOTE! This may not be the same as how much of a user buffer
799 		 * we filled up (we may be padding etc), so we can only update
800 		 * "pos" here (the actor routine has to update the user buffer
801 		 * pointers and the remaining count).
802 		 */
803 		ret = actor(desc, page, offset, nr);
804 		offset += ret;
805 		index += offset >> PAGE_CACHE_SHIFT;
806 		offset &= ~PAGE_CACHE_MASK;
807 
808 		page_cache_release(page);
809 		if (ret == nr && desc->count)
810 			continue;
811 		goto out;
812 
813 page_not_up_to_date:
814 		/* Get exclusive access to the page ... */
815 		lock_page(page);
816 
817 		/* Did it get unhashed before we got the lock? */
818 		if (!page->mapping) {
819 			unlock_page(page);
820 			page_cache_release(page);
821 			continue;
822 		}
823 
824 		/* Did somebody else fill it already? */
825 		if (PageUptodate(page)) {
826 			unlock_page(page);
827 			goto page_ok;
828 		}
829 
830 readpage:
831 		/* Start the actual read. The read will unlock the page. */
832 		error = mapping->a_ops->readpage(filp, page);
833 
834 		if (unlikely(error))
835 			goto readpage_error;
836 
837 		if (!PageUptodate(page)) {
838 			lock_page(page);
839 			if (!PageUptodate(page)) {
840 				if (page->mapping == NULL) {
841 					/*
842 					 * invalidate_inode_pages got it
843 					 */
844 					unlock_page(page);
845 					page_cache_release(page);
846 					goto find_page;
847 				}
848 				unlock_page(page);
849 				error = -EIO;
850 				goto readpage_error;
851 			}
852 			unlock_page(page);
853 		}
854 
855 		/*
856 		 * i_size must be checked after we have done ->readpage.
857 		 *
858 		 * Checking i_size after the readpage allows us to calculate
859 		 * the correct value for "nr", which means the zero-filled
860 		 * part of the page is not copied back to userspace (unless
861 		 * another truncate extends the file - this is desired though).
862 		 */
863 		isize = i_size_read(inode);
864 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
865 		if (unlikely(!isize || index > end_index)) {
866 			page_cache_release(page);
867 			goto out;
868 		}
869 
870 		/* nr is the maximum number of bytes to copy from this page */
871 		nr = PAGE_CACHE_SIZE;
872 		if (index == end_index) {
873 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
874 			if (nr <= offset) {
875 				page_cache_release(page);
876 				goto out;
877 			}
878 		}
879 		nr = nr - offset;
880 		goto page_ok;
881 
882 readpage_error:
883 		/* UHHUH! A synchronous read error occurred. Report it */
884 		desc->error = error;
885 		page_cache_release(page);
886 		goto out;
887 
888 no_cached_page:
889 		/*
890 		 * Ok, it wasn't cached, so we need to create a new
891 		 * page..
892 		 */
893 		if (!cached_page) {
894 			cached_page = page_cache_alloc_cold(mapping);
895 			if (!cached_page) {
896 				desc->error = -ENOMEM;
897 				goto out;
898 			}
899 		}
900 		error = add_to_page_cache_lru(cached_page, mapping,
901 						index, GFP_KERNEL);
902 		if (error) {
903 			if (error == -EEXIST)
904 				goto find_page;
905 			desc->error = error;
906 			goto out;
907 		}
908 		page = cached_page;
909 		cached_page = NULL;
910 		goto readpage;
911 	}
912 
913 out:
914 	*_ra = ra;
915 
916 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
917 	if (cached_page)
918 		page_cache_release(cached_page);
919 	if (filp)
920 		file_accessed(filp);
921 }
922 
923 EXPORT_SYMBOL(do_generic_mapping_read);
924 
925 int file_read_actor(read_descriptor_t *desc, struct page *page,
926 			unsigned long offset, unsigned long size)
927 {
928 	char *kaddr;
929 	unsigned long left, count = desc->count;
930 
931 	if (size > count)
932 		size = count;
933 
934 	/*
935 	 * Faults on the destination of a read are common, so do it before
936 	 * taking the kmap.
937 	 */
938 	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
939 		kaddr = kmap_atomic(page, KM_USER0);
940 		left = __copy_to_user_inatomic(desc->arg.buf,
941 						kaddr + offset, size);
942 		kunmap_atomic(kaddr, KM_USER0);
943 		if (left == 0)
944 			goto success;
945 	}
946 
947 	/* Do it the slow way */
948 	kaddr = kmap(page);
949 	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
950 	kunmap(page);
951 
952 	if (left) {
953 		size -= left;
954 		desc->error = -EFAULT;
955 	}
956 success:
957 	desc->count = count - size;
958 	desc->written += size;
959 	desc->arg.buf += size;
960 	return size;
961 }
962 
963 /*
964  * This is the "read()" routine for all filesystems
965  * that can use the page cache directly.
966  */
967 ssize_t
968 __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
969 		unsigned long nr_segs, loff_t *ppos)
970 {
971 	struct file *filp = iocb->ki_filp;
972 	ssize_t retval;
973 	unsigned long seg;
974 	size_t count;
975 
976 	count = 0;
977 	for (seg = 0; seg < nr_segs; seg++) {
978 		const struct iovec *iv = &iov[seg];
979 
980 		/*
981 		 * If any segment has a negative length, or the cumulative
982 		 * length ever wraps negative then return -EINVAL.
983 		 */
984 		count += iv->iov_len;
985 		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
986 			return -EINVAL;
987 		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
988 			continue;
989 		if (seg == 0)
990 			return -EFAULT;
991 		nr_segs = seg;
992 		count -= iv->iov_len;	/* This segment is no good */
993 		break;
994 	}
995 
996 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
997 	if (filp->f_flags & O_DIRECT) {
998 		loff_t pos = *ppos, size;
999 		struct address_space *mapping;
1000 		struct inode *inode;
1001 
1002 		mapping = filp->f_mapping;
1003 		inode = mapping->host;
1004 		retval = 0;
1005 		if (!count)
1006 			goto out; /* skip atime */
1007 		size = i_size_read(inode);
1008 		if (pos < size) {
1009 			retval = generic_file_direct_IO(READ, iocb,
1010 						iov, pos, nr_segs);
1011 			if (retval > 0 && !is_sync_kiocb(iocb))
1012 				retval = -EIOCBQUEUED;
1013 			if (retval > 0)
1014 				*ppos = pos + retval;
1015 		}
1016 		file_accessed(filp);
1017 		goto out;
1018 	}
1019 
1020 	retval = 0;
1021 	if (count) {
1022 		for (seg = 0; seg < nr_segs; seg++) {
1023 			read_descriptor_t desc;
1024 
1025 			desc.written = 0;
1026 			desc.arg.buf = iov[seg].iov_base;
1027 			desc.count = iov[seg].iov_len;
1028 			if (desc.count == 0)
1029 				continue;
1030 			desc.error = 0;
1031 			do_generic_file_read(filp,ppos,&desc,file_read_actor);
1032 			retval += desc.written;
1033 			if (desc.error) {
1034 				retval = retval ?: desc.error;
1035 				break;
1036 			}
1037 		}
1038 	}
1039 out:
1040 	return retval;
1041 }
1042 
1043 EXPORT_SYMBOL(__generic_file_aio_read);
1044 
1045 ssize_t
1046 generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1047 {
1048 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1049 
1050 	BUG_ON(iocb->ki_pos != pos);
1051 	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1052 }
1053 
1054 EXPORT_SYMBOL(generic_file_aio_read);
1055 
1056 ssize_t
1057 generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1058 {
1059 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1060 	struct kiocb kiocb;
1061 	ssize_t ret;
1062 
1063 	init_sync_kiocb(&kiocb, filp);
1064 	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1065 	if (-EIOCBQUEUED == ret)
1066 		ret = wait_on_sync_kiocb(&kiocb);
1067 	return ret;
1068 }
1069 
1070 EXPORT_SYMBOL(generic_file_read);
1071 
1072 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1073 {
1074 	ssize_t written;
1075 	unsigned long count = desc->count;
1076 	struct file *file = desc->arg.data;
1077 
1078 	if (size > count)
1079 		size = count;
1080 
1081 	written = file->f_op->sendpage(file, page, offset,
1082 				       size, &file->f_pos, size<count);
1083 	if (written < 0) {
1084 		desc->error = written;
1085 		written = 0;
1086 	}
1087 	desc->count = count - written;
1088 	desc->written += written;
1089 	return written;
1090 }
1091 
1092 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1093 			 size_t count, read_actor_t actor, void *target)
1094 {
1095 	read_descriptor_t desc;
1096 
1097 	if (!count)
1098 		return 0;
1099 
1100 	desc.written = 0;
1101 	desc.count = count;
1102 	desc.arg.data = target;
1103 	desc.error = 0;
1104 
1105 	do_generic_file_read(in_file, ppos, &desc, actor);
1106 	if (desc.written)
1107 		return desc.written;
1108 	return desc.error;
1109 }
1110 
1111 EXPORT_SYMBOL(generic_file_sendfile);
1112 
1113 static ssize_t
1114 do_readahead(struct address_space *mapping, struct file *filp,
1115 	     unsigned long index, unsigned long nr)
1116 {
1117 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1118 		return -EINVAL;
1119 
1120 	force_page_cache_readahead(mapping, filp, index,
1121 					max_sane_readahead(nr));
1122 	return 0;
1123 }
1124 
1125 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1126 {
1127 	ssize_t ret;
1128 	struct file *file;
1129 
1130 	ret = -EBADF;
1131 	file = fget(fd);
1132 	if (file) {
1133 		if (file->f_mode & FMODE_READ) {
1134 			struct address_space *mapping = file->f_mapping;
1135 			unsigned long start = offset >> PAGE_CACHE_SHIFT;
1136 			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1137 			unsigned long len = end - start + 1;
1138 			ret = do_readahead(mapping, file, start, len);
1139 		}
1140 		fput(file);
1141 	}
1142 	return ret;
1143 }
1144 
1145 #ifdef CONFIG_MMU
1146 /*
1147  * This adds the requested page to the page cache if it isn't already there,
1148  * and schedules an I/O to read in its contents from disk.
1149  */
1150 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1151 static int fastcall page_cache_read(struct file * file, unsigned long offset)
1152 {
1153 	struct address_space *mapping = file->f_mapping;
1154 	struct page *page;
1155 	int error;
1156 
1157 	page = page_cache_alloc_cold(mapping);
1158 	if (!page)
1159 		return -ENOMEM;
1160 
1161 	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1162 	if (!error) {
1163 		error = mapping->a_ops->readpage(file, page);
1164 		page_cache_release(page);
1165 		return error;
1166 	}
1167 
1168 	/*
1169 	 * We arrive here in the unlikely event that someone
1170 	 * raced with us and added our page to the cache first
1171 	 * or we are out of memory for radix-tree nodes.
1172 	 */
1173 	page_cache_release(page);
1174 	return error == -EEXIST ? 0 : error;
1175 }
1176 
1177 #define MMAP_LOTSAMISS  (100)
1178 
1179 /*
1180  * filemap_nopage() is invoked via the vma operations vector for a
1181  * mapped memory region to read in file data during a page fault.
1182  *
1183  * The goto's are kind of ugly, but this streamlines the normal case of having
1184  * it in the page cache, and handles the special cases reasonably without
1185  * having a lot of duplicated code.
1186  */
1187 struct page *filemap_nopage(struct vm_area_struct *area,
1188 				unsigned long address, int *type)
1189 {
1190 	int error;
1191 	struct file *file = area->vm_file;
1192 	struct address_space *mapping = file->f_mapping;
1193 	struct file_ra_state *ra = &file->f_ra;
1194 	struct inode *inode = mapping->host;
1195 	struct page *page;
1196 	unsigned long size, pgoff;
1197 	int did_readaround = 0, majmin = VM_FAULT_MINOR;
1198 
1199 	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1200 
1201 retry_all:
1202 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1203 	if (pgoff >= size)
1204 		goto outside_data_content;
1205 
1206 	/* If we don't want any read-ahead, don't bother */
1207 	if (VM_RandomReadHint(area))
1208 		goto no_cached_page;
1209 
1210 	/*
1211 	 * The readahead code wants to be told about each and every page
1212 	 * so it can build and shrink its windows appropriately
1213 	 *
1214 	 * For sequential accesses, we use the generic readahead logic.
1215 	 */
1216 	if (VM_SequentialReadHint(area))
1217 		page_cache_readahead(mapping, ra, file, pgoff, 1);
1218 
1219 	/*
1220 	 * Do we have something in the page cache already?
1221 	 */
1222 retry_find:
1223 	page = find_get_page(mapping, pgoff);
1224 	if (!page) {
1225 		unsigned long ra_pages;
1226 
1227 		if (VM_SequentialReadHint(area)) {
1228 			handle_ra_miss(mapping, ra, pgoff);
1229 			goto no_cached_page;
1230 		}
1231 		ra->mmap_miss++;
1232 
1233 		/*
1234 		 * Do we miss much more than hit in this file? If so,
1235 		 * stop bothering with read-ahead. It will only hurt.
1236 		 */
1237 		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1238 			goto no_cached_page;
1239 
1240 		/*
1241 		 * To keep the pgmajfault counter straight, we need to
1242 		 * check did_readaround, as this is an inner loop.
1243 		 */
1244 		if (!did_readaround) {
1245 			majmin = VM_FAULT_MAJOR;
1246 			inc_page_state(pgmajfault);
1247 		}
1248 		did_readaround = 1;
1249 		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1250 		if (ra_pages) {
1251 			pgoff_t start = 0;
1252 
1253 			if (pgoff > ra_pages / 2)
1254 				start = pgoff - ra_pages / 2;
1255 			do_page_cache_readahead(mapping, file, start, ra_pages);
1256 		}
1257 		page = find_get_page(mapping, pgoff);
1258 		if (!page)
1259 			goto no_cached_page;
1260 	}
1261 
1262 	if (!did_readaround)
1263 		ra->mmap_hit++;
1264 
1265 	/*
1266 	 * Ok, found a page in the page cache, now we need to check
1267 	 * that it's up-to-date.
1268 	 */
1269 	if (!PageUptodate(page))
1270 		goto page_not_uptodate;
1271 
1272 success:
1273 	/*
1274 	 * Found the page and have a reference on it.
1275 	 */
1276 	mark_page_accessed(page);
1277 	if (type)
1278 		*type = majmin;
1279 	return page;
1280 
1281 outside_data_content:
1282 	/*
1283 	 * An external ptracer can access pages that normally aren't
1284 	 * accessible..
1285 	 */
1286 	if (area->vm_mm == current->mm)
1287 		return NULL;
1288 	/* Fall through to the non-read-ahead case */
1289 no_cached_page:
1290 	/*
1291 	 * We're only likely to ever get here if MADV_RANDOM is in
1292 	 * effect.
1293 	 */
1294 	error = page_cache_read(file, pgoff);
1295 	grab_swap_token();
1296 
1297 	/*
1298 	 * The page we want has now been added to the page cache.
1299 	 * In the unlikely event that someone removed it in the
1300 	 * meantime, we'll just come back here and read it again.
1301 	 */
1302 	if (error >= 0)
1303 		goto retry_find;
1304 
1305 	/*
1306 	 * An error return from page_cache_read can result if the
1307 	 * system is low on memory, or a problem occurs while trying
1308 	 * to schedule I/O.
1309 	 */
1310 	if (error == -ENOMEM)
1311 		return NOPAGE_OOM;
1312 	return NULL;
1313 
1314 page_not_uptodate:
1315 	if (!did_readaround) {
1316 		majmin = VM_FAULT_MAJOR;
1317 		inc_page_state(pgmajfault);
1318 	}
1319 	lock_page(page);
1320 
1321 	/* Did it get unhashed while we waited for it? */
1322 	if (!page->mapping) {
1323 		unlock_page(page);
1324 		page_cache_release(page);
1325 		goto retry_all;
1326 	}
1327 
1328 	/* Did somebody else get it up-to-date? */
1329 	if (PageUptodate(page)) {
1330 		unlock_page(page);
1331 		goto success;
1332 	}
1333 
1334 	if (!mapping->a_ops->readpage(file, page)) {
1335 		wait_on_page_locked(page);
1336 		if (PageUptodate(page))
1337 			goto success;
1338 	}
1339 
1340 	/*
1341 	 * Umm, take care of errors if the page isn't up-to-date.
1342 	 * Try to re-read it _once_. We do this synchronously,
1343 	 * because there really aren't any performance issues here
1344 	 * and we need to check for errors.
1345 	 */
1346 	lock_page(page);
1347 
1348 	/* Somebody truncated the page on us? */
1349 	if (!page->mapping) {
1350 		unlock_page(page);
1351 		page_cache_release(page);
1352 		goto retry_all;
1353 	}
1354 
1355 	/* Somebody else successfully read it in? */
1356 	if (PageUptodate(page)) {
1357 		unlock_page(page);
1358 		goto success;
1359 	}
1360 	ClearPageError(page);
1361 	if (!mapping->a_ops->readpage(file, page)) {
1362 		wait_on_page_locked(page);
1363 		if (PageUptodate(page))
1364 			goto success;
1365 	}
1366 
1367 	/*
1368 	 * Things didn't work out. Return zero to tell the
1369 	 * mm layer so, possibly freeing the page cache page first.
1370 	 */
1371 	page_cache_release(page);
1372 	return NULL;
1373 }
1374 
1375 EXPORT_SYMBOL(filemap_nopage);
1376 
1377 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1378 					int nonblock)
1379 {
1380 	struct address_space *mapping = file->f_mapping;
1381 	struct page *page;
1382 	int error;
1383 
1384 	/*
1385 	 * Do we have something in the page cache already?
1386 	 */
1387 retry_find:
1388 	page = find_get_page(mapping, pgoff);
1389 	if (!page) {
1390 		if (nonblock)
1391 			return NULL;
1392 		goto no_cached_page;
1393 	}
1394 
1395 	/*
1396 	 * Ok, found a page in the page cache, now we need to check
1397 	 * that it's up-to-date.
1398 	 */
1399 	if (!PageUptodate(page)) {
1400 		if (nonblock) {
1401 			page_cache_release(page);
1402 			return NULL;
1403 		}
1404 		goto page_not_uptodate;
1405 	}
1406 
1407 success:
1408 	/*
1409 	 * Found the page and have a reference on it.
1410 	 */
1411 	mark_page_accessed(page);
1412 	return page;
1413 
1414 no_cached_page:
1415 	error = page_cache_read(file, pgoff);
1416 
1417 	/*
1418 	 * The page we want has now been added to the page cache.
1419 	 * In the unlikely event that someone removed it in the
1420 	 * meantime, we'll just come back here and read it again.
1421 	 */
1422 	if (error >= 0)
1423 		goto retry_find;
1424 
1425 	/*
1426 	 * An error return from page_cache_read can result if the
1427 	 * system is low on memory, or a problem occurs while trying
1428 	 * to schedule I/O.
1429 	 */
1430 	return NULL;
1431 
1432 page_not_uptodate:
1433 	lock_page(page);
1434 
1435 	/* Did it get unhashed while we waited for it? */
1436 	if (!page->mapping) {
1437 		unlock_page(page);
1438 		goto err;
1439 	}
1440 
1441 	/* Did somebody else get it up-to-date? */
1442 	if (PageUptodate(page)) {
1443 		unlock_page(page);
1444 		goto success;
1445 	}
1446 
1447 	if (!mapping->a_ops->readpage(file, page)) {
1448 		wait_on_page_locked(page);
1449 		if (PageUptodate(page))
1450 			goto success;
1451 	}
1452 
1453 	/*
1454 	 * Umm, take care of errors if the page isn't up-to-date.
1455 	 * Try to re-read it _once_. We do this synchronously,
1456 	 * because there really aren't any performance issues here
1457 	 * and we need to check for errors.
1458 	 */
1459 	lock_page(page);
1460 
1461 	/* Somebody truncated the page on us? */
1462 	if (!page->mapping) {
1463 		unlock_page(page);
1464 		goto err;
1465 	}
1466 	/* Somebody else successfully read it in? */
1467 	if (PageUptodate(page)) {
1468 		unlock_page(page);
1469 		goto success;
1470 	}
1471 
1472 	ClearPageError(page);
1473 	if (!mapping->a_ops->readpage(file, page)) {
1474 		wait_on_page_locked(page);
1475 		if (PageUptodate(page))
1476 			goto success;
1477 	}
1478 
1479 	/*
1480 	 * Things didn't work out. Return zero to tell the
1481 	 * mm layer so, possibly freeing the page cache page first.
1482 	 */
1483 err:
1484 	page_cache_release(page);
1485 
1486 	return NULL;
1487 }
1488 
1489 int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1490 		unsigned long len, pgprot_t prot, unsigned long pgoff,
1491 		int nonblock)
1492 {
1493 	struct file *file = vma->vm_file;
1494 	struct address_space *mapping = file->f_mapping;
1495 	struct inode *inode = mapping->host;
1496 	unsigned long size;
1497 	struct mm_struct *mm = vma->vm_mm;
1498 	struct page *page;
1499 	int err;
1500 
1501 	if (!nonblock)
1502 		force_page_cache_readahead(mapping, vma->vm_file,
1503 					pgoff, len >> PAGE_CACHE_SHIFT);
1504 
1505 repeat:
1506 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1507 	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1508 		return -EINVAL;
1509 
1510 	page = filemap_getpage(file, pgoff, nonblock);
1511 
1512 	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
1513 	 * done in shmem_populate calling shmem_getpage */
1514 	if (!page && !nonblock)
1515 		return -ENOMEM;
1516 
1517 	if (page) {
1518 		err = install_page(mm, vma, addr, page, prot);
1519 		if (err) {
1520 			page_cache_release(page);
1521 			return err;
1522 		}
1523 	} else if (vma->vm_flags & VM_NONLINEAR) {
1524 		/* No page was found just because we can't read it in now (being
1525 		 * here implies nonblock != 0), but the page may exist, so set
1526 		 * the PTE to fault it in later. */
1527 		err = install_file_pte(mm, vma, addr, pgoff, prot);
1528 		if (err)
1529 			return err;
1530 	}
1531 
1532 	len -= PAGE_SIZE;
1533 	addr += PAGE_SIZE;
1534 	pgoff++;
1535 	if (len)
1536 		goto repeat;
1537 
1538 	return 0;
1539 }
1540 EXPORT_SYMBOL(filemap_populate);
1541 
1542 struct vm_operations_struct generic_file_vm_ops = {
1543 	.nopage		= filemap_nopage,
1544 	.populate	= filemap_populate,
1545 };
1546 
1547 /* This is used for a general mmap of a disk file */
1548 
1549 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1550 {
1551 	struct address_space *mapping = file->f_mapping;
1552 
1553 	if (!mapping->a_ops->readpage)
1554 		return -ENOEXEC;
1555 	file_accessed(file);
1556 	vma->vm_ops = &generic_file_vm_ops;
1557 	return 0;
1558 }
1559 
1560 /*
1561  * This is for filesystems which do not implement ->writepage.
1562  */
1563 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1564 {
1565 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1566 		return -EINVAL;
1567 	return generic_file_mmap(file, vma);
1568 }
1569 #else
1570 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1571 {
1572 	return -ENOSYS;
1573 }
1574 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1575 {
1576 	return -ENOSYS;
1577 }
1578 #endif /* CONFIG_MMU */
1579 
1580 EXPORT_SYMBOL(generic_file_mmap);
1581 EXPORT_SYMBOL(generic_file_readonly_mmap);
1582 
1583 static inline struct page *__read_cache_page(struct address_space *mapping,
1584 				unsigned long index,
1585 				int (*filler)(void *,struct page*),
1586 				void *data)
1587 {
1588 	struct page *page, *cached_page = NULL;
1589 	int err;
1590 repeat:
1591 	page = find_get_page(mapping, index);
1592 	if (!page) {
1593 		if (!cached_page) {
1594 			cached_page = page_cache_alloc_cold(mapping);
1595 			if (!cached_page)
1596 				return ERR_PTR(-ENOMEM);
1597 		}
1598 		err = add_to_page_cache_lru(cached_page, mapping,
1599 					index, GFP_KERNEL);
1600 		if (err == -EEXIST)
1601 			goto repeat;
1602 		if (err < 0) {
1603 			/* Presumably ENOMEM for radix tree node */
1604 			page_cache_release(cached_page);
1605 			return ERR_PTR(err);
1606 		}
1607 		page = cached_page;
1608 		cached_page = NULL;
1609 		err = filler(data, page);
1610 		if (err < 0) {
1611 			page_cache_release(page);
1612 			page = ERR_PTR(err);
1613 		}
1614 	}
1615 	if (cached_page)
1616 		page_cache_release(cached_page);
1617 	return page;
1618 }
1619 
1620 /*
1621  * Read into the page cache. If a page already exists,
1622  * and PageUptodate() is not set, try to fill the page.
1623  */
1624 struct page *read_cache_page(struct address_space *mapping,
1625 				unsigned long index,
1626 				int (*filler)(void *,struct page*),
1627 				void *data)
1628 {
1629 	struct page *page;
1630 	int err;
1631 
1632 retry:
1633 	page = __read_cache_page(mapping, index, filler, data);
1634 	if (IS_ERR(page))
1635 		goto out;
1636 	mark_page_accessed(page);
1637 	if (PageUptodate(page))
1638 		goto out;
1639 
1640 	lock_page(page);
1641 	if (!page->mapping) {
1642 		unlock_page(page);
1643 		page_cache_release(page);
1644 		goto retry;
1645 	}
1646 	if (PageUptodate(page)) {
1647 		unlock_page(page);
1648 		goto out;
1649 	}
1650 	err = filler(data, page);
1651 	if (err < 0) {
1652 		page_cache_release(page);
1653 		page = ERR_PTR(err);
1654 	}
1655  out:
1656 	return page;
1657 }
1658 
1659 EXPORT_SYMBOL(read_cache_page);
1660 
1661 /*
1662  * If the page was newly created, increment its refcount and add it to the
1663  * caller's lru-buffering pagevec.  This function is specifically for
1664  * generic_file_write().
1665  */
1666 static inline struct page *
1667 __grab_cache_page(struct address_space *mapping, unsigned long index,
1668 			struct page **cached_page, struct pagevec *lru_pvec)
1669 {
1670 	int err;
1671 	struct page *page;
1672 repeat:
1673 	page = find_lock_page(mapping, index);
1674 	if (!page) {
1675 		if (!*cached_page) {
1676 			*cached_page = page_cache_alloc(mapping);
1677 			if (!*cached_page)
1678 				return NULL;
1679 		}
1680 		err = add_to_page_cache(*cached_page, mapping,
1681 					index, GFP_KERNEL);
1682 		if (err == -EEXIST)
1683 			goto repeat;
1684 		if (err == 0) {
1685 			page = *cached_page;
1686 			page_cache_get(page);
1687 			if (!pagevec_add(lru_pvec, page))
1688 				__pagevec_lru_add(lru_pvec);
1689 			*cached_page = NULL;
1690 		}
1691 	}
1692 	return page;
1693 }
1694 
1695 /*
1696  * The logic we want is
1697  *
1698  *	if suid or (sgid and xgrp)
1699  *		remove privs
1700  */
1701 int remove_suid(struct dentry *dentry)
1702 {
1703 	mode_t mode = dentry->d_inode->i_mode;
1704 	int kill = 0;
1705 	int result = 0;
1706 
1707 	/* suid always must be killed */
1708 	if (unlikely(mode & S_ISUID))
1709 		kill = ATTR_KILL_SUID;
1710 
1711 	/*
1712 	 * sgid without any exec bits is just a mandatory locking mark; leave
1713 	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
1714 	 */
1715 	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1716 		kill |= ATTR_KILL_SGID;
1717 
1718 	if (unlikely(kill && !capable(CAP_FSETID))) {
1719 		struct iattr newattrs;
1720 
1721 		newattrs.ia_valid = ATTR_FORCE | kill;
1722 		result = notify_change(dentry, &newattrs);
1723 	}
1724 	return result;
1725 }
1726 EXPORT_SYMBOL(remove_suid);
1727 
1728 size_t
1729 __filemap_copy_from_user_iovec(char *vaddr,
1730 			const struct iovec *iov, size_t base, size_t bytes)
1731 {
1732 	size_t copied = 0, left = 0;
1733 
1734 	while (bytes) {
1735 		char __user *buf = iov->iov_base + base;
1736 		int copy = min(bytes, iov->iov_len - base);
1737 
1738 		base = 0;
1739 		left = __copy_from_user_inatomic(vaddr, buf, copy);
1740 		copied += copy;
1741 		bytes -= copy;
1742 		vaddr += copy;
1743 		iov++;
1744 
1745 		if (unlikely(left)) {
1746 			/* zero the rest of the target like __copy_from_user */
1747 			if (bytes)
1748 				memset(vaddr, 0, bytes);
1749 			break;
1750 		}
1751 	}
1752 	return copied - left;
1753 }
1754 
1755 /*
1756  * Performs necessary checks before doing a write
1757  *
1758  * Can adjust writing position aor amount of bytes to write.
1759  * Returns appropriate error code that caller should return or
1760  * zero in case that write should be allowed.
1761  */
1762 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1763 {
1764 	struct inode *inode = file->f_mapping->host;
1765 	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1766 
1767         if (unlikely(*pos < 0))
1768                 return -EINVAL;
1769 
1770 	if (!isblk) {
1771 		/* FIXME: this is for backwards compatibility with 2.4 */
1772 		if (file->f_flags & O_APPEND)
1773                         *pos = i_size_read(inode);
1774 
1775 		if (limit != RLIM_INFINITY) {
1776 			if (*pos >= limit) {
1777 				send_sig(SIGXFSZ, current, 0);
1778 				return -EFBIG;
1779 			}
1780 			if (*count > limit - (typeof(limit))*pos) {
1781 				*count = limit - (typeof(limit))*pos;
1782 			}
1783 		}
1784 	}
1785 
1786 	/*
1787 	 * LFS rule
1788 	 */
1789 	if (unlikely(*pos + *count > MAX_NON_LFS &&
1790 				!(file->f_flags & O_LARGEFILE))) {
1791 		if (*pos >= MAX_NON_LFS) {
1792 			send_sig(SIGXFSZ, current, 0);
1793 			return -EFBIG;
1794 		}
1795 		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1796 			*count = MAX_NON_LFS - (unsigned long)*pos;
1797 		}
1798 	}
1799 
1800 	/*
1801 	 * Are we about to exceed the fs block limit ?
1802 	 *
1803 	 * If we have written data it becomes a short write.  If we have
1804 	 * exceeded without writing data we send a signal and return EFBIG.
1805 	 * Linus frestrict idea will clean these up nicely..
1806 	 */
1807 	if (likely(!isblk)) {
1808 		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1809 			if (*count || *pos > inode->i_sb->s_maxbytes) {
1810 				send_sig(SIGXFSZ, current, 0);
1811 				return -EFBIG;
1812 			}
1813 			/* zero-length writes at ->s_maxbytes are OK */
1814 		}
1815 
1816 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1817 			*count = inode->i_sb->s_maxbytes - *pos;
1818 	} else {
1819 		loff_t isize;
1820 		if (bdev_read_only(I_BDEV(inode)))
1821 			return -EPERM;
1822 		isize = i_size_read(inode);
1823 		if (*pos >= isize) {
1824 			if (*count || *pos > isize)
1825 				return -ENOSPC;
1826 		}
1827 
1828 		if (*pos + *count > isize)
1829 			*count = isize - *pos;
1830 	}
1831 	return 0;
1832 }
1833 EXPORT_SYMBOL(generic_write_checks);
1834 
1835 ssize_t
1836 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1837 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1838 		size_t count, size_t ocount)
1839 {
1840 	struct file	*file = iocb->ki_filp;
1841 	struct address_space *mapping = file->f_mapping;
1842 	struct inode	*inode = mapping->host;
1843 	ssize_t		written;
1844 
1845 	if (count != ocount)
1846 		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1847 
1848 	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1849 	if (written > 0) {
1850 		loff_t end = pos + written;
1851 		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1852 			i_size_write(inode,  end);
1853 			mark_inode_dirty(inode);
1854 		}
1855 		*ppos = end;
1856 	}
1857 
1858 	/*
1859 	 * Sync the fs metadata but not the minor inode changes and
1860 	 * of course not the data as we did direct DMA for the IO.
1861 	 * i_sem is held, which protects generic_osync_inode() from
1862 	 * livelocking.
1863 	 */
1864 	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1865 		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1866 		if (err < 0)
1867 			written = err;
1868 	}
1869 	if (written == count && !is_sync_kiocb(iocb))
1870 		written = -EIOCBQUEUED;
1871 	return written;
1872 }
1873 EXPORT_SYMBOL(generic_file_direct_write);
1874 
1875 ssize_t
1876 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1877 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
1878 		size_t count, ssize_t written)
1879 {
1880 	struct file *file = iocb->ki_filp;
1881 	struct address_space * mapping = file->f_mapping;
1882 	struct address_space_operations *a_ops = mapping->a_ops;
1883 	struct inode 	*inode = mapping->host;
1884 	long		status = 0;
1885 	struct page	*page;
1886 	struct page	*cached_page = NULL;
1887 	size_t		bytes;
1888 	struct pagevec	lru_pvec;
1889 	const struct iovec *cur_iov = iov; /* current iovec */
1890 	size_t		iov_base = 0;	   /* offset in the current iovec */
1891 	char __user	*buf;
1892 
1893 	pagevec_init(&lru_pvec, 0);
1894 
1895 	/*
1896 	 * handle partial DIO write.  Adjust cur_iov if needed.
1897 	 */
1898 	if (likely(nr_segs == 1))
1899 		buf = iov->iov_base + written;
1900 	else {
1901 		filemap_set_next_iovec(&cur_iov, &iov_base, written);
1902 		buf = cur_iov->iov_base + iov_base;
1903 	}
1904 
1905 	do {
1906 		unsigned long index;
1907 		unsigned long offset;
1908 		unsigned long maxlen;
1909 		size_t copied;
1910 
1911 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1912 		index = pos >> PAGE_CACHE_SHIFT;
1913 		bytes = PAGE_CACHE_SIZE - offset;
1914 		if (bytes > count)
1915 			bytes = count;
1916 
1917 		/*
1918 		 * Bring in the user page that we will copy from _first_.
1919 		 * Otherwise there's a nasty deadlock on copying from the
1920 		 * same page as we're writing to, without it being marked
1921 		 * up-to-date.
1922 		 */
1923 		maxlen = cur_iov->iov_len - iov_base;
1924 		if (maxlen > bytes)
1925 			maxlen = bytes;
1926 		fault_in_pages_readable(buf, maxlen);
1927 
1928 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1929 		if (!page) {
1930 			status = -ENOMEM;
1931 			break;
1932 		}
1933 
1934 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
1935 		if (unlikely(status)) {
1936 			loff_t isize = i_size_read(inode);
1937 			/*
1938 			 * prepare_write() may have instantiated a few blocks
1939 			 * outside i_size.  Trim these off again.
1940 			 */
1941 			unlock_page(page);
1942 			page_cache_release(page);
1943 			if (pos + bytes > isize)
1944 				vmtruncate(inode, isize);
1945 			break;
1946 		}
1947 		if (likely(nr_segs == 1))
1948 			copied = filemap_copy_from_user(page, offset,
1949 							buf, bytes);
1950 		else
1951 			copied = filemap_copy_from_user_iovec(page, offset,
1952 						cur_iov, iov_base, bytes);
1953 		flush_dcache_page(page);
1954 		status = a_ops->commit_write(file, page, offset, offset+bytes);
1955 		if (likely(copied > 0)) {
1956 			if (!status)
1957 				status = copied;
1958 
1959 			if (status >= 0) {
1960 				written += status;
1961 				count -= status;
1962 				pos += status;
1963 				buf += status;
1964 				if (unlikely(nr_segs > 1)) {
1965 					filemap_set_next_iovec(&cur_iov,
1966 							&iov_base, status);
1967 					if (count)
1968 						buf = cur_iov->iov_base +
1969 							iov_base;
1970 				} else {
1971 					iov_base += status;
1972 				}
1973 			}
1974 		}
1975 		if (unlikely(copied != bytes))
1976 			if (status >= 0)
1977 				status = -EFAULT;
1978 		unlock_page(page);
1979 		mark_page_accessed(page);
1980 		page_cache_release(page);
1981 		if (status < 0)
1982 			break;
1983 		balance_dirty_pages_ratelimited(mapping);
1984 		cond_resched();
1985 	} while (count);
1986 	*ppos = pos;
1987 
1988 	if (cached_page)
1989 		page_cache_release(cached_page);
1990 
1991 	/*
1992 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
1993 	 */
1994 	if (likely(status >= 0)) {
1995 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1996 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
1997 				status = generic_osync_inode(inode, mapping,
1998 						OSYNC_METADATA|OSYNC_DATA);
1999 		}
2000   	}
2001 
2002 	/*
2003 	 * If we get here for O_DIRECT writes then we must have fallen through
2004 	 * to buffered writes (block instantiation inside i_size).  So we sync
2005 	 * the file data here, to try to honour O_DIRECT expectations.
2006 	 */
2007 	if (unlikely(file->f_flags & O_DIRECT) && written)
2008 		status = filemap_write_and_wait(mapping);
2009 
2010 	pagevec_lru_add(&lru_pvec);
2011 	return written ? written : status;
2012 }
2013 EXPORT_SYMBOL(generic_file_buffered_write);
2014 
2015 static ssize_t
2016 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2017 				unsigned long nr_segs, loff_t *ppos)
2018 {
2019 	struct file *file = iocb->ki_filp;
2020 	struct address_space * mapping = file->f_mapping;
2021 	size_t ocount;		/* original count */
2022 	size_t count;		/* after file limit checks */
2023 	struct inode 	*inode = mapping->host;
2024 	unsigned long	seg;
2025 	loff_t		pos;
2026 	ssize_t		written;
2027 	ssize_t		err;
2028 
2029 	ocount = 0;
2030 	for (seg = 0; seg < nr_segs; seg++) {
2031 		const struct iovec *iv = &iov[seg];
2032 
2033 		/*
2034 		 * If any segment has a negative length, or the cumulative
2035 		 * length ever wraps negative then return -EINVAL.
2036 		 */
2037 		ocount += iv->iov_len;
2038 		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2039 			return -EINVAL;
2040 		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2041 			continue;
2042 		if (seg == 0)
2043 			return -EFAULT;
2044 		nr_segs = seg;
2045 		ocount -= iv->iov_len;	/* This segment is no good */
2046 		break;
2047 	}
2048 
2049 	count = ocount;
2050 	pos = *ppos;
2051 
2052 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2053 
2054 	/* We can write back this queue in page reclaim */
2055 	current->backing_dev_info = mapping->backing_dev_info;
2056 	written = 0;
2057 
2058 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2059 	if (err)
2060 		goto out;
2061 
2062 	if (count == 0)
2063 		goto out;
2064 
2065 	err = remove_suid(file->f_dentry);
2066 	if (err)
2067 		goto out;
2068 
2069 	inode_update_time(inode, 1);
2070 
2071 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2072 	if (unlikely(file->f_flags & O_DIRECT)) {
2073 		written = generic_file_direct_write(iocb, iov,
2074 				&nr_segs, pos, ppos, count, ocount);
2075 		if (written < 0 || written == count)
2076 			goto out;
2077 		/*
2078 		 * direct-io write to a hole: fall through to buffered I/O
2079 		 * for completing the rest of the request.
2080 		 */
2081 		pos += written;
2082 		count -= written;
2083 	}
2084 
2085 	written = generic_file_buffered_write(iocb, iov, nr_segs,
2086 			pos, ppos, count, written);
2087 out:
2088 	current->backing_dev_info = NULL;
2089 	return written ? written : err;
2090 }
2091 EXPORT_SYMBOL(generic_file_aio_write_nolock);
2092 
2093 ssize_t
2094 generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2095 				unsigned long nr_segs, loff_t *ppos)
2096 {
2097 	struct file *file = iocb->ki_filp;
2098 	struct address_space *mapping = file->f_mapping;
2099 	struct inode *inode = mapping->host;
2100 	ssize_t ret;
2101 	loff_t pos = *ppos;
2102 
2103 	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2104 
2105 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2106 		int err;
2107 
2108 		err = sync_page_range_nolock(inode, mapping, pos, ret);
2109 		if (err < 0)
2110 			ret = err;
2111 	}
2112 	return ret;
2113 }
2114 
2115 static ssize_t
2116 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
2117 				unsigned long nr_segs, loff_t *ppos)
2118 {
2119 	struct kiocb kiocb;
2120 	ssize_t ret;
2121 
2122 	init_sync_kiocb(&kiocb, file);
2123 	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2124 	if (ret == -EIOCBQUEUED)
2125 		ret = wait_on_sync_kiocb(&kiocb);
2126 	return ret;
2127 }
2128 
2129 ssize_t
2130 generic_file_write_nolock(struct file *file, const struct iovec *iov,
2131 				unsigned long nr_segs, loff_t *ppos)
2132 {
2133 	struct kiocb kiocb;
2134 	ssize_t ret;
2135 
2136 	init_sync_kiocb(&kiocb, file);
2137 	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2138 	if (-EIOCBQUEUED == ret)
2139 		ret = wait_on_sync_kiocb(&kiocb);
2140 	return ret;
2141 }
2142 EXPORT_SYMBOL(generic_file_write_nolock);
2143 
2144 ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2145 			       size_t count, loff_t pos)
2146 {
2147 	struct file *file = iocb->ki_filp;
2148 	struct address_space *mapping = file->f_mapping;
2149 	struct inode *inode = mapping->host;
2150 	ssize_t ret;
2151 	struct iovec local_iov = { .iov_base = (void __user *)buf,
2152 					.iov_len = count };
2153 
2154 	BUG_ON(iocb->ki_pos != pos);
2155 
2156 	down(&inode->i_sem);
2157 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2158 						&iocb->ki_pos);
2159 	up(&inode->i_sem);
2160 
2161 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2162 		ssize_t err;
2163 
2164 		err = sync_page_range(inode, mapping, pos, ret);
2165 		if (err < 0)
2166 			ret = err;
2167 	}
2168 	return ret;
2169 }
2170 EXPORT_SYMBOL(generic_file_aio_write);
2171 
2172 ssize_t generic_file_write(struct file *file, const char __user *buf,
2173 			   size_t count, loff_t *ppos)
2174 {
2175 	struct address_space *mapping = file->f_mapping;
2176 	struct inode *inode = mapping->host;
2177 	ssize_t	ret;
2178 	struct iovec local_iov = { .iov_base = (void __user *)buf,
2179 					.iov_len = count };
2180 
2181 	down(&inode->i_sem);
2182 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2183 	up(&inode->i_sem);
2184 
2185 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2186 		ssize_t err;
2187 
2188 		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2189 		if (err < 0)
2190 			ret = err;
2191 	}
2192 	return ret;
2193 }
2194 EXPORT_SYMBOL(generic_file_write);
2195 
2196 ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2197 			unsigned long nr_segs, loff_t *ppos)
2198 {
2199 	struct kiocb kiocb;
2200 	ssize_t ret;
2201 
2202 	init_sync_kiocb(&kiocb, filp);
2203 	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2204 	if (-EIOCBQUEUED == ret)
2205 		ret = wait_on_sync_kiocb(&kiocb);
2206 	return ret;
2207 }
2208 EXPORT_SYMBOL(generic_file_readv);
2209 
2210 ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2211 			unsigned long nr_segs, loff_t *ppos)
2212 {
2213 	struct address_space *mapping = file->f_mapping;
2214 	struct inode *inode = mapping->host;
2215 	ssize_t ret;
2216 
2217 	down(&inode->i_sem);
2218 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2219 	up(&inode->i_sem);
2220 
2221 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2222 		int err;
2223 
2224 		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2225 		if (err < 0)
2226 			ret = err;
2227 	}
2228 	return ret;
2229 }
2230 EXPORT_SYMBOL(generic_file_writev);
2231 
2232 /*
2233  * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
2234  * went wrong during pagecache shootdown.
2235  */
2236 static ssize_t
2237 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2238 	loff_t offset, unsigned long nr_segs)
2239 {
2240 	struct file *file = iocb->ki_filp;
2241 	struct address_space *mapping = file->f_mapping;
2242 	ssize_t retval;
2243 	size_t write_len = 0;
2244 
2245 	/*
2246 	 * If it's a write, unmap all mmappings of the file up-front.  This
2247 	 * will cause any pte dirty bits to be propagated into the pageframes
2248 	 * for the subsequent filemap_write_and_wait().
2249 	 */
2250 	if (rw == WRITE) {
2251 		write_len = iov_length(iov, nr_segs);
2252 	       	if (mapping_mapped(mapping))
2253 			unmap_mapping_range(mapping, offset, write_len, 0);
2254 	}
2255 
2256 	retval = filemap_write_and_wait(mapping);
2257 	if (retval == 0) {
2258 		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2259 						offset, nr_segs);
2260 		if (rw == WRITE && mapping->nrpages) {
2261 			pgoff_t end = (offset + write_len - 1)
2262 						>> PAGE_CACHE_SHIFT;
2263 			int err = invalidate_inode_pages2_range(mapping,
2264 					offset >> PAGE_CACHE_SHIFT, end);
2265 			if (err)
2266 				retval = err;
2267 		}
2268 	}
2269 	return retval;
2270 }
2271