xref: /openbmc/linux/mm/filemap.c (revision 727dede0)
1 /*
2  *	linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/export.h>
13 #include <linux/compiler.h>
14 #include <linux/dax.h>
15 #include <linux/fs.h>
16 #include <linux/sched/signal.h>
17 #include <linux/uaccess.h>
18 #include <linux/capability.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/gfp.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/mman.h>
24 #include <linux/pagemap.h>
25 #include <linux/file.h>
26 #include <linux/uio.h>
27 #include <linux/hash.h>
28 #include <linux/writeback.h>
29 #include <linux/backing-dev.h>
30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h>
32 #include <linux/security.h>
33 #include <linux/cpuset.h>
34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35 #include <linux/hugetlb.h>
36 #include <linux/memcontrol.h>
37 #include <linux/cleancache.h>
38 #include <linux/rmap.h>
39 #include "internal.h"
40 
41 #define CREATE_TRACE_POINTS
42 #include <trace/events/filemap.h>
43 
44 /*
45  * FIXME: remove all knowledge of the buffer layer from the core VM
46  */
47 #include <linux/buffer_head.h> /* for try_to_free_buffers */
48 
49 #include <asm/mman.h>
50 
51 /*
52  * Shared mappings implemented 30.11.1994. It's not fully working yet,
53  * though.
54  *
55  * Shared mappings now work. 15.8.1995  Bruno.
56  *
57  * finished 'unifying' the page and buffer cache and SMP-threaded the
58  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
59  *
60  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
61  */
62 
63 /*
64  * Lock ordering:
65  *
66  *  ->i_mmap_rwsem		(truncate_pagecache)
67  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
68  *      ->swap_lock		(exclusive_swap_page, others)
69  *        ->mapping->tree_lock
70  *
71  *  ->i_mutex
72  *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
73  *
74  *  ->mmap_sem
75  *    ->i_mmap_rwsem
76  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
77  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
78  *
79  *  ->mmap_sem
80  *    ->lock_page		(access_process_vm)
81  *
82  *  ->i_mutex			(generic_perform_write)
83  *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
84  *
85  *  bdi->wb.list_lock
86  *    sb_lock			(fs/fs-writeback.c)
87  *    ->mapping->tree_lock	(__sync_single_inode)
88  *
89  *  ->i_mmap_rwsem
90  *    ->anon_vma.lock		(vma_adjust)
91  *
92  *  ->anon_vma.lock
93  *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
94  *
95  *  ->page_table_lock or pte_lock
96  *    ->swap_lock		(try_to_unmap_one)
97  *    ->private_lock		(try_to_unmap_one)
98  *    ->tree_lock		(try_to_unmap_one)
99  *    ->zone_lru_lock(zone)	(follow_page->mark_page_accessed)
100  *    ->zone_lru_lock(zone)	(check_pte_range->isolate_lru_page)
101  *    ->private_lock		(page_remove_rmap->set_page_dirty)
102  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
103  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
104  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
105  *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
106  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
107  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
108  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
109  *
110  * ->i_mmap_rwsem
111  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
112  */
113 
114 static int page_cache_tree_insert(struct address_space *mapping,
115 				  struct page *page, void **shadowp)
116 {
117 	struct radix_tree_node *node;
118 	void **slot;
119 	int error;
120 
121 	error = __radix_tree_create(&mapping->page_tree, page->index, 0,
122 				    &node, &slot);
123 	if (error)
124 		return error;
125 	if (*slot) {
126 		void *p;
127 
128 		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
129 		if (!radix_tree_exceptional_entry(p))
130 			return -EEXIST;
131 
132 		mapping->nrexceptional--;
133 		if (shadowp)
134 			*shadowp = p;
135 	}
136 	__radix_tree_replace(&mapping->page_tree, node, slot, page,
137 			     workingset_update_node, mapping);
138 	mapping->nrpages++;
139 	return 0;
140 }
141 
142 static void page_cache_tree_delete(struct address_space *mapping,
143 				   struct page *page, void *shadow)
144 {
145 	int i, nr;
146 
147 	/* hugetlb pages are represented by one entry in the radix tree */
148 	nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
149 
150 	VM_BUG_ON_PAGE(!PageLocked(page), page);
151 	VM_BUG_ON_PAGE(PageTail(page), page);
152 	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
153 
154 	for (i = 0; i < nr; i++) {
155 		struct radix_tree_node *node;
156 		void **slot;
157 
158 		__radix_tree_lookup(&mapping->page_tree, page->index + i,
159 				    &node, &slot);
160 
161 		VM_BUG_ON_PAGE(!node && nr != 1, page);
162 
163 		radix_tree_clear_tags(&mapping->page_tree, node, slot);
164 		__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
165 				     workingset_update_node, mapping);
166 	}
167 
168 	if (shadow) {
169 		mapping->nrexceptional += nr;
170 		/*
171 		 * Make sure the nrexceptional update is committed before
172 		 * the nrpages update so that final truncate racing
173 		 * with reclaim does not see both counters 0 at the
174 		 * same time and miss a shadow entry.
175 		 */
176 		smp_wmb();
177 	}
178 	mapping->nrpages -= nr;
179 }
180 
181 /*
182  * Delete a page from the page cache and free it. Caller has to make
183  * sure the page is locked and that nobody else uses it - or that usage
184  * is safe.  The caller must hold the mapping's tree_lock.
185  */
186 void __delete_from_page_cache(struct page *page, void *shadow)
187 {
188 	struct address_space *mapping = page->mapping;
189 	int nr = hpage_nr_pages(page);
190 
191 	trace_mm_filemap_delete_from_page_cache(page);
192 	/*
193 	 * if we're uptodate, flush out into the cleancache, otherwise
194 	 * invalidate any existing cleancache entries.  We can't leave
195 	 * stale data around in the cleancache once our page is gone
196 	 */
197 	if (PageUptodate(page) && PageMappedToDisk(page))
198 		cleancache_put_page(page);
199 	else
200 		cleancache_invalidate_page(mapping, page);
201 
202 	VM_BUG_ON_PAGE(PageTail(page), page);
203 	VM_BUG_ON_PAGE(page_mapped(page), page);
204 	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
205 		int mapcount;
206 
207 		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
208 			 current->comm, page_to_pfn(page));
209 		dump_page(page, "still mapped when deleted");
210 		dump_stack();
211 		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
212 
213 		mapcount = page_mapcount(page);
214 		if (mapping_exiting(mapping) &&
215 		    page_count(page) >= mapcount + 2) {
216 			/*
217 			 * All vmas have already been torn down, so it's
218 			 * a good bet that actually the page is unmapped,
219 			 * and we'd prefer not to leak it: if we're wrong,
220 			 * some other bad page check should catch it later.
221 			 */
222 			page_mapcount_reset(page);
223 			page_ref_sub(page, mapcount);
224 		}
225 	}
226 
227 	page_cache_tree_delete(mapping, page, shadow);
228 
229 	page->mapping = NULL;
230 	/* Leave page->index set: truncation lookup relies upon it */
231 
232 	/* hugetlb pages do not participate in page cache accounting. */
233 	if (PageHuge(page))
234 		return;
235 
236 	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
237 	if (PageSwapBacked(page)) {
238 		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
239 		if (PageTransHuge(page))
240 			__dec_node_page_state(page, NR_SHMEM_THPS);
241 	} else {
242 		VM_BUG_ON_PAGE(PageTransHuge(page), page);
243 	}
244 
245 	/*
246 	 * At this point page must be either written or cleaned by truncate.
247 	 * Dirty page here signals a bug and loss of unwritten data.
248 	 *
249 	 * This fixes dirty accounting after removing the page entirely but
250 	 * leaves PageDirty set: it has no effect for truncated page and
251 	 * anyway will be cleared before returning page into buddy allocator.
252 	 */
253 	if (WARN_ON_ONCE(PageDirty(page)))
254 		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
255 }
256 
257 /**
258  * delete_from_page_cache - delete page from page cache
259  * @page: the page which the kernel is trying to remove from page cache
260  *
261  * This must be called only on pages that have been verified to be in the page
262  * cache and locked.  It will never put the page into the free list, the caller
263  * has a reference on the page.
264  */
265 void delete_from_page_cache(struct page *page)
266 {
267 	struct address_space *mapping = page_mapping(page);
268 	unsigned long flags;
269 	void (*freepage)(struct page *);
270 
271 	BUG_ON(!PageLocked(page));
272 
273 	freepage = mapping->a_ops->freepage;
274 
275 	spin_lock_irqsave(&mapping->tree_lock, flags);
276 	__delete_from_page_cache(page, NULL);
277 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
278 
279 	if (freepage)
280 		freepage(page);
281 
282 	if (PageTransHuge(page) && !PageHuge(page)) {
283 		page_ref_sub(page, HPAGE_PMD_NR);
284 		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
285 	} else {
286 		put_page(page);
287 	}
288 }
289 EXPORT_SYMBOL(delete_from_page_cache);
290 
291 int filemap_check_errors(struct address_space *mapping)
292 {
293 	int ret = 0;
294 	/* Check for outstanding write errors */
295 	if (test_bit(AS_ENOSPC, &mapping->flags) &&
296 	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
297 		ret = -ENOSPC;
298 	if (test_bit(AS_EIO, &mapping->flags) &&
299 	    test_and_clear_bit(AS_EIO, &mapping->flags))
300 		ret = -EIO;
301 	return ret;
302 }
303 EXPORT_SYMBOL(filemap_check_errors);
304 
305 static int filemap_check_and_keep_errors(struct address_space *mapping)
306 {
307 	/* Check for outstanding write errors */
308 	if (test_bit(AS_EIO, &mapping->flags))
309 		return -EIO;
310 	if (test_bit(AS_ENOSPC, &mapping->flags))
311 		return -ENOSPC;
312 	return 0;
313 }
314 
315 /**
316  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
317  * @mapping:	address space structure to write
318  * @start:	offset in bytes where the range starts
319  * @end:	offset in bytes where the range ends (inclusive)
320  * @sync_mode:	enable synchronous operation
321  *
322  * Start writeback against all of a mapping's dirty pages that lie
323  * within the byte offsets <start, end> inclusive.
324  *
325  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
326  * opposed to a regular memory cleansing writeback.  The difference between
327  * these two operations is that if a dirty page/buffer is encountered, it must
328  * be waited upon, and not just skipped over.
329  */
330 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
331 				loff_t end, int sync_mode)
332 {
333 	int ret;
334 	struct writeback_control wbc = {
335 		.sync_mode = sync_mode,
336 		.nr_to_write = LONG_MAX,
337 		.range_start = start,
338 		.range_end = end,
339 	};
340 
341 	if (!mapping_cap_writeback_dirty(mapping))
342 		return 0;
343 
344 	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
345 	ret = do_writepages(mapping, &wbc);
346 	wbc_detach_inode(&wbc);
347 	return ret;
348 }
349 
350 static inline int __filemap_fdatawrite(struct address_space *mapping,
351 	int sync_mode)
352 {
353 	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
354 }
355 
356 int filemap_fdatawrite(struct address_space *mapping)
357 {
358 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
359 }
360 EXPORT_SYMBOL(filemap_fdatawrite);
361 
362 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
363 				loff_t end)
364 {
365 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
366 }
367 EXPORT_SYMBOL(filemap_fdatawrite_range);
368 
369 /**
370  * filemap_flush - mostly a non-blocking flush
371  * @mapping:	target address_space
372  *
373  * This is a mostly non-blocking flush.  Not suitable for data-integrity
374  * purposes - I/O may not be started against all dirty pages.
375  */
376 int filemap_flush(struct address_space *mapping)
377 {
378 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
379 }
380 EXPORT_SYMBOL(filemap_flush);
381 
382 /**
383  * filemap_range_has_page - check if a page exists in range.
384  * @mapping:           address space within which to check
385  * @start_byte:        offset in bytes where the range starts
386  * @end_byte:          offset in bytes where the range ends (inclusive)
387  *
388  * Find at least one page in the range supplied, usually used to check if
389  * direct writing in this range will trigger a writeback.
390  */
391 bool filemap_range_has_page(struct address_space *mapping,
392 			   loff_t start_byte, loff_t end_byte)
393 {
394 	pgoff_t index = start_byte >> PAGE_SHIFT;
395 	pgoff_t end = end_byte >> PAGE_SHIFT;
396 	struct page *page;
397 
398 	if (end_byte < start_byte)
399 		return false;
400 
401 	if (mapping->nrpages == 0)
402 		return false;
403 
404 	if (!find_get_pages_range(mapping, &index, end, 1, &page))
405 		return false;
406 	put_page(page);
407 	return true;
408 }
409 EXPORT_SYMBOL(filemap_range_has_page);
410 
411 static void __filemap_fdatawait_range(struct address_space *mapping,
412 				     loff_t start_byte, loff_t end_byte)
413 {
414 	pgoff_t index = start_byte >> PAGE_SHIFT;
415 	pgoff_t end = end_byte >> PAGE_SHIFT;
416 	struct pagevec pvec;
417 	int nr_pages;
418 
419 	if (end_byte < start_byte)
420 		return;
421 
422 	pagevec_init(&pvec, 0);
423 	while ((index <= end) &&
424 			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
425 			PAGECACHE_TAG_WRITEBACK,
426 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
427 		unsigned i;
428 
429 		for (i = 0; i < nr_pages; i++) {
430 			struct page *page = pvec.pages[i];
431 
432 			/* until radix tree lookup accepts end_index */
433 			if (page->index > end)
434 				continue;
435 
436 			wait_on_page_writeback(page);
437 			ClearPageError(page);
438 		}
439 		pagevec_release(&pvec);
440 		cond_resched();
441 	}
442 }
443 
444 /**
445  * filemap_fdatawait_range - wait for writeback to complete
446  * @mapping:		address space structure to wait for
447  * @start_byte:		offset in bytes where the range starts
448  * @end_byte:		offset in bytes where the range ends (inclusive)
449  *
450  * Walk the list of under-writeback pages of the given address space
451  * in the given range and wait for all of them.  Check error status of
452  * the address space and return it.
453  *
454  * Since the error status of the address space is cleared by this function,
455  * callers are responsible for checking the return value and handling and/or
456  * reporting the error.
457  */
458 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
459 			    loff_t end_byte)
460 {
461 	__filemap_fdatawait_range(mapping, start_byte, end_byte);
462 	return filemap_check_errors(mapping);
463 }
464 EXPORT_SYMBOL(filemap_fdatawait_range);
465 
466 /**
467  * file_fdatawait_range - wait for writeback to complete
468  * @file:		file pointing to address space structure to wait for
469  * @start_byte:		offset in bytes where the range starts
470  * @end_byte:		offset in bytes where the range ends (inclusive)
471  *
472  * Walk the list of under-writeback pages of the address space that file
473  * refers to, in the given range and wait for all of them.  Check error
474  * status of the address space vs. the file->f_wb_err cursor and return it.
475  *
476  * Since the error status of the file is advanced by this function,
477  * callers are responsible for checking the return value and handling and/or
478  * reporting the error.
479  */
480 int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
481 {
482 	struct address_space *mapping = file->f_mapping;
483 
484 	__filemap_fdatawait_range(mapping, start_byte, end_byte);
485 	return file_check_and_advance_wb_err(file);
486 }
487 EXPORT_SYMBOL(file_fdatawait_range);
488 
489 /**
490  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
491  * @mapping: address space structure to wait for
492  *
493  * Walk the list of under-writeback pages of the given address space
494  * and wait for all of them.  Unlike filemap_fdatawait(), this function
495  * does not clear error status of the address space.
496  *
497  * Use this function if callers don't handle errors themselves.  Expected
498  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
499  * fsfreeze(8)
500  */
501 int filemap_fdatawait_keep_errors(struct address_space *mapping)
502 {
503 	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
504 	return filemap_check_and_keep_errors(mapping);
505 }
506 EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
507 
508 static bool mapping_needs_writeback(struct address_space *mapping)
509 {
510 	return (!dax_mapping(mapping) && mapping->nrpages) ||
511 	    (dax_mapping(mapping) && mapping->nrexceptional);
512 }
513 
514 int filemap_write_and_wait(struct address_space *mapping)
515 {
516 	int err = 0;
517 
518 	if (mapping_needs_writeback(mapping)) {
519 		err = filemap_fdatawrite(mapping);
520 		/*
521 		 * Even if the above returned error, the pages may be
522 		 * written partially (e.g. -ENOSPC), so we wait for it.
523 		 * But the -EIO is special case, it may indicate the worst
524 		 * thing (e.g. bug) happened, so we avoid waiting for it.
525 		 */
526 		if (err != -EIO) {
527 			int err2 = filemap_fdatawait(mapping);
528 			if (!err)
529 				err = err2;
530 		} else {
531 			/* Clear any previously stored errors */
532 			filemap_check_errors(mapping);
533 		}
534 	} else {
535 		err = filemap_check_errors(mapping);
536 	}
537 	return err;
538 }
539 EXPORT_SYMBOL(filemap_write_and_wait);
540 
541 /**
542  * filemap_write_and_wait_range - write out & wait on a file range
543  * @mapping:	the address_space for the pages
544  * @lstart:	offset in bytes where the range starts
545  * @lend:	offset in bytes where the range ends (inclusive)
546  *
547  * Write out and wait upon file offsets lstart->lend, inclusive.
548  *
549  * Note that @lend is inclusive (describes the last byte to be written) so
550  * that this function can be used to write to the very end-of-file (end = -1).
551  */
552 int filemap_write_and_wait_range(struct address_space *mapping,
553 				 loff_t lstart, loff_t lend)
554 {
555 	int err = 0;
556 
557 	if (mapping_needs_writeback(mapping)) {
558 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
559 						 WB_SYNC_ALL);
560 		/* See comment of filemap_write_and_wait() */
561 		if (err != -EIO) {
562 			int err2 = filemap_fdatawait_range(mapping,
563 						lstart, lend);
564 			if (!err)
565 				err = err2;
566 		} else {
567 			/* Clear any previously stored errors */
568 			filemap_check_errors(mapping);
569 		}
570 	} else {
571 		err = filemap_check_errors(mapping);
572 	}
573 	return err;
574 }
575 EXPORT_SYMBOL(filemap_write_and_wait_range);
576 
577 void __filemap_set_wb_err(struct address_space *mapping, int err)
578 {
579 	errseq_t eseq = errseq_set(&mapping->wb_err, err);
580 
581 	trace_filemap_set_wb_err(mapping, eseq);
582 }
583 EXPORT_SYMBOL(__filemap_set_wb_err);
584 
585 /**
586  * file_check_and_advance_wb_err - report wb error (if any) that was previously
587  * 				   and advance wb_err to current one
588  * @file: struct file on which the error is being reported
589  *
590  * When userland calls fsync (or something like nfsd does the equivalent), we
591  * want to report any writeback errors that occurred since the last fsync (or
592  * since the file was opened if there haven't been any).
593  *
594  * Grab the wb_err from the mapping. If it matches what we have in the file,
595  * then just quickly return 0. The file is all caught up.
596  *
597  * If it doesn't match, then take the mapping value, set the "seen" flag in
598  * it and try to swap it into place. If it works, or another task beat us
599  * to it with the new value, then update the f_wb_err and return the error
600  * portion. The error at this point must be reported via proper channels
601  * (a'la fsync, or NFS COMMIT operation, etc.).
602  *
603  * While we handle mapping->wb_err with atomic operations, the f_wb_err
604  * value is protected by the f_lock since we must ensure that it reflects
605  * the latest value swapped in for this file descriptor.
606  */
607 int file_check_and_advance_wb_err(struct file *file)
608 {
609 	int err = 0;
610 	errseq_t old = READ_ONCE(file->f_wb_err);
611 	struct address_space *mapping = file->f_mapping;
612 
613 	/* Locklessly handle the common case where nothing has changed */
614 	if (errseq_check(&mapping->wb_err, old)) {
615 		/* Something changed, must use slow path */
616 		spin_lock(&file->f_lock);
617 		old = file->f_wb_err;
618 		err = errseq_check_and_advance(&mapping->wb_err,
619 						&file->f_wb_err);
620 		trace_file_check_and_advance_wb_err(file, old);
621 		spin_unlock(&file->f_lock);
622 	}
623 
624 	/*
625 	 * We're mostly using this function as a drop in replacement for
626 	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
627 	 * that the legacy code would have had on these flags.
628 	 */
629 	clear_bit(AS_EIO, &mapping->flags);
630 	clear_bit(AS_ENOSPC, &mapping->flags);
631 	return err;
632 }
633 EXPORT_SYMBOL(file_check_and_advance_wb_err);
634 
635 /**
636  * file_write_and_wait_range - write out & wait on a file range
637  * @file:	file pointing to address_space with pages
638  * @lstart:	offset in bytes where the range starts
639  * @lend:	offset in bytes where the range ends (inclusive)
640  *
641  * Write out and wait upon file offsets lstart->lend, inclusive.
642  *
643  * Note that @lend is inclusive (describes the last byte to be written) so
644  * that this function can be used to write to the very end-of-file (end = -1).
645  *
646  * After writing out and waiting on the data, we check and advance the
647  * f_wb_err cursor to the latest value, and return any errors detected there.
648  */
649 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
650 {
651 	int err = 0, err2;
652 	struct address_space *mapping = file->f_mapping;
653 
654 	if (mapping_needs_writeback(mapping)) {
655 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
656 						 WB_SYNC_ALL);
657 		/* See comment of filemap_write_and_wait() */
658 		if (err != -EIO)
659 			__filemap_fdatawait_range(mapping, lstart, lend);
660 	}
661 	err2 = file_check_and_advance_wb_err(file);
662 	if (!err)
663 		err = err2;
664 	return err;
665 }
666 EXPORT_SYMBOL(file_write_and_wait_range);
667 
668 /**
669  * replace_page_cache_page - replace a pagecache page with a new one
670  * @old:	page to be replaced
671  * @new:	page to replace with
672  * @gfp_mask:	allocation mode
673  *
674  * This function replaces a page in the pagecache with a new one.  On
675  * success it acquires the pagecache reference for the new page and
676  * drops it for the old page.  Both the old and new pages must be
677  * locked.  This function does not add the new page to the LRU, the
678  * caller must do that.
679  *
680  * The remove + add is atomic.  The only way this function can fail is
681  * memory allocation failure.
682  */
683 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
684 {
685 	int error;
686 
687 	VM_BUG_ON_PAGE(!PageLocked(old), old);
688 	VM_BUG_ON_PAGE(!PageLocked(new), new);
689 	VM_BUG_ON_PAGE(new->mapping, new);
690 
691 	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
692 	if (!error) {
693 		struct address_space *mapping = old->mapping;
694 		void (*freepage)(struct page *);
695 		unsigned long flags;
696 
697 		pgoff_t offset = old->index;
698 		freepage = mapping->a_ops->freepage;
699 
700 		get_page(new);
701 		new->mapping = mapping;
702 		new->index = offset;
703 
704 		spin_lock_irqsave(&mapping->tree_lock, flags);
705 		__delete_from_page_cache(old, NULL);
706 		error = page_cache_tree_insert(mapping, new, NULL);
707 		BUG_ON(error);
708 
709 		/*
710 		 * hugetlb pages do not participate in page cache accounting.
711 		 */
712 		if (!PageHuge(new))
713 			__inc_node_page_state(new, NR_FILE_PAGES);
714 		if (PageSwapBacked(new))
715 			__inc_node_page_state(new, NR_SHMEM);
716 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
717 		mem_cgroup_migrate(old, new);
718 		radix_tree_preload_end();
719 		if (freepage)
720 			freepage(old);
721 		put_page(old);
722 	}
723 
724 	return error;
725 }
726 EXPORT_SYMBOL_GPL(replace_page_cache_page);
727 
728 static int __add_to_page_cache_locked(struct page *page,
729 				      struct address_space *mapping,
730 				      pgoff_t offset, gfp_t gfp_mask,
731 				      void **shadowp)
732 {
733 	int huge = PageHuge(page);
734 	struct mem_cgroup *memcg;
735 	int error;
736 
737 	VM_BUG_ON_PAGE(!PageLocked(page), page);
738 	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
739 
740 	if (!huge) {
741 		error = mem_cgroup_try_charge(page, current->mm,
742 					      gfp_mask, &memcg, false);
743 		if (error)
744 			return error;
745 	}
746 
747 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
748 	if (error) {
749 		if (!huge)
750 			mem_cgroup_cancel_charge(page, memcg, false);
751 		return error;
752 	}
753 
754 	get_page(page);
755 	page->mapping = mapping;
756 	page->index = offset;
757 
758 	spin_lock_irq(&mapping->tree_lock);
759 	error = page_cache_tree_insert(mapping, page, shadowp);
760 	radix_tree_preload_end();
761 	if (unlikely(error))
762 		goto err_insert;
763 
764 	/* hugetlb pages do not participate in page cache accounting. */
765 	if (!huge)
766 		__inc_node_page_state(page, NR_FILE_PAGES);
767 	spin_unlock_irq(&mapping->tree_lock);
768 	if (!huge)
769 		mem_cgroup_commit_charge(page, memcg, false, false);
770 	trace_mm_filemap_add_to_page_cache(page);
771 	return 0;
772 err_insert:
773 	page->mapping = NULL;
774 	/* Leave page->index set: truncation relies upon it */
775 	spin_unlock_irq(&mapping->tree_lock);
776 	if (!huge)
777 		mem_cgroup_cancel_charge(page, memcg, false);
778 	put_page(page);
779 	return error;
780 }
781 
782 /**
783  * add_to_page_cache_locked - add a locked page to the pagecache
784  * @page:	page to add
785  * @mapping:	the page's address_space
786  * @offset:	page index
787  * @gfp_mask:	page allocation mode
788  *
789  * This function is used to add a page to the pagecache. It must be locked.
790  * This function does not add the page to the LRU.  The caller must do that.
791  */
792 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
793 		pgoff_t offset, gfp_t gfp_mask)
794 {
795 	return __add_to_page_cache_locked(page, mapping, offset,
796 					  gfp_mask, NULL);
797 }
798 EXPORT_SYMBOL(add_to_page_cache_locked);
799 
800 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
801 				pgoff_t offset, gfp_t gfp_mask)
802 {
803 	void *shadow = NULL;
804 	int ret;
805 
806 	__SetPageLocked(page);
807 	ret = __add_to_page_cache_locked(page, mapping, offset,
808 					 gfp_mask, &shadow);
809 	if (unlikely(ret))
810 		__ClearPageLocked(page);
811 	else {
812 		/*
813 		 * The page might have been evicted from cache only
814 		 * recently, in which case it should be activated like
815 		 * any other repeatedly accessed page.
816 		 * The exception is pages getting rewritten; evicting other
817 		 * data from the working set, only to cache data that will
818 		 * get overwritten with something else, is a waste of memory.
819 		 */
820 		if (!(gfp_mask & __GFP_WRITE) &&
821 		    shadow && workingset_refault(shadow)) {
822 			SetPageActive(page);
823 			workingset_activation(page);
824 		} else
825 			ClearPageActive(page);
826 		lru_cache_add(page);
827 	}
828 	return ret;
829 }
830 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
831 
832 #ifdef CONFIG_NUMA
833 struct page *__page_cache_alloc(gfp_t gfp)
834 {
835 	int n;
836 	struct page *page;
837 
838 	if (cpuset_do_page_mem_spread()) {
839 		unsigned int cpuset_mems_cookie;
840 		do {
841 			cpuset_mems_cookie = read_mems_allowed_begin();
842 			n = cpuset_mem_spread_node();
843 			page = __alloc_pages_node(n, gfp, 0);
844 		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
845 
846 		return page;
847 	}
848 	return alloc_pages(gfp, 0);
849 }
850 EXPORT_SYMBOL(__page_cache_alloc);
851 #endif
852 
853 /*
854  * In order to wait for pages to become available there must be
855  * waitqueues associated with pages. By using a hash table of
856  * waitqueues where the bucket discipline is to maintain all
857  * waiters on the same queue and wake all when any of the pages
858  * become available, and for the woken contexts to check to be
859  * sure the appropriate page became available, this saves space
860  * at a cost of "thundering herd" phenomena during rare hash
861  * collisions.
862  */
863 #define PAGE_WAIT_TABLE_BITS 8
864 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
865 static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
866 
867 static wait_queue_head_t *page_waitqueue(struct page *page)
868 {
869 	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
870 }
871 
872 void __init pagecache_init(void)
873 {
874 	int i;
875 
876 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
877 		init_waitqueue_head(&page_wait_table[i]);
878 
879 	page_writeback_init();
880 }
881 
882 /* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
883 struct wait_page_key {
884 	struct page *page;
885 	int bit_nr;
886 	int page_match;
887 };
888 
889 struct wait_page_queue {
890 	struct page *page;
891 	int bit_nr;
892 	wait_queue_entry_t wait;
893 };
894 
895 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
896 {
897 	struct wait_page_key *key = arg;
898 	struct wait_page_queue *wait_page
899 		= container_of(wait, struct wait_page_queue, wait);
900 
901 	if (wait_page->page != key->page)
902 	       return 0;
903 	key->page_match = 1;
904 
905 	if (wait_page->bit_nr != key->bit_nr)
906 		return 0;
907 
908 	/* Stop walking if it's locked */
909 	if (test_bit(key->bit_nr, &key->page->flags))
910 		return -1;
911 
912 	return autoremove_wake_function(wait, mode, sync, key);
913 }
914 
915 static void wake_up_page_bit(struct page *page, int bit_nr)
916 {
917 	wait_queue_head_t *q = page_waitqueue(page);
918 	struct wait_page_key key;
919 	unsigned long flags;
920 	wait_queue_entry_t bookmark;
921 
922 	key.page = page;
923 	key.bit_nr = bit_nr;
924 	key.page_match = 0;
925 
926 	bookmark.flags = 0;
927 	bookmark.private = NULL;
928 	bookmark.func = NULL;
929 	INIT_LIST_HEAD(&bookmark.entry);
930 
931 	spin_lock_irqsave(&q->lock, flags);
932 	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
933 
934 	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
935 		/*
936 		 * Take a breather from holding the lock,
937 		 * allow pages that finish wake up asynchronously
938 		 * to acquire the lock and remove themselves
939 		 * from wait queue
940 		 */
941 		spin_unlock_irqrestore(&q->lock, flags);
942 		cpu_relax();
943 		spin_lock_irqsave(&q->lock, flags);
944 		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
945 	}
946 
947 	/*
948 	 * It is possible for other pages to have collided on the waitqueue
949 	 * hash, so in that case check for a page match. That prevents a long-
950 	 * term waiter
951 	 *
952 	 * It is still possible to miss a case here, when we woke page waiters
953 	 * and removed them from the waitqueue, but there are still other
954 	 * page waiters.
955 	 */
956 	if (!waitqueue_active(q) || !key.page_match) {
957 		ClearPageWaiters(page);
958 		/*
959 		 * It's possible to miss clearing Waiters here, when we woke
960 		 * our page waiters, but the hashed waitqueue has waiters for
961 		 * other pages on it.
962 		 *
963 		 * That's okay, it's a rare case. The next waker will clear it.
964 		 */
965 	}
966 	spin_unlock_irqrestore(&q->lock, flags);
967 }
968 
969 static void wake_up_page(struct page *page, int bit)
970 {
971 	if (!PageWaiters(page))
972 		return;
973 	wake_up_page_bit(page, bit);
974 }
975 
976 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
977 		struct page *page, int bit_nr, int state, bool lock)
978 {
979 	struct wait_page_queue wait_page;
980 	wait_queue_entry_t *wait = &wait_page.wait;
981 	int ret = 0;
982 
983 	init_wait(wait);
984 	wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
985 	wait->func = wake_page_function;
986 	wait_page.page = page;
987 	wait_page.bit_nr = bit_nr;
988 
989 	for (;;) {
990 		spin_lock_irq(&q->lock);
991 
992 		if (likely(list_empty(&wait->entry))) {
993 			__add_wait_queue_entry_tail(q, wait);
994 			SetPageWaiters(page);
995 		}
996 
997 		set_current_state(state);
998 
999 		spin_unlock_irq(&q->lock);
1000 
1001 		if (likely(test_bit(bit_nr, &page->flags))) {
1002 			io_schedule();
1003 		}
1004 
1005 		if (lock) {
1006 			if (!test_and_set_bit_lock(bit_nr, &page->flags))
1007 				break;
1008 		} else {
1009 			if (!test_bit(bit_nr, &page->flags))
1010 				break;
1011 		}
1012 
1013 		if (unlikely(signal_pending_state(state, current))) {
1014 			ret = -EINTR;
1015 			break;
1016 		}
1017 	}
1018 
1019 	finish_wait(q, wait);
1020 
1021 	/*
1022 	 * A signal could leave PageWaiters set. Clearing it here if
1023 	 * !waitqueue_active would be possible (by open-coding finish_wait),
1024 	 * but still fail to catch it in the case of wait hash collision. We
1025 	 * already can fail to clear wait hash collision cases, so don't
1026 	 * bother with signals either.
1027 	 */
1028 
1029 	return ret;
1030 }
1031 
1032 void wait_on_page_bit(struct page *page, int bit_nr)
1033 {
1034 	wait_queue_head_t *q = page_waitqueue(page);
1035 	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
1036 }
1037 EXPORT_SYMBOL(wait_on_page_bit);
1038 
1039 int wait_on_page_bit_killable(struct page *page, int bit_nr)
1040 {
1041 	wait_queue_head_t *q = page_waitqueue(page);
1042 	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
1043 }
1044 
1045 /**
1046  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
1047  * @page: Page defining the wait queue of interest
1048  * @waiter: Waiter to add to the queue
1049  *
1050  * Add an arbitrary @waiter to the wait queue for the nominated @page.
1051  */
1052 void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
1053 {
1054 	wait_queue_head_t *q = page_waitqueue(page);
1055 	unsigned long flags;
1056 
1057 	spin_lock_irqsave(&q->lock, flags);
1058 	__add_wait_queue_entry_tail(q, waiter);
1059 	SetPageWaiters(page);
1060 	spin_unlock_irqrestore(&q->lock, flags);
1061 }
1062 EXPORT_SYMBOL_GPL(add_page_wait_queue);
1063 
1064 #ifndef clear_bit_unlock_is_negative_byte
1065 
1066 /*
1067  * PG_waiters is the high bit in the same byte as PG_lock.
1068  *
1069  * On x86 (and on many other architectures), we can clear PG_lock and
1070  * test the sign bit at the same time. But if the architecture does
1071  * not support that special operation, we just do this all by hand
1072  * instead.
1073  *
1074  * The read of PG_waiters has to be after (or concurrently with) PG_locked
1075  * being cleared, but a memory barrier should be unneccssary since it is
1076  * in the same byte as PG_locked.
1077  */
1078 static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
1079 {
1080 	clear_bit_unlock(nr, mem);
1081 	/* smp_mb__after_atomic(); */
1082 	return test_bit(PG_waiters, mem);
1083 }
1084 
1085 #endif
1086 
1087 /**
1088  * unlock_page - unlock a locked page
1089  * @page: the page
1090  *
1091  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
1092  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
1093  * mechanism between PageLocked pages and PageWriteback pages is shared.
1094  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
1095  *
1096  * Note that this depends on PG_waiters being the sign bit in the byte
1097  * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
1098  * clear the PG_locked bit and test PG_waiters at the same time fairly
1099  * portably (architectures that do LL/SC can test any bit, while x86 can
1100  * test the sign bit).
1101  */
1102 void unlock_page(struct page *page)
1103 {
1104 	BUILD_BUG_ON(PG_waiters != 7);
1105 	page = compound_head(page);
1106 	VM_BUG_ON_PAGE(!PageLocked(page), page);
1107 	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
1108 		wake_up_page_bit(page, PG_locked);
1109 }
1110 EXPORT_SYMBOL(unlock_page);
1111 
1112 /**
1113  * end_page_writeback - end writeback against a page
1114  * @page: the page
1115  */
1116 void end_page_writeback(struct page *page)
1117 {
1118 	/*
1119 	 * TestClearPageReclaim could be used here but it is an atomic
1120 	 * operation and overkill in this particular case. Failing to
1121 	 * shuffle a page marked for immediate reclaim is too mild to
1122 	 * justify taking an atomic operation penalty at the end of
1123 	 * ever page writeback.
1124 	 */
1125 	if (PageReclaim(page)) {
1126 		ClearPageReclaim(page);
1127 		rotate_reclaimable_page(page);
1128 	}
1129 
1130 	if (!test_clear_page_writeback(page))
1131 		BUG();
1132 
1133 	smp_mb__after_atomic();
1134 	wake_up_page(page, PG_writeback);
1135 }
1136 EXPORT_SYMBOL(end_page_writeback);
1137 
1138 /*
1139  * After completing I/O on a page, call this routine to update the page
1140  * flags appropriately
1141  */
1142 void page_endio(struct page *page, bool is_write, int err)
1143 {
1144 	if (!is_write) {
1145 		if (!err) {
1146 			SetPageUptodate(page);
1147 		} else {
1148 			ClearPageUptodate(page);
1149 			SetPageError(page);
1150 		}
1151 		unlock_page(page);
1152 	} else {
1153 		if (err) {
1154 			struct address_space *mapping;
1155 
1156 			SetPageError(page);
1157 			mapping = page_mapping(page);
1158 			if (mapping)
1159 				mapping_set_error(mapping, err);
1160 		}
1161 		end_page_writeback(page);
1162 	}
1163 }
1164 EXPORT_SYMBOL_GPL(page_endio);
1165 
1166 /**
1167  * __lock_page - get a lock on the page, assuming we need to sleep to get it
1168  * @__page: the page to lock
1169  */
1170 void __lock_page(struct page *__page)
1171 {
1172 	struct page *page = compound_head(__page);
1173 	wait_queue_head_t *q = page_waitqueue(page);
1174 	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
1175 }
1176 EXPORT_SYMBOL(__lock_page);
1177 
1178 int __lock_page_killable(struct page *__page)
1179 {
1180 	struct page *page = compound_head(__page);
1181 	wait_queue_head_t *q = page_waitqueue(page);
1182 	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
1183 }
1184 EXPORT_SYMBOL_GPL(__lock_page_killable);
1185 
1186 /*
1187  * Return values:
1188  * 1 - page is locked; mmap_sem is still held.
1189  * 0 - page is not locked.
1190  *     mmap_sem has been released (up_read()), unless flags had both
1191  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
1192  *     which case mmap_sem is still held.
1193  *
1194  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
1195  * with the page locked and the mmap_sem unperturbed.
1196  */
1197 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
1198 			 unsigned int flags)
1199 {
1200 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
1201 		/*
1202 		 * CAUTION! In this case, mmap_sem is not released
1203 		 * even though return 0.
1204 		 */
1205 		if (flags & FAULT_FLAG_RETRY_NOWAIT)
1206 			return 0;
1207 
1208 		up_read(&mm->mmap_sem);
1209 		if (flags & FAULT_FLAG_KILLABLE)
1210 			wait_on_page_locked_killable(page);
1211 		else
1212 			wait_on_page_locked(page);
1213 		return 0;
1214 	} else {
1215 		if (flags & FAULT_FLAG_KILLABLE) {
1216 			int ret;
1217 
1218 			ret = __lock_page_killable(page);
1219 			if (ret) {
1220 				up_read(&mm->mmap_sem);
1221 				return 0;
1222 			}
1223 		} else
1224 			__lock_page(page);
1225 		return 1;
1226 	}
1227 }
1228 
1229 /**
1230  * page_cache_next_hole - find the next hole (not-present entry)
1231  * @mapping: mapping
1232  * @index: index
1233  * @max_scan: maximum range to search
1234  *
1235  * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
1236  * lowest indexed hole.
1237  *
1238  * Returns: the index of the hole if found, otherwise returns an index
1239  * outside of the set specified (in which case 'return - index >=
1240  * max_scan' will be true). In rare cases of index wrap-around, 0 will
1241  * be returned.
1242  *
1243  * page_cache_next_hole may be called under rcu_read_lock. However,
1244  * like radix_tree_gang_lookup, this will not atomically search a
1245  * snapshot of the tree at a single point in time. For example, if a
1246  * hole is created at index 5, then subsequently a hole is created at
1247  * index 10, page_cache_next_hole covering both indexes may return 10
1248  * if called under rcu_read_lock.
1249  */
1250 pgoff_t page_cache_next_hole(struct address_space *mapping,
1251 			     pgoff_t index, unsigned long max_scan)
1252 {
1253 	unsigned long i;
1254 
1255 	for (i = 0; i < max_scan; i++) {
1256 		struct page *page;
1257 
1258 		page = radix_tree_lookup(&mapping->page_tree, index);
1259 		if (!page || radix_tree_exceptional_entry(page))
1260 			break;
1261 		index++;
1262 		if (index == 0)
1263 			break;
1264 	}
1265 
1266 	return index;
1267 }
1268 EXPORT_SYMBOL(page_cache_next_hole);
1269 
1270 /**
1271  * page_cache_prev_hole - find the prev hole (not-present entry)
1272  * @mapping: mapping
1273  * @index: index
1274  * @max_scan: maximum range to search
1275  *
1276  * Search backwards in the range [max(index-max_scan+1, 0), index] for
1277  * the first hole.
1278  *
1279  * Returns: the index of the hole if found, otherwise returns an index
1280  * outside of the set specified (in which case 'index - return >=
1281  * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
1282  * will be returned.
1283  *
1284  * page_cache_prev_hole may be called under rcu_read_lock. However,
1285  * like radix_tree_gang_lookup, this will not atomically search a
1286  * snapshot of the tree at a single point in time. For example, if a
1287  * hole is created at index 10, then subsequently a hole is created at
1288  * index 5, page_cache_prev_hole covering both indexes may return 5 if
1289  * called under rcu_read_lock.
1290  */
1291 pgoff_t page_cache_prev_hole(struct address_space *mapping,
1292 			     pgoff_t index, unsigned long max_scan)
1293 {
1294 	unsigned long i;
1295 
1296 	for (i = 0; i < max_scan; i++) {
1297 		struct page *page;
1298 
1299 		page = radix_tree_lookup(&mapping->page_tree, index);
1300 		if (!page || radix_tree_exceptional_entry(page))
1301 			break;
1302 		index--;
1303 		if (index == ULONG_MAX)
1304 			break;
1305 	}
1306 
1307 	return index;
1308 }
1309 EXPORT_SYMBOL(page_cache_prev_hole);
1310 
1311 /**
1312  * find_get_entry - find and get a page cache entry
1313  * @mapping: the address_space to search
1314  * @offset: the page cache index
1315  *
1316  * Looks up the page cache slot at @mapping & @offset.  If there is a
1317  * page cache page, it is returned with an increased refcount.
1318  *
1319  * If the slot holds a shadow entry of a previously evicted page, or a
1320  * swap entry from shmem/tmpfs, it is returned.
1321  *
1322  * Otherwise, %NULL is returned.
1323  */
1324 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1325 {
1326 	void **pagep;
1327 	struct page *head, *page;
1328 
1329 	rcu_read_lock();
1330 repeat:
1331 	page = NULL;
1332 	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
1333 	if (pagep) {
1334 		page = radix_tree_deref_slot(pagep);
1335 		if (unlikely(!page))
1336 			goto out;
1337 		if (radix_tree_exception(page)) {
1338 			if (radix_tree_deref_retry(page))
1339 				goto repeat;
1340 			/*
1341 			 * A shadow entry of a recently evicted page,
1342 			 * or a swap entry from shmem/tmpfs.  Return
1343 			 * it without attempting to raise page count.
1344 			 */
1345 			goto out;
1346 		}
1347 
1348 		head = compound_head(page);
1349 		if (!page_cache_get_speculative(head))
1350 			goto repeat;
1351 
1352 		/* The page was split under us? */
1353 		if (compound_head(page) != head) {
1354 			put_page(head);
1355 			goto repeat;
1356 		}
1357 
1358 		/*
1359 		 * Has the page moved?
1360 		 * This is part of the lockless pagecache protocol. See
1361 		 * include/linux/pagemap.h for details.
1362 		 */
1363 		if (unlikely(page != *pagep)) {
1364 			put_page(head);
1365 			goto repeat;
1366 		}
1367 	}
1368 out:
1369 	rcu_read_unlock();
1370 
1371 	return page;
1372 }
1373 EXPORT_SYMBOL(find_get_entry);
1374 
1375 /**
1376  * find_lock_entry - locate, pin and lock a page cache entry
1377  * @mapping: the address_space to search
1378  * @offset: the page cache index
1379  *
1380  * Looks up the page cache slot at @mapping & @offset.  If there is a
1381  * page cache page, it is returned locked and with an increased
1382  * refcount.
1383  *
1384  * If the slot holds a shadow entry of a previously evicted page, or a
1385  * swap entry from shmem/tmpfs, it is returned.
1386  *
1387  * Otherwise, %NULL is returned.
1388  *
1389  * find_lock_entry() may sleep.
1390  */
1391 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
1392 {
1393 	struct page *page;
1394 
1395 repeat:
1396 	page = find_get_entry(mapping, offset);
1397 	if (page && !radix_tree_exception(page)) {
1398 		lock_page(page);
1399 		/* Has the page been truncated? */
1400 		if (unlikely(page_mapping(page) != mapping)) {
1401 			unlock_page(page);
1402 			put_page(page);
1403 			goto repeat;
1404 		}
1405 		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
1406 	}
1407 	return page;
1408 }
1409 EXPORT_SYMBOL(find_lock_entry);
1410 
1411 /**
1412  * pagecache_get_page - find and get a page reference
1413  * @mapping: the address_space to search
1414  * @offset: the page index
1415  * @fgp_flags: PCG flags
1416  * @gfp_mask: gfp mask to use for the page cache data page allocation
1417  *
1418  * Looks up the page cache slot at @mapping & @offset.
1419  *
1420  * PCG flags modify how the page is returned.
1421  *
1422  * @fgp_flags can be:
1423  *
1424  * - FGP_ACCESSED: the page will be marked accessed
1425  * - FGP_LOCK: Page is return locked
1426  * - FGP_CREAT: If page is not present then a new page is allocated using
1427  *   @gfp_mask and added to the page cache and the VM's LRU
1428  *   list. The page is returned locked and with an increased
1429  *   refcount. Otherwise, NULL is returned.
1430  *
1431  * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1432  * if the GFP flags specified for FGP_CREAT are atomic.
1433  *
1434  * If there is a page cache page, it is returned with an increased refcount.
1435  */
1436 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1437 	int fgp_flags, gfp_t gfp_mask)
1438 {
1439 	struct page *page;
1440 
1441 repeat:
1442 	page = find_get_entry(mapping, offset);
1443 	if (radix_tree_exceptional_entry(page))
1444 		page = NULL;
1445 	if (!page)
1446 		goto no_page;
1447 
1448 	if (fgp_flags & FGP_LOCK) {
1449 		if (fgp_flags & FGP_NOWAIT) {
1450 			if (!trylock_page(page)) {
1451 				put_page(page);
1452 				return NULL;
1453 			}
1454 		} else {
1455 			lock_page(page);
1456 		}
1457 
1458 		/* Has the page been truncated? */
1459 		if (unlikely(page->mapping != mapping)) {
1460 			unlock_page(page);
1461 			put_page(page);
1462 			goto repeat;
1463 		}
1464 		VM_BUG_ON_PAGE(page->index != offset, page);
1465 	}
1466 
1467 	if (page && (fgp_flags & FGP_ACCESSED))
1468 		mark_page_accessed(page);
1469 
1470 no_page:
1471 	if (!page && (fgp_flags & FGP_CREAT)) {
1472 		int err;
1473 		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
1474 			gfp_mask |= __GFP_WRITE;
1475 		if (fgp_flags & FGP_NOFS)
1476 			gfp_mask &= ~__GFP_FS;
1477 
1478 		page = __page_cache_alloc(gfp_mask);
1479 		if (!page)
1480 			return NULL;
1481 
1482 		if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1483 			fgp_flags |= FGP_LOCK;
1484 
1485 		/* Init accessed so avoid atomic mark_page_accessed later */
1486 		if (fgp_flags & FGP_ACCESSED)
1487 			__SetPageReferenced(page);
1488 
1489 		err = add_to_page_cache_lru(page, mapping, offset,
1490 				gfp_mask & GFP_RECLAIM_MASK);
1491 		if (unlikely(err)) {
1492 			put_page(page);
1493 			page = NULL;
1494 			if (err == -EEXIST)
1495 				goto repeat;
1496 		}
1497 	}
1498 
1499 	return page;
1500 }
1501 EXPORT_SYMBOL(pagecache_get_page);
1502 
1503 /**
1504  * find_get_entries - gang pagecache lookup
1505  * @mapping:	The address_space to search
1506  * @start:	The starting page cache index
1507  * @nr_entries:	The maximum number of entries
1508  * @entries:	Where the resulting entries are placed
1509  * @indices:	The cache indices corresponding to the entries in @entries
1510  *
1511  * find_get_entries() will search for and return a group of up to
1512  * @nr_entries entries in the mapping.  The entries are placed at
1513  * @entries.  find_get_entries() takes a reference against any actual
1514  * pages it returns.
1515  *
1516  * The search returns a group of mapping-contiguous page cache entries
1517  * with ascending indexes.  There may be holes in the indices due to
1518  * not-present pages.
1519  *
1520  * Any shadow entries of evicted pages, or swap entries from
1521  * shmem/tmpfs, are included in the returned array.
1522  *
1523  * find_get_entries() returns the number of pages and shadow entries
1524  * which were found.
1525  */
1526 unsigned find_get_entries(struct address_space *mapping,
1527 			  pgoff_t start, unsigned int nr_entries,
1528 			  struct page **entries, pgoff_t *indices)
1529 {
1530 	void **slot;
1531 	unsigned int ret = 0;
1532 	struct radix_tree_iter iter;
1533 
1534 	if (!nr_entries)
1535 		return 0;
1536 
1537 	rcu_read_lock();
1538 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1539 		struct page *head, *page;
1540 repeat:
1541 		page = radix_tree_deref_slot(slot);
1542 		if (unlikely(!page))
1543 			continue;
1544 		if (radix_tree_exception(page)) {
1545 			if (radix_tree_deref_retry(page)) {
1546 				slot = radix_tree_iter_retry(&iter);
1547 				continue;
1548 			}
1549 			/*
1550 			 * A shadow entry of a recently evicted page, a swap
1551 			 * entry from shmem/tmpfs or a DAX entry.  Return it
1552 			 * without attempting to raise page count.
1553 			 */
1554 			goto export;
1555 		}
1556 
1557 		head = compound_head(page);
1558 		if (!page_cache_get_speculative(head))
1559 			goto repeat;
1560 
1561 		/* The page was split under us? */
1562 		if (compound_head(page) != head) {
1563 			put_page(head);
1564 			goto repeat;
1565 		}
1566 
1567 		/* Has the page moved? */
1568 		if (unlikely(page != *slot)) {
1569 			put_page(head);
1570 			goto repeat;
1571 		}
1572 export:
1573 		indices[ret] = iter.index;
1574 		entries[ret] = page;
1575 		if (++ret == nr_entries)
1576 			break;
1577 	}
1578 	rcu_read_unlock();
1579 	return ret;
1580 }
1581 
1582 /**
1583  * find_get_pages_range - gang pagecache lookup
1584  * @mapping:	The address_space to search
1585  * @start:	The starting page index
1586  * @end:	The final page index (inclusive)
1587  * @nr_pages:	The maximum number of pages
1588  * @pages:	Where the resulting pages are placed
1589  *
1590  * find_get_pages_range() will search for and return a group of up to @nr_pages
1591  * pages in the mapping starting at index @start and up to index @end
1592  * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
1593  * a reference against the returned pages.
1594  *
1595  * The search returns a group of mapping-contiguous pages with ascending
1596  * indexes.  There may be holes in the indices due to not-present pages.
1597  * We also update @start to index the next page for the traversal.
1598  *
1599  * find_get_pages_range() returns the number of pages which were found. If this
1600  * number is smaller than @nr_pages, the end of specified range has been
1601  * reached.
1602  */
1603 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
1604 			      pgoff_t end, unsigned int nr_pages,
1605 			      struct page **pages)
1606 {
1607 	struct radix_tree_iter iter;
1608 	void **slot;
1609 	unsigned ret = 0;
1610 
1611 	if (unlikely(!nr_pages))
1612 		return 0;
1613 
1614 	rcu_read_lock();
1615 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
1616 		struct page *head, *page;
1617 
1618 		if (iter.index > end)
1619 			break;
1620 repeat:
1621 		page = radix_tree_deref_slot(slot);
1622 		if (unlikely(!page))
1623 			continue;
1624 
1625 		if (radix_tree_exception(page)) {
1626 			if (radix_tree_deref_retry(page)) {
1627 				slot = radix_tree_iter_retry(&iter);
1628 				continue;
1629 			}
1630 			/*
1631 			 * A shadow entry of a recently evicted page,
1632 			 * or a swap entry from shmem/tmpfs.  Skip
1633 			 * over it.
1634 			 */
1635 			continue;
1636 		}
1637 
1638 		head = compound_head(page);
1639 		if (!page_cache_get_speculative(head))
1640 			goto repeat;
1641 
1642 		/* The page was split under us? */
1643 		if (compound_head(page) != head) {
1644 			put_page(head);
1645 			goto repeat;
1646 		}
1647 
1648 		/* Has the page moved? */
1649 		if (unlikely(page != *slot)) {
1650 			put_page(head);
1651 			goto repeat;
1652 		}
1653 
1654 		pages[ret] = page;
1655 		if (++ret == nr_pages) {
1656 			*start = pages[ret - 1]->index + 1;
1657 			goto out;
1658 		}
1659 	}
1660 
1661 	/*
1662 	 * We come here when there is no page beyond @end. We take care to not
1663 	 * overflow the index @start as it confuses some of the callers. This
1664 	 * breaks the iteration when there is page at index -1 but that is
1665 	 * already broken anyway.
1666 	 */
1667 	if (end == (pgoff_t)-1)
1668 		*start = (pgoff_t)-1;
1669 	else
1670 		*start = end + 1;
1671 out:
1672 	rcu_read_unlock();
1673 
1674 	return ret;
1675 }
1676 
1677 /**
1678  * find_get_pages_contig - gang contiguous pagecache lookup
1679  * @mapping:	The address_space to search
1680  * @index:	The starting page index
1681  * @nr_pages:	The maximum number of pages
1682  * @pages:	Where the resulting pages are placed
1683  *
1684  * find_get_pages_contig() works exactly like find_get_pages(), except
1685  * that the returned number of pages are guaranteed to be contiguous.
1686  *
1687  * find_get_pages_contig() returns the number of pages which were found.
1688  */
1689 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1690 			       unsigned int nr_pages, struct page **pages)
1691 {
1692 	struct radix_tree_iter iter;
1693 	void **slot;
1694 	unsigned int ret = 0;
1695 
1696 	if (unlikely(!nr_pages))
1697 		return 0;
1698 
1699 	rcu_read_lock();
1700 	radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
1701 		struct page *head, *page;
1702 repeat:
1703 		page = radix_tree_deref_slot(slot);
1704 		/* The hole, there no reason to continue */
1705 		if (unlikely(!page))
1706 			break;
1707 
1708 		if (radix_tree_exception(page)) {
1709 			if (radix_tree_deref_retry(page)) {
1710 				slot = radix_tree_iter_retry(&iter);
1711 				continue;
1712 			}
1713 			/*
1714 			 * A shadow entry of a recently evicted page,
1715 			 * or a swap entry from shmem/tmpfs.  Stop
1716 			 * looking for contiguous pages.
1717 			 */
1718 			break;
1719 		}
1720 
1721 		head = compound_head(page);
1722 		if (!page_cache_get_speculative(head))
1723 			goto repeat;
1724 
1725 		/* The page was split under us? */
1726 		if (compound_head(page) != head) {
1727 			put_page(head);
1728 			goto repeat;
1729 		}
1730 
1731 		/* Has the page moved? */
1732 		if (unlikely(page != *slot)) {
1733 			put_page(head);
1734 			goto repeat;
1735 		}
1736 
1737 		/*
1738 		 * must check mapping and index after taking the ref.
1739 		 * otherwise we can get both false positives and false
1740 		 * negatives, which is just confusing to the caller.
1741 		 */
1742 		if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
1743 			put_page(page);
1744 			break;
1745 		}
1746 
1747 		pages[ret] = page;
1748 		if (++ret == nr_pages)
1749 			break;
1750 	}
1751 	rcu_read_unlock();
1752 	return ret;
1753 }
1754 EXPORT_SYMBOL(find_get_pages_contig);
1755 
1756 /**
1757  * find_get_pages_tag - find and return pages that match @tag
1758  * @mapping:	the address_space to search
1759  * @index:	the starting page index
1760  * @tag:	the tag index
1761  * @nr_pages:	the maximum number of pages
1762  * @pages:	where the resulting pages are placed
1763  *
1764  * Like find_get_pages, except we only return pages which are tagged with
1765  * @tag.   We update @index to index the next page for the traversal.
1766  */
1767 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
1768 			int tag, unsigned int nr_pages, struct page **pages)
1769 {
1770 	struct radix_tree_iter iter;
1771 	void **slot;
1772 	unsigned ret = 0;
1773 
1774 	if (unlikely(!nr_pages))
1775 		return 0;
1776 
1777 	rcu_read_lock();
1778 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
1779 				   &iter, *index, tag) {
1780 		struct page *head, *page;
1781 repeat:
1782 		page = radix_tree_deref_slot(slot);
1783 		if (unlikely(!page))
1784 			continue;
1785 
1786 		if (radix_tree_exception(page)) {
1787 			if (radix_tree_deref_retry(page)) {
1788 				slot = radix_tree_iter_retry(&iter);
1789 				continue;
1790 			}
1791 			/*
1792 			 * A shadow entry of a recently evicted page.
1793 			 *
1794 			 * Those entries should never be tagged, but
1795 			 * this tree walk is lockless and the tags are
1796 			 * looked up in bulk, one radix tree node at a
1797 			 * time, so there is a sizable window for page
1798 			 * reclaim to evict a page we saw tagged.
1799 			 *
1800 			 * Skip over it.
1801 			 */
1802 			continue;
1803 		}
1804 
1805 		head = compound_head(page);
1806 		if (!page_cache_get_speculative(head))
1807 			goto repeat;
1808 
1809 		/* The page was split under us? */
1810 		if (compound_head(page) != head) {
1811 			put_page(head);
1812 			goto repeat;
1813 		}
1814 
1815 		/* Has the page moved? */
1816 		if (unlikely(page != *slot)) {
1817 			put_page(head);
1818 			goto repeat;
1819 		}
1820 
1821 		pages[ret] = page;
1822 		if (++ret == nr_pages)
1823 			break;
1824 	}
1825 
1826 	rcu_read_unlock();
1827 
1828 	if (ret)
1829 		*index = pages[ret - 1]->index + 1;
1830 
1831 	return ret;
1832 }
1833 EXPORT_SYMBOL(find_get_pages_tag);
1834 
1835 /**
1836  * find_get_entries_tag - find and return entries that match @tag
1837  * @mapping:	the address_space to search
1838  * @start:	the starting page cache index
1839  * @tag:	the tag index
1840  * @nr_entries:	the maximum number of entries
1841  * @entries:	where the resulting entries are placed
1842  * @indices:	the cache indices corresponding to the entries in @entries
1843  *
1844  * Like find_get_entries, except we only return entries which are tagged with
1845  * @tag.
1846  */
1847 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
1848 			int tag, unsigned int nr_entries,
1849 			struct page **entries, pgoff_t *indices)
1850 {
1851 	void **slot;
1852 	unsigned int ret = 0;
1853 	struct radix_tree_iter iter;
1854 
1855 	if (!nr_entries)
1856 		return 0;
1857 
1858 	rcu_read_lock();
1859 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
1860 				   &iter, start, tag) {
1861 		struct page *head, *page;
1862 repeat:
1863 		page = radix_tree_deref_slot(slot);
1864 		if (unlikely(!page))
1865 			continue;
1866 		if (radix_tree_exception(page)) {
1867 			if (radix_tree_deref_retry(page)) {
1868 				slot = radix_tree_iter_retry(&iter);
1869 				continue;
1870 			}
1871 
1872 			/*
1873 			 * A shadow entry of a recently evicted page, a swap
1874 			 * entry from shmem/tmpfs or a DAX entry.  Return it
1875 			 * without attempting to raise page count.
1876 			 */
1877 			goto export;
1878 		}
1879 
1880 		head = compound_head(page);
1881 		if (!page_cache_get_speculative(head))
1882 			goto repeat;
1883 
1884 		/* The page was split under us? */
1885 		if (compound_head(page) != head) {
1886 			put_page(head);
1887 			goto repeat;
1888 		}
1889 
1890 		/* Has the page moved? */
1891 		if (unlikely(page != *slot)) {
1892 			put_page(head);
1893 			goto repeat;
1894 		}
1895 export:
1896 		indices[ret] = iter.index;
1897 		entries[ret] = page;
1898 		if (++ret == nr_entries)
1899 			break;
1900 	}
1901 	rcu_read_unlock();
1902 	return ret;
1903 }
1904 EXPORT_SYMBOL(find_get_entries_tag);
1905 
1906 /*
1907  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1908  * a _large_ part of the i/o request. Imagine the worst scenario:
1909  *
1910  *      ---R__________________________________________B__________
1911  *         ^ reading here                             ^ bad block(assume 4k)
1912  *
1913  * read(R) => miss => readahead(R...B) => media error => frustrating retries
1914  * => failing the whole request => read(R) => read(R+1) =>
1915  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
1916  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
1917  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
1918  *
1919  * It is going insane. Fix it by quickly scaling down the readahead size.
1920  */
1921 static void shrink_readahead_size_eio(struct file *filp,
1922 					struct file_ra_state *ra)
1923 {
1924 	ra->ra_pages /= 4;
1925 }
1926 
1927 /**
1928  * generic_file_buffered_read - generic file read routine
1929  * @iocb:	the iocb to read
1930  * @iter:	data destination
1931  * @written:	already copied
1932  *
1933  * This is a generic file read routine, and uses the
1934  * mapping->a_ops->readpage() function for the actual low-level stuff.
1935  *
1936  * This is really ugly. But the goto's actually try to clarify some
1937  * of the logic when it comes to error handling etc.
1938  */
1939 static ssize_t generic_file_buffered_read(struct kiocb *iocb,
1940 		struct iov_iter *iter, ssize_t written)
1941 {
1942 	struct file *filp = iocb->ki_filp;
1943 	struct address_space *mapping = filp->f_mapping;
1944 	struct inode *inode = mapping->host;
1945 	struct file_ra_state *ra = &filp->f_ra;
1946 	loff_t *ppos = &iocb->ki_pos;
1947 	pgoff_t index;
1948 	pgoff_t last_index;
1949 	pgoff_t prev_index;
1950 	unsigned long offset;      /* offset into pagecache page */
1951 	unsigned int prev_offset;
1952 	int error = 0;
1953 
1954 	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
1955 		return 0;
1956 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
1957 
1958 	index = *ppos >> PAGE_SHIFT;
1959 	prev_index = ra->prev_pos >> PAGE_SHIFT;
1960 	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
1961 	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
1962 	offset = *ppos & ~PAGE_MASK;
1963 
1964 	for (;;) {
1965 		struct page *page;
1966 		pgoff_t end_index;
1967 		loff_t isize;
1968 		unsigned long nr, ret;
1969 
1970 		cond_resched();
1971 find_page:
1972 		if (fatal_signal_pending(current)) {
1973 			error = -EINTR;
1974 			goto out;
1975 		}
1976 
1977 		page = find_get_page(mapping, index);
1978 		if (!page) {
1979 			if (iocb->ki_flags & IOCB_NOWAIT)
1980 				goto would_block;
1981 			page_cache_sync_readahead(mapping,
1982 					ra, filp,
1983 					index, last_index - index);
1984 			page = find_get_page(mapping, index);
1985 			if (unlikely(page == NULL))
1986 				goto no_cached_page;
1987 		}
1988 		if (PageReadahead(page)) {
1989 			page_cache_async_readahead(mapping,
1990 					ra, filp, page,
1991 					index, last_index - index);
1992 		}
1993 		if (!PageUptodate(page)) {
1994 			if (iocb->ki_flags & IOCB_NOWAIT) {
1995 				put_page(page);
1996 				goto would_block;
1997 			}
1998 
1999 			/*
2000 			 * See comment in do_read_cache_page on why
2001 			 * wait_on_page_locked is used to avoid unnecessarily
2002 			 * serialisations and why it's safe.
2003 			 */
2004 			error = wait_on_page_locked_killable(page);
2005 			if (unlikely(error))
2006 				goto readpage_error;
2007 			if (PageUptodate(page))
2008 				goto page_ok;
2009 
2010 			if (inode->i_blkbits == PAGE_SHIFT ||
2011 					!mapping->a_ops->is_partially_uptodate)
2012 				goto page_not_up_to_date;
2013 			/* pipes can't handle partially uptodate pages */
2014 			if (unlikely(iter->type & ITER_PIPE))
2015 				goto page_not_up_to_date;
2016 			if (!trylock_page(page))
2017 				goto page_not_up_to_date;
2018 			/* Did it get truncated before we got the lock? */
2019 			if (!page->mapping)
2020 				goto page_not_up_to_date_locked;
2021 			if (!mapping->a_ops->is_partially_uptodate(page,
2022 							offset, iter->count))
2023 				goto page_not_up_to_date_locked;
2024 			unlock_page(page);
2025 		}
2026 page_ok:
2027 		/*
2028 		 * i_size must be checked after we know the page is Uptodate.
2029 		 *
2030 		 * Checking i_size after the check allows us to calculate
2031 		 * the correct value for "nr", which means the zero-filled
2032 		 * part of the page is not copied back to userspace (unless
2033 		 * another truncate extends the file - this is desired though).
2034 		 */
2035 
2036 		isize = i_size_read(inode);
2037 		end_index = (isize - 1) >> PAGE_SHIFT;
2038 		if (unlikely(!isize || index > end_index)) {
2039 			put_page(page);
2040 			goto out;
2041 		}
2042 
2043 		/* nr is the maximum number of bytes to copy from this page */
2044 		nr = PAGE_SIZE;
2045 		if (index == end_index) {
2046 			nr = ((isize - 1) & ~PAGE_MASK) + 1;
2047 			if (nr <= offset) {
2048 				put_page(page);
2049 				goto out;
2050 			}
2051 		}
2052 		nr = nr - offset;
2053 
2054 		/* If users can be writing to this page using arbitrary
2055 		 * virtual addresses, take care about potential aliasing
2056 		 * before reading the page on the kernel side.
2057 		 */
2058 		if (mapping_writably_mapped(mapping))
2059 			flush_dcache_page(page);
2060 
2061 		/*
2062 		 * When a sequential read accesses a page several times,
2063 		 * only mark it as accessed the first time.
2064 		 */
2065 		if (prev_index != index || offset != prev_offset)
2066 			mark_page_accessed(page);
2067 		prev_index = index;
2068 
2069 		/*
2070 		 * Ok, we have the page, and it's up-to-date, so
2071 		 * now we can copy it to user space...
2072 		 */
2073 
2074 		ret = copy_page_to_iter(page, offset, nr, iter);
2075 		offset += ret;
2076 		index += offset >> PAGE_SHIFT;
2077 		offset &= ~PAGE_MASK;
2078 		prev_offset = offset;
2079 
2080 		put_page(page);
2081 		written += ret;
2082 		if (!iov_iter_count(iter))
2083 			goto out;
2084 		if (ret < nr) {
2085 			error = -EFAULT;
2086 			goto out;
2087 		}
2088 		continue;
2089 
2090 page_not_up_to_date:
2091 		/* Get exclusive access to the page ... */
2092 		error = lock_page_killable(page);
2093 		if (unlikely(error))
2094 			goto readpage_error;
2095 
2096 page_not_up_to_date_locked:
2097 		/* Did it get truncated before we got the lock? */
2098 		if (!page->mapping) {
2099 			unlock_page(page);
2100 			put_page(page);
2101 			continue;
2102 		}
2103 
2104 		/* Did somebody else fill it already? */
2105 		if (PageUptodate(page)) {
2106 			unlock_page(page);
2107 			goto page_ok;
2108 		}
2109 
2110 readpage:
2111 		/*
2112 		 * A previous I/O error may have been due to temporary
2113 		 * failures, eg. multipath errors.
2114 		 * PG_error will be set again if readpage fails.
2115 		 */
2116 		ClearPageError(page);
2117 		/* Start the actual read. The read will unlock the page. */
2118 		error = mapping->a_ops->readpage(filp, page);
2119 
2120 		if (unlikely(error)) {
2121 			if (error == AOP_TRUNCATED_PAGE) {
2122 				put_page(page);
2123 				error = 0;
2124 				goto find_page;
2125 			}
2126 			goto readpage_error;
2127 		}
2128 
2129 		if (!PageUptodate(page)) {
2130 			error = lock_page_killable(page);
2131 			if (unlikely(error))
2132 				goto readpage_error;
2133 			if (!PageUptodate(page)) {
2134 				if (page->mapping == NULL) {
2135 					/*
2136 					 * invalidate_mapping_pages got it
2137 					 */
2138 					unlock_page(page);
2139 					put_page(page);
2140 					goto find_page;
2141 				}
2142 				unlock_page(page);
2143 				shrink_readahead_size_eio(filp, ra);
2144 				error = -EIO;
2145 				goto readpage_error;
2146 			}
2147 			unlock_page(page);
2148 		}
2149 
2150 		goto page_ok;
2151 
2152 readpage_error:
2153 		/* UHHUH! A synchronous read error occurred. Report it */
2154 		put_page(page);
2155 		goto out;
2156 
2157 no_cached_page:
2158 		/*
2159 		 * Ok, it wasn't cached, so we need to create a new
2160 		 * page..
2161 		 */
2162 		page = page_cache_alloc_cold(mapping);
2163 		if (!page) {
2164 			error = -ENOMEM;
2165 			goto out;
2166 		}
2167 		error = add_to_page_cache_lru(page, mapping, index,
2168 				mapping_gfp_constraint(mapping, GFP_KERNEL));
2169 		if (error) {
2170 			put_page(page);
2171 			if (error == -EEXIST) {
2172 				error = 0;
2173 				goto find_page;
2174 			}
2175 			goto out;
2176 		}
2177 		goto readpage;
2178 	}
2179 
2180 would_block:
2181 	error = -EAGAIN;
2182 out:
2183 	ra->prev_pos = prev_index;
2184 	ra->prev_pos <<= PAGE_SHIFT;
2185 	ra->prev_pos |= prev_offset;
2186 
2187 	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
2188 	file_accessed(filp);
2189 	return written ? written : error;
2190 }
2191 
2192 /**
2193  * generic_file_read_iter - generic filesystem read routine
2194  * @iocb:	kernel I/O control block
2195  * @iter:	destination for the data read
2196  *
2197  * This is the "read_iter()" routine for all filesystems
2198  * that can use the page cache directly.
2199  */
2200 ssize_t
2201 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2202 {
2203 	size_t count = iov_iter_count(iter);
2204 	ssize_t retval = 0;
2205 
2206 	if (!count)
2207 		goto out; /* skip atime */
2208 
2209 	if (iocb->ki_flags & IOCB_DIRECT) {
2210 		struct file *file = iocb->ki_filp;
2211 		struct address_space *mapping = file->f_mapping;
2212 		struct inode *inode = mapping->host;
2213 		loff_t size;
2214 
2215 		size = i_size_read(inode);
2216 		if (iocb->ki_flags & IOCB_NOWAIT) {
2217 			if (filemap_range_has_page(mapping, iocb->ki_pos,
2218 						   iocb->ki_pos + count - 1))
2219 				return -EAGAIN;
2220 		} else {
2221 			retval = filemap_write_and_wait_range(mapping,
2222 						iocb->ki_pos,
2223 					        iocb->ki_pos + count - 1);
2224 			if (retval < 0)
2225 				goto out;
2226 		}
2227 
2228 		file_accessed(file);
2229 
2230 		retval = mapping->a_ops->direct_IO(iocb, iter);
2231 		if (retval >= 0) {
2232 			iocb->ki_pos += retval;
2233 			count -= retval;
2234 		}
2235 		iov_iter_revert(iter, count - iov_iter_count(iter));
2236 
2237 		/*
2238 		 * Btrfs can have a short DIO read if we encounter
2239 		 * compressed extents, so if there was an error, or if
2240 		 * we've already read everything we wanted to, or if
2241 		 * there was a short read because we hit EOF, go ahead
2242 		 * and return.  Otherwise fallthrough to buffered io for
2243 		 * the rest of the read.  Buffered reads will not work for
2244 		 * DAX files, so don't bother trying.
2245 		 */
2246 		if (retval < 0 || !count || iocb->ki_pos >= size ||
2247 		    IS_DAX(inode))
2248 			goto out;
2249 	}
2250 
2251 	retval = generic_file_buffered_read(iocb, iter, retval);
2252 out:
2253 	return retval;
2254 }
2255 EXPORT_SYMBOL(generic_file_read_iter);
2256 
2257 #ifdef CONFIG_MMU
2258 /**
2259  * page_cache_read - adds requested page to the page cache if not already there
2260  * @file:	file to read
2261  * @offset:	page index
2262  * @gfp_mask:	memory allocation flags
2263  *
2264  * This adds the requested page to the page cache if it isn't already there,
2265  * and schedules an I/O to read in its contents from disk.
2266  */
2267 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2268 {
2269 	struct address_space *mapping = file->f_mapping;
2270 	struct page *page;
2271 	int ret;
2272 
2273 	do {
2274 		page = __page_cache_alloc(gfp_mask|__GFP_COLD);
2275 		if (!page)
2276 			return -ENOMEM;
2277 
2278 		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
2279 		if (ret == 0)
2280 			ret = mapping->a_ops->readpage(file, page);
2281 		else if (ret == -EEXIST)
2282 			ret = 0; /* losing race to add is OK */
2283 
2284 		put_page(page);
2285 
2286 	} while (ret == AOP_TRUNCATED_PAGE);
2287 
2288 	return ret;
2289 }
2290 
2291 #define MMAP_LOTSAMISS  (100)
2292 
2293 /*
2294  * Synchronous readahead happens when we don't even find
2295  * a page in the page cache at all.
2296  */
2297 static void do_sync_mmap_readahead(struct vm_area_struct *vma,
2298 				   struct file_ra_state *ra,
2299 				   struct file *file,
2300 				   pgoff_t offset)
2301 {
2302 	struct address_space *mapping = file->f_mapping;
2303 
2304 	/* If we don't want any read-ahead, don't bother */
2305 	if (vma->vm_flags & VM_RAND_READ)
2306 		return;
2307 	if (!ra->ra_pages)
2308 		return;
2309 
2310 	if (vma->vm_flags & VM_SEQ_READ) {
2311 		page_cache_sync_readahead(mapping, ra, file, offset,
2312 					  ra->ra_pages);
2313 		return;
2314 	}
2315 
2316 	/* Avoid banging the cache line if not needed */
2317 	if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
2318 		ra->mmap_miss++;
2319 
2320 	/*
2321 	 * Do we miss much more than hit in this file? If so,
2322 	 * stop bothering with read-ahead. It will only hurt.
2323 	 */
2324 	if (ra->mmap_miss > MMAP_LOTSAMISS)
2325 		return;
2326 
2327 	/*
2328 	 * mmap read-around
2329 	 */
2330 	ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
2331 	ra->size = ra->ra_pages;
2332 	ra->async_size = ra->ra_pages / 4;
2333 	ra_submit(ra, mapping, file);
2334 }
2335 
2336 /*
2337  * Asynchronous readahead happens when we find the page and PG_readahead,
2338  * so we want to possibly extend the readahead further..
2339  */
2340 static void do_async_mmap_readahead(struct vm_area_struct *vma,
2341 				    struct file_ra_state *ra,
2342 				    struct file *file,
2343 				    struct page *page,
2344 				    pgoff_t offset)
2345 {
2346 	struct address_space *mapping = file->f_mapping;
2347 
2348 	/* If we don't want any read-ahead, don't bother */
2349 	if (vma->vm_flags & VM_RAND_READ)
2350 		return;
2351 	if (ra->mmap_miss > 0)
2352 		ra->mmap_miss--;
2353 	if (PageReadahead(page))
2354 		page_cache_async_readahead(mapping, ra, file,
2355 					   page, offset, ra->ra_pages);
2356 }
2357 
2358 /**
2359  * filemap_fault - read in file data for page fault handling
2360  * @vmf:	struct vm_fault containing details of the fault
2361  *
2362  * filemap_fault() is invoked via the vma operations vector for a
2363  * mapped memory region to read in file data during a page fault.
2364  *
2365  * The goto's are kind of ugly, but this streamlines the normal case of having
2366  * it in the page cache, and handles the special cases reasonably without
2367  * having a lot of duplicated code.
2368  *
2369  * vma->vm_mm->mmap_sem must be held on entry.
2370  *
2371  * If our return value has VM_FAULT_RETRY set, it's because
2372  * lock_page_or_retry() returned 0.
2373  * The mmap_sem has usually been released in this case.
2374  * See __lock_page_or_retry() for the exception.
2375  *
2376  * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
2377  * has not been released.
2378  *
2379  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
2380  */
2381 int filemap_fault(struct vm_fault *vmf)
2382 {
2383 	int error;
2384 	struct file *file = vmf->vma->vm_file;
2385 	struct address_space *mapping = file->f_mapping;
2386 	struct file_ra_state *ra = &file->f_ra;
2387 	struct inode *inode = mapping->host;
2388 	pgoff_t offset = vmf->pgoff;
2389 	pgoff_t max_off;
2390 	struct page *page;
2391 	int ret = 0;
2392 
2393 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2394 	if (unlikely(offset >= max_off))
2395 		return VM_FAULT_SIGBUS;
2396 
2397 	/*
2398 	 * Do we have something in the page cache already?
2399 	 */
2400 	page = find_get_page(mapping, offset);
2401 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
2402 		/*
2403 		 * We found the page, so try async readahead before
2404 		 * waiting for the lock.
2405 		 */
2406 		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
2407 	} else if (!page) {
2408 		/* No page in the page cache at all */
2409 		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
2410 		count_vm_event(PGMAJFAULT);
2411 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
2412 		ret = VM_FAULT_MAJOR;
2413 retry_find:
2414 		page = find_get_page(mapping, offset);
2415 		if (!page)
2416 			goto no_cached_page;
2417 	}
2418 
2419 	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
2420 		put_page(page);
2421 		return ret | VM_FAULT_RETRY;
2422 	}
2423 
2424 	/* Did it get truncated? */
2425 	if (unlikely(page->mapping != mapping)) {
2426 		unlock_page(page);
2427 		put_page(page);
2428 		goto retry_find;
2429 	}
2430 	VM_BUG_ON_PAGE(page->index != offset, page);
2431 
2432 	/*
2433 	 * We have a locked page in the page cache, now we need to check
2434 	 * that it's up-to-date. If not, it is going to be due to an error.
2435 	 */
2436 	if (unlikely(!PageUptodate(page)))
2437 		goto page_not_uptodate;
2438 
2439 	/*
2440 	 * Found the page and have a reference on it.
2441 	 * We must recheck i_size under page lock.
2442 	 */
2443 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2444 	if (unlikely(offset >= max_off)) {
2445 		unlock_page(page);
2446 		put_page(page);
2447 		return VM_FAULT_SIGBUS;
2448 	}
2449 
2450 	vmf->page = page;
2451 	return ret | VM_FAULT_LOCKED;
2452 
2453 no_cached_page:
2454 	/*
2455 	 * We're only likely to ever get here if MADV_RANDOM is in
2456 	 * effect.
2457 	 */
2458 	error = page_cache_read(file, offset, vmf->gfp_mask);
2459 
2460 	/*
2461 	 * The page we want has now been added to the page cache.
2462 	 * In the unlikely event that someone removed it in the
2463 	 * meantime, we'll just come back here and read it again.
2464 	 */
2465 	if (error >= 0)
2466 		goto retry_find;
2467 
2468 	/*
2469 	 * An error return from page_cache_read can result if the
2470 	 * system is low on memory, or a problem occurs while trying
2471 	 * to schedule I/O.
2472 	 */
2473 	if (error == -ENOMEM)
2474 		return VM_FAULT_OOM;
2475 	return VM_FAULT_SIGBUS;
2476 
2477 page_not_uptodate:
2478 	/*
2479 	 * Umm, take care of errors if the page isn't up-to-date.
2480 	 * Try to re-read it _once_. We do this synchronously,
2481 	 * because there really aren't any performance issues here
2482 	 * and we need to check for errors.
2483 	 */
2484 	ClearPageError(page);
2485 	error = mapping->a_ops->readpage(file, page);
2486 	if (!error) {
2487 		wait_on_page_locked(page);
2488 		if (!PageUptodate(page))
2489 			error = -EIO;
2490 	}
2491 	put_page(page);
2492 
2493 	if (!error || error == AOP_TRUNCATED_PAGE)
2494 		goto retry_find;
2495 
2496 	/* Things didn't work out. Return zero to tell the mm layer so. */
2497 	shrink_readahead_size_eio(file, ra);
2498 	return VM_FAULT_SIGBUS;
2499 }
2500 EXPORT_SYMBOL(filemap_fault);
2501 
2502 void filemap_map_pages(struct vm_fault *vmf,
2503 		pgoff_t start_pgoff, pgoff_t end_pgoff)
2504 {
2505 	struct radix_tree_iter iter;
2506 	void **slot;
2507 	struct file *file = vmf->vma->vm_file;
2508 	struct address_space *mapping = file->f_mapping;
2509 	pgoff_t last_pgoff = start_pgoff;
2510 	unsigned long max_idx;
2511 	struct page *head, *page;
2512 
2513 	rcu_read_lock();
2514 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
2515 			start_pgoff) {
2516 		if (iter.index > end_pgoff)
2517 			break;
2518 repeat:
2519 		page = radix_tree_deref_slot(slot);
2520 		if (unlikely(!page))
2521 			goto next;
2522 		if (radix_tree_exception(page)) {
2523 			if (radix_tree_deref_retry(page)) {
2524 				slot = radix_tree_iter_retry(&iter);
2525 				continue;
2526 			}
2527 			goto next;
2528 		}
2529 
2530 		head = compound_head(page);
2531 		if (!page_cache_get_speculative(head))
2532 			goto repeat;
2533 
2534 		/* The page was split under us? */
2535 		if (compound_head(page) != head) {
2536 			put_page(head);
2537 			goto repeat;
2538 		}
2539 
2540 		/* Has the page moved? */
2541 		if (unlikely(page != *slot)) {
2542 			put_page(head);
2543 			goto repeat;
2544 		}
2545 
2546 		if (!PageUptodate(page) ||
2547 				PageReadahead(page) ||
2548 				PageHWPoison(page))
2549 			goto skip;
2550 		if (!trylock_page(page))
2551 			goto skip;
2552 
2553 		if (page->mapping != mapping || !PageUptodate(page))
2554 			goto unlock;
2555 
2556 		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2557 		if (page->index >= max_idx)
2558 			goto unlock;
2559 
2560 		if (file->f_ra.mmap_miss > 0)
2561 			file->f_ra.mmap_miss--;
2562 
2563 		vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
2564 		if (vmf->pte)
2565 			vmf->pte += iter.index - last_pgoff;
2566 		last_pgoff = iter.index;
2567 		if (alloc_set_pte(vmf, NULL, page))
2568 			goto unlock;
2569 		unlock_page(page);
2570 		goto next;
2571 unlock:
2572 		unlock_page(page);
2573 skip:
2574 		put_page(page);
2575 next:
2576 		/* Huge page is mapped? No need to proceed. */
2577 		if (pmd_trans_huge(*vmf->pmd))
2578 			break;
2579 		if (iter.index == end_pgoff)
2580 			break;
2581 	}
2582 	rcu_read_unlock();
2583 }
2584 EXPORT_SYMBOL(filemap_map_pages);
2585 
2586 int filemap_page_mkwrite(struct vm_fault *vmf)
2587 {
2588 	struct page *page = vmf->page;
2589 	struct inode *inode = file_inode(vmf->vma->vm_file);
2590 	int ret = VM_FAULT_LOCKED;
2591 
2592 	sb_start_pagefault(inode->i_sb);
2593 	file_update_time(vmf->vma->vm_file);
2594 	lock_page(page);
2595 	if (page->mapping != inode->i_mapping) {
2596 		unlock_page(page);
2597 		ret = VM_FAULT_NOPAGE;
2598 		goto out;
2599 	}
2600 	/*
2601 	 * We mark the page dirty already here so that when freeze is in
2602 	 * progress, we are guaranteed that writeback during freezing will
2603 	 * see the dirty page and writeprotect it again.
2604 	 */
2605 	set_page_dirty(page);
2606 	wait_for_stable_page(page);
2607 out:
2608 	sb_end_pagefault(inode->i_sb);
2609 	return ret;
2610 }
2611 EXPORT_SYMBOL(filemap_page_mkwrite);
2612 
2613 const struct vm_operations_struct generic_file_vm_ops = {
2614 	.fault		= filemap_fault,
2615 	.map_pages	= filemap_map_pages,
2616 	.page_mkwrite	= filemap_page_mkwrite,
2617 };
2618 
2619 /* This is used for a general mmap of a disk file */
2620 
2621 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2622 {
2623 	struct address_space *mapping = file->f_mapping;
2624 
2625 	if (!mapping->a_ops->readpage)
2626 		return -ENOEXEC;
2627 	file_accessed(file);
2628 	vma->vm_ops = &generic_file_vm_ops;
2629 	return 0;
2630 }
2631 
2632 /*
2633  * This is for filesystems which do not implement ->writepage.
2634  */
2635 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
2636 {
2637 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2638 		return -EINVAL;
2639 	return generic_file_mmap(file, vma);
2640 }
2641 #else
2642 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2643 {
2644 	return -ENOSYS;
2645 }
2646 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
2647 {
2648 	return -ENOSYS;
2649 }
2650 #endif /* CONFIG_MMU */
2651 
2652 EXPORT_SYMBOL(generic_file_mmap);
2653 EXPORT_SYMBOL(generic_file_readonly_mmap);
2654 
2655 static struct page *wait_on_page_read(struct page *page)
2656 {
2657 	if (!IS_ERR(page)) {
2658 		wait_on_page_locked(page);
2659 		if (!PageUptodate(page)) {
2660 			put_page(page);
2661 			page = ERR_PTR(-EIO);
2662 		}
2663 	}
2664 	return page;
2665 }
2666 
2667 static struct page *do_read_cache_page(struct address_space *mapping,
2668 				pgoff_t index,
2669 				int (*filler)(void *, struct page *),
2670 				void *data,
2671 				gfp_t gfp)
2672 {
2673 	struct page *page;
2674 	int err;
2675 repeat:
2676 	page = find_get_page(mapping, index);
2677 	if (!page) {
2678 		page = __page_cache_alloc(gfp | __GFP_COLD);
2679 		if (!page)
2680 			return ERR_PTR(-ENOMEM);
2681 		err = add_to_page_cache_lru(page, mapping, index, gfp);
2682 		if (unlikely(err)) {
2683 			put_page(page);
2684 			if (err == -EEXIST)
2685 				goto repeat;
2686 			/* Presumably ENOMEM for radix tree node */
2687 			return ERR_PTR(err);
2688 		}
2689 
2690 filler:
2691 		err = filler(data, page);
2692 		if (err < 0) {
2693 			put_page(page);
2694 			return ERR_PTR(err);
2695 		}
2696 
2697 		page = wait_on_page_read(page);
2698 		if (IS_ERR(page))
2699 			return page;
2700 		goto out;
2701 	}
2702 	if (PageUptodate(page))
2703 		goto out;
2704 
2705 	/*
2706 	 * Page is not up to date and may be locked due one of the following
2707 	 * case a: Page is being filled and the page lock is held
2708 	 * case b: Read/write error clearing the page uptodate status
2709 	 * case c: Truncation in progress (page locked)
2710 	 * case d: Reclaim in progress
2711 	 *
2712 	 * Case a, the page will be up to date when the page is unlocked.
2713 	 *    There is no need to serialise on the page lock here as the page
2714 	 *    is pinned so the lock gives no additional protection. Even if the
2715 	 *    the page is truncated, the data is still valid if PageUptodate as
2716 	 *    it's a race vs truncate race.
2717 	 * Case b, the page will not be up to date
2718 	 * Case c, the page may be truncated but in itself, the data may still
2719 	 *    be valid after IO completes as it's a read vs truncate race. The
2720 	 *    operation must restart if the page is not uptodate on unlock but
2721 	 *    otherwise serialising on page lock to stabilise the mapping gives
2722 	 *    no additional guarantees to the caller as the page lock is
2723 	 *    released before return.
2724 	 * Case d, similar to truncation. If reclaim holds the page lock, it
2725 	 *    will be a race with remove_mapping that determines if the mapping
2726 	 *    is valid on unlock but otherwise the data is valid and there is
2727 	 *    no need to serialise with page lock.
2728 	 *
2729 	 * As the page lock gives no additional guarantee, we optimistically
2730 	 * wait on the page to be unlocked and check if it's up to date and
2731 	 * use the page if it is. Otherwise, the page lock is required to
2732 	 * distinguish between the different cases. The motivation is that we
2733 	 * avoid spurious serialisations and wakeups when multiple processes
2734 	 * wait on the same page for IO to complete.
2735 	 */
2736 	wait_on_page_locked(page);
2737 	if (PageUptodate(page))
2738 		goto out;
2739 
2740 	/* Distinguish between all the cases under the safety of the lock */
2741 	lock_page(page);
2742 
2743 	/* Case c or d, restart the operation */
2744 	if (!page->mapping) {
2745 		unlock_page(page);
2746 		put_page(page);
2747 		goto repeat;
2748 	}
2749 
2750 	/* Someone else locked and filled the page in a very small window */
2751 	if (PageUptodate(page)) {
2752 		unlock_page(page);
2753 		goto out;
2754 	}
2755 	goto filler;
2756 
2757 out:
2758 	mark_page_accessed(page);
2759 	return page;
2760 }
2761 
2762 /**
2763  * read_cache_page - read into page cache, fill it if needed
2764  * @mapping:	the page's address_space
2765  * @index:	the page index
2766  * @filler:	function to perform the read
2767  * @data:	first arg to filler(data, page) function, often left as NULL
2768  *
2769  * Read into the page cache. If a page already exists, and PageUptodate() is
2770  * not set, try to fill the page and wait for it to become unlocked.
2771  *
2772  * If the page does not get brought uptodate, return -EIO.
2773  */
2774 struct page *read_cache_page(struct address_space *mapping,
2775 				pgoff_t index,
2776 				int (*filler)(void *, struct page *),
2777 				void *data)
2778 {
2779 	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
2780 }
2781 EXPORT_SYMBOL(read_cache_page);
2782 
2783 /**
2784  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
2785  * @mapping:	the page's address_space
2786  * @index:	the page index
2787  * @gfp:	the page allocator flags to use if allocating
2788  *
2789  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
2790  * any new page allocations done using the specified allocation flags.
2791  *
2792  * If the page does not get brought uptodate, return -EIO.
2793  */
2794 struct page *read_cache_page_gfp(struct address_space *mapping,
2795 				pgoff_t index,
2796 				gfp_t gfp)
2797 {
2798 	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
2799 
2800 	return do_read_cache_page(mapping, index, filler, NULL, gfp);
2801 }
2802 EXPORT_SYMBOL(read_cache_page_gfp);
2803 
2804 /*
2805  * Performs necessary checks before doing a write
2806  *
2807  * Can adjust writing position or amount of bytes to write.
2808  * Returns appropriate error code that caller should return or
2809  * zero in case that write should be allowed.
2810  */
2811 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
2812 {
2813 	struct file *file = iocb->ki_filp;
2814 	struct inode *inode = file->f_mapping->host;
2815 	unsigned long limit = rlimit(RLIMIT_FSIZE);
2816 	loff_t pos;
2817 
2818 	if (!iov_iter_count(from))
2819 		return 0;
2820 
2821 	/* FIXME: this is for backwards compatibility with 2.4 */
2822 	if (iocb->ki_flags & IOCB_APPEND)
2823 		iocb->ki_pos = i_size_read(inode);
2824 
2825 	pos = iocb->ki_pos;
2826 
2827 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
2828 		return -EINVAL;
2829 
2830 	if (limit != RLIM_INFINITY) {
2831 		if (iocb->ki_pos >= limit) {
2832 			send_sig(SIGXFSZ, current, 0);
2833 			return -EFBIG;
2834 		}
2835 		iov_iter_truncate(from, limit - (unsigned long)pos);
2836 	}
2837 
2838 	/*
2839 	 * LFS rule
2840 	 */
2841 	if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
2842 				!(file->f_flags & O_LARGEFILE))) {
2843 		if (pos >= MAX_NON_LFS)
2844 			return -EFBIG;
2845 		iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
2846 	}
2847 
2848 	/*
2849 	 * Are we about to exceed the fs block limit ?
2850 	 *
2851 	 * If we have written data it becomes a short write.  If we have
2852 	 * exceeded without writing data we send a signal and return EFBIG.
2853 	 * Linus frestrict idea will clean these up nicely..
2854 	 */
2855 	if (unlikely(pos >= inode->i_sb->s_maxbytes))
2856 		return -EFBIG;
2857 
2858 	iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
2859 	return iov_iter_count(from);
2860 }
2861 EXPORT_SYMBOL(generic_write_checks);
2862 
2863 int pagecache_write_begin(struct file *file, struct address_space *mapping,
2864 				loff_t pos, unsigned len, unsigned flags,
2865 				struct page **pagep, void **fsdata)
2866 {
2867 	const struct address_space_operations *aops = mapping->a_ops;
2868 
2869 	return aops->write_begin(file, mapping, pos, len, flags,
2870 							pagep, fsdata);
2871 }
2872 EXPORT_SYMBOL(pagecache_write_begin);
2873 
2874 int pagecache_write_end(struct file *file, struct address_space *mapping,
2875 				loff_t pos, unsigned len, unsigned copied,
2876 				struct page *page, void *fsdata)
2877 {
2878 	const struct address_space_operations *aops = mapping->a_ops;
2879 
2880 	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2881 }
2882 EXPORT_SYMBOL(pagecache_write_end);
2883 
2884 ssize_t
2885 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
2886 {
2887 	struct file	*file = iocb->ki_filp;
2888 	struct address_space *mapping = file->f_mapping;
2889 	struct inode	*inode = mapping->host;
2890 	loff_t		pos = iocb->ki_pos;
2891 	ssize_t		written;
2892 	size_t		write_len;
2893 	pgoff_t		end;
2894 
2895 	write_len = iov_iter_count(from);
2896 	end = (pos + write_len - 1) >> PAGE_SHIFT;
2897 
2898 	if (iocb->ki_flags & IOCB_NOWAIT) {
2899 		/* If there are pages to writeback, return */
2900 		if (filemap_range_has_page(inode->i_mapping, pos,
2901 					   pos + iov_iter_count(from)))
2902 			return -EAGAIN;
2903 	} else {
2904 		written = filemap_write_and_wait_range(mapping, pos,
2905 							pos + write_len - 1);
2906 		if (written)
2907 			goto out;
2908 	}
2909 
2910 	/*
2911 	 * After a write we want buffered reads to be sure to go to disk to get
2912 	 * the new data.  We invalidate clean cached page from the region we're
2913 	 * about to write.  We do this *before* the write so that we can return
2914 	 * without clobbering -EIOCBQUEUED from ->direct_IO().
2915 	 */
2916 	written = invalidate_inode_pages2_range(mapping,
2917 					pos >> PAGE_SHIFT, end);
2918 	/*
2919 	 * If a page can not be invalidated, return 0 to fall back
2920 	 * to buffered write.
2921 	 */
2922 	if (written) {
2923 		if (written == -EBUSY)
2924 			return 0;
2925 		goto out;
2926 	}
2927 
2928 	written = mapping->a_ops->direct_IO(iocb, from);
2929 
2930 	/*
2931 	 * Finally, try again to invalidate clean pages which might have been
2932 	 * cached by non-direct readahead, or faulted in by get_user_pages()
2933 	 * if the source of the write was an mmap'ed region of the file
2934 	 * we're writing.  Either one is a pretty crazy thing to do,
2935 	 * so we don't support it 100%.  If this invalidation
2936 	 * fails, tough, the write still worked...
2937 	 *
2938 	 * Most of the time we do not need this since dio_complete() will do
2939 	 * the invalidation for us. However there are some file systems that
2940 	 * do not end up with dio_complete() being called, so let's not break
2941 	 * them by removing it completely
2942 	 */
2943 	if (mapping->nrpages)
2944 		invalidate_inode_pages2_range(mapping,
2945 					pos >> PAGE_SHIFT, end);
2946 
2947 	if (written > 0) {
2948 		pos += written;
2949 		write_len -= written;
2950 		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2951 			i_size_write(inode, pos);
2952 			mark_inode_dirty(inode);
2953 		}
2954 		iocb->ki_pos = pos;
2955 	}
2956 	iov_iter_revert(from, write_len - iov_iter_count(from));
2957 out:
2958 	return written;
2959 }
2960 EXPORT_SYMBOL(generic_file_direct_write);
2961 
2962 /*
2963  * Find or create a page at the given pagecache position. Return the locked
2964  * page. This function is specifically for buffered writes.
2965  */
2966 struct page *grab_cache_page_write_begin(struct address_space *mapping,
2967 					pgoff_t index, unsigned flags)
2968 {
2969 	struct page *page;
2970 	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
2971 
2972 	if (flags & AOP_FLAG_NOFS)
2973 		fgp_flags |= FGP_NOFS;
2974 
2975 	page = pagecache_get_page(mapping, index, fgp_flags,
2976 			mapping_gfp_mask(mapping));
2977 	if (page)
2978 		wait_for_stable_page(page);
2979 
2980 	return page;
2981 }
2982 EXPORT_SYMBOL(grab_cache_page_write_begin);
2983 
2984 ssize_t generic_perform_write(struct file *file,
2985 				struct iov_iter *i, loff_t pos)
2986 {
2987 	struct address_space *mapping = file->f_mapping;
2988 	const struct address_space_operations *a_ops = mapping->a_ops;
2989 	long status = 0;
2990 	ssize_t written = 0;
2991 	unsigned int flags = 0;
2992 
2993 	do {
2994 		struct page *page;
2995 		unsigned long offset;	/* Offset into pagecache page */
2996 		unsigned long bytes;	/* Bytes to write to page */
2997 		size_t copied;		/* Bytes copied from user */
2998 		void *fsdata;
2999 
3000 		offset = (pos & (PAGE_SIZE - 1));
3001 		bytes = min_t(unsigned long, PAGE_SIZE - offset,
3002 						iov_iter_count(i));
3003 
3004 again:
3005 		/*
3006 		 * Bring in the user page that we will copy from _first_.
3007 		 * Otherwise there's a nasty deadlock on copying from the
3008 		 * same page as we're writing to, without it being marked
3009 		 * up-to-date.
3010 		 *
3011 		 * Not only is this an optimisation, but it is also required
3012 		 * to check that the address is actually valid, when atomic
3013 		 * usercopies are used, below.
3014 		 */
3015 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
3016 			status = -EFAULT;
3017 			break;
3018 		}
3019 
3020 		if (fatal_signal_pending(current)) {
3021 			status = -EINTR;
3022 			break;
3023 		}
3024 
3025 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
3026 						&page, &fsdata);
3027 		if (unlikely(status < 0))
3028 			break;
3029 
3030 		if (mapping_writably_mapped(mapping))
3031 			flush_dcache_page(page);
3032 
3033 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
3034 		flush_dcache_page(page);
3035 
3036 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
3037 						page, fsdata);
3038 		if (unlikely(status < 0))
3039 			break;
3040 		copied = status;
3041 
3042 		cond_resched();
3043 
3044 		iov_iter_advance(i, copied);
3045 		if (unlikely(copied == 0)) {
3046 			/*
3047 			 * If we were unable to copy any data at all, we must
3048 			 * fall back to a single segment length write.
3049 			 *
3050 			 * If we didn't fallback here, we could livelock
3051 			 * because not all segments in the iov can be copied at
3052 			 * once without a pagefault.
3053 			 */
3054 			bytes = min_t(unsigned long, PAGE_SIZE - offset,
3055 						iov_iter_single_seg_count(i));
3056 			goto again;
3057 		}
3058 		pos += copied;
3059 		written += copied;
3060 
3061 		balance_dirty_pages_ratelimited(mapping);
3062 	} while (iov_iter_count(i));
3063 
3064 	return written ? written : status;
3065 }
3066 EXPORT_SYMBOL(generic_perform_write);
3067 
3068 /**
3069  * __generic_file_write_iter - write data to a file
3070  * @iocb:	IO state structure (file, offset, etc.)
3071  * @from:	iov_iter with data to write
3072  *
3073  * This function does all the work needed for actually writing data to a
3074  * file. It does all basic checks, removes SUID from the file, updates
3075  * modification times and calls proper subroutines depending on whether we
3076  * do direct IO or a standard buffered write.
3077  *
3078  * It expects i_mutex to be grabbed unless we work on a block device or similar
3079  * object which does not need locking at all.
3080  *
3081  * This function does *not* take care of syncing data in case of O_SYNC write.
3082  * A caller has to handle it. This is mainly due to the fact that we want to
3083  * avoid syncing under i_mutex.
3084  */
3085 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3086 {
3087 	struct file *file = iocb->ki_filp;
3088 	struct address_space * mapping = file->f_mapping;
3089 	struct inode 	*inode = mapping->host;
3090 	ssize_t		written = 0;
3091 	ssize_t		err;
3092 	ssize_t		status;
3093 
3094 	/* We can write back this queue in page reclaim */
3095 	current->backing_dev_info = inode_to_bdi(inode);
3096 	err = file_remove_privs(file);
3097 	if (err)
3098 		goto out;
3099 
3100 	err = file_update_time(file);
3101 	if (err)
3102 		goto out;
3103 
3104 	if (iocb->ki_flags & IOCB_DIRECT) {
3105 		loff_t pos, endbyte;
3106 
3107 		written = generic_file_direct_write(iocb, from);
3108 		/*
3109 		 * If the write stopped short of completing, fall back to
3110 		 * buffered writes.  Some filesystems do this for writes to
3111 		 * holes, for example.  For DAX files, a buffered write will
3112 		 * not succeed (even if it did, DAX does not handle dirty
3113 		 * page-cache pages correctly).
3114 		 */
3115 		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
3116 			goto out;
3117 
3118 		status = generic_perform_write(file, from, pos = iocb->ki_pos);
3119 		/*
3120 		 * If generic_perform_write() returned a synchronous error
3121 		 * then we want to return the number of bytes which were
3122 		 * direct-written, or the error code if that was zero.  Note
3123 		 * that this differs from normal direct-io semantics, which
3124 		 * will return -EFOO even if some bytes were written.
3125 		 */
3126 		if (unlikely(status < 0)) {
3127 			err = status;
3128 			goto out;
3129 		}
3130 		/*
3131 		 * We need to ensure that the page cache pages are written to
3132 		 * disk and invalidated to preserve the expected O_DIRECT
3133 		 * semantics.
3134 		 */
3135 		endbyte = pos + status - 1;
3136 		err = filemap_write_and_wait_range(mapping, pos, endbyte);
3137 		if (err == 0) {
3138 			iocb->ki_pos = endbyte + 1;
3139 			written += status;
3140 			invalidate_mapping_pages(mapping,
3141 						 pos >> PAGE_SHIFT,
3142 						 endbyte >> PAGE_SHIFT);
3143 		} else {
3144 			/*
3145 			 * We don't know how much we wrote, so just return
3146 			 * the number of bytes which were direct-written
3147 			 */
3148 		}
3149 	} else {
3150 		written = generic_perform_write(file, from, iocb->ki_pos);
3151 		if (likely(written > 0))
3152 			iocb->ki_pos += written;
3153 	}
3154 out:
3155 	current->backing_dev_info = NULL;
3156 	return written ? written : err;
3157 }
3158 EXPORT_SYMBOL(__generic_file_write_iter);
3159 
3160 /**
3161  * generic_file_write_iter - write data to a file
3162  * @iocb:	IO state structure
3163  * @from:	iov_iter with data to write
3164  *
3165  * This is a wrapper around __generic_file_write_iter() to be used by most
3166  * filesystems. It takes care of syncing the file in case of O_SYNC file
3167  * and acquires i_mutex as needed.
3168  */
3169 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3170 {
3171 	struct file *file = iocb->ki_filp;
3172 	struct inode *inode = file->f_mapping->host;
3173 	ssize_t ret;
3174 
3175 	inode_lock(inode);
3176 	ret = generic_write_checks(iocb, from);
3177 	if (ret > 0)
3178 		ret = __generic_file_write_iter(iocb, from);
3179 	inode_unlock(inode);
3180 
3181 	if (ret > 0)
3182 		ret = generic_write_sync(iocb, ret);
3183 	return ret;
3184 }
3185 EXPORT_SYMBOL(generic_file_write_iter);
3186 
3187 /**
3188  * try_to_release_page() - release old fs-specific metadata on a page
3189  *
3190  * @page: the page which the kernel is trying to free
3191  * @gfp_mask: memory allocation flags (and I/O mode)
3192  *
3193  * The address_space is to try to release any data against the page
3194  * (presumably at page->private).  If the release was successful, return '1'.
3195  * Otherwise return zero.
3196  *
3197  * This may also be called if PG_fscache is set on a page, indicating that the
3198  * page is known to the local caching routines.
3199  *
3200  * The @gfp_mask argument specifies whether I/O may be performed to release
3201  * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
3202  *
3203  */
3204 int try_to_release_page(struct page *page, gfp_t gfp_mask)
3205 {
3206 	struct address_space * const mapping = page->mapping;
3207 
3208 	BUG_ON(!PageLocked(page));
3209 	if (PageWriteback(page))
3210 		return 0;
3211 
3212 	if (mapping && mapping->a_ops->releasepage)
3213 		return mapping->a_ops->releasepage(page, gfp_mask);
3214 	return try_to_free_buffers(page);
3215 }
3216 
3217 EXPORT_SYMBOL(try_to_release_page);
3218