xref: /openbmc/linux/mm/truncate.c (revision 68198dca)
1 /*
2  * mm/truncate.c - code for taking down pages from address_spaces
3  *
4  * Copyright (C) 2002, Linus Torvalds
5  *
6  * 10Sep2002	Andrew Morton
7  *		Initial version.
8  */
9 
10 #include <linux/kernel.h>
11 #include <linux/backing-dev.h>
12 #include <linux/dax.h>
13 #include <linux/gfp.h>
14 #include <linux/mm.h>
15 #include <linux/swap.h>
16 #include <linux/export.h>
17 #include <linux/pagemap.h>
18 #include <linux/highmem.h>
19 #include <linux/pagevec.h>
20 #include <linux/task_io_accounting_ops.h>
21 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
22 				   do_invalidatepage */
23 #include <linux/shmem_fs.h>
24 #include <linux/cleancache.h>
25 #include <linux/rmap.h>
26 #include "internal.h"
27 
28 /*
29  * Regular page slots are stabilized by the page lock even without the tree
30  * itself locked.  These unlocked entries need verification under the tree
31  * lock.
32  */
33 static inline void __clear_shadow_entry(struct address_space *mapping,
34 				pgoff_t index, void *entry)
35 {
36 	struct radix_tree_node *node;
37 	void **slot;
38 
39 	if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
40 		return;
41 	if (*slot != entry)
42 		return;
43 	__radix_tree_replace(&mapping->page_tree, node, slot, NULL,
44 			     workingset_update_node);
45 	mapping->nrexceptional--;
46 }
47 
48 static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
49 			       void *entry)
50 {
51 	spin_lock_irq(&mapping->tree_lock);
52 	__clear_shadow_entry(mapping, index, entry);
53 	spin_unlock_irq(&mapping->tree_lock);
54 }
55 
56 /*
57  * Unconditionally remove exceptional entries. Usually called from truncate
58  * path. Note that the pagevec may be altered by this function by removing
59  * exceptional entries similar to what pagevec_remove_exceptionals does.
60  */
61 static void truncate_exceptional_pvec_entries(struct address_space *mapping,
62 				struct pagevec *pvec, pgoff_t *indices,
63 				pgoff_t end)
64 {
65 	int i, j;
66 	bool dax, lock;
67 
68 	/* Handled by shmem itself */
69 	if (shmem_mapping(mapping))
70 		return;
71 
72 	for (j = 0; j < pagevec_count(pvec); j++)
73 		if (radix_tree_exceptional_entry(pvec->pages[j]))
74 			break;
75 
76 	if (j == pagevec_count(pvec))
77 		return;
78 
79 	dax = dax_mapping(mapping);
80 	lock = !dax && indices[j] < end;
81 	if (lock)
82 		spin_lock_irq(&mapping->tree_lock);
83 
84 	for (i = j; i < pagevec_count(pvec); i++) {
85 		struct page *page = pvec->pages[i];
86 		pgoff_t index = indices[i];
87 
88 		if (!radix_tree_exceptional_entry(page)) {
89 			pvec->pages[j++] = page;
90 			continue;
91 		}
92 
93 		if (index >= end)
94 			continue;
95 
96 		if (unlikely(dax)) {
97 			dax_delete_mapping_entry(mapping, index);
98 			continue;
99 		}
100 
101 		__clear_shadow_entry(mapping, index, page);
102 	}
103 
104 	if (lock)
105 		spin_unlock_irq(&mapping->tree_lock);
106 	pvec->nr = j;
107 }
108 
109 /*
110  * Invalidate exceptional entry if easily possible. This handles exceptional
111  * entries for invalidate_inode_pages().
112  */
113 static int invalidate_exceptional_entry(struct address_space *mapping,
114 					pgoff_t index, void *entry)
115 {
116 	/* Handled by shmem itself, or for DAX we do nothing. */
117 	if (shmem_mapping(mapping) || dax_mapping(mapping))
118 		return 1;
119 	clear_shadow_entry(mapping, index, entry);
120 	return 1;
121 }
122 
123 /*
124  * Invalidate exceptional entry if clean. This handles exceptional entries for
125  * invalidate_inode_pages2() so for DAX it evicts only clean entries.
126  */
127 static int invalidate_exceptional_entry2(struct address_space *mapping,
128 					 pgoff_t index, void *entry)
129 {
130 	/* Handled by shmem itself */
131 	if (shmem_mapping(mapping))
132 		return 1;
133 	if (dax_mapping(mapping))
134 		return dax_invalidate_mapping_entry_sync(mapping, index);
135 	clear_shadow_entry(mapping, index, entry);
136 	return 1;
137 }
138 
139 /**
140  * do_invalidatepage - invalidate part or all of a page
141  * @page: the page which is affected
142  * @offset: start of the range to invalidate
143  * @length: length of the range to invalidate
144  *
145  * do_invalidatepage() is called when all or part of the page has become
146  * invalidated by a truncate operation.
147  *
148  * do_invalidatepage() does not have to release all buffers, but it must
149  * ensure that no dirty buffer is left outside @offset and that no I/O
150  * is underway against any of the blocks which are outside the truncation
151  * point.  Because the caller is about to free (and possibly reuse) those
152  * blocks on-disk.
153  */
154 void do_invalidatepage(struct page *page, unsigned int offset,
155 		       unsigned int length)
156 {
157 	void (*invalidatepage)(struct page *, unsigned int, unsigned int);
158 
159 	invalidatepage = page->mapping->a_ops->invalidatepage;
160 #ifdef CONFIG_BLOCK
161 	if (!invalidatepage)
162 		invalidatepage = block_invalidatepage;
163 #endif
164 	if (invalidatepage)
165 		(*invalidatepage)(page, offset, length);
166 }
167 
168 /*
169  * If truncate cannot remove the fs-private metadata from the page, the page
170  * becomes orphaned.  It will be left on the LRU and may even be mapped into
171  * user pagetables if we're racing with filemap_fault().
172  *
173  * We need to bale out if page->mapping is no longer equal to the original
174  * mapping.  This happens a) when the VM reclaimed the page while we waited on
175  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
176  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
177  */
178 static void
179 truncate_cleanup_page(struct address_space *mapping, struct page *page)
180 {
181 	if (page_mapped(page)) {
182 		loff_t holelen;
183 
184 		holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
185 		unmap_mapping_range(mapping,
186 				   (loff_t)page->index << PAGE_SHIFT,
187 				   holelen, 0);
188 	}
189 
190 	if (page_has_private(page))
191 		do_invalidatepage(page, 0, PAGE_SIZE);
192 
193 	/*
194 	 * Some filesystems seem to re-dirty the page even after
195 	 * the VM has canceled the dirty bit (eg ext3 journaling).
196 	 * Hence dirty accounting check is placed after invalidation.
197 	 */
198 	cancel_dirty_page(page);
199 	ClearPageMappedToDisk(page);
200 }
201 
202 /*
203  * This is for invalidate_mapping_pages().  That function can be called at
204  * any time, and is not supposed to throw away dirty pages.  But pages can
205  * be marked dirty at any time too, so use remove_mapping which safely
206  * discards clean, unused pages.
207  *
208  * Returns non-zero if the page was successfully invalidated.
209  */
210 static int
211 invalidate_complete_page(struct address_space *mapping, struct page *page)
212 {
213 	int ret;
214 
215 	if (page->mapping != mapping)
216 		return 0;
217 
218 	if (page_has_private(page) && !try_to_release_page(page, 0))
219 		return 0;
220 
221 	ret = remove_mapping(mapping, page);
222 
223 	return ret;
224 }
225 
226 int truncate_inode_page(struct address_space *mapping, struct page *page)
227 {
228 	VM_BUG_ON_PAGE(PageTail(page), page);
229 
230 	if (page->mapping != mapping)
231 		return -EIO;
232 
233 	truncate_cleanup_page(mapping, page);
234 	delete_from_page_cache(page);
235 	return 0;
236 }
237 
238 /*
239  * Used to get rid of pages on hardware memory corruption.
240  */
241 int generic_error_remove_page(struct address_space *mapping, struct page *page)
242 {
243 	if (!mapping)
244 		return -EINVAL;
245 	/*
246 	 * Only punch for normal data pages for now.
247 	 * Handling other types like directories would need more auditing.
248 	 */
249 	if (!S_ISREG(mapping->host->i_mode))
250 		return -EIO;
251 	return truncate_inode_page(mapping, page);
252 }
253 EXPORT_SYMBOL(generic_error_remove_page);
254 
255 /*
256  * Safely invalidate one page from its pagecache mapping.
257  * It only drops clean, unused pages. The page must be locked.
258  *
259  * Returns 1 if the page is successfully invalidated, otherwise 0.
260  */
261 int invalidate_inode_page(struct page *page)
262 {
263 	struct address_space *mapping = page_mapping(page);
264 	if (!mapping)
265 		return 0;
266 	if (PageDirty(page) || PageWriteback(page))
267 		return 0;
268 	if (page_mapped(page))
269 		return 0;
270 	return invalidate_complete_page(mapping, page);
271 }
272 
273 /**
274  * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
275  * @mapping: mapping to truncate
276  * @lstart: offset from which to truncate
277  * @lend: offset to which to truncate (inclusive)
278  *
279  * Truncate the page cache, removing the pages that are between
280  * specified offsets (and zeroing out partial pages
281  * if lstart or lend + 1 is not page aligned).
282  *
283  * Truncate takes two passes - the first pass is nonblocking.  It will not
284  * block on page locks and it will not block on writeback.  The second pass
285  * will wait.  This is to prevent as much IO as possible in the affected region.
286  * The first pass will remove most pages, so the search cost of the second pass
287  * is low.
288  *
289  * We pass down the cache-hot hint to the page freeing code.  Even if the
290  * mapping is large, it is probably the case that the final pages are the most
291  * recently touched, and freeing happens in ascending file offset order.
292  *
293  * Note that since ->invalidatepage() accepts range to invalidate
294  * truncate_inode_pages_range is able to handle cases where lend + 1 is not
295  * page aligned properly.
296  */
297 void truncate_inode_pages_range(struct address_space *mapping,
298 				loff_t lstart, loff_t lend)
299 {
300 	pgoff_t		start;		/* inclusive */
301 	pgoff_t		end;		/* exclusive */
302 	unsigned int	partial_start;	/* inclusive */
303 	unsigned int	partial_end;	/* exclusive */
304 	struct pagevec	pvec;
305 	pgoff_t		indices[PAGEVEC_SIZE];
306 	pgoff_t		index;
307 	int		i;
308 
309 	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
310 		goto out;
311 
312 	/* Offsets within partial pages */
313 	partial_start = lstart & (PAGE_SIZE - 1);
314 	partial_end = (lend + 1) & (PAGE_SIZE - 1);
315 
316 	/*
317 	 * 'start' and 'end' always covers the range of pages to be fully
318 	 * truncated. Partial pages are covered with 'partial_start' at the
319 	 * start of the range and 'partial_end' at the end of the range.
320 	 * Note that 'end' is exclusive while 'lend' is inclusive.
321 	 */
322 	start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
323 	if (lend == -1)
324 		/*
325 		 * lend == -1 indicates end-of-file so we have to set 'end'
326 		 * to the highest possible pgoff_t and since the type is
327 		 * unsigned we're using -1.
328 		 */
329 		end = -1;
330 	else
331 		end = (lend + 1) >> PAGE_SHIFT;
332 
333 	pagevec_init(&pvec);
334 	index = start;
335 	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
336 			min(end - index, (pgoff_t)PAGEVEC_SIZE),
337 			indices)) {
338 		/*
339 		 * Pagevec array has exceptional entries and we may also fail
340 		 * to lock some pages. So we store pages that can be deleted
341 		 * in a new pagevec.
342 		 */
343 		struct pagevec locked_pvec;
344 
345 		pagevec_init(&locked_pvec);
346 		for (i = 0; i < pagevec_count(&pvec); i++) {
347 			struct page *page = pvec.pages[i];
348 
349 			/* We rely upon deletion not changing page->index */
350 			index = indices[i];
351 			if (index >= end)
352 				break;
353 
354 			if (radix_tree_exceptional_entry(page))
355 				continue;
356 
357 			if (!trylock_page(page))
358 				continue;
359 			WARN_ON(page_to_index(page) != index);
360 			if (PageWriteback(page)) {
361 				unlock_page(page);
362 				continue;
363 			}
364 			if (page->mapping != mapping) {
365 				unlock_page(page);
366 				continue;
367 			}
368 			pagevec_add(&locked_pvec, page);
369 		}
370 		for (i = 0; i < pagevec_count(&locked_pvec); i++)
371 			truncate_cleanup_page(mapping, locked_pvec.pages[i]);
372 		delete_from_page_cache_batch(mapping, &locked_pvec);
373 		for (i = 0; i < pagevec_count(&locked_pvec); i++)
374 			unlock_page(locked_pvec.pages[i]);
375 		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
376 		pagevec_release(&pvec);
377 		cond_resched();
378 		index++;
379 	}
380 	if (partial_start) {
381 		struct page *page = find_lock_page(mapping, start - 1);
382 		if (page) {
383 			unsigned int top = PAGE_SIZE;
384 			if (start > end) {
385 				/* Truncation within a single page */
386 				top = partial_end;
387 				partial_end = 0;
388 			}
389 			wait_on_page_writeback(page);
390 			zero_user_segment(page, partial_start, top);
391 			cleancache_invalidate_page(mapping, page);
392 			if (page_has_private(page))
393 				do_invalidatepage(page, partial_start,
394 						  top - partial_start);
395 			unlock_page(page);
396 			put_page(page);
397 		}
398 	}
399 	if (partial_end) {
400 		struct page *page = find_lock_page(mapping, end);
401 		if (page) {
402 			wait_on_page_writeback(page);
403 			zero_user_segment(page, 0, partial_end);
404 			cleancache_invalidate_page(mapping, page);
405 			if (page_has_private(page))
406 				do_invalidatepage(page, 0,
407 						  partial_end);
408 			unlock_page(page);
409 			put_page(page);
410 		}
411 	}
412 	/*
413 	 * If the truncation happened within a single page no pages
414 	 * will be released, just zeroed, so we can bail out now.
415 	 */
416 	if (start >= end)
417 		goto out;
418 
419 	index = start;
420 	for ( ; ; ) {
421 		cond_resched();
422 		if (!pagevec_lookup_entries(&pvec, mapping, index,
423 			min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
424 			/* If all gone from start onwards, we're done */
425 			if (index == start)
426 				break;
427 			/* Otherwise restart to make sure all gone */
428 			index = start;
429 			continue;
430 		}
431 		if (index == start && indices[0] >= end) {
432 			/* All gone out of hole to be punched, we're done */
433 			pagevec_remove_exceptionals(&pvec);
434 			pagevec_release(&pvec);
435 			break;
436 		}
437 
438 		for (i = 0; i < pagevec_count(&pvec); i++) {
439 			struct page *page = pvec.pages[i];
440 
441 			/* We rely upon deletion not changing page->index */
442 			index = indices[i];
443 			if (index >= end) {
444 				/* Restart punch to make sure all gone */
445 				index = start - 1;
446 				break;
447 			}
448 
449 			if (radix_tree_exceptional_entry(page))
450 				continue;
451 
452 			lock_page(page);
453 			WARN_ON(page_to_index(page) != index);
454 			wait_on_page_writeback(page);
455 			truncate_inode_page(mapping, page);
456 			unlock_page(page);
457 		}
458 		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
459 		pagevec_release(&pvec);
460 		index++;
461 	}
462 
463 out:
464 	cleancache_invalidate_inode(mapping);
465 }
466 EXPORT_SYMBOL(truncate_inode_pages_range);
467 
468 /**
469  * truncate_inode_pages - truncate *all* the pages from an offset
470  * @mapping: mapping to truncate
471  * @lstart: offset from which to truncate
472  *
473  * Called under (and serialised by) inode->i_mutex.
474  *
475  * Note: When this function returns, there can be a page in the process of
476  * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
477  * mapping->nrpages can be non-zero when this function returns even after
478  * truncation of the whole mapping.
479  */
480 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
481 {
482 	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
483 }
484 EXPORT_SYMBOL(truncate_inode_pages);
485 
486 /**
487  * truncate_inode_pages_final - truncate *all* pages before inode dies
488  * @mapping: mapping to truncate
489  *
490  * Called under (and serialized by) inode->i_mutex.
491  *
492  * Filesystems have to use this in the .evict_inode path to inform the
493  * VM that this is the final truncate and the inode is going away.
494  */
495 void truncate_inode_pages_final(struct address_space *mapping)
496 {
497 	unsigned long nrexceptional;
498 	unsigned long nrpages;
499 
500 	/*
501 	 * Page reclaim can not participate in regular inode lifetime
502 	 * management (can't call iput()) and thus can race with the
503 	 * inode teardown.  Tell it when the address space is exiting,
504 	 * so that it does not install eviction information after the
505 	 * final truncate has begun.
506 	 */
507 	mapping_set_exiting(mapping);
508 
509 	/*
510 	 * When reclaim installs eviction entries, it increases
511 	 * nrexceptional first, then decreases nrpages.  Make sure we see
512 	 * this in the right order or we might miss an entry.
513 	 */
514 	nrpages = mapping->nrpages;
515 	smp_rmb();
516 	nrexceptional = mapping->nrexceptional;
517 
518 	if (nrpages || nrexceptional) {
519 		/*
520 		 * As truncation uses a lockless tree lookup, cycle
521 		 * the tree lock to make sure any ongoing tree
522 		 * modification that does not see AS_EXITING is
523 		 * completed before starting the final truncate.
524 		 */
525 		spin_lock_irq(&mapping->tree_lock);
526 		spin_unlock_irq(&mapping->tree_lock);
527 
528 		truncate_inode_pages(mapping, 0);
529 	}
530 }
531 EXPORT_SYMBOL(truncate_inode_pages_final);
532 
533 /**
534  * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
535  * @mapping: the address_space which holds the pages to invalidate
536  * @start: the offset 'from' which to invalidate
537  * @end: the offset 'to' which to invalidate (inclusive)
538  *
539  * This function only removes the unlocked pages, if you want to
540  * remove all the pages of one inode, you must call truncate_inode_pages.
541  *
542  * invalidate_mapping_pages() will not block on IO activity. It will not
543  * invalidate pages which are dirty, locked, under writeback or mapped into
544  * pagetables.
545  */
546 unsigned long invalidate_mapping_pages(struct address_space *mapping,
547 		pgoff_t start, pgoff_t end)
548 {
549 	pgoff_t indices[PAGEVEC_SIZE];
550 	struct pagevec pvec;
551 	pgoff_t index = start;
552 	unsigned long ret;
553 	unsigned long count = 0;
554 	int i;
555 
556 	pagevec_init(&pvec);
557 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
558 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
559 			indices)) {
560 		for (i = 0; i < pagevec_count(&pvec); i++) {
561 			struct page *page = pvec.pages[i];
562 
563 			/* We rely upon deletion not changing page->index */
564 			index = indices[i];
565 			if (index > end)
566 				break;
567 
568 			if (radix_tree_exceptional_entry(page)) {
569 				invalidate_exceptional_entry(mapping, index,
570 							     page);
571 				continue;
572 			}
573 
574 			if (!trylock_page(page))
575 				continue;
576 
577 			WARN_ON(page_to_index(page) != index);
578 
579 			/* Middle of THP: skip */
580 			if (PageTransTail(page)) {
581 				unlock_page(page);
582 				continue;
583 			} else if (PageTransHuge(page)) {
584 				index += HPAGE_PMD_NR - 1;
585 				i += HPAGE_PMD_NR - 1;
586 				/*
587 				 * 'end' is in the middle of THP. Don't
588 				 * invalidate the page as the part outside of
589 				 * 'end' could be still useful.
590 				 */
591 				if (index > end) {
592 					unlock_page(page);
593 					continue;
594 				}
595 			}
596 
597 			ret = invalidate_inode_page(page);
598 			unlock_page(page);
599 			/*
600 			 * Invalidation is a hint that the page is no longer
601 			 * of interest and try to speed up its reclaim.
602 			 */
603 			if (!ret)
604 				deactivate_file_page(page);
605 			count += ret;
606 		}
607 		pagevec_remove_exceptionals(&pvec);
608 		pagevec_release(&pvec);
609 		cond_resched();
610 		index++;
611 	}
612 	return count;
613 }
614 EXPORT_SYMBOL(invalidate_mapping_pages);
615 
616 /*
617  * This is like invalidate_complete_page(), except it ignores the page's
618  * refcount.  We do this because invalidate_inode_pages2() needs stronger
619  * invalidation guarantees, and cannot afford to leave pages behind because
620  * shrink_page_list() has a temp ref on them, or because they're transiently
621  * sitting in the lru_cache_add() pagevecs.
622  */
623 static int
624 invalidate_complete_page2(struct address_space *mapping, struct page *page)
625 {
626 	unsigned long flags;
627 
628 	if (page->mapping != mapping)
629 		return 0;
630 
631 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
632 		return 0;
633 
634 	spin_lock_irqsave(&mapping->tree_lock, flags);
635 	if (PageDirty(page))
636 		goto failed;
637 
638 	BUG_ON(page_has_private(page));
639 	__delete_from_page_cache(page, NULL);
640 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
641 
642 	if (mapping->a_ops->freepage)
643 		mapping->a_ops->freepage(page);
644 
645 	put_page(page);	/* pagecache ref */
646 	return 1;
647 failed:
648 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
649 	return 0;
650 }
651 
652 static int do_launder_page(struct address_space *mapping, struct page *page)
653 {
654 	if (!PageDirty(page))
655 		return 0;
656 	if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
657 		return 0;
658 	return mapping->a_ops->launder_page(page);
659 }
660 
661 /**
662  * invalidate_inode_pages2_range - remove range of pages from an address_space
663  * @mapping: the address_space
664  * @start: the page offset 'from' which to invalidate
665  * @end: the page offset 'to' which to invalidate (inclusive)
666  *
667  * Any pages which are found to be mapped into pagetables are unmapped prior to
668  * invalidation.
669  *
670  * Returns -EBUSY if any pages could not be invalidated.
671  */
672 int invalidate_inode_pages2_range(struct address_space *mapping,
673 				  pgoff_t start, pgoff_t end)
674 {
675 	pgoff_t indices[PAGEVEC_SIZE];
676 	struct pagevec pvec;
677 	pgoff_t index;
678 	int i;
679 	int ret = 0;
680 	int ret2 = 0;
681 	int did_range_unmap = 0;
682 
683 	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
684 		goto out;
685 
686 	pagevec_init(&pvec);
687 	index = start;
688 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
689 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
690 			indices)) {
691 		for (i = 0; i < pagevec_count(&pvec); i++) {
692 			struct page *page = pvec.pages[i];
693 
694 			/* We rely upon deletion not changing page->index */
695 			index = indices[i];
696 			if (index > end)
697 				break;
698 
699 			if (radix_tree_exceptional_entry(page)) {
700 				if (!invalidate_exceptional_entry2(mapping,
701 								   index, page))
702 					ret = -EBUSY;
703 				continue;
704 			}
705 
706 			lock_page(page);
707 			WARN_ON(page_to_index(page) != index);
708 			if (page->mapping != mapping) {
709 				unlock_page(page);
710 				continue;
711 			}
712 			wait_on_page_writeback(page);
713 			if (page_mapped(page)) {
714 				if (!did_range_unmap) {
715 					/*
716 					 * Zap the rest of the file in one hit.
717 					 */
718 					unmap_mapping_range(mapping,
719 					   (loff_t)index << PAGE_SHIFT,
720 					   (loff_t)(1 + end - index)
721 							 << PAGE_SHIFT,
722 							 0);
723 					did_range_unmap = 1;
724 				} else {
725 					/*
726 					 * Just zap this page
727 					 */
728 					unmap_mapping_range(mapping,
729 					   (loff_t)index << PAGE_SHIFT,
730 					   PAGE_SIZE, 0);
731 				}
732 			}
733 			BUG_ON(page_mapped(page));
734 			ret2 = do_launder_page(mapping, page);
735 			if (ret2 == 0) {
736 				if (!invalidate_complete_page2(mapping, page))
737 					ret2 = -EBUSY;
738 			}
739 			if (ret2 < 0)
740 				ret = ret2;
741 			unlock_page(page);
742 		}
743 		pagevec_remove_exceptionals(&pvec);
744 		pagevec_release(&pvec);
745 		cond_resched();
746 		index++;
747 	}
748 	/*
749 	 * For DAX we invalidate page tables after invalidating radix tree.  We
750 	 * could invalidate page tables while invalidating each entry however
751 	 * that would be expensive. And doing range unmapping before doesn't
752 	 * work as we have no cheap way to find whether radix tree entry didn't
753 	 * get remapped later.
754 	 */
755 	if (dax_mapping(mapping)) {
756 		unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
757 				    (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
758 	}
759 out:
760 	cleancache_invalidate_inode(mapping);
761 	return ret;
762 }
763 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
764 
765 /**
766  * invalidate_inode_pages2 - remove all pages from an address_space
767  * @mapping: the address_space
768  *
769  * Any pages which are found to be mapped into pagetables are unmapped prior to
770  * invalidation.
771  *
772  * Returns -EBUSY if any pages could not be invalidated.
773  */
774 int invalidate_inode_pages2(struct address_space *mapping)
775 {
776 	return invalidate_inode_pages2_range(mapping, 0, -1);
777 }
778 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
779 
780 /**
781  * truncate_pagecache - unmap and remove pagecache that has been truncated
782  * @inode: inode
783  * @newsize: new file size
784  *
785  * inode's new i_size must already be written before truncate_pagecache
786  * is called.
787  *
788  * This function should typically be called before the filesystem
789  * releases resources associated with the freed range (eg. deallocates
790  * blocks). This way, pagecache will always stay logically coherent
791  * with on-disk format, and the filesystem would not have to deal with
792  * situations such as writepage being called for a page that has already
793  * had its underlying blocks deallocated.
794  */
795 void truncate_pagecache(struct inode *inode, loff_t newsize)
796 {
797 	struct address_space *mapping = inode->i_mapping;
798 	loff_t holebegin = round_up(newsize, PAGE_SIZE);
799 
800 	/*
801 	 * unmap_mapping_range is called twice, first simply for
802 	 * efficiency so that truncate_inode_pages does fewer
803 	 * single-page unmaps.  However after this first call, and
804 	 * before truncate_inode_pages finishes, it is possible for
805 	 * private pages to be COWed, which remain after
806 	 * truncate_inode_pages finishes, hence the second
807 	 * unmap_mapping_range call must be made for correctness.
808 	 */
809 	unmap_mapping_range(mapping, holebegin, 0, 1);
810 	truncate_inode_pages(mapping, newsize);
811 	unmap_mapping_range(mapping, holebegin, 0, 1);
812 }
813 EXPORT_SYMBOL(truncate_pagecache);
814 
815 /**
816  * truncate_setsize - update inode and pagecache for a new file size
817  * @inode: inode
818  * @newsize: new file size
819  *
820  * truncate_setsize updates i_size and performs pagecache truncation (if
821  * necessary) to @newsize. It will be typically be called from the filesystem's
822  * setattr function when ATTR_SIZE is passed in.
823  *
824  * Must be called with a lock serializing truncates and writes (generally
825  * i_mutex but e.g. xfs uses a different lock) and before all filesystem
826  * specific block truncation has been performed.
827  */
828 void truncate_setsize(struct inode *inode, loff_t newsize)
829 {
830 	loff_t oldsize = inode->i_size;
831 
832 	i_size_write(inode, newsize);
833 	if (newsize > oldsize)
834 		pagecache_isize_extended(inode, oldsize, newsize);
835 	truncate_pagecache(inode, newsize);
836 }
837 EXPORT_SYMBOL(truncate_setsize);
838 
839 /**
840  * pagecache_isize_extended - update pagecache after extension of i_size
841  * @inode:	inode for which i_size was extended
842  * @from:	original inode size
843  * @to:		new inode size
844  *
845  * Handle extension of inode size either caused by extending truncate or by
846  * write starting after current i_size. We mark the page straddling current
847  * i_size RO so that page_mkwrite() is called on the nearest write access to
848  * the page.  This way filesystem can be sure that page_mkwrite() is called on
849  * the page before user writes to the page via mmap after the i_size has been
850  * changed.
851  *
852  * The function must be called after i_size is updated so that page fault
853  * coming after we unlock the page will already see the new i_size.
854  * The function must be called while we still hold i_mutex - this not only
855  * makes sure i_size is stable but also that userspace cannot observe new
856  * i_size value before we are prepared to store mmap writes at new inode size.
857  */
858 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
859 {
860 	int bsize = i_blocksize(inode);
861 	loff_t rounded_from;
862 	struct page *page;
863 	pgoff_t index;
864 
865 	WARN_ON(to > inode->i_size);
866 
867 	if (from >= to || bsize == PAGE_SIZE)
868 		return;
869 	/* Page straddling @from will not have any hole block created? */
870 	rounded_from = round_up(from, bsize);
871 	if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
872 		return;
873 
874 	index = from >> PAGE_SHIFT;
875 	page = find_lock_page(inode->i_mapping, index);
876 	/* Page not cached? Nothing to do */
877 	if (!page)
878 		return;
879 	/*
880 	 * See clear_page_dirty_for_io() for details why set_page_dirty()
881 	 * is needed.
882 	 */
883 	if (page_mkclean(page))
884 		set_page_dirty(page);
885 	unlock_page(page);
886 	put_page(page);
887 }
888 EXPORT_SYMBOL(pagecache_isize_extended);
889 
890 /**
891  * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
892  * @inode: inode
893  * @lstart: offset of beginning of hole
894  * @lend: offset of last byte of hole
895  *
896  * This function should typically be called before the filesystem
897  * releases resources associated with the freed range (eg. deallocates
898  * blocks). This way, pagecache will always stay logically coherent
899  * with on-disk format, and the filesystem would not have to deal with
900  * situations such as writepage being called for a page that has already
901  * had its underlying blocks deallocated.
902  */
903 void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
904 {
905 	struct address_space *mapping = inode->i_mapping;
906 	loff_t unmap_start = round_up(lstart, PAGE_SIZE);
907 	loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
908 	/*
909 	 * This rounding is currently just for example: unmap_mapping_range
910 	 * expands its hole outwards, whereas we want it to contract the hole
911 	 * inwards.  However, existing callers of truncate_pagecache_range are
912 	 * doing their own page rounding first.  Note that unmap_mapping_range
913 	 * allows holelen 0 for all, and we allow lend -1 for end of file.
914 	 */
915 
916 	/*
917 	 * Unlike in truncate_pagecache, unmap_mapping_range is called only
918 	 * once (before truncating pagecache), and without "even_cows" flag:
919 	 * hole-punching should not remove private COWed pages from the hole.
920 	 */
921 	if ((u64)unmap_end > (u64)unmap_start)
922 		unmap_mapping_range(mapping, unmap_start,
923 				    1 + unmap_end - unmap_start, 0);
924 	truncate_inode_pages_range(mapping, lstart, lend);
925 }
926 EXPORT_SYMBOL(truncate_pagecache_range);
927