xref: /openbmc/linux/mm/swap.c (revision 6abeae2a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/mm/swap.c
4  *
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  */
7 
8 /*
9  * This file contains the default values for the operation of the
10  * Linux VM subsystem. Fine-tuning documentation can be found in
11  * Documentation/admin-guide/sysctl/vm.rst.
12  * Started 18.12.91
13  * Swap aging added 23.2.95, Stephen Tweedie.
14  * Buffermem limits added 12.3.98, Rik van Riel.
15  */
16 
17 #include <linux/mm.h>
18 #include <linux/sched.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/swap.h>
21 #include <linux/mman.h>
22 #include <linux/pagemap.h>
23 #include <linux/pagevec.h>
24 #include <linux/init.h>
25 #include <linux/export.h>
26 #include <linux/mm_inline.h>
27 #include <linux/percpu_counter.h>
28 #include <linux/memremap.h>
29 #include <linux/percpu.h>
30 #include <linux/cpu.h>
31 #include <linux/notifier.h>
32 #include <linux/backing-dev.h>
33 #include <linux/memcontrol.h>
34 #include <linux/gfp.h>
35 #include <linux/uio.h>
36 #include <linux/hugetlb.h>
37 #include <linux/page_idle.h>
38 #include <linux/local_lock.h>
39 
40 #include "internal.h"
41 
42 #define CREATE_TRACE_POINTS
43 #include <trace/events/pagemap.h>
44 
45 /* How many pages do we try to swap or page in/out together? */
46 int page_cluster;
47 
48 /* Protecting only lru_rotate.pvec which requires disabling interrupts */
49 struct lru_rotate {
50 	local_lock_t lock;
51 	struct pagevec pvec;
52 };
53 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
54 	.lock = INIT_LOCAL_LOCK(lock),
55 };
56 
57 /*
58  * The following struct pagevec are grouped together because they are protected
59  * by disabling preemption (and interrupts remain enabled).
60  */
61 struct lru_pvecs {
62 	local_lock_t lock;
63 	struct pagevec lru_add;
64 	struct pagevec lru_deactivate_file;
65 	struct pagevec lru_deactivate;
66 	struct pagevec lru_lazyfree;
67 #ifdef CONFIG_SMP
68 	struct pagevec activate_page;
69 #endif
70 };
71 static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
72 	.lock = INIT_LOCAL_LOCK(lock),
73 };
74 
75 /*
76  * This path almost never happens for VM activity - pages are normally
77  * freed via pagevecs.  But it gets used by networking.
78  */
79 static void __page_cache_release(struct page *page)
80 {
81 	if (PageLRU(page)) {
82 		struct lruvec *lruvec;
83 		unsigned long flags;
84 
85 		lruvec = lock_page_lruvec_irqsave(page, &flags);
86 		VM_BUG_ON_PAGE(!PageLRU(page), page);
87 		__ClearPageLRU(page);
88 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
89 		unlock_page_lruvec_irqrestore(lruvec, flags);
90 	}
91 	__ClearPageWaiters(page);
92 }
93 
94 static void __put_single_page(struct page *page)
95 {
96 	__page_cache_release(page);
97 	mem_cgroup_uncharge(page);
98 	free_unref_page(page);
99 }
100 
101 static void __put_compound_page(struct page *page)
102 {
103 	/*
104 	 * __page_cache_release() is supposed to be called for thp, not for
105 	 * hugetlb. This is because hugetlb page does never have PageLRU set
106 	 * (it's never listed to any LRU lists) and no memcg routines should
107 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
108 	 */
109 	if (!PageHuge(page))
110 		__page_cache_release(page);
111 	destroy_compound_page(page);
112 }
113 
114 void __put_page(struct page *page)
115 {
116 	if (is_zone_device_page(page)) {
117 		put_dev_pagemap(page->pgmap);
118 
119 		/*
120 		 * The page belongs to the device that created pgmap. Do
121 		 * not return it to page allocator.
122 		 */
123 		return;
124 	}
125 
126 	if (unlikely(PageCompound(page)))
127 		__put_compound_page(page);
128 	else
129 		__put_single_page(page);
130 }
131 EXPORT_SYMBOL(__put_page);
132 
133 /**
134  * put_pages_list() - release a list of pages
135  * @pages: list of pages threaded on page->lru
136  *
137  * Release a list of pages which are strung together on page.lru.  Currently
138  * used by read_cache_pages() and related error recovery code.
139  */
140 void put_pages_list(struct list_head *pages)
141 {
142 	while (!list_empty(pages)) {
143 		struct page *victim;
144 
145 		victim = lru_to_page(pages);
146 		list_del(&victim->lru);
147 		put_page(victim);
148 	}
149 }
150 EXPORT_SYMBOL(put_pages_list);
151 
152 /*
153  * get_kernel_pages() - pin kernel pages in memory
154  * @kiov:	An array of struct kvec structures
155  * @nr_segs:	number of segments to pin
156  * @write:	pinning for read/write, currently ignored
157  * @pages:	array that receives pointers to the pages pinned.
158  *		Should be at least nr_segs long.
159  *
160  * Returns number of pages pinned. This may be fewer than the number
161  * requested. If nr_pages is 0 or negative, returns 0. If no pages
162  * were pinned, returns -errno. Each page returned must be released
163  * with a put_page() call when it is finished with.
164  */
165 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
166 		struct page **pages)
167 {
168 	int seg;
169 
170 	for (seg = 0; seg < nr_segs; seg++) {
171 		if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
172 			return seg;
173 
174 		pages[seg] = kmap_to_page(kiov[seg].iov_base);
175 		get_page(pages[seg]);
176 	}
177 
178 	return seg;
179 }
180 EXPORT_SYMBOL_GPL(get_kernel_pages);
181 
182 /*
183  * get_kernel_page() - pin a kernel page in memory
184  * @start:	starting kernel address
185  * @write:	pinning for read/write, currently ignored
186  * @pages:	array that receives pointer to the page pinned.
187  *		Must be at least nr_segs long.
188  *
189  * Returns 1 if page is pinned. If the page was not pinned, returns
190  * -errno. The page returned must be released with a put_page() call
191  * when it is finished with.
192  */
193 int get_kernel_page(unsigned long start, int write, struct page **pages)
194 {
195 	const struct kvec kiov = {
196 		.iov_base = (void *)start,
197 		.iov_len = PAGE_SIZE
198 	};
199 
200 	return get_kernel_pages(&kiov, 1, write, pages);
201 }
202 EXPORT_SYMBOL_GPL(get_kernel_page);
203 
204 static void pagevec_lru_move_fn(struct pagevec *pvec,
205 	void (*move_fn)(struct page *page, struct lruvec *lruvec))
206 {
207 	int i;
208 	struct lruvec *lruvec = NULL;
209 	unsigned long flags = 0;
210 
211 	for (i = 0; i < pagevec_count(pvec); i++) {
212 		struct page *page = pvec->pages[i];
213 
214 		/* block memcg migration during page moving between lru */
215 		if (!TestClearPageLRU(page))
216 			continue;
217 
218 		lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
219 		(*move_fn)(page, lruvec);
220 
221 		SetPageLRU(page);
222 	}
223 	if (lruvec)
224 		unlock_page_lruvec_irqrestore(lruvec, flags);
225 	release_pages(pvec->pages, pvec->nr);
226 	pagevec_reinit(pvec);
227 }
228 
229 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
230 {
231 	if (!PageUnevictable(page)) {
232 		del_page_from_lru_list(page, lruvec, page_lru(page));
233 		ClearPageActive(page);
234 		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
235 		__count_vm_events(PGROTATED, thp_nr_pages(page));
236 	}
237 }
238 
239 /*
240  * Writeback is about to end against a page which has been marked for immediate
241  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
242  * inactive list.
243  *
244  * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
245  */
246 void rotate_reclaimable_page(struct page *page)
247 {
248 	if (!PageLocked(page) && !PageDirty(page) &&
249 	    !PageUnevictable(page) && PageLRU(page)) {
250 		struct pagevec *pvec;
251 		unsigned long flags;
252 
253 		get_page(page);
254 		local_lock_irqsave(&lru_rotate.lock, flags);
255 		pvec = this_cpu_ptr(&lru_rotate.pvec);
256 		if (!pagevec_add(pvec, page) || PageCompound(page))
257 			pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
258 		local_unlock_irqrestore(&lru_rotate.lock, flags);
259 	}
260 }
261 
262 void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
263 {
264 	do {
265 		unsigned long lrusize;
266 
267 		/*
268 		 * Hold lruvec->lru_lock is safe here, since
269 		 * 1) The pinned lruvec in reclaim, or
270 		 * 2) From a pre-LRU page during refault (which also holds the
271 		 *    rcu lock, so would be safe even if the page was on the LRU
272 		 *    and could move simultaneously to a new lruvec).
273 		 */
274 		spin_lock_irq(&lruvec->lru_lock);
275 		/* Record cost event */
276 		if (file)
277 			lruvec->file_cost += nr_pages;
278 		else
279 			lruvec->anon_cost += nr_pages;
280 
281 		/*
282 		 * Decay previous events
283 		 *
284 		 * Because workloads change over time (and to avoid
285 		 * overflow) we keep these statistics as a floating
286 		 * average, which ends up weighing recent refaults
287 		 * more than old ones.
288 		 */
289 		lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
290 			  lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
291 			  lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
292 			  lruvec_page_state(lruvec, NR_ACTIVE_FILE);
293 
294 		if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
295 			lruvec->file_cost /= 2;
296 			lruvec->anon_cost /= 2;
297 		}
298 		spin_unlock_irq(&lruvec->lru_lock);
299 	} while ((lruvec = parent_lruvec(lruvec)));
300 }
301 
302 void lru_note_cost_page(struct page *page)
303 {
304 	lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
305 		      page_is_file_lru(page), thp_nr_pages(page));
306 }
307 
308 static void __activate_page(struct page *page, struct lruvec *lruvec)
309 {
310 	if (!PageActive(page) && !PageUnevictable(page)) {
311 		int lru = page_lru_base_type(page);
312 		int nr_pages = thp_nr_pages(page);
313 
314 		del_page_from_lru_list(page, lruvec, lru);
315 		SetPageActive(page);
316 		lru += LRU_ACTIVE;
317 		add_page_to_lru_list(page, lruvec, lru);
318 		trace_mm_lru_activate(page);
319 
320 		__count_vm_events(PGACTIVATE, nr_pages);
321 		__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
322 				     nr_pages);
323 	}
324 }
325 
326 #ifdef CONFIG_SMP
327 static void activate_page_drain(int cpu)
328 {
329 	struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
330 
331 	if (pagevec_count(pvec))
332 		pagevec_lru_move_fn(pvec, __activate_page);
333 }
334 
335 static bool need_activate_page_drain(int cpu)
336 {
337 	return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
338 }
339 
340 static void activate_page(struct page *page)
341 {
342 	page = compound_head(page);
343 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
344 		struct pagevec *pvec;
345 
346 		local_lock(&lru_pvecs.lock);
347 		pvec = this_cpu_ptr(&lru_pvecs.activate_page);
348 		get_page(page);
349 		if (!pagevec_add(pvec, page) || PageCompound(page))
350 			pagevec_lru_move_fn(pvec, __activate_page);
351 		local_unlock(&lru_pvecs.lock);
352 	}
353 }
354 
355 #else
356 static inline void activate_page_drain(int cpu)
357 {
358 }
359 
360 static void activate_page(struct page *page)
361 {
362 	struct lruvec *lruvec;
363 
364 	page = compound_head(page);
365 	if (TestClearPageLRU(page)) {
366 		lruvec = lock_page_lruvec_irq(page);
367 		__activate_page(page, lruvec);
368 		unlock_page_lruvec_irq(lruvec);
369 		SetPageLRU(page);
370 	}
371 }
372 #endif
373 
374 static void __lru_cache_activate_page(struct page *page)
375 {
376 	struct pagevec *pvec;
377 	int i;
378 
379 	local_lock(&lru_pvecs.lock);
380 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
381 
382 	/*
383 	 * Search backwards on the optimistic assumption that the page being
384 	 * activated has just been added to this pagevec. Note that only
385 	 * the local pagevec is examined as a !PageLRU page could be in the
386 	 * process of being released, reclaimed, migrated or on a remote
387 	 * pagevec that is currently being drained. Furthermore, marking
388 	 * a remote pagevec's page PageActive potentially hits a race where
389 	 * a page is marked PageActive just after it is added to the inactive
390 	 * list causing accounting errors and BUG_ON checks to trigger.
391 	 */
392 	for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
393 		struct page *pagevec_page = pvec->pages[i];
394 
395 		if (pagevec_page == page) {
396 			SetPageActive(page);
397 			break;
398 		}
399 	}
400 
401 	local_unlock(&lru_pvecs.lock);
402 }
403 
404 /*
405  * Mark a page as having seen activity.
406  *
407  * inactive,unreferenced	->	inactive,referenced
408  * inactive,referenced		->	active,unreferenced
409  * active,unreferenced		->	active,referenced
410  *
411  * When a newly allocated page is not yet visible, so safe for non-atomic ops,
412  * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
413  */
414 void mark_page_accessed(struct page *page)
415 {
416 	page = compound_head(page);
417 
418 	if (!PageReferenced(page)) {
419 		SetPageReferenced(page);
420 	} else if (PageUnevictable(page)) {
421 		/*
422 		 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
423 		 * this list is never rotated or maintained, so marking an
424 		 * evictable page accessed has no effect.
425 		 */
426 	} else if (!PageActive(page)) {
427 		/*
428 		 * If the page is on the LRU, queue it for activation via
429 		 * lru_pvecs.activate_page. Otherwise, assume the page is on a
430 		 * pagevec, mark it active and it'll be moved to the active
431 		 * LRU on the next drain.
432 		 */
433 		if (PageLRU(page))
434 			activate_page(page);
435 		else
436 			__lru_cache_activate_page(page);
437 		ClearPageReferenced(page);
438 		workingset_activation(page);
439 	}
440 	if (page_is_idle(page))
441 		clear_page_idle(page);
442 }
443 EXPORT_SYMBOL(mark_page_accessed);
444 
445 /**
446  * lru_cache_add - add a page to a page list
447  * @page: the page to be added to the LRU.
448  *
449  * Queue the page for addition to the LRU via pagevec. The decision on whether
450  * to add the page to the [in]active [file|anon] list is deferred until the
451  * pagevec is drained. This gives a chance for the caller of lru_cache_add()
452  * have the page added to the active list using mark_page_accessed().
453  */
454 void lru_cache_add(struct page *page)
455 {
456 	struct pagevec *pvec;
457 
458 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
459 	VM_BUG_ON_PAGE(PageLRU(page), page);
460 
461 	get_page(page);
462 	local_lock(&lru_pvecs.lock);
463 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
464 	if (!pagevec_add(pvec, page) || PageCompound(page))
465 		__pagevec_lru_add(pvec);
466 	local_unlock(&lru_pvecs.lock);
467 }
468 EXPORT_SYMBOL(lru_cache_add);
469 
470 /**
471  * lru_cache_add_inactive_or_unevictable
472  * @page:  the page to be added to LRU
473  * @vma:   vma in which page is mapped for determining reclaimability
474  *
475  * Place @page on the inactive or unevictable LRU list, depending on its
476  * evictability.
477  */
478 void lru_cache_add_inactive_or_unevictable(struct page *page,
479 					 struct vm_area_struct *vma)
480 {
481 	bool unevictable;
482 
483 	VM_BUG_ON_PAGE(PageLRU(page), page);
484 
485 	unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
486 	if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
487 		int nr_pages = thp_nr_pages(page);
488 		/*
489 		 * We use the irq-unsafe __mod_zone_page_stat because this
490 		 * counter is not modified from interrupt context, and the pte
491 		 * lock is held(spinlock), which implies preemption disabled.
492 		 */
493 		__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
494 		count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
495 	}
496 	lru_cache_add(page);
497 }
498 
499 /*
500  * If the page can not be invalidated, it is moved to the
501  * inactive list to speed up its reclaim.  It is moved to the
502  * head of the list, rather than the tail, to give the flusher
503  * threads some time to write it out, as this is much more
504  * effective than the single-page writeout from reclaim.
505  *
506  * If the page isn't page_mapped and dirty/writeback, the page
507  * could reclaim asap using PG_reclaim.
508  *
509  * 1. active, mapped page -> none
510  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
511  * 3. inactive, mapped page -> none
512  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
513  * 5. inactive, clean -> inactive, tail
514  * 6. Others -> none
515  *
516  * In 4, why it moves inactive's head, the VM expects the page would
517  * be write it out by flusher threads as this is much more effective
518  * than the single-page writeout from reclaim.
519  */
520 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
521 {
522 	int lru;
523 	bool active;
524 	int nr_pages = thp_nr_pages(page);
525 
526 	if (PageUnevictable(page))
527 		return;
528 
529 	/* Some processes are using the page */
530 	if (page_mapped(page))
531 		return;
532 
533 	active = PageActive(page);
534 	lru = page_lru_base_type(page);
535 
536 	del_page_from_lru_list(page, lruvec, lru + active);
537 	ClearPageActive(page);
538 	ClearPageReferenced(page);
539 
540 	if (PageWriteback(page) || PageDirty(page)) {
541 		/*
542 		 * PG_reclaim could be raced with end_page_writeback
543 		 * It can make readahead confusing.  But race window
544 		 * is _really_ small and  it's non-critical problem.
545 		 */
546 		add_page_to_lru_list(page, lruvec, lru);
547 		SetPageReclaim(page);
548 	} else {
549 		/*
550 		 * The page's writeback ends up during pagevec
551 		 * We moves tha page into tail of inactive.
552 		 */
553 		add_page_to_lru_list_tail(page, lruvec, lru);
554 		__count_vm_events(PGROTATED, nr_pages);
555 	}
556 
557 	if (active) {
558 		__count_vm_events(PGDEACTIVATE, nr_pages);
559 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
560 				     nr_pages);
561 	}
562 }
563 
564 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
565 {
566 	if (PageActive(page) && !PageUnevictable(page)) {
567 		int lru = page_lru_base_type(page);
568 		int nr_pages = thp_nr_pages(page);
569 
570 		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
571 		ClearPageActive(page);
572 		ClearPageReferenced(page);
573 		add_page_to_lru_list(page, lruvec, lru);
574 
575 		__count_vm_events(PGDEACTIVATE, nr_pages);
576 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
577 				     nr_pages);
578 	}
579 }
580 
581 static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
582 {
583 	if (PageAnon(page) && PageSwapBacked(page) &&
584 	    !PageSwapCache(page) && !PageUnevictable(page)) {
585 		bool active = PageActive(page);
586 		int nr_pages = thp_nr_pages(page);
587 
588 		del_page_from_lru_list(page, lruvec,
589 				       LRU_INACTIVE_ANON + active);
590 		ClearPageActive(page);
591 		ClearPageReferenced(page);
592 		/*
593 		 * Lazyfree pages are clean anonymous pages.  They have
594 		 * PG_swapbacked flag cleared, to distinguish them from normal
595 		 * anonymous pages
596 		 */
597 		ClearPageSwapBacked(page);
598 		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
599 
600 		__count_vm_events(PGLAZYFREE, nr_pages);
601 		__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
602 				     nr_pages);
603 	}
604 }
605 
606 /*
607  * Drain pages out of the cpu's pagevecs.
608  * Either "cpu" is the current CPU, and preemption has already been
609  * disabled; or "cpu" is being hot-unplugged, and is already dead.
610  */
611 void lru_add_drain_cpu(int cpu)
612 {
613 	struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
614 
615 	if (pagevec_count(pvec))
616 		__pagevec_lru_add(pvec);
617 
618 	pvec = &per_cpu(lru_rotate.pvec, cpu);
619 	/* Disabling interrupts below acts as a compiler barrier. */
620 	if (data_race(pagevec_count(pvec))) {
621 		unsigned long flags;
622 
623 		/* No harm done if a racing interrupt already did this */
624 		local_lock_irqsave(&lru_rotate.lock, flags);
625 		pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
626 		local_unlock_irqrestore(&lru_rotate.lock, flags);
627 	}
628 
629 	pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
630 	if (pagevec_count(pvec))
631 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
632 
633 	pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
634 	if (pagevec_count(pvec))
635 		pagevec_lru_move_fn(pvec, lru_deactivate_fn);
636 
637 	pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
638 	if (pagevec_count(pvec))
639 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
640 
641 	activate_page_drain(cpu);
642 }
643 
644 /**
645  * deactivate_file_page - forcefully deactivate a file page
646  * @page: page to deactivate
647  *
648  * This function hints the VM that @page is a good reclaim candidate,
649  * for example if its invalidation fails due to the page being dirty
650  * or under writeback.
651  */
652 void deactivate_file_page(struct page *page)
653 {
654 	/*
655 	 * In a workload with many unevictable page such as mprotect,
656 	 * unevictable page deactivation for accelerating reclaim is pointless.
657 	 */
658 	if (PageUnevictable(page))
659 		return;
660 
661 	if (likely(get_page_unless_zero(page))) {
662 		struct pagevec *pvec;
663 
664 		local_lock(&lru_pvecs.lock);
665 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
666 
667 		if (!pagevec_add(pvec, page) || PageCompound(page))
668 			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
669 		local_unlock(&lru_pvecs.lock);
670 	}
671 }
672 
673 /*
674  * deactivate_page - deactivate a page
675  * @page: page to deactivate
676  *
677  * deactivate_page() moves @page to the inactive list if @page was on the active
678  * list and was not an unevictable page.  This is done to accelerate the reclaim
679  * of @page.
680  */
681 void deactivate_page(struct page *page)
682 {
683 	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
684 		struct pagevec *pvec;
685 
686 		local_lock(&lru_pvecs.lock);
687 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
688 		get_page(page);
689 		if (!pagevec_add(pvec, page) || PageCompound(page))
690 			pagevec_lru_move_fn(pvec, lru_deactivate_fn);
691 		local_unlock(&lru_pvecs.lock);
692 	}
693 }
694 
695 /**
696  * mark_page_lazyfree - make an anon page lazyfree
697  * @page: page to deactivate
698  *
699  * mark_page_lazyfree() moves @page to the inactive file list.
700  * This is done to accelerate the reclaim of @page.
701  */
702 void mark_page_lazyfree(struct page *page)
703 {
704 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
705 	    !PageSwapCache(page) && !PageUnevictable(page)) {
706 		struct pagevec *pvec;
707 
708 		local_lock(&lru_pvecs.lock);
709 		pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
710 		get_page(page);
711 		if (!pagevec_add(pvec, page) || PageCompound(page))
712 			pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
713 		local_unlock(&lru_pvecs.lock);
714 	}
715 }
716 
717 void lru_add_drain(void)
718 {
719 	local_lock(&lru_pvecs.lock);
720 	lru_add_drain_cpu(smp_processor_id());
721 	local_unlock(&lru_pvecs.lock);
722 }
723 
724 void lru_add_drain_cpu_zone(struct zone *zone)
725 {
726 	local_lock(&lru_pvecs.lock);
727 	lru_add_drain_cpu(smp_processor_id());
728 	drain_local_pages(zone);
729 	local_unlock(&lru_pvecs.lock);
730 }
731 
732 #ifdef CONFIG_SMP
733 
734 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
735 
736 static void lru_add_drain_per_cpu(struct work_struct *dummy)
737 {
738 	lru_add_drain();
739 }
740 
741 /*
742  * Doesn't need any cpu hotplug locking because we do rely on per-cpu
743  * kworkers being shut down before our page_alloc_cpu_dead callback is
744  * executed on the offlined cpu.
745  * Calling this function with cpu hotplug locks held can actually lead
746  * to obscure indirect dependencies via WQ context.
747  */
748 void lru_add_drain_all(void)
749 {
750 	/*
751 	 * lru_drain_gen - Global pages generation number
752 	 *
753 	 * (A) Definition: global lru_drain_gen = x implies that all generations
754 	 *     0 < n <= x are already *scheduled* for draining.
755 	 *
756 	 * This is an optimization for the highly-contended use case where a
757 	 * user space workload keeps constantly generating a flow of pages for
758 	 * each CPU.
759 	 */
760 	static unsigned int lru_drain_gen;
761 	static struct cpumask has_work;
762 	static DEFINE_MUTEX(lock);
763 	unsigned cpu, this_gen;
764 
765 	/*
766 	 * Make sure nobody triggers this path before mm_percpu_wq is fully
767 	 * initialized.
768 	 */
769 	if (WARN_ON(!mm_percpu_wq))
770 		return;
771 
772 	/*
773 	 * Guarantee pagevec counter stores visible by this CPU are visible to
774 	 * other CPUs before loading the current drain generation.
775 	 */
776 	smp_mb();
777 
778 	/*
779 	 * (B) Locally cache global LRU draining generation number
780 	 *
781 	 * The read barrier ensures that the counter is loaded before the mutex
782 	 * is taken. It pairs with smp_mb() inside the mutex critical section
783 	 * at (D).
784 	 */
785 	this_gen = smp_load_acquire(&lru_drain_gen);
786 
787 	mutex_lock(&lock);
788 
789 	/*
790 	 * (C) Exit the draining operation if a newer generation, from another
791 	 * lru_add_drain_all(), was already scheduled for draining. Check (A).
792 	 */
793 	if (unlikely(this_gen != lru_drain_gen))
794 		goto done;
795 
796 	/*
797 	 * (D) Increment global generation number
798 	 *
799 	 * Pairs with smp_load_acquire() at (B), outside of the critical
800 	 * section. Use a full memory barrier to guarantee that the new global
801 	 * drain generation number is stored before loading pagevec counters.
802 	 *
803 	 * This pairing must be done here, before the for_each_online_cpu loop
804 	 * below which drains the page vectors.
805 	 *
806 	 * Let x, y, and z represent some system CPU numbers, where x < y < z.
807 	 * Assume CPU #z is is in the middle of the for_each_online_cpu loop
808 	 * below and has already reached CPU #y's per-cpu data. CPU #x comes
809 	 * along, adds some pages to its per-cpu vectors, then calls
810 	 * lru_add_drain_all().
811 	 *
812 	 * If the paired barrier is done at any later step, e.g. after the
813 	 * loop, CPU #x will just exit at (C) and miss flushing out all of its
814 	 * added pages.
815 	 */
816 	WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
817 	smp_mb();
818 
819 	cpumask_clear(&has_work);
820 	for_each_online_cpu(cpu) {
821 		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
822 
823 		if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
824 		    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
825 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
826 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
827 		    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
828 		    need_activate_page_drain(cpu)) {
829 			INIT_WORK(work, lru_add_drain_per_cpu);
830 			queue_work_on(cpu, mm_percpu_wq, work);
831 			__cpumask_set_cpu(cpu, &has_work);
832 		}
833 	}
834 
835 	for_each_cpu(cpu, &has_work)
836 		flush_work(&per_cpu(lru_add_drain_work, cpu));
837 
838 done:
839 	mutex_unlock(&lock);
840 }
841 #else
842 void lru_add_drain_all(void)
843 {
844 	lru_add_drain();
845 }
846 #endif /* CONFIG_SMP */
847 
848 /**
849  * release_pages - batched put_page()
850  * @pages: array of pages to release
851  * @nr: number of pages
852  *
853  * Decrement the reference count on all the pages in @pages.  If it
854  * fell to zero, remove the page from the LRU and free it.
855  */
856 void release_pages(struct page **pages, int nr)
857 {
858 	int i;
859 	LIST_HEAD(pages_to_free);
860 	struct lruvec *lruvec = NULL;
861 	unsigned long flags;
862 	unsigned int lock_batch;
863 
864 	for (i = 0; i < nr; i++) {
865 		struct page *page = pages[i];
866 
867 		/*
868 		 * Make sure the IRQ-safe lock-holding time does not get
869 		 * excessive with a continuous string of pages from the
870 		 * same lruvec. The lock is held only if lruvec != NULL.
871 		 */
872 		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
873 			unlock_page_lruvec_irqrestore(lruvec, flags);
874 			lruvec = NULL;
875 		}
876 
877 		page = compound_head(page);
878 		if (is_huge_zero_page(page))
879 			continue;
880 
881 		if (is_zone_device_page(page)) {
882 			if (lruvec) {
883 				unlock_page_lruvec_irqrestore(lruvec, flags);
884 				lruvec = NULL;
885 			}
886 			/*
887 			 * ZONE_DEVICE pages that return 'false' from
888 			 * page_is_devmap_managed() do not require special
889 			 * processing, and instead, expect a call to
890 			 * put_page_testzero().
891 			 */
892 			if (page_is_devmap_managed(page)) {
893 				put_devmap_managed_page(page);
894 				continue;
895 			}
896 			if (put_page_testzero(page))
897 				put_dev_pagemap(page->pgmap);
898 			continue;
899 		}
900 
901 		if (!put_page_testzero(page))
902 			continue;
903 
904 		if (PageCompound(page)) {
905 			if (lruvec) {
906 				unlock_page_lruvec_irqrestore(lruvec, flags);
907 				lruvec = NULL;
908 			}
909 			__put_compound_page(page);
910 			continue;
911 		}
912 
913 		if (PageLRU(page)) {
914 			struct lruvec *prev_lruvec = lruvec;
915 
916 			lruvec = relock_page_lruvec_irqsave(page, lruvec,
917 									&flags);
918 			if (prev_lruvec != lruvec)
919 				lock_batch = 0;
920 
921 			VM_BUG_ON_PAGE(!PageLRU(page), page);
922 			__ClearPageLRU(page);
923 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
924 		}
925 
926 		__ClearPageWaiters(page);
927 
928 		list_add(&page->lru, &pages_to_free);
929 	}
930 	if (lruvec)
931 		unlock_page_lruvec_irqrestore(lruvec, flags);
932 
933 	mem_cgroup_uncharge_list(&pages_to_free);
934 	free_unref_page_list(&pages_to_free);
935 }
936 EXPORT_SYMBOL(release_pages);
937 
938 /*
939  * The pages which we're about to release may be in the deferred lru-addition
940  * queues.  That would prevent them from really being freed right now.  That's
941  * OK from a correctness point of view but is inefficient - those pages may be
942  * cache-warm and we want to give them back to the page allocator ASAP.
943  *
944  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
945  * and __pagevec_lru_add_active() call release_pages() directly to avoid
946  * mutual recursion.
947  */
948 void __pagevec_release(struct pagevec *pvec)
949 {
950 	if (!pvec->percpu_pvec_drained) {
951 		lru_add_drain();
952 		pvec->percpu_pvec_drained = true;
953 	}
954 	release_pages(pvec->pages, pagevec_count(pvec));
955 	pagevec_reinit(pvec);
956 }
957 EXPORT_SYMBOL(__pagevec_release);
958 
959 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
960 {
961 	enum lru_list lru;
962 	int was_unevictable = TestClearPageUnevictable(page);
963 	int nr_pages = thp_nr_pages(page);
964 
965 	VM_BUG_ON_PAGE(PageLRU(page), page);
966 
967 	/*
968 	 * Page becomes evictable in two ways:
969 	 * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
970 	 * 2) Before acquiring LRU lock to put the page to correct LRU and then
971 	 *   a) do PageLRU check with lock [check_move_unevictable_pages]
972 	 *   b) do PageLRU check before lock [clear_page_mlock]
973 	 *
974 	 * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
975 	 * following strict ordering:
976 	 *
977 	 * #0: __pagevec_lru_add_fn		#1: clear_page_mlock
978 	 *
979 	 * SetPageLRU()				TestClearPageMlocked()
980 	 * smp_mb() // explicit ordering	// above provides strict
981 	 *					// ordering
982 	 * PageMlocked()			PageLRU()
983 	 *
984 	 *
985 	 * if '#1' does not observe setting of PG_lru by '#0' and fails
986 	 * isolation, the explicit barrier will make sure that page_evictable
987 	 * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
988 	 * can be reordered after PageMlocked check and can make '#1' to fail
989 	 * the isolation of the page whose Mlocked bit is cleared (#0 is also
990 	 * looking at the same page) and the evictable page will be stranded
991 	 * in an unevictable LRU.
992 	 */
993 	SetPageLRU(page);
994 	smp_mb__after_atomic();
995 
996 	if (page_evictable(page)) {
997 		lru = page_lru(page);
998 		if (was_unevictable)
999 			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
1000 	} else {
1001 		lru = LRU_UNEVICTABLE;
1002 		ClearPageActive(page);
1003 		SetPageUnevictable(page);
1004 		if (!was_unevictable)
1005 			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
1006 	}
1007 
1008 	add_page_to_lru_list(page, lruvec, lru);
1009 	trace_mm_lru_insertion(page, lru);
1010 }
1011 
1012 /*
1013  * Add the passed pages to the LRU, then drop the caller's refcount
1014  * on them.  Reinitialises the caller's pagevec.
1015  */
1016 void __pagevec_lru_add(struct pagevec *pvec)
1017 {
1018 	int i;
1019 	struct lruvec *lruvec = NULL;
1020 	unsigned long flags = 0;
1021 
1022 	for (i = 0; i < pagevec_count(pvec); i++) {
1023 		struct page *page = pvec->pages[i];
1024 
1025 		lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
1026 		__pagevec_lru_add_fn(page, lruvec);
1027 	}
1028 	if (lruvec)
1029 		unlock_page_lruvec_irqrestore(lruvec, flags);
1030 	release_pages(pvec->pages, pvec->nr);
1031 	pagevec_reinit(pvec);
1032 }
1033 
1034 /**
1035  * pagevec_lookup_entries - gang pagecache lookup
1036  * @pvec:	Where the resulting entries are placed
1037  * @mapping:	The address_space to search
1038  * @start:	The starting entry index
1039  * @nr_entries:	The maximum number of pages
1040  * @indices:	The cache indices corresponding to the entries in @pvec
1041  *
1042  * pagevec_lookup_entries() will search for and return a group of up
1043  * to @nr_pages pages and shadow entries in the mapping.  All
1044  * entries are placed in @pvec.  pagevec_lookup_entries() takes a
1045  * reference against actual pages in @pvec.
1046  *
1047  * The search returns a group of mapping-contiguous entries with
1048  * ascending indexes.  There may be holes in the indices due to
1049  * not-present entries.
1050  *
1051  * Only one subpage of a Transparent Huge Page is returned in one call:
1052  * allowing truncate_inode_pages_range() to evict the whole THP without
1053  * cycling through a pagevec of extra references.
1054  *
1055  * pagevec_lookup_entries() returns the number of entries which were
1056  * found.
1057  */
1058 unsigned pagevec_lookup_entries(struct pagevec *pvec,
1059 				struct address_space *mapping,
1060 				pgoff_t start, unsigned nr_entries,
1061 				pgoff_t *indices)
1062 {
1063 	pvec->nr = find_get_entries(mapping, start, nr_entries,
1064 				    pvec->pages, indices);
1065 	return pagevec_count(pvec);
1066 }
1067 
1068 /**
1069  * pagevec_remove_exceptionals - pagevec exceptionals pruning
1070  * @pvec:	The pagevec to prune
1071  *
1072  * pagevec_lookup_entries() fills both pages and exceptional radix
1073  * tree entries into the pagevec.  This function prunes all
1074  * exceptionals from @pvec without leaving holes, so that it can be
1075  * passed on to page-only pagevec operations.
1076  */
1077 void pagevec_remove_exceptionals(struct pagevec *pvec)
1078 {
1079 	int i, j;
1080 
1081 	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
1082 		struct page *page = pvec->pages[i];
1083 		if (!xa_is_value(page))
1084 			pvec->pages[j++] = page;
1085 	}
1086 	pvec->nr = j;
1087 }
1088 
1089 /**
1090  * pagevec_lookup_range - gang pagecache lookup
1091  * @pvec:	Where the resulting pages are placed
1092  * @mapping:	The address_space to search
1093  * @start:	The starting page index
1094  * @end:	The final page index
1095  *
1096  * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
1097  * pages in the mapping starting from index @start and upto index @end
1098  * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
1099  * reference against the pages in @pvec.
1100  *
1101  * The search returns a group of mapping-contiguous pages with ascending
1102  * indexes.  There may be holes in the indices due to not-present pages. We
1103  * also update @start to index the next page for the traversal.
1104  *
1105  * pagevec_lookup_range() returns the number of pages which were found. If this
1106  * number is smaller than PAGEVEC_SIZE, the end of specified range has been
1107  * reached.
1108  */
1109 unsigned pagevec_lookup_range(struct pagevec *pvec,
1110 		struct address_space *mapping, pgoff_t *start, pgoff_t end)
1111 {
1112 	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
1113 					pvec->pages);
1114 	return pagevec_count(pvec);
1115 }
1116 EXPORT_SYMBOL(pagevec_lookup_range);
1117 
1118 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1119 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
1120 		xa_mark_t tag)
1121 {
1122 	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1123 					PAGEVEC_SIZE, pvec->pages);
1124 	return pagevec_count(pvec);
1125 }
1126 EXPORT_SYMBOL(pagevec_lookup_range_tag);
1127 
1128 /*
1129  * Perform any setup for the swap system
1130  */
1131 void __init swap_setup(void)
1132 {
1133 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1134 
1135 	/* Use a smaller cluster for small-memory machines */
1136 	if (megs < 16)
1137 		page_cluster = 2;
1138 	else
1139 		page_cluster = 3;
1140 	/*
1141 	 * Right now other parts of the system means that we
1142 	 * _really_ don't want to cluster much more
1143 	 */
1144 }
1145 
1146 #ifdef CONFIG_DEV_PAGEMAP_OPS
1147 void put_devmap_managed_page(struct page *page)
1148 {
1149 	int count;
1150 
1151 	if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
1152 		return;
1153 
1154 	count = page_ref_dec_return(page);
1155 
1156 	/*
1157 	 * devmap page refcounts are 1-based, rather than 0-based: if
1158 	 * refcount is 1, then the page is free and the refcount is
1159 	 * stable because nobody holds a reference on the page.
1160 	 */
1161 	if (count == 1)
1162 		free_devmap_managed_page(page);
1163 	else if (!count)
1164 		__put_page(page);
1165 }
1166 EXPORT_SYMBOL(put_devmap_managed_page);
1167 #endif
1168