xref: /openbmc/linux/mm/vmscan.c (revision 285e74ab4f94d921a56fcf66320b3cd65e35b6bc)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/mm/vmscan.c
4   *
5   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6   *
7   *  Swap reorganised 29.12.95, Stephen Tweedie.
8   *  kswapd added: 7.1.96  sct
9   *  Removed kswapd_ctl limits, and swap out as many pages as needed
10   *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
11   *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12   *  Multiqueue VM started 5.8.00, Rik van Riel.
13   */
14  
15  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16  
17  #include <linux/mm.h>
18  #include <linux/sched/mm.h>
19  #include <linux/module.h>
20  #include <linux/gfp.h>
21  #include <linux/kernel_stat.h>
22  #include <linux/swap.h>
23  #include <linux/pagemap.h>
24  #include <linux/init.h>
25  #include <linux/highmem.h>
26  #include <linux/vmpressure.h>
27  #include <linux/vmstat.h>
28  #include <linux/file.h>
29  #include <linux/writeback.h>
30  #include <linux/blkdev.h>
31  #include <linux/buffer_head.h>	/* for try_to_release_page(),
32  					buffer_heads_over_limit */
33  #include <linux/mm_inline.h>
34  #include <linux/backing-dev.h>
35  #include <linux/rmap.h>
36  #include <linux/topology.h>
37  #include <linux/cpu.h>
38  #include <linux/cpuset.h>
39  #include <linux/compaction.h>
40  #include <linux/notifier.h>
41  #include <linux/rwsem.h>
42  #include <linux/delay.h>
43  #include <linux/kthread.h>
44  #include <linux/freezer.h>
45  #include <linux/memcontrol.h>
46  #include <linux/delayacct.h>
47  #include <linux/sysctl.h>
48  #include <linux/oom.h>
49  #include <linux/pagevec.h>
50  #include <linux/prefetch.h>
51  #include <linux/printk.h>
52  #include <linux/dax.h>
53  #include <linux/psi.h>
54  
55  #include <asm/tlbflush.h>
56  #include <asm/div64.h>
57  
58  #include <linux/swapops.h>
59  #include <linux/balloon_compaction.h>
60  
61  #include "internal.h"
62  
63  #define CREATE_TRACE_POINTS
64  #include <trace/events/vmscan.h>
65  
66  struct scan_control {
67  	/* How many pages shrink_list() should reclaim */
68  	unsigned long nr_to_reclaim;
69  
70  	/*
71  	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
72  	 * are scanned.
73  	 */
74  	nodemask_t	*nodemask;
75  
76  	/*
77  	 * The memory cgroup that hit its limit and as a result is the
78  	 * primary target of this reclaim invocation.
79  	 */
80  	struct mem_cgroup *target_mem_cgroup;
81  
82  	/*
83  	 * Scan pressure balancing between anon and file LRUs
84  	 */
85  	unsigned long	anon_cost;
86  	unsigned long	file_cost;
87  
88  	/* Can active pages be deactivated as part of reclaim? */
89  #define DEACTIVATE_ANON 1
90  #define DEACTIVATE_FILE 2
91  	unsigned int may_deactivate:2;
92  	unsigned int force_deactivate:1;
93  	unsigned int skipped_deactivate:1;
94  
95  	/* Writepage batching in laptop mode; RECLAIM_WRITE */
96  	unsigned int may_writepage:1;
97  
98  	/* Can mapped pages be reclaimed? */
99  	unsigned int may_unmap:1;
100  
101  	/* Can pages be swapped as part of reclaim? */
102  	unsigned int may_swap:1;
103  
104  	/*
105  	 * Cgroups are not reclaimed below their configured memory.low,
106  	 * unless we threaten to OOM. If any cgroups are skipped due to
107  	 * memory.low and nothing was reclaimed, go back for memory.low.
108  	 */
109  	unsigned int memcg_low_reclaim:1;
110  	unsigned int memcg_low_skipped:1;
111  
112  	unsigned int hibernation_mode:1;
113  
114  	/* One of the zones is ready for compaction */
115  	unsigned int compaction_ready:1;
116  
117  	/* There is easily reclaimable cold cache in the current node */
118  	unsigned int cache_trim_mode:1;
119  
120  	/* The file pages on the current node are dangerously low */
121  	unsigned int file_is_tiny:1;
122  
123  	/* Allocation order */
124  	s8 order;
125  
126  	/* Scan (total_size >> priority) pages at once */
127  	s8 priority;
128  
129  	/* The highest zone to isolate pages for reclaim from */
130  	s8 reclaim_idx;
131  
132  	/* This context's GFP mask */
133  	gfp_t gfp_mask;
134  
135  	/* Incremented by the number of inactive pages that were scanned */
136  	unsigned long nr_scanned;
137  
138  	/* Number of pages freed so far during a call to shrink_zones() */
139  	unsigned long nr_reclaimed;
140  
141  	struct {
142  		unsigned int dirty;
143  		unsigned int unqueued_dirty;
144  		unsigned int congested;
145  		unsigned int writeback;
146  		unsigned int immediate;
147  		unsigned int file_taken;
148  		unsigned int taken;
149  	} nr;
150  
151  	/* for recording the reclaimed slab by now */
152  	struct reclaim_state reclaim_state;
153  };
154  
155  #ifdef ARCH_HAS_PREFETCHW
156  #define prefetchw_prev_lru_page(_page, _base, _field)			\
157  	do {								\
158  		if ((_page)->lru.prev != _base) {			\
159  			struct page *prev;				\
160  									\
161  			prev = lru_to_page(&(_page->lru));		\
162  			prefetchw(&prev->_field);			\
163  		}							\
164  	} while (0)
165  #else
166  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
167  #endif
168  
169  /*
170   * From 0 .. 200.  Higher means more swappy.
171   */
172  int vm_swappiness = 60;
173  /*
174   * The total number of pages which are beyond the high watermark within all
175   * zones.
176   */
177  unsigned long vm_total_pages;
178  
179  static void set_task_reclaim_state(struct task_struct *task,
180  				   struct reclaim_state *rs)
181  {
182  	/* Check for an overwrite */
183  	WARN_ON_ONCE(rs && task->reclaim_state);
184  
185  	/* Check for the nulling of an already-nulled member */
186  	WARN_ON_ONCE(!rs && !task->reclaim_state);
187  
188  	task->reclaim_state = rs;
189  }
190  
191  static LIST_HEAD(shrinker_list);
192  static DECLARE_RWSEM(shrinker_rwsem);
193  
194  #ifdef CONFIG_MEMCG
195  /*
196   * We allow subsystems to populate their shrinker-related
197   * LRU lists before register_shrinker_prepared() is called
198   * for the shrinker, since we don't want to impose
199   * restrictions on their internal registration order.
200   * In this case shrink_slab_memcg() may find corresponding
201   * bit is set in the shrinkers map.
202   *
203   * This value is used by the function to detect registering
204   * shrinkers and to skip do_shrink_slab() calls for them.
205   */
206  #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
207  
208  static DEFINE_IDR(shrinker_idr);
209  static int shrinker_nr_max;
210  
211  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
212  {
213  	int id, ret = -ENOMEM;
214  
215  	down_write(&shrinker_rwsem);
216  	/* This may call shrinker, so it must use down_read_trylock() */
217  	id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
218  	if (id < 0)
219  		goto unlock;
220  
221  	if (id >= shrinker_nr_max) {
222  		if (memcg_expand_shrinker_maps(id)) {
223  			idr_remove(&shrinker_idr, id);
224  			goto unlock;
225  		}
226  
227  		shrinker_nr_max = id + 1;
228  	}
229  	shrinker->id = id;
230  	ret = 0;
231  unlock:
232  	up_write(&shrinker_rwsem);
233  	return ret;
234  }
235  
236  static void unregister_memcg_shrinker(struct shrinker *shrinker)
237  {
238  	int id = shrinker->id;
239  
240  	BUG_ON(id < 0);
241  
242  	down_write(&shrinker_rwsem);
243  	idr_remove(&shrinker_idr, id);
244  	up_write(&shrinker_rwsem);
245  }
246  
247  static bool cgroup_reclaim(struct scan_control *sc)
248  {
249  	return sc->target_mem_cgroup;
250  }
251  
252  /**
253   * writeback_throttling_sane - is the usual dirty throttling mechanism available?
254   * @sc: scan_control in question
255   *
256   * The normal page dirty throttling mechanism in balance_dirty_pages() is
257   * completely broken with the legacy memcg and direct stalling in
258   * shrink_page_list() is used for throttling instead, which lacks all the
259   * niceties such as fairness, adaptive pausing, bandwidth proportional
260   * allocation and configurability.
261   *
262   * This function tests whether the vmscan currently in progress can assume
263   * that the normal dirty throttling mechanism is operational.
264   */
265  static bool writeback_throttling_sane(struct scan_control *sc)
266  {
267  	if (!cgroup_reclaim(sc))
268  		return true;
269  #ifdef CONFIG_CGROUP_WRITEBACK
270  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
271  		return true;
272  #endif
273  	return false;
274  }
275  #else
276  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
277  {
278  	return 0;
279  }
280  
281  static void unregister_memcg_shrinker(struct shrinker *shrinker)
282  {
283  }
284  
285  static bool cgroup_reclaim(struct scan_control *sc)
286  {
287  	return false;
288  }
289  
290  static bool writeback_throttling_sane(struct scan_control *sc)
291  {
292  	return true;
293  }
294  #endif
295  
296  /*
297   * This misses isolated pages which are not accounted for to save counters.
298   * As the data only determines if reclaim or compaction continues, it is
299   * not expected that isolated pages will be a dominating factor.
300   */
301  unsigned long zone_reclaimable_pages(struct zone *zone)
302  {
303  	unsigned long nr;
304  
305  	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
306  		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
307  	if (get_nr_swap_pages() > 0)
308  		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
309  			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
310  
311  	return nr;
312  }
313  
314  /**
315   * lruvec_lru_size -  Returns the number of pages on the given LRU list.
316   * @lruvec: lru vector
317   * @lru: lru to use
318   * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
319   */
320  unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
321  {
322  	unsigned long size = 0;
323  	int zid;
324  
325  	for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
326  		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
327  
328  		if (!managed_zone(zone))
329  			continue;
330  
331  		if (!mem_cgroup_disabled())
332  			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
333  		else
334  			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
335  	}
336  	return size;
337  }
338  
339  /*
340   * Add a shrinker callback to be called from the vm.
341   */
342  int prealloc_shrinker(struct shrinker *shrinker)
343  {
344  	unsigned int size = sizeof(*shrinker->nr_deferred);
345  
346  	if (shrinker->flags & SHRINKER_NUMA_AWARE)
347  		size *= nr_node_ids;
348  
349  	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
350  	if (!shrinker->nr_deferred)
351  		return -ENOMEM;
352  
353  	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
354  		if (prealloc_memcg_shrinker(shrinker))
355  			goto free_deferred;
356  	}
357  
358  	return 0;
359  
360  free_deferred:
361  	kfree(shrinker->nr_deferred);
362  	shrinker->nr_deferred = NULL;
363  	return -ENOMEM;
364  }
365  
366  void free_prealloced_shrinker(struct shrinker *shrinker)
367  {
368  	if (!shrinker->nr_deferred)
369  		return;
370  
371  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
372  		unregister_memcg_shrinker(shrinker);
373  
374  	kfree(shrinker->nr_deferred);
375  	shrinker->nr_deferred = NULL;
376  }
377  
378  void register_shrinker_prepared(struct shrinker *shrinker)
379  {
380  	down_write(&shrinker_rwsem);
381  	list_add_tail(&shrinker->list, &shrinker_list);
382  #ifdef CONFIG_MEMCG
383  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
384  		idr_replace(&shrinker_idr, shrinker, shrinker->id);
385  #endif
386  	up_write(&shrinker_rwsem);
387  }
388  
389  int register_shrinker(struct shrinker *shrinker)
390  {
391  	int err = prealloc_shrinker(shrinker);
392  
393  	if (err)
394  		return err;
395  	register_shrinker_prepared(shrinker);
396  	return 0;
397  }
398  EXPORT_SYMBOL(register_shrinker);
399  
400  /*
401   * Remove one
402   */
403  void unregister_shrinker(struct shrinker *shrinker)
404  {
405  	if (!shrinker->nr_deferred)
406  		return;
407  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
408  		unregister_memcg_shrinker(shrinker);
409  	down_write(&shrinker_rwsem);
410  	list_del(&shrinker->list);
411  	up_write(&shrinker_rwsem);
412  	kfree(shrinker->nr_deferred);
413  	shrinker->nr_deferred = NULL;
414  }
415  EXPORT_SYMBOL(unregister_shrinker);
416  
417  #define SHRINK_BATCH 128
418  
419  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
420  				    struct shrinker *shrinker, int priority)
421  {
422  	unsigned long freed = 0;
423  	unsigned long long delta;
424  	long total_scan;
425  	long freeable;
426  	long nr;
427  	long new_nr;
428  	int nid = shrinkctl->nid;
429  	long batch_size = shrinker->batch ? shrinker->batch
430  					  : SHRINK_BATCH;
431  	long scanned = 0, next_deferred;
432  
433  	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
434  		nid = 0;
435  
436  	freeable = shrinker->count_objects(shrinker, shrinkctl);
437  	if (freeable == 0 || freeable == SHRINK_EMPTY)
438  		return freeable;
439  
440  	/*
441  	 * copy the current shrinker scan count into a local variable
442  	 * and zero it so that other concurrent shrinker invocations
443  	 * don't also do this scanning work.
444  	 */
445  	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
446  
447  	total_scan = nr;
448  	if (shrinker->seeks) {
449  		delta = freeable >> priority;
450  		delta *= 4;
451  		do_div(delta, shrinker->seeks);
452  	} else {
453  		/*
454  		 * These objects don't require any IO to create. Trim
455  		 * them aggressively under memory pressure to keep
456  		 * them from causing refetches in the IO caches.
457  		 */
458  		delta = freeable / 2;
459  	}
460  
461  	total_scan += delta;
462  	if (total_scan < 0) {
463  		pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
464  		       shrinker->scan_objects, total_scan);
465  		total_scan = freeable;
466  		next_deferred = nr;
467  	} else
468  		next_deferred = total_scan;
469  
470  	/*
471  	 * We need to avoid excessive windup on filesystem shrinkers
472  	 * due to large numbers of GFP_NOFS allocations causing the
473  	 * shrinkers to return -1 all the time. This results in a large
474  	 * nr being built up so when a shrink that can do some work
475  	 * comes along it empties the entire cache due to nr >>>
476  	 * freeable. This is bad for sustaining a working set in
477  	 * memory.
478  	 *
479  	 * Hence only allow the shrinker to scan the entire cache when
480  	 * a large delta change is calculated directly.
481  	 */
482  	if (delta < freeable / 4)
483  		total_scan = min(total_scan, freeable / 2);
484  
485  	/*
486  	 * Avoid risking looping forever due to too large nr value:
487  	 * never try to free more than twice the estimate number of
488  	 * freeable entries.
489  	 */
490  	if (total_scan > freeable * 2)
491  		total_scan = freeable * 2;
492  
493  	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
494  				   freeable, delta, total_scan, priority);
495  
496  	/*
497  	 * Normally, we should not scan less than batch_size objects in one
498  	 * pass to avoid too frequent shrinker calls, but if the slab has less
499  	 * than batch_size objects in total and we are really tight on memory,
500  	 * we will try to reclaim all available objects, otherwise we can end
501  	 * up failing allocations although there are plenty of reclaimable
502  	 * objects spread over several slabs with usage less than the
503  	 * batch_size.
504  	 *
505  	 * We detect the "tight on memory" situations by looking at the total
506  	 * number of objects we want to scan (total_scan). If it is greater
507  	 * than the total number of objects on slab (freeable), we must be
508  	 * scanning at high prio and therefore should try to reclaim as much as
509  	 * possible.
510  	 */
511  	while (total_scan >= batch_size ||
512  	       total_scan >= freeable) {
513  		unsigned long ret;
514  		unsigned long nr_to_scan = min(batch_size, total_scan);
515  
516  		shrinkctl->nr_to_scan = nr_to_scan;
517  		shrinkctl->nr_scanned = nr_to_scan;
518  		ret = shrinker->scan_objects(shrinker, shrinkctl);
519  		if (ret == SHRINK_STOP)
520  			break;
521  		freed += ret;
522  
523  		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
524  		total_scan -= shrinkctl->nr_scanned;
525  		scanned += shrinkctl->nr_scanned;
526  
527  		cond_resched();
528  	}
529  
530  	if (next_deferred >= scanned)
531  		next_deferred -= scanned;
532  	else
533  		next_deferred = 0;
534  	/*
535  	 * move the unused scan count back into the shrinker in a
536  	 * manner that handles concurrent updates. If we exhausted the
537  	 * scan, there is no need to do an update.
538  	 */
539  	if (next_deferred > 0)
540  		new_nr = atomic_long_add_return(next_deferred,
541  						&shrinker->nr_deferred[nid]);
542  	else
543  		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
544  
545  	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
546  	return freed;
547  }
548  
549  #ifdef CONFIG_MEMCG
550  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
551  			struct mem_cgroup *memcg, int priority)
552  {
553  	struct memcg_shrinker_map *map;
554  	unsigned long ret, freed = 0;
555  	int i;
556  
557  	if (!mem_cgroup_online(memcg))
558  		return 0;
559  
560  	if (!down_read_trylock(&shrinker_rwsem))
561  		return 0;
562  
563  	map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
564  					true);
565  	if (unlikely(!map))
566  		goto unlock;
567  
568  	for_each_set_bit(i, map->map, shrinker_nr_max) {
569  		struct shrink_control sc = {
570  			.gfp_mask = gfp_mask,
571  			.nid = nid,
572  			.memcg = memcg,
573  		};
574  		struct shrinker *shrinker;
575  
576  		shrinker = idr_find(&shrinker_idr, i);
577  		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
578  			if (!shrinker)
579  				clear_bit(i, map->map);
580  			continue;
581  		}
582  
583  		/* Call non-slab shrinkers even though kmem is disabled */
584  		if (!memcg_kmem_enabled() &&
585  		    !(shrinker->flags & SHRINKER_NONSLAB))
586  			continue;
587  
588  		ret = do_shrink_slab(&sc, shrinker, priority);
589  		if (ret == SHRINK_EMPTY) {
590  			clear_bit(i, map->map);
591  			/*
592  			 * After the shrinker reported that it had no objects to
593  			 * free, but before we cleared the corresponding bit in
594  			 * the memcg shrinker map, a new object might have been
595  			 * added. To make sure, we have the bit set in this
596  			 * case, we invoke the shrinker one more time and reset
597  			 * the bit if it reports that it is not empty anymore.
598  			 * The memory barrier here pairs with the barrier in
599  			 * memcg_set_shrinker_bit():
600  			 *
601  			 * list_lru_add()     shrink_slab_memcg()
602  			 *   list_add_tail()    clear_bit()
603  			 *   <MB>               <MB>
604  			 *   set_bit()          do_shrink_slab()
605  			 */
606  			smp_mb__after_atomic();
607  			ret = do_shrink_slab(&sc, shrinker, priority);
608  			if (ret == SHRINK_EMPTY)
609  				ret = 0;
610  			else
611  				memcg_set_shrinker_bit(memcg, nid, i);
612  		}
613  		freed += ret;
614  
615  		if (rwsem_is_contended(&shrinker_rwsem)) {
616  			freed = freed ? : 1;
617  			break;
618  		}
619  	}
620  unlock:
621  	up_read(&shrinker_rwsem);
622  	return freed;
623  }
624  #else /* CONFIG_MEMCG */
625  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
626  			struct mem_cgroup *memcg, int priority)
627  {
628  	return 0;
629  }
630  #endif /* CONFIG_MEMCG */
631  
632  /**
633   * shrink_slab - shrink slab caches
634   * @gfp_mask: allocation context
635   * @nid: node whose slab caches to target
636   * @memcg: memory cgroup whose slab caches to target
637   * @priority: the reclaim priority
638   *
639   * Call the shrink functions to age shrinkable caches.
640   *
641   * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
642   * unaware shrinkers will receive a node id of 0 instead.
643   *
644   * @memcg specifies the memory cgroup to target. Unaware shrinkers
645   * are called only if it is the root cgroup.
646   *
647   * @priority is sc->priority, we take the number of objects and >> by priority
648   * in order to get the scan target.
649   *
650   * Returns the number of reclaimed slab objects.
651   */
652  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
653  				 struct mem_cgroup *memcg,
654  				 int priority)
655  {
656  	unsigned long ret, freed = 0;
657  	struct shrinker *shrinker;
658  
659  	/*
660  	 * The root memcg might be allocated even though memcg is disabled
661  	 * via "cgroup_disable=memory" boot parameter.  This could make
662  	 * mem_cgroup_is_root() return false, then just run memcg slab
663  	 * shrink, but skip global shrink.  This may result in premature
664  	 * oom.
665  	 */
666  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
667  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
668  
669  	if (!down_read_trylock(&shrinker_rwsem))
670  		goto out;
671  
672  	list_for_each_entry(shrinker, &shrinker_list, list) {
673  		struct shrink_control sc = {
674  			.gfp_mask = gfp_mask,
675  			.nid = nid,
676  			.memcg = memcg,
677  		};
678  
679  		ret = do_shrink_slab(&sc, shrinker, priority);
680  		if (ret == SHRINK_EMPTY)
681  			ret = 0;
682  		freed += ret;
683  		/*
684  		 * Bail out if someone want to register a new shrinker to
685  		 * prevent the registration from being stalled for long periods
686  		 * by parallel ongoing shrinking.
687  		 */
688  		if (rwsem_is_contended(&shrinker_rwsem)) {
689  			freed = freed ? : 1;
690  			break;
691  		}
692  	}
693  
694  	up_read(&shrinker_rwsem);
695  out:
696  	cond_resched();
697  	return freed;
698  }
699  
700  void drop_slab_node(int nid)
701  {
702  	unsigned long freed;
703  
704  	do {
705  		struct mem_cgroup *memcg = NULL;
706  
707  		freed = 0;
708  		memcg = mem_cgroup_iter(NULL, NULL, NULL);
709  		do {
710  			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
711  		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
712  	} while (freed > 10);
713  }
714  
715  void drop_slab(void)
716  {
717  	int nid;
718  
719  	for_each_online_node(nid)
720  		drop_slab_node(nid);
721  }
722  
723  static inline int is_page_cache_freeable(struct page *page)
724  {
725  	/*
726  	 * A freeable page cache page is referenced only by the caller
727  	 * that isolated the page, the page cache and optional buffer
728  	 * heads at page->private.
729  	 */
730  	int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
731  		HPAGE_PMD_NR : 1;
732  	return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
733  }
734  
735  static int may_write_to_inode(struct inode *inode)
736  {
737  	if (current->flags & PF_SWAPWRITE)
738  		return 1;
739  	if (!inode_write_congested(inode))
740  		return 1;
741  	if (inode_to_bdi(inode) == current->backing_dev_info)
742  		return 1;
743  	return 0;
744  }
745  
746  /*
747   * We detected a synchronous write error writing a page out.  Probably
748   * -ENOSPC.  We need to propagate that into the address_space for a subsequent
749   * fsync(), msync() or close().
750   *
751   * The tricky part is that after writepage we cannot touch the mapping: nothing
752   * prevents it from being freed up.  But we have a ref on the page and once
753   * that page is locked, the mapping is pinned.
754   *
755   * We're allowed to run sleeping lock_page() here because we know the caller has
756   * __GFP_FS.
757   */
758  static void handle_write_error(struct address_space *mapping,
759  				struct page *page, int error)
760  {
761  	lock_page(page);
762  	if (page_mapping(page) == mapping)
763  		mapping_set_error(mapping, error);
764  	unlock_page(page);
765  }
766  
767  /* possible outcome of pageout() */
768  typedef enum {
769  	/* failed to write page out, page is locked */
770  	PAGE_KEEP,
771  	/* move page to the active list, page is locked */
772  	PAGE_ACTIVATE,
773  	/* page has been sent to the disk successfully, page is unlocked */
774  	PAGE_SUCCESS,
775  	/* page is clean and locked */
776  	PAGE_CLEAN,
777  } pageout_t;
778  
779  /*
780   * pageout is called by shrink_page_list() for each dirty page.
781   * Calls ->writepage().
782   */
783  static pageout_t pageout(struct page *page, struct address_space *mapping)
784  {
785  	/*
786  	 * If the page is dirty, only perform writeback if that write
787  	 * will be non-blocking.  To prevent this allocation from being
788  	 * stalled by pagecache activity.  But note that there may be
789  	 * stalls if we need to run get_block().  We could test
790  	 * PagePrivate for that.
791  	 *
792  	 * If this process is currently in __generic_file_write_iter() against
793  	 * this page's queue, we can perform writeback even if that
794  	 * will block.
795  	 *
796  	 * If the page is swapcache, write it back even if that would
797  	 * block, for some throttling. This happens by accident, because
798  	 * swap_backing_dev_info is bust: it doesn't reflect the
799  	 * congestion state of the swapdevs.  Easy to fix, if needed.
800  	 */
801  	if (!is_page_cache_freeable(page))
802  		return PAGE_KEEP;
803  	if (!mapping) {
804  		/*
805  		 * Some data journaling orphaned pages can have
806  		 * page->mapping == NULL while being dirty with clean buffers.
807  		 */
808  		if (page_has_private(page)) {
809  			if (try_to_free_buffers(page)) {
810  				ClearPageDirty(page);
811  				pr_info("%s: orphaned page\n", __func__);
812  				return PAGE_CLEAN;
813  			}
814  		}
815  		return PAGE_KEEP;
816  	}
817  	if (mapping->a_ops->writepage == NULL)
818  		return PAGE_ACTIVATE;
819  	if (!may_write_to_inode(mapping->host))
820  		return PAGE_KEEP;
821  
822  	if (clear_page_dirty_for_io(page)) {
823  		int res;
824  		struct writeback_control wbc = {
825  			.sync_mode = WB_SYNC_NONE,
826  			.nr_to_write = SWAP_CLUSTER_MAX,
827  			.range_start = 0,
828  			.range_end = LLONG_MAX,
829  			.for_reclaim = 1,
830  		};
831  
832  		SetPageReclaim(page);
833  		res = mapping->a_ops->writepage(page, &wbc);
834  		if (res < 0)
835  			handle_write_error(mapping, page, res);
836  		if (res == AOP_WRITEPAGE_ACTIVATE) {
837  			ClearPageReclaim(page);
838  			return PAGE_ACTIVATE;
839  		}
840  
841  		if (!PageWriteback(page)) {
842  			/* synchronous write or broken a_ops? */
843  			ClearPageReclaim(page);
844  		}
845  		trace_mm_vmscan_writepage(page);
846  		inc_node_page_state(page, NR_VMSCAN_WRITE);
847  		return PAGE_SUCCESS;
848  	}
849  
850  	return PAGE_CLEAN;
851  }
852  
853  /*
854   * Same as remove_mapping, but if the page is removed from the mapping, it
855   * gets returned with a refcount of 0.
856   */
857  static int __remove_mapping(struct address_space *mapping, struct page *page,
858  			    bool reclaimed, struct mem_cgroup *target_memcg)
859  {
860  	unsigned long flags;
861  	int refcount;
862  
863  	BUG_ON(!PageLocked(page));
864  	BUG_ON(mapping != page_mapping(page));
865  
866  	xa_lock_irqsave(&mapping->i_pages, flags);
867  	/*
868  	 * The non racy check for a busy page.
869  	 *
870  	 * Must be careful with the order of the tests. When someone has
871  	 * a ref to the page, it may be possible that they dirty it then
872  	 * drop the reference. So if PageDirty is tested before page_count
873  	 * here, then the following race may occur:
874  	 *
875  	 * get_user_pages(&page);
876  	 * [user mapping goes away]
877  	 * write_to(page);
878  	 *				!PageDirty(page)    [good]
879  	 * SetPageDirty(page);
880  	 * put_page(page);
881  	 *				!page_count(page)   [good, discard it]
882  	 *
883  	 * [oops, our write_to data is lost]
884  	 *
885  	 * Reversing the order of the tests ensures such a situation cannot
886  	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
887  	 * load is not satisfied before that of page->_refcount.
888  	 *
889  	 * Note that if SetPageDirty is always performed via set_page_dirty,
890  	 * and thus under the i_pages lock, then this ordering is not required.
891  	 */
892  	refcount = 1 + compound_nr(page);
893  	if (!page_ref_freeze(page, refcount))
894  		goto cannot_free;
895  	/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
896  	if (unlikely(PageDirty(page))) {
897  		page_ref_unfreeze(page, refcount);
898  		goto cannot_free;
899  	}
900  
901  	if (PageSwapCache(page)) {
902  		swp_entry_t swap = { .val = page_private(page) };
903  		mem_cgroup_swapout(page, swap);
904  		__delete_from_swap_cache(page, swap);
905  		xa_unlock_irqrestore(&mapping->i_pages, flags);
906  		put_swap_page(page, swap);
907  	} else {
908  		void (*freepage)(struct page *);
909  		void *shadow = NULL;
910  
911  		freepage = mapping->a_ops->freepage;
912  		/*
913  		 * Remember a shadow entry for reclaimed file cache in
914  		 * order to detect refaults, thus thrashing, later on.
915  		 *
916  		 * But don't store shadows in an address space that is
917  		 * already exiting.  This is not just an optizimation,
918  		 * inode reclaim needs to empty out the radix tree or
919  		 * the nodes are lost.  Don't plant shadows behind its
920  		 * back.
921  		 *
922  		 * We also don't store shadows for DAX mappings because the
923  		 * only page cache pages found in these are zero pages
924  		 * covering holes, and because we don't want to mix DAX
925  		 * exceptional entries and shadow exceptional entries in the
926  		 * same address_space.
927  		 */
928  		if (reclaimed && page_is_file_lru(page) &&
929  		    !mapping_exiting(mapping) && !dax_mapping(mapping))
930  			shadow = workingset_eviction(page, target_memcg);
931  		__delete_from_page_cache(page, shadow);
932  		xa_unlock_irqrestore(&mapping->i_pages, flags);
933  
934  		if (freepage != NULL)
935  			freepage(page);
936  	}
937  
938  	return 1;
939  
940  cannot_free:
941  	xa_unlock_irqrestore(&mapping->i_pages, flags);
942  	return 0;
943  }
944  
945  /*
946   * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
947   * someone else has a ref on the page, abort and return 0.  If it was
948   * successfully detached, return 1.  Assumes the caller has a single ref on
949   * this page.
950   */
951  int remove_mapping(struct address_space *mapping, struct page *page)
952  {
953  	if (__remove_mapping(mapping, page, false, NULL)) {
954  		/*
955  		 * Unfreezing the refcount with 1 rather than 2 effectively
956  		 * drops the pagecache ref for us without requiring another
957  		 * atomic operation.
958  		 */
959  		page_ref_unfreeze(page, 1);
960  		return 1;
961  	}
962  	return 0;
963  }
964  
965  /**
966   * putback_lru_page - put previously isolated page onto appropriate LRU list
967   * @page: page to be put back to appropriate lru list
968   *
969   * Add previously isolated @page to appropriate LRU list.
970   * Page may still be unevictable for other reasons.
971   *
972   * lru_lock must not be held, interrupts must be enabled.
973   */
974  void putback_lru_page(struct page *page)
975  {
976  	lru_cache_add(page);
977  	put_page(page);		/* drop ref from isolate */
978  }
979  
980  enum page_references {
981  	PAGEREF_RECLAIM,
982  	PAGEREF_RECLAIM_CLEAN,
983  	PAGEREF_KEEP,
984  	PAGEREF_ACTIVATE,
985  };
986  
987  static enum page_references page_check_references(struct page *page,
988  						  struct scan_control *sc)
989  {
990  	int referenced_ptes, referenced_page;
991  	unsigned long vm_flags;
992  
993  	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
994  					  &vm_flags);
995  	referenced_page = TestClearPageReferenced(page);
996  
997  	/*
998  	 * Mlock lost the isolation race with us.  Let try_to_unmap()
999  	 * move the page to the unevictable list.
1000  	 */
1001  	if (vm_flags & VM_LOCKED)
1002  		return PAGEREF_RECLAIM;
1003  
1004  	if (referenced_ptes) {
1005  		if (PageSwapBacked(page))
1006  			return PAGEREF_ACTIVATE;
1007  		/*
1008  		 * All mapped pages start out with page table
1009  		 * references from the instantiating fault, so we need
1010  		 * to look twice if a mapped file page is used more
1011  		 * than once.
1012  		 *
1013  		 * Mark it and spare it for another trip around the
1014  		 * inactive list.  Another page table reference will
1015  		 * lead to its activation.
1016  		 *
1017  		 * Note: the mark is set for activated pages as well
1018  		 * so that recently deactivated but used pages are
1019  		 * quickly recovered.
1020  		 */
1021  		SetPageReferenced(page);
1022  
1023  		if (referenced_page || referenced_ptes > 1)
1024  			return PAGEREF_ACTIVATE;
1025  
1026  		/*
1027  		 * Activate file-backed executable pages after first usage.
1028  		 */
1029  		if (vm_flags & VM_EXEC)
1030  			return PAGEREF_ACTIVATE;
1031  
1032  		return PAGEREF_KEEP;
1033  	}
1034  
1035  	/* Reclaim if clean, defer dirty pages to writeback */
1036  	if (referenced_page && !PageSwapBacked(page))
1037  		return PAGEREF_RECLAIM_CLEAN;
1038  
1039  	return PAGEREF_RECLAIM;
1040  }
1041  
1042  /* Check if a page is dirty or under writeback */
1043  static void page_check_dirty_writeback(struct page *page,
1044  				       bool *dirty, bool *writeback)
1045  {
1046  	struct address_space *mapping;
1047  
1048  	/*
1049  	 * Anonymous pages are not handled by flushers and must be written
1050  	 * from reclaim context. Do not stall reclaim based on them
1051  	 */
1052  	if (!page_is_file_lru(page) ||
1053  	    (PageAnon(page) && !PageSwapBacked(page))) {
1054  		*dirty = false;
1055  		*writeback = false;
1056  		return;
1057  	}
1058  
1059  	/* By default assume that the page flags are accurate */
1060  	*dirty = PageDirty(page);
1061  	*writeback = PageWriteback(page);
1062  
1063  	/* Verify dirty/writeback state if the filesystem supports it */
1064  	if (!page_has_private(page))
1065  		return;
1066  
1067  	mapping = page_mapping(page);
1068  	if (mapping && mapping->a_ops->is_dirty_writeback)
1069  		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1070  }
1071  
1072  /*
1073   * shrink_page_list() returns the number of reclaimed pages
1074   */
1075  static unsigned int shrink_page_list(struct list_head *page_list,
1076  				     struct pglist_data *pgdat,
1077  				     struct scan_control *sc,
1078  				     enum ttu_flags ttu_flags,
1079  				     struct reclaim_stat *stat,
1080  				     bool ignore_references)
1081  {
1082  	LIST_HEAD(ret_pages);
1083  	LIST_HEAD(free_pages);
1084  	unsigned int nr_reclaimed = 0;
1085  	unsigned int pgactivate = 0;
1086  
1087  	memset(stat, 0, sizeof(*stat));
1088  	cond_resched();
1089  
1090  	while (!list_empty(page_list)) {
1091  		struct address_space *mapping;
1092  		struct page *page;
1093  		enum page_references references = PAGEREF_RECLAIM;
1094  		bool dirty, writeback, may_enter_fs;
1095  		unsigned int nr_pages;
1096  
1097  		cond_resched();
1098  
1099  		page = lru_to_page(page_list);
1100  		list_del(&page->lru);
1101  
1102  		if (!trylock_page(page))
1103  			goto keep;
1104  
1105  		VM_BUG_ON_PAGE(PageActive(page), page);
1106  
1107  		nr_pages = compound_nr(page);
1108  
1109  		/* Account the number of base pages even though THP */
1110  		sc->nr_scanned += nr_pages;
1111  
1112  		if (unlikely(!page_evictable(page)))
1113  			goto activate_locked;
1114  
1115  		if (!sc->may_unmap && page_mapped(page))
1116  			goto keep_locked;
1117  
1118  		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1119  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1120  
1121  		/*
1122  		 * The number of dirty pages determines if a node is marked
1123  		 * reclaim_congested which affects wait_iff_congested. kswapd
1124  		 * will stall and start writing pages if the tail of the LRU
1125  		 * is all dirty unqueued pages.
1126  		 */
1127  		page_check_dirty_writeback(page, &dirty, &writeback);
1128  		if (dirty || writeback)
1129  			stat->nr_dirty++;
1130  
1131  		if (dirty && !writeback)
1132  			stat->nr_unqueued_dirty++;
1133  
1134  		/*
1135  		 * Treat this page as congested if the underlying BDI is or if
1136  		 * pages are cycling through the LRU so quickly that the
1137  		 * pages marked for immediate reclaim are making it to the
1138  		 * end of the LRU a second time.
1139  		 */
1140  		mapping = page_mapping(page);
1141  		if (((dirty || writeback) && mapping &&
1142  		     inode_write_congested(mapping->host)) ||
1143  		    (writeback && PageReclaim(page)))
1144  			stat->nr_congested++;
1145  
1146  		/*
1147  		 * If a page at the tail of the LRU is under writeback, there
1148  		 * are three cases to consider.
1149  		 *
1150  		 * 1) If reclaim is encountering an excessive number of pages
1151  		 *    under writeback and this page is both under writeback and
1152  		 *    PageReclaim then it indicates that pages are being queued
1153  		 *    for IO but are being recycled through the LRU before the
1154  		 *    IO can complete. Waiting on the page itself risks an
1155  		 *    indefinite stall if it is impossible to writeback the
1156  		 *    page due to IO error or disconnected storage so instead
1157  		 *    note that the LRU is being scanned too quickly and the
1158  		 *    caller can stall after page list has been processed.
1159  		 *
1160  		 * 2) Global or new memcg reclaim encounters a page that is
1161  		 *    not marked for immediate reclaim, or the caller does not
1162  		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1163  		 *    not to fs). In this case mark the page for immediate
1164  		 *    reclaim and continue scanning.
1165  		 *
1166  		 *    Require may_enter_fs because we would wait on fs, which
1167  		 *    may not have submitted IO yet. And the loop driver might
1168  		 *    enter reclaim, and deadlock if it waits on a page for
1169  		 *    which it is needed to do the write (loop masks off
1170  		 *    __GFP_IO|__GFP_FS for this reason); but more thought
1171  		 *    would probably show more reasons.
1172  		 *
1173  		 * 3) Legacy memcg encounters a page that is already marked
1174  		 *    PageReclaim. memcg does not have any dirty pages
1175  		 *    throttling so we could easily OOM just because too many
1176  		 *    pages are in writeback and there is nothing else to
1177  		 *    reclaim. Wait for the writeback to complete.
1178  		 *
1179  		 * In cases 1) and 2) we activate the pages to get them out of
1180  		 * the way while we continue scanning for clean pages on the
1181  		 * inactive list and refilling from the active list. The
1182  		 * observation here is that waiting for disk writes is more
1183  		 * expensive than potentially causing reloads down the line.
1184  		 * Since they're marked for immediate reclaim, they won't put
1185  		 * memory pressure on the cache working set any longer than it
1186  		 * takes to write them to disk.
1187  		 */
1188  		if (PageWriteback(page)) {
1189  			/* Case 1 above */
1190  			if (current_is_kswapd() &&
1191  			    PageReclaim(page) &&
1192  			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1193  				stat->nr_immediate++;
1194  				goto activate_locked;
1195  
1196  			/* Case 2 above */
1197  			} else if (writeback_throttling_sane(sc) ||
1198  			    !PageReclaim(page) || !may_enter_fs) {
1199  				/*
1200  				 * This is slightly racy - end_page_writeback()
1201  				 * might have just cleared PageReclaim, then
1202  				 * setting PageReclaim here end up interpreted
1203  				 * as PageReadahead - but that does not matter
1204  				 * enough to care.  What we do want is for this
1205  				 * page to have PageReclaim set next time memcg
1206  				 * reclaim reaches the tests above, so it will
1207  				 * then wait_on_page_writeback() to avoid OOM;
1208  				 * and it's also appropriate in global reclaim.
1209  				 */
1210  				SetPageReclaim(page);
1211  				stat->nr_writeback++;
1212  				goto activate_locked;
1213  
1214  			/* Case 3 above */
1215  			} else {
1216  				unlock_page(page);
1217  				wait_on_page_writeback(page);
1218  				/* then go back and try same page again */
1219  				list_add_tail(&page->lru, page_list);
1220  				continue;
1221  			}
1222  		}
1223  
1224  		if (!ignore_references)
1225  			references = page_check_references(page, sc);
1226  
1227  		switch (references) {
1228  		case PAGEREF_ACTIVATE:
1229  			goto activate_locked;
1230  		case PAGEREF_KEEP:
1231  			stat->nr_ref_keep += nr_pages;
1232  			goto keep_locked;
1233  		case PAGEREF_RECLAIM:
1234  		case PAGEREF_RECLAIM_CLEAN:
1235  			; /* try to reclaim the page below */
1236  		}
1237  
1238  		/*
1239  		 * Anonymous process memory has backing store?
1240  		 * Try to allocate it some swap space here.
1241  		 * Lazyfree page could be freed directly
1242  		 */
1243  		if (PageAnon(page) && PageSwapBacked(page)) {
1244  			if (!PageSwapCache(page)) {
1245  				if (!(sc->gfp_mask & __GFP_IO))
1246  					goto keep_locked;
1247  				if (PageTransHuge(page)) {
1248  					/* cannot split THP, skip it */
1249  					if (!can_split_huge_page(page, NULL))
1250  						goto activate_locked;
1251  					/*
1252  					 * Split pages without a PMD map right
1253  					 * away. Chances are some or all of the
1254  					 * tail pages can be freed without IO.
1255  					 */
1256  					if (!compound_mapcount(page) &&
1257  					    split_huge_page_to_list(page,
1258  								    page_list))
1259  						goto activate_locked;
1260  				}
1261  				if (!add_to_swap(page)) {
1262  					if (!PageTransHuge(page))
1263  						goto activate_locked_split;
1264  					/* Fallback to swap normal pages */
1265  					if (split_huge_page_to_list(page,
1266  								    page_list))
1267  						goto activate_locked;
1268  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1269  					count_vm_event(THP_SWPOUT_FALLBACK);
1270  #endif
1271  					if (!add_to_swap(page))
1272  						goto activate_locked_split;
1273  				}
1274  
1275  				may_enter_fs = true;
1276  
1277  				/* Adding to swap updated mapping */
1278  				mapping = page_mapping(page);
1279  			}
1280  		} else if (unlikely(PageTransHuge(page))) {
1281  			/* Split file THP */
1282  			if (split_huge_page_to_list(page, page_list))
1283  				goto keep_locked;
1284  		}
1285  
1286  		/*
1287  		 * THP may get split above, need minus tail pages and update
1288  		 * nr_pages to avoid accounting tail pages twice.
1289  		 *
1290  		 * The tail pages that are added into swap cache successfully
1291  		 * reach here.
1292  		 */
1293  		if ((nr_pages > 1) && !PageTransHuge(page)) {
1294  			sc->nr_scanned -= (nr_pages - 1);
1295  			nr_pages = 1;
1296  		}
1297  
1298  		/*
1299  		 * The page is mapped into the page tables of one or more
1300  		 * processes. Try to unmap it here.
1301  		 */
1302  		if (page_mapped(page)) {
1303  			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1304  			bool was_swapbacked = PageSwapBacked(page);
1305  
1306  			if (unlikely(PageTransHuge(page)))
1307  				flags |= TTU_SPLIT_HUGE_PMD;
1308  
1309  			if (!try_to_unmap(page, flags)) {
1310  				stat->nr_unmap_fail += nr_pages;
1311  				if (!was_swapbacked && PageSwapBacked(page))
1312  					stat->nr_lazyfree_fail += nr_pages;
1313  				goto activate_locked;
1314  			}
1315  		}
1316  
1317  		if (PageDirty(page)) {
1318  			/*
1319  			 * Only kswapd can writeback filesystem pages
1320  			 * to avoid risk of stack overflow. But avoid
1321  			 * injecting inefficient single-page IO into
1322  			 * flusher writeback as much as possible: only
1323  			 * write pages when we've encountered many
1324  			 * dirty pages, and when we've already scanned
1325  			 * the rest of the LRU for clean pages and see
1326  			 * the same dirty pages again (PageReclaim).
1327  			 */
1328  			if (page_is_file_lru(page) &&
1329  			    (!current_is_kswapd() || !PageReclaim(page) ||
1330  			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1331  				/*
1332  				 * Immediately reclaim when written back.
1333  				 * Similar in principal to deactivate_page()
1334  				 * except we already have the page isolated
1335  				 * and know it's dirty
1336  				 */
1337  				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1338  				SetPageReclaim(page);
1339  
1340  				goto activate_locked;
1341  			}
1342  
1343  			if (references == PAGEREF_RECLAIM_CLEAN)
1344  				goto keep_locked;
1345  			if (!may_enter_fs)
1346  				goto keep_locked;
1347  			if (!sc->may_writepage)
1348  				goto keep_locked;
1349  
1350  			/*
1351  			 * Page is dirty. Flush the TLB if a writable entry
1352  			 * potentially exists to avoid CPU writes after IO
1353  			 * starts and then write it out here.
1354  			 */
1355  			try_to_unmap_flush_dirty();
1356  			switch (pageout(page, mapping)) {
1357  			case PAGE_KEEP:
1358  				goto keep_locked;
1359  			case PAGE_ACTIVATE:
1360  				goto activate_locked;
1361  			case PAGE_SUCCESS:
1362  				stat->nr_pageout += hpage_nr_pages(page);
1363  
1364  				if (PageWriteback(page))
1365  					goto keep;
1366  				if (PageDirty(page))
1367  					goto keep;
1368  
1369  				/*
1370  				 * A synchronous write - probably a ramdisk.  Go
1371  				 * ahead and try to reclaim the page.
1372  				 */
1373  				if (!trylock_page(page))
1374  					goto keep;
1375  				if (PageDirty(page) || PageWriteback(page))
1376  					goto keep_locked;
1377  				mapping = page_mapping(page);
1378  			case PAGE_CLEAN:
1379  				; /* try to free the page below */
1380  			}
1381  		}
1382  
1383  		/*
1384  		 * If the page has buffers, try to free the buffer mappings
1385  		 * associated with this page. If we succeed we try to free
1386  		 * the page as well.
1387  		 *
1388  		 * We do this even if the page is PageDirty().
1389  		 * try_to_release_page() does not perform I/O, but it is
1390  		 * possible for a page to have PageDirty set, but it is actually
1391  		 * clean (all its buffers are clean).  This happens if the
1392  		 * buffers were written out directly, with submit_bh(). ext3
1393  		 * will do this, as well as the blockdev mapping.
1394  		 * try_to_release_page() will discover that cleanness and will
1395  		 * drop the buffers and mark the page clean - it can be freed.
1396  		 *
1397  		 * Rarely, pages can have buffers and no ->mapping.  These are
1398  		 * the pages which were not successfully invalidated in
1399  		 * truncate_complete_page().  We try to drop those buffers here
1400  		 * and if that worked, and the page is no longer mapped into
1401  		 * process address space (page_count == 1) it can be freed.
1402  		 * Otherwise, leave the page on the LRU so it is swappable.
1403  		 */
1404  		if (page_has_private(page)) {
1405  			if (!try_to_release_page(page, sc->gfp_mask))
1406  				goto activate_locked;
1407  			if (!mapping && page_count(page) == 1) {
1408  				unlock_page(page);
1409  				if (put_page_testzero(page))
1410  					goto free_it;
1411  				else {
1412  					/*
1413  					 * rare race with speculative reference.
1414  					 * the speculative reference will free
1415  					 * this page shortly, so we may
1416  					 * increment nr_reclaimed here (and
1417  					 * leave it off the LRU).
1418  					 */
1419  					nr_reclaimed++;
1420  					continue;
1421  				}
1422  			}
1423  		}
1424  
1425  		if (PageAnon(page) && !PageSwapBacked(page)) {
1426  			/* follow __remove_mapping for reference */
1427  			if (!page_ref_freeze(page, 1))
1428  				goto keep_locked;
1429  			if (PageDirty(page)) {
1430  				page_ref_unfreeze(page, 1);
1431  				goto keep_locked;
1432  			}
1433  
1434  			count_vm_event(PGLAZYFREED);
1435  			count_memcg_page_event(page, PGLAZYFREED);
1436  		} else if (!mapping || !__remove_mapping(mapping, page, true,
1437  							 sc->target_mem_cgroup))
1438  			goto keep_locked;
1439  
1440  		unlock_page(page);
1441  free_it:
1442  		/*
1443  		 * THP may get swapped out in a whole, need account
1444  		 * all base pages.
1445  		 */
1446  		nr_reclaimed += nr_pages;
1447  
1448  		/*
1449  		 * Is there need to periodically free_page_list? It would
1450  		 * appear not as the counts should be low
1451  		 */
1452  		if (unlikely(PageTransHuge(page)))
1453  			destroy_compound_page(page);
1454  		else
1455  			list_add(&page->lru, &free_pages);
1456  		continue;
1457  
1458  activate_locked_split:
1459  		/*
1460  		 * The tail pages that are failed to add into swap cache
1461  		 * reach here.  Fixup nr_scanned and nr_pages.
1462  		 */
1463  		if (nr_pages > 1) {
1464  			sc->nr_scanned -= (nr_pages - 1);
1465  			nr_pages = 1;
1466  		}
1467  activate_locked:
1468  		/* Not a candidate for swapping, so reclaim swap space. */
1469  		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1470  						PageMlocked(page)))
1471  			try_to_free_swap(page);
1472  		VM_BUG_ON_PAGE(PageActive(page), page);
1473  		if (!PageMlocked(page)) {
1474  			int type = page_is_file_lru(page);
1475  			SetPageActive(page);
1476  			stat->nr_activate[type] += nr_pages;
1477  			count_memcg_page_event(page, PGACTIVATE);
1478  		}
1479  keep_locked:
1480  		unlock_page(page);
1481  keep:
1482  		list_add(&page->lru, &ret_pages);
1483  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1484  	}
1485  
1486  	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1487  
1488  	mem_cgroup_uncharge_list(&free_pages);
1489  	try_to_unmap_flush();
1490  	free_unref_page_list(&free_pages);
1491  
1492  	list_splice(&ret_pages, page_list);
1493  	count_vm_events(PGACTIVATE, pgactivate);
1494  
1495  	return nr_reclaimed;
1496  }
1497  
1498  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1499  					    struct list_head *page_list)
1500  {
1501  	struct scan_control sc = {
1502  		.gfp_mask = GFP_KERNEL,
1503  		.priority = DEF_PRIORITY,
1504  		.may_unmap = 1,
1505  	};
1506  	struct reclaim_stat stat;
1507  	unsigned int nr_reclaimed;
1508  	struct page *page, *next;
1509  	LIST_HEAD(clean_pages);
1510  
1511  	list_for_each_entry_safe(page, next, page_list, lru) {
1512  		if (page_is_file_lru(page) && !PageDirty(page) &&
1513  		    !__PageMovable(page) && !PageUnevictable(page)) {
1514  			ClearPageActive(page);
1515  			list_move(&page->lru, &clean_pages);
1516  		}
1517  	}
1518  
1519  	nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1520  			TTU_IGNORE_ACCESS, &stat, true);
1521  	list_splice(&clean_pages, page_list);
1522  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
1523  	/*
1524  	 * Since lazyfree pages are isolated from file LRU from the beginning,
1525  	 * they will rotate back to anonymous LRU in the end if it failed to
1526  	 * discard so isolated count will be mismatched.
1527  	 * Compensate the isolated count for both LRU lists.
1528  	 */
1529  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1530  			    stat.nr_lazyfree_fail);
1531  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1532  			    -stat.nr_lazyfree_fail);
1533  	return nr_reclaimed;
1534  }
1535  
1536  /*
1537   * Attempt to remove the specified page from its LRU.  Only take this page
1538   * if it is of the appropriate PageActive status.  Pages which are being
1539   * freed elsewhere are also ignored.
1540   *
1541   * page:	page to consider
1542   * mode:	one of the LRU isolation modes defined above
1543   *
1544   * returns 0 on success, -ve errno on failure.
1545   */
1546  int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1547  {
1548  	int ret = -EINVAL;
1549  
1550  	/* Only take pages on the LRU. */
1551  	if (!PageLRU(page))
1552  		return ret;
1553  
1554  	/* Compaction should not handle unevictable pages but CMA can do so */
1555  	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1556  		return ret;
1557  
1558  	ret = -EBUSY;
1559  
1560  	/*
1561  	 * To minimise LRU disruption, the caller can indicate that it only
1562  	 * wants to isolate pages it will be able to operate on without
1563  	 * blocking - clean pages for the most part.
1564  	 *
1565  	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1566  	 * that it is possible to migrate without blocking
1567  	 */
1568  	if (mode & ISOLATE_ASYNC_MIGRATE) {
1569  		/* All the caller can do on PageWriteback is block */
1570  		if (PageWriteback(page))
1571  			return ret;
1572  
1573  		if (PageDirty(page)) {
1574  			struct address_space *mapping;
1575  			bool migrate_dirty;
1576  
1577  			/*
1578  			 * Only pages without mappings or that have a
1579  			 * ->migratepage callback are possible to migrate
1580  			 * without blocking. However, we can be racing with
1581  			 * truncation so it's necessary to lock the page
1582  			 * to stabilise the mapping as truncation holds
1583  			 * the page lock until after the page is removed
1584  			 * from the page cache.
1585  			 */
1586  			if (!trylock_page(page))
1587  				return ret;
1588  
1589  			mapping = page_mapping(page);
1590  			migrate_dirty = !mapping || mapping->a_ops->migratepage;
1591  			unlock_page(page);
1592  			if (!migrate_dirty)
1593  				return ret;
1594  		}
1595  	}
1596  
1597  	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1598  		return ret;
1599  
1600  	if (likely(get_page_unless_zero(page))) {
1601  		/*
1602  		 * Be careful not to clear PageLRU until after we're
1603  		 * sure the page is not being freed elsewhere -- the
1604  		 * page release code relies on it.
1605  		 */
1606  		ClearPageLRU(page);
1607  		ret = 0;
1608  	}
1609  
1610  	return ret;
1611  }
1612  
1613  
1614  /*
1615   * Update LRU sizes after isolating pages. The LRU size updates must
1616   * be complete before mem_cgroup_update_lru_size due to a sanity check.
1617   */
1618  static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1619  			enum lru_list lru, unsigned long *nr_zone_taken)
1620  {
1621  	int zid;
1622  
1623  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1624  		if (!nr_zone_taken[zid])
1625  			continue;
1626  
1627  		update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1628  	}
1629  
1630  }
1631  
1632  /**
1633   * pgdat->lru_lock is heavily contended.  Some of the functions that
1634   * shrink the lists perform better by taking out a batch of pages
1635   * and working on them outside the LRU lock.
1636   *
1637   * For pagecache intensive workloads, this function is the hottest
1638   * spot in the kernel (apart from copy_*_user functions).
1639   *
1640   * Appropriate locks must be held before calling this function.
1641   *
1642   * @nr_to_scan:	The number of eligible pages to look through on the list.
1643   * @lruvec:	The LRU vector to pull pages from.
1644   * @dst:	The temp list to put pages on to.
1645   * @nr_scanned:	The number of pages that were scanned.
1646   * @sc:		The scan_control struct for this reclaim session
1647   * @lru:	LRU list id for isolating
1648   *
1649   * returns how many pages were moved onto *@dst.
1650   */
1651  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1652  		struct lruvec *lruvec, struct list_head *dst,
1653  		unsigned long *nr_scanned, struct scan_control *sc,
1654  		enum lru_list lru)
1655  {
1656  	struct list_head *src = &lruvec->lists[lru];
1657  	unsigned long nr_taken = 0;
1658  	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1659  	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1660  	unsigned long skipped = 0;
1661  	unsigned long scan, total_scan, nr_pages;
1662  	LIST_HEAD(pages_skipped);
1663  	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1664  
1665  	total_scan = 0;
1666  	scan = 0;
1667  	while (scan < nr_to_scan && !list_empty(src)) {
1668  		struct page *page;
1669  
1670  		page = lru_to_page(src);
1671  		prefetchw_prev_lru_page(page, src, flags);
1672  
1673  		VM_BUG_ON_PAGE(!PageLRU(page), page);
1674  
1675  		nr_pages = compound_nr(page);
1676  		total_scan += nr_pages;
1677  
1678  		if (page_zonenum(page) > sc->reclaim_idx) {
1679  			list_move(&page->lru, &pages_skipped);
1680  			nr_skipped[page_zonenum(page)] += nr_pages;
1681  			continue;
1682  		}
1683  
1684  		/*
1685  		 * Do not count skipped pages because that makes the function
1686  		 * return with no isolated pages if the LRU mostly contains
1687  		 * ineligible pages.  This causes the VM to not reclaim any
1688  		 * pages, triggering a premature OOM.
1689  		 *
1690  		 * Account all tail pages of THP.  This would not cause
1691  		 * premature OOM since __isolate_lru_page() returns -EBUSY
1692  		 * only when the page is being freed somewhere else.
1693  		 */
1694  		scan += nr_pages;
1695  		switch (__isolate_lru_page(page, mode)) {
1696  		case 0:
1697  			nr_taken += nr_pages;
1698  			nr_zone_taken[page_zonenum(page)] += nr_pages;
1699  			list_move(&page->lru, dst);
1700  			break;
1701  
1702  		case -EBUSY:
1703  			/* else it is being freed elsewhere */
1704  			list_move(&page->lru, src);
1705  			continue;
1706  
1707  		default:
1708  			BUG();
1709  		}
1710  	}
1711  
1712  	/*
1713  	 * Splice any skipped pages to the start of the LRU list. Note that
1714  	 * this disrupts the LRU order when reclaiming for lower zones but
1715  	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1716  	 * scanning would soon rescan the same pages to skip and put the
1717  	 * system at risk of premature OOM.
1718  	 */
1719  	if (!list_empty(&pages_skipped)) {
1720  		int zid;
1721  
1722  		list_splice(&pages_skipped, src);
1723  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1724  			if (!nr_skipped[zid])
1725  				continue;
1726  
1727  			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1728  			skipped += nr_skipped[zid];
1729  		}
1730  	}
1731  	*nr_scanned = total_scan;
1732  	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1733  				    total_scan, skipped, nr_taken, mode, lru);
1734  	update_lru_sizes(lruvec, lru, nr_zone_taken);
1735  	return nr_taken;
1736  }
1737  
1738  /**
1739   * isolate_lru_page - tries to isolate a page from its LRU list
1740   * @page: page to isolate from its LRU list
1741   *
1742   * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1743   * vmstat statistic corresponding to whatever LRU list the page was on.
1744   *
1745   * Returns 0 if the page was removed from an LRU list.
1746   * Returns -EBUSY if the page was not on an LRU list.
1747   *
1748   * The returned page will have PageLRU() cleared.  If it was found on
1749   * the active list, it will have PageActive set.  If it was found on
1750   * the unevictable list, it will have the PageUnevictable bit set. That flag
1751   * may need to be cleared by the caller before letting the page go.
1752   *
1753   * The vmstat statistic corresponding to the list on which the page was
1754   * found will be decremented.
1755   *
1756   * Restrictions:
1757   *
1758   * (1) Must be called with an elevated refcount on the page. This is a
1759   *     fundamentnal difference from isolate_lru_pages (which is called
1760   *     without a stable reference).
1761   * (2) the lru_lock must not be held.
1762   * (3) interrupts must be enabled.
1763   */
1764  int isolate_lru_page(struct page *page)
1765  {
1766  	int ret = -EBUSY;
1767  
1768  	VM_BUG_ON_PAGE(!page_count(page), page);
1769  	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1770  
1771  	if (PageLRU(page)) {
1772  		pg_data_t *pgdat = page_pgdat(page);
1773  		struct lruvec *lruvec;
1774  
1775  		spin_lock_irq(&pgdat->lru_lock);
1776  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
1777  		if (PageLRU(page)) {
1778  			int lru = page_lru(page);
1779  			get_page(page);
1780  			ClearPageLRU(page);
1781  			del_page_from_lru_list(page, lruvec, lru);
1782  			ret = 0;
1783  		}
1784  		spin_unlock_irq(&pgdat->lru_lock);
1785  	}
1786  	return ret;
1787  }
1788  
1789  /*
1790   * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1791   * then get rescheduled. When there are massive number of tasks doing page
1792   * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1793   * the LRU list will go small and be scanned faster than necessary, leading to
1794   * unnecessary swapping, thrashing and OOM.
1795   */
1796  static int too_many_isolated(struct pglist_data *pgdat, int file,
1797  		struct scan_control *sc)
1798  {
1799  	unsigned long inactive, isolated;
1800  
1801  	if (current_is_kswapd())
1802  		return 0;
1803  
1804  	if (!writeback_throttling_sane(sc))
1805  		return 0;
1806  
1807  	if (file) {
1808  		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1809  		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1810  	} else {
1811  		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1812  		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1813  	}
1814  
1815  	/*
1816  	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1817  	 * won't get blocked by normal direct-reclaimers, forming a circular
1818  	 * deadlock.
1819  	 */
1820  	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1821  		inactive >>= 3;
1822  
1823  	return isolated > inactive;
1824  }
1825  
1826  /*
1827   * This moves pages from @list to corresponding LRU list.
1828   *
1829   * We move them the other way if the page is referenced by one or more
1830   * processes, from rmap.
1831   *
1832   * If the pages are mostly unmapped, the processing is fast and it is
1833   * appropriate to hold zone_lru_lock across the whole operation.  But if
1834   * the pages are mapped, the processing is slow (page_referenced()) so we
1835   * should drop zone_lru_lock around each page.  It's impossible to balance
1836   * this, so instead we remove the pages from the LRU while processing them.
1837   * It is safe to rely on PG_active against the non-LRU pages in here because
1838   * nobody will play with that bit on a non-LRU page.
1839   *
1840   * The downside is that we have to touch page->_refcount against each page.
1841   * But we had to alter page->flags anyway.
1842   *
1843   * Returns the number of pages moved to the given lruvec.
1844   */
1845  
1846  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1847  						     struct list_head *list)
1848  {
1849  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1850  	int nr_pages, nr_moved = 0;
1851  	LIST_HEAD(pages_to_free);
1852  	struct page *page;
1853  	enum lru_list lru;
1854  
1855  	while (!list_empty(list)) {
1856  		page = lru_to_page(list);
1857  		VM_BUG_ON_PAGE(PageLRU(page), page);
1858  		if (unlikely(!page_evictable(page))) {
1859  			list_del(&page->lru);
1860  			spin_unlock_irq(&pgdat->lru_lock);
1861  			putback_lru_page(page);
1862  			spin_lock_irq(&pgdat->lru_lock);
1863  			continue;
1864  		}
1865  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
1866  
1867  		SetPageLRU(page);
1868  		lru = page_lru(page);
1869  
1870  		nr_pages = hpage_nr_pages(page);
1871  		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1872  		list_move(&page->lru, &lruvec->lists[lru]);
1873  
1874  		if (put_page_testzero(page)) {
1875  			__ClearPageLRU(page);
1876  			__ClearPageActive(page);
1877  			del_page_from_lru_list(page, lruvec, lru);
1878  
1879  			if (unlikely(PageCompound(page))) {
1880  				spin_unlock_irq(&pgdat->lru_lock);
1881  				destroy_compound_page(page);
1882  				spin_lock_irq(&pgdat->lru_lock);
1883  			} else
1884  				list_add(&page->lru, &pages_to_free);
1885  		} else {
1886  			nr_moved += nr_pages;
1887  		}
1888  	}
1889  
1890  	/*
1891  	 * To save our caller's stack, now use input list for pages to free.
1892  	 */
1893  	list_splice(&pages_to_free, list);
1894  
1895  	return nr_moved;
1896  }
1897  
1898  /*
1899   * If a kernel thread (such as nfsd for loop-back mounts) services
1900   * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
1901   * In that case we should only throttle if the backing device it is
1902   * writing to is congested.  In other cases it is safe to throttle.
1903   */
1904  static int current_may_throttle(void)
1905  {
1906  	return !(current->flags & PF_LOCAL_THROTTLE) ||
1907  		current->backing_dev_info == NULL ||
1908  		bdi_write_congested(current->backing_dev_info);
1909  }
1910  
1911  /*
1912   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1913   * of reclaimed pages
1914   */
1915  static noinline_for_stack unsigned long
1916  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1917  		     struct scan_control *sc, enum lru_list lru)
1918  {
1919  	LIST_HEAD(page_list);
1920  	unsigned long nr_scanned;
1921  	unsigned int nr_reclaimed = 0;
1922  	unsigned long nr_taken;
1923  	struct reclaim_stat stat;
1924  	bool file = is_file_lru(lru);
1925  	enum vm_event_item item;
1926  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1927  	bool stalled = false;
1928  
1929  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
1930  		if (stalled)
1931  			return 0;
1932  
1933  		/* wait a bit for the reclaimer. */
1934  		msleep(100);
1935  		stalled = true;
1936  
1937  		/* We are about to die and free our memory. Return now. */
1938  		if (fatal_signal_pending(current))
1939  			return SWAP_CLUSTER_MAX;
1940  	}
1941  
1942  	lru_add_drain();
1943  
1944  	spin_lock_irq(&pgdat->lru_lock);
1945  
1946  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1947  				     &nr_scanned, sc, lru);
1948  
1949  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1950  	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1951  	if (!cgroup_reclaim(sc))
1952  		__count_vm_events(item, nr_scanned);
1953  	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1954  	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
1955  
1956  	spin_unlock_irq(&pgdat->lru_lock);
1957  
1958  	if (nr_taken == 0)
1959  		return 0;
1960  
1961  	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1962  				&stat, false);
1963  
1964  	spin_lock_irq(&pgdat->lru_lock);
1965  
1966  	move_pages_to_lru(lruvec, &page_list);
1967  
1968  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1969  	lru_note_cost(lruvec, file, stat.nr_pageout);
1970  	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1971  	if (!cgroup_reclaim(sc))
1972  		__count_vm_events(item, nr_reclaimed);
1973  	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
1974  	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
1975  
1976  	spin_unlock_irq(&pgdat->lru_lock);
1977  
1978  	mem_cgroup_uncharge_list(&page_list);
1979  	free_unref_page_list(&page_list);
1980  
1981  	/*
1982  	 * If dirty pages are scanned that are not queued for IO, it
1983  	 * implies that flushers are not doing their job. This can
1984  	 * happen when memory pressure pushes dirty pages to the end of
1985  	 * the LRU before the dirty limits are breached and the dirty
1986  	 * data has expired. It can also happen when the proportion of
1987  	 * dirty pages grows not through writes but through memory
1988  	 * pressure reclaiming all the clean cache. And in some cases,
1989  	 * the flushers simply cannot keep up with the allocation
1990  	 * rate. Nudge the flusher threads in case they are asleep.
1991  	 */
1992  	if (stat.nr_unqueued_dirty == nr_taken)
1993  		wakeup_flusher_threads(WB_REASON_VMSCAN);
1994  
1995  	sc->nr.dirty += stat.nr_dirty;
1996  	sc->nr.congested += stat.nr_congested;
1997  	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1998  	sc->nr.writeback += stat.nr_writeback;
1999  	sc->nr.immediate += stat.nr_immediate;
2000  	sc->nr.taken += nr_taken;
2001  	if (file)
2002  		sc->nr.file_taken += nr_taken;
2003  
2004  	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2005  			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2006  	return nr_reclaimed;
2007  }
2008  
2009  static void shrink_active_list(unsigned long nr_to_scan,
2010  			       struct lruvec *lruvec,
2011  			       struct scan_control *sc,
2012  			       enum lru_list lru)
2013  {
2014  	unsigned long nr_taken;
2015  	unsigned long nr_scanned;
2016  	unsigned long vm_flags;
2017  	LIST_HEAD(l_hold);	/* The pages which were snipped off */
2018  	LIST_HEAD(l_active);
2019  	LIST_HEAD(l_inactive);
2020  	struct page *page;
2021  	unsigned nr_deactivate, nr_activate;
2022  	unsigned nr_rotated = 0;
2023  	int file = is_file_lru(lru);
2024  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2025  
2026  	lru_add_drain();
2027  
2028  	spin_lock_irq(&pgdat->lru_lock);
2029  
2030  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2031  				     &nr_scanned, sc, lru);
2032  
2033  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2034  
2035  	__count_vm_events(PGREFILL, nr_scanned);
2036  	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2037  
2038  	spin_unlock_irq(&pgdat->lru_lock);
2039  
2040  	while (!list_empty(&l_hold)) {
2041  		cond_resched();
2042  		page = lru_to_page(&l_hold);
2043  		list_del(&page->lru);
2044  
2045  		if (unlikely(!page_evictable(page))) {
2046  			putback_lru_page(page);
2047  			continue;
2048  		}
2049  
2050  		if (unlikely(buffer_heads_over_limit)) {
2051  			if (page_has_private(page) && trylock_page(page)) {
2052  				if (page_has_private(page))
2053  					try_to_release_page(page, 0);
2054  				unlock_page(page);
2055  			}
2056  		}
2057  
2058  		if (page_referenced(page, 0, sc->target_mem_cgroup,
2059  				    &vm_flags)) {
2060  			/*
2061  			 * Identify referenced, file-backed active pages and
2062  			 * give them one more trip around the active list. So
2063  			 * that executable code get better chances to stay in
2064  			 * memory under moderate memory pressure.  Anon pages
2065  			 * are not likely to be evicted by use-once streaming
2066  			 * IO, plus JVM can create lots of anon VM_EXEC pages,
2067  			 * so we ignore them here.
2068  			 */
2069  			if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
2070  				nr_rotated += hpage_nr_pages(page);
2071  				list_add(&page->lru, &l_active);
2072  				continue;
2073  			}
2074  		}
2075  
2076  		ClearPageActive(page);	/* we are de-activating */
2077  		SetPageWorkingset(page);
2078  		list_add(&page->lru, &l_inactive);
2079  	}
2080  
2081  	/*
2082  	 * Move pages back to the lru list.
2083  	 */
2084  	spin_lock_irq(&pgdat->lru_lock);
2085  
2086  	nr_activate = move_pages_to_lru(lruvec, &l_active);
2087  	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2088  	/* Keep all free pages in l_active list */
2089  	list_splice(&l_inactive, &l_active);
2090  
2091  	__count_vm_events(PGDEACTIVATE, nr_deactivate);
2092  	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2093  
2094  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2095  	spin_unlock_irq(&pgdat->lru_lock);
2096  
2097  	mem_cgroup_uncharge_list(&l_active);
2098  	free_unref_page_list(&l_active);
2099  	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2100  			nr_deactivate, nr_rotated, sc->priority, file);
2101  }
2102  
2103  unsigned long reclaim_pages(struct list_head *page_list)
2104  {
2105  	int nid = NUMA_NO_NODE;
2106  	unsigned int nr_reclaimed = 0;
2107  	LIST_HEAD(node_page_list);
2108  	struct reclaim_stat dummy_stat;
2109  	struct page *page;
2110  	struct scan_control sc = {
2111  		.gfp_mask = GFP_KERNEL,
2112  		.priority = DEF_PRIORITY,
2113  		.may_writepage = 1,
2114  		.may_unmap = 1,
2115  		.may_swap = 1,
2116  	};
2117  
2118  	while (!list_empty(page_list)) {
2119  		page = lru_to_page(page_list);
2120  		if (nid == NUMA_NO_NODE) {
2121  			nid = page_to_nid(page);
2122  			INIT_LIST_HEAD(&node_page_list);
2123  		}
2124  
2125  		if (nid == page_to_nid(page)) {
2126  			ClearPageActive(page);
2127  			list_move(&page->lru, &node_page_list);
2128  			continue;
2129  		}
2130  
2131  		nr_reclaimed += shrink_page_list(&node_page_list,
2132  						NODE_DATA(nid),
2133  						&sc, 0,
2134  						&dummy_stat, false);
2135  		while (!list_empty(&node_page_list)) {
2136  			page = lru_to_page(&node_page_list);
2137  			list_del(&page->lru);
2138  			putback_lru_page(page);
2139  		}
2140  
2141  		nid = NUMA_NO_NODE;
2142  	}
2143  
2144  	if (!list_empty(&node_page_list)) {
2145  		nr_reclaimed += shrink_page_list(&node_page_list,
2146  						NODE_DATA(nid),
2147  						&sc, 0,
2148  						&dummy_stat, false);
2149  		while (!list_empty(&node_page_list)) {
2150  			page = lru_to_page(&node_page_list);
2151  			list_del(&page->lru);
2152  			putback_lru_page(page);
2153  		}
2154  	}
2155  
2156  	return nr_reclaimed;
2157  }
2158  
2159  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2160  				 struct lruvec *lruvec, struct scan_control *sc)
2161  {
2162  	if (is_active_lru(lru)) {
2163  		if (sc->may_deactivate & (1 << is_file_lru(lru)))
2164  			shrink_active_list(nr_to_scan, lruvec, sc, lru);
2165  		else
2166  			sc->skipped_deactivate = 1;
2167  		return 0;
2168  	}
2169  
2170  	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2171  }
2172  
2173  /*
2174   * The inactive anon list should be small enough that the VM never has
2175   * to do too much work.
2176   *
2177   * The inactive file list should be small enough to leave most memory
2178   * to the established workingset on the scan-resistant active list,
2179   * but large enough to avoid thrashing the aggregate readahead window.
2180   *
2181   * Both inactive lists should also be large enough that each inactive
2182   * page has a chance to be referenced again before it is reclaimed.
2183   *
2184   * If that fails and refaulting is observed, the inactive list grows.
2185   *
2186   * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2187   * on this LRU, maintained by the pageout code. An inactive_ratio
2188   * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2189   *
2190   * total     target    max
2191   * memory    ratio     inactive
2192   * -------------------------------------
2193   *   10MB       1         5MB
2194   *  100MB       1        50MB
2195   *    1GB       3       250MB
2196   *   10GB      10       0.9GB
2197   *  100GB      31         3GB
2198   *    1TB     101        10GB
2199   *   10TB     320        32GB
2200   */
2201  static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2202  {
2203  	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2204  	unsigned long inactive, active;
2205  	unsigned long inactive_ratio;
2206  	unsigned long gb;
2207  
2208  	inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2209  	active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2210  
2211  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
2212  	if (gb)
2213  		inactive_ratio = int_sqrt(10 * gb);
2214  	else
2215  		inactive_ratio = 1;
2216  
2217  	return inactive * inactive_ratio < active;
2218  }
2219  
2220  enum scan_balance {
2221  	SCAN_EQUAL,
2222  	SCAN_FRACT,
2223  	SCAN_ANON,
2224  	SCAN_FILE,
2225  };
2226  
2227  /*
2228   * Determine how aggressively the anon and file LRU lists should be
2229   * scanned.  The relative value of each set of LRU lists is determined
2230   * by looking at the fraction of the pages scanned we did rotate back
2231   * onto the active list instead of evict.
2232   *
2233   * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
2234   * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
2235   */
2236  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2237  			   unsigned long *nr)
2238  {
2239  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2240  	unsigned long anon_cost, file_cost, total_cost;
2241  	int swappiness = mem_cgroup_swappiness(memcg);
2242  	u64 fraction[2];
2243  	u64 denominator = 0;	/* gcc */
2244  	enum scan_balance scan_balance;
2245  	unsigned long ap, fp;
2246  	enum lru_list lru;
2247  
2248  	/* If we have no swap space, do not bother scanning anon pages. */
2249  	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2250  		scan_balance = SCAN_FILE;
2251  		goto out;
2252  	}
2253  
2254  	/*
2255  	 * Global reclaim will swap to prevent OOM even with no
2256  	 * swappiness, but memcg users want to use this knob to
2257  	 * disable swapping for individual groups completely when
2258  	 * using the memory controller's swap limit feature would be
2259  	 * too expensive.
2260  	 */
2261  	if (cgroup_reclaim(sc) && !swappiness) {
2262  		scan_balance = SCAN_FILE;
2263  		goto out;
2264  	}
2265  
2266  	/*
2267  	 * Do not apply any pressure balancing cleverness when the
2268  	 * system is close to OOM, scan both anon and file equally
2269  	 * (unless the swappiness setting disagrees with swapping).
2270  	 */
2271  	if (!sc->priority && swappiness) {
2272  		scan_balance = SCAN_EQUAL;
2273  		goto out;
2274  	}
2275  
2276  	/*
2277  	 * If the system is almost out of file pages, force-scan anon.
2278  	 */
2279  	if (sc->file_is_tiny) {
2280  		scan_balance = SCAN_ANON;
2281  		goto out;
2282  	}
2283  
2284  	/*
2285  	 * If there is enough inactive page cache, we do not reclaim
2286  	 * anything from the anonymous working right now.
2287  	 */
2288  	if (sc->cache_trim_mode) {
2289  		scan_balance = SCAN_FILE;
2290  		goto out;
2291  	}
2292  
2293  	scan_balance = SCAN_FRACT;
2294  	/*
2295  	 * Calculate the pressure balance between anon and file pages.
2296  	 *
2297  	 * The amount of pressure we put on each LRU is inversely
2298  	 * proportional to the cost of reclaiming each list, as
2299  	 * determined by the share of pages that are refaulting, times
2300  	 * the relative IO cost of bringing back a swapped out
2301  	 * anonymous page vs reloading a filesystem page (swappiness).
2302  	 *
2303  	 * Although we limit that influence to ensure no list gets
2304  	 * left behind completely: at least a third of the pressure is
2305  	 * applied, before swappiness.
2306  	 *
2307  	 * With swappiness at 100, anon and file have equal IO cost.
2308  	 */
2309  	total_cost = sc->anon_cost + sc->file_cost;
2310  	anon_cost = total_cost + sc->anon_cost;
2311  	file_cost = total_cost + sc->file_cost;
2312  	total_cost = anon_cost + file_cost;
2313  
2314  	ap = swappiness * (total_cost + 1);
2315  	ap /= anon_cost + 1;
2316  
2317  	fp = (200 - swappiness) * (total_cost + 1);
2318  	fp /= file_cost + 1;
2319  
2320  	fraction[0] = ap;
2321  	fraction[1] = fp;
2322  	denominator = ap + fp;
2323  out:
2324  	for_each_evictable_lru(lru) {
2325  		int file = is_file_lru(lru);
2326  		unsigned long lruvec_size;
2327  		unsigned long scan;
2328  		unsigned long protection;
2329  
2330  		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2331  		protection = mem_cgroup_protection(memcg,
2332  						   sc->memcg_low_reclaim);
2333  
2334  		if (protection) {
2335  			/*
2336  			 * Scale a cgroup's reclaim pressure by proportioning
2337  			 * its current usage to its memory.low or memory.min
2338  			 * setting.
2339  			 *
2340  			 * This is important, as otherwise scanning aggression
2341  			 * becomes extremely binary -- from nothing as we
2342  			 * approach the memory protection threshold, to totally
2343  			 * nominal as we exceed it.  This results in requiring
2344  			 * setting extremely liberal protection thresholds. It
2345  			 * also means we simply get no protection at all if we
2346  			 * set it too low, which is not ideal.
2347  			 *
2348  			 * If there is any protection in place, we reduce scan
2349  			 * pressure by how much of the total memory used is
2350  			 * within protection thresholds.
2351  			 *
2352  			 * There is one special case: in the first reclaim pass,
2353  			 * we skip over all groups that are within their low
2354  			 * protection. If that fails to reclaim enough pages to
2355  			 * satisfy the reclaim goal, we come back and override
2356  			 * the best-effort low protection. However, we still
2357  			 * ideally want to honor how well-behaved groups are in
2358  			 * that case instead of simply punishing them all
2359  			 * equally. As such, we reclaim them based on how much
2360  			 * memory they are using, reducing the scan pressure
2361  			 * again by how much of the total memory used is under
2362  			 * hard protection.
2363  			 */
2364  			unsigned long cgroup_size = mem_cgroup_size(memcg);
2365  
2366  			/* Avoid TOCTOU with earlier protection check */
2367  			cgroup_size = max(cgroup_size, protection);
2368  
2369  			scan = lruvec_size - lruvec_size * protection /
2370  				cgroup_size;
2371  
2372  			/*
2373  			 * Minimally target SWAP_CLUSTER_MAX pages to keep
2374  			 * reclaim moving forwards, avoiding decrementing
2375  			 * sc->priority further than desirable.
2376  			 */
2377  			scan = max(scan, SWAP_CLUSTER_MAX);
2378  		} else {
2379  			scan = lruvec_size;
2380  		}
2381  
2382  		scan >>= sc->priority;
2383  
2384  		/*
2385  		 * If the cgroup's already been deleted, make sure to
2386  		 * scrape out the remaining cache.
2387  		 */
2388  		if (!scan && !mem_cgroup_online(memcg))
2389  			scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2390  
2391  		switch (scan_balance) {
2392  		case SCAN_EQUAL:
2393  			/* Scan lists relative to size */
2394  			break;
2395  		case SCAN_FRACT:
2396  			/*
2397  			 * Scan types proportional to swappiness and
2398  			 * their relative recent reclaim efficiency.
2399  			 * Make sure we don't miss the last page on
2400  			 * the offlined memory cgroups because of a
2401  			 * round-off error.
2402  			 */
2403  			scan = mem_cgroup_online(memcg) ?
2404  			       div64_u64(scan * fraction[file], denominator) :
2405  			       DIV64_U64_ROUND_UP(scan * fraction[file],
2406  						  denominator);
2407  			break;
2408  		case SCAN_FILE:
2409  		case SCAN_ANON:
2410  			/* Scan one type exclusively */
2411  			if ((scan_balance == SCAN_FILE) != file)
2412  				scan = 0;
2413  			break;
2414  		default:
2415  			/* Look ma, no brain */
2416  			BUG();
2417  		}
2418  
2419  		nr[lru] = scan;
2420  	}
2421  }
2422  
2423  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2424  {
2425  	unsigned long nr[NR_LRU_LISTS];
2426  	unsigned long targets[NR_LRU_LISTS];
2427  	unsigned long nr_to_scan;
2428  	enum lru_list lru;
2429  	unsigned long nr_reclaimed = 0;
2430  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2431  	struct blk_plug plug;
2432  	bool scan_adjusted;
2433  
2434  	get_scan_count(lruvec, sc, nr);
2435  
2436  	/* Record the original scan target for proportional adjustments later */
2437  	memcpy(targets, nr, sizeof(nr));
2438  
2439  	/*
2440  	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2441  	 * event that can occur when there is little memory pressure e.g.
2442  	 * multiple streaming readers/writers. Hence, we do not abort scanning
2443  	 * when the requested number of pages are reclaimed when scanning at
2444  	 * DEF_PRIORITY on the assumption that the fact we are direct
2445  	 * reclaiming implies that kswapd is not keeping up and it is best to
2446  	 * do a batch of work at once. For memcg reclaim one check is made to
2447  	 * abort proportional reclaim if either the file or anon lru has already
2448  	 * dropped to zero at the first pass.
2449  	 */
2450  	scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2451  			 sc->priority == DEF_PRIORITY);
2452  
2453  	blk_start_plug(&plug);
2454  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2455  					nr[LRU_INACTIVE_FILE]) {
2456  		unsigned long nr_anon, nr_file, percentage;
2457  		unsigned long nr_scanned;
2458  
2459  		for_each_evictable_lru(lru) {
2460  			if (nr[lru]) {
2461  				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2462  				nr[lru] -= nr_to_scan;
2463  
2464  				nr_reclaimed += shrink_list(lru, nr_to_scan,
2465  							    lruvec, sc);
2466  			}
2467  		}
2468  
2469  		cond_resched();
2470  
2471  		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2472  			continue;
2473  
2474  		/*
2475  		 * For kswapd and memcg, reclaim at least the number of pages
2476  		 * requested. Ensure that the anon and file LRUs are scanned
2477  		 * proportionally what was requested by get_scan_count(). We
2478  		 * stop reclaiming one LRU and reduce the amount scanning
2479  		 * proportional to the original scan target.
2480  		 */
2481  		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2482  		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2483  
2484  		/*
2485  		 * It's just vindictive to attack the larger once the smaller
2486  		 * has gone to zero.  And given the way we stop scanning the
2487  		 * smaller below, this makes sure that we only make one nudge
2488  		 * towards proportionality once we've got nr_to_reclaim.
2489  		 */
2490  		if (!nr_file || !nr_anon)
2491  			break;
2492  
2493  		if (nr_file > nr_anon) {
2494  			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2495  						targets[LRU_ACTIVE_ANON] + 1;
2496  			lru = LRU_BASE;
2497  			percentage = nr_anon * 100 / scan_target;
2498  		} else {
2499  			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2500  						targets[LRU_ACTIVE_FILE] + 1;
2501  			lru = LRU_FILE;
2502  			percentage = nr_file * 100 / scan_target;
2503  		}
2504  
2505  		/* Stop scanning the smaller of the LRU */
2506  		nr[lru] = 0;
2507  		nr[lru + LRU_ACTIVE] = 0;
2508  
2509  		/*
2510  		 * Recalculate the other LRU scan count based on its original
2511  		 * scan target and the percentage scanning already complete
2512  		 */
2513  		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2514  		nr_scanned = targets[lru] - nr[lru];
2515  		nr[lru] = targets[lru] * (100 - percentage) / 100;
2516  		nr[lru] -= min(nr[lru], nr_scanned);
2517  
2518  		lru += LRU_ACTIVE;
2519  		nr_scanned = targets[lru] - nr[lru];
2520  		nr[lru] = targets[lru] * (100 - percentage) / 100;
2521  		nr[lru] -= min(nr[lru], nr_scanned);
2522  
2523  		scan_adjusted = true;
2524  	}
2525  	blk_finish_plug(&plug);
2526  	sc->nr_reclaimed += nr_reclaimed;
2527  
2528  	/*
2529  	 * Even if we did not try to evict anon pages at all, we want to
2530  	 * rebalance the anon lru active/inactive ratio.
2531  	 */
2532  	if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
2533  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2534  				   sc, LRU_ACTIVE_ANON);
2535  }
2536  
2537  /* Use reclaim/compaction for costly allocs or under memory pressure */
2538  static bool in_reclaim_compaction(struct scan_control *sc)
2539  {
2540  	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2541  			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2542  			 sc->priority < DEF_PRIORITY - 2))
2543  		return true;
2544  
2545  	return false;
2546  }
2547  
2548  /*
2549   * Reclaim/compaction is used for high-order allocation requests. It reclaims
2550   * order-0 pages before compacting the zone. should_continue_reclaim() returns
2551   * true if more pages should be reclaimed such that when the page allocator
2552   * calls try_to_compact_pages() that it will have enough free pages to succeed.
2553   * It will give up earlier than that if there is difficulty reclaiming pages.
2554   */
2555  static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2556  					unsigned long nr_reclaimed,
2557  					struct scan_control *sc)
2558  {
2559  	unsigned long pages_for_compaction;
2560  	unsigned long inactive_lru_pages;
2561  	int z;
2562  
2563  	/* If not in reclaim/compaction mode, stop */
2564  	if (!in_reclaim_compaction(sc))
2565  		return false;
2566  
2567  	/*
2568  	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
2569  	 * number of pages that were scanned. This will return to the caller
2570  	 * with the risk reclaim/compaction and the resulting allocation attempt
2571  	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
2572  	 * allocations through requiring that the full LRU list has been scanned
2573  	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
2574  	 * scan, but that approximation was wrong, and there were corner cases
2575  	 * where always a non-zero amount of pages were scanned.
2576  	 */
2577  	if (!nr_reclaimed)
2578  		return false;
2579  
2580  	/* If compaction would go ahead or the allocation would succeed, stop */
2581  	for (z = 0; z <= sc->reclaim_idx; z++) {
2582  		struct zone *zone = &pgdat->node_zones[z];
2583  		if (!managed_zone(zone))
2584  			continue;
2585  
2586  		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2587  		case COMPACT_SUCCESS:
2588  		case COMPACT_CONTINUE:
2589  			return false;
2590  		default:
2591  			/* check next zone */
2592  			;
2593  		}
2594  	}
2595  
2596  	/*
2597  	 * If we have not reclaimed enough pages for compaction and the
2598  	 * inactive lists are large enough, continue reclaiming
2599  	 */
2600  	pages_for_compaction = compact_gap(sc->order);
2601  	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2602  	if (get_nr_swap_pages() > 0)
2603  		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2604  
2605  	return inactive_lru_pages > pages_for_compaction;
2606  }
2607  
2608  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
2609  {
2610  	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
2611  	struct mem_cgroup *memcg;
2612  
2613  	memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
2614  	do {
2615  		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2616  		unsigned long reclaimed;
2617  		unsigned long scanned;
2618  
2619  		switch (mem_cgroup_protected(target_memcg, memcg)) {
2620  		case MEMCG_PROT_MIN:
2621  			/*
2622  			 * Hard protection.
2623  			 * If there is no reclaimable memory, OOM.
2624  			 */
2625  			continue;
2626  		case MEMCG_PROT_LOW:
2627  			/*
2628  			 * Soft protection.
2629  			 * Respect the protection only as long as
2630  			 * there is an unprotected supply
2631  			 * of reclaimable memory from other cgroups.
2632  			 */
2633  			if (!sc->memcg_low_reclaim) {
2634  				sc->memcg_low_skipped = 1;
2635  				continue;
2636  			}
2637  			memcg_memory_event(memcg, MEMCG_LOW);
2638  			break;
2639  		case MEMCG_PROT_NONE:
2640  			/*
2641  			 * All protection thresholds breached. We may
2642  			 * still choose to vary the scan pressure
2643  			 * applied based on by how much the cgroup in
2644  			 * question has exceeded its protection
2645  			 * thresholds (see get_scan_count).
2646  			 */
2647  			break;
2648  		}
2649  
2650  		reclaimed = sc->nr_reclaimed;
2651  		scanned = sc->nr_scanned;
2652  
2653  		shrink_lruvec(lruvec, sc);
2654  
2655  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
2656  			    sc->priority);
2657  
2658  		/* Record the group's reclaim efficiency */
2659  		vmpressure(sc->gfp_mask, memcg, false,
2660  			   sc->nr_scanned - scanned,
2661  			   sc->nr_reclaimed - reclaimed);
2662  
2663  	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
2664  }
2665  
2666  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2667  {
2668  	struct reclaim_state *reclaim_state = current->reclaim_state;
2669  	unsigned long nr_reclaimed, nr_scanned;
2670  	struct lruvec *target_lruvec;
2671  	bool reclaimable = false;
2672  	unsigned long file;
2673  
2674  	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2675  
2676  again:
2677  	memset(&sc->nr, 0, sizeof(sc->nr));
2678  
2679  	nr_reclaimed = sc->nr_reclaimed;
2680  	nr_scanned = sc->nr_scanned;
2681  
2682  	/*
2683  	 * Determine the scan balance between anon and file LRUs.
2684  	 */
2685  	spin_lock_irq(&pgdat->lru_lock);
2686  	sc->anon_cost = target_lruvec->anon_cost;
2687  	sc->file_cost = target_lruvec->file_cost;
2688  	spin_unlock_irq(&pgdat->lru_lock);
2689  
2690  	/*
2691  	 * Target desirable inactive:active list ratios for the anon
2692  	 * and file LRU lists.
2693  	 */
2694  	if (!sc->force_deactivate) {
2695  		unsigned long refaults;
2696  
2697  		if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2698  			sc->may_deactivate |= DEACTIVATE_ANON;
2699  		else
2700  			sc->may_deactivate &= ~DEACTIVATE_ANON;
2701  
2702  		/*
2703  		 * When refaults are being observed, it means a new
2704  		 * workingset is being established. Deactivate to get
2705  		 * rid of any stale active pages quickly.
2706  		 */
2707  		refaults = lruvec_page_state(target_lruvec,
2708  					     WORKINGSET_ACTIVATE);
2709  		if (refaults != target_lruvec->refaults ||
2710  		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2711  			sc->may_deactivate |= DEACTIVATE_FILE;
2712  		else
2713  			sc->may_deactivate &= ~DEACTIVATE_FILE;
2714  	} else
2715  		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2716  
2717  	/*
2718  	 * If we have plenty of inactive file pages that aren't
2719  	 * thrashing, try to reclaim those first before touching
2720  	 * anonymous pages.
2721  	 */
2722  	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2723  	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
2724  		sc->cache_trim_mode = 1;
2725  	else
2726  		sc->cache_trim_mode = 0;
2727  
2728  	/*
2729  	 * Prevent the reclaimer from falling into the cache trap: as
2730  	 * cache pages start out inactive, every cache fault will tip
2731  	 * the scan balance towards the file LRU.  And as the file LRU
2732  	 * shrinks, so does the window for rotation from references.
2733  	 * This means we have a runaway feedback loop where a tiny
2734  	 * thrashing file LRU becomes infinitely more attractive than
2735  	 * anon pages.  Try to detect this based on file LRU size.
2736  	 */
2737  	if (!cgroup_reclaim(sc)) {
2738  		unsigned long total_high_wmark = 0;
2739  		unsigned long free, anon;
2740  		int z;
2741  
2742  		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2743  		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2744  			   node_page_state(pgdat, NR_INACTIVE_FILE);
2745  
2746  		for (z = 0; z < MAX_NR_ZONES; z++) {
2747  			struct zone *zone = &pgdat->node_zones[z];
2748  			if (!managed_zone(zone))
2749  				continue;
2750  
2751  			total_high_wmark += high_wmark_pages(zone);
2752  		}
2753  
2754  		/*
2755  		 * Consider anon: if that's low too, this isn't a
2756  		 * runaway file reclaim problem, but rather just
2757  		 * extreme pressure. Reclaim as per usual then.
2758  		 */
2759  		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2760  
2761  		sc->file_is_tiny =
2762  			file + free <= total_high_wmark &&
2763  			!(sc->may_deactivate & DEACTIVATE_ANON) &&
2764  			anon >> sc->priority;
2765  	}
2766  
2767  	shrink_node_memcgs(pgdat, sc);
2768  
2769  	if (reclaim_state) {
2770  		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2771  		reclaim_state->reclaimed_slab = 0;
2772  	}
2773  
2774  	/* Record the subtree's reclaim efficiency */
2775  	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2776  		   sc->nr_scanned - nr_scanned,
2777  		   sc->nr_reclaimed - nr_reclaimed);
2778  
2779  	if (sc->nr_reclaimed - nr_reclaimed)
2780  		reclaimable = true;
2781  
2782  	if (current_is_kswapd()) {
2783  		/*
2784  		 * If reclaim is isolating dirty pages under writeback,
2785  		 * it implies that the long-lived page allocation rate
2786  		 * is exceeding the page laundering rate. Either the
2787  		 * global limits are not being effective at throttling
2788  		 * processes due to the page distribution throughout
2789  		 * zones or there is heavy usage of a slow backing
2790  		 * device. The only option is to throttle from reclaim
2791  		 * context which is not ideal as there is no guarantee
2792  		 * the dirtying process is throttled in the same way
2793  		 * balance_dirty_pages() manages.
2794  		 *
2795  		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
2796  		 * count the number of pages under pages flagged for
2797  		 * immediate reclaim and stall if any are encountered
2798  		 * in the nr_immediate check below.
2799  		 */
2800  		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2801  			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2802  
2803  		/* Allow kswapd to start writing pages during reclaim.*/
2804  		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2805  			set_bit(PGDAT_DIRTY, &pgdat->flags);
2806  
2807  		/*
2808  		 * If kswapd scans pages marked marked for immediate
2809  		 * reclaim and under writeback (nr_immediate), it
2810  		 * implies that pages are cycling through the LRU
2811  		 * faster than they are written so also forcibly stall.
2812  		 */
2813  		if (sc->nr.immediate)
2814  			congestion_wait(BLK_RW_ASYNC, HZ/10);
2815  	}
2816  
2817  	/*
2818  	 * Tag a node/memcg as congested if all the dirty pages
2819  	 * scanned were backed by a congested BDI and
2820  	 * wait_iff_congested will stall.
2821  	 *
2822  	 * Legacy memcg will stall in page writeback so avoid forcibly
2823  	 * stalling in wait_iff_congested().
2824  	 */
2825  	if ((current_is_kswapd() ||
2826  	     (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
2827  	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2828  		set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
2829  
2830  	/*
2831  	 * Stall direct reclaim for IO completions if underlying BDIs
2832  	 * and node is congested. Allow kswapd to continue until it
2833  	 * starts encountering unqueued dirty pages or cycling through
2834  	 * the LRU too quickly.
2835  	 */
2836  	if (!current_is_kswapd() && current_may_throttle() &&
2837  	    !sc->hibernation_mode &&
2838  	    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
2839  		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2840  
2841  	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2842  				    sc))
2843  		goto again;
2844  
2845  	/*
2846  	 * Kswapd gives up on balancing particular nodes after too
2847  	 * many failures to reclaim anything from them and goes to
2848  	 * sleep. On reclaim progress, reset the failure counter. A
2849  	 * successful direct reclaim run will revive a dormant kswapd.
2850  	 */
2851  	if (reclaimable)
2852  		pgdat->kswapd_failures = 0;
2853  }
2854  
2855  /*
2856   * Returns true if compaction should go ahead for a costly-order request, or
2857   * the allocation would already succeed without compaction. Return false if we
2858   * should reclaim first.
2859   */
2860  static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2861  {
2862  	unsigned long watermark;
2863  	enum compact_result suitable;
2864  
2865  	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2866  	if (suitable == COMPACT_SUCCESS)
2867  		/* Allocation should succeed already. Don't reclaim. */
2868  		return true;
2869  	if (suitable == COMPACT_SKIPPED)
2870  		/* Compaction cannot yet proceed. Do reclaim. */
2871  		return false;
2872  
2873  	/*
2874  	 * Compaction is already possible, but it takes time to run and there
2875  	 * are potentially other callers using the pages just freed. So proceed
2876  	 * with reclaim to make a buffer of free pages available to give
2877  	 * compaction a reasonable chance of completing and allocating the page.
2878  	 * Note that we won't actually reclaim the whole buffer in one attempt
2879  	 * as the target watermark in should_continue_reclaim() is lower. But if
2880  	 * we are already above the high+gap watermark, don't reclaim at all.
2881  	 */
2882  	watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2883  
2884  	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2885  }
2886  
2887  /*
2888   * This is the direct reclaim path, for page-allocating processes.  We only
2889   * try to reclaim pages from zones which will satisfy the caller's allocation
2890   * request.
2891   *
2892   * If a zone is deemed to be full of pinned pages then just give it a light
2893   * scan then give up on it.
2894   */
2895  static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2896  {
2897  	struct zoneref *z;
2898  	struct zone *zone;
2899  	unsigned long nr_soft_reclaimed;
2900  	unsigned long nr_soft_scanned;
2901  	gfp_t orig_mask;
2902  	pg_data_t *last_pgdat = NULL;
2903  
2904  	/*
2905  	 * If the number of buffer_heads in the machine exceeds the maximum
2906  	 * allowed level, force direct reclaim to scan the highmem zone as
2907  	 * highmem pages could be pinning lowmem pages storing buffer_heads
2908  	 */
2909  	orig_mask = sc->gfp_mask;
2910  	if (buffer_heads_over_limit) {
2911  		sc->gfp_mask |= __GFP_HIGHMEM;
2912  		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2913  	}
2914  
2915  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
2916  					sc->reclaim_idx, sc->nodemask) {
2917  		/*
2918  		 * Take care memory controller reclaiming has small influence
2919  		 * to global LRU.
2920  		 */
2921  		if (!cgroup_reclaim(sc)) {
2922  			if (!cpuset_zone_allowed(zone,
2923  						 GFP_KERNEL | __GFP_HARDWALL))
2924  				continue;
2925  
2926  			/*
2927  			 * If we already have plenty of memory free for
2928  			 * compaction in this zone, don't free any more.
2929  			 * Even though compaction is invoked for any
2930  			 * non-zero order, only frequent costly order
2931  			 * reclamation is disruptive enough to become a
2932  			 * noticeable problem, like transparent huge
2933  			 * page allocations.
2934  			 */
2935  			if (IS_ENABLED(CONFIG_COMPACTION) &&
2936  			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2937  			    compaction_ready(zone, sc)) {
2938  				sc->compaction_ready = true;
2939  				continue;
2940  			}
2941  
2942  			/*
2943  			 * Shrink each node in the zonelist once. If the
2944  			 * zonelist is ordered by zone (not the default) then a
2945  			 * node may be shrunk multiple times but in that case
2946  			 * the user prefers lower zones being preserved.
2947  			 */
2948  			if (zone->zone_pgdat == last_pgdat)
2949  				continue;
2950  
2951  			/*
2952  			 * This steals pages from memory cgroups over softlimit
2953  			 * and returns the number of reclaimed pages and
2954  			 * scanned pages. This works for global memory pressure
2955  			 * and balancing, not for a memcg's limit.
2956  			 */
2957  			nr_soft_scanned = 0;
2958  			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2959  						sc->order, sc->gfp_mask,
2960  						&nr_soft_scanned);
2961  			sc->nr_reclaimed += nr_soft_reclaimed;
2962  			sc->nr_scanned += nr_soft_scanned;
2963  			/* need some check for avoid more shrink_zone() */
2964  		}
2965  
2966  		/* See comment about same check for global reclaim above */
2967  		if (zone->zone_pgdat == last_pgdat)
2968  			continue;
2969  		last_pgdat = zone->zone_pgdat;
2970  		shrink_node(zone->zone_pgdat, sc);
2971  	}
2972  
2973  	/*
2974  	 * Restore to original mask to avoid the impact on the caller if we
2975  	 * promoted it to __GFP_HIGHMEM.
2976  	 */
2977  	sc->gfp_mask = orig_mask;
2978  }
2979  
2980  static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
2981  {
2982  	struct lruvec *target_lruvec;
2983  	unsigned long refaults;
2984  
2985  	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
2986  	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
2987  	target_lruvec->refaults = refaults;
2988  }
2989  
2990  /*
2991   * This is the main entry point to direct page reclaim.
2992   *
2993   * If a full scan of the inactive list fails to free enough memory then we
2994   * are "out of memory" and something needs to be killed.
2995   *
2996   * If the caller is !__GFP_FS then the probability of a failure is reasonably
2997   * high - the zone may be full of dirty or under-writeback pages, which this
2998   * caller can't do much about.  We kick the writeback threads and take explicit
2999   * naps in the hope that some of these pages can be written.  But if the
3000   * allocating task holds filesystem locks which prevent writeout this might not
3001   * work, and the allocation attempt will fail.
3002   *
3003   * returns:	0, if no pages reclaimed
3004   * 		else, the number of pages reclaimed
3005   */
3006  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3007  					  struct scan_control *sc)
3008  {
3009  	int initial_priority = sc->priority;
3010  	pg_data_t *last_pgdat;
3011  	struct zoneref *z;
3012  	struct zone *zone;
3013  retry:
3014  	delayacct_freepages_start();
3015  
3016  	if (!cgroup_reclaim(sc))
3017  		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3018  
3019  	do {
3020  		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3021  				sc->priority);
3022  		sc->nr_scanned = 0;
3023  		shrink_zones(zonelist, sc);
3024  
3025  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3026  			break;
3027  
3028  		if (sc->compaction_ready)
3029  			break;
3030  
3031  		/*
3032  		 * If we're getting trouble reclaiming, start doing
3033  		 * writepage even in laptop mode.
3034  		 */
3035  		if (sc->priority < DEF_PRIORITY - 2)
3036  			sc->may_writepage = 1;
3037  	} while (--sc->priority >= 0);
3038  
3039  	last_pgdat = NULL;
3040  	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3041  					sc->nodemask) {
3042  		if (zone->zone_pgdat == last_pgdat)
3043  			continue;
3044  		last_pgdat = zone->zone_pgdat;
3045  
3046  		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3047  
3048  		if (cgroup_reclaim(sc)) {
3049  			struct lruvec *lruvec;
3050  
3051  			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3052  						   zone->zone_pgdat);
3053  			clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3054  		}
3055  	}
3056  
3057  	delayacct_freepages_end();
3058  
3059  	if (sc->nr_reclaimed)
3060  		return sc->nr_reclaimed;
3061  
3062  	/* Aborted reclaim to try compaction? don't OOM, then */
3063  	if (sc->compaction_ready)
3064  		return 1;
3065  
3066  	/*
3067  	 * We make inactive:active ratio decisions based on the node's
3068  	 * composition of memory, but a restrictive reclaim_idx or a
3069  	 * memory.low cgroup setting can exempt large amounts of
3070  	 * memory from reclaim. Neither of which are very common, so
3071  	 * instead of doing costly eligibility calculations of the
3072  	 * entire cgroup subtree up front, we assume the estimates are
3073  	 * good, and retry with forcible deactivation if that fails.
3074  	 */
3075  	if (sc->skipped_deactivate) {
3076  		sc->priority = initial_priority;
3077  		sc->force_deactivate = 1;
3078  		sc->skipped_deactivate = 0;
3079  		goto retry;
3080  	}
3081  
3082  	/* Untapped cgroup reserves?  Don't OOM, retry. */
3083  	if (sc->memcg_low_skipped) {
3084  		sc->priority = initial_priority;
3085  		sc->force_deactivate = 0;
3086  		sc->memcg_low_reclaim = 1;
3087  		sc->memcg_low_skipped = 0;
3088  		goto retry;
3089  	}
3090  
3091  	return 0;
3092  }
3093  
3094  static bool allow_direct_reclaim(pg_data_t *pgdat)
3095  {
3096  	struct zone *zone;
3097  	unsigned long pfmemalloc_reserve = 0;
3098  	unsigned long free_pages = 0;
3099  	int i;
3100  	bool wmark_ok;
3101  
3102  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3103  		return true;
3104  
3105  	for (i = 0; i <= ZONE_NORMAL; i++) {
3106  		zone = &pgdat->node_zones[i];
3107  		if (!managed_zone(zone))
3108  			continue;
3109  
3110  		if (!zone_reclaimable_pages(zone))
3111  			continue;
3112  
3113  		pfmemalloc_reserve += min_wmark_pages(zone);
3114  		free_pages += zone_page_state(zone, NR_FREE_PAGES);
3115  	}
3116  
3117  	/* If there are no reserves (unexpected config) then do not throttle */
3118  	if (!pfmemalloc_reserve)
3119  		return true;
3120  
3121  	wmark_ok = free_pages > pfmemalloc_reserve / 2;
3122  
3123  	/* kswapd must be awake if processes are being throttled */
3124  	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3125  		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3126  			WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3127  
3128  		wake_up_interruptible(&pgdat->kswapd_wait);
3129  	}
3130  
3131  	return wmark_ok;
3132  }
3133  
3134  /*
3135   * Throttle direct reclaimers if backing storage is backed by the network
3136   * and the PFMEMALLOC reserve for the preferred node is getting dangerously
3137   * depleted. kswapd will continue to make progress and wake the processes
3138   * when the low watermark is reached.
3139   *
3140   * Returns true if a fatal signal was delivered during throttling. If this
3141   * happens, the page allocator should not consider triggering the OOM killer.
3142   */
3143  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3144  					nodemask_t *nodemask)
3145  {
3146  	struct zoneref *z;
3147  	struct zone *zone;
3148  	pg_data_t *pgdat = NULL;
3149  
3150  	/*
3151  	 * Kernel threads should not be throttled as they may be indirectly
3152  	 * responsible for cleaning pages necessary for reclaim to make forward
3153  	 * progress. kjournald for example may enter direct reclaim while
3154  	 * committing a transaction where throttling it could forcing other
3155  	 * processes to block on log_wait_commit().
3156  	 */
3157  	if (current->flags & PF_KTHREAD)
3158  		goto out;
3159  
3160  	/*
3161  	 * If a fatal signal is pending, this process should not throttle.
3162  	 * It should return quickly so it can exit and free its memory
3163  	 */
3164  	if (fatal_signal_pending(current))
3165  		goto out;
3166  
3167  	/*
3168  	 * Check if the pfmemalloc reserves are ok by finding the first node
3169  	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
3170  	 * GFP_KERNEL will be required for allocating network buffers when
3171  	 * swapping over the network so ZONE_HIGHMEM is unusable.
3172  	 *
3173  	 * Throttling is based on the first usable node and throttled processes
3174  	 * wait on a queue until kswapd makes progress and wakes them. There
3175  	 * is an affinity then between processes waking up and where reclaim
3176  	 * progress has been made assuming the process wakes on the same node.
3177  	 * More importantly, processes running on remote nodes will not compete
3178  	 * for remote pfmemalloc reserves and processes on different nodes
3179  	 * should make reasonable progress.
3180  	 */
3181  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
3182  					gfp_zone(gfp_mask), nodemask) {
3183  		if (zone_idx(zone) > ZONE_NORMAL)
3184  			continue;
3185  
3186  		/* Throttle based on the first usable node */
3187  		pgdat = zone->zone_pgdat;
3188  		if (allow_direct_reclaim(pgdat))
3189  			goto out;
3190  		break;
3191  	}
3192  
3193  	/* If no zone was usable by the allocation flags then do not throttle */
3194  	if (!pgdat)
3195  		goto out;
3196  
3197  	/* Account for the throttling */
3198  	count_vm_event(PGSCAN_DIRECT_THROTTLE);
3199  
3200  	/*
3201  	 * If the caller cannot enter the filesystem, it's possible that it
3202  	 * is due to the caller holding an FS lock or performing a journal
3203  	 * transaction in the case of a filesystem like ext[3|4]. In this case,
3204  	 * it is not safe to block on pfmemalloc_wait as kswapd could be
3205  	 * blocked waiting on the same lock. Instead, throttle for up to a
3206  	 * second before continuing.
3207  	 */
3208  	if (!(gfp_mask & __GFP_FS)) {
3209  		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3210  			allow_direct_reclaim(pgdat), HZ);
3211  
3212  		goto check_pending;
3213  	}
3214  
3215  	/* Throttle until kswapd wakes the process */
3216  	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3217  		allow_direct_reclaim(pgdat));
3218  
3219  check_pending:
3220  	if (fatal_signal_pending(current))
3221  		return true;
3222  
3223  out:
3224  	return false;
3225  }
3226  
3227  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3228  				gfp_t gfp_mask, nodemask_t *nodemask)
3229  {
3230  	unsigned long nr_reclaimed;
3231  	struct scan_control sc = {
3232  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
3233  		.gfp_mask = current_gfp_context(gfp_mask),
3234  		.reclaim_idx = gfp_zone(gfp_mask),
3235  		.order = order,
3236  		.nodemask = nodemask,
3237  		.priority = DEF_PRIORITY,
3238  		.may_writepage = !laptop_mode,
3239  		.may_unmap = 1,
3240  		.may_swap = 1,
3241  	};
3242  
3243  	/*
3244  	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
3245  	 * Confirm they are large enough for max values.
3246  	 */
3247  	BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3248  	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3249  	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3250  
3251  	/*
3252  	 * Do not enter reclaim if fatal signal was delivered while throttled.
3253  	 * 1 is returned so that the page allocator does not OOM kill at this
3254  	 * point.
3255  	 */
3256  	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3257  		return 1;
3258  
3259  	set_task_reclaim_state(current, &sc.reclaim_state);
3260  	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3261  
3262  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3263  
3264  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3265  	set_task_reclaim_state(current, NULL);
3266  
3267  	return nr_reclaimed;
3268  }
3269  
3270  #ifdef CONFIG_MEMCG
3271  
3272  /* Only used by soft limit reclaim. Do not reuse for anything else. */
3273  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3274  						gfp_t gfp_mask, bool noswap,
3275  						pg_data_t *pgdat,
3276  						unsigned long *nr_scanned)
3277  {
3278  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3279  	struct scan_control sc = {
3280  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
3281  		.target_mem_cgroup = memcg,
3282  		.may_writepage = !laptop_mode,
3283  		.may_unmap = 1,
3284  		.reclaim_idx = MAX_NR_ZONES - 1,
3285  		.may_swap = !noswap,
3286  	};
3287  
3288  	WARN_ON_ONCE(!current->reclaim_state);
3289  
3290  	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3291  			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3292  
3293  	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3294  						      sc.gfp_mask);
3295  
3296  	/*
3297  	 * NOTE: Although we can get the priority field, using it
3298  	 * here is not a good idea, since it limits the pages we can scan.
3299  	 * if we don't reclaim here, the shrink_node from balance_pgdat
3300  	 * will pick up pages from other mem cgroup's as well. We hack
3301  	 * the priority and make it zero.
3302  	 */
3303  	shrink_lruvec(lruvec, &sc);
3304  
3305  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3306  
3307  	*nr_scanned = sc.nr_scanned;
3308  
3309  	return sc.nr_reclaimed;
3310  }
3311  
3312  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3313  					   unsigned long nr_pages,
3314  					   gfp_t gfp_mask,
3315  					   bool may_swap)
3316  {
3317  	unsigned long nr_reclaimed;
3318  	unsigned long pflags;
3319  	unsigned int noreclaim_flag;
3320  	struct scan_control sc = {
3321  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3322  		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3323  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3324  		.reclaim_idx = MAX_NR_ZONES - 1,
3325  		.target_mem_cgroup = memcg,
3326  		.priority = DEF_PRIORITY,
3327  		.may_writepage = !laptop_mode,
3328  		.may_unmap = 1,
3329  		.may_swap = may_swap,
3330  	};
3331  	/*
3332  	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
3333  	 * equal pressure on all the nodes. This is based on the assumption that
3334  	 * the reclaim does not bail out early.
3335  	 */
3336  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3337  
3338  	set_task_reclaim_state(current, &sc.reclaim_state);
3339  
3340  	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3341  
3342  	psi_memstall_enter(&pflags);
3343  	noreclaim_flag = memalloc_noreclaim_save();
3344  
3345  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3346  
3347  	memalloc_noreclaim_restore(noreclaim_flag);
3348  	psi_memstall_leave(&pflags);
3349  
3350  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3351  	set_task_reclaim_state(current, NULL);
3352  
3353  	return nr_reclaimed;
3354  }
3355  #endif
3356  
3357  static void age_active_anon(struct pglist_data *pgdat,
3358  				struct scan_control *sc)
3359  {
3360  	struct mem_cgroup *memcg;
3361  	struct lruvec *lruvec;
3362  
3363  	if (!total_swap_pages)
3364  		return;
3365  
3366  	lruvec = mem_cgroup_lruvec(NULL, pgdat);
3367  	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3368  		return;
3369  
3370  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
3371  	do {
3372  		lruvec = mem_cgroup_lruvec(memcg, pgdat);
3373  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3374  				   sc, LRU_ACTIVE_ANON);
3375  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
3376  	} while (memcg);
3377  }
3378  
3379  static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3380  {
3381  	int i;
3382  	struct zone *zone;
3383  
3384  	/*
3385  	 * Check for watermark boosts top-down as the higher zones
3386  	 * are more likely to be boosted. Both watermarks and boosts
3387  	 * should not be checked at the time time as reclaim would
3388  	 * start prematurely when there is no boosting and a lower
3389  	 * zone is balanced.
3390  	 */
3391  	for (i = highest_zoneidx; i >= 0; i--) {
3392  		zone = pgdat->node_zones + i;
3393  		if (!managed_zone(zone))
3394  			continue;
3395  
3396  		if (zone->watermark_boost)
3397  			return true;
3398  	}
3399  
3400  	return false;
3401  }
3402  
3403  /*
3404   * Returns true if there is an eligible zone balanced for the request order
3405   * and highest_zoneidx
3406   */
3407  static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3408  {
3409  	int i;
3410  	unsigned long mark = -1;
3411  	struct zone *zone;
3412  
3413  	/*
3414  	 * Check watermarks bottom-up as lower zones are more likely to
3415  	 * meet watermarks.
3416  	 */
3417  	for (i = 0; i <= highest_zoneidx; i++) {
3418  		zone = pgdat->node_zones + i;
3419  
3420  		if (!managed_zone(zone))
3421  			continue;
3422  
3423  		mark = high_wmark_pages(zone);
3424  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3425  			return true;
3426  	}
3427  
3428  	/*
3429  	 * If a node has no populated zone within highest_zoneidx, it does not
3430  	 * need balancing by definition. This can happen if a zone-restricted
3431  	 * allocation tries to wake a remote kswapd.
3432  	 */
3433  	if (mark == -1)
3434  		return true;
3435  
3436  	return false;
3437  }
3438  
3439  /* Clear pgdat state for congested, dirty or under writeback. */
3440  static void clear_pgdat_congested(pg_data_t *pgdat)
3441  {
3442  	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
3443  
3444  	clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3445  	clear_bit(PGDAT_DIRTY, &pgdat->flags);
3446  	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3447  }
3448  
3449  /*
3450   * Prepare kswapd for sleeping. This verifies that there are no processes
3451   * waiting in throttle_direct_reclaim() and that watermarks have been met.
3452   *
3453   * Returns true if kswapd is ready to sleep
3454   */
3455  static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
3456  				int highest_zoneidx)
3457  {
3458  	/*
3459  	 * The throttled processes are normally woken up in balance_pgdat() as
3460  	 * soon as allow_direct_reclaim() is true. But there is a potential
3461  	 * race between when kswapd checks the watermarks and a process gets
3462  	 * throttled. There is also a potential race if processes get
3463  	 * throttled, kswapd wakes, a large process exits thereby balancing the
3464  	 * zones, which causes kswapd to exit balance_pgdat() before reaching
3465  	 * the wake up checks. If kswapd is going to sleep, no process should
3466  	 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
3467  	 * the wake up is premature, processes will wake kswapd and get
3468  	 * throttled again. The difference from wake ups in balance_pgdat() is
3469  	 * that here we are under prepare_to_wait().
3470  	 */
3471  	if (waitqueue_active(&pgdat->pfmemalloc_wait))
3472  		wake_up_all(&pgdat->pfmemalloc_wait);
3473  
3474  	/* Hopeless node, leave it to direct reclaim */
3475  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3476  		return true;
3477  
3478  	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
3479  		clear_pgdat_congested(pgdat);
3480  		return true;
3481  	}
3482  
3483  	return false;
3484  }
3485  
3486  /*
3487   * kswapd shrinks a node of pages that are at or below the highest usable
3488   * zone that is currently unbalanced.
3489   *
3490   * Returns true if kswapd scanned at least the requested number of pages to
3491   * reclaim or if the lack of progress was due to pages under writeback.
3492   * This is used to determine if the scanning priority needs to be raised.
3493   */
3494  static bool kswapd_shrink_node(pg_data_t *pgdat,
3495  			       struct scan_control *sc)
3496  {
3497  	struct zone *zone;
3498  	int z;
3499  
3500  	/* Reclaim a number of pages proportional to the number of zones */
3501  	sc->nr_to_reclaim = 0;
3502  	for (z = 0; z <= sc->reclaim_idx; z++) {
3503  		zone = pgdat->node_zones + z;
3504  		if (!managed_zone(zone))
3505  			continue;
3506  
3507  		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3508  	}
3509  
3510  	/*
3511  	 * Historically care was taken to put equal pressure on all zones but
3512  	 * now pressure is applied based on node LRU order.
3513  	 */
3514  	shrink_node(pgdat, sc);
3515  
3516  	/*
3517  	 * Fragmentation may mean that the system cannot be rebalanced for
3518  	 * high-order allocations. If twice the allocation size has been
3519  	 * reclaimed then recheck watermarks only at order-0 to prevent
3520  	 * excessive reclaim. Assume that a process requested a high-order
3521  	 * can direct reclaim/compact.
3522  	 */
3523  	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3524  		sc->order = 0;
3525  
3526  	return sc->nr_scanned >= sc->nr_to_reclaim;
3527  }
3528  
3529  /*
3530   * For kswapd, balance_pgdat() will reclaim pages across a node from zones
3531   * that are eligible for use by the caller until at least one zone is
3532   * balanced.
3533   *
3534   * Returns the order kswapd finished reclaiming at.
3535   *
3536   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
3537   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
3538   * found to have free_pages <= high_wmark_pages(zone), any page in that zone
3539   * or lower is eligible for reclaim until at least one usable zone is
3540   * balanced.
3541   */
3542  static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
3543  {
3544  	int i;
3545  	unsigned long nr_soft_reclaimed;
3546  	unsigned long nr_soft_scanned;
3547  	unsigned long pflags;
3548  	unsigned long nr_boost_reclaim;
3549  	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3550  	bool boosted;
3551  	struct zone *zone;
3552  	struct scan_control sc = {
3553  		.gfp_mask = GFP_KERNEL,
3554  		.order = order,
3555  		.may_unmap = 1,
3556  	};
3557  
3558  	set_task_reclaim_state(current, &sc.reclaim_state);
3559  	psi_memstall_enter(&pflags);
3560  	__fs_reclaim_acquire();
3561  
3562  	count_vm_event(PAGEOUTRUN);
3563  
3564  	/*
3565  	 * Account for the reclaim boost. Note that the zone boost is left in
3566  	 * place so that parallel allocations that are near the watermark will
3567  	 * stall or direct reclaim until kswapd is finished.
3568  	 */
3569  	nr_boost_reclaim = 0;
3570  	for (i = 0; i <= highest_zoneidx; i++) {
3571  		zone = pgdat->node_zones + i;
3572  		if (!managed_zone(zone))
3573  			continue;
3574  
3575  		nr_boost_reclaim += zone->watermark_boost;
3576  		zone_boosts[i] = zone->watermark_boost;
3577  	}
3578  	boosted = nr_boost_reclaim;
3579  
3580  restart:
3581  	sc.priority = DEF_PRIORITY;
3582  	do {
3583  		unsigned long nr_reclaimed = sc.nr_reclaimed;
3584  		bool raise_priority = true;
3585  		bool balanced;
3586  		bool ret;
3587  
3588  		sc.reclaim_idx = highest_zoneidx;
3589  
3590  		/*
3591  		 * If the number of buffer_heads exceeds the maximum allowed
3592  		 * then consider reclaiming from all zones. This has a dual
3593  		 * purpose -- on 64-bit systems it is expected that
3594  		 * buffer_heads are stripped during active rotation. On 32-bit
3595  		 * systems, highmem pages can pin lowmem memory and shrinking
3596  		 * buffers can relieve lowmem pressure. Reclaim may still not
3597  		 * go ahead if all eligible zones for the original allocation
3598  		 * request are balanced to avoid excessive reclaim from kswapd.
3599  		 */
3600  		if (buffer_heads_over_limit) {
3601  			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3602  				zone = pgdat->node_zones + i;
3603  				if (!managed_zone(zone))
3604  					continue;
3605  
3606  				sc.reclaim_idx = i;
3607  				break;
3608  			}
3609  		}
3610  
3611  		/*
3612  		 * If the pgdat is imbalanced then ignore boosting and preserve
3613  		 * the watermarks for a later time and restart. Note that the
3614  		 * zone watermarks will be still reset at the end of balancing
3615  		 * on the grounds that the normal reclaim should be enough to
3616  		 * re-evaluate if boosting is required when kswapd next wakes.
3617  		 */
3618  		balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
3619  		if (!balanced && nr_boost_reclaim) {
3620  			nr_boost_reclaim = 0;
3621  			goto restart;
3622  		}
3623  
3624  		/*
3625  		 * If boosting is not active then only reclaim if there are no
3626  		 * eligible zones. Note that sc.reclaim_idx is not used as
3627  		 * buffer_heads_over_limit may have adjusted it.
3628  		 */
3629  		if (!nr_boost_reclaim && balanced)
3630  			goto out;
3631  
3632  		/* Limit the priority of boosting to avoid reclaim writeback */
3633  		if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3634  			raise_priority = false;
3635  
3636  		/*
3637  		 * Do not writeback or swap pages for boosted reclaim. The
3638  		 * intent is to relieve pressure not issue sub-optimal IO
3639  		 * from reclaim context. If no pages are reclaimed, the
3640  		 * reclaim will be aborted.
3641  		 */
3642  		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3643  		sc.may_swap = !nr_boost_reclaim;
3644  
3645  		/*
3646  		 * Do some background aging of the anon list, to give
3647  		 * pages a chance to be referenced before reclaiming. All
3648  		 * pages are rotated regardless of classzone as this is
3649  		 * about consistent aging.
3650  		 */
3651  		age_active_anon(pgdat, &sc);
3652  
3653  		/*
3654  		 * If we're getting trouble reclaiming, start doing writepage
3655  		 * even in laptop mode.
3656  		 */
3657  		if (sc.priority < DEF_PRIORITY - 2)
3658  			sc.may_writepage = 1;
3659  
3660  		/* Call soft limit reclaim before calling shrink_node. */
3661  		sc.nr_scanned = 0;
3662  		nr_soft_scanned = 0;
3663  		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3664  						sc.gfp_mask, &nr_soft_scanned);
3665  		sc.nr_reclaimed += nr_soft_reclaimed;
3666  
3667  		/*
3668  		 * There should be no need to raise the scanning priority if
3669  		 * enough pages are already being scanned that that high
3670  		 * watermark would be met at 100% efficiency.
3671  		 */
3672  		if (kswapd_shrink_node(pgdat, &sc))
3673  			raise_priority = false;
3674  
3675  		/*
3676  		 * If the low watermark is met there is no need for processes
3677  		 * to be throttled on pfmemalloc_wait as they should not be
3678  		 * able to safely make forward progress. Wake them
3679  		 */
3680  		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3681  				allow_direct_reclaim(pgdat))
3682  			wake_up_all(&pgdat->pfmemalloc_wait);
3683  
3684  		/* Check if kswapd should be suspending */
3685  		__fs_reclaim_release();
3686  		ret = try_to_freeze();
3687  		__fs_reclaim_acquire();
3688  		if (ret || kthread_should_stop())
3689  			break;
3690  
3691  		/*
3692  		 * Raise priority if scanning rate is too low or there was no
3693  		 * progress in reclaiming pages
3694  		 */
3695  		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3696  		nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3697  
3698  		/*
3699  		 * If reclaim made no progress for a boost, stop reclaim as
3700  		 * IO cannot be queued and it could be an infinite loop in
3701  		 * extreme circumstances.
3702  		 */
3703  		if (nr_boost_reclaim && !nr_reclaimed)
3704  			break;
3705  
3706  		if (raise_priority || !nr_reclaimed)
3707  			sc.priority--;
3708  	} while (sc.priority >= 1);
3709  
3710  	if (!sc.nr_reclaimed)
3711  		pgdat->kswapd_failures++;
3712  
3713  out:
3714  	/* If reclaim was boosted, account for the reclaim done in this pass */
3715  	if (boosted) {
3716  		unsigned long flags;
3717  
3718  		for (i = 0; i <= highest_zoneidx; i++) {
3719  			if (!zone_boosts[i])
3720  				continue;
3721  
3722  			/* Increments are under the zone lock */
3723  			zone = pgdat->node_zones + i;
3724  			spin_lock_irqsave(&zone->lock, flags);
3725  			zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3726  			spin_unlock_irqrestore(&zone->lock, flags);
3727  		}
3728  
3729  		/*
3730  		 * As there is now likely space, wakeup kcompact to defragment
3731  		 * pageblocks.
3732  		 */
3733  		wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
3734  	}
3735  
3736  	snapshot_refaults(NULL, pgdat);
3737  	__fs_reclaim_release();
3738  	psi_memstall_leave(&pflags);
3739  	set_task_reclaim_state(current, NULL);
3740  
3741  	/*
3742  	 * Return the order kswapd stopped reclaiming at as
3743  	 * prepare_kswapd_sleep() takes it into account. If another caller
3744  	 * entered the allocator slow path while kswapd was awake, order will
3745  	 * remain at the higher level.
3746  	 */
3747  	return sc.order;
3748  }
3749  
3750  /*
3751   * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
3752   * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
3753   * not a valid index then either kswapd runs for first time or kswapd couldn't
3754   * sleep after previous reclaim attempt (node is still unbalanced). In that
3755   * case return the zone index of the previous kswapd reclaim cycle.
3756   */
3757  static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
3758  					   enum zone_type prev_highest_zoneidx)
3759  {
3760  	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3761  
3762  	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
3763  }
3764  
3765  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3766  				unsigned int highest_zoneidx)
3767  {
3768  	long remaining = 0;
3769  	DEFINE_WAIT(wait);
3770  
3771  	if (freezing(current) || kthread_should_stop())
3772  		return;
3773  
3774  	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3775  
3776  	/*
3777  	 * Try to sleep for a short interval. Note that kcompactd will only be
3778  	 * woken if it is possible to sleep for a short interval. This is
3779  	 * deliberate on the assumption that if reclaim cannot keep an
3780  	 * eligible zone balanced that it's also unlikely that compaction will
3781  	 * succeed.
3782  	 */
3783  	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3784  		/*
3785  		 * Compaction records what page blocks it recently failed to
3786  		 * isolate pages from and skips them in the future scanning.
3787  		 * When kswapd is going to sleep, it is reasonable to assume
3788  		 * that pages and compaction may succeed so reset the cache.
3789  		 */
3790  		reset_isolation_suitable(pgdat);
3791  
3792  		/*
3793  		 * We have freed the memory, now we should compact it to make
3794  		 * allocation of the requested order possible.
3795  		 */
3796  		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
3797  
3798  		remaining = schedule_timeout(HZ/10);
3799  
3800  		/*
3801  		 * If woken prematurely then reset kswapd_highest_zoneidx and
3802  		 * order. The values will either be from a wakeup request or
3803  		 * the previous request that slept prematurely.
3804  		 */
3805  		if (remaining) {
3806  			WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
3807  					kswapd_highest_zoneidx(pgdat,
3808  							highest_zoneidx));
3809  
3810  			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
3811  				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
3812  		}
3813  
3814  		finish_wait(&pgdat->kswapd_wait, &wait);
3815  		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3816  	}
3817  
3818  	/*
3819  	 * After a short sleep, check if it was a premature sleep. If not, then
3820  	 * go fully to sleep until explicitly woken up.
3821  	 */
3822  	if (!remaining &&
3823  	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3824  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3825  
3826  		/*
3827  		 * vmstat counters are not perfectly accurate and the estimated
3828  		 * value for counters such as NR_FREE_PAGES can deviate from the
3829  		 * true value by nr_online_cpus * threshold. To avoid the zone
3830  		 * watermarks being breached while under pressure, we reduce the
3831  		 * per-cpu vmstat threshold while kswapd is awake and restore
3832  		 * them before going back to sleep.
3833  		 */
3834  		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3835  
3836  		if (!kthread_should_stop())
3837  			schedule();
3838  
3839  		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3840  	} else {
3841  		if (remaining)
3842  			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3843  		else
3844  			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3845  	}
3846  	finish_wait(&pgdat->kswapd_wait, &wait);
3847  }
3848  
3849  /*
3850   * The background pageout daemon, started as a kernel thread
3851   * from the init process.
3852   *
3853   * This basically trickles out pages so that we have _some_
3854   * free memory available even if there is no other activity
3855   * that frees anything up. This is needed for things like routing
3856   * etc, where we otherwise might have all activity going on in
3857   * asynchronous contexts that cannot page things out.
3858   *
3859   * If there are applications that are active memory-allocators
3860   * (most normal use), this basically shouldn't matter.
3861   */
3862  static int kswapd(void *p)
3863  {
3864  	unsigned int alloc_order, reclaim_order;
3865  	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
3866  	pg_data_t *pgdat = (pg_data_t*)p;
3867  	struct task_struct *tsk = current;
3868  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3869  
3870  	if (!cpumask_empty(cpumask))
3871  		set_cpus_allowed_ptr(tsk, cpumask);
3872  
3873  	/*
3874  	 * Tell the memory management that we're a "memory allocator",
3875  	 * and that if we need more memory we should get access to it
3876  	 * regardless (see "__alloc_pages()"). "kswapd" should
3877  	 * never get caught in the normal page freeing logic.
3878  	 *
3879  	 * (Kswapd normally doesn't need memory anyway, but sometimes
3880  	 * you need a small amount of memory in order to be able to
3881  	 * page out something else, and this flag essentially protects
3882  	 * us from recursively trying to free more memory as we're
3883  	 * trying to free the first piece of memory in the first place).
3884  	 */
3885  	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3886  	set_freezable();
3887  
3888  	WRITE_ONCE(pgdat->kswapd_order, 0);
3889  	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3890  	for ( ; ; ) {
3891  		bool ret;
3892  
3893  		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
3894  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3895  							highest_zoneidx);
3896  
3897  kswapd_try_sleep:
3898  		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3899  					highest_zoneidx);
3900  
3901  		/* Read the new order and highest_zoneidx */
3902  		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
3903  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3904  							highest_zoneidx);
3905  		WRITE_ONCE(pgdat->kswapd_order, 0);
3906  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3907  
3908  		ret = try_to_freeze();
3909  		if (kthread_should_stop())
3910  			break;
3911  
3912  		/*
3913  		 * We can speed up thawing tasks if we don't call balance_pgdat
3914  		 * after returning from the refrigerator
3915  		 */
3916  		if (ret)
3917  			continue;
3918  
3919  		/*
3920  		 * Reclaim begins at the requested order but if a high-order
3921  		 * reclaim fails then kswapd falls back to reclaiming for
3922  		 * order-0. If that happens, kswapd will consider sleeping
3923  		 * for the order it finished reclaiming at (reclaim_order)
3924  		 * but kcompactd is woken to compact for the original
3925  		 * request (alloc_order).
3926  		 */
3927  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
3928  						alloc_order);
3929  		reclaim_order = balance_pgdat(pgdat, alloc_order,
3930  						highest_zoneidx);
3931  		if (reclaim_order < alloc_order)
3932  			goto kswapd_try_sleep;
3933  	}
3934  
3935  	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3936  
3937  	return 0;
3938  }
3939  
3940  /*
3941   * A zone is low on free memory or too fragmented for high-order memory.  If
3942   * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
3943   * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
3944   * has failed or is not needed, still wake up kcompactd if only compaction is
3945   * needed.
3946   */
3947  void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3948  		   enum zone_type highest_zoneidx)
3949  {
3950  	pg_data_t *pgdat;
3951  	enum zone_type curr_idx;
3952  
3953  	if (!managed_zone(zone))
3954  		return;
3955  
3956  	if (!cpuset_zone_allowed(zone, gfp_flags))
3957  		return;
3958  
3959  	pgdat = zone->zone_pgdat;
3960  	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3961  
3962  	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
3963  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
3964  
3965  	if (READ_ONCE(pgdat->kswapd_order) < order)
3966  		WRITE_ONCE(pgdat->kswapd_order, order);
3967  
3968  	if (!waitqueue_active(&pgdat->kswapd_wait))
3969  		return;
3970  
3971  	/* Hopeless node, leave it to direct reclaim if possible */
3972  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3973  	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
3974  	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
3975  		/*
3976  		 * There may be plenty of free memory available, but it's too
3977  		 * fragmented for high-order allocations.  Wake up kcompactd
3978  		 * and rely on compaction_suitable() to determine if it's
3979  		 * needed.  If it fails, it will defer subsequent attempts to
3980  		 * ratelimit its work.
3981  		 */
3982  		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3983  			wakeup_kcompactd(pgdat, order, highest_zoneidx);
3984  		return;
3985  	}
3986  
3987  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
3988  				      gfp_flags);
3989  	wake_up_interruptible(&pgdat->kswapd_wait);
3990  }
3991  
3992  #ifdef CONFIG_HIBERNATION
3993  /*
3994   * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
3995   * freed pages.
3996   *
3997   * Rather than trying to age LRUs the aim is to preserve the overall
3998   * LRU order by reclaiming preferentially
3999   * inactive > active > active referenced > active mapped
4000   */
4001  unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4002  {
4003  	struct scan_control sc = {
4004  		.nr_to_reclaim = nr_to_reclaim,
4005  		.gfp_mask = GFP_HIGHUSER_MOVABLE,
4006  		.reclaim_idx = MAX_NR_ZONES - 1,
4007  		.priority = DEF_PRIORITY,
4008  		.may_writepage = 1,
4009  		.may_unmap = 1,
4010  		.may_swap = 1,
4011  		.hibernation_mode = 1,
4012  	};
4013  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4014  	unsigned long nr_reclaimed;
4015  	unsigned int noreclaim_flag;
4016  
4017  	fs_reclaim_acquire(sc.gfp_mask);
4018  	noreclaim_flag = memalloc_noreclaim_save();
4019  	set_task_reclaim_state(current, &sc.reclaim_state);
4020  
4021  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4022  
4023  	set_task_reclaim_state(current, NULL);
4024  	memalloc_noreclaim_restore(noreclaim_flag);
4025  	fs_reclaim_release(sc.gfp_mask);
4026  
4027  	return nr_reclaimed;
4028  }
4029  #endif /* CONFIG_HIBERNATION */
4030  
4031  /*
4032   * This kswapd start function will be called by init and node-hot-add.
4033   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
4034   */
4035  int kswapd_run(int nid)
4036  {
4037  	pg_data_t *pgdat = NODE_DATA(nid);
4038  	int ret = 0;
4039  
4040  	if (pgdat->kswapd)
4041  		return 0;
4042  
4043  	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4044  	if (IS_ERR(pgdat->kswapd)) {
4045  		/* failure at boot is fatal */
4046  		BUG_ON(system_state < SYSTEM_RUNNING);
4047  		pr_err("Failed to start kswapd on node %d\n", nid);
4048  		ret = PTR_ERR(pgdat->kswapd);
4049  		pgdat->kswapd = NULL;
4050  	}
4051  	return ret;
4052  }
4053  
4054  /*
4055   * Called by memory hotplug when all memory in a node is offlined.  Caller must
4056   * hold mem_hotplug_begin/end().
4057   */
4058  void kswapd_stop(int nid)
4059  {
4060  	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4061  
4062  	if (kswapd) {
4063  		kthread_stop(kswapd);
4064  		NODE_DATA(nid)->kswapd = NULL;
4065  	}
4066  }
4067  
4068  static int __init kswapd_init(void)
4069  {
4070  	int nid;
4071  
4072  	swap_setup();
4073  	for_each_node_state(nid, N_MEMORY)
4074   		kswapd_run(nid);
4075  	return 0;
4076  }
4077  
4078  module_init(kswapd_init)
4079  
4080  #ifdef CONFIG_NUMA
4081  /*
4082   * Node reclaim mode
4083   *
4084   * If non-zero call node_reclaim when the number of free pages falls below
4085   * the watermarks.
4086   */
4087  int node_reclaim_mode __read_mostly;
4088  
4089  #define RECLAIM_WRITE (1<<0)	/* Writeout pages during reclaim */
4090  #define RECLAIM_UNMAP (1<<1)	/* Unmap pages during reclaim */
4091  
4092  /*
4093   * Priority for NODE_RECLAIM. This determines the fraction of pages
4094   * of a node considered for each zone_reclaim. 4 scans 1/16th of
4095   * a zone.
4096   */
4097  #define NODE_RECLAIM_PRIORITY 4
4098  
4099  /*
4100   * Percentage of pages in a zone that must be unmapped for node_reclaim to
4101   * occur.
4102   */
4103  int sysctl_min_unmapped_ratio = 1;
4104  
4105  /*
4106   * If the number of slab pages in a zone grows beyond this percentage then
4107   * slab reclaim needs to occur.
4108   */
4109  int sysctl_min_slab_ratio = 5;
4110  
4111  static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4112  {
4113  	unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4114  	unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4115  		node_page_state(pgdat, NR_ACTIVE_FILE);
4116  
4117  	/*
4118  	 * It's possible for there to be more file mapped pages than
4119  	 * accounted for by the pages on the file LRU lists because
4120  	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
4121  	 */
4122  	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4123  }
4124  
4125  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
4126  static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4127  {
4128  	unsigned long nr_pagecache_reclaimable;
4129  	unsigned long delta = 0;
4130  
4131  	/*
4132  	 * If RECLAIM_UNMAP is set, then all file pages are considered
4133  	 * potentially reclaimable. Otherwise, we have to worry about
4134  	 * pages like swapcache and node_unmapped_file_pages() provides
4135  	 * a better estimate
4136  	 */
4137  	if (node_reclaim_mode & RECLAIM_UNMAP)
4138  		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4139  	else
4140  		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4141  
4142  	/* If we can't clean pages, remove dirty pages from consideration */
4143  	if (!(node_reclaim_mode & RECLAIM_WRITE))
4144  		delta += node_page_state(pgdat, NR_FILE_DIRTY);
4145  
4146  	/* Watch for any possible underflows due to delta */
4147  	if (unlikely(delta > nr_pagecache_reclaimable))
4148  		delta = nr_pagecache_reclaimable;
4149  
4150  	return nr_pagecache_reclaimable - delta;
4151  }
4152  
4153  /*
4154   * Try to free up some pages from this node through reclaim.
4155   */
4156  static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4157  {
4158  	/* Minimum pages needed in order to stay on node */
4159  	const unsigned long nr_pages = 1 << order;
4160  	struct task_struct *p = current;
4161  	unsigned int noreclaim_flag;
4162  	struct scan_control sc = {
4163  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4164  		.gfp_mask = current_gfp_context(gfp_mask),
4165  		.order = order,
4166  		.priority = NODE_RECLAIM_PRIORITY,
4167  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4168  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4169  		.may_swap = 1,
4170  		.reclaim_idx = gfp_zone(gfp_mask),
4171  	};
4172  
4173  	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4174  					   sc.gfp_mask);
4175  
4176  	cond_resched();
4177  	fs_reclaim_acquire(sc.gfp_mask);
4178  	/*
4179  	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
4180  	 * and we also need to be able to write out pages for RECLAIM_WRITE
4181  	 * and RECLAIM_UNMAP.
4182  	 */
4183  	noreclaim_flag = memalloc_noreclaim_save();
4184  	p->flags |= PF_SWAPWRITE;
4185  	set_task_reclaim_state(p, &sc.reclaim_state);
4186  
4187  	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4188  		/*
4189  		 * Free memory by calling shrink node with increasing
4190  		 * priorities until we have enough memory freed.
4191  		 */
4192  		do {
4193  			shrink_node(pgdat, &sc);
4194  		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4195  	}
4196  
4197  	set_task_reclaim_state(p, NULL);
4198  	current->flags &= ~PF_SWAPWRITE;
4199  	memalloc_noreclaim_restore(noreclaim_flag);
4200  	fs_reclaim_release(sc.gfp_mask);
4201  
4202  	trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4203  
4204  	return sc.nr_reclaimed >= nr_pages;
4205  }
4206  
4207  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4208  {
4209  	int ret;
4210  
4211  	/*
4212  	 * Node reclaim reclaims unmapped file backed pages and
4213  	 * slab pages if we are over the defined limits.
4214  	 *
4215  	 * A small portion of unmapped file backed pages is needed for
4216  	 * file I/O otherwise pages read by file I/O will be immediately
4217  	 * thrown out if the node is overallocated. So we do not reclaim
4218  	 * if less than a specified percentage of the node is used by
4219  	 * unmapped file backed pages.
4220  	 */
4221  	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4222  	    node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4223  		return NODE_RECLAIM_FULL;
4224  
4225  	/*
4226  	 * Do not scan if the allocation should not be delayed.
4227  	 */
4228  	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4229  		return NODE_RECLAIM_NOSCAN;
4230  
4231  	/*
4232  	 * Only run node reclaim on the local node or on nodes that do not
4233  	 * have associated processors. This will favor the local processor
4234  	 * over remote processors and spread off node memory allocations
4235  	 * as wide as possible.
4236  	 */
4237  	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4238  		return NODE_RECLAIM_NOSCAN;
4239  
4240  	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4241  		return NODE_RECLAIM_NOSCAN;
4242  
4243  	ret = __node_reclaim(pgdat, gfp_mask, order);
4244  	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4245  
4246  	if (!ret)
4247  		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4248  
4249  	return ret;
4250  }
4251  #endif
4252  
4253  /**
4254   * check_move_unevictable_pages - check pages for evictability and move to
4255   * appropriate zone lru list
4256   * @pvec: pagevec with lru pages to check
4257   *
4258   * Checks pages for evictability, if an evictable page is in the unevictable
4259   * lru list, moves it to the appropriate evictable lru list. This function
4260   * should be only used for lru pages.
4261   */
4262  void check_move_unevictable_pages(struct pagevec *pvec)
4263  {
4264  	struct lruvec *lruvec;
4265  	struct pglist_data *pgdat = NULL;
4266  	int pgscanned = 0;
4267  	int pgrescued = 0;
4268  	int i;
4269  
4270  	for (i = 0; i < pvec->nr; i++) {
4271  		struct page *page = pvec->pages[i];
4272  		struct pglist_data *pagepgdat = page_pgdat(page);
4273  
4274  		pgscanned++;
4275  		if (pagepgdat != pgdat) {
4276  			if (pgdat)
4277  				spin_unlock_irq(&pgdat->lru_lock);
4278  			pgdat = pagepgdat;
4279  			spin_lock_irq(&pgdat->lru_lock);
4280  		}
4281  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
4282  
4283  		if (!PageLRU(page) || !PageUnevictable(page))
4284  			continue;
4285  
4286  		if (page_evictable(page)) {
4287  			enum lru_list lru = page_lru_base_type(page);
4288  
4289  			VM_BUG_ON_PAGE(PageActive(page), page);
4290  			ClearPageUnevictable(page);
4291  			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4292  			add_page_to_lru_list(page, lruvec, lru);
4293  			pgrescued++;
4294  		}
4295  	}
4296  
4297  	if (pgdat) {
4298  		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4299  		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4300  		spin_unlock_irq(&pgdat->lru_lock);
4301  	}
4302  }
4303  EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4304