xref: /openbmc/linux/mm/swap.c (revision 6a6d6681ac1add9655b7ab5dd0b46b54aeb1b44f)
1  /*
2   *  linux/mm/swap.c
3   *
4   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5   */
6  
7  /*
8   * This file contains the default values for the operation of the
9   * Linux VM subsystem. Fine-tuning documentation can be found in
10   * Documentation/sysctl/vm.txt.
11   * Started 18.12.91
12   * Swap aging added 23.2.95, Stephen Tweedie.
13   * Buffermem limits added 12.3.98, Rik van Riel.
14   */
15  
16  #include <linux/mm.h>
17  #include <linux/sched.h>
18  #include <linux/kernel_stat.h>
19  #include <linux/swap.h>
20  #include <linux/mman.h>
21  #include <linux/pagemap.h>
22  #include <linux/pagevec.h>
23  #include <linux/init.h>
24  #include <linux/export.h>
25  #include <linux/mm_inline.h>
26  #include <linux/percpu_counter.h>
27  #include <linux/memremap.h>
28  #include <linux/percpu.h>
29  #include <linux/cpu.h>
30  #include <linux/notifier.h>
31  #include <linux/backing-dev.h>
32  #include <linux/memcontrol.h>
33  #include <linux/gfp.h>
34  #include <linux/uio.h>
35  #include <linux/hugetlb.h>
36  #include <linux/page_idle.h>
37  
38  #include "internal.h"
39  
40  #define CREATE_TRACE_POINTS
41  #include <trace/events/pagemap.h>
42  
43  /* How many pages do we try to swap or page in/out together? */
44  int page_cluster;
45  
46  static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
47  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
48  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
49  static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
50  #ifdef CONFIG_SMP
51  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
52  #endif
53  
54  /*
55   * This path almost never happens for VM activity - pages are normally
56   * freed via pagevecs.  But it gets used by networking.
57   */
58  static void __page_cache_release(struct page *page)
59  {
60  	if (PageLRU(page)) {
61  		struct zone *zone = page_zone(page);
62  		struct lruvec *lruvec;
63  		unsigned long flags;
64  
65  		spin_lock_irqsave(zone_lru_lock(zone), flags);
66  		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
67  		VM_BUG_ON_PAGE(!PageLRU(page), page);
68  		__ClearPageLRU(page);
69  		del_page_from_lru_list(page, lruvec, page_off_lru(page));
70  		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
71  	}
72  	__ClearPageWaiters(page);
73  	mem_cgroup_uncharge(page);
74  }
75  
76  static void __put_single_page(struct page *page)
77  {
78  	__page_cache_release(page);
79  	free_unref_page(page);
80  }
81  
82  static void __put_compound_page(struct page *page)
83  {
84  	compound_page_dtor *dtor;
85  
86  	/*
87  	 * __page_cache_release() is supposed to be called for thp, not for
88  	 * hugetlb. This is because hugetlb page does never have PageLRU set
89  	 * (it's never listed to any LRU lists) and no memcg routines should
90  	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
91  	 */
92  	if (!PageHuge(page))
93  		__page_cache_release(page);
94  	dtor = get_compound_page_dtor(page);
95  	(*dtor)(page);
96  }
97  
98  void __put_page(struct page *page)
99  {
100  	if (is_zone_device_page(page)) {
101  		put_dev_pagemap(page->pgmap);
102  
103  		/*
104  		 * The page belongs to the device that created pgmap. Do
105  		 * not return it to page allocator.
106  		 */
107  		return;
108  	}
109  
110  	if (unlikely(PageCompound(page)))
111  		__put_compound_page(page);
112  	else
113  		__put_single_page(page);
114  }
115  EXPORT_SYMBOL(__put_page);
116  
117  /**
118   * put_pages_list() - release a list of pages
119   * @pages: list of pages threaded on page->lru
120   *
121   * Release a list of pages which are strung together on page.lru.  Currently
122   * used by read_cache_pages() and related error recovery code.
123   */
124  void put_pages_list(struct list_head *pages)
125  {
126  	while (!list_empty(pages)) {
127  		struct page *victim;
128  
129  		victim = list_entry(pages->prev, struct page, lru);
130  		list_del(&victim->lru);
131  		put_page(victim);
132  	}
133  }
134  EXPORT_SYMBOL(put_pages_list);
135  
136  /*
137   * get_kernel_pages() - pin kernel pages in memory
138   * @kiov:	An array of struct kvec structures
139   * @nr_segs:	number of segments to pin
140   * @write:	pinning for read/write, currently ignored
141   * @pages:	array that receives pointers to the pages pinned.
142   *		Should be at least nr_segs long.
143   *
144   * Returns number of pages pinned. This may be fewer than the number
145   * requested. If nr_pages is 0 or negative, returns 0. If no pages
146   * were pinned, returns -errno. Each page returned must be released
147   * with a put_page() call when it is finished with.
148   */
149  int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
150  		struct page **pages)
151  {
152  	int seg;
153  
154  	for (seg = 0; seg < nr_segs; seg++) {
155  		if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
156  			return seg;
157  
158  		pages[seg] = kmap_to_page(kiov[seg].iov_base);
159  		get_page(pages[seg]);
160  	}
161  
162  	return seg;
163  }
164  EXPORT_SYMBOL_GPL(get_kernel_pages);
165  
166  /*
167   * get_kernel_page() - pin a kernel page in memory
168   * @start:	starting kernel address
169   * @write:	pinning for read/write, currently ignored
170   * @pages:	array that receives pointer to the page pinned.
171   *		Must be at least nr_segs long.
172   *
173   * Returns 1 if page is pinned. If the page was not pinned, returns
174   * -errno. The page returned must be released with a put_page() call
175   * when it is finished with.
176   */
177  int get_kernel_page(unsigned long start, int write, struct page **pages)
178  {
179  	const struct kvec kiov = {
180  		.iov_base = (void *)start,
181  		.iov_len = PAGE_SIZE
182  	};
183  
184  	return get_kernel_pages(&kiov, 1, write, pages);
185  }
186  EXPORT_SYMBOL_GPL(get_kernel_page);
187  
188  static void pagevec_lru_move_fn(struct pagevec *pvec,
189  	void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
190  	void *arg)
191  {
192  	int i;
193  	struct pglist_data *pgdat = NULL;
194  	struct lruvec *lruvec;
195  	unsigned long flags = 0;
196  
197  	for (i = 0; i < pagevec_count(pvec); i++) {
198  		struct page *page = pvec->pages[i];
199  		struct pglist_data *pagepgdat = page_pgdat(page);
200  
201  		if (pagepgdat != pgdat) {
202  			if (pgdat)
203  				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
204  			pgdat = pagepgdat;
205  			spin_lock_irqsave(&pgdat->lru_lock, flags);
206  		}
207  
208  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
209  		(*move_fn)(page, lruvec, arg);
210  	}
211  	if (pgdat)
212  		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
213  	release_pages(pvec->pages, pvec->nr);
214  	pagevec_reinit(pvec);
215  }
216  
217  static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
218  				 void *arg)
219  {
220  	int *pgmoved = arg;
221  
222  	if (PageLRU(page) && !PageUnevictable(page)) {
223  		del_page_from_lru_list(page, lruvec, page_lru(page));
224  		ClearPageActive(page);
225  		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
226  		(*pgmoved)++;
227  	}
228  }
229  
230  /*
231   * pagevec_move_tail() must be called with IRQ disabled.
232   * Otherwise this may cause nasty races.
233   */
234  static void pagevec_move_tail(struct pagevec *pvec)
235  {
236  	int pgmoved = 0;
237  
238  	pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
239  	__count_vm_events(PGROTATED, pgmoved);
240  }
241  
242  /*
243   * Writeback is about to end against a page which has been marked for immediate
244   * reclaim.  If it still appears to be reclaimable, move it to the tail of the
245   * inactive list.
246   */
247  void rotate_reclaimable_page(struct page *page)
248  {
249  	if (!PageLocked(page) && !PageDirty(page) &&
250  	    !PageUnevictable(page) && PageLRU(page)) {
251  		struct pagevec *pvec;
252  		unsigned long flags;
253  
254  		get_page(page);
255  		local_irq_save(flags);
256  		pvec = this_cpu_ptr(&lru_rotate_pvecs);
257  		if (!pagevec_add(pvec, page) || PageCompound(page))
258  			pagevec_move_tail(pvec);
259  		local_irq_restore(flags);
260  	}
261  }
262  
263  static void update_page_reclaim_stat(struct lruvec *lruvec,
264  				     int file, int rotated)
265  {
266  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
267  
268  	reclaim_stat->recent_scanned[file]++;
269  	if (rotated)
270  		reclaim_stat->recent_rotated[file]++;
271  }
272  
273  static void __activate_page(struct page *page, struct lruvec *lruvec,
274  			    void *arg)
275  {
276  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
277  		int file = page_is_file_cache(page);
278  		int lru = page_lru_base_type(page);
279  
280  		del_page_from_lru_list(page, lruvec, lru);
281  		SetPageActive(page);
282  		lru += LRU_ACTIVE;
283  		add_page_to_lru_list(page, lruvec, lru);
284  		trace_mm_lru_activate(page);
285  
286  		__count_vm_event(PGACTIVATE);
287  		update_page_reclaim_stat(lruvec, file, 1);
288  	}
289  }
290  
291  #ifdef CONFIG_SMP
292  static void activate_page_drain(int cpu)
293  {
294  	struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
295  
296  	if (pagevec_count(pvec))
297  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
298  }
299  
300  static bool need_activate_page_drain(int cpu)
301  {
302  	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
303  }
304  
305  void activate_page(struct page *page)
306  {
307  	page = compound_head(page);
308  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
309  		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
310  
311  		get_page(page);
312  		if (!pagevec_add(pvec, page) || PageCompound(page))
313  			pagevec_lru_move_fn(pvec, __activate_page, NULL);
314  		put_cpu_var(activate_page_pvecs);
315  	}
316  }
317  
318  #else
319  static inline void activate_page_drain(int cpu)
320  {
321  }
322  
323  static bool need_activate_page_drain(int cpu)
324  {
325  	return false;
326  }
327  
328  void activate_page(struct page *page)
329  {
330  	struct zone *zone = page_zone(page);
331  
332  	page = compound_head(page);
333  	spin_lock_irq(zone_lru_lock(zone));
334  	__activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
335  	spin_unlock_irq(zone_lru_lock(zone));
336  }
337  #endif
338  
339  static void __lru_cache_activate_page(struct page *page)
340  {
341  	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
342  	int i;
343  
344  	/*
345  	 * Search backwards on the optimistic assumption that the page being
346  	 * activated has just been added to this pagevec. Note that only
347  	 * the local pagevec is examined as a !PageLRU page could be in the
348  	 * process of being released, reclaimed, migrated or on a remote
349  	 * pagevec that is currently being drained. Furthermore, marking
350  	 * a remote pagevec's page PageActive potentially hits a race where
351  	 * a page is marked PageActive just after it is added to the inactive
352  	 * list causing accounting errors and BUG_ON checks to trigger.
353  	 */
354  	for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
355  		struct page *pagevec_page = pvec->pages[i];
356  
357  		if (pagevec_page == page) {
358  			SetPageActive(page);
359  			break;
360  		}
361  	}
362  
363  	put_cpu_var(lru_add_pvec);
364  }
365  
366  /*
367   * Mark a page as having seen activity.
368   *
369   * inactive,unreferenced	->	inactive,referenced
370   * inactive,referenced		->	active,unreferenced
371   * active,unreferenced		->	active,referenced
372   *
373   * When a newly allocated page is not yet visible, so safe for non-atomic ops,
374   * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
375   */
376  void mark_page_accessed(struct page *page)
377  {
378  	page = compound_head(page);
379  	if (!PageActive(page) && !PageUnevictable(page) &&
380  			PageReferenced(page)) {
381  
382  		/*
383  		 * If the page is on the LRU, queue it for activation via
384  		 * activate_page_pvecs. Otherwise, assume the page is on a
385  		 * pagevec, mark it active and it'll be moved to the active
386  		 * LRU on the next drain.
387  		 */
388  		if (PageLRU(page))
389  			activate_page(page);
390  		else
391  			__lru_cache_activate_page(page);
392  		ClearPageReferenced(page);
393  		if (page_is_file_cache(page))
394  			workingset_activation(page);
395  	} else if (!PageReferenced(page)) {
396  		SetPageReferenced(page);
397  	}
398  	if (page_is_idle(page))
399  		clear_page_idle(page);
400  }
401  EXPORT_SYMBOL(mark_page_accessed);
402  
403  static void __lru_cache_add(struct page *page)
404  {
405  	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
406  
407  	get_page(page);
408  	if (!pagevec_add(pvec, page) || PageCompound(page))
409  		__pagevec_lru_add(pvec);
410  	put_cpu_var(lru_add_pvec);
411  }
412  
413  /**
414   * lru_cache_add_anon - add a page to the page lists
415   * @page: the page to add
416   */
417  void lru_cache_add_anon(struct page *page)
418  {
419  	if (PageActive(page))
420  		ClearPageActive(page);
421  	__lru_cache_add(page);
422  }
423  
424  void lru_cache_add_file(struct page *page)
425  {
426  	if (PageActive(page))
427  		ClearPageActive(page);
428  	__lru_cache_add(page);
429  }
430  EXPORT_SYMBOL(lru_cache_add_file);
431  
432  /**
433   * lru_cache_add - add a page to a page list
434   * @page: the page to be added to the LRU.
435   *
436   * Queue the page for addition to the LRU via pagevec. The decision on whether
437   * to add the page to the [in]active [file|anon] list is deferred until the
438   * pagevec is drained. This gives a chance for the caller of lru_cache_add()
439   * have the page added to the active list using mark_page_accessed().
440   */
441  void lru_cache_add(struct page *page)
442  {
443  	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
444  	VM_BUG_ON_PAGE(PageLRU(page), page);
445  	__lru_cache_add(page);
446  }
447  
448  /**
449   * lru_cache_add_active_or_unevictable
450   * @page:  the page to be added to LRU
451   * @vma:   vma in which page is mapped for determining reclaimability
452   *
453   * Place @page on the active or unevictable LRU list, depending on its
454   * evictability.  Note that if the page is not evictable, it goes
455   * directly back onto it's zone's unevictable list, it does NOT use a
456   * per cpu pagevec.
457   */
458  void lru_cache_add_active_or_unevictable(struct page *page,
459  					 struct vm_area_struct *vma)
460  {
461  	VM_BUG_ON_PAGE(PageLRU(page), page);
462  
463  	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
464  		SetPageActive(page);
465  	else if (!TestSetPageMlocked(page)) {
466  		/*
467  		 * We use the irq-unsafe __mod_zone_page_stat because this
468  		 * counter is not modified from interrupt context, and the pte
469  		 * lock is held(spinlock), which implies preemption disabled.
470  		 */
471  		__mod_zone_page_state(page_zone(page), NR_MLOCK,
472  				    hpage_nr_pages(page));
473  		count_vm_event(UNEVICTABLE_PGMLOCKED);
474  	}
475  	lru_cache_add(page);
476  }
477  
478  /*
479   * If the page can not be invalidated, it is moved to the
480   * inactive list to speed up its reclaim.  It is moved to the
481   * head of the list, rather than the tail, to give the flusher
482   * threads some time to write it out, as this is much more
483   * effective than the single-page writeout from reclaim.
484   *
485   * If the page isn't page_mapped and dirty/writeback, the page
486   * could reclaim asap using PG_reclaim.
487   *
488   * 1. active, mapped page -> none
489   * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
490   * 3. inactive, mapped page -> none
491   * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
492   * 5. inactive, clean -> inactive, tail
493   * 6. Others -> none
494   *
495   * In 4, why it moves inactive's head, the VM expects the page would
496   * be write it out by flusher threads as this is much more effective
497   * than the single-page writeout from reclaim.
498   */
499  static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
500  			      void *arg)
501  {
502  	int lru, file;
503  	bool active;
504  
505  	if (!PageLRU(page))
506  		return;
507  
508  	if (PageUnevictable(page))
509  		return;
510  
511  	/* Some processes are using the page */
512  	if (page_mapped(page))
513  		return;
514  
515  	active = PageActive(page);
516  	file = page_is_file_cache(page);
517  	lru = page_lru_base_type(page);
518  
519  	del_page_from_lru_list(page, lruvec, lru + active);
520  	ClearPageActive(page);
521  	ClearPageReferenced(page);
522  	add_page_to_lru_list(page, lruvec, lru);
523  
524  	if (PageWriteback(page) || PageDirty(page)) {
525  		/*
526  		 * PG_reclaim could be raced with end_page_writeback
527  		 * It can make readahead confusing.  But race window
528  		 * is _really_ small and  it's non-critical problem.
529  		 */
530  		SetPageReclaim(page);
531  	} else {
532  		/*
533  		 * The page's writeback ends up during pagevec
534  		 * We moves tha page into tail of inactive.
535  		 */
536  		list_move_tail(&page->lru, &lruvec->lists[lru]);
537  		__count_vm_event(PGROTATED);
538  	}
539  
540  	if (active)
541  		__count_vm_event(PGDEACTIVATE);
542  	update_page_reclaim_stat(lruvec, file, 0);
543  }
544  
545  
546  static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
547  			    void *arg)
548  {
549  	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
550  	    !PageSwapCache(page) && !PageUnevictable(page)) {
551  		bool active = PageActive(page);
552  
553  		del_page_from_lru_list(page, lruvec,
554  				       LRU_INACTIVE_ANON + active);
555  		ClearPageActive(page);
556  		ClearPageReferenced(page);
557  		/*
558  		 * lazyfree pages are clean anonymous pages. They have
559  		 * SwapBacked flag cleared to distinguish normal anonymous
560  		 * pages
561  		 */
562  		ClearPageSwapBacked(page);
563  		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
564  
565  		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
566  		count_memcg_page_event(page, PGLAZYFREE);
567  		update_page_reclaim_stat(lruvec, 1, 0);
568  	}
569  }
570  
571  /*
572   * Drain pages out of the cpu's pagevecs.
573   * Either "cpu" is the current CPU, and preemption has already been
574   * disabled; or "cpu" is being hot-unplugged, and is already dead.
575   */
576  void lru_add_drain_cpu(int cpu)
577  {
578  	struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
579  
580  	if (pagevec_count(pvec))
581  		__pagevec_lru_add(pvec);
582  
583  	pvec = &per_cpu(lru_rotate_pvecs, cpu);
584  	if (pagevec_count(pvec)) {
585  		unsigned long flags;
586  
587  		/* No harm done if a racing interrupt already did this */
588  		local_irq_save(flags);
589  		pagevec_move_tail(pvec);
590  		local_irq_restore(flags);
591  	}
592  
593  	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
594  	if (pagevec_count(pvec))
595  		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
596  
597  	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
598  	if (pagevec_count(pvec))
599  		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
600  
601  	activate_page_drain(cpu);
602  }
603  
604  /**
605   * deactivate_file_page - forcefully deactivate a file page
606   * @page: page to deactivate
607   *
608   * This function hints the VM that @page is a good reclaim candidate,
609   * for example if its invalidation fails due to the page being dirty
610   * or under writeback.
611   */
612  void deactivate_file_page(struct page *page)
613  {
614  	/*
615  	 * In a workload with many unevictable page such as mprotect,
616  	 * unevictable page deactivation for accelerating reclaim is pointless.
617  	 */
618  	if (PageUnevictable(page))
619  		return;
620  
621  	if (likely(get_page_unless_zero(page))) {
622  		struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
623  
624  		if (!pagevec_add(pvec, page) || PageCompound(page))
625  			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
626  		put_cpu_var(lru_deactivate_file_pvecs);
627  	}
628  }
629  
630  /**
631   * mark_page_lazyfree - make an anon page lazyfree
632   * @page: page to deactivate
633   *
634   * mark_page_lazyfree() moves @page to the inactive file list.
635   * This is done to accelerate the reclaim of @page.
636   */
637  void mark_page_lazyfree(struct page *page)
638  {
639  	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
640  	    !PageSwapCache(page) && !PageUnevictable(page)) {
641  		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
642  
643  		get_page(page);
644  		if (!pagevec_add(pvec, page) || PageCompound(page))
645  			pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
646  		put_cpu_var(lru_lazyfree_pvecs);
647  	}
648  }
649  
650  void lru_add_drain(void)
651  {
652  	lru_add_drain_cpu(get_cpu());
653  	put_cpu();
654  }
655  
656  static void lru_add_drain_per_cpu(struct work_struct *dummy)
657  {
658  	lru_add_drain();
659  }
660  
661  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
662  
663  /*
664   * Doesn't need any cpu hotplug locking because we do rely on per-cpu
665   * kworkers being shut down before our page_alloc_cpu_dead callback is
666   * executed on the offlined cpu.
667   * Calling this function with cpu hotplug locks held can actually lead
668   * to obscure indirect dependencies via WQ context.
669   */
670  void lru_add_drain_all(void)
671  {
672  	static DEFINE_MUTEX(lock);
673  	static struct cpumask has_work;
674  	int cpu;
675  
676  	/*
677  	 * Make sure nobody triggers this path before mm_percpu_wq is fully
678  	 * initialized.
679  	 */
680  	if (WARN_ON(!mm_percpu_wq))
681  		return;
682  
683  	mutex_lock(&lock);
684  	cpumask_clear(&has_work);
685  
686  	for_each_online_cpu(cpu) {
687  		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
688  
689  		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
690  		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
691  		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
692  		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
693  		    need_activate_page_drain(cpu)) {
694  			INIT_WORK(work, lru_add_drain_per_cpu);
695  			queue_work_on(cpu, mm_percpu_wq, work);
696  			cpumask_set_cpu(cpu, &has_work);
697  		}
698  	}
699  
700  	for_each_cpu(cpu, &has_work)
701  		flush_work(&per_cpu(lru_add_drain_work, cpu));
702  
703  	mutex_unlock(&lock);
704  }
705  
706  /**
707   * release_pages - batched put_page()
708   * @pages: array of pages to release
709   * @nr: number of pages
710   *
711   * Decrement the reference count on all the pages in @pages.  If it
712   * fell to zero, remove the page from the LRU and free it.
713   */
714  void release_pages(struct page **pages, int nr)
715  {
716  	int i;
717  	LIST_HEAD(pages_to_free);
718  	struct pglist_data *locked_pgdat = NULL;
719  	struct lruvec *lruvec;
720  	unsigned long uninitialized_var(flags);
721  	unsigned int uninitialized_var(lock_batch);
722  
723  	for (i = 0; i < nr; i++) {
724  		struct page *page = pages[i];
725  
726  		/*
727  		 * Make sure the IRQ-safe lock-holding time does not get
728  		 * excessive with a continuous string of pages from the
729  		 * same pgdat. The lock is held only if pgdat != NULL.
730  		 */
731  		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
732  			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
733  			locked_pgdat = NULL;
734  		}
735  
736  		if (is_huge_zero_page(page))
737  			continue;
738  
739  		/* Device public page can not be huge page */
740  		if (is_device_public_page(page)) {
741  			if (locked_pgdat) {
742  				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
743  						       flags);
744  				locked_pgdat = NULL;
745  			}
746  			put_devmap_managed_page(page);
747  			continue;
748  		}
749  
750  		page = compound_head(page);
751  		if (!put_page_testzero(page))
752  			continue;
753  
754  		if (PageCompound(page)) {
755  			if (locked_pgdat) {
756  				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
757  				locked_pgdat = NULL;
758  			}
759  			__put_compound_page(page);
760  			continue;
761  		}
762  
763  		if (PageLRU(page)) {
764  			struct pglist_data *pgdat = page_pgdat(page);
765  
766  			if (pgdat != locked_pgdat) {
767  				if (locked_pgdat)
768  					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
769  									flags);
770  				lock_batch = 0;
771  				locked_pgdat = pgdat;
772  				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
773  			}
774  
775  			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
776  			VM_BUG_ON_PAGE(!PageLRU(page), page);
777  			__ClearPageLRU(page);
778  			del_page_from_lru_list(page, lruvec, page_off_lru(page));
779  		}
780  
781  		/* Clear Active bit in case of parallel mark_page_accessed */
782  		__ClearPageActive(page);
783  		__ClearPageWaiters(page);
784  
785  		list_add(&page->lru, &pages_to_free);
786  	}
787  	if (locked_pgdat)
788  		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
789  
790  	mem_cgroup_uncharge_list(&pages_to_free);
791  	free_unref_page_list(&pages_to_free);
792  }
793  EXPORT_SYMBOL(release_pages);
794  
795  /*
796   * The pages which we're about to release may be in the deferred lru-addition
797   * queues.  That would prevent them from really being freed right now.  That's
798   * OK from a correctness point of view but is inefficient - those pages may be
799   * cache-warm and we want to give them back to the page allocator ASAP.
800   *
801   * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
802   * and __pagevec_lru_add_active() call release_pages() directly to avoid
803   * mutual recursion.
804   */
805  void __pagevec_release(struct pagevec *pvec)
806  {
807  	if (!pvec->percpu_pvec_drained) {
808  		lru_add_drain();
809  		pvec->percpu_pvec_drained = true;
810  	}
811  	release_pages(pvec->pages, pagevec_count(pvec));
812  	pagevec_reinit(pvec);
813  }
814  EXPORT_SYMBOL(__pagevec_release);
815  
816  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
817  /* used by __split_huge_page_refcount() */
818  void lru_add_page_tail(struct page *page, struct page *page_tail,
819  		       struct lruvec *lruvec, struct list_head *list)
820  {
821  	const int file = 0;
822  
823  	VM_BUG_ON_PAGE(!PageHead(page), page);
824  	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
825  	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
826  	VM_BUG_ON(NR_CPUS != 1 &&
827  		  !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
828  
829  	if (!list)
830  		SetPageLRU(page_tail);
831  
832  	if (likely(PageLRU(page)))
833  		list_add_tail(&page_tail->lru, &page->lru);
834  	else if (list) {
835  		/* page reclaim is reclaiming a huge page */
836  		get_page(page_tail);
837  		list_add_tail(&page_tail->lru, list);
838  	} else {
839  		struct list_head *list_head;
840  		/*
841  		 * Head page has not yet been counted, as an hpage,
842  		 * so we must account for each subpage individually.
843  		 *
844  		 * Use the standard add function to put page_tail on the list,
845  		 * but then correct its position so they all end up in order.
846  		 */
847  		add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
848  		list_head = page_tail->lru.prev;
849  		list_move_tail(&page_tail->lru, list_head);
850  	}
851  
852  	if (!PageUnevictable(page))
853  		update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
854  }
855  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
856  
857  static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
858  				 void *arg)
859  {
860  	enum lru_list lru;
861  	int was_unevictable = TestClearPageUnevictable(page);
862  
863  	VM_BUG_ON_PAGE(PageLRU(page), page);
864  
865  	SetPageLRU(page);
866  	/*
867  	 * Page becomes evictable in two ways:
868  	 * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
869  	 * 2) Before acquiring LRU lock to put the page to correct LRU and then
870  	 *   a) do PageLRU check with lock [check_move_unevictable_pages]
871  	 *   b) do PageLRU check before lock [clear_page_mlock]
872  	 *
873  	 * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
874  	 * following strict ordering:
875  	 *
876  	 * #0: __pagevec_lru_add_fn		#1: clear_page_mlock
877  	 *
878  	 * SetPageLRU()				TestClearPageMlocked()
879  	 * smp_mb() // explicit ordering	// above provides strict
880  	 *					// ordering
881  	 * PageMlocked()			PageLRU()
882  	 *
883  	 *
884  	 * if '#1' does not observe setting of PG_lru by '#0' and fails
885  	 * isolation, the explicit barrier will make sure that page_evictable
886  	 * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
887  	 * can be reordered after PageMlocked check and can make '#1' to fail
888  	 * the isolation of the page whose Mlocked bit is cleared (#0 is also
889  	 * looking at the same page) and the evictable page will be stranded
890  	 * in an unevictable LRU.
891  	 */
892  	smp_mb();
893  
894  	if (page_evictable(page)) {
895  		lru = page_lru(page);
896  		update_page_reclaim_stat(lruvec, page_is_file_cache(page),
897  					 PageActive(page));
898  		if (was_unevictable)
899  			count_vm_event(UNEVICTABLE_PGRESCUED);
900  	} else {
901  		lru = LRU_UNEVICTABLE;
902  		ClearPageActive(page);
903  		SetPageUnevictable(page);
904  		if (!was_unevictable)
905  			count_vm_event(UNEVICTABLE_PGCULLED);
906  	}
907  
908  	add_page_to_lru_list(page, lruvec, lru);
909  	trace_mm_lru_insertion(page, lru);
910  }
911  
912  /*
913   * Add the passed pages to the LRU, then drop the caller's refcount
914   * on them.  Reinitialises the caller's pagevec.
915   */
916  void __pagevec_lru_add(struct pagevec *pvec)
917  {
918  	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
919  }
920  EXPORT_SYMBOL(__pagevec_lru_add);
921  
922  /**
923   * pagevec_lookup_entries - gang pagecache lookup
924   * @pvec:	Where the resulting entries are placed
925   * @mapping:	The address_space to search
926   * @start:	The starting entry index
927   * @nr_entries:	The maximum number of pages
928   * @indices:	The cache indices corresponding to the entries in @pvec
929   *
930   * pagevec_lookup_entries() will search for and return a group of up
931   * to @nr_pages pages and shadow entries in the mapping.  All
932   * entries are placed in @pvec.  pagevec_lookup_entries() takes a
933   * reference against actual pages in @pvec.
934   *
935   * The search returns a group of mapping-contiguous entries with
936   * ascending indexes.  There may be holes in the indices due to
937   * not-present entries.
938   *
939   * pagevec_lookup_entries() returns the number of entries which were
940   * found.
941   */
942  unsigned pagevec_lookup_entries(struct pagevec *pvec,
943  				struct address_space *mapping,
944  				pgoff_t start, unsigned nr_entries,
945  				pgoff_t *indices)
946  {
947  	pvec->nr = find_get_entries(mapping, start, nr_entries,
948  				    pvec->pages, indices);
949  	return pagevec_count(pvec);
950  }
951  
952  /**
953   * pagevec_remove_exceptionals - pagevec exceptionals pruning
954   * @pvec:	The pagevec to prune
955   *
956   * pagevec_lookup_entries() fills both pages and exceptional radix
957   * tree entries into the pagevec.  This function prunes all
958   * exceptionals from @pvec without leaving holes, so that it can be
959   * passed on to page-only pagevec operations.
960   */
961  void pagevec_remove_exceptionals(struct pagevec *pvec)
962  {
963  	int i, j;
964  
965  	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
966  		struct page *page = pvec->pages[i];
967  		if (!xa_is_value(page))
968  			pvec->pages[j++] = page;
969  	}
970  	pvec->nr = j;
971  }
972  
973  /**
974   * pagevec_lookup_range - gang pagecache lookup
975   * @pvec:	Where the resulting pages are placed
976   * @mapping:	The address_space to search
977   * @start:	The starting page index
978   * @end:	The final page index
979   *
980   * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
981   * pages in the mapping starting from index @start and upto index @end
982   * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
983   * reference against the pages in @pvec.
984   *
985   * The search returns a group of mapping-contiguous pages with ascending
986   * indexes.  There may be holes in the indices due to not-present pages. We
987   * also update @start to index the next page for the traversal.
988   *
989   * pagevec_lookup_range() returns the number of pages which were found. If this
990   * number is smaller than PAGEVEC_SIZE, the end of specified range has been
991   * reached.
992   */
993  unsigned pagevec_lookup_range(struct pagevec *pvec,
994  		struct address_space *mapping, pgoff_t *start, pgoff_t end)
995  {
996  	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
997  					pvec->pages);
998  	return pagevec_count(pvec);
999  }
1000  EXPORT_SYMBOL(pagevec_lookup_range);
1001  
1002  unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1003  		struct address_space *mapping, pgoff_t *index, pgoff_t end,
1004  		xa_mark_t tag)
1005  {
1006  	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1007  					PAGEVEC_SIZE, pvec->pages);
1008  	return pagevec_count(pvec);
1009  }
1010  EXPORT_SYMBOL(pagevec_lookup_range_tag);
1011  
1012  unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1013  		struct address_space *mapping, pgoff_t *index, pgoff_t end,
1014  		xa_mark_t tag, unsigned max_pages)
1015  {
1016  	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1017  		min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1018  	return pagevec_count(pvec);
1019  }
1020  EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
1021  /*
1022   * Perform any setup for the swap system
1023   */
1024  void __init swap_setup(void)
1025  {
1026  	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
1027  
1028  	/* Use a smaller cluster for small-memory machines */
1029  	if (megs < 16)
1030  		page_cluster = 2;
1031  	else
1032  		page_cluster = 3;
1033  	/*
1034  	 * Right now other parts of the system means that we
1035  	 * _really_ don't want to cluster much more
1036  	 */
1037  }
1038