xref: /openbmc/linux/mm/swap.c (revision 14ff46fd99077c3c15cbf38ab1795cbd0f16669e)
1  /*
2   *  linux/mm/swap.c
3   *
4   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5   */
6  
7  /*
8   * This file contains the default values for the operation of the
9   * Linux VM subsystem. Fine-tuning documentation can be found in
10   * Documentation/sysctl/vm.txt.
11   * Started 18.12.91
12   * Swap aging added 23.2.95, Stephen Tweedie.
13   * Buffermem limits added 12.3.98, Rik van Riel.
14   */
15  
16  #include <linux/mm.h>
17  #include <linux/sched.h>
18  #include <linux/kernel_stat.h>
19  #include <linux/swap.h>
20  #include <linux/mman.h>
21  #include <linux/pagemap.h>
22  #include <linux/pagevec.h>
23  #include <linux/init.h>
24  #include <linux/export.h>
25  #include <linux/mm_inline.h>
26  #include <linux/percpu_counter.h>
27  #include <linux/memremap.h>
28  #include <linux/percpu.h>
29  #include <linux/cpu.h>
30  #include <linux/notifier.h>
31  #include <linux/backing-dev.h>
32  #include <linux/memcontrol.h>
33  #include <linux/gfp.h>
34  #include <linux/uio.h>
35  #include <linux/hugetlb.h>
36  #include <linux/page_idle.h>
37  
38  #include "internal.h"
39  
40  #define CREATE_TRACE_POINTS
41  #include <trace/events/pagemap.h>
42  
43  /* How many pages do we try to swap or page in/out together? */
44  int page_cluster;
45  
46  static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
47  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
48  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
49  static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
50  #ifdef CONFIG_SMP
51  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
52  #endif
53  
54  /*
55   * This path almost never happens for VM activity - pages are normally
56   * freed via pagevecs.  But it gets used by networking.
57   */
58  static void __page_cache_release(struct page *page)
59  {
60  	if (PageLRU(page)) {
61  		struct zone *zone = page_zone(page);
62  		struct lruvec *lruvec;
63  		unsigned long flags;
64  
65  		spin_lock_irqsave(zone_lru_lock(zone), flags);
66  		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
67  		VM_BUG_ON_PAGE(!PageLRU(page), page);
68  		__ClearPageLRU(page);
69  		del_page_from_lru_list(page, lruvec, page_off_lru(page));
70  		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
71  	}
72  	__ClearPageWaiters(page);
73  	mem_cgroup_uncharge(page);
74  }
75  
76  static void __put_single_page(struct page *page)
77  {
78  	__page_cache_release(page);
79  	free_hot_cold_page(page, false);
80  }
81  
82  static void __put_compound_page(struct page *page)
83  {
84  	compound_page_dtor *dtor;
85  
86  	/*
87  	 * __page_cache_release() is supposed to be called for thp, not for
88  	 * hugetlb. This is because hugetlb page does never have PageLRU set
89  	 * (it's never listed to any LRU lists) and no memcg routines should
90  	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
91  	 */
92  	if (!PageHuge(page))
93  		__page_cache_release(page);
94  	dtor = get_compound_page_dtor(page);
95  	(*dtor)(page);
96  }
97  
98  void __put_page(struct page *page)
99  {
100  	if (is_zone_device_page(page)) {
101  		put_dev_pagemap(page->pgmap);
102  
103  		/*
104  		 * The page belongs to the device that created pgmap. Do
105  		 * not return it to page allocator.
106  		 */
107  		return;
108  	}
109  
110  	if (unlikely(PageCompound(page)))
111  		__put_compound_page(page);
112  	else
113  		__put_single_page(page);
114  }
115  EXPORT_SYMBOL(__put_page);
116  
117  /**
118   * put_pages_list() - release a list of pages
119   * @pages: list of pages threaded on page->lru
120   *
121   * Release a list of pages which are strung together on page.lru.  Currently
122   * used by read_cache_pages() and related error recovery code.
123   */
124  void put_pages_list(struct list_head *pages)
125  {
126  	while (!list_empty(pages)) {
127  		struct page *victim;
128  
129  		victim = list_entry(pages->prev, struct page, lru);
130  		list_del(&victim->lru);
131  		put_page(victim);
132  	}
133  }
134  EXPORT_SYMBOL(put_pages_list);
135  
136  /*
137   * get_kernel_pages() - pin kernel pages in memory
138   * @kiov:	An array of struct kvec structures
139   * @nr_segs:	number of segments to pin
140   * @write:	pinning for read/write, currently ignored
141   * @pages:	array that receives pointers to the pages pinned.
142   *		Should be at least nr_segs long.
143   *
144   * Returns number of pages pinned. This may be fewer than the number
145   * requested. If nr_pages is 0 or negative, returns 0. If no pages
146   * were pinned, returns -errno. Each page returned must be released
147   * with a put_page() call when it is finished with.
148   */
149  int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
150  		struct page **pages)
151  {
152  	int seg;
153  
154  	for (seg = 0; seg < nr_segs; seg++) {
155  		if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
156  			return seg;
157  
158  		pages[seg] = kmap_to_page(kiov[seg].iov_base);
159  		get_page(pages[seg]);
160  	}
161  
162  	return seg;
163  }
164  EXPORT_SYMBOL_GPL(get_kernel_pages);
165  
166  /*
167   * get_kernel_page() - pin a kernel page in memory
168   * @start:	starting kernel address
169   * @write:	pinning for read/write, currently ignored
170   * @pages:	array that receives pointer to the page pinned.
171   *		Must be at least nr_segs long.
172   *
173   * Returns 1 if page is pinned. If the page was not pinned, returns
174   * -errno. The page returned must be released with a put_page() call
175   * when it is finished with.
176   */
177  int get_kernel_page(unsigned long start, int write, struct page **pages)
178  {
179  	const struct kvec kiov = {
180  		.iov_base = (void *)start,
181  		.iov_len = PAGE_SIZE
182  	};
183  
184  	return get_kernel_pages(&kiov, 1, write, pages);
185  }
186  EXPORT_SYMBOL_GPL(get_kernel_page);
187  
188  static void pagevec_lru_move_fn(struct pagevec *pvec,
189  	void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
190  	void *arg)
191  {
192  	int i;
193  	struct pglist_data *pgdat = NULL;
194  	struct lruvec *lruvec;
195  	unsigned long flags = 0;
196  
197  	for (i = 0; i < pagevec_count(pvec); i++) {
198  		struct page *page = pvec->pages[i];
199  		struct pglist_data *pagepgdat = page_pgdat(page);
200  
201  		if (pagepgdat != pgdat) {
202  			if (pgdat)
203  				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
204  			pgdat = pagepgdat;
205  			spin_lock_irqsave(&pgdat->lru_lock, flags);
206  		}
207  
208  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
209  		(*move_fn)(page, lruvec, arg);
210  	}
211  	if (pgdat)
212  		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
213  	release_pages(pvec->pages, pvec->nr, pvec->cold);
214  	pagevec_reinit(pvec);
215  }
216  
217  static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
218  				 void *arg)
219  {
220  	int *pgmoved = arg;
221  
222  	if (PageLRU(page) && !PageUnevictable(page)) {
223  		del_page_from_lru_list(page, lruvec, page_lru(page));
224  		ClearPageActive(page);
225  		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
226  		(*pgmoved)++;
227  	}
228  }
229  
230  /*
231   * pagevec_move_tail() must be called with IRQ disabled.
232   * Otherwise this may cause nasty races.
233   */
234  static void pagevec_move_tail(struct pagevec *pvec)
235  {
236  	int pgmoved = 0;
237  
238  	pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
239  	__count_vm_events(PGROTATED, pgmoved);
240  }
241  
242  /*
243   * Writeback is about to end against a page which has been marked for immediate
244   * reclaim.  If it still appears to be reclaimable, move it to the tail of the
245   * inactive list.
246   */
247  void rotate_reclaimable_page(struct page *page)
248  {
249  	if (!PageLocked(page) && !PageDirty(page) &&
250  	    !PageUnevictable(page) && PageLRU(page)) {
251  		struct pagevec *pvec;
252  		unsigned long flags;
253  
254  		get_page(page);
255  		local_irq_save(flags);
256  		pvec = this_cpu_ptr(&lru_rotate_pvecs);
257  		if (!pagevec_add(pvec, page) || PageCompound(page))
258  			pagevec_move_tail(pvec);
259  		local_irq_restore(flags);
260  	}
261  }
262  
263  static void update_page_reclaim_stat(struct lruvec *lruvec,
264  				     int file, int rotated)
265  {
266  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
267  
268  	reclaim_stat->recent_scanned[file]++;
269  	if (rotated)
270  		reclaim_stat->recent_rotated[file]++;
271  }
272  
273  static void __activate_page(struct page *page, struct lruvec *lruvec,
274  			    void *arg)
275  {
276  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
277  		int file = page_is_file_cache(page);
278  		int lru = page_lru_base_type(page);
279  
280  		del_page_from_lru_list(page, lruvec, lru);
281  		SetPageActive(page);
282  		lru += LRU_ACTIVE;
283  		add_page_to_lru_list(page, lruvec, lru);
284  		trace_mm_lru_activate(page);
285  
286  		__count_vm_event(PGACTIVATE);
287  		update_page_reclaim_stat(lruvec, file, 1);
288  	}
289  }
290  
291  #ifdef CONFIG_SMP
292  static void activate_page_drain(int cpu)
293  {
294  	struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
295  
296  	if (pagevec_count(pvec))
297  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
298  }
299  
300  static bool need_activate_page_drain(int cpu)
301  {
302  	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
303  }
304  
305  void activate_page(struct page *page)
306  {
307  	page = compound_head(page);
308  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
309  		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
310  
311  		get_page(page);
312  		if (!pagevec_add(pvec, page) || PageCompound(page))
313  			pagevec_lru_move_fn(pvec, __activate_page, NULL);
314  		put_cpu_var(activate_page_pvecs);
315  	}
316  }
317  
318  #else
319  static inline void activate_page_drain(int cpu)
320  {
321  }
322  
323  static bool need_activate_page_drain(int cpu)
324  {
325  	return false;
326  }
327  
328  void activate_page(struct page *page)
329  {
330  	struct zone *zone = page_zone(page);
331  
332  	page = compound_head(page);
333  	spin_lock_irq(zone_lru_lock(zone));
334  	__activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
335  	spin_unlock_irq(zone_lru_lock(zone));
336  }
337  #endif
338  
339  static void __lru_cache_activate_page(struct page *page)
340  {
341  	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
342  	int i;
343  
344  	/*
345  	 * Search backwards on the optimistic assumption that the page being
346  	 * activated has just been added to this pagevec. Note that only
347  	 * the local pagevec is examined as a !PageLRU page could be in the
348  	 * process of being released, reclaimed, migrated or on a remote
349  	 * pagevec that is currently being drained. Furthermore, marking
350  	 * a remote pagevec's page PageActive potentially hits a race where
351  	 * a page is marked PageActive just after it is added to the inactive
352  	 * list causing accounting errors and BUG_ON checks to trigger.
353  	 */
354  	for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
355  		struct page *pagevec_page = pvec->pages[i];
356  
357  		if (pagevec_page == page) {
358  			SetPageActive(page);
359  			break;
360  		}
361  	}
362  
363  	put_cpu_var(lru_add_pvec);
364  }
365  
366  /*
367   * Mark a page as having seen activity.
368   *
369   * inactive,unreferenced	->	inactive,referenced
370   * inactive,referenced		->	active,unreferenced
371   * active,unreferenced		->	active,referenced
372   *
373   * When a newly allocated page is not yet visible, so safe for non-atomic ops,
374   * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
375   */
376  void mark_page_accessed(struct page *page)
377  {
378  	page = compound_head(page);
379  	if (!PageActive(page) && !PageUnevictable(page) &&
380  			PageReferenced(page)) {
381  
382  		/*
383  		 * If the page is on the LRU, queue it for activation via
384  		 * activate_page_pvecs. Otherwise, assume the page is on a
385  		 * pagevec, mark it active and it'll be moved to the active
386  		 * LRU on the next drain.
387  		 */
388  		if (PageLRU(page))
389  			activate_page(page);
390  		else
391  			__lru_cache_activate_page(page);
392  		ClearPageReferenced(page);
393  		if (page_is_file_cache(page))
394  			workingset_activation(page);
395  	} else if (!PageReferenced(page)) {
396  		SetPageReferenced(page);
397  	}
398  	if (page_is_idle(page))
399  		clear_page_idle(page);
400  }
401  EXPORT_SYMBOL(mark_page_accessed);
402  
403  static void __lru_cache_add(struct page *page)
404  {
405  	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
406  
407  	get_page(page);
408  	if (!pagevec_add(pvec, page) || PageCompound(page))
409  		__pagevec_lru_add(pvec);
410  	put_cpu_var(lru_add_pvec);
411  }
412  
413  /**
414   * lru_cache_add: add a page to the page lists
415   * @page: the page to add
416   */
417  void lru_cache_add_anon(struct page *page)
418  {
419  	if (PageActive(page))
420  		ClearPageActive(page);
421  	__lru_cache_add(page);
422  }
423  
424  void lru_cache_add_file(struct page *page)
425  {
426  	if (PageActive(page))
427  		ClearPageActive(page);
428  	__lru_cache_add(page);
429  }
430  EXPORT_SYMBOL(lru_cache_add_file);
431  
432  /**
433   * lru_cache_add - add a page to a page list
434   * @page: the page to be added to the LRU.
435   *
436   * Queue the page for addition to the LRU via pagevec. The decision on whether
437   * to add the page to the [in]active [file|anon] list is deferred until the
438   * pagevec is drained. This gives a chance for the caller of lru_cache_add()
439   * have the page added to the active list using mark_page_accessed().
440   */
441  void lru_cache_add(struct page *page)
442  {
443  	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
444  	VM_BUG_ON_PAGE(PageLRU(page), page);
445  	__lru_cache_add(page);
446  }
447  
448  /**
449   * add_page_to_unevictable_list - add a page to the unevictable list
450   * @page:  the page to be added to the unevictable list
451   *
452   * Add page directly to its zone's unevictable list.  To avoid races with
453   * tasks that might be making the page evictable, through eg. munlock,
454   * munmap or exit, while it's not on the lru, we want to add the page
455   * while it's locked or otherwise "invisible" to other tasks.  This is
456   * difficult to do when using the pagevec cache, so bypass that.
457   */
458  void add_page_to_unevictable_list(struct page *page)
459  {
460  	struct pglist_data *pgdat = page_pgdat(page);
461  	struct lruvec *lruvec;
462  
463  	spin_lock_irq(&pgdat->lru_lock);
464  	lruvec = mem_cgroup_page_lruvec(page, pgdat);
465  	ClearPageActive(page);
466  	SetPageUnevictable(page);
467  	SetPageLRU(page);
468  	add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
469  	spin_unlock_irq(&pgdat->lru_lock);
470  }
471  
472  /**
473   * lru_cache_add_active_or_unevictable
474   * @page:  the page to be added to LRU
475   * @vma:   vma in which page is mapped for determining reclaimability
476   *
477   * Place @page on the active or unevictable LRU list, depending on its
478   * evictability.  Note that if the page is not evictable, it goes
479   * directly back onto it's zone's unevictable list, it does NOT use a
480   * per cpu pagevec.
481   */
482  void lru_cache_add_active_or_unevictable(struct page *page,
483  					 struct vm_area_struct *vma)
484  {
485  	VM_BUG_ON_PAGE(PageLRU(page), page);
486  
487  	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
488  		SetPageActive(page);
489  		lru_cache_add(page);
490  		return;
491  	}
492  
493  	if (!TestSetPageMlocked(page)) {
494  		/*
495  		 * We use the irq-unsafe __mod_zone_page_stat because this
496  		 * counter is not modified from interrupt context, and the pte
497  		 * lock is held(spinlock), which implies preemption disabled.
498  		 */
499  		__mod_zone_page_state(page_zone(page), NR_MLOCK,
500  				    hpage_nr_pages(page));
501  		count_vm_event(UNEVICTABLE_PGMLOCKED);
502  	}
503  	add_page_to_unevictable_list(page);
504  }
505  
506  /*
507   * If the page can not be invalidated, it is moved to the
508   * inactive list to speed up its reclaim.  It is moved to the
509   * head of the list, rather than the tail, to give the flusher
510   * threads some time to write it out, as this is much more
511   * effective than the single-page writeout from reclaim.
512   *
513   * If the page isn't page_mapped and dirty/writeback, the page
514   * could reclaim asap using PG_reclaim.
515   *
516   * 1. active, mapped page -> none
517   * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
518   * 3. inactive, mapped page -> none
519   * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
520   * 5. inactive, clean -> inactive, tail
521   * 6. Others -> none
522   *
523   * In 4, why it moves inactive's head, the VM expects the page would
524   * be write it out by flusher threads as this is much more effective
525   * than the single-page writeout from reclaim.
526   */
527  static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
528  			      void *arg)
529  {
530  	int lru, file;
531  	bool active;
532  
533  	if (!PageLRU(page))
534  		return;
535  
536  	if (PageUnevictable(page))
537  		return;
538  
539  	/* Some processes are using the page */
540  	if (page_mapped(page))
541  		return;
542  
543  	active = PageActive(page);
544  	file = page_is_file_cache(page);
545  	lru = page_lru_base_type(page);
546  
547  	del_page_from_lru_list(page, lruvec, lru + active);
548  	ClearPageActive(page);
549  	ClearPageReferenced(page);
550  	add_page_to_lru_list(page, lruvec, lru);
551  
552  	if (PageWriteback(page) || PageDirty(page)) {
553  		/*
554  		 * PG_reclaim could be raced with end_page_writeback
555  		 * It can make readahead confusing.  But race window
556  		 * is _really_ small and  it's non-critical problem.
557  		 */
558  		SetPageReclaim(page);
559  	} else {
560  		/*
561  		 * The page's writeback ends up during pagevec
562  		 * We moves tha page into tail of inactive.
563  		 */
564  		list_move_tail(&page->lru, &lruvec->lists[lru]);
565  		__count_vm_event(PGROTATED);
566  	}
567  
568  	if (active)
569  		__count_vm_event(PGDEACTIVATE);
570  	update_page_reclaim_stat(lruvec, file, 0);
571  }
572  
573  
574  static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
575  			    void *arg)
576  {
577  	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
578  	    !PageUnevictable(page)) {
579  		bool active = PageActive(page);
580  
581  		del_page_from_lru_list(page, lruvec,
582  				       LRU_INACTIVE_ANON + active);
583  		ClearPageActive(page);
584  		ClearPageReferenced(page);
585  		/*
586  		 * lazyfree pages are clean anonymous pages. They have
587  		 * SwapBacked flag cleared to distinguish normal anonymous
588  		 * pages
589  		 */
590  		ClearPageSwapBacked(page);
591  		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
592  
593  		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
594  		count_memcg_page_event(page, PGLAZYFREE);
595  		update_page_reclaim_stat(lruvec, 1, 0);
596  	}
597  }
598  
599  /*
600   * Drain pages out of the cpu's pagevecs.
601   * Either "cpu" is the current CPU, and preemption has already been
602   * disabled; or "cpu" is being hot-unplugged, and is already dead.
603   */
604  void lru_add_drain_cpu(int cpu)
605  {
606  	struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
607  
608  	if (pagevec_count(pvec))
609  		__pagevec_lru_add(pvec);
610  
611  	pvec = &per_cpu(lru_rotate_pvecs, cpu);
612  	if (pagevec_count(pvec)) {
613  		unsigned long flags;
614  
615  		/* No harm done if a racing interrupt already did this */
616  		local_irq_save(flags);
617  		pagevec_move_tail(pvec);
618  		local_irq_restore(flags);
619  	}
620  
621  	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
622  	if (pagevec_count(pvec))
623  		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
624  
625  	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
626  	if (pagevec_count(pvec))
627  		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
628  
629  	activate_page_drain(cpu);
630  }
631  
632  /**
633   * deactivate_file_page - forcefully deactivate a file page
634   * @page: page to deactivate
635   *
636   * This function hints the VM that @page is a good reclaim candidate,
637   * for example if its invalidation fails due to the page being dirty
638   * or under writeback.
639   */
640  void deactivate_file_page(struct page *page)
641  {
642  	/*
643  	 * In a workload with many unevictable page such as mprotect,
644  	 * unevictable page deactivation for accelerating reclaim is pointless.
645  	 */
646  	if (PageUnevictable(page))
647  		return;
648  
649  	if (likely(get_page_unless_zero(page))) {
650  		struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
651  
652  		if (!pagevec_add(pvec, page) || PageCompound(page))
653  			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
654  		put_cpu_var(lru_deactivate_file_pvecs);
655  	}
656  }
657  
658  /**
659   * mark_page_lazyfree - make an anon page lazyfree
660   * @page: page to deactivate
661   *
662   * mark_page_lazyfree() moves @page to the inactive file list.
663   * This is done to accelerate the reclaim of @page.
664   */
665  void mark_page_lazyfree(struct page *page)
666  {
667  	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
668  	    !PageUnevictable(page)) {
669  		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
670  
671  		get_page(page);
672  		if (!pagevec_add(pvec, page) || PageCompound(page))
673  			pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
674  		put_cpu_var(lru_lazyfree_pvecs);
675  	}
676  }
677  
678  void lru_add_drain(void)
679  {
680  	lru_add_drain_cpu(get_cpu());
681  	put_cpu();
682  }
683  
684  static void lru_add_drain_per_cpu(struct work_struct *dummy)
685  {
686  	lru_add_drain();
687  }
688  
689  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
690  
691  void lru_add_drain_all_cpuslocked(void)
692  {
693  	static DEFINE_MUTEX(lock);
694  	static struct cpumask has_work;
695  	int cpu;
696  
697  	/*
698  	 * Make sure nobody triggers this path before mm_percpu_wq is fully
699  	 * initialized.
700  	 */
701  	if (WARN_ON(!mm_percpu_wq))
702  		return;
703  
704  	mutex_lock(&lock);
705  	cpumask_clear(&has_work);
706  
707  	for_each_online_cpu(cpu) {
708  		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
709  
710  		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
711  		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
712  		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
713  		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
714  		    need_activate_page_drain(cpu)) {
715  			INIT_WORK(work, lru_add_drain_per_cpu);
716  			queue_work_on(cpu, mm_percpu_wq, work);
717  			cpumask_set_cpu(cpu, &has_work);
718  		}
719  	}
720  
721  	for_each_cpu(cpu, &has_work)
722  		flush_work(&per_cpu(lru_add_drain_work, cpu));
723  
724  	mutex_unlock(&lock);
725  }
726  
727  void lru_add_drain_all(void)
728  {
729  	get_online_cpus();
730  	lru_add_drain_all_cpuslocked();
731  	put_online_cpus();
732  }
733  
734  /**
735   * release_pages - batched put_page()
736   * @pages: array of pages to release
737   * @nr: number of pages
738   * @cold: whether the pages are cache cold
739   *
740   * Decrement the reference count on all the pages in @pages.  If it
741   * fell to zero, remove the page from the LRU and free it.
742   */
743  void release_pages(struct page **pages, int nr, bool cold)
744  {
745  	int i;
746  	LIST_HEAD(pages_to_free);
747  	struct pglist_data *locked_pgdat = NULL;
748  	struct lruvec *lruvec;
749  	unsigned long uninitialized_var(flags);
750  	unsigned int uninitialized_var(lock_batch);
751  
752  	for (i = 0; i < nr; i++) {
753  		struct page *page = pages[i];
754  
755  		/*
756  		 * Make sure the IRQ-safe lock-holding time does not get
757  		 * excessive with a continuous string of pages from the
758  		 * same pgdat. The lock is held only if pgdat != NULL.
759  		 */
760  		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
761  			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
762  			locked_pgdat = NULL;
763  		}
764  
765  		if (is_huge_zero_page(page))
766  			continue;
767  
768  		page = compound_head(page);
769  		if (!put_page_testzero(page))
770  			continue;
771  
772  		if (PageCompound(page)) {
773  			if (locked_pgdat) {
774  				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
775  				locked_pgdat = NULL;
776  			}
777  			__put_compound_page(page);
778  			continue;
779  		}
780  
781  		if (PageLRU(page)) {
782  			struct pglist_data *pgdat = page_pgdat(page);
783  
784  			if (pgdat != locked_pgdat) {
785  				if (locked_pgdat)
786  					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
787  									flags);
788  				lock_batch = 0;
789  				locked_pgdat = pgdat;
790  				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
791  			}
792  
793  			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
794  			VM_BUG_ON_PAGE(!PageLRU(page), page);
795  			__ClearPageLRU(page);
796  			del_page_from_lru_list(page, lruvec, page_off_lru(page));
797  		}
798  
799  		/* Clear Active bit in case of parallel mark_page_accessed */
800  		__ClearPageActive(page);
801  		__ClearPageWaiters(page);
802  
803  		list_add(&page->lru, &pages_to_free);
804  	}
805  	if (locked_pgdat)
806  		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
807  
808  	mem_cgroup_uncharge_list(&pages_to_free);
809  	free_hot_cold_page_list(&pages_to_free, cold);
810  }
811  EXPORT_SYMBOL(release_pages);
812  
813  /*
814   * The pages which we're about to release may be in the deferred lru-addition
815   * queues.  That would prevent them from really being freed right now.  That's
816   * OK from a correctness point of view but is inefficient - those pages may be
817   * cache-warm and we want to give them back to the page allocator ASAP.
818   *
819   * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
820   * and __pagevec_lru_add_active() call release_pages() directly to avoid
821   * mutual recursion.
822   */
823  void __pagevec_release(struct pagevec *pvec)
824  {
825  	lru_add_drain();
826  	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
827  	pagevec_reinit(pvec);
828  }
829  EXPORT_SYMBOL(__pagevec_release);
830  
831  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
832  /* used by __split_huge_page_refcount() */
833  void lru_add_page_tail(struct page *page, struct page *page_tail,
834  		       struct lruvec *lruvec, struct list_head *list)
835  {
836  	const int file = 0;
837  
838  	VM_BUG_ON_PAGE(!PageHead(page), page);
839  	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
840  	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
841  	VM_BUG_ON(NR_CPUS != 1 &&
842  		  !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
843  
844  	if (!list)
845  		SetPageLRU(page_tail);
846  
847  	if (likely(PageLRU(page)))
848  		list_add_tail(&page_tail->lru, &page->lru);
849  	else if (list) {
850  		/* page reclaim is reclaiming a huge page */
851  		get_page(page_tail);
852  		list_add_tail(&page_tail->lru, list);
853  	} else {
854  		struct list_head *list_head;
855  		/*
856  		 * Head page has not yet been counted, as an hpage,
857  		 * so we must account for each subpage individually.
858  		 *
859  		 * Use the standard add function to put page_tail on the list,
860  		 * but then correct its position so they all end up in order.
861  		 */
862  		add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
863  		list_head = page_tail->lru.prev;
864  		list_move_tail(&page_tail->lru, list_head);
865  	}
866  
867  	if (!PageUnevictable(page))
868  		update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
869  }
870  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
871  
872  static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
873  				 void *arg)
874  {
875  	int file = page_is_file_cache(page);
876  	int active = PageActive(page);
877  	enum lru_list lru = page_lru(page);
878  
879  	VM_BUG_ON_PAGE(PageLRU(page), page);
880  
881  	SetPageLRU(page);
882  	add_page_to_lru_list(page, lruvec, lru);
883  	update_page_reclaim_stat(lruvec, file, active);
884  	trace_mm_lru_insertion(page, lru);
885  }
886  
887  /*
888   * Add the passed pages to the LRU, then drop the caller's refcount
889   * on them.  Reinitialises the caller's pagevec.
890   */
891  void __pagevec_lru_add(struct pagevec *pvec)
892  {
893  	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
894  }
895  EXPORT_SYMBOL(__pagevec_lru_add);
896  
897  /**
898   * pagevec_lookup_entries - gang pagecache lookup
899   * @pvec:	Where the resulting entries are placed
900   * @mapping:	The address_space to search
901   * @start:	The starting entry index
902   * @nr_entries:	The maximum number of entries
903   * @indices:	The cache indices corresponding to the entries in @pvec
904   *
905   * pagevec_lookup_entries() will search for and return a group of up
906   * to @nr_entries pages and shadow entries in the mapping.  All
907   * entries are placed in @pvec.  pagevec_lookup_entries() takes a
908   * reference against actual pages in @pvec.
909   *
910   * The search returns a group of mapping-contiguous entries with
911   * ascending indexes.  There may be holes in the indices due to
912   * not-present entries.
913   *
914   * pagevec_lookup_entries() returns the number of entries which were
915   * found.
916   */
917  unsigned pagevec_lookup_entries(struct pagevec *pvec,
918  				struct address_space *mapping,
919  				pgoff_t start, unsigned nr_pages,
920  				pgoff_t *indices)
921  {
922  	pvec->nr = find_get_entries(mapping, start, nr_pages,
923  				    pvec->pages, indices);
924  	return pagevec_count(pvec);
925  }
926  
927  /**
928   * pagevec_remove_exceptionals - pagevec exceptionals pruning
929   * @pvec:	The pagevec to prune
930   *
931   * pagevec_lookup_entries() fills both pages and exceptional radix
932   * tree entries into the pagevec.  This function prunes all
933   * exceptionals from @pvec without leaving holes, so that it can be
934   * passed on to page-only pagevec operations.
935   */
936  void pagevec_remove_exceptionals(struct pagevec *pvec)
937  {
938  	int i, j;
939  
940  	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
941  		struct page *page = pvec->pages[i];
942  		if (!radix_tree_exceptional_entry(page))
943  			pvec->pages[j++] = page;
944  	}
945  	pvec->nr = j;
946  }
947  
948  /**
949   * pagevec_lookup - gang pagecache lookup
950   * @pvec:	Where the resulting pages are placed
951   * @mapping:	The address_space to search
952   * @start:	The starting page index
953   * @nr_pages:	The maximum number of pages
954   *
955   * pagevec_lookup() will search for and return a group of up to @nr_pages pages
956   * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
957   * reference against the pages in @pvec.
958   *
959   * The search returns a group of mapping-contiguous pages with ascending
960   * indexes.  There may be holes in the indices due to not-present pages.
961   *
962   * pagevec_lookup() returns the number of pages which were found.
963   */
964  unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
965  		pgoff_t start, unsigned nr_pages)
966  {
967  	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
968  	return pagevec_count(pvec);
969  }
970  EXPORT_SYMBOL(pagevec_lookup);
971  
972  unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
973  		pgoff_t *index, int tag, unsigned nr_pages)
974  {
975  	pvec->nr = find_get_pages_tag(mapping, index, tag,
976  					nr_pages, pvec->pages);
977  	return pagevec_count(pvec);
978  }
979  EXPORT_SYMBOL(pagevec_lookup_tag);
980  
981  /*
982   * Perform any setup for the swap system
983   */
984  void __init swap_setup(void)
985  {
986  	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
987  
988  	/* Use a smaller cluster for small-memory machines */
989  	if (megs < 16)
990  		page_cluster = 2;
991  	else
992  		page_cluster = 3;
993  	/*
994  	 * Right now other parts of the system means that we
995  	 * _really_ don't want to cluster much more
996  	 */
997  }
998