xref: /openbmc/linux/mm/page_alloc.c (revision cd238eff)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/mm/page_alloc.c
4   *
5   *  Manages the free list, the system allocates free pages here.
6   *  Note that kmalloc() lives in slab.c
7   *
8   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
9   *  Swap reorganised 29.12.95, Stephen Tweedie
10   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16   */
17  
18  #include <linux/stddef.h>
19  #include <linux/mm.h>
20  #include <linux/highmem.h>
21  #include <linux/swap.h>
22  #include <linux/interrupt.h>
23  #include <linux/pagemap.h>
24  #include <linux/jiffies.h>
25  #include <linux/memblock.h>
26  #include <linux/compiler.h>
27  #include <linux/kernel.h>
28  #include <linux/kasan.h>
29  #include <linux/module.h>
30  #include <linux/suspend.h>
31  #include <linux/pagevec.h>
32  #include <linux/blkdev.h>
33  #include <linux/slab.h>
34  #include <linux/ratelimit.h>
35  #include <linux/oom.h>
36  #include <linux/topology.h>
37  #include <linux/sysctl.h>
38  #include <linux/cpu.h>
39  #include <linux/cpuset.h>
40  #include <linux/memory_hotplug.h>
41  #include <linux/nodemask.h>
42  #include <linux/vmalloc.h>
43  #include <linux/vmstat.h>
44  #include <linux/mempolicy.h>
45  #include <linux/memremap.h>
46  #include <linux/stop_machine.h>
47  #include <linux/random.h>
48  #include <linux/sort.h>
49  #include <linux/pfn.h>
50  #include <linux/backing-dev.h>
51  #include <linux/fault-inject.h>
52  #include <linux/page-isolation.h>
53  #include <linux/page_ext.h>
54  #include <linux/debugobjects.h>
55  #include <linux/kmemleak.h>
56  #include <linux/compaction.h>
57  #include <trace/events/kmem.h>
58  #include <trace/events/oom.h>
59  #include <linux/prefetch.h>
60  #include <linux/mm_inline.h>
61  #include <linux/migrate.h>
62  #include <linux/hugetlb.h>
63  #include <linux/sched/rt.h>
64  #include <linux/sched/mm.h>
65  #include <linux/page_owner.h>
66  #include <linux/kthread.h>
67  #include <linux/memcontrol.h>
68  #include <linux/ftrace.h>
69  #include <linux/lockdep.h>
70  #include <linux/nmi.h>
71  #include <linux/psi.h>
72  
73  #include <asm/sections.h>
74  #include <asm/tlbflush.h>
75  #include <asm/div64.h>
76  #include "internal.h"
77  #include "shuffle.h"
78  
79  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
80  static DEFINE_MUTEX(pcp_batch_high_lock);
81  #define MIN_PERCPU_PAGELIST_FRACTION	(8)
82  
83  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
84  DEFINE_PER_CPU(int, numa_node);
85  EXPORT_PER_CPU_SYMBOL(numa_node);
86  #endif
87  
88  DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
89  
90  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
91  /*
92   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
93   * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
94   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
95   * defined in <linux/topology.h>.
96   */
97  DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
98  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
99  int _node_numa_mem_[MAX_NUMNODES];
100  #endif
101  
102  /* work_structs for global per-cpu drains */
103  struct pcpu_drain {
104  	struct zone *zone;
105  	struct work_struct work;
106  };
107  DEFINE_MUTEX(pcpu_drain_mutex);
108  DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
109  
110  #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
111  volatile unsigned long latent_entropy __latent_entropy;
112  EXPORT_SYMBOL(latent_entropy);
113  #endif
114  
115  /*
116   * Array of node states.
117   */
118  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
119  	[N_POSSIBLE] = NODE_MASK_ALL,
120  	[N_ONLINE] = { { [0] = 1UL } },
121  #ifndef CONFIG_NUMA
122  	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
123  #ifdef CONFIG_HIGHMEM
124  	[N_HIGH_MEMORY] = { { [0] = 1UL } },
125  #endif
126  	[N_MEMORY] = { { [0] = 1UL } },
127  	[N_CPU] = { { [0] = 1UL } },
128  #endif	/* NUMA */
129  };
130  EXPORT_SYMBOL(node_states);
131  
132  atomic_long_t _totalram_pages __read_mostly;
133  EXPORT_SYMBOL(_totalram_pages);
134  unsigned long totalreserve_pages __read_mostly;
135  unsigned long totalcma_pages __read_mostly;
136  
137  int percpu_pagelist_fraction;
138  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
139  
140  /*
141   * A cached value of the page's pageblock's migratetype, used when the page is
142   * put on a pcplist. Used to avoid the pageblock migratetype lookup when
143   * freeing from pcplists in most cases, at the cost of possibly becoming stale.
144   * Also the migratetype set in the page does not necessarily match the pcplist
145   * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
146   * other index - this ensures that it will be put on the correct CMA freelist.
147   */
148  static inline int get_pcppage_migratetype(struct page *page)
149  {
150  	return page->index;
151  }
152  
153  static inline void set_pcppage_migratetype(struct page *page, int migratetype)
154  {
155  	page->index = migratetype;
156  }
157  
158  #ifdef CONFIG_PM_SLEEP
159  /*
160   * The following functions are used by the suspend/hibernate code to temporarily
161   * change gfp_allowed_mask in order to avoid using I/O during memory allocations
162   * while devices are suspended.  To avoid races with the suspend/hibernate code,
163   * they should always be called with system_transition_mutex held
164   * (gfp_allowed_mask also should only be modified with system_transition_mutex
165   * held, unless the suspend/hibernate code is guaranteed not to run in parallel
166   * with that modification).
167   */
168  
169  static gfp_t saved_gfp_mask;
170  
171  void pm_restore_gfp_mask(void)
172  {
173  	WARN_ON(!mutex_is_locked(&system_transition_mutex));
174  	if (saved_gfp_mask) {
175  		gfp_allowed_mask = saved_gfp_mask;
176  		saved_gfp_mask = 0;
177  	}
178  }
179  
180  void pm_restrict_gfp_mask(void)
181  {
182  	WARN_ON(!mutex_is_locked(&system_transition_mutex));
183  	WARN_ON(saved_gfp_mask);
184  	saved_gfp_mask = gfp_allowed_mask;
185  	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
186  }
187  
188  bool pm_suspended_storage(void)
189  {
190  	if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
191  		return false;
192  	return true;
193  }
194  #endif /* CONFIG_PM_SLEEP */
195  
196  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
197  unsigned int pageblock_order __read_mostly;
198  #endif
199  
200  static void __free_pages_ok(struct page *page, unsigned int order);
201  
202  /*
203   * results with 256, 32 in the lowmem_reserve sysctl:
204   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
205   *	1G machine -> (16M dma, 784M normal, 224M high)
206   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
207   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
208   *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
209   *
210   * TBD: should special case ZONE_DMA32 machines here - in those we normally
211   * don't need any ZONE_NORMAL reservation
212   */
213  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
214  #ifdef CONFIG_ZONE_DMA
215  	[ZONE_DMA] = 256,
216  #endif
217  #ifdef CONFIG_ZONE_DMA32
218  	[ZONE_DMA32] = 256,
219  #endif
220  	[ZONE_NORMAL] = 32,
221  #ifdef CONFIG_HIGHMEM
222  	[ZONE_HIGHMEM] = 0,
223  #endif
224  	[ZONE_MOVABLE] = 0,
225  };
226  
227  EXPORT_SYMBOL(totalram_pages);
228  
229  static char * const zone_names[MAX_NR_ZONES] = {
230  #ifdef CONFIG_ZONE_DMA
231  	 "DMA",
232  #endif
233  #ifdef CONFIG_ZONE_DMA32
234  	 "DMA32",
235  #endif
236  	 "Normal",
237  #ifdef CONFIG_HIGHMEM
238  	 "HighMem",
239  #endif
240  	 "Movable",
241  #ifdef CONFIG_ZONE_DEVICE
242  	 "Device",
243  #endif
244  };
245  
246  const char * const migratetype_names[MIGRATE_TYPES] = {
247  	"Unmovable",
248  	"Movable",
249  	"Reclaimable",
250  	"HighAtomic",
251  #ifdef CONFIG_CMA
252  	"CMA",
253  #endif
254  #ifdef CONFIG_MEMORY_ISOLATION
255  	"Isolate",
256  #endif
257  };
258  
259  compound_page_dtor * const compound_page_dtors[] = {
260  	NULL,
261  	free_compound_page,
262  #ifdef CONFIG_HUGETLB_PAGE
263  	free_huge_page,
264  #endif
265  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
266  	free_transhuge_page,
267  #endif
268  };
269  
270  int min_free_kbytes = 1024;
271  int user_min_free_kbytes = -1;
272  #ifdef CONFIG_DISCONTIGMEM
273  /*
274   * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
275   * are not on separate NUMA nodes. Functionally this works but with
276   * watermark_boost_factor, it can reclaim prematurely as the ranges can be
277   * quite small. By default, do not boost watermarks on discontigmem as in
278   * many cases very high-order allocations like THP are likely to be
279   * unsupported and the premature reclaim offsets the advantage of long-term
280   * fragmentation avoidance.
281   */
282  int watermark_boost_factor __read_mostly;
283  #else
284  int watermark_boost_factor __read_mostly = 15000;
285  #endif
286  int watermark_scale_factor = 10;
287  
288  static unsigned long nr_kernel_pages __initdata;
289  static unsigned long nr_all_pages __initdata;
290  static unsigned long dma_reserve __initdata;
291  
292  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
293  static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
294  static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
295  static unsigned long required_kernelcore __initdata;
296  static unsigned long required_kernelcore_percent __initdata;
297  static unsigned long required_movablecore __initdata;
298  static unsigned long required_movablecore_percent __initdata;
299  static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
300  static bool mirrored_kernelcore __meminitdata;
301  
302  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
303  int movable_zone;
304  EXPORT_SYMBOL(movable_zone);
305  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
306  
307  #if MAX_NUMNODES > 1
308  unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
309  unsigned int nr_online_nodes __read_mostly = 1;
310  EXPORT_SYMBOL(nr_node_ids);
311  EXPORT_SYMBOL(nr_online_nodes);
312  #endif
313  
314  int page_group_by_mobility_disabled __read_mostly;
315  
316  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
317  /*
318   * During boot we initialize deferred pages on-demand, as needed, but once
319   * page_alloc_init_late() has finished, the deferred pages are all initialized,
320   * and we can permanently disable that path.
321   */
322  static DEFINE_STATIC_KEY_TRUE(deferred_pages);
323  
324  /*
325   * Calling kasan_free_pages() only after deferred memory initialization
326   * has completed. Poisoning pages during deferred memory init will greatly
327   * lengthen the process and cause problem in large memory systems as the
328   * deferred pages initialization is done with interrupt disabled.
329   *
330   * Assuming that there will be no reference to those newly initialized
331   * pages before they are ever allocated, this should have no effect on
332   * KASAN memory tracking as the poison will be properly inserted at page
333   * allocation time. The only corner case is when pages are allocated by
334   * on-demand allocation and then freed again before the deferred pages
335   * initialization is done, but this is not likely to happen.
336   */
337  static inline void kasan_free_nondeferred_pages(struct page *page, int order)
338  {
339  	if (!static_branch_unlikely(&deferred_pages))
340  		kasan_free_pages(page, order);
341  }
342  
343  /* Returns true if the struct page for the pfn is uninitialised */
344  static inline bool __meminit early_page_uninitialised(unsigned long pfn)
345  {
346  	int nid = early_pfn_to_nid(pfn);
347  
348  	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
349  		return true;
350  
351  	return false;
352  }
353  
354  /*
355   * Returns true when the remaining initialisation should be deferred until
356   * later in the boot cycle when it can be parallelised.
357   */
358  static bool __meminit
359  defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
360  {
361  	static unsigned long prev_end_pfn, nr_initialised;
362  
363  	/*
364  	 * prev_end_pfn static that contains the end of previous zone
365  	 * No need to protect because called very early in boot before smp_init.
366  	 */
367  	if (prev_end_pfn != end_pfn) {
368  		prev_end_pfn = end_pfn;
369  		nr_initialised = 0;
370  	}
371  
372  	/* Always populate low zones for address-constrained allocations */
373  	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
374  		return false;
375  
376  	/*
377  	 * We start only with one section of pages, more pages are added as
378  	 * needed until the rest of deferred pages are initialized.
379  	 */
380  	nr_initialised++;
381  	if ((nr_initialised > PAGES_PER_SECTION) &&
382  	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
383  		NODE_DATA(nid)->first_deferred_pfn = pfn;
384  		return true;
385  	}
386  	return false;
387  }
388  #else
389  #define kasan_free_nondeferred_pages(p, o)	kasan_free_pages(p, o)
390  
391  static inline bool early_page_uninitialised(unsigned long pfn)
392  {
393  	return false;
394  }
395  
396  static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
397  {
398  	return false;
399  }
400  #endif
401  
402  /* Return a pointer to the bitmap storing bits affecting a block of pages */
403  static inline unsigned long *get_pageblock_bitmap(struct page *page,
404  							unsigned long pfn)
405  {
406  #ifdef CONFIG_SPARSEMEM
407  	return __pfn_to_section(pfn)->pageblock_flags;
408  #else
409  	return page_zone(page)->pageblock_flags;
410  #endif /* CONFIG_SPARSEMEM */
411  }
412  
413  static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
414  {
415  #ifdef CONFIG_SPARSEMEM
416  	pfn &= (PAGES_PER_SECTION-1);
417  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
418  #else
419  	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
420  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
421  #endif /* CONFIG_SPARSEMEM */
422  }
423  
424  /**
425   * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
426   * @page: The page within the block of interest
427   * @pfn: The target page frame number
428   * @end_bitidx: The last bit of interest to retrieve
429   * @mask: mask of bits that the caller is interested in
430   *
431   * Return: pageblock_bits flags
432   */
433  static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
434  					unsigned long pfn,
435  					unsigned long end_bitidx,
436  					unsigned long mask)
437  {
438  	unsigned long *bitmap;
439  	unsigned long bitidx, word_bitidx;
440  	unsigned long word;
441  
442  	bitmap = get_pageblock_bitmap(page, pfn);
443  	bitidx = pfn_to_bitidx(page, pfn);
444  	word_bitidx = bitidx / BITS_PER_LONG;
445  	bitidx &= (BITS_PER_LONG-1);
446  
447  	word = bitmap[word_bitidx];
448  	bitidx += end_bitidx;
449  	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
450  }
451  
452  unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
453  					unsigned long end_bitidx,
454  					unsigned long mask)
455  {
456  	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
457  }
458  
459  static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
460  {
461  	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
462  }
463  
464  /**
465   * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
466   * @page: The page within the block of interest
467   * @flags: The flags to set
468   * @pfn: The target page frame number
469   * @end_bitidx: The last bit of interest
470   * @mask: mask of bits that the caller is interested in
471   */
472  void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
473  					unsigned long pfn,
474  					unsigned long end_bitidx,
475  					unsigned long mask)
476  {
477  	unsigned long *bitmap;
478  	unsigned long bitidx, word_bitidx;
479  	unsigned long old_word, word;
480  
481  	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
482  	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
483  
484  	bitmap = get_pageblock_bitmap(page, pfn);
485  	bitidx = pfn_to_bitidx(page, pfn);
486  	word_bitidx = bitidx / BITS_PER_LONG;
487  	bitidx &= (BITS_PER_LONG-1);
488  
489  	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
490  
491  	bitidx += end_bitidx;
492  	mask <<= (BITS_PER_LONG - bitidx - 1);
493  	flags <<= (BITS_PER_LONG - bitidx - 1);
494  
495  	word = READ_ONCE(bitmap[word_bitidx]);
496  	for (;;) {
497  		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
498  		if (word == old_word)
499  			break;
500  		word = old_word;
501  	}
502  }
503  
504  void set_pageblock_migratetype(struct page *page, int migratetype)
505  {
506  	if (unlikely(page_group_by_mobility_disabled &&
507  		     migratetype < MIGRATE_PCPTYPES))
508  		migratetype = MIGRATE_UNMOVABLE;
509  
510  	set_pageblock_flags_group(page, (unsigned long)migratetype,
511  					PB_migrate, PB_migrate_end);
512  }
513  
514  #ifdef CONFIG_DEBUG_VM
515  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
516  {
517  	int ret = 0;
518  	unsigned seq;
519  	unsigned long pfn = page_to_pfn(page);
520  	unsigned long sp, start_pfn;
521  
522  	do {
523  		seq = zone_span_seqbegin(zone);
524  		start_pfn = zone->zone_start_pfn;
525  		sp = zone->spanned_pages;
526  		if (!zone_spans_pfn(zone, pfn))
527  			ret = 1;
528  	} while (zone_span_seqretry(zone, seq));
529  
530  	if (ret)
531  		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
532  			pfn, zone_to_nid(zone), zone->name,
533  			start_pfn, start_pfn + sp);
534  
535  	return ret;
536  }
537  
538  static int page_is_consistent(struct zone *zone, struct page *page)
539  {
540  	if (!pfn_valid_within(page_to_pfn(page)))
541  		return 0;
542  	if (zone != page_zone(page))
543  		return 0;
544  
545  	return 1;
546  }
547  /*
548   * Temporary debugging check for pages not lying within a given zone.
549   */
550  static int __maybe_unused bad_range(struct zone *zone, struct page *page)
551  {
552  	if (page_outside_zone_boundaries(zone, page))
553  		return 1;
554  	if (!page_is_consistent(zone, page))
555  		return 1;
556  
557  	return 0;
558  }
559  #else
560  static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
561  {
562  	return 0;
563  }
564  #endif
565  
566  static void bad_page(struct page *page, const char *reason,
567  		unsigned long bad_flags)
568  {
569  	static unsigned long resume;
570  	static unsigned long nr_shown;
571  	static unsigned long nr_unshown;
572  
573  	/*
574  	 * Allow a burst of 60 reports, then keep quiet for that minute;
575  	 * or allow a steady drip of one report per second.
576  	 */
577  	if (nr_shown == 60) {
578  		if (time_before(jiffies, resume)) {
579  			nr_unshown++;
580  			goto out;
581  		}
582  		if (nr_unshown) {
583  			pr_alert(
584  			      "BUG: Bad page state: %lu messages suppressed\n",
585  				nr_unshown);
586  			nr_unshown = 0;
587  		}
588  		nr_shown = 0;
589  	}
590  	if (nr_shown++ == 0)
591  		resume = jiffies + 60 * HZ;
592  
593  	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
594  		current->comm, page_to_pfn(page));
595  	__dump_page(page, reason);
596  	bad_flags &= page->flags;
597  	if (bad_flags)
598  		pr_alert("bad because of flags: %#lx(%pGp)\n",
599  						bad_flags, &bad_flags);
600  	dump_page_owner(page);
601  
602  	print_modules();
603  	dump_stack();
604  out:
605  	/* Leave bad fields for debug, except PageBuddy could make trouble */
606  	page_mapcount_reset(page); /* remove PageBuddy */
607  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
608  }
609  
610  /*
611   * Higher-order pages are called "compound pages".  They are structured thusly:
612   *
613   * The first PAGE_SIZE page is called the "head page" and have PG_head set.
614   *
615   * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
616   * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
617   *
618   * The first tail page's ->compound_dtor holds the offset in array of compound
619   * page destructors. See compound_page_dtors.
620   *
621   * The first tail page's ->compound_order holds the order of allocation.
622   * This usage means that zero-order pages may not be compound.
623   */
624  
625  void free_compound_page(struct page *page)
626  {
627  	__free_pages_ok(page, compound_order(page));
628  }
629  
630  void prep_compound_page(struct page *page, unsigned int order)
631  {
632  	int i;
633  	int nr_pages = 1 << order;
634  
635  	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
636  	set_compound_order(page, order);
637  	__SetPageHead(page);
638  	for (i = 1; i < nr_pages; i++) {
639  		struct page *p = page + i;
640  		set_page_count(p, 0);
641  		p->mapping = TAIL_MAPPING;
642  		set_compound_head(p, page);
643  	}
644  	atomic_set(compound_mapcount_ptr(page), -1);
645  }
646  
647  #ifdef CONFIG_DEBUG_PAGEALLOC
648  unsigned int _debug_guardpage_minorder;
649  bool _debug_pagealloc_enabled __read_mostly
650  			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
651  EXPORT_SYMBOL(_debug_pagealloc_enabled);
652  bool _debug_guardpage_enabled __read_mostly;
653  
654  static int __init early_debug_pagealloc(char *buf)
655  {
656  	if (!buf)
657  		return -EINVAL;
658  	return kstrtobool(buf, &_debug_pagealloc_enabled);
659  }
660  early_param("debug_pagealloc", early_debug_pagealloc);
661  
662  static bool need_debug_guardpage(void)
663  {
664  	/* If we don't use debug_pagealloc, we don't need guard page */
665  	if (!debug_pagealloc_enabled())
666  		return false;
667  
668  	if (!debug_guardpage_minorder())
669  		return false;
670  
671  	return true;
672  }
673  
674  static void init_debug_guardpage(void)
675  {
676  	if (!debug_pagealloc_enabled())
677  		return;
678  
679  	if (!debug_guardpage_minorder())
680  		return;
681  
682  	_debug_guardpage_enabled = true;
683  }
684  
685  struct page_ext_operations debug_guardpage_ops = {
686  	.need = need_debug_guardpage,
687  	.init = init_debug_guardpage,
688  };
689  
690  static int __init debug_guardpage_minorder_setup(char *buf)
691  {
692  	unsigned long res;
693  
694  	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
695  		pr_err("Bad debug_guardpage_minorder value\n");
696  		return 0;
697  	}
698  	_debug_guardpage_minorder = res;
699  	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
700  	return 0;
701  }
702  early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
703  
704  static inline bool set_page_guard(struct zone *zone, struct page *page,
705  				unsigned int order, int migratetype)
706  {
707  	struct page_ext *page_ext;
708  
709  	if (!debug_guardpage_enabled())
710  		return false;
711  
712  	if (order >= debug_guardpage_minorder())
713  		return false;
714  
715  	page_ext = lookup_page_ext(page);
716  	if (unlikely(!page_ext))
717  		return false;
718  
719  	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
720  
721  	INIT_LIST_HEAD(&page->lru);
722  	set_page_private(page, order);
723  	/* Guard pages are not available for any usage */
724  	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
725  
726  	return true;
727  }
728  
729  static inline void clear_page_guard(struct zone *zone, struct page *page,
730  				unsigned int order, int migratetype)
731  {
732  	struct page_ext *page_ext;
733  
734  	if (!debug_guardpage_enabled())
735  		return;
736  
737  	page_ext = lookup_page_ext(page);
738  	if (unlikely(!page_ext))
739  		return;
740  
741  	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
742  
743  	set_page_private(page, 0);
744  	if (!is_migrate_isolate(migratetype))
745  		__mod_zone_freepage_state(zone, (1 << order), migratetype);
746  }
747  #else
748  struct page_ext_operations debug_guardpage_ops;
749  static inline bool set_page_guard(struct zone *zone, struct page *page,
750  			unsigned int order, int migratetype) { return false; }
751  static inline void clear_page_guard(struct zone *zone, struct page *page,
752  				unsigned int order, int migratetype) {}
753  #endif
754  
755  static inline void set_page_order(struct page *page, unsigned int order)
756  {
757  	set_page_private(page, order);
758  	__SetPageBuddy(page);
759  }
760  
761  /*
762   * This function checks whether a page is free && is the buddy
763   * we can coalesce a page and its buddy if
764   * (a) the buddy is not in a hole (check before calling!) &&
765   * (b) the buddy is in the buddy system &&
766   * (c) a page and its buddy have the same order &&
767   * (d) a page and its buddy are in the same zone.
768   *
769   * For recording whether a page is in the buddy system, we set PageBuddy.
770   * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
771   *
772   * For recording page's order, we use page_private(page).
773   */
774  static inline int page_is_buddy(struct page *page, struct page *buddy,
775  							unsigned int order)
776  {
777  	if (page_is_guard(buddy) && page_order(buddy) == order) {
778  		if (page_zone_id(page) != page_zone_id(buddy))
779  			return 0;
780  
781  		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
782  
783  		return 1;
784  	}
785  
786  	if (PageBuddy(buddy) && page_order(buddy) == order) {
787  		/*
788  		 * zone check is done late to avoid uselessly
789  		 * calculating zone/node ids for pages that could
790  		 * never merge.
791  		 */
792  		if (page_zone_id(page) != page_zone_id(buddy))
793  			return 0;
794  
795  		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
796  
797  		return 1;
798  	}
799  	return 0;
800  }
801  
802  #ifdef CONFIG_COMPACTION
803  static inline struct capture_control *task_capc(struct zone *zone)
804  {
805  	struct capture_control *capc = current->capture_control;
806  
807  	return capc &&
808  		!(current->flags & PF_KTHREAD) &&
809  		!capc->page &&
810  		capc->cc->zone == zone &&
811  		capc->cc->direct_compaction ? capc : NULL;
812  }
813  
814  static inline bool
815  compaction_capture(struct capture_control *capc, struct page *page,
816  		   int order, int migratetype)
817  {
818  	if (!capc || order != capc->cc->order)
819  		return false;
820  
821  	/* Do not accidentally pollute CMA or isolated regions*/
822  	if (is_migrate_cma(migratetype) ||
823  	    is_migrate_isolate(migratetype))
824  		return false;
825  
826  	/*
827  	 * Do not let lower order allocations polluate a movable pageblock.
828  	 * This might let an unmovable request use a reclaimable pageblock
829  	 * and vice-versa but no more than normal fallback logic which can
830  	 * have trouble finding a high-order free page.
831  	 */
832  	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
833  		return false;
834  
835  	capc->page = page;
836  	return true;
837  }
838  
839  #else
840  static inline struct capture_control *task_capc(struct zone *zone)
841  {
842  	return NULL;
843  }
844  
845  static inline bool
846  compaction_capture(struct capture_control *capc, struct page *page,
847  		   int order, int migratetype)
848  {
849  	return false;
850  }
851  #endif /* CONFIG_COMPACTION */
852  
853  /*
854   * Freeing function for a buddy system allocator.
855   *
856   * The concept of a buddy system is to maintain direct-mapped table
857   * (containing bit values) for memory blocks of various "orders".
858   * The bottom level table contains the map for the smallest allocatable
859   * units of memory (here, pages), and each level above it describes
860   * pairs of units from the levels below, hence, "buddies".
861   * At a high level, all that happens here is marking the table entry
862   * at the bottom level available, and propagating the changes upward
863   * as necessary, plus some accounting needed to play nicely with other
864   * parts of the VM system.
865   * At each level, we keep a list of pages, which are heads of continuous
866   * free pages of length of (1 << order) and marked with PageBuddy.
867   * Page's order is recorded in page_private(page) field.
868   * So when we are allocating or freeing one, we can derive the state of the
869   * other.  That is, if we allocate a small block, and both were
870   * free, the remainder of the region must be split into blocks.
871   * If a block is freed, and its buddy is also free, then this
872   * triggers coalescing into a block of larger size.
873   *
874   * -- nyc
875   */
876  
877  static inline void __free_one_page(struct page *page,
878  		unsigned long pfn,
879  		struct zone *zone, unsigned int order,
880  		int migratetype)
881  {
882  	unsigned long combined_pfn;
883  	unsigned long uninitialized_var(buddy_pfn);
884  	struct page *buddy;
885  	unsigned int max_order;
886  	struct capture_control *capc = task_capc(zone);
887  
888  	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
889  
890  	VM_BUG_ON(!zone_is_initialized(zone));
891  	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
892  
893  	VM_BUG_ON(migratetype == -1);
894  	if (likely(!is_migrate_isolate(migratetype)))
895  		__mod_zone_freepage_state(zone, 1 << order, migratetype);
896  
897  	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
898  	VM_BUG_ON_PAGE(bad_range(zone, page), page);
899  
900  continue_merging:
901  	while (order < max_order - 1) {
902  		if (compaction_capture(capc, page, order, migratetype)) {
903  			__mod_zone_freepage_state(zone, -(1 << order),
904  								migratetype);
905  			return;
906  		}
907  		buddy_pfn = __find_buddy_pfn(pfn, order);
908  		buddy = page + (buddy_pfn - pfn);
909  
910  		if (!pfn_valid_within(buddy_pfn))
911  			goto done_merging;
912  		if (!page_is_buddy(page, buddy, order))
913  			goto done_merging;
914  		/*
915  		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
916  		 * merge with it and move up one order.
917  		 */
918  		if (page_is_guard(buddy))
919  			clear_page_guard(zone, buddy, order, migratetype);
920  		else
921  			del_page_from_free_area(buddy, &zone->free_area[order]);
922  		combined_pfn = buddy_pfn & pfn;
923  		page = page + (combined_pfn - pfn);
924  		pfn = combined_pfn;
925  		order++;
926  	}
927  	if (max_order < MAX_ORDER) {
928  		/* If we are here, it means order is >= pageblock_order.
929  		 * We want to prevent merge between freepages on isolate
930  		 * pageblock and normal pageblock. Without this, pageblock
931  		 * isolation could cause incorrect freepage or CMA accounting.
932  		 *
933  		 * We don't want to hit this code for the more frequent
934  		 * low-order merging.
935  		 */
936  		if (unlikely(has_isolate_pageblock(zone))) {
937  			int buddy_mt;
938  
939  			buddy_pfn = __find_buddy_pfn(pfn, order);
940  			buddy = page + (buddy_pfn - pfn);
941  			buddy_mt = get_pageblock_migratetype(buddy);
942  
943  			if (migratetype != buddy_mt
944  					&& (is_migrate_isolate(migratetype) ||
945  						is_migrate_isolate(buddy_mt)))
946  				goto done_merging;
947  		}
948  		max_order++;
949  		goto continue_merging;
950  	}
951  
952  done_merging:
953  	set_page_order(page, order);
954  
955  	/*
956  	 * If this is not the largest possible page, check if the buddy
957  	 * of the next-highest order is free. If it is, it's possible
958  	 * that pages are being freed that will coalesce soon. In case,
959  	 * that is happening, add the free page to the tail of the list
960  	 * so it's less likely to be used soon and more likely to be merged
961  	 * as a higher order page
962  	 */
963  	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
964  			&& !is_shuffle_order(order)) {
965  		struct page *higher_page, *higher_buddy;
966  		combined_pfn = buddy_pfn & pfn;
967  		higher_page = page + (combined_pfn - pfn);
968  		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
969  		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
970  		if (pfn_valid_within(buddy_pfn) &&
971  		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
972  			add_to_free_area_tail(page, &zone->free_area[order],
973  					      migratetype);
974  			return;
975  		}
976  	}
977  
978  	if (is_shuffle_order(order))
979  		add_to_free_area_random(page, &zone->free_area[order],
980  				migratetype);
981  	else
982  		add_to_free_area(page, &zone->free_area[order], migratetype);
983  
984  }
985  
986  /*
987   * A bad page could be due to a number of fields. Instead of multiple branches,
988   * try and check multiple fields with one check. The caller must do a detailed
989   * check if necessary.
990   */
991  static inline bool page_expected_state(struct page *page,
992  					unsigned long check_flags)
993  {
994  	if (unlikely(atomic_read(&page->_mapcount) != -1))
995  		return false;
996  
997  	if (unlikely((unsigned long)page->mapping |
998  			page_ref_count(page) |
999  #ifdef CONFIG_MEMCG
1000  			(unsigned long)page->mem_cgroup |
1001  #endif
1002  			(page->flags & check_flags)))
1003  		return false;
1004  
1005  	return true;
1006  }
1007  
1008  static void free_pages_check_bad(struct page *page)
1009  {
1010  	const char *bad_reason;
1011  	unsigned long bad_flags;
1012  
1013  	bad_reason = NULL;
1014  	bad_flags = 0;
1015  
1016  	if (unlikely(atomic_read(&page->_mapcount) != -1))
1017  		bad_reason = "nonzero mapcount";
1018  	if (unlikely(page->mapping != NULL))
1019  		bad_reason = "non-NULL mapping";
1020  	if (unlikely(page_ref_count(page) != 0))
1021  		bad_reason = "nonzero _refcount";
1022  	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1023  		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1024  		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
1025  	}
1026  #ifdef CONFIG_MEMCG
1027  	if (unlikely(page->mem_cgroup))
1028  		bad_reason = "page still charged to cgroup";
1029  #endif
1030  	bad_page(page, bad_reason, bad_flags);
1031  }
1032  
1033  static inline int free_pages_check(struct page *page)
1034  {
1035  	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1036  		return 0;
1037  
1038  	/* Something has gone sideways, find it */
1039  	free_pages_check_bad(page);
1040  	return 1;
1041  }
1042  
1043  static int free_tail_pages_check(struct page *head_page, struct page *page)
1044  {
1045  	int ret = 1;
1046  
1047  	/*
1048  	 * We rely page->lru.next never has bit 0 set, unless the page
1049  	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1050  	 */
1051  	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1052  
1053  	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
1054  		ret = 0;
1055  		goto out;
1056  	}
1057  	switch (page - head_page) {
1058  	case 1:
1059  		/* the first tail page: ->mapping may be compound_mapcount() */
1060  		if (unlikely(compound_mapcount(page))) {
1061  			bad_page(page, "nonzero compound_mapcount", 0);
1062  			goto out;
1063  		}
1064  		break;
1065  	case 2:
1066  		/*
1067  		 * the second tail page: ->mapping is
1068  		 * deferred_list.next -- ignore value.
1069  		 */
1070  		break;
1071  	default:
1072  		if (page->mapping != TAIL_MAPPING) {
1073  			bad_page(page, "corrupted mapping in tail page", 0);
1074  			goto out;
1075  		}
1076  		break;
1077  	}
1078  	if (unlikely(!PageTail(page))) {
1079  		bad_page(page, "PageTail not set", 0);
1080  		goto out;
1081  	}
1082  	if (unlikely(compound_head(page) != head_page)) {
1083  		bad_page(page, "compound_head not consistent", 0);
1084  		goto out;
1085  	}
1086  	ret = 0;
1087  out:
1088  	page->mapping = NULL;
1089  	clear_compound_head(page);
1090  	return ret;
1091  }
1092  
1093  static __always_inline bool free_pages_prepare(struct page *page,
1094  					unsigned int order, bool check_free)
1095  {
1096  	int bad = 0;
1097  
1098  	VM_BUG_ON_PAGE(PageTail(page), page);
1099  
1100  	trace_mm_page_free(page, order);
1101  
1102  	/*
1103  	 * Check tail pages before head page information is cleared to
1104  	 * avoid checking PageCompound for order-0 pages.
1105  	 */
1106  	if (unlikely(order)) {
1107  		bool compound = PageCompound(page);
1108  		int i;
1109  
1110  		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1111  
1112  		if (compound)
1113  			ClearPageDoubleMap(page);
1114  		for (i = 1; i < (1 << order); i++) {
1115  			if (compound)
1116  				bad += free_tail_pages_check(page, page + i);
1117  			if (unlikely(free_pages_check(page + i))) {
1118  				bad++;
1119  				continue;
1120  			}
1121  			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1122  		}
1123  	}
1124  	if (PageMappingFlags(page))
1125  		page->mapping = NULL;
1126  	if (memcg_kmem_enabled() && PageKmemcg(page))
1127  		__memcg_kmem_uncharge(page, order);
1128  	if (check_free)
1129  		bad += free_pages_check(page);
1130  	if (bad)
1131  		return false;
1132  
1133  	page_cpupid_reset_last(page);
1134  	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1135  	reset_page_owner(page, order);
1136  
1137  	if (!PageHighMem(page)) {
1138  		debug_check_no_locks_freed(page_address(page),
1139  					   PAGE_SIZE << order);
1140  		debug_check_no_obj_freed(page_address(page),
1141  					   PAGE_SIZE << order);
1142  	}
1143  	arch_free_page(page, order);
1144  	kernel_poison_pages(page, 1 << order, 0);
1145  	if (debug_pagealloc_enabled())
1146  		kernel_map_pages(page, 1 << order, 0);
1147  
1148  	kasan_free_nondeferred_pages(page, order);
1149  
1150  	return true;
1151  }
1152  
1153  #ifdef CONFIG_DEBUG_VM
1154  static inline bool free_pcp_prepare(struct page *page)
1155  {
1156  	return free_pages_prepare(page, 0, true);
1157  }
1158  
1159  static inline bool bulkfree_pcp_prepare(struct page *page)
1160  {
1161  	return false;
1162  }
1163  #else
1164  static bool free_pcp_prepare(struct page *page)
1165  {
1166  	return free_pages_prepare(page, 0, false);
1167  }
1168  
1169  static bool bulkfree_pcp_prepare(struct page *page)
1170  {
1171  	return free_pages_check(page);
1172  }
1173  #endif /* CONFIG_DEBUG_VM */
1174  
1175  static inline void prefetch_buddy(struct page *page)
1176  {
1177  	unsigned long pfn = page_to_pfn(page);
1178  	unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
1179  	struct page *buddy = page + (buddy_pfn - pfn);
1180  
1181  	prefetch(buddy);
1182  }
1183  
1184  /*
1185   * Frees a number of pages from the PCP lists
1186   * Assumes all pages on list are in same zone, and of same order.
1187   * count is the number of pages to free.
1188   *
1189   * If the zone was previously in an "all pages pinned" state then look to
1190   * see if this freeing clears that state.
1191   *
1192   * And clear the zone's pages_scanned counter, to hold off the "all pages are
1193   * pinned" detection logic.
1194   */
1195  static void free_pcppages_bulk(struct zone *zone, int count,
1196  					struct per_cpu_pages *pcp)
1197  {
1198  	int migratetype = 0;
1199  	int batch_free = 0;
1200  	int prefetch_nr = 0;
1201  	bool isolated_pageblocks;
1202  	struct page *page, *tmp;
1203  	LIST_HEAD(head);
1204  
1205  	while (count) {
1206  		struct list_head *list;
1207  
1208  		/*
1209  		 * Remove pages from lists in a round-robin fashion. A
1210  		 * batch_free count is maintained that is incremented when an
1211  		 * empty list is encountered.  This is so more pages are freed
1212  		 * off fuller lists instead of spinning excessively around empty
1213  		 * lists
1214  		 */
1215  		do {
1216  			batch_free++;
1217  			if (++migratetype == MIGRATE_PCPTYPES)
1218  				migratetype = 0;
1219  			list = &pcp->lists[migratetype];
1220  		} while (list_empty(list));
1221  
1222  		/* This is the only non-empty list. Free them all. */
1223  		if (batch_free == MIGRATE_PCPTYPES)
1224  			batch_free = count;
1225  
1226  		do {
1227  			page = list_last_entry(list, struct page, lru);
1228  			/* must delete to avoid corrupting pcp list */
1229  			list_del(&page->lru);
1230  			pcp->count--;
1231  
1232  			if (bulkfree_pcp_prepare(page))
1233  				continue;
1234  
1235  			list_add_tail(&page->lru, &head);
1236  
1237  			/*
1238  			 * We are going to put the page back to the global
1239  			 * pool, prefetch its buddy to speed up later access
1240  			 * under zone->lock. It is believed the overhead of
1241  			 * an additional test and calculating buddy_pfn here
1242  			 * can be offset by reduced memory latency later. To
1243  			 * avoid excessive prefetching due to large count, only
1244  			 * prefetch buddy for the first pcp->batch nr of pages.
1245  			 */
1246  			if (prefetch_nr++ < pcp->batch)
1247  				prefetch_buddy(page);
1248  		} while (--count && --batch_free && !list_empty(list));
1249  	}
1250  
1251  	spin_lock(&zone->lock);
1252  	isolated_pageblocks = has_isolate_pageblock(zone);
1253  
1254  	/*
1255  	 * Use safe version since after __free_one_page(),
1256  	 * page->lru.next will not point to original list.
1257  	 */
1258  	list_for_each_entry_safe(page, tmp, &head, lru) {
1259  		int mt = get_pcppage_migratetype(page);
1260  		/* MIGRATE_ISOLATE page should not go to pcplists */
1261  		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1262  		/* Pageblock could have been isolated meanwhile */
1263  		if (unlikely(isolated_pageblocks))
1264  			mt = get_pageblock_migratetype(page);
1265  
1266  		__free_one_page(page, page_to_pfn(page), zone, 0, mt);
1267  		trace_mm_page_pcpu_drain(page, 0, mt);
1268  	}
1269  	spin_unlock(&zone->lock);
1270  }
1271  
1272  static void free_one_page(struct zone *zone,
1273  				struct page *page, unsigned long pfn,
1274  				unsigned int order,
1275  				int migratetype)
1276  {
1277  	spin_lock(&zone->lock);
1278  	if (unlikely(has_isolate_pageblock(zone) ||
1279  		is_migrate_isolate(migratetype))) {
1280  		migratetype = get_pfnblock_migratetype(page, pfn);
1281  	}
1282  	__free_one_page(page, pfn, zone, order, migratetype);
1283  	spin_unlock(&zone->lock);
1284  }
1285  
1286  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1287  				unsigned long zone, int nid)
1288  {
1289  	mm_zero_struct_page(page);
1290  	set_page_links(page, zone, nid, pfn);
1291  	init_page_count(page);
1292  	page_mapcount_reset(page);
1293  	page_cpupid_reset_last(page);
1294  	page_kasan_tag_reset(page);
1295  
1296  	INIT_LIST_HEAD(&page->lru);
1297  #ifdef WANT_PAGE_VIRTUAL
1298  	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
1299  	if (!is_highmem_idx(zone))
1300  		set_page_address(page, __va(pfn << PAGE_SHIFT));
1301  #endif
1302  }
1303  
1304  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1305  static void __meminit init_reserved_page(unsigned long pfn)
1306  {
1307  	pg_data_t *pgdat;
1308  	int nid, zid;
1309  
1310  	if (!early_page_uninitialised(pfn))
1311  		return;
1312  
1313  	nid = early_pfn_to_nid(pfn);
1314  	pgdat = NODE_DATA(nid);
1315  
1316  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1317  		struct zone *zone = &pgdat->node_zones[zid];
1318  
1319  		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1320  			break;
1321  	}
1322  	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
1323  }
1324  #else
1325  static inline void init_reserved_page(unsigned long pfn)
1326  {
1327  }
1328  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1329  
1330  /*
1331   * Initialised pages do not have PageReserved set. This function is
1332   * called for each range allocated by the bootmem allocator and
1333   * marks the pages PageReserved. The remaining valid pages are later
1334   * sent to the buddy page allocator.
1335   */
1336  void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1337  {
1338  	unsigned long start_pfn = PFN_DOWN(start);
1339  	unsigned long end_pfn = PFN_UP(end);
1340  
1341  	for (; start_pfn < end_pfn; start_pfn++) {
1342  		if (pfn_valid(start_pfn)) {
1343  			struct page *page = pfn_to_page(start_pfn);
1344  
1345  			init_reserved_page(start_pfn);
1346  
1347  			/* Avoid false-positive PageTail() */
1348  			INIT_LIST_HEAD(&page->lru);
1349  
1350  			/*
1351  			 * no need for atomic set_bit because the struct
1352  			 * page is not visible yet so nobody should
1353  			 * access it yet.
1354  			 */
1355  			__SetPageReserved(page);
1356  		}
1357  	}
1358  }
1359  
1360  static void __free_pages_ok(struct page *page, unsigned int order)
1361  {
1362  	unsigned long flags;
1363  	int migratetype;
1364  	unsigned long pfn = page_to_pfn(page);
1365  
1366  	if (!free_pages_prepare(page, order, true))
1367  		return;
1368  
1369  	migratetype = get_pfnblock_migratetype(page, pfn);
1370  	local_irq_save(flags);
1371  	__count_vm_events(PGFREE, 1 << order);
1372  	free_one_page(page_zone(page), page, pfn, order, migratetype);
1373  	local_irq_restore(flags);
1374  }
1375  
1376  void __free_pages_core(struct page *page, unsigned int order)
1377  {
1378  	unsigned int nr_pages = 1 << order;
1379  	struct page *p = page;
1380  	unsigned int loop;
1381  
1382  	prefetchw(p);
1383  	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1384  		prefetchw(p + 1);
1385  		__ClearPageReserved(p);
1386  		set_page_count(p, 0);
1387  	}
1388  	__ClearPageReserved(p);
1389  	set_page_count(p, 0);
1390  
1391  	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1392  	set_page_refcounted(page);
1393  	__free_pages(page, order);
1394  }
1395  
1396  #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1397  	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1398  
1399  static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1400  
1401  int __meminit early_pfn_to_nid(unsigned long pfn)
1402  {
1403  	static DEFINE_SPINLOCK(early_pfn_lock);
1404  	int nid;
1405  
1406  	spin_lock(&early_pfn_lock);
1407  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1408  	if (nid < 0)
1409  		nid = first_online_node;
1410  	spin_unlock(&early_pfn_lock);
1411  
1412  	return nid;
1413  }
1414  #endif
1415  
1416  #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1417  /* Only safe to use early in boot when initialisation is single-threaded */
1418  static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1419  {
1420  	int nid;
1421  
1422  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1423  	if (nid >= 0 && nid != node)
1424  		return false;
1425  	return true;
1426  }
1427  
1428  #else
1429  static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1430  {
1431  	return true;
1432  }
1433  #endif
1434  
1435  
1436  void __init memblock_free_pages(struct page *page, unsigned long pfn,
1437  							unsigned int order)
1438  {
1439  	if (early_page_uninitialised(pfn))
1440  		return;
1441  	__free_pages_core(page, order);
1442  }
1443  
1444  /*
1445   * Check that the whole (or subset of) a pageblock given by the interval of
1446   * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1447   * with the migration of free compaction scanner. The scanners then need to
1448   * use only pfn_valid_within() check for arches that allow holes within
1449   * pageblocks.
1450   *
1451   * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1452   *
1453   * It's possible on some configurations to have a setup like node0 node1 node0
1454   * i.e. it's possible that all pages within a zones range of pages do not
1455   * belong to a single zone. We assume that a border between node0 and node1
1456   * can occur within a single pageblock, but not a node0 node1 node0
1457   * interleaving within a single pageblock. It is therefore sufficient to check
1458   * the first and last page of a pageblock and avoid checking each individual
1459   * page in a pageblock.
1460   */
1461  struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1462  				     unsigned long end_pfn, struct zone *zone)
1463  {
1464  	struct page *start_page;
1465  	struct page *end_page;
1466  
1467  	/* end_pfn is one past the range we are checking */
1468  	end_pfn--;
1469  
1470  	if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1471  		return NULL;
1472  
1473  	start_page = pfn_to_online_page(start_pfn);
1474  	if (!start_page)
1475  		return NULL;
1476  
1477  	if (page_zone(start_page) != zone)
1478  		return NULL;
1479  
1480  	end_page = pfn_to_page(end_pfn);
1481  
1482  	/* This gives a shorter code than deriving page_zone(end_page) */
1483  	if (page_zone_id(start_page) != page_zone_id(end_page))
1484  		return NULL;
1485  
1486  	return start_page;
1487  }
1488  
1489  void set_zone_contiguous(struct zone *zone)
1490  {
1491  	unsigned long block_start_pfn = zone->zone_start_pfn;
1492  	unsigned long block_end_pfn;
1493  
1494  	block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1495  	for (; block_start_pfn < zone_end_pfn(zone);
1496  			block_start_pfn = block_end_pfn,
1497  			 block_end_pfn += pageblock_nr_pages) {
1498  
1499  		block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1500  
1501  		if (!__pageblock_pfn_to_page(block_start_pfn,
1502  					     block_end_pfn, zone))
1503  			return;
1504  	}
1505  
1506  	/* We confirm that there is no hole */
1507  	zone->contiguous = true;
1508  }
1509  
1510  void clear_zone_contiguous(struct zone *zone)
1511  {
1512  	zone->contiguous = false;
1513  }
1514  
1515  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1516  static void __init deferred_free_range(unsigned long pfn,
1517  				       unsigned long nr_pages)
1518  {
1519  	struct page *page;
1520  	unsigned long i;
1521  
1522  	if (!nr_pages)
1523  		return;
1524  
1525  	page = pfn_to_page(pfn);
1526  
1527  	/* Free a large naturally-aligned chunk if possible */
1528  	if (nr_pages == pageblock_nr_pages &&
1529  	    (pfn & (pageblock_nr_pages - 1)) == 0) {
1530  		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1531  		__free_pages_core(page, pageblock_order);
1532  		return;
1533  	}
1534  
1535  	for (i = 0; i < nr_pages; i++, page++, pfn++) {
1536  		if ((pfn & (pageblock_nr_pages - 1)) == 0)
1537  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1538  		__free_pages_core(page, 0);
1539  	}
1540  }
1541  
1542  /* Completion tracking for deferred_init_memmap() threads */
1543  static atomic_t pgdat_init_n_undone __initdata;
1544  static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1545  
1546  static inline void __init pgdat_init_report_one_done(void)
1547  {
1548  	if (atomic_dec_and_test(&pgdat_init_n_undone))
1549  		complete(&pgdat_init_all_done_comp);
1550  }
1551  
1552  /*
1553   * Returns true if page needs to be initialized or freed to buddy allocator.
1554   *
1555   * First we check if pfn is valid on architectures where it is possible to have
1556   * holes within pageblock_nr_pages. On systems where it is not possible, this
1557   * function is optimized out.
1558   *
1559   * Then, we check if a current large page is valid by only checking the validity
1560   * of the head pfn.
1561   */
1562  static inline bool __init deferred_pfn_valid(unsigned long pfn)
1563  {
1564  	if (!pfn_valid_within(pfn))
1565  		return false;
1566  	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1567  		return false;
1568  	return true;
1569  }
1570  
1571  /*
1572   * Free pages to buddy allocator. Try to free aligned pages in
1573   * pageblock_nr_pages sizes.
1574   */
1575  static void __init deferred_free_pages(unsigned long pfn,
1576  				       unsigned long end_pfn)
1577  {
1578  	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1579  	unsigned long nr_free = 0;
1580  
1581  	for (; pfn < end_pfn; pfn++) {
1582  		if (!deferred_pfn_valid(pfn)) {
1583  			deferred_free_range(pfn - nr_free, nr_free);
1584  			nr_free = 0;
1585  		} else if (!(pfn & nr_pgmask)) {
1586  			deferred_free_range(pfn - nr_free, nr_free);
1587  			nr_free = 1;
1588  			touch_nmi_watchdog();
1589  		} else {
1590  			nr_free++;
1591  		}
1592  	}
1593  	/* Free the last block of pages to allocator */
1594  	deferred_free_range(pfn - nr_free, nr_free);
1595  }
1596  
1597  /*
1598   * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
1599   * by performing it only once every pageblock_nr_pages.
1600   * Return number of pages initialized.
1601   */
1602  static unsigned long  __init deferred_init_pages(struct zone *zone,
1603  						 unsigned long pfn,
1604  						 unsigned long end_pfn)
1605  {
1606  	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1607  	int nid = zone_to_nid(zone);
1608  	unsigned long nr_pages = 0;
1609  	int zid = zone_idx(zone);
1610  	struct page *page = NULL;
1611  
1612  	for (; pfn < end_pfn; pfn++) {
1613  		if (!deferred_pfn_valid(pfn)) {
1614  			page = NULL;
1615  			continue;
1616  		} else if (!page || !(pfn & nr_pgmask)) {
1617  			page = pfn_to_page(pfn);
1618  			touch_nmi_watchdog();
1619  		} else {
1620  			page++;
1621  		}
1622  		__init_single_page(page, pfn, zid, nid);
1623  		nr_pages++;
1624  	}
1625  	return (nr_pages);
1626  }
1627  
1628  /*
1629   * This function is meant to pre-load the iterator for the zone init.
1630   * Specifically it walks through the ranges until we are caught up to the
1631   * first_init_pfn value and exits there. If we never encounter the value we
1632   * return false indicating there are no valid ranges left.
1633   */
1634  static bool __init
1635  deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
1636  				    unsigned long *spfn, unsigned long *epfn,
1637  				    unsigned long first_init_pfn)
1638  {
1639  	u64 j;
1640  
1641  	/*
1642  	 * Start out by walking through the ranges in this zone that have
1643  	 * already been initialized. We don't need to do anything with them
1644  	 * so we just need to flush them out of the system.
1645  	 */
1646  	for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
1647  		if (*epfn <= first_init_pfn)
1648  			continue;
1649  		if (*spfn < first_init_pfn)
1650  			*spfn = first_init_pfn;
1651  		*i = j;
1652  		return true;
1653  	}
1654  
1655  	return false;
1656  }
1657  
1658  /*
1659   * Initialize and free pages. We do it in two loops: first we initialize
1660   * struct page, then free to buddy allocator, because while we are
1661   * freeing pages we can access pages that are ahead (computing buddy
1662   * page in __free_one_page()).
1663   *
1664   * In order to try and keep some memory in the cache we have the loop
1665   * broken along max page order boundaries. This way we will not cause
1666   * any issues with the buddy page computation.
1667   */
1668  static unsigned long __init
1669  deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
1670  		       unsigned long *end_pfn)
1671  {
1672  	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
1673  	unsigned long spfn = *start_pfn, epfn = *end_pfn;
1674  	unsigned long nr_pages = 0;
1675  	u64 j = *i;
1676  
1677  	/* First we loop through and initialize the page values */
1678  	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
1679  		unsigned long t;
1680  
1681  		if (mo_pfn <= *start_pfn)
1682  			break;
1683  
1684  		t = min(mo_pfn, *end_pfn);
1685  		nr_pages += deferred_init_pages(zone, *start_pfn, t);
1686  
1687  		if (mo_pfn < *end_pfn) {
1688  			*start_pfn = mo_pfn;
1689  			break;
1690  		}
1691  	}
1692  
1693  	/* Reset values and now loop through freeing pages as needed */
1694  	swap(j, *i);
1695  
1696  	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
1697  		unsigned long t;
1698  
1699  		if (mo_pfn <= spfn)
1700  			break;
1701  
1702  		t = min(mo_pfn, epfn);
1703  		deferred_free_pages(spfn, t);
1704  
1705  		if (mo_pfn <= epfn)
1706  			break;
1707  	}
1708  
1709  	return nr_pages;
1710  }
1711  
1712  /* Initialise remaining memory on a node */
1713  static int __init deferred_init_memmap(void *data)
1714  {
1715  	pg_data_t *pgdat = data;
1716  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1717  	unsigned long spfn = 0, epfn = 0, nr_pages = 0;
1718  	unsigned long first_init_pfn, flags;
1719  	unsigned long start = jiffies;
1720  	struct zone *zone;
1721  	int zid;
1722  	u64 i;
1723  
1724  	/* Bind memory initialisation thread to a local node if possible */
1725  	if (!cpumask_empty(cpumask))
1726  		set_cpus_allowed_ptr(current, cpumask);
1727  
1728  	pgdat_resize_lock(pgdat, &flags);
1729  	first_init_pfn = pgdat->first_deferred_pfn;
1730  	if (first_init_pfn == ULONG_MAX) {
1731  		pgdat_resize_unlock(pgdat, &flags);
1732  		pgdat_init_report_one_done();
1733  		return 0;
1734  	}
1735  
1736  	/* Sanity check boundaries */
1737  	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1738  	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1739  	pgdat->first_deferred_pfn = ULONG_MAX;
1740  
1741  	/* Only the highest zone is deferred so find it */
1742  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1743  		zone = pgdat->node_zones + zid;
1744  		if (first_init_pfn < zone_end_pfn(zone))
1745  			break;
1746  	}
1747  
1748  	/* If the zone is empty somebody else may have cleared out the zone */
1749  	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
1750  						 first_init_pfn))
1751  		goto zone_empty;
1752  
1753  	/*
1754  	 * Initialize and free pages in MAX_ORDER sized increments so
1755  	 * that we can avoid introducing any issues with the buddy
1756  	 * allocator.
1757  	 */
1758  	while (spfn < epfn)
1759  		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
1760  zone_empty:
1761  	pgdat_resize_unlock(pgdat, &flags);
1762  
1763  	/* Sanity check that the next zone really is unpopulated */
1764  	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1765  
1766  	pr_info("node %d initialised, %lu pages in %ums\n",
1767  		pgdat->node_id,	nr_pages, jiffies_to_msecs(jiffies - start));
1768  
1769  	pgdat_init_report_one_done();
1770  	return 0;
1771  }
1772  
1773  /*
1774   * If this zone has deferred pages, try to grow it by initializing enough
1775   * deferred pages to satisfy the allocation specified by order, rounded up to
1776   * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
1777   * of SECTION_SIZE bytes by initializing struct pages in increments of
1778   * PAGES_PER_SECTION * sizeof(struct page) bytes.
1779   *
1780   * Return true when zone was grown, otherwise return false. We return true even
1781   * when we grow less than requested, to let the caller decide if there are
1782   * enough pages to satisfy the allocation.
1783   *
1784   * Note: We use noinline because this function is needed only during boot, and
1785   * it is called from a __ref function _deferred_grow_zone. This way we are
1786   * making sure that it is not inlined into permanent text section.
1787   */
1788  static noinline bool __init
1789  deferred_grow_zone(struct zone *zone, unsigned int order)
1790  {
1791  	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1792  	pg_data_t *pgdat = zone->zone_pgdat;
1793  	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1794  	unsigned long spfn, epfn, flags;
1795  	unsigned long nr_pages = 0;
1796  	u64 i;
1797  
1798  	/* Only the last zone may have deferred pages */
1799  	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
1800  		return false;
1801  
1802  	pgdat_resize_lock(pgdat, &flags);
1803  
1804  	/*
1805  	 * If deferred pages have been initialized while we were waiting for
1806  	 * the lock, return true, as the zone was grown.  The caller will retry
1807  	 * this zone.  We won't return to this function since the caller also
1808  	 * has this static branch.
1809  	 */
1810  	if (!static_branch_unlikely(&deferred_pages)) {
1811  		pgdat_resize_unlock(pgdat, &flags);
1812  		return true;
1813  	}
1814  
1815  	/*
1816  	 * If someone grew this zone while we were waiting for spinlock, return
1817  	 * true, as there might be enough pages already.
1818  	 */
1819  	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
1820  		pgdat_resize_unlock(pgdat, &flags);
1821  		return true;
1822  	}
1823  
1824  	/* If the zone is empty somebody else may have cleared out the zone */
1825  	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
1826  						 first_deferred_pfn)) {
1827  		pgdat->first_deferred_pfn = ULONG_MAX;
1828  		pgdat_resize_unlock(pgdat, &flags);
1829  		return true;
1830  	}
1831  
1832  	/*
1833  	 * Initialize and free pages in MAX_ORDER sized increments so
1834  	 * that we can avoid introducing any issues with the buddy
1835  	 * allocator.
1836  	 */
1837  	while (spfn < epfn) {
1838  		/* update our first deferred PFN for this section */
1839  		first_deferred_pfn = spfn;
1840  
1841  		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
1842  
1843  		/* We should only stop along section boundaries */
1844  		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
1845  			continue;
1846  
1847  		/* If our quota has been met we can stop here */
1848  		if (nr_pages >= nr_pages_needed)
1849  			break;
1850  	}
1851  
1852  	pgdat->first_deferred_pfn = spfn;
1853  	pgdat_resize_unlock(pgdat, &flags);
1854  
1855  	return nr_pages > 0;
1856  }
1857  
1858  /*
1859   * deferred_grow_zone() is __init, but it is called from
1860   * get_page_from_freelist() during early boot until deferred_pages permanently
1861   * disables this call. This is why we have refdata wrapper to avoid warning,
1862   * and to ensure that the function body gets unloaded.
1863   */
1864  static bool __ref
1865  _deferred_grow_zone(struct zone *zone, unsigned int order)
1866  {
1867  	return deferred_grow_zone(zone, order);
1868  }
1869  
1870  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1871  
1872  void __init page_alloc_init_late(void)
1873  {
1874  	struct zone *zone;
1875  	int nid;
1876  
1877  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1878  
1879  	/* There will be num_node_state(N_MEMORY) threads */
1880  	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1881  	for_each_node_state(nid, N_MEMORY) {
1882  		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1883  	}
1884  
1885  	/* Block until all are initialised */
1886  	wait_for_completion(&pgdat_init_all_done_comp);
1887  
1888  	/*
1889  	 * We initialized the rest of the deferred pages.  Permanently disable
1890  	 * on-demand struct page initialization.
1891  	 */
1892  	static_branch_disable(&deferred_pages);
1893  
1894  	/* Reinit limits that are based on free pages after the kernel is up */
1895  	files_maxfiles_init();
1896  #endif
1897  
1898  	/* Discard memblock private memory */
1899  	memblock_discard();
1900  
1901  	for_each_node_state(nid, N_MEMORY)
1902  		shuffle_free_memory(NODE_DATA(nid));
1903  
1904  	for_each_populated_zone(zone)
1905  		set_zone_contiguous(zone);
1906  }
1907  
1908  #ifdef CONFIG_CMA
1909  /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1910  void __init init_cma_reserved_pageblock(struct page *page)
1911  {
1912  	unsigned i = pageblock_nr_pages;
1913  	struct page *p = page;
1914  
1915  	do {
1916  		__ClearPageReserved(p);
1917  		set_page_count(p, 0);
1918  	} while (++p, --i);
1919  
1920  	set_pageblock_migratetype(page, MIGRATE_CMA);
1921  
1922  	if (pageblock_order >= MAX_ORDER) {
1923  		i = pageblock_nr_pages;
1924  		p = page;
1925  		do {
1926  			set_page_refcounted(p);
1927  			__free_pages(p, MAX_ORDER - 1);
1928  			p += MAX_ORDER_NR_PAGES;
1929  		} while (i -= MAX_ORDER_NR_PAGES);
1930  	} else {
1931  		set_page_refcounted(page);
1932  		__free_pages(page, pageblock_order);
1933  	}
1934  
1935  	adjust_managed_page_count(page, pageblock_nr_pages);
1936  }
1937  #endif
1938  
1939  /*
1940   * The order of subdivision here is critical for the IO subsystem.
1941   * Please do not alter this order without good reasons and regression
1942   * testing. Specifically, as large blocks of memory are subdivided,
1943   * the order in which smaller blocks are delivered depends on the order
1944   * they're subdivided in this function. This is the primary factor
1945   * influencing the order in which pages are delivered to the IO
1946   * subsystem according to empirical testing, and this is also justified
1947   * by considering the behavior of a buddy system containing a single
1948   * large block of memory acted on by a series of small allocations.
1949   * This behavior is a critical factor in sglist merging's success.
1950   *
1951   * -- nyc
1952   */
1953  static inline void expand(struct zone *zone, struct page *page,
1954  	int low, int high, struct free_area *area,
1955  	int migratetype)
1956  {
1957  	unsigned long size = 1 << high;
1958  
1959  	while (high > low) {
1960  		area--;
1961  		high--;
1962  		size >>= 1;
1963  		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1964  
1965  		/*
1966  		 * Mark as guard pages (or page), that will allow to
1967  		 * merge back to allocator when buddy will be freed.
1968  		 * Corresponding page table entries will not be touched,
1969  		 * pages will stay not present in virtual address space
1970  		 */
1971  		if (set_page_guard(zone, &page[size], high, migratetype))
1972  			continue;
1973  
1974  		add_to_free_area(&page[size], area, migratetype);
1975  		set_page_order(&page[size], high);
1976  	}
1977  }
1978  
1979  static void check_new_page_bad(struct page *page)
1980  {
1981  	const char *bad_reason = NULL;
1982  	unsigned long bad_flags = 0;
1983  
1984  	if (unlikely(atomic_read(&page->_mapcount) != -1))
1985  		bad_reason = "nonzero mapcount";
1986  	if (unlikely(page->mapping != NULL))
1987  		bad_reason = "non-NULL mapping";
1988  	if (unlikely(page_ref_count(page) != 0))
1989  		bad_reason = "nonzero _refcount";
1990  	if (unlikely(page->flags & __PG_HWPOISON)) {
1991  		bad_reason = "HWPoisoned (hardware-corrupted)";
1992  		bad_flags = __PG_HWPOISON;
1993  		/* Don't complain about hwpoisoned pages */
1994  		page_mapcount_reset(page); /* remove PageBuddy */
1995  		return;
1996  	}
1997  	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1998  		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1999  		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
2000  	}
2001  #ifdef CONFIG_MEMCG
2002  	if (unlikely(page->mem_cgroup))
2003  		bad_reason = "page still charged to cgroup";
2004  #endif
2005  	bad_page(page, bad_reason, bad_flags);
2006  }
2007  
2008  /*
2009   * This page is about to be returned from the page allocator
2010   */
2011  static inline int check_new_page(struct page *page)
2012  {
2013  	if (likely(page_expected_state(page,
2014  				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
2015  		return 0;
2016  
2017  	check_new_page_bad(page);
2018  	return 1;
2019  }
2020  
2021  static inline bool free_pages_prezeroed(void)
2022  {
2023  	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
2024  		page_poisoning_enabled();
2025  }
2026  
2027  #ifdef CONFIG_DEBUG_VM
2028  static bool check_pcp_refill(struct page *page)
2029  {
2030  	return false;
2031  }
2032  
2033  static bool check_new_pcp(struct page *page)
2034  {
2035  	return check_new_page(page);
2036  }
2037  #else
2038  static bool check_pcp_refill(struct page *page)
2039  {
2040  	return check_new_page(page);
2041  }
2042  static bool check_new_pcp(struct page *page)
2043  {
2044  	return false;
2045  }
2046  #endif /* CONFIG_DEBUG_VM */
2047  
2048  static bool check_new_pages(struct page *page, unsigned int order)
2049  {
2050  	int i;
2051  	for (i = 0; i < (1 << order); i++) {
2052  		struct page *p = page + i;
2053  
2054  		if (unlikely(check_new_page(p)))
2055  			return true;
2056  	}
2057  
2058  	return false;
2059  }
2060  
2061  inline void post_alloc_hook(struct page *page, unsigned int order,
2062  				gfp_t gfp_flags)
2063  {
2064  	set_page_private(page, 0);
2065  	set_page_refcounted(page);
2066  
2067  	arch_alloc_page(page, order);
2068  	if (debug_pagealloc_enabled())
2069  		kernel_map_pages(page, 1 << order, 1);
2070  	kasan_alloc_pages(page, order);
2071  	kernel_poison_pages(page, 1 << order, 1);
2072  	set_page_owner(page, order, gfp_flags);
2073  }
2074  
2075  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
2076  							unsigned int alloc_flags)
2077  {
2078  	int i;
2079  
2080  	post_alloc_hook(page, order, gfp_flags);
2081  
2082  	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
2083  		for (i = 0; i < (1 << order); i++)
2084  			clear_highpage(page + i);
2085  
2086  	if (order && (gfp_flags & __GFP_COMP))
2087  		prep_compound_page(page, order);
2088  
2089  	/*
2090  	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
2091  	 * allocate the page. The expectation is that the caller is taking
2092  	 * steps that will free more memory. The caller should avoid the page
2093  	 * being used for !PFMEMALLOC purposes.
2094  	 */
2095  	if (alloc_flags & ALLOC_NO_WATERMARKS)
2096  		set_page_pfmemalloc(page);
2097  	else
2098  		clear_page_pfmemalloc(page);
2099  }
2100  
2101  /*
2102   * Go through the free lists for the given migratetype and remove
2103   * the smallest available page from the freelists
2104   */
2105  static __always_inline
2106  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
2107  						int migratetype)
2108  {
2109  	unsigned int current_order;
2110  	struct free_area *area;
2111  	struct page *page;
2112  
2113  	/* Find a page of the appropriate size in the preferred list */
2114  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2115  		area = &(zone->free_area[current_order]);
2116  		page = get_page_from_free_area(area, migratetype);
2117  		if (!page)
2118  			continue;
2119  		del_page_from_free_area(page, area);
2120  		expand(zone, page, order, current_order, area, migratetype);
2121  		set_pcppage_migratetype(page, migratetype);
2122  		return page;
2123  	}
2124  
2125  	return NULL;
2126  }
2127  
2128  
2129  /*
2130   * This array describes the order lists are fallen back to when
2131   * the free lists for the desirable migrate type are depleted
2132   */
2133  static int fallbacks[MIGRATE_TYPES][4] = {
2134  	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
2135  	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2136  	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
2137  #ifdef CONFIG_CMA
2138  	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
2139  #endif
2140  #ifdef CONFIG_MEMORY_ISOLATION
2141  	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
2142  #endif
2143  };
2144  
2145  #ifdef CONFIG_CMA
2146  static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2147  					unsigned int order)
2148  {
2149  	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
2150  }
2151  #else
2152  static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2153  					unsigned int order) { return NULL; }
2154  #endif
2155  
2156  /*
2157   * Move the free pages in a range to the free lists of the requested type.
2158   * Note that start_page and end_pages are not aligned on a pageblock
2159   * boundary. If alignment is required, use move_freepages_block()
2160   */
2161  static int move_freepages(struct zone *zone,
2162  			  struct page *start_page, struct page *end_page,
2163  			  int migratetype, int *num_movable)
2164  {
2165  	struct page *page;
2166  	unsigned int order;
2167  	int pages_moved = 0;
2168  
2169  #ifndef CONFIG_HOLES_IN_ZONE
2170  	/*
2171  	 * page_zone is not safe to call in this context when
2172  	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2173  	 * anyway as we check zone boundaries in move_freepages_block().
2174  	 * Remove at a later date when no bug reports exist related to
2175  	 * grouping pages by mobility
2176  	 */
2177  	VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2178  	          pfn_valid(page_to_pfn(end_page)) &&
2179  	          page_zone(start_page) != page_zone(end_page));
2180  #endif
2181  	for (page = start_page; page <= end_page;) {
2182  		if (!pfn_valid_within(page_to_pfn(page))) {
2183  			page++;
2184  			continue;
2185  		}
2186  
2187  		/* Make sure we are not inadvertently changing nodes */
2188  		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2189  
2190  		if (!PageBuddy(page)) {
2191  			/*
2192  			 * We assume that pages that could be isolated for
2193  			 * migration are movable. But we don't actually try
2194  			 * isolating, as that would be expensive.
2195  			 */
2196  			if (num_movable &&
2197  					(PageLRU(page) || __PageMovable(page)))
2198  				(*num_movable)++;
2199  
2200  			page++;
2201  			continue;
2202  		}
2203  
2204  		order = page_order(page);
2205  		move_to_free_area(page, &zone->free_area[order], migratetype);
2206  		page += 1 << order;
2207  		pages_moved += 1 << order;
2208  	}
2209  
2210  	return pages_moved;
2211  }
2212  
2213  int move_freepages_block(struct zone *zone, struct page *page,
2214  				int migratetype, int *num_movable)
2215  {
2216  	unsigned long start_pfn, end_pfn;
2217  	struct page *start_page, *end_page;
2218  
2219  	if (num_movable)
2220  		*num_movable = 0;
2221  
2222  	start_pfn = page_to_pfn(page);
2223  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
2224  	start_page = pfn_to_page(start_pfn);
2225  	end_page = start_page + pageblock_nr_pages - 1;
2226  	end_pfn = start_pfn + pageblock_nr_pages - 1;
2227  
2228  	/* Do not cross zone boundaries */
2229  	if (!zone_spans_pfn(zone, start_pfn))
2230  		start_page = page;
2231  	if (!zone_spans_pfn(zone, end_pfn))
2232  		return 0;
2233  
2234  	return move_freepages(zone, start_page, end_page, migratetype,
2235  								num_movable);
2236  }
2237  
2238  static void change_pageblock_range(struct page *pageblock_page,
2239  					int start_order, int migratetype)
2240  {
2241  	int nr_pageblocks = 1 << (start_order - pageblock_order);
2242  
2243  	while (nr_pageblocks--) {
2244  		set_pageblock_migratetype(pageblock_page, migratetype);
2245  		pageblock_page += pageblock_nr_pages;
2246  	}
2247  }
2248  
2249  /*
2250   * When we are falling back to another migratetype during allocation, try to
2251   * steal extra free pages from the same pageblocks to satisfy further
2252   * allocations, instead of polluting multiple pageblocks.
2253   *
2254   * If we are stealing a relatively large buddy page, it is likely there will
2255   * be more free pages in the pageblock, so try to steal them all. For
2256   * reclaimable and unmovable allocations, we steal regardless of page size,
2257   * as fragmentation caused by those allocations polluting movable pageblocks
2258   * is worse than movable allocations stealing from unmovable and reclaimable
2259   * pageblocks.
2260   */
2261  static bool can_steal_fallback(unsigned int order, int start_mt)
2262  {
2263  	/*
2264  	 * Leaving this order check is intended, although there is
2265  	 * relaxed order check in next check. The reason is that
2266  	 * we can actually steal whole pageblock if this condition met,
2267  	 * but, below check doesn't guarantee it and that is just heuristic
2268  	 * so could be changed anytime.
2269  	 */
2270  	if (order >= pageblock_order)
2271  		return true;
2272  
2273  	if (order >= pageblock_order / 2 ||
2274  		start_mt == MIGRATE_RECLAIMABLE ||
2275  		start_mt == MIGRATE_UNMOVABLE ||
2276  		page_group_by_mobility_disabled)
2277  		return true;
2278  
2279  	return false;
2280  }
2281  
2282  static inline void boost_watermark(struct zone *zone)
2283  {
2284  	unsigned long max_boost;
2285  
2286  	if (!watermark_boost_factor)
2287  		return;
2288  
2289  	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2290  			watermark_boost_factor, 10000);
2291  
2292  	/*
2293  	 * high watermark may be uninitialised if fragmentation occurs
2294  	 * very early in boot so do not boost. We do not fall
2295  	 * through and boost by pageblock_nr_pages as failing
2296  	 * allocations that early means that reclaim is not going
2297  	 * to help and it may even be impossible to reclaim the
2298  	 * boosted watermark resulting in a hang.
2299  	 */
2300  	if (!max_boost)
2301  		return;
2302  
2303  	max_boost = max(pageblock_nr_pages, max_boost);
2304  
2305  	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2306  		max_boost);
2307  }
2308  
2309  /*
2310   * This function implements actual steal behaviour. If order is large enough,
2311   * we can steal whole pageblock. If not, we first move freepages in this
2312   * pageblock to our migratetype and determine how many already-allocated pages
2313   * are there in the pageblock with a compatible migratetype. If at least half
2314   * of pages are free or compatible, we can change migratetype of the pageblock
2315   * itself, so pages freed in the future will be put on the correct free list.
2316   */
2317  static void steal_suitable_fallback(struct zone *zone, struct page *page,
2318  		unsigned int alloc_flags, int start_type, bool whole_block)
2319  {
2320  	unsigned int current_order = page_order(page);
2321  	struct free_area *area;
2322  	int free_pages, movable_pages, alike_pages;
2323  	int old_block_type;
2324  
2325  	old_block_type = get_pageblock_migratetype(page);
2326  
2327  	/*
2328  	 * This can happen due to races and we want to prevent broken
2329  	 * highatomic accounting.
2330  	 */
2331  	if (is_migrate_highatomic(old_block_type))
2332  		goto single_page;
2333  
2334  	/* Take ownership for orders >= pageblock_order */
2335  	if (current_order >= pageblock_order) {
2336  		change_pageblock_range(page, current_order, start_type);
2337  		goto single_page;
2338  	}
2339  
2340  	/*
2341  	 * Boost watermarks to increase reclaim pressure to reduce the
2342  	 * likelihood of future fallbacks. Wake kswapd now as the node
2343  	 * may be balanced overall and kswapd will not wake naturally.
2344  	 */
2345  	boost_watermark(zone);
2346  	if (alloc_flags & ALLOC_KSWAPD)
2347  		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2348  
2349  	/* We are not allowed to try stealing from the whole block */
2350  	if (!whole_block)
2351  		goto single_page;
2352  
2353  	free_pages = move_freepages_block(zone, page, start_type,
2354  						&movable_pages);
2355  	/*
2356  	 * Determine how many pages are compatible with our allocation.
2357  	 * For movable allocation, it's the number of movable pages which
2358  	 * we just obtained. For other types it's a bit more tricky.
2359  	 */
2360  	if (start_type == MIGRATE_MOVABLE) {
2361  		alike_pages = movable_pages;
2362  	} else {
2363  		/*
2364  		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2365  		 * to MOVABLE pageblock, consider all non-movable pages as
2366  		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2367  		 * vice versa, be conservative since we can't distinguish the
2368  		 * exact migratetype of non-movable pages.
2369  		 */
2370  		if (old_block_type == MIGRATE_MOVABLE)
2371  			alike_pages = pageblock_nr_pages
2372  						- (free_pages + movable_pages);
2373  		else
2374  			alike_pages = 0;
2375  	}
2376  
2377  	/* moving whole block can fail due to zone boundary conditions */
2378  	if (!free_pages)
2379  		goto single_page;
2380  
2381  	/*
2382  	 * If a sufficient number of pages in the block are either free or of
2383  	 * comparable migratability as our allocation, claim the whole block.
2384  	 */
2385  	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2386  			page_group_by_mobility_disabled)
2387  		set_pageblock_migratetype(page, start_type);
2388  
2389  	return;
2390  
2391  single_page:
2392  	area = &zone->free_area[current_order];
2393  	move_to_free_area(page, area, start_type);
2394  }
2395  
2396  /*
2397   * Check whether there is a suitable fallback freepage with requested order.
2398   * If only_stealable is true, this function returns fallback_mt only if
2399   * we can steal other freepages all together. This would help to reduce
2400   * fragmentation due to mixed migratetype pages in one pageblock.
2401   */
2402  int find_suitable_fallback(struct free_area *area, unsigned int order,
2403  			int migratetype, bool only_stealable, bool *can_steal)
2404  {
2405  	int i;
2406  	int fallback_mt;
2407  
2408  	if (area->nr_free == 0)
2409  		return -1;
2410  
2411  	*can_steal = false;
2412  	for (i = 0;; i++) {
2413  		fallback_mt = fallbacks[migratetype][i];
2414  		if (fallback_mt == MIGRATE_TYPES)
2415  			break;
2416  
2417  		if (free_area_empty(area, fallback_mt))
2418  			continue;
2419  
2420  		if (can_steal_fallback(order, migratetype))
2421  			*can_steal = true;
2422  
2423  		if (!only_stealable)
2424  			return fallback_mt;
2425  
2426  		if (*can_steal)
2427  			return fallback_mt;
2428  	}
2429  
2430  	return -1;
2431  }
2432  
2433  /*
2434   * Reserve a pageblock for exclusive use of high-order atomic allocations if
2435   * there are no empty page blocks that contain a page with a suitable order
2436   */
2437  static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2438  				unsigned int alloc_order)
2439  {
2440  	int mt;
2441  	unsigned long max_managed, flags;
2442  
2443  	/*
2444  	 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2445  	 * Check is race-prone but harmless.
2446  	 */
2447  	max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2448  	if (zone->nr_reserved_highatomic >= max_managed)
2449  		return;
2450  
2451  	spin_lock_irqsave(&zone->lock, flags);
2452  
2453  	/* Recheck the nr_reserved_highatomic limit under the lock */
2454  	if (zone->nr_reserved_highatomic >= max_managed)
2455  		goto out_unlock;
2456  
2457  	/* Yoink! */
2458  	mt = get_pageblock_migratetype(page);
2459  	if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2460  	    && !is_migrate_cma(mt)) {
2461  		zone->nr_reserved_highatomic += pageblock_nr_pages;
2462  		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2463  		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2464  	}
2465  
2466  out_unlock:
2467  	spin_unlock_irqrestore(&zone->lock, flags);
2468  }
2469  
2470  /*
2471   * Used when an allocation is about to fail under memory pressure. This
2472   * potentially hurts the reliability of high-order allocations when under
2473   * intense memory pressure but failed atomic allocations should be easier
2474   * to recover from than an OOM.
2475   *
2476   * If @force is true, try to unreserve a pageblock even though highatomic
2477   * pageblock is exhausted.
2478   */
2479  static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2480  						bool force)
2481  {
2482  	struct zonelist *zonelist = ac->zonelist;
2483  	unsigned long flags;
2484  	struct zoneref *z;
2485  	struct zone *zone;
2486  	struct page *page;
2487  	int order;
2488  	bool ret;
2489  
2490  	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2491  								ac->nodemask) {
2492  		/*
2493  		 * Preserve at least one pageblock unless memory pressure
2494  		 * is really high.
2495  		 */
2496  		if (!force && zone->nr_reserved_highatomic <=
2497  					pageblock_nr_pages)
2498  			continue;
2499  
2500  		spin_lock_irqsave(&zone->lock, flags);
2501  		for (order = 0; order < MAX_ORDER; order++) {
2502  			struct free_area *area = &(zone->free_area[order]);
2503  
2504  			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2505  			if (!page)
2506  				continue;
2507  
2508  			/*
2509  			 * In page freeing path, migratetype change is racy so
2510  			 * we can counter several free pages in a pageblock
2511  			 * in this loop althoug we changed the pageblock type
2512  			 * from highatomic to ac->migratetype. So we should
2513  			 * adjust the count once.
2514  			 */
2515  			if (is_migrate_highatomic_page(page)) {
2516  				/*
2517  				 * It should never happen but changes to
2518  				 * locking could inadvertently allow a per-cpu
2519  				 * drain to add pages to MIGRATE_HIGHATOMIC
2520  				 * while unreserving so be safe and watch for
2521  				 * underflows.
2522  				 */
2523  				zone->nr_reserved_highatomic -= min(
2524  						pageblock_nr_pages,
2525  						zone->nr_reserved_highatomic);
2526  			}
2527  
2528  			/*
2529  			 * Convert to ac->migratetype and avoid the normal
2530  			 * pageblock stealing heuristics. Minimally, the caller
2531  			 * is doing the work and needs the pages. More
2532  			 * importantly, if the block was always converted to
2533  			 * MIGRATE_UNMOVABLE or another type then the number
2534  			 * of pageblocks that cannot be completely freed
2535  			 * may increase.
2536  			 */
2537  			set_pageblock_migratetype(page, ac->migratetype);
2538  			ret = move_freepages_block(zone, page, ac->migratetype,
2539  									NULL);
2540  			if (ret) {
2541  				spin_unlock_irqrestore(&zone->lock, flags);
2542  				return ret;
2543  			}
2544  		}
2545  		spin_unlock_irqrestore(&zone->lock, flags);
2546  	}
2547  
2548  	return false;
2549  }
2550  
2551  /*
2552   * Try finding a free buddy page on the fallback list and put it on the free
2553   * list of requested migratetype, possibly along with other pages from the same
2554   * block, depending on fragmentation avoidance heuristics. Returns true if
2555   * fallback was found so that __rmqueue_smallest() can grab it.
2556   *
2557   * The use of signed ints for order and current_order is a deliberate
2558   * deviation from the rest of this file, to make the for loop
2559   * condition simpler.
2560   */
2561  static __always_inline bool
2562  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2563  						unsigned int alloc_flags)
2564  {
2565  	struct free_area *area;
2566  	int current_order;
2567  	int min_order = order;
2568  	struct page *page;
2569  	int fallback_mt;
2570  	bool can_steal;
2571  
2572  	/*
2573  	 * Do not steal pages from freelists belonging to other pageblocks
2574  	 * i.e. orders < pageblock_order. If there are no local zones free,
2575  	 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2576  	 */
2577  	if (alloc_flags & ALLOC_NOFRAGMENT)
2578  		min_order = pageblock_order;
2579  
2580  	/*
2581  	 * Find the largest available free page in the other list. This roughly
2582  	 * approximates finding the pageblock with the most free pages, which
2583  	 * would be too costly to do exactly.
2584  	 */
2585  	for (current_order = MAX_ORDER - 1; current_order >= min_order;
2586  				--current_order) {
2587  		area = &(zone->free_area[current_order]);
2588  		fallback_mt = find_suitable_fallback(area, current_order,
2589  				start_migratetype, false, &can_steal);
2590  		if (fallback_mt == -1)
2591  			continue;
2592  
2593  		/*
2594  		 * We cannot steal all free pages from the pageblock and the
2595  		 * requested migratetype is movable. In that case it's better to
2596  		 * steal and split the smallest available page instead of the
2597  		 * largest available page, because even if the next movable
2598  		 * allocation falls back into a different pageblock than this
2599  		 * one, it won't cause permanent fragmentation.
2600  		 */
2601  		if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2602  					&& current_order > order)
2603  			goto find_smallest;
2604  
2605  		goto do_steal;
2606  	}
2607  
2608  	return false;
2609  
2610  find_smallest:
2611  	for (current_order = order; current_order < MAX_ORDER;
2612  							current_order++) {
2613  		area = &(zone->free_area[current_order]);
2614  		fallback_mt = find_suitable_fallback(area, current_order,
2615  				start_migratetype, false, &can_steal);
2616  		if (fallback_mt != -1)
2617  			break;
2618  	}
2619  
2620  	/*
2621  	 * This should not happen - we already found a suitable fallback
2622  	 * when looking for the largest page.
2623  	 */
2624  	VM_BUG_ON(current_order == MAX_ORDER);
2625  
2626  do_steal:
2627  	page = get_page_from_free_area(area, fallback_mt);
2628  
2629  	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2630  								can_steal);
2631  
2632  	trace_mm_page_alloc_extfrag(page, order, current_order,
2633  		start_migratetype, fallback_mt);
2634  
2635  	return true;
2636  
2637  }
2638  
2639  /*
2640   * Do the hard work of removing an element from the buddy allocator.
2641   * Call me with the zone->lock already held.
2642   */
2643  static __always_inline struct page *
2644  __rmqueue(struct zone *zone, unsigned int order, int migratetype,
2645  						unsigned int alloc_flags)
2646  {
2647  	struct page *page;
2648  
2649  retry:
2650  	page = __rmqueue_smallest(zone, order, migratetype);
2651  	if (unlikely(!page)) {
2652  		if (migratetype == MIGRATE_MOVABLE)
2653  			page = __rmqueue_cma_fallback(zone, order);
2654  
2655  		if (!page && __rmqueue_fallback(zone, order, migratetype,
2656  								alloc_flags))
2657  			goto retry;
2658  	}
2659  
2660  	trace_mm_page_alloc_zone_locked(page, order, migratetype);
2661  	return page;
2662  }
2663  
2664  /*
2665   * Obtain a specified number of elements from the buddy allocator, all under
2666   * a single hold of the lock, for efficiency.  Add them to the supplied list.
2667   * Returns the number of new pages which were placed at *list.
2668   */
2669  static int rmqueue_bulk(struct zone *zone, unsigned int order,
2670  			unsigned long count, struct list_head *list,
2671  			int migratetype, unsigned int alloc_flags)
2672  {
2673  	int i, alloced = 0;
2674  
2675  	spin_lock(&zone->lock);
2676  	for (i = 0; i < count; ++i) {
2677  		struct page *page = __rmqueue(zone, order, migratetype,
2678  								alloc_flags);
2679  		if (unlikely(page == NULL))
2680  			break;
2681  
2682  		if (unlikely(check_pcp_refill(page)))
2683  			continue;
2684  
2685  		/*
2686  		 * Split buddy pages returned by expand() are received here in
2687  		 * physical page order. The page is added to the tail of
2688  		 * caller's list. From the callers perspective, the linked list
2689  		 * is ordered by page number under some conditions. This is
2690  		 * useful for IO devices that can forward direction from the
2691  		 * head, thus also in the physical page order. This is useful
2692  		 * for IO devices that can merge IO requests if the physical
2693  		 * pages are ordered properly.
2694  		 */
2695  		list_add_tail(&page->lru, list);
2696  		alloced++;
2697  		if (is_migrate_cma(get_pcppage_migratetype(page)))
2698  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2699  					      -(1 << order));
2700  	}
2701  
2702  	/*
2703  	 * i pages were removed from the buddy list even if some leak due
2704  	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2705  	 * on i. Do not confuse with 'alloced' which is the number of
2706  	 * pages added to the pcp list.
2707  	 */
2708  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2709  	spin_unlock(&zone->lock);
2710  	return alloced;
2711  }
2712  
2713  #ifdef CONFIG_NUMA
2714  /*
2715   * Called from the vmstat counter updater to drain pagesets of this
2716   * currently executing processor on remote nodes after they have
2717   * expired.
2718   *
2719   * Note that this function must be called with the thread pinned to
2720   * a single processor.
2721   */
2722  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2723  {
2724  	unsigned long flags;
2725  	int to_drain, batch;
2726  
2727  	local_irq_save(flags);
2728  	batch = READ_ONCE(pcp->batch);
2729  	to_drain = min(pcp->count, batch);
2730  	if (to_drain > 0)
2731  		free_pcppages_bulk(zone, to_drain, pcp);
2732  	local_irq_restore(flags);
2733  }
2734  #endif
2735  
2736  /*
2737   * Drain pcplists of the indicated processor and zone.
2738   *
2739   * The processor must either be the current processor and the
2740   * thread pinned to the current processor or a processor that
2741   * is not online.
2742   */
2743  static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2744  {
2745  	unsigned long flags;
2746  	struct per_cpu_pageset *pset;
2747  	struct per_cpu_pages *pcp;
2748  
2749  	local_irq_save(flags);
2750  	pset = per_cpu_ptr(zone->pageset, cpu);
2751  
2752  	pcp = &pset->pcp;
2753  	if (pcp->count)
2754  		free_pcppages_bulk(zone, pcp->count, pcp);
2755  	local_irq_restore(flags);
2756  }
2757  
2758  /*
2759   * Drain pcplists of all zones on the indicated processor.
2760   *
2761   * The processor must either be the current processor and the
2762   * thread pinned to the current processor or a processor that
2763   * is not online.
2764   */
2765  static void drain_pages(unsigned int cpu)
2766  {
2767  	struct zone *zone;
2768  
2769  	for_each_populated_zone(zone) {
2770  		drain_pages_zone(cpu, zone);
2771  	}
2772  }
2773  
2774  /*
2775   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2776   *
2777   * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2778   * the single zone's pages.
2779   */
2780  void drain_local_pages(struct zone *zone)
2781  {
2782  	int cpu = smp_processor_id();
2783  
2784  	if (zone)
2785  		drain_pages_zone(cpu, zone);
2786  	else
2787  		drain_pages(cpu);
2788  }
2789  
2790  static void drain_local_pages_wq(struct work_struct *work)
2791  {
2792  	struct pcpu_drain *drain;
2793  
2794  	drain = container_of(work, struct pcpu_drain, work);
2795  
2796  	/*
2797  	 * drain_all_pages doesn't use proper cpu hotplug protection so
2798  	 * we can race with cpu offline when the WQ can move this from
2799  	 * a cpu pinned worker to an unbound one. We can operate on a different
2800  	 * cpu which is allright but we also have to make sure to not move to
2801  	 * a different one.
2802  	 */
2803  	preempt_disable();
2804  	drain_local_pages(drain->zone);
2805  	preempt_enable();
2806  }
2807  
2808  /*
2809   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2810   *
2811   * When zone parameter is non-NULL, spill just the single zone's pages.
2812   *
2813   * Note that this can be extremely slow as the draining happens in a workqueue.
2814   */
2815  void drain_all_pages(struct zone *zone)
2816  {
2817  	int cpu;
2818  
2819  	/*
2820  	 * Allocate in the BSS so we wont require allocation in
2821  	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2822  	 */
2823  	static cpumask_t cpus_with_pcps;
2824  
2825  	/*
2826  	 * Make sure nobody triggers this path before mm_percpu_wq is fully
2827  	 * initialized.
2828  	 */
2829  	if (WARN_ON_ONCE(!mm_percpu_wq))
2830  		return;
2831  
2832  	/*
2833  	 * Do not drain if one is already in progress unless it's specific to
2834  	 * a zone. Such callers are primarily CMA and memory hotplug and need
2835  	 * the drain to be complete when the call returns.
2836  	 */
2837  	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2838  		if (!zone)
2839  			return;
2840  		mutex_lock(&pcpu_drain_mutex);
2841  	}
2842  
2843  	/*
2844  	 * We don't care about racing with CPU hotplug event
2845  	 * as offline notification will cause the notified
2846  	 * cpu to drain that CPU pcps and on_each_cpu_mask
2847  	 * disables preemption as part of its processing
2848  	 */
2849  	for_each_online_cpu(cpu) {
2850  		struct per_cpu_pageset *pcp;
2851  		struct zone *z;
2852  		bool has_pcps = false;
2853  
2854  		if (zone) {
2855  			pcp = per_cpu_ptr(zone->pageset, cpu);
2856  			if (pcp->pcp.count)
2857  				has_pcps = true;
2858  		} else {
2859  			for_each_populated_zone(z) {
2860  				pcp = per_cpu_ptr(z->pageset, cpu);
2861  				if (pcp->pcp.count) {
2862  					has_pcps = true;
2863  					break;
2864  				}
2865  			}
2866  		}
2867  
2868  		if (has_pcps)
2869  			cpumask_set_cpu(cpu, &cpus_with_pcps);
2870  		else
2871  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
2872  	}
2873  
2874  	for_each_cpu(cpu, &cpus_with_pcps) {
2875  		struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
2876  
2877  		drain->zone = zone;
2878  		INIT_WORK(&drain->work, drain_local_pages_wq);
2879  		queue_work_on(cpu, mm_percpu_wq, &drain->work);
2880  	}
2881  	for_each_cpu(cpu, &cpus_with_pcps)
2882  		flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2883  
2884  	mutex_unlock(&pcpu_drain_mutex);
2885  }
2886  
2887  #ifdef CONFIG_HIBERNATION
2888  
2889  /*
2890   * Touch the watchdog for every WD_PAGE_COUNT pages.
2891   */
2892  #define WD_PAGE_COUNT	(128*1024)
2893  
2894  void mark_free_pages(struct zone *zone)
2895  {
2896  	unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
2897  	unsigned long flags;
2898  	unsigned int order, t;
2899  	struct page *page;
2900  
2901  	if (zone_is_empty(zone))
2902  		return;
2903  
2904  	spin_lock_irqsave(&zone->lock, flags);
2905  
2906  	max_zone_pfn = zone_end_pfn(zone);
2907  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2908  		if (pfn_valid(pfn)) {
2909  			page = pfn_to_page(pfn);
2910  
2911  			if (!--page_count) {
2912  				touch_nmi_watchdog();
2913  				page_count = WD_PAGE_COUNT;
2914  			}
2915  
2916  			if (page_zone(page) != zone)
2917  				continue;
2918  
2919  			if (!swsusp_page_is_forbidden(page))
2920  				swsusp_unset_page_free(page);
2921  		}
2922  
2923  	for_each_migratetype_order(order, t) {
2924  		list_for_each_entry(page,
2925  				&zone->free_area[order].free_list[t], lru) {
2926  			unsigned long i;
2927  
2928  			pfn = page_to_pfn(page);
2929  			for (i = 0; i < (1UL << order); i++) {
2930  				if (!--page_count) {
2931  					touch_nmi_watchdog();
2932  					page_count = WD_PAGE_COUNT;
2933  				}
2934  				swsusp_set_page_free(pfn_to_page(pfn + i));
2935  			}
2936  		}
2937  	}
2938  	spin_unlock_irqrestore(&zone->lock, flags);
2939  }
2940  #endif /* CONFIG_PM */
2941  
2942  static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
2943  {
2944  	int migratetype;
2945  
2946  	if (!free_pcp_prepare(page))
2947  		return false;
2948  
2949  	migratetype = get_pfnblock_migratetype(page, pfn);
2950  	set_pcppage_migratetype(page, migratetype);
2951  	return true;
2952  }
2953  
2954  static void free_unref_page_commit(struct page *page, unsigned long pfn)
2955  {
2956  	struct zone *zone = page_zone(page);
2957  	struct per_cpu_pages *pcp;
2958  	int migratetype;
2959  
2960  	migratetype = get_pcppage_migratetype(page);
2961  	__count_vm_event(PGFREE);
2962  
2963  	/*
2964  	 * We only track unmovable, reclaimable and movable on pcp lists.
2965  	 * Free ISOLATE pages back to the allocator because they are being
2966  	 * offlined but treat HIGHATOMIC as movable pages so we can get those
2967  	 * areas back if necessary. Otherwise, we may have to free
2968  	 * excessively into the page allocator
2969  	 */
2970  	if (migratetype >= MIGRATE_PCPTYPES) {
2971  		if (unlikely(is_migrate_isolate(migratetype))) {
2972  			free_one_page(zone, page, pfn, 0, migratetype);
2973  			return;
2974  		}
2975  		migratetype = MIGRATE_MOVABLE;
2976  	}
2977  
2978  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
2979  	list_add(&page->lru, &pcp->lists[migratetype]);
2980  	pcp->count++;
2981  	if (pcp->count >= pcp->high) {
2982  		unsigned long batch = READ_ONCE(pcp->batch);
2983  		free_pcppages_bulk(zone, batch, pcp);
2984  	}
2985  }
2986  
2987  /*
2988   * Free a 0-order page
2989   */
2990  void free_unref_page(struct page *page)
2991  {
2992  	unsigned long flags;
2993  	unsigned long pfn = page_to_pfn(page);
2994  
2995  	if (!free_unref_page_prepare(page, pfn))
2996  		return;
2997  
2998  	local_irq_save(flags);
2999  	free_unref_page_commit(page, pfn);
3000  	local_irq_restore(flags);
3001  }
3002  
3003  /*
3004   * Free a list of 0-order pages
3005   */
3006  void free_unref_page_list(struct list_head *list)
3007  {
3008  	struct page *page, *next;
3009  	unsigned long flags, pfn;
3010  	int batch_count = 0;
3011  
3012  	/* Prepare pages for freeing */
3013  	list_for_each_entry_safe(page, next, list, lru) {
3014  		pfn = page_to_pfn(page);
3015  		if (!free_unref_page_prepare(page, pfn))
3016  			list_del(&page->lru);
3017  		set_page_private(page, pfn);
3018  	}
3019  
3020  	local_irq_save(flags);
3021  	list_for_each_entry_safe(page, next, list, lru) {
3022  		unsigned long pfn = page_private(page);
3023  
3024  		set_page_private(page, 0);
3025  		trace_mm_page_free_batched(page);
3026  		free_unref_page_commit(page, pfn);
3027  
3028  		/*
3029  		 * Guard against excessive IRQ disabled times when we get
3030  		 * a large list of pages to free.
3031  		 */
3032  		if (++batch_count == SWAP_CLUSTER_MAX) {
3033  			local_irq_restore(flags);
3034  			batch_count = 0;
3035  			local_irq_save(flags);
3036  		}
3037  	}
3038  	local_irq_restore(flags);
3039  }
3040  
3041  /*
3042   * split_page takes a non-compound higher-order page, and splits it into
3043   * n (1<<order) sub-pages: page[0..n]
3044   * Each sub-page must be freed individually.
3045   *
3046   * Note: this is probably too low level an operation for use in drivers.
3047   * Please consult with lkml before using this in your driver.
3048   */
3049  void split_page(struct page *page, unsigned int order)
3050  {
3051  	int i;
3052  
3053  	VM_BUG_ON_PAGE(PageCompound(page), page);
3054  	VM_BUG_ON_PAGE(!page_count(page), page);
3055  
3056  	for (i = 1; i < (1 << order); i++)
3057  		set_page_refcounted(page + i);
3058  	split_page_owner(page, order);
3059  }
3060  EXPORT_SYMBOL_GPL(split_page);
3061  
3062  int __isolate_free_page(struct page *page, unsigned int order)
3063  {
3064  	struct free_area *area = &page_zone(page)->free_area[order];
3065  	unsigned long watermark;
3066  	struct zone *zone;
3067  	int mt;
3068  
3069  	BUG_ON(!PageBuddy(page));
3070  
3071  	zone = page_zone(page);
3072  	mt = get_pageblock_migratetype(page);
3073  
3074  	if (!is_migrate_isolate(mt)) {
3075  		/*
3076  		 * Obey watermarks as if the page was being allocated. We can
3077  		 * emulate a high-order watermark check with a raised order-0
3078  		 * watermark, because we already know our high-order page
3079  		 * exists.
3080  		 */
3081  		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3082  		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3083  			return 0;
3084  
3085  		__mod_zone_freepage_state(zone, -(1UL << order), mt);
3086  	}
3087  
3088  	/* Remove page from free list */
3089  
3090  	del_page_from_free_area(page, area);
3091  
3092  	/*
3093  	 * Set the pageblock if the isolated page is at least half of a
3094  	 * pageblock
3095  	 */
3096  	if (order >= pageblock_order - 1) {
3097  		struct page *endpage = page + (1 << order) - 1;
3098  		for (; page < endpage; page += pageblock_nr_pages) {
3099  			int mt = get_pageblock_migratetype(page);
3100  			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
3101  			    && !is_migrate_highatomic(mt))
3102  				set_pageblock_migratetype(page,
3103  							  MIGRATE_MOVABLE);
3104  		}
3105  	}
3106  
3107  
3108  	return 1UL << order;
3109  }
3110  
3111  /*
3112   * Update NUMA hit/miss statistics
3113   *
3114   * Must be called with interrupts disabled.
3115   */
3116  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
3117  {
3118  #ifdef CONFIG_NUMA
3119  	enum numa_stat_item local_stat = NUMA_LOCAL;
3120  
3121  	/* skip numa counters update if numa stats is disabled */
3122  	if (!static_branch_likely(&vm_numa_stat_key))
3123  		return;
3124  
3125  	if (zone_to_nid(z) != numa_node_id())
3126  		local_stat = NUMA_OTHER;
3127  
3128  	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3129  		__inc_numa_state(z, NUMA_HIT);
3130  	else {
3131  		__inc_numa_state(z, NUMA_MISS);
3132  		__inc_numa_state(preferred_zone, NUMA_FOREIGN);
3133  	}
3134  	__inc_numa_state(z, local_stat);
3135  #endif
3136  }
3137  
3138  /* Remove page from the per-cpu list, caller must protect the list */
3139  static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3140  			unsigned int alloc_flags,
3141  			struct per_cpu_pages *pcp,
3142  			struct list_head *list)
3143  {
3144  	struct page *page;
3145  
3146  	do {
3147  		if (list_empty(list)) {
3148  			pcp->count += rmqueue_bulk(zone, 0,
3149  					pcp->batch, list,
3150  					migratetype, alloc_flags);
3151  			if (unlikely(list_empty(list)))
3152  				return NULL;
3153  		}
3154  
3155  		page = list_first_entry(list, struct page, lru);
3156  		list_del(&page->lru);
3157  		pcp->count--;
3158  	} while (check_new_pcp(page));
3159  
3160  	return page;
3161  }
3162  
3163  /* Lock and remove page from the per-cpu list */
3164  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3165  			struct zone *zone, gfp_t gfp_flags,
3166  			int migratetype, unsigned int alloc_flags)
3167  {
3168  	struct per_cpu_pages *pcp;
3169  	struct list_head *list;
3170  	struct page *page;
3171  	unsigned long flags;
3172  
3173  	local_irq_save(flags);
3174  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3175  	list = &pcp->lists[migratetype];
3176  	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
3177  	if (page) {
3178  		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3179  		zone_statistics(preferred_zone, zone);
3180  	}
3181  	local_irq_restore(flags);
3182  	return page;
3183  }
3184  
3185  /*
3186   * Allocate a page from the given zone. Use pcplists for order-0 allocations.
3187   */
3188  static inline
3189  struct page *rmqueue(struct zone *preferred_zone,
3190  			struct zone *zone, unsigned int order,
3191  			gfp_t gfp_flags, unsigned int alloc_flags,
3192  			int migratetype)
3193  {
3194  	unsigned long flags;
3195  	struct page *page;
3196  
3197  	if (likely(order == 0)) {
3198  		page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
3199  					migratetype, alloc_flags);
3200  		goto out;
3201  	}
3202  
3203  	/*
3204  	 * We most definitely don't want callers attempting to
3205  	 * allocate greater than order-1 page units with __GFP_NOFAIL.
3206  	 */
3207  	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3208  	spin_lock_irqsave(&zone->lock, flags);
3209  
3210  	do {
3211  		page = NULL;
3212  		if (alloc_flags & ALLOC_HARDER) {
3213  			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3214  			if (page)
3215  				trace_mm_page_alloc_zone_locked(page, order, migratetype);
3216  		}
3217  		if (!page)
3218  			page = __rmqueue(zone, order, migratetype, alloc_flags);
3219  	} while (page && check_new_pages(page, order));
3220  	spin_unlock(&zone->lock);
3221  	if (!page)
3222  		goto failed;
3223  	__mod_zone_freepage_state(zone, -(1 << order),
3224  				  get_pcppage_migratetype(page));
3225  
3226  	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3227  	zone_statistics(preferred_zone, zone);
3228  	local_irq_restore(flags);
3229  
3230  out:
3231  	/* Separate test+clear to avoid unnecessary atomics */
3232  	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3233  		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3234  		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3235  	}
3236  
3237  	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3238  	return page;
3239  
3240  failed:
3241  	local_irq_restore(flags);
3242  	return NULL;
3243  }
3244  
3245  #ifdef CONFIG_FAIL_PAGE_ALLOC
3246  
3247  static struct {
3248  	struct fault_attr attr;
3249  
3250  	bool ignore_gfp_highmem;
3251  	bool ignore_gfp_reclaim;
3252  	u32 min_order;
3253  } fail_page_alloc = {
3254  	.attr = FAULT_ATTR_INITIALIZER,
3255  	.ignore_gfp_reclaim = true,
3256  	.ignore_gfp_highmem = true,
3257  	.min_order = 1,
3258  };
3259  
3260  static int __init setup_fail_page_alloc(char *str)
3261  {
3262  	return setup_fault_attr(&fail_page_alloc.attr, str);
3263  }
3264  __setup("fail_page_alloc=", setup_fail_page_alloc);
3265  
3266  static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3267  {
3268  	if (order < fail_page_alloc.min_order)
3269  		return false;
3270  	if (gfp_mask & __GFP_NOFAIL)
3271  		return false;
3272  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
3273  		return false;
3274  	if (fail_page_alloc.ignore_gfp_reclaim &&
3275  			(gfp_mask & __GFP_DIRECT_RECLAIM))
3276  		return false;
3277  
3278  	return should_fail(&fail_page_alloc.attr, 1 << order);
3279  }
3280  
3281  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3282  
3283  static int __init fail_page_alloc_debugfs(void)
3284  {
3285  	umode_t mode = S_IFREG | 0600;
3286  	struct dentry *dir;
3287  
3288  	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3289  					&fail_page_alloc.attr);
3290  
3291  	debugfs_create_bool("ignore-gfp-wait", mode, dir,
3292  			    &fail_page_alloc.ignore_gfp_reclaim);
3293  	debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3294  			    &fail_page_alloc.ignore_gfp_highmem);
3295  	debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3296  
3297  	return 0;
3298  }
3299  
3300  late_initcall(fail_page_alloc_debugfs);
3301  
3302  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3303  
3304  #else /* CONFIG_FAIL_PAGE_ALLOC */
3305  
3306  static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3307  {
3308  	return false;
3309  }
3310  
3311  #endif /* CONFIG_FAIL_PAGE_ALLOC */
3312  
3313  static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3314  {
3315  	return __should_fail_alloc_page(gfp_mask, order);
3316  }
3317  ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3318  
3319  /*
3320   * Return true if free base pages are above 'mark'. For high-order checks it
3321   * will return true of the order-0 watermark is reached and there is at least
3322   * one free page of a suitable size. Checking now avoids taking the zone lock
3323   * to check in the allocation paths if no pages are free.
3324   */
3325  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3326  			 int classzone_idx, unsigned int alloc_flags,
3327  			 long free_pages)
3328  {
3329  	long min = mark;
3330  	int o;
3331  	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
3332  
3333  	/* free_pages may go negative - that's OK */
3334  	free_pages -= (1 << order) - 1;
3335  
3336  	if (alloc_flags & ALLOC_HIGH)
3337  		min -= min / 2;
3338  
3339  	/*
3340  	 * If the caller does not have rights to ALLOC_HARDER then subtract
3341  	 * the high-atomic reserves. This will over-estimate the size of the
3342  	 * atomic reserve but it avoids a search.
3343  	 */
3344  	if (likely(!alloc_harder)) {
3345  		free_pages -= z->nr_reserved_highatomic;
3346  	} else {
3347  		/*
3348  		 * OOM victims can try even harder than normal ALLOC_HARDER
3349  		 * users on the grounds that it's definitely going to be in
3350  		 * the exit path shortly and free memory. Any allocation it
3351  		 * makes during the free path will be small and short-lived.
3352  		 */
3353  		if (alloc_flags & ALLOC_OOM)
3354  			min -= min / 2;
3355  		else
3356  			min -= min / 4;
3357  	}
3358  
3359  
3360  #ifdef CONFIG_CMA
3361  	/* If allocation can't use CMA areas don't use free CMA pages */
3362  	if (!(alloc_flags & ALLOC_CMA))
3363  		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3364  #endif
3365  
3366  	/*
3367  	 * Check watermarks for an order-0 allocation request. If these
3368  	 * are not met, then a high-order request also cannot go ahead
3369  	 * even if a suitable page happened to be free.
3370  	 */
3371  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3372  		return false;
3373  
3374  	/* If this is an order-0 request then the watermark is fine */
3375  	if (!order)
3376  		return true;
3377  
3378  	/* For a high-order request, check at least one suitable page is free */
3379  	for (o = order; o < MAX_ORDER; o++) {
3380  		struct free_area *area = &z->free_area[o];
3381  		int mt;
3382  
3383  		if (!area->nr_free)
3384  			continue;
3385  
3386  		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3387  			if (!free_area_empty(area, mt))
3388  				return true;
3389  		}
3390  
3391  #ifdef CONFIG_CMA
3392  		if ((alloc_flags & ALLOC_CMA) &&
3393  		    !free_area_empty(area, MIGRATE_CMA)) {
3394  			return true;
3395  		}
3396  #endif
3397  		if (alloc_harder &&
3398  			!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3399  			return true;
3400  	}
3401  	return false;
3402  }
3403  
3404  bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3405  		      int classzone_idx, unsigned int alloc_flags)
3406  {
3407  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3408  					zone_page_state(z, NR_FREE_PAGES));
3409  }
3410  
3411  static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3412  		unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3413  {
3414  	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3415  	long cma_pages = 0;
3416  
3417  #ifdef CONFIG_CMA
3418  	/* If allocation can't use CMA areas don't use free CMA pages */
3419  	if (!(alloc_flags & ALLOC_CMA))
3420  		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3421  #endif
3422  
3423  	/*
3424  	 * Fast check for order-0 only. If this fails then the reserves
3425  	 * need to be calculated. There is a corner case where the check
3426  	 * passes but only the high-order atomic reserve are free. If
3427  	 * the caller is !atomic then it'll uselessly search the free
3428  	 * list. That corner case is then slower but it is harmless.
3429  	 */
3430  	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3431  		return true;
3432  
3433  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3434  					free_pages);
3435  }
3436  
3437  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3438  			unsigned long mark, int classzone_idx)
3439  {
3440  	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3441  
3442  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3443  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3444  
3445  	return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3446  								free_pages);
3447  }
3448  
3449  #ifdef CONFIG_NUMA
3450  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3451  {
3452  	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3453  				RECLAIM_DISTANCE;
3454  }
3455  #else	/* CONFIG_NUMA */
3456  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3457  {
3458  	return true;
3459  }
3460  #endif	/* CONFIG_NUMA */
3461  
3462  /*
3463   * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3464   * fragmentation is subtle. If the preferred zone was HIGHMEM then
3465   * premature use of a lower zone may cause lowmem pressure problems that
3466   * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3467   * probably too small. It only makes sense to spread allocations to avoid
3468   * fragmentation between the Normal and DMA32 zones.
3469   */
3470  static inline unsigned int
3471  alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3472  {
3473  	unsigned int alloc_flags = 0;
3474  
3475  	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3476  		alloc_flags |= ALLOC_KSWAPD;
3477  
3478  #ifdef CONFIG_ZONE_DMA32
3479  	if (!zone)
3480  		return alloc_flags;
3481  
3482  	if (zone_idx(zone) != ZONE_NORMAL)
3483  		return alloc_flags;
3484  
3485  	/*
3486  	 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3487  	 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3488  	 * on UMA that if Normal is populated then so is DMA32.
3489  	 */
3490  	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3491  	if (nr_online_nodes > 1 && !populated_zone(--zone))
3492  		return alloc_flags;
3493  
3494  	alloc_flags |= ALLOC_NOFRAGMENT;
3495  #endif /* CONFIG_ZONE_DMA32 */
3496  	return alloc_flags;
3497  }
3498  
3499  /*
3500   * get_page_from_freelist goes through the zonelist trying to allocate
3501   * a page.
3502   */
3503  static struct page *
3504  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3505  						const struct alloc_context *ac)
3506  {
3507  	struct zoneref *z;
3508  	struct zone *zone;
3509  	struct pglist_data *last_pgdat_dirty_limit = NULL;
3510  	bool no_fallback;
3511  
3512  retry:
3513  	/*
3514  	 * Scan zonelist, looking for a zone with enough free.
3515  	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3516  	 */
3517  	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3518  	z = ac->preferred_zoneref;
3519  	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3520  								ac->nodemask) {
3521  		struct page *page;
3522  		unsigned long mark;
3523  
3524  		if (cpusets_enabled() &&
3525  			(alloc_flags & ALLOC_CPUSET) &&
3526  			!__cpuset_zone_allowed(zone, gfp_mask))
3527  				continue;
3528  		/*
3529  		 * When allocating a page cache page for writing, we
3530  		 * want to get it from a node that is within its dirty
3531  		 * limit, such that no single node holds more than its
3532  		 * proportional share of globally allowed dirty pages.
3533  		 * The dirty limits take into account the node's
3534  		 * lowmem reserves and high watermark so that kswapd
3535  		 * should be able to balance it without having to
3536  		 * write pages from its LRU list.
3537  		 *
3538  		 * XXX: For now, allow allocations to potentially
3539  		 * exceed the per-node dirty limit in the slowpath
3540  		 * (spread_dirty_pages unset) before going into reclaim,
3541  		 * which is important when on a NUMA setup the allowed
3542  		 * nodes are together not big enough to reach the
3543  		 * global limit.  The proper fix for these situations
3544  		 * will require awareness of nodes in the
3545  		 * dirty-throttling and the flusher threads.
3546  		 */
3547  		if (ac->spread_dirty_pages) {
3548  			if (last_pgdat_dirty_limit == zone->zone_pgdat)
3549  				continue;
3550  
3551  			if (!node_dirty_ok(zone->zone_pgdat)) {
3552  				last_pgdat_dirty_limit = zone->zone_pgdat;
3553  				continue;
3554  			}
3555  		}
3556  
3557  		if (no_fallback && nr_online_nodes > 1 &&
3558  		    zone != ac->preferred_zoneref->zone) {
3559  			int local_nid;
3560  
3561  			/*
3562  			 * If moving to a remote node, retry but allow
3563  			 * fragmenting fallbacks. Locality is more important
3564  			 * than fragmentation avoidance.
3565  			 */
3566  			local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3567  			if (zone_to_nid(zone) != local_nid) {
3568  				alloc_flags &= ~ALLOC_NOFRAGMENT;
3569  				goto retry;
3570  			}
3571  		}
3572  
3573  		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3574  		if (!zone_watermark_fast(zone, order, mark,
3575  				       ac_classzone_idx(ac), alloc_flags)) {
3576  			int ret;
3577  
3578  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3579  			/*
3580  			 * Watermark failed for this zone, but see if we can
3581  			 * grow this zone if it contains deferred pages.
3582  			 */
3583  			if (static_branch_unlikely(&deferred_pages)) {
3584  				if (_deferred_grow_zone(zone, order))
3585  					goto try_this_zone;
3586  			}
3587  #endif
3588  			/* Checked here to keep the fast path fast */
3589  			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3590  			if (alloc_flags & ALLOC_NO_WATERMARKS)
3591  				goto try_this_zone;
3592  
3593  			if (node_reclaim_mode == 0 ||
3594  			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3595  				continue;
3596  
3597  			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3598  			switch (ret) {
3599  			case NODE_RECLAIM_NOSCAN:
3600  				/* did not scan */
3601  				continue;
3602  			case NODE_RECLAIM_FULL:
3603  				/* scanned but unreclaimable */
3604  				continue;
3605  			default:
3606  				/* did we reclaim enough */
3607  				if (zone_watermark_ok(zone, order, mark,
3608  						ac_classzone_idx(ac), alloc_flags))
3609  					goto try_this_zone;
3610  
3611  				continue;
3612  			}
3613  		}
3614  
3615  try_this_zone:
3616  		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3617  				gfp_mask, alloc_flags, ac->migratetype);
3618  		if (page) {
3619  			prep_new_page(page, order, gfp_mask, alloc_flags);
3620  
3621  			/*
3622  			 * If this is a high-order atomic allocation then check
3623  			 * if the pageblock should be reserved for the future
3624  			 */
3625  			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
3626  				reserve_highatomic_pageblock(page, zone, order);
3627  
3628  			return page;
3629  		} else {
3630  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3631  			/* Try again if zone has deferred pages */
3632  			if (static_branch_unlikely(&deferred_pages)) {
3633  				if (_deferred_grow_zone(zone, order))
3634  					goto try_this_zone;
3635  			}
3636  #endif
3637  		}
3638  	}
3639  
3640  	/*
3641  	 * It's possible on a UMA machine to get through all zones that are
3642  	 * fragmented. If avoiding fragmentation, reset and try again.
3643  	 */
3644  	if (no_fallback) {
3645  		alloc_flags &= ~ALLOC_NOFRAGMENT;
3646  		goto retry;
3647  	}
3648  
3649  	return NULL;
3650  }
3651  
3652  static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3653  {
3654  	unsigned int filter = SHOW_MEM_FILTER_NODES;
3655  	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3656  
3657  	if (!__ratelimit(&show_mem_rs))
3658  		return;
3659  
3660  	/*
3661  	 * This documents exceptions given to allocations in certain
3662  	 * contexts that are allowed to allocate outside current's set
3663  	 * of allowed nodes.
3664  	 */
3665  	if (!(gfp_mask & __GFP_NOMEMALLOC))
3666  		if (tsk_is_oom_victim(current) ||
3667  		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
3668  			filter &= ~SHOW_MEM_FILTER_NODES;
3669  	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3670  		filter &= ~SHOW_MEM_FILTER_NODES;
3671  
3672  	show_mem(filter, nodemask);
3673  }
3674  
3675  void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3676  {
3677  	struct va_format vaf;
3678  	va_list args;
3679  	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3680  				      DEFAULT_RATELIMIT_BURST);
3681  
3682  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3683  		return;
3684  
3685  	va_start(args, fmt);
3686  	vaf.fmt = fmt;
3687  	vaf.va = &args;
3688  	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3689  			current->comm, &vaf, gfp_mask, &gfp_mask,
3690  			nodemask_pr_args(nodemask));
3691  	va_end(args);
3692  
3693  	cpuset_print_current_mems_allowed();
3694  	pr_cont("\n");
3695  	dump_stack();
3696  	warn_alloc_show_mem(gfp_mask, nodemask);
3697  }
3698  
3699  static inline struct page *
3700  __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3701  			      unsigned int alloc_flags,
3702  			      const struct alloc_context *ac)
3703  {
3704  	struct page *page;
3705  
3706  	page = get_page_from_freelist(gfp_mask, order,
3707  			alloc_flags|ALLOC_CPUSET, ac);
3708  	/*
3709  	 * fallback to ignore cpuset restriction if our nodes
3710  	 * are depleted
3711  	 */
3712  	if (!page)
3713  		page = get_page_from_freelist(gfp_mask, order,
3714  				alloc_flags, ac);
3715  
3716  	return page;
3717  }
3718  
3719  static inline struct page *
3720  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3721  	const struct alloc_context *ac, unsigned long *did_some_progress)
3722  {
3723  	struct oom_control oc = {
3724  		.zonelist = ac->zonelist,
3725  		.nodemask = ac->nodemask,
3726  		.memcg = NULL,
3727  		.gfp_mask = gfp_mask,
3728  		.order = order,
3729  	};
3730  	struct page *page;
3731  
3732  	*did_some_progress = 0;
3733  
3734  	/*
3735  	 * Acquire the oom lock.  If that fails, somebody else is
3736  	 * making progress for us.
3737  	 */
3738  	if (!mutex_trylock(&oom_lock)) {
3739  		*did_some_progress = 1;
3740  		schedule_timeout_uninterruptible(1);
3741  		return NULL;
3742  	}
3743  
3744  	/*
3745  	 * Go through the zonelist yet one more time, keep very high watermark
3746  	 * here, this is only to catch a parallel oom killing, we must fail if
3747  	 * we're still under heavy pressure. But make sure that this reclaim
3748  	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3749  	 * allocation which will never fail due to oom_lock already held.
3750  	 */
3751  	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3752  				      ~__GFP_DIRECT_RECLAIM, order,
3753  				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3754  	if (page)
3755  		goto out;
3756  
3757  	/* Coredumps can quickly deplete all memory reserves */
3758  	if (current->flags & PF_DUMPCORE)
3759  		goto out;
3760  	/* The OOM killer will not help higher order allocs */
3761  	if (order > PAGE_ALLOC_COSTLY_ORDER)
3762  		goto out;
3763  	/*
3764  	 * We have already exhausted all our reclaim opportunities without any
3765  	 * success so it is time to admit defeat. We will skip the OOM killer
3766  	 * because it is very likely that the caller has a more reasonable
3767  	 * fallback than shooting a random task.
3768  	 */
3769  	if (gfp_mask & __GFP_RETRY_MAYFAIL)
3770  		goto out;
3771  	/* The OOM killer does not needlessly kill tasks for lowmem */
3772  	if (ac->high_zoneidx < ZONE_NORMAL)
3773  		goto out;
3774  	if (pm_suspended_storage())
3775  		goto out;
3776  	/*
3777  	 * XXX: GFP_NOFS allocations should rather fail than rely on
3778  	 * other request to make a forward progress.
3779  	 * We are in an unfortunate situation where out_of_memory cannot
3780  	 * do much for this context but let's try it to at least get
3781  	 * access to memory reserved if the current task is killed (see
3782  	 * out_of_memory). Once filesystems are ready to handle allocation
3783  	 * failures more gracefully we should just bail out here.
3784  	 */
3785  
3786  	/* The OOM killer may not free memory on a specific node */
3787  	if (gfp_mask & __GFP_THISNODE)
3788  		goto out;
3789  
3790  	/* Exhausted what can be done so it's blame time */
3791  	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3792  		*did_some_progress = 1;
3793  
3794  		/*
3795  		 * Help non-failing allocations by giving them access to memory
3796  		 * reserves
3797  		 */
3798  		if (gfp_mask & __GFP_NOFAIL)
3799  			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3800  					ALLOC_NO_WATERMARKS, ac);
3801  	}
3802  out:
3803  	mutex_unlock(&oom_lock);
3804  	return page;
3805  }
3806  
3807  /*
3808   * Maximum number of compaction retries wit a progress before OOM
3809   * killer is consider as the only way to move forward.
3810   */
3811  #define MAX_COMPACT_RETRIES 16
3812  
3813  #ifdef CONFIG_COMPACTION
3814  /* Try memory compaction for high-order allocations before reclaim */
3815  static struct page *
3816  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3817  		unsigned int alloc_flags, const struct alloc_context *ac,
3818  		enum compact_priority prio, enum compact_result *compact_result)
3819  {
3820  	struct page *page = NULL;
3821  	unsigned long pflags;
3822  	unsigned int noreclaim_flag;
3823  
3824  	if (!order)
3825  		return NULL;
3826  
3827  	psi_memstall_enter(&pflags);
3828  	noreclaim_flag = memalloc_noreclaim_save();
3829  
3830  	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3831  								prio, &page);
3832  
3833  	memalloc_noreclaim_restore(noreclaim_flag);
3834  	psi_memstall_leave(&pflags);
3835  
3836  	/*
3837  	 * At least in one zone compaction wasn't deferred or skipped, so let's
3838  	 * count a compaction stall
3839  	 */
3840  	count_vm_event(COMPACTSTALL);
3841  
3842  	/* Prep a captured page if available */
3843  	if (page)
3844  		prep_new_page(page, order, gfp_mask, alloc_flags);
3845  
3846  	/* Try get a page from the freelist if available */
3847  	if (!page)
3848  		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3849  
3850  	if (page) {
3851  		struct zone *zone = page_zone(page);
3852  
3853  		zone->compact_blockskip_flush = false;
3854  		compaction_defer_reset(zone, order, true);
3855  		count_vm_event(COMPACTSUCCESS);
3856  		return page;
3857  	}
3858  
3859  	/*
3860  	 * It's bad if compaction run occurs and fails. The most likely reason
3861  	 * is that pages exist, but not enough to satisfy watermarks.
3862  	 */
3863  	count_vm_event(COMPACTFAIL);
3864  
3865  	cond_resched();
3866  
3867  	return NULL;
3868  }
3869  
3870  static inline bool
3871  should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3872  		     enum compact_result compact_result,
3873  		     enum compact_priority *compact_priority,
3874  		     int *compaction_retries)
3875  {
3876  	int max_retries = MAX_COMPACT_RETRIES;
3877  	int min_priority;
3878  	bool ret = false;
3879  	int retries = *compaction_retries;
3880  	enum compact_priority priority = *compact_priority;
3881  
3882  	if (!order)
3883  		return false;
3884  
3885  	if (compaction_made_progress(compact_result))
3886  		(*compaction_retries)++;
3887  
3888  	/*
3889  	 * compaction considers all the zone as desperately out of memory
3890  	 * so it doesn't really make much sense to retry except when the
3891  	 * failure could be caused by insufficient priority
3892  	 */
3893  	if (compaction_failed(compact_result))
3894  		goto check_priority;
3895  
3896  	/*
3897  	 * make sure the compaction wasn't deferred or didn't bail out early
3898  	 * due to locks contention before we declare that we should give up.
3899  	 * But do not retry if the given zonelist is not suitable for
3900  	 * compaction.
3901  	 */
3902  	if (compaction_withdrawn(compact_result)) {
3903  		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3904  		goto out;
3905  	}
3906  
3907  	/*
3908  	 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3909  	 * costly ones because they are de facto nofail and invoke OOM
3910  	 * killer to move on while costly can fail and users are ready
3911  	 * to cope with that. 1/4 retries is rather arbitrary but we
3912  	 * would need much more detailed feedback from compaction to
3913  	 * make a better decision.
3914  	 */
3915  	if (order > PAGE_ALLOC_COSTLY_ORDER)
3916  		max_retries /= 4;
3917  	if (*compaction_retries <= max_retries) {
3918  		ret = true;
3919  		goto out;
3920  	}
3921  
3922  	/*
3923  	 * Make sure there are attempts at the highest priority if we exhausted
3924  	 * all retries or failed at the lower priorities.
3925  	 */
3926  check_priority:
3927  	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3928  			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3929  
3930  	if (*compact_priority > min_priority) {
3931  		(*compact_priority)--;
3932  		*compaction_retries = 0;
3933  		ret = true;
3934  	}
3935  out:
3936  	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3937  	return ret;
3938  }
3939  #else
3940  static inline struct page *
3941  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3942  		unsigned int alloc_flags, const struct alloc_context *ac,
3943  		enum compact_priority prio, enum compact_result *compact_result)
3944  {
3945  	*compact_result = COMPACT_SKIPPED;
3946  	return NULL;
3947  }
3948  
3949  static inline bool
3950  should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3951  		     enum compact_result compact_result,
3952  		     enum compact_priority *compact_priority,
3953  		     int *compaction_retries)
3954  {
3955  	struct zone *zone;
3956  	struct zoneref *z;
3957  
3958  	if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3959  		return false;
3960  
3961  	/*
3962  	 * There are setups with compaction disabled which would prefer to loop
3963  	 * inside the allocator rather than hit the oom killer prematurely.
3964  	 * Let's give them a good hope and keep retrying while the order-0
3965  	 * watermarks are OK.
3966  	 */
3967  	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3968  					ac->nodemask) {
3969  		if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3970  					ac_classzone_idx(ac), alloc_flags))
3971  			return true;
3972  	}
3973  	return false;
3974  }
3975  #endif /* CONFIG_COMPACTION */
3976  
3977  #ifdef CONFIG_LOCKDEP
3978  static struct lockdep_map __fs_reclaim_map =
3979  	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3980  
3981  static bool __need_fs_reclaim(gfp_t gfp_mask)
3982  {
3983  	gfp_mask = current_gfp_context(gfp_mask);
3984  
3985  	/* no reclaim without waiting on it */
3986  	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3987  		return false;
3988  
3989  	/* this guy won't enter reclaim */
3990  	if (current->flags & PF_MEMALLOC)
3991  		return false;
3992  
3993  	/* We're only interested __GFP_FS allocations for now */
3994  	if (!(gfp_mask & __GFP_FS))
3995  		return false;
3996  
3997  	if (gfp_mask & __GFP_NOLOCKDEP)
3998  		return false;
3999  
4000  	return true;
4001  }
4002  
4003  void __fs_reclaim_acquire(void)
4004  {
4005  	lock_map_acquire(&__fs_reclaim_map);
4006  }
4007  
4008  void __fs_reclaim_release(void)
4009  {
4010  	lock_map_release(&__fs_reclaim_map);
4011  }
4012  
4013  void fs_reclaim_acquire(gfp_t gfp_mask)
4014  {
4015  	if (__need_fs_reclaim(gfp_mask))
4016  		__fs_reclaim_acquire();
4017  }
4018  EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4019  
4020  void fs_reclaim_release(gfp_t gfp_mask)
4021  {
4022  	if (__need_fs_reclaim(gfp_mask))
4023  		__fs_reclaim_release();
4024  }
4025  EXPORT_SYMBOL_GPL(fs_reclaim_release);
4026  #endif
4027  
4028  /* Perform direct synchronous page reclaim */
4029  static int
4030  __perform_reclaim(gfp_t gfp_mask, unsigned int order,
4031  					const struct alloc_context *ac)
4032  {
4033  	struct reclaim_state reclaim_state;
4034  	int progress;
4035  	unsigned int noreclaim_flag;
4036  	unsigned long pflags;
4037  
4038  	cond_resched();
4039  
4040  	/* We now go into synchronous reclaim */
4041  	cpuset_memory_pressure_bump();
4042  	psi_memstall_enter(&pflags);
4043  	fs_reclaim_acquire(gfp_mask);
4044  	noreclaim_flag = memalloc_noreclaim_save();
4045  	reclaim_state.reclaimed_slab = 0;
4046  	current->reclaim_state = &reclaim_state;
4047  
4048  	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4049  								ac->nodemask);
4050  
4051  	current->reclaim_state = NULL;
4052  	memalloc_noreclaim_restore(noreclaim_flag);
4053  	fs_reclaim_release(gfp_mask);
4054  	psi_memstall_leave(&pflags);
4055  
4056  	cond_resched();
4057  
4058  	return progress;
4059  }
4060  
4061  /* The really slow allocator path where we enter direct reclaim */
4062  static inline struct page *
4063  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
4064  		unsigned int alloc_flags, const struct alloc_context *ac,
4065  		unsigned long *did_some_progress)
4066  {
4067  	struct page *page = NULL;
4068  	bool drained = false;
4069  
4070  	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4071  	if (unlikely(!(*did_some_progress)))
4072  		return NULL;
4073  
4074  retry:
4075  	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4076  
4077  	/*
4078  	 * If an allocation failed after direct reclaim, it could be because
4079  	 * pages are pinned on the per-cpu lists or in high alloc reserves.
4080  	 * Shrink them them and try again
4081  	 */
4082  	if (!page && !drained) {
4083  		unreserve_highatomic_pageblock(ac, false);
4084  		drain_all_pages(NULL);
4085  		drained = true;
4086  		goto retry;
4087  	}
4088  
4089  	return page;
4090  }
4091  
4092  static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4093  			     const struct alloc_context *ac)
4094  {
4095  	struct zoneref *z;
4096  	struct zone *zone;
4097  	pg_data_t *last_pgdat = NULL;
4098  	enum zone_type high_zoneidx = ac->high_zoneidx;
4099  
4100  	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
4101  					ac->nodemask) {
4102  		if (last_pgdat != zone->zone_pgdat)
4103  			wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
4104  		last_pgdat = zone->zone_pgdat;
4105  	}
4106  }
4107  
4108  static inline unsigned int
4109  gfp_to_alloc_flags(gfp_t gfp_mask)
4110  {
4111  	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
4112  
4113  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
4114  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
4115  
4116  	/*
4117  	 * The caller may dip into page reserves a bit more if the caller
4118  	 * cannot run direct reclaim, or if the caller has realtime scheduling
4119  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
4120  	 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4121  	 */
4122  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
4123  
4124  	if (gfp_mask & __GFP_ATOMIC) {
4125  		/*
4126  		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4127  		 * if it can't schedule.
4128  		 */
4129  		if (!(gfp_mask & __GFP_NOMEMALLOC))
4130  			alloc_flags |= ALLOC_HARDER;
4131  		/*
4132  		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
4133  		 * comment for __cpuset_node_allowed().
4134  		 */
4135  		alloc_flags &= ~ALLOC_CPUSET;
4136  	} else if (unlikely(rt_task(current)) && !in_interrupt())
4137  		alloc_flags |= ALLOC_HARDER;
4138  
4139  	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4140  		alloc_flags |= ALLOC_KSWAPD;
4141  
4142  #ifdef CONFIG_CMA
4143  	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4144  		alloc_flags |= ALLOC_CMA;
4145  #endif
4146  	return alloc_flags;
4147  }
4148  
4149  static bool oom_reserves_allowed(struct task_struct *tsk)
4150  {
4151  	if (!tsk_is_oom_victim(tsk))
4152  		return false;
4153  
4154  	/*
4155  	 * !MMU doesn't have oom reaper so give access to memory reserves
4156  	 * only to the thread with TIF_MEMDIE set
4157  	 */
4158  	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
4159  		return false;
4160  
4161  	return true;
4162  }
4163  
4164  /*
4165   * Distinguish requests which really need access to full memory
4166   * reserves from oom victims which can live with a portion of it
4167   */
4168  static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4169  {
4170  	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4171  		return 0;
4172  	if (gfp_mask & __GFP_MEMALLOC)
4173  		return ALLOC_NO_WATERMARKS;
4174  	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4175  		return ALLOC_NO_WATERMARKS;
4176  	if (!in_interrupt()) {
4177  		if (current->flags & PF_MEMALLOC)
4178  			return ALLOC_NO_WATERMARKS;
4179  		else if (oom_reserves_allowed(current))
4180  			return ALLOC_OOM;
4181  	}
4182  
4183  	return 0;
4184  }
4185  
4186  bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4187  {
4188  	return !!__gfp_pfmemalloc_flags(gfp_mask);
4189  }
4190  
4191  /*
4192   * Checks whether it makes sense to retry the reclaim to make a forward progress
4193   * for the given allocation request.
4194   *
4195   * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4196   * without success, or when we couldn't even meet the watermark if we
4197   * reclaimed all remaining pages on the LRU lists.
4198   *
4199   * Returns true if a retry is viable or false to enter the oom path.
4200   */
4201  static inline bool
4202  should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4203  		     struct alloc_context *ac, int alloc_flags,
4204  		     bool did_some_progress, int *no_progress_loops)
4205  {
4206  	struct zone *zone;
4207  	struct zoneref *z;
4208  	bool ret = false;
4209  
4210  	/*
4211  	 * Costly allocations might have made a progress but this doesn't mean
4212  	 * their order will become available due to high fragmentation so
4213  	 * always increment the no progress counter for them
4214  	 */
4215  	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4216  		*no_progress_loops = 0;
4217  	else
4218  		(*no_progress_loops)++;
4219  
4220  	/*
4221  	 * Make sure we converge to OOM if we cannot make any progress
4222  	 * several times in the row.
4223  	 */
4224  	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
4225  		/* Before OOM, exhaust highatomic_reserve */
4226  		return unreserve_highatomic_pageblock(ac, true);
4227  	}
4228  
4229  	/*
4230  	 * Keep reclaiming pages while there is a chance this will lead
4231  	 * somewhere.  If none of the target zones can satisfy our allocation
4232  	 * request even if all reclaimable pages are considered then we are
4233  	 * screwed and have to go OOM.
4234  	 */
4235  	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4236  					ac->nodemask) {
4237  		unsigned long available;
4238  		unsigned long reclaimable;
4239  		unsigned long min_wmark = min_wmark_pages(zone);
4240  		bool wmark;
4241  
4242  		available = reclaimable = zone_reclaimable_pages(zone);
4243  		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
4244  
4245  		/*
4246  		 * Would the allocation succeed if we reclaimed all
4247  		 * reclaimable pages?
4248  		 */
4249  		wmark = __zone_watermark_ok(zone, order, min_wmark,
4250  				ac_classzone_idx(ac), alloc_flags, available);
4251  		trace_reclaim_retry_zone(z, order, reclaimable,
4252  				available, min_wmark, *no_progress_loops, wmark);
4253  		if (wmark) {
4254  			/*
4255  			 * If we didn't make any progress and have a lot of
4256  			 * dirty + writeback pages then we should wait for
4257  			 * an IO to complete to slow down the reclaim and
4258  			 * prevent from pre mature OOM
4259  			 */
4260  			if (!did_some_progress) {
4261  				unsigned long write_pending;
4262  
4263  				write_pending = zone_page_state_snapshot(zone,
4264  							NR_ZONE_WRITE_PENDING);
4265  
4266  				if (2 * write_pending > reclaimable) {
4267  					congestion_wait(BLK_RW_ASYNC, HZ/10);
4268  					return true;
4269  				}
4270  			}
4271  
4272  			ret = true;
4273  			goto out;
4274  		}
4275  	}
4276  
4277  out:
4278  	/*
4279  	 * Memory allocation/reclaim might be called from a WQ context and the
4280  	 * current implementation of the WQ concurrency control doesn't
4281  	 * recognize that a particular WQ is congested if the worker thread is
4282  	 * looping without ever sleeping. Therefore we have to do a short sleep
4283  	 * here rather than calling cond_resched().
4284  	 */
4285  	if (current->flags & PF_WQ_WORKER)
4286  		schedule_timeout_uninterruptible(1);
4287  	else
4288  		cond_resched();
4289  	return ret;
4290  }
4291  
4292  static inline bool
4293  check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4294  {
4295  	/*
4296  	 * It's possible that cpuset's mems_allowed and the nodemask from
4297  	 * mempolicy don't intersect. This should be normally dealt with by
4298  	 * policy_nodemask(), but it's possible to race with cpuset update in
4299  	 * such a way the check therein was true, and then it became false
4300  	 * before we got our cpuset_mems_cookie here.
4301  	 * This assumes that for all allocations, ac->nodemask can come only
4302  	 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4303  	 * when it does not intersect with the cpuset restrictions) or the
4304  	 * caller can deal with a violated nodemask.
4305  	 */
4306  	if (cpusets_enabled() && ac->nodemask &&
4307  			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4308  		ac->nodemask = NULL;
4309  		return true;
4310  	}
4311  
4312  	/*
4313  	 * When updating a task's mems_allowed or mempolicy nodemask, it is
4314  	 * possible to race with parallel threads in such a way that our
4315  	 * allocation can fail while the mask is being updated. If we are about
4316  	 * to fail, check if the cpuset changed during allocation and if so,
4317  	 * retry.
4318  	 */
4319  	if (read_mems_allowed_retry(cpuset_mems_cookie))
4320  		return true;
4321  
4322  	return false;
4323  }
4324  
4325  static inline struct page *
4326  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4327  						struct alloc_context *ac)
4328  {
4329  	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4330  	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4331  	struct page *page = NULL;
4332  	unsigned int alloc_flags;
4333  	unsigned long did_some_progress;
4334  	enum compact_priority compact_priority;
4335  	enum compact_result compact_result;
4336  	int compaction_retries;
4337  	int no_progress_loops;
4338  	unsigned int cpuset_mems_cookie;
4339  	int reserve_flags;
4340  
4341  	/*
4342  	 * We also sanity check to catch abuse of atomic reserves being used by
4343  	 * callers that are not in atomic context.
4344  	 */
4345  	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
4346  				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
4347  		gfp_mask &= ~__GFP_ATOMIC;
4348  
4349  retry_cpuset:
4350  	compaction_retries = 0;
4351  	no_progress_loops = 0;
4352  	compact_priority = DEF_COMPACT_PRIORITY;
4353  	cpuset_mems_cookie = read_mems_allowed_begin();
4354  
4355  	/*
4356  	 * The fast path uses conservative alloc_flags to succeed only until
4357  	 * kswapd needs to be woken up, and to avoid the cost of setting up
4358  	 * alloc_flags precisely. So we do that now.
4359  	 */
4360  	alloc_flags = gfp_to_alloc_flags(gfp_mask);
4361  
4362  	/*
4363  	 * We need to recalculate the starting point for the zonelist iterator
4364  	 * because we might have used different nodemask in the fast path, or
4365  	 * there was a cpuset modification and we are retrying - otherwise we
4366  	 * could end up iterating over non-eligible zones endlessly.
4367  	 */
4368  	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4369  					ac->high_zoneidx, ac->nodemask);
4370  	if (!ac->preferred_zoneref->zone)
4371  		goto nopage;
4372  
4373  	if (alloc_flags & ALLOC_KSWAPD)
4374  		wake_all_kswapds(order, gfp_mask, ac);
4375  
4376  	/*
4377  	 * The adjusted alloc_flags might result in immediate success, so try
4378  	 * that first
4379  	 */
4380  	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4381  	if (page)
4382  		goto got_pg;
4383  
4384  	/*
4385  	 * For costly allocations, try direct compaction first, as it's likely
4386  	 * that we have enough base pages and don't need to reclaim. For non-
4387  	 * movable high-order allocations, do that as well, as compaction will
4388  	 * try prevent permanent fragmentation by migrating from blocks of the
4389  	 * same migratetype.
4390  	 * Don't try this for allocations that are allowed to ignore
4391  	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4392  	 */
4393  	if (can_direct_reclaim &&
4394  			(costly_order ||
4395  			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4396  			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
4397  		page = __alloc_pages_direct_compact(gfp_mask, order,
4398  						alloc_flags, ac,
4399  						INIT_COMPACT_PRIORITY,
4400  						&compact_result);
4401  		if (page)
4402  			goto got_pg;
4403  
4404  		/*
4405  		 * Checks for costly allocations with __GFP_NORETRY, which
4406  		 * includes THP page fault allocations
4407  		 */
4408  		if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4409  			/*
4410  			 * If compaction is deferred for high-order allocations,
4411  			 * it is because sync compaction recently failed. If
4412  			 * this is the case and the caller requested a THP
4413  			 * allocation, we do not want to heavily disrupt the
4414  			 * system, so we fail the allocation instead of entering
4415  			 * direct reclaim.
4416  			 */
4417  			if (compact_result == COMPACT_DEFERRED)
4418  				goto nopage;
4419  
4420  			/*
4421  			 * Looks like reclaim/compaction is worth trying, but
4422  			 * sync compaction could be very expensive, so keep
4423  			 * using async compaction.
4424  			 */
4425  			compact_priority = INIT_COMPACT_PRIORITY;
4426  		}
4427  	}
4428  
4429  retry:
4430  	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4431  	if (alloc_flags & ALLOC_KSWAPD)
4432  		wake_all_kswapds(order, gfp_mask, ac);
4433  
4434  	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4435  	if (reserve_flags)
4436  		alloc_flags = reserve_flags;
4437  
4438  	/*
4439  	 * Reset the nodemask and zonelist iterators if memory policies can be
4440  	 * ignored. These allocations are high priority and system rather than
4441  	 * user oriented.
4442  	 */
4443  	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4444  		ac->nodemask = NULL;
4445  		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4446  					ac->high_zoneidx, ac->nodemask);
4447  	}
4448  
4449  	/* Attempt with potentially adjusted zonelist and alloc_flags */
4450  	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4451  	if (page)
4452  		goto got_pg;
4453  
4454  	/* Caller is not willing to reclaim, we can't balance anything */
4455  	if (!can_direct_reclaim)
4456  		goto nopage;
4457  
4458  	/* Avoid recursion of direct reclaim */
4459  	if (current->flags & PF_MEMALLOC)
4460  		goto nopage;
4461  
4462  	/* Try direct reclaim and then allocating */
4463  	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4464  							&did_some_progress);
4465  	if (page)
4466  		goto got_pg;
4467  
4468  	/* Try direct compaction and then allocating */
4469  	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4470  					compact_priority, &compact_result);
4471  	if (page)
4472  		goto got_pg;
4473  
4474  	/* Do not loop if specifically requested */
4475  	if (gfp_mask & __GFP_NORETRY)
4476  		goto nopage;
4477  
4478  	/*
4479  	 * Do not retry costly high order allocations unless they are
4480  	 * __GFP_RETRY_MAYFAIL
4481  	 */
4482  	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4483  		goto nopage;
4484  
4485  	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4486  				 did_some_progress > 0, &no_progress_loops))
4487  		goto retry;
4488  
4489  	/*
4490  	 * It doesn't make any sense to retry for the compaction if the order-0
4491  	 * reclaim is not able to make any progress because the current
4492  	 * implementation of the compaction depends on the sufficient amount
4493  	 * of free memory (see __compaction_suitable)
4494  	 */
4495  	if (did_some_progress > 0 &&
4496  			should_compact_retry(ac, order, alloc_flags,
4497  				compact_result, &compact_priority,
4498  				&compaction_retries))
4499  		goto retry;
4500  
4501  
4502  	/* Deal with possible cpuset update races before we start OOM killing */
4503  	if (check_retry_cpuset(cpuset_mems_cookie, ac))
4504  		goto retry_cpuset;
4505  
4506  	/* Reclaim has failed us, start killing things */
4507  	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4508  	if (page)
4509  		goto got_pg;
4510  
4511  	/* Avoid allocations with no watermarks from looping endlessly */
4512  	if (tsk_is_oom_victim(current) &&
4513  	    (alloc_flags == ALLOC_OOM ||
4514  	     (gfp_mask & __GFP_NOMEMALLOC)))
4515  		goto nopage;
4516  
4517  	/* Retry as long as the OOM killer is making progress */
4518  	if (did_some_progress) {
4519  		no_progress_loops = 0;
4520  		goto retry;
4521  	}
4522  
4523  nopage:
4524  	/* Deal with possible cpuset update races before we fail */
4525  	if (check_retry_cpuset(cpuset_mems_cookie, ac))
4526  		goto retry_cpuset;
4527  
4528  	/*
4529  	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4530  	 * we always retry
4531  	 */
4532  	if (gfp_mask & __GFP_NOFAIL) {
4533  		/*
4534  		 * All existing users of the __GFP_NOFAIL are blockable, so warn
4535  		 * of any new users that actually require GFP_NOWAIT
4536  		 */
4537  		if (WARN_ON_ONCE(!can_direct_reclaim))
4538  			goto fail;
4539  
4540  		/*
4541  		 * PF_MEMALLOC request from this context is rather bizarre
4542  		 * because we cannot reclaim anything and only can loop waiting
4543  		 * for somebody to do a work for us
4544  		 */
4545  		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4546  
4547  		/*
4548  		 * non failing costly orders are a hard requirement which we
4549  		 * are not prepared for much so let's warn about these users
4550  		 * so that we can identify them and convert them to something
4551  		 * else.
4552  		 */
4553  		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4554  
4555  		/*
4556  		 * Help non-failing allocations by giving them access to memory
4557  		 * reserves but do not use ALLOC_NO_WATERMARKS because this
4558  		 * could deplete whole memory reserves which would just make
4559  		 * the situation worse
4560  		 */
4561  		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4562  		if (page)
4563  			goto got_pg;
4564  
4565  		cond_resched();
4566  		goto retry;
4567  	}
4568  fail:
4569  	warn_alloc(gfp_mask, ac->nodemask,
4570  			"page allocation failure: order:%u", order);
4571  got_pg:
4572  	return page;
4573  }
4574  
4575  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4576  		int preferred_nid, nodemask_t *nodemask,
4577  		struct alloc_context *ac, gfp_t *alloc_mask,
4578  		unsigned int *alloc_flags)
4579  {
4580  	ac->high_zoneidx = gfp_zone(gfp_mask);
4581  	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4582  	ac->nodemask = nodemask;
4583  	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
4584  
4585  	if (cpusets_enabled()) {
4586  		*alloc_mask |= __GFP_HARDWALL;
4587  		if (!ac->nodemask)
4588  			ac->nodemask = &cpuset_current_mems_allowed;
4589  		else
4590  			*alloc_flags |= ALLOC_CPUSET;
4591  	}
4592  
4593  	fs_reclaim_acquire(gfp_mask);
4594  	fs_reclaim_release(gfp_mask);
4595  
4596  	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4597  
4598  	if (should_fail_alloc_page(gfp_mask, order))
4599  		return false;
4600  
4601  	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4602  		*alloc_flags |= ALLOC_CMA;
4603  
4604  	return true;
4605  }
4606  
4607  /* Determine whether to spread dirty pages and what the first usable zone */
4608  static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4609  {
4610  	/* Dirty zone balancing only done in the fast path */
4611  	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4612  
4613  	/*
4614  	 * The preferred zone is used for statistics but crucially it is
4615  	 * also used as the starting point for the zonelist iterator. It
4616  	 * may get reset for allocations that ignore memory policies.
4617  	 */
4618  	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4619  					ac->high_zoneidx, ac->nodemask);
4620  }
4621  
4622  /*
4623   * This is the 'heart' of the zoned buddy allocator.
4624   */
4625  struct page *
4626  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4627  							nodemask_t *nodemask)
4628  {
4629  	struct page *page;
4630  	unsigned int alloc_flags = ALLOC_WMARK_LOW;
4631  	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
4632  	struct alloc_context ac = { };
4633  
4634  	/*
4635  	 * There are several places where we assume that the order value is sane
4636  	 * so bail out early if the request is out of bound.
4637  	 */
4638  	if (unlikely(order >= MAX_ORDER)) {
4639  		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
4640  		return NULL;
4641  	}
4642  
4643  	gfp_mask &= gfp_allowed_mask;
4644  	alloc_mask = gfp_mask;
4645  	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4646  		return NULL;
4647  
4648  	finalise_ac(gfp_mask, &ac);
4649  
4650  	/*
4651  	 * Forbid the first pass from falling back to types that fragment
4652  	 * memory until all local zones are considered.
4653  	 */
4654  	alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4655  
4656  	/* First allocation attempt */
4657  	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4658  	if (likely(page))
4659  		goto out;
4660  
4661  	/*
4662  	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4663  	 * resp. GFP_NOIO which has to be inherited for all allocation requests
4664  	 * from a particular context which has been marked by
4665  	 * memalloc_no{fs,io}_{save,restore}.
4666  	 */
4667  	alloc_mask = current_gfp_context(gfp_mask);
4668  	ac.spread_dirty_pages = false;
4669  
4670  	/*
4671  	 * Restore the original nodemask if it was potentially replaced with
4672  	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4673  	 */
4674  	if (unlikely(ac.nodemask != nodemask))
4675  		ac.nodemask = nodemask;
4676  
4677  	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4678  
4679  out:
4680  	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4681  	    unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4682  		__free_pages(page, order);
4683  		page = NULL;
4684  	}
4685  
4686  	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4687  
4688  	return page;
4689  }
4690  EXPORT_SYMBOL(__alloc_pages_nodemask);
4691  
4692  /*
4693   * Common helper functions. Never use with __GFP_HIGHMEM because the returned
4694   * address cannot represent highmem pages. Use alloc_pages and then kmap if
4695   * you need to access high mem.
4696   */
4697  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4698  {
4699  	struct page *page;
4700  
4701  	page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
4702  	if (!page)
4703  		return 0;
4704  	return (unsigned long) page_address(page);
4705  }
4706  EXPORT_SYMBOL(__get_free_pages);
4707  
4708  unsigned long get_zeroed_page(gfp_t gfp_mask)
4709  {
4710  	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
4711  }
4712  EXPORT_SYMBOL(get_zeroed_page);
4713  
4714  static inline void free_the_page(struct page *page, unsigned int order)
4715  {
4716  	if (order == 0)		/* Via pcp? */
4717  		free_unref_page(page);
4718  	else
4719  		__free_pages_ok(page, order);
4720  }
4721  
4722  void __free_pages(struct page *page, unsigned int order)
4723  {
4724  	if (put_page_testzero(page))
4725  		free_the_page(page, order);
4726  }
4727  EXPORT_SYMBOL(__free_pages);
4728  
4729  void free_pages(unsigned long addr, unsigned int order)
4730  {
4731  	if (addr != 0) {
4732  		VM_BUG_ON(!virt_addr_valid((void *)addr));
4733  		__free_pages(virt_to_page((void *)addr), order);
4734  	}
4735  }
4736  
4737  EXPORT_SYMBOL(free_pages);
4738  
4739  /*
4740   * Page Fragment:
4741   *  An arbitrary-length arbitrary-offset area of memory which resides
4742   *  within a 0 or higher order page.  Multiple fragments within that page
4743   *  are individually refcounted, in the page's reference counter.
4744   *
4745   * The page_frag functions below provide a simple allocation framework for
4746   * page fragments.  This is used by the network stack and network device
4747   * drivers to provide a backing region of memory for use as either an
4748   * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4749   */
4750  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4751  					     gfp_t gfp_mask)
4752  {
4753  	struct page *page = NULL;
4754  	gfp_t gfp = gfp_mask;
4755  
4756  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4757  	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4758  		    __GFP_NOMEMALLOC;
4759  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4760  				PAGE_FRAG_CACHE_MAX_ORDER);
4761  	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4762  #endif
4763  	if (unlikely(!page))
4764  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4765  
4766  	nc->va = page ? page_address(page) : NULL;
4767  
4768  	return page;
4769  }
4770  
4771  void __page_frag_cache_drain(struct page *page, unsigned int count)
4772  {
4773  	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4774  
4775  	if (page_ref_sub_and_test(page, count))
4776  		free_the_page(page, compound_order(page));
4777  }
4778  EXPORT_SYMBOL(__page_frag_cache_drain);
4779  
4780  void *page_frag_alloc(struct page_frag_cache *nc,
4781  		      unsigned int fragsz, gfp_t gfp_mask)
4782  {
4783  	unsigned int size = PAGE_SIZE;
4784  	struct page *page;
4785  	int offset;
4786  
4787  	if (unlikely(!nc->va)) {
4788  refill:
4789  		page = __page_frag_cache_refill(nc, gfp_mask);
4790  		if (!page)
4791  			return NULL;
4792  
4793  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4794  		/* if size can vary use size else just use PAGE_SIZE */
4795  		size = nc->size;
4796  #endif
4797  		/* Even if we own the page, we do not use atomic_set().
4798  		 * This would break get_page_unless_zero() users.
4799  		 */
4800  		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
4801  
4802  		/* reset page count bias and offset to start of new frag */
4803  		nc->pfmemalloc = page_is_pfmemalloc(page);
4804  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4805  		nc->offset = size;
4806  	}
4807  
4808  	offset = nc->offset - fragsz;
4809  	if (unlikely(offset < 0)) {
4810  		page = virt_to_page(nc->va);
4811  
4812  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4813  			goto refill;
4814  
4815  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4816  		/* if size can vary use size else just use PAGE_SIZE */
4817  		size = nc->size;
4818  #endif
4819  		/* OK, page count is 0, we can safely set it */
4820  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
4821  
4822  		/* reset page count bias and offset to start of new frag */
4823  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4824  		offset = size - fragsz;
4825  	}
4826  
4827  	nc->pagecnt_bias--;
4828  	nc->offset = offset;
4829  
4830  	return nc->va + offset;
4831  }
4832  EXPORT_SYMBOL(page_frag_alloc);
4833  
4834  /*
4835   * Frees a page fragment allocated out of either a compound or order 0 page.
4836   */
4837  void page_frag_free(void *addr)
4838  {
4839  	struct page *page = virt_to_head_page(addr);
4840  
4841  	if (unlikely(put_page_testzero(page)))
4842  		free_the_page(page, compound_order(page));
4843  }
4844  EXPORT_SYMBOL(page_frag_free);
4845  
4846  static void *make_alloc_exact(unsigned long addr, unsigned int order,
4847  		size_t size)
4848  {
4849  	if (addr) {
4850  		unsigned long alloc_end = addr + (PAGE_SIZE << order);
4851  		unsigned long used = addr + PAGE_ALIGN(size);
4852  
4853  		split_page(virt_to_page((void *)addr), order);
4854  		while (used < alloc_end) {
4855  			free_page(used);
4856  			used += PAGE_SIZE;
4857  		}
4858  	}
4859  	return (void *)addr;
4860  }
4861  
4862  /**
4863   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4864   * @size: the number of bytes to allocate
4865   * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4866   *
4867   * This function is similar to alloc_pages(), except that it allocates the
4868   * minimum number of pages to satisfy the request.  alloc_pages() can only
4869   * allocate memory in power-of-two pages.
4870   *
4871   * This function is also limited by MAX_ORDER.
4872   *
4873   * Memory allocated by this function must be released by free_pages_exact().
4874   *
4875   * Return: pointer to the allocated area or %NULL in case of error.
4876   */
4877  void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4878  {
4879  	unsigned int order = get_order(size);
4880  	unsigned long addr;
4881  
4882  	if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
4883  		gfp_mask &= ~__GFP_COMP;
4884  
4885  	addr = __get_free_pages(gfp_mask, order);
4886  	return make_alloc_exact(addr, order, size);
4887  }
4888  EXPORT_SYMBOL(alloc_pages_exact);
4889  
4890  /**
4891   * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4892   *			   pages on a node.
4893   * @nid: the preferred node ID where memory should be allocated
4894   * @size: the number of bytes to allocate
4895   * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4896   *
4897   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4898   * back.
4899   *
4900   * Return: pointer to the allocated area or %NULL in case of error.
4901   */
4902  void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4903  {
4904  	unsigned int order = get_order(size);
4905  	struct page *p;
4906  
4907  	if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
4908  		gfp_mask &= ~__GFP_COMP;
4909  
4910  	p = alloc_pages_node(nid, gfp_mask, order);
4911  	if (!p)
4912  		return NULL;
4913  	return make_alloc_exact((unsigned long)page_address(p), order, size);
4914  }
4915  
4916  /**
4917   * free_pages_exact - release memory allocated via alloc_pages_exact()
4918   * @virt: the value returned by alloc_pages_exact.
4919   * @size: size of allocation, same value as passed to alloc_pages_exact().
4920   *
4921   * Release the memory allocated by a previous call to alloc_pages_exact.
4922   */
4923  void free_pages_exact(void *virt, size_t size)
4924  {
4925  	unsigned long addr = (unsigned long)virt;
4926  	unsigned long end = addr + PAGE_ALIGN(size);
4927  
4928  	while (addr < end) {
4929  		free_page(addr);
4930  		addr += PAGE_SIZE;
4931  	}
4932  }
4933  EXPORT_SYMBOL(free_pages_exact);
4934  
4935  /**
4936   * nr_free_zone_pages - count number of pages beyond high watermark
4937   * @offset: The zone index of the highest zone
4938   *
4939   * nr_free_zone_pages() counts the number of pages which are beyond the
4940   * high watermark within all zones at or below a given zone index.  For each
4941   * zone, the number of pages is calculated as:
4942   *
4943   *     nr_free_zone_pages = managed_pages - high_pages
4944   *
4945   * Return: number of pages beyond high watermark.
4946   */
4947  static unsigned long nr_free_zone_pages(int offset)
4948  {
4949  	struct zoneref *z;
4950  	struct zone *zone;
4951  
4952  	/* Just pick one node, since fallback list is circular */
4953  	unsigned long sum = 0;
4954  
4955  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4956  
4957  	for_each_zone_zonelist(zone, z, zonelist, offset) {
4958  		unsigned long size = zone_managed_pages(zone);
4959  		unsigned long high = high_wmark_pages(zone);
4960  		if (size > high)
4961  			sum += size - high;
4962  	}
4963  
4964  	return sum;
4965  }
4966  
4967  /**
4968   * nr_free_buffer_pages - count number of pages beyond high watermark
4969   *
4970   * nr_free_buffer_pages() counts the number of pages which are beyond the high
4971   * watermark within ZONE_DMA and ZONE_NORMAL.
4972   *
4973   * Return: number of pages beyond high watermark within ZONE_DMA and
4974   * ZONE_NORMAL.
4975   */
4976  unsigned long nr_free_buffer_pages(void)
4977  {
4978  	return nr_free_zone_pages(gfp_zone(GFP_USER));
4979  }
4980  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4981  
4982  /**
4983   * nr_free_pagecache_pages - count number of pages beyond high watermark
4984   *
4985   * nr_free_pagecache_pages() counts the number of pages which are beyond the
4986   * high watermark within all zones.
4987   *
4988   * Return: number of pages beyond high watermark within all zones.
4989   */
4990  unsigned long nr_free_pagecache_pages(void)
4991  {
4992  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4993  }
4994  
4995  static inline void show_node(struct zone *zone)
4996  {
4997  	if (IS_ENABLED(CONFIG_NUMA))
4998  		printk("Node %d ", zone_to_nid(zone));
4999  }
5000  
5001  long si_mem_available(void)
5002  {
5003  	long available;
5004  	unsigned long pagecache;
5005  	unsigned long wmark_low = 0;
5006  	unsigned long pages[NR_LRU_LISTS];
5007  	unsigned long reclaimable;
5008  	struct zone *zone;
5009  	int lru;
5010  
5011  	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
5012  		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
5013  
5014  	for_each_zone(zone)
5015  		wmark_low += low_wmark_pages(zone);
5016  
5017  	/*
5018  	 * Estimate the amount of memory available for userspace allocations,
5019  	 * without causing swapping.
5020  	 */
5021  	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
5022  
5023  	/*
5024  	 * Not all the page cache can be freed, otherwise the system will
5025  	 * start swapping. Assume at least half of the page cache, or the
5026  	 * low watermark worth of cache, needs to stay.
5027  	 */
5028  	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
5029  	pagecache -= min(pagecache / 2, wmark_low);
5030  	available += pagecache;
5031  
5032  	/*
5033  	 * Part of the reclaimable slab and other kernel memory consists of
5034  	 * items that are in use, and cannot be freed. Cap this estimate at the
5035  	 * low watermark.
5036  	 */
5037  	reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
5038  			global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5039  	available += reclaimable - min(reclaimable / 2, wmark_low);
5040  
5041  	if (available < 0)
5042  		available = 0;
5043  	return available;
5044  }
5045  EXPORT_SYMBOL_GPL(si_mem_available);
5046  
5047  void si_meminfo(struct sysinfo *val)
5048  {
5049  	val->totalram = totalram_pages();
5050  	val->sharedram = global_node_page_state(NR_SHMEM);
5051  	val->freeram = global_zone_page_state(NR_FREE_PAGES);
5052  	val->bufferram = nr_blockdev_pages();
5053  	val->totalhigh = totalhigh_pages();
5054  	val->freehigh = nr_free_highpages();
5055  	val->mem_unit = PAGE_SIZE;
5056  }
5057  
5058  EXPORT_SYMBOL(si_meminfo);
5059  
5060  #ifdef CONFIG_NUMA
5061  void si_meminfo_node(struct sysinfo *val, int nid)
5062  {
5063  	int zone_type;		/* needs to be signed */
5064  	unsigned long managed_pages = 0;
5065  	unsigned long managed_highpages = 0;
5066  	unsigned long free_highpages = 0;
5067  	pg_data_t *pgdat = NODE_DATA(nid);
5068  
5069  	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
5070  		managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
5071  	val->totalram = managed_pages;
5072  	val->sharedram = node_page_state(pgdat, NR_SHMEM);
5073  	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
5074  #ifdef CONFIG_HIGHMEM
5075  	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
5076  		struct zone *zone = &pgdat->node_zones[zone_type];
5077  
5078  		if (is_highmem(zone)) {
5079  			managed_highpages += zone_managed_pages(zone);
5080  			free_highpages += zone_page_state(zone, NR_FREE_PAGES);
5081  		}
5082  	}
5083  	val->totalhigh = managed_highpages;
5084  	val->freehigh = free_highpages;
5085  #else
5086  	val->totalhigh = managed_highpages;
5087  	val->freehigh = free_highpages;
5088  #endif
5089  	val->mem_unit = PAGE_SIZE;
5090  }
5091  #endif
5092  
5093  /*
5094   * Determine whether the node should be displayed or not, depending on whether
5095   * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
5096   */
5097  static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
5098  {
5099  	if (!(flags & SHOW_MEM_FILTER_NODES))
5100  		return false;
5101  
5102  	/*
5103  	 * no node mask - aka implicit memory numa policy. Do not bother with
5104  	 * the synchronization - read_mems_allowed_begin - because we do not
5105  	 * have to be precise here.
5106  	 */
5107  	if (!nodemask)
5108  		nodemask = &cpuset_current_mems_allowed;
5109  
5110  	return !node_isset(nid, *nodemask);
5111  }
5112  
5113  #define K(x) ((x) << (PAGE_SHIFT-10))
5114  
5115  static void show_migration_types(unsigned char type)
5116  {
5117  	static const char types[MIGRATE_TYPES] = {
5118  		[MIGRATE_UNMOVABLE]	= 'U',
5119  		[MIGRATE_MOVABLE]	= 'M',
5120  		[MIGRATE_RECLAIMABLE]	= 'E',
5121  		[MIGRATE_HIGHATOMIC]	= 'H',
5122  #ifdef CONFIG_CMA
5123  		[MIGRATE_CMA]		= 'C',
5124  #endif
5125  #ifdef CONFIG_MEMORY_ISOLATION
5126  		[MIGRATE_ISOLATE]	= 'I',
5127  #endif
5128  	};
5129  	char tmp[MIGRATE_TYPES + 1];
5130  	char *p = tmp;
5131  	int i;
5132  
5133  	for (i = 0; i < MIGRATE_TYPES; i++) {
5134  		if (type & (1 << i))
5135  			*p++ = types[i];
5136  	}
5137  
5138  	*p = '\0';
5139  	printk(KERN_CONT "(%s) ", tmp);
5140  }
5141  
5142  /*
5143   * Show free area list (used inside shift_scroll-lock stuff)
5144   * We also calculate the percentage fragmentation. We do this by counting the
5145   * memory on each free list with the exception of the first item on the list.
5146   *
5147   * Bits in @filter:
5148   * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
5149   *   cpuset.
5150   */
5151  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
5152  {
5153  	unsigned long free_pcp = 0;
5154  	int cpu;
5155  	struct zone *zone;
5156  	pg_data_t *pgdat;
5157  
5158  	for_each_populated_zone(zone) {
5159  		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5160  			continue;
5161  
5162  		for_each_online_cpu(cpu)
5163  			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
5164  	}
5165  
5166  	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5167  		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5168  		" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
5169  		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5170  		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5171  		" free:%lu free_pcp:%lu free_cma:%lu\n",
5172  		global_node_page_state(NR_ACTIVE_ANON),
5173  		global_node_page_state(NR_INACTIVE_ANON),
5174  		global_node_page_state(NR_ISOLATED_ANON),
5175  		global_node_page_state(NR_ACTIVE_FILE),
5176  		global_node_page_state(NR_INACTIVE_FILE),
5177  		global_node_page_state(NR_ISOLATED_FILE),
5178  		global_node_page_state(NR_UNEVICTABLE),
5179  		global_node_page_state(NR_FILE_DIRTY),
5180  		global_node_page_state(NR_WRITEBACK),
5181  		global_node_page_state(NR_UNSTABLE_NFS),
5182  		global_node_page_state(NR_SLAB_RECLAIMABLE),
5183  		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
5184  		global_node_page_state(NR_FILE_MAPPED),
5185  		global_node_page_state(NR_SHMEM),
5186  		global_zone_page_state(NR_PAGETABLE),
5187  		global_zone_page_state(NR_BOUNCE),
5188  		global_zone_page_state(NR_FREE_PAGES),
5189  		free_pcp,
5190  		global_zone_page_state(NR_FREE_CMA_PAGES));
5191  
5192  	for_each_online_pgdat(pgdat) {
5193  		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5194  			continue;
5195  
5196  		printk("Node %d"
5197  			" active_anon:%lukB"
5198  			" inactive_anon:%lukB"
5199  			" active_file:%lukB"
5200  			" inactive_file:%lukB"
5201  			" unevictable:%lukB"
5202  			" isolated(anon):%lukB"
5203  			" isolated(file):%lukB"
5204  			" mapped:%lukB"
5205  			" dirty:%lukB"
5206  			" writeback:%lukB"
5207  			" shmem:%lukB"
5208  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5209  			" shmem_thp: %lukB"
5210  			" shmem_pmdmapped: %lukB"
5211  			" anon_thp: %lukB"
5212  #endif
5213  			" writeback_tmp:%lukB"
5214  			" unstable:%lukB"
5215  			" all_unreclaimable? %s"
5216  			"\n",
5217  			pgdat->node_id,
5218  			K(node_page_state(pgdat, NR_ACTIVE_ANON)),
5219  			K(node_page_state(pgdat, NR_INACTIVE_ANON)),
5220  			K(node_page_state(pgdat, NR_ACTIVE_FILE)),
5221  			K(node_page_state(pgdat, NR_INACTIVE_FILE)),
5222  			K(node_page_state(pgdat, NR_UNEVICTABLE)),
5223  			K(node_page_state(pgdat, NR_ISOLATED_ANON)),
5224  			K(node_page_state(pgdat, NR_ISOLATED_FILE)),
5225  			K(node_page_state(pgdat, NR_FILE_MAPPED)),
5226  			K(node_page_state(pgdat, NR_FILE_DIRTY)),
5227  			K(node_page_state(pgdat, NR_WRITEBACK)),
5228  			K(node_page_state(pgdat, NR_SHMEM)),
5229  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5230  			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
5231  			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
5232  					* HPAGE_PMD_NR),
5233  			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5234  #endif
5235  			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5236  			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
5237  			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5238  				"yes" : "no");
5239  	}
5240  
5241  	for_each_populated_zone(zone) {
5242  		int i;
5243  
5244  		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5245  			continue;
5246  
5247  		free_pcp = 0;
5248  		for_each_online_cpu(cpu)
5249  			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
5250  
5251  		show_node(zone);
5252  		printk(KERN_CONT
5253  			"%s"
5254  			" free:%lukB"
5255  			" min:%lukB"
5256  			" low:%lukB"
5257  			" high:%lukB"
5258  			" active_anon:%lukB"
5259  			" inactive_anon:%lukB"
5260  			" active_file:%lukB"
5261  			" inactive_file:%lukB"
5262  			" unevictable:%lukB"
5263  			" writepending:%lukB"
5264  			" present:%lukB"
5265  			" managed:%lukB"
5266  			" mlocked:%lukB"
5267  			" kernel_stack:%lukB"
5268  			" pagetables:%lukB"
5269  			" bounce:%lukB"
5270  			" free_pcp:%lukB"
5271  			" local_pcp:%ukB"
5272  			" free_cma:%lukB"
5273  			"\n",
5274  			zone->name,
5275  			K(zone_page_state(zone, NR_FREE_PAGES)),
5276  			K(min_wmark_pages(zone)),
5277  			K(low_wmark_pages(zone)),
5278  			K(high_wmark_pages(zone)),
5279  			K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5280  			K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5281  			K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
5282  			K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
5283  			K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5284  			K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5285  			K(zone->present_pages),
5286  			K(zone_managed_pages(zone)),
5287  			K(zone_page_state(zone, NR_MLOCK)),
5288  			zone_page_state(zone, NR_KERNEL_STACK_KB),
5289  			K(zone_page_state(zone, NR_PAGETABLE)),
5290  			K(zone_page_state(zone, NR_BOUNCE)),
5291  			K(free_pcp),
5292  			K(this_cpu_read(zone->pageset->pcp.count)),
5293  			K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
5294  		printk("lowmem_reserve[]:");
5295  		for (i = 0; i < MAX_NR_ZONES; i++)
5296  			printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
5297  		printk(KERN_CONT "\n");
5298  	}
5299  
5300  	for_each_populated_zone(zone) {
5301  		unsigned int order;
5302  		unsigned long nr[MAX_ORDER], flags, total = 0;
5303  		unsigned char types[MAX_ORDER];
5304  
5305  		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5306  			continue;
5307  		show_node(zone);
5308  		printk(KERN_CONT "%s: ", zone->name);
5309  
5310  		spin_lock_irqsave(&zone->lock, flags);
5311  		for (order = 0; order < MAX_ORDER; order++) {
5312  			struct free_area *area = &zone->free_area[order];
5313  			int type;
5314  
5315  			nr[order] = area->nr_free;
5316  			total += nr[order] << order;
5317  
5318  			types[order] = 0;
5319  			for (type = 0; type < MIGRATE_TYPES; type++) {
5320  				if (!free_area_empty(area, type))
5321  					types[order] |= 1 << type;
5322  			}
5323  		}
5324  		spin_unlock_irqrestore(&zone->lock, flags);
5325  		for (order = 0; order < MAX_ORDER; order++) {
5326  			printk(KERN_CONT "%lu*%lukB ",
5327  			       nr[order], K(1UL) << order);
5328  			if (nr[order])
5329  				show_migration_types(types[order]);
5330  		}
5331  		printk(KERN_CONT "= %lukB\n", K(total));
5332  	}
5333  
5334  	hugetlb_show_meminfo();
5335  
5336  	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
5337  
5338  	show_swap_cache_info();
5339  }
5340  
5341  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
5342  {
5343  	zoneref->zone = zone;
5344  	zoneref->zone_idx = zone_idx(zone);
5345  }
5346  
5347  /*
5348   * Builds allocation fallback zone lists.
5349   *
5350   * Add all populated zones of a node to the zonelist.
5351   */
5352  static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
5353  {
5354  	struct zone *zone;
5355  	enum zone_type zone_type = MAX_NR_ZONES;
5356  	int nr_zones = 0;
5357  
5358  	do {
5359  		zone_type--;
5360  		zone = pgdat->node_zones + zone_type;
5361  		if (managed_zone(zone)) {
5362  			zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5363  			check_highest_zone(zone_type);
5364  		}
5365  	} while (zone_type);
5366  
5367  	return nr_zones;
5368  }
5369  
5370  #ifdef CONFIG_NUMA
5371  
5372  static int __parse_numa_zonelist_order(char *s)
5373  {
5374  	/*
5375  	 * We used to support different zonlists modes but they turned
5376  	 * out to be just not useful. Let's keep the warning in place
5377  	 * if somebody still use the cmd line parameter so that we do
5378  	 * not fail it silently
5379  	 */
5380  	if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
5381  		pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
5382  		return -EINVAL;
5383  	}
5384  	return 0;
5385  }
5386  
5387  static __init int setup_numa_zonelist_order(char *s)
5388  {
5389  	if (!s)
5390  		return 0;
5391  
5392  	return __parse_numa_zonelist_order(s);
5393  }
5394  early_param("numa_zonelist_order", setup_numa_zonelist_order);
5395  
5396  char numa_zonelist_order[] = "Node";
5397  
5398  /*
5399   * sysctl handler for numa_zonelist_order
5400   */
5401  int numa_zonelist_order_handler(struct ctl_table *table, int write,
5402  		void __user *buffer, size_t *length,
5403  		loff_t *ppos)
5404  {
5405  	char *str;
5406  	int ret;
5407  
5408  	if (!write)
5409  		return proc_dostring(table, write, buffer, length, ppos);
5410  	str = memdup_user_nul(buffer, 16);
5411  	if (IS_ERR(str))
5412  		return PTR_ERR(str);
5413  
5414  	ret = __parse_numa_zonelist_order(str);
5415  	kfree(str);
5416  	return ret;
5417  }
5418  
5419  
5420  #define MAX_NODE_LOAD (nr_online_nodes)
5421  static int node_load[MAX_NUMNODES];
5422  
5423  /**
5424   * find_next_best_node - find the next node that should appear in a given node's fallback list
5425   * @node: node whose fallback list we're appending
5426   * @used_node_mask: nodemask_t of already used nodes
5427   *
5428   * We use a number of factors to determine which is the next node that should
5429   * appear on a given node's fallback list.  The node should not have appeared
5430   * already in @node's fallback list, and it should be the next closest node
5431   * according to the distance array (which contains arbitrary distance values
5432   * from each node to each node in the system), and should also prefer nodes
5433   * with no CPUs, since presumably they'll have very little allocation pressure
5434   * on them otherwise.
5435   *
5436   * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5437   */
5438  static int find_next_best_node(int node, nodemask_t *used_node_mask)
5439  {
5440  	int n, val;
5441  	int min_val = INT_MAX;
5442  	int best_node = NUMA_NO_NODE;
5443  	const struct cpumask *tmp = cpumask_of_node(0);
5444  
5445  	/* Use the local node if we haven't already */
5446  	if (!node_isset(node, *used_node_mask)) {
5447  		node_set(node, *used_node_mask);
5448  		return node;
5449  	}
5450  
5451  	for_each_node_state(n, N_MEMORY) {
5452  
5453  		/* Don't want a node to appear more than once */
5454  		if (node_isset(n, *used_node_mask))
5455  			continue;
5456  
5457  		/* Use the distance array to find the distance */
5458  		val = node_distance(node, n);
5459  
5460  		/* Penalize nodes under us ("prefer the next node") */
5461  		val += (n < node);
5462  
5463  		/* Give preference to headless and unused nodes */
5464  		tmp = cpumask_of_node(n);
5465  		if (!cpumask_empty(tmp))
5466  			val += PENALTY_FOR_NODE_WITH_CPUS;
5467  
5468  		/* Slight preference for less loaded node */
5469  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
5470  		val += node_load[n];
5471  
5472  		if (val < min_val) {
5473  			min_val = val;
5474  			best_node = n;
5475  		}
5476  	}
5477  
5478  	if (best_node >= 0)
5479  		node_set(best_node, *used_node_mask);
5480  
5481  	return best_node;
5482  }
5483  
5484  
5485  /*
5486   * Build zonelists ordered by node and zones within node.
5487   * This results in maximum locality--normal zone overflows into local
5488   * DMA zone, if any--but risks exhausting DMA zone.
5489   */
5490  static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5491  		unsigned nr_nodes)
5492  {
5493  	struct zoneref *zonerefs;
5494  	int i;
5495  
5496  	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5497  
5498  	for (i = 0; i < nr_nodes; i++) {
5499  		int nr_zones;
5500  
5501  		pg_data_t *node = NODE_DATA(node_order[i]);
5502  
5503  		nr_zones = build_zonerefs_node(node, zonerefs);
5504  		zonerefs += nr_zones;
5505  	}
5506  	zonerefs->zone = NULL;
5507  	zonerefs->zone_idx = 0;
5508  }
5509  
5510  /*
5511   * Build gfp_thisnode zonelists
5512   */
5513  static void build_thisnode_zonelists(pg_data_t *pgdat)
5514  {
5515  	struct zoneref *zonerefs;
5516  	int nr_zones;
5517  
5518  	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5519  	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5520  	zonerefs += nr_zones;
5521  	zonerefs->zone = NULL;
5522  	zonerefs->zone_idx = 0;
5523  }
5524  
5525  /*
5526   * Build zonelists ordered by zone and nodes within zones.
5527   * This results in conserving DMA zone[s] until all Normal memory is
5528   * exhausted, but results in overflowing to remote node while memory
5529   * may still exist in local DMA zone.
5530   */
5531  
5532  static void build_zonelists(pg_data_t *pgdat)
5533  {
5534  	static int node_order[MAX_NUMNODES];
5535  	int node, load, nr_nodes = 0;
5536  	nodemask_t used_mask;
5537  	int local_node, prev_node;
5538  
5539  	/* NUMA-aware ordering of nodes */
5540  	local_node = pgdat->node_id;
5541  	load = nr_online_nodes;
5542  	prev_node = local_node;
5543  	nodes_clear(used_mask);
5544  
5545  	memset(node_order, 0, sizeof(node_order));
5546  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5547  		/*
5548  		 * We don't want to pressure a particular node.
5549  		 * So adding penalty to the first node in same
5550  		 * distance group to make it round-robin.
5551  		 */
5552  		if (node_distance(local_node, node) !=
5553  		    node_distance(local_node, prev_node))
5554  			node_load[node] = load;
5555  
5556  		node_order[nr_nodes++] = node;
5557  		prev_node = node;
5558  		load--;
5559  	}
5560  
5561  	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5562  	build_thisnode_zonelists(pgdat);
5563  }
5564  
5565  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5566  /*
5567   * Return node id of node used for "local" allocations.
5568   * I.e., first node id of first zone in arg node's generic zonelist.
5569   * Used for initializing percpu 'numa_mem', which is used primarily
5570   * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5571   */
5572  int local_memory_node(int node)
5573  {
5574  	struct zoneref *z;
5575  
5576  	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5577  				   gfp_zone(GFP_KERNEL),
5578  				   NULL);
5579  	return zone_to_nid(z->zone);
5580  }
5581  #endif
5582  
5583  static void setup_min_unmapped_ratio(void);
5584  static void setup_min_slab_ratio(void);
5585  #else	/* CONFIG_NUMA */
5586  
5587  static void build_zonelists(pg_data_t *pgdat)
5588  {
5589  	int node, local_node;
5590  	struct zoneref *zonerefs;
5591  	int nr_zones;
5592  
5593  	local_node = pgdat->node_id;
5594  
5595  	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5596  	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5597  	zonerefs += nr_zones;
5598  
5599  	/*
5600  	 * Now we build the zonelist so that it contains the zones
5601  	 * of all the other nodes.
5602  	 * We don't want to pressure a particular node, so when
5603  	 * building the zones for node N, we make sure that the
5604  	 * zones coming right after the local ones are those from
5605  	 * node N+1 (modulo N)
5606  	 */
5607  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5608  		if (!node_online(node))
5609  			continue;
5610  		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5611  		zonerefs += nr_zones;
5612  	}
5613  	for (node = 0; node < local_node; node++) {
5614  		if (!node_online(node))
5615  			continue;
5616  		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5617  		zonerefs += nr_zones;
5618  	}
5619  
5620  	zonerefs->zone = NULL;
5621  	zonerefs->zone_idx = 0;
5622  }
5623  
5624  #endif	/* CONFIG_NUMA */
5625  
5626  /*
5627   * Boot pageset table. One per cpu which is going to be used for all
5628   * zones and all nodes. The parameters will be set in such a way
5629   * that an item put on a list will immediately be handed over to
5630   * the buddy list. This is safe since pageset manipulation is done
5631   * with interrupts disabled.
5632   *
5633   * The boot_pagesets must be kept even after bootup is complete for
5634   * unused processors and/or zones. They do play a role for bootstrapping
5635   * hotplugged processors.
5636   *
5637   * zoneinfo_show() and maybe other functions do
5638   * not check if the processor is online before following the pageset pointer.
5639   * Other parts of the kernel may not check if the zone is available.
5640   */
5641  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5642  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5643  static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5644  
5645  static void __build_all_zonelists(void *data)
5646  {
5647  	int nid;
5648  	int __maybe_unused cpu;
5649  	pg_data_t *self = data;
5650  	static DEFINE_SPINLOCK(lock);
5651  
5652  	spin_lock(&lock);
5653  
5654  #ifdef CONFIG_NUMA
5655  	memset(node_load, 0, sizeof(node_load));
5656  #endif
5657  
5658  	/*
5659  	 * This node is hotadded and no memory is yet present.   So just
5660  	 * building zonelists is fine - no need to touch other nodes.
5661  	 */
5662  	if (self && !node_online(self->node_id)) {
5663  		build_zonelists(self);
5664  	} else {
5665  		for_each_online_node(nid) {
5666  			pg_data_t *pgdat = NODE_DATA(nid);
5667  
5668  			build_zonelists(pgdat);
5669  		}
5670  
5671  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5672  		/*
5673  		 * We now know the "local memory node" for each node--
5674  		 * i.e., the node of the first zone in the generic zonelist.
5675  		 * Set up numa_mem percpu variable for on-line cpus.  During
5676  		 * boot, only the boot cpu should be on-line;  we'll init the
5677  		 * secondary cpus' numa_mem as they come on-line.  During
5678  		 * node/memory hotplug, we'll fixup all on-line cpus.
5679  		 */
5680  		for_each_online_cpu(cpu)
5681  			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5682  #endif
5683  	}
5684  
5685  	spin_unlock(&lock);
5686  }
5687  
5688  static noinline void __init
5689  build_all_zonelists_init(void)
5690  {
5691  	int cpu;
5692  
5693  	__build_all_zonelists(NULL);
5694  
5695  	/*
5696  	 * Initialize the boot_pagesets that are going to be used
5697  	 * for bootstrapping processors. The real pagesets for
5698  	 * each zone will be allocated later when the per cpu
5699  	 * allocator is available.
5700  	 *
5701  	 * boot_pagesets are used also for bootstrapping offline
5702  	 * cpus if the system is already booted because the pagesets
5703  	 * are needed to initialize allocators on a specific cpu too.
5704  	 * F.e. the percpu allocator needs the page allocator which
5705  	 * needs the percpu allocator in order to allocate its pagesets
5706  	 * (a chicken-egg dilemma).
5707  	 */
5708  	for_each_possible_cpu(cpu)
5709  		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5710  
5711  	mminit_verify_zonelist();
5712  	cpuset_init_current_mems_allowed();
5713  }
5714  
5715  /*
5716   * unless system_state == SYSTEM_BOOTING.
5717   *
5718   * __ref due to call of __init annotated helper build_all_zonelists_init
5719   * [protected by SYSTEM_BOOTING].
5720   */
5721  void __ref build_all_zonelists(pg_data_t *pgdat)
5722  {
5723  	if (system_state == SYSTEM_BOOTING) {
5724  		build_all_zonelists_init();
5725  	} else {
5726  		__build_all_zonelists(pgdat);
5727  		/* cpuset refresh routine should be here */
5728  	}
5729  	vm_total_pages = nr_free_pagecache_pages();
5730  	/*
5731  	 * Disable grouping by mobility if the number of pages in the
5732  	 * system is too low to allow the mechanism to work. It would be
5733  	 * more accurate, but expensive to check per-zone. This check is
5734  	 * made on memory-hotadd so a system can start with mobility
5735  	 * disabled and enable it later
5736  	 */
5737  	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5738  		page_group_by_mobility_disabled = 1;
5739  	else
5740  		page_group_by_mobility_disabled = 0;
5741  
5742  	pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
5743  		nr_online_nodes,
5744  		page_group_by_mobility_disabled ? "off" : "on",
5745  		vm_total_pages);
5746  #ifdef CONFIG_NUMA
5747  	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5748  #endif
5749  }
5750  
5751  /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
5752  static bool __meminit
5753  overlap_memmap_init(unsigned long zone, unsigned long *pfn)
5754  {
5755  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5756  	static struct memblock_region *r;
5757  
5758  	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5759  		if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
5760  			for_each_memblock(memory, r) {
5761  				if (*pfn < memblock_region_memory_end_pfn(r))
5762  					break;
5763  			}
5764  		}
5765  		if (*pfn >= memblock_region_memory_base_pfn(r) &&
5766  		    memblock_is_mirror(r)) {
5767  			*pfn = memblock_region_memory_end_pfn(r);
5768  			return true;
5769  		}
5770  	}
5771  #endif
5772  	return false;
5773  }
5774  
5775  /*
5776   * Initially all pages are reserved - free ones are freed
5777   * up by memblock_free_all() once the early boot process is
5778   * done. Non-atomic initialization, single-pass.
5779   */
5780  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5781  		unsigned long start_pfn, enum memmap_context context,
5782  		struct vmem_altmap *altmap)
5783  {
5784  	unsigned long pfn, end_pfn = start_pfn + size;
5785  	struct page *page;
5786  
5787  	if (highest_memmap_pfn < end_pfn - 1)
5788  		highest_memmap_pfn = end_pfn - 1;
5789  
5790  #ifdef CONFIG_ZONE_DEVICE
5791  	/*
5792  	 * Honor reservation requested by the driver for this ZONE_DEVICE
5793  	 * memory. We limit the total number of pages to initialize to just
5794  	 * those that might contain the memory mapping. We will defer the
5795  	 * ZONE_DEVICE page initialization until after we have released
5796  	 * the hotplug lock.
5797  	 */
5798  	if (zone == ZONE_DEVICE) {
5799  		if (!altmap)
5800  			return;
5801  
5802  		if (start_pfn == altmap->base_pfn)
5803  			start_pfn += altmap->reserve;
5804  		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5805  	}
5806  #endif
5807  
5808  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5809  		/*
5810  		 * There can be holes in boot-time mem_map[]s handed to this
5811  		 * function.  They do not exist on hotplugged memory.
5812  		 */
5813  		if (context == MEMMAP_EARLY) {
5814  			if (!early_pfn_valid(pfn))
5815  				continue;
5816  			if (!early_pfn_in_nid(pfn, nid))
5817  				continue;
5818  			if (overlap_memmap_init(zone, &pfn))
5819  				continue;
5820  			if (defer_init(nid, pfn, end_pfn))
5821  				break;
5822  		}
5823  
5824  		page = pfn_to_page(pfn);
5825  		__init_single_page(page, pfn, zone, nid);
5826  		if (context == MEMMAP_HOTPLUG)
5827  			__SetPageReserved(page);
5828  
5829  		/*
5830  		 * Mark the block movable so that blocks are reserved for
5831  		 * movable at startup. This will force kernel allocations
5832  		 * to reserve their blocks rather than leaking throughout
5833  		 * the address space during boot when many long-lived
5834  		 * kernel allocations are made.
5835  		 *
5836  		 * bitmap is created for zone's valid pfn range. but memmap
5837  		 * can be created for invalid pages (for alignment)
5838  		 * check here not to call set_pageblock_migratetype() against
5839  		 * pfn out of zone.
5840  		 */
5841  		if (!(pfn & (pageblock_nr_pages - 1))) {
5842  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5843  			cond_resched();
5844  		}
5845  	}
5846  }
5847  
5848  #ifdef CONFIG_ZONE_DEVICE
5849  void __ref memmap_init_zone_device(struct zone *zone,
5850  				   unsigned long start_pfn,
5851  				   unsigned long size,
5852  				   struct dev_pagemap *pgmap)
5853  {
5854  	unsigned long pfn, end_pfn = start_pfn + size;
5855  	struct pglist_data *pgdat = zone->zone_pgdat;
5856  	unsigned long zone_idx = zone_idx(zone);
5857  	unsigned long start = jiffies;
5858  	int nid = pgdat->node_id;
5859  
5860  	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
5861  		return;
5862  
5863  	/*
5864  	 * The call to memmap_init_zone should have already taken care
5865  	 * of the pages reserved for the memmap, so we can just jump to
5866  	 * the end of that region and start processing the device pages.
5867  	 */
5868  	if (pgmap->altmap_valid) {
5869  		struct vmem_altmap *altmap = &pgmap->altmap;
5870  
5871  		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5872  		size = end_pfn - start_pfn;
5873  	}
5874  
5875  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5876  		struct page *page = pfn_to_page(pfn);
5877  
5878  		__init_single_page(page, pfn, zone_idx, nid);
5879  
5880  		/*
5881  		 * Mark page reserved as it will need to wait for onlining
5882  		 * phase for it to be fully associated with a zone.
5883  		 *
5884  		 * We can use the non-atomic __set_bit operation for setting
5885  		 * the flag as we are still initializing the pages.
5886  		 */
5887  		__SetPageReserved(page);
5888  
5889  		/*
5890  		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
5891  		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
5892  		 * page is ever freed or placed on a driver-private list.
5893  		 */
5894  		page->pgmap = pgmap;
5895  		page->hmm_data = 0;
5896  
5897  		/*
5898  		 * Mark the block movable so that blocks are reserved for
5899  		 * movable at startup. This will force kernel allocations
5900  		 * to reserve their blocks rather than leaking throughout
5901  		 * the address space during boot when many long-lived
5902  		 * kernel allocations are made.
5903  		 *
5904  		 * bitmap is created for zone's valid pfn range. but memmap
5905  		 * can be created for invalid pages (for alignment)
5906  		 * check here not to call set_pageblock_migratetype() against
5907  		 * pfn out of zone.
5908  		 *
5909  		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
5910  		 * because this is done early in sparse_add_one_section
5911  		 */
5912  		if (!(pfn & (pageblock_nr_pages - 1))) {
5913  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5914  			cond_resched();
5915  		}
5916  	}
5917  
5918  	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
5919  		size, jiffies_to_msecs(jiffies - start));
5920  }
5921  
5922  #endif
5923  static void __meminit zone_init_free_lists(struct zone *zone)
5924  {
5925  	unsigned int order, t;
5926  	for_each_migratetype_order(order, t) {
5927  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
5928  		zone->free_area[order].nr_free = 0;
5929  	}
5930  }
5931  
5932  void __meminit __weak memmap_init(unsigned long size, int nid,
5933  				  unsigned long zone, unsigned long start_pfn)
5934  {
5935  	memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
5936  }
5937  
5938  static int zone_batchsize(struct zone *zone)
5939  {
5940  #ifdef CONFIG_MMU
5941  	int batch;
5942  
5943  	/*
5944  	 * The per-cpu-pages pools are set to around 1000th of the
5945  	 * size of the zone.
5946  	 */
5947  	batch = zone_managed_pages(zone) / 1024;
5948  	/* But no more than a meg. */
5949  	if (batch * PAGE_SIZE > 1024 * 1024)
5950  		batch = (1024 * 1024) / PAGE_SIZE;
5951  	batch /= 4;		/* We effectively *= 4 below */
5952  	if (batch < 1)
5953  		batch = 1;
5954  
5955  	/*
5956  	 * Clamp the batch to a 2^n - 1 value. Having a power
5957  	 * of 2 value was found to be more likely to have
5958  	 * suboptimal cache aliasing properties in some cases.
5959  	 *
5960  	 * For example if 2 tasks are alternately allocating
5961  	 * batches of pages, one task can end up with a lot
5962  	 * of pages of one half of the possible page colors
5963  	 * and the other with pages of the other colors.
5964  	 */
5965  	batch = rounddown_pow_of_two(batch + batch/2) - 1;
5966  
5967  	return batch;
5968  
5969  #else
5970  	/* The deferral and batching of frees should be suppressed under NOMMU
5971  	 * conditions.
5972  	 *
5973  	 * The problem is that NOMMU needs to be able to allocate large chunks
5974  	 * of contiguous memory as there's no hardware page translation to
5975  	 * assemble apparent contiguous memory from discontiguous pages.
5976  	 *
5977  	 * Queueing large contiguous runs of pages for batching, however,
5978  	 * causes the pages to actually be freed in smaller chunks.  As there
5979  	 * can be a significant delay between the individual batches being
5980  	 * recycled, this leads to the once large chunks of space being
5981  	 * fragmented and becoming unavailable for high-order allocations.
5982  	 */
5983  	return 0;
5984  #endif
5985  }
5986  
5987  /*
5988   * pcp->high and pcp->batch values are related and dependent on one another:
5989   * ->batch must never be higher then ->high.
5990   * The following function updates them in a safe manner without read side
5991   * locking.
5992   *
5993   * Any new users of pcp->batch and pcp->high should ensure they can cope with
5994   * those fields changing asynchronously (acording the the above rule).
5995   *
5996   * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5997   * outside of boot time (or some other assurance that no concurrent updaters
5998   * exist).
5999   */
6000  static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
6001  		unsigned long batch)
6002  {
6003         /* start with a fail safe value for batch */
6004  	pcp->batch = 1;
6005  	smp_wmb();
6006  
6007         /* Update high, then batch, in order */
6008  	pcp->high = high;
6009  	smp_wmb();
6010  
6011  	pcp->batch = batch;
6012  }
6013  
6014  /* a companion to pageset_set_high() */
6015  static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
6016  {
6017  	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
6018  }
6019  
6020  static void pageset_init(struct per_cpu_pageset *p)
6021  {
6022  	struct per_cpu_pages *pcp;
6023  	int migratetype;
6024  
6025  	memset(p, 0, sizeof(*p));
6026  
6027  	pcp = &p->pcp;
6028  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
6029  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
6030  }
6031  
6032  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
6033  {
6034  	pageset_init(p);
6035  	pageset_set_batch(p, batch);
6036  }
6037  
6038  /*
6039   * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
6040   * to the value high for the pageset p.
6041   */
6042  static void pageset_set_high(struct per_cpu_pageset *p,
6043  				unsigned long high)
6044  {
6045  	unsigned long batch = max(1UL, high / 4);
6046  	if ((high / 4) > (PAGE_SHIFT * 8))
6047  		batch = PAGE_SHIFT * 8;
6048  
6049  	pageset_update(&p->pcp, high, batch);
6050  }
6051  
6052  static void pageset_set_high_and_batch(struct zone *zone,
6053  				       struct per_cpu_pageset *pcp)
6054  {
6055  	if (percpu_pagelist_fraction)
6056  		pageset_set_high(pcp,
6057  			(zone_managed_pages(zone) /
6058  				percpu_pagelist_fraction));
6059  	else
6060  		pageset_set_batch(pcp, zone_batchsize(zone));
6061  }
6062  
6063  static void __meminit zone_pageset_init(struct zone *zone, int cpu)
6064  {
6065  	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
6066  
6067  	pageset_init(pcp);
6068  	pageset_set_high_and_batch(zone, pcp);
6069  }
6070  
6071  void __meminit setup_zone_pageset(struct zone *zone)
6072  {
6073  	int cpu;
6074  	zone->pageset = alloc_percpu(struct per_cpu_pageset);
6075  	for_each_possible_cpu(cpu)
6076  		zone_pageset_init(zone, cpu);
6077  }
6078  
6079  /*
6080   * Allocate per cpu pagesets and initialize them.
6081   * Before this call only boot pagesets were available.
6082   */
6083  void __init setup_per_cpu_pageset(void)
6084  {
6085  	struct pglist_data *pgdat;
6086  	struct zone *zone;
6087  
6088  	for_each_populated_zone(zone)
6089  		setup_zone_pageset(zone);
6090  
6091  	for_each_online_pgdat(pgdat)
6092  		pgdat->per_cpu_nodestats =
6093  			alloc_percpu(struct per_cpu_nodestat);
6094  }
6095  
6096  static __meminit void zone_pcp_init(struct zone *zone)
6097  {
6098  	/*
6099  	 * per cpu subsystem is not up at this point. The following code
6100  	 * relies on the ability of the linker to provide the
6101  	 * offset of a (static) per cpu variable into the per cpu area.
6102  	 */
6103  	zone->pageset = &boot_pageset;
6104  
6105  	if (populated_zone(zone))
6106  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
6107  			zone->name, zone->present_pages,
6108  					 zone_batchsize(zone));
6109  }
6110  
6111  void __meminit init_currently_empty_zone(struct zone *zone,
6112  					unsigned long zone_start_pfn,
6113  					unsigned long size)
6114  {
6115  	struct pglist_data *pgdat = zone->zone_pgdat;
6116  	int zone_idx = zone_idx(zone) + 1;
6117  
6118  	if (zone_idx > pgdat->nr_zones)
6119  		pgdat->nr_zones = zone_idx;
6120  
6121  	zone->zone_start_pfn = zone_start_pfn;
6122  
6123  	mminit_dprintk(MMINIT_TRACE, "memmap_init",
6124  			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
6125  			pgdat->node_id,
6126  			(unsigned long)zone_idx(zone),
6127  			zone_start_pfn, (zone_start_pfn + size));
6128  
6129  	zone_init_free_lists(zone);
6130  	zone->initialized = 1;
6131  }
6132  
6133  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6134  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
6135  
6136  /*
6137   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
6138   */
6139  int __meminit __early_pfn_to_nid(unsigned long pfn,
6140  					struct mminit_pfnnid_cache *state)
6141  {
6142  	unsigned long start_pfn, end_pfn;
6143  	int nid;
6144  
6145  	if (state->last_start <= pfn && pfn < state->last_end)
6146  		return state->last_nid;
6147  
6148  	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
6149  	if (nid != NUMA_NO_NODE) {
6150  		state->last_start = start_pfn;
6151  		state->last_end = end_pfn;
6152  		state->last_nid = nid;
6153  	}
6154  
6155  	return nid;
6156  }
6157  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
6158  
6159  /**
6160   * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
6161   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
6162   * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
6163   *
6164   * If an architecture guarantees that all ranges registered contain no holes
6165   * and may be freed, this this function may be used instead of calling
6166   * memblock_free_early_nid() manually.
6167   */
6168  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
6169  {
6170  	unsigned long start_pfn, end_pfn;
6171  	int i, this_nid;
6172  
6173  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
6174  		start_pfn = min(start_pfn, max_low_pfn);
6175  		end_pfn = min(end_pfn, max_low_pfn);
6176  
6177  		if (start_pfn < end_pfn)
6178  			memblock_free_early_nid(PFN_PHYS(start_pfn),
6179  					(end_pfn - start_pfn) << PAGE_SHIFT,
6180  					this_nid);
6181  	}
6182  }
6183  
6184  /**
6185   * sparse_memory_present_with_active_regions - Call memory_present for each active range
6186   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6187   *
6188   * If an architecture guarantees that all ranges registered contain no holes and may
6189   * be freed, this function may be used instead of calling memory_present() manually.
6190   */
6191  void __init sparse_memory_present_with_active_regions(int nid)
6192  {
6193  	unsigned long start_pfn, end_pfn;
6194  	int i, this_nid;
6195  
6196  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6197  		memory_present(this_nid, start_pfn, end_pfn);
6198  }
6199  
6200  /**
6201   * get_pfn_range_for_nid - Return the start and end page frames for a node
6202   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
6203   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
6204   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
6205   *
6206   * It returns the start and end page frame of a node based on information
6207   * provided by memblock_set_node(). If called for a node
6208   * with no available memory, a warning is printed and the start and end
6209   * PFNs will be 0.
6210   */
6211  void __init get_pfn_range_for_nid(unsigned int nid,
6212  			unsigned long *start_pfn, unsigned long *end_pfn)
6213  {
6214  	unsigned long this_start_pfn, this_end_pfn;
6215  	int i;
6216  
6217  	*start_pfn = -1UL;
6218  	*end_pfn = 0;
6219  
6220  	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
6221  		*start_pfn = min(*start_pfn, this_start_pfn);
6222  		*end_pfn = max(*end_pfn, this_end_pfn);
6223  	}
6224  
6225  	if (*start_pfn == -1UL)
6226  		*start_pfn = 0;
6227  }
6228  
6229  /*
6230   * This finds a zone that can be used for ZONE_MOVABLE pages. The
6231   * assumption is made that zones within a node are ordered in monotonic
6232   * increasing memory addresses so that the "highest" populated zone is used
6233   */
6234  static void __init find_usable_zone_for_movable(void)
6235  {
6236  	int zone_index;
6237  	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
6238  		if (zone_index == ZONE_MOVABLE)
6239  			continue;
6240  
6241  		if (arch_zone_highest_possible_pfn[zone_index] >
6242  				arch_zone_lowest_possible_pfn[zone_index])
6243  			break;
6244  	}
6245  
6246  	VM_BUG_ON(zone_index == -1);
6247  	movable_zone = zone_index;
6248  }
6249  
6250  /*
6251   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
6252   * because it is sized independent of architecture. Unlike the other zones,
6253   * the starting point for ZONE_MOVABLE is not fixed. It may be different
6254   * in each node depending on the size of each node and how evenly kernelcore
6255   * is distributed. This helper function adjusts the zone ranges
6256   * provided by the architecture for a given node by using the end of the
6257   * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6258   * zones within a node are in order of monotonic increases memory addresses
6259   */
6260  static void __init adjust_zone_range_for_zone_movable(int nid,
6261  					unsigned long zone_type,
6262  					unsigned long node_start_pfn,
6263  					unsigned long node_end_pfn,
6264  					unsigned long *zone_start_pfn,
6265  					unsigned long *zone_end_pfn)
6266  {
6267  	/* Only adjust if ZONE_MOVABLE is on this node */
6268  	if (zone_movable_pfn[nid]) {
6269  		/* Size ZONE_MOVABLE */
6270  		if (zone_type == ZONE_MOVABLE) {
6271  			*zone_start_pfn = zone_movable_pfn[nid];
6272  			*zone_end_pfn = min(node_end_pfn,
6273  				arch_zone_highest_possible_pfn[movable_zone]);
6274  
6275  		/* Adjust for ZONE_MOVABLE starting within this range */
6276  		} else if (!mirrored_kernelcore &&
6277  			*zone_start_pfn < zone_movable_pfn[nid] &&
6278  			*zone_end_pfn > zone_movable_pfn[nid]) {
6279  			*zone_end_pfn = zone_movable_pfn[nid];
6280  
6281  		/* Check if this whole range is within ZONE_MOVABLE */
6282  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
6283  			*zone_start_pfn = *zone_end_pfn;
6284  	}
6285  }
6286  
6287  /*
6288   * Return the number of pages a zone spans in a node, including holes
6289   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6290   */
6291  static unsigned long __init zone_spanned_pages_in_node(int nid,
6292  					unsigned long zone_type,
6293  					unsigned long node_start_pfn,
6294  					unsigned long node_end_pfn,
6295  					unsigned long *zone_start_pfn,
6296  					unsigned long *zone_end_pfn,
6297  					unsigned long *ignored)
6298  {
6299  	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6300  	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
6301  	/* When hotadd a new node from cpu_up(), the node should be empty */
6302  	if (!node_start_pfn && !node_end_pfn)
6303  		return 0;
6304  
6305  	/* Get the start and end of the zone */
6306  	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6307  	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
6308  	adjust_zone_range_for_zone_movable(nid, zone_type,
6309  				node_start_pfn, node_end_pfn,
6310  				zone_start_pfn, zone_end_pfn);
6311  
6312  	/* Check that this node has pages within the zone's required range */
6313  	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
6314  		return 0;
6315  
6316  	/* Move the zone boundaries inside the node if necessary */
6317  	*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
6318  	*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
6319  
6320  	/* Return the spanned pages */
6321  	return *zone_end_pfn - *zone_start_pfn;
6322  }
6323  
6324  /*
6325   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6326   * then all holes in the requested range will be accounted for.
6327   */
6328  unsigned long __init __absent_pages_in_range(int nid,
6329  				unsigned long range_start_pfn,
6330  				unsigned long range_end_pfn)
6331  {
6332  	unsigned long nr_absent = range_end_pfn - range_start_pfn;
6333  	unsigned long start_pfn, end_pfn;
6334  	int i;
6335  
6336  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6337  		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
6338  		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
6339  		nr_absent -= end_pfn - start_pfn;
6340  	}
6341  	return nr_absent;
6342  }
6343  
6344  /**
6345   * absent_pages_in_range - Return number of page frames in holes within a range
6346   * @start_pfn: The start PFN to start searching for holes
6347   * @end_pfn: The end PFN to stop searching for holes
6348   *
6349   * Return: the number of pages frames in memory holes within a range.
6350   */
6351  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6352  							unsigned long end_pfn)
6353  {
6354  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
6355  }
6356  
6357  /* Return the number of page frames in holes in a zone on a node */
6358  static unsigned long __init zone_absent_pages_in_node(int nid,
6359  					unsigned long zone_type,
6360  					unsigned long node_start_pfn,
6361  					unsigned long node_end_pfn,
6362  					unsigned long *ignored)
6363  {
6364  	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6365  	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
6366  	unsigned long zone_start_pfn, zone_end_pfn;
6367  	unsigned long nr_absent;
6368  
6369  	/* When hotadd a new node from cpu_up(), the node should be empty */
6370  	if (!node_start_pfn && !node_end_pfn)
6371  		return 0;
6372  
6373  	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6374  	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
6375  
6376  	adjust_zone_range_for_zone_movable(nid, zone_type,
6377  			node_start_pfn, node_end_pfn,
6378  			&zone_start_pfn, &zone_end_pfn);
6379  	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
6380  
6381  	/*
6382  	 * ZONE_MOVABLE handling.
6383  	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
6384  	 * and vice versa.
6385  	 */
6386  	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
6387  		unsigned long start_pfn, end_pfn;
6388  		struct memblock_region *r;
6389  
6390  		for_each_memblock(memory, r) {
6391  			start_pfn = clamp(memblock_region_memory_base_pfn(r),
6392  					  zone_start_pfn, zone_end_pfn);
6393  			end_pfn = clamp(memblock_region_memory_end_pfn(r),
6394  					zone_start_pfn, zone_end_pfn);
6395  
6396  			if (zone_type == ZONE_MOVABLE &&
6397  			    memblock_is_mirror(r))
6398  				nr_absent += end_pfn - start_pfn;
6399  
6400  			if (zone_type == ZONE_NORMAL &&
6401  			    !memblock_is_mirror(r))
6402  				nr_absent += end_pfn - start_pfn;
6403  		}
6404  	}
6405  
6406  	return nr_absent;
6407  }
6408  
6409  #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6410  static inline unsigned long __init zone_spanned_pages_in_node(int nid,
6411  					unsigned long zone_type,
6412  					unsigned long node_start_pfn,
6413  					unsigned long node_end_pfn,
6414  					unsigned long *zone_start_pfn,
6415  					unsigned long *zone_end_pfn,
6416  					unsigned long *zones_size)
6417  {
6418  	unsigned int zone;
6419  
6420  	*zone_start_pfn = node_start_pfn;
6421  	for (zone = 0; zone < zone_type; zone++)
6422  		*zone_start_pfn += zones_size[zone];
6423  
6424  	*zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
6425  
6426  	return zones_size[zone_type];
6427  }
6428  
6429  static inline unsigned long __init zone_absent_pages_in_node(int nid,
6430  						unsigned long zone_type,
6431  						unsigned long node_start_pfn,
6432  						unsigned long node_end_pfn,
6433  						unsigned long *zholes_size)
6434  {
6435  	if (!zholes_size)
6436  		return 0;
6437  
6438  	return zholes_size[zone_type];
6439  }
6440  
6441  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6442  
6443  static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6444  						unsigned long node_start_pfn,
6445  						unsigned long node_end_pfn,
6446  						unsigned long *zones_size,
6447  						unsigned long *zholes_size)
6448  {
6449  	unsigned long realtotalpages = 0, totalpages = 0;
6450  	enum zone_type i;
6451  
6452  	for (i = 0; i < MAX_NR_ZONES; i++) {
6453  		struct zone *zone = pgdat->node_zones + i;
6454  		unsigned long zone_start_pfn, zone_end_pfn;
6455  		unsigned long size, real_size;
6456  
6457  		size = zone_spanned_pages_in_node(pgdat->node_id, i,
6458  						  node_start_pfn,
6459  						  node_end_pfn,
6460  						  &zone_start_pfn,
6461  						  &zone_end_pfn,
6462  						  zones_size);
6463  		real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6464  						  node_start_pfn, node_end_pfn,
6465  						  zholes_size);
6466  		if (size)
6467  			zone->zone_start_pfn = zone_start_pfn;
6468  		else
6469  			zone->zone_start_pfn = 0;
6470  		zone->spanned_pages = size;
6471  		zone->present_pages = real_size;
6472  
6473  		totalpages += size;
6474  		realtotalpages += real_size;
6475  	}
6476  
6477  	pgdat->node_spanned_pages = totalpages;
6478  	pgdat->node_present_pages = realtotalpages;
6479  	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
6480  							realtotalpages);
6481  }
6482  
6483  #ifndef CONFIG_SPARSEMEM
6484  /*
6485   * Calculate the size of the zone->blockflags rounded to an unsigned long
6486   * Start by making sure zonesize is a multiple of pageblock_order by rounding
6487   * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
6488   * round what is now in bits to nearest long in bits, then return it in
6489   * bytes.
6490   */
6491  static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
6492  {
6493  	unsigned long usemapsize;
6494  
6495  	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
6496  	usemapsize = roundup(zonesize, pageblock_nr_pages);
6497  	usemapsize = usemapsize >> pageblock_order;
6498  	usemapsize *= NR_PAGEBLOCK_BITS;
6499  	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
6500  
6501  	return usemapsize / 8;
6502  }
6503  
6504  static void __ref setup_usemap(struct pglist_data *pgdat,
6505  				struct zone *zone,
6506  				unsigned long zone_start_pfn,
6507  				unsigned long zonesize)
6508  {
6509  	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6510  	zone->pageblock_flags = NULL;
6511  	if (usemapsize) {
6512  		zone->pageblock_flags =
6513  			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
6514  					    pgdat->node_id);
6515  		if (!zone->pageblock_flags)
6516  			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
6517  			      usemapsize, zone->name, pgdat->node_id);
6518  	}
6519  }
6520  #else
6521  static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
6522  				unsigned long zone_start_pfn, unsigned long zonesize) {}
6523  #endif /* CONFIG_SPARSEMEM */
6524  
6525  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
6526  
6527  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
6528  void __init set_pageblock_order(void)
6529  {
6530  	unsigned int order;
6531  
6532  	/* Check that pageblock_nr_pages has not already been setup */
6533  	if (pageblock_order)
6534  		return;
6535  
6536  	if (HPAGE_SHIFT > PAGE_SHIFT)
6537  		order = HUGETLB_PAGE_ORDER;
6538  	else
6539  		order = MAX_ORDER - 1;
6540  
6541  	/*
6542  	 * Assume the largest contiguous order of interest is a huge page.
6543  	 * This value may be variable depending on boot parameters on IA64 and
6544  	 * powerpc.
6545  	 */
6546  	pageblock_order = order;
6547  }
6548  #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6549  
6550  /*
6551   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
6552   * is unused as pageblock_order is set at compile-time. See
6553   * include/linux/pageblock-flags.h for the values of pageblock_order based on
6554   * the kernel config
6555   */
6556  void __init set_pageblock_order(void)
6557  {
6558  }
6559  
6560  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6561  
6562  static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
6563  						unsigned long present_pages)
6564  {
6565  	unsigned long pages = spanned_pages;
6566  
6567  	/*
6568  	 * Provide a more accurate estimation if there are holes within
6569  	 * the zone and SPARSEMEM is in use. If there are holes within the
6570  	 * zone, each populated memory region may cost us one or two extra
6571  	 * memmap pages due to alignment because memmap pages for each
6572  	 * populated regions may not be naturally aligned on page boundary.
6573  	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
6574  	 */
6575  	if (spanned_pages > present_pages + (present_pages >> 4) &&
6576  	    IS_ENABLED(CONFIG_SPARSEMEM))
6577  		pages = present_pages;
6578  
6579  	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
6580  }
6581  
6582  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6583  static void pgdat_init_split_queue(struct pglist_data *pgdat)
6584  {
6585  	spin_lock_init(&pgdat->split_queue_lock);
6586  	INIT_LIST_HEAD(&pgdat->split_queue);
6587  	pgdat->split_queue_len = 0;
6588  }
6589  #else
6590  static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
6591  #endif
6592  
6593  #ifdef CONFIG_COMPACTION
6594  static void pgdat_init_kcompactd(struct pglist_data *pgdat)
6595  {
6596  	init_waitqueue_head(&pgdat->kcompactd_wait);
6597  }
6598  #else
6599  static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
6600  #endif
6601  
6602  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
6603  {
6604  	pgdat_resize_init(pgdat);
6605  
6606  	pgdat_init_split_queue(pgdat);
6607  	pgdat_init_kcompactd(pgdat);
6608  
6609  	init_waitqueue_head(&pgdat->kswapd_wait);
6610  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
6611  
6612  	pgdat_page_ext_init(pgdat);
6613  	spin_lock_init(&pgdat->lru_lock);
6614  	lruvec_init(node_lruvec(pgdat));
6615  }
6616  
6617  static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6618  							unsigned long remaining_pages)
6619  {
6620  	atomic_long_set(&zone->managed_pages, remaining_pages);
6621  	zone_set_nid(zone, nid);
6622  	zone->name = zone_names[idx];
6623  	zone->zone_pgdat = NODE_DATA(nid);
6624  	spin_lock_init(&zone->lock);
6625  	zone_seqlock_init(zone);
6626  	zone_pcp_init(zone);
6627  }
6628  
6629  /*
6630   * Set up the zone data structures
6631   * - init pgdat internals
6632   * - init all zones belonging to this node
6633   *
6634   * NOTE: this function is only called during memory hotplug
6635   */
6636  #ifdef CONFIG_MEMORY_HOTPLUG
6637  void __ref free_area_init_core_hotplug(int nid)
6638  {
6639  	enum zone_type z;
6640  	pg_data_t *pgdat = NODE_DATA(nid);
6641  
6642  	pgdat_init_internals(pgdat);
6643  	for (z = 0; z < MAX_NR_ZONES; z++)
6644  		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
6645  }
6646  #endif
6647  
6648  /*
6649   * Set up the zone data structures:
6650   *   - mark all pages reserved
6651   *   - mark all memory queues empty
6652   *   - clear the memory bitmaps
6653   *
6654   * NOTE: pgdat should get zeroed by caller.
6655   * NOTE: this function is only called during early init.
6656   */
6657  static void __init free_area_init_core(struct pglist_data *pgdat)
6658  {
6659  	enum zone_type j;
6660  	int nid = pgdat->node_id;
6661  
6662  	pgdat_init_internals(pgdat);
6663  	pgdat->per_cpu_nodestats = &boot_nodestats;
6664  
6665  	for (j = 0; j < MAX_NR_ZONES; j++) {
6666  		struct zone *zone = pgdat->node_zones + j;
6667  		unsigned long size, freesize, memmap_pages;
6668  		unsigned long zone_start_pfn = zone->zone_start_pfn;
6669  
6670  		size = zone->spanned_pages;
6671  		freesize = zone->present_pages;
6672  
6673  		/*
6674  		 * Adjust freesize so that it accounts for how much memory
6675  		 * is used by this zone for memmap. This affects the watermark
6676  		 * and per-cpu initialisations
6677  		 */
6678  		memmap_pages = calc_memmap_size(size, freesize);
6679  		if (!is_highmem_idx(j)) {
6680  			if (freesize >= memmap_pages) {
6681  				freesize -= memmap_pages;
6682  				if (memmap_pages)
6683  					printk(KERN_DEBUG
6684  					       "  %s zone: %lu pages used for memmap\n",
6685  					       zone_names[j], memmap_pages);
6686  			} else
6687  				pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
6688  					zone_names[j], memmap_pages, freesize);
6689  		}
6690  
6691  		/* Account for reserved pages */
6692  		if (j == 0 && freesize > dma_reserve) {
6693  			freesize -= dma_reserve;
6694  			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
6695  					zone_names[0], dma_reserve);
6696  		}
6697  
6698  		if (!is_highmem_idx(j))
6699  			nr_kernel_pages += freesize;
6700  		/* Charge for highmem memmap if there are enough kernel pages */
6701  		else if (nr_kernel_pages > memmap_pages * 2)
6702  			nr_kernel_pages -= memmap_pages;
6703  		nr_all_pages += freesize;
6704  
6705  		/*
6706  		 * Set an approximate value for lowmem here, it will be adjusted
6707  		 * when the bootmem allocator frees pages into the buddy system.
6708  		 * And all highmem pages will be managed by the buddy system.
6709  		 */
6710  		zone_init_internals(zone, j, nid, freesize);
6711  
6712  		if (!size)
6713  			continue;
6714  
6715  		set_pageblock_order();
6716  		setup_usemap(pgdat, zone, zone_start_pfn, size);
6717  		init_currently_empty_zone(zone, zone_start_pfn, size);
6718  		memmap_init(size, nid, j, zone_start_pfn);
6719  	}
6720  }
6721  
6722  #ifdef CONFIG_FLAT_NODE_MEM_MAP
6723  static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6724  {
6725  	unsigned long __maybe_unused start = 0;
6726  	unsigned long __maybe_unused offset = 0;
6727  
6728  	/* Skip empty nodes */
6729  	if (!pgdat->node_spanned_pages)
6730  		return;
6731  
6732  	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6733  	offset = pgdat->node_start_pfn - start;
6734  	/* ia64 gets its own node_mem_map, before this, without bootmem */
6735  	if (!pgdat->node_mem_map) {
6736  		unsigned long size, end;
6737  		struct page *map;
6738  
6739  		/*
6740  		 * The zone's endpoints aren't required to be MAX_ORDER
6741  		 * aligned but the node_mem_map endpoints must be in order
6742  		 * for the buddy allocator to function correctly.
6743  		 */
6744  		end = pgdat_end_pfn(pgdat);
6745  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
6746  		size =  (end - start) * sizeof(struct page);
6747  		map = memblock_alloc_node(size, SMP_CACHE_BYTES,
6748  					  pgdat->node_id);
6749  		if (!map)
6750  			panic("Failed to allocate %ld bytes for node %d memory map\n",
6751  			      size, pgdat->node_id);
6752  		pgdat->node_mem_map = map + offset;
6753  	}
6754  	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
6755  				__func__, pgdat->node_id, (unsigned long)pgdat,
6756  				(unsigned long)pgdat->node_mem_map);
6757  #ifndef CONFIG_NEED_MULTIPLE_NODES
6758  	/*
6759  	 * With no DISCONTIG, the global mem_map is just set as node 0's
6760  	 */
6761  	if (pgdat == NODE_DATA(0)) {
6762  		mem_map = NODE_DATA(0)->node_mem_map;
6763  #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
6764  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6765  			mem_map -= offset;
6766  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6767  	}
6768  #endif
6769  }
6770  #else
6771  static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6772  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
6773  
6774  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6775  static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6776  {
6777  	pgdat->first_deferred_pfn = ULONG_MAX;
6778  }
6779  #else
6780  static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6781  #endif
6782  
6783  void __init free_area_init_node(int nid, unsigned long *zones_size,
6784  				   unsigned long node_start_pfn,
6785  				   unsigned long *zholes_size)
6786  {
6787  	pg_data_t *pgdat = NODE_DATA(nid);
6788  	unsigned long start_pfn = 0;
6789  	unsigned long end_pfn = 0;
6790  
6791  	/* pg_data_t should be reset to zero when it's allocated */
6792  	WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6793  
6794  	pgdat->node_id = nid;
6795  	pgdat->node_start_pfn = node_start_pfn;
6796  	pgdat->per_cpu_nodestats = NULL;
6797  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6798  	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6799  	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6800  		(u64)start_pfn << PAGE_SHIFT,
6801  		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6802  #else
6803  	start_pfn = node_start_pfn;
6804  #endif
6805  	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6806  				  zones_size, zholes_size);
6807  
6808  	alloc_node_mem_map(pgdat);
6809  	pgdat_set_deferred_range(pgdat);
6810  
6811  	free_area_init_core(pgdat);
6812  }
6813  
6814  #if !defined(CONFIG_FLAT_NODE_MEM_MAP)
6815  /*
6816   * Zero all valid struct pages in range [spfn, epfn), return number of struct
6817   * pages zeroed
6818   */
6819  static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
6820  {
6821  	unsigned long pfn;
6822  	u64 pgcnt = 0;
6823  
6824  	for (pfn = spfn; pfn < epfn; pfn++) {
6825  		if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6826  			pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6827  				+ pageblock_nr_pages - 1;
6828  			continue;
6829  		}
6830  		mm_zero_struct_page(pfn_to_page(pfn));
6831  		pgcnt++;
6832  	}
6833  
6834  	return pgcnt;
6835  }
6836  
6837  /*
6838   * Only struct pages that are backed by physical memory are zeroed and
6839   * initialized by going through __init_single_page(). But, there are some
6840   * struct pages which are reserved in memblock allocator and their fields
6841   * may be accessed (for example page_to_pfn() on some configuration accesses
6842   * flags). We must explicitly zero those struct pages.
6843   *
6844   * This function also addresses a similar issue where struct pages are left
6845   * uninitialized because the physical address range is not covered by
6846   * memblock.memory or memblock.reserved. That could happen when memblock
6847   * layout is manually configured via memmap=.
6848   */
6849  void __init zero_resv_unavail(void)
6850  {
6851  	phys_addr_t start, end;
6852  	u64 i, pgcnt;
6853  	phys_addr_t next = 0;
6854  
6855  	/*
6856  	 * Loop through unavailable ranges not covered by memblock.memory.
6857  	 */
6858  	pgcnt = 0;
6859  	for_each_mem_range(i, &memblock.memory, NULL,
6860  			NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6861  		if (next < start)
6862  			pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6863  		next = end;
6864  	}
6865  	pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
6866  
6867  	/*
6868  	 * Struct pages that do not have backing memory. This could be because
6869  	 * firmware is using some of this memory, or for some other reasons.
6870  	 */
6871  	if (pgcnt)
6872  		pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6873  }
6874  #endif /* !CONFIG_FLAT_NODE_MEM_MAP */
6875  
6876  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6877  
6878  #if MAX_NUMNODES > 1
6879  /*
6880   * Figure out the number of possible node ids.
6881   */
6882  void __init setup_nr_node_ids(void)
6883  {
6884  	unsigned int highest;
6885  
6886  	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
6887  	nr_node_ids = highest + 1;
6888  }
6889  #endif
6890  
6891  /**
6892   * node_map_pfn_alignment - determine the maximum internode alignment
6893   *
6894   * This function should be called after node map is populated and sorted.
6895   * It calculates the maximum power of two alignment which can distinguish
6896   * all the nodes.
6897   *
6898   * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
6899   * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
6900   * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
6901   * shifted, 1GiB is enough and this function will indicate so.
6902   *
6903   * This is used to test whether pfn -> nid mapping of the chosen memory
6904   * model has fine enough granularity to avoid incorrect mapping for the
6905   * populated node map.
6906   *
6907   * Return: the determined alignment in pfn's.  0 if there is no alignment
6908   * requirement (single node).
6909   */
6910  unsigned long __init node_map_pfn_alignment(void)
6911  {
6912  	unsigned long accl_mask = 0, last_end = 0;
6913  	unsigned long start, end, mask;
6914  	int last_nid = NUMA_NO_NODE;
6915  	int i, nid;
6916  
6917  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6918  		if (!start || last_nid < 0 || last_nid == nid) {
6919  			last_nid = nid;
6920  			last_end = end;
6921  			continue;
6922  		}
6923  
6924  		/*
6925  		 * Start with a mask granular enough to pin-point to the
6926  		 * start pfn and tick off bits one-by-one until it becomes
6927  		 * too coarse to separate the current node from the last.
6928  		 */
6929  		mask = ~((1 << __ffs(start)) - 1);
6930  		while (mask && last_end <= (start & (mask << 1)))
6931  			mask <<= 1;
6932  
6933  		/* accumulate all internode masks */
6934  		accl_mask |= mask;
6935  	}
6936  
6937  	/* convert mask to number of pages */
6938  	return ~accl_mask + 1;
6939  }
6940  
6941  /* Find the lowest pfn for a node */
6942  static unsigned long __init find_min_pfn_for_node(int nid)
6943  {
6944  	unsigned long min_pfn = ULONG_MAX;
6945  	unsigned long start_pfn;
6946  	int i;
6947  
6948  	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6949  		min_pfn = min(min_pfn, start_pfn);
6950  
6951  	if (min_pfn == ULONG_MAX) {
6952  		pr_warn("Could not find start_pfn for node %d\n", nid);
6953  		return 0;
6954  	}
6955  
6956  	return min_pfn;
6957  }
6958  
6959  /**
6960   * find_min_pfn_with_active_regions - Find the minimum PFN registered
6961   *
6962   * Return: the minimum PFN based on information provided via
6963   * memblock_set_node().
6964   */
6965  unsigned long __init find_min_pfn_with_active_regions(void)
6966  {
6967  	return find_min_pfn_for_node(MAX_NUMNODES);
6968  }
6969  
6970  /*
6971   * early_calculate_totalpages()
6972   * Sum pages in active regions for movable zone.
6973   * Populate N_MEMORY for calculating usable_nodes.
6974   */
6975  static unsigned long __init early_calculate_totalpages(void)
6976  {
6977  	unsigned long totalpages = 0;
6978  	unsigned long start_pfn, end_pfn;
6979  	int i, nid;
6980  
6981  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6982  		unsigned long pages = end_pfn - start_pfn;
6983  
6984  		totalpages += pages;
6985  		if (pages)
6986  			node_set_state(nid, N_MEMORY);
6987  	}
6988  	return totalpages;
6989  }
6990  
6991  /*
6992   * Find the PFN the Movable zone begins in each node. Kernel memory
6993   * is spread evenly between nodes as long as the nodes have enough
6994   * memory. When they don't, some nodes will have more kernelcore than
6995   * others
6996   */
6997  static void __init find_zone_movable_pfns_for_nodes(void)
6998  {
6999  	int i, nid;
7000  	unsigned long usable_startpfn;
7001  	unsigned long kernelcore_node, kernelcore_remaining;
7002  	/* save the state before borrow the nodemask */
7003  	nodemask_t saved_node_state = node_states[N_MEMORY];
7004  	unsigned long totalpages = early_calculate_totalpages();
7005  	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
7006  	struct memblock_region *r;
7007  
7008  	/* Need to find movable_zone earlier when movable_node is specified. */
7009  	find_usable_zone_for_movable();
7010  
7011  	/*
7012  	 * If movable_node is specified, ignore kernelcore and movablecore
7013  	 * options.
7014  	 */
7015  	if (movable_node_is_enabled()) {
7016  		for_each_memblock(memory, r) {
7017  			if (!memblock_is_hotpluggable(r))
7018  				continue;
7019  
7020  			nid = r->nid;
7021  
7022  			usable_startpfn = PFN_DOWN(r->base);
7023  			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
7024  				min(usable_startpfn, zone_movable_pfn[nid]) :
7025  				usable_startpfn;
7026  		}
7027  
7028  		goto out2;
7029  	}
7030  
7031  	/*
7032  	 * If kernelcore=mirror is specified, ignore movablecore option
7033  	 */
7034  	if (mirrored_kernelcore) {
7035  		bool mem_below_4gb_not_mirrored = false;
7036  
7037  		for_each_memblock(memory, r) {
7038  			if (memblock_is_mirror(r))
7039  				continue;
7040  
7041  			nid = r->nid;
7042  
7043  			usable_startpfn = memblock_region_memory_base_pfn(r);
7044  
7045  			if (usable_startpfn < 0x100000) {
7046  				mem_below_4gb_not_mirrored = true;
7047  				continue;
7048  			}
7049  
7050  			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
7051  				min(usable_startpfn, zone_movable_pfn[nid]) :
7052  				usable_startpfn;
7053  		}
7054  
7055  		if (mem_below_4gb_not_mirrored)
7056  			pr_warn("This configuration results in unmirrored kernel memory.");
7057  
7058  		goto out2;
7059  	}
7060  
7061  	/*
7062  	 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
7063  	 * amount of necessary memory.
7064  	 */
7065  	if (required_kernelcore_percent)
7066  		required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
7067  				       10000UL;
7068  	if (required_movablecore_percent)
7069  		required_movablecore = (totalpages * 100 * required_movablecore_percent) /
7070  					10000UL;
7071  
7072  	/*
7073  	 * If movablecore= was specified, calculate what size of
7074  	 * kernelcore that corresponds so that memory usable for
7075  	 * any allocation type is evenly spread. If both kernelcore
7076  	 * and movablecore are specified, then the value of kernelcore
7077  	 * will be used for required_kernelcore if it's greater than
7078  	 * what movablecore would have allowed.
7079  	 */
7080  	if (required_movablecore) {
7081  		unsigned long corepages;
7082  
7083  		/*
7084  		 * Round-up so that ZONE_MOVABLE is at least as large as what
7085  		 * was requested by the user
7086  		 */
7087  		required_movablecore =
7088  			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
7089  		required_movablecore = min(totalpages, required_movablecore);
7090  		corepages = totalpages - required_movablecore;
7091  
7092  		required_kernelcore = max(required_kernelcore, corepages);
7093  	}
7094  
7095  	/*
7096  	 * If kernelcore was not specified or kernelcore size is larger
7097  	 * than totalpages, there is no ZONE_MOVABLE.
7098  	 */
7099  	if (!required_kernelcore || required_kernelcore >= totalpages)
7100  		goto out;
7101  
7102  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
7103  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
7104  
7105  restart:
7106  	/* Spread kernelcore memory as evenly as possible throughout nodes */
7107  	kernelcore_node = required_kernelcore / usable_nodes;
7108  	for_each_node_state(nid, N_MEMORY) {
7109  		unsigned long start_pfn, end_pfn;
7110  
7111  		/*
7112  		 * Recalculate kernelcore_node if the division per node
7113  		 * now exceeds what is necessary to satisfy the requested
7114  		 * amount of memory for the kernel
7115  		 */
7116  		if (required_kernelcore < kernelcore_node)
7117  			kernelcore_node = required_kernelcore / usable_nodes;
7118  
7119  		/*
7120  		 * As the map is walked, we track how much memory is usable
7121  		 * by the kernel using kernelcore_remaining. When it is
7122  		 * 0, the rest of the node is usable by ZONE_MOVABLE
7123  		 */
7124  		kernelcore_remaining = kernelcore_node;
7125  
7126  		/* Go through each range of PFNs within this node */
7127  		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
7128  			unsigned long size_pages;
7129  
7130  			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
7131  			if (start_pfn >= end_pfn)
7132  				continue;
7133  
7134  			/* Account for what is only usable for kernelcore */
7135  			if (start_pfn < usable_startpfn) {
7136  				unsigned long kernel_pages;
7137  				kernel_pages = min(end_pfn, usable_startpfn)
7138  								- start_pfn;
7139  
7140  				kernelcore_remaining -= min(kernel_pages,
7141  							kernelcore_remaining);
7142  				required_kernelcore -= min(kernel_pages,
7143  							required_kernelcore);
7144  
7145  				/* Continue if range is now fully accounted */
7146  				if (end_pfn <= usable_startpfn) {
7147  
7148  					/*
7149  					 * Push zone_movable_pfn to the end so
7150  					 * that if we have to rebalance
7151  					 * kernelcore across nodes, we will
7152  					 * not double account here
7153  					 */
7154  					zone_movable_pfn[nid] = end_pfn;
7155  					continue;
7156  				}
7157  				start_pfn = usable_startpfn;
7158  			}
7159  
7160  			/*
7161  			 * The usable PFN range for ZONE_MOVABLE is from
7162  			 * start_pfn->end_pfn. Calculate size_pages as the
7163  			 * number of pages used as kernelcore
7164  			 */
7165  			size_pages = end_pfn - start_pfn;
7166  			if (size_pages > kernelcore_remaining)
7167  				size_pages = kernelcore_remaining;
7168  			zone_movable_pfn[nid] = start_pfn + size_pages;
7169  
7170  			/*
7171  			 * Some kernelcore has been met, update counts and
7172  			 * break if the kernelcore for this node has been
7173  			 * satisfied
7174  			 */
7175  			required_kernelcore -= min(required_kernelcore,
7176  								size_pages);
7177  			kernelcore_remaining -= size_pages;
7178  			if (!kernelcore_remaining)
7179  				break;
7180  		}
7181  	}
7182  
7183  	/*
7184  	 * If there is still required_kernelcore, we do another pass with one
7185  	 * less node in the count. This will push zone_movable_pfn[nid] further
7186  	 * along on the nodes that still have memory until kernelcore is
7187  	 * satisfied
7188  	 */
7189  	usable_nodes--;
7190  	if (usable_nodes && required_kernelcore > usable_nodes)
7191  		goto restart;
7192  
7193  out2:
7194  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7195  	for (nid = 0; nid < MAX_NUMNODES; nid++)
7196  		zone_movable_pfn[nid] =
7197  			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
7198  
7199  out:
7200  	/* restore the node_state */
7201  	node_states[N_MEMORY] = saved_node_state;
7202  }
7203  
7204  /* Any regular or high memory on that node ? */
7205  static void check_for_memory(pg_data_t *pgdat, int nid)
7206  {
7207  	enum zone_type zone_type;
7208  
7209  	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
7210  		struct zone *zone = &pgdat->node_zones[zone_type];
7211  		if (populated_zone(zone)) {
7212  			if (IS_ENABLED(CONFIG_HIGHMEM))
7213  				node_set_state(nid, N_HIGH_MEMORY);
7214  			if (zone_type <= ZONE_NORMAL)
7215  				node_set_state(nid, N_NORMAL_MEMORY);
7216  			break;
7217  		}
7218  	}
7219  }
7220  
7221  /**
7222   * free_area_init_nodes - Initialise all pg_data_t and zone data
7223   * @max_zone_pfn: an array of max PFNs for each zone
7224   *
7225   * This will call free_area_init_node() for each active node in the system.
7226   * Using the page ranges provided by memblock_set_node(), the size of each
7227   * zone in each node and their holes is calculated. If the maximum PFN
7228   * between two adjacent zones match, it is assumed that the zone is empty.
7229   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
7230   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
7231   * starts where the previous one ended. For example, ZONE_DMA32 starts
7232   * at arch_max_dma_pfn.
7233   */
7234  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
7235  {
7236  	unsigned long start_pfn, end_pfn;
7237  	int i, nid;
7238  
7239  	/* Record where the zone boundaries are */
7240  	memset(arch_zone_lowest_possible_pfn, 0,
7241  				sizeof(arch_zone_lowest_possible_pfn));
7242  	memset(arch_zone_highest_possible_pfn, 0,
7243  				sizeof(arch_zone_highest_possible_pfn));
7244  
7245  	start_pfn = find_min_pfn_with_active_regions();
7246  
7247  	for (i = 0; i < MAX_NR_ZONES; i++) {
7248  		if (i == ZONE_MOVABLE)
7249  			continue;
7250  
7251  		end_pfn = max(max_zone_pfn[i], start_pfn);
7252  		arch_zone_lowest_possible_pfn[i] = start_pfn;
7253  		arch_zone_highest_possible_pfn[i] = end_pfn;
7254  
7255  		start_pfn = end_pfn;
7256  	}
7257  
7258  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
7259  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
7260  	find_zone_movable_pfns_for_nodes();
7261  
7262  	/* Print out the zone ranges */
7263  	pr_info("Zone ranges:\n");
7264  	for (i = 0; i < MAX_NR_ZONES; i++) {
7265  		if (i == ZONE_MOVABLE)
7266  			continue;
7267  		pr_info("  %-8s ", zone_names[i]);
7268  		if (arch_zone_lowest_possible_pfn[i] ==
7269  				arch_zone_highest_possible_pfn[i])
7270  			pr_cont("empty\n");
7271  		else
7272  			pr_cont("[mem %#018Lx-%#018Lx]\n",
7273  				(u64)arch_zone_lowest_possible_pfn[i]
7274  					<< PAGE_SHIFT,
7275  				((u64)arch_zone_highest_possible_pfn[i]
7276  					<< PAGE_SHIFT) - 1);
7277  	}
7278  
7279  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
7280  	pr_info("Movable zone start for each node\n");
7281  	for (i = 0; i < MAX_NUMNODES; i++) {
7282  		if (zone_movable_pfn[i])
7283  			pr_info("  Node %d: %#018Lx\n", i,
7284  			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
7285  	}
7286  
7287  	/* Print out the early node map */
7288  	pr_info("Early memory node ranges\n");
7289  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
7290  		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7291  			(u64)start_pfn << PAGE_SHIFT,
7292  			((u64)end_pfn << PAGE_SHIFT) - 1);
7293  
7294  	/* Initialise every node */
7295  	mminit_verify_pageflags_layout();
7296  	setup_nr_node_ids();
7297  	zero_resv_unavail();
7298  	for_each_online_node(nid) {
7299  		pg_data_t *pgdat = NODE_DATA(nid);
7300  		free_area_init_node(nid, NULL,
7301  				find_min_pfn_for_node(nid), NULL);
7302  
7303  		/* Any memory on that node */
7304  		if (pgdat->node_present_pages)
7305  			node_set_state(nid, N_MEMORY);
7306  		check_for_memory(pgdat, nid);
7307  	}
7308  }
7309  
7310  static int __init cmdline_parse_core(char *p, unsigned long *core,
7311  				     unsigned long *percent)
7312  {
7313  	unsigned long long coremem;
7314  	char *endptr;
7315  
7316  	if (!p)
7317  		return -EINVAL;
7318  
7319  	/* Value may be a percentage of total memory, otherwise bytes */
7320  	coremem = simple_strtoull(p, &endptr, 0);
7321  	if (*endptr == '%') {
7322  		/* Paranoid check for percent values greater than 100 */
7323  		WARN_ON(coremem > 100);
7324  
7325  		*percent = coremem;
7326  	} else {
7327  		coremem = memparse(p, &p);
7328  		/* Paranoid check that UL is enough for the coremem value */
7329  		WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
7330  
7331  		*core = coremem >> PAGE_SHIFT;
7332  		*percent = 0UL;
7333  	}
7334  	return 0;
7335  }
7336  
7337  /*
7338   * kernelcore=size sets the amount of memory for use for allocations that
7339   * cannot be reclaimed or migrated.
7340   */
7341  static int __init cmdline_parse_kernelcore(char *p)
7342  {
7343  	/* parse kernelcore=mirror */
7344  	if (parse_option_str(p, "mirror")) {
7345  		mirrored_kernelcore = true;
7346  		return 0;
7347  	}
7348  
7349  	return cmdline_parse_core(p, &required_kernelcore,
7350  				  &required_kernelcore_percent);
7351  }
7352  
7353  /*
7354   * movablecore=size sets the amount of memory for use for allocations that
7355   * can be reclaimed or migrated.
7356   */
7357  static int __init cmdline_parse_movablecore(char *p)
7358  {
7359  	return cmdline_parse_core(p, &required_movablecore,
7360  				  &required_movablecore_percent);
7361  }
7362  
7363  early_param("kernelcore", cmdline_parse_kernelcore);
7364  early_param("movablecore", cmdline_parse_movablecore);
7365  
7366  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7367  
7368  void adjust_managed_page_count(struct page *page, long count)
7369  {
7370  	atomic_long_add(count, &page_zone(page)->managed_pages);
7371  	totalram_pages_add(count);
7372  #ifdef CONFIG_HIGHMEM
7373  	if (PageHighMem(page))
7374  		totalhigh_pages_add(count);
7375  #endif
7376  }
7377  EXPORT_SYMBOL(adjust_managed_page_count);
7378  
7379  unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
7380  {
7381  	void *pos;
7382  	unsigned long pages = 0;
7383  
7384  	start = (void *)PAGE_ALIGN((unsigned long)start);
7385  	end = (void *)((unsigned long)end & PAGE_MASK);
7386  	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
7387  		struct page *page = virt_to_page(pos);
7388  		void *direct_map_addr;
7389  
7390  		/*
7391  		 * 'direct_map_addr' might be different from 'pos'
7392  		 * because some architectures' virt_to_page()
7393  		 * work with aliases.  Getting the direct map
7394  		 * address ensures that we get a _writeable_
7395  		 * alias for the memset().
7396  		 */
7397  		direct_map_addr = page_address(page);
7398  		if ((unsigned int)poison <= 0xFF)
7399  			memset(direct_map_addr, poison, PAGE_SIZE);
7400  
7401  		free_reserved_page(page);
7402  	}
7403  
7404  	if (pages && s)
7405  		pr_info("Freeing %s memory: %ldK\n",
7406  			s, pages << (PAGE_SHIFT - 10));
7407  
7408  	return pages;
7409  }
7410  
7411  #ifdef	CONFIG_HIGHMEM
7412  void free_highmem_page(struct page *page)
7413  {
7414  	__free_reserved_page(page);
7415  	totalram_pages_inc();
7416  	atomic_long_inc(&page_zone(page)->managed_pages);
7417  	totalhigh_pages_inc();
7418  }
7419  #endif
7420  
7421  
7422  void __init mem_init_print_info(const char *str)
7423  {
7424  	unsigned long physpages, codesize, datasize, rosize, bss_size;
7425  	unsigned long init_code_size, init_data_size;
7426  
7427  	physpages = get_num_physpages();
7428  	codesize = _etext - _stext;
7429  	datasize = _edata - _sdata;
7430  	rosize = __end_rodata - __start_rodata;
7431  	bss_size = __bss_stop - __bss_start;
7432  	init_data_size = __init_end - __init_begin;
7433  	init_code_size = _einittext - _sinittext;
7434  
7435  	/*
7436  	 * Detect special cases and adjust section sizes accordingly:
7437  	 * 1) .init.* may be embedded into .data sections
7438  	 * 2) .init.text.* may be out of [__init_begin, __init_end],
7439  	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
7440  	 * 3) .rodata.* may be embedded into .text or .data sections.
7441  	 */
7442  #define adj_init_size(start, end, size, pos, adj) \
7443  	do { \
7444  		if (start <= pos && pos < end && size > adj) \
7445  			size -= adj; \
7446  	} while (0)
7447  
7448  	adj_init_size(__init_begin, __init_end, init_data_size,
7449  		     _sinittext, init_code_size);
7450  	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
7451  	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
7452  	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
7453  	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
7454  
7455  #undef	adj_init_size
7456  
7457  	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
7458  #ifdef	CONFIG_HIGHMEM
7459  		", %luK highmem"
7460  #endif
7461  		"%s%s)\n",
7462  		nr_free_pages() << (PAGE_SHIFT - 10),
7463  		physpages << (PAGE_SHIFT - 10),
7464  		codesize >> 10, datasize >> 10, rosize >> 10,
7465  		(init_data_size + init_code_size) >> 10, bss_size >> 10,
7466  		(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7467  		totalcma_pages << (PAGE_SHIFT - 10),
7468  #ifdef	CONFIG_HIGHMEM
7469  		totalhigh_pages() << (PAGE_SHIFT - 10),
7470  #endif
7471  		str ? ", " : "", str ? str : "");
7472  }
7473  
7474  /**
7475   * set_dma_reserve - set the specified number of pages reserved in the first zone
7476   * @new_dma_reserve: The number of pages to mark reserved
7477   *
7478   * The per-cpu batchsize and zone watermarks are determined by managed_pages.
7479   * In the DMA zone, a significant percentage may be consumed by kernel image
7480   * and other unfreeable allocations which can skew the watermarks badly. This
7481   * function may optionally be used to account for unfreeable pages in the
7482   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
7483   * smaller per-cpu batchsize.
7484   */
7485  void __init set_dma_reserve(unsigned long new_dma_reserve)
7486  {
7487  	dma_reserve = new_dma_reserve;
7488  }
7489  
7490  void __init free_area_init(unsigned long *zones_size)
7491  {
7492  	zero_resv_unavail();
7493  	free_area_init_node(0, zones_size,
7494  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
7495  }
7496  
7497  static int page_alloc_cpu_dead(unsigned int cpu)
7498  {
7499  
7500  	lru_add_drain_cpu(cpu);
7501  	drain_pages(cpu);
7502  
7503  	/*
7504  	 * Spill the event counters of the dead processor
7505  	 * into the current processors event counters.
7506  	 * This artificially elevates the count of the current
7507  	 * processor.
7508  	 */
7509  	vm_events_fold_cpu(cpu);
7510  
7511  	/*
7512  	 * Zero the differential counters of the dead processor
7513  	 * so that the vm statistics are consistent.
7514  	 *
7515  	 * This is only okay since the processor is dead and cannot
7516  	 * race with what we are doing.
7517  	 */
7518  	cpu_vm_stats_fold(cpu);
7519  	return 0;
7520  }
7521  
7522  void __init page_alloc_init(void)
7523  {
7524  	int ret;
7525  
7526  	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
7527  					"mm/page_alloc:dead", NULL,
7528  					page_alloc_cpu_dead);
7529  	WARN_ON(ret < 0);
7530  }
7531  
7532  /*
7533   * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
7534   *	or min_free_kbytes changes.
7535   */
7536  static void calculate_totalreserve_pages(void)
7537  {
7538  	struct pglist_data *pgdat;
7539  	unsigned long reserve_pages = 0;
7540  	enum zone_type i, j;
7541  
7542  	for_each_online_pgdat(pgdat) {
7543  
7544  		pgdat->totalreserve_pages = 0;
7545  
7546  		for (i = 0; i < MAX_NR_ZONES; i++) {
7547  			struct zone *zone = pgdat->node_zones + i;
7548  			long max = 0;
7549  			unsigned long managed_pages = zone_managed_pages(zone);
7550  
7551  			/* Find valid and maximum lowmem_reserve in the zone */
7552  			for (j = i; j < MAX_NR_ZONES; j++) {
7553  				if (zone->lowmem_reserve[j] > max)
7554  					max = zone->lowmem_reserve[j];
7555  			}
7556  
7557  			/* we treat the high watermark as reserved pages. */
7558  			max += high_wmark_pages(zone);
7559  
7560  			if (max > managed_pages)
7561  				max = managed_pages;
7562  
7563  			pgdat->totalreserve_pages += max;
7564  
7565  			reserve_pages += max;
7566  		}
7567  	}
7568  	totalreserve_pages = reserve_pages;
7569  }
7570  
7571  /*
7572   * setup_per_zone_lowmem_reserve - called whenever
7573   *	sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
7574   *	has a correct pages reserved value, so an adequate number of
7575   *	pages are left in the zone after a successful __alloc_pages().
7576   */
7577  static void setup_per_zone_lowmem_reserve(void)
7578  {
7579  	struct pglist_data *pgdat;
7580  	enum zone_type j, idx;
7581  
7582  	for_each_online_pgdat(pgdat) {
7583  		for (j = 0; j < MAX_NR_ZONES; j++) {
7584  			struct zone *zone = pgdat->node_zones + j;
7585  			unsigned long managed_pages = zone_managed_pages(zone);
7586  
7587  			zone->lowmem_reserve[j] = 0;
7588  
7589  			idx = j;
7590  			while (idx) {
7591  				struct zone *lower_zone;
7592  
7593  				idx--;
7594  				lower_zone = pgdat->node_zones + idx;
7595  
7596  				if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7597  					sysctl_lowmem_reserve_ratio[idx] = 0;
7598  					lower_zone->lowmem_reserve[j] = 0;
7599  				} else {
7600  					lower_zone->lowmem_reserve[j] =
7601  						managed_pages / sysctl_lowmem_reserve_ratio[idx];
7602  				}
7603  				managed_pages += zone_managed_pages(lower_zone);
7604  			}
7605  		}
7606  	}
7607  
7608  	/* update totalreserve_pages */
7609  	calculate_totalreserve_pages();
7610  }
7611  
7612  static void __setup_per_zone_wmarks(void)
7613  {
7614  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
7615  	unsigned long lowmem_pages = 0;
7616  	struct zone *zone;
7617  	unsigned long flags;
7618  
7619  	/* Calculate total number of !ZONE_HIGHMEM pages */
7620  	for_each_zone(zone) {
7621  		if (!is_highmem(zone))
7622  			lowmem_pages += zone_managed_pages(zone);
7623  	}
7624  
7625  	for_each_zone(zone) {
7626  		u64 tmp;
7627  
7628  		spin_lock_irqsave(&zone->lock, flags);
7629  		tmp = (u64)pages_min * zone_managed_pages(zone);
7630  		do_div(tmp, lowmem_pages);
7631  		if (is_highmem(zone)) {
7632  			/*
7633  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
7634  			 * need highmem pages, so cap pages_min to a small
7635  			 * value here.
7636  			 *
7637  			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7638  			 * deltas control async page reclaim, and so should
7639  			 * not be capped for highmem.
7640  			 */
7641  			unsigned long min_pages;
7642  
7643  			min_pages = zone_managed_pages(zone) / 1024;
7644  			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7645  			zone->_watermark[WMARK_MIN] = min_pages;
7646  		} else {
7647  			/*
7648  			 * If it's a lowmem zone, reserve a number of pages
7649  			 * proportionate to the zone's size.
7650  			 */
7651  			zone->_watermark[WMARK_MIN] = tmp;
7652  		}
7653  
7654  		/*
7655  		 * Set the kswapd watermarks distance according to the
7656  		 * scale factor in proportion to available memory, but
7657  		 * ensure a minimum size on small systems.
7658  		 */
7659  		tmp = max_t(u64, tmp >> 2,
7660  			    mult_frac(zone_managed_pages(zone),
7661  				      watermark_scale_factor, 10000));
7662  
7663  		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
7664  		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
7665  		zone->watermark_boost = 0;
7666  
7667  		spin_unlock_irqrestore(&zone->lock, flags);
7668  	}
7669  
7670  	/* update totalreserve_pages */
7671  	calculate_totalreserve_pages();
7672  }
7673  
7674  /**
7675   * setup_per_zone_wmarks - called when min_free_kbytes changes
7676   * or when memory is hot-{added|removed}
7677   *
7678   * Ensures that the watermark[min,low,high] values for each zone are set
7679   * correctly with respect to min_free_kbytes.
7680   */
7681  void setup_per_zone_wmarks(void)
7682  {
7683  	static DEFINE_SPINLOCK(lock);
7684  
7685  	spin_lock(&lock);
7686  	__setup_per_zone_wmarks();
7687  	spin_unlock(&lock);
7688  }
7689  
7690  /*
7691   * Initialise min_free_kbytes.
7692   *
7693   * For small machines we want it small (128k min).  For large machines
7694   * we want it large (64MB max).  But it is not linear, because network
7695   * bandwidth does not increase linearly with machine size.  We use
7696   *
7697   *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
7698   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
7699   *
7700   * which yields
7701   *
7702   * 16MB:	512k
7703   * 32MB:	724k
7704   * 64MB:	1024k
7705   * 128MB:	1448k
7706   * 256MB:	2048k
7707   * 512MB:	2896k
7708   * 1024MB:	4096k
7709   * 2048MB:	5792k
7710   * 4096MB:	8192k
7711   * 8192MB:	11584k
7712   * 16384MB:	16384k
7713   */
7714  int __meminit init_per_zone_wmark_min(void)
7715  {
7716  	unsigned long lowmem_kbytes;
7717  	int new_min_free_kbytes;
7718  
7719  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
7720  	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
7721  
7722  	if (new_min_free_kbytes > user_min_free_kbytes) {
7723  		min_free_kbytes = new_min_free_kbytes;
7724  		if (min_free_kbytes < 128)
7725  			min_free_kbytes = 128;
7726  		if (min_free_kbytes > 65536)
7727  			min_free_kbytes = 65536;
7728  	} else {
7729  		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7730  				new_min_free_kbytes, user_min_free_kbytes);
7731  	}
7732  	setup_per_zone_wmarks();
7733  	refresh_zone_stat_thresholds();
7734  	setup_per_zone_lowmem_reserve();
7735  
7736  #ifdef CONFIG_NUMA
7737  	setup_min_unmapped_ratio();
7738  	setup_min_slab_ratio();
7739  #endif
7740  
7741  	return 0;
7742  }
7743  core_initcall(init_per_zone_wmark_min)
7744  
7745  /*
7746   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
7747   *	that we can call two helper functions whenever min_free_kbytes
7748   *	changes.
7749   */
7750  int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7751  	void __user *buffer, size_t *length, loff_t *ppos)
7752  {
7753  	int rc;
7754  
7755  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7756  	if (rc)
7757  		return rc;
7758  
7759  	if (write) {
7760  		user_min_free_kbytes = min_free_kbytes;
7761  		setup_per_zone_wmarks();
7762  	}
7763  	return 0;
7764  }
7765  
7766  int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
7767  	void __user *buffer, size_t *length, loff_t *ppos)
7768  {
7769  	int rc;
7770  
7771  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7772  	if (rc)
7773  		return rc;
7774  
7775  	return 0;
7776  }
7777  
7778  int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7779  	void __user *buffer, size_t *length, loff_t *ppos)
7780  {
7781  	int rc;
7782  
7783  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7784  	if (rc)
7785  		return rc;
7786  
7787  	if (write)
7788  		setup_per_zone_wmarks();
7789  
7790  	return 0;
7791  }
7792  
7793  #ifdef CONFIG_NUMA
7794  static void setup_min_unmapped_ratio(void)
7795  {
7796  	pg_data_t *pgdat;
7797  	struct zone *zone;
7798  
7799  	for_each_online_pgdat(pgdat)
7800  		pgdat->min_unmapped_pages = 0;
7801  
7802  	for_each_zone(zone)
7803  		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
7804  						         sysctl_min_unmapped_ratio) / 100;
7805  }
7806  
7807  
7808  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7809  	void __user *buffer, size_t *length, loff_t *ppos)
7810  {
7811  	int rc;
7812  
7813  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7814  	if (rc)
7815  		return rc;
7816  
7817  	setup_min_unmapped_ratio();
7818  
7819  	return 0;
7820  }
7821  
7822  static void setup_min_slab_ratio(void)
7823  {
7824  	pg_data_t *pgdat;
7825  	struct zone *zone;
7826  
7827  	for_each_online_pgdat(pgdat)
7828  		pgdat->min_slab_pages = 0;
7829  
7830  	for_each_zone(zone)
7831  		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
7832  						     sysctl_min_slab_ratio) / 100;
7833  }
7834  
7835  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7836  	void __user *buffer, size_t *length, loff_t *ppos)
7837  {
7838  	int rc;
7839  
7840  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7841  	if (rc)
7842  		return rc;
7843  
7844  	setup_min_slab_ratio();
7845  
7846  	return 0;
7847  }
7848  #endif
7849  
7850  /*
7851   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
7852   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
7853   *	whenever sysctl_lowmem_reserve_ratio changes.
7854   *
7855   * The reserve ratio obviously has absolutely no relation with the
7856   * minimum watermarks. The lowmem reserve ratio can only make sense
7857   * if in function of the boot time zone sizes.
7858   */
7859  int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7860  	void __user *buffer, size_t *length, loff_t *ppos)
7861  {
7862  	proc_dointvec_minmax(table, write, buffer, length, ppos);
7863  	setup_per_zone_lowmem_reserve();
7864  	return 0;
7865  }
7866  
7867  /*
7868   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
7869   * cpu.  It is the fraction of total pages in each zone that a hot per cpu
7870   * pagelist can have before it gets flushed back to buddy allocator.
7871   */
7872  int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7873  	void __user *buffer, size_t *length, loff_t *ppos)
7874  {
7875  	struct zone *zone;
7876  	int old_percpu_pagelist_fraction;
7877  	int ret;
7878  
7879  	mutex_lock(&pcp_batch_high_lock);
7880  	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
7881  
7882  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
7883  	if (!write || ret < 0)
7884  		goto out;
7885  
7886  	/* Sanity checking to avoid pcp imbalance */
7887  	if (percpu_pagelist_fraction &&
7888  	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
7889  		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
7890  		ret = -EINVAL;
7891  		goto out;
7892  	}
7893  
7894  	/* No change? */
7895  	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7896  		goto out;
7897  
7898  	for_each_populated_zone(zone) {
7899  		unsigned int cpu;
7900  
7901  		for_each_possible_cpu(cpu)
7902  			pageset_set_high_and_batch(zone,
7903  					per_cpu_ptr(zone->pageset, cpu));
7904  	}
7905  out:
7906  	mutex_unlock(&pcp_batch_high_lock);
7907  	return ret;
7908  }
7909  
7910  #ifdef CONFIG_NUMA
7911  int hashdist = HASHDIST_DEFAULT;
7912  
7913  static int __init set_hashdist(char *str)
7914  {
7915  	if (!str)
7916  		return 0;
7917  	hashdist = simple_strtoul(str, &str, 0);
7918  	return 1;
7919  }
7920  __setup("hashdist=", set_hashdist);
7921  #endif
7922  
7923  #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7924  /*
7925   * Returns the number of pages that arch has reserved but
7926   * is not known to alloc_large_system_hash().
7927   */
7928  static unsigned long __init arch_reserved_kernel_pages(void)
7929  {
7930  	return 0;
7931  }
7932  #endif
7933  
7934  /*
7935   * Adaptive scale is meant to reduce sizes of hash tables on large memory
7936   * machines. As memory size is increased the scale is also increased but at
7937   * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
7938   * quadruples the scale is increased by one, which means the size of hash table
7939   * only doubles, instead of quadrupling as well.
7940   * Because 32-bit systems cannot have large physical memory, where this scaling
7941   * makes sense, it is disabled on such platforms.
7942   */
7943  #if __BITS_PER_LONG > 32
7944  #define ADAPT_SCALE_BASE	(64ul << 30)
7945  #define ADAPT_SCALE_SHIFT	2
7946  #define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT)
7947  #endif
7948  
7949  /*
7950   * allocate a large system hash table from bootmem
7951   * - it is assumed that the hash table must contain an exact power-of-2
7952   *   quantity of entries
7953   * - limit is the number of hash buckets, not the total allocation size
7954   */
7955  void *__init alloc_large_system_hash(const char *tablename,
7956  				     unsigned long bucketsize,
7957  				     unsigned long numentries,
7958  				     int scale,
7959  				     int flags,
7960  				     unsigned int *_hash_shift,
7961  				     unsigned int *_hash_mask,
7962  				     unsigned long low_limit,
7963  				     unsigned long high_limit)
7964  {
7965  	unsigned long long max = high_limit;
7966  	unsigned long log2qty, size;
7967  	void *table = NULL;
7968  	gfp_t gfp_flags;
7969  
7970  	/* allow the kernel cmdline to have a say */
7971  	if (!numentries) {
7972  		/* round applicable memory size up to nearest megabyte */
7973  		numentries = nr_kernel_pages;
7974  		numentries -= arch_reserved_kernel_pages();
7975  
7976  		/* It isn't necessary when PAGE_SIZE >= 1MB */
7977  		if (PAGE_SHIFT < 20)
7978  			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
7979  
7980  #if __BITS_PER_LONG > 32
7981  		if (!high_limit) {
7982  			unsigned long adapt;
7983  
7984  			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
7985  			     adapt <<= ADAPT_SCALE_SHIFT)
7986  				scale++;
7987  		}
7988  #endif
7989  
7990  		/* limit to 1 bucket per 2^scale bytes of low memory */
7991  		if (scale > PAGE_SHIFT)
7992  			numentries >>= (scale - PAGE_SHIFT);
7993  		else
7994  			numentries <<= (PAGE_SHIFT - scale);
7995  
7996  		/* Make sure we've got at least a 0-order allocation.. */
7997  		if (unlikely(flags & HASH_SMALL)) {
7998  			/* Makes no sense without HASH_EARLY */
7999  			WARN_ON(!(flags & HASH_EARLY));
8000  			if (!(numentries >> *_hash_shift)) {
8001  				numentries = 1UL << *_hash_shift;
8002  				BUG_ON(!numentries);
8003  			}
8004  		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
8005  			numentries = PAGE_SIZE / bucketsize;
8006  	}
8007  	numentries = roundup_pow_of_two(numentries);
8008  
8009  	/* limit allocation size to 1/16 total memory by default */
8010  	if (max == 0) {
8011  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
8012  		do_div(max, bucketsize);
8013  	}
8014  	max = min(max, 0x80000000ULL);
8015  
8016  	if (numentries < low_limit)
8017  		numentries = low_limit;
8018  	if (numentries > max)
8019  		numentries = max;
8020  
8021  	log2qty = ilog2(numentries);
8022  
8023  	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
8024  	do {
8025  		size = bucketsize << log2qty;
8026  		if (flags & HASH_EARLY) {
8027  			if (flags & HASH_ZERO)
8028  				table = memblock_alloc(size, SMP_CACHE_BYTES);
8029  			else
8030  				table = memblock_alloc_raw(size,
8031  							   SMP_CACHE_BYTES);
8032  		} else if (hashdist) {
8033  			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
8034  		} else {
8035  			/*
8036  			 * If bucketsize is not a power-of-two, we may free
8037  			 * some pages at the end of hash table which
8038  			 * alloc_pages_exact() automatically does
8039  			 */
8040  			if (get_order(size) < MAX_ORDER) {
8041  				table = alloc_pages_exact(size, gfp_flags);
8042  				kmemleak_alloc(table, size, 1, gfp_flags);
8043  			}
8044  		}
8045  	} while (!table && size > PAGE_SIZE && --log2qty);
8046  
8047  	if (!table)
8048  		panic("Failed to allocate %s hash table\n", tablename);
8049  
8050  	pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
8051  		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
8052  
8053  	if (_hash_shift)
8054  		*_hash_shift = log2qty;
8055  	if (_hash_mask)
8056  		*_hash_mask = (1 << log2qty) - 1;
8057  
8058  	return table;
8059  }
8060  
8061  /*
8062   * This function checks whether pageblock includes unmovable pages or not.
8063   * If @count is not zero, it is okay to include less @count unmovable pages
8064   *
8065   * PageLRU check without isolation or lru_lock could race so that
8066   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
8067   * check without lock_page also may miss some movable non-lru pages at
8068   * race condition. So you can't expect this function should be exact.
8069   */
8070  bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
8071  			 int migratetype, int flags)
8072  {
8073  	unsigned long found;
8074  	unsigned long iter = 0;
8075  	unsigned long pfn = page_to_pfn(page);
8076  	const char *reason = "unmovable page";
8077  
8078  	/*
8079  	 * TODO we could make this much more efficient by not checking every
8080  	 * page in the range if we know all of them are in MOVABLE_ZONE and
8081  	 * that the movable zone guarantees that pages are migratable but
8082  	 * the later is not the case right now unfortunatelly. E.g. movablecore
8083  	 * can still lead to having bootmem allocations in zone_movable.
8084  	 */
8085  
8086  	if (is_migrate_cma_page(page)) {
8087  		/*
8088  		 * CMA allocations (alloc_contig_range) really need to mark
8089  		 * isolate CMA pageblocks even when they are not movable in fact
8090  		 * so consider them movable here.
8091  		 */
8092  		if (is_migrate_cma(migratetype))
8093  			return false;
8094  
8095  		reason = "CMA page";
8096  		goto unmovable;
8097  	}
8098  
8099  	for (found = 0; iter < pageblock_nr_pages; iter++) {
8100  		unsigned long check = pfn + iter;
8101  
8102  		if (!pfn_valid_within(check))
8103  			continue;
8104  
8105  		page = pfn_to_page(check);
8106  
8107  		if (PageReserved(page))
8108  			goto unmovable;
8109  
8110  		/*
8111  		 * If the zone is movable and we have ruled out all reserved
8112  		 * pages then it should be reasonably safe to assume the rest
8113  		 * is movable.
8114  		 */
8115  		if (zone_idx(zone) == ZONE_MOVABLE)
8116  			continue;
8117  
8118  		/*
8119  		 * Hugepages are not in LRU lists, but they're movable.
8120  		 * We need not scan over tail pages because we don't
8121  		 * handle each tail page individually in migration.
8122  		 */
8123  		if (PageHuge(page)) {
8124  			struct page *head = compound_head(page);
8125  			unsigned int skip_pages;
8126  
8127  			if (!hugepage_migration_supported(page_hstate(head)))
8128  				goto unmovable;
8129  
8130  			skip_pages = (1 << compound_order(head)) - (page - head);
8131  			iter += skip_pages - 1;
8132  			continue;
8133  		}
8134  
8135  		/*
8136  		 * We can't use page_count without pin a page
8137  		 * because another CPU can free compound page.
8138  		 * This check already skips compound tails of THP
8139  		 * because their page->_refcount is zero at all time.
8140  		 */
8141  		if (!page_ref_count(page)) {
8142  			if (PageBuddy(page))
8143  				iter += (1 << page_order(page)) - 1;
8144  			continue;
8145  		}
8146  
8147  		/*
8148  		 * The HWPoisoned page may be not in buddy system, and
8149  		 * page_count() is not 0.
8150  		 */
8151  		if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
8152  			continue;
8153  
8154  		if (__PageMovable(page))
8155  			continue;
8156  
8157  		if (!PageLRU(page))
8158  			found++;
8159  		/*
8160  		 * If there are RECLAIMABLE pages, we need to check
8161  		 * it.  But now, memory offline itself doesn't call
8162  		 * shrink_node_slabs() and it still to be fixed.
8163  		 */
8164  		/*
8165  		 * If the page is not RAM, page_count()should be 0.
8166  		 * we don't need more check. This is an _used_ not-movable page.
8167  		 *
8168  		 * The problematic thing here is PG_reserved pages. PG_reserved
8169  		 * is set to both of a memory hole page and a _used_ kernel
8170  		 * page at boot.
8171  		 */
8172  		if (found > count)
8173  			goto unmovable;
8174  	}
8175  	return false;
8176  unmovable:
8177  	WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8178  	if (flags & REPORT_FAILURE)
8179  		dump_page(pfn_to_page(pfn + iter), reason);
8180  	return true;
8181  }
8182  
8183  #ifdef CONFIG_CONTIG_ALLOC
8184  static unsigned long pfn_max_align_down(unsigned long pfn)
8185  {
8186  	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8187  			     pageblock_nr_pages) - 1);
8188  }
8189  
8190  static unsigned long pfn_max_align_up(unsigned long pfn)
8191  {
8192  	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8193  				pageblock_nr_pages));
8194  }
8195  
8196  /* [start, end) must belong to a single zone. */
8197  static int __alloc_contig_migrate_range(struct compact_control *cc,
8198  					unsigned long start, unsigned long end)
8199  {
8200  	/* This function is based on compact_zone() from compaction.c. */
8201  	unsigned long nr_reclaimed;
8202  	unsigned long pfn = start;
8203  	unsigned int tries = 0;
8204  	int ret = 0;
8205  
8206  	migrate_prep();
8207  
8208  	while (pfn < end || !list_empty(&cc->migratepages)) {
8209  		if (fatal_signal_pending(current)) {
8210  			ret = -EINTR;
8211  			break;
8212  		}
8213  
8214  		if (list_empty(&cc->migratepages)) {
8215  			cc->nr_migratepages = 0;
8216  			pfn = isolate_migratepages_range(cc, pfn, end);
8217  			if (!pfn) {
8218  				ret = -EINTR;
8219  				break;
8220  			}
8221  			tries = 0;
8222  		} else if (++tries == 5) {
8223  			ret = ret < 0 ? ret : -EBUSY;
8224  			break;
8225  		}
8226  
8227  		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8228  							&cc->migratepages);
8229  		cc->nr_migratepages -= nr_reclaimed;
8230  
8231  		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8232  				    NULL, 0, cc->mode, MR_CONTIG_RANGE);
8233  	}
8234  	if (ret < 0) {
8235  		putback_movable_pages(&cc->migratepages);
8236  		return ret;
8237  	}
8238  	return 0;
8239  }
8240  
8241  /**
8242   * alloc_contig_range() -- tries to allocate given range of pages
8243   * @start:	start PFN to allocate
8244   * @end:	one-past-the-last PFN to allocate
8245   * @migratetype:	migratetype of the underlaying pageblocks (either
8246   *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
8247   *			in range must have the same migratetype and it must
8248   *			be either of the two.
8249   * @gfp_mask:	GFP mask to use during compaction
8250   *
8251   * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
8252   * aligned.  The PFN range must belong to a single zone.
8253   *
8254   * The first thing this routine does is attempt to MIGRATE_ISOLATE all
8255   * pageblocks in the range.  Once isolated, the pageblocks should not
8256   * be modified by others.
8257   *
8258   * Return: zero on success or negative error code.  On success all
8259   * pages which PFN is in [start, end) are allocated for the caller and
8260   * need to be freed with free_contig_range().
8261   */
8262  int alloc_contig_range(unsigned long start, unsigned long end,
8263  		       unsigned migratetype, gfp_t gfp_mask)
8264  {
8265  	unsigned long outer_start, outer_end;
8266  	unsigned int order;
8267  	int ret = 0;
8268  
8269  	struct compact_control cc = {
8270  		.nr_migratepages = 0,
8271  		.order = -1,
8272  		.zone = page_zone(pfn_to_page(start)),
8273  		.mode = MIGRATE_SYNC,
8274  		.ignore_skip_hint = true,
8275  		.no_set_skip_hint = true,
8276  		.gfp_mask = current_gfp_context(gfp_mask),
8277  	};
8278  	INIT_LIST_HEAD(&cc.migratepages);
8279  
8280  	/*
8281  	 * What we do here is we mark all pageblocks in range as
8282  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
8283  	 * have different sizes, and due to the way page allocator
8284  	 * work, we align the range to biggest of the two pages so
8285  	 * that page allocator won't try to merge buddies from
8286  	 * different pageblocks and change MIGRATE_ISOLATE to some
8287  	 * other migration type.
8288  	 *
8289  	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
8290  	 * migrate the pages from an unaligned range (ie. pages that
8291  	 * we are interested in).  This will put all the pages in
8292  	 * range back to page allocator as MIGRATE_ISOLATE.
8293  	 *
8294  	 * When this is done, we take the pages in range from page
8295  	 * allocator removing them from the buddy system.  This way
8296  	 * page allocator will never consider using them.
8297  	 *
8298  	 * This lets us mark the pageblocks back as
8299  	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
8300  	 * aligned range but not in the unaligned, original range are
8301  	 * put back to page allocator so that buddy can use them.
8302  	 */
8303  
8304  	ret = start_isolate_page_range(pfn_max_align_down(start),
8305  				       pfn_max_align_up(end), migratetype, 0);
8306  	if (ret < 0)
8307  		return ret;
8308  
8309  	/*
8310  	 * In case of -EBUSY, we'd like to know which page causes problem.
8311  	 * So, just fall through. test_pages_isolated() has a tracepoint
8312  	 * which will report the busy page.
8313  	 *
8314  	 * It is possible that busy pages could become available before
8315  	 * the call to test_pages_isolated, and the range will actually be
8316  	 * allocated.  So, if we fall through be sure to clear ret so that
8317  	 * -EBUSY is not accidentally used or returned to caller.
8318  	 */
8319  	ret = __alloc_contig_migrate_range(&cc, start, end);
8320  	if (ret && ret != -EBUSY)
8321  		goto done;
8322  	ret =0;
8323  
8324  	/*
8325  	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
8326  	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
8327  	 * more, all pages in [start, end) are free in page allocator.
8328  	 * What we are going to do is to allocate all pages from
8329  	 * [start, end) (that is remove them from page allocator).
8330  	 *
8331  	 * The only problem is that pages at the beginning and at the
8332  	 * end of interesting range may be not aligned with pages that
8333  	 * page allocator holds, ie. they can be part of higher order
8334  	 * pages.  Because of this, we reserve the bigger range and
8335  	 * once this is done free the pages we are not interested in.
8336  	 *
8337  	 * We don't have to hold zone->lock here because the pages are
8338  	 * isolated thus they won't get removed from buddy.
8339  	 */
8340  
8341  	lru_add_drain_all();
8342  
8343  	order = 0;
8344  	outer_start = start;
8345  	while (!PageBuddy(pfn_to_page(outer_start))) {
8346  		if (++order >= MAX_ORDER) {
8347  			outer_start = start;
8348  			break;
8349  		}
8350  		outer_start &= ~0UL << order;
8351  	}
8352  
8353  	if (outer_start != start) {
8354  		order = page_order(pfn_to_page(outer_start));
8355  
8356  		/*
8357  		 * outer_start page could be small order buddy page and
8358  		 * it doesn't include start page. Adjust outer_start
8359  		 * in this case to report failed page properly
8360  		 * on tracepoint in test_pages_isolated()
8361  		 */
8362  		if (outer_start + (1UL << order) <= start)
8363  			outer_start = start;
8364  	}
8365  
8366  	/* Make sure the range is really isolated. */
8367  	if (test_pages_isolated(outer_start, end, false)) {
8368  		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
8369  			__func__, outer_start, end);
8370  		ret = -EBUSY;
8371  		goto done;
8372  	}
8373  
8374  	/* Grab isolated pages from freelists. */
8375  	outer_end = isolate_freepages_range(&cc, outer_start, end);
8376  	if (!outer_end) {
8377  		ret = -EBUSY;
8378  		goto done;
8379  	}
8380  
8381  	/* Free head and tail (if any) */
8382  	if (start != outer_start)
8383  		free_contig_range(outer_start, start - outer_start);
8384  	if (end != outer_end)
8385  		free_contig_range(end, outer_end - end);
8386  
8387  done:
8388  	undo_isolate_page_range(pfn_max_align_down(start),
8389  				pfn_max_align_up(end), migratetype);
8390  	return ret;
8391  }
8392  #endif /* CONFIG_CONTIG_ALLOC */
8393  
8394  void free_contig_range(unsigned long pfn, unsigned int nr_pages)
8395  {
8396  	unsigned int count = 0;
8397  
8398  	for (; nr_pages--; pfn++) {
8399  		struct page *page = pfn_to_page(pfn);
8400  
8401  		count += page_count(page) != 1;
8402  		__free_page(page);
8403  	}
8404  	WARN(count != 0, "%d pages are still in use!\n", count);
8405  }
8406  
8407  #ifdef CONFIG_MEMORY_HOTPLUG
8408  /*
8409   * The zone indicated has a new number of managed_pages; batch sizes and percpu
8410   * page high values need to be recalulated.
8411   */
8412  void __meminit zone_pcp_update(struct zone *zone)
8413  {
8414  	unsigned cpu;
8415  	mutex_lock(&pcp_batch_high_lock);
8416  	for_each_possible_cpu(cpu)
8417  		pageset_set_high_and_batch(zone,
8418  				per_cpu_ptr(zone->pageset, cpu));
8419  	mutex_unlock(&pcp_batch_high_lock);
8420  }
8421  #endif
8422  
8423  void zone_pcp_reset(struct zone *zone)
8424  {
8425  	unsigned long flags;
8426  	int cpu;
8427  	struct per_cpu_pageset *pset;
8428  
8429  	/* avoid races with drain_pages()  */
8430  	local_irq_save(flags);
8431  	if (zone->pageset != &boot_pageset) {
8432  		for_each_online_cpu(cpu) {
8433  			pset = per_cpu_ptr(zone->pageset, cpu);
8434  			drain_zonestat(zone, pset);
8435  		}
8436  		free_percpu(zone->pageset);
8437  		zone->pageset = &boot_pageset;
8438  	}
8439  	local_irq_restore(flags);
8440  }
8441  
8442  #ifdef CONFIG_MEMORY_HOTREMOVE
8443  /*
8444   * All pages in the range must be in a single zone and isolated
8445   * before calling this.
8446   */
8447  unsigned long
8448  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
8449  {
8450  	struct page *page;
8451  	struct zone *zone;
8452  	unsigned int order, i;
8453  	unsigned long pfn;
8454  	unsigned long flags;
8455  	unsigned long offlined_pages = 0;
8456  
8457  	/* find the first valid pfn */
8458  	for (pfn = start_pfn; pfn < end_pfn; pfn++)
8459  		if (pfn_valid(pfn))
8460  			break;
8461  	if (pfn == end_pfn)
8462  		return offlined_pages;
8463  
8464  	offline_mem_sections(pfn, end_pfn);
8465  	zone = page_zone(pfn_to_page(pfn));
8466  	spin_lock_irqsave(&zone->lock, flags);
8467  	pfn = start_pfn;
8468  	while (pfn < end_pfn) {
8469  		if (!pfn_valid(pfn)) {
8470  			pfn++;
8471  			continue;
8472  		}
8473  		page = pfn_to_page(pfn);
8474  		/*
8475  		 * The HWPoisoned page may be not in buddy system, and
8476  		 * page_count() is not 0.
8477  		 */
8478  		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
8479  			pfn++;
8480  			SetPageReserved(page);
8481  			offlined_pages++;
8482  			continue;
8483  		}
8484  
8485  		BUG_ON(page_count(page));
8486  		BUG_ON(!PageBuddy(page));
8487  		order = page_order(page);
8488  		offlined_pages += 1 << order;
8489  #ifdef CONFIG_DEBUG_VM
8490  		pr_info("remove from free list %lx %d %lx\n",
8491  			pfn, 1 << order, end_pfn);
8492  #endif
8493  		del_page_from_free_area(page, &zone->free_area[order]);
8494  		for (i = 0; i < (1 << order); i++)
8495  			SetPageReserved((page+i));
8496  		pfn += (1 << order);
8497  	}
8498  	spin_unlock_irqrestore(&zone->lock, flags);
8499  
8500  	return offlined_pages;
8501  }
8502  #endif
8503  
8504  bool is_free_buddy_page(struct page *page)
8505  {
8506  	struct zone *zone = page_zone(page);
8507  	unsigned long pfn = page_to_pfn(page);
8508  	unsigned long flags;
8509  	unsigned int order;
8510  
8511  	spin_lock_irqsave(&zone->lock, flags);
8512  	for (order = 0; order < MAX_ORDER; order++) {
8513  		struct page *page_head = page - (pfn & ((1 << order) - 1));
8514  
8515  		if (PageBuddy(page_head) && page_order(page_head) >= order)
8516  			break;
8517  	}
8518  	spin_unlock_irqrestore(&zone->lock, flags);
8519  
8520  	return order < MAX_ORDER;
8521  }
8522  
8523  #ifdef CONFIG_MEMORY_FAILURE
8524  /*
8525   * Set PG_hwpoison flag if a given page is confirmed to be a free page.  This
8526   * test is performed under the zone lock to prevent a race against page
8527   * allocation.
8528   */
8529  bool set_hwpoison_free_buddy_page(struct page *page)
8530  {
8531  	struct zone *zone = page_zone(page);
8532  	unsigned long pfn = page_to_pfn(page);
8533  	unsigned long flags;
8534  	unsigned int order;
8535  	bool hwpoisoned = false;
8536  
8537  	spin_lock_irqsave(&zone->lock, flags);
8538  	for (order = 0; order < MAX_ORDER; order++) {
8539  		struct page *page_head = page - (pfn & ((1 << order) - 1));
8540  
8541  		if (PageBuddy(page_head) && page_order(page_head) >= order) {
8542  			if (!TestSetPageHWPoison(page))
8543  				hwpoisoned = true;
8544  			break;
8545  		}
8546  	}
8547  	spin_unlock_irqrestore(&zone->lock, flags);
8548  
8549  	return hwpoisoned;
8550  }
8551  #endif
8552