1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/page_alloc.c 4 * 5 * Manages the free list, the system allocates free pages here. 6 * Note that kmalloc() lives in slab.c 7 * 8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 9 * Swap reorganised 29.12.95, Stephen Tweedie 10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 16 */ 17 18 #include <linux/stddef.h> 19 #include <linux/mm.h> 20 #include <linux/highmem.h> 21 #include <linux/swap.h> 22 #include <linux/interrupt.h> 23 #include <linux/pagemap.h> 24 #include <linux/jiffies.h> 25 #include <linux/memblock.h> 26 #include <linux/compiler.h> 27 #include <linux/kernel.h> 28 #include <linux/kasan.h> 29 #include <linux/module.h> 30 #include <linux/suspend.h> 31 #include <linux/pagevec.h> 32 #include <linux/blkdev.h> 33 #include <linux/slab.h> 34 #include <linux/ratelimit.h> 35 #include <linux/oom.h> 36 #include <linux/topology.h> 37 #include <linux/sysctl.h> 38 #include <linux/cpu.h> 39 #include <linux/cpuset.h> 40 #include <linux/memory_hotplug.h> 41 #include <linux/nodemask.h> 42 #include <linux/vmalloc.h> 43 #include <linux/vmstat.h> 44 #include <linux/mempolicy.h> 45 #include <linux/memremap.h> 46 #include <linux/stop_machine.h> 47 #include <linux/random.h> 48 #include <linux/sort.h> 49 #include <linux/pfn.h> 50 #include <linux/backing-dev.h> 51 #include <linux/fault-inject.h> 52 #include <linux/page-isolation.h> 53 #include <linux/debugobjects.h> 54 #include <linux/kmemleak.h> 55 #include <linux/compaction.h> 56 #include <trace/events/kmem.h> 57 #include <trace/events/oom.h> 58 #include <linux/prefetch.h> 59 #include <linux/mm_inline.h> 60 #include <linux/mmu_notifier.h> 61 #include <linux/migrate.h> 62 #include <linux/hugetlb.h> 63 #include <linux/sched/rt.h> 64 #include <linux/sched/mm.h> 65 #include <linux/page_owner.h> 66 #include <linux/kthread.h> 67 #include <linux/memcontrol.h> 68 #include <linux/ftrace.h> 69 #include <linux/lockdep.h> 70 #include <linux/nmi.h> 71 #include <linux/psi.h> 72 #include <linux/padata.h> 73 #include <linux/khugepaged.h> 74 #include <linux/buffer_head.h> 75 76 #include <asm/sections.h> 77 #include <asm/tlbflush.h> 78 #include <asm/div64.h> 79 #include "internal.h" 80 #include "shuffle.h" 81 #include "page_reporting.h" 82 83 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 84 typedef int __bitwise fpi_t; 85 86 /* No special request */ 87 #define FPI_NONE ((__force fpi_t)0) 88 89 /* 90 * Skip free page reporting notification for the (possibly merged) page. 91 * This does not hinder free page reporting from grabbing the page, 92 * reporting it and marking it "reported" - it only skips notifying 93 * the free page reporting infrastructure about a newly freed page. For 94 * example, used when temporarily pulling a page from a freelist and 95 * putting it back unmodified. 96 */ 97 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 98 99 /* 100 * Place the (possibly merged) page to the tail of the freelist. Will ignore 101 * page shuffling (relevant code - e.g., memory onlining - is expected to 102 * shuffle the whole zone). 103 * 104 * Note: No code should rely on this flag for correctness - it's purely 105 * to allow for optimizations when handing back either fresh pages 106 * (memory onlining) or untouched pages (page isolation, free page 107 * reporting). 108 */ 109 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 110 111 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 112 static DEFINE_MUTEX(pcp_batch_high_lock); 113 #define MIN_PERCPU_PAGELIST_FRACTION (8) 114 115 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 116 DEFINE_PER_CPU(int, numa_node); 117 EXPORT_PER_CPU_SYMBOL(numa_node); 118 #endif 119 120 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 121 122 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 123 /* 124 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 125 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 126 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 127 * defined in <linux/topology.h>. 128 */ 129 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 130 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 131 #endif 132 133 /* work_structs for global per-cpu drains */ 134 struct pcpu_drain { 135 struct zone *zone; 136 struct work_struct work; 137 }; 138 static DEFINE_MUTEX(pcpu_drain_mutex); 139 static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); 140 141 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 142 volatile unsigned long latent_entropy __latent_entropy; 143 EXPORT_SYMBOL(latent_entropy); 144 #endif 145 146 /* 147 * Array of node states. 148 */ 149 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 150 [N_POSSIBLE] = NODE_MASK_ALL, 151 [N_ONLINE] = { { [0] = 1UL } }, 152 #ifndef CONFIG_NUMA 153 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 154 #ifdef CONFIG_HIGHMEM 155 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 156 #endif 157 [N_MEMORY] = { { [0] = 1UL } }, 158 [N_CPU] = { { [0] = 1UL } }, 159 #endif /* NUMA */ 160 }; 161 EXPORT_SYMBOL(node_states); 162 163 atomic_long_t _totalram_pages __read_mostly; 164 EXPORT_SYMBOL(_totalram_pages); 165 unsigned long totalreserve_pages __read_mostly; 166 unsigned long totalcma_pages __read_mostly; 167 168 int percpu_pagelist_fraction; 169 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 170 DEFINE_STATIC_KEY_FALSE(init_on_alloc); 171 EXPORT_SYMBOL(init_on_alloc); 172 173 DEFINE_STATIC_KEY_FALSE(init_on_free); 174 EXPORT_SYMBOL(init_on_free); 175 176 static bool _init_on_alloc_enabled_early __read_mostly 177 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); 178 static int __init early_init_on_alloc(char *buf) 179 { 180 181 return kstrtobool(buf, &_init_on_alloc_enabled_early); 182 } 183 early_param("init_on_alloc", early_init_on_alloc); 184 185 static bool _init_on_free_enabled_early __read_mostly 186 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); 187 static int __init early_init_on_free(char *buf) 188 { 189 return kstrtobool(buf, &_init_on_free_enabled_early); 190 } 191 early_param("init_on_free", early_init_on_free); 192 193 /* 194 * A cached value of the page's pageblock's migratetype, used when the page is 195 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 196 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 197 * Also the migratetype set in the page does not necessarily match the pcplist 198 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 199 * other index - this ensures that it will be put on the correct CMA freelist. 200 */ 201 static inline int get_pcppage_migratetype(struct page *page) 202 { 203 return page->index; 204 } 205 206 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 207 { 208 page->index = migratetype; 209 } 210 211 #ifdef CONFIG_PM_SLEEP 212 /* 213 * The following functions are used by the suspend/hibernate code to temporarily 214 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 215 * while devices are suspended. To avoid races with the suspend/hibernate code, 216 * they should always be called with system_transition_mutex held 217 * (gfp_allowed_mask also should only be modified with system_transition_mutex 218 * held, unless the suspend/hibernate code is guaranteed not to run in parallel 219 * with that modification). 220 */ 221 222 static gfp_t saved_gfp_mask; 223 224 void pm_restore_gfp_mask(void) 225 { 226 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 227 if (saved_gfp_mask) { 228 gfp_allowed_mask = saved_gfp_mask; 229 saved_gfp_mask = 0; 230 } 231 } 232 233 void pm_restrict_gfp_mask(void) 234 { 235 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 236 WARN_ON(saved_gfp_mask); 237 saved_gfp_mask = gfp_allowed_mask; 238 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 239 } 240 241 bool pm_suspended_storage(void) 242 { 243 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 244 return false; 245 return true; 246 } 247 #endif /* CONFIG_PM_SLEEP */ 248 249 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 250 unsigned int pageblock_order __read_mostly; 251 #endif 252 253 static void __free_pages_ok(struct page *page, unsigned int order, 254 fpi_t fpi_flags); 255 256 /* 257 * results with 256, 32 in the lowmem_reserve sysctl: 258 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 259 * 1G machine -> (16M dma, 784M normal, 224M high) 260 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 261 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 262 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 263 * 264 * TBD: should special case ZONE_DMA32 machines here - in those we normally 265 * don't need any ZONE_NORMAL reservation 266 */ 267 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 268 #ifdef CONFIG_ZONE_DMA 269 [ZONE_DMA] = 256, 270 #endif 271 #ifdef CONFIG_ZONE_DMA32 272 [ZONE_DMA32] = 256, 273 #endif 274 [ZONE_NORMAL] = 32, 275 #ifdef CONFIG_HIGHMEM 276 [ZONE_HIGHMEM] = 0, 277 #endif 278 [ZONE_MOVABLE] = 0, 279 }; 280 281 static char * const zone_names[MAX_NR_ZONES] = { 282 #ifdef CONFIG_ZONE_DMA 283 "DMA", 284 #endif 285 #ifdef CONFIG_ZONE_DMA32 286 "DMA32", 287 #endif 288 "Normal", 289 #ifdef CONFIG_HIGHMEM 290 "HighMem", 291 #endif 292 "Movable", 293 #ifdef CONFIG_ZONE_DEVICE 294 "Device", 295 #endif 296 }; 297 298 const char * const migratetype_names[MIGRATE_TYPES] = { 299 "Unmovable", 300 "Movable", 301 "Reclaimable", 302 "HighAtomic", 303 #ifdef CONFIG_CMA 304 "CMA", 305 #endif 306 #ifdef CONFIG_MEMORY_ISOLATION 307 "Isolate", 308 #endif 309 }; 310 311 compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { 312 [NULL_COMPOUND_DTOR] = NULL, 313 [COMPOUND_PAGE_DTOR] = free_compound_page, 314 #ifdef CONFIG_HUGETLB_PAGE 315 [HUGETLB_PAGE_DTOR] = free_huge_page, 316 #endif 317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 318 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, 319 #endif 320 }; 321 322 int min_free_kbytes = 1024; 323 int user_min_free_kbytes = -1; 324 #ifdef CONFIG_DISCONTIGMEM 325 /* 326 * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges 327 * are not on separate NUMA nodes. Functionally this works but with 328 * watermark_boost_factor, it can reclaim prematurely as the ranges can be 329 * quite small. By default, do not boost watermarks on discontigmem as in 330 * many cases very high-order allocations like THP are likely to be 331 * unsupported and the premature reclaim offsets the advantage of long-term 332 * fragmentation avoidance. 333 */ 334 int watermark_boost_factor __read_mostly; 335 #else 336 int watermark_boost_factor __read_mostly = 15000; 337 #endif 338 int watermark_scale_factor = 10; 339 340 static unsigned long nr_kernel_pages __initdata; 341 static unsigned long nr_all_pages __initdata; 342 static unsigned long dma_reserve __initdata; 343 344 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; 345 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; 346 static unsigned long required_kernelcore __initdata; 347 static unsigned long required_kernelcore_percent __initdata; 348 static unsigned long required_movablecore __initdata; 349 static unsigned long required_movablecore_percent __initdata; 350 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; 351 static bool mirrored_kernelcore __meminitdata; 352 353 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 354 int movable_zone; 355 EXPORT_SYMBOL(movable_zone); 356 357 #if MAX_NUMNODES > 1 358 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 359 unsigned int nr_online_nodes __read_mostly = 1; 360 EXPORT_SYMBOL(nr_node_ids); 361 EXPORT_SYMBOL(nr_online_nodes); 362 #endif 363 364 int page_group_by_mobility_disabled __read_mostly; 365 366 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 367 /* 368 * During boot we initialize deferred pages on-demand, as needed, but once 369 * page_alloc_init_late() has finished, the deferred pages are all initialized, 370 * and we can permanently disable that path. 371 */ 372 static DEFINE_STATIC_KEY_TRUE(deferred_pages); 373 374 /* 375 * Calling kasan_free_pages() only after deferred memory initialization 376 * has completed. Poisoning pages during deferred memory init will greatly 377 * lengthen the process and cause problem in large memory systems as the 378 * deferred pages initialization is done with interrupt disabled. 379 * 380 * Assuming that there will be no reference to those newly initialized 381 * pages before they are ever allocated, this should have no effect on 382 * KASAN memory tracking as the poison will be properly inserted at page 383 * allocation time. The only corner case is when pages are allocated by 384 * on-demand allocation and then freed again before the deferred pages 385 * initialization is done, but this is not likely to happen. 386 */ 387 static inline void kasan_free_nondeferred_pages(struct page *page, int order) 388 { 389 if (!static_branch_unlikely(&deferred_pages)) 390 kasan_free_pages(page, order); 391 } 392 393 /* Returns true if the struct page for the pfn is uninitialised */ 394 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 395 { 396 int nid = early_pfn_to_nid(pfn); 397 398 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 399 return true; 400 401 return false; 402 } 403 404 /* 405 * Returns true when the remaining initialisation should be deferred until 406 * later in the boot cycle when it can be parallelised. 407 */ 408 static bool __meminit 409 defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 410 { 411 static unsigned long prev_end_pfn, nr_initialised; 412 413 /* 414 * prev_end_pfn static that contains the end of previous zone 415 * No need to protect because called very early in boot before smp_init. 416 */ 417 if (prev_end_pfn != end_pfn) { 418 prev_end_pfn = end_pfn; 419 nr_initialised = 0; 420 } 421 422 /* Always populate low zones for address-constrained allocations */ 423 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 424 return false; 425 426 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) 427 return true; 428 /* 429 * We start only with one section of pages, more pages are added as 430 * needed until the rest of deferred pages are initialized. 431 */ 432 nr_initialised++; 433 if ((nr_initialised > PAGES_PER_SECTION) && 434 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 435 NODE_DATA(nid)->first_deferred_pfn = pfn; 436 return true; 437 } 438 return false; 439 } 440 #else 441 #define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) 442 443 static inline bool early_page_uninitialised(unsigned long pfn) 444 { 445 return false; 446 } 447 448 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 449 { 450 return false; 451 } 452 #endif 453 454 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 455 static inline unsigned long *get_pageblock_bitmap(struct page *page, 456 unsigned long pfn) 457 { 458 #ifdef CONFIG_SPARSEMEM 459 return section_to_usemap(__pfn_to_section(pfn)); 460 #else 461 return page_zone(page)->pageblock_flags; 462 #endif /* CONFIG_SPARSEMEM */ 463 } 464 465 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) 466 { 467 #ifdef CONFIG_SPARSEMEM 468 pfn &= (PAGES_PER_SECTION-1); 469 #else 470 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 471 #endif /* CONFIG_SPARSEMEM */ 472 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 473 } 474 475 static __always_inline 476 unsigned long __get_pfnblock_flags_mask(struct page *page, 477 unsigned long pfn, 478 unsigned long mask) 479 { 480 unsigned long *bitmap; 481 unsigned long bitidx, word_bitidx; 482 unsigned long word; 483 484 bitmap = get_pageblock_bitmap(page, pfn); 485 bitidx = pfn_to_bitidx(page, pfn); 486 word_bitidx = bitidx / BITS_PER_LONG; 487 bitidx &= (BITS_PER_LONG-1); 488 489 word = bitmap[word_bitidx]; 490 return (word >> bitidx) & mask; 491 } 492 493 /** 494 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 495 * @page: The page within the block of interest 496 * @pfn: The target page frame number 497 * @mask: mask of bits that the caller is interested in 498 * 499 * Return: pageblock_bits flags 500 */ 501 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 502 unsigned long mask) 503 { 504 return __get_pfnblock_flags_mask(page, pfn, mask); 505 } 506 507 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) 508 { 509 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); 510 } 511 512 /** 513 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 514 * @page: The page within the block of interest 515 * @flags: The flags to set 516 * @pfn: The target page frame number 517 * @mask: mask of bits that the caller is interested in 518 */ 519 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 520 unsigned long pfn, 521 unsigned long mask) 522 { 523 unsigned long *bitmap; 524 unsigned long bitidx, word_bitidx; 525 unsigned long old_word, word; 526 527 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 528 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); 529 530 bitmap = get_pageblock_bitmap(page, pfn); 531 bitidx = pfn_to_bitidx(page, pfn); 532 word_bitidx = bitidx / BITS_PER_LONG; 533 bitidx &= (BITS_PER_LONG-1); 534 535 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 536 537 mask <<= bitidx; 538 flags <<= bitidx; 539 540 word = READ_ONCE(bitmap[word_bitidx]); 541 for (;;) { 542 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 543 if (word == old_word) 544 break; 545 word = old_word; 546 } 547 } 548 549 void set_pageblock_migratetype(struct page *page, int migratetype) 550 { 551 if (unlikely(page_group_by_mobility_disabled && 552 migratetype < MIGRATE_PCPTYPES)) 553 migratetype = MIGRATE_UNMOVABLE; 554 555 set_pfnblock_flags_mask(page, (unsigned long)migratetype, 556 page_to_pfn(page), MIGRATETYPE_MASK); 557 } 558 559 #ifdef CONFIG_DEBUG_VM 560 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 561 { 562 int ret = 0; 563 unsigned seq; 564 unsigned long pfn = page_to_pfn(page); 565 unsigned long sp, start_pfn; 566 567 do { 568 seq = zone_span_seqbegin(zone); 569 start_pfn = zone->zone_start_pfn; 570 sp = zone->spanned_pages; 571 if (!zone_spans_pfn(zone, pfn)) 572 ret = 1; 573 } while (zone_span_seqretry(zone, seq)); 574 575 if (ret) 576 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 577 pfn, zone_to_nid(zone), zone->name, 578 start_pfn, start_pfn + sp); 579 580 return ret; 581 } 582 583 static int page_is_consistent(struct zone *zone, struct page *page) 584 { 585 if (!pfn_valid_within(page_to_pfn(page))) 586 return 0; 587 if (zone != page_zone(page)) 588 return 0; 589 590 return 1; 591 } 592 /* 593 * Temporary debugging check for pages not lying within a given zone. 594 */ 595 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 596 { 597 if (page_outside_zone_boundaries(zone, page)) 598 return 1; 599 if (!page_is_consistent(zone, page)) 600 return 1; 601 602 return 0; 603 } 604 #else 605 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 606 { 607 return 0; 608 } 609 #endif 610 611 static void bad_page(struct page *page, const char *reason) 612 { 613 static unsigned long resume; 614 static unsigned long nr_shown; 615 static unsigned long nr_unshown; 616 617 /* 618 * Allow a burst of 60 reports, then keep quiet for that minute; 619 * or allow a steady drip of one report per second. 620 */ 621 if (nr_shown == 60) { 622 if (time_before(jiffies, resume)) { 623 nr_unshown++; 624 goto out; 625 } 626 if (nr_unshown) { 627 pr_alert( 628 "BUG: Bad page state: %lu messages suppressed\n", 629 nr_unshown); 630 nr_unshown = 0; 631 } 632 nr_shown = 0; 633 } 634 if (nr_shown++ == 0) 635 resume = jiffies + 60 * HZ; 636 637 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 638 current->comm, page_to_pfn(page)); 639 __dump_page(page, reason); 640 dump_page_owner(page); 641 642 print_modules(); 643 dump_stack(); 644 out: 645 /* Leave bad fields for debug, except PageBuddy could make trouble */ 646 page_mapcount_reset(page); /* remove PageBuddy */ 647 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 648 } 649 650 /* 651 * Higher-order pages are called "compound pages". They are structured thusly: 652 * 653 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 654 * 655 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 656 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 657 * 658 * The first tail page's ->compound_dtor holds the offset in array of compound 659 * page destructors. See compound_page_dtors. 660 * 661 * The first tail page's ->compound_order holds the order of allocation. 662 * This usage means that zero-order pages may not be compound. 663 */ 664 665 void free_compound_page(struct page *page) 666 { 667 mem_cgroup_uncharge(page); 668 __free_pages_ok(page, compound_order(page), FPI_NONE); 669 } 670 671 void prep_compound_page(struct page *page, unsigned int order) 672 { 673 int i; 674 int nr_pages = 1 << order; 675 676 __SetPageHead(page); 677 for (i = 1; i < nr_pages; i++) { 678 struct page *p = page + i; 679 set_page_count(p, 0); 680 p->mapping = TAIL_MAPPING; 681 set_compound_head(p, page); 682 } 683 684 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 685 set_compound_order(page, order); 686 atomic_set(compound_mapcount_ptr(page), -1); 687 if (hpage_pincount_available(page)) 688 atomic_set(compound_pincount_ptr(page), 0); 689 } 690 691 #ifdef CONFIG_DEBUG_PAGEALLOC 692 unsigned int _debug_guardpage_minorder; 693 694 bool _debug_pagealloc_enabled_early __read_mostly 695 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 696 EXPORT_SYMBOL(_debug_pagealloc_enabled_early); 697 DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); 698 EXPORT_SYMBOL(_debug_pagealloc_enabled); 699 700 DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); 701 702 static int __init early_debug_pagealloc(char *buf) 703 { 704 return kstrtobool(buf, &_debug_pagealloc_enabled_early); 705 } 706 early_param("debug_pagealloc", early_debug_pagealloc); 707 708 static int __init debug_guardpage_minorder_setup(char *buf) 709 { 710 unsigned long res; 711 712 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 713 pr_err("Bad debug_guardpage_minorder value\n"); 714 return 0; 715 } 716 _debug_guardpage_minorder = res; 717 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 718 return 0; 719 } 720 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 721 722 static inline bool set_page_guard(struct zone *zone, struct page *page, 723 unsigned int order, int migratetype) 724 { 725 if (!debug_guardpage_enabled()) 726 return false; 727 728 if (order >= debug_guardpage_minorder()) 729 return false; 730 731 __SetPageGuard(page); 732 INIT_LIST_HEAD(&page->lru); 733 set_page_private(page, order); 734 /* Guard pages are not available for any usage */ 735 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 736 737 return true; 738 } 739 740 static inline void clear_page_guard(struct zone *zone, struct page *page, 741 unsigned int order, int migratetype) 742 { 743 if (!debug_guardpage_enabled()) 744 return; 745 746 __ClearPageGuard(page); 747 748 set_page_private(page, 0); 749 if (!is_migrate_isolate(migratetype)) 750 __mod_zone_freepage_state(zone, (1 << order), migratetype); 751 } 752 #else 753 static inline bool set_page_guard(struct zone *zone, struct page *page, 754 unsigned int order, int migratetype) { return false; } 755 static inline void clear_page_guard(struct zone *zone, struct page *page, 756 unsigned int order, int migratetype) {} 757 #endif 758 759 /* 760 * Enable static keys related to various memory debugging and hardening options. 761 * Some override others, and depend on early params that are evaluated in the 762 * order of appearance. So we need to first gather the full picture of what was 763 * enabled, and then make decisions. 764 */ 765 void init_mem_debugging_and_hardening(void) 766 { 767 if (_init_on_alloc_enabled_early) { 768 if (page_poisoning_enabled()) 769 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 770 "will take precedence over init_on_alloc\n"); 771 else 772 static_branch_enable(&init_on_alloc); 773 } 774 if (_init_on_free_enabled_early) { 775 if (page_poisoning_enabled()) 776 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 777 "will take precedence over init_on_free\n"); 778 else 779 static_branch_enable(&init_on_free); 780 } 781 782 #ifdef CONFIG_PAGE_POISONING 783 /* 784 * Page poisoning is debug page alloc for some arches. If 785 * either of those options are enabled, enable poisoning. 786 */ 787 if (page_poisoning_enabled() || 788 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && 789 debug_pagealloc_enabled())) 790 static_branch_enable(&_page_poisoning_enabled); 791 #endif 792 793 #ifdef CONFIG_DEBUG_PAGEALLOC 794 if (!debug_pagealloc_enabled()) 795 return; 796 797 static_branch_enable(&_debug_pagealloc_enabled); 798 799 if (!debug_guardpage_minorder()) 800 return; 801 802 static_branch_enable(&_debug_guardpage_enabled); 803 #endif 804 } 805 806 static inline void set_buddy_order(struct page *page, unsigned int order) 807 { 808 set_page_private(page, order); 809 __SetPageBuddy(page); 810 } 811 812 /* 813 * This function checks whether a page is free && is the buddy 814 * we can coalesce a page and its buddy if 815 * (a) the buddy is not in a hole (check before calling!) && 816 * (b) the buddy is in the buddy system && 817 * (c) a page and its buddy have the same order && 818 * (d) a page and its buddy are in the same zone. 819 * 820 * For recording whether a page is in the buddy system, we set PageBuddy. 821 * Setting, clearing, and testing PageBuddy is serialized by zone->lock. 822 * 823 * For recording page's order, we use page_private(page). 824 */ 825 static inline bool page_is_buddy(struct page *page, struct page *buddy, 826 unsigned int order) 827 { 828 if (!page_is_guard(buddy) && !PageBuddy(buddy)) 829 return false; 830 831 if (buddy_order(buddy) != order) 832 return false; 833 834 /* 835 * zone check is done late to avoid uselessly calculating 836 * zone/node ids for pages that could never merge. 837 */ 838 if (page_zone_id(page) != page_zone_id(buddy)) 839 return false; 840 841 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 842 843 return true; 844 } 845 846 #ifdef CONFIG_COMPACTION 847 static inline struct capture_control *task_capc(struct zone *zone) 848 { 849 struct capture_control *capc = current->capture_control; 850 851 return unlikely(capc) && 852 !(current->flags & PF_KTHREAD) && 853 !capc->page && 854 capc->cc->zone == zone ? capc : NULL; 855 } 856 857 static inline bool 858 compaction_capture(struct capture_control *capc, struct page *page, 859 int order, int migratetype) 860 { 861 if (!capc || order != capc->cc->order) 862 return false; 863 864 /* Do not accidentally pollute CMA or isolated regions*/ 865 if (is_migrate_cma(migratetype) || 866 is_migrate_isolate(migratetype)) 867 return false; 868 869 /* 870 * Do not let lower order allocations polluate a movable pageblock. 871 * This might let an unmovable request use a reclaimable pageblock 872 * and vice-versa but no more than normal fallback logic which can 873 * have trouble finding a high-order free page. 874 */ 875 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 876 return false; 877 878 capc->page = page; 879 return true; 880 } 881 882 #else 883 static inline struct capture_control *task_capc(struct zone *zone) 884 { 885 return NULL; 886 } 887 888 static inline bool 889 compaction_capture(struct capture_control *capc, struct page *page, 890 int order, int migratetype) 891 { 892 return false; 893 } 894 #endif /* CONFIG_COMPACTION */ 895 896 /* Used for pages not on another list */ 897 static inline void add_to_free_list(struct page *page, struct zone *zone, 898 unsigned int order, int migratetype) 899 { 900 struct free_area *area = &zone->free_area[order]; 901 902 list_add(&page->lru, &area->free_list[migratetype]); 903 area->nr_free++; 904 } 905 906 /* Used for pages not on another list */ 907 static inline void add_to_free_list_tail(struct page *page, struct zone *zone, 908 unsigned int order, int migratetype) 909 { 910 struct free_area *area = &zone->free_area[order]; 911 912 list_add_tail(&page->lru, &area->free_list[migratetype]); 913 area->nr_free++; 914 } 915 916 /* 917 * Used for pages which are on another list. Move the pages to the tail 918 * of the list - so the moved pages won't immediately be considered for 919 * allocation again (e.g., optimization for memory onlining). 920 */ 921 static inline void move_to_free_list(struct page *page, struct zone *zone, 922 unsigned int order, int migratetype) 923 { 924 struct free_area *area = &zone->free_area[order]; 925 926 list_move_tail(&page->lru, &area->free_list[migratetype]); 927 } 928 929 static inline void del_page_from_free_list(struct page *page, struct zone *zone, 930 unsigned int order) 931 { 932 /* clear reported state and update reported page count */ 933 if (page_reported(page)) 934 __ClearPageReported(page); 935 936 list_del(&page->lru); 937 __ClearPageBuddy(page); 938 set_page_private(page, 0); 939 zone->free_area[order].nr_free--; 940 } 941 942 /* 943 * If this is not the largest possible page, check if the buddy 944 * of the next-highest order is free. If it is, it's possible 945 * that pages are being freed that will coalesce soon. In case, 946 * that is happening, add the free page to the tail of the list 947 * so it's less likely to be used soon and more likely to be merged 948 * as a higher order page 949 */ 950 static inline bool 951 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 952 struct page *page, unsigned int order) 953 { 954 struct page *higher_page, *higher_buddy; 955 unsigned long combined_pfn; 956 957 if (order >= MAX_ORDER - 2) 958 return false; 959 960 if (!pfn_valid_within(buddy_pfn)) 961 return false; 962 963 combined_pfn = buddy_pfn & pfn; 964 higher_page = page + (combined_pfn - pfn); 965 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 966 higher_buddy = higher_page + (buddy_pfn - combined_pfn); 967 968 return pfn_valid_within(buddy_pfn) && 969 page_is_buddy(higher_page, higher_buddy, order + 1); 970 } 971 972 /* 973 * Freeing function for a buddy system allocator. 974 * 975 * The concept of a buddy system is to maintain direct-mapped table 976 * (containing bit values) for memory blocks of various "orders". 977 * The bottom level table contains the map for the smallest allocatable 978 * units of memory (here, pages), and each level above it describes 979 * pairs of units from the levels below, hence, "buddies". 980 * At a high level, all that happens here is marking the table entry 981 * at the bottom level available, and propagating the changes upward 982 * as necessary, plus some accounting needed to play nicely with other 983 * parts of the VM system. 984 * At each level, we keep a list of pages, which are heads of continuous 985 * free pages of length of (1 << order) and marked with PageBuddy. 986 * Page's order is recorded in page_private(page) field. 987 * So when we are allocating or freeing one, we can derive the state of the 988 * other. That is, if we allocate a small block, and both were 989 * free, the remainder of the region must be split into blocks. 990 * If a block is freed, and its buddy is also free, then this 991 * triggers coalescing into a block of larger size. 992 * 993 * -- nyc 994 */ 995 996 static inline void __free_one_page(struct page *page, 997 unsigned long pfn, 998 struct zone *zone, unsigned int order, 999 int migratetype, fpi_t fpi_flags) 1000 { 1001 struct capture_control *capc = task_capc(zone); 1002 unsigned long buddy_pfn; 1003 unsigned long combined_pfn; 1004 unsigned int max_order; 1005 struct page *buddy; 1006 bool to_tail; 1007 1008 max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); 1009 1010 VM_BUG_ON(!zone_is_initialized(zone)); 1011 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 1012 1013 VM_BUG_ON(migratetype == -1); 1014 if (likely(!is_migrate_isolate(migratetype))) 1015 __mod_zone_freepage_state(zone, 1 << order, migratetype); 1016 1017 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 1018 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1019 1020 continue_merging: 1021 while (order < max_order) { 1022 if (compaction_capture(capc, page, order, migratetype)) { 1023 __mod_zone_freepage_state(zone, -(1 << order), 1024 migratetype); 1025 return; 1026 } 1027 buddy_pfn = __find_buddy_pfn(pfn, order); 1028 buddy = page + (buddy_pfn - pfn); 1029 1030 if (!pfn_valid_within(buddy_pfn)) 1031 goto done_merging; 1032 if (!page_is_buddy(page, buddy, order)) 1033 goto done_merging; 1034 /* 1035 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 1036 * merge with it and move up one order. 1037 */ 1038 if (page_is_guard(buddy)) 1039 clear_page_guard(zone, buddy, order, migratetype); 1040 else 1041 del_page_from_free_list(buddy, zone, order); 1042 combined_pfn = buddy_pfn & pfn; 1043 page = page + (combined_pfn - pfn); 1044 pfn = combined_pfn; 1045 order++; 1046 } 1047 if (order < MAX_ORDER - 1) { 1048 /* If we are here, it means order is >= pageblock_order. 1049 * We want to prevent merge between freepages on isolate 1050 * pageblock and normal pageblock. Without this, pageblock 1051 * isolation could cause incorrect freepage or CMA accounting. 1052 * 1053 * We don't want to hit this code for the more frequent 1054 * low-order merging. 1055 */ 1056 if (unlikely(has_isolate_pageblock(zone))) { 1057 int buddy_mt; 1058 1059 buddy_pfn = __find_buddy_pfn(pfn, order); 1060 buddy = page + (buddy_pfn - pfn); 1061 buddy_mt = get_pageblock_migratetype(buddy); 1062 1063 if (migratetype != buddy_mt 1064 && (is_migrate_isolate(migratetype) || 1065 is_migrate_isolate(buddy_mt))) 1066 goto done_merging; 1067 } 1068 max_order = order + 1; 1069 goto continue_merging; 1070 } 1071 1072 done_merging: 1073 set_buddy_order(page, order); 1074 1075 if (fpi_flags & FPI_TO_TAIL) 1076 to_tail = true; 1077 else if (is_shuffle_order(order)) 1078 to_tail = shuffle_pick_tail(); 1079 else 1080 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 1081 1082 if (to_tail) 1083 add_to_free_list_tail(page, zone, order, migratetype); 1084 else 1085 add_to_free_list(page, zone, order, migratetype); 1086 1087 /* Notify page reporting subsystem of freed page */ 1088 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 1089 page_reporting_notify_free(order); 1090 } 1091 1092 /* 1093 * A bad page could be due to a number of fields. Instead of multiple branches, 1094 * try and check multiple fields with one check. The caller must do a detailed 1095 * check if necessary. 1096 */ 1097 static inline bool page_expected_state(struct page *page, 1098 unsigned long check_flags) 1099 { 1100 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1101 return false; 1102 1103 if (unlikely((unsigned long)page->mapping | 1104 page_ref_count(page) | 1105 #ifdef CONFIG_MEMCG 1106 (unsigned long)page_memcg(page) | 1107 #endif 1108 (page->flags & check_flags))) 1109 return false; 1110 1111 return true; 1112 } 1113 1114 static const char *page_bad_reason(struct page *page, unsigned long flags) 1115 { 1116 const char *bad_reason = NULL; 1117 1118 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1119 bad_reason = "nonzero mapcount"; 1120 if (unlikely(page->mapping != NULL)) 1121 bad_reason = "non-NULL mapping"; 1122 if (unlikely(page_ref_count(page) != 0)) 1123 bad_reason = "nonzero _refcount"; 1124 if (unlikely(page->flags & flags)) { 1125 if (flags == PAGE_FLAGS_CHECK_AT_PREP) 1126 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 1127 else 1128 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 1129 } 1130 #ifdef CONFIG_MEMCG 1131 if (unlikely(page_memcg(page))) 1132 bad_reason = "page still charged to cgroup"; 1133 #endif 1134 return bad_reason; 1135 } 1136 1137 static void check_free_page_bad(struct page *page) 1138 { 1139 bad_page(page, 1140 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 1141 } 1142 1143 static inline int check_free_page(struct page *page) 1144 { 1145 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 1146 return 0; 1147 1148 /* Something has gone sideways, find it */ 1149 check_free_page_bad(page); 1150 return 1; 1151 } 1152 1153 static int free_tail_pages_check(struct page *head_page, struct page *page) 1154 { 1155 int ret = 1; 1156 1157 /* 1158 * We rely page->lru.next never has bit 0 set, unless the page 1159 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 1160 */ 1161 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 1162 1163 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 1164 ret = 0; 1165 goto out; 1166 } 1167 switch (page - head_page) { 1168 case 1: 1169 /* the first tail page: ->mapping may be compound_mapcount() */ 1170 if (unlikely(compound_mapcount(page))) { 1171 bad_page(page, "nonzero compound_mapcount"); 1172 goto out; 1173 } 1174 break; 1175 case 2: 1176 /* 1177 * the second tail page: ->mapping is 1178 * deferred_list.next -- ignore value. 1179 */ 1180 break; 1181 default: 1182 if (page->mapping != TAIL_MAPPING) { 1183 bad_page(page, "corrupted mapping in tail page"); 1184 goto out; 1185 } 1186 break; 1187 } 1188 if (unlikely(!PageTail(page))) { 1189 bad_page(page, "PageTail not set"); 1190 goto out; 1191 } 1192 if (unlikely(compound_head(page) != head_page)) { 1193 bad_page(page, "compound_head not consistent"); 1194 goto out; 1195 } 1196 ret = 0; 1197 out: 1198 page->mapping = NULL; 1199 clear_compound_head(page); 1200 return ret; 1201 } 1202 1203 static void kernel_init_free_pages(struct page *page, int numpages) 1204 { 1205 int i; 1206 1207 /* s390's use of memset() could override KASAN redzones. */ 1208 kasan_disable_current(); 1209 for (i = 0; i < numpages; i++) { 1210 u8 tag = page_kasan_tag(page + i); 1211 page_kasan_tag_reset(page + i); 1212 clear_highpage(page + i); 1213 page_kasan_tag_set(page + i, tag); 1214 } 1215 kasan_enable_current(); 1216 } 1217 1218 static __always_inline bool free_pages_prepare(struct page *page, 1219 unsigned int order, bool check_free) 1220 { 1221 int bad = 0; 1222 1223 VM_BUG_ON_PAGE(PageTail(page), page); 1224 1225 trace_mm_page_free(page, order); 1226 1227 if (unlikely(PageHWPoison(page)) && !order) { 1228 /* 1229 * Do not let hwpoison pages hit pcplists/buddy 1230 * Untie memcg state and reset page's owner 1231 */ 1232 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1233 __memcg_kmem_uncharge_page(page, order); 1234 reset_page_owner(page, order); 1235 return false; 1236 } 1237 1238 /* 1239 * Check tail pages before head page information is cleared to 1240 * avoid checking PageCompound for order-0 pages. 1241 */ 1242 if (unlikely(order)) { 1243 bool compound = PageCompound(page); 1244 int i; 1245 1246 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1247 1248 if (compound) 1249 ClearPageDoubleMap(page); 1250 for (i = 1; i < (1 << order); i++) { 1251 if (compound) 1252 bad += free_tail_pages_check(page, page + i); 1253 if (unlikely(check_free_page(page + i))) { 1254 bad++; 1255 continue; 1256 } 1257 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1258 } 1259 } 1260 if (PageMappingFlags(page)) 1261 page->mapping = NULL; 1262 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1263 __memcg_kmem_uncharge_page(page, order); 1264 if (check_free) 1265 bad += check_free_page(page); 1266 if (bad) 1267 return false; 1268 1269 page_cpupid_reset_last(page); 1270 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1271 reset_page_owner(page, order); 1272 1273 if (!PageHighMem(page)) { 1274 debug_check_no_locks_freed(page_address(page), 1275 PAGE_SIZE << order); 1276 debug_check_no_obj_freed(page_address(page), 1277 PAGE_SIZE << order); 1278 } 1279 if (want_init_on_free()) 1280 kernel_init_free_pages(page, 1 << order); 1281 1282 kernel_poison_pages(page, 1 << order); 1283 1284 /* 1285 * With hardware tag-based KASAN, memory tags must be set before the 1286 * page becomes unavailable via debug_pagealloc or arch_free_page. 1287 */ 1288 kasan_free_nondeferred_pages(page, order); 1289 1290 /* 1291 * arch_free_page() can make the page's contents inaccessible. s390 1292 * does this. So nothing which can access the page's contents should 1293 * happen after this. 1294 */ 1295 arch_free_page(page, order); 1296 1297 debug_pagealloc_unmap_pages(page, 1 << order); 1298 1299 return true; 1300 } 1301 1302 #ifdef CONFIG_DEBUG_VM 1303 /* 1304 * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed 1305 * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when 1306 * moved from pcp lists to free lists. 1307 */ 1308 static bool free_pcp_prepare(struct page *page) 1309 { 1310 return free_pages_prepare(page, 0, true); 1311 } 1312 1313 static bool bulkfree_pcp_prepare(struct page *page) 1314 { 1315 if (debug_pagealloc_enabled_static()) 1316 return check_free_page(page); 1317 else 1318 return false; 1319 } 1320 #else 1321 /* 1322 * With DEBUG_VM disabled, order-0 pages being freed are checked only when 1323 * moving from pcp lists to free list in order to reduce overhead. With 1324 * debug_pagealloc enabled, they are checked also immediately when being freed 1325 * to the pcp lists. 1326 */ 1327 static bool free_pcp_prepare(struct page *page) 1328 { 1329 if (debug_pagealloc_enabled_static()) 1330 return free_pages_prepare(page, 0, true); 1331 else 1332 return free_pages_prepare(page, 0, false); 1333 } 1334 1335 static bool bulkfree_pcp_prepare(struct page *page) 1336 { 1337 return check_free_page(page); 1338 } 1339 #endif /* CONFIG_DEBUG_VM */ 1340 1341 static inline void prefetch_buddy(struct page *page) 1342 { 1343 unsigned long pfn = page_to_pfn(page); 1344 unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); 1345 struct page *buddy = page + (buddy_pfn - pfn); 1346 1347 prefetch(buddy); 1348 } 1349 1350 /* 1351 * Frees a number of pages from the PCP lists 1352 * Assumes all pages on list are in same zone, and of same order. 1353 * count is the number of pages to free. 1354 * 1355 * If the zone was previously in an "all pages pinned" state then look to 1356 * see if this freeing clears that state. 1357 * 1358 * And clear the zone's pages_scanned counter, to hold off the "all pages are 1359 * pinned" detection logic. 1360 */ 1361 static void free_pcppages_bulk(struct zone *zone, int count, 1362 struct per_cpu_pages *pcp) 1363 { 1364 int migratetype = 0; 1365 int batch_free = 0; 1366 int prefetch_nr = READ_ONCE(pcp->batch); 1367 bool isolated_pageblocks; 1368 struct page *page, *tmp; 1369 LIST_HEAD(head); 1370 1371 /* 1372 * Ensure proper count is passed which otherwise would stuck in the 1373 * below while (list_empty(list)) loop. 1374 */ 1375 count = min(pcp->count, count); 1376 while (count) { 1377 struct list_head *list; 1378 1379 /* 1380 * Remove pages from lists in a round-robin fashion. A 1381 * batch_free count is maintained that is incremented when an 1382 * empty list is encountered. This is so more pages are freed 1383 * off fuller lists instead of spinning excessively around empty 1384 * lists 1385 */ 1386 do { 1387 batch_free++; 1388 if (++migratetype == MIGRATE_PCPTYPES) 1389 migratetype = 0; 1390 list = &pcp->lists[migratetype]; 1391 } while (list_empty(list)); 1392 1393 /* This is the only non-empty list. Free them all. */ 1394 if (batch_free == MIGRATE_PCPTYPES) 1395 batch_free = count; 1396 1397 do { 1398 page = list_last_entry(list, struct page, lru); 1399 /* must delete to avoid corrupting pcp list */ 1400 list_del(&page->lru); 1401 pcp->count--; 1402 1403 if (bulkfree_pcp_prepare(page)) 1404 continue; 1405 1406 list_add_tail(&page->lru, &head); 1407 1408 /* 1409 * We are going to put the page back to the global 1410 * pool, prefetch its buddy to speed up later access 1411 * under zone->lock. It is believed the overhead of 1412 * an additional test and calculating buddy_pfn here 1413 * can be offset by reduced memory latency later. To 1414 * avoid excessive prefetching due to large count, only 1415 * prefetch buddy for the first pcp->batch nr of pages. 1416 */ 1417 if (prefetch_nr) { 1418 prefetch_buddy(page); 1419 prefetch_nr--; 1420 } 1421 } while (--count && --batch_free && !list_empty(list)); 1422 } 1423 1424 spin_lock(&zone->lock); 1425 isolated_pageblocks = has_isolate_pageblock(zone); 1426 1427 /* 1428 * Use safe version since after __free_one_page(), 1429 * page->lru.next will not point to original list. 1430 */ 1431 list_for_each_entry_safe(page, tmp, &head, lru) { 1432 int mt = get_pcppage_migratetype(page); 1433 /* MIGRATE_ISOLATE page should not go to pcplists */ 1434 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1435 /* Pageblock could have been isolated meanwhile */ 1436 if (unlikely(isolated_pageblocks)) 1437 mt = get_pageblock_migratetype(page); 1438 1439 __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); 1440 trace_mm_page_pcpu_drain(page, 0, mt); 1441 } 1442 spin_unlock(&zone->lock); 1443 } 1444 1445 static void free_one_page(struct zone *zone, 1446 struct page *page, unsigned long pfn, 1447 unsigned int order, 1448 int migratetype, fpi_t fpi_flags) 1449 { 1450 spin_lock(&zone->lock); 1451 if (unlikely(has_isolate_pageblock(zone) || 1452 is_migrate_isolate(migratetype))) { 1453 migratetype = get_pfnblock_migratetype(page, pfn); 1454 } 1455 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1456 spin_unlock(&zone->lock); 1457 } 1458 1459 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1460 unsigned long zone, int nid) 1461 { 1462 mm_zero_struct_page(page); 1463 set_page_links(page, zone, nid, pfn); 1464 init_page_count(page); 1465 page_mapcount_reset(page); 1466 page_cpupid_reset_last(page); 1467 page_kasan_tag_reset(page); 1468 1469 INIT_LIST_HEAD(&page->lru); 1470 #ifdef WANT_PAGE_VIRTUAL 1471 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1472 if (!is_highmem_idx(zone)) 1473 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1474 #endif 1475 } 1476 1477 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1478 static void __meminit init_reserved_page(unsigned long pfn) 1479 { 1480 pg_data_t *pgdat; 1481 int nid, zid; 1482 1483 if (!early_page_uninitialised(pfn)) 1484 return; 1485 1486 nid = early_pfn_to_nid(pfn); 1487 pgdat = NODE_DATA(nid); 1488 1489 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1490 struct zone *zone = &pgdat->node_zones[zid]; 1491 1492 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1493 break; 1494 } 1495 __init_single_page(pfn_to_page(pfn), pfn, zid, nid); 1496 } 1497 #else 1498 static inline void init_reserved_page(unsigned long pfn) 1499 { 1500 } 1501 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1502 1503 /* 1504 * Initialised pages do not have PageReserved set. This function is 1505 * called for each range allocated by the bootmem allocator and 1506 * marks the pages PageReserved. The remaining valid pages are later 1507 * sent to the buddy page allocator. 1508 */ 1509 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1510 { 1511 unsigned long start_pfn = PFN_DOWN(start); 1512 unsigned long end_pfn = PFN_UP(end); 1513 1514 for (; start_pfn < end_pfn; start_pfn++) { 1515 if (pfn_valid(start_pfn)) { 1516 struct page *page = pfn_to_page(start_pfn); 1517 1518 init_reserved_page(start_pfn); 1519 1520 /* Avoid false-positive PageTail() */ 1521 INIT_LIST_HEAD(&page->lru); 1522 1523 /* 1524 * no need for atomic set_bit because the struct 1525 * page is not visible yet so nobody should 1526 * access it yet. 1527 */ 1528 __SetPageReserved(page); 1529 } 1530 } 1531 } 1532 1533 static void __free_pages_ok(struct page *page, unsigned int order, 1534 fpi_t fpi_flags) 1535 { 1536 unsigned long flags; 1537 int migratetype; 1538 unsigned long pfn = page_to_pfn(page); 1539 1540 if (!free_pages_prepare(page, order, true)) 1541 return; 1542 1543 migratetype = get_pfnblock_migratetype(page, pfn); 1544 local_irq_save(flags); 1545 __count_vm_events(PGFREE, 1 << order); 1546 free_one_page(page_zone(page), page, pfn, order, migratetype, 1547 fpi_flags); 1548 local_irq_restore(flags); 1549 } 1550 1551 void __free_pages_core(struct page *page, unsigned int order) 1552 { 1553 unsigned int nr_pages = 1 << order; 1554 struct page *p = page; 1555 unsigned int loop; 1556 1557 /* 1558 * When initializing the memmap, __init_single_page() sets the refcount 1559 * of all pages to 1 ("allocated"/"not free"). We have to set the 1560 * refcount of all involved pages to 0. 1561 */ 1562 prefetchw(p); 1563 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1564 prefetchw(p + 1); 1565 __ClearPageReserved(p); 1566 set_page_count(p, 0); 1567 } 1568 __ClearPageReserved(p); 1569 set_page_count(p, 0); 1570 1571 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1572 1573 /* 1574 * Bypass PCP and place fresh pages right to the tail, primarily 1575 * relevant for memory onlining. 1576 */ 1577 __free_pages_ok(page, order, FPI_TO_TAIL); 1578 } 1579 1580 #ifdef CONFIG_NEED_MULTIPLE_NODES 1581 1582 /* 1583 * During memory init memblocks map pfns to nids. The search is expensive and 1584 * this caches recent lookups. The implementation of __early_pfn_to_nid 1585 * treats start/end as pfns. 1586 */ 1587 struct mminit_pfnnid_cache { 1588 unsigned long last_start; 1589 unsigned long last_end; 1590 int last_nid; 1591 }; 1592 1593 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1594 1595 /* 1596 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 1597 */ 1598 static int __meminit __early_pfn_to_nid(unsigned long pfn, 1599 struct mminit_pfnnid_cache *state) 1600 { 1601 unsigned long start_pfn, end_pfn; 1602 int nid; 1603 1604 if (state->last_start <= pfn && pfn < state->last_end) 1605 return state->last_nid; 1606 1607 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 1608 if (nid != NUMA_NO_NODE) { 1609 state->last_start = start_pfn; 1610 state->last_end = end_pfn; 1611 state->last_nid = nid; 1612 } 1613 1614 return nid; 1615 } 1616 1617 int __meminit early_pfn_to_nid(unsigned long pfn) 1618 { 1619 static DEFINE_SPINLOCK(early_pfn_lock); 1620 int nid; 1621 1622 spin_lock(&early_pfn_lock); 1623 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1624 if (nid < 0) 1625 nid = first_online_node; 1626 spin_unlock(&early_pfn_lock); 1627 1628 return nid; 1629 } 1630 #endif /* CONFIG_NEED_MULTIPLE_NODES */ 1631 1632 void __init memblock_free_pages(struct page *page, unsigned long pfn, 1633 unsigned int order) 1634 { 1635 if (early_page_uninitialised(pfn)) 1636 return; 1637 __free_pages_core(page, order); 1638 } 1639 1640 /* 1641 * Check that the whole (or subset of) a pageblock given by the interval of 1642 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1643 * with the migration of free compaction scanner. The scanners then need to 1644 * use only pfn_valid_within() check for arches that allow holes within 1645 * pageblocks. 1646 * 1647 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1648 * 1649 * It's possible on some configurations to have a setup like node0 node1 node0 1650 * i.e. it's possible that all pages within a zones range of pages do not 1651 * belong to a single zone. We assume that a border between node0 and node1 1652 * can occur within a single pageblock, but not a node0 node1 node0 1653 * interleaving within a single pageblock. It is therefore sufficient to check 1654 * the first and last page of a pageblock and avoid checking each individual 1655 * page in a pageblock. 1656 */ 1657 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1658 unsigned long end_pfn, struct zone *zone) 1659 { 1660 struct page *start_page; 1661 struct page *end_page; 1662 1663 /* end_pfn is one past the range we are checking */ 1664 end_pfn--; 1665 1666 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1667 return NULL; 1668 1669 start_page = pfn_to_online_page(start_pfn); 1670 if (!start_page) 1671 return NULL; 1672 1673 if (page_zone(start_page) != zone) 1674 return NULL; 1675 1676 end_page = pfn_to_page(end_pfn); 1677 1678 /* This gives a shorter code than deriving page_zone(end_page) */ 1679 if (page_zone_id(start_page) != page_zone_id(end_page)) 1680 return NULL; 1681 1682 return start_page; 1683 } 1684 1685 void set_zone_contiguous(struct zone *zone) 1686 { 1687 unsigned long block_start_pfn = zone->zone_start_pfn; 1688 unsigned long block_end_pfn; 1689 1690 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1691 for (; block_start_pfn < zone_end_pfn(zone); 1692 block_start_pfn = block_end_pfn, 1693 block_end_pfn += pageblock_nr_pages) { 1694 1695 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1696 1697 if (!__pageblock_pfn_to_page(block_start_pfn, 1698 block_end_pfn, zone)) 1699 return; 1700 cond_resched(); 1701 } 1702 1703 /* We confirm that there is no hole */ 1704 zone->contiguous = true; 1705 } 1706 1707 void clear_zone_contiguous(struct zone *zone) 1708 { 1709 zone->contiguous = false; 1710 } 1711 1712 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1713 static void __init deferred_free_range(unsigned long pfn, 1714 unsigned long nr_pages) 1715 { 1716 struct page *page; 1717 unsigned long i; 1718 1719 if (!nr_pages) 1720 return; 1721 1722 page = pfn_to_page(pfn); 1723 1724 /* Free a large naturally-aligned chunk if possible */ 1725 if (nr_pages == pageblock_nr_pages && 1726 (pfn & (pageblock_nr_pages - 1)) == 0) { 1727 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1728 __free_pages_core(page, pageblock_order); 1729 return; 1730 } 1731 1732 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1733 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1734 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1735 __free_pages_core(page, 0); 1736 } 1737 } 1738 1739 /* Completion tracking for deferred_init_memmap() threads */ 1740 static atomic_t pgdat_init_n_undone __initdata; 1741 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1742 1743 static inline void __init pgdat_init_report_one_done(void) 1744 { 1745 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1746 complete(&pgdat_init_all_done_comp); 1747 } 1748 1749 /* 1750 * Returns true if page needs to be initialized or freed to buddy allocator. 1751 * 1752 * First we check if pfn is valid on architectures where it is possible to have 1753 * holes within pageblock_nr_pages. On systems where it is not possible, this 1754 * function is optimized out. 1755 * 1756 * Then, we check if a current large page is valid by only checking the validity 1757 * of the head pfn. 1758 */ 1759 static inline bool __init deferred_pfn_valid(unsigned long pfn) 1760 { 1761 if (!pfn_valid_within(pfn)) 1762 return false; 1763 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) 1764 return false; 1765 return true; 1766 } 1767 1768 /* 1769 * Free pages to buddy allocator. Try to free aligned pages in 1770 * pageblock_nr_pages sizes. 1771 */ 1772 static void __init deferred_free_pages(unsigned long pfn, 1773 unsigned long end_pfn) 1774 { 1775 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1776 unsigned long nr_free = 0; 1777 1778 for (; pfn < end_pfn; pfn++) { 1779 if (!deferred_pfn_valid(pfn)) { 1780 deferred_free_range(pfn - nr_free, nr_free); 1781 nr_free = 0; 1782 } else if (!(pfn & nr_pgmask)) { 1783 deferred_free_range(pfn - nr_free, nr_free); 1784 nr_free = 1; 1785 } else { 1786 nr_free++; 1787 } 1788 } 1789 /* Free the last block of pages to allocator */ 1790 deferred_free_range(pfn - nr_free, nr_free); 1791 } 1792 1793 /* 1794 * Initialize struct pages. We minimize pfn page lookups and scheduler checks 1795 * by performing it only once every pageblock_nr_pages. 1796 * Return number of pages initialized. 1797 */ 1798 static unsigned long __init deferred_init_pages(struct zone *zone, 1799 unsigned long pfn, 1800 unsigned long end_pfn) 1801 { 1802 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1803 int nid = zone_to_nid(zone); 1804 unsigned long nr_pages = 0; 1805 int zid = zone_idx(zone); 1806 struct page *page = NULL; 1807 1808 for (; pfn < end_pfn; pfn++) { 1809 if (!deferred_pfn_valid(pfn)) { 1810 page = NULL; 1811 continue; 1812 } else if (!page || !(pfn & nr_pgmask)) { 1813 page = pfn_to_page(pfn); 1814 } else { 1815 page++; 1816 } 1817 __init_single_page(page, pfn, zid, nid); 1818 nr_pages++; 1819 } 1820 return (nr_pages); 1821 } 1822 1823 /* 1824 * This function is meant to pre-load the iterator for the zone init. 1825 * Specifically it walks through the ranges until we are caught up to the 1826 * first_init_pfn value and exits there. If we never encounter the value we 1827 * return false indicating there are no valid ranges left. 1828 */ 1829 static bool __init 1830 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, 1831 unsigned long *spfn, unsigned long *epfn, 1832 unsigned long first_init_pfn) 1833 { 1834 u64 j; 1835 1836 /* 1837 * Start out by walking through the ranges in this zone that have 1838 * already been initialized. We don't need to do anything with them 1839 * so we just need to flush them out of the system. 1840 */ 1841 for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { 1842 if (*epfn <= first_init_pfn) 1843 continue; 1844 if (*spfn < first_init_pfn) 1845 *spfn = first_init_pfn; 1846 *i = j; 1847 return true; 1848 } 1849 1850 return false; 1851 } 1852 1853 /* 1854 * Initialize and free pages. We do it in two loops: first we initialize 1855 * struct page, then free to buddy allocator, because while we are 1856 * freeing pages we can access pages that are ahead (computing buddy 1857 * page in __free_one_page()). 1858 * 1859 * In order to try and keep some memory in the cache we have the loop 1860 * broken along max page order boundaries. This way we will not cause 1861 * any issues with the buddy page computation. 1862 */ 1863 static unsigned long __init 1864 deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, 1865 unsigned long *end_pfn) 1866 { 1867 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); 1868 unsigned long spfn = *start_pfn, epfn = *end_pfn; 1869 unsigned long nr_pages = 0; 1870 u64 j = *i; 1871 1872 /* First we loop through and initialize the page values */ 1873 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { 1874 unsigned long t; 1875 1876 if (mo_pfn <= *start_pfn) 1877 break; 1878 1879 t = min(mo_pfn, *end_pfn); 1880 nr_pages += deferred_init_pages(zone, *start_pfn, t); 1881 1882 if (mo_pfn < *end_pfn) { 1883 *start_pfn = mo_pfn; 1884 break; 1885 } 1886 } 1887 1888 /* Reset values and now loop through freeing pages as needed */ 1889 swap(j, *i); 1890 1891 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { 1892 unsigned long t; 1893 1894 if (mo_pfn <= spfn) 1895 break; 1896 1897 t = min(mo_pfn, epfn); 1898 deferred_free_pages(spfn, t); 1899 1900 if (mo_pfn <= epfn) 1901 break; 1902 } 1903 1904 return nr_pages; 1905 } 1906 1907 static void __init 1908 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, 1909 void *arg) 1910 { 1911 unsigned long spfn, epfn; 1912 struct zone *zone = arg; 1913 u64 i; 1914 1915 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); 1916 1917 /* 1918 * Initialize and free pages in MAX_ORDER sized increments so that we 1919 * can avoid introducing any issues with the buddy allocator. 1920 */ 1921 while (spfn < end_pfn) { 1922 deferred_init_maxorder(&i, zone, &spfn, &epfn); 1923 cond_resched(); 1924 } 1925 } 1926 1927 /* An arch may override for more concurrency. */ 1928 __weak int __init 1929 deferred_page_init_max_threads(const struct cpumask *node_cpumask) 1930 { 1931 return 1; 1932 } 1933 1934 /* Initialise remaining memory on a node */ 1935 static int __init deferred_init_memmap(void *data) 1936 { 1937 pg_data_t *pgdat = data; 1938 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1939 unsigned long spfn = 0, epfn = 0; 1940 unsigned long first_init_pfn, flags; 1941 unsigned long start = jiffies; 1942 struct zone *zone; 1943 int zid, max_threads; 1944 u64 i; 1945 1946 /* Bind memory initialisation thread to a local node if possible */ 1947 if (!cpumask_empty(cpumask)) 1948 set_cpus_allowed_ptr(current, cpumask); 1949 1950 pgdat_resize_lock(pgdat, &flags); 1951 first_init_pfn = pgdat->first_deferred_pfn; 1952 if (first_init_pfn == ULONG_MAX) { 1953 pgdat_resize_unlock(pgdat, &flags); 1954 pgdat_init_report_one_done(); 1955 return 0; 1956 } 1957 1958 /* Sanity check boundaries */ 1959 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1960 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1961 pgdat->first_deferred_pfn = ULONG_MAX; 1962 1963 /* 1964 * Once we unlock here, the zone cannot be grown anymore, thus if an 1965 * interrupt thread must allocate this early in boot, zone must be 1966 * pre-grown prior to start of deferred page initialization. 1967 */ 1968 pgdat_resize_unlock(pgdat, &flags); 1969 1970 /* Only the highest zone is deferred so find it */ 1971 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1972 zone = pgdat->node_zones + zid; 1973 if (first_init_pfn < zone_end_pfn(zone)) 1974 break; 1975 } 1976 1977 /* If the zone is empty somebody else may have cleared out the zone */ 1978 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 1979 first_init_pfn)) 1980 goto zone_empty; 1981 1982 max_threads = deferred_page_init_max_threads(cpumask); 1983 1984 while (spfn < epfn) { 1985 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); 1986 struct padata_mt_job job = { 1987 .thread_fn = deferred_init_memmap_chunk, 1988 .fn_arg = zone, 1989 .start = spfn, 1990 .size = epfn_align - spfn, 1991 .align = PAGES_PER_SECTION, 1992 .min_chunk = PAGES_PER_SECTION, 1993 .max_threads = max_threads, 1994 }; 1995 1996 padata_do_multithreaded(&job); 1997 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 1998 epfn_align); 1999 } 2000 zone_empty: 2001 /* Sanity check that the next zone really is unpopulated */ 2002 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 2003 2004 pr_info("node %d deferred pages initialised in %ums\n", 2005 pgdat->node_id, jiffies_to_msecs(jiffies - start)); 2006 2007 pgdat_init_report_one_done(); 2008 return 0; 2009 } 2010 2011 /* 2012 * If this zone has deferred pages, try to grow it by initializing enough 2013 * deferred pages to satisfy the allocation specified by order, rounded up to 2014 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 2015 * of SECTION_SIZE bytes by initializing struct pages in increments of 2016 * PAGES_PER_SECTION * sizeof(struct page) bytes. 2017 * 2018 * Return true when zone was grown, otherwise return false. We return true even 2019 * when we grow less than requested, to let the caller decide if there are 2020 * enough pages to satisfy the allocation. 2021 * 2022 * Note: We use noinline because this function is needed only during boot, and 2023 * it is called from a __ref function _deferred_grow_zone. This way we are 2024 * making sure that it is not inlined into permanent text section. 2025 */ 2026 static noinline bool __init 2027 deferred_grow_zone(struct zone *zone, unsigned int order) 2028 { 2029 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); 2030 pg_data_t *pgdat = zone->zone_pgdat; 2031 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 2032 unsigned long spfn, epfn, flags; 2033 unsigned long nr_pages = 0; 2034 u64 i; 2035 2036 /* Only the last zone may have deferred pages */ 2037 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) 2038 return false; 2039 2040 pgdat_resize_lock(pgdat, &flags); 2041 2042 /* 2043 * If someone grew this zone while we were waiting for spinlock, return 2044 * true, as there might be enough pages already. 2045 */ 2046 if (first_deferred_pfn != pgdat->first_deferred_pfn) { 2047 pgdat_resize_unlock(pgdat, &flags); 2048 return true; 2049 } 2050 2051 /* If the zone is empty somebody else may have cleared out the zone */ 2052 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2053 first_deferred_pfn)) { 2054 pgdat->first_deferred_pfn = ULONG_MAX; 2055 pgdat_resize_unlock(pgdat, &flags); 2056 /* Retry only once. */ 2057 return first_deferred_pfn != ULONG_MAX; 2058 } 2059 2060 /* 2061 * Initialize and free pages in MAX_ORDER sized increments so 2062 * that we can avoid introducing any issues with the buddy 2063 * allocator. 2064 */ 2065 while (spfn < epfn) { 2066 /* update our first deferred PFN for this section */ 2067 first_deferred_pfn = spfn; 2068 2069 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 2070 touch_nmi_watchdog(); 2071 2072 /* We should only stop along section boundaries */ 2073 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) 2074 continue; 2075 2076 /* If our quota has been met we can stop here */ 2077 if (nr_pages >= nr_pages_needed) 2078 break; 2079 } 2080 2081 pgdat->first_deferred_pfn = spfn; 2082 pgdat_resize_unlock(pgdat, &flags); 2083 2084 return nr_pages > 0; 2085 } 2086 2087 /* 2088 * deferred_grow_zone() is __init, but it is called from 2089 * get_page_from_freelist() during early boot until deferred_pages permanently 2090 * disables this call. This is why we have refdata wrapper to avoid warning, 2091 * and to ensure that the function body gets unloaded. 2092 */ 2093 static bool __ref 2094 _deferred_grow_zone(struct zone *zone, unsigned int order) 2095 { 2096 return deferred_grow_zone(zone, order); 2097 } 2098 2099 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 2100 2101 void __init page_alloc_init_late(void) 2102 { 2103 struct zone *zone; 2104 int nid; 2105 2106 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 2107 2108 /* There will be num_node_state(N_MEMORY) threads */ 2109 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 2110 for_each_node_state(nid, N_MEMORY) { 2111 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 2112 } 2113 2114 /* Block until all are initialised */ 2115 wait_for_completion(&pgdat_init_all_done_comp); 2116 2117 /* 2118 * The number of managed pages has changed due to the initialisation 2119 * so the pcpu batch and high limits needs to be updated or the limits 2120 * will be artificially small. 2121 */ 2122 for_each_populated_zone(zone) 2123 zone_pcp_update(zone); 2124 2125 /* 2126 * We initialized the rest of the deferred pages. Permanently disable 2127 * on-demand struct page initialization. 2128 */ 2129 static_branch_disable(&deferred_pages); 2130 2131 /* Reinit limits that are based on free pages after the kernel is up */ 2132 files_maxfiles_init(); 2133 #endif 2134 2135 buffer_init(); 2136 2137 /* Discard memblock private memory */ 2138 memblock_discard(); 2139 2140 for_each_node_state(nid, N_MEMORY) 2141 shuffle_free_memory(NODE_DATA(nid)); 2142 2143 for_each_populated_zone(zone) 2144 set_zone_contiguous(zone); 2145 } 2146 2147 #ifdef CONFIG_CMA 2148 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 2149 void __init init_cma_reserved_pageblock(struct page *page) 2150 { 2151 unsigned i = pageblock_nr_pages; 2152 struct page *p = page; 2153 2154 do { 2155 __ClearPageReserved(p); 2156 set_page_count(p, 0); 2157 } while (++p, --i); 2158 2159 set_pageblock_migratetype(page, MIGRATE_CMA); 2160 2161 if (pageblock_order >= MAX_ORDER) { 2162 i = pageblock_nr_pages; 2163 p = page; 2164 do { 2165 set_page_refcounted(p); 2166 __free_pages(p, MAX_ORDER - 1); 2167 p += MAX_ORDER_NR_PAGES; 2168 } while (i -= MAX_ORDER_NR_PAGES); 2169 } else { 2170 set_page_refcounted(page); 2171 __free_pages(page, pageblock_order); 2172 } 2173 2174 adjust_managed_page_count(page, pageblock_nr_pages); 2175 page_zone(page)->cma_pages += pageblock_nr_pages; 2176 } 2177 #endif 2178 2179 /* 2180 * The order of subdivision here is critical for the IO subsystem. 2181 * Please do not alter this order without good reasons and regression 2182 * testing. Specifically, as large blocks of memory are subdivided, 2183 * the order in which smaller blocks are delivered depends on the order 2184 * they're subdivided in this function. This is the primary factor 2185 * influencing the order in which pages are delivered to the IO 2186 * subsystem according to empirical testing, and this is also justified 2187 * by considering the behavior of a buddy system containing a single 2188 * large block of memory acted on by a series of small allocations. 2189 * This behavior is a critical factor in sglist merging's success. 2190 * 2191 * -- nyc 2192 */ 2193 static inline void expand(struct zone *zone, struct page *page, 2194 int low, int high, int migratetype) 2195 { 2196 unsigned long size = 1 << high; 2197 2198 while (high > low) { 2199 high--; 2200 size >>= 1; 2201 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 2202 2203 /* 2204 * Mark as guard pages (or page), that will allow to 2205 * merge back to allocator when buddy will be freed. 2206 * Corresponding page table entries will not be touched, 2207 * pages will stay not present in virtual address space 2208 */ 2209 if (set_page_guard(zone, &page[size], high, migratetype)) 2210 continue; 2211 2212 add_to_free_list(&page[size], zone, high, migratetype); 2213 set_buddy_order(&page[size], high); 2214 } 2215 } 2216 2217 static void check_new_page_bad(struct page *page) 2218 { 2219 if (unlikely(page->flags & __PG_HWPOISON)) { 2220 /* Don't complain about hwpoisoned pages */ 2221 page_mapcount_reset(page); /* remove PageBuddy */ 2222 return; 2223 } 2224 2225 bad_page(page, 2226 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 2227 } 2228 2229 /* 2230 * This page is about to be returned from the page allocator 2231 */ 2232 static inline int check_new_page(struct page *page) 2233 { 2234 if (likely(page_expected_state(page, 2235 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 2236 return 0; 2237 2238 check_new_page_bad(page); 2239 return 1; 2240 } 2241 2242 #ifdef CONFIG_DEBUG_VM 2243 /* 2244 * With DEBUG_VM enabled, order-0 pages are checked for expected state when 2245 * being allocated from pcp lists. With debug_pagealloc also enabled, they are 2246 * also checked when pcp lists are refilled from the free lists. 2247 */ 2248 static inline bool check_pcp_refill(struct page *page) 2249 { 2250 if (debug_pagealloc_enabled_static()) 2251 return check_new_page(page); 2252 else 2253 return false; 2254 } 2255 2256 static inline bool check_new_pcp(struct page *page) 2257 { 2258 return check_new_page(page); 2259 } 2260 #else 2261 /* 2262 * With DEBUG_VM disabled, free order-0 pages are checked for expected state 2263 * when pcp lists are being refilled from the free lists. With debug_pagealloc 2264 * enabled, they are also checked when being allocated from the pcp lists. 2265 */ 2266 static inline bool check_pcp_refill(struct page *page) 2267 { 2268 return check_new_page(page); 2269 } 2270 static inline bool check_new_pcp(struct page *page) 2271 { 2272 if (debug_pagealloc_enabled_static()) 2273 return check_new_page(page); 2274 else 2275 return false; 2276 } 2277 #endif /* CONFIG_DEBUG_VM */ 2278 2279 static bool check_new_pages(struct page *page, unsigned int order) 2280 { 2281 int i; 2282 for (i = 0; i < (1 << order); i++) { 2283 struct page *p = page + i; 2284 2285 if (unlikely(check_new_page(p))) 2286 return true; 2287 } 2288 2289 return false; 2290 } 2291 2292 inline void post_alloc_hook(struct page *page, unsigned int order, 2293 gfp_t gfp_flags) 2294 { 2295 set_page_private(page, 0); 2296 set_page_refcounted(page); 2297 2298 arch_alloc_page(page, order); 2299 debug_pagealloc_map_pages(page, 1 << order); 2300 kasan_alloc_pages(page, order); 2301 kernel_unpoison_pages(page, 1 << order); 2302 set_page_owner(page, order, gfp_flags); 2303 2304 if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) 2305 kernel_init_free_pages(page, 1 << order); 2306 } 2307 2308 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 2309 unsigned int alloc_flags) 2310 { 2311 post_alloc_hook(page, order, gfp_flags); 2312 2313 if (order && (gfp_flags & __GFP_COMP)) 2314 prep_compound_page(page, order); 2315 2316 /* 2317 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 2318 * allocate the page. The expectation is that the caller is taking 2319 * steps that will free more memory. The caller should avoid the page 2320 * being used for !PFMEMALLOC purposes. 2321 */ 2322 if (alloc_flags & ALLOC_NO_WATERMARKS) 2323 set_page_pfmemalloc(page); 2324 else 2325 clear_page_pfmemalloc(page); 2326 } 2327 2328 /* 2329 * Go through the free lists for the given migratetype and remove 2330 * the smallest available page from the freelists 2331 */ 2332 static __always_inline 2333 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 2334 int migratetype) 2335 { 2336 unsigned int current_order; 2337 struct free_area *area; 2338 struct page *page; 2339 2340 /* Find a page of the appropriate size in the preferred list */ 2341 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 2342 area = &(zone->free_area[current_order]); 2343 page = get_page_from_free_area(area, migratetype); 2344 if (!page) 2345 continue; 2346 del_page_from_free_list(page, zone, current_order); 2347 expand(zone, page, order, current_order, migratetype); 2348 set_pcppage_migratetype(page, migratetype); 2349 return page; 2350 } 2351 2352 return NULL; 2353 } 2354 2355 2356 /* 2357 * This array describes the order lists are fallen back to when 2358 * the free lists for the desirable migrate type are depleted 2359 */ 2360 static int fallbacks[MIGRATE_TYPES][3] = { 2361 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2362 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 2363 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2364 #ifdef CONFIG_CMA 2365 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 2366 #endif 2367 #ifdef CONFIG_MEMORY_ISOLATION 2368 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 2369 #endif 2370 }; 2371 2372 #ifdef CONFIG_CMA 2373 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2374 unsigned int order) 2375 { 2376 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 2377 } 2378 #else 2379 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2380 unsigned int order) { return NULL; } 2381 #endif 2382 2383 /* 2384 * Move the free pages in a range to the freelist tail of the requested type. 2385 * Note that start_page and end_pages are not aligned on a pageblock 2386 * boundary. If alignment is required, use move_freepages_block() 2387 */ 2388 static int move_freepages(struct zone *zone, 2389 struct page *start_page, struct page *end_page, 2390 int migratetype, int *num_movable) 2391 { 2392 struct page *page; 2393 unsigned int order; 2394 int pages_moved = 0; 2395 2396 for (page = start_page; page <= end_page;) { 2397 if (!pfn_valid_within(page_to_pfn(page))) { 2398 page++; 2399 continue; 2400 } 2401 2402 if (!PageBuddy(page)) { 2403 /* 2404 * We assume that pages that could be isolated for 2405 * migration are movable. But we don't actually try 2406 * isolating, as that would be expensive. 2407 */ 2408 if (num_movable && 2409 (PageLRU(page) || __PageMovable(page))) 2410 (*num_movable)++; 2411 2412 page++; 2413 continue; 2414 } 2415 2416 /* Make sure we are not inadvertently changing nodes */ 2417 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 2418 VM_BUG_ON_PAGE(page_zone(page) != zone, page); 2419 2420 order = buddy_order(page); 2421 move_to_free_list(page, zone, order, migratetype); 2422 page += 1 << order; 2423 pages_moved += 1 << order; 2424 } 2425 2426 return pages_moved; 2427 } 2428 2429 int move_freepages_block(struct zone *zone, struct page *page, 2430 int migratetype, int *num_movable) 2431 { 2432 unsigned long start_pfn, end_pfn; 2433 struct page *start_page, *end_page; 2434 2435 if (num_movable) 2436 *num_movable = 0; 2437 2438 start_pfn = page_to_pfn(page); 2439 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 2440 start_page = pfn_to_page(start_pfn); 2441 end_page = start_page + pageblock_nr_pages - 1; 2442 end_pfn = start_pfn + pageblock_nr_pages - 1; 2443 2444 /* Do not cross zone boundaries */ 2445 if (!zone_spans_pfn(zone, start_pfn)) 2446 start_page = page; 2447 if (!zone_spans_pfn(zone, end_pfn)) 2448 return 0; 2449 2450 return move_freepages(zone, start_page, end_page, migratetype, 2451 num_movable); 2452 } 2453 2454 static void change_pageblock_range(struct page *pageblock_page, 2455 int start_order, int migratetype) 2456 { 2457 int nr_pageblocks = 1 << (start_order - pageblock_order); 2458 2459 while (nr_pageblocks--) { 2460 set_pageblock_migratetype(pageblock_page, migratetype); 2461 pageblock_page += pageblock_nr_pages; 2462 } 2463 } 2464 2465 /* 2466 * When we are falling back to another migratetype during allocation, try to 2467 * steal extra free pages from the same pageblocks to satisfy further 2468 * allocations, instead of polluting multiple pageblocks. 2469 * 2470 * If we are stealing a relatively large buddy page, it is likely there will 2471 * be more free pages in the pageblock, so try to steal them all. For 2472 * reclaimable and unmovable allocations, we steal regardless of page size, 2473 * as fragmentation caused by those allocations polluting movable pageblocks 2474 * is worse than movable allocations stealing from unmovable and reclaimable 2475 * pageblocks. 2476 */ 2477 static bool can_steal_fallback(unsigned int order, int start_mt) 2478 { 2479 /* 2480 * Leaving this order check is intended, although there is 2481 * relaxed order check in next check. The reason is that 2482 * we can actually steal whole pageblock if this condition met, 2483 * but, below check doesn't guarantee it and that is just heuristic 2484 * so could be changed anytime. 2485 */ 2486 if (order >= pageblock_order) 2487 return true; 2488 2489 if (order >= pageblock_order / 2 || 2490 start_mt == MIGRATE_RECLAIMABLE || 2491 start_mt == MIGRATE_UNMOVABLE || 2492 page_group_by_mobility_disabled) 2493 return true; 2494 2495 return false; 2496 } 2497 2498 static inline bool boost_watermark(struct zone *zone) 2499 { 2500 unsigned long max_boost; 2501 2502 if (!watermark_boost_factor) 2503 return false; 2504 /* 2505 * Don't bother in zones that are unlikely to produce results. 2506 * On small machines, including kdump capture kernels running 2507 * in a small area, boosting the watermark can cause an out of 2508 * memory situation immediately. 2509 */ 2510 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 2511 return false; 2512 2513 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 2514 watermark_boost_factor, 10000); 2515 2516 /* 2517 * high watermark may be uninitialised if fragmentation occurs 2518 * very early in boot so do not boost. We do not fall 2519 * through and boost by pageblock_nr_pages as failing 2520 * allocations that early means that reclaim is not going 2521 * to help and it may even be impossible to reclaim the 2522 * boosted watermark resulting in a hang. 2523 */ 2524 if (!max_boost) 2525 return false; 2526 2527 max_boost = max(pageblock_nr_pages, max_boost); 2528 2529 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 2530 max_boost); 2531 2532 return true; 2533 } 2534 2535 /* 2536 * This function implements actual steal behaviour. If order is large enough, 2537 * we can steal whole pageblock. If not, we first move freepages in this 2538 * pageblock to our migratetype and determine how many already-allocated pages 2539 * are there in the pageblock with a compatible migratetype. If at least half 2540 * of pages are free or compatible, we can change migratetype of the pageblock 2541 * itself, so pages freed in the future will be put on the correct free list. 2542 */ 2543 static void steal_suitable_fallback(struct zone *zone, struct page *page, 2544 unsigned int alloc_flags, int start_type, bool whole_block) 2545 { 2546 unsigned int current_order = buddy_order(page); 2547 int free_pages, movable_pages, alike_pages; 2548 int old_block_type; 2549 2550 old_block_type = get_pageblock_migratetype(page); 2551 2552 /* 2553 * This can happen due to races and we want to prevent broken 2554 * highatomic accounting. 2555 */ 2556 if (is_migrate_highatomic(old_block_type)) 2557 goto single_page; 2558 2559 /* Take ownership for orders >= pageblock_order */ 2560 if (current_order >= pageblock_order) { 2561 change_pageblock_range(page, current_order, start_type); 2562 goto single_page; 2563 } 2564 2565 /* 2566 * Boost watermarks to increase reclaim pressure to reduce the 2567 * likelihood of future fallbacks. Wake kswapd now as the node 2568 * may be balanced overall and kswapd will not wake naturally. 2569 */ 2570 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 2571 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 2572 2573 /* We are not allowed to try stealing from the whole block */ 2574 if (!whole_block) 2575 goto single_page; 2576 2577 free_pages = move_freepages_block(zone, page, start_type, 2578 &movable_pages); 2579 /* 2580 * Determine how many pages are compatible with our allocation. 2581 * For movable allocation, it's the number of movable pages which 2582 * we just obtained. For other types it's a bit more tricky. 2583 */ 2584 if (start_type == MIGRATE_MOVABLE) { 2585 alike_pages = movable_pages; 2586 } else { 2587 /* 2588 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2589 * to MOVABLE pageblock, consider all non-movable pages as 2590 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2591 * vice versa, be conservative since we can't distinguish the 2592 * exact migratetype of non-movable pages. 2593 */ 2594 if (old_block_type == MIGRATE_MOVABLE) 2595 alike_pages = pageblock_nr_pages 2596 - (free_pages + movable_pages); 2597 else 2598 alike_pages = 0; 2599 } 2600 2601 /* moving whole block can fail due to zone boundary conditions */ 2602 if (!free_pages) 2603 goto single_page; 2604 2605 /* 2606 * If a sufficient number of pages in the block are either free or of 2607 * comparable migratability as our allocation, claim the whole block. 2608 */ 2609 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2610 page_group_by_mobility_disabled) 2611 set_pageblock_migratetype(page, start_type); 2612 2613 return; 2614 2615 single_page: 2616 move_to_free_list(page, zone, current_order, start_type); 2617 } 2618 2619 /* 2620 * Check whether there is a suitable fallback freepage with requested order. 2621 * If only_stealable is true, this function returns fallback_mt only if 2622 * we can steal other freepages all together. This would help to reduce 2623 * fragmentation due to mixed migratetype pages in one pageblock. 2624 */ 2625 int find_suitable_fallback(struct free_area *area, unsigned int order, 2626 int migratetype, bool only_stealable, bool *can_steal) 2627 { 2628 int i; 2629 int fallback_mt; 2630 2631 if (area->nr_free == 0) 2632 return -1; 2633 2634 *can_steal = false; 2635 for (i = 0;; i++) { 2636 fallback_mt = fallbacks[migratetype][i]; 2637 if (fallback_mt == MIGRATE_TYPES) 2638 break; 2639 2640 if (free_area_empty(area, fallback_mt)) 2641 continue; 2642 2643 if (can_steal_fallback(order, migratetype)) 2644 *can_steal = true; 2645 2646 if (!only_stealable) 2647 return fallback_mt; 2648 2649 if (*can_steal) 2650 return fallback_mt; 2651 } 2652 2653 return -1; 2654 } 2655 2656 /* 2657 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2658 * there are no empty page blocks that contain a page with a suitable order 2659 */ 2660 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2661 unsigned int alloc_order) 2662 { 2663 int mt; 2664 unsigned long max_managed, flags; 2665 2666 /* 2667 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2668 * Check is race-prone but harmless. 2669 */ 2670 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; 2671 if (zone->nr_reserved_highatomic >= max_managed) 2672 return; 2673 2674 spin_lock_irqsave(&zone->lock, flags); 2675 2676 /* Recheck the nr_reserved_highatomic limit under the lock */ 2677 if (zone->nr_reserved_highatomic >= max_managed) 2678 goto out_unlock; 2679 2680 /* Yoink! */ 2681 mt = get_pageblock_migratetype(page); 2682 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 2683 && !is_migrate_cma(mt)) { 2684 zone->nr_reserved_highatomic += pageblock_nr_pages; 2685 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2686 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2687 } 2688 2689 out_unlock: 2690 spin_unlock_irqrestore(&zone->lock, flags); 2691 } 2692 2693 /* 2694 * Used when an allocation is about to fail under memory pressure. This 2695 * potentially hurts the reliability of high-order allocations when under 2696 * intense memory pressure but failed atomic allocations should be easier 2697 * to recover from than an OOM. 2698 * 2699 * If @force is true, try to unreserve a pageblock even though highatomic 2700 * pageblock is exhausted. 2701 */ 2702 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2703 bool force) 2704 { 2705 struct zonelist *zonelist = ac->zonelist; 2706 unsigned long flags; 2707 struct zoneref *z; 2708 struct zone *zone; 2709 struct page *page; 2710 int order; 2711 bool ret; 2712 2713 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 2714 ac->nodemask) { 2715 /* 2716 * Preserve at least one pageblock unless memory pressure 2717 * is really high. 2718 */ 2719 if (!force && zone->nr_reserved_highatomic <= 2720 pageblock_nr_pages) 2721 continue; 2722 2723 spin_lock_irqsave(&zone->lock, flags); 2724 for (order = 0; order < MAX_ORDER; order++) { 2725 struct free_area *area = &(zone->free_area[order]); 2726 2727 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 2728 if (!page) 2729 continue; 2730 2731 /* 2732 * In page freeing path, migratetype change is racy so 2733 * we can counter several free pages in a pageblock 2734 * in this loop althoug we changed the pageblock type 2735 * from highatomic to ac->migratetype. So we should 2736 * adjust the count once. 2737 */ 2738 if (is_migrate_highatomic_page(page)) { 2739 /* 2740 * It should never happen but changes to 2741 * locking could inadvertently allow a per-cpu 2742 * drain to add pages to MIGRATE_HIGHATOMIC 2743 * while unreserving so be safe and watch for 2744 * underflows. 2745 */ 2746 zone->nr_reserved_highatomic -= min( 2747 pageblock_nr_pages, 2748 zone->nr_reserved_highatomic); 2749 } 2750 2751 /* 2752 * Convert to ac->migratetype and avoid the normal 2753 * pageblock stealing heuristics. Minimally, the caller 2754 * is doing the work and needs the pages. More 2755 * importantly, if the block was always converted to 2756 * MIGRATE_UNMOVABLE or another type then the number 2757 * of pageblocks that cannot be completely freed 2758 * may increase. 2759 */ 2760 set_pageblock_migratetype(page, ac->migratetype); 2761 ret = move_freepages_block(zone, page, ac->migratetype, 2762 NULL); 2763 if (ret) { 2764 spin_unlock_irqrestore(&zone->lock, flags); 2765 return ret; 2766 } 2767 } 2768 spin_unlock_irqrestore(&zone->lock, flags); 2769 } 2770 2771 return false; 2772 } 2773 2774 /* 2775 * Try finding a free buddy page on the fallback list and put it on the free 2776 * list of requested migratetype, possibly along with other pages from the same 2777 * block, depending on fragmentation avoidance heuristics. Returns true if 2778 * fallback was found so that __rmqueue_smallest() can grab it. 2779 * 2780 * The use of signed ints for order and current_order is a deliberate 2781 * deviation from the rest of this file, to make the for loop 2782 * condition simpler. 2783 */ 2784 static __always_inline bool 2785 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, 2786 unsigned int alloc_flags) 2787 { 2788 struct free_area *area; 2789 int current_order; 2790 int min_order = order; 2791 struct page *page; 2792 int fallback_mt; 2793 bool can_steal; 2794 2795 /* 2796 * Do not steal pages from freelists belonging to other pageblocks 2797 * i.e. orders < pageblock_order. If there are no local zones free, 2798 * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 2799 */ 2800 if (alloc_flags & ALLOC_NOFRAGMENT) 2801 min_order = pageblock_order; 2802 2803 /* 2804 * Find the largest available free page in the other list. This roughly 2805 * approximates finding the pageblock with the most free pages, which 2806 * would be too costly to do exactly. 2807 */ 2808 for (current_order = MAX_ORDER - 1; current_order >= min_order; 2809 --current_order) { 2810 area = &(zone->free_area[current_order]); 2811 fallback_mt = find_suitable_fallback(area, current_order, 2812 start_migratetype, false, &can_steal); 2813 if (fallback_mt == -1) 2814 continue; 2815 2816 /* 2817 * We cannot steal all free pages from the pageblock and the 2818 * requested migratetype is movable. In that case it's better to 2819 * steal and split the smallest available page instead of the 2820 * largest available page, because even if the next movable 2821 * allocation falls back into a different pageblock than this 2822 * one, it won't cause permanent fragmentation. 2823 */ 2824 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2825 && current_order > order) 2826 goto find_smallest; 2827 2828 goto do_steal; 2829 } 2830 2831 return false; 2832 2833 find_smallest: 2834 for (current_order = order; current_order < MAX_ORDER; 2835 current_order++) { 2836 area = &(zone->free_area[current_order]); 2837 fallback_mt = find_suitable_fallback(area, current_order, 2838 start_migratetype, false, &can_steal); 2839 if (fallback_mt != -1) 2840 break; 2841 } 2842 2843 /* 2844 * This should not happen - we already found a suitable fallback 2845 * when looking for the largest page. 2846 */ 2847 VM_BUG_ON(current_order == MAX_ORDER); 2848 2849 do_steal: 2850 page = get_page_from_free_area(area, fallback_mt); 2851 2852 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, 2853 can_steal); 2854 2855 trace_mm_page_alloc_extfrag(page, order, current_order, 2856 start_migratetype, fallback_mt); 2857 2858 return true; 2859 2860 } 2861 2862 /* 2863 * Do the hard work of removing an element from the buddy allocator. 2864 * Call me with the zone->lock already held. 2865 */ 2866 static __always_inline struct page * 2867 __rmqueue(struct zone *zone, unsigned int order, int migratetype, 2868 unsigned int alloc_flags) 2869 { 2870 struct page *page; 2871 2872 if (IS_ENABLED(CONFIG_CMA)) { 2873 /* 2874 * Balance movable allocations between regular and CMA areas by 2875 * allocating from CMA when over half of the zone's free memory 2876 * is in the CMA area. 2877 */ 2878 if (alloc_flags & ALLOC_CMA && 2879 zone_page_state(zone, NR_FREE_CMA_PAGES) > 2880 zone_page_state(zone, NR_FREE_PAGES) / 2) { 2881 page = __rmqueue_cma_fallback(zone, order); 2882 if (page) 2883 goto out; 2884 } 2885 } 2886 retry: 2887 page = __rmqueue_smallest(zone, order, migratetype); 2888 if (unlikely(!page)) { 2889 if (alloc_flags & ALLOC_CMA) 2890 page = __rmqueue_cma_fallback(zone, order); 2891 2892 if (!page && __rmqueue_fallback(zone, order, migratetype, 2893 alloc_flags)) 2894 goto retry; 2895 } 2896 out: 2897 if (page) 2898 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2899 return page; 2900 } 2901 2902 /* 2903 * Obtain a specified number of elements from the buddy allocator, all under 2904 * a single hold of the lock, for efficiency. Add them to the supplied list. 2905 * Returns the number of new pages which were placed at *list. 2906 */ 2907 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2908 unsigned long count, struct list_head *list, 2909 int migratetype, unsigned int alloc_flags) 2910 { 2911 int i, alloced = 0; 2912 2913 spin_lock(&zone->lock); 2914 for (i = 0; i < count; ++i) { 2915 struct page *page = __rmqueue(zone, order, migratetype, 2916 alloc_flags); 2917 if (unlikely(page == NULL)) 2918 break; 2919 2920 if (unlikely(check_pcp_refill(page))) 2921 continue; 2922 2923 /* 2924 * Split buddy pages returned by expand() are received here in 2925 * physical page order. The page is added to the tail of 2926 * caller's list. From the callers perspective, the linked list 2927 * is ordered by page number under some conditions. This is 2928 * useful for IO devices that can forward direction from the 2929 * head, thus also in the physical page order. This is useful 2930 * for IO devices that can merge IO requests if the physical 2931 * pages are ordered properly. 2932 */ 2933 list_add_tail(&page->lru, list); 2934 alloced++; 2935 if (is_migrate_cma(get_pcppage_migratetype(page))) 2936 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2937 -(1 << order)); 2938 } 2939 2940 /* 2941 * i pages were removed from the buddy list even if some leak due 2942 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 2943 * on i. Do not confuse with 'alloced' which is the number of 2944 * pages added to the pcp list. 2945 */ 2946 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2947 spin_unlock(&zone->lock); 2948 return alloced; 2949 } 2950 2951 #ifdef CONFIG_NUMA 2952 /* 2953 * Called from the vmstat counter updater to drain pagesets of this 2954 * currently executing processor on remote nodes after they have 2955 * expired. 2956 * 2957 * Note that this function must be called with the thread pinned to 2958 * a single processor. 2959 */ 2960 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2961 { 2962 unsigned long flags; 2963 int to_drain, batch; 2964 2965 local_irq_save(flags); 2966 batch = READ_ONCE(pcp->batch); 2967 to_drain = min(pcp->count, batch); 2968 if (to_drain > 0) 2969 free_pcppages_bulk(zone, to_drain, pcp); 2970 local_irq_restore(flags); 2971 } 2972 #endif 2973 2974 /* 2975 * Drain pcplists of the indicated processor and zone. 2976 * 2977 * The processor must either be the current processor and the 2978 * thread pinned to the current processor or a processor that 2979 * is not online. 2980 */ 2981 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2982 { 2983 unsigned long flags; 2984 struct per_cpu_pageset *pset; 2985 struct per_cpu_pages *pcp; 2986 2987 local_irq_save(flags); 2988 pset = per_cpu_ptr(zone->pageset, cpu); 2989 2990 pcp = &pset->pcp; 2991 if (pcp->count) 2992 free_pcppages_bulk(zone, pcp->count, pcp); 2993 local_irq_restore(flags); 2994 } 2995 2996 /* 2997 * Drain pcplists of all zones on the indicated processor. 2998 * 2999 * The processor must either be the current processor and the 3000 * thread pinned to the current processor or a processor that 3001 * is not online. 3002 */ 3003 static void drain_pages(unsigned int cpu) 3004 { 3005 struct zone *zone; 3006 3007 for_each_populated_zone(zone) { 3008 drain_pages_zone(cpu, zone); 3009 } 3010 } 3011 3012 /* 3013 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 3014 * 3015 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 3016 * the single zone's pages. 3017 */ 3018 void drain_local_pages(struct zone *zone) 3019 { 3020 int cpu = smp_processor_id(); 3021 3022 if (zone) 3023 drain_pages_zone(cpu, zone); 3024 else 3025 drain_pages(cpu); 3026 } 3027 3028 static void drain_local_pages_wq(struct work_struct *work) 3029 { 3030 struct pcpu_drain *drain; 3031 3032 drain = container_of(work, struct pcpu_drain, work); 3033 3034 /* 3035 * drain_all_pages doesn't use proper cpu hotplug protection so 3036 * we can race with cpu offline when the WQ can move this from 3037 * a cpu pinned worker to an unbound one. We can operate on a different 3038 * cpu which is allright but we also have to make sure to not move to 3039 * a different one. 3040 */ 3041 preempt_disable(); 3042 drain_local_pages(drain->zone); 3043 preempt_enable(); 3044 } 3045 3046 /* 3047 * The implementation of drain_all_pages(), exposing an extra parameter to 3048 * drain on all cpus. 3049 * 3050 * drain_all_pages() is optimized to only execute on cpus where pcplists are 3051 * not empty. The check for non-emptiness can however race with a free to 3052 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers 3053 * that need the guarantee that every CPU has drained can disable the 3054 * optimizing racy check. 3055 */ 3056 static void __drain_all_pages(struct zone *zone, bool force_all_cpus) 3057 { 3058 int cpu; 3059 3060 /* 3061 * Allocate in the BSS so we wont require allocation in 3062 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 3063 */ 3064 static cpumask_t cpus_with_pcps; 3065 3066 /* 3067 * Make sure nobody triggers this path before mm_percpu_wq is fully 3068 * initialized. 3069 */ 3070 if (WARN_ON_ONCE(!mm_percpu_wq)) 3071 return; 3072 3073 /* 3074 * Do not drain if one is already in progress unless it's specific to 3075 * a zone. Such callers are primarily CMA and memory hotplug and need 3076 * the drain to be complete when the call returns. 3077 */ 3078 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 3079 if (!zone) 3080 return; 3081 mutex_lock(&pcpu_drain_mutex); 3082 } 3083 3084 /* 3085 * We don't care about racing with CPU hotplug event 3086 * as offline notification will cause the notified 3087 * cpu to drain that CPU pcps and on_each_cpu_mask 3088 * disables preemption as part of its processing 3089 */ 3090 for_each_online_cpu(cpu) { 3091 struct per_cpu_pageset *pcp; 3092 struct zone *z; 3093 bool has_pcps = false; 3094 3095 if (force_all_cpus) { 3096 /* 3097 * The pcp.count check is racy, some callers need a 3098 * guarantee that no cpu is missed. 3099 */ 3100 has_pcps = true; 3101 } else if (zone) { 3102 pcp = per_cpu_ptr(zone->pageset, cpu); 3103 if (pcp->pcp.count) 3104 has_pcps = true; 3105 } else { 3106 for_each_populated_zone(z) { 3107 pcp = per_cpu_ptr(z->pageset, cpu); 3108 if (pcp->pcp.count) { 3109 has_pcps = true; 3110 break; 3111 } 3112 } 3113 } 3114 3115 if (has_pcps) 3116 cpumask_set_cpu(cpu, &cpus_with_pcps); 3117 else 3118 cpumask_clear_cpu(cpu, &cpus_with_pcps); 3119 } 3120 3121 for_each_cpu(cpu, &cpus_with_pcps) { 3122 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); 3123 3124 drain->zone = zone; 3125 INIT_WORK(&drain->work, drain_local_pages_wq); 3126 queue_work_on(cpu, mm_percpu_wq, &drain->work); 3127 } 3128 for_each_cpu(cpu, &cpus_with_pcps) 3129 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); 3130 3131 mutex_unlock(&pcpu_drain_mutex); 3132 } 3133 3134 /* 3135 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 3136 * 3137 * When zone parameter is non-NULL, spill just the single zone's pages. 3138 * 3139 * Note that this can be extremely slow as the draining happens in a workqueue. 3140 */ 3141 void drain_all_pages(struct zone *zone) 3142 { 3143 __drain_all_pages(zone, false); 3144 } 3145 3146 #ifdef CONFIG_HIBERNATION 3147 3148 /* 3149 * Touch the watchdog for every WD_PAGE_COUNT pages. 3150 */ 3151 #define WD_PAGE_COUNT (128*1024) 3152 3153 void mark_free_pages(struct zone *zone) 3154 { 3155 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 3156 unsigned long flags; 3157 unsigned int order, t; 3158 struct page *page; 3159 3160 if (zone_is_empty(zone)) 3161 return; 3162 3163 spin_lock_irqsave(&zone->lock, flags); 3164 3165 max_zone_pfn = zone_end_pfn(zone); 3166 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 3167 if (pfn_valid(pfn)) { 3168 page = pfn_to_page(pfn); 3169 3170 if (!--page_count) { 3171 touch_nmi_watchdog(); 3172 page_count = WD_PAGE_COUNT; 3173 } 3174 3175 if (page_zone(page) != zone) 3176 continue; 3177 3178 if (!swsusp_page_is_forbidden(page)) 3179 swsusp_unset_page_free(page); 3180 } 3181 3182 for_each_migratetype_order(order, t) { 3183 list_for_each_entry(page, 3184 &zone->free_area[order].free_list[t], lru) { 3185 unsigned long i; 3186 3187 pfn = page_to_pfn(page); 3188 for (i = 0; i < (1UL << order); i++) { 3189 if (!--page_count) { 3190 touch_nmi_watchdog(); 3191 page_count = WD_PAGE_COUNT; 3192 } 3193 swsusp_set_page_free(pfn_to_page(pfn + i)); 3194 } 3195 } 3196 } 3197 spin_unlock_irqrestore(&zone->lock, flags); 3198 } 3199 #endif /* CONFIG_PM */ 3200 3201 static bool free_unref_page_prepare(struct page *page, unsigned long pfn) 3202 { 3203 int migratetype; 3204 3205 if (!free_pcp_prepare(page)) 3206 return false; 3207 3208 migratetype = get_pfnblock_migratetype(page, pfn); 3209 set_pcppage_migratetype(page, migratetype); 3210 return true; 3211 } 3212 3213 static void free_unref_page_commit(struct page *page, unsigned long pfn) 3214 { 3215 struct zone *zone = page_zone(page); 3216 struct per_cpu_pages *pcp; 3217 int migratetype; 3218 3219 migratetype = get_pcppage_migratetype(page); 3220 __count_vm_event(PGFREE); 3221 3222 /* 3223 * We only track unmovable, reclaimable and movable on pcp lists. 3224 * Free ISOLATE pages back to the allocator because they are being 3225 * offlined but treat HIGHATOMIC as movable pages so we can get those 3226 * areas back if necessary. Otherwise, we may have to free 3227 * excessively into the page allocator 3228 */ 3229 if (migratetype >= MIGRATE_PCPTYPES) { 3230 if (unlikely(is_migrate_isolate(migratetype))) { 3231 free_one_page(zone, page, pfn, 0, migratetype, 3232 FPI_NONE); 3233 return; 3234 } 3235 migratetype = MIGRATE_MOVABLE; 3236 } 3237 3238 pcp = &this_cpu_ptr(zone->pageset)->pcp; 3239 list_add(&page->lru, &pcp->lists[migratetype]); 3240 pcp->count++; 3241 if (pcp->count >= READ_ONCE(pcp->high)) 3242 free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); 3243 } 3244 3245 /* 3246 * Free a 0-order page 3247 */ 3248 void free_unref_page(struct page *page) 3249 { 3250 unsigned long flags; 3251 unsigned long pfn = page_to_pfn(page); 3252 3253 if (!free_unref_page_prepare(page, pfn)) 3254 return; 3255 3256 local_irq_save(flags); 3257 free_unref_page_commit(page, pfn); 3258 local_irq_restore(flags); 3259 } 3260 3261 /* 3262 * Free a list of 0-order pages 3263 */ 3264 void free_unref_page_list(struct list_head *list) 3265 { 3266 struct page *page, *next; 3267 unsigned long flags, pfn; 3268 int batch_count = 0; 3269 3270 /* Prepare pages for freeing */ 3271 list_for_each_entry_safe(page, next, list, lru) { 3272 pfn = page_to_pfn(page); 3273 if (!free_unref_page_prepare(page, pfn)) 3274 list_del(&page->lru); 3275 set_page_private(page, pfn); 3276 } 3277 3278 local_irq_save(flags); 3279 list_for_each_entry_safe(page, next, list, lru) { 3280 unsigned long pfn = page_private(page); 3281 3282 set_page_private(page, 0); 3283 trace_mm_page_free_batched(page); 3284 free_unref_page_commit(page, pfn); 3285 3286 /* 3287 * Guard against excessive IRQ disabled times when we get 3288 * a large list of pages to free. 3289 */ 3290 if (++batch_count == SWAP_CLUSTER_MAX) { 3291 local_irq_restore(flags); 3292 batch_count = 0; 3293 local_irq_save(flags); 3294 } 3295 } 3296 local_irq_restore(flags); 3297 } 3298 3299 /* 3300 * split_page takes a non-compound higher-order page, and splits it into 3301 * n (1<<order) sub-pages: page[0..n] 3302 * Each sub-page must be freed individually. 3303 * 3304 * Note: this is probably too low level an operation for use in drivers. 3305 * Please consult with lkml before using this in your driver. 3306 */ 3307 void split_page(struct page *page, unsigned int order) 3308 { 3309 int i; 3310 3311 VM_BUG_ON_PAGE(PageCompound(page), page); 3312 VM_BUG_ON_PAGE(!page_count(page), page); 3313 3314 for (i = 1; i < (1 << order); i++) 3315 set_page_refcounted(page + i); 3316 split_page_owner(page, 1 << order); 3317 split_page_memcg(page, 1 << order); 3318 } 3319 EXPORT_SYMBOL_GPL(split_page); 3320 3321 int __isolate_free_page(struct page *page, unsigned int order) 3322 { 3323 unsigned long watermark; 3324 struct zone *zone; 3325 int mt; 3326 3327 BUG_ON(!PageBuddy(page)); 3328 3329 zone = page_zone(page); 3330 mt = get_pageblock_migratetype(page); 3331 3332 if (!is_migrate_isolate(mt)) { 3333 /* 3334 * Obey watermarks as if the page was being allocated. We can 3335 * emulate a high-order watermark check with a raised order-0 3336 * watermark, because we already know our high-order page 3337 * exists. 3338 */ 3339 watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 3340 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3341 return 0; 3342 3343 __mod_zone_freepage_state(zone, -(1UL << order), mt); 3344 } 3345 3346 /* Remove page from free list */ 3347 3348 del_page_from_free_list(page, zone, order); 3349 3350 /* 3351 * Set the pageblock if the isolated page is at least half of a 3352 * pageblock 3353 */ 3354 if (order >= pageblock_order - 1) { 3355 struct page *endpage = page + (1 << order) - 1; 3356 for (; page < endpage; page += pageblock_nr_pages) { 3357 int mt = get_pageblock_migratetype(page); 3358 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 3359 && !is_migrate_highatomic(mt)) 3360 set_pageblock_migratetype(page, 3361 MIGRATE_MOVABLE); 3362 } 3363 } 3364 3365 3366 return 1UL << order; 3367 } 3368 3369 /** 3370 * __putback_isolated_page - Return a now-isolated page back where we got it 3371 * @page: Page that was isolated 3372 * @order: Order of the isolated page 3373 * @mt: The page's pageblock's migratetype 3374 * 3375 * This function is meant to return a page pulled from the free lists via 3376 * __isolate_free_page back to the free lists they were pulled from. 3377 */ 3378 void __putback_isolated_page(struct page *page, unsigned int order, int mt) 3379 { 3380 struct zone *zone = page_zone(page); 3381 3382 /* zone lock should be held when this function is called */ 3383 lockdep_assert_held(&zone->lock); 3384 3385 /* Return isolated page to tail of freelist. */ 3386 __free_one_page(page, page_to_pfn(page), zone, order, mt, 3387 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 3388 } 3389 3390 /* 3391 * Update NUMA hit/miss statistics 3392 * 3393 * Must be called with interrupts disabled. 3394 */ 3395 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 3396 { 3397 #ifdef CONFIG_NUMA 3398 enum numa_stat_item local_stat = NUMA_LOCAL; 3399 3400 /* skip numa counters update if numa stats is disabled */ 3401 if (!static_branch_likely(&vm_numa_stat_key)) 3402 return; 3403 3404 if (zone_to_nid(z) != numa_node_id()) 3405 local_stat = NUMA_OTHER; 3406 3407 if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 3408 __inc_numa_state(z, NUMA_HIT); 3409 else { 3410 __inc_numa_state(z, NUMA_MISS); 3411 __inc_numa_state(preferred_zone, NUMA_FOREIGN); 3412 } 3413 __inc_numa_state(z, local_stat); 3414 #endif 3415 } 3416 3417 /* Remove page from the per-cpu list, caller must protect the list */ 3418 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 3419 unsigned int alloc_flags, 3420 struct per_cpu_pages *pcp, 3421 struct list_head *list) 3422 { 3423 struct page *page; 3424 3425 do { 3426 if (list_empty(list)) { 3427 pcp->count += rmqueue_bulk(zone, 0, 3428 READ_ONCE(pcp->batch), list, 3429 migratetype, alloc_flags); 3430 if (unlikely(list_empty(list))) 3431 return NULL; 3432 } 3433 3434 page = list_first_entry(list, struct page, lru); 3435 list_del(&page->lru); 3436 pcp->count--; 3437 } while (check_new_pcp(page)); 3438 3439 return page; 3440 } 3441 3442 /* Lock and remove page from the per-cpu list */ 3443 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3444 struct zone *zone, gfp_t gfp_flags, 3445 int migratetype, unsigned int alloc_flags) 3446 { 3447 struct per_cpu_pages *pcp; 3448 struct list_head *list; 3449 struct page *page; 3450 unsigned long flags; 3451 3452 local_irq_save(flags); 3453 pcp = &this_cpu_ptr(zone->pageset)->pcp; 3454 list = &pcp->lists[migratetype]; 3455 page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); 3456 if (page) { 3457 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); 3458 zone_statistics(preferred_zone, zone); 3459 } 3460 local_irq_restore(flags); 3461 return page; 3462 } 3463 3464 /* 3465 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 3466 */ 3467 static inline 3468 struct page *rmqueue(struct zone *preferred_zone, 3469 struct zone *zone, unsigned int order, 3470 gfp_t gfp_flags, unsigned int alloc_flags, 3471 int migratetype) 3472 { 3473 unsigned long flags; 3474 struct page *page; 3475 3476 if (likely(order == 0)) { 3477 /* 3478 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and 3479 * we need to skip it when CMA area isn't allowed. 3480 */ 3481 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || 3482 migratetype != MIGRATE_MOVABLE) { 3483 page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, 3484 migratetype, alloc_flags); 3485 goto out; 3486 } 3487 } 3488 3489 /* 3490 * We most definitely don't want callers attempting to 3491 * allocate greater than order-1 page units with __GFP_NOFAIL. 3492 */ 3493 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 3494 spin_lock_irqsave(&zone->lock, flags); 3495 3496 do { 3497 page = NULL; 3498 /* 3499 * order-0 request can reach here when the pcplist is skipped 3500 * due to non-CMA allocation context. HIGHATOMIC area is 3501 * reserved for high-order atomic allocation, so order-0 3502 * request should skip it. 3503 */ 3504 if (order > 0 && alloc_flags & ALLOC_HARDER) { 3505 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 3506 if (page) 3507 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3508 } 3509 if (!page) 3510 page = __rmqueue(zone, order, migratetype, alloc_flags); 3511 } while (page && check_new_pages(page, order)); 3512 spin_unlock(&zone->lock); 3513 if (!page) 3514 goto failed; 3515 __mod_zone_freepage_state(zone, -(1 << order), 3516 get_pcppage_migratetype(page)); 3517 3518 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3519 zone_statistics(preferred_zone, zone); 3520 local_irq_restore(flags); 3521 3522 out: 3523 /* Separate test+clear to avoid unnecessary atomics */ 3524 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { 3525 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 3526 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 3527 } 3528 3529 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3530 return page; 3531 3532 failed: 3533 local_irq_restore(flags); 3534 return NULL; 3535 } 3536 3537 #ifdef CONFIG_FAIL_PAGE_ALLOC 3538 3539 static struct { 3540 struct fault_attr attr; 3541 3542 bool ignore_gfp_highmem; 3543 bool ignore_gfp_reclaim; 3544 u32 min_order; 3545 } fail_page_alloc = { 3546 .attr = FAULT_ATTR_INITIALIZER, 3547 .ignore_gfp_reclaim = true, 3548 .ignore_gfp_highmem = true, 3549 .min_order = 1, 3550 }; 3551 3552 static int __init setup_fail_page_alloc(char *str) 3553 { 3554 return setup_fault_attr(&fail_page_alloc.attr, str); 3555 } 3556 __setup("fail_page_alloc=", setup_fail_page_alloc); 3557 3558 static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3559 { 3560 if (order < fail_page_alloc.min_order) 3561 return false; 3562 if (gfp_mask & __GFP_NOFAIL) 3563 return false; 3564 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 3565 return false; 3566 if (fail_page_alloc.ignore_gfp_reclaim && 3567 (gfp_mask & __GFP_DIRECT_RECLAIM)) 3568 return false; 3569 3570 return should_fail(&fail_page_alloc.attr, 1 << order); 3571 } 3572 3573 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 3574 3575 static int __init fail_page_alloc_debugfs(void) 3576 { 3577 umode_t mode = S_IFREG | 0600; 3578 struct dentry *dir; 3579 3580 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 3581 &fail_page_alloc.attr); 3582 3583 debugfs_create_bool("ignore-gfp-wait", mode, dir, 3584 &fail_page_alloc.ignore_gfp_reclaim); 3585 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 3586 &fail_page_alloc.ignore_gfp_highmem); 3587 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); 3588 3589 return 0; 3590 } 3591 3592 late_initcall(fail_page_alloc_debugfs); 3593 3594 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 3595 3596 #else /* CONFIG_FAIL_PAGE_ALLOC */ 3597 3598 static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3599 { 3600 return false; 3601 } 3602 3603 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 3604 3605 noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3606 { 3607 return __should_fail_alloc_page(gfp_mask, order); 3608 } 3609 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); 3610 3611 static inline long __zone_watermark_unusable_free(struct zone *z, 3612 unsigned int order, unsigned int alloc_flags) 3613 { 3614 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3615 long unusable_free = (1 << order) - 1; 3616 3617 /* 3618 * If the caller does not have rights to ALLOC_HARDER then subtract 3619 * the high-atomic reserves. This will over-estimate the size of the 3620 * atomic reserve but it avoids a search. 3621 */ 3622 if (likely(!alloc_harder)) 3623 unusable_free += z->nr_reserved_highatomic; 3624 3625 #ifdef CONFIG_CMA 3626 /* If allocation can't use CMA areas don't use free CMA pages */ 3627 if (!(alloc_flags & ALLOC_CMA)) 3628 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 3629 #endif 3630 3631 return unusable_free; 3632 } 3633 3634 /* 3635 * Return true if free base pages are above 'mark'. For high-order checks it 3636 * will return true of the order-0 watermark is reached and there is at least 3637 * one free page of a suitable size. Checking now avoids taking the zone lock 3638 * to check in the allocation paths if no pages are free. 3639 */ 3640 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3641 int highest_zoneidx, unsigned int alloc_flags, 3642 long free_pages) 3643 { 3644 long min = mark; 3645 int o; 3646 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3647 3648 /* free_pages may go negative - that's OK */ 3649 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 3650 3651 if (alloc_flags & ALLOC_HIGH) 3652 min -= min / 2; 3653 3654 if (unlikely(alloc_harder)) { 3655 /* 3656 * OOM victims can try even harder than normal ALLOC_HARDER 3657 * users on the grounds that it's definitely going to be in 3658 * the exit path shortly and free memory. Any allocation it 3659 * makes during the free path will be small and short-lived. 3660 */ 3661 if (alloc_flags & ALLOC_OOM) 3662 min -= min / 2; 3663 else 3664 min -= min / 4; 3665 } 3666 3667 /* 3668 * Check watermarks for an order-0 allocation request. If these 3669 * are not met, then a high-order request also cannot go ahead 3670 * even if a suitable page happened to be free. 3671 */ 3672 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 3673 return false; 3674 3675 /* If this is an order-0 request then the watermark is fine */ 3676 if (!order) 3677 return true; 3678 3679 /* For a high-order request, check at least one suitable page is free */ 3680 for (o = order; o < MAX_ORDER; o++) { 3681 struct free_area *area = &z->free_area[o]; 3682 int mt; 3683 3684 if (!area->nr_free) 3685 continue; 3686 3687 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3688 if (!free_area_empty(area, mt)) 3689 return true; 3690 } 3691 3692 #ifdef CONFIG_CMA 3693 if ((alloc_flags & ALLOC_CMA) && 3694 !free_area_empty(area, MIGRATE_CMA)) { 3695 return true; 3696 } 3697 #endif 3698 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) 3699 return true; 3700 } 3701 return false; 3702 } 3703 3704 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3705 int highest_zoneidx, unsigned int alloc_flags) 3706 { 3707 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3708 zone_page_state(z, NR_FREE_PAGES)); 3709 } 3710 3711 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3712 unsigned long mark, int highest_zoneidx, 3713 unsigned int alloc_flags, gfp_t gfp_mask) 3714 { 3715 long free_pages; 3716 3717 free_pages = zone_page_state(z, NR_FREE_PAGES); 3718 3719 /* 3720 * Fast check for order-0 only. If this fails then the reserves 3721 * need to be calculated. 3722 */ 3723 if (!order) { 3724 long fast_free; 3725 3726 fast_free = free_pages; 3727 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); 3728 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) 3729 return true; 3730 } 3731 3732 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3733 free_pages)) 3734 return true; 3735 /* 3736 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations 3737 * when checking the min watermark. The min watermark is the 3738 * point where boosting is ignored so that kswapd is woken up 3739 * when below the low watermark. 3740 */ 3741 if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost 3742 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 3743 mark = z->_watermark[WMARK_MIN]; 3744 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 3745 alloc_flags, free_pages); 3746 } 3747 3748 return false; 3749 } 3750 3751 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3752 unsigned long mark, int highest_zoneidx) 3753 { 3754 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3755 3756 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3757 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3758 3759 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, 3760 free_pages); 3761 } 3762 3763 #ifdef CONFIG_NUMA 3764 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3765 { 3766 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3767 node_reclaim_distance; 3768 } 3769 #else /* CONFIG_NUMA */ 3770 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3771 { 3772 return true; 3773 } 3774 #endif /* CONFIG_NUMA */ 3775 3776 /* 3777 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 3778 * fragmentation is subtle. If the preferred zone was HIGHMEM then 3779 * premature use of a lower zone may cause lowmem pressure problems that 3780 * are worse than fragmentation. If the next zone is ZONE_DMA then it is 3781 * probably too small. It only makes sense to spread allocations to avoid 3782 * fragmentation between the Normal and DMA32 zones. 3783 */ 3784 static inline unsigned int 3785 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 3786 { 3787 unsigned int alloc_flags; 3788 3789 /* 3790 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 3791 * to save a branch. 3792 */ 3793 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 3794 3795 #ifdef CONFIG_ZONE_DMA32 3796 if (!zone) 3797 return alloc_flags; 3798 3799 if (zone_idx(zone) != ZONE_NORMAL) 3800 return alloc_flags; 3801 3802 /* 3803 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 3804 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 3805 * on UMA that if Normal is populated then so is DMA32. 3806 */ 3807 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 3808 if (nr_online_nodes > 1 && !populated_zone(--zone)) 3809 return alloc_flags; 3810 3811 alloc_flags |= ALLOC_NOFRAGMENT; 3812 #endif /* CONFIG_ZONE_DMA32 */ 3813 return alloc_flags; 3814 } 3815 3816 static inline unsigned int current_alloc_flags(gfp_t gfp_mask, 3817 unsigned int alloc_flags) 3818 { 3819 #ifdef CONFIG_CMA 3820 unsigned int pflags = current->flags; 3821 3822 if (!(pflags & PF_MEMALLOC_NOCMA) && 3823 gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3824 alloc_flags |= ALLOC_CMA; 3825 3826 #endif 3827 return alloc_flags; 3828 } 3829 3830 /* 3831 * get_page_from_freelist goes through the zonelist trying to allocate 3832 * a page. 3833 */ 3834 static struct page * 3835 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3836 const struct alloc_context *ac) 3837 { 3838 struct zoneref *z; 3839 struct zone *zone; 3840 struct pglist_data *last_pgdat_dirty_limit = NULL; 3841 bool no_fallback; 3842 3843 retry: 3844 /* 3845 * Scan zonelist, looking for a zone with enough free. 3846 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3847 */ 3848 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 3849 z = ac->preferred_zoneref; 3850 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 3851 ac->nodemask) { 3852 struct page *page; 3853 unsigned long mark; 3854 3855 if (cpusets_enabled() && 3856 (alloc_flags & ALLOC_CPUSET) && 3857 !__cpuset_zone_allowed(zone, gfp_mask)) 3858 continue; 3859 /* 3860 * When allocating a page cache page for writing, we 3861 * want to get it from a node that is within its dirty 3862 * limit, such that no single node holds more than its 3863 * proportional share of globally allowed dirty pages. 3864 * The dirty limits take into account the node's 3865 * lowmem reserves and high watermark so that kswapd 3866 * should be able to balance it without having to 3867 * write pages from its LRU list. 3868 * 3869 * XXX: For now, allow allocations to potentially 3870 * exceed the per-node dirty limit in the slowpath 3871 * (spread_dirty_pages unset) before going into reclaim, 3872 * which is important when on a NUMA setup the allowed 3873 * nodes are together not big enough to reach the 3874 * global limit. The proper fix for these situations 3875 * will require awareness of nodes in the 3876 * dirty-throttling and the flusher threads. 3877 */ 3878 if (ac->spread_dirty_pages) { 3879 if (last_pgdat_dirty_limit == zone->zone_pgdat) 3880 continue; 3881 3882 if (!node_dirty_ok(zone->zone_pgdat)) { 3883 last_pgdat_dirty_limit = zone->zone_pgdat; 3884 continue; 3885 } 3886 } 3887 3888 if (no_fallback && nr_online_nodes > 1 && 3889 zone != ac->preferred_zoneref->zone) { 3890 int local_nid; 3891 3892 /* 3893 * If moving to a remote node, retry but allow 3894 * fragmenting fallbacks. Locality is more important 3895 * than fragmentation avoidance. 3896 */ 3897 local_nid = zone_to_nid(ac->preferred_zoneref->zone); 3898 if (zone_to_nid(zone) != local_nid) { 3899 alloc_flags &= ~ALLOC_NOFRAGMENT; 3900 goto retry; 3901 } 3902 } 3903 3904 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 3905 if (!zone_watermark_fast(zone, order, mark, 3906 ac->highest_zoneidx, alloc_flags, 3907 gfp_mask)) { 3908 int ret; 3909 3910 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3911 /* 3912 * Watermark failed for this zone, but see if we can 3913 * grow this zone if it contains deferred pages. 3914 */ 3915 if (static_branch_unlikely(&deferred_pages)) { 3916 if (_deferred_grow_zone(zone, order)) 3917 goto try_this_zone; 3918 } 3919 #endif 3920 /* Checked here to keep the fast path fast */ 3921 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3922 if (alloc_flags & ALLOC_NO_WATERMARKS) 3923 goto try_this_zone; 3924 3925 if (node_reclaim_mode == 0 || 3926 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 3927 continue; 3928 3929 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3930 switch (ret) { 3931 case NODE_RECLAIM_NOSCAN: 3932 /* did not scan */ 3933 continue; 3934 case NODE_RECLAIM_FULL: 3935 /* scanned but unreclaimable */ 3936 continue; 3937 default: 3938 /* did we reclaim enough */ 3939 if (zone_watermark_ok(zone, order, mark, 3940 ac->highest_zoneidx, alloc_flags)) 3941 goto try_this_zone; 3942 3943 continue; 3944 } 3945 } 3946 3947 try_this_zone: 3948 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 3949 gfp_mask, alloc_flags, ac->migratetype); 3950 if (page) { 3951 prep_new_page(page, order, gfp_mask, alloc_flags); 3952 3953 /* 3954 * If this is a high-order atomic allocation then check 3955 * if the pageblock should be reserved for the future 3956 */ 3957 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 3958 reserve_highatomic_pageblock(page, zone, order); 3959 3960 return page; 3961 } else { 3962 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3963 /* Try again if zone has deferred pages */ 3964 if (static_branch_unlikely(&deferred_pages)) { 3965 if (_deferred_grow_zone(zone, order)) 3966 goto try_this_zone; 3967 } 3968 #endif 3969 } 3970 } 3971 3972 /* 3973 * It's possible on a UMA machine to get through all zones that are 3974 * fragmented. If avoiding fragmentation, reset and try again. 3975 */ 3976 if (no_fallback) { 3977 alloc_flags &= ~ALLOC_NOFRAGMENT; 3978 goto retry; 3979 } 3980 3981 return NULL; 3982 } 3983 3984 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3985 { 3986 unsigned int filter = SHOW_MEM_FILTER_NODES; 3987 3988 /* 3989 * This documents exceptions given to allocations in certain 3990 * contexts that are allowed to allocate outside current's set 3991 * of allowed nodes. 3992 */ 3993 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3994 if (tsk_is_oom_victim(current) || 3995 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3996 filter &= ~SHOW_MEM_FILTER_NODES; 3997 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3998 filter &= ~SHOW_MEM_FILTER_NODES; 3999 4000 show_mem(filter, nodemask); 4001 } 4002 4003 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 4004 { 4005 struct va_format vaf; 4006 va_list args; 4007 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 4008 4009 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 4010 return; 4011 4012 va_start(args, fmt); 4013 vaf.fmt = fmt; 4014 vaf.va = &args; 4015 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 4016 current->comm, &vaf, gfp_mask, &gfp_mask, 4017 nodemask_pr_args(nodemask)); 4018 va_end(args); 4019 4020 cpuset_print_current_mems_allowed(); 4021 pr_cont("\n"); 4022 dump_stack(); 4023 warn_alloc_show_mem(gfp_mask, nodemask); 4024 } 4025 4026 static inline struct page * 4027 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 4028 unsigned int alloc_flags, 4029 const struct alloc_context *ac) 4030 { 4031 struct page *page; 4032 4033 page = get_page_from_freelist(gfp_mask, order, 4034 alloc_flags|ALLOC_CPUSET, ac); 4035 /* 4036 * fallback to ignore cpuset restriction if our nodes 4037 * are depleted 4038 */ 4039 if (!page) 4040 page = get_page_from_freelist(gfp_mask, order, 4041 alloc_flags, ac); 4042 4043 return page; 4044 } 4045 4046 static inline struct page * 4047 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 4048 const struct alloc_context *ac, unsigned long *did_some_progress) 4049 { 4050 struct oom_control oc = { 4051 .zonelist = ac->zonelist, 4052 .nodemask = ac->nodemask, 4053 .memcg = NULL, 4054 .gfp_mask = gfp_mask, 4055 .order = order, 4056 }; 4057 struct page *page; 4058 4059 *did_some_progress = 0; 4060 4061 /* 4062 * Acquire the oom lock. If that fails, somebody else is 4063 * making progress for us. 4064 */ 4065 if (!mutex_trylock(&oom_lock)) { 4066 *did_some_progress = 1; 4067 schedule_timeout_uninterruptible(1); 4068 return NULL; 4069 } 4070 4071 /* 4072 * Go through the zonelist yet one more time, keep very high watermark 4073 * here, this is only to catch a parallel oom killing, we must fail if 4074 * we're still under heavy pressure. But make sure that this reclaim 4075 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 4076 * allocation which will never fail due to oom_lock already held. 4077 */ 4078 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 4079 ~__GFP_DIRECT_RECLAIM, order, 4080 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 4081 if (page) 4082 goto out; 4083 4084 /* Coredumps can quickly deplete all memory reserves */ 4085 if (current->flags & PF_DUMPCORE) 4086 goto out; 4087 /* The OOM killer will not help higher order allocs */ 4088 if (order > PAGE_ALLOC_COSTLY_ORDER) 4089 goto out; 4090 /* 4091 * We have already exhausted all our reclaim opportunities without any 4092 * success so it is time to admit defeat. We will skip the OOM killer 4093 * because it is very likely that the caller has a more reasonable 4094 * fallback than shooting a random task. 4095 * 4096 * The OOM killer may not free memory on a specific node. 4097 */ 4098 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 4099 goto out; 4100 /* The OOM killer does not needlessly kill tasks for lowmem */ 4101 if (ac->highest_zoneidx < ZONE_NORMAL) 4102 goto out; 4103 if (pm_suspended_storage()) 4104 goto out; 4105 /* 4106 * XXX: GFP_NOFS allocations should rather fail than rely on 4107 * other request to make a forward progress. 4108 * We are in an unfortunate situation where out_of_memory cannot 4109 * do much for this context but let's try it to at least get 4110 * access to memory reserved if the current task is killed (see 4111 * out_of_memory). Once filesystems are ready to handle allocation 4112 * failures more gracefully we should just bail out here. 4113 */ 4114 4115 /* Exhausted what can be done so it's blame time */ 4116 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 4117 *did_some_progress = 1; 4118 4119 /* 4120 * Help non-failing allocations by giving them access to memory 4121 * reserves 4122 */ 4123 if (gfp_mask & __GFP_NOFAIL) 4124 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 4125 ALLOC_NO_WATERMARKS, ac); 4126 } 4127 out: 4128 mutex_unlock(&oom_lock); 4129 return page; 4130 } 4131 4132 /* 4133 * Maximum number of compaction retries wit a progress before OOM 4134 * killer is consider as the only way to move forward. 4135 */ 4136 #define MAX_COMPACT_RETRIES 16 4137 4138 #ifdef CONFIG_COMPACTION 4139 /* Try memory compaction for high-order allocations before reclaim */ 4140 static struct page * 4141 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4142 unsigned int alloc_flags, const struct alloc_context *ac, 4143 enum compact_priority prio, enum compact_result *compact_result) 4144 { 4145 struct page *page = NULL; 4146 unsigned long pflags; 4147 unsigned int noreclaim_flag; 4148 4149 if (!order) 4150 return NULL; 4151 4152 psi_memstall_enter(&pflags); 4153 noreclaim_flag = memalloc_noreclaim_save(); 4154 4155 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 4156 prio, &page); 4157 4158 memalloc_noreclaim_restore(noreclaim_flag); 4159 psi_memstall_leave(&pflags); 4160 4161 /* 4162 * At least in one zone compaction wasn't deferred or skipped, so let's 4163 * count a compaction stall 4164 */ 4165 count_vm_event(COMPACTSTALL); 4166 4167 /* Prep a captured page if available */ 4168 if (page) 4169 prep_new_page(page, order, gfp_mask, alloc_flags); 4170 4171 /* Try get a page from the freelist if available */ 4172 if (!page) 4173 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4174 4175 if (page) { 4176 struct zone *zone = page_zone(page); 4177 4178 zone->compact_blockskip_flush = false; 4179 compaction_defer_reset(zone, order, true); 4180 count_vm_event(COMPACTSUCCESS); 4181 return page; 4182 } 4183 4184 /* 4185 * It's bad if compaction run occurs and fails. The most likely reason 4186 * is that pages exist, but not enough to satisfy watermarks. 4187 */ 4188 count_vm_event(COMPACTFAIL); 4189 4190 cond_resched(); 4191 4192 return NULL; 4193 } 4194 4195 static inline bool 4196 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 4197 enum compact_result compact_result, 4198 enum compact_priority *compact_priority, 4199 int *compaction_retries) 4200 { 4201 int max_retries = MAX_COMPACT_RETRIES; 4202 int min_priority; 4203 bool ret = false; 4204 int retries = *compaction_retries; 4205 enum compact_priority priority = *compact_priority; 4206 4207 if (!order) 4208 return false; 4209 4210 if (compaction_made_progress(compact_result)) 4211 (*compaction_retries)++; 4212 4213 /* 4214 * compaction considers all the zone as desperately out of memory 4215 * so it doesn't really make much sense to retry except when the 4216 * failure could be caused by insufficient priority 4217 */ 4218 if (compaction_failed(compact_result)) 4219 goto check_priority; 4220 4221 /* 4222 * compaction was skipped because there are not enough order-0 pages 4223 * to work with, so we retry only if it looks like reclaim can help. 4224 */ 4225 if (compaction_needs_reclaim(compact_result)) { 4226 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 4227 goto out; 4228 } 4229 4230 /* 4231 * make sure the compaction wasn't deferred or didn't bail out early 4232 * due to locks contention before we declare that we should give up. 4233 * But the next retry should use a higher priority if allowed, so 4234 * we don't just keep bailing out endlessly. 4235 */ 4236 if (compaction_withdrawn(compact_result)) { 4237 goto check_priority; 4238 } 4239 4240 /* 4241 * !costly requests are much more important than __GFP_RETRY_MAYFAIL 4242 * costly ones because they are de facto nofail and invoke OOM 4243 * killer to move on while costly can fail and users are ready 4244 * to cope with that. 1/4 retries is rather arbitrary but we 4245 * would need much more detailed feedback from compaction to 4246 * make a better decision. 4247 */ 4248 if (order > PAGE_ALLOC_COSTLY_ORDER) 4249 max_retries /= 4; 4250 if (*compaction_retries <= max_retries) { 4251 ret = true; 4252 goto out; 4253 } 4254 4255 /* 4256 * Make sure there are attempts at the highest priority if we exhausted 4257 * all retries or failed at the lower priorities. 4258 */ 4259 check_priority: 4260 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 4261 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 4262 4263 if (*compact_priority > min_priority) { 4264 (*compact_priority)--; 4265 *compaction_retries = 0; 4266 ret = true; 4267 } 4268 out: 4269 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 4270 return ret; 4271 } 4272 #else 4273 static inline struct page * 4274 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4275 unsigned int alloc_flags, const struct alloc_context *ac, 4276 enum compact_priority prio, enum compact_result *compact_result) 4277 { 4278 *compact_result = COMPACT_SKIPPED; 4279 return NULL; 4280 } 4281 4282 static inline bool 4283 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 4284 enum compact_result compact_result, 4285 enum compact_priority *compact_priority, 4286 int *compaction_retries) 4287 { 4288 struct zone *zone; 4289 struct zoneref *z; 4290 4291 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 4292 return false; 4293 4294 /* 4295 * There are setups with compaction disabled which would prefer to loop 4296 * inside the allocator rather than hit the oom killer prematurely. 4297 * Let's give them a good hope and keep retrying while the order-0 4298 * watermarks are OK. 4299 */ 4300 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4301 ac->highest_zoneidx, ac->nodemask) { 4302 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 4303 ac->highest_zoneidx, alloc_flags)) 4304 return true; 4305 } 4306 return false; 4307 } 4308 #endif /* CONFIG_COMPACTION */ 4309 4310 #ifdef CONFIG_LOCKDEP 4311 static struct lockdep_map __fs_reclaim_map = 4312 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 4313 4314 static bool __need_reclaim(gfp_t gfp_mask) 4315 { 4316 /* no reclaim without waiting on it */ 4317 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 4318 return false; 4319 4320 /* this guy won't enter reclaim */ 4321 if (current->flags & PF_MEMALLOC) 4322 return false; 4323 4324 if (gfp_mask & __GFP_NOLOCKDEP) 4325 return false; 4326 4327 return true; 4328 } 4329 4330 void __fs_reclaim_acquire(void) 4331 { 4332 lock_map_acquire(&__fs_reclaim_map); 4333 } 4334 4335 void __fs_reclaim_release(void) 4336 { 4337 lock_map_release(&__fs_reclaim_map); 4338 } 4339 4340 void fs_reclaim_acquire(gfp_t gfp_mask) 4341 { 4342 gfp_mask = current_gfp_context(gfp_mask); 4343 4344 if (__need_reclaim(gfp_mask)) { 4345 if (gfp_mask & __GFP_FS) 4346 __fs_reclaim_acquire(); 4347 4348 #ifdef CONFIG_MMU_NOTIFIER 4349 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 4350 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 4351 #endif 4352 4353 } 4354 } 4355 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 4356 4357 void fs_reclaim_release(gfp_t gfp_mask) 4358 { 4359 gfp_mask = current_gfp_context(gfp_mask); 4360 4361 if (__need_reclaim(gfp_mask)) { 4362 if (gfp_mask & __GFP_FS) 4363 __fs_reclaim_release(); 4364 } 4365 } 4366 EXPORT_SYMBOL_GPL(fs_reclaim_release); 4367 #endif 4368 4369 /* Perform direct synchronous page reclaim */ 4370 static unsigned long 4371 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 4372 const struct alloc_context *ac) 4373 { 4374 unsigned int noreclaim_flag; 4375 unsigned long pflags, progress; 4376 4377 cond_resched(); 4378 4379 /* We now go into synchronous reclaim */ 4380 cpuset_memory_pressure_bump(); 4381 psi_memstall_enter(&pflags); 4382 fs_reclaim_acquire(gfp_mask); 4383 noreclaim_flag = memalloc_noreclaim_save(); 4384 4385 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 4386 ac->nodemask); 4387 4388 memalloc_noreclaim_restore(noreclaim_flag); 4389 fs_reclaim_release(gfp_mask); 4390 psi_memstall_leave(&pflags); 4391 4392 cond_resched(); 4393 4394 return progress; 4395 } 4396 4397 /* The really slow allocator path where we enter direct reclaim */ 4398 static inline struct page * 4399 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 4400 unsigned int alloc_flags, const struct alloc_context *ac, 4401 unsigned long *did_some_progress) 4402 { 4403 struct page *page = NULL; 4404 bool drained = false; 4405 4406 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 4407 if (unlikely(!(*did_some_progress))) 4408 return NULL; 4409 4410 retry: 4411 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4412 4413 /* 4414 * If an allocation failed after direct reclaim, it could be because 4415 * pages are pinned on the per-cpu lists or in high alloc reserves. 4416 * Shrink them and try again 4417 */ 4418 if (!page && !drained) { 4419 unreserve_highatomic_pageblock(ac, false); 4420 drain_all_pages(NULL); 4421 drained = true; 4422 goto retry; 4423 } 4424 4425 return page; 4426 } 4427 4428 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 4429 const struct alloc_context *ac) 4430 { 4431 struct zoneref *z; 4432 struct zone *zone; 4433 pg_data_t *last_pgdat = NULL; 4434 enum zone_type highest_zoneidx = ac->highest_zoneidx; 4435 4436 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 4437 ac->nodemask) { 4438 if (last_pgdat != zone->zone_pgdat) 4439 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); 4440 last_pgdat = zone->zone_pgdat; 4441 } 4442 } 4443 4444 static inline unsigned int 4445 gfp_to_alloc_flags(gfp_t gfp_mask) 4446 { 4447 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 4448 4449 /* 4450 * __GFP_HIGH is assumed to be the same as ALLOC_HIGH 4451 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 4452 * to save two branches. 4453 */ 4454 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 4455 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 4456 4457 /* 4458 * The caller may dip into page reserves a bit more if the caller 4459 * cannot run direct reclaim, or if the caller has realtime scheduling 4460 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 4461 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 4462 */ 4463 alloc_flags |= (__force int) 4464 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 4465 4466 if (gfp_mask & __GFP_ATOMIC) { 4467 /* 4468 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 4469 * if it can't schedule. 4470 */ 4471 if (!(gfp_mask & __GFP_NOMEMALLOC)) 4472 alloc_flags |= ALLOC_HARDER; 4473 /* 4474 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 4475 * comment for __cpuset_node_allowed(). 4476 */ 4477 alloc_flags &= ~ALLOC_CPUSET; 4478 } else if (unlikely(rt_task(current)) && !in_interrupt()) 4479 alloc_flags |= ALLOC_HARDER; 4480 4481 alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); 4482 4483 return alloc_flags; 4484 } 4485 4486 static bool oom_reserves_allowed(struct task_struct *tsk) 4487 { 4488 if (!tsk_is_oom_victim(tsk)) 4489 return false; 4490 4491 /* 4492 * !MMU doesn't have oom reaper so give access to memory reserves 4493 * only to the thread with TIF_MEMDIE set 4494 */ 4495 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 4496 return false; 4497 4498 return true; 4499 } 4500 4501 /* 4502 * Distinguish requests which really need access to full memory 4503 * reserves from oom victims which can live with a portion of it 4504 */ 4505 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 4506 { 4507 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 4508 return 0; 4509 if (gfp_mask & __GFP_MEMALLOC) 4510 return ALLOC_NO_WATERMARKS; 4511 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 4512 return ALLOC_NO_WATERMARKS; 4513 if (!in_interrupt()) { 4514 if (current->flags & PF_MEMALLOC) 4515 return ALLOC_NO_WATERMARKS; 4516 else if (oom_reserves_allowed(current)) 4517 return ALLOC_OOM; 4518 } 4519 4520 return 0; 4521 } 4522 4523 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 4524 { 4525 return !!__gfp_pfmemalloc_flags(gfp_mask); 4526 } 4527 4528 /* 4529 * Checks whether it makes sense to retry the reclaim to make a forward progress 4530 * for the given allocation request. 4531 * 4532 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 4533 * without success, or when we couldn't even meet the watermark if we 4534 * reclaimed all remaining pages on the LRU lists. 4535 * 4536 * Returns true if a retry is viable or false to enter the oom path. 4537 */ 4538 static inline bool 4539 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 4540 struct alloc_context *ac, int alloc_flags, 4541 bool did_some_progress, int *no_progress_loops) 4542 { 4543 struct zone *zone; 4544 struct zoneref *z; 4545 bool ret = false; 4546 4547 /* 4548 * Costly allocations might have made a progress but this doesn't mean 4549 * their order will become available due to high fragmentation so 4550 * always increment the no progress counter for them 4551 */ 4552 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 4553 *no_progress_loops = 0; 4554 else 4555 (*no_progress_loops)++; 4556 4557 /* 4558 * Make sure we converge to OOM if we cannot make any progress 4559 * several times in the row. 4560 */ 4561 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 4562 /* Before OOM, exhaust highatomic_reserve */ 4563 return unreserve_highatomic_pageblock(ac, true); 4564 } 4565 4566 /* 4567 * Keep reclaiming pages while there is a chance this will lead 4568 * somewhere. If none of the target zones can satisfy our allocation 4569 * request even if all reclaimable pages are considered then we are 4570 * screwed and have to go OOM. 4571 */ 4572 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4573 ac->highest_zoneidx, ac->nodemask) { 4574 unsigned long available; 4575 unsigned long reclaimable; 4576 unsigned long min_wmark = min_wmark_pages(zone); 4577 bool wmark; 4578 4579 available = reclaimable = zone_reclaimable_pages(zone); 4580 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 4581 4582 /* 4583 * Would the allocation succeed if we reclaimed all 4584 * reclaimable pages? 4585 */ 4586 wmark = __zone_watermark_ok(zone, order, min_wmark, 4587 ac->highest_zoneidx, alloc_flags, available); 4588 trace_reclaim_retry_zone(z, order, reclaimable, 4589 available, min_wmark, *no_progress_loops, wmark); 4590 if (wmark) { 4591 /* 4592 * If we didn't make any progress and have a lot of 4593 * dirty + writeback pages then we should wait for 4594 * an IO to complete to slow down the reclaim and 4595 * prevent from pre mature OOM 4596 */ 4597 if (!did_some_progress) { 4598 unsigned long write_pending; 4599 4600 write_pending = zone_page_state_snapshot(zone, 4601 NR_ZONE_WRITE_PENDING); 4602 4603 if (2 * write_pending > reclaimable) { 4604 congestion_wait(BLK_RW_ASYNC, HZ/10); 4605 return true; 4606 } 4607 } 4608 4609 ret = true; 4610 goto out; 4611 } 4612 } 4613 4614 out: 4615 /* 4616 * Memory allocation/reclaim might be called from a WQ context and the 4617 * current implementation of the WQ concurrency control doesn't 4618 * recognize that a particular WQ is congested if the worker thread is 4619 * looping without ever sleeping. Therefore we have to do a short sleep 4620 * here rather than calling cond_resched(). 4621 */ 4622 if (current->flags & PF_WQ_WORKER) 4623 schedule_timeout_uninterruptible(1); 4624 else 4625 cond_resched(); 4626 return ret; 4627 } 4628 4629 static inline bool 4630 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 4631 { 4632 /* 4633 * It's possible that cpuset's mems_allowed and the nodemask from 4634 * mempolicy don't intersect. This should be normally dealt with by 4635 * policy_nodemask(), but it's possible to race with cpuset update in 4636 * such a way the check therein was true, and then it became false 4637 * before we got our cpuset_mems_cookie here. 4638 * This assumes that for all allocations, ac->nodemask can come only 4639 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 4640 * when it does not intersect with the cpuset restrictions) or the 4641 * caller can deal with a violated nodemask. 4642 */ 4643 if (cpusets_enabled() && ac->nodemask && 4644 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 4645 ac->nodemask = NULL; 4646 return true; 4647 } 4648 4649 /* 4650 * When updating a task's mems_allowed or mempolicy nodemask, it is 4651 * possible to race with parallel threads in such a way that our 4652 * allocation can fail while the mask is being updated. If we are about 4653 * to fail, check if the cpuset changed during allocation and if so, 4654 * retry. 4655 */ 4656 if (read_mems_allowed_retry(cpuset_mems_cookie)) 4657 return true; 4658 4659 return false; 4660 } 4661 4662 static inline struct page * 4663 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 4664 struct alloc_context *ac) 4665 { 4666 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 4667 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 4668 struct page *page = NULL; 4669 unsigned int alloc_flags; 4670 unsigned long did_some_progress; 4671 enum compact_priority compact_priority; 4672 enum compact_result compact_result; 4673 int compaction_retries; 4674 int no_progress_loops; 4675 unsigned int cpuset_mems_cookie; 4676 int reserve_flags; 4677 4678 /* 4679 * We also sanity check to catch abuse of atomic reserves being used by 4680 * callers that are not in atomic context. 4681 */ 4682 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 4683 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 4684 gfp_mask &= ~__GFP_ATOMIC; 4685 4686 retry_cpuset: 4687 compaction_retries = 0; 4688 no_progress_loops = 0; 4689 compact_priority = DEF_COMPACT_PRIORITY; 4690 cpuset_mems_cookie = read_mems_allowed_begin(); 4691 4692 /* 4693 * The fast path uses conservative alloc_flags to succeed only until 4694 * kswapd needs to be woken up, and to avoid the cost of setting up 4695 * alloc_flags precisely. So we do that now. 4696 */ 4697 alloc_flags = gfp_to_alloc_flags(gfp_mask); 4698 4699 /* 4700 * We need to recalculate the starting point for the zonelist iterator 4701 * because we might have used different nodemask in the fast path, or 4702 * there was a cpuset modification and we are retrying - otherwise we 4703 * could end up iterating over non-eligible zones endlessly. 4704 */ 4705 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4706 ac->highest_zoneidx, ac->nodemask); 4707 if (!ac->preferred_zoneref->zone) 4708 goto nopage; 4709 4710 if (alloc_flags & ALLOC_KSWAPD) 4711 wake_all_kswapds(order, gfp_mask, ac); 4712 4713 /* 4714 * The adjusted alloc_flags might result in immediate success, so try 4715 * that first 4716 */ 4717 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4718 if (page) 4719 goto got_pg; 4720 4721 /* 4722 * For costly allocations, try direct compaction first, as it's likely 4723 * that we have enough base pages and don't need to reclaim. For non- 4724 * movable high-order allocations, do that as well, as compaction will 4725 * try prevent permanent fragmentation by migrating from blocks of the 4726 * same migratetype. 4727 * Don't try this for allocations that are allowed to ignore 4728 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 4729 */ 4730 if (can_direct_reclaim && 4731 (costly_order || 4732 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 4733 && !gfp_pfmemalloc_allowed(gfp_mask)) { 4734 page = __alloc_pages_direct_compact(gfp_mask, order, 4735 alloc_flags, ac, 4736 INIT_COMPACT_PRIORITY, 4737 &compact_result); 4738 if (page) 4739 goto got_pg; 4740 4741 /* 4742 * Checks for costly allocations with __GFP_NORETRY, which 4743 * includes some THP page fault allocations 4744 */ 4745 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 4746 /* 4747 * If allocating entire pageblock(s) and compaction 4748 * failed because all zones are below low watermarks 4749 * or is prohibited because it recently failed at this 4750 * order, fail immediately unless the allocator has 4751 * requested compaction and reclaim retry. 4752 * 4753 * Reclaim is 4754 * - potentially very expensive because zones are far 4755 * below their low watermarks or this is part of very 4756 * bursty high order allocations, 4757 * - not guaranteed to help because isolate_freepages() 4758 * may not iterate over freed pages as part of its 4759 * linear scan, and 4760 * - unlikely to make entire pageblocks free on its 4761 * own. 4762 */ 4763 if (compact_result == COMPACT_SKIPPED || 4764 compact_result == COMPACT_DEFERRED) 4765 goto nopage; 4766 4767 /* 4768 * Looks like reclaim/compaction is worth trying, but 4769 * sync compaction could be very expensive, so keep 4770 * using async compaction. 4771 */ 4772 compact_priority = INIT_COMPACT_PRIORITY; 4773 } 4774 } 4775 4776 retry: 4777 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4778 if (alloc_flags & ALLOC_KSWAPD) 4779 wake_all_kswapds(order, gfp_mask, ac); 4780 4781 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4782 if (reserve_flags) 4783 alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); 4784 4785 /* 4786 * Reset the nodemask and zonelist iterators if memory policies can be 4787 * ignored. These allocations are high priority and system rather than 4788 * user oriented. 4789 */ 4790 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 4791 ac->nodemask = NULL; 4792 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4793 ac->highest_zoneidx, ac->nodemask); 4794 } 4795 4796 /* Attempt with potentially adjusted zonelist and alloc_flags */ 4797 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4798 if (page) 4799 goto got_pg; 4800 4801 /* Caller is not willing to reclaim, we can't balance anything */ 4802 if (!can_direct_reclaim) 4803 goto nopage; 4804 4805 /* Avoid recursion of direct reclaim */ 4806 if (current->flags & PF_MEMALLOC) 4807 goto nopage; 4808 4809 /* Try direct reclaim and then allocating */ 4810 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 4811 &did_some_progress); 4812 if (page) 4813 goto got_pg; 4814 4815 /* Try direct compaction and then allocating */ 4816 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 4817 compact_priority, &compact_result); 4818 if (page) 4819 goto got_pg; 4820 4821 /* Do not loop if specifically requested */ 4822 if (gfp_mask & __GFP_NORETRY) 4823 goto nopage; 4824 4825 /* 4826 * Do not retry costly high order allocations unless they are 4827 * __GFP_RETRY_MAYFAIL 4828 */ 4829 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 4830 goto nopage; 4831 4832 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 4833 did_some_progress > 0, &no_progress_loops)) 4834 goto retry; 4835 4836 /* 4837 * It doesn't make any sense to retry for the compaction if the order-0 4838 * reclaim is not able to make any progress because the current 4839 * implementation of the compaction depends on the sufficient amount 4840 * of free memory (see __compaction_suitable) 4841 */ 4842 if (did_some_progress > 0 && 4843 should_compact_retry(ac, order, alloc_flags, 4844 compact_result, &compact_priority, 4845 &compaction_retries)) 4846 goto retry; 4847 4848 4849 /* Deal with possible cpuset update races before we start OOM killing */ 4850 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4851 goto retry_cpuset; 4852 4853 /* Reclaim has failed us, start killing things */ 4854 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 4855 if (page) 4856 goto got_pg; 4857 4858 /* Avoid allocations with no watermarks from looping endlessly */ 4859 if (tsk_is_oom_victim(current) && 4860 (alloc_flags & ALLOC_OOM || 4861 (gfp_mask & __GFP_NOMEMALLOC))) 4862 goto nopage; 4863 4864 /* Retry as long as the OOM killer is making progress */ 4865 if (did_some_progress) { 4866 no_progress_loops = 0; 4867 goto retry; 4868 } 4869 4870 nopage: 4871 /* Deal with possible cpuset update races before we fail */ 4872 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4873 goto retry_cpuset; 4874 4875 /* 4876 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 4877 * we always retry 4878 */ 4879 if (gfp_mask & __GFP_NOFAIL) { 4880 /* 4881 * All existing users of the __GFP_NOFAIL are blockable, so warn 4882 * of any new users that actually require GFP_NOWAIT 4883 */ 4884 if (WARN_ON_ONCE(!can_direct_reclaim)) 4885 goto fail; 4886 4887 /* 4888 * PF_MEMALLOC request from this context is rather bizarre 4889 * because we cannot reclaim anything and only can loop waiting 4890 * for somebody to do a work for us 4891 */ 4892 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 4893 4894 /* 4895 * non failing costly orders are a hard requirement which we 4896 * are not prepared for much so let's warn about these users 4897 * so that we can identify them and convert them to something 4898 * else. 4899 */ 4900 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 4901 4902 /* 4903 * Help non-failing allocations by giving them access to memory 4904 * reserves but do not use ALLOC_NO_WATERMARKS because this 4905 * could deplete whole memory reserves which would just make 4906 * the situation worse 4907 */ 4908 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 4909 if (page) 4910 goto got_pg; 4911 4912 cond_resched(); 4913 goto retry; 4914 } 4915 fail: 4916 warn_alloc(gfp_mask, ac->nodemask, 4917 "page allocation failure: order:%u", order); 4918 got_pg: 4919 return page; 4920 } 4921 4922 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 4923 int preferred_nid, nodemask_t *nodemask, 4924 struct alloc_context *ac, gfp_t *alloc_mask, 4925 unsigned int *alloc_flags) 4926 { 4927 ac->highest_zoneidx = gfp_zone(gfp_mask); 4928 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 4929 ac->nodemask = nodemask; 4930 ac->migratetype = gfp_migratetype(gfp_mask); 4931 4932 if (cpusets_enabled()) { 4933 *alloc_mask |= __GFP_HARDWALL; 4934 /* 4935 * When we are in the interrupt context, it is irrelevant 4936 * to the current task context. It means that any node ok. 4937 */ 4938 if (!in_interrupt() && !ac->nodemask) 4939 ac->nodemask = &cpuset_current_mems_allowed; 4940 else 4941 *alloc_flags |= ALLOC_CPUSET; 4942 } 4943 4944 fs_reclaim_acquire(gfp_mask); 4945 fs_reclaim_release(gfp_mask); 4946 4947 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 4948 4949 if (should_fail_alloc_page(gfp_mask, order)) 4950 return false; 4951 4952 *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); 4953 4954 /* Dirty zone balancing only done in the fast path */ 4955 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 4956 4957 /* 4958 * The preferred zone is used for statistics but crucially it is 4959 * also used as the starting point for the zonelist iterator. It 4960 * may get reset for allocations that ignore memory policies. 4961 */ 4962 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4963 ac->highest_zoneidx, ac->nodemask); 4964 4965 return true; 4966 } 4967 4968 /* 4969 * This is the 'heart' of the zoned buddy allocator. 4970 */ 4971 struct page * 4972 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, 4973 nodemask_t *nodemask) 4974 { 4975 struct page *page; 4976 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4977 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ 4978 struct alloc_context ac = { }; 4979 4980 /* 4981 * There are several places where we assume that the order value is sane 4982 * so bail out early if the request is out of bound. 4983 */ 4984 if (unlikely(order >= MAX_ORDER)) { 4985 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 4986 return NULL; 4987 } 4988 4989 gfp_mask &= gfp_allowed_mask; 4990 alloc_mask = gfp_mask; 4991 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) 4992 return NULL; 4993 4994 /* 4995 * Forbid the first pass from falling back to types that fragment 4996 * memory until all local zones are considered. 4997 */ 4998 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); 4999 5000 /* First allocation attempt */ 5001 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 5002 if (likely(page)) 5003 goto out; 5004 5005 /* 5006 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 5007 * resp. GFP_NOIO which has to be inherited for all allocation requests 5008 * from a particular context which has been marked by 5009 * memalloc_no{fs,io}_{save,restore}. 5010 */ 5011 alloc_mask = current_gfp_context(gfp_mask); 5012 ac.spread_dirty_pages = false; 5013 5014 /* 5015 * Restore the original nodemask if it was potentially replaced with 5016 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 5017 */ 5018 ac.nodemask = nodemask; 5019 5020 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 5021 5022 out: 5023 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 5024 unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { 5025 __free_pages(page, order); 5026 page = NULL; 5027 } 5028 5029 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 5030 5031 return page; 5032 } 5033 EXPORT_SYMBOL(__alloc_pages_nodemask); 5034 5035 /* 5036 * Common helper functions. Never use with __GFP_HIGHMEM because the returned 5037 * address cannot represent highmem pages. Use alloc_pages and then kmap if 5038 * you need to access high mem. 5039 */ 5040 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 5041 { 5042 struct page *page; 5043 5044 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); 5045 if (!page) 5046 return 0; 5047 return (unsigned long) page_address(page); 5048 } 5049 EXPORT_SYMBOL(__get_free_pages); 5050 5051 unsigned long get_zeroed_page(gfp_t gfp_mask) 5052 { 5053 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 5054 } 5055 EXPORT_SYMBOL(get_zeroed_page); 5056 5057 static inline void free_the_page(struct page *page, unsigned int order) 5058 { 5059 if (order == 0) /* Via pcp? */ 5060 free_unref_page(page); 5061 else 5062 __free_pages_ok(page, order, FPI_NONE); 5063 } 5064 5065 /** 5066 * __free_pages - Free pages allocated with alloc_pages(). 5067 * @page: The page pointer returned from alloc_pages(). 5068 * @order: The order of the allocation. 5069 * 5070 * This function can free multi-page allocations that are not compound 5071 * pages. It does not check that the @order passed in matches that of 5072 * the allocation, so it is easy to leak memory. Freeing more memory 5073 * than was allocated will probably emit a warning. 5074 * 5075 * If the last reference to this page is speculative, it will be released 5076 * by put_page() which only frees the first page of a non-compound 5077 * allocation. To prevent the remaining pages from being leaked, we free 5078 * the subsequent pages here. If you want to use the page's reference 5079 * count to decide when to free the allocation, you should allocate a 5080 * compound page, and use put_page() instead of __free_pages(). 5081 * 5082 * Context: May be called in interrupt context or while holding a normal 5083 * spinlock, but not in NMI context or while holding a raw spinlock. 5084 */ 5085 void __free_pages(struct page *page, unsigned int order) 5086 { 5087 if (put_page_testzero(page)) 5088 free_the_page(page, order); 5089 else if (!PageHead(page)) 5090 while (order-- > 0) 5091 free_the_page(page + (1 << order), order); 5092 } 5093 EXPORT_SYMBOL(__free_pages); 5094 5095 void free_pages(unsigned long addr, unsigned int order) 5096 { 5097 if (addr != 0) { 5098 VM_BUG_ON(!virt_addr_valid((void *)addr)); 5099 __free_pages(virt_to_page((void *)addr), order); 5100 } 5101 } 5102 5103 EXPORT_SYMBOL(free_pages); 5104 5105 /* 5106 * Page Fragment: 5107 * An arbitrary-length arbitrary-offset area of memory which resides 5108 * within a 0 or higher order page. Multiple fragments within that page 5109 * are individually refcounted, in the page's reference counter. 5110 * 5111 * The page_frag functions below provide a simple allocation framework for 5112 * page fragments. This is used by the network stack and network device 5113 * drivers to provide a backing region of memory for use as either an 5114 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 5115 */ 5116 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 5117 gfp_t gfp_mask) 5118 { 5119 struct page *page = NULL; 5120 gfp_t gfp = gfp_mask; 5121 5122 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5123 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 5124 __GFP_NOMEMALLOC; 5125 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 5126 PAGE_FRAG_CACHE_MAX_ORDER); 5127 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 5128 #endif 5129 if (unlikely(!page)) 5130 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 5131 5132 nc->va = page ? page_address(page) : NULL; 5133 5134 return page; 5135 } 5136 5137 void __page_frag_cache_drain(struct page *page, unsigned int count) 5138 { 5139 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 5140 5141 if (page_ref_sub_and_test(page, count)) 5142 free_the_page(page, compound_order(page)); 5143 } 5144 EXPORT_SYMBOL(__page_frag_cache_drain); 5145 5146 void *page_frag_alloc_align(struct page_frag_cache *nc, 5147 unsigned int fragsz, gfp_t gfp_mask, 5148 unsigned int align_mask) 5149 { 5150 unsigned int size = PAGE_SIZE; 5151 struct page *page; 5152 int offset; 5153 5154 if (unlikely(!nc->va)) { 5155 refill: 5156 page = __page_frag_cache_refill(nc, gfp_mask); 5157 if (!page) 5158 return NULL; 5159 5160 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5161 /* if size can vary use size else just use PAGE_SIZE */ 5162 size = nc->size; 5163 #endif 5164 /* Even if we own the page, we do not use atomic_set(). 5165 * This would break get_page_unless_zero() users. 5166 */ 5167 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); 5168 5169 /* reset page count bias and offset to start of new frag */ 5170 nc->pfmemalloc = page_is_pfmemalloc(page); 5171 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5172 nc->offset = size; 5173 } 5174 5175 offset = nc->offset - fragsz; 5176 if (unlikely(offset < 0)) { 5177 page = virt_to_page(nc->va); 5178 5179 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 5180 goto refill; 5181 5182 if (unlikely(nc->pfmemalloc)) { 5183 free_the_page(page, compound_order(page)); 5184 goto refill; 5185 } 5186 5187 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5188 /* if size can vary use size else just use PAGE_SIZE */ 5189 size = nc->size; 5190 #endif 5191 /* OK, page count is 0, we can safely set it */ 5192 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); 5193 5194 /* reset page count bias and offset to start of new frag */ 5195 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5196 offset = size - fragsz; 5197 } 5198 5199 nc->pagecnt_bias--; 5200 offset &= align_mask; 5201 nc->offset = offset; 5202 5203 return nc->va + offset; 5204 } 5205 EXPORT_SYMBOL(page_frag_alloc_align); 5206 5207 /* 5208 * Frees a page fragment allocated out of either a compound or order 0 page. 5209 */ 5210 void page_frag_free(void *addr) 5211 { 5212 struct page *page = virt_to_head_page(addr); 5213 5214 if (unlikely(put_page_testzero(page))) 5215 free_the_page(page, compound_order(page)); 5216 } 5217 EXPORT_SYMBOL(page_frag_free); 5218 5219 static void *make_alloc_exact(unsigned long addr, unsigned int order, 5220 size_t size) 5221 { 5222 if (addr) { 5223 unsigned long alloc_end = addr + (PAGE_SIZE << order); 5224 unsigned long used = addr + PAGE_ALIGN(size); 5225 5226 split_page(virt_to_page((void *)addr), order); 5227 while (used < alloc_end) { 5228 free_page(used); 5229 used += PAGE_SIZE; 5230 } 5231 } 5232 return (void *)addr; 5233 } 5234 5235 /** 5236 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 5237 * @size: the number of bytes to allocate 5238 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5239 * 5240 * This function is similar to alloc_pages(), except that it allocates the 5241 * minimum number of pages to satisfy the request. alloc_pages() can only 5242 * allocate memory in power-of-two pages. 5243 * 5244 * This function is also limited by MAX_ORDER. 5245 * 5246 * Memory allocated by this function must be released by free_pages_exact(). 5247 * 5248 * Return: pointer to the allocated area or %NULL in case of error. 5249 */ 5250 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 5251 { 5252 unsigned int order = get_order(size); 5253 unsigned long addr; 5254 5255 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 5256 gfp_mask &= ~__GFP_COMP; 5257 5258 addr = __get_free_pages(gfp_mask, order); 5259 return make_alloc_exact(addr, order, size); 5260 } 5261 EXPORT_SYMBOL(alloc_pages_exact); 5262 5263 /** 5264 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 5265 * pages on a node. 5266 * @nid: the preferred node ID where memory should be allocated 5267 * @size: the number of bytes to allocate 5268 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5269 * 5270 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 5271 * back. 5272 * 5273 * Return: pointer to the allocated area or %NULL in case of error. 5274 */ 5275 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 5276 { 5277 unsigned int order = get_order(size); 5278 struct page *p; 5279 5280 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 5281 gfp_mask &= ~__GFP_COMP; 5282 5283 p = alloc_pages_node(nid, gfp_mask, order); 5284 if (!p) 5285 return NULL; 5286 return make_alloc_exact((unsigned long)page_address(p), order, size); 5287 } 5288 5289 /** 5290 * free_pages_exact - release memory allocated via alloc_pages_exact() 5291 * @virt: the value returned by alloc_pages_exact. 5292 * @size: size of allocation, same value as passed to alloc_pages_exact(). 5293 * 5294 * Release the memory allocated by a previous call to alloc_pages_exact. 5295 */ 5296 void free_pages_exact(void *virt, size_t size) 5297 { 5298 unsigned long addr = (unsigned long)virt; 5299 unsigned long end = addr + PAGE_ALIGN(size); 5300 5301 while (addr < end) { 5302 free_page(addr); 5303 addr += PAGE_SIZE; 5304 } 5305 } 5306 EXPORT_SYMBOL(free_pages_exact); 5307 5308 /** 5309 * nr_free_zone_pages - count number of pages beyond high watermark 5310 * @offset: The zone index of the highest zone 5311 * 5312 * nr_free_zone_pages() counts the number of pages which are beyond the 5313 * high watermark within all zones at or below a given zone index. For each 5314 * zone, the number of pages is calculated as: 5315 * 5316 * nr_free_zone_pages = managed_pages - high_pages 5317 * 5318 * Return: number of pages beyond high watermark. 5319 */ 5320 static unsigned long nr_free_zone_pages(int offset) 5321 { 5322 struct zoneref *z; 5323 struct zone *zone; 5324 5325 /* Just pick one node, since fallback list is circular */ 5326 unsigned long sum = 0; 5327 5328 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 5329 5330 for_each_zone_zonelist(zone, z, zonelist, offset) { 5331 unsigned long size = zone_managed_pages(zone); 5332 unsigned long high = high_wmark_pages(zone); 5333 if (size > high) 5334 sum += size - high; 5335 } 5336 5337 return sum; 5338 } 5339 5340 /** 5341 * nr_free_buffer_pages - count number of pages beyond high watermark 5342 * 5343 * nr_free_buffer_pages() counts the number of pages which are beyond the high 5344 * watermark within ZONE_DMA and ZONE_NORMAL. 5345 * 5346 * Return: number of pages beyond high watermark within ZONE_DMA and 5347 * ZONE_NORMAL. 5348 */ 5349 unsigned long nr_free_buffer_pages(void) 5350 { 5351 return nr_free_zone_pages(gfp_zone(GFP_USER)); 5352 } 5353 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 5354 5355 static inline void show_node(struct zone *zone) 5356 { 5357 if (IS_ENABLED(CONFIG_NUMA)) 5358 printk("Node %d ", zone_to_nid(zone)); 5359 } 5360 5361 long si_mem_available(void) 5362 { 5363 long available; 5364 unsigned long pagecache; 5365 unsigned long wmark_low = 0; 5366 unsigned long pages[NR_LRU_LISTS]; 5367 unsigned long reclaimable; 5368 struct zone *zone; 5369 int lru; 5370 5371 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 5372 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 5373 5374 for_each_zone(zone) 5375 wmark_low += low_wmark_pages(zone); 5376 5377 /* 5378 * Estimate the amount of memory available for userspace allocations, 5379 * without causing swapping. 5380 */ 5381 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; 5382 5383 /* 5384 * Not all the page cache can be freed, otherwise the system will 5385 * start swapping. Assume at least half of the page cache, or the 5386 * low watermark worth of cache, needs to stay. 5387 */ 5388 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 5389 pagecache -= min(pagecache / 2, wmark_low); 5390 available += pagecache; 5391 5392 /* 5393 * Part of the reclaimable slab and other kernel memory consists of 5394 * items that are in use, and cannot be freed. Cap this estimate at the 5395 * low watermark. 5396 */ 5397 reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + 5398 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); 5399 available += reclaimable - min(reclaimable / 2, wmark_low); 5400 5401 if (available < 0) 5402 available = 0; 5403 return available; 5404 } 5405 EXPORT_SYMBOL_GPL(si_mem_available); 5406 5407 void si_meminfo(struct sysinfo *val) 5408 { 5409 val->totalram = totalram_pages(); 5410 val->sharedram = global_node_page_state(NR_SHMEM); 5411 val->freeram = global_zone_page_state(NR_FREE_PAGES); 5412 val->bufferram = nr_blockdev_pages(); 5413 val->totalhigh = totalhigh_pages(); 5414 val->freehigh = nr_free_highpages(); 5415 val->mem_unit = PAGE_SIZE; 5416 } 5417 5418 EXPORT_SYMBOL(si_meminfo); 5419 5420 #ifdef CONFIG_NUMA 5421 void si_meminfo_node(struct sysinfo *val, int nid) 5422 { 5423 int zone_type; /* needs to be signed */ 5424 unsigned long managed_pages = 0; 5425 unsigned long managed_highpages = 0; 5426 unsigned long free_highpages = 0; 5427 pg_data_t *pgdat = NODE_DATA(nid); 5428 5429 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 5430 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); 5431 val->totalram = managed_pages; 5432 val->sharedram = node_page_state(pgdat, NR_SHMEM); 5433 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 5434 #ifdef CONFIG_HIGHMEM 5435 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 5436 struct zone *zone = &pgdat->node_zones[zone_type]; 5437 5438 if (is_highmem(zone)) { 5439 managed_highpages += zone_managed_pages(zone); 5440 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 5441 } 5442 } 5443 val->totalhigh = managed_highpages; 5444 val->freehigh = free_highpages; 5445 #else 5446 val->totalhigh = managed_highpages; 5447 val->freehigh = free_highpages; 5448 #endif 5449 val->mem_unit = PAGE_SIZE; 5450 } 5451 #endif 5452 5453 /* 5454 * Determine whether the node should be displayed or not, depending on whether 5455 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 5456 */ 5457 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 5458 { 5459 if (!(flags & SHOW_MEM_FILTER_NODES)) 5460 return false; 5461 5462 /* 5463 * no node mask - aka implicit memory numa policy. Do not bother with 5464 * the synchronization - read_mems_allowed_begin - because we do not 5465 * have to be precise here. 5466 */ 5467 if (!nodemask) 5468 nodemask = &cpuset_current_mems_allowed; 5469 5470 return !node_isset(nid, *nodemask); 5471 } 5472 5473 #define K(x) ((x) << (PAGE_SHIFT-10)) 5474 5475 static void show_migration_types(unsigned char type) 5476 { 5477 static const char types[MIGRATE_TYPES] = { 5478 [MIGRATE_UNMOVABLE] = 'U', 5479 [MIGRATE_MOVABLE] = 'M', 5480 [MIGRATE_RECLAIMABLE] = 'E', 5481 [MIGRATE_HIGHATOMIC] = 'H', 5482 #ifdef CONFIG_CMA 5483 [MIGRATE_CMA] = 'C', 5484 #endif 5485 #ifdef CONFIG_MEMORY_ISOLATION 5486 [MIGRATE_ISOLATE] = 'I', 5487 #endif 5488 }; 5489 char tmp[MIGRATE_TYPES + 1]; 5490 char *p = tmp; 5491 int i; 5492 5493 for (i = 0; i < MIGRATE_TYPES; i++) { 5494 if (type & (1 << i)) 5495 *p++ = types[i]; 5496 } 5497 5498 *p = '\0'; 5499 printk(KERN_CONT "(%s) ", tmp); 5500 } 5501 5502 /* 5503 * Show free area list (used inside shift_scroll-lock stuff) 5504 * We also calculate the percentage fragmentation. We do this by counting the 5505 * memory on each free list with the exception of the first item on the list. 5506 * 5507 * Bits in @filter: 5508 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 5509 * cpuset. 5510 */ 5511 void show_free_areas(unsigned int filter, nodemask_t *nodemask) 5512 { 5513 unsigned long free_pcp = 0; 5514 int cpu; 5515 struct zone *zone; 5516 pg_data_t *pgdat; 5517 5518 for_each_populated_zone(zone) { 5519 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5520 continue; 5521 5522 for_each_online_cpu(cpu) 5523 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 5524 } 5525 5526 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 5527 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 5528 " unevictable:%lu dirty:%lu writeback:%lu\n" 5529 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 5530 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 5531 " free:%lu free_pcp:%lu free_cma:%lu\n", 5532 global_node_page_state(NR_ACTIVE_ANON), 5533 global_node_page_state(NR_INACTIVE_ANON), 5534 global_node_page_state(NR_ISOLATED_ANON), 5535 global_node_page_state(NR_ACTIVE_FILE), 5536 global_node_page_state(NR_INACTIVE_FILE), 5537 global_node_page_state(NR_ISOLATED_FILE), 5538 global_node_page_state(NR_UNEVICTABLE), 5539 global_node_page_state(NR_FILE_DIRTY), 5540 global_node_page_state(NR_WRITEBACK), 5541 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), 5542 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), 5543 global_node_page_state(NR_FILE_MAPPED), 5544 global_node_page_state(NR_SHMEM), 5545 global_node_page_state(NR_PAGETABLE), 5546 global_zone_page_state(NR_BOUNCE), 5547 global_zone_page_state(NR_FREE_PAGES), 5548 free_pcp, 5549 global_zone_page_state(NR_FREE_CMA_PAGES)); 5550 5551 for_each_online_pgdat(pgdat) { 5552 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 5553 continue; 5554 5555 printk("Node %d" 5556 " active_anon:%lukB" 5557 " inactive_anon:%lukB" 5558 " active_file:%lukB" 5559 " inactive_file:%lukB" 5560 " unevictable:%lukB" 5561 " isolated(anon):%lukB" 5562 " isolated(file):%lukB" 5563 " mapped:%lukB" 5564 " dirty:%lukB" 5565 " writeback:%lukB" 5566 " shmem:%lukB" 5567 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5568 " shmem_thp: %lukB" 5569 " shmem_pmdmapped: %lukB" 5570 " anon_thp: %lukB" 5571 #endif 5572 " writeback_tmp:%lukB" 5573 " kernel_stack:%lukB" 5574 #ifdef CONFIG_SHADOW_CALL_STACK 5575 " shadow_call_stack:%lukB" 5576 #endif 5577 " pagetables:%lukB" 5578 " all_unreclaimable? %s" 5579 "\n", 5580 pgdat->node_id, 5581 K(node_page_state(pgdat, NR_ACTIVE_ANON)), 5582 K(node_page_state(pgdat, NR_INACTIVE_ANON)), 5583 K(node_page_state(pgdat, NR_ACTIVE_FILE)), 5584 K(node_page_state(pgdat, NR_INACTIVE_FILE)), 5585 K(node_page_state(pgdat, NR_UNEVICTABLE)), 5586 K(node_page_state(pgdat, NR_ISOLATED_ANON)), 5587 K(node_page_state(pgdat, NR_ISOLATED_FILE)), 5588 K(node_page_state(pgdat, NR_FILE_MAPPED)), 5589 K(node_page_state(pgdat, NR_FILE_DIRTY)), 5590 K(node_page_state(pgdat, NR_WRITEBACK)), 5591 K(node_page_state(pgdat, NR_SHMEM)), 5592 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5593 K(node_page_state(pgdat, NR_SHMEM_THPS)), 5594 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), 5595 K(node_page_state(pgdat, NR_ANON_THPS)), 5596 #endif 5597 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 5598 node_page_state(pgdat, NR_KERNEL_STACK_KB), 5599 #ifdef CONFIG_SHADOW_CALL_STACK 5600 node_page_state(pgdat, NR_KERNEL_SCS_KB), 5601 #endif 5602 K(node_page_state(pgdat, NR_PAGETABLE)), 5603 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 5604 "yes" : "no"); 5605 } 5606 5607 for_each_populated_zone(zone) { 5608 int i; 5609 5610 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5611 continue; 5612 5613 free_pcp = 0; 5614 for_each_online_cpu(cpu) 5615 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 5616 5617 show_node(zone); 5618 printk(KERN_CONT 5619 "%s" 5620 " free:%lukB" 5621 " min:%lukB" 5622 " low:%lukB" 5623 " high:%lukB" 5624 " reserved_highatomic:%luKB" 5625 " active_anon:%lukB" 5626 " inactive_anon:%lukB" 5627 " active_file:%lukB" 5628 " inactive_file:%lukB" 5629 " unevictable:%lukB" 5630 " writepending:%lukB" 5631 " present:%lukB" 5632 " managed:%lukB" 5633 " mlocked:%lukB" 5634 " bounce:%lukB" 5635 " free_pcp:%lukB" 5636 " local_pcp:%ukB" 5637 " free_cma:%lukB" 5638 "\n", 5639 zone->name, 5640 K(zone_page_state(zone, NR_FREE_PAGES)), 5641 K(min_wmark_pages(zone)), 5642 K(low_wmark_pages(zone)), 5643 K(high_wmark_pages(zone)), 5644 K(zone->nr_reserved_highatomic), 5645 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 5646 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 5647 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 5648 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 5649 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 5650 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 5651 K(zone->present_pages), 5652 K(zone_managed_pages(zone)), 5653 K(zone_page_state(zone, NR_MLOCK)), 5654 K(zone_page_state(zone, NR_BOUNCE)), 5655 K(free_pcp), 5656 K(this_cpu_read(zone->pageset->pcp.count)), 5657 K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 5658 printk("lowmem_reserve[]:"); 5659 for (i = 0; i < MAX_NR_ZONES; i++) 5660 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 5661 printk(KERN_CONT "\n"); 5662 } 5663 5664 for_each_populated_zone(zone) { 5665 unsigned int order; 5666 unsigned long nr[MAX_ORDER], flags, total = 0; 5667 unsigned char types[MAX_ORDER]; 5668 5669 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5670 continue; 5671 show_node(zone); 5672 printk(KERN_CONT "%s: ", zone->name); 5673 5674 spin_lock_irqsave(&zone->lock, flags); 5675 for (order = 0; order < MAX_ORDER; order++) { 5676 struct free_area *area = &zone->free_area[order]; 5677 int type; 5678 5679 nr[order] = area->nr_free; 5680 total += nr[order] << order; 5681 5682 types[order] = 0; 5683 for (type = 0; type < MIGRATE_TYPES; type++) { 5684 if (!free_area_empty(area, type)) 5685 types[order] |= 1 << type; 5686 } 5687 } 5688 spin_unlock_irqrestore(&zone->lock, flags); 5689 for (order = 0; order < MAX_ORDER; order++) { 5690 printk(KERN_CONT "%lu*%lukB ", 5691 nr[order], K(1UL) << order); 5692 if (nr[order]) 5693 show_migration_types(types[order]); 5694 } 5695 printk(KERN_CONT "= %lukB\n", K(total)); 5696 } 5697 5698 hugetlb_show_meminfo(); 5699 5700 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 5701 5702 show_swap_cache_info(); 5703 } 5704 5705 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 5706 { 5707 zoneref->zone = zone; 5708 zoneref->zone_idx = zone_idx(zone); 5709 } 5710 5711 /* 5712 * Builds allocation fallback zone lists. 5713 * 5714 * Add all populated zones of a node to the zonelist. 5715 */ 5716 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 5717 { 5718 struct zone *zone; 5719 enum zone_type zone_type = MAX_NR_ZONES; 5720 int nr_zones = 0; 5721 5722 do { 5723 zone_type--; 5724 zone = pgdat->node_zones + zone_type; 5725 if (managed_zone(zone)) { 5726 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 5727 check_highest_zone(zone_type); 5728 } 5729 } while (zone_type); 5730 5731 return nr_zones; 5732 } 5733 5734 #ifdef CONFIG_NUMA 5735 5736 static int __parse_numa_zonelist_order(char *s) 5737 { 5738 /* 5739 * We used to support different zonlists modes but they turned 5740 * out to be just not useful. Let's keep the warning in place 5741 * if somebody still use the cmd line parameter so that we do 5742 * not fail it silently 5743 */ 5744 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 5745 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 5746 return -EINVAL; 5747 } 5748 return 0; 5749 } 5750 5751 char numa_zonelist_order[] = "Node"; 5752 5753 /* 5754 * sysctl handler for numa_zonelist_order 5755 */ 5756 int numa_zonelist_order_handler(struct ctl_table *table, int write, 5757 void *buffer, size_t *length, loff_t *ppos) 5758 { 5759 if (write) 5760 return __parse_numa_zonelist_order(buffer); 5761 return proc_dostring(table, write, buffer, length, ppos); 5762 } 5763 5764 5765 #define MAX_NODE_LOAD (nr_online_nodes) 5766 static int node_load[MAX_NUMNODES]; 5767 5768 /** 5769 * find_next_best_node - find the next node that should appear in a given node's fallback list 5770 * @node: node whose fallback list we're appending 5771 * @used_node_mask: nodemask_t of already used nodes 5772 * 5773 * We use a number of factors to determine which is the next node that should 5774 * appear on a given node's fallback list. The node should not have appeared 5775 * already in @node's fallback list, and it should be the next closest node 5776 * according to the distance array (which contains arbitrary distance values 5777 * from each node to each node in the system), and should also prefer nodes 5778 * with no CPUs, since presumably they'll have very little allocation pressure 5779 * on them otherwise. 5780 * 5781 * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 5782 */ 5783 static int find_next_best_node(int node, nodemask_t *used_node_mask) 5784 { 5785 int n, val; 5786 int min_val = INT_MAX; 5787 int best_node = NUMA_NO_NODE; 5788 5789 /* Use the local node if we haven't already */ 5790 if (!node_isset(node, *used_node_mask)) { 5791 node_set(node, *used_node_mask); 5792 return node; 5793 } 5794 5795 for_each_node_state(n, N_MEMORY) { 5796 5797 /* Don't want a node to appear more than once */ 5798 if (node_isset(n, *used_node_mask)) 5799 continue; 5800 5801 /* Use the distance array to find the distance */ 5802 val = node_distance(node, n); 5803 5804 /* Penalize nodes under us ("prefer the next node") */ 5805 val += (n < node); 5806 5807 /* Give preference to headless and unused nodes */ 5808 if (!cpumask_empty(cpumask_of_node(n))) 5809 val += PENALTY_FOR_NODE_WITH_CPUS; 5810 5811 /* Slight preference for less loaded node */ 5812 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 5813 val += node_load[n]; 5814 5815 if (val < min_val) { 5816 min_val = val; 5817 best_node = n; 5818 } 5819 } 5820 5821 if (best_node >= 0) 5822 node_set(best_node, *used_node_mask); 5823 5824 return best_node; 5825 } 5826 5827 5828 /* 5829 * Build zonelists ordered by node and zones within node. 5830 * This results in maximum locality--normal zone overflows into local 5831 * DMA zone, if any--but risks exhausting DMA zone. 5832 */ 5833 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 5834 unsigned nr_nodes) 5835 { 5836 struct zoneref *zonerefs; 5837 int i; 5838 5839 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5840 5841 for (i = 0; i < nr_nodes; i++) { 5842 int nr_zones; 5843 5844 pg_data_t *node = NODE_DATA(node_order[i]); 5845 5846 nr_zones = build_zonerefs_node(node, zonerefs); 5847 zonerefs += nr_zones; 5848 } 5849 zonerefs->zone = NULL; 5850 zonerefs->zone_idx = 0; 5851 } 5852 5853 /* 5854 * Build gfp_thisnode zonelists 5855 */ 5856 static void build_thisnode_zonelists(pg_data_t *pgdat) 5857 { 5858 struct zoneref *zonerefs; 5859 int nr_zones; 5860 5861 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 5862 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5863 zonerefs += nr_zones; 5864 zonerefs->zone = NULL; 5865 zonerefs->zone_idx = 0; 5866 } 5867 5868 /* 5869 * Build zonelists ordered by zone and nodes within zones. 5870 * This results in conserving DMA zone[s] until all Normal memory is 5871 * exhausted, but results in overflowing to remote node while memory 5872 * may still exist in local DMA zone. 5873 */ 5874 5875 static void build_zonelists(pg_data_t *pgdat) 5876 { 5877 static int node_order[MAX_NUMNODES]; 5878 int node, load, nr_nodes = 0; 5879 nodemask_t used_mask = NODE_MASK_NONE; 5880 int local_node, prev_node; 5881 5882 /* NUMA-aware ordering of nodes */ 5883 local_node = pgdat->node_id; 5884 load = nr_online_nodes; 5885 prev_node = local_node; 5886 5887 memset(node_order, 0, sizeof(node_order)); 5888 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5889 /* 5890 * We don't want to pressure a particular node. 5891 * So adding penalty to the first node in same 5892 * distance group to make it round-robin. 5893 */ 5894 if (node_distance(local_node, node) != 5895 node_distance(local_node, prev_node)) 5896 node_load[node] = load; 5897 5898 node_order[nr_nodes++] = node; 5899 prev_node = node; 5900 load--; 5901 } 5902 5903 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 5904 build_thisnode_zonelists(pgdat); 5905 } 5906 5907 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5908 /* 5909 * Return node id of node used for "local" allocations. 5910 * I.e., first node id of first zone in arg node's generic zonelist. 5911 * Used for initializing percpu 'numa_mem', which is used primarily 5912 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 5913 */ 5914 int local_memory_node(int node) 5915 { 5916 struct zoneref *z; 5917 5918 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5919 gfp_zone(GFP_KERNEL), 5920 NULL); 5921 return zone_to_nid(z->zone); 5922 } 5923 #endif 5924 5925 static void setup_min_unmapped_ratio(void); 5926 static void setup_min_slab_ratio(void); 5927 #else /* CONFIG_NUMA */ 5928 5929 static void build_zonelists(pg_data_t *pgdat) 5930 { 5931 int node, local_node; 5932 struct zoneref *zonerefs; 5933 int nr_zones; 5934 5935 local_node = pgdat->node_id; 5936 5937 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5938 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5939 zonerefs += nr_zones; 5940 5941 /* 5942 * Now we build the zonelist so that it contains the zones 5943 * of all the other nodes. 5944 * We don't want to pressure a particular node, so when 5945 * building the zones for node N, we make sure that the 5946 * zones coming right after the local ones are those from 5947 * node N+1 (modulo N) 5948 */ 5949 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5950 if (!node_online(node)) 5951 continue; 5952 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5953 zonerefs += nr_zones; 5954 } 5955 for (node = 0; node < local_node; node++) { 5956 if (!node_online(node)) 5957 continue; 5958 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5959 zonerefs += nr_zones; 5960 } 5961 5962 zonerefs->zone = NULL; 5963 zonerefs->zone_idx = 0; 5964 } 5965 5966 #endif /* CONFIG_NUMA */ 5967 5968 /* 5969 * Boot pageset table. One per cpu which is going to be used for all 5970 * zones and all nodes. The parameters will be set in such a way 5971 * that an item put on a list will immediately be handed over to 5972 * the buddy list. This is safe since pageset manipulation is done 5973 * with interrupts disabled. 5974 * 5975 * The boot_pagesets must be kept even after bootup is complete for 5976 * unused processors and/or zones. They do play a role for bootstrapping 5977 * hotplugged processors. 5978 * 5979 * zoneinfo_show() and maybe other functions do 5980 * not check if the processor is online before following the pageset pointer. 5981 * Other parts of the kernel may not check if the zone is available. 5982 */ 5983 static void pageset_init(struct per_cpu_pageset *p); 5984 /* These effectively disable the pcplists in the boot pageset completely */ 5985 #define BOOT_PAGESET_HIGH 0 5986 #define BOOT_PAGESET_BATCH 1 5987 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 5988 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 5989 5990 static void __build_all_zonelists(void *data) 5991 { 5992 int nid; 5993 int __maybe_unused cpu; 5994 pg_data_t *self = data; 5995 static DEFINE_SPINLOCK(lock); 5996 5997 spin_lock(&lock); 5998 5999 #ifdef CONFIG_NUMA 6000 memset(node_load, 0, sizeof(node_load)); 6001 #endif 6002 6003 /* 6004 * This node is hotadded and no memory is yet present. So just 6005 * building zonelists is fine - no need to touch other nodes. 6006 */ 6007 if (self && !node_online(self->node_id)) { 6008 build_zonelists(self); 6009 } else { 6010 for_each_online_node(nid) { 6011 pg_data_t *pgdat = NODE_DATA(nid); 6012 6013 build_zonelists(pgdat); 6014 } 6015 6016 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 6017 /* 6018 * We now know the "local memory node" for each node-- 6019 * i.e., the node of the first zone in the generic zonelist. 6020 * Set up numa_mem percpu variable for on-line cpus. During 6021 * boot, only the boot cpu should be on-line; we'll init the 6022 * secondary cpus' numa_mem as they come on-line. During 6023 * node/memory hotplug, we'll fixup all on-line cpus. 6024 */ 6025 for_each_online_cpu(cpu) 6026 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 6027 #endif 6028 } 6029 6030 spin_unlock(&lock); 6031 } 6032 6033 static noinline void __init 6034 build_all_zonelists_init(void) 6035 { 6036 int cpu; 6037 6038 __build_all_zonelists(NULL); 6039 6040 /* 6041 * Initialize the boot_pagesets that are going to be used 6042 * for bootstrapping processors. The real pagesets for 6043 * each zone will be allocated later when the per cpu 6044 * allocator is available. 6045 * 6046 * boot_pagesets are used also for bootstrapping offline 6047 * cpus if the system is already booted because the pagesets 6048 * are needed to initialize allocators on a specific cpu too. 6049 * F.e. the percpu allocator needs the page allocator which 6050 * needs the percpu allocator in order to allocate its pagesets 6051 * (a chicken-egg dilemma). 6052 */ 6053 for_each_possible_cpu(cpu) 6054 pageset_init(&per_cpu(boot_pageset, cpu)); 6055 6056 mminit_verify_zonelist(); 6057 cpuset_init_current_mems_allowed(); 6058 } 6059 6060 /* 6061 * unless system_state == SYSTEM_BOOTING. 6062 * 6063 * __ref due to call of __init annotated helper build_all_zonelists_init 6064 * [protected by SYSTEM_BOOTING]. 6065 */ 6066 void __ref build_all_zonelists(pg_data_t *pgdat) 6067 { 6068 unsigned long vm_total_pages; 6069 6070 if (system_state == SYSTEM_BOOTING) { 6071 build_all_zonelists_init(); 6072 } else { 6073 __build_all_zonelists(pgdat); 6074 /* cpuset refresh routine should be here */ 6075 } 6076 /* Get the number of free pages beyond high watermark in all zones. */ 6077 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 6078 /* 6079 * Disable grouping by mobility if the number of pages in the 6080 * system is too low to allow the mechanism to work. It would be 6081 * more accurate, but expensive to check per-zone. This check is 6082 * made on memory-hotadd so a system can start with mobility 6083 * disabled and enable it later 6084 */ 6085 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 6086 page_group_by_mobility_disabled = 1; 6087 else 6088 page_group_by_mobility_disabled = 0; 6089 6090 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 6091 nr_online_nodes, 6092 page_group_by_mobility_disabled ? "off" : "on", 6093 vm_total_pages); 6094 #ifdef CONFIG_NUMA 6095 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 6096 #endif 6097 } 6098 6099 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ 6100 static bool __meminit 6101 overlap_memmap_init(unsigned long zone, unsigned long *pfn) 6102 { 6103 static struct memblock_region *r; 6104 6105 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 6106 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { 6107 for_each_mem_region(r) { 6108 if (*pfn < memblock_region_memory_end_pfn(r)) 6109 break; 6110 } 6111 } 6112 if (*pfn >= memblock_region_memory_base_pfn(r) && 6113 memblock_is_mirror(r)) { 6114 *pfn = memblock_region_memory_end_pfn(r); 6115 return true; 6116 } 6117 } 6118 return false; 6119 } 6120 6121 /* 6122 * Initially all pages are reserved - free ones are freed 6123 * up by memblock_free_all() once the early boot process is 6124 * done. Non-atomic initialization, single-pass. 6125 * 6126 * All aligned pageblocks are initialized to the specified migratetype 6127 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 6128 * zone stats (e.g., nr_isolate_pageblock) are touched. 6129 */ 6130 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, 6131 unsigned long start_pfn, unsigned long zone_end_pfn, 6132 enum meminit_context context, 6133 struct vmem_altmap *altmap, int migratetype) 6134 { 6135 unsigned long pfn, end_pfn = start_pfn + size; 6136 struct page *page; 6137 6138 if (highest_memmap_pfn < end_pfn - 1) 6139 highest_memmap_pfn = end_pfn - 1; 6140 6141 #ifdef CONFIG_ZONE_DEVICE 6142 /* 6143 * Honor reservation requested by the driver for this ZONE_DEVICE 6144 * memory. We limit the total number of pages to initialize to just 6145 * those that might contain the memory mapping. We will defer the 6146 * ZONE_DEVICE page initialization until after we have released 6147 * the hotplug lock. 6148 */ 6149 if (zone == ZONE_DEVICE) { 6150 if (!altmap) 6151 return; 6152 6153 if (start_pfn == altmap->base_pfn) 6154 start_pfn += altmap->reserve; 6155 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6156 } 6157 #endif 6158 6159 for (pfn = start_pfn; pfn < end_pfn; ) { 6160 /* 6161 * There can be holes in boot-time mem_map[]s handed to this 6162 * function. They do not exist on hotplugged memory. 6163 */ 6164 if (context == MEMINIT_EARLY) { 6165 if (overlap_memmap_init(zone, &pfn)) 6166 continue; 6167 if (defer_init(nid, pfn, zone_end_pfn)) 6168 break; 6169 } 6170 6171 page = pfn_to_page(pfn); 6172 __init_single_page(page, pfn, zone, nid); 6173 if (context == MEMINIT_HOTPLUG) 6174 __SetPageReserved(page); 6175 6176 /* 6177 * Usually, we want to mark the pageblock MIGRATE_MOVABLE, 6178 * such that unmovable allocations won't be scattered all 6179 * over the place during system boot. 6180 */ 6181 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6182 set_pageblock_migratetype(page, migratetype); 6183 cond_resched(); 6184 } 6185 pfn++; 6186 } 6187 } 6188 6189 #ifdef CONFIG_ZONE_DEVICE 6190 void __ref memmap_init_zone_device(struct zone *zone, 6191 unsigned long start_pfn, 6192 unsigned long nr_pages, 6193 struct dev_pagemap *pgmap) 6194 { 6195 unsigned long pfn, end_pfn = start_pfn + nr_pages; 6196 struct pglist_data *pgdat = zone->zone_pgdat; 6197 struct vmem_altmap *altmap = pgmap_altmap(pgmap); 6198 unsigned long zone_idx = zone_idx(zone); 6199 unsigned long start = jiffies; 6200 int nid = pgdat->node_id; 6201 6202 if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) 6203 return; 6204 6205 /* 6206 * The call to memmap_init_zone should have already taken care 6207 * of the pages reserved for the memmap, so we can just jump to 6208 * the end of that region and start processing the device pages. 6209 */ 6210 if (altmap) { 6211 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6212 nr_pages = end_pfn - start_pfn; 6213 } 6214 6215 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 6216 struct page *page = pfn_to_page(pfn); 6217 6218 __init_single_page(page, pfn, zone_idx, nid); 6219 6220 /* 6221 * Mark page reserved as it will need to wait for onlining 6222 * phase for it to be fully associated with a zone. 6223 * 6224 * We can use the non-atomic __set_bit operation for setting 6225 * the flag as we are still initializing the pages. 6226 */ 6227 __SetPageReserved(page); 6228 6229 /* 6230 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer 6231 * and zone_device_data. It is a bug if a ZONE_DEVICE page is 6232 * ever freed or placed on a driver-private list. 6233 */ 6234 page->pgmap = pgmap; 6235 page->zone_device_data = NULL; 6236 6237 /* 6238 * Mark the block movable so that blocks are reserved for 6239 * movable at startup. This will force kernel allocations 6240 * to reserve their blocks rather than leaking throughout 6241 * the address space during boot when many long-lived 6242 * kernel allocations are made. 6243 * 6244 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap 6245 * because this is done early in section_activate() 6246 */ 6247 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6248 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 6249 cond_resched(); 6250 } 6251 } 6252 6253 pr_info("%s initialised %lu pages in %ums\n", __func__, 6254 nr_pages, jiffies_to_msecs(jiffies - start)); 6255 } 6256 6257 #endif 6258 static void __meminit zone_init_free_lists(struct zone *zone) 6259 { 6260 unsigned int order, t; 6261 for_each_migratetype_order(order, t) { 6262 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 6263 zone->free_area[order].nr_free = 0; 6264 } 6265 } 6266 6267 #if !defined(CONFIG_FLAT_NODE_MEM_MAP) 6268 /* 6269 * Only struct pages that correspond to ranges defined by memblock.memory 6270 * are zeroed and initialized by going through __init_single_page() during 6271 * memmap_init_zone(). 6272 * 6273 * But, there could be struct pages that correspond to holes in 6274 * memblock.memory. This can happen because of the following reasons: 6275 * - physical memory bank size is not necessarily the exact multiple of the 6276 * arbitrary section size 6277 * - early reserved memory may not be listed in memblock.memory 6278 * - memory layouts defined with memmap= kernel parameter may not align 6279 * nicely with memmap sections 6280 * 6281 * Explicitly initialize those struct pages so that: 6282 * - PG_Reserved is set 6283 * - zone and node links point to zone and node that span the page if the 6284 * hole is in the middle of a zone 6285 * - zone and node links point to adjacent zone/node if the hole falls on 6286 * the zone boundary; the pages in such holes will be prepended to the 6287 * zone/node above the hole except for the trailing pages in the last 6288 * section that will be appended to the zone/node below. 6289 */ 6290 static u64 __meminit init_unavailable_range(unsigned long spfn, 6291 unsigned long epfn, 6292 int zone, int node) 6293 { 6294 unsigned long pfn; 6295 u64 pgcnt = 0; 6296 6297 for (pfn = spfn; pfn < epfn; pfn++) { 6298 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6299 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6300 + pageblock_nr_pages - 1; 6301 continue; 6302 } 6303 __init_single_page(pfn_to_page(pfn), pfn, zone, node); 6304 __SetPageReserved(pfn_to_page(pfn)); 6305 pgcnt++; 6306 } 6307 6308 return pgcnt; 6309 } 6310 #else 6311 static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, 6312 int zone, int node) 6313 { 6314 return 0; 6315 } 6316 #endif 6317 6318 void __meminit __weak memmap_init_zone(struct zone *zone) 6319 { 6320 unsigned long zone_start_pfn = zone->zone_start_pfn; 6321 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 6322 int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); 6323 static unsigned long hole_pfn; 6324 unsigned long start_pfn, end_pfn; 6325 u64 pgcnt = 0; 6326 6327 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 6328 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); 6329 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); 6330 6331 if (end_pfn > start_pfn) 6332 memmap_init_range(end_pfn - start_pfn, nid, 6333 zone_id, start_pfn, zone_end_pfn, 6334 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); 6335 6336 if (hole_pfn < start_pfn) 6337 pgcnt += init_unavailable_range(hole_pfn, start_pfn, 6338 zone_id, nid); 6339 hole_pfn = end_pfn; 6340 } 6341 6342 #ifdef CONFIG_SPARSEMEM 6343 /* 6344 * Initialize the hole in the range [zone_end_pfn, section_end]. 6345 * If zone boundary falls in the middle of a section, this hole 6346 * will be re-initialized during the call to this function for the 6347 * higher zone. 6348 */ 6349 end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); 6350 if (hole_pfn < end_pfn) 6351 pgcnt += init_unavailable_range(hole_pfn, end_pfn, 6352 zone_id, nid); 6353 #endif 6354 6355 if (pgcnt) 6356 pr_info(" %s zone: %llu pages in unavailable ranges\n", 6357 zone->name, pgcnt); 6358 } 6359 6360 static int zone_batchsize(struct zone *zone) 6361 { 6362 #ifdef CONFIG_MMU 6363 int batch; 6364 6365 /* 6366 * The per-cpu-pages pools are set to around 1000th of the 6367 * size of the zone. 6368 */ 6369 batch = zone_managed_pages(zone) / 1024; 6370 /* But no more than a meg. */ 6371 if (batch * PAGE_SIZE > 1024 * 1024) 6372 batch = (1024 * 1024) / PAGE_SIZE; 6373 batch /= 4; /* We effectively *= 4 below */ 6374 if (batch < 1) 6375 batch = 1; 6376 6377 /* 6378 * Clamp the batch to a 2^n - 1 value. Having a power 6379 * of 2 value was found to be more likely to have 6380 * suboptimal cache aliasing properties in some cases. 6381 * 6382 * For example if 2 tasks are alternately allocating 6383 * batches of pages, one task can end up with a lot 6384 * of pages of one half of the possible page colors 6385 * and the other with pages of the other colors. 6386 */ 6387 batch = rounddown_pow_of_two(batch + batch/2) - 1; 6388 6389 return batch; 6390 6391 #else 6392 /* The deferral and batching of frees should be suppressed under NOMMU 6393 * conditions. 6394 * 6395 * The problem is that NOMMU needs to be able to allocate large chunks 6396 * of contiguous memory as there's no hardware page translation to 6397 * assemble apparent contiguous memory from discontiguous pages. 6398 * 6399 * Queueing large contiguous runs of pages for batching, however, 6400 * causes the pages to actually be freed in smaller chunks. As there 6401 * can be a significant delay between the individual batches being 6402 * recycled, this leads to the once large chunks of space being 6403 * fragmented and becoming unavailable for high-order allocations. 6404 */ 6405 return 0; 6406 #endif 6407 } 6408 6409 /* 6410 * pcp->high and pcp->batch values are related and generally batch is lower 6411 * than high. They are also related to pcp->count such that count is lower 6412 * than high, and as soon as it reaches high, the pcplist is flushed. 6413 * 6414 * However, guaranteeing these relations at all times would require e.g. write 6415 * barriers here but also careful usage of read barriers at the read side, and 6416 * thus be prone to error and bad for performance. Thus the update only prevents 6417 * store tearing. Any new users of pcp->batch and pcp->high should ensure they 6418 * can cope with those fields changing asynchronously, and fully trust only the 6419 * pcp->count field on the local CPU with interrupts disabled. 6420 * 6421 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 6422 * outside of boot time (or some other assurance that no concurrent updaters 6423 * exist). 6424 */ 6425 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 6426 unsigned long batch) 6427 { 6428 WRITE_ONCE(pcp->batch, batch); 6429 WRITE_ONCE(pcp->high, high); 6430 } 6431 6432 static void pageset_init(struct per_cpu_pageset *p) 6433 { 6434 struct per_cpu_pages *pcp; 6435 int migratetype; 6436 6437 memset(p, 0, sizeof(*p)); 6438 6439 pcp = &p->pcp; 6440 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 6441 INIT_LIST_HEAD(&pcp->lists[migratetype]); 6442 6443 /* 6444 * Set batch and high values safe for a boot pageset. A true percpu 6445 * pageset's initialization will update them subsequently. Here we don't 6446 * need to be as careful as pageset_update() as nobody can access the 6447 * pageset yet. 6448 */ 6449 pcp->high = BOOT_PAGESET_HIGH; 6450 pcp->batch = BOOT_PAGESET_BATCH; 6451 } 6452 6453 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, 6454 unsigned long batch) 6455 { 6456 struct per_cpu_pageset *p; 6457 int cpu; 6458 6459 for_each_possible_cpu(cpu) { 6460 p = per_cpu_ptr(zone->pageset, cpu); 6461 pageset_update(&p->pcp, high, batch); 6462 } 6463 } 6464 6465 /* 6466 * Calculate and set new high and batch values for all per-cpu pagesets of a 6467 * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. 6468 */ 6469 static void zone_set_pageset_high_and_batch(struct zone *zone) 6470 { 6471 unsigned long new_high, new_batch; 6472 6473 if (percpu_pagelist_fraction) { 6474 new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; 6475 new_batch = max(1UL, new_high / 4); 6476 if ((new_high / 4) > (PAGE_SHIFT * 8)) 6477 new_batch = PAGE_SHIFT * 8; 6478 } else { 6479 new_batch = zone_batchsize(zone); 6480 new_high = 6 * new_batch; 6481 new_batch = max(1UL, 1 * new_batch); 6482 } 6483 6484 if (zone->pageset_high == new_high && 6485 zone->pageset_batch == new_batch) 6486 return; 6487 6488 zone->pageset_high = new_high; 6489 zone->pageset_batch = new_batch; 6490 6491 __zone_set_pageset_high_and_batch(zone, new_high, new_batch); 6492 } 6493 6494 void __meminit setup_zone_pageset(struct zone *zone) 6495 { 6496 struct per_cpu_pageset *p; 6497 int cpu; 6498 6499 zone->pageset = alloc_percpu(struct per_cpu_pageset); 6500 for_each_possible_cpu(cpu) { 6501 p = per_cpu_ptr(zone->pageset, cpu); 6502 pageset_init(p); 6503 } 6504 6505 zone_set_pageset_high_and_batch(zone); 6506 } 6507 6508 /* 6509 * Allocate per cpu pagesets and initialize them. 6510 * Before this call only boot pagesets were available. 6511 */ 6512 void __init setup_per_cpu_pageset(void) 6513 { 6514 struct pglist_data *pgdat; 6515 struct zone *zone; 6516 int __maybe_unused cpu; 6517 6518 for_each_populated_zone(zone) 6519 setup_zone_pageset(zone); 6520 6521 #ifdef CONFIG_NUMA 6522 /* 6523 * Unpopulated zones continue using the boot pagesets. 6524 * The numa stats for these pagesets need to be reset. 6525 * Otherwise, they will end up skewing the stats of 6526 * the nodes these zones are associated with. 6527 */ 6528 for_each_possible_cpu(cpu) { 6529 struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); 6530 memset(pcp->vm_numa_stat_diff, 0, 6531 sizeof(pcp->vm_numa_stat_diff)); 6532 } 6533 #endif 6534 6535 for_each_online_pgdat(pgdat) 6536 pgdat->per_cpu_nodestats = 6537 alloc_percpu(struct per_cpu_nodestat); 6538 } 6539 6540 static __meminit void zone_pcp_init(struct zone *zone) 6541 { 6542 /* 6543 * per cpu subsystem is not up at this point. The following code 6544 * relies on the ability of the linker to provide the 6545 * offset of a (static) per cpu variable into the per cpu area. 6546 */ 6547 zone->pageset = &boot_pageset; 6548 zone->pageset_high = BOOT_PAGESET_HIGH; 6549 zone->pageset_batch = BOOT_PAGESET_BATCH; 6550 6551 if (populated_zone(zone)) 6552 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 6553 zone->name, zone->present_pages, 6554 zone_batchsize(zone)); 6555 } 6556 6557 void __meminit init_currently_empty_zone(struct zone *zone, 6558 unsigned long zone_start_pfn, 6559 unsigned long size) 6560 { 6561 struct pglist_data *pgdat = zone->zone_pgdat; 6562 int zone_idx = zone_idx(zone) + 1; 6563 6564 if (zone_idx > pgdat->nr_zones) 6565 pgdat->nr_zones = zone_idx; 6566 6567 zone->zone_start_pfn = zone_start_pfn; 6568 6569 mminit_dprintk(MMINIT_TRACE, "memmap_init", 6570 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 6571 pgdat->node_id, 6572 (unsigned long)zone_idx(zone), 6573 zone_start_pfn, (zone_start_pfn + size)); 6574 6575 zone_init_free_lists(zone); 6576 zone->initialized = 1; 6577 } 6578 6579 /** 6580 * get_pfn_range_for_nid - Return the start and end page frames for a node 6581 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 6582 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 6583 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 6584 * 6585 * It returns the start and end page frame of a node based on information 6586 * provided by memblock_set_node(). If called for a node 6587 * with no available memory, a warning is printed and the start and end 6588 * PFNs will be 0. 6589 */ 6590 void __init get_pfn_range_for_nid(unsigned int nid, 6591 unsigned long *start_pfn, unsigned long *end_pfn) 6592 { 6593 unsigned long this_start_pfn, this_end_pfn; 6594 int i; 6595 6596 *start_pfn = -1UL; 6597 *end_pfn = 0; 6598 6599 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 6600 *start_pfn = min(*start_pfn, this_start_pfn); 6601 *end_pfn = max(*end_pfn, this_end_pfn); 6602 } 6603 6604 if (*start_pfn == -1UL) 6605 *start_pfn = 0; 6606 } 6607 6608 /* 6609 * This finds a zone that can be used for ZONE_MOVABLE pages. The 6610 * assumption is made that zones within a node are ordered in monotonic 6611 * increasing memory addresses so that the "highest" populated zone is used 6612 */ 6613 static void __init find_usable_zone_for_movable(void) 6614 { 6615 int zone_index; 6616 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 6617 if (zone_index == ZONE_MOVABLE) 6618 continue; 6619 6620 if (arch_zone_highest_possible_pfn[zone_index] > 6621 arch_zone_lowest_possible_pfn[zone_index]) 6622 break; 6623 } 6624 6625 VM_BUG_ON(zone_index == -1); 6626 movable_zone = zone_index; 6627 } 6628 6629 /* 6630 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 6631 * because it is sized independent of architecture. Unlike the other zones, 6632 * the starting point for ZONE_MOVABLE is not fixed. It may be different 6633 * in each node depending on the size of each node and how evenly kernelcore 6634 * is distributed. This helper function adjusts the zone ranges 6635 * provided by the architecture for a given node by using the end of the 6636 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 6637 * zones within a node are in order of monotonic increases memory addresses 6638 */ 6639 static void __init adjust_zone_range_for_zone_movable(int nid, 6640 unsigned long zone_type, 6641 unsigned long node_start_pfn, 6642 unsigned long node_end_pfn, 6643 unsigned long *zone_start_pfn, 6644 unsigned long *zone_end_pfn) 6645 { 6646 /* Only adjust if ZONE_MOVABLE is on this node */ 6647 if (zone_movable_pfn[nid]) { 6648 /* Size ZONE_MOVABLE */ 6649 if (zone_type == ZONE_MOVABLE) { 6650 *zone_start_pfn = zone_movable_pfn[nid]; 6651 *zone_end_pfn = min(node_end_pfn, 6652 arch_zone_highest_possible_pfn[movable_zone]); 6653 6654 /* Adjust for ZONE_MOVABLE starting within this range */ 6655 } else if (!mirrored_kernelcore && 6656 *zone_start_pfn < zone_movable_pfn[nid] && 6657 *zone_end_pfn > zone_movable_pfn[nid]) { 6658 *zone_end_pfn = zone_movable_pfn[nid]; 6659 6660 /* Check if this whole range is within ZONE_MOVABLE */ 6661 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 6662 *zone_start_pfn = *zone_end_pfn; 6663 } 6664 } 6665 6666 /* 6667 * Return the number of pages a zone spans in a node, including holes 6668 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 6669 */ 6670 static unsigned long __init zone_spanned_pages_in_node(int nid, 6671 unsigned long zone_type, 6672 unsigned long node_start_pfn, 6673 unsigned long node_end_pfn, 6674 unsigned long *zone_start_pfn, 6675 unsigned long *zone_end_pfn) 6676 { 6677 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 6678 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 6679 /* When hotadd a new node from cpu_up(), the node should be empty */ 6680 if (!node_start_pfn && !node_end_pfn) 6681 return 0; 6682 6683 /* Get the start and end of the zone */ 6684 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 6685 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 6686 adjust_zone_range_for_zone_movable(nid, zone_type, 6687 node_start_pfn, node_end_pfn, 6688 zone_start_pfn, zone_end_pfn); 6689 6690 /* Check that this node has pages within the zone's required range */ 6691 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 6692 return 0; 6693 6694 /* Move the zone boundaries inside the node if necessary */ 6695 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 6696 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 6697 6698 /* Return the spanned pages */ 6699 return *zone_end_pfn - *zone_start_pfn; 6700 } 6701 6702 /* 6703 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 6704 * then all holes in the requested range will be accounted for. 6705 */ 6706 unsigned long __init __absent_pages_in_range(int nid, 6707 unsigned long range_start_pfn, 6708 unsigned long range_end_pfn) 6709 { 6710 unsigned long nr_absent = range_end_pfn - range_start_pfn; 6711 unsigned long start_pfn, end_pfn; 6712 int i; 6713 6714 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 6715 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 6716 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 6717 nr_absent -= end_pfn - start_pfn; 6718 } 6719 return nr_absent; 6720 } 6721 6722 /** 6723 * absent_pages_in_range - Return number of page frames in holes within a range 6724 * @start_pfn: The start PFN to start searching for holes 6725 * @end_pfn: The end PFN to stop searching for holes 6726 * 6727 * Return: the number of pages frames in memory holes within a range. 6728 */ 6729 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 6730 unsigned long end_pfn) 6731 { 6732 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 6733 } 6734 6735 /* Return the number of page frames in holes in a zone on a node */ 6736 static unsigned long __init zone_absent_pages_in_node(int nid, 6737 unsigned long zone_type, 6738 unsigned long node_start_pfn, 6739 unsigned long node_end_pfn) 6740 { 6741 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 6742 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 6743 unsigned long zone_start_pfn, zone_end_pfn; 6744 unsigned long nr_absent; 6745 6746 /* When hotadd a new node from cpu_up(), the node should be empty */ 6747 if (!node_start_pfn && !node_end_pfn) 6748 return 0; 6749 6750 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 6751 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 6752 6753 adjust_zone_range_for_zone_movable(nid, zone_type, 6754 node_start_pfn, node_end_pfn, 6755 &zone_start_pfn, &zone_end_pfn); 6756 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 6757 6758 /* 6759 * ZONE_MOVABLE handling. 6760 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 6761 * and vice versa. 6762 */ 6763 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 6764 unsigned long start_pfn, end_pfn; 6765 struct memblock_region *r; 6766 6767 for_each_mem_region(r) { 6768 start_pfn = clamp(memblock_region_memory_base_pfn(r), 6769 zone_start_pfn, zone_end_pfn); 6770 end_pfn = clamp(memblock_region_memory_end_pfn(r), 6771 zone_start_pfn, zone_end_pfn); 6772 6773 if (zone_type == ZONE_MOVABLE && 6774 memblock_is_mirror(r)) 6775 nr_absent += end_pfn - start_pfn; 6776 6777 if (zone_type == ZONE_NORMAL && 6778 !memblock_is_mirror(r)) 6779 nr_absent += end_pfn - start_pfn; 6780 } 6781 } 6782 6783 return nr_absent; 6784 } 6785 6786 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 6787 unsigned long node_start_pfn, 6788 unsigned long node_end_pfn) 6789 { 6790 unsigned long realtotalpages = 0, totalpages = 0; 6791 enum zone_type i; 6792 6793 for (i = 0; i < MAX_NR_ZONES; i++) { 6794 struct zone *zone = pgdat->node_zones + i; 6795 unsigned long zone_start_pfn, zone_end_pfn; 6796 unsigned long spanned, absent; 6797 unsigned long size, real_size; 6798 6799 spanned = zone_spanned_pages_in_node(pgdat->node_id, i, 6800 node_start_pfn, 6801 node_end_pfn, 6802 &zone_start_pfn, 6803 &zone_end_pfn); 6804 absent = zone_absent_pages_in_node(pgdat->node_id, i, 6805 node_start_pfn, 6806 node_end_pfn); 6807 6808 size = spanned; 6809 real_size = size - absent; 6810 6811 if (size) 6812 zone->zone_start_pfn = zone_start_pfn; 6813 else 6814 zone->zone_start_pfn = 0; 6815 zone->spanned_pages = size; 6816 zone->present_pages = real_size; 6817 6818 totalpages += size; 6819 realtotalpages += real_size; 6820 } 6821 6822 pgdat->node_spanned_pages = totalpages; 6823 pgdat->node_present_pages = realtotalpages; 6824 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 6825 realtotalpages); 6826 } 6827 6828 #ifndef CONFIG_SPARSEMEM 6829 /* 6830 * Calculate the size of the zone->blockflags rounded to an unsigned long 6831 * Start by making sure zonesize is a multiple of pageblock_order by rounding 6832 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 6833 * round what is now in bits to nearest long in bits, then return it in 6834 * bytes. 6835 */ 6836 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 6837 { 6838 unsigned long usemapsize; 6839 6840 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 6841 usemapsize = roundup(zonesize, pageblock_nr_pages); 6842 usemapsize = usemapsize >> pageblock_order; 6843 usemapsize *= NR_PAGEBLOCK_BITS; 6844 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 6845 6846 return usemapsize / 8; 6847 } 6848 6849 static void __ref setup_usemap(struct zone *zone) 6850 { 6851 unsigned long usemapsize = usemap_size(zone->zone_start_pfn, 6852 zone->spanned_pages); 6853 zone->pageblock_flags = NULL; 6854 if (usemapsize) { 6855 zone->pageblock_flags = 6856 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, 6857 zone_to_nid(zone)); 6858 if (!zone->pageblock_flags) 6859 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 6860 usemapsize, zone->name, zone_to_nid(zone)); 6861 } 6862 } 6863 #else 6864 static inline void setup_usemap(struct zone *zone) {} 6865 #endif /* CONFIG_SPARSEMEM */ 6866 6867 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 6868 6869 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 6870 void __init set_pageblock_order(void) 6871 { 6872 unsigned int order; 6873 6874 /* Check that pageblock_nr_pages has not already been setup */ 6875 if (pageblock_order) 6876 return; 6877 6878 if (HPAGE_SHIFT > PAGE_SHIFT) 6879 order = HUGETLB_PAGE_ORDER; 6880 else 6881 order = MAX_ORDER - 1; 6882 6883 /* 6884 * Assume the largest contiguous order of interest is a huge page. 6885 * This value may be variable depending on boot parameters on IA64 and 6886 * powerpc. 6887 */ 6888 pageblock_order = order; 6889 } 6890 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 6891 6892 /* 6893 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 6894 * is unused as pageblock_order is set at compile-time. See 6895 * include/linux/pageblock-flags.h for the values of pageblock_order based on 6896 * the kernel config 6897 */ 6898 void __init set_pageblock_order(void) 6899 { 6900 } 6901 6902 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 6903 6904 static unsigned long __init calc_memmap_size(unsigned long spanned_pages, 6905 unsigned long present_pages) 6906 { 6907 unsigned long pages = spanned_pages; 6908 6909 /* 6910 * Provide a more accurate estimation if there are holes within 6911 * the zone and SPARSEMEM is in use. If there are holes within the 6912 * zone, each populated memory region may cost us one or two extra 6913 * memmap pages due to alignment because memmap pages for each 6914 * populated regions may not be naturally aligned on page boundary. 6915 * So the (present_pages >> 4) heuristic is a tradeoff for that. 6916 */ 6917 if (spanned_pages > present_pages + (present_pages >> 4) && 6918 IS_ENABLED(CONFIG_SPARSEMEM)) 6919 pages = present_pages; 6920 6921 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 6922 } 6923 6924 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6925 static void pgdat_init_split_queue(struct pglist_data *pgdat) 6926 { 6927 struct deferred_split *ds_queue = &pgdat->deferred_split_queue; 6928 6929 spin_lock_init(&ds_queue->split_queue_lock); 6930 INIT_LIST_HEAD(&ds_queue->split_queue); 6931 ds_queue->split_queue_len = 0; 6932 } 6933 #else 6934 static void pgdat_init_split_queue(struct pglist_data *pgdat) {} 6935 #endif 6936 6937 #ifdef CONFIG_COMPACTION 6938 static void pgdat_init_kcompactd(struct pglist_data *pgdat) 6939 { 6940 init_waitqueue_head(&pgdat->kcompactd_wait); 6941 } 6942 #else 6943 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} 6944 #endif 6945 6946 static void __meminit pgdat_init_internals(struct pglist_data *pgdat) 6947 { 6948 pgdat_resize_init(pgdat); 6949 6950 pgdat_init_split_queue(pgdat); 6951 pgdat_init_kcompactd(pgdat); 6952 6953 init_waitqueue_head(&pgdat->kswapd_wait); 6954 init_waitqueue_head(&pgdat->pfmemalloc_wait); 6955 6956 pgdat_page_ext_init(pgdat); 6957 lruvec_init(&pgdat->__lruvec); 6958 } 6959 6960 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 6961 unsigned long remaining_pages) 6962 { 6963 atomic_long_set(&zone->managed_pages, remaining_pages); 6964 zone_set_nid(zone, nid); 6965 zone->name = zone_names[idx]; 6966 zone->zone_pgdat = NODE_DATA(nid); 6967 spin_lock_init(&zone->lock); 6968 zone_seqlock_init(zone); 6969 zone_pcp_init(zone); 6970 } 6971 6972 /* 6973 * Set up the zone data structures 6974 * - init pgdat internals 6975 * - init all zones belonging to this node 6976 * 6977 * NOTE: this function is only called during memory hotplug 6978 */ 6979 #ifdef CONFIG_MEMORY_HOTPLUG 6980 void __ref free_area_init_core_hotplug(int nid) 6981 { 6982 enum zone_type z; 6983 pg_data_t *pgdat = NODE_DATA(nid); 6984 6985 pgdat_init_internals(pgdat); 6986 for (z = 0; z < MAX_NR_ZONES; z++) 6987 zone_init_internals(&pgdat->node_zones[z], z, nid, 0); 6988 } 6989 #endif 6990 6991 /* 6992 * Set up the zone data structures: 6993 * - mark all pages reserved 6994 * - mark all memory queues empty 6995 * - clear the memory bitmaps 6996 * 6997 * NOTE: pgdat should get zeroed by caller. 6998 * NOTE: this function is only called during early init. 6999 */ 7000 static void __init free_area_init_core(struct pglist_data *pgdat) 7001 { 7002 enum zone_type j; 7003 int nid = pgdat->node_id; 7004 7005 pgdat_init_internals(pgdat); 7006 pgdat->per_cpu_nodestats = &boot_nodestats; 7007 7008 for (j = 0; j < MAX_NR_ZONES; j++) { 7009 struct zone *zone = pgdat->node_zones + j; 7010 unsigned long size, freesize, memmap_pages; 7011 7012 size = zone->spanned_pages; 7013 freesize = zone->present_pages; 7014 7015 /* 7016 * Adjust freesize so that it accounts for how much memory 7017 * is used by this zone for memmap. This affects the watermark 7018 * and per-cpu initialisations 7019 */ 7020 memmap_pages = calc_memmap_size(size, freesize); 7021 if (!is_highmem_idx(j)) { 7022 if (freesize >= memmap_pages) { 7023 freesize -= memmap_pages; 7024 if (memmap_pages) 7025 printk(KERN_DEBUG 7026 " %s zone: %lu pages used for memmap\n", 7027 zone_names[j], memmap_pages); 7028 } else 7029 pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", 7030 zone_names[j], memmap_pages, freesize); 7031 } 7032 7033 /* Account for reserved pages */ 7034 if (j == 0 && freesize > dma_reserve) { 7035 freesize -= dma_reserve; 7036 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 7037 zone_names[0], dma_reserve); 7038 } 7039 7040 if (!is_highmem_idx(j)) 7041 nr_kernel_pages += freesize; 7042 /* Charge for highmem memmap if there are enough kernel pages */ 7043 else if (nr_kernel_pages > memmap_pages * 2) 7044 nr_kernel_pages -= memmap_pages; 7045 nr_all_pages += freesize; 7046 7047 /* 7048 * Set an approximate value for lowmem here, it will be adjusted 7049 * when the bootmem allocator frees pages into the buddy system. 7050 * And all highmem pages will be managed by the buddy system. 7051 */ 7052 zone_init_internals(zone, j, nid, freesize); 7053 7054 if (!size) 7055 continue; 7056 7057 set_pageblock_order(); 7058 setup_usemap(zone); 7059 init_currently_empty_zone(zone, zone->zone_start_pfn, size); 7060 memmap_init_zone(zone); 7061 } 7062 } 7063 7064 #ifdef CONFIG_FLAT_NODE_MEM_MAP 7065 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 7066 { 7067 unsigned long __maybe_unused start = 0; 7068 unsigned long __maybe_unused offset = 0; 7069 7070 /* Skip empty nodes */ 7071 if (!pgdat->node_spanned_pages) 7072 return; 7073 7074 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 7075 offset = pgdat->node_start_pfn - start; 7076 /* ia64 gets its own node_mem_map, before this, without bootmem */ 7077 if (!pgdat->node_mem_map) { 7078 unsigned long size, end; 7079 struct page *map; 7080 7081 /* 7082 * The zone's endpoints aren't required to be MAX_ORDER 7083 * aligned but the node_mem_map endpoints must be in order 7084 * for the buddy allocator to function correctly. 7085 */ 7086 end = pgdat_end_pfn(pgdat); 7087 end = ALIGN(end, MAX_ORDER_NR_PAGES); 7088 size = (end - start) * sizeof(struct page); 7089 map = memblock_alloc_node(size, SMP_CACHE_BYTES, 7090 pgdat->node_id); 7091 if (!map) 7092 panic("Failed to allocate %ld bytes for node %d memory map\n", 7093 size, pgdat->node_id); 7094 pgdat->node_mem_map = map + offset; 7095 } 7096 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 7097 __func__, pgdat->node_id, (unsigned long)pgdat, 7098 (unsigned long)pgdat->node_mem_map); 7099 #ifndef CONFIG_NEED_MULTIPLE_NODES 7100 /* 7101 * With no DISCONTIG, the global mem_map is just set as node 0's 7102 */ 7103 if (pgdat == NODE_DATA(0)) { 7104 mem_map = NODE_DATA(0)->node_mem_map; 7105 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 7106 mem_map -= offset; 7107 } 7108 #endif 7109 } 7110 #else 7111 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } 7112 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 7113 7114 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 7115 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 7116 { 7117 pgdat->first_deferred_pfn = ULONG_MAX; 7118 } 7119 #else 7120 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} 7121 #endif 7122 7123 static void __init free_area_init_node(int nid) 7124 { 7125 pg_data_t *pgdat = NODE_DATA(nid); 7126 unsigned long start_pfn = 0; 7127 unsigned long end_pfn = 0; 7128 7129 /* pg_data_t should be reset to zero when it's allocated */ 7130 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); 7131 7132 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 7133 7134 pgdat->node_id = nid; 7135 pgdat->node_start_pfn = start_pfn; 7136 pgdat->per_cpu_nodestats = NULL; 7137 7138 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 7139 (u64)start_pfn << PAGE_SHIFT, 7140 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 7141 calculate_node_totalpages(pgdat, start_pfn, end_pfn); 7142 7143 alloc_node_mem_map(pgdat); 7144 pgdat_set_deferred_range(pgdat); 7145 7146 free_area_init_core(pgdat); 7147 } 7148 7149 void __init free_area_init_memoryless_node(int nid) 7150 { 7151 free_area_init_node(nid); 7152 } 7153 7154 #if MAX_NUMNODES > 1 7155 /* 7156 * Figure out the number of possible node ids. 7157 */ 7158 void __init setup_nr_node_ids(void) 7159 { 7160 unsigned int highest; 7161 7162 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 7163 nr_node_ids = highest + 1; 7164 } 7165 #endif 7166 7167 /** 7168 * node_map_pfn_alignment - determine the maximum internode alignment 7169 * 7170 * This function should be called after node map is populated and sorted. 7171 * It calculates the maximum power of two alignment which can distinguish 7172 * all the nodes. 7173 * 7174 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 7175 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 7176 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 7177 * shifted, 1GiB is enough and this function will indicate so. 7178 * 7179 * This is used to test whether pfn -> nid mapping of the chosen memory 7180 * model has fine enough granularity to avoid incorrect mapping for the 7181 * populated node map. 7182 * 7183 * Return: the determined alignment in pfn's. 0 if there is no alignment 7184 * requirement (single node). 7185 */ 7186 unsigned long __init node_map_pfn_alignment(void) 7187 { 7188 unsigned long accl_mask = 0, last_end = 0; 7189 unsigned long start, end, mask; 7190 int last_nid = NUMA_NO_NODE; 7191 int i, nid; 7192 7193 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 7194 if (!start || last_nid < 0 || last_nid == nid) { 7195 last_nid = nid; 7196 last_end = end; 7197 continue; 7198 } 7199 7200 /* 7201 * Start with a mask granular enough to pin-point to the 7202 * start pfn and tick off bits one-by-one until it becomes 7203 * too coarse to separate the current node from the last. 7204 */ 7205 mask = ~((1 << __ffs(start)) - 1); 7206 while (mask && last_end <= (start & (mask << 1))) 7207 mask <<= 1; 7208 7209 /* accumulate all internode masks */ 7210 accl_mask |= mask; 7211 } 7212 7213 /* convert mask to number of pages */ 7214 return ~accl_mask + 1; 7215 } 7216 7217 /** 7218 * find_min_pfn_with_active_regions - Find the minimum PFN registered 7219 * 7220 * Return: the minimum PFN based on information provided via 7221 * memblock_set_node(). 7222 */ 7223 unsigned long __init find_min_pfn_with_active_regions(void) 7224 { 7225 return PHYS_PFN(memblock_start_of_DRAM()); 7226 } 7227 7228 /* 7229 * early_calculate_totalpages() 7230 * Sum pages in active regions for movable zone. 7231 * Populate N_MEMORY for calculating usable_nodes. 7232 */ 7233 static unsigned long __init early_calculate_totalpages(void) 7234 { 7235 unsigned long totalpages = 0; 7236 unsigned long start_pfn, end_pfn; 7237 int i, nid; 7238 7239 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 7240 unsigned long pages = end_pfn - start_pfn; 7241 7242 totalpages += pages; 7243 if (pages) 7244 node_set_state(nid, N_MEMORY); 7245 } 7246 return totalpages; 7247 } 7248 7249 /* 7250 * Find the PFN the Movable zone begins in each node. Kernel memory 7251 * is spread evenly between nodes as long as the nodes have enough 7252 * memory. When they don't, some nodes will have more kernelcore than 7253 * others 7254 */ 7255 static void __init find_zone_movable_pfns_for_nodes(void) 7256 { 7257 int i, nid; 7258 unsigned long usable_startpfn; 7259 unsigned long kernelcore_node, kernelcore_remaining; 7260 /* save the state before borrow the nodemask */ 7261 nodemask_t saved_node_state = node_states[N_MEMORY]; 7262 unsigned long totalpages = early_calculate_totalpages(); 7263 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 7264 struct memblock_region *r; 7265 7266 /* Need to find movable_zone earlier when movable_node is specified. */ 7267 find_usable_zone_for_movable(); 7268 7269 /* 7270 * If movable_node is specified, ignore kernelcore and movablecore 7271 * options. 7272 */ 7273 if (movable_node_is_enabled()) { 7274 for_each_mem_region(r) { 7275 if (!memblock_is_hotpluggable(r)) 7276 continue; 7277 7278 nid = memblock_get_region_node(r); 7279 7280 usable_startpfn = PFN_DOWN(r->base); 7281 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7282 min(usable_startpfn, zone_movable_pfn[nid]) : 7283 usable_startpfn; 7284 } 7285 7286 goto out2; 7287 } 7288 7289 /* 7290 * If kernelcore=mirror is specified, ignore movablecore option 7291 */ 7292 if (mirrored_kernelcore) { 7293 bool mem_below_4gb_not_mirrored = false; 7294 7295 for_each_mem_region(r) { 7296 if (memblock_is_mirror(r)) 7297 continue; 7298 7299 nid = memblock_get_region_node(r); 7300 7301 usable_startpfn = memblock_region_memory_base_pfn(r); 7302 7303 if (usable_startpfn < 0x100000) { 7304 mem_below_4gb_not_mirrored = true; 7305 continue; 7306 } 7307 7308 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7309 min(usable_startpfn, zone_movable_pfn[nid]) : 7310 usable_startpfn; 7311 } 7312 7313 if (mem_below_4gb_not_mirrored) 7314 pr_warn("This configuration results in unmirrored kernel memory.\n"); 7315 7316 goto out2; 7317 } 7318 7319 /* 7320 * If kernelcore=nn% or movablecore=nn% was specified, calculate the 7321 * amount of necessary memory. 7322 */ 7323 if (required_kernelcore_percent) 7324 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / 7325 10000UL; 7326 if (required_movablecore_percent) 7327 required_movablecore = (totalpages * 100 * required_movablecore_percent) / 7328 10000UL; 7329 7330 /* 7331 * If movablecore= was specified, calculate what size of 7332 * kernelcore that corresponds so that memory usable for 7333 * any allocation type is evenly spread. If both kernelcore 7334 * and movablecore are specified, then the value of kernelcore 7335 * will be used for required_kernelcore if it's greater than 7336 * what movablecore would have allowed. 7337 */ 7338 if (required_movablecore) { 7339 unsigned long corepages; 7340 7341 /* 7342 * Round-up so that ZONE_MOVABLE is at least as large as what 7343 * was requested by the user 7344 */ 7345 required_movablecore = 7346 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 7347 required_movablecore = min(totalpages, required_movablecore); 7348 corepages = totalpages - required_movablecore; 7349 7350 required_kernelcore = max(required_kernelcore, corepages); 7351 } 7352 7353 /* 7354 * If kernelcore was not specified or kernelcore size is larger 7355 * than totalpages, there is no ZONE_MOVABLE. 7356 */ 7357 if (!required_kernelcore || required_kernelcore >= totalpages) 7358 goto out; 7359 7360 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 7361 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 7362 7363 restart: 7364 /* Spread kernelcore memory as evenly as possible throughout nodes */ 7365 kernelcore_node = required_kernelcore / usable_nodes; 7366 for_each_node_state(nid, N_MEMORY) { 7367 unsigned long start_pfn, end_pfn; 7368 7369 /* 7370 * Recalculate kernelcore_node if the division per node 7371 * now exceeds what is necessary to satisfy the requested 7372 * amount of memory for the kernel 7373 */ 7374 if (required_kernelcore < kernelcore_node) 7375 kernelcore_node = required_kernelcore / usable_nodes; 7376 7377 /* 7378 * As the map is walked, we track how much memory is usable 7379 * by the kernel using kernelcore_remaining. When it is 7380 * 0, the rest of the node is usable by ZONE_MOVABLE 7381 */ 7382 kernelcore_remaining = kernelcore_node; 7383 7384 /* Go through each range of PFNs within this node */ 7385 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 7386 unsigned long size_pages; 7387 7388 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 7389 if (start_pfn >= end_pfn) 7390 continue; 7391 7392 /* Account for what is only usable for kernelcore */ 7393 if (start_pfn < usable_startpfn) { 7394 unsigned long kernel_pages; 7395 kernel_pages = min(end_pfn, usable_startpfn) 7396 - start_pfn; 7397 7398 kernelcore_remaining -= min(kernel_pages, 7399 kernelcore_remaining); 7400 required_kernelcore -= min(kernel_pages, 7401 required_kernelcore); 7402 7403 /* Continue if range is now fully accounted */ 7404 if (end_pfn <= usable_startpfn) { 7405 7406 /* 7407 * Push zone_movable_pfn to the end so 7408 * that if we have to rebalance 7409 * kernelcore across nodes, we will 7410 * not double account here 7411 */ 7412 zone_movable_pfn[nid] = end_pfn; 7413 continue; 7414 } 7415 start_pfn = usable_startpfn; 7416 } 7417 7418 /* 7419 * The usable PFN range for ZONE_MOVABLE is from 7420 * start_pfn->end_pfn. Calculate size_pages as the 7421 * number of pages used as kernelcore 7422 */ 7423 size_pages = end_pfn - start_pfn; 7424 if (size_pages > kernelcore_remaining) 7425 size_pages = kernelcore_remaining; 7426 zone_movable_pfn[nid] = start_pfn + size_pages; 7427 7428 /* 7429 * Some kernelcore has been met, update counts and 7430 * break if the kernelcore for this node has been 7431 * satisfied 7432 */ 7433 required_kernelcore -= min(required_kernelcore, 7434 size_pages); 7435 kernelcore_remaining -= size_pages; 7436 if (!kernelcore_remaining) 7437 break; 7438 } 7439 } 7440 7441 /* 7442 * If there is still required_kernelcore, we do another pass with one 7443 * less node in the count. This will push zone_movable_pfn[nid] further 7444 * along on the nodes that still have memory until kernelcore is 7445 * satisfied 7446 */ 7447 usable_nodes--; 7448 if (usable_nodes && required_kernelcore > usable_nodes) 7449 goto restart; 7450 7451 out2: 7452 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 7453 for (nid = 0; nid < MAX_NUMNODES; nid++) 7454 zone_movable_pfn[nid] = 7455 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 7456 7457 out: 7458 /* restore the node_state */ 7459 node_states[N_MEMORY] = saved_node_state; 7460 } 7461 7462 /* Any regular or high memory on that node ? */ 7463 static void check_for_memory(pg_data_t *pgdat, int nid) 7464 { 7465 enum zone_type zone_type; 7466 7467 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 7468 struct zone *zone = &pgdat->node_zones[zone_type]; 7469 if (populated_zone(zone)) { 7470 if (IS_ENABLED(CONFIG_HIGHMEM)) 7471 node_set_state(nid, N_HIGH_MEMORY); 7472 if (zone_type <= ZONE_NORMAL) 7473 node_set_state(nid, N_NORMAL_MEMORY); 7474 break; 7475 } 7476 } 7477 } 7478 7479 /* 7480 * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For 7481 * such cases we allow max_zone_pfn sorted in the descending order 7482 */ 7483 bool __weak arch_has_descending_max_zone_pfns(void) 7484 { 7485 return false; 7486 } 7487 7488 /** 7489 * free_area_init - Initialise all pg_data_t and zone data 7490 * @max_zone_pfn: an array of max PFNs for each zone 7491 * 7492 * This will call free_area_init_node() for each active node in the system. 7493 * Using the page ranges provided by memblock_set_node(), the size of each 7494 * zone in each node and their holes is calculated. If the maximum PFN 7495 * between two adjacent zones match, it is assumed that the zone is empty. 7496 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 7497 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 7498 * starts where the previous one ended. For example, ZONE_DMA32 starts 7499 * at arch_max_dma_pfn. 7500 */ 7501 void __init free_area_init(unsigned long *max_zone_pfn) 7502 { 7503 unsigned long start_pfn, end_pfn; 7504 int i, nid, zone; 7505 bool descending; 7506 7507 /* Record where the zone boundaries are */ 7508 memset(arch_zone_lowest_possible_pfn, 0, 7509 sizeof(arch_zone_lowest_possible_pfn)); 7510 memset(arch_zone_highest_possible_pfn, 0, 7511 sizeof(arch_zone_highest_possible_pfn)); 7512 7513 start_pfn = find_min_pfn_with_active_regions(); 7514 descending = arch_has_descending_max_zone_pfns(); 7515 7516 for (i = 0; i < MAX_NR_ZONES; i++) { 7517 if (descending) 7518 zone = MAX_NR_ZONES - i - 1; 7519 else 7520 zone = i; 7521 7522 if (zone == ZONE_MOVABLE) 7523 continue; 7524 7525 end_pfn = max(max_zone_pfn[zone], start_pfn); 7526 arch_zone_lowest_possible_pfn[zone] = start_pfn; 7527 arch_zone_highest_possible_pfn[zone] = end_pfn; 7528 7529 start_pfn = end_pfn; 7530 } 7531 7532 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 7533 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 7534 find_zone_movable_pfns_for_nodes(); 7535 7536 /* Print out the zone ranges */ 7537 pr_info("Zone ranges:\n"); 7538 for (i = 0; i < MAX_NR_ZONES; i++) { 7539 if (i == ZONE_MOVABLE) 7540 continue; 7541 pr_info(" %-8s ", zone_names[i]); 7542 if (arch_zone_lowest_possible_pfn[i] == 7543 arch_zone_highest_possible_pfn[i]) 7544 pr_cont("empty\n"); 7545 else 7546 pr_cont("[mem %#018Lx-%#018Lx]\n", 7547 (u64)arch_zone_lowest_possible_pfn[i] 7548 << PAGE_SHIFT, 7549 ((u64)arch_zone_highest_possible_pfn[i] 7550 << PAGE_SHIFT) - 1); 7551 } 7552 7553 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 7554 pr_info("Movable zone start for each node\n"); 7555 for (i = 0; i < MAX_NUMNODES; i++) { 7556 if (zone_movable_pfn[i]) 7557 pr_info(" Node %d: %#018Lx\n", i, 7558 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 7559 } 7560 7561 /* 7562 * Print out the early node map, and initialize the 7563 * subsection-map relative to active online memory ranges to 7564 * enable future "sub-section" extensions of the memory map. 7565 */ 7566 pr_info("Early memory node ranges\n"); 7567 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 7568 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 7569 (u64)start_pfn << PAGE_SHIFT, 7570 ((u64)end_pfn << PAGE_SHIFT) - 1); 7571 subsection_map_init(start_pfn, end_pfn - start_pfn); 7572 } 7573 7574 /* Initialise every node */ 7575 mminit_verify_pageflags_layout(); 7576 setup_nr_node_ids(); 7577 for_each_online_node(nid) { 7578 pg_data_t *pgdat = NODE_DATA(nid); 7579 free_area_init_node(nid); 7580 7581 /* Any memory on that node */ 7582 if (pgdat->node_present_pages) 7583 node_set_state(nid, N_MEMORY); 7584 check_for_memory(pgdat, nid); 7585 } 7586 } 7587 7588 static int __init cmdline_parse_core(char *p, unsigned long *core, 7589 unsigned long *percent) 7590 { 7591 unsigned long long coremem; 7592 char *endptr; 7593 7594 if (!p) 7595 return -EINVAL; 7596 7597 /* Value may be a percentage of total memory, otherwise bytes */ 7598 coremem = simple_strtoull(p, &endptr, 0); 7599 if (*endptr == '%') { 7600 /* Paranoid check for percent values greater than 100 */ 7601 WARN_ON(coremem > 100); 7602 7603 *percent = coremem; 7604 } else { 7605 coremem = memparse(p, &p); 7606 /* Paranoid check that UL is enough for the coremem value */ 7607 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 7608 7609 *core = coremem >> PAGE_SHIFT; 7610 *percent = 0UL; 7611 } 7612 return 0; 7613 } 7614 7615 /* 7616 * kernelcore=size sets the amount of memory for use for allocations that 7617 * cannot be reclaimed or migrated. 7618 */ 7619 static int __init cmdline_parse_kernelcore(char *p) 7620 { 7621 /* parse kernelcore=mirror */ 7622 if (parse_option_str(p, "mirror")) { 7623 mirrored_kernelcore = true; 7624 return 0; 7625 } 7626 7627 return cmdline_parse_core(p, &required_kernelcore, 7628 &required_kernelcore_percent); 7629 } 7630 7631 /* 7632 * movablecore=size sets the amount of memory for use for allocations that 7633 * can be reclaimed or migrated. 7634 */ 7635 static int __init cmdline_parse_movablecore(char *p) 7636 { 7637 return cmdline_parse_core(p, &required_movablecore, 7638 &required_movablecore_percent); 7639 } 7640 7641 early_param("kernelcore", cmdline_parse_kernelcore); 7642 early_param("movablecore", cmdline_parse_movablecore); 7643 7644 void adjust_managed_page_count(struct page *page, long count) 7645 { 7646 atomic_long_add(count, &page_zone(page)->managed_pages); 7647 totalram_pages_add(count); 7648 #ifdef CONFIG_HIGHMEM 7649 if (PageHighMem(page)) 7650 totalhigh_pages_add(count); 7651 #endif 7652 } 7653 EXPORT_SYMBOL(adjust_managed_page_count); 7654 7655 unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) 7656 { 7657 void *pos; 7658 unsigned long pages = 0; 7659 7660 start = (void *)PAGE_ALIGN((unsigned long)start); 7661 end = (void *)((unsigned long)end & PAGE_MASK); 7662 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 7663 struct page *page = virt_to_page(pos); 7664 void *direct_map_addr; 7665 7666 /* 7667 * 'direct_map_addr' might be different from 'pos' 7668 * because some architectures' virt_to_page() 7669 * work with aliases. Getting the direct map 7670 * address ensures that we get a _writeable_ 7671 * alias for the memset(). 7672 */ 7673 direct_map_addr = page_address(page); 7674 /* 7675 * Perform a kasan-unchecked memset() since this memory 7676 * has not been initialized. 7677 */ 7678 direct_map_addr = kasan_reset_tag(direct_map_addr); 7679 if ((unsigned int)poison <= 0xFF) 7680 memset(direct_map_addr, poison, PAGE_SIZE); 7681 7682 free_reserved_page(page); 7683 } 7684 7685 if (pages && s) 7686 pr_info("Freeing %s memory: %ldK\n", 7687 s, pages << (PAGE_SHIFT - 10)); 7688 7689 return pages; 7690 } 7691 7692 void __init mem_init_print_info(const char *str) 7693 { 7694 unsigned long physpages, codesize, datasize, rosize, bss_size; 7695 unsigned long init_code_size, init_data_size; 7696 7697 physpages = get_num_physpages(); 7698 codesize = _etext - _stext; 7699 datasize = _edata - _sdata; 7700 rosize = __end_rodata - __start_rodata; 7701 bss_size = __bss_stop - __bss_start; 7702 init_data_size = __init_end - __init_begin; 7703 init_code_size = _einittext - _sinittext; 7704 7705 /* 7706 * Detect special cases and adjust section sizes accordingly: 7707 * 1) .init.* may be embedded into .data sections 7708 * 2) .init.text.* may be out of [__init_begin, __init_end], 7709 * please refer to arch/tile/kernel/vmlinux.lds.S. 7710 * 3) .rodata.* may be embedded into .text or .data sections. 7711 */ 7712 #define adj_init_size(start, end, size, pos, adj) \ 7713 do { \ 7714 if (start <= pos && pos < end && size > adj) \ 7715 size -= adj; \ 7716 } while (0) 7717 7718 adj_init_size(__init_begin, __init_end, init_data_size, 7719 _sinittext, init_code_size); 7720 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 7721 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 7722 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 7723 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 7724 7725 #undef adj_init_size 7726 7727 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 7728 #ifdef CONFIG_HIGHMEM 7729 ", %luK highmem" 7730 #endif 7731 "%s%s)\n", 7732 nr_free_pages() << (PAGE_SHIFT - 10), 7733 physpages << (PAGE_SHIFT - 10), 7734 codesize >> 10, datasize >> 10, rosize >> 10, 7735 (init_data_size + init_code_size) >> 10, bss_size >> 10, 7736 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), 7737 totalcma_pages << (PAGE_SHIFT - 10), 7738 #ifdef CONFIG_HIGHMEM 7739 totalhigh_pages() << (PAGE_SHIFT - 10), 7740 #endif 7741 str ? ", " : "", str ? str : ""); 7742 } 7743 7744 /** 7745 * set_dma_reserve - set the specified number of pages reserved in the first zone 7746 * @new_dma_reserve: The number of pages to mark reserved 7747 * 7748 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 7749 * In the DMA zone, a significant percentage may be consumed by kernel image 7750 * and other unfreeable allocations which can skew the watermarks badly. This 7751 * function may optionally be used to account for unfreeable pages in the 7752 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 7753 * smaller per-cpu batchsize. 7754 */ 7755 void __init set_dma_reserve(unsigned long new_dma_reserve) 7756 { 7757 dma_reserve = new_dma_reserve; 7758 } 7759 7760 static int page_alloc_cpu_dead(unsigned int cpu) 7761 { 7762 7763 lru_add_drain_cpu(cpu); 7764 drain_pages(cpu); 7765 7766 /* 7767 * Spill the event counters of the dead processor 7768 * into the current processors event counters. 7769 * This artificially elevates the count of the current 7770 * processor. 7771 */ 7772 vm_events_fold_cpu(cpu); 7773 7774 /* 7775 * Zero the differential counters of the dead processor 7776 * so that the vm statistics are consistent. 7777 * 7778 * This is only okay since the processor is dead and cannot 7779 * race with what we are doing. 7780 */ 7781 cpu_vm_stats_fold(cpu); 7782 return 0; 7783 } 7784 7785 #ifdef CONFIG_NUMA 7786 int hashdist = HASHDIST_DEFAULT; 7787 7788 static int __init set_hashdist(char *str) 7789 { 7790 if (!str) 7791 return 0; 7792 hashdist = simple_strtoul(str, &str, 0); 7793 return 1; 7794 } 7795 __setup("hashdist=", set_hashdist); 7796 #endif 7797 7798 void __init page_alloc_init(void) 7799 { 7800 int ret; 7801 7802 #ifdef CONFIG_NUMA 7803 if (num_node_state(N_MEMORY) == 1) 7804 hashdist = 0; 7805 #endif 7806 7807 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, 7808 "mm/page_alloc:dead", NULL, 7809 page_alloc_cpu_dead); 7810 WARN_ON(ret < 0); 7811 } 7812 7813 /* 7814 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 7815 * or min_free_kbytes changes. 7816 */ 7817 static void calculate_totalreserve_pages(void) 7818 { 7819 struct pglist_data *pgdat; 7820 unsigned long reserve_pages = 0; 7821 enum zone_type i, j; 7822 7823 for_each_online_pgdat(pgdat) { 7824 7825 pgdat->totalreserve_pages = 0; 7826 7827 for (i = 0; i < MAX_NR_ZONES; i++) { 7828 struct zone *zone = pgdat->node_zones + i; 7829 long max = 0; 7830 unsigned long managed_pages = zone_managed_pages(zone); 7831 7832 /* Find valid and maximum lowmem_reserve in the zone */ 7833 for (j = i; j < MAX_NR_ZONES; j++) { 7834 if (zone->lowmem_reserve[j] > max) 7835 max = zone->lowmem_reserve[j]; 7836 } 7837 7838 /* we treat the high watermark as reserved pages. */ 7839 max += high_wmark_pages(zone); 7840 7841 if (max > managed_pages) 7842 max = managed_pages; 7843 7844 pgdat->totalreserve_pages += max; 7845 7846 reserve_pages += max; 7847 } 7848 } 7849 totalreserve_pages = reserve_pages; 7850 } 7851 7852 /* 7853 * setup_per_zone_lowmem_reserve - called whenever 7854 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 7855 * has a correct pages reserved value, so an adequate number of 7856 * pages are left in the zone after a successful __alloc_pages(). 7857 */ 7858 static void setup_per_zone_lowmem_reserve(void) 7859 { 7860 struct pglist_data *pgdat; 7861 enum zone_type i, j; 7862 7863 for_each_online_pgdat(pgdat) { 7864 for (i = 0; i < MAX_NR_ZONES - 1; i++) { 7865 struct zone *zone = &pgdat->node_zones[i]; 7866 int ratio = sysctl_lowmem_reserve_ratio[i]; 7867 bool clear = !ratio || !zone_managed_pages(zone); 7868 unsigned long managed_pages = 0; 7869 7870 for (j = i + 1; j < MAX_NR_ZONES; j++) { 7871 if (clear) { 7872 zone->lowmem_reserve[j] = 0; 7873 } else { 7874 struct zone *upper_zone = &pgdat->node_zones[j]; 7875 7876 managed_pages += zone_managed_pages(upper_zone); 7877 zone->lowmem_reserve[j] = managed_pages / ratio; 7878 } 7879 } 7880 } 7881 } 7882 7883 /* update totalreserve_pages */ 7884 calculate_totalreserve_pages(); 7885 } 7886 7887 static void __setup_per_zone_wmarks(void) 7888 { 7889 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 7890 unsigned long lowmem_pages = 0; 7891 struct zone *zone; 7892 unsigned long flags; 7893 7894 /* Calculate total number of !ZONE_HIGHMEM pages */ 7895 for_each_zone(zone) { 7896 if (!is_highmem(zone)) 7897 lowmem_pages += zone_managed_pages(zone); 7898 } 7899 7900 for_each_zone(zone) { 7901 u64 tmp; 7902 7903 spin_lock_irqsave(&zone->lock, flags); 7904 tmp = (u64)pages_min * zone_managed_pages(zone); 7905 do_div(tmp, lowmem_pages); 7906 if (is_highmem(zone)) { 7907 /* 7908 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 7909 * need highmem pages, so cap pages_min to a small 7910 * value here. 7911 * 7912 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 7913 * deltas control async page reclaim, and so should 7914 * not be capped for highmem. 7915 */ 7916 unsigned long min_pages; 7917 7918 min_pages = zone_managed_pages(zone) / 1024; 7919 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 7920 zone->_watermark[WMARK_MIN] = min_pages; 7921 } else { 7922 /* 7923 * If it's a lowmem zone, reserve a number of pages 7924 * proportionate to the zone's size. 7925 */ 7926 zone->_watermark[WMARK_MIN] = tmp; 7927 } 7928 7929 /* 7930 * Set the kswapd watermarks distance according to the 7931 * scale factor in proportion to available memory, but 7932 * ensure a minimum size on small systems. 7933 */ 7934 tmp = max_t(u64, tmp >> 2, 7935 mult_frac(zone_managed_pages(zone), 7936 watermark_scale_factor, 10000)); 7937 7938 zone->watermark_boost = 0; 7939 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 7940 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 7941 7942 spin_unlock_irqrestore(&zone->lock, flags); 7943 } 7944 7945 /* update totalreserve_pages */ 7946 calculate_totalreserve_pages(); 7947 } 7948 7949 /** 7950 * setup_per_zone_wmarks - called when min_free_kbytes changes 7951 * or when memory is hot-{added|removed} 7952 * 7953 * Ensures that the watermark[min,low,high] values for each zone are set 7954 * correctly with respect to min_free_kbytes. 7955 */ 7956 void setup_per_zone_wmarks(void) 7957 { 7958 static DEFINE_SPINLOCK(lock); 7959 7960 spin_lock(&lock); 7961 __setup_per_zone_wmarks(); 7962 spin_unlock(&lock); 7963 } 7964 7965 /* 7966 * Initialise min_free_kbytes. 7967 * 7968 * For small machines we want it small (128k min). For large machines 7969 * we want it large (256MB max). But it is not linear, because network 7970 * bandwidth does not increase linearly with machine size. We use 7971 * 7972 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 7973 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 7974 * 7975 * which yields 7976 * 7977 * 16MB: 512k 7978 * 32MB: 724k 7979 * 64MB: 1024k 7980 * 128MB: 1448k 7981 * 256MB: 2048k 7982 * 512MB: 2896k 7983 * 1024MB: 4096k 7984 * 2048MB: 5792k 7985 * 4096MB: 8192k 7986 * 8192MB: 11584k 7987 * 16384MB: 16384k 7988 */ 7989 int __meminit init_per_zone_wmark_min(void) 7990 { 7991 unsigned long lowmem_kbytes; 7992 int new_min_free_kbytes; 7993 7994 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 7995 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 7996 7997 if (new_min_free_kbytes > user_min_free_kbytes) { 7998 min_free_kbytes = new_min_free_kbytes; 7999 if (min_free_kbytes < 128) 8000 min_free_kbytes = 128; 8001 if (min_free_kbytes > 262144) 8002 min_free_kbytes = 262144; 8003 } else { 8004 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 8005 new_min_free_kbytes, user_min_free_kbytes); 8006 } 8007 setup_per_zone_wmarks(); 8008 refresh_zone_stat_thresholds(); 8009 setup_per_zone_lowmem_reserve(); 8010 8011 #ifdef CONFIG_NUMA 8012 setup_min_unmapped_ratio(); 8013 setup_min_slab_ratio(); 8014 #endif 8015 8016 khugepaged_min_free_kbytes_update(); 8017 8018 return 0; 8019 } 8020 postcore_initcall(init_per_zone_wmark_min) 8021 8022 /* 8023 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 8024 * that we can call two helper functions whenever min_free_kbytes 8025 * changes. 8026 */ 8027 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 8028 void *buffer, size_t *length, loff_t *ppos) 8029 { 8030 int rc; 8031 8032 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8033 if (rc) 8034 return rc; 8035 8036 if (write) { 8037 user_min_free_kbytes = min_free_kbytes; 8038 setup_per_zone_wmarks(); 8039 } 8040 return 0; 8041 } 8042 8043 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 8044 void *buffer, size_t *length, loff_t *ppos) 8045 { 8046 int rc; 8047 8048 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8049 if (rc) 8050 return rc; 8051 8052 if (write) 8053 setup_per_zone_wmarks(); 8054 8055 return 0; 8056 } 8057 8058 #ifdef CONFIG_NUMA 8059 static void setup_min_unmapped_ratio(void) 8060 { 8061 pg_data_t *pgdat; 8062 struct zone *zone; 8063 8064 for_each_online_pgdat(pgdat) 8065 pgdat->min_unmapped_pages = 0; 8066 8067 for_each_zone(zone) 8068 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 8069 sysctl_min_unmapped_ratio) / 100; 8070 } 8071 8072 8073 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 8074 void *buffer, size_t *length, loff_t *ppos) 8075 { 8076 int rc; 8077 8078 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8079 if (rc) 8080 return rc; 8081 8082 setup_min_unmapped_ratio(); 8083 8084 return 0; 8085 } 8086 8087 static void setup_min_slab_ratio(void) 8088 { 8089 pg_data_t *pgdat; 8090 struct zone *zone; 8091 8092 for_each_online_pgdat(pgdat) 8093 pgdat->min_slab_pages = 0; 8094 8095 for_each_zone(zone) 8096 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 8097 sysctl_min_slab_ratio) / 100; 8098 } 8099 8100 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 8101 void *buffer, size_t *length, loff_t *ppos) 8102 { 8103 int rc; 8104 8105 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8106 if (rc) 8107 return rc; 8108 8109 setup_min_slab_ratio(); 8110 8111 return 0; 8112 } 8113 #endif 8114 8115 /* 8116 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 8117 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 8118 * whenever sysctl_lowmem_reserve_ratio changes. 8119 * 8120 * The reserve ratio obviously has absolutely no relation with the 8121 * minimum watermarks. The lowmem reserve ratio can only make sense 8122 * if in function of the boot time zone sizes. 8123 */ 8124 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 8125 void *buffer, size_t *length, loff_t *ppos) 8126 { 8127 int i; 8128 8129 proc_dointvec_minmax(table, write, buffer, length, ppos); 8130 8131 for (i = 0; i < MAX_NR_ZONES; i++) { 8132 if (sysctl_lowmem_reserve_ratio[i] < 1) 8133 sysctl_lowmem_reserve_ratio[i] = 0; 8134 } 8135 8136 setup_per_zone_lowmem_reserve(); 8137 return 0; 8138 } 8139 8140 /* 8141 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 8142 * cpu. It is the fraction of total pages in each zone that a hot per cpu 8143 * pagelist can have before it gets flushed back to buddy allocator. 8144 */ 8145 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, 8146 void *buffer, size_t *length, loff_t *ppos) 8147 { 8148 struct zone *zone; 8149 int old_percpu_pagelist_fraction; 8150 int ret; 8151 8152 mutex_lock(&pcp_batch_high_lock); 8153 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 8154 8155 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 8156 if (!write || ret < 0) 8157 goto out; 8158 8159 /* Sanity checking to avoid pcp imbalance */ 8160 if (percpu_pagelist_fraction && 8161 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 8162 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 8163 ret = -EINVAL; 8164 goto out; 8165 } 8166 8167 /* No change? */ 8168 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 8169 goto out; 8170 8171 for_each_populated_zone(zone) 8172 zone_set_pageset_high_and_batch(zone); 8173 out: 8174 mutex_unlock(&pcp_batch_high_lock); 8175 return ret; 8176 } 8177 8178 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 8179 /* 8180 * Returns the number of pages that arch has reserved but 8181 * is not known to alloc_large_system_hash(). 8182 */ 8183 static unsigned long __init arch_reserved_kernel_pages(void) 8184 { 8185 return 0; 8186 } 8187 #endif 8188 8189 /* 8190 * Adaptive scale is meant to reduce sizes of hash tables on large memory 8191 * machines. As memory size is increased the scale is also increased but at 8192 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 8193 * quadruples the scale is increased by one, which means the size of hash table 8194 * only doubles, instead of quadrupling as well. 8195 * Because 32-bit systems cannot have large physical memory, where this scaling 8196 * makes sense, it is disabled on such platforms. 8197 */ 8198 #if __BITS_PER_LONG > 32 8199 #define ADAPT_SCALE_BASE (64ul << 30) 8200 #define ADAPT_SCALE_SHIFT 2 8201 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 8202 #endif 8203 8204 /* 8205 * allocate a large system hash table from bootmem 8206 * - it is assumed that the hash table must contain an exact power-of-2 8207 * quantity of entries 8208 * - limit is the number of hash buckets, not the total allocation size 8209 */ 8210 void *__init alloc_large_system_hash(const char *tablename, 8211 unsigned long bucketsize, 8212 unsigned long numentries, 8213 int scale, 8214 int flags, 8215 unsigned int *_hash_shift, 8216 unsigned int *_hash_mask, 8217 unsigned long low_limit, 8218 unsigned long high_limit) 8219 { 8220 unsigned long long max = high_limit; 8221 unsigned long log2qty, size; 8222 void *table = NULL; 8223 gfp_t gfp_flags; 8224 bool virt; 8225 8226 /* allow the kernel cmdline to have a say */ 8227 if (!numentries) { 8228 /* round applicable memory size up to nearest megabyte */ 8229 numentries = nr_kernel_pages; 8230 numentries -= arch_reserved_kernel_pages(); 8231 8232 /* It isn't necessary when PAGE_SIZE >= 1MB */ 8233 if (PAGE_SHIFT < 20) 8234 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 8235 8236 #if __BITS_PER_LONG > 32 8237 if (!high_limit) { 8238 unsigned long adapt; 8239 8240 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 8241 adapt <<= ADAPT_SCALE_SHIFT) 8242 scale++; 8243 } 8244 #endif 8245 8246 /* limit to 1 bucket per 2^scale bytes of low memory */ 8247 if (scale > PAGE_SHIFT) 8248 numentries >>= (scale - PAGE_SHIFT); 8249 else 8250 numentries <<= (PAGE_SHIFT - scale); 8251 8252 /* Make sure we've got at least a 0-order allocation.. */ 8253 if (unlikely(flags & HASH_SMALL)) { 8254 /* Makes no sense without HASH_EARLY */ 8255 WARN_ON(!(flags & HASH_EARLY)); 8256 if (!(numentries >> *_hash_shift)) { 8257 numentries = 1UL << *_hash_shift; 8258 BUG_ON(!numentries); 8259 } 8260 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 8261 numentries = PAGE_SIZE / bucketsize; 8262 } 8263 numentries = roundup_pow_of_two(numentries); 8264 8265 /* limit allocation size to 1/16 total memory by default */ 8266 if (max == 0) { 8267 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 8268 do_div(max, bucketsize); 8269 } 8270 max = min(max, 0x80000000ULL); 8271 8272 if (numentries < low_limit) 8273 numentries = low_limit; 8274 if (numentries > max) 8275 numentries = max; 8276 8277 log2qty = ilog2(numentries); 8278 8279 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 8280 do { 8281 virt = false; 8282 size = bucketsize << log2qty; 8283 if (flags & HASH_EARLY) { 8284 if (flags & HASH_ZERO) 8285 table = memblock_alloc(size, SMP_CACHE_BYTES); 8286 else 8287 table = memblock_alloc_raw(size, 8288 SMP_CACHE_BYTES); 8289 } else if (get_order(size) >= MAX_ORDER || hashdist) { 8290 table = __vmalloc(size, gfp_flags); 8291 virt = true; 8292 } else { 8293 /* 8294 * If bucketsize is not a power-of-two, we may free 8295 * some pages at the end of hash table which 8296 * alloc_pages_exact() automatically does 8297 */ 8298 table = alloc_pages_exact(size, gfp_flags); 8299 kmemleak_alloc(table, size, 1, gfp_flags); 8300 } 8301 } while (!table && size > PAGE_SIZE && --log2qty); 8302 8303 if (!table) 8304 panic("Failed to allocate %s hash table\n", tablename); 8305 8306 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 8307 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, 8308 virt ? "vmalloc" : "linear"); 8309 8310 if (_hash_shift) 8311 *_hash_shift = log2qty; 8312 if (_hash_mask) 8313 *_hash_mask = (1 << log2qty) - 1; 8314 8315 return table; 8316 } 8317 8318 /* 8319 * This function checks whether pageblock includes unmovable pages or not. 8320 * 8321 * PageLRU check without isolation or lru_lock could race so that 8322 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 8323 * check without lock_page also may miss some movable non-lru pages at 8324 * race condition. So you can't expect this function should be exact. 8325 * 8326 * Returns a page without holding a reference. If the caller wants to 8327 * dereference that page (e.g., dumping), it has to make sure that it 8328 * cannot get removed (e.g., via memory unplug) concurrently. 8329 * 8330 */ 8331 struct page *has_unmovable_pages(struct zone *zone, struct page *page, 8332 int migratetype, int flags) 8333 { 8334 unsigned long iter = 0; 8335 unsigned long pfn = page_to_pfn(page); 8336 unsigned long offset = pfn % pageblock_nr_pages; 8337 8338 if (is_migrate_cma_page(page)) { 8339 /* 8340 * CMA allocations (alloc_contig_range) really need to mark 8341 * isolate CMA pageblocks even when they are not movable in fact 8342 * so consider them movable here. 8343 */ 8344 if (is_migrate_cma(migratetype)) 8345 return NULL; 8346 8347 return page; 8348 } 8349 8350 for (; iter < pageblock_nr_pages - offset; iter++) { 8351 if (!pfn_valid_within(pfn + iter)) 8352 continue; 8353 8354 page = pfn_to_page(pfn + iter); 8355 8356 /* 8357 * Both, bootmem allocations and memory holes are marked 8358 * PG_reserved and are unmovable. We can even have unmovable 8359 * allocations inside ZONE_MOVABLE, for example when 8360 * specifying "movablecore". 8361 */ 8362 if (PageReserved(page)) 8363 return page; 8364 8365 /* 8366 * If the zone is movable and we have ruled out all reserved 8367 * pages then it should be reasonably safe to assume the rest 8368 * is movable. 8369 */ 8370 if (zone_idx(zone) == ZONE_MOVABLE) 8371 continue; 8372 8373 /* 8374 * Hugepages are not in LRU lists, but they're movable. 8375 * THPs are on the LRU, but need to be counted as #small pages. 8376 * We need not scan over tail pages because we don't 8377 * handle each tail page individually in migration. 8378 */ 8379 if (PageHuge(page) || PageTransCompound(page)) { 8380 struct page *head = compound_head(page); 8381 unsigned int skip_pages; 8382 8383 if (PageHuge(page)) { 8384 if (!hugepage_migration_supported(page_hstate(head))) 8385 return page; 8386 } else if (!PageLRU(head) && !__PageMovable(head)) { 8387 return page; 8388 } 8389 8390 skip_pages = compound_nr(head) - (page - head); 8391 iter += skip_pages - 1; 8392 continue; 8393 } 8394 8395 /* 8396 * We can't use page_count without pin a page 8397 * because another CPU can free compound page. 8398 * This check already skips compound tails of THP 8399 * because their page->_refcount is zero at all time. 8400 */ 8401 if (!page_ref_count(page)) { 8402 if (PageBuddy(page)) 8403 iter += (1 << buddy_order(page)) - 1; 8404 continue; 8405 } 8406 8407 /* 8408 * The HWPoisoned page may be not in buddy system, and 8409 * page_count() is not 0. 8410 */ 8411 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) 8412 continue; 8413 8414 /* 8415 * We treat all PageOffline() pages as movable when offlining 8416 * to give drivers a chance to decrement their reference count 8417 * in MEM_GOING_OFFLINE in order to indicate that these pages 8418 * can be offlined as there are no direct references anymore. 8419 * For actually unmovable PageOffline() where the driver does 8420 * not support this, we will fail later when trying to actually 8421 * move these pages that still have a reference count > 0. 8422 * (false negatives in this function only) 8423 */ 8424 if ((flags & MEMORY_OFFLINE) && PageOffline(page)) 8425 continue; 8426 8427 if (__PageMovable(page) || PageLRU(page)) 8428 continue; 8429 8430 /* 8431 * If there are RECLAIMABLE pages, we need to check 8432 * it. But now, memory offline itself doesn't call 8433 * shrink_node_slabs() and it still to be fixed. 8434 */ 8435 return page; 8436 } 8437 return NULL; 8438 } 8439 8440 #ifdef CONFIG_CONTIG_ALLOC 8441 static unsigned long pfn_max_align_down(unsigned long pfn) 8442 { 8443 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 8444 pageblock_nr_pages) - 1); 8445 } 8446 8447 static unsigned long pfn_max_align_up(unsigned long pfn) 8448 { 8449 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 8450 pageblock_nr_pages)); 8451 } 8452 8453 /* [start, end) must belong to a single zone. */ 8454 static int __alloc_contig_migrate_range(struct compact_control *cc, 8455 unsigned long start, unsigned long end) 8456 { 8457 /* This function is based on compact_zone() from compaction.c. */ 8458 unsigned int nr_reclaimed; 8459 unsigned long pfn = start; 8460 unsigned int tries = 0; 8461 int ret = 0; 8462 struct migration_target_control mtc = { 8463 .nid = zone_to_nid(cc->zone), 8464 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 8465 }; 8466 8467 migrate_prep(); 8468 8469 while (pfn < end || !list_empty(&cc->migratepages)) { 8470 if (fatal_signal_pending(current)) { 8471 ret = -EINTR; 8472 break; 8473 } 8474 8475 if (list_empty(&cc->migratepages)) { 8476 cc->nr_migratepages = 0; 8477 pfn = isolate_migratepages_range(cc, pfn, end); 8478 if (!pfn) { 8479 ret = -EINTR; 8480 break; 8481 } 8482 tries = 0; 8483 } else if (++tries == 5) { 8484 ret = ret < 0 ? ret : -EBUSY; 8485 break; 8486 } 8487 8488 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 8489 &cc->migratepages); 8490 cc->nr_migratepages -= nr_reclaimed; 8491 8492 ret = migrate_pages(&cc->migratepages, alloc_migration_target, 8493 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); 8494 } 8495 if (ret < 0) { 8496 putback_movable_pages(&cc->migratepages); 8497 return ret; 8498 } 8499 return 0; 8500 } 8501 8502 /** 8503 * alloc_contig_range() -- tries to allocate given range of pages 8504 * @start: start PFN to allocate 8505 * @end: one-past-the-last PFN to allocate 8506 * @migratetype: migratetype of the underlaying pageblocks (either 8507 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 8508 * in range must have the same migratetype and it must 8509 * be either of the two. 8510 * @gfp_mask: GFP mask to use during compaction 8511 * 8512 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 8513 * aligned. The PFN range must belong to a single zone. 8514 * 8515 * The first thing this routine does is attempt to MIGRATE_ISOLATE all 8516 * pageblocks in the range. Once isolated, the pageblocks should not 8517 * be modified by others. 8518 * 8519 * Return: zero on success or negative error code. On success all 8520 * pages which PFN is in [start, end) are allocated for the caller and 8521 * need to be freed with free_contig_range(). 8522 */ 8523 int alloc_contig_range(unsigned long start, unsigned long end, 8524 unsigned migratetype, gfp_t gfp_mask) 8525 { 8526 unsigned long outer_start, outer_end; 8527 unsigned int order; 8528 int ret = 0; 8529 8530 struct compact_control cc = { 8531 .nr_migratepages = 0, 8532 .order = -1, 8533 .zone = page_zone(pfn_to_page(start)), 8534 .mode = MIGRATE_SYNC, 8535 .ignore_skip_hint = true, 8536 .no_set_skip_hint = true, 8537 .gfp_mask = current_gfp_context(gfp_mask), 8538 .alloc_contig = true, 8539 }; 8540 INIT_LIST_HEAD(&cc.migratepages); 8541 8542 /* 8543 * What we do here is we mark all pageblocks in range as 8544 * MIGRATE_ISOLATE. Because pageblock and max order pages may 8545 * have different sizes, and due to the way page allocator 8546 * work, we align the range to biggest of the two pages so 8547 * that page allocator won't try to merge buddies from 8548 * different pageblocks and change MIGRATE_ISOLATE to some 8549 * other migration type. 8550 * 8551 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 8552 * migrate the pages from an unaligned range (ie. pages that 8553 * we are interested in). This will put all the pages in 8554 * range back to page allocator as MIGRATE_ISOLATE. 8555 * 8556 * When this is done, we take the pages in range from page 8557 * allocator removing them from the buddy system. This way 8558 * page allocator will never consider using them. 8559 * 8560 * This lets us mark the pageblocks back as 8561 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 8562 * aligned range but not in the unaligned, original range are 8563 * put back to page allocator so that buddy can use them. 8564 */ 8565 8566 ret = start_isolate_page_range(pfn_max_align_down(start), 8567 pfn_max_align_up(end), migratetype, 0); 8568 if (ret) 8569 return ret; 8570 8571 drain_all_pages(cc.zone); 8572 8573 /* 8574 * In case of -EBUSY, we'd like to know which page causes problem. 8575 * So, just fall through. test_pages_isolated() has a tracepoint 8576 * which will report the busy page. 8577 * 8578 * It is possible that busy pages could become available before 8579 * the call to test_pages_isolated, and the range will actually be 8580 * allocated. So, if we fall through be sure to clear ret so that 8581 * -EBUSY is not accidentally used or returned to caller. 8582 */ 8583 ret = __alloc_contig_migrate_range(&cc, start, end); 8584 if (ret && ret != -EBUSY) 8585 goto done; 8586 ret =0; 8587 8588 /* 8589 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 8590 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 8591 * more, all pages in [start, end) are free in page allocator. 8592 * What we are going to do is to allocate all pages from 8593 * [start, end) (that is remove them from page allocator). 8594 * 8595 * The only problem is that pages at the beginning and at the 8596 * end of interesting range may be not aligned with pages that 8597 * page allocator holds, ie. they can be part of higher order 8598 * pages. Because of this, we reserve the bigger range and 8599 * once this is done free the pages we are not interested in. 8600 * 8601 * We don't have to hold zone->lock here because the pages are 8602 * isolated thus they won't get removed from buddy. 8603 */ 8604 8605 lru_add_drain_all(); 8606 8607 order = 0; 8608 outer_start = start; 8609 while (!PageBuddy(pfn_to_page(outer_start))) { 8610 if (++order >= MAX_ORDER) { 8611 outer_start = start; 8612 break; 8613 } 8614 outer_start &= ~0UL << order; 8615 } 8616 8617 if (outer_start != start) { 8618 order = buddy_order(pfn_to_page(outer_start)); 8619 8620 /* 8621 * outer_start page could be small order buddy page and 8622 * it doesn't include start page. Adjust outer_start 8623 * in this case to report failed page properly 8624 * on tracepoint in test_pages_isolated() 8625 */ 8626 if (outer_start + (1UL << order) <= start) 8627 outer_start = start; 8628 } 8629 8630 /* Make sure the range is really isolated. */ 8631 if (test_pages_isolated(outer_start, end, 0)) { 8632 pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", 8633 __func__, outer_start, end); 8634 ret = -EBUSY; 8635 goto done; 8636 } 8637 8638 /* Grab isolated pages from freelists. */ 8639 outer_end = isolate_freepages_range(&cc, outer_start, end); 8640 if (!outer_end) { 8641 ret = -EBUSY; 8642 goto done; 8643 } 8644 8645 /* Free head and tail (if any) */ 8646 if (start != outer_start) 8647 free_contig_range(outer_start, start - outer_start); 8648 if (end != outer_end) 8649 free_contig_range(end, outer_end - end); 8650 8651 done: 8652 undo_isolate_page_range(pfn_max_align_down(start), 8653 pfn_max_align_up(end), migratetype); 8654 return ret; 8655 } 8656 EXPORT_SYMBOL(alloc_contig_range); 8657 8658 static int __alloc_contig_pages(unsigned long start_pfn, 8659 unsigned long nr_pages, gfp_t gfp_mask) 8660 { 8661 unsigned long end_pfn = start_pfn + nr_pages; 8662 8663 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 8664 gfp_mask); 8665 } 8666 8667 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 8668 unsigned long nr_pages) 8669 { 8670 unsigned long i, end_pfn = start_pfn + nr_pages; 8671 struct page *page; 8672 8673 for (i = start_pfn; i < end_pfn; i++) { 8674 page = pfn_to_online_page(i); 8675 if (!page) 8676 return false; 8677 8678 if (page_zone(page) != z) 8679 return false; 8680 8681 if (PageReserved(page)) 8682 return false; 8683 8684 if (page_count(page) > 0) 8685 return false; 8686 8687 if (PageHuge(page)) 8688 return false; 8689 } 8690 return true; 8691 } 8692 8693 static bool zone_spans_last_pfn(const struct zone *zone, 8694 unsigned long start_pfn, unsigned long nr_pages) 8695 { 8696 unsigned long last_pfn = start_pfn + nr_pages - 1; 8697 8698 return zone_spans_pfn(zone, last_pfn); 8699 } 8700 8701 /** 8702 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 8703 * @nr_pages: Number of contiguous pages to allocate 8704 * @gfp_mask: GFP mask to limit search and used during compaction 8705 * @nid: Target node 8706 * @nodemask: Mask for other possible nodes 8707 * 8708 * This routine is a wrapper around alloc_contig_range(). It scans over zones 8709 * on an applicable zonelist to find a contiguous pfn range which can then be 8710 * tried for allocation with alloc_contig_range(). This routine is intended 8711 * for allocation requests which can not be fulfilled with the buddy allocator. 8712 * 8713 * The allocated memory is always aligned to a page boundary. If nr_pages is a 8714 * power of two then the alignment is guaranteed to be to the given nr_pages 8715 * (e.g. 1GB request would be aligned to 1GB). 8716 * 8717 * Allocated pages can be freed with free_contig_range() or by manually calling 8718 * __free_page() on each allocated page. 8719 * 8720 * Return: pointer to contiguous pages on success, or NULL if not successful. 8721 */ 8722 struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, 8723 int nid, nodemask_t *nodemask) 8724 { 8725 unsigned long ret, pfn, flags; 8726 struct zonelist *zonelist; 8727 struct zone *zone; 8728 struct zoneref *z; 8729 8730 zonelist = node_zonelist(nid, gfp_mask); 8731 for_each_zone_zonelist_nodemask(zone, z, zonelist, 8732 gfp_zone(gfp_mask), nodemask) { 8733 spin_lock_irqsave(&zone->lock, flags); 8734 8735 pfn = ALIGN(zone->zone_start_pfn, nr_pages); 8736 while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 8737 if (pfn_range_valid_contig(zone, pfn, nr_pages)) { 8738 /* 8739 * We release the zone lock here because 8740 * alloc_contig_range() will also lock the zone 8741 * at some point. If there's an allocation 8742 * spinning on this lock, it may win the race 8743 * and cause alloc_contig_range() to fail... 8744 */ 8745 spin_unlock_irqrestore(&zone->lock, flags); 8746 ret = __alloc_contig_pages(pfn, nr_pages, 8747 gfp_mask); 8748 if (!ret) 8749 return pfn_to_page(pfn); 8750 spin_lock_irqsave(&zone->lock, flags); 8751 } 8752 pfn += nr_pages; 8753 } 8754 spin_unlock_irqrestore(&zone->lock, flags); 8755 } 8756 return NULL; 8757 } 8758 #endif /* CONFIG_CONTIG_ALLOC */ 8759 8760 void free_contig_range(unsigned long pfn, unsigned int nr_pages) 8761 { 8762 unsigned int count = 0; 8763 8764 for (; nr_pages--; pfn++) { 8765 struct page *page = pfn_to_page(pfn); 8766 8767 count += page_count(page) != 1; 8768 __free_page(page); 8769 } 8770 WARN(count != 0, "%d pages are still in use!\n", count); 8771 } 8772 EXPORT_SYMBOL(free_contig_range); 8773 8774 /* 8775 * The zone indicated has a new number of managed_pages; batch sizes and percpu 8776 * page high values need to be recalulated. 8777 */ 8778 void __meminit zone_pcp_update(struct zone *zone) 8779 { 8780 mutex_lock(&pcp_batch_high_lock); 8781 zone_set_pageset_high_and_batch(zone); 8782 mutex_unlock(&pcp_batch_high_lock); 8783 } 8784 8785 /* 8786 * Effectively disable pcplists for the zone by setting the high limit to 0 8787 * and draining all cpus. A concurrent page freeing on another CPU that's about 8788 * to put the page on pcplist will either finish before the drain and the page 8789 * will be drained, or observe the new high limit and skip the pcplist. 8790 * 8791 * Must be paired with a call to zone_pcp_enable(). 8792 */ 8793 void zone_pcp_disable(struct zone *zone) 8794 { 8795 mutex_lock(&pcp_batch_high_lock); 8796 __zone_set_pageset_high_and_batch(zone, 0, 1); 8797 __drain_all_pages(zone, true); 8798 } 8799 8800 void zone_pcp_enable(struct zone *zone) 8801 { 8802 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); 8803 mutex_unlock(&pcp_batch_high_lock); 8804 } 8805 8806 void zone_pcp_reset(struct zone *zone) 8807 { 8808 unsigned long flags; 8809 int cpu; 8810 struct per_cpu_pageset *pset; 8811 8812 /* avoid races with drain_pages() */ 8813 local_irq_save(flags); 8814 if (zone->pageset != &boot_pageset) { 8815 for_each_online_cpu(cpu) { 8816 pset = per_cpu_ptr(zone->pageset, cpu); 8817 drain_zonestat(zone, pset); 8818 } 8819 free_percpu(zone->pageset); 8820 zone->pageset = &boot_pageset; 8821 } 8822 local_irq_restore(flags); 8823 } 8824 8825 #ifdef CONFIG_MEMORY_HOTREMOVE 8826 /* 8827 * All pages in the range must be in a single zone, must not contain holes, 8828 * must span full sections, and must be isolated before calling this function. 8829 */ 8830 void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 8831 { 8832 unsigned long pfn = start_pfn; 8833 struct page *page; 8834 struct zone *zone; 8835 unsigned int order; 8836 unsigned long flags; 8837 8838 offline_mem_sections(pfn, end_pfn); 8839 zone = page_zone(pfn_to_page(pfn)); 8840 spin_lock_irqsave(&zone->lock, flags); 8841 while (pfn < end_pfn) { 8842 page = pfn_to_page(pfn); 8843 /* 8844 * The HWPoisoned page may be not in buddy system, and 8845 * page_count() is not 0. 8846 */ 8847 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 8848 pfn++; 8849 continue; 8850 } 8851 /* 8852 * At this point all remaining PageOffline() pages have a 8853 * reference count of 0 and can simply be skipped. 8854 */ 8855 if (PageOffline(page)) { 8856 BUG_ON(page_count(page)); 8857 BUG_ON(PageBuddy(page)); 8858 pfn++; 8859 continue; 8860 } 8861 8862 BUG_ON(page_count(page)); 8863 BUG_ON(!PageBuddy(page)); 8864 order = buddy_order(page); 8865 del_page_from_free_list(page, zone, order); 8866 pfn += (1 << order); 8867 } 8868 spin_unlock_irqrestore(&zone->lock, flags); 8869 } 8870 #endif 8871 8872 bool is_free_buddy_page(struct page *page) 8873 { 8874 struct zone *zone = page_zone(page); 8875 unsigned long pfn = page_to_pfn(page); 8876 unsigned long flags; 8877 unsigned int order; 8878 8879 spin_lock_irqsave(&zone->lock, flags); 8880 for (order = 0; order < MAX_ORDER; order++) { 8881 struct page *page_head = page - (pfn & ((1 << order) - 1)); 8882 8883 if (PageBuddy(page_head) && buddy_order(page_head) >= order) 8884 break; 8885 } 8886 spin_unlock_irqrestore(&zone->lock, flags); 8887 8888 return order < MAX_ORDER; 8889 } 8890 8891 #ifdef CONFIG_MEMORY_FAILURE 8892 /* 8893 * Break down a higher-order page in sub-pages, and keep our target out of 8894 * buddy allocator. 8895 */ 8896 static void break_down_buddy_pages(struct zone *zone, struct page *page, 8897 struct page *target, int low, int high, 8898 int migratetype) 8899 { 8900 unsigned long size = 1 << high; 8901 struct page *current_buddy, *next_page; 8902 8903 while (high > low) { 8904 high--; 8905 size >>= 1; 8906 8907 if (target >= &page[size]) { 8908 next_page = page + size; 8909 current_buddy = page; 8910 } else { 8911 next_page = page; 8912 current_buddy = page + size; 8913 } 8914 8915 if (set_page_guard(zone, current_buddy, high, migratetype)) 8916 continue; 8917 8918 if (current_buddy != target) { 8919 add_to_free_list(current_buddy, zone, high, migratetype); 8920 set_buddy_order(current_buddy, high); 8921 page = next_page; 8922 } 8923 } 8924 } 8925 8926 /* 8927 * Take a page that will be marked as poisoned off the buddy allocator. 8928 */ 8929 bool take_page_off_buddy(struct page *page) 8930 { 8931 struct zone *zone = page_zone(page); 8932 unsigned long pfn = page_to_pfn(page); 8933 unsigned long flags; 8934 unsigned int order; 8935 bool ret = false; 8936 8937 spin_lock_irqsave(&zone->lock, flags); 8938 for (order = 0; order < MAX_ORDER; order++) { 8939 struct page *page_head = page - (pfn & ((1 << order) - 1)); 8940 int page_order = buddy_order(page_head); 8941 8942 if (PageBuddy(page_head) && page_order >= order) { 8943 unsigned long pfn_head = page_to_pfn(page_head); 8944 int migratetype = get_pfnblock_migratetype(page_head, 8945 pfn_head); 8946 8947 del_page_from_free_list(page_head, zone, page_order); 8948 break_down_buddy_pages(zone, page_head, page, 0, 8949 page_order, migratetype); 8950 ret = true; 8951 break; 8952 } 8953 if (page_count(page_head) > 0) 8954 break; 8955 } 8956 spin_unlock_irqrestore(&zone->lock, flags); 8957 return ret; 8958 } 8959 #endif 8960