1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/page_alloc.c 4 * 5 * Manages the free list, the system allocates free pages here. 6 * Note that kmalloc() lives in slab.c 7 * 8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 9 * Swap reorganised 29.12.95, Stephen Tweedie 10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 16 */ 17 18 #include <linux/stddef.h> 19 #include <linux/mm.h> 20 #include <linux/highmem.h> 21 #include <linux/swap.h> 22 #include <linux/swapops.h> 23 #include <linux/interrupt.h> 24 #include <linux/pagemap.h> 25 #include <linux/jiffies.h> 26 #include <linux/memblock.h> 27 #include <linux/compiler.h> 28 #include <linux/kernel.h> 29 #include <linux/kasan.h> 30 #include <linux/module.h> 31 #include <linux/suspend.h> 32 #include <linux/pagevec.h> 33 #include <linux/blkdev.h> 34 #include <linux/slab.h> 35 #include <linux/ratelimit.h> 36 #include <linux/oom.h> 37 #include <linux/topology.h> 38 #include <linux/sysctl.h> 39 #include <linux/cpu.h> 40 #include <linux/cpuset.h> 41 #include <linux/memory_hotplug.h> 42 #include <linux/nodemask.h> 43 #include <linux/vmalloc.h> 44 #include <linux/vmstat.h> 45 #include <linux/mempolicy.h> 46 #include <linux/memremap.h> 47 #include <linux/stop_machine.h> 48 #include <linux/random.h> 49 #include <linux/sort.h> 50 #include <linux/pfn.h> 51 #include <linux/backing-dev.h> 52 #include <linux/fault-inject.h> 53 #include <linux/page-isolation.h> 54 #include <linux/debugobjects.h> 55 #include <linux/kmemleak.h> 56 #include <linux/compaction.h> 57 #include <trace/events/kmem.h> 58 #include <trace/events/oom.h> 59 #include <linux/prefetch.h> 60 #include <linux/mm_inline.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/sched/rt.h> 65 #include <linux/sched/mm.h> 66 #include <linux/page_owner.h> 67 #include <linux/page_table_check.h> 68 #include <linux/kthread.h> 69 #include <linux/memcontrol.h> 70 #include <linux/ftrace.h> 71 #include <linux/lockdep.h> 72 #include <linux/nmi.h> 73 #include <linux/psi.h> 74 #include <linux/padata.h> 75 #include <linux/khugepaged.h> 76 #include <linux/buffer_head.h> 77 #include <linux/delayacct.h> 78 #include <asm/sections.h> 79 #include <asm/tlbflush.h> 80 #include <asm/div64.h> 81 #include "internal.h" 82 #include "shuffle.h" 83 #include "page_reporting.h" 84 #include "swap.h" 85 86 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 87 typedef int __bitwise fpi_t; 88 89 /* No special request */ 90 #define FPI_NONE ((__force fpi_t)0) 91 92 /* 93 * Skip free page reporting notification for the (possibly merged) page. 94 * This does not hinder free page reporting from grabbing the page, 95 * reporting it and marking it "reported" - it only skips notifying 96 * the free page reporting infrastructure about a newly freed page. For 97 * example, used when temporarily pulling a page from a freelist and 98 * putting it back unmodified. 99 */ 100 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 101 102 /* 103 * Place the (possibly merged) page to the tail of the freelist. Will ignore 104 * page shuffling (relevant code - e.g., memory onlining - is expected to 105 * shuffle the whole zone). 106 * 107 * Note: No code should rely on this flag for correctness - it's purely 108 * to allow for optimizations when handing back either fresh pages 109 * (memory onlining) or untouched pages (page isolation, free page 110 * reporting). 111 */ 112 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 113 114 /* 115 * Don't poison memory with KASAN (only for the tag-based modes). 116 * During boot, all non-reserved memblock memory is exposed to page_alloc. 117 * Poisoning all that memory lengthens boot time, especially on systems with 118 * large amount of RAM. This flag is used to skip that poisoning. 119 * This is only done for the tag-based KASAN modes, as those are able to 120 * detect memory corruptions with the memory tags assigned by default. 121 * All memory allocated normally after boot gets poisoned as usual. 122 */ 123 #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) 124 125 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 126 static DEFINE_MUTEX(pcp_batch_high_lock); 127 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) 128 129 struct pagesets { 130 local_lock_t lock; 131 }; 132 static DEFINE_PER_CPU(struct pagesets, pagesets) = { 133 .lock = INIT_LOCAL_LOCK(lock), 134 }; 135 136 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 137 DEFINE_PER_CPU(int, numa_node); 138 EXPORT_PER_CPU_SYMBOL(numa_node); 139 #endif 140 141 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 142 143 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 144 /* 145 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 146 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 147 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 148 * defined in <linux/topology.h>. 149 */ 150 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 151 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 152 #endif 153 154 /* work_structs for global per-cpu drains */ 155 struct pcpu_drain { 156 struct zone *zone; 157 struct work_struct work; 158 }; 159 static DEFINE_MUTEX(pcpu_drain_mutex); 160 static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); 161 162 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 163 volatile unsigned long latent_entropy __latent_entropy; 164 EXPORT_SYMBOL(latent_entropy); 165 #endif 166 167 /* 168 * Array of node states. 169 */ 170 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 171 [N_POSSIBLE] = NODE_MASK_ALL, 172 [N_ONLINE] = { { [0] = 1UL } }, 173 #ifndef CONFIG_NUMA 174 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 175 #ifdef CONFIG_HIGHMEM 176 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 177 #endif 178 [N_MEMORY] = { { [0] = 1UL } }, 179 [N_CPU] = { { [0] = 1UL } }, 180 #endif /* NUMA */ 181 }; 182 EXPORT_SYMBOL(node_states); 183 184 atomic_long_t _totalram_pages __read_mostly; 185 EXPORT_SYMBOL(_totalram_pages); 186 unsigned long totalreserve_pages __read_mostly; 187 unsigned long totalcma_pages __read_mostly; 188 189 int percpu_pagelist_high_fraction; 190 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 191 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); 192 EXPORT_SYMBOL(init_on_alloc); 193 194 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); 195 EXPORT_SYMBOL(init_on_free); 196 197 static bool _init_on_alloc_enabled_early __read_mostly 198 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); 199 static int __init early_init_on_alloc(char *buf) 200 { 201 202 return kstrtobool(buf, &_init_on_alloc_enabled_early); 203 } 204 early_param("init_on_alloc", early_init_on_alloc); 205 206 static bool _init_on_free_enabled_early __read_mostly 207 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); 208 static int __init early_init_on_free(char *buf) 209 { 210 return kstrtobool(buf, &_init_on_free_enabled_early); 211 } 212 early_param("init_on_free", early_init_on_free); 213 214 /* 215 * A cached value of the page's pageblock's migratetype, used when the page is 216 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 217 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 218 * Also the migratetype set in the page does not necessarily match the pcplist 219 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 220 * other index - this ensures that it will be put on the correct CMA freelist. 221 */ 222 static inline int get_pcppage_migratetype(struct page *page) 223 { 224 return page->index; 225 } 226 227 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 228 { 229 page->index = migratetype; 230 } 231 232 #ifdef CONFIG_PM_SLEEP 233 /* 234 * The following functions are used by the suspend/hibernate code to temporarily 235 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 236 * while devices are suspended. To avoid races with the suspend/hibernate code, 237 * they should always be called with system_transition_mutex held 238 * (gfp_allowed_mask also should only be modified with system_transition_mutex 239 * held, unless the suspend/hibernate code is guaranteed not to run in parallel 240 * with that modification). 241 */ 242 243 static gfp_t saved_gfp_mask; 244 245 void pm_restore_gfp_mask(void) 246 { 247 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 248 if (saved_gfp_mask) { 249 gfp_allowed_mask = saved_gfp_mask; 250 saved_gfp_mask = 0; 251 } 252 } 253 254 void pm_restrict_gfp_mask(void) 255 { 256 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 257 WARN_ON(saved_gfp_mask); 258 saved_gfp_mask = gfp_allowed_mask; 259 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 260 } 261 262 bool pm_suspended_storage(void) 263 { 264 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 265 return false; 266 return true; 267 } 268 #endif /* CONFIG_PM_SLEEP */ 269 270 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 271 unsigned int pageblock_order __read_mostly; 272 #endif 273 274 static void __free_pages_ok(struct page *page, unsigned int order, 275 fpi_t fpi_flags); 276 277 /* 278 * results with 256, 32 in the lowmem_reserve sysctl: 279 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 280 * 1G machine -> (16M dma, 784M normal, 224M high) 281 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 282 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 283 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 284 * 285 * TBD: should special case ZONE_DMA32 machines here - in those we normally 286 * don't need any ZONE_NORMAL reservation 287 */ 288 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 289 #ifdef CONFIG_ZONE_DMA 290 [ZONE_DMA] = 256, 291 #endif 292 #ifdef CONFIG_ZONE_DMA32 293 [ZONE_DMA32] = 256, 294 #endif 295 [ZONE_NORMAL] = 32, 296 #ifdef CONFIG_HIGHMEM 297 [ZONE_HIGHMEM] = 0, 298 #endif 299 [ZONE_MOVABLE] = 0, 300 }; 301 302 static char * const zone_names[MAX_NR_ZONES] = { 303 #ifdef CONFIG_ZONE_DMA 304 "DMA", 305 #endif 306 #ifdef CONFIG_ZONE_DMA32 307 "DMA32", 308 #endif 309 "Normal", 310 #ifdef CONFIG_HIGHMEM 311 "HighMem", 312 #endif 313 "Movable", 314 #ifdef CONFIG_ZONE_DEVICE 315 "Device", 316 #endif 317 }; 318 319 const char * const migratetype_names[MIGRATE_TYPES] = { 320 "Unmovable", 321 "Movable", 322 "Reclaimable", 323 "HighAtomic", 324 #ifdef CONFIG_CMA 325 "CMA", 326 #endif 327 #ifdef CONFIG_MEMORY_ISOLATION 328 "Isolate", 329 #endif 330 }; 331 332 compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { 333 [NULL_COMPOUND_DTOR] = NULL, 334 [COMPOUND_PAGE_DTOR] = free_compound_page, 335 #ifdef CONFIG_HUGETLB_PAGE 336 [HUGETLB_PAGE_DTOR] = free_huge_page, 337 #endif 338 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 339 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, 340 #endif 341 }; 342 343 int min_free_kbytes = 1024; 344 int user_min_free_kbytes = -1; 345 int watermark_boost_factor __read_mostly = 15000; 346 int watermark_scale_factor = 10; 347 348 static unsigned long nr_kernel_pages __initdata; 349 static unsigned long nr_all_pages __initdata; 350 static unsigned long dma_reserve __initdata; 351 352 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; 353 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; 354 static unsigned long required_kernelcore __initdata; 355 static unsigned long required_kernelcore_percent __initdata; 356 static unsigned long required_movablecore __initdata; 357 static unsigned long required_movablecore_percent __initdata; 358 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; 359 static bool mirrored_kernelcore __meminitdata; 360 361 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 362 int movable_zone; 363 EXPORT_SYMBOL(movable_zone); 364 365 #if MAX_NUMNODES > 1 366 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 367 unsigned int nr_online_nodes __read_mostly = 1; 368 EXPORT_SYMBOL(nr_node_ids); 369 EXPORT_SYMBOL(nr_online_nodes); 370 #endif 371 372 int page_group_by_mobility_disabled __read_mostly; 373 374 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 375 /* 376 * During boot we initialize deferred pages on-demand, as needed, but once 377 * page_alloc_init_late() has finished, the deferred pages are all initialized, 378 * and we can permanently disable that path. 379 */ 380 static DEFINE_STATIC_KEY_TRUE(deferred_pages); 381 382 static inline bool deferred_pages_enabled(void) 383 { 384 return static_branch_unlikely(&deferred_pages); 385 } 386 387 /* Returns true if the struct page for the pfn is uninitialised */ 388 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 389 { 390 int nid = early_pfn_to_nid(pfn); 391 392 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 393 return true; 394 395 return false; 396 } 397 398 /* 399 * Returns true when the remaining initialisation should be deferred until 400 * later in the boot cycle when it can be parallelised. 401 */ 402 static bool __meminit 403 defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 404 { 405 static unsigned long prev_end_pfn, nr_initialised; 406 407 /* 408 * prev_end_pfn static that contains the end of previous zone 409 * No need to protect because called very early in boot before smp_init. 410 */ 411 if (prev_end_pfn != end_pfn) { 412 prev_end_pfn = end_pfn; 413 nr_initialised = 0; 414 } 415 416 /* Always populate low zones for address-constrained allocations */ 417 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 418 return false; 419 420 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) 421 return true; 422 /* 423 * We start only with one section of pages, more pages are added as 424 * needed until the rest of deferred pages are initialized. 425 */ 426 nr_initialised++; 427 if ((nr_initialised > PAGES_PER_SECTION) && 428 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 429 NODE_DATA(nid)->first_deferred_pfn = pfn; 430 return true; 431 } 432 return false; 433 } 434 #else 435 static inline bool deferred_pages_enabled(void) 436 { 437 return false; 438 } 439 440 static inline bool early_page_uninitialised(unsigned long pfn) 441 { 442 return false; 443 } 444 445 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 446 { 447 return false; 448 } 449 #endif 450 451 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 452 static inline unsigned long *get_pageblock_bitmap(const struct page *page, 453 unsigned long pfn) 454 { 455 #ifdef CONFIG_SPARSEMEM 456 return section_to_usemap(__pfn_to_section(pfn)); 457 #else 458 return page_zone(page)->pageblock_flags; 459 #endif /* CONFIG_SPARSEMEM */ 460 } 461 462 static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) 463 { 464 #ifdef CONFIG_SPARSEMEM 465 pfn &= (PAGES_PER_SECTION-1); 466 #else 467 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 468 #endif /* CONFIG_SPARSEMEM */ 469 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 470 } 471 472 static __always_inline 473 unsigned long __get_pfnblock_flags_mask(const struct page *page, 474 unsigned long pfn, 475 unsigned long mask) 476 { 477 unsigned long *bitmap; 478 unsigned long bitidx, word_bitidx; 479 unsigned long word; 480 481 bitmap = get_pageblock_bitmap(page, pfn); 482 bitidx = pfn_to_bitidx(page, pfn); 483 word_bitidx = bitidx / BITS_PER_LONG; 484 bitidx &= (BITS_PER_LONG-1); 485 486 word = bitmap[word_bitidx]; 487 return (word >> bitidx) & mask; 488 } 489 490 /** 491 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 492 * @page: The page within the block of interest 493 * @pfn: The target page frame number 494 * @mask: mask of bits that the caller is interested in 495 * 496 * Return: pageblock_bits flags 497 */ 498 unsigned long get_pfnblock_flags_mask(const struct page *page, 499 unsigned long pfn, unsigned long mask) 500 { 501 return __get_pfnblock_flags_mask(page, pfn, mask); 502 } 503 504 static __always_inline int get_pfnblock_migratetype(const struct page *page, 505 unsigned long pfn) 506 { 507 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); 508 } 509 510 /** 511 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 512 * @page: The page within the block of interest 513 * @flags: The flags to set 514 * @pfn: The target page frame number 515 * @mask: mask of bits that the caller is interested in 516 */ 517 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 518 unsigned long pfn, 519 unsigned long mask) 520 { 521 unsigned long *bitmap; 522 unsigned long bitidx, word_bitidx; 523 unsigned long old_word, word; 524 525 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 526 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); 527 528 bitmap = get_pageblock_bitmap(page, pfn); 529 bitidx = pfn_to_bitidx(page, pfn); 530 word_bitidx = bitidx / BITS_PER_LONG; 531 bitidx &= (BITS_PER_LONG-1); 532 533 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 534 535 mask <<= bitidx; 536 flags <<= bitidx; 537 538 word = READ_ONCE(bitmap[word_bitidx]); 539 for (;;) { 540 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 541 if (word == old_word) 542 break; 543 word = old_word; 544 } 545 } 546 547 void set_pageblock_migratetype(struct page *page, int migratetype) 548 { 549 if (unlikely(page_group_by_mobility_disabled && 550 migratetype < MIGRATE_PCPTYPES)) 551 migratetype = MIGRATE_UNMOVABLE; 552 553 set_pfnblock_flags_mask(page, (unsigned long)migratetype, 554 page_to_pfn(page), MIGRATETYPE_MASK); 555 } 556 557 #ifdef CONFIG_DEBUG_VM 558 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 559 { 560 int ret = 0; 561 unsigned seq; 562 unsigned long pfn = page_to_pfn(page); 563 unsigned long sp, start_pfn; 564 565 do { 566 seq = zone_span_seqbegin(zone); 567 start_pfn = zone->zone_start_pfn; 568 sp = zone->spanned_pages; 569 if (!zone_spans_pfn(zone, pfn)) 570 ret = 1; 571 } while (zone_span_seqretry(zone, seq)); 572 573 if (ret) 574 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 575 pfn, zone_to_nid(zone), zone->name, 576 start_pfn, start_pfn + sp); 577 578 return ret; 579 } 580 581 static int page_is_consistent(struct zone *zone, struct page *page) 582 { 583 if (zone != page_zone(page)) 584 return 0; 585 586 return 1; 587 } 588 /* 589 * Temporary debugging check for pages not lying within a given zone. 590 */ 591 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 592 { 593 if (page_outside_zone_boundaries(zone, page)) 594 return 1; 595 if (!page_is_consistent(zone, page)) 596 return 1; 597 598 return 0; 599 } 600 #else 601 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 602 { 603 return 0; 604 } 605 #endif 606 607 static void bad_page(struct page *page, const char *reason) 608 { 609 static unsigned long resume; 610 static unsigned long nr_shown; 611 static unsigned long nr_unshown; 612 613 /* 614 * Allow a burst of 60 reports, then keep quiet for that minute; 615 * or allow a steady drip of one report per second. 616 */ 617 if (nr_shown == 60) { 618 if (time_before(jiffies, resume)) { 619 nr_unshown++; 620 goto out; 621 } 622 if (nr_unshown) { 623 pr_alert( 624 "BUG: Bad page state: %lu messages suppressed\n", 625 nr_unshown); 626 nr_unshown = 0; 627 } 628 nr_shown = 0; 629 } 630 if (nr_shown++ == 0) 631 resume = jiffies + 60 * HZ; 632 633 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 634 current->comm, page_to_pfn(page)); 635 dump_page(page, reason); 636 637 print_modules(); 638 dump_stack(); 639 out: 640 /* Leave bad fields for debug, except PageBuddy could make trouble */ 641 page_mapcount_reset(page); /* remove PageBuddy */ 642 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 643 } 644 645 static inline unsigned int order_to_pindex(int migratetype, int order) 646 { 647 int base = order; 648 649 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 650 if (order > PAGE_ALLOC_COSTLY_ORDER) { 651 VM_BUG_ON(order != pageblock_order); 652 base = PAGE_ALLOC_COSTLY_ORDER + 1; 653 } 654 #else 655 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 656 #endif 657 658 return (MIGRATE_PCPTYPES * base) + migratetype; 659 } 660 661 static inline int pindex_to_order(unsigned int pindex) 662 { 663 int order = pindex / MIGRATE_PCPTYPES; 664 665 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 666 if (order > PAGE_ALLOC_COSTLY_ORDER) 667 order = pageblock_order; 668 #else 669 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 670 #endif 671 672 return order; 673 } 674 675 static inline bool pcp_allowed_order(unsigned int order) 676 { 677 if (order <= PAGE_ALLOC_COSTLY_ORDER) 678 return true; 679 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 680 if (order == pageblock_order) 681 return true; 682 #endif 683 return false; 684 } 685 686 static inline void free_the_page(struct page *page, unsigned int order) 687 { 688 if (pcp_allowed_order(order)) /* Via pcp? */ 689 free_unref_page(page, order); 690 else 691 __free_pages_ok(page, order, FPI_NONE); 692 } 693 694 /* 695 * Higher-order pages are called "compound pages". They are structured thusly: 696 * 697 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 698 * 699 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 700 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 701 * 702 * The first tail page's ->compound_dtor holds the offset in array of compound 703 * page destructors. See compound_page_dtors. 704 * 705 * The first tail page's ->compound_order holds the order of allocation. 706 * This usage means that zero-order pages may not be compound. 707 */ 708 709 void free_compound_page(struct page *page) 710 { 711 mem_cgroup_uncharge(page_folio(page)); 712 free_the_page(page, compound_order(page)); 713 } 714 715 static void prep_compound_head(struct page *page, unsigned int order) 716 { 717 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 718 set_compound_order(page, order); 719 atomic_set(compound_mapcount_ptr(page), -1); 720 atomic_set(compound_pincount_ptr(page), 0); 721 } 722 723 static void prep_compound_tail(struct page *head, int tail_idx) 724 { 725 struct page *p = head + tail_idx; 726 727 p->mapping = TAIL_MAPPING; 728 set_compound_head(p, head); 729 } 730 731 void prep_compound_page(struct page *page, unsigned int order) 732 { 733 int i; 734 int nr_pages = 1 << order; 735 736 __SetPageHead(page); 737 for (i = 1; i < nr_pages; i++) 738 prep_compound_tail(page, i); 739 740 prep_compound_head(page, order); 741 } 742 743 #ifdef CONFIG_DEBUG_PAGEALLOC 744 unsigned int _debug_guardpage_minorder; 745 746 bool _debug_pagealloc_enabled_early __read_mostly 747 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 748 EXPORT_SYMBOL(_debug_pagealloc_enabled_early); 749 DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); 750 EXPORT_SYMBOL(_debug_pagealloc_enabled); 751 752 DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); 753 754 static int __init early_debug_pagealloc(char *buf) 755 { 756 return kstrtobool(buf, &_debug_pagealloc_enabled_early); 757 } 758 early_param("debug_pagealloc", early_debug_pagealloc); 759 760 static int __init debug_guardpage_minorder_setup(char *buf) 761 { 762 unsigned long res; 763 764 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 765 pr_err("Bad debug_guardpage_minorder value\n"); 766 return 0; 767 } 768 _debug_guardpage_minorder = res; 769 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 770 return 0; 771 } 772 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 773 774 static inline bool set_page_guard(struct zone *zone, struct page *page, 775 unsigned int order, int migratetype) 776 { 777 if (!debug_guardpage_enabled()) 778 return false; 779 780 if (order >= debug_guardpage_minorder()) 781 return false; 782 783 __SetPageGuard(page); 784 INIT_LIST_HEAD(&page->lru); 785 set_page_private(page, order); 786 /* Guard pages are not available for any usage */ 787 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 788 789 return true; 790 } 791 792 static inline void clear_page_guard(struct zone *zone, struct page *page, 793 unsigned int order, int migratetype) 794 { 795 if (!debug_guardpage_enabled()) 796 return; 797 798 __ClearPageGuard(page); 799 800 set_page_private(page, 0); 801 if (!is_migrate_isolate(migratetype)) 802 __mod_zone_freepage_state(zone, (1 << order), migratetype); 803 } 804 #else 805 static inline bool set_page_guard(struct zone *zone, struct page *page, 806 unsigned int order, int migratetype) { return false; } 807 static inline void clear_page_guard(struct zone *zone, struct page *page, 808 unsigned int order, int migratetype) {} 809 #endif 810 811 /* 812 * Enable static keys related to various memory debugging and hardening options. 813 * Some override others, and depend on early params that are evaluated in the 814 * order of appearance. So we need to first gather the full picture of what was 815 * enabled, and then make decisions. 816 */ 817 void init_mem_debugging_and_hardening(void) 818 { 819 bool page_poisoning_requested = false; 820 821 #ifdef CONFIG_PAGE_POISONING 822 /* 823 * Page poisoning is debug page alloc for some arches. If 824 * either of those options are enabled, enable poisoning. 825 */ 826 if (page_poisoning_enabled() || 827 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && 828 debug_pagealloc_enabled())) { 829 static_branch_enable(&_page_poisoning_enabled); 830 page_poisoning_requested = true; 831 } 832 #endif 833 834 if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && 835 page_poisoning_requested) { 836 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 837 "will take precedence over init_on_alloc and init_on_free\n"); 838 _init_on_alloc_enabled_early = false; 839 _init_on_free_enabled_early = false; 840 } 841 842 if (_init_on_alloc_enabled_early) 843 static_branch_enable(&init_on_alloc); 844 else 845 static_branch_disable(&init_on_alloc); 846 847 if (_init_on_free_enabled_early) 848 static_branch_enable(&init_on_free); 849 else 850 static_branch_disable(&init_on_free); 851 852 #ifdef CONFIG_DEBUG_PAGEALLOC 853 if (!debug_pagealloc_enabled()) 854 return; 855 856 static_branch_enable(&_debug_pagealloc_enabled); 857 858 if (!debug_guardpage_minorder()) 859 return; 860 861 static_branch_enable(&_debug_guardpage_enabled); 862 #endif 863 } 864 865 static inline void set_buddy_order(struct page *page, unsigned int order) 866 { 867 set_page_private(page, order); 868 __SetPageBuddy(page); 869 } 870 871 #ifdef CONFIG_COMPACTION 872 static inline struct capture_control *task_capc(struct zone *zone) 873 { 874 struct capture_control *capc = current->capture_control; 875 876 return unlikely(capc) && 877 !(current->flags & PF_KTHREAD) && 878 !capc->page && 879 capc->cc->zone == zone ? capc : NULL; 880 } 881 882 static inline bool 883 compaction_capture(struct capture_control *capc, struct page *page, 884 int order, int migratetype) 885 { 886 if (!capc || order != capc->cc->order) 887 return false; 888 889 /* Do not accidentally pollute CMA or isolated regions*/ 890 if (is_migrate_cma(migratetype) || 891 is_migrate_isolate(migratetype)) 892 return false; 893 894 /* 895 * Do not let lower order allocations pollute a movable pageblock. 896 * This might let an unmovable request use a reclaimable pageblock 897 * and vice-versa but no more than normal fallback logic which can 898 * have trouble finding a high-order free page. 899 */ 900 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 901 return false; 902 903 capc->page = page; 904 return true; 905 } 906 907 #else 908 static inline struct capture_control *task_capc(struct zone *zone) 909 { 910 return NULL; 911 } 912 913 static inline bool 914 compaction_capture(struct capture_control *capc, struct page *page, 915 int order, int migratetype) 916 { 917 return false; 918 } 919 #endif /* CONFIG_COMPACTION */ 920 921 /* Used for pages not on another list */ 922 static inline void add_to_free_list(struct page *page, struct zone *zone, 923 unsigned int order, int migratetype) 924 { 925 struct free_area *area = &zone->free_area[order]; 926 927 list_add(&page->lru, &area->free_list[migratetype]); 928 area->nr_free++; 929 } 930 931 /* Used for pages not on another list */ 932 static inline void add_to_free_list_tail(struct page *page, struct zone *zone, 933 unsigned int order, int migratetype) 934 { 935 struct free_area *area = &zone->free_area[order]; 936 937 list_add_tail(&page->lru, &area->free_list[migratetype]); 938 area->nr_free++; 939 } 940 941 /* 942 * Used for pages which are on another list. Move the pages to the tail 943 * of the list - so the moved pages won't immediately be considered for 944 * allocation again (e.g., optimization for memory onlining). 945 */ 946 static inline void move_to_free_list(struct page *page, struct zone *zone, 947 unsigned int order, int migratetype) 948 { 949 struct free_area *area = &zone->free_area[order]; 950 951 list_move_tail(&page->lru, &area->free_list[migratetype]); 952 } 953 954 static inline void del_page_from_free_list(struct page *page, struct zone *zone, 955 unsigned int order) 956 { 957 /* clear reported state and update reported page count */ 958 if (page_reported(page)) 959 __ClearPageReported(page); 960 961 list_del(&page->lru); 962 __ClearPageBuddy(page); 963 set_page_private(page, 0); 964 zone->free_area[order].nr_free--; 965 } 966 967 /* 968 * If this is not the largest possible page, check if the buddy 969 * of the next-highest order is free. If it is, it's possible 970 * that pages are being freed that will coalesce soon. In case, 971 * that is happening, add the free page to the tail of the list 972 * so it's less likely to be used soon and more likely to be merged 973 * as a higher order page 974 */ 975 static inline bool 976 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 977 struct page *page, unsigned int order) 978 { 979 unsigned long higher_page_pfn; 980 struct page *higher_page; 981 982 if (order >= MAX_ORDER - 2) 983 return false; 984 985 higher_page_pfn = buddy_pfn & pfn; 986 higher_page = page + (higher_page_pfn - pfn); 987 988 return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, 989 NULL) != NULL; 990 } 991 992 /* 993 * Freeing function for a buddy system allocator. 994 * 995 * The concept of a buddy system is to maintain direct-mapped table 996 * (containing bit values) for memory blocks of various "orders". 997 * The bottom level table contains the map for the smallest allocatable 998 * units of memory (here, pages), and each level above it describes 999 * pairs of units from the levels below, hence, "buddies". 1000 * At a high level, all that happens here is marking the table entry 1001 * at the bottom level available, and propagating the changes upward 1002 * as necessary, plus some accounting needed to play nicely with other 1003 * parts of the VM system. 1004 * At each level, we keep a list of pages, which are heads of continuous 1005 * free pages of length of (1 << order) and marked with PageBuddy. 1006 * Page's order is recorded in page_private(page) field. 1007 * So when we are allocating or freeing one, we can derive the state of the 1008 * other. That is, if we allocate a small block, and both were 1009 * free, the remainder of the region must be split into blocks. 1010 * If a block is freed, and its buddy is also free, then this 1011 * triggers coalescing into a block of larger size. 1012 * 1013 * -- nyc 1014 */ 1015 1016 static inline void __free_one_page(struct page *page, 1017 unsigned long pfn, 1018 struct zone *zone, unsigned int order, 1019 int migratetype, fpi_t fpi_flags) 1020 { 1021 struct capture_control *capc = task_capc(zone); 1022 unsigned long buddy_pfn; 1023 unsigned long combined_pfn; 1024 struct page *buddy; 1025 bool to_tail; 1026 1027 VM_BUG_ON(!zone_is_initialized(zone)); 1028 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 1029 1030 VM_BUG_ON(migratetype == -1); 1031 if (likely(!is_migrate_isolate(migratetype))) 1032 __mod_zone_freepage_state(zone, 1 << order, migratetype); 1033 1034 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 1035 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1036 1037 while (order < MAX_ORDER - 1) { 1038 if (compaction_capture(capc, page, order, migratetype)) { 1039 __mod_zone_freepage_state(zone, -(1 << order), 1040 migratetype); 1041 return; 1042 } 1043 1044 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); 1045 if (!buddy) 1046 goto done_merging; 1047 1048 if (unlikely(order >= pageblock_order)) { 1049 /* 1050 * We want to prevent merge between freepages on pageblock 1051 * without fallbacks and normal pageblock. Without this, 1052 * pageblock isolation could cause incorrect freepage or CMA 1053 * accounting or HIGHATOMIC accounting. 1054 */ 1055 int buddy_mt = get_pageblock_migratetype(buddy); 1056 1057 if (migratetype != buddy_mt 1058 && (!migratetype_is_mergeable(migratetype) || 1059 !migratetype_is_mergeable(buddy_mt))) 1060 goto done_merging; 1061 } 1062 1063 /* 1064 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 1065 * merge with it and move up one order. 1066 */ 1067 if (page_is_guard(buddy)) 1068 clear_page_guard(zone, buddy, order, migratetype); 1069 else 1070 del_page_from_free_list(buddy, zone, order); 1071 combined_pfn = buddy_pfn & pfn; 1072 page = page + (combined_pfn - pfn); 1073 pfn = combined_pfn; 1074 order++; 1075 } 1076 1077 done_merging: 1078 set_buddy_order(page, order); 1079 1080 if (fpi_flags & FPI_TO_TAIL) 1081 to_tail = true; 1082 else if (is_shuffle_order(order)) 1083 to_tail = shuffle_pick_tail(); 1084 else 1085 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 1086 1087 if (to_tail) 1088 add_to_free_list_tail(page, zone, order, migratetype); 1089 else 1090 add_to_free_list(page, zone, order, migratetype); 1091 1092 /* Notify page reporting subsystem of freed page */ 1093 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 1094 page_reporting_notify_free(order); 1095 } 1096 1097 /* 1098 * A bad page could be due to a number of fields. Instead of multiple branches, 1099 * try and check multiple fields with one check. The caller must do a detailed 1100 * check if necessary. 1101 */ 1102 static inline bool page_expected_state(struct page *page, 1103 unsigned long check_flags) 1104 { 1105 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1106 return false; 1107 1108 if (unlikely((unsigned long)page->mapping | 1109 page_ref_count(page) | 1110 #ifdef CONFIG_MEMCG 1111 page->memcg_data | 1112 #endif 1113 (page->flags & check_flags))) 1114 return false; 1115 1116 return true; 1117 } 1118 1119 static const char *page_bad_reason(struct page *page, unsigned long flags) 1120 { 1121 const char *bad_reason = NULL; 1122 1123 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1124 bad_reason = "nonzero mapcount"; 1125 if (unlikely(page->mapping != NULL)) 1126 bad_reason = "non-NULL mapping"; 1127 if (unlikely(page_ref_count(page) != 0)) 1128 bad_reason = "nonzero _refcount"; 1129 if (unlikely(page->flags & flags)) { 1130 if (flags == PAGE_FLAGS_CHECK_AT_PREP) 1131 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 1132 else 1133 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 1134 } 1135 #ifdef CONFIG_MEMCG 1136 if (unlikely(page->memcg_data)) 1137 bad_reason = "page still charged to cgroup"; 1138 #endif 1139 return bad_reason; 1140 } 1141 1142 static void check_free_page_bad(struct page *page) 1143 { 1144 bad_page(page, 1145 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 1146 } 1147 1148 static inline int check_free_page(struct page *page) 1149 { 1150 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 1151 return 0; 1152 1153 /* Something has gone sideways, find it */ 1154 check_free_page_bad(page); 1155 return 1; 1156 } 1157 1158 static int free_tail_pages_check(struct page *head_page, struct page *page) 1159 { 1160 int ret = 1; 1161 1162 /* 1163 * We rely page->lru.next never has bit 0 set, unless the page 1164 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 1165 */ 1166 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 1167 1168 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 1169 ret = 0; 1170 goto out; 1171 } 1172 switch (page - head_page) { 1173 case 1: 1174 /* the first tail page: ->mapping may be compound_mapcount() */ 1175 if (unlikely(compound_mapcount(page))) { 1176 bad_page(page, "nonzero compound_mapcount"); 1177 goto out; 1178 } 1179 break; 1180 case 2: 1181 /* 1182 * the second tail page: ->mapping is 1183 * deferred_list.next -- ignore value. 1184 */ 1185 break; 1186 default: 1187 if (page->mapping != TAIL_MAPPING) { 1188 bad_page(page, "corrupted mapping in tail page"); 1189 goto out; 1190 } 1191 break; 1192 } 1193 if (unlikely(!PageTail(page))) { 1194 bad_page(page, "PageTail not set"); 1195 goto out; 1196 } 1197 if (unlikely(compound_head(page) != head_page)) { 1198 bad_page(page, "compound_head not consistent"); 1199 goto out; 1200 } 1201 ret = 0; 1202 out: 1203 page->mapping = NULL; 1204 clear_compound_head(page); 1205 return ret; 1206 } 1207 1208 /* 1209 * Skip KASAN memory poisoning when either: 1210 * 1211 * 1. Deferred memory initialization has not yet completed, 1212 * see the explanation below. 1213 * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, 1214 * see the comment next to it. 1215 * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, 1216 * see the comment next to it. 1217 * 1218 * Poisoning pages during deferred memory init will greatly lengthen the 1219 * process and cause problem in large memory systems as the deferred pages 1220 * initialization is done with interrupt disabled. 1221 * 1222 * Assuming that there will be no reference to those newly initialized 1223 * pages before they are ever allocated, this should have no effect on 1224 * KASAN memory tracking as the poison will be properly inserted at page 1225 * allocation time. The only corner case is when pages are allocated by 1226 * on-demand allocation and then freed again before the deferred pages 1227 * initialization is done, but this is not likely to happen. 1228 */ 1229 static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) 1230 { 1231 return deferred_pages_enabled() || 1232 (!IS_ENABLED(CONFIG_KASAN_GENERIC) && 1233 (fpi_flags & FPI_SKIP_KASAN_POISON)) || 1234 PageSkipKASanPoison(page); 1235 } 1236 1237 static void kernel_init_free_pages(struct page *page, int numpages) 1238 { 1239 int i; 1240 1241 /* s390's use of memset() could override KASAN redzones. */ 1242 kasan_disable_current(); 1243 for (i = 0; i < numpages; i++) { 1244 u8 tag = page_kasan_tag(page + i); 1245 page_kasan_tag_reset(page + i); 1246 clear_highpage(page + i); 1247 page_kasan_tag_set(page + i, tag); 1248 } 1249 kasan_enable_current(); 1250 } 1251 1252 static __always_inline bool free_pages_prepare(struct page *page, 1253 unsigned int order, bool check_free, fpi_t fpi_flags) 1254 { 1255 int bad = 0; 1256 bool init = want_init_on_free(); 1257 1258 VM_BUG_ON_PAGE(PageTail(page), page); 1259 1260 trace_mm_page_free(page, order); 1261 1262 if (unlikely(PageHWPoison(page)) && !order) { 1263 /* 1264 * Do not let hwpoison pages hit pcplists/buddy 1265 * Untie memcg state and reset page's owner 1266 */ 1267 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1268 __memcg_kmem_uncharge_page(page, order); 1269 reset_page_owner(page, order); 1270 page_table_check_free(page, order); 1271 return false; 1272 } 1273 1274 /* 1275 * Check tail pages before head page information is cleared to 1276 * avoid checking PageCompound for order-0 pages. 1277 */ 1278 if (unlikely(order)) { 1279 bool compound = PageCompound(page); 1280 int i; 1281 1282 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1283 1284 if (compound) { 1285 ClearPageDoubleMap(page); 1286 ClearPageHasHWPoisoned(page); 1287 } 1288 for (i = 1; i < (1 << order); i++) { 1289 if (compound) 1290 bad += free_tail_pages_check(page, page + i); 1291 if (unlikely(check_free_page(page + i))) { 1292 bad++; 1293 continue; 1294 } 1295 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1296 } 1297 } 1298 if (PageMappingFlags(page)) 1299 page->mapping = NULL; 1300 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1301 __memcg_kmem_uncharge_page(page, order); 1302 if (check_free) 1303 bad += check_free_page(page); 1304 if (bad) 1305 return false; 1306 1307 page_cpupid_reset_last(page); 1308 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1309 reset_page_owner(page, order); 1310 page_table_check_free(page, order); 1311 1312 if (!PageHighMem(page)) { 1313 debug_check_no_locks_freed(page_address(page), 1314 PAGE_SIZE << order); 1315 debug_check_no_obj_freed(page_address(page), 1316 PAGE_SIZE << order); 1317 } 1318 1319 kernel_poison_pages(page, 1 << order); 1320 1321 /* 1322 * As memory initialization might be integrated into KASAN, 1323 * KASAN poisoning and memory initialization code must be 1324 * kept together to avoid discrepancies in behavior. 1325 * 1326 * With hardware tag-based KASAN, memory tags must be set before the 1327 * page becomes unavailable via debug_pagealloc or arch_free_page. 1328 */ 1329 if (!should_skip_kasan_poison(page, fpi_flags)) { 1330 kasan_poison_pages(page, order, init); 1331 1332 /* Memory is already initialized if KASAN did it internally. */ 1333 if (kasan_has_integrated_init()) 1334 init = false; 1335 } 1336 if (init) 1337 kernel_init_free_pages(page, 1 << order); 1338 1339 /* 1340 * arch_free_page() can make the page's contents inaccessible. s390 1341 * does this. So nothing which can access the page's contents should 1342 * happen after this. 1343 */ 1344 arch_free_page(page, order); 1345 1346 debug_pagealloc_unmap_pages(page, 1 << order); 1347 1348 return true; 1349 } 1350 1351 #ifdef CONFIG_DEBUG_VM 1352 /* 1353 * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed 1354 * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when 1355 * moved from pcp lists to free lists. 1356 */ 1357 static bool free_pcp_prepare(struct page *page, unsigned int order) 1358 { 1359 return free_pages_prepare(page, order, true, FPI_NONE); 1360 } 1361 1362 static bool bulkfree_pcp_prepare(struct page *page) 1363 { 1364 if (debug_pagealloc_enabled_static()) 1365 return check_free_page(page); 1366 else 1367 return false; 1368 } 1369 #else 1370 /* 1371 * With DEBUG_VM disabled, order-0 pages being freed are checked only when 1372 * moving from pcp lists to free list in order to reduce overhead. With 1373 * debug_pagealloc enabled, they are checked also immediately when being freed 1374 * to the pcp lists. 1375 */ 1376 static bool free_pcp_prepare(struct page *page, unsigned int order) 1377 { 1378 if (debug_pagealloc_enabled_static()) 1379 return free_pages_prepare(page, order, true, FPI_NONE); 1380 else 1381 return free_pages_prepare(page, order, false, FPI_NONE); 1382 } 1383 1384 static bool bulkfree_pcp_prepare(struct page *page) 1385 { 1386 return check_free_page(page); 1387 } 1388 #endif /* CONFIG_DEBUG_VM */ 1389 1390 /* 1391 * Frees a number of pages from the PCP lists 1392 * Assumes all pages on list are in same zone. 1393 * count is the number of pages to free. 1394 */ 1395 static void free_pcppages_bulk(struct zone *zone, int count, 1396 struct per_cpu_pages *pcp, 1397 int pindex) 1398 { 1399 int min_pindex = 0; 1400 int max_pindex = NR_PCP_LISTS - 1; 1401 unsigned int order; 1402 bool isolated_pageblocks; 1403 struct page *page; 1404 1405 /* 1406 * Ensure proper count is passed which otherwise would stuck in the 1407 * below while (list_empty(list)) loop. 1408 */ 1409 count = min(pcp->count, count); 1410 1411 /* Ensure requested pindex is drained first. */ 1412 pindex = pindex - 1; 1413 1414 /* 1415 * local_lock_irq held so equivalent to spin_lock_irqsave for 1416 * both PREEMPT_RT and non-PREEMPT_RT configurations. 1417 */ 1418 spin_lock(&zone->lock); 1419 isolated_pageblocks = has_isolate_pageblock(zone); 1420 1421 while (count > 0) { 1422 struct list_head *list; 1423 int nr_pages; 1424 1425 /* Remove pages from lists in a round-robin fashion. */ 1426 do { 1427 if (++pindex > max_pindex) 1428 pindex = min_pindex; 1429 list = &pcp->lists[pindex]; 1430 if (!list_empty(list)) 1431 break; 1432 1433 if (pindex == max_pindex) 1434 max_pindex--; 1435 if (pindex == min_pindex) 1436 min_pindex++; 1437 } while (1); 1438 1439 order = pindex_to_order(pindex); 1440 nr_pages = 1 << order; 1441 BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); 1442 do { 1443 int mt; 1444 1445 page = list_last_entry(list, struct page, lru); 1446 mt = get_pcppage_migratetype(page); 1447 1448 /* must delete to avoid corrupting pcp list */ 1449 list_del(&page->lru); 1450 count -= nr_pages; 1451 pcp->count -= nr_pages; 1452 1453 if (bulkfree_pcp_prepare(page)) 1454 continue; 1455 1456 /* MIGRATE_ISOLATE page should not go to pcplists */ 1457 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1458 /* Pageblock could have been isolated meanwhile */ 1459 if (unlikely(isolated_pageblocks)) 1460 mt = get_pageblock_migratetype(page); 1461 1462 __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); 1463 trace_mm_page_pcpu_drain(page, order, mt); 1464 } while (count > 0 && !list_empty(list)); 1465 } 1466 1467 spin_unlock(&zone->lock); 1468 } 1469 1470 static void free_one_page(struct zone *zone, 1471 struct page *page, unsigned long pfn, 1472 unsigned int order, 1473 int migratetype, fpi_t fpi_flags) 1474 { 1475 unsigned long flags; 1476 1477 spin_lock_irqsave(&zone->lock, flags); 1478 if (unlikely(has_isolate_pageblock(zone) || 1479 is_migrate_isolate(migratetype))) { 1480 migratetype = get_pfnblock_migratetype(page, pfn); 1481 } 1482 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1483 spin_unlock_irqrestore(&zone->lock, flags); 1484 } 1485 1486 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1487 unsigned long zone, int nid) 1488 { 1489 mm_zero_struct_page(page); 1490 set_page_links(page, zone, nid, pfn); 1491 init_page_count(page); 1492 page_mapcount_reset(page); 1493 page_cpupid_reset_last(page); 1494 page_kasan_tag_reset(page); 1495 1496 INIT_LIST_HEAD(&page->lru); 1497 #ifdef WANT_PAGE_VIRTUAL 1498 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1499 if (!is_highmem_idx(zone)) 1500 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1501 #endif 1502 } 1503 1504 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1505 static void __meminit init_reserved_page(unsigned long pfn) 1506 { 1507 pg_data_t *pgdat; 1508 int nid, zid; 1509 1510 if (!early_page_uninitialised(pfn)) 1511 return; 1512 1513 nid = early_pfn_to_nid(pfn); 1514 pgdat = NODE_DATA(nid); 1515 1516 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1517 struct zone *zone = &pgdat->node_zones[zid]; 1518 1519 if (zone_spans_pfn(zone, pfn)) 1520 break; 1521 } 1522 __init_single_page(pfn_to_page(pfn), pfn, zid, nid); 1523 } 1524 #else 1525 static inline void init_reserved_page(unsigned long pfn) 1526 { 1527 } 1528 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1529 1530 /* 1531 * Initialised pages do not have PageReserved set. This function is 1532 * called for each range allocated by the bootmem allocator and 1533 * marks the pages PageReserved. The remaining valid pages are later 1534 * sent to the buddy page allocator. 1535 */ 1536 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1537 { 1538 unsigned long start_pfn = PFN_DOWN(start); 1539 unsigned long end_pfn = PFN_UP(end); 1540 1541 for (; start_pfn < end_pfn; start_pfn++) { 1542 if (pfn_valid(start_pfn)) { 1543 struct page *page = pfn_to_page(start_pfn); 1544 1545 init_reserved_page(start_pfn); 1546 1547 /* Avoid false-positive PageTail() */ 1548 INIT_LIST_HEAD(&page->lru); 1549 1550 /* 1551 * no need for atomic set_bit because the struct 1552 * page is not visible yet so nobody should 1553 * access it yet. 1554 */ 1555 __SetPageReserved(page); 1556 } 1557 } 1558 } 1559 1560 static void __free_pages_ok(struct page *page, unsigned int order, 1561 fpi_t fpi_flags) 1562 { 1563 unsigned long flags; 1564 int migratetype; 1565 unsigned long pfn = page_to_pfn(page); 1566 struct zone *zone = page_zone(page); 1567 1568 if (!free_pages_prepare(page, order, true, fpi_flags)) 1569 return; 1570 1571 migratetype = get_pfnblock_migratetype(page, pfn); 1572 1573 spin_lock_irqsave(&zone->lock, flags); 1574 if (unlikely(has_isolate_pageblock(zone) || 1575 is_migrate_isolate(migratetype))) { 1576 migratetype = get_pfnblock_migratetype(page, pfn); 1577 } 1578 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1579 spin_unlock_irqrestore(&zone->lock, flags); 1580 1581 __count_vm_events(PGFREE, 1 << order); 1582 } 1583 1584 void __free_pages_core(struct page *page, unsigned int order) 1585 { 1586 unsigned int nr_pages = 1 << order; 1587 struct page *p = page; 1588 unsigned int loop; 1589 1590 /* 1591 * When initializing the memmap, __init_single_page() sets the refcount 1592 * of all pages to 1 ("allocated"/"not free"). We have to set the 1593 * refcount of all involved pages to 0. 1594 */ 1595 prefetchw(p); 1596 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1597 prefetchw(p + 1); 1598 __ClearPageReserved(p); 1599 set_page_count(p, 0); 1600 } 1601 __ClearPageReserved(p); 1602 set_page_count(p, 0); 1603 1604 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1605 1606 /* 1607 * Bypass PCP and place fresh pages right to the tail, primarily 1608 * relevant for memory onlining. 1609 */ 1610 __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); 1611 } 1612 1613 #ifdef CONFIG_NUMA 1614 1615 /* 1616 * During memory init memblocks map pfns to nids. The search is expensive and 1617 * this caches recent lookups. The implementation of __early_pfn_to_nid 1618 * treats start/end as pfns. 1619 */ 1620 struct mminit_pfnnid_cache { 1621 unsigned long last_start; 1622 unsigned long last_end; 1623 int last_nid; 1624 }; 1625 1626 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1627 1628 /* 1629 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 1630 */ 1631 static int __meminit __early_pfn_to_nid(unsigned long pfn, 1632 struct mminit_pfnnid_cache *state) 1633 { 1634 unsigned long start_pfn, end_pfn; 1635 int nid; 1636 1637 if (state->last_start <= pfn && pfn < state->last_end) 1638 return state->last_nid; 1639 1640 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 1641 if (nid != NUMA_NO_NODE) { 1642 state->last_start = start_pfn; 1643 state->last_end = end_pfn; 1644 state->last_nid = nid; 1645 } 1646 1647 return nid; 1648 } 1649 1650 int __meminit early_pfn_to_nid(unsigned long pfn) 1651 { 1652 static DEFINE_SPINLOCK(early_pfn_lock); 1653 int nid; 1654 1655 spin_lock(&early_pfn_lock); 1656 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1657 if (nid < 0) 1658 nid = first_online_node; 1659 spin_unlock(&early_pfn_lock); 1660 1661 return nid; 1662 } 1663 #endif /* CONFIG_NUMA */ 1664 1665 void __init memblock_free_pages(struct page *page, unsigned long pfn, 1666 unsigned int order) 1667 { 1668 if (early_page_uninitialised(pfn)) 1669 return; 1670 __free_pages_core(page, order); 1671 } 1672 1673 /* 1674 * Check that the whole (or subset of) a pageblock given by the interval of 1675 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1676 * with the migration of free compaction scanner. 1677 * 1678 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1679 * 1680 * It's possible on some configurations to have a setup like node0 node1 node0 1681 * i.e. it's possible that all pages within a zones range of pages do not 1682 * belong to a single zone. We assume that a border between node0 and node1 1683 * can occur within a single pageblock, but not a node0 node1 node0 1684 * interleaving within a single pageblock. It is therefore sufficient to check 1685 * the first and last page of a pageblock and avoid checking each individual 1686 * page in a pageblock. 1687 */ 1688 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1689 unsigned long end_pfn, struct zone *zone) 1690 { 1691 struct page *start_page; 1692 struct page *end_page; 1693 1694 /* end_pfn is one past the range we are checking */ 1695 end_pfn--; 1696 1697 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1698 return NULL; 1699 1700 start_page = pfn_to_online_page(start_pfn); 1701 if (!start_page) 1702 return NULL; 1703 1704 if (page_zone(start_page) != zone) 1705 return NULL; 1706 1707 end_page = pfn_to_page(end_pfn); 1708 1709 /* This gives a shorter code than deriving page_zone(end_page) */ 1710 if (page_zone_id(start_page) != page_zone_id(end_page)) 1711 return NULL; 1712 1713 return start_page; 1714 } 1715 1716 void set_zone_contiguous(struct zone *zone) 1717 { 1718 unsigned long block_start_pfn = zone->zone_start_pfn; 1719 unsigned long block_end_pfn; 1720 1721 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1722 for (; block_start_pfn < zone_end_pfn(zone); 1723 block_start_pfn = block_end_pfn, 1724 block_end_pfn += pageblock_nr_pages) { 1725 1726 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1727 1728 if (!__pageblock_pfn_to_page(block_start_pfn, 1729 block_end_pfn, zone)) 1730 return; 1731 cond_resched(); 1732 } 1733 1734 /* We confirm that there is no hole */ 1735 zone->contiguous = true; 1736 } 1737 1738 void clear_zone_contiguous(struct zone *zone) 1739 { 1740 zone->contiguous = false; 1741 } 1742 1743 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1744 static void __init deferred_free_range(unsigned long pfn, 1745 unsigned long nr_pages) 1746 { 1747 struct page *page; 1748 unsigned long i; 1749 1750 if (!nr_pages) 1751 return; 1752 1753 page = pfn_to_page(pfn); 1754 1755 /* Free a large naturally-aligned chunk if possible */ 1756 if (nr_pages == pageblock_nr_pages && 1757 (pfn & (pageblock_nr_pages - 1)) == 0) { 1758 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1759 __free_pages_core(page, pageblock_order); 1760 return; 1761 } 1762 1763 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1764 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1765 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1766 __free_pages_core(page, 0); 1767 } 1768 } 1769 1770 /* Completion tracking for deferred_init_memmap() threads */ 1771 static atomic_t pgdat_init_n_undone __initdata; 1772 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1773 1774 static inline void __init pgdat_init_report_one_done(void) 1775 { 1776 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1777 complete(&pgdat_init_all_done_comp); 1778 } 1779 1780 /* 1781 * Returns true if page needs to be initialized or freed to buddy allocator. 1782 * 1783 * First we check if pfn is valid on architectures where it is possible to have 1784 * holes within pageblock_nr_pages. On systems where it is not possible, this 1785 * function is optimized out. 1786 * 1787 * Then, we check if a current large page is valid by only checking the validity 1788 * of the head pfn. 1789 */ 1790 static inline bool __init deferred_pfn_valid(unsigned long pfn) 1791 { 1792 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) 1793 return false; 1794 return true; 1795 } 1796 1797 /* 1798 * Free pages to buddy allocator. Try to free aligned pages in 1799 * pageblock_nr_pages sizes. 1800 */ 1801 static void __init deferred_free_pages(unsigned long pfn, 1802 unsigned long end_pfn) 1803 { 1804 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1805 unsigned long nr_free = 0; 1806 1807 for (; pfn < end_pfn; pfn++) { 1808 if (!deferred_pfn_valid(pfn)) { 1809 deferred_free_range(pfn - nr_free, nr_free); 1810 nr_free = 0; 1811 } else if (!(pfn & nr_pgmask)) { 1812 deferred_free_range(pfn - nr_free, nr_free); 1813 nr_free = 1; 1814 } else { 1815 nr_free++; 1816 } 1817 } 1818 /* Free the last block of pages to allocator */ 1819 deferred_free_range(pfn - nr_free, nr_free); 1820 } 1821 1822 /* 1823 * Initialize struct pages. We minimize pfn page lookups and scheduler checks 1824 * by performing it only once every pageblock_nr_pages. 1825 * Return number of pages initialized. 1826 */ 1827 static unsigned long __init deferred_init_pages(struct zone *zone, 1828 unsigned long pfn, 1829 unsigned long end_pfn) 1830 { 1831 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1832 int nid = zone_to_nid(zone); 1833 unsigned long nr_pages = 0; 1834 int zid = zone_idx(zone); 1835 struct page *page = NULL; 1836 1837 for (; pfn < end_pfn; pfn++) { 1838 if (!deferred_pfn_valid(pfn)) { 1839 page = NULL; 1840 continue; 1841 } else if (!page || !(pfn & nr_pgmask)) { 1842 page = pfn_to_page(pfn); 1843 } else { 1844 page++; 1845 } 1846 __init_single_page(page, pfn, zid, nid); 1847 nr_pages++; 1848 } 1849 return (nr_pages); 1850 } 1851 1852 /* 1853 * This function is meant to pre-load the iterator for the zone init. 1854 * Specifically it walks through the ranges until we are caught up to the 1855 * first_init_pfn value and exits there. If we never encounter the value we 1856 * return false indicating there are no valid ranges left. 1857 */ 1858 static bool __init 1859 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, 1860 unsigned long *spfn, unsigned long *epfn, 1861 unsigned long first_init_pfn) 1862 { 1863 u64 j; 1864 1865 /* 1866 * Start out by walking through the ranges in this zone that have 1867 * already been initialized. We don't need to do anything with them 1868 * so we just need to flush them out of the system. 1869 */ 1870 for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { 1871 if (*epfn <= first_init_pfn) 1872 continue; 1873 if (*spfn < first_init_pfn) 1874 *spfn = first_init_pfn; 1875 *i = j; 1876 return true; 1877 } 1878 1879 return false; 1880 } 1881 1882 /* 1883 * Initialize and free pages. We do it in two loops: first we initialize 1884 * struct page, then free to buddy allocator, because while we are 1885 * freeing pages we can access pages that are ahead (computing buddy 1886 * page in __free_one_page()). 1887 * 1888 * In order to try and keep some memory in the cache we have the loop 1889 * broken along max page order boundaries. This way we will not cause 1890 * any issues with the buddy page computation. 1891 */ 1892 static unsigned long __init 1893 deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, 1894 unsigned long *end_pfn) 1895 { 1896 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); 1897 unsigned long spfn = *start_pfn, epfn = *end_pfn; 1898 unsigned long nr_pages = 0; 1899 u64 j = *i; 1900 1901 /* First we loop through and initialize the page values */ 1902 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { 1903 unsigned long t; 1904 1905 if (mo_pfn <= *start_pfn) 1906 break; 1907 1908 t = min(mo_pfn, *end_pfn); 1909 nr_pages += deferred_init_pages(zone, *start_pfn, t); 1910 1911 if (mo_pfn < *end_pfn) { 1912 *start_pfn = mo_pfn; 1913 break; 1914 } 1915 } 1916 1917 /* Reset values and now loop through freeing pages as needed */ 1918 swap(j, *i); 1919 1920 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { 1921 unsigned long t; 1922 1923 if (mo_pfn <= spfn) 1924 break; 1925 1926 t = min(mo_pfn, epfn); 1927 deferred_free_pages(spfn, t); 1928 1929 if (mo_pfn <= epfn) 1930 break; 1931 } 1932 1933 return nr_pages; 1934 } 1935 1936 static void __init 1937 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, 1938 void *arg) 1939 { 1940 unsigned long spfn, epfn; 1941 struct zone *zone = arg; 1942 u64 i; 1943 1944 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); 1945 1946 /* 1947 * Initialize and free pages in MAX_ORDER sized increments so that we 1948 * can avoid introducing any issues with the buddy allocator. 1949 */ 1950 while (spfn < end_pfn) { 1951 deferred_init_maxorder(&i, zone, &spfn, &epfn); 1952 cond_resched(); 1953 } 1954 } 1955 1956 /* An arch may override for more concurrency. */ 1957 __weak int __init 1958 deferred_page_init_max_threads(const struct cpumask *node_cpumask) 1959 { 1960 return 1; 1961 } 1962 1963 /* Initialise remaining memory on a node */ 1964 static int __init deferred_init_memmap(void *data) 1965 { 1966 pg_data_t *pgdat = data; 1967 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1968 unsigned long spfn = 0, epfn = 0; 1969 unsigned long first_init_pfn, flags; 1970 unsigned long start = jiffies; 1971 struct zone *zone; 1972 int zid, max_threads; 1973 u64 i; 1974 1975 /* Bind memory initialisation thread to a local node if possible */ 1976 if (!cpumask_empty(cpumask)) 1977 set_cpus_allowed_ptr(current, cpumask); 1978 1979 pgdat_resize_lock(pgdat, &flags); 1980 first_init_pfn = pgdat->first_deferred_pfn; 1981 if (first_init_pfn == ULONG_MAX) { 1982 pgdat_resize_unlock(pgdat, &flags); 1983 pgdat_init_report_one_done(); 1984 return 0; 1985 } 1986 1987 /* Sanity check boundaries */ 1988 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1989 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1990 pgdat->first_deferred_pfn = ULONG_MAX; 1991 1992 /* 1993 * Once we unlock here, the zone cannot be grown anymore, thus if an 1994 * interrupt thread must allocate this early in boot, zone must be 1995 * pre-grown prior to start of deferred page initialization. 1996 */ 1997 pgdat_resize_unlock(pgdat, &flags); 1998 1999 /* Only the highest zone is deferred so find it */ 2000 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2001 zone = pgdat->node_zones + zid; 2002 if (first_init_pfn < zone_end_pfn(zone)) 2003 break; 2004 } 2005 2006 /* If the zone is empty somebody else may have cleared out the zone */ 2007 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2008 first_init_pfn)) 2009 goto zone_empty; 2010 2011 max_threads = deferred_page_init_max_threads(cpumask); 2012 2013 while (spfn < epfn) { 2014 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); 2015 struct padata_mt_job job = { 2016 .thread_fn = deferred_init_memmap_chunk, 2017 .fn_arg = zone, 2018 .start = spfn, 2019 .size = epfn_align - spfn, 2020 .align = PAGES_PER_SECTION, 2021 .min_chunk = PAGES_PER_SECTION, 2022 .max_threads = max_threads, 2023 }; 2024 2025 padata_do_multithreaded(&job); 2026 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2027 epfn_align); 2028 } 2029 zone_empty: 2030 /* Sanity check that the next zone really is unpopulated */ 2031 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 2032 2033 pr_info("node %d deferred pages initialised in %ums\n", 2034 pgdat->node_id, jiffies_to_msecs(jiffies - start)); 2035 2036 pgdat_init_report_one_done(); 2037 return 0; 2038 } 2039 2040 /* 2041 * If this zone has deferred pages, try to grow it by initializing enough 2042 * deferred pages to satisfy the allocation specified by order, rounded up to 2043 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 2044 * of SECTION_SIZE bytes by initializing struct pages in increments of 2045 * PAGES_PER_SECTION * sizeof(struct page) bytes. 2046 * 2047 * Return true when zone was grown, otherwise return false. We return true even 2048 * when we grow less than requested, to let the caller decide if there are 2049 * enough pages to satisfy the allocation. 2050 * 2051 * Note: We use noinline because this function is needed only during boot, and 2052 * it is called from a __ref function _deferred_grow_zone. This way we are 2053 * making sure that it is not inlined into permanent text section. 2054 */ 2055 static noinline bool __init 2056 deferred_grow_zone(struct zone *zone, unsigned int order) 2057 { 2058 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); 2059 pg_data_t *pgdat = zone->zone_pgdat; 2060 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 2061 unsigned long spfn, epfn, flags; 2062 unsigned long nr_pages = 0; 2063 u64 i; 2064 2065 /* Only the last zone may have deferred pages */ 2066 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) 2067 return false; 2068 2069 pgdat_resize_lock(pgdat, &flags); 2070 2071 /* 2072 * If someone grew this zone while we were waiting for spinlock, return 2073 * true, as there might be enough pages already. 2074 */ 2075 if (first_deferred_pfn != pgdat->first_deferred_pfn) { 2076 pgdat_resize_unlock(pgdat, &flags); 2077 return true; 2078 } 2079 2080 /* If the zone is empty somebody else may have cleared out the zone */ 2081 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2082 first_deferred_pfn)) { 2083 pgdat->first_deferred_pfn = ULONG_MAX; 2084 pgdat_resize_unlock(pgdat, &flags); 2085 /* Retry only once. */ 2086 return first_deferred_pfn != ULONG_MAX; 2087 } 2088 2089 /* 2090 * Initialize and free pages in MAX_ORDER sized increments so 2091 * that we can avoid introducing any issues with the buddy 2092 * allocator. 2093 */ 2094 while (spfn < epfn) { 2095 /* update our first deferred PFN for this section */ 2096 first_deferred_pfn = spfn; 2097 2098 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 2099 touch_nmi_watchdog(); 2100 2101 /* We should only stop along section boundaries */ 2102 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) 2103 continue; 2104 2105 /* If our quota has been met we can stop here */ 2106 if (nr_pages >= nr_pages_needed) 2107 break; 2108 } 2109 2110 pgdat->first_deferred_pfn = spfn; 2111 pgdat_resize_unlock(pgdat, &flags); 2112 2113 return nr_pages > 0; 2114 } 2115 2116 /* 2117 * deferred_grow_zone() is __init, but it is called from 2118 * get_page_from_freelist() during early boot until deferred_pages permanently 2119 * disables this call. This is why we have refdata wrapper to avoid warning, 2120 * and to ensure that the function body gets unloaded. 2121 */ 2122 static bool __ref 2123 _deferred_grow_zone(struct zone *zone, unsigned int order) 2124 { 2125 return deferred_grow_zone(zone, order); 2126 } 2127 2128 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 2129 2130 void __init page_alloc_init_late(void) 2131 { 2132 struct zone *zone; 2133 int nid; 2134 2135 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 2136 2137 /* There will be num_node_state(N_MEMORY) threads */ 2138 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 2139 for_each_node_state(nid, N_MEMORY) { 2140 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 2141 } 2142 2143 /* Block until all are initialised */ 2144 wait_for_completion(&pgdat_init_all_done_comp); 2145 2146 /* 2147 * We initialized the rest of the deferred pages. Permanently disable 2148 * on-demand struct page initialization. 2149 */ 2150 static_branch_disable(&deferred_pages); 2151 2152 /* Reinit limits that are based on free pages after the kernel is up */ 2153 files_maxfiles_init(); 2154 #endif 2155 2156 buffer_init(); 2157 2158 /* Discard memblock private memory */ 2159 memblock_discard(); 2160 2161 for_each_node_state(nid, N_MEMORY) 2162 shuffle_free_memory(NODE_DATA(nid)); 2163 2164 for_each_populated_zone(zone) 2165 set_zone_contiguous(zone); 2166 } 2167 2168 #ifdef CONFIG_CMA 2169 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 2170 void __init init_cma_reserved_pageblock(struct page *page) 2171 { 2172 unsigned i = pageblock_nr_pages; 2173 struct page *p = page; 2174 2175 do { 2176 __ClearPageReserved(p); 2177 set_page_count(p, 0); 2178 } while (++p, --i); 2179 2180 set_pageblock_migratetype(page, MIGRATE_CMA); 2181 set_page_refcounted(page); 2182 __free_pages(page, pageblock_order); 2183 2184 adjust_managed_page_count(page, pageblock_nr_pages); 2185 page_zone(page)->cma_pages += pageblock_nr_pages; 2186 } 2187 #endif 2188 2189 /* 2190 * The order of subdivision here is critical for the IO subsystem. 2191 * Please do not alter this order without good reasons and regression 2192 * testing. Specifically, as large blocks of memory are subdivided, 2193 * the order in which smaller blocks are delivered depends on the order 2194 * they're subdivided in this function. This is the primary factor 2195 * influencing the order in which pages are delivered to the IO 2196 * subsystem according to empirical testing, and this is also justified 2197 * by considering the behavior of a buddy system containing a single 2198 * large block of memory acted on by a series of small allocations. 2199 * This behavior is a critical factor in sglist merging's success. 2200 * 2201 * -- nyc 2202 */ 2203 static inline void expand(struct zone *zone, struct page *page, 2204 int low, int high, int migratetype) 2205 { 2206 unsigned long size = 1 << high; 2207 2208 while (high > low) { 2209 high--; 2210 size >>= 1; 2211 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 2212 2213 /* 2214 * Mark as guard pages (or page), that will allow to 2215 * merge back to allocator when buddy will be freed. 2216 * Corresponding page table entries will not be touched, 2217 * pages will stay not present in virtual address space 2218 */ 2219 if (set_page_guard(zone, &page[size], high, migratetype)) 2220 continue; 2221 2222 add_to_free_list(&page[size], zone, high, migratetype); 2223 set_buddy_order(&page[size], high); 2224 } 2225 } 2226 2227 static void check_new_page_bad(struct page *page) 2228 { 2229 if (unlikely(page->flags & __PG_HWPOISON)) { 2230 /* Don't complain about hwpoisoned pages */ 2231 page_mapcount_reset(page); /* remove PageBuddy */ 2232 return; 2233 } 2234 2235 bad_page(page, 2236 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 2237 } 2238 2239 /* 2240 * This page is about to be returned from the page allocator 2241 */ 2242 static inline int check_new_page(struct page *page) 2243 { 2244 if (likely(page_expected_state(page, 2245 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 2246 return 0; 2247 2248 check_new_page_bad(page); 2249 return 1; 2250 } 2251 2252 static bool check_new_pages(struct page *page, unsigned int order) 2253 { 2254 int i; 2255 for (i = 0; i < (1 << order); i++) { 2256 struct page *p = page + i; 2257 2258 if (unlikely(check_new_page(p))) 2259 return true; 2260 } 2261 2262 return false; 2263 } 2264 2265 #ifdef CONFIG_DEBUG_VM 2266 /* 2267 * With DEBUG_VM enabled, order-0 pages are checked for expected state when 2268 * being allocated from pcp lists. With debug_pagealloc also enabled, they are 2269 * also checked when pcp lists are refilled from the free lists. 2270 */ 2271 static inline bool check_pcp_refill(struct page *page, unsigned int order) 2272 { 2273 if (debug_pagealloc_enabled_static()) 2274 return check_new_pages(page, order); 2275 else 2276 return false; 2277 } 2278 2279 static inline bool check_new_pcp(struct page *page, unsigned int order) 2280 { 2281 return check_new_pages(page, order); 2282 } 2283 #else 2284 /* 2285 * With DEBUG_VM disabled, free order-0 pages are checked for expected state 2286 * when pcp lists are being refilled from the free lists. With debug_pagealloc 2287 * enabled, they are also checked when being allocated from the pcp lists. 2288 */ 2289 static inline bool check_pcp_refill(struct page *page, unsigned int order) 2290 { 2291 return check_new_pages(page, order); 2292 } 2293 static inline bool check_new_pcp(struct page *page, unsigned int order) 2294 { 2295 if (debug_pagealloc_enabled_static()) 2296 return check_new_pages(page, order); 2297 else 2298 return false; 2299 } 2300 #endif /* CONFIG_DEBUG_VM */ 2301 2302 static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) 2303 { 2304 /* Don't skip if a software KASAN mode is enabled. */ 2305 if (IS_ENABLED(CONFIG_KASAN_GENERIC) || 2306 IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 2307 return false; 2308 2309 /* Skip, if hardware tag-based KASAN is not enabled. */ 2310 if (!kasan_hw_tags_enabled()) 2311 return true; 2312 2313 /* 2314 * With hardware tag-based KASAN enabled, skip if either: 2315 * 2316 * 1. Memory tags have already been cleared via tag_clear_highpage(). 2317 * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. 2318 */ 2319 return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); 2320 } 2321 2322 static inline bool should_skip_init(gfp_t flags) 2323 { 2324 /* Don't skip, if hardware tag-based KASAN is not enabled. */ 2325 if (!kasan_hw_tags_enabled()) 2326 return false; 2327 2328 /* For hardware tag-based KASAN, skip if requested. */ 2329 return (flags & __GFP_SKIP_ZERO); 2330 } 2331 2332 inline void post_alloc_hook(struct page *page, unsigned int order, 2333 gfp_t gfp_flags) 2334 { 2335 bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && 2336 !should_skip_init(gfp_flags); 2337 bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); 2338 2339 set_page_private(page, 0); 2340 set_page_refcounted(page); 2341 2342 arch_alloc_page(page, order); 2343 debug_pagealloc_map_pages(page, 1 << order); 2344 2345 /* 2346 * Page unpoisoning must happen before memory initialization. 2347 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO 2348 * allocations and the page unpoisoning code will complain. 2349 */ 2350 kernel_unpoison_pages(page, 1 << order); 2351 2352 /* 2353 * As memory initialization might be integrated into KASAN, 2354 * KASAN unpoisoning and memory initializion code must be 2355 * kept together to avoid discrepancies in behavior. 2356 */ 2357 2358 /* 2359 * If memory tags should be zeroed (which happens only when memory 2360 * should be initialized as well). 2361 */ 2362 if (init_tags) { 2363 int i; 2364 2365 /* Initialize both memory and tags. */ 2366 for (i = 0; i != 1 << order; ++i) 2367 tag_clear_highpage(page + i); 2368 2369 /* Note that memory is already initialized by the loop above. */ 2370 init = false; 2371 } 2372 if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { 2373 /* Unpoison shadow memory or set memory tags. */ 2374 kasan_unpoison_pages(page, order, init); 2375 2376 /* Note that memory is already initialized by KASAN. */ 2377 if (kasan_has_integrated_init()) 2378 init = false; 2379 } 2380 /* If memory is still not initialized, do it now. */ 2381 if (init) 2382 kernel_init_free_pages(page, 1 << order); 2383 /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ 2384 if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) 2385 SetPageSkipKASanPoison(page); 2386 2387 set_page_owner(page, order, gfp_flags); 2388 page_table_check_alloc(page, order); 2389 } 2390 2391 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 2392 unsigned int alloc_flags) 2393 { 2394 post_alloc_hook(page, order, gfp_flags); 2395 2396 if (order && (gfp_flags & __GFP_COMP)) 2397 prep_compound_page(page, order); 2398 2399 /* 2400 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 2401 * allocate the page. The expectation is that the caller is taking 2402 * steps that will free more memory. The caller should avoid the page 2403 * being used for !PFMEMALLOC purposes. 2404 */ 2405 if (alloc_flags & ALLOC_NO_WATERMARKS) 2406 set_page_pfmemalloc(page); 2407 else 2408 clear_page_pfmemalloc(page); 2409 } 2410 2411 /* 2412 * Go through the free lists for the given migratetype and remove 2413 * the smallest available page from the freelists 2414 */ 2415 static __always_inline 2416 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 2417 int migratetype) 2418 { 2419 unsigned int current_order; 2420 struct free_area *area; 2421 struct page *page; 2422 2423 /* Find a page of the appropriate size in the preferred list */ 2424 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 2425 area = &(zone->free_area[current_order]); 2426 page = get_page_from_free_area(area, migratetype); 2427 if (!page) 2428 continue; 2429 del_page_from_free_list(page, zone, current_order); 2430 expand(zone, page, order, current_order, migratetype); 2431 set_pcppage_migratetype(page, migratetype); 2432 return page; 2433 } 2434 2435 return NULL; 2436 } 2437 2438 2439 /* 2440 * This array describes the order lists are fallen back to when 2441 * the free lists for the desirable migrate type are depleted 2442 * 2443 * The other migratetypes do not have fallbacks. 2444 */ 2445 static int fallbacks[MIGRATE_TYPES][3] = { 2446 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2447 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 2448 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2449 }; 2450 2451 #ifdef CONFIG_CMA 2452 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2453 unsigned int order) 2454 { 2455 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 2456 } 2457 #else 2458 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2459 unsigned int order) { return NULL; } 2460 #endif 2461 2462 /* 2463 * Move the free pages in a range to the freelist tail of the requested type. 2464 * Note that start_page and end_pages are not aligned on a pageblock 2465 * boundary. If alignment is required, use move_freepages_block() 2466 */ 2467 static int move_freepages(struct zone *zone, 2468 unsigned long start_pfn, unsigned long end_pfn, 2469 int migratetype, int *num_movable) 2470 { 2471 struct page *page; 2472 unsigned long pfn; 2473 unsigned int order; 2474 int pages_moved = 0; 2475 2476 for (pfn = start_pfn; pfn <= end_pfn;) { 2477 page = pfn_to_page(pfn); 2478 if (!PageBuddy(page)) { 2479 /* 2480 * We assume that pages that could be isolated for 2481 * migration are movable. But we don't actually try 2482 * isolating, as that would be expensive. 2483 */ 2484 if (num_movable && 2485 (PageLRU(page) || __PageMovable(page))) 2486 (*num_movable)++; 2487 pfn++; 2488 continue; 2489 } 2490 2491 /* Make sure we are not inadvertently changing nodes */ 2492 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 2493 VM_BUG_ON_PAGE(page_zone(page) != zone, page); 2494 2495 order = buddy_order(page); 2496 move_to_free_list(page, zone, order, migratetype); 2497 pfn += 1 << order; 2498 pages_moved += 1 << order; 2499 } 2500 2501 return pages_moved; 2502 } 2503 2504 int move_freepages_block(struct zone *zone, struct page *page, 2505 int migratetype, int *num_movable) 2506 { 2507 unsigned long start_pfn, end_pfn, pfn; 2508 2509 if (num_movable) 2510 *num_movable = 0; 2511 2512 pfn = page_to_pfn(page); 2513 start_pfn = pfn & ~(pageblock_nr_pages - 1); 2514 end_pfn = start_pfn + pageblock_nr_pages - 1; 2515 2516 /* Do not cross zone boundaries */ 2517 if (!zone_spans_pfn(zone, start_pfn)) 2518 start_pfn = pfn; 2519 if (!zone_spans_pfn(zone, end_pfn)) 2520 return 0; 2521 2522 return move_freepages(zone, start_pfn, end_pfn, migratetype, 2523 num_movable); 2524 } 2525 2526 static void change_pageblock_range(struct page *pageblock_page, 2527 int start_order, int migratetype) 2528 { 2529 int nr_pageblocks = 1 << (start_order - pageblock_order); 2530 2531 while (nr_pageblocks--) { 2532 set_pageblock_migratetype(pageblock_page, migratetype); 2533 pageblock_page += pageblock_nr_pages; 2534 } 2535 } 2536 2537 /* 2538 * When we are falling back to another migratetype during allocation, try to 2539 * steal extra free pages from the same pageblocks to satisfy further 2540 * allocations, instead of polluting multiple pageblocks. 2541 * 2542 * If we are stealing a relatively large buddy page, it is likely there will 2543 * be more free pages in the pageblock, so try to steal them all. For 2544 * reclaimable and unmovable allocations, we steal regardless of page size, 2545 * as fragmentation caused by those allocations polluting movable pageblocks 2546 * is worse than movable allocations stealing from unmovable and reclaimable 2547 * pageblocks. 2548 */ 2549 static bool can_steal_fallback(unsigned int order, int start_mt) 2550 { 2551 /* 2552 * Leaving this order check is intended, although there is 2553 * relaxed order check in next check. The reason is that 2554 * we can actually steal whole pageblock if this condition met, 2555 * but, below check doesn't guarantee it and that is just heuristic 2556 * so could be changed anytime. 2557 */ 2558 if (order >= pageblock_order) 2559 return true; 2560 2561 if (order >= pageblock_order / 2 || 2562 start_mt == MIGRATE_RECLAIMABLE || 2563 start_mt == MIGRATE_UNMOVABLE || 2564 page_group_by_mobility_disabled) 2565 return true; 2566 2567 return false; 2568 } 2569 2570 static inline bool boost_watermark(struct zone *zone) 2571 { 2572 unsigned long max_boost; 2573 2574 if (!watermark_boost_factor) 2575 return false; 2576 /* 2577 * Don't bother in zones that are unlikely to produce results. 2578 * On small machines, including kdump capture kernels running 2579 * in a small area, boosting the watermark can cause an out of 2580 * memory situation immediately. 2581 */ 2582 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 2583 return false; 2584 2585 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 2586 watermark_boost_factor, 10000); 2587 2588 /* 2589 * high watermark may be uninitialised if fragmentation occurs 2590 * very early in boot so do not boost. We do not fall 2591 * through and boost by pageblock_nr_pages as failing 2592 * allocations that early means that reclaim is not going 2593 * to help and it may even be impossible to reclaim the 2594 * boosted watermark resulting in a hang. 2595 */ 2596 if (!max_boost) 2597 return false; 2598 2599 max_boost = max(pageblock_nr_pages, max_boost); 2600 2601 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 2602 max_boost); 2603 2604 return true; 2605 } 2606 2607 /* 2608 * This function implements actual steal behaviour. If order is large enough, 2609 * we can steal whole pageblock. If not, we first move freepages in this 2610 * pageblock to our migratetype and determine how many already-allocated pages 2611 * are there in the pageblock with a compatible migratetype. If at least half 2612 * of pages are free or compatible, we can change migratetype of the pageblock 2613 * itself, so pages freed in the future will be put on the correct free list. 2614 */ 2615 static void steal_suitable_fallback(struct zone *zone, struct page *page, 2616 unsigned int alloc_flags, int start_type, bool whole_block) 2617 { 2618 unsigned int current_order = buddy_order(page); 2619 int free_pages, movable_pages, alike_pages; 2620 int old_block_type; 2621 2622 old_block_type = get_pageblock_migratetype(page); 2623 2624 /* 2625 * This can happen due to races and we want to prevent broken 2626 * highatomic accounting. 2627 */ 2628 if (is_migrate_highatomic(old_block_type)) 2629 goto single_page; 2630 2631 /* Take ownership for orders >= pageblock_order */ 2632 if (current_order >= pageblock_order) { 2633 change_pageblock_range(page, current_order, start_type); 2634 goto single_page; 2635 } 2636 2637 /* 2638 * Boost watermarks to increase reclaim pressure to reduce the 2639 * likelihood of future fallbacks. Wake kswapd now as the node 2640 * may be balanced overall and kswapd will not wake naturally. 2641 */ 2642 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 2643 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 2644 2645 /* We are not allowed to try stealing from the whole block */ 2646 if (!whole_block) 2647 goto single_page; 2648 2649 free_pages = move_freepages_block(zone, page, start_type, 2650 &movable_pages); 2651 /* 2652 * Determine how many pages are compatible with our allocation. 2653 * For movable allocation, it's the number of movable pages which 2654 * we just obtained. For other types it's a bit more tricky. 2655 */ 2656 if (start_type == MIGRATE_MOVABLE) { 2657 alike_pages = movable_pages; 2658 } else { 2659 /* 2660 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2661 * to MOVABLE pageblock, consider all non-movable pages as 2662 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2663 * vice versa, be conservative since we can't distinguish the 2664 * exact migratetype of non-movable pages. 2665 */ 2666 if (old_block_type == MIGRATE_MOVABLE) 2667 alike_pages = pageblock_nr_pages 2668 - (free_pages + movable_pages); 2669 else 2670 alike_pages = 0; 2671 } 2672 2673 /* moving whole block can fail due to zone boundary conditions */ 2674 if (!free_pages) 2675 goto single_page; 2676 2677 /* 2678 * If a sufficient number of pages in the block are either free or of 2679 * comparable migratability as our allocation, claim the whole block. 2680 */ 2681 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2682 page_group_by_mobility_disabled) 2683 set_pageblock_migratetype(page, start_type); 2684 2685 return; 2686 2687 single_page: 2688 move_to_free_list(page, zone, current_order, start_type); 2689 } 2690 2691 /* 2692 * Check whether there is a suitable fallback freepage with requested order. 2693 * If only_stealable is true, this function returns fallback_mt only if 2694 * we can steal other freepages all together. This would help to reduce 2695 * fragmentation due to mixed migratetype pages in one pageblock. 2696 */ 2697 int find_suitable_fallback(struct free_area *area, unsigned int order, 2698 int migratetype, bool only_stealable, bool *can_steal) 2699 { 2700 int i; 2701 int fallback_mt; 2702 2703 if (area->nr_free == 0) 2704 return -1; 2705 2706 *can_steal = false; 2707 for (i = 0;; i++) { 2708 fallback_mt = fallbacks[migratetype][i]; 2709 if (fallback_mt == MIGRATE_TYPES) 2710 break; 2711 2712 if (free_area_empty(area, fallback_mt)) 2713 continue; 2714 2715 if (can_steal_fallback(order, migratetype)) 2716 *can_steal = true; 2717 2718 if (!only_stealable) 2719 return fallback_mt; 2720 2721 if (*can_steal) 2722 return fallback_mt; 2723 } 2724 2725 return -1; 2726 } 2727 2728 /* 2729 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2730 * there are no empty page blocks that contain a page with a suitable order 2731 */ 2732 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2733 unsigned int alloc_order) 2734 { 2735 int mt; 2736 unsigned long max_managed, flags; 2737 2738 /* 2739 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2740 * Check is race-prone but harmless. 2741 */ 2742 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; 2743 if (zone->nr_reserved_highatomic >= max_managed) 2744 return; 2745 2746 spin_lock_irqsave(&zone->lock, flags); 2747 2748 /* Recheck the nr_reserved_highatomic limit under the lock */ 2749 if (zone->nr_reserved_highatomic >= max_managed) 2750 goto out_unlock; 2751 2752 /* Yoink! */ 2753 mt = get_pageblock_migratetype(page); 2754 /* Only reserve normal pageblocks (i.e., they can merge with others) */ 2755 if (migratetype_is_mergeable(mt)) { 2756 zone->nr_reserved_highatomic += pageblock_nr_pages; 2757 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2758 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2759 } 2760 2761 out_unlock: 2762 spin_unlock_irqrestore(&zone->lock, flags); 2763 } 2764 2765 /* 2766 * Used when an allocation is about to fail under memory pressure. This 2767 * potentially hurts the reliability of high-order allocations when under 2768 * intense memory pressure but failed atomic allocations should be easier 2769 * to recover from than an OOM. 2770 * 2771 * If @force is true, try to unreserve a pageblock even though highatomic 2772 * pageblock is exhausted. 2773 */ 2774 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2775 bool force) 2776 { 2777 struct zonelist *zonelist = ac->zonelist; 2778 unsigned long flags; 2779 struct zoneref *z; 2780 struct zone *zone; 2781 struct page *page; 2782 int order; 2783 bool ret; 2784 2785 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 2786 ac->nodemask) { 2787 /* 2788 * Preserve at least one pageblock unless memory pressure 2789 * is really high. 2790 */ 2791 if (!force && zone->nr_reserved_highatomic <= 2792 pageblock_nr_pages) 2793 continue; 2794 2795 spin_lock_irqsave(&zone->lock, flags); 2796 for (order = 0; order < MAX_ORDER; order++) { 2797 struct free_area *area = &(zone->free_area[order]); 2798 2799 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 2800 if (!page) 2801 continue; 2802 2803 /* 2804 * In page freeing path, migratetype change is racy so 2805 * we can counter several free pages in a pageblock 2806 * in this loop although we changed the pageblock type 2807 * from highatomic to ac->migratetype. So we should 2808 * adjust the count once. 2809 */ 2810 if (is_migrate_highatomic_page(page)) { 2811 /* 2812 * It should never happen but changes to 2813 * locking could inadvertently allow a per-cpu 2814 * drain to add pages to MIGRATE_HIGHATOMIC 2815 * while unreserving so be safe and watch for 2816 * underflows. 2817 */ 2818 zone->nr_reserved_highatomic -= min( 2819 pageblock_nr_pages, 2820 zone->nr_reserved_highatomic); 2821 } 2822 2823 /* 2824 * Convert to ac->migratetype and avoid the normal 2825 * pageblock stealing heuristics. Minimally, the caller 2826 * is doing the work and needs the pages. More 2827 * importantly, if the block was always converted to 2828 * MIGRATE_UNMOVABLE or another type then the number 2829 * of pageblocks that cannot be completely freed 2830 * may increase. 2831 */ 2832 set_pageblock_migratetype(page, ac->migratetype); 2833 ret = move_freepages_block(zone, page, ac->migratetype, 2834 NULL); 2835 if (ret) { 2836 spin_unlock_irqrestore(&zone->lock, flags); 2837 return ret; 2838 } 2839 } 2840 spin_unlock_irqrestore(&zone->lock, flags); 2841 } 2842 2843 return false; 2844 } 2845 2846 /* 2847 * Try finding a free buddy page on the fallback list and put it on the free 2848 * list of requested migratetype, possibly along with other pages from the same 2849 * block, depending on fragmentation avoidance heuristics. Returns true if 2850 * fallback was found so that __rmqueue_smallest() can grab it. 2851 * 2852 * The use of signed ints for order and current_order is a deliberate 2853 * deviation from the rest of this file, to make the for loop 2854 * condition simpler. 2855 */ 2856 static __always_inline bool 2857 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, 2858 unsigned int alloc_flags) 2859 { 2860 struct free_area *area; 2861 int current_order; 2862 int min_order = order; 2863 struct page *page; 2864 int fallback_mt; 2865 bool can_steal; 2866 2867 /* 2868 * Do not steal pages from freelists belonging to other pageblocks 2869 * i.e. orders < pageblock_order. If there are no local zones free, 2870 * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 2871 */ 2872 if (alloc_flags & ALLOC_NOFRAGMENT) 2873 min_order = pageblock_order; 2874 2875 /* 2876 * Find the largest available free page in the other list. This roughly 2877 * approximates finding the pageblock with the most free pages, which 2878 * would be too costly to do exactly. 2879 */ 2880 for (current_order = MAX_ORDER - 1; current_order >= min_order; 2881 --current_order) { 2882 area = &(zone->free_area[current_order]); 2883 fallback_mt = find_suitable_fallback(area, current_order, 2884 start_migratetype, false, &can_steal); 2885 if (fallback_mt == -1) 2886 continue; 2887 2888 /* 2889 * We cannot steal all free pages from the pageblock and the 2890 * requested migratetype is movable. In that case it's better to 2891 * steal and split the smallest available page instead of the 2892 * largest available page, because even if the next movable 2893 * allocation falls back into a different pageblock than this 2894 * one, it won't cause permanent fragmentation. 2895 */ 2896 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2897 && current_order > order) 2898 goto find_smallest; 2899 2900 goto do_steal; 2901 } 2902 2903 return false; 2904 2905 find_smallest: 2906 for (current_order = order; current_order < MAX_ORDER; 2907 current_order++) { 2908 area = &(zone->free_area[current_order]); 2909 fallback_mt = find_suitable_fallback(area, current_order, 2910 start_migratetype, false, &can_steal); 2911 if (fallback_mt != -1) 2912 break; 2913 } 2914 2915 /* 2916 * This should not happen - we already found a suitable fallback 2917 * when looking for the largest page. 2918 */ 2919 VM_BUG_ON(current_order == MAX_ORDER); 2920 2921 do_steal: 2922 page = get_page_from_free_area(area, fallback_mt); 2923 2924 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, 2925 can_steal); 2926 2927 trace_mm_page_alloc_extfrag(page, order, current_order, 2928 start_migratetype, fallback_mt); 2929 2930 return true; 2931 2932 } 2933 2934 /* 2935 * Do the hard work of removing an element from the buddy allocator. 2936 * Call me with the zone->lock already held. 2937 */ 2938 static __always_inline struct page * 2939 __rmqueue(struct zone *zone, unsigned int order, int migratetype, 2940 unsigned int alloc_flags) 2941 { 2942 struct page *page; 2943 2944 if (IS_ENABLED(CONFIG_CMA)) { 2945 /* 2946 * Balance movable allocations between regular and CMA areas by 2947 * allocating from CMA when over half of the zone's free memory 2948 * is in the CMA area. 2949 */ 2950 if (alloc_flags & ALLOC_CMA && 2951 zone_page_state(zone, NR_FREE_CMA_PAGES) > 2952 zone_page_state(zone, NR_FREE_PAGES) / 2) { 2953 page = __rmqueue_cma_fallback(zone, order); 2954 if (page) 2955 goto out; 2956 } 2957 } 2958 retry: 2959 page = __rmqueue_smallest(zone, order, migratetype); 2960 if (unlikely(!page)) { 2961 if (alloc_flags & ALLOC_CMA) 2962 page = __rmqueue_cma_fallback(zone, order); 2963 2964 if (!page && __rmqueue_fallback(zone, order, migratetype, 2965 alloc_flags)) 2966 goto retry; 2967 } 2968 out: 2969 if (page) 2970 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2971 return page; 2972 } 2973 2974 /* 2975 * Obtain a specified number of elements from the buddy allocator, all under 2976 * a single hold of the lock, for efficiency. Add them to the supplied list. 2977 * Returns the number of new pages which were placed at *list. 2978 */ 2979 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2980 unsigned long count, struct list_head *list, 2981 int migratetype, unsigned int alloc_flags) 2982 { 2983 int i, allocated = 0; 2984 2985 /* 2986 * local_lock_irq held so equivalent to spin_lock_irqsave for 2987 * both PREEMPT_RT and non-PREEMPT_RT configurations. 2988 */ 2989 spin_lock(&zone->lock); 2990 for (i = 0; i < count; ++i) { 2991 struct page *page = __rmqueue(zone, order, migratetype, 2992 alloc_flags); 2993 if (unlikely(page == NULL)) 2994 break; 2995 2996 if (unlikely(check_pcp_refill(page, order))) 2997 continue; 2998 2999 /* 3000 * Split buddy pages returned by expand() are received here in 3001 * physical page order. The page is added to the tail of 3002 * caller's list. From the callers perspective, the linked list 3003 * is ordered by page number under some conditions. This is 3004 * useful for IO devices that can forward direction from the 3005 * head, thus also in the physical page order. This is useful 3006 * for IO devices that can merge IO requests if the physical 3007 * pages are ordered properly. 3008 */ 3009 list_add_tail(&page->lru, list); 3010 allocated++; 3011 if (is_migrate_cma(get_pcppage_migratetype(page))) 3012 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 3013 -(1 << order)); 3014 } 3015 3016 /* 3017 * i pages were removed from the buddy list even if some leak due 3018 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 3019 * on i. Do not confuse with 'allocated' which is the number of 3020 * pages added to the pcp list. 3021 */ 3022 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 3023 spin_unlock(&zone->lock); 3024 return allocated; 3025 } 3026 3027 #ifdef CONFIG_NUMA 3028 /* 3029 * Called from the vmstat counter updater to drain pagesets of this 3030 * currently executing processor on remote nodes after they have 3031 * expired. 3032 * 3033 * Note that this function must be called with the thread pinned to 3034 * a single processor. 3035 */ 3036 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 3037 { 3038 unsigned long flags; 3039 int to_drain, batch; 3040 3041 local_lock_irqsave(&pagesets.lock, flags); 3042 batch = READ_ONCE(pcp->batch); 3043 to_drain = min(pcp->count, batch); 3044 if (to_drain > 0) 3045 free_pcppages_bulk(zone, to_drain, pcp, 0); 3046 local_unlock_irqrestore(&pagesets.lock, flags); 3047 } 3048 #endif 3049 3050 /* 3051 * Drain pcplists of the indicated processor and zone. 3052 * 3053 * The processor must either be the current processor and the 3054 * thread pinned to the current processor or a processor that 3055 * is not online. 3056 */ 3057 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 3058 { 3059 unsigned long flags; 3060 struct per_cpu_pages *pcp; 3061 3062 local_lock_irqsave(&pagesets.lock, flags); 3063 3064 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 3065 if (pcp->count) 3066 free_pcppages_bulk(zone, pcp->count, pcp, 0); 3067 3068 local_unlock_irqrestore(&pagesets.lock, flags); 3069 } 3070 3071 /* 3072 * Drain pcplists of all zones on the indicated processor. 3073 * 3074 * The processor must either be the current processor and the 3075 * thread pinned to the current processor or a processor that 3076 * is not online. 3077 */ 3078 static void drain_pages(unsigned int cpu) 3079 { 3080 struct zone *zone; 3081 3082 for_each_populated_zone(zone) { 3083 drain_pages_zone(cpu, zone); 3084 } 3085 } 3086 3087 /* 3088 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 3089 * 3090 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 3091 * the single zone's pages. 3092 */ 3093 void drain_local_pages(struct zone *zone) 3094 { 3095 int cpu = smp_processor_id(); 3096 3097 if (zone) 3098 drain_pages_zone(cpu, zone); 3099 else 3100 drain_pages(cpu); 3101 } 3102 3103 static void drain_local_pages_wq(struct work_struct *work) 3104 { 3105 struct pcpu_drain *drain; 3106 3107 drain = container_of(work, struct pcpu_drain, work); 3108 3109 /* 3110 * drain_all_pages doesn't use proper cpu hotplug protection so 3111 * we can race with cpu offline when the WQ can move this from 3112 * a cpu pinned worker to an unbound one. We can operate on a different 3113 * cpu which is alright but we also have to make sure to not move to 3114 * a different one. 3115 */ 3116 migrate_disable(); 3117 drain_local_pages(drain->zone); 3118 migrate_enable(); 3119 } 3120 3121 /* 3122 * The implementation of drain_all_pages(), exposing an extra parameter to 3123 * drain on all cpus. 3124 * 3125 * drain_all_pages() is optimized to only execute on cpus where pcplists are 3126 * not empty. The check for non-emptiness can however race with a free to 3127 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers 3128 * that need the guarantee that every CPU has drained can disable the 3129 * optimizing racy check. 3130 */ 3131 static void __drain_all_pages(struct zone *zone, bool force_all_cpus) 3132 { 3133 int cpu; 3134 3135 /* 3136 * Allocate in the BSS so we won't require allocation in 3137 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 3138 */ 3139 static cpumask_t cpus_with_pcps; 3140 3141 /* 3142 * Make sure nobody triggers this path before mm_percpu_wq is fully 3143 * initialized. 3144 */ 3145 if (WARN_ON_ONCE(!mm_percpu_wq)) 3146 return; 3147 3148 /* 3149 * Do not drain if one is already in progress unless it's specific to 3150 * a zone. Such callers are primarily CMA and memory hotplug and need 3151 * the drain to be complete when the call returns. 3152 */ 3153 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 3154 if (!zone) 3155 return; 3156 mutex_lock(&pcpu_drain_mutex); 3157 } 3158 3159 /* 3160 * We don't care about racing with CPU hotplug event 3161 * as offline notification will cause the notified 3162 * cpu to drain that CPU pcps and on_each_cpu_mask 3163 * disables preemption as part of its processing 3164 */ 3165 for_each_online_cpu(cpu) { 3166 struct per_cpu_pages *pcp; 3167 struct zone *z; 3168 bool has_pcps = false; 3169 3170 if (force_all_cpus) { 3171 /* 3172 * The pcp.count check is racy, some callers need a 3173 * guarantee that no cpu is missed. 3174 */ 3175 has_pcps = true; 3176 } else if (zone) { 3177 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 3178 if (pcp->count) 3179 has_pcps = true; 3180 } else { 3181 for_each_populated_zone(z) { 3182 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); 3183 if (pcp->count) { 3184 has_pcps = true; 3185 break; 3186 } 3187 } 3188 } 3189 3190 if (has_pcps) 3191 cpumask_set_cpu(cpu, &cpus_with_pcps); 3192 else 3193 cpumask_clear_cpu(cpu, &cpus_with_pcps); 3194 } 3195 3196 for_each_cpu(cpu, &cpus_with_pcps) { 3197 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); 3198 3199 drain->zone = zone; 3200 INIT_WORK(&drain->work, drain_local_pages_wq); 3201 queue_work_on(cpu, mm_percpu_wq, &drain->work); 3202 } 3203 for_each_cpu(cpu, &cpus_with_pcps) 3204 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); 3205 3206 mutex_unlock(&pcpu_drain_mutex); 3207 } 3208 3209 /* 3210 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 3211 * 3212 * When zone parameter is non-NULL, spill just the single zone's pages. 3213 * 3214 * Note that this can be extremely slow as the draining happens in a workqueue. 3215 */ 3216 void drain_all_pages(struct zone *zone) 3217 { 3218 __drain_all_pages(zone, false); 3219 } 3220 3221 #ifdef CONFIG_HIBERNATION 3222 3223 /* 3224 * Touch the watchdog for every WD_PAGE_COUNT pages. 3225 */ 3226 #define WD_PAGE_COUNT (128*1024) 3227 3228 void mark_free_pages(struct zone *zone) 3229 { 3230 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 3231 unsigned long flags; 3232 unsigned int order, t; 3233 struct page *page; 3234 3235 if (zone_is_empty(zone)) 3236 return; 3237 3238 spin_lock_irqsave(&zone->lock, flags); 3239 3240 max_zone_pfn = zone_end_pfn(zone); 3241 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 3242 if (pfn_valid(pfn)) { 3243 page = pfn_to_page(pfn); 3244 3245 if (!--page_count) { 3246 touch_nmi_watchdog(); 3247 page_count = WD_PAGE_COUNT; 3248 } 3249 3250 if (page_zone(page) != zone) 3251 continue; 3252 3253 if (!swsusp_page_is_forbidden(page)) 3254 swsusp_unset_page_free(page); 3255 } 3256 3257 for_each_migratetype_order(order, t) { 3258 list_for_each_entry(page, 3259 &zone->free_area[order].free_list[t], lru) { 3260 unsigned long i; 3261 3262 pfn = page_to_pfn(page); 3263 for (i = 0; i < (1UL << order); i++) { 3264 if (!--page_count) { 3265 touch_nmi_watchdog(); 3266 page_count = WD_PAGE_COUNT; 3267 } 3268 swsusp_set_page_free(pfn_to_page(pfn + i)); 3269 } 3270 } 3271 } 3272 spin_unlock_irqrestore(&zone->lock, flags); 3273 } 3274 #endif /* CONFIG_PM */ 3275 3276 static bool free_unref_page_prepare(struct page *page, unsigned long pfn, 3277 unsigned int order) 3278 { 3279 int migratetype; 3280 3281 if (!free_pcp_prepare(page, order)) 3282 return false; 3283 3284 migratetype = get_pfnblock_migratetype(page, pfn); 3285 set_pcppage_migratetype(page, migratetype); 3286 return true; 3287 } 3288 3289 static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, 3290 bool free_high) 3291 { 3292 int min_nr_free, max_nr_free; 3293 3294 /* Free everything if batch freeing high-order pages. */ 3295 if (unlikely(free_high)) 3296 return pcp->count; 3297 3298 /* Check for PCP disabled or boot pageset */ 3299 if (unlikely(high < batch)) 3300 return 1; 3301 3302 /* Leave at least pcp->batch pages on the list */ 3303 min_nr_free = batch; 3304 max_nr_free = high - batch; 3305 3306 /* 3307 * Double the number of pages freed each time there is subsequent 3308 * freeing of pages without any allocation. 3309 */ 3310 batch <<= pcp->free_factor; 3311 if (batch < max_nr_free) 3312 pcp->free_factor++; 3313 batch = clamp(batch, min_nr_free, max_nr_free); 3314 3315 return batch; 3316 } 3317 3318 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, 3319 bool free_high) 3320 { 3321 int high = READ_ONCE(pcp->high); 3322 3323 if (unlikely(!high || free_high)) 3324 return 0; 3325 3326 if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) 3327 return high; 3328 3329 /* 3330 * If reclaim is active, limit the number of pages that can be 3331 * stored on pcp lists 3332 */ 3333 return min(READ_ONCE(pcp->batch) << 2, high); 3334 } 3335 3336 static void free_unref_page_commit(struct page *page, int migratetype, 3337 unsigned int order) 3338 { 3339 struct zone *zone = page_zone(page); 3340 struct per_cpu_pages *pcp; 3341 int high; 3342 int pindex; 3343 bool free_high; 3344 3345 __count_vm_event(PGFREE); 3346 pcp = this_cpu_ptr(zone->per_cpu_pageset); 3347 pindex = order_to_pindex(migratetype, order); 3348 list_add(&page->lru, &pcp->lists[pindex]); 3349 pcp->count += 1 << order; 3350 3351 /* 3352 * As high-order pages other than THP's stored on PCP can contribute 3353 * to fragmentation, limit the number stored when PCP is heavily 3354 * freeing without allocation. The remainder after bulk freeing 3355 * stops will be drained from vmstat refresh context. 3356 */ 3357 free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); 3358 3359 high = nr_pcp_high(pcp, zone, free_high); 3360 if (pcp->count >= high) { 3361 int batch = READ_ONCE(pcp->batch); 3362 3363 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); 3364 } 3365 } 3366 3367 /* 3368 * Free a pcp page 3369 */ 3370 void free_unref_page(struct page *page, unsigned int order) 3371 { 3372 unsigned long flags; 3373 unsigned long pfn = page_to_pfn(page); 3374 int migratetype; 3375 3376 if (!free_unref_page_prepare(page, pfn, order)) 3377 return; 3378 3379 /* 3380 * We only track unmovable, reclaimable and movable on pcp lists. 3381 * Place ISOLATE pages on the isolated list because they are being 3382 * offlined but treat HIGHATOMIC as movable pages so we can get those 3383 * areas back if necessary. Otherwise, we may have to free 3384 * excessively into the page allocator 3385 */ 3386 migratetype = get_pcppage_migratetype(page); 3387 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 3388 if (unlikely(is_migrate_isolate(migratetype))) { 3389 free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); 3390 return; 3391 } 3392 migratetype = MIGRATE_MOVABLE; 3393 } 3394 3395 local_lock_irqsave(&pagesets.lock, flags); 3396 free_unref_page_commit(page, migratetype, order); 3397 local_unlock_irqrestore(&pagesets.lock, flags); 3398 } 3399 3400 /* 3401 * Free a list of 0-order pages 3402 */ 3403 void free_unref_page_list(struct list_head *list) 3404 { 3405 struct page *page, *next; 3406 unsigned long flags; 3407 int batch_count = 0; 3408 int migratetype; 3409 3410 /* Prepare pages for freeing */ 3411 list_for_each_entry_safe(page, next, list, lru) { 3412 unsigned long pfn = page_to_pfn(page); 3413 if (!free_unref_page_prepare(page, pfn, 0)) { 3414 list_del(&page->lru); 3415 continue; 3416 } 3417 3418 /* 3419 * Free isolated pages directly to the allocator, see 3420 * comment in free_unref_page. 3421 */ 3422 migratetype = get_pcppage_migratetype(page); 3423 if (unlikely(is_migrate_isolate(migratetype))) { 3424 list_del(&page->lru); 3425 free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); 3426 continue; 3427 } 3428 } 3429 3430 local_lock_irqsave(&pagesets.lock, flags); 3431 list_for_each_entry_safe(page, next, list, lru) { 3432 /* 3433 * Non-isolated types over MIGRATE_PCPTYPES get added 3434 * to the MIGRATE_MOVABLE pcp list. 3435 */ 3436 migratetype = get_pcppage_migratetype(page); 3437 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) 3438 migratetype = MIGRATE_MOVABLE; 3439 3440 trace_mm_page_free_batched(page); 3441 free_unref_page_commit(page, migratetype, 0); 3442 3443 /* 3444 * Guard against excessive IRQ disabled times when we get 3445 * a large list of pages to free. 3446 */ 3447 if (++batch_count == SWAP_CLUSTER_MAX) { 3448 local_unlock_irqrestore(&pagesets.lock, flags); 3449 batch_count = 0; 3450 local_lock_irqsave(&pagesets.lock, flags); 3451 } 3452 } 3453 local_unlock_irqrestore(&pagesets.lock, flags); 3454 } 3455 3456 /* 3457 * split_page takes a non-compound higher-order page, and splits it into 3458 * n (1<<order) sub-pages: page[0..n] 3459 * Each sub-page must be freed individually. 3460 * 3461 * Note: this is probably too low level an operation for use in drivers. 3462 * Please consult with lkml before using this in your driver. 3463 */ 3464 void split_page(struct page *page, unsigned int order) 3465 { 3466 int i; 3467 3468 VM_BUG_ON_PAGE(PageCompound(page), page); 3469 VM_BUG_ON_PAGE(!page_count(page), page); 3470 3471 for (i = 1; i < (1 << order); i++) 3472 set_page_refcounted(page + i); 3473 split_page_owner(page, 1 << order); 3474 split_page_memcg(page, 1 << order); 3475 } 3476 EXPORT_SYMBOL_GPL(split_page); 3477 3478 int __isolate_free_page(struct page *page, unsigned int order) 3479 { 3480 unsigned long watermark; 3481 struct zone *zone; 3482 int mt; 3483 3484 BUG_ON(!PageBuddy(page)); 3485 3486 zone = page_zone(page); 3487 mt = get_pageblock_migratetype(page); 3488 3489 if (!is_migrate_isolate(mt)) { 3490 /* 3491 * Obey watermarks as if the page was being allocated. We can 3492 * emulate a high-order watermark check with a raised order-0 3493 * watermark, because we already know our high-order page 3494 * exists. 3495 */ 3496 watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 3497 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3498 return 0; 3499 3500 __mod_zone_freepage_state(zone, -(1UL << order), mt); 3501 } 3502 3503 /* Remove page from free list */ 3504 3505 del_page_from_free_list(page, zone, order); 3506 3507 /* 3508 * Set the pageblock if the isolated page is at least half of a 3509 * pageblock 3510 */ 3511 if (order >= pageblock_order - 1) { 3512 struct page *endpage = page + (1 << order) - 1; 3513 for (; page < endpage; page += pageblock_nr_pages) { 3514 int mt = get_pageblock_migratetype(page); 3515 /* 3516 * Only change normal pageblocks (i.e., they can merge 3517 * with others) 3518 */ 3519 if (migratetype_is_mergeable(mt)) 3520 set_pageblock_migratetype(page, 3521 MIGRATE_MOVABLE); 3522 } 3523 } 3524 3525 3526 return 1UL << order; 3527 } 3528 3529 /** 3530 * __putback_isolated_page - Return a now-isolated page back where we got it 3531 * @page: Page that was isolated 3532 * @order: Order of the isolated page 3533 * @mt: The page's pageblock's migratetype 3534 * 3535 * This function is meant to return a page pulled from the free lists via 3536 * __isolate_free_page back to the free lists they were pulled from. 3537 */ 3538 void __putback_isolated_page(struct page *page, unsigned int order, int mt) 3539 { 3540 struct zone *zone = page_zone(page); 3541 3542 /* zone lock should be held when this function is called */ 3543 lockdep_assert_held(&zone->lock); 3544 3545 /* Return isolated page to tail of freelist. */ 3546 __free_one_page(page, page_to_pfn(page), zone, order, mt, 3547 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 3548 } 3549 3550 /* 3551 * Update NUMA hit/miss statistics 3552 * 3553 * Must be called with interrupts disabled. 3554 */ 3555 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, 3556 long nr_account) 3557 { 3558 #ifdef CONFIG_NUMA 3559 enum numa_stat_item local_stat = NUMA_LOCAL; 3560 3561 /* skip numa counters update if numa stats is disabled */ 3562 if (!static_branch_likely(&vm_numa_stat_key)) 3563 return; 3564 3565 if (zone_to_nid(z) != numa_node_id()) 3566 local_stat = NUMA_OTHER; 3567 3568 if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 3569 __count_numa_events(z, NUMA_HIT, nr_account); 3570 else { 3571 __count_numa_events(z, NUMA_MISS, nr_account); 3572 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); 3573 } 3574 __count_numa_events(z, local_stat, nr_account); 3575 #endif 3576 } 3577 3578 /* Remove page from the per-cpu list, caller must protect the list */ 3579 static inline 3580 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, 3581 int migratetype, 3582 unsigned int alloc_flags, 3583 struct per_cpu_pages *pcp, 3584 struct list_head *list) 3585 { 3586 struct page *page; 3587 3588 do { 3589 if (list_empty(list)) { 3590 int batch = READ_ONCE(pcp->batch); 3591 int alloced; 3592 3593 /* 3594 * Scale batch relative to order if batch implies 3595 * free pages can be stored on the PCP. Batch can 3596 * be 1 for small zones or for boot pagesets which 3597 * should never store free pages as the pages may 3598 * belong to arbitrary zones. 3599 */ 3600 if (batch > 1) 3601 batch = max(batch >> order, 2); 3602 alloced = rmqueue_bulk(zone, order, 3603 batch, list, 3604 migratetype, alloc_flags); 3605 3606 pcp->count += alloced << order; 3607 if (unlikely(list_empty(list))) 3608 return NULL; 3609 } 3610 3611 page = list_first_entry(list, struct page, lru); 3612 list_del(&page->lru); 3613 pcp->count -= 1 << order; 3614 } while (check_new_pcp(page, order)); 3615 3616 return page; 3617 } 3618 3619 /* Lock and remove page from the per-cpu list */ 3620 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3621 struct zone *zone, unsigned int order, 3622 gfp_t gfp_flags, int migratetype, 3623 unsigned int alloc_flags) 3624 { 3625 struct per_cpu_pages *pcp; 3626 struct list_head *list; 3627 struct page *page; 3628 unsigned long flags; 3629 3630 local_lock_irqsave(&pagesets.lock, flags); 3631 3632 /* 3633 * On allocation, reduce the number of pages that are batch freed. 3634 * See nr_pcp_free() where free_factor is increased for subsequent 3635 * frees. 3636 */ 3637 pcp = this_cpu_ptr(zone->per_cpu_pageset); 3638 pcp->free_factor >>= 1; 3639 list = &pcp->lists[order_to_pindex(migratetype, order)]; 3640 page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); 3641 local_unlock_irqrestore(&pagesets.lock, flags); 3642 if (page) { 3643 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); 3644 zone_statistics(preferred_zone, zone, 1); 3645 } 3646 return page; 3647 } 3648 3649 /* 3650 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 3651 */ 3652 static inline 3653 struct page *rmqueue(struct zone *preferred_zone, 3654 struct zone *zone, unsigned int order, 3655 gfp_t gfp_flags, unsigned int alloc_flags, 3656 int migratetype) 3657 { 3658 unsigned long flags; 3659 struct page *page; 3660 3661 if (likely(pcp_allowed_order(order))) { 3662 /* 3663 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and 3664 * we need to skip it when CMA area isn't allowed. 3665 */ 3666 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || 3667 migratetype != MIGRATE_MOVABLE) { 3668 page = rmqueue_pcplist(preferred_zone, zone, order, 3669 gfp_flags, migratetype, alloc_flags); 3670 goto out; 3671 } 3672 } 3673 3674 /* 3675 * We most definitely don't want callers attempting to 3676 * allocate greater than order-1 page units with __GFP_NOFAIL. 3677 */ 3678 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 3679 3680 do { 3681 page = NULL; 3682 spin_lock_irqsave(&zone->lock, flags); 3683 /* 3684 * order-0 request can reach here when the pcplist is skipped 3685 * due to non-CMA allocation context. HIGHATOMIC area is 3686 * reserved for high-order atomic allocation, so order-0 3687 * request should skip it. 3688 */ 3689 if (order > 0 && alloc_flags & ALLOC_HARDER) { 3690 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 3691 if (page) 3692 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3693 } 3694 if (!page) { 3695 page = __rmqueue(zone, order, migratetype, alloc_flags); 3696 if (!page) 3697 goto failed; 3698 } 3699 __mod_zone_freepage_state(zone, -(1 << order), 3700 get_pcppage_migratetype(page)); 3701 spin_unlock_irqrestore(&zone->lock, flags); 3702 } while (check_new_pages(page, order)); 3703 3704 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3705 zone_statistics(preferred_zone, zone, 1); 3706 3707 out: 3708 /* Separate test+clear to avoid unnecessary atomics */ 3709 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { 3710 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 3711 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 3712 } 3713 3714 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3715 return page; 3716 3717 failed: 3718 spin_unlock_irqrestore(&zone->lock, flags); 3719 return NULL; 3720 } 3721 3722 #ifdef CONFIG_FAIL_PAGE_ALLOC 3723 3724 static struct { 3725 struct fault_attr attr; 3726 3727 bool ignore_gfp_highmem; 3728 bool ignore_gfp_reclaim; 3729 u32 min_order; 3730 } fail_page_alloc = { 3731 .attr = FAULT_ATTR_INITIALIZER, 3732 .ignore_gfp_reclaim = true, 3733 .ignore_gfp_highmem = true, 3734 .min_order = 1, 3735 }; 3736 3737 static int __init setup_fail_page_alloc(char *str) 3738 { 3739 return setup_fault_attr(&fail_page_alloc.attr, str); 3740 } 3741 __setup("fail_page_alloc=", setup_fail_page_alloc); 3742 3743 static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3744 { 3745 if (order < fail_page_alloc.min_order) 3746 return false; 3747 if (gfp_mask & __GFP_NOFAIL) 3748 return false; 3749 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 3750 return false; 3751 if (fail_page_alloc.ignore_gfp_reclaim && 3752 (gfp_mask & __GFP_DIRECT_RECLAIM)) 3753 return false; 3754 3755 return should_fail(&fail_page_alloc.attr, 1 << order); 3756 } 3757 3758 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 3759 3760 static int __init fail_page_alloc_debugfs(void) 3761 { 3762 umode_t mode = S_IFREG | 0600; 3763 struct dentry *dir; 3764 3765 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 3766 &fail_page_alloc.attr); 3767 3768 debugfs_create_bool("ignore-gfp-wait", mode, dir, 3769 &fail_page_alloc.ignore_gfp_reclaim); 3770 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 3771 &fail_page_alloc.ignore_gfp_highmem); 3772 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); 3773 3774 return 0; 3775 } 3776 3777 late_initcall(fail_page_alloc_debugfs); 3778 3779 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 3780 3781 #else /* CONFIG_FAIL_PAGE_ALLOC */ 3782 3783 static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3784 { 3785 return false; 3786 } 3787 3788 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 3789 3790 noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3791 { 3792 return __should_fail_alloc_page(gfp_mask, order); 3793 } 3794 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); 3795 3796 static inline long __zone_watermark_unusable_free(struct zone *z, 3797 unsigned int order, unsigned int alloc_flags) 3798 { 3799 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3800 long unusable_free = (1 << order) - 1; 3801 3802 /* 3803 * If the caller does not have rights to ALLOC_HARDER then subtract 3804 * the high-atomic reserves. This will over-estimate the size of the 3805 * atomic reserve but it avoids a search. 3806 */ 3807 if (likely(!alloc_harder)) 3808 unusable_free += z->nr_reserved_highatomic; 3809 3810 #ifdef CONFIG_CMA 3811 /* If allocation can't use CMA areas don't use free CMA pages */ 3812 if (!(alloc_flags & ALLOC_CMA)) 3813 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 3814 #endif 3815 3816 return unusable_free; 3817 } 3818 3819 /* 3820 * Return true if free base pages are above 'mark'. For high-order checks it 3821 * will return true of the order-0 watermark is reached and there is at least 3822 * one free page of a suitable size. Checking now avoids taking the zone lock 3823 * to check in the allocation paths if no pages are free. 3824 */ 3825 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3826 int highest_zoneidx, unsigned int alloc_flags, 3827 long free_pages) 3828 { 3829 long min = mark; 3830 int o; 3831 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3832 3833 /* free_pages may go negative - that's OK */ 3834 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 3835 3836 if (alloc_flags & ALLOC_HIGH) 3837 min -= min / 2; 3838 3839 if (unlikely(alloc_harder)) { 3840 /* 3841 * OOM victims can try even harder than normal ALLOC_HARDER 3842 * users on the grounds that it's definitely going to be in 3843 * the exit path shortly and free memory. Any allocation it 3844 * makes during the free path will be small and short-lived. 3845 */ 3846 if (alloc_flags & ALLOC_OOM) 3847 min -= min / 2; 3848 else 3849 min -= min / 4; 3850 } 3851 3852 /* 3853 * Check watermarks for an order-0 allocation request. If these 3854 * are not met, then a high-order request also cannot go ahead 3855 * even if a suitable page happened to be free. 3856 */ 3857 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 3858 return false; 3859 3860 /* If this is an order-0 request then the watermark is fine */ 3861 if (!order) 3862 return true; 3863 3864 /* For a high-order request, check at least one suitable page is free */ 3865 for (o = order; o < MAX_ORDER; o++) { 3866 struct free_area *area = &z->free_area[o]; 3867 int mt; 3868 3869 if (!area->nr_free) 3870 continue; 3871 3872 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3873 if (!free_area_empty(area, mt)) 3874 return true; 3875 } 3876 3877 #ifdef CONFIG_CMA 3878 if ((alloc_flags & ALLOC_CMA) && 3879 !free_area_empty(area, MIGRATE_CMA)) { 3880 return true; 3881 } 3882 #endif 3883 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) 3884 return true; 3885 } 3886 return false; 3887 } 3888 3889 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3890 int highest_zoneidx, unsigned int alloc_flags) 3891 { 3892 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3893 zone_page_state(z, NR_FREE_PAGES)); 3894 } 3895 3896 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3897 unsigned long mark, int highest_zoneidx, 3898 unsigned int alloc_flags, gfp_t gfp_mask) 3899 { 3900 long free_pages; 3901 3902 free_pages = zone_page_state(z, NR_FREE_PAGES); 3903 3904 /* 3905 * Fast check for order-0 only. If this fails then the reserves 3906 * need to be calculated. 3907 */ 3908 if (!order) { 3909 long fast_free; 3910 3911 fast_free = free_pages; 3912 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); 3913 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) 3914 return true; 3915 } 3916 3917 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3918 free_pages)) 3919 return true; 3920 /* 3921 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations 3922 * when checking the min watermark. The min watermark is the 3923 * point where boosting is ignored so that kswapd is woken up 3924 * when below the low watermark. 3925 */ 3926 if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost 3927 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 3928 mark = z->_watermark[WMARK_MIN]; 3929 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 3930 alloc_flags, free_pages); 3931 } 3932 3933 return false; 3934 } 3935 3936 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3937 unsigned long mark, int highest_zoneidx) 3938 { 3939 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3940 3941 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3942 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3943 3944 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, 3945 free_pages); 3946 } 3947 3948 #ifdef CONFIG_NUMA 3949 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; 3950 3951 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3952 { 3953 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3954 node_reclaim_distance; 3955 } 3956 #else /* CONFIG_NUMA */ 3957 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3958 { 3959 return true; 3960 } 3961 #endif /* CONFIG_NUMA */ 3962 3963 /* 3964 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 3965 * fragmentation is subtle. If the preferred zone was HIGHMEM then 3966 * premature use of a lower zone may cause lowmem pressure problems that 3967 * are worse than fragmentation. If the next zone is ZONE_DMA then it is 3968 * probably too small. It only makes sense to spread allocations to avoid 3969 * fragmentation between the Normal and DMA32 zones. 3970 */ 3971 static inline unsigned int 3972 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 3973 { 3974 unsigned int alloc_flags; 3975 3976 /* 3977 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 3978 * to save a branch. 3979 */ 3980 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 3981 3982 #ifdef CONFIG_ZONE_DMA32 3983 if (!zone) 3984 return alloc_flags; 3985 3986 if (zone_idx(zone) != ZONE_NORMAL) 3987 return alloc_flags; 3988 3989 /* 3990 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 3991 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 3992 * on UMA that if Normal is populated then so is DMA32. 3993 */ 3994 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 3995 if (nr_online_nodes > 1 && !populated_zone(--zone)) 3996 return alloc_flags; 3997 3998 alloc_flags |= ALLOC_NOFRAGMENT; 3999 #endif /* CONFIG_ZONE_DMA32 */ 4000 return alloc_flags; 4001 } 4002 4003 /* Must be called after current_gfp_context() which can change gfp_mask */ 4004 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, 4005 unsigned int alloc_flags) 4006 { 4007 #ifdef CONFIG_CMA 4008 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) 4009 alloc_flags |= ALLOC_CMA; 4010 #endif 4011 return alloc_flags; 4012 } 4013 4014 /* 4015 * get_page_from_freelist goes through the zonelist trying to allocate 4016 * a page. 4017 */ 4018 static struct page * 4019 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 4020 const struct alloc_context *ac) 4021 { 4022 struct zoneref *z; 4023 struct zone *zone; 4024 struct pglist_data *last_pgdat_dirty_limit = NULL; 4025 bool no_fallback; 4026 4027 retry: 4028 /* 4029 * Scan zonelist, looking for a zone with enough free. 4030 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 4031 */ 4032 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 4033 z = ac->preferred_zoneref; 4034 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 4035 ac->nodemask) { 4036 struct page *page; 4037 unsigned long mark; 4038 4039 if (cpusets_enabled() && 4040 (alloc_flags & ALLOC_CPUSET) && 4041 !__cpuset_zone_allowed(zone, gfp_mask)) 4042 continue; 4043 /* 4044 * When allocating a page cache page for writing, we 4045 * want to get it from a node that is within its dirty 4046 * limit, such that no single node holds more than its 4047 * proportional share of globally allowed dirty pages. 4048 * The dirty limits take into account the node's 4049 * lowmem reserves and high watermark so that kswapd 4050 * should be able to balance it without having to 4051 * write pages from its LRU list. 4052 * 4053 * XXX: For now, allow allocations to potentially 4054 * exceed the per-node dirty limit in the slowpath 4055 * (spread_dirty_pages unset) before going into reclaim, 4056 * which is important when on a NUMA setup the allowed 4057 * nodes are together not big enough to reach the 4058 * global limit. The proper fix for these situations 4059 * will require awareness of nodes in the 4060 * dirty-throttling and the flusher threads. 4061 */ 4062 if (ac->spread_dirty_pages) { 4063 if (last_pgdat_dirty_limit == zone->zone_pgdat) 4064 continue; 4065 4066 if (!node_dirty_ok(zone->zone_pgdat)) { 4067 last_pgdat_dirty_limit = zone->zone_pgdat; 4068 continue; 4069 } 4070 } 4071 4072 if (no_fallback && nr_online_nodes > 1 && 4073 zone != ac->preferred_zoneref->zone) { 4074 int local_nid; 4075 4076 /* 4077 * If moving to a remote node, retry but allow 4078 * fragmenting fallbacks. Locality is more important 4079 * than fragmentation avoidance. 4080 */ 4081 local_nid = zone_to_nid(ac->preferred_zoneref->zone); 4082 if (zone_to_nid(zone) != local_nid) { 4083 alloc_flags &= ~ALLOC_NOFRAGMENT; 4084 goto retry; 4085 } 4086 } 4087 4088 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 4089 if (!zone_watermark_fast(zone, order, mark, 4090 ac->highest_zoneidx, alloc_flags, 4091 gfp_mask)) { 4092 int ret; 4093 4094 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 4095 /* 4096 * Watermark failed for this zone, but see if we can 4097 * grow this zone if it contains deferred pages. 4098 */ 4099 if (static_branch_unlikely(&deferred_pages)) { 4100 if (_deferred_grow_zone(zone, order)) 4101 goto try_this_zone; 4102 } 4103 #endif 4104 /* Checked here to keep the fast path fast */ 4105 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 4106 if (alloc_flags & ALLOC_NO_WATERMARKS) 4107 goto try_this_zone; 4108 4109 if (!node_reclaim_enabled() || 4110 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 4111 continue; 4112 4113 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 4114 switch (ret) { 4115 case NODE_RECLAIM_NOSCAN: 4116 /* did not scan */ 4117 continue; 4118 case NODE_RECLAIM_FULL: 4119 /* scanned but unreclaimable */ 4120 continue; 4121 default: 4122 /* did we reclaim enough */ 4123 if (zone_watermark_ok(zone, order, mark, 4124 ac->highest_zoneidx, alloc_flags)) 4125 goto try_this_zone; 4126 4127 continue; 4128 } 4129 } 4130 4131 try_this_zone: 4132 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 4133 gfp_mask, alloc_flags, ac->migratetype); 4134 if (page) { 4135 prep_new_page(page, order, gfp_mask, alloc_flags); 4136 4137 /* 4138 * If this is a high-order atomic allocation then check 4139 * if the pageblock should be reserved for the future 4140 */ 4141 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 4142 reserve_highatomic_pageblock(page, zone, order); 4143 4144 return page; 4145 } else { 4146 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 4147 /* Try again if zone has deferred pages */ 4148 if (static_branch_unlikely(&deferred_pages)) { 4149 if (_deferred_grow_zone(zone, order)) 4150 goto try_this_zone; 4151 } 4152 #endif 4153 } 4154 } 4155 4156 /* 4157 * It's possible on a UMA machine to get through all zones that are 4158 * fragmented. If avoiding fragmentation, reset and try again. 4159 */ 4160 if (no_fallback) { 4161 alloc_flags &= ~ALLOC_NOFRAGMENT; 4162 goto retry; 4163 } 4164 4165 return NULL; 4166 } 4167 4168 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 4169 { 4170 unsigned int filter = SHOW_MEM_FILTER_NODES; 4171 4172 /* 4173 * This documents exceptions given to allocations in certain 4174 * contexts that are allowed to allocate outside current's set 4175 * of allowed nodes. 4176 */ 4177 if (!(gfp_mask & __GFP_NOMEMALLOC)) 4178 if (tsk_is_oom_victim(current) || 4179 (current->flags & (PF_MEMALLOC | PF_EXITING))) 4180 filter &= ~SHOW_MEM_FILTER_NODES; 4181 if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 4182 filter &= ~SHOW_MEM_FILTER_NODES; 4183 4184 show_mem(filter, nodemask); 4185 } 4186 4187 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 4188 { 4189 struct va_format vaf; 4190 va_list args; 4191 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 4192 4193 if ((gfp_mask & __GFP_NOWARN) || 4194 !__ratelimit(&nopage_rs) || 4195 ((gfp_mask & __GFP_DMA) && !has_managed_dma())) 4196 return; 4197 4198 va_start(args, fmt); 4199 vaf.fmt = fmt; 4200 vaf.va = &args; 4201 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 4202 current->comm, &vaf, gfp_mask, &gfp_mask, 4203 nodemask_pr_args(nodemask)); 4204 va_end(args); 4205 4206 cpuset_print_current_mems_allowed(); 4207 pr_cont("\n"); 4208 dump_stack(); 4209 warn_alloc_show_mem(gfp_mask, nodemask); 4210 } 4211 4212 static inline struct page * 4213 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 4214 unsigned int alloc_flags, 4215 const struct alloc_context *ac) 4216 { 4217 struct page *page; 4218 4219 page = get_page_from_freelist(gfp_mask, order, 4220 alloc_flags|ALLOC_CPUSET, ac); 4221 /* 4222 * fallback to ignore cpuset restriction if our nodes 4223 * are depleted 4224 */ 4225 if (!page) 4226 page = get_page_from_freelist(gfp_mask, order, 4227 alloc_flags, ac); 4228 4229 return page; 4230 } 4231 4232 static inline struct page * 4233 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 4234 const struct alloc_context *ac, unsigned long *did_some_progress) 4235 { 4236 struct oom_control oc = { 4237 .zonelist = ac->zonelist, 4238 .nodemask = ac->nodemask, 4239 .memcg = NULL, 4240 .gfp_mask = gfp_mask, 4241 .order = order, 4242 }; 4243 struct page *page; 4244 4245 *did_some_progress = 0; 4246 4247 /* 4248 * Acquire the oom lock. If that fails, somebody else is 4249 * making progress for us. 4250 */ 4251 if (!mutex_trylock(&oom_lock)) { 4252 *did_some_progress = 1; 4253 schedule_timeout_uninterruptible(1); 4254 return NULL; 4255 } 4256 4257 /* 4258 * Go through the zonelist yet one more time, keep very high watermark 4259 * here, this is only to catch a parallel oom killing, we must fail if 4260 * we're still under heavy pressure. But make sure that this reclaim 4261 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 4262 * allocation which will never fail due to oom_lock already held. 4263 */ 4264 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 4265 ~__GFP_DIRECT_RECLAIM, order, 4266 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 4267 if (page) 4268 goto out; 4269 4270 /* Coredumps can quickly deplete all memory reserves */ 4271 if (current->flags & PF_DUMPCORE) 4272 goto out; 4273 /* The OOM killer will not help higher order allocs */ 4274 if (order > PAGE_ALLOC_COSTLY_ORDER) 4275 goto out; 4276 /* 4277 * We have already exhausted all our reclaim opportunities without any 4278 * success so it is time to admit defeat. We will skip the OOM killer 4279 * because it is very likely that the caller has a more reasonable 4280 * fallback than shooting a random task. 4281 * 4282 * The OOM killer may not free memory on a specific node. 4283 */ 4284 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 4285 goto out; 4286 /* The OOM killer does not needlessly kill tasks for lowmem */ 4287 if (ac->highest_zoneidx < ZONE_NORMAL) 4288 goto out; 4289 if (pm_suspended_storage()) 4290 goto out; 4291 /* 4292 * XXX: GFP_NOFS allocations should rather fail than rely on 4293 * other request to make a forward progress. 4294 * We are in an unfortunate situation where out_of_memory cannot 4295 * do much for this context but let's try it to at least get 4296 * access to memory reserved if the current task is killed (see 4297 * out_of_memory). Once filesystems are ready to handle allocation 4298 * failures more gracefully we should just bail out here. 4299 */ 4300 4301 /* Exhausted what can be done so it's blame time */ 4302 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 4303 *did_some_progress = 1; 4304 4305 /* 4306 * Help non-failing allocations by giving them access to memory 4307 * reserves 4308 */ 4309 if (gfp_mask & __GFP_NOFAIL) 4310 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 4311 ALLOC_NO_WATERMARKS, ac); 4312 } 4313 out: 4314 mutex_unlock(&oom_lock); 4315 return page; 4316 } 4317 4318 /* 4319 * Maximum number of compaction retries with a progress before OOM 4320 * killer is consider as the only way to move forward. 4321 */ 4322 #define MAX_COMPACT_RETRIES 16 4323 4324 #ifdef CONFIG_COMPACTION 4325 /* Try memory compaction for high-order allocations before reclaim */ 4326 static struct page * 4327 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4328 unsigned int alloc_flags, const struct alloc_context *ac, 4329 enum compact_priority prio, enum compact_result *compact_result) 4330 { 4331 struct page *page = NULL; 4332 unsigned long pflags; 4333 unsigned int noreclaim_flag; 4334 4335 if (!order) 4336 return NULL; 4337 4338 psi_memstall_enter(&pflags); 4339 delayacct_compact_start(); 4340 noreclaim_flag = memalloc_noreclaim_save(); 4341 4342 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 4343 prio, &page); 4344 4345 memalloc_noreclaim_restore(noreclaim_flag); 4346 psi_memstall_leave(&pflags); 4347 delayacct_compact_end(); 4348 4349 if (*compact_result == COMPACT_SKIPPED) 4350 return NULL; 4351 /* 4352 * At least in one zone compaction wasn't deferred or skipped, so let's 4353 * count a compaction stall 4354 */ 4355 count_vm_event(COMPACTSTALL); 4356 4357 /* Prep a captured page if available */ 4358 if (page) 4359 prep_new_page(page, order, gfp_mask, alloc_flags); 4360 4361 /* Try get a page from the freelist if available */ 4362 if (!page) 4363 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4364 4365 if (page) { 4366 struct zone *zone = page_zone(page); 4367 4368 zone->compact_blockskip_flush = false; 4369 compaction_defer_reset(zone, order, true); 4370 count_vm_event(COMPACTSUCCESS); 4371 return page; 4372 } 4373 4374 /* 4375 * It's bad if compaction run occurs and fails. The most likely reason 4376 * is that pages exist, but not enough to satisfy watermarks. 4377 */ 4378 count_vm_event(COMPACTFAIL); 4379 4380 cond_resched(); 4381 4382 return NULL; 4383 } 4384 4385 static inline bool 4386 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 4387 enum compact_result compact_result, 4388 enum compact_priority *compact_priority, 4389 int *compaction_retries) 4390 { 4391 int max_retries = MAX_COMPACT_RETRIES; 4392 int min_priority; 4393 bool ret = false; 4394 int retries = *compaction_retries; 4395 enum compact_priority priority = *compact_priority; 4396 4397 if (!order) 4398 return false; 4399 4400 if (fatal_signal_pending(current)) 4401 return false; 4402 4403 if (compaction_made_progress(compact_result)) 4404 (*compaction_retries)++; 4405 4406 /* 4407 * compaction considers all the zone as desperately out of memory 4408 * so it doesn't really make much sense to retry except when the 4409 * failure could be caused by insufficient priority 4410 */ 4411 if (compaction_failed(compact_result)) 4412 goto check_priority; 4413 4414 /* 4415 * compaction was skipped because there are not enough order-0 pages 4416 * to work with, so we retry only if it looks like reclaim can help. 4417 */ 4418 if (compaction_needs_reclaim(compact_result)) { 4419 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 4420 goto out; 4421 } 4422 4423 /* 4424 * make sure the compaction wasn't deferred or didn't bail out early 4425 * due to locks contention before we declare that we should give up. 4426 * But the next retry should use a higher priority if allowed, so 4427 * we don't just keep bailing out endlessly. 4428 */ 4429 if (compaction_withdrawn(compact_result)) { 4430 goto check_priority; 4431 } 4432 4433 /* 4434 * !costly requests are much more important than __GFP_RETRY_MAYFAIL 4435 * costly ones because they are de facto nofail and invoke OOM 4436 * killer to move on while costly can fail and users are ready 4437 * to cope with that. 1/4 retries is rather arbitrary but we 4438 * would need much more detailed feedback from compaction to 4439 * make a better decision. 4440 */ 4441 if (order > PAGE_ALLOC_COSTLY_ORDER) 4442 max_retries /= 4; 4443 if (*compaction_retries <= max_retries) { 4444 ret = true; 4445 goto out; 4446 } 4447 4448 /* 4449 * Make sure there are attempts at the highest priority if we exhausted 4450 * all retries or failed at the lower priorities. 4451 */ 4452 check_priority: 4453 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 4454 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 4455 4456 if (*compact_priority > min_priority) { 4457 (*compact_priority)--; 4458 *compaction_retries = 0; 4459 ret = true; 4460 } 4461 out: 4462 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 4463 return ret; 4464 } 4465 #else 4466 static inline struct page * 4467 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4468 unsigned int alloc_flags, const struct alloc_context *ac, 4469 enum compact_priority prio, enum compact_result *compact_result) 4470 { 4471 *compact_result = COMPACT_SKIPPED; 4472 return NULL; 4473 } 4474 4475 static inline bool 4476 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 4477 enum compact_result compact_result, 4478 enum compact_priority *compact_priority, 4479 int *compaction_retries) 4480 { 4481 struct zone *zone; 4482 struct zoneref *z; 4483 4484 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 4485 return false; 4486 4487 /* 4488 * There are setups with compaction disabled which would prefer to loop 4489 * inside the allocator rather than hit the oom killer prematurely. 4490 * Let's give them a good hope and keep retrying while the order-0 4491 * watermarks are OK. 4492 */ 4493 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4494 ac->highest_zoneidx, ac->nodemask) { 4495 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 4496 ac->highest_zoneidx, alloc_flags)) 4497 return true; 4498 } 4499 return false; 4500 } 4501 #endif /* CONFIG_COMPACTION */ 4502 4503 #ifdef CONFIG_LOCKDEP 4504 static struct lockdep_map __fs_reclaim_map = 4505 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 4506 4507 static bool __need_reclaim(gfp_t gfp_mask) 4508 { 4509 /* no reclaim without waiting on it */ 4510 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 4511 return false; 4512 4513 /* this guy won't enter reclaim */ 4514 if (current->flags & PF_MEMALLOC) 4515 return false; 4516 4517 if (gfp_mask & __GFP_NOLOCKDEP) 4518 return false; 4519 4520 return true; 4521 } 4522 4523 void __fs_reclaim_acquire(unsigned long ip) 4524 { 4525 lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); 4526 } 4527 4528 void __fs_reclaim_release(unsigned long ip) 4529 { 4530 lock_release(&__fs_reclaim_map, ip); 4531 } 4532 4533 void fs_reclaim_acquire(gfp_t gfp_mask) 4534 { 4535 gfp_mask = current_gfp_context(gfp_mask); 4536 4537 if (__need_reclaim(gfp_mask)) { 4538 if (gfp_mask & __GFP_FS) 4539 __fs_reclaim_acquire(_RET_IP_); 4540 4541 #ifdef CONFIG_MMU_NOTIFIER 4542 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 4543 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 4544 #endif 4545 4546 } 4547 } 4548 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 4549 4550 void fs_reclaim_release(gfp_t gfp_mask) 4551 { 4552 gfp_mask = current_gfp_context(gfp_mask); 4553 4554 if (__need_reclaim(gfp_mask)) { 4555 if (gfp_mask & __GFP_FS) 4556 __fs_reclaim_release(_RET_IP_); 4557 } 4558 } 4559 EXPORT_SYMBOL_GPL(fs_reclaim_release); 4560 #endif 4561 4562 /* Perform direct synchronous page reclaim */ 4563 static unsigned long 4564 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 4565 const struct alloc_context *ac) 4566 { 4567 unsigned int noreclaim_flag; 4568 unsigned long progress; 4569 4570 cond_resched(); 4571 4572 /* We now go into synchronous reclaim */ 4573 cpuset_memory_pressure_bump(); 4574 fs_reclaim_acquire(gfp_mask); 4575 noreclaim_flag = memalloc_noreclaim_save(); 4576 4577 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 4578 ac->nodemask); 4579 4580 memalloc_noreclaim_restore(noreclaim_flag); 4581 fs_reclaim_release(gfp_mask); 4582 4583 cond_resched(); 4584 4585 return progress; 4586 } 4587 4588 /* The really slow allocator path where we enter direct reclaim */ 4589 static inline struct page * 4590 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 4591 unsigned int alloc_flags, const struct alloc_context *ac, 4592 unsigned long *did_some_progress) 4593 { 4594 struct page *page = NULL; 4595 unsigned long pflags; 4596 bool drained = false; 4597 4598 psi_memstall_enter(&pflags); 4599 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 4600 if (unlikely(!(*did_some_progress))) 4601 goto out; 4602 4603 retry: 4604 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4605 4606 /* 4607 * If an allocation failed after direct reclaim, it could be because 4608 * pages are pinned on the per-cpu lists or in high alloc reserves. 4609 * Shrink them and try again 4610 */ 4611 if (!page && !drained) { 4612 unreserve_highatomic_pageblock(ac, false); 4613 drain_all_pages(NULL); 4614 drained = true; 4615 goto retry; 4616 } 4617 out: 4618 psi_memstall_leave(&pflags); 4619 4620 return page; 4621 } 4622 4623 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 4624 const struct alloc_context *ac) 4625 { 4626 struct zoneref *z; 4627 struct zone *zone; 4628 pg_data_t *last_pgdat = NULL; 4629 enum zone_type highest_zoneidx = ac->highest_zoneidx; 4630 4631 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 4632 ac->nodemask) { 4633 if (!managed_zone(zone)) 4634 continue; 4635 if (last_pgdat != zone->zone_pgdat) { 4636 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); 4637 last_pgdat = zone->zone_pgdat; 4638 } 4639 } 4640 } 4641 4642 static inline unsigned int 4643 gfp_to_alloc_flags(gfp_t gfp_mask) 4644 { 4645 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 4646 4647 /* 4648 * __GFP_HIGH is assumed to be the same as ALLOC_HIGH 4649 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 4650 * to save two branches. 4651 */ 4652 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 4653 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 4654 4655 /* 4656 * The caller may dip into page reserves a bit more if the caller 4657 * cannot run direct reclaim, or if the caller has realtime scheduling 4658 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 4659 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 4660 */ 4661 alloc_flags |= (__force int) 4662 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 4663 4664 if (gfp_mask & __GFP_ATOMIC) { 4665 /* 4666 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 4667 * if it can't schedule. 4668 */ 4669 if (!(gfp_mask & __GFP_NOMEMALLOC)) 4670 alloc_flags |= ALLOC_HARDER; 4671 /* 4672 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 4673 * comment for __cpuset_node_allowed(). 4674 */ 4675 alloc_flags &= ~ALLOC_CPUSET; 4676 } else if (unlikely(rt_task(current)) && in_task()) 4677 alloc_flags |= ALLOC_HARDER; 4678 4679 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); 4680 4681 return alloc_flags; 4682 } 4683 4684 static bool oom_reserves_allowed(struct task_struct *tsk) 4685 { 4686 if (!tsk_is_oom_victim(tsk)) 4687 return false; 4688 4689 /* 4690 * !MMU doesn't have oom reaper so give access to memory reserves 4691 * only to the thread with TIF_MEMDIE set 4692 */ 4693 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 4694 return false; 4695 4696 return true; 4697 } 4698 4699 /* 4700 * Distinguish requests which really need access to full memory 4701 * reserves from oom victims which can live with a portion of it 4702 */ 4703 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 4704 { 4705 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 4706 return 0; 4707 if (gfp_mask & __GFP_MEMALLOC) 4708 return ALLOC_NO_WATERMARKS; 4709 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 4710 return ALLOC_NO_WATERMARKS; 4711 if (!in_interrupt()) { 4712 if (current->flags & PF_MEMALLOC) 4713 return ALLOC_NO_WATERMARKS; 4714 else if (oom_reserves_allowed(current)) 4715 return ALLOC_OOM; 4716 } 4717 4718 return 0; 4719 } 4720 4721 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 4722 { 4723 return !!__gfp_pfmemalloc_flags(gfp_mask); 4724 } 4725 4726 /* 4727 * Checks whether it makes sense to retry the reclaim to make a forward progress 4728 * for the given allocation request. 4729 * 4730 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 4731 * without success, or when we couldn't even meet the watermark if we 4732 * reclaimed all remaining pages on the LRU lists. 4733 * 4734 * Returns true if a retry is viable or false to enter the oom path. 4735 */ 4736 static inline bool 4737 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 4738 struct alloc_context *ac, int alloc_flags, 4739 bool did_some_progress, int *no_progress_loops) 4740 { 4741 struct zone *zone; 4742 struct zoneref *z; 4743 bool ret = false; 4744 4745 /* 4746 * Costly allocations might have made a progress but this doesn't mean 4747 * their order will become available due to high fragmentation so 4748 * always increment the no progress counter for them 4749 */ 4750 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 4751 *no_progress_loops = 0; 4752 else 4753 (*no_progress_loops)++; 4754 4755 /* 4756 * Make sure we converge to OOM if we cannot make any progress 4757 * several times in the row. 4758 */ 4759 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 4760 /* Before OOM, exhaust highatomic_reserve */ 4761 return unreserve_highatomic_pageblock(ac, true); 4762 } 4763 4764 /* 4765 * Keep reclaiming pages while there is a chance this will lead 4766 * somewhere. If none of the target zones can satisfy our allocation 4767 * request even if all reclaimable pages are considered then we are 4768 * screwed and have to go OOM. 4769 */ 4770 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4771 ac->highest_zoneidx, ac->nodemask) { 4772 unsigned long available; 4773 unsigned long reclaimable; 4774 unsigned long min_wmark = min_wmark_pages(zone); 4775 bool wmark; 4776 4777 available = reclaimable = zone_reclaimable_pages(zone); 4778 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 4779 4780 /* 4781 * Would the allocation succeed if we reclaimed all 4782 * reclaimable pages? 4783 */ 4784 wmark = __zone_watermark_ok(zone, order, min_wmark, 4785 ac->highest_zoneidx, alloc_flags, available); 4786 trace_reclaim_retry_zone(z, order, reclaimable, 4787 available, min_wmark, *no_progress_loops, wmark); 4788 if (wmark) { 4789 ret = true; 4790 break; 4791 } 4792 } 4793 4794 /* 4795 * Memory allocation/reclaim might be called from a WQ context and the 4796 * current implementation of the WQ concurrency control doesn't 4797 * recognize that a particular WQ is congested if the worker thread is 4798 * looping without ever sleeping. Therefore we have to do a short sleep 4799 * here rather than calling cond_resched(). 4800 */ 4801 if (current->flags & PF_WQ_WORKER) 4802 schedule_timeout_uninterruptible(1); 4803 else 4804 cond_resched(); 4805 return ret; 4806 } 4807 4808 static inline bool 4809 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 4810 { 4811 /* 4812 * It's possible that cpuset's mems_allowed and the nodemask from 4813 * mempolicy don't intersect. This should be normally dealt with by 4814 * policy_nodemask(), but it's possible to race with cpuset update in 4815 * such a way the check therein was true, and then it became false 4816 * before we got our cpuset_mems_cookie here. 4817 * This assumes that for all allocations, ac->nodemask can come only 4818 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 4819 * when it does not intersect with the cpuset restrictions) or the 4820 * caller can deal with a violated nodemask. 4821 */ 4822 if (cpusets_enabled() && ac->nodemask && 4823 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 4824 ac->nodemask = NULL; 4825 return true; 4826 } 4827 4828 /* 4829 * When updating a task's mems_allowed or mempolicy nodemask, it is 4830 * possible to race with parallel threads in such a way that our 4831 * allocation can fail while the mask is being updated. If we are about 4832 * to fail, check if the cpuset changed during allocation and if so, 4833 * retry. 4834 */ 4835 if (read_mems_allowed_retry(cpuset_mems_cookie)) 4836 return true; 4837 4838 return false; 4839 } 4840 4841 static inline struct page * 4842 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 4843 struct alloc_context *ac) 4844 { 4845 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 4846 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 4847 struct page *page = NULL; 4848 unsigned int alloc_flags; 4849 unsigned long did_some_progress; 4850 enum compact_priority compact_priority; 4851 enum compact_result compact_result; 4852 int compaction_retries; 4853 int no_progress_loops; 4854 unsigned int cpuset_mems_cookie; 4855 int reserve_flags; 4856 4857 /* 4858 * We also sanity check to catch abuse of atomic reserves being used by 4859 * callers that are not in atomic context. 4860 */ 4861 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 4862 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 4863 gfp_mask &= ~__GFP_ATOMIC; 4864 4865 retry_cpuset: 4866 compaction_retries = 0; 4867 no_progress_loops = 0; 4868 compact_priority = DEF_COMPACT_PRIORITY; 4869 cpuset_mems_cookie = read_mems_allowed_begin(); 4870 4871 /* 4872 * The fast path uses conservative alloc_flags to succeed only until 4873 * kswapd needs to be woken up, and to avoid the cost of setting up 4874 * alloc_flags precisely. So we do that now. 4875 */ 4876 alloc_flags = gfp_to_alloc_flags(gfp_mask); 4877 4878 /* 4879 * We need to recalculate the starting point for the zonelist iterator 4880 * because we might have used different nodemask in the fast path, or 4881 * there was a cpuset modification and we are retrying - otherwise we 4882 * could end up iterating over non-eligible zones endlessly. 4883 */ 4884 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4885 ac->highest_zoneidx, ac->nodemask); 4886 if (!ac->preferred_zoneref->zone) 4887 goto nopage; 4888 4889 /* 4890 * Check for insane configurations where the cpuset doesn't contain 4891 * any suitable zone to satisfy the request - e.g. non-movable 4892 * GFP_HIGHUSER allocations from MOVABLE nodes only. 4893 */ 4894 if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { 4895 struct zoneref *z = first_zones_zonelist(ac->zonelist, 4896 ac->highest_zoneidx, 4897 &cpuset_current_mems_allowed); 4898 if (!z->zone) 4899 goto nopage; 4900 } 4901 4902 if (alloc_flags & ALLOC_KSWAPD) 4903 wake_all_kswapds(order, gfp_mask, ac); 4904 4905 /* 4906 * The adjusted alloc_flags might result in immediate success, so try 4907 * that first 4908 */ 4909 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4910 if (page) 4911 goto got_pg; 4912 4913 /* 4914 * For costly allocations, try direct compaction first, as it's likely 4915 * that we have enough base pages and don't need to reclaim. For non- 4916 * movable high-order allocations, do that as well, as compaction will 4917 * try prevent permanent fragmentation by migrating from blocks of the 4918 * same migratetype. 4919 * Don't try this for allocations that are allowed to ignore 4920 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 4921 */ 4922 if (can_direct_reclaim && 4923 (costly_order || 4924 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 4925 && !gfp_pfmemalloc_allowed(gfp_mask)) { 4926 page = __alloc_pages_direct_compact(gfp_mask, order, 4927 alloc_flags, ac, 4928 INIT_COMPACT_PRIORITY, 4929 &compact_result); 4930 if (page) 4931 goto got_pg; 4932 4933 /* 4934 * Checks for costly allocations with __GFP_NORETRY, which 4935 * includes some THP page fault allocations 4936 */ 4937 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 4938 /* 4939 * If allocating entire pageblock(s) and compaction 4940 * failed because all zones are below low watermarks 4941 * or is prohibited because it recently failed at this 4942 * order, fail immediately unless the allocator has 4943 * requested compaction and reclaim retry. 4944 * 4945 * Reclaim is 4946 * - potentially very expensive because zones are far 4947 * below their low watermarks or this is part of very 4948 * bursty high order allocations, 4949 * - not guaranteed to help because isolate_freepages() 4950 * may not iterate over freed pages as part of its 4951 * linear scan, and 4952 * - unlikely to make entire pageblocks free on its 4953 * own. 4954 */ 4955 if (compact_result == COMPACT_SKIPPED || 4956 compact_result == COMPACT_DEFERRED) 4957 goto nopage; 4958 4959 /* 4960 * Looks like reclaim/compaction is worth trying, but 4961 * sync compaction could be very expensive, so keep 4962 * using async compaction. 4963 */ 4964 compact_priority = INIT_COMPACT_PRIORITY; 4965 } 4966 } 4967 4968 retry: 4969 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4970 if (alloc_flags & ALLOC_KSWAPD) 4971 wake_all_kswapds(order, gfp_mask, ac); 4972 4973 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4974 if (reserve_flags) 4975 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); 4976 4977 /* 4978 * Reset the nodemask and zonelist iterators if memory policies can be 4979 * ignored. These allocations are high priority and system rather than 4980 * user oriented. 4981 */ 4982 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 4983 ac->nodemask = NULL; 4984 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4985 ac->highest_zoneidx, ac->nodemask); 4986 } 4987 4988 /* Attempt with potentially adjusted zonelist and alloc_flags */ 4989 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4990 if (page) 4991 goto got_pg; 4992 4993 /* Caller is not willing to reclaim, we can't balance anything */ 4994 if (!can_direct_reclaim) 4995 goto nopage; 4996 4997 /* Avoid recursion of direct reclaim */ 4998 if (current->flags & PF_MEMALLOC) 4999 goto nopage; 5000 5001 /* Try direct reclaim and then allocating */ 5002 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 5003 &did_some_progress); 5004 if (page) 5005 goto got_pg; 5006 5007 /* Try direct compaction and then allocating */ 5008 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 5009 compact_priority, &compact_result); 5010 if (page) 5011 goto got_pg; 5012 5013 /* Do not loop if specifically requested */ 5014 if (gfp_mask & __GFP_NORETRY) 5015 goto nopage; 5016 5017 /* 5018 * Do not retry costly high order allocations unless they are 5019 * __GFP_RETRY_MAYFAIL 5020 */ 5021 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 5022 goto nopage; 5023 5024 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 5025 did_some_progress > 0, &no_progress_loops)) 5026 goto retry; 5027 5028 /* 5029 * It doesn't make any sense to retry for the compaction if the order-0 5030 * reclaim is not able to make any progress because the current 5031 * implementation of the compaction depends on the sufficient amount 5032 * of free memory (see __compaction_suitable) 5033 */ 5034 if (did_some_progress > 0 && 5035 should_compact_retry(ac, order, alloc_flags, 5036 compact_result, &compact_priority, 5037 &compaction_retries)) 5038 goto retry; 5039 5040 5041 /* Deal with possible cpuset update races before we start OOM killing */ 5042 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 5043 goto retry_cpuset; 5044 5045 /* Reclaim has failed us, start killing things */ 5046 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 5047 if (page) 5048 goto got_pg; 5049 5050 /* Avoid allocations with no watermarks from looping endlessly */ 5051 if (tsk_is_oom_victim(current) && 5052 (alloc_flags & ALLOC_OOM || 5053 (gfp_mask & __GFP_NOMEMALLOC))) 5054 goto nopage; 5055 5056 /* Retry as long as the OOM killer is making progress */ 5057 if (did_some_progress) { 5058 no_progress_loops = 0; 5059 goto retry; 5060 } 5061 5062 nopage: 5063 /* Deal with possible cpuset update races before we fail */ 5064 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 5065 goto retry_cpuset; 5066 5067 /* 5068 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 5069 * we always retry 5070 */ 5071 if (gfp_mask & __GFP_NOFAIL) { 5072 /* 5073 * All existing users of the __GFP_NOFAIL are blockable, so warn 5074 * of any new users that actually require GFP_NOWAIT 5075 */ 5076 if (WARN_ON_ONCE(!can_direct_reclaim)) 5077 goto fail; 5078 5079 /* 5080 * PF_MEMALLOC request from this context is rather bizarre 5081 * because we cannot reclaim anything and only can loop waiting 5082 * for somebody to do a work for us 5083 */ 5084 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 5085 5086 /* 5087 * non failing costly orders are a hard requirement which we 5088 * are not prepared for much so let's warn about these users 5089 * so that we can identify them and convert them to something 5090 * else. 5091 */ 5092 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 5093 5094 /* 5095 * Help non-failing allocations by giving them access to memory 5096 * reserves but do not use ALLOC_NO_WATERMARKS because this 5097 * could deplete whole memory reserves which would just make 5098 * the situation worse 5099 */ 5100 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 5101 if (page) 5102 goto got_pg; 5103 5104 cond_resched(); 5105 goto retry; 5106 } 5107 fail: 5108 warn_alloc(gfp_mask, ac->nodemask, 5109 "page allocation failure: order:%u", order); 5110 got_pg: 5111 return page; 5112 } 5113 5114 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 5115 int preferred_nid, nodemask_t *nodemask, 5116 struct alloc_context *ac, gfp_t *alloc_gfp, 5117 unsigned int *alloc_flags) 5118 { 5119 ac->highest_zoneidx = gfp_zone(gfp_mask); 5120 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 5121 ac->nodemask = nodemask; 5122 ac->migratetype = gfp_migratetype(gfp_mask); 5123 5124 if (cpusets_enabled()) { 5125 *alloc_gfp |= __GFP_HARDWALL; 5126 /* 5127 * When we are in the interrupt context, it is irrelevant 5128 * to the current task context. It means that any node ok. 5129 */ 5130 if (in_task() && !ac->nodemask) 5131 ac->nodemask = &cpuset_current_mems_allowed; 5132 else 5133 *alloc_flags |= ALLOC_CPUSET; 5134 } 5135 5136 fs_reclaim_acquire(gfp_mask); 5137 fs_reclaim_release(gfp_mask); 5138 5139 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 5140 5141 if (should_fail_alloc_page(gfp_mask, order)) 5142 return false; 5143 5144 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); 5145 5146 /* Dirty zone balancing only done in the fast path */ 5147 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 5148 5149 /* 5150 * The preferred zone is used for statistics but crucially it is 5151 * also used as the starting point for the zonelist iterator. It 5152 * may get reset for allocations that ignore memory policies. 5153 */ 5154 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 5155 ac->highest_zoneidx, ac->nodemask); 5156 5157 return true; 5158 } 5159 5160 /* 5161 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array 5162 * @gfp: GFP flags for the allocation 5163 * @preferred_nid: The preferred NUMA node ID to allocate from 5164 * @nodemask: Set of nodes to allocate from, may be NULL 5165 * @nr_pages: The number of pages desired on the list or array 5166 * @page_list: Optional list to store the allocated pages 5167 * @page_array: Optional array to store the pages 5168 * 5169 * This is a batched version of the page allocator that attempts to 5170 * allocate nr_pages quickly. Pages are added to page_list if page_list 5171 * is not NULL, otherwise it is assumed that the page_array is valid. 5172 * 5173 * For lists, nr_pages is the number of pages that should be allocated. 5174 * 5175 * For arrays, only NULL elements are populated with pages and nr_pages 5176 * is the maximum number of pages that will be stored in the array. 5177 * 5178 * Returns the number of pages on the list or array. 5179 */ 5180 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, 5181 nodemask_t *nodemask, int nr_pages, 5182 struct list_head *page_list, 5183 struct page **page_array) 5184 { 5185 struct page *page; 5186 unsigned long flags; 5187 struct zone *zone; 5188 struct zoneref *z; 5189 struct per_cpu_pages *pcp; 5190 struct list_head *pcp_list; 5191 struct alloc_context ac; 5192 gfp_t alloc_gfp; 5193 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5194 int nr_populated = 0, nr_account = 0; 5195 5196 /* 5197 * Skip populated array elements to determine if any pages need 5198 * to be allocated before disabling IRQs. 5199 */ 5200 while (page_array && nr_populated < nr_pages && page_array[nr_populated]) 5201 nr_populated++; 5202 5203 /* No pages requested? */ 5204 if (unlikely(nr_pages <= 0)) 5205 goto out; 5206 5207 /* Already populated array? */ 5208 if (unlikely(page_array && nr_pages - nr_populated == 0)) 5209 goto out; 5210 5211 /* Bulk allocator does not support memcg accounting. */ 5212 if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) 5213 goto failed; 5214 5215 /* Use the single page allocator for one page. */ 5216 if (nr_pages - nr_populated == 1) 5217 goto failed; 5218 5219 #ifdef CONFIG_PAGE_OWNER 5220 /* 5221 * PAGE_OWNER may recurse into the allocator to allocate space to 5222 * save the stack with pagesets.lock held. Releasing/reacquiring 5223 * removes much of the performance benefit of bulk allocation so 5224 * force the caller to allocate one page at a time as it'll have 5225 * similar performance to added complexity to the bulk allocator. 5226 */ 5227 if (static_branch_unlikely(&page_owner_inited)) 5228 goto failed; 5229 #endif 5230 5231 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ 5232 gfp &= gfp_allowed_mask; 5233 alloc_gfp = gfp; 5234 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) 5235 goto out; 5236 gfp = alloc_gfp; 5237 5238 /* Find an allowed local zone that meets the low watermark. */ 5239 for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { 5240 unsigned long mark; 5241 5242 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && 5243 !__cpuset_zone_allowed(zone, gfp)) { 5244 continue; 5245 } 5246 5247 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && 5248 zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { 5249 goto failed; 5250 } 5251 5252 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; 5253 if (zone_watermark_fast(zone, 0, mark, 5254 zonelist_zone_idx(ac.preferred_zoneref), 5255 alloc_flags, gfp)) { 5256 break; 5257 } 5258 } 5259 5260 /* 5261 * If there are no allowed local zones that meets the watermarks then 5262 * try to allocate a single page and reclaim if necessary. 5263 */ 5264 if (unlikely(!zone)) 5265 goto failed; 5266 5267 /* Attempt the batch allocation */ 5268 local_lock_irqsave(&pagesets.lock, flags); 5269 pcp = this_cpu_ptr(zone->per_cpu_pageset); 5270 pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; 5271 5272 while (nr_populated < nr_pages) { 5273 5274 /* Skip existing pages */ 5275 if (page_array && page_array[nr_populated]) { 5276 nr_populated++; 5277 continue; 5278 } 5279 5280 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, 5281 pcp, pcp_list); 5282 if (unlikely(!page)) { 5283 /* Try and get at least one page */ 5284 if (!nr_populated) 5285 goto failed_irq; 5286 break; 5287 } 5288 nr_account++; 5289 5290 prep_new_page(page, 0, gfp, 0); 5291 if (page_list) 5292 list_add(&page->lru, page_list); 5293 else 5294 page_array[nr_populated] = page; 5295 nr_populated++; 5296 } 5297 5298 local_unlock_irqrestore(&pagesets.lock, flags); 5299 5300 __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); 5301 zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); 5302 5303 out: 5304 return nr_populated; 5305 5306 failed_irq: 5307 local_unlock_irqrestore(&pagesets.lock, flags); 5308 5309 failed: 5310 page = __alloc_pages(gfp, 0, preferred_nid, nodemask); 5311 if (page) { 5312 if (page_list) 5313 list_add(&page->lru, page_list); 5314 else 5315 page_array[nr_populated] = page; 5316 nr_populated++; 5317 } 5318 5319 goto out; 5320 } 5321 EXPORT_SYMBOL_GPL(__alloc_pages_bulk); 5322 5323 /* 5324 * This is the 'heart' of the zoned buddy allocator. 5325 */ 5326 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, 5327 nodemask_t *nodemask) 5328 { 5329 struct page *page; 5330 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5331 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ 5332 struct alloc_context ac = { }; 5333 5334 /* 5335 * There are several places where we assume that the order value is sane 5336 * so bail out early if the request is out of bound. 5337 */ 5338 if (unlikely(order >= MAX_ORDER)) { 5339 WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); 5340 return NULL; 5341 } 5342 5343 gfp &= gfp_allowed_mask; 5344 /* 5345 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 5346 * resp. GFP_NOIO which has to be inherited for all allocation requests 5347 * from a particular context which has been marked by 5348 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures 5349 * movable zones are not used during allocation. 5350 */ 5351 gfp = current_gfp_context(gfp); 5352 alloc_gfp = gfp; 5353 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, 5354 &alloc_gfp, &alloc_flags)) 5355 return NULL; 5356 5357 /* 5358 * Forbid the first pass from falling back to types that fragment 5359 * memory until all local zones are considered. 5360 */ 5361 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); 5362 5363 /* First allocation attempt */ 5364 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 5365 if (likely(page)) 5366 goto out; 5367 5368 alloc_gfp = gfp; 5369 ac.spread_dirty_pages = false; 5370 5371 /* 5372 * Restore the original nodemask if it was potentially replaced with 5373 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 5374 */ 5375 ac.nodemask = nodemask; 5376 5377 page = __alloc_pages_slowpath(alloc_gfp, order, &ac); 5378 5379 out: 5380 if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && 5381 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { 5382 __free_pages(page, order); 5383 page = NULL; 5384 } 5385 5386 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 5387 5388 return page; 5389 } 5390 EXPORT_SYMBOL(__alloc_pages); 5391 5392 struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, 5393 nodemask_t *nodemask) 5394 { 5395 struct page *page = __alloc_pages(gfp | __GFP_COMP, order, 5396 preferred_nid, nodemask); 5397 5398 if (page && order > 1) 5399 prep_transhuge_page(page); 5400 return (struct folio *)page; 5401 } 5402 EXPORT_SYMBOL(__folio_alloc); 5403 5404 /* 5405 * Common helper functions. Never use with __GFP_HIGHMEM because the returned 5406 * address cannot represent highmem pages. Use alloc_pages and then kmap if 5407 * you need to access high mem. 5408 */ 5409 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 5410 { 5411 struct page *page; 5412 5413 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); 5414 if (!page) 5415 return 0; 5416 return (unsigned long) page_address(page); 5417 } 5418 EXPORT_SYMBOL(__get_free_pages); 5419 5420 unsigned long get_zeroed_page(gfp_t gfp_mask) 5421 { 5422 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 5423 } 5424 EXPORT_SYMBOL(get_zeroed_page); 5425 5426 /** 5427 * __free_pages - Free pages allocated with alloc_pages(). 5428 * @page: The page pointer returned from alloc_pages(). 5429 * @order: The order of the allocation. 5430 * 5431 * This function can free multi-page allocations that are not compound 5432 * pages. It does not check that the @order passed in matches that of 5433 * the allocation, so it is easy to leak memory. Freeing more memory 5434 * than was allocated will probably emit a warning. 5435 * 5436 * If the last reference to this page is speculative, it will be released 5437 * by put_page() which only frees the first page of a non-compound 5438 * allocation. To prevent the remaining pages from being leaked, we free 5439 * the subsequent pages here. If you want to use the page's reference 5440 * count to decide when to free the allocation, you should allocate a 5441 * compound page, and use put_page() instead of __free_pages(). 5442 * 5443 * Context: May be called in interrupt context or while holding a normal 5444 * spinlock, but not in NMI context or while holding a raw spinlock. 5445 */ 5446 void __free_pages(struct page *page, unsigned int order) 5447 { 5448 if (put_page_testzero(page)) 5449 free_the_page(page, order); 5450 else if (!PageHead(page)) 5451 while (order-- > 0) 5452 free_the_page(page + (1 << order), order); 5453 } 5454 EXPORT_SYMBOL(__free_pages); 5455 5456 void free_pages(unsigned long addr, unsigned int order) 5457 { 5458 if (addr != 0) { 5459 VM_BUG_ON(!virt_addr_valid((void *)addr)); 5460 __free_pages(virt_to_page((void *)addr), order); 5461 } 5462 } 5463 5464 EXPORT_SYMBOL(free_pages); 5465 5466 /* 5467 * Page Fragment: 5468 * An arbitrary-length arbitrary-offset area of memory which resides 5469 * within a 0 or higher order page. Multiple fragments within that page 5470 * are individually refcounted, in the page's reference counter. 5471 * 5472 * The page_frag functions below provide a simple allocation framework for 5473 * page fragments. This is used by the network stack and network device 5474 * drivers to provide a backing region of memory for use as either an 5475 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 5476 */ 5477 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 5478 gfp_t gfp_mask) 5479 { 5480 struct page *page = NULL; 5481 gfp_t gfp = gfp_mask; 5482 5483 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5484 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 5485 __GFP_NOMEMALLOC; 5486 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 5487 PAGE_FRAG_CACHE_MAX_ORDER); 5488 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 5489 #endif 5490 if (unlikely(!page)) 5491 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 5492 5493 nc->va = page ? page_address(page) : NULL; 5494 5495 return page; 5496 } 5497 5498 void __page_frag_cache_drain(struct page *page, unsigned int count) 5499 { 5500 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 5501 5502 if (page_ref_sub_and_test(page, count)) 5503 free_the_page(page, compound_order(page)); 5504 } 5505 EXPORT_SYMBOL(__page_frag_cache_drain); 5506 5507 void *page_frag_alloc_align(struct page_frag_cache *nc, 5508 unsigned int fragsz, gfp_t gfp_mask, 5509 unsigned int align_mask) 5510 { 5511 unsigned int size = PAGE_SIZE; 5512 struct page *page; 5513 int offset; 5514 5515 if (unlikely(!nc->va)) { 5516 refill: 5517 page = __page_frag_cache_refill(nc, gfp_mask); 5518 if (!page) 5519 return NULL; 5520 5521 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5522 /* if size can vary use size else just use PAGE_SIZE */ 5523 size = nc->size; 5524 #endif 5525 /* Even if we own the page, we do not use atomic_set(). 5526 * This would break get_page_unless_zero() users. 5527 */ 5528 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); 5529 5530 /* reset page count bias and offset to start of new frag */ 5531 nc->pfmemalloc = page_is_pfmemalloc(page); 5532 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5533 nc->offset = size; 5534 } 5535 5536 offset = nc->offset - fragsz; 5537 if (unlikely(offset < 0)) { 5538 page = virt_to_page(nc->va); 5539 5540 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 5541 goto refill; 5542 5543 if (unlikely(nc->pfmemalloc)) { 5544 free_the_page(page, compound_order(page)); 5545 goto refill; 5546 } 5547 5548 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5549 /* if size can vary use size else just use PAGE_SIZE */ 5550 size = nc->size; 5551 #endif 5552 /* OK, page count is 0, we can safely set it */ 5553 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); 5554 5555 /* reset page count bias and offset to start of new frag */ 5556 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5557 offset = size - fragsz; 5558 } 5559 5560 nc->pagecnt_bias--; 5561 offset &= align_mask; 5562 nc->offset = offset; 5563 5564 return nc->va + offset; 5565 } 5566 EXPORT_SYMBOL(page_frag_alloc_align); 5567 5568 /* 5569 * Frees a page fragment allocated out of either a compound or order 0 page. 5570 */ 5571 void page_frag_free(void *addr) 5572 { 5573 struct page *page = virt_to_head_page(addr); 5574 5575 if (unlikely(put_page_testzero(page))) 5576 free_the_page(page, compound_order(page)); 5577 } 5578 EXPORT_SYMBOL(page_frag_free); 5579 5580 static void *make_alloc_exact(unsigned long addr, unsigned int order, 5581 size_t size) 5582 { 5583 if (addr) { 5584 unsigned long alloc_end = addr + (PAGE_SIZE << order); 5585 unsigned long used = addr + PAGE_ALIGN(size); 5586 5587 split_page(virt_to_page((void *)addr), order); 5588 while (used < alloc_end) { 5589 free_page(used); 5590 used += PAGE_SIZE; 5591 } 5592 } 5593 return (void *)addr; 5594 } 5595 5596 /** 5597 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 5598 * @size: the number of bytes to allocate 5599 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5600 * 5601 * This function is similar to alloc_pages(), except that it allocates the 5602 * minimum number of pages to satisfy the request. alloc_pages() can only 5603 * allocate memory in power-of-two pages. 5604 * 5605 * This function is also limited by MAX_ORDER. 5606 * 5607 * Memory allocated by this function must be released by free_pages_exact(). 5608 * 5609 * Return: pointer to the allocated area or %NULL in case of error. 5610 */ 5611 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 5612 { 5613 unsigned int order = get_order(size); 5614 unsigned long addr; 5615 5616 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 5617 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 5618 5619 addr = __get_free_pages(gfp_mask, order); 5620 return make_alloc_exact(addr, order, size); 5621 } 5622 EXPORT_SYMBOL(alloc_pages_exact); 5623 5624 /** 5625 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 5626 * pages on a node. 5627 * @nid: the preferred node ID where memory should be allocated 5628 * @size: the number of bytes to allocate 5629 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5630 * 5631 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 5632 * back. 5633 * 5634 * Return: pointer to the allocated area or %NULL in case of error. 5635 */ 5636 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 5637 { 5638 unsigned int order = get_order(size); 5639 struct page *p; 5640 5641 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 5642 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 5643 5644 p = alloc_pages_node(nid, gfp_mask, order); 5645 if (!p) 5646 return NULL; 5647 return make_alloc_exact((unsigned long)page_address(p), order, size); 5648 } 5649 5650 /** 5651 * free_pages_exact - release memory allocated via alloc_pages_exact() 5652 * @virt: the value returned by alloc_pages_exact. 5653 * @size: size of allocation, same value as passed to alloc_pages_exact(). 5654 * 5655 * Release the memory allocated by a previous call to alloc_pages_exact. 5656 */ 5657 void free_pages_exact(void *virt, size_t size) 5658 { 5659 unsigned long addr = (unsigned long)virt; 5660 unsigned long end = addr + PAGE_ALIGN(size); 5661 5662 while (addr < end) { 5663 free_page(addr); 5664 addr += PAGE_SIZE; 5665 } 5666 } 5667 EXPORT_SYMBOL(free_pages_exact); 5668 5669 /** 5670 * nr_free_zone_pages - count number of pages beyond high watermark 5671 * @offset: The zone index of the highest zone 5672 * 5673 * nr_free_zone_pages() counts the number of pages which are beyond the 5674 * high watermark within all zones at or below a given zone index. For each 5675 * zone, the number of pages is calculated as: 5676 * 5677 * nr_free_zone_pages = managed_pages - high_pages 5678 * 5679 * Return: number of pages beyond high watermark. 5680 */ 5681 static unsigned long nr_free_zone_pages(int offset) 5682 { 5683 struct zoneref *z; 5684 struct zone *zone; 5685 5686 /* Just pick one node, since fallback list is circular */ 5687 unsigned long sum = 0; 5688 5689 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 5690 5691 for_each_zone_zonelist(zone, z, zonelist, offset) { 5692 unsigned long size = zone_managed_pages(zone); 5693 unsigned long high = high_wmark_pages(zone); 5694 if (size > high) 5695 sum += size - high; 5696 } 5697 5698 return sum; 5699 } 5700 5701 /** 5702 * nr_free_buffer_pages - count number of pages beyond high watermark 5703 * 5704 * nr_free_buffer_pages() counts the number of pages which are beyond the high 5705 * watermark within ZONE_DMA and ZONE_NORMAL. 5706 * 5707 * Return: number of pages beyond high watermark within ZONE_DMA and 5708 * ZONE_NORMAL. 5709 */ 5710 unsigned long nr_free_buffer_pages(void) 5711 { 5712 return nr_free_zone_pages(gfp_zone(GFP_USER)); 5713 } 5714 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 5715 5716 static inline void show_node(struct zone *zone) 5717 { 5718 if (IS_ENABLED(CONFIG_NUMA)) 5719 printk("Node %d ", zone_to_nid(zone)); 5720 } 5721 5722 long si_mem_available(void) 5723 { 5724 long available; 5725 unsigned long pagecache; 5726 unsigned long wmark_low = 0; 5727 unsigned long pages[NR_LRU_LISTS]; 5728 unsigned long reclaimable; 5729 struct zone *zone; 5730 int lru; 5731 5732 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 5733 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 5734 5735 for_each_zone(zone) 5736 wmark_low += low_wmark_pages(zone); 5737 5738 /* 5739 * Estimate the amount of memory available for userspace allocations, 5740 * without causing swapping. 5741 */ 5742 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; 5743 5744 /* 5745 * Not all the page cache can be freed, otherwise the system will 5746 * start swapping. Assume at least half of the page cache, or the 5747 * low watermark worth of cache, needs to stay. 5748 */ 5749 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 5750 pagecache -= min(pagecache / 2, wmark_low); 5751 available += pagecache; 5752 5753 /* 5754 * Part of the reclaimable slab and other kernel memory consists of 5755 * items that are in use, and cannot be freed. Cap this estimate at the 5756 * low watermark. 5757 */ 5758 reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + 5759 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); 5760 available += reclaimable - min(reclaimable / 2, wmark_low); 5761 5762 if (available < 0) 5763 available = 0; 5764 return available; 5765 } 5766 EXPORT_SYMBOL_GPL(si_mem_available); 5767 5768 void si_meminfo(struct sysinfo *val) 5769 { 5770 val->totalram = totalram_pages(); 5771 val->sharedram = global_node_page_state(NR_SHMEM); 5772 val->freeram = global_zone_page_state(NR_FREE_PAGES); 5773 val->bufferram = nr_blockdev_pages(); 5774 val->totalhigh = totalhigh_pages(); 5775 val->freehigh = nr_free_highpages(); 5776 val->mem_unit = PAGE_SIZE; 5777 } 5778 5779 EXPORT_SYMBOL(si_meminfo); 5780 5781 #ifdef CONFIG_NUMA 5782 void si_meminfo_node(struct sysinfo *val, int nid) 5783 { 5784 int zone_type; /* needs to be signed */ 5785 unsigned long managed_pages = 0; 5786 unsigned long managed_highpages = 0; 5787 unsigned long free_highpages = 0; 5788 pg_data_t *pgdat = NODE_DATA(nid); 5789 5790 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 5791 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); 5792 val->totalram = managed_pages; 5793 val->sharedram = node_page_state(pgdat, NR_SHMEM); 5794 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 5795 #ifdef CONFIG_HIGHMEM 5796 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 5797 struct zone *zone = &pgdat->node_zones[zone_type]; 5798 5799 if (is_highmem(zone)) { 5800 managed_highpages += zone_managed_pages(zone); 5801 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 5802 } 5803 } 5804 val->totalhigh = managed_highpages; 5805 val->freehigh = free_highpages; 5806 #else 5807 val->totalhigh = managed_highpages; 5808 val->freehigh = free_highpages; 5809 #endif 5810 val->mem_unit = PAGE_SIZE; 5811 } 5812 #endif 5813 5814 /* 5815 * Determine whether the node should be displayed or not, depending on whether 5816 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 5817 */ 5818 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 5819 { 5820 if (!(flags & SHOW_MEM_FILTER_NODES)) 5821 return false; 5822 5823 /* 5824 * no node mask - aka implicit memory numa policy. Do not bother with 5825 * the synchronization - read_mems_allowed_begin - because we do not 5826 * have to be precise here. 5827 */ 5828 if (!nodemask) 5829 nodemask = &cpuset_current_mems_allowed; 5830 5831 return !node_isset(nid, *nodemask); 5832 } 5833 5834 #define K(x) ((x) << (PAGE_SHIFT-10)) 5835 5836 static void show_migration_types(unsigned char type) 5837 { 5838 static const char types[MIGRATE_TYPES] = { 5839 [MIGRATE_UNMOVABLE] = 'U', 5840 [MIGRATE_MOVABLE] = 'M', 5841 [MIGRATE_RECLAIMABLE] = 'E', 5842 [MIGRATE_HIGHATOMIC] = 'H', 5843 #ifdef CONFIG_CMA 5844 [MIGRATE_CMA] = 'C', 5845 #endif 5846 #ifdef CONFIG_MEMORY_ISOLATION 5847 [MIGRATE_ISOLATE] = 'I', 5848 #endif 5849 }; 5850 char tmp[MIGRATE_TYPES + 1]; 5851 char *p = tmp; 5852 int i; 5853 5854 for (i = 0; i < MIGRATE_TYPES; i++) { 5855 if (type & (1 << i)) 5856 *p++ = types[i]; 5857 } 5858 5859 *p = '\0'; 5860 printk(KERN_CONT "(%s) ", tmp); 5861 } 5862 5863 /* 5864 * Show free area list (used inside shift_scroll-lock stuff) 5865 * We also calculate the percentage fragmentation. We do this by counting the 5866 * memory on each free list with the exception of the first item on the list. 5867 * 5868 * Bits in @filter: 5869 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 5870 * cpuset. 5871 */ 5872 void show_free_areas(unsigned int filter, nodemask_t *nodemask) 5873 { 5874 unsigned long free_pcp = 0; 5875 int cpu; 5876 struct zone *zone; 5877 pg_data_t *pgdat; 5878 5879 for_each_populated_zone(zone) { 5880 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5881 continue; 5882 5883 for_each_online_cpu(cpu) 5884 free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; 5885 } 5886 5887 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 5888 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 5889 " unevictable:%lu dirty:%lu writeback:%lu\n" 5890 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 5891 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 5892 " kernel_misc_reclaimable:%lu\n" 5893 " free:%lu free_pcp:%lu free_cma:%lu\n", 5894 global_node_page_state(NR_ACTIVE_ANON), 5895 global_node_page_state(NR_INACTIVE_ANON), 5896 global_node_page_state(NR_ISOLATED_ANON), 5897 global_node_page_state(NR_ACTIVE_FILE), 5898 global_node_page_state(NR_INACTIVE_FILE), 5899 global_node_page_state(NR_ISOLATED_FILE), 5900 global_node_page_state(NR_UNEVICTABLE), 5901 global_node_page_state(NR_FILE_DIRTY), 5902 global_node_page_state(NR_WRITEBACK), 5903 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), 5904 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), 5905 global_node_page_state(NR_FILE_MAPPED), 5906 global_node_page_state(NR_SHMEM), 5907 global_node_page_state(NR_PAGETABLE), 5908 global_zone_page_state(NR_BOUNCE), 5909 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), 5910 global_zone_page_state(NR_FREE_PAGES), 5911 free_pcp, 5912 global_zone_page_state(NR_FREE_CMA_PAGES)); 5913 5914 for_each_online_pgdat(pgdat) { 5915 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 5916 continue; 5917 5918 printk("Node %d" 5919 " active_anon:%lukB" 5920 " inactive_anon:%lukB" 5921 " active_file:%lukB" 5922 " inactive_file:%lukB" 5923 " unevictable:%lukB" 5924 " isolated(anon):%lukB" 5925 " isolated(file):%lukB" 5926 " mapped:%lukB" 5927 " dirty:%lukB" 5928 " writeback:%lukB" 5929 " shmem:%lukB" 5930 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5931 " shmem_thp: %lukB" 5932 " shmem_pmdmapped: %lukB" 5933 " anon_thp: %lukB" 5934 #endif 5935 " writeback_tmp:%lukB" 5936 " kernel_stack:%lukB" 5937 #ifdef CONFIG_SHADOW_CALL_STACK 5938 " shadow_call_stack:%lukB" 5939 #endif 5940 " pagetables:%lukB" 5941 " all_unreclaimable? %s" 5942 "\n", 5943 pgdat->node_id, 5944 K(node_page_state(pgdat, NR_ACTIVE_ANON)), 5945 K(node_page_state(pgdat, NR_INACTIVE_ANON)), 5946 K(node_page_state(pgdat, NR_ACTIVE_FILE)), 5947 K(node_page_state(pgdat, NR_INACTIVE_FILE)), 5948 K(node_page_state(pgdat, NR_UNEVICTABLE)), 5949 K(node_page_state(pgdat, NR_ISOLATED_ANON)), 5950 K(node_page_state(pgdat, NR_ISOLATED_FILE)), 5951 K(node_page_state(pgdat, NR_FILE_MAPPED)), 5952 K(node_page_state(pgdat, NR_FILE_DIRTY)), 5953 K(node_page_state(pgdat, NR_WRITEBACK)), 5954 K(node_page_state(pgdat, NR_SHMEM)), 5955 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5956 K(node_page_state(pgdat, NR_SHMEM_THPS)), 5957 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), 5958 K(node_page_state(pgdat, NR_ANON_THPS)), 5959 #endif 5960 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 5961 node_page_state(pgdat, NR_KERNEL_STACK_KB), 5962 #ifdef CONFIG_SHADOW_CALL_STACK 5963 node_page_state(pgdat, NR_KERNEL_SCS_KB), 5964 #endif 5965 K(node_page_state(pgdat, NR_PAGETABLE)), 5966 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 5967 "yes" : "no"); 5968 } 5969 5970 for_each_populated_zone(zone) { 5971 int i; 5972 5973 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5974 continue; 5975 5976 free_pcp = 0; 5977 for_each_online_cpu(cpu) 5978 free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; 5979 5980 show_node(zone); 5981 printk(KERN_CONT 5982 "%s" 5983 " free:%lukB" 5984 " boost:%lukB" 5985 " min:%lukB" 5986 " low:%lukB" 5987 " high:%lukB" 5988 " reserved_highatomic:%luKB" 5989 " active_anon:%lukB" 5990 " inactive_anon:%lukB" 5991 " active_file:%lukB" 5992 " inactive_file:%lukB" 5993 " unevictable:%lukB" 5994 " writepending:%lukB" 5995 " present:%lukB" 5996 " managed:%lukB" 5997 " mlocked:%lukB" 5998 " bounce:%lukB" 5999 " free_pcp:%lukB" 6000 " local_pcp:%ukB" 6001 " free_cma:%lukB" 6002 "\n", 6003 zone->name, 6004 K(zone_page_state(zone, NR_FREE_PAGES)), 6005 K(zone->watermark_boost), 6006 K(min_wmark_pages(zone)), 6007 K(low_wmark_pages(zone)), 6008 K(high_wmark_pages(zone)), 6009 K(zone->nr_reserved_highatomic), 6010 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 6011 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 6012 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 6013 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 6014 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 6015 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 6016 K(zone->present_pages), 6017 K(zone_managed_pages(zone)), 6018 K(zone_page_state(zone, NR_MLOCK)), 6019 K(zone_page_state(zone, NR_BOUNCE)), 6020 K(free_pcp), 6021 K(this_cpu_read(zone->per_cpu_pageset->count)), 6022 K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 6023 printk("lowmem_reserve[]:"); 6024 for (i = 0; i < MAX_NR_ZONES; i++) 6025 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 6026 printk(KERN_CONT "\n"); 6027 } 6028 6029 for_each_populated_zone(zone) { 6030 unsigned int order; 6031 unsigned long nr[MAX_ORDER], flags, total = 0; 6032 unsigned char types[MAX_ORDER]; 6033 6034 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 6035 continue; 6036 show_node(zone); 6037 printk(KERN_CONT "%s: ", zone->name); 6038 6039 spin_lock_irqsave(&zone->lock, flags); 6040 for (order = 0; order < MAX_ORDER; order++) { 6041 struct free_area *area = &zone->free_area[order]; 6042 int type; 6043 6044 nr[order] = area->nr_free; 6045 total += nr[order] << order; 6046 6047 types[order] = 0; 6048 for (type = 0; type < MIGRATE_TYPES; type++) { 6049 if (!free_area_empty(area, type)) 6050 types[order] |= 1 << type; 6051 } 6052 } 6053 spin_unlock_irqrestore(&zone->lock, flags); 6054 for (order = 0; order < MAX_ORDER; order++) { 6055 printk(KERN_CONT "%lu*%lukB ", 6056 nr[order], K(1UL) << order); 6057 if (nr[order]) 6058 show_migration_types(types[order]); 6059 } 6060 printk(KERN_CONT "= %lukB\n", K(total)); 6061 } 6062 6063 hugetlb_show_meminfo(); 6064 6065 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 6066 6067 show_swap_cache_info(); 6068 } 6069 6070 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 6071 { 6072 zoneref->zone = zone; 6073 zoneref->zone_idx = zone_idx(zone); 6074 } 6075 6076 /* 6077 * Builds allocation fallback zone lists. 6078 * 6079 * Add all populated zones of a node to the zonelist. 6080 */ 6081 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 6082 { 6083 struct zone *zone; 6084 enum zone_type zone_type = MAX_NR_ZONES; 6085 int nr_zones = 0; 6086 6087 do { 6088 zone_type--; 6089 zone = pgdat->node_zones + zone_type; 6090 if (populated_zone(zone)) { 6091 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 6092 check_highest_zone(zone_type); 6093 } 6094 } while (zone_type); 6095 6096 return nr_zones; 6097 } 6098 6099 #ifdef CONFIG_NUMA 6100 6101 static int __parse_numa_zonelist_order(char *s) 6102 { 6103 /* 6104 * We used to support different zonelists modes but they turned 6105 * out to be just not useful. Let's keep the warning in place 6106 * if somebody still use the cmd line parameter so that we do 6107 * not fail it silently 6108 */ 6109 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 6110 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 6111 return -EINVAL; 6112 } 6113 return 0; 6114 } 6115 6116 char numa_zonelist_order[] = "Node"; 6117 6118 /* 6119 * sysctl handler for numa_zonelist_order 6120 */ 6121 int numa_zonelist_order_handler(struct ctl_table *table, int write, 6122 void *buffer, size_t *length, loff_t *ppos) 6123 { 6124 if (write) 6125 return __parse_numa_zonelist_order(buffer); 6126 return proc_dostring(table, write, buffer, length, ppos); 6127 } 6128 6129 6130 static int node_load[MAX_NUMNODES]; 6131 6132 /** 6133 * find_next_best_node - find the next node that should appear in a given node's fallback list 6134 * @node: node whose fallback list we're appending 6135 * @used_node_mask: nodemask_t of already used nodes 6136 * 6137 * We use a number of factors to determine which is the next node that should 6138 * appear on a given node's fallback list. The node should not have appeared 6139 * already in @node's fallback list, and it should be the next closest node 6140 * according to the distance array (which contains arbitrary distance values 6141 * from each node to each node in the system), and should also prefer nodes 6142 * with no CPUs, since presumably they'll have very little allocation pressure 6143 * on them otherwise. 6144 * 6145 * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 6146 */ 6147 int find_next_best_node(int node, nodemask_t *used_node_mask) 6148 { 6149 int n, val; 6150 int min_val = INT_MAX; 6151 int best_node = NUMA_NO_NODE; 6152 6153 /* Use the local node if we haven't already */ 6154 if (!node_isset(node, *used_node_mask)) { 6155 node_set(node, *used_node_mask); 6156 return node; 6157 } 6158 6159 for_each_node_state(n, N_MEMORY) { 6160 6161 /* Don't want a node to appear more than once */ 6162 if (node_isset(n, *used_node_mask)) 6163 continue; 6164 6165 /* Use the distance array to find the distance */ 6166 val = node_distance(node, n); 6167 6168 /* Penalize nodes under us ("prefer the next node") */ 6169 val += (n < node); 6170 6171 /* Give preference to headless and unused nodes */ 6172 if (!cpumask_empty(cpumask_of_node(n))) 6173 val += PENALTY_FOR_NODE_WITH_CPUS; 6174 6175 /* Slight preference for less loaded node */ 6176 val *= MAX_NUMNODES; 6177 val += node_load[n]; 6178 6179 if (val < min_val) { 6180 min_val = val; 6181 best_node = n; 6182 } 6183 } 6184 6185 if (best_node >= 0) 6186 node_set(best_node, *used_node_mask); 6187 6188 return best_node; 6189 } 6190 6191 6192 /* 6193 * Build zonelists ordered by node and zones within node. 6194 * This results in maximum locality--normal zone overflows into local 6195 * DMA zone, if any--but risks exhausting DMA zone. 6196 */ 6197 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 6198 unsigned nr_nodes) 6199 { 6200 struct zoneref *zonerefs; 6201 int i; 6202 6203 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 6204 6205 for (i = 0; i < nr_nodes; i++) { 6206 int nr_zones; 6207 6208 pg_data_t *node = NODE_DATA(node_order[i]); 6209 6210 nr_zones = build_zonerefs_node(node, zonerefs); 6211 zonerefs += nr_zones; 6212 } 6213 zonerefs->zone = NULL; 6214 zonerefs->zone_idx = 0; 6215 } 6216 6217 /* 6218 * Build gfp_thisnode zonelists 6219 */ 6220 static void build_thisnode_zonelists(pg_data_t *pgdat) 6221 { 6222 struct zoneref *zonerefs; 6223 int nr_zones; 6224 6225 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 6226 nr_zones = build_zonerefs_node(pgdat, zonerefs); 6227 zonerefs += nr_zones; 6228 zonerefs->zone = NULL; 6229 zonerefs->zone_idx = 0; 6230 } 6231 6232 /* 6233 * Build zonelists ordered by zone and nodes within zones. 6234 * This results in conserving DMA zone[s] until all Normal memory is 6235 * exhausted, but results in overflowing to remote node while memory 6236 * may still exist in local DMA zone. 6237 */ 6238 6239 static void build_zonelists(pg_data_t *pgdat) 6240 { 6241 static int node_order[MAX_NUMNODES]; 6242 int node, nr_nodes = 0; 6243 nodemask_t used_mask = NODE_MASK_NONE; 6244 int local_node, prev_node; 6245 6246 /* NUMA-aware ordering of nodes */ 6247 local_node = pgdat->node_id; 6248 prev_node = local_node; 6249 6250 memset(node_order, 0, sizeof(node_order)); 6251 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 6252 /* 6253 * We don't want to pressure a particular node. 6254 * So adding penalty to the first node in same 6255 * distance group to make it round-robin. 6256 */ 6257 if (node_distance(local_node, node) != 6258 node_distance(local_node, prev_node)) 6259 node_load[node] += 1; 6260 6261 node_order[nr_nodes++] = node; 6262 prev_node = node; 6263 } 6264 6265 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 6266 build_thisnode_zonelists(pgdat); 6267 pr_info("Fallback order for Node %d: ", local_node); 6268 for (node = 0; node < nr_nodes; node++) 6269 pr_cont("%d ", node_order[node]); 6270 pr_cont("\n"); 6271 } 6272 6273 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 6274 /* 6275 * Return node id of node used for "local" allocations. 6276 * I.e., first node id of first zone in arg node's generic zonelist. 6277 * Used for initializing percpu 'numa_mem', which is used primarily 6278 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 6279 */ 6280 int local_memory_node(int node) 6281 { 6282 struct zoneref *z; 6283 6284 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 6285 gfp_zone(GFP_KERNEL), 6286 NULL); 6287 return zone_to_nid(z->zone); 6288 } 6289 #endif 6290 6291 static void setup_min_unmapped_ratio(void); 6292 static void setup_min_slab_ratio(void); 6293 #else /* CONFIG_NUMA */ 6294 6295 static void build_zonelists(pg_data_t *pgdat) 6296 { 6297 int node, local_node; 6298 struct zoneref *zonerefs; 6299 int nr_zones; 6300 6301 local_node = pgdat->node_id; 6302 6303 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 6304 nr_zones = build_zonerefs_node(pgdat, zonerefs); 6305 zonerefs += nr_zones; 6306 6307 /* 6308 * Now we build the zonelist so that it contains the zones 6309 * of all the other nodes. 6310 * We don't want to pressure a particular node, so when 6311 * building the zones for node N, we make sure that the 6312 * zones coming right after the local ones are those from 6313 * node N+1 (modulo N) 6314 */ 6315 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 6316 if (!node_online(node)) 6317 continue; 6318 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 6319 zonerefs += nr_zones; 6320 } 6321 for (node = 0; node < local_node; node++) { 6322 if (!node_online(node)) 6323 continue; 6324 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 6325 zonerefs += nr_zones; 6326 } 6327 6328 zonerefs->zone = NULL; 6329 zonerefs->zone_idx = 0; 6330 } 6331 6332 #endif /* CONFIG_NUMA */ 6333 6334 /* 6335 * Boot pageset table. One per cpu which is going to be used for all 6336 * zones and all nodes. The parameters will be set in such a way 6337 * that an item put on a list will immediately be handed over to 6338 * the buddy list. This is safe since pageset manipulation is done 6339 * with interrupts disabled. 6340 * 6341 * The boot_pagesets must be kept even after bootup is complete for 6342 * unused processors and/or zones. They do play a role for bootstrapping 6343 * hotplugged processors. 6344 * 6345 * zoneinfo_show() and maybe other functions do 6346 * not check if the processor is online before following the pageset pointer. 6347 * Other parts of the kernel may not check if the zone is available. 6348 */ 6349 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); 6350 /* These effectively disable the pcplists in the boot pageset completely */ 6351 #define BOOT_PAGESET_HIGH 0 6352 #define BOOT_PAGESET_BATCH 1 6353 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); 6354 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); 6355 DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 6356 6357 static void __build_all_zonelists(void *data) 6358 { 6359 int nid; 6360 int __maybe_unused cpu; 6361 pg_data_t *self = data; 6362 static DEFINE_SPINLOCK(lock); 6363 6364 spin_lock(&lock); 6365 6366 #ifdef CONFIG_NUMA 6367 memset(node_load, 0, sizeof(node_load)); 6368 #endif 6369 6370 /* 6371 * This node is hotadded and no memory is yet present. So just 6372 * building zonelists is fine - no need to touch other nodes. 6373 */ 6374 if (self && !node_online(self->node_id)) { 6375 build_zonelists(self); 6376 } else { 6377 /* 6378 * All possible nodes have pgdat preallocated 6379 * in free_area_init 6380 */ 6381 for_each_node(nid) { 6382 pg_data_t *pgdat = NODE_DATA(nid); 6383 6384 build_zonelists(pgdat); 6385 } 6386 6387 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 6388 /* 6389 * We now know the "local memory node" for each node-- 6390 * i.e., the node of the first zone in the generic zonelist. 6391 * Set up numa_mem percpu variable for on-line cpus. During 6392 * boot, only the boot cpu should be on-line; we'll init the 6393 * secondary cpus' numa_mem as they come on-line. During 6394 * node/memory hotplug, we'll fixup all on-line cpus. 6395 */ 6396 for_each_online_cpu(cpu) 6397 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 6398 #endif 6399 } 6400 6401 spin_unlock(&lock); 6402 } 6403 6404 static noinline void __init 6405 build_all_zonelists_init(void) 6406 { 6407 int cpu; 6408 6409 __build_all_zonelists(NULL); 6410 6411 /* 6412 * Initialize the boot_pagesets that are going to be used 6413 * for bootstrapping processors. The real pagesets for 6414 * each zone will be allocated later when the per cpu 6415 * allocator is available. 6416 * 6417 * boot_pagesets are used also for bootstrapping offline 6418 * cpus if the system is already booted because the pagesets 6419 * are needed to initialize allocators on a specific cpu too. 6420 * F.e. the percpu allocator needs the page allocator which 6421 * needs the percpu allocator in order to allocate its pagesets 6422 * (a chicken-egg dilemma). 6423 */ 6424 for_each_possible_cpu(cpu) 6425 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); 6426 6427 mminit_verify_zonelist(); 6428 cpuset_init_current_mems_allowed(); 6429 } 6430 6431 /* 6432 * unless system_state == SYSTEM_BOOTING. 6433 * 6434 * __ref due to call of __init annotated helper build_all_zonelists_init 6435 * [protected by SYSTEM_BOOTING]. 6436 */ 6437 void __ref build_all_zonelists(pg_data_t *pgdat) 6438 { 6439 unsigned long vm_total_pages; 6440 6441 if (system_state == SYSTEM_BOOTING) { 6442 build_all_zonelists_init(); 6443 } else { 6444 __build_all_zonelists(pgdat); 6445 /* cpuset refresh routine should be here */ 6446 } 6447 /* Get the number of free pages beyond high watermark in all zones. */ 6448 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 6449 /* 6450 * Disable grouping by mobility if the number of pages in the 6451 * system is too low to allow the mechanism to work. It would be 6452 * more accurate, but expensive to check per-zone. This check is 6453 * made on memory-hotadd so a system can start with mobility 6454 * disabled and enable it later 6455 */ 6456 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 6457 page_group_by_mobility_disabled = 1; 6458 else 6459 page_group_by_mobility_disabled = 0; 6460 6461 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 6462 nr_online_nodes, 6463 page_group_by_mobility_disabled ? "off" : "on", 6464 vm_total_pages); 6465 #ifdef CONFIG_NUMA 6466 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 6467 #endif 6468 } 6469 6470 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ 6471 static bool __meminit 6472 overlap_memmap_init(unsigned long zone, unsigned long *pfn) 6473 { 6474 static struct memblock_region *r; 6475 6476 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 6477 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { 6478 for_each_mem_region(r) { 6479 if (*pfn < memblock_region_memory_end_pfn(r)) 6480 break; 6481 } 6482 } 6483 if (*pfn >= memblock_region_memory_base_pfn(r) && 6484 memblock_is_mirror(r)) { 6485 *pfn = memblock_region_memory_end_pfn(r); 6486 return true; 6487 } 6488 } 6489 return false; 6490 } 6491 6492 /* 6493 * Initially all pages are reserved - free ones are freed 6494 * up by memblock_free_all() once the early boot process is 6495 * done. Non-atomic initialization, single-pass. 6496 * 6497 * All aligned pageblocks are initialized to the specified migratetype 6498 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 6499 * zone stats (e.g., nr_isolate_pageblock) are touched. 6500 */ 6501 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, 6502 unsigned long start_pfn, unsigned long zone_end_pfn, 6503 enum meminit_context context, 6504 struct vmem_altmap *altmap, int migratetype) 6505 { 6506 unsigned long pfn, end_pfn = start_pfn + size; 6507 struct page *page; 6508 6509 if (highest_memmap_pfn < end_pfn - 1) 6510 highest_memmap_pfn = end_pfn - 1; 6511 6512 #ifdef CONFIG_ZONE_DEVICE 6513 /* 6514 * Honor reservation requested by the driver for this ZONE_DEVICE 6515 * memory. We limit the total number of pages to initialize to just 6516 * those that might contain the memory mapping. We will defer the 6517 * ZONE_DEVICE page initialization until after we have released 6518 * the hotplug lock. 6519 */ 6520 if (zone == ZONE_DEVICE) { 6521 if (!altmap) 6522 return; 6523 6524 if (start_pfn == altmap->base_pfn) 6525 start_pfn += altmap->reserve; 6526 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6527 } 6528 #endif 6529 6530 for (pfn = start_pfn; pfn < end_pfn; ) { 6531 /* 6532 * There can be holes in boot-time mem_map[]s handed to this 6533 * function. They do not exist on hotplugged memory. 6534 */ 6535 if (context == MEMINIT_EARLY) { 6536 if (overlap_memmap_init(zone, &pfn)) 6537 continue; 6538 if (defer_init(nid, pfn, zone_end_pfn)) 6539 break; 6540 } 6541 6542 page = pfn_to_page(pfn); 6543 __init_single_page(page, pfn, zone, nid); 6544 if (context == MEMINIT_HOTPLUG) 6545 __SetPageReserved(page); 6546 6547 /* 6548 * Usually, we want to mark the pageblock MIGRATE_MOVABLE, 6549 * such that unmovable allocations won't be scattered all 6550 * over the place during system boot. 6551 */ 6552 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6553 set_pageblock_migratetype(page, migratetype); 6554 cond_resched(); 6555 } 6556 pfn++; 6557 } 6558 } 6559 6560 #ifdef CONFIG_ZONE_DEVICE 6561 static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, 6562 unsigned long zone_idx, int nid, 6563 struct dev_pagemap *pgmap) 6564 { 6565 6566 __init_single_page(page, pfn, zone_idx, nid); 6567 6568 /* 6569 * Mark page reserved as it will need to wait for onlining 6570 * phase for it to be fully associated with a zone. 6571 * 6572 * We can use the non-atomic __set_bit operation for setting 6573 * the flag as we are still initializing the pages. 6574 */ 6575 __SetPageReserved(page); 6576 6577 /* 6578 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer 6579 * and zone_device_data. It is a bug if a ZONE_DEVICE page is 6580 * ever freed or placed on a driver-private list. 6581 */ 6582 page->pgmap = pgmap; 6583 page->zone_device_data = NULL; 6584 6585 /* 6586 * Mark the block movable so that blocks are reserved for 6587 * movable at startup. This will force kernel allocations 6588 * to reserve their blocks rather than leaking throughout 6589 * the address space during boot when many long-lived 6590 * kernel allocations are made. 6591 * 6592 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap 6593 * because this is done early in section_activate() 6594 */ 6595 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6596 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 6597 cond_resched(); 6598 } 6599 } 6600 6601 /* 6602 * With compound page geometry and when struct pages are stored in ram most 6603 * tail pages are reused. Consequently, the amount of unique struct pages to 6604 * initialize is a lot smaller that the total amount of struct pages being 6605 * mapped. This is a paired / mild layering violation with explicit knowledge 6606 * of how the sparse_vmemmap internals handle compound pages in the lack 6607 * of an altmap. See vmemmap_populate_compound_pages(). 6608 */ 6609 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, 6610 unsigned long nr_pages) 6611 { 6612 return is_power_of_2(sizeof(struct page)) && 6613 !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; 6614 } 6615 6616 static void __ref memmap_init_compound(struct page *head, 6617 unsigned long head_pfn, 6618 unsigned long zone_idx, int nid, 6619 struct dev_pagemap *pgmap, 6620 unsigned long nr_pages) 6621 { 6622 unsigned long pfn, end_pfn = head_pfn + nr_pages; 6623 unsigned int order = pgmap->vmemmap_shift; 6624 6625 __SetPageHead(head); 6626 for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { 6627 struct page *page = pfn_to_page(pfn); 6628 6629 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); 6630 prep_compound_tail(head, pfn - head_pfn); 6631 set_page_count(page, 0); 6632 6633 /* 6634 * The first tail page stores compound_mapcount_ptr() and 6635 * compound_order() and the second tail page stores 6636 * compound_pincount_ptr(). Call prep_compound_head() after 6637 * the first and second tail pages have been initialized to 6638 * not have the data overwritten. 6639 */ 6640 if (pfn == head_pfn + 2) 6641 prep_compound_head(head, order); 6642 } 6643 } 6644 6645 void __ref memmap_init_zone_device(struct zone *zone, 6646 unsigned long start_pfn, 6647 unsigned long nr_pages, 6648 struct dev_pagemap *pgmap) 6649 { 6650 unsigned long pfn, end_pfn = start_pfn + nr_pages; 6651 struct pglist_data *pgdat = zone->zone_pgdat; 6652 struct vmem_altmap *altmap = pgmap_altmap(pgmap); 6653 unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); 6654 unsigned long zone_idx = zone_idx(zone); 6655 unsigned long start = jiffies; 6656 int nid = pgdat->node_id; 6657 6658 if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) 6659 return; 6660 6661 /* 6662 * The call to memmap_init should have already taken care 6663 * of the pages reserved for the memmap, so we can just jump to 6664 * the end of that region and start processing the device pages. 6665 */ 6666 if (altmap) { 6667 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6668 nr_pages = end_pfn - start_pfn; 6669 } 6670 6671 for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { 6672 struct page *page = pfn_to_page(pfn); 6673 6674 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); 6675 6676 if (pfns_per_compound == 1) 6677 continue; 6678 6679 memmap_init_compound(page, pfn, zone_idx, nid, pgmap, 6680 compound_nr_pages(altmap, pfns_per_compound)); 6681 } 6682 6683 pr_info("%s initialised %lu pages in %ums\n", __func__, 6684 nr_pages, jiffies_to_msecs(jiffies - start)); 6685 } 6686 6687 #endif 6688 static void __meminit zone_init_free_lists(struct zone *zone) 6689 { 6690 unsigned int order, t; 6691 for_each_migratetype_order(order, t) { 6692 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 6693 zone->free_area[order].nr_free = 0; 6694 } 6695 } 6696 6697 /* 6698 * Only struct pages that correspond to ranges defined by memblock.memory 6699 * are zeroed and initialized by going through __init_single_page() during 6700 * memmap_init_zone_range(). 6701 * 6702 * But, there could be struct pages that correspond to holes in 6703 * memblock.memory. This can happen because of the following reasons: 6704 * - physical memory bank size is not necessarily the exact multiple of the 6705 * arbitrary section size 6706 * - early reserved memory may not be listed in memblock.memory 6707 * - memory layouts defined with memmap= kernel parameter may not align 6708 * nicely with memmap sections 6709 * 6710 * Explicitly initialize those struct pages so that: 6711 * - PG_Reserved is set 6712 * - zone and node links point to zone and node that span the page if the 6713 * hole is in the middle of a zone 6714 * - zone and node links point to adjacent zone/node if the hole falls on 6715 * the zone boundary; the pages in such holes will be prepended to the 6716 * zone/node above the hole except for the trailing pages in the last 6717 * section that will be appended to the zone/node below. 6718 */ 6719 static void __init init_unavailable_range(unsigned long spfn, 6720 unsigned long epfn, 6721 int zone, int node) 6722 { 6723 unsigned long pfn; 6724 u64 pgcnt = 0; 6725 6726 for (pfn = spfn; pfn < epfn; pfn++) { 6727 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6728 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6729 + pageblock_nr_pages - 1; 6730 continue; 6731 } 6732 __init_single_page(pfn_to_page(pfn), pfn, zone, node); 6733 __SetPageReserved(pfn_to_page(pfn)); 6734 pgcnt++; 6735 } 6736 6737 if (pgcnt) 6738 pr_info("On node %d, zone %s: %lld pages in unavailable ranges", 6739 node, zone_names[zone], pgcnt); 6740 } 6741 6742 static void __init memmap_init_zone_range(struct zone *zone, 6743 unsigned long start_pfn, 6744 unsigned long end_pfn, 6745 unsigned long *hole_pfn) 6746 { 6747 unsigned long zone_start_pfn = zone->zone_start_pfn; 6748 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 6749 int nid = zone_to_nid(zone), zone_id = zone_idx(zone); 6750 6751 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); 6752 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); 6753 6754 if (start_pfn >= end_pfn) 6755 return; 6756 6757 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, 6758 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); 6759 6760 if (*hole_pfn < start_pfn) 6761 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); 6762 6763 *hole_pfn = end_pfn; 6764 } 6765 6766 static void __init memmap_init(void) 6767 { 6768 unsigned long start_pfn, end_pfn; 6769 unsigned long hole_pfn = 0; 6770 int i, j, zone_id = 0, nid; 6771 6772 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 6773 struct pglist_data *node = NODE_DATA(nid); 6774 6775 for (j = 0; j < MAX_NR_ZONES; j++) { 6776 struct zone *zone = node->node_zones + j; 6777 6778 if (!populated_zone(zone)) 6779 continue; 6780 6781 memmap_init_zone_range(zone, start_pfn, end_pfn, 6782 &hole_pfn); 6783 zone_id = j; 6784 } 6785 } 6786 6787 #ifdef CONFIG_SPARSEMEM 6788 /* 6789 * Initialize the memory map for hole in the range [memory_end, 6790 * section_end]. 6791 * Append the pages in this hole to the highest zone in the last 6792 * node. 6793 * The call to init_unavailable_range() is outside the ifdef to 6794 * silence the compiler warining about zone_id set but not used; 6795 * for FLATMEM it is a nop anyway 6796 */ 6797 end_pfn = round_up(end_pfn, PAGES_PER_SECTION); 6798 if (hole_pfn < end_pfn) 6799 #endif 6800 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); 6801 } 6802 6803 void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, 6804 phys_addr_t min_addr, int nid, bool exact_nid) 6805 { 6806 void *ptr; 6807 6808 if (exact_nid) 6809 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, 6810 MEMBLOCK_ALLOC_ACCESSIBLE, 6811 nid); 6812 else 6813 ptr = memblock_alloc_try_nid_raw(size, align, min_addr, 6814 MEMBLOCK_ALLOC_ACCESSIBLE, 6815 nid); 6816 6817 if (ptr && size > 0) 6818 page_init_poison(ptr, size); 6819 6820 return ptr; 6821 } 6822 6823 static int zone_batchsize(struct zone *zone) 6824 { 6825 #ifdef CONFIG_MMU 6826 int batch; 6827 6828 /* 6829 * The number of pages to batch allocate is either ~0.1% 6830 * of the zone or 1MB, whichever is smaller. The batch 6831 * size is striking a balance between allocation latency 6832 * and zone lock contention. 6833 */ 6834 batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); 6835 batch /= 4; /* We effectively *= 4 below */ 6836 if (batch < 1) 6837 batch = 1; 6838 6839 /* 6840 * Clamp the batch to a 2^n - 1 value. Having a power 6841 * of 2 value was found to be more likely to have 6842 * suboptimal cache aliasing properties in some cases. 6843 * 6844 * For example if 2 tasks are alternately allocating 6845 * batches of pages, one task can end up with a lot 6846 * of pages of one half of the possible page colors 6847 * and the other with pages of the other colors. 6848 */ 6849 batch = rounddown_pow_of_two(batch + batch/2) - 1; 6850 6851 return batch; 6852 6853 #else 6854 /* The deferral and batching of frees should be suppressed under NOMMU 6855 * conditions. 6856 * 6857 * The problem is that NOMMU needs to be able to allocate large chunks 6858 * of contiguous memory as there's no hardware page translation to 6859 * assemble apparent contiguous memory from discontiguous pages. 6860 * 6861 * Queueing large contiguous runs of pages for batching, however, 6862 * causes the pages to actually be freed in smaller chunks. As there 6863 * can be a significant delay between the individual batches being 6864 * recycled, this leads to the once large chunks of space being 6865 * fragmented and becoming unavailable for high-order allocations. 6866 */ 6867 return 0; 6868 #endif 6869 } 6870 6871 static int zone_highsize(struct zone *zone, int batch, int cpu_online) 6872 { 6873 #ifdef CONFIG_MMU 6874 int high; 6875 int nr_split_cpus; 6876 unsigned long total_pages; 6877 6878 if (!percpu_pagelist_high_fraction) { 6879 /* 6880 * By default, the high value of the pcp is based on the zone 6881 * low watermark so that if they are full then background 6882 * reclaim will not be started prematurely. 6883 */ 6884 total_pages = low_wmark_pages(zone); 6885 } else { 6886 /* 6887 * If percpu_pagelist_high_fraction is configured, the high 6888 * value is based on a fraction of the managed pages in the 6889 * zone. 6890 */ 6891 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; 6892 } 6893 6894 /* 6895 * Split the high value across all online CPUs local to the zone. Note 6896 * that early in boot that CPUs may not be online yet and that during 6897 * CPU hotplug that the cpumask is not yet updated when a CPU is being 6898 * onlined. For memory nodes that have no CPUs, split pcp->high across 6899 * all online CPUs to mitigate the risk that reclaim is triggered 6900 * prematurely due to pages stored on pcp lists. 6901 */ 6902 nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; 6903 if (!nr_split_cpus) 6904 nr_split_cpus = num_online_cpus(); 6905 high = total_pages / nr_split_cpus; 6906 6907 /* 6908 * Ensure high is at least batch*4. The multiple is based on the 6909 * historical relationship between high and batch. 6910 */ 6911 high = max(high, batch << 2); 6912 6913 return high; 6914 #else 6915 return 0; 6916 #endif 6917 } 6918 6919 /* 6920 * pcp->high and pcp->batch values are related and generally batch is lower 6921 * than high. They are also related to pcp->count such that count is lower 6922 * than high, and as soon as it reaches high, the pcplist is flushed. 6923 * 6924 * However, guaranteeing these relations at all times would require e.g. write 6925 * barriers here but also careful usage of read barriers at the read side, and 6926 * thus be prone to error and bad for performance. Thus the update only prevents 6927 * store tearing. Any new users of pcp->batch and pcp->high should ensure they 6928 * can cope with those fields changing asynchronously, and fully trust only the 6929 * pcp->count field on the local CPU with interrupts disabled. 6930 * 6931 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 6932 * outside of boot time (or some other assurance that no concurrent updaters 6933 * exist). 6934 */ 6935 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 6936 unsigned long batch) 6937 { 6938 WRITE_ONCE(pcp->batch, batch); 6939 WRITE_ONCE(pcp->high, high); 6940 } 6941 6942 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) 6943 { 6944 int pindex; 6945 6946 memset(pcp, 0, sizeof(*pcp)); 6947 memset(pzstats, 0, sizeof(*pzstats)); 6948 6949 for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) 6950 INIT_LIST_HEAD(&pcp->lists[pindex]); 6951 6952 /* 6953 * Set batch and high values safe for a boot pageset. A true percpu 6954 * pageset's initialization will update them subsequently. Here we don't 6955 * need to be as careful as pageset_update() as nobody can access the 6956 * pageset yet. 6957 */ 6958 pcp->high = BOOT_PAGESET_HIGH; 6959 pcp->batch = BOOT_PAGESET_BATCH; 6960 pcp->free_factor = 0; 6961 } 6962 6963 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, 6964 unsigned long batch) 6965 { 6966 struct per_cpu_pages *pcp; 6967 int cpu; 6968 6969 for_each_possible_cpu(cpu) { 6970 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6971 pageset_update(pcp, high, batch); 6972 } 6973 } 6974 6975 /* 6976 * Calculate and set new high and batch values for all per-cpu pagesets of a 6977 * zone based on the zone's size. 6978 */ 6979 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) 6980 { 6981 int new_high, new_batch; 6982 6983 new_batch = max(1, zone_batchsize(zone)); 6984 new_high = zone_highsize(zone, new_batch, cpu_online); 6985 6986 if (zone->pageset_high == new_high && 6987 zone->pageset_batch == new_batch) 6988 return; 6989 6990 zone->pageset_high = new_high; 6991 zone->pageset_batch = new_batch; 6992 6993 __zone_set_pageset_high_and_batch(zone, new_high, new_batch); 6994 } 6995 6996 void __meminit setup_zone_pageset(struct zone *zone) 6997 { 6998 int cpu; 6999 7000 /* Size may be 0 on !SMP && !NUMA */ 7001 if (sizeof(struct per_cpu_zonestat) > 0) 7002 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); 7003 7004 zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); 7005 for_each_possible_cpu(cpu) { 7006 struct per_cpu_pages *pcp; 7007 struct per_cpu_zonestat *pzstats; 7008 7009 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 7010 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 7011 per_cpu_pages_init(pcp, pzstats); 7012 } 7013 7014 zone_set_pageset_high_and_batch(zone, 0); 7015 } 7016 7017 /* 7018 * Allocate per cpu pagesets and initialize them. 7019 * Before this call only boot pagesets were available. 7020 */ 7021 void __init setup_per_cpu_pageset(void) 7022 { 7023 struct pglist_data *pgdat; 7024 struct zone *zone; 7025 int __maybe_unused cpu; 7026 7027 for_each_populated_zone(zone) 7028 setup_zone_pageset(zone); 7029 7030 #ifdef CONFIG_NUMA 7031 /* 7032 * Unpopulated zones continue using the boot pagesets. 7033 * The numa stats for these pagesets need to be reset. 7034 * Otherwise, they will end up skewing the stats of 7035 * the nodes these zones are associated with. 7036 */ 7037 for_each_possible_cpu(cpu) { 7038 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); 7039 memset(pzstats->vm_numa_event, 0, 7040 sizeof(pzstats->vm_numa_event)); 7041 } 7042 #endif 7043 7044 for_each_online_pgdat(pgdat) 7045 pgdat->per_cpu_nodestats = 7046 alloc_percpu(struct per_cpu_nodestat); 7047 } 7048 7049 static __meminit void zone_pcp_init(struct zone *zone) 7050 { 7051 /* 7052 * per cpu subsystem is not up at this point. The following code 7053 * relies on the ability of the linker to provide the 7054 * offset of a (static) per cpu variable into the per cpu area. 7055 */ 7056 zone->per_cpu_pageset = &boot_pageset; 7057 zone->per_cpu_zonestats = &boot_zonestats; 7058 zone->pageset_high = BOOT_PAGESET_HIGH; 7059 zone->pageset_batch = BOOT_PAGESET_BATCH; 7060 7061 if (populated_zone(zone)) 7062 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, 7063 zone->present_pages, zone_batchsize(zone)); 7064 } 7065 7066 void __meminit init_currently_empty_zone(struct zone *zone, 7067 unsigned long zone_start_pfn, 7068 unsigned long size) 7069 { 7070 struct pglist_data *pgdat = zone->zone_pgdat; 7071 int zone_idx = zone_idx(zone) + 1; 7072 7073 if (zone_idx > pgdat->nr_zones) 7074 pgdat->nr_zones = zone_idx; 7075 7076 zone->zone_start_pfn = zone_start_pfn; 7077 7078 mminit_dprintk(MMINIT_TRACE, "memmap_init", 7079 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 7080 pgdat->node_id, 7081 (unsigned long)zone_idx(zone), 7082 zone_start_pfn, (zone_start_pfn + size)); 7083 7084 zone_init_free_lists(zone); 7085 zone->initialized = 1; 7086 } 7087 7088 /** 7089 * get_pfn_range_for_nid - Return the start and end page frames for a node 7090 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 7091 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 7092 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 7093 * 7094 * It returns the start and end page frame of a node based on information 7095 * provided by memblock_set_node(). If called for a node 7096 * with no available memory, a warning is printed and the start and end 7097 * PFNs will be 0. 7098 */ 7099 void __init get_pfn_range_for_nid(unsigned int nid, 7100 unsigned long *start_pfn, unsigned long *end_pfn) 7101 { 7102 unsigned long this_start_pfn, this_end_pfn; 7103 int i; 7104 7105 *start_pfn = -1UL; 7106 *end_pfn = 0; 7107 7108 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 7109 *start_pfn = min(*start_pfn, this_start_pfn); 7110 *end_pfn = max(*end_pfn, this_end_pfn); 7111 } 7112 7113 if (*start_pfn == -1UL) 7114 *start_pfn = 0; 7115 } 7116 7117 /* 7118 * This finds a zone that can be used for ZONE_MOVABLE pages. The 7119 * assumption is made that zones within a node are ordered in monotonic 7120 * increasing memory addresses so that the "highest" populated zone is used 7121 */ 7122 static void __init find_usable_zone_for_movable(void) 7123 { 7124 int zone_index; 7125 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 7126 if (zone_index == ZONE_MOVABLE) 7127 continue; 7128 7129 if (arch_zone_highest_possible_pfn[zone_index] > 7130 arch_zone_lowest_possible_pfn[zone_index]) 7131 break; 7132 } 7133 7134 VM_BUG_ON(zone_index == -1); 7135 movable_zone = zone_index; 7136 } 7137 7138 /* 7139 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 7140 * because it is sized independent of architecture. Unlike the other zones, 7141 * the starting point for ZONE_MOVABLE is not fixed. It may be different 7142 * in each node depending on the size of each node and how evenly kernelcore 7143 * is distributed. This helper function adjusts the zone ranges 7144 * provided by the architecture for a given node by using the end of the 7145 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 7146 * zones within a node are in order of monotonic increases memory addresses 7147 */ 7148 static void __init adjust_zone_range_for_zone_movable(int nid, 7149 unsigned long zone_type, 7150 unsigned long node_start_pfn, 7151 unsigned long node_end_pfn, 7152 unsigned long *zone_start_pfn, 7153 unsigned long *zone_end_pfn) 7154 { 7155 /* Only adjust if ZONE_MOVABLE is on this node */ 7156 if (zone_movable_pfn[nid]) { 7157 /* Size ZONE_MOVABLE */ 7158 if (zone_type == ZONE_MOVABLE) { 7159 *zone_start_pfn = zone_movable_pfn[nid]; 7160 *zone_end_pfn = min(node_end_pfn, 7161 arch_zone_highest_possible_pfn[movable_zone]); 7162 7163 /* Adjust for ZONE_MOVABLE starting within this range */ 7164 } else if (!mirrored_kernelcore && 7165 *zone_start_pfn < zone_movable_pfn[nid] && 7166 *zone_end_pfn > zone_movable_pfn[nid]) { 7167 *zone_end_pfn = zone_movable_pfn[nid]; 7168 7169 /* Check if this whole range is within ZONE_MOVABLE */ 7170 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 7171 *zone_start_pfn = *zone_end_pfn; 7172 } 7173 } 7174 7175 /* 7176 * Return the number of pages a zone spans in a node, including holes 7177 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 7178 */ 7179 static unsigned long __init zone_spanned_pages_in_node(int nid, 7180 unsigned long zone_type, 7181 unsigned long node_start_pfn, 7182 unsigned long node_end_pfn, 7183 unsigned long *zone_start_pfn, 7184 unsigned long *zone_end_pfn) 7185 { 7186 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 7187 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 7188 /* When hotadd a new node from cpu_up(), the node should be empty */ 7189 if (!node_start_pfn && !node_end_pfn) 7190 return 0; 7191 7192 /* Get the start and end of the zone */ 7193 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 7194 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 7195 adjust_zone_range_for_zone_movable(nid, zone_type, 7196 node_start_pfn, node_end_pfn, 7197 zone_start_pfn, zone_end_pfn); 7198 7199 /* Check that this node has pages within the zone's required range */ 7200 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 7201 return 0; 7202 7203 /* Move the zone boundaries inside the node if necessary */ 7204 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 7205 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 7206 7207 /* Return the spanned pages */ 7208 return *zone_end_pfn - *zone_start_pfn; 7209 } 7210 7211 /* 7212 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 7213 * then all holes in the requested range will be accounted for. 7214 */ 7215 unsigned long __init __absent_pages_in_range(int nid, 7216 unsigned long range_start_pfn, 7217 unsigned long range_end_pfn) 7218 { 7219 unsigned long nr_absent = range_end_pfn - range_start_pfn; 7220 unsigned long start_pfn, end_pfn; 7221 int i; 7222 7223 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 7224 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 7225 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 7226 nr_absent -= end_pfn - start_pfn; 7227 } 7228 return nr_absent; 7229 } 7230 7231 /** 7232 * absent_pages_in_range - Return number of page frames in holes within a range 7233 * @start_pfn: The start PFN to start searching for holes 7234 * @end_pfn: The end PFN to stop searching for holes 7235 * 7236 * Return: the number of pages frames in memory holes within a range. 7237 */ 7238 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 7239 unsigned long end_pfn) 7240 { 7241 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 7242 } 7243 7244 /* Return the number of page frames in holes in a zone on a node */ 7245 static unsigned long __init zone_absent_pages_in_node(int nid, 7246 unsigned long zone_type, 7247 unsigned long node_start_pfn, 7248 unsigned long node_end_pfn) 7249 { 7250 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 7251 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 7252 unsigned long zone_start_pfn, zone_end_pfn; 7253 unsigned long nr_absent; 7254 7255 /* When hotadd a new node from cpu_up(), the node should be empty */ 7256 if (!node_start_pfn && !node_end_pfn) 7257 return 0; 7258 7259 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 7260 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 7261 7262 adjust_zone_range_for_zone_movable(nid, zone_type, 7263 node_start_pfn, node_end_pfn, 7264 &zone_start_pfn, &zone_end_pfn); 7265 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 7266 7267 /* 7268 * ZONE_MOVABLE handling. 7269 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 7270 * and vice versa. 7271 */ 7272 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 7273 unsigned long start_pfn, end_pfn; 7274 struct memblock_region *r; 7275 7276 for_each_mem_region(r) { 7277 start_pfn = clamp(memblock_region_memory_base_pfn(r), 7278 zone_start_pfn, zone_end_pfn); 7279 end_pfn = clamp(memblock_region_memory_end_pfn(r), 7280 zone_start_pfn, zone_end_pfn); 7281 7282 if (zone_type == ZONE_MOVABLE && 7283 memblock_is_mirror(r)) 7284 nr_absent += end_pfn - start_pfn; 7285 7286 if (zone_type == ZONE_NORMAL && 7287 !memblock_is_mirror(r)) 7288 nr_absent += end_pfn - start_pfn; 7289 } 7290 } 7291 7292 return nr_absent; 7293 } 7294 7295 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 7296 unsigned long node_start_pfn, 7297 unsigned long node_end_pfn) 7298 { 7299 unsigned long realtotalpages = 0, totalpages = 0; 7300 enum zone_type i; 7301 7302 for (i = 0; i < MAX_NR_ZONES; i++) { 7303 struct zone *zone = pgdat->node_zones + i; 7304 unsigned long zone_start_pfn, zone_end_pfn; 7305 unsigned long spanned, absent; 7306 unsigned long size, real_size; 7307 7308 spanned = zone_spanned_pages_in_node(pgdat->node_id, i, 7309 node_start_pfn, 7310 node_end_pfn, 7311 &zone_start_pfn, 7312 &zone_end_pfn); 7313 absent = zone_absent_pages_in_node(pgdat->node_id, i, 7314 node_start_pfn, 7315 node_end_pfn); 7316 7317 size = spanned; 7318 real_size = size - absent; 7319 7320 if (size) 7321 zone->zone_start_pfn = zone_start_pfn; 7322 else 7323 zone->zone_start_pfn = 0; 7324 zone->spanned_pages = size; 7325 zone->present_pages = real_size; 7326 #if defined(CONFIG_MEMORY_HOTPLUG) 7327 zone->present_early_pages = real_size; 7328 #endif 7329 7330 totalpages += size; 7331 realtotalpages += real_size; 7332 } 7333 7334 pgdat->node_spanned_pages = totalpages; 7335 pgdat->node_present_pages = realtotalpages; 7336 pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 7337 } 7338 7339 #ifndef CONFIG_SPARSEMEM 7340 /* 7341 * Calculate the size of the zone->blockflags rounded to an unsigned long 7342 * Start by making sure zonesize is a multiple of pageblock_order by rounding 7343 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 7344 * round what is now in bits to nearest long in bits, then return it in 7345 * bytes. 7346 */ 7347 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 7348 { 7349 unsigned long usemapsize; 7350 7351 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 7352 usemapsize = roundup(zonesize, pageblock_nr_pages); 7353 usemapsize = usemapsize >> pageblock_order; 7354 usemapsize *= NR_PAGEBLOCK_BITS; 7355 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 7356 7357 return usemapsize / 8; 7358 } 7359 7360 static void __ref setup_usemap(struct zone *zone) 7361 { 7362 unsigned long usemapsize = usemap_size(zone->zone_start_pfn, 7363 zone->spanned_pages); 7364 zone->pageblock_flags = NULL; 7365 if (usemapsize) { 7366 zone->pageblock_flags = 7367 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, 7368 zone_to_nid(zone)); 7369 if (!zone->pageblock_flags) 7370 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 7371 usemapsize, zone->name, zone_to_nid(zone)); 7372 } 7373 } 7374 #else 7375 static inline void setup_usemap(struct zone *zone) {} 7376 #endif /* CONFIG_SPARSEMEM */ 7377 7378 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 7379 7380 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 7381 void __init set_pageblock_order(void) 7382 { 7383 unsigned int order = MAX_ORDER - 1; 7384 7385 /* Check that pageblock_nr_pages has not already been setup */ 7386 if (pageblock_order) 7387 return; 7388 7389 /* Don't let pageblocks exceed the maximum allocation granularity. */ 7390 if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) 7391 order = HUGETLB_PAGE_ORDER; 7392 7393 /* 7394 * Assume the largest contiguous order of interest is a huge page. 7395 * This value may be variable depending on boot parameters on IA64 and 7396 * powerpc. 7397 */ 7398 pageblock_order = order; 7399 } 7400 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 7401 7402 /* 7403 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 7404 * is unused as pageblock_order is set at compile-time. See 7405 * include/linux/pageblock-flags.h for the values of pageblock_order based on 7406 * the kernel config 7407 */ 7408 void __init set_pageblock_order(void) 7409 { 7410 } 7411 7412 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 7413 7414 static unsigned long __init calc_memmap_size(unsigned long spanned_pages, 7415 unsigned long present_pages) 7416 { 7417 unsigned long pages = spanned_pages; 7418 7419 /* 7420 * Provide a more accurate estimation if there are holes within 7421 * the zone and SPARSEMEM is in use. If there are holes within the 7422 * zone, each populated memory region may cost us one or two extra 7423 * memmap pages due to alignment because memmap pages for each 7424 * populated regions may not be naturally aligned on page boundary. 7425 * So the (present_pages >> 4) heuristic is a tradeoff for that. 7426 */ 7427 if (spanned_pages > present_pages + (present_pages >> 4) && 7428 IS_ENABLED(CONFIG_SPARSEMEM)) 7429 pages = present_pages; 7430 7431 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 7432 } 7433 7434 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 7435 static void pgdat_init_split_queue(struct pglist_data *pgdat) 7436 { 7437 struct deferred_split *ds_queue = &pgdat->deferred_split_queue; 7438 7439 spin_lock_init(&ds_queue->split_queue_lock); 7440 INIT_LIST_HEAD(&ds_queue->split_queue); 7441 ds_queue->split_queue_len = 0; 7442 } 7443 #else 7444 static void pgdat_init_split_queue(struct pglist_data *pgdat) {} 7445 #endif 7446 7447 #ifdef CONFIG_COMPACTION 7448 static void pgdat_init_kcompactd(struct pglist_data *pgdat) 7449 { 7450 init_waitqueue_head(&pgdat->kcompactd_wait); 7451 } 7452 #else 7453 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} 7454 #endif 7455 7456 static void __meminit pgdat_init_internals(struct pglist_data *pgdat) 7457 { 7458 int i; 7459 7460 pgdat_resize_init(pgdat); 7461 7462 pgdat_init_split_queue(pgdat); 7463 pgdat_init_kcompactd(pgdat); 7464 7465 init_waitqueue_head(&pgdat->kswapd_wait); 7466 init_waitqueue_head(&pgdat->pfmemalloc_wait); 7467 7468 for (i = 0; i < NR_VMSCAN_THROTTLE; i++) 7469 init_waitqueue_head(&pgdat->reclaim_wait[i]); 7470 7471 pgdat_page_ext_init(pgdat); 7472 lruvec_init(&pgdat->__lruvec); 7473 } 7474 7475 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 7476 unsigned long remaining_pages) 7477 { 7478 atomic_long_set(&zone->managed_pages, remaining_pages); 7479 zone_set_nid(zone, nid); 7480 zone->name = zone_names[idx]; 7481 zone->zone_pgdat = NODE_DATA(nid); 7482 spin_lock_init(&zone->lock); 7483 zone_seqlock_init(zone); 7484 zone_pcp_init(zone); 7485 } 7486 7487 /* 7488 * Set up the zone data structures 7489 * - init pgdat internals 7490 * - init all zones belonging to this node 7491 * 7492 * NOTE: this function is only called during memory hotplug 7493 */ 7494 #ifdef CONFIG_MEMORY_HOTPLUG 7495 void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) 7496 { 7497 int nid = pgdat->node_id; 7498 enum zone_type z; 7499 int cpu; 7500 7501 pgdat_init_internals(pgdat); 7502 7503 if (pgdat->per_cpu_nodestats == &boot_nodestats) 7504 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 7505 7506 /* 7507 * Reset the nr_zones, order and highest_zoneidx before reuse. 7508 * Note that kswapd will init kswapd_highest_zoneidx properly 7509 * when it starts in the near future. 7510 */ 7511 pgdat->nr_zones = 0; 7512 pgdat->kswapd_order = 0; 7513 pgdat->kswapd_highest_zoneidx = 0; 7514 pgdat->node_start_pfn = 0; 7515 for_each_online_cpu(cpu) { 7516 struct per_cpu_nodestat *p; 7517 7518 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); 7519 memset(p, 0, sizeof(*p)); 7520 } 7521 7522 for (z = 0; z < MAX_NR_ZONES; z++) 7523 zone_init_internals(&pgdat->node_zones[z], z, nid, 0); 7524 } 7525 #endif 7526 7527 /* 7528 * Set up the zone data structures: 7529 * - mark all pages reserved 7530 * - mark all memory queues empty 7531 * - clear the memory bitmaps 7532 * 7533 * NOTE: pgdat should get zeroed by caller. 7534 * NOTE: this function is only called during early init. 7535 */ 7536 static void __init free_area_init_core(struct pglist_data *pgdat) 7537 { 7538 enum zone_type j; 7539 int nid = pgdat->node_id; 7540 7541 pgdat_init_internals(pgdat); 7542 pgdat->per_cpu_nodestats = &boot_nodestats; 7543 7544 for (j = 0; j < MAX_NR_ZONES; j++) { 7545 struct zone *zone = pgdat->node_zones + j; 7546 unsigned long size, freesize, memmap_pages; 7547 7548 size = zone->spanned_pages; 7549 freesize = zone->present_pages; 7550 7551 /* 7552 * Adjust freesize so that it accounts for how much memory 7553 * is used by this zone for memmap. This affects the watermark 7554 * and per-cpu initialisations 7555 */ 7556 memmap_pages = calc_memmap_size(size, freesize); 7557 if (!is_highmem_idx(j)) { 7558 if (freesize >= memmap_pages) { 7559 freesize -= memmap_pages; 7560 if (memmap_pages) 7561 pr_debug(" %s zone: %lu pages used for memmap\n", 7562 zone_names[j], memmap_pages); 7563 } else 7564 pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", 7565 zone_names[j], memmap_pages, freesize); 7566 } 7567 7568 /* Account for reserved pages */ 7569 if (j == 0 && freesize > dma_reserve) { 7570 freesize -= dma_reserve; 7571 pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); 7572 } 7573 7574 if (!is_highmem_idx(j)) 7575 nr_kernel_pages += freesize; 7576 /* Charge for highmem memmap if there are enough kernel pages */ 7577 else if (nr_kernel_pages > memmap_pages * 2) 7578 nr_kernel_pages -= memmap_pages; 7579 nr_all_pages += freesize; 7580 7581 /* 7582 * Set an approximate value for lowmem here, it will be adjusted 7583 * when the bootmem allocator frees pages into the buddy system. 7584 * And all highmem pages will be managed by the buddy system. 7585 */ 7586 zone_init_internals(zone, j, nid, freesize); 7587 7588 if (!size) 7589 continue; 7590 7591 set_pageblock_order(); 7592 setup_usemap(zone); 7593 init_currently_empty_zone(zone, zone->zone_start_pfn, size); 7594 } 7595 } 7596 7597 #ifdef CONFIG_FLATMEM 7598 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 7599 { 7600 unsigned long __maybe_unused start = 0; 7601 unsigned long __maybe_unused offset = 0; 7602 7603 /* Skip empty nodes */ 7604 if (!pgdat->node_spanned_pages) 7605 return; 7606 7607 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 7608 offset = pgdat->node_start_pfn - start; 7609 /* ia64 gets its own node_mem_map, before this, without bootmem */ 7610 if (!pgdat->node_mem_map) { 7611 unsigned long size, end; 7612 struct page *map; 7613 7614 /* 7615 * The zone's endpoints aren't required to be MAX_ORDER 7616 * aligned but the node_mem_map endpoints must be in order 7617 * for the buddy allocator to function correctly. 7618 */ 7619 end = pgdat_end_pfn(pgdat); 7620 end = ALIGN(end, MAX_ORDER_NR_PAGES); 7621 size = (end - start) * sizeof(struct page); 7622 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, 7623 pgdat->node_id, false); 7624 if (!map) 7625 panic("Failed to allocate %ld bytes for node %d memory map\n", 7626 size, pgdat->node_id); 7627 pgdat->node_mem_map = map + offset; 7628 } 7629 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 7630 __func__, pgdat->node_id, (unsigned long)pgdat, 7631 (unsigned long)pgdat->node_mem_map); 7632 #ifndef CONFIG_NUMA 7633 /* 7634 * With no DISCONTIG, the global mem_map is just set as node 0's 7635 */ 7636 if (pgdat == NODE_DATA(0)) { 7637 mem_map = NODE_DATA(0)->node_mem_map; 7638 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 7639 mem_map -= offset; 7640 } 7641 #endif 7642 } 7643 #else 7644 static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } 7645 #endif /* CONFIG_FLATMEM */ 7646 7647 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 7648 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 7649 { 7650 pgdat->first_deferred_pfn = ULONG_MAX; 7651 } 7652 #else 7653 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} 7654 #endif 7655 7656 static void __init free_area_init_node(int nid) 7657 { 7658 pg_data_t *pgdat = NODE_DATA(nid); 7659 unsigned long start_pfn = 0; 7660 unsigned long end_pfn = 0; 7661 7662 /* pg_data_t should be reset to zero when it's allocated */ 7663 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); 7664 7665 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 7666 7667 pgdat->node_id = nid; 7668 pgdat->node_start_pfn = start_pfn; 7669 pgdat->per_cpu_nodestats = NULL; 7670 7671 if (start_pfn != end_pfn) { 7672 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 7673 (u64)start_pfn << PAGE_SHIFT, 7674 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 7675 } else { 7676 pr_info("Initmem setup node %d as memoryless\n", nid); 7677 } 7678 7679 calculate_node_totalpages(pgdat, start_pfn, end_pfn); 7680 7681 alloc_node_mem_map(pgdat); 7682 pgdat_set_deferred_range(pgdat); 7683 7684 free_area_init_core(pgdat); 7685 } 7686 7687 static void __init free_area_init_memoryless_node(int nid) 7688 { 7689 free_area_init_node(nid); 7690 } 7691 7692 #if MAX_NUMNODES > 1 7693 /* 7694 * Figure out the number of possible node ids. 7695 */ 7696 void __init setup_nr_node_ids(void) 7697 { 7698 unsigned int highest; 7699 7700 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 7701 nr_node_ids = highest + 1; 7702 } 7703 #endif 7704 7705 /** 7706 * node_map_pfn_alignment - determine the maximum internode alignment 7707 * 7708 * This function should be called after node map is populated and sorted. 7709 * It calculates the maximum power of two alignment which can distinguish 7710 * all the nodes. 7711 * 7712 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 7713 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 7714 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 7715 * shifted, 1GiB is enough and this function will indicate so. 7716 * 7717 * This is used to test whether pfn -> nid mapping of the chosen memory 7718 * model has fine enough granularity to avoid incorrect mapping for the 7719 * populated node map. 7720 * 7721 * Return: the determined alignment in pfn's. 0 if there is no alignment 7722 * requirement (single node). 7723 */ 7724 unsigned long __init node_map_pfn_alignment(void) 7725 { 7726 unsigned long accl_mask = 0, last_end = 0; 7727 unsigned long start, end, mask; 7728 int last_nid = NUMA_NO_NODE; 7729 int i, nid; 7730 7731 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 7732 if (!start || last_nid < 0 || last_nid == nid) { 7733 last_nid = nid; 7734 last_end = end; 7735 continue; 7736 } 7737 7738 /* 7739 * Start with a mask granular enough to pin-point to the 7740 * start pfn and tick off bits one-by-one until it becomes 7741 * too coarse to separate the current node from the last. 7742 */ 7743 mask = ~((1 << __ffs(start)) - 1); 7744 while (mask && last_end <= (start & (mask << 1))) 7745 mask <<= 1; 7746 7747 /* accumulate all internode masks */ 7748 accl_mask |= mask; 7749 } 7750 7751 /* convert mask to number of pages */ 7752 return ~accl_mask + 1; 7753 } 7754 7755 /** 7756 * find_min_pfn_with_active_regions - Find the minimum PFN registered 7757 * 7758 * Return: the minimum PFN based on information provided via 7759 * memblock_set_node(). 7760 */ 7761 unsigned long __init find_min_pfn_with_active_regions(void) 7762 { 7763 return PHYS_PFN(memblock_start_of_DRAM()); 7764 } 7765 7766 /* 7767 * early_calculate_totalpages() 7768 * Sum pages in active regions for movable zone. 7769 * Populate N_MEMORY for calculating usable_nodes. 7770 */ 7771 static unsigned long __init early_calculate_totalpages(void) 7772 { 7773 unsigned long totalpages = 0; 7774 unsigned long start_pfn, end_pfn; 7775 int i, nid; 7776 7777 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 7778 unsigned long pages = end_pfn - start_pfn; 7779 7780 totalpages += pages; 7781 if (pages) 7782 node_set_state(nid, N_MEMORY); 7783 } 7784 return totalpages; 7785 } 7786 7787 /* 7788 * Find the PFN the Movable zone begins in each node. Kernel memory 7789 * is spread evenly between nodes as long as the nodes have enough 7790 * memory. When they don't, some nodes will have more kernelcore than 7791 * others 7792 */ 7793 static void __init find_zone_movable_pfns_for_nodes(void) 7794 { 7795 int i, nid; 7796 unsigned long usable_startpfn; 7797 unsigned long kernelcore_node, kernelcore_remaining; 7798 /* save the state before borrow the nodemask */ 7799 nodemask_t saved_node_state = node_states[N_MEMORY]; 7800 unsigned long totalpages = early_calculate_totalpages(); 7801 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 7802 struct memblock_region *r; 7803 7804 /* Need to find movable_zone earlier when movable_node is specified. */ 7805 find_usable_zone_for_movable(); 7806 7807 /* 7808 * If movable_node is specified, ignore kernelcore and movablecore 7809 * options. 7810 */ 7811 if (movable_node_is_enabled()) { 7812 for_each_mem_region(r) { 7813 if (!memblock_is_hotpluggable(r)) 7814 continue; 7815 7816 nid = memblock_get_region_node(r); 7817 7818 usable_startpfn = PFN_DOWN(r->base); 7819 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7820 min(usable_startpfn, zone_movable_pfn[nid]) : 7821 usable_startpfn; 7822 } 7823 7824 goto out2; 7825 } 7826 7827 /* 7828 * If kernelcore=mirror is specified, ignore movablecore option 7829 */ 7830 if (mirrored_kernelcore) { 7831 bool mem_below_4gb_not_mirrored = false; 7832 7833 for_each_mem_region(r) { 7834 if (memblock_is_mirror(r)) 7835 continue; 7836 7837 nid = memblock_get_region_node(r); 7838 7839 usable_startpfn = memblock_region_memory_base_pfn(r); 7840 7841 if (usable_startpfn < PHYS_PFN(SZ_4G)) { 7842 mem_below_4gb_not_mirrored = true; 7843 continue; 7844 } 7845 7846 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7847 min(usable_startpfn, zone_movable_pfn[nid]) : 7848 usable_startpfn; 7849 } 7850 7851 if (mem_below_4gb_not_mirrored) 7852 pr_warn("This configuration results in unmirrored kernel memory.\n"); 7853 7854 goto out2; 7855 } 7856 7857 /* 7858 * If kernelcore=nn% or movablecore=nn% was specified, calculate the 7859 * amount of necessary memory. 7860 */ 7861 if (required_kernelcore_percent) 7862 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / 7863 10000UL; 7864 if (required_movablecore_percent) 7865 required_movablecore = (totalpages * 100 * required_movablecore_percent) / 7866 10000UL; 7867 7868 /* 7869 * If movablecore= was specified, calculate what size of 7870 * kernelcore that corresponds so that memory usable for 7871 * any allocation type is evenly spread. If both kernelcore 7872 * and movablecore are specified, then the value of kernelcore 7873 * will be used for required_kernelcore if it's greater than 7874 * what movablecore would have allowed. 7875 */ 7876 if (required_movablecore) { 7877 unsigned long corepages; 7878 7879 /* 7880 * Round-up so that ZONE_MOVABLE is at least as large as what 7881 * was requested by the user 7882 */ 7883 required_movablecore = 7884 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 7885 required_movablecore = min(totalpages, required_movablecore); 7886 corepages = totalpages - required_movablecore; 7887 7888 required_kernelcore = max(required_kernelcore, corepages); 7889 } 7890 7891 /* 7892 * If kernelcore was not specified or kernelcore size is larger 7893 * than totalpages, there is no ZONE_MOVABLE. 7894 */ 7895 if (!required_kernelcore || required_kernelcore >= totalpages) 7896 goto out; 7897 7898 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 7899 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 7900 7901 restart: 7902 /* Spread kernelcore memory as evenly as possible throughout nodes */ 7903 kernelcore_node = required_kernelcore / usable_nodes; 7904 for_each_node_state(nid, N_MEMORY) { 7905 unsigned long start_pfn, end_pfn; 7906 7907 /* 7908 * Recalculate kernelcore_node if the division per node 7909 * now exceeds what is necessary to satisfy the requested 7910 * amount of memory for the kernel 7911 */ 7912 if (required_kernelcore < kernelcore_node) 7913 kernelcore_node = required_kernelcore / usable_nodes; 7914 7915 /* 7916 * As the map is walked, we track how much memory is usable 7917 * by the kernel using kernelcore_remaining. When it is 7918 * 0, the rest of the node is usable by ZONE_MOVABLE 7919 */ 7920 kernelcore_remaining = kernelcore_node; 7921 7922 /* Go through each range of PFNs within this node */ 7923 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 7924 unsigned long size_pages; 7925 7926 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 7927 if (start_pfn >= end_pfn) 7928 continue; 7929 7930 /* Account for what is only usable for kernelcore */ 7931 if (start_pfn < usable_startpfn) { 7932 unsigned long kernel_pages; 7933 kernel_pages = min(end_pfn, usable_startpfn) 7934 - start_pfn; 7935 7936 kernelcore_remaining -= min(kernel_pages, 7937 kernelcore_remaining); 7938 required_kernelcore -= min(kernel_pages, 7939 required_kernelcore); 7940 7941 /* Continue if range is now fully accounted */ 7942 if (end_pfn <= usable_startpfn) { 7943 7944 /* 7945 * Push zone_movable_pfn to the end so 7946 * that if we have to rebalance 7947 * kernelcore across nodes, we will 7948 * not double account here 7949 */ 7950 zone_movable_pfn[nid] = end_pfn; 7951 continue; 7952 } 7953 start_pfn = usable_startpfn; 7954 } 7955 7956 /* 7957 * The usable PFN range for ZONE_MOVABLE is from 7958 * start_pfn->end_pfn. Calculate size_pages as the 7959 * number of pages used as kernelcore 7960 */ 7961 size_pages = end_pfn - start_pfn; 7962 if (size_pages > kernelcore_remaining) 7963 size_pages = kernelcore_remaining; 7964 zone_movable_pfn[nid] = start_pfn + size_pages; 7965 7966 /* 7967 * Some kernelcore has been met, update counts and 7968 * break if the kernelcore for this node has been 7969 * satisfied 7970 */ 7971 required_kernelcore -= min(required_kernelcore, 7972 size_pages); 7973 kernelcore_remaining -= size_pages; 7974 if (!kernelcore_remaining) 7975 break; 7976 } 7977 } 7978 7979 /* 7980 * If there is still required_kernelcore, we do another pass with one 7981 * less node in the count. This will push zone_movable_pfn[nid] further 7982 * along on the nodes that still have memory until kernelcore is 7983 * satisfied 7984 */ 7985 usable_nodes--; 7986 if (usable_nodes && required_kernelcore > usable_nodes) 7987 goto restart; 7988 7989 out2: 7990 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 7991 for (nid = 0; nid < MAX_NUMNODES; nid++) { 7992 unsigned long start_pfn, end_pfn; 7993 7994 zone_movable_pfn[nid] = 7995 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 7996 7997 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 7998 if (zone_movable_pfn[nid] >= end_pfn) 7999 zone_movable_pfn[nid] = 0; 8000 } 8001 8002 out: 8003 /* restore the node_state */ 8004 node_states[N_MEMORY] = saved_node_state; 8005 } 8006 8007 /* Any regular or high memory on that node ? */ 8008 static void check_for_memory(pg_data_t *pgdat, int nid) 8009 { 8010 enum zone_type zone_type; 8011 8012 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 8013 struct zone *zone = &pgdat->node_zones[zone_type]; 8014 if (populated_zone(zone)) { 8015 if (IS_ENABLED(CONFIG_HIGHMEM)) 8016 node_set_state(nid, N_HIGH_MEMORY); 8017 if (zone_type <= ZONE_NORMAL) 8018 node_set_state(nid, N_NORMAL_MEMORY); 8019 break; 8020 } 8021 } 8022 } 8023 8024 /* 8025 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For 8026 * such cases we allow max_zone_pfn sorted in the descending order 8027 */ 8028 bool __weak arch_has_descending_max_zone_pfns(void) 8029 { 8030 return false; 8031 } 8032 8033 /** 8034 * free_area_init - Initialise all pg_data_t and zone data 8035 * @max_zone_pfn: an array of max PFNs for each zone 8036 * 8037 * This will call free_area_init_node() for each active node in the system. 8038 * Using the page ranges provided by memblock_set_node(), the size of each 8039 * zone in each node and their holes is calculated. If the maximum PFN 8040 * between two adjacent zones match, it is assumed that the zone is empty. 8041 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 8042 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 8043 * starts where the previous one ended. For example, ZONE_DMA32 starts 8044 * at arch_max_dma_pfn. 8045 */ 8046 void __init free_area_init(unsigned long *max_zone_pfn) 8047 { 8048 unsigned long start_pfn, end_pfn; 8049 int i, nid, zone; 8050 bool descending; 8051 8052 /* Record where the zone boundaries are */ 8053 memset(arch_zone_lowest_possible_pfn, 0, 8054 sizeof(arch_zone_lowest_possible_pfn)); 8055 memset(arch_zone_highest_possible_pfn, 0, 8056 sizeof(arch_zone_highest_possible_pfn)); 8057 8058 start_pfn = find_min_pfn_with_active_regions(); 8059 descending = arch_has_descending_max_zone_pfns(); 8060 8061 for (i = 0; i < MAX_NR_ZONES; i++) { 8062 if (descending) 8063 zone = MAX_NR_ZONES - i - 1; 8064 else 8065 zone = i; 8066 8067 if (zone == ZONE_MOVABLE) 8068 continue; 8069 8070 end_pfn = max(max_zone_pfn[zone], start_pfn); 8071 arch_zone_lowest_possible_pfn[zone] = start_pfn; 8072 arch_zone_highest_possible_pfn[zone] = end_pfn; 8073 8074 start_pfn = end_pfn; 8075 } 8076 8077 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 8078 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 8079 find_zone_movable_pfns_for_nodes(); 8080 8081 /* Print out the zone ranges */ 8082 pr_info("Zone ranges:\n"); 8083 for (i = 0; i < MAX_NR_ZONES; i++) { 8084 if (i == ZONE_MOVABLE) 8085 continue; 8086 pr_info(" %-8s ", zone_names[i]); 8087 if (arch_zone_lowest_possible_pfn[i] == 8088 arch_zone_highest_possible_pfn[i]) 8089 pr_cont("empty\n"); 8090 else 8091 pr_cont("[mem %#018Lx-%#018Lx]\n", 8092 (u64)arch_zone_lowest_possible_pfn[i] 8093 << PAGE_SHIFT, 8094 ((u64)arch_zone_highest_possible_pfn[i] 8095 << PAGE_SHIFT) - 1); 8096 } 8097 8098 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 8099 pr_info("Movable zone start for each node\n"); 8100 for (i = 0; i < MAX_NUMNODES; i++) { 8101 if (zone_movable_pfn[i]) 8102 pr_info(" Node %d: %#018Lx\n", i, 8103 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 8104 } 8105 8106 /* 8107 * Print out the early node map, and initialize the 8108 * subsection-map relative to active online memory ranges to 8109 * enable future "sub-section" extensions of the memory map. 8110 */ 8111 pr_info("Early memory node ranges\n"); 8112 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 8113 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 8114 (u64)start_pfn << PAGE_SHIFT, 8115 ((u64)end_pfn << PAGE_SHIFT) - 1); 8116 subsection_map_init(start_pfn, end_pfn - start_pfn); 8117 } 8118 8119 /* Initialise every node */ 8120 mminit_verify_pageflags_layout(); 8121 setup_nr_node_ids(); 8122 for_each_node(nid) { 8123 pg_data_t *pgdat; 8124 8125 if (!node_online(nid)) { 8126 pr_info("Initializing node %d as memoryless\n", nid); 8127 8128 /* Allocator not initialized yet */ 8129 pgdat = arch_alloc_nodedata(nid); 8130 if (!pgdat) { 8131 pr_err("Cannot allocate %zuB for node %d.\n", 8132 sizeof(*pgdat), nid); 8133 continue; 8134 } 8135 arch_refresh_nodedata(nid, pgdat); 8136 free_area_init_memoryless_node(nid); 8137 8138 /* 8139 * We do not want to confuse userspace by sysfs 8140 * files/directories for node without any memory 8141 * attached to it, so this node is not marked as 8142 * N_MEMORY and not marked online so that no sysfs 8143 * hierarchy will be created via register_one_node for 8144 * it. The pgdat will get fully initialized by 8145 * hotadd_init_pgdat() when memory is hotplugged into 8146 * this node. 8147 */ 8148 continue; 8149 } 8150 8151 pgdat = NODE_DATA(nid); 8152 free_area_init_node(nid); 8153 8154 /* Any memory on that node */ 8155 if (pgdat->node_present_pages) 8156 node_set_state(nid, N_MEMORY); 8157 check_for_memory(pgdat, nid); 8158 } 8159 8160 memmap_init(); 8161 } 8162 8163 static int __init cmdline_parse_core(char *p, unsigned long *core, 8164 unsigned long *percent) 8165 { 8166 unsigned long long coremem; 8167 char *endptr; 8168 8169 if (!p) 8170 return -EINVAL; 8171 8172 /* Value may be a percentage of total memory, otherwise bytes */ 8173 coremem = simple_strtoull(p, &endptr, 0); 8174 if (*endptr == '%') { 8175 /* Paranoid check for percent values greater than 100 */ 8176 WARN_ON(coremem > 100); 8177 8178 *percent = coremem; 8179 } else { 8180 coremem = memparse(p, &p); 8181 /* Paranoid check that UL is enough for the coremem value */ 8182 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 8183 8184 *core = coremem >> PAGE_SHIFT; 8185 *percent = 0UL; 8186 } 8187 return 0; 8188 } 8189 8190 /* 8191 * kernelcore=size sets the amount of memory for use for allocations that 8192 * cannot be reclaimed or migrated. 8193 */ 8194 static int __init cmdline_parse_kernelcore(char *p) 8195 { 8196 /* parse kernelcore=mirror */ 8197 if (parse_option_str(p, "mirror")) { 8198 mirrored_kernelcore = true; 8199 return 0; 8200 } 8201 8202 return cmdline_parse_core(p, &required_kernelcore, 8203 &required_kernelcore_percent); 8204 } 8205 8206 /* 8207 * movablecore=size sets the amount of memory for use for allocations that 8208 * can be reclaimed or migrated. 8209 */ 8210 static int __init cmdline_parse_movablecore(char *p) 8211 { 8212 return cmdline_parse_core(p, &required_movablecore, 8213 &required_movablecore_percent); 8214 } 8215 8216 early_param("kernelcore", cmdline_parse_kernelcore); 8217 early_param("movablecore", cmdline_parse_movablecore); 8218 8219 void adjust_managed_page_count(struct page *page, long count) 8220 { 8221 atomic_long_add(count, &page_zone(page)->managed_pages); 8222 totalram_pages_add(count); 8223 #ifdef CONFIG_HIGHMEM 8224 if (PageHighMem(page)) 8225 totalhigh_pages_add(count); 8226 #endif 8227 } 8228 EXPORT_SYMBOL(adjust_managed_page_count); 8229 8230 unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) 8231 { 8232 void *pos; 8233 unsigned long pages = 0; 8234 8235 start = (void *)PAGE_ALIGN((unsigned long)start); 8236 end = (void *)((unsigned long)end & PAGE_MASK); 8237 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 8238 struct page *page = virt_to_page(pos); 8239 void *direct_map_addr; 8240 8241 /* 8242 * 'direct_map_addr' might be different from 'pos' 8243 * because some architectures' virt_to_page() 8244 * work with aliases. Getting the direct map 8245 * address ensures that we get a _writeable_ 8246 * alias for the memset(). 8247 */ 8248 direct_map_addr = page_address(page); 8249 /* 8250 * Perform a kasan-unchecked memset() since this memory 8251 * has not been initialized. 8252 */ 8253 direct_map_addr = kasan_reset_tag(direct_map_addr); 8254 if ((unsigned int)poison <= 0xFF) 8255 memset(direct_map_addr, poison, PAGE_SIZE); 8256 8257 free_reserved_page(page); 8258 } 8259 8260 if (pages && s) 8261 pr_info("Freeing %s memory: %ldK\n", s, K(pages)); 8262 8263 return pages; 8264 } 8265 8266 void __init mem_init_print_info(void) 8267 { 8268 unsigned long physpages, codesize, datasize, rosize, bss_size; 8269 unsigned long init_code_size, init_data_size; 8270 8271 physpages = get_num_physpages(); 8272 codesize = _etext - _stext; 8273 datasize = _edata - _sdata; 8274 rosize = __end_rodata - __start_rodata; 8275 bss_size = __bss_stop - __bss_start; 8276 init_data_size = __init_end - __init_begin; 8277 init_code_size = _einittext - _sinittext; 8278 8279 /* 8280 * Detect special cases and adjust section sizes accordingly: 8281 * 1) .init.* may be embedded into .data sections 8282 * 2) .init.text.* may be out of [__init_begin, __init_end], 8283 * please refer to arch/tile/kernel/vmlinux.lds.S. 8284 * 3) .rodata.* may be embedded into .text or .data sections. 8285 */ 8286 #define adj_init_size(start, end, size, pos, adj) \ 8287 do { \ 8288 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ 8289 size -= adj; \ 8290 } while (0) 8291 8292 adj_init_size(__init_begin, __init_end, init_data_size, 8293 _sinittext, init_code_size); 8294 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 8295 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 8296 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 8297 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 8298 8299 #undef adj_init_size 8300 8301 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 8302 #ifdef CONFIG_HIGHMEM 8303 ", %luK highmem" 8304 #endif 8305 ")\n", 8306 K(nr_free_pages()), K(physpages), 8307 codesize >> 10, datasize >> 10, rosize >> 10, 8308 (init_data_size + init_code_size) >> 10, bss_size >> 10, 8309 K(physpages - totalram_pages() - totalcma_pages), 8310 K(totalcma_pages) 8311 #ifdef CONFIG_HIGHMEM 8312 , K(totalhigh_pages()) 8313 #endif 8314 ); 8315 } 8316 8317 /** 8318 * set_dma_reserve - set the specified number of pages reserved in the first zone 8319 * @new_dma_reserve: The number of pages to mark reserved 8320 * 8321 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 8322 * In the DMA zone, a significant percentage may be consumed by kernel image 8323 * and other unfreeable allocations which can skew the watermarks badly. This 8324 * function may optionally be used to account for unfreeable pages in the 8325 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 8326 * smaller per-cpu batchsize. 8327 */ 8328 void __init set_dma_reserve(unsigned long new_dma_reserve) 8329 { 8330 dma_reserve = new_dma_reserve; 8331 } 8332 8333 static int page_alloc_cpu_dead(unsigned int cpu) 8334 { 8335 struct zone *zone; 8336 8337 lru_add_drain_cpu(cpu); 8338 mlock_page_drain_remote(cpu); 8339 drain_pages(cpu); 8340 8341 /* 8342 * Spill the event counters of the dead processor 8343 * into the current processors event counters. 8344 * This artificially elevates the count of the current 8345 * processor. 8346 */ 8347 vm_events_fold_cpu(cpu); 8348 8349 /* 8350 * Zero the differential counters of the dead processor 8351 * so that the vm statistics are consistent. 8352 * 8353 * This is only okay since the processor is dead and cannot 8354 * race with what we are doing. 8355 */ 8356 cpu_vm_stats_fold(cpu); 8357 8358 for_each_populated_zone(zone) 8359 zone_pcp_update(zone, 0); 8360 8361 return 0; 8362 } 8363 8364 static int page_alloc_cpu_online(unsigned int cpu) 8365 { 8366 struct zone *zone; 8367 8368 for_each_populated_zone(zone) 8369 zone_pcp_update(zone, 1); 8370 return 0; 8371 } 8372 8373 #ifdef CONFIG_NUMA 8374 int hashdist = HASHDIST_DEFAULT; 8375 8376 static int __init set_hashdist(char *str) 8377 { 8378 if (!str) 8379 return 0; 8380 hashdist = simple_strtoul(str, &str, 0); 8381 return 1; 8382 } 8383 __setup("hashdist=", set_hashdist); 8384 #endif 8385 8386 void __init page_alloc_init(void) 8387 { 8388 int ret; 8389 8390 #ifdef CONFIG_NUMA 8391 if (num_node_state(N_MEMORY) == 1) 8392 hashdist = 0; 8393 #endif 8394 8395 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, 8396 "mm/page_alloc:pcp", 8397 page_alloc_cpu_online, 8398 page_alloc_cpu_dead); 8399 WARN_ON(ret < 0); 8400 } 8401 8402 /* 8403 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 8404 * or min_free_kbytes changes. 8405 */ 8406 static void calculate_totalreserve_pages(void) 8407 { 8408 struct pglist_data *pgdat; 8409 unsigned long reserve_pages = 0; 8410 enum zone_type i, j; 8411 8412 for_each_online_pgdat(pgdat) { 8413 8414 pgdat->totalreserve_pages = 0; 8415 8416 for (i = 0; i < MAX_NR_ZONES; i++) { 8417 struct zone *zone = pgdat->node_zones + i; 8418 long max = 0; 8419 unsigned long managed_pages = zone_managed_pages(zone); 8420 8421 /* Find valid and maximum lowmem_reserve in the zone */ 8422 for (j = i; j < MAX_NR_ZONES; j++) { 8423 if (zone->lowmem_reserve[j] > max) 8424 max = zone->lowmem_reserve[j]; 8425 } 8426 8427 /* we treat the high watermark as reserved pages. */ 8428 max += high_wmark_pages(zone); 8429 8430 if (max > managed_pages) 8431 max = managed_pages; 8432 8433 pgdat->totalreserve_pages += max; 8434 8435 reserve_pages += max; 8436 } 8437 } 8438 totalreserve_pages = reserve_pages; 8439 } 8440 8441 /* 8442 * setup_per_zone_lowmem_reserve - called whenever 8443 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 8444 * has a correct pages reserved value, so an adequate number of 8445 * pages are left in the zone after a successful __alloc_pages(). 8446 */ 8447 static void setup_per_zone_lowmem_reserve(void) 8448 { 8449 struct pglist_data *pgdat; 8450 enum zone_type i, j; 8451 8452 for_each_online_pgdat(pgdat) { 8453 for (i = 0; i < MAX_NR_ZONES - 1; i++) { 8454 struct zone *zone = &pgdat->node_zones[i]; 8455 int ratio = sysctl_lowmem_reserve_ratio[i]; 8456 bool clear = !ratio || !zone_managed_pages(zone); 8457 unsigned long managed_pages = 0; 8458 8459 for (j = i + 1; j < MAX_NR_ZONES; j++) { 8460 struct zone *upper_zone = &pgdat->node_zones[j]; 8461 8462 managed_pages += zone_managed_pages(upper_zone); 8463 8464 if (clear) 8465 zone->lowmem_reserve[j] = 0; 8466 else 8467 zone->lowmem_reserve[j] = managed_pages / ratio; 8468 } 8469 } 8470 } 8471 8472 /* update totalreserve_pages */ 8473 calculate_totalreserve_pages(); 8474 } 8475 8476 static void __setup_per_zone_wmarks(void) 8477 { 8478 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 8479 unsigned long lowmem_pages = 0; 8480 struct zone *zone; 8481 unsigned long flags; 8482 8483 /* Calculate total number of !ZONE_HIGHMEM pages */ 8484 for_each_zone(zone) { 8485 if (!is_highmem(zone)) 8486 lowmem_pages += zone_managed_pages(zone); 8487 } 8488 8489 for_each_zone(zone) { 8490 u64 tmp; 8491 8492 spin_lock_irqsave(&zone->lock, flags); 8493 tmp = (u64)pages_min * zone_managed_pages(zone); 8494 do_div(tmp, lowmem_pages); 8495 if (is_highmem(zone)) { 8496 /* 8497 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 8498 * need highmem pages, so cap pages_min to a small 8499 * value here. 8500 * 8501 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 8502 * deltas control async page reclaim, and so should 8503 * not be capped for highmem. 8504 */ 8505 unsigned long min_pages; 8506 8507 min_pages = zone_managed_pages(zone) / 1024; 8508 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 8509 zone->_watermark[WMARK_MIN] = min_pages; 8510 } else { 8511 /* 8512 * If it's a lowmem zone, reserve a number of pages 8513 * proportionate to the zone's size. 8514 */ 8515 zone->_watermark[WMARK_MIN] = tmp; 8516 } 8517 8518 /* 8519 * Set the kswapd watermarks distance according to the 8520 * scale factor in proportion to available memory, but 8521 * ensure a minimum size on small systems. 8522 */ 8523 tmp = max_t(u64, tmp >> 2, 8524 mult_frac(zone_managed_pages(zone), 8525 watermark_scale_factor, 10000)); 8526 8527 zone->watermark_boost = 0; 8528 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 8529 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; 8530 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; 8531 8532 spin_unlock_irqrestore(&zone->lock, flags); 8533 } 8534 8535 /* update totalreserve_pages */ 8536 calculate_totalreserve_pages(); 8537 } 8538 8539 /** 8540 * setup_per_zone_wmarks - called when min_free_kbytes changes 8541 * or when memory is hot-{added|removed} 8542 * 8543 * Ensures that the watermark[min,low,high] values for each zone are set 8544 * correctly with respect to min_free_kbytes. 8545 */ 8546 void setup_per_zone_wmarks(void) 8547 { 8548 struct zone *zone; 8549 static DEFINE_SPINLOCK(lock); 8550 8551 spin_lock(&lock); 8552 __setup_per_zone_wmarks(); 8553 spin_unlock(&lock); 8554 8555 /* 8556 * The watermark size have changed so update the pcpu batch 8557 * and high limits or the limits may be inappropriate. 8558 */ 8559 for_each_zone(zone) 8560 zone_pcp_update(zone, 0); 8561 } 8562 8563 /* 8564 * Initialise min_free_kbytes. 8565 * 8566 * For small machines we want it small (128k min). For large machines 8567 * we want it large (256MB max). But it is not linear, because network 8568 * bandwidth does not increase linearly with machine size. We use 8569 * 8570 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 8571 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 8572 * 8573 * which yields 8574 * 8575 * 16MB: 512k 8576 * 32MB: 724k 8577 * 64MB: 1024k 8578 * 128MB: 1448k 8579 * 256MB: 2048k 8580 * 512MB: 2896k 8581 * 1024MB: 4096k 8582 * 2048MB: 5792k 8583 * 4096MB: 8192k 8584 * 8192MB: 11584k 8585 * 16384MB: 16384k 8586 */ 8587 void calculate_min_free_kbytes(void) 8588 { 8589 unsigned long lowmem_kbytes; 8590 int new_min_free_kbytes; 8591 8592 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 8593 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 8594 8595 if (new_min_free_kbytes > user_min_free_kbytes) 8596 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); 8597 else 8598 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 8599 new_min_free_kbytes, user_min_free_kbytes); 8600 8601 } 8602 8603 int __meminit init_per_zone_wmark_min(void) 8604 { 8605 calculate_min_free_kbytes(); 8606 setup_per_zone_wmarks(); 8607 refresh_zone_stat_thresholds(); 8608 setup_per_zone_lowmem_reserve(); 8609 8610 #ifdef CONFIG_NUMA 8611 setup_min_unmapped_ratio(); 8612 setup_min_slab_ratio(); 8613 #endif 8614 8615 khugepaged_min_free_kbytes_update(); 8616 8617 return 0; 8618 } 8619 postcore_initcall(init_per_zone_wmark_min) 8620 8621 /* 8622 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 8623 * that we can call two helper functions whenever min_free_kbytes 8624 * changes. 8625 */ 8626 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 8627 void *buffer, size_t *length, loff_t *ppos) 8628 { 8629 int rc; 8630 8631 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8632 if (rc) 8633 return rc; 8634 8635 if (write) { 8636 user_min_free_kbytes = min_free_kbytes; 8637 setup_per_zone_wmarks(); 8638 } 8639 return 0; 8640 } 8641 8642 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 8643 void *buffer, size_t *length, loff_t *ppos) 8644 { 8645 int rc; 8646 8647 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8648 if (rc) 8649 return rc; 8650 8651 if (write) 8652 setup_per_zone_wmarks(); 8653 8654 return 0; 8655 } 8656 8657 #ifdef CONFIG_NUMA 8658 static void setup_min_unmapped_ratio(void) 8659 { 8660 pg_data_t *pgdat; 8661 struct zone *zone; 8662 8663 for_each_online_pgdat(pgdat) 8664 pgdat->min_unmapped_pages = 0; 8665 8666 for_each_zone(zone) 8667 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 8668 sysctl_min_unmapped_ratio) / 100; 8669 } 8670 8671 8672 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 8673 void *buffer, size_t *length, loff_t *ppos) 8674 { 8675 int rc; 8676 8677 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8678 if (rc) 8679 return rc; 8680 8681 setup_min_unmapped_ratio(); 8682 8683 return 0; 8684 } 8685 8686 static void setup_min_slab_ratio(void) 8687 { 8688 pg_data_t *pgdat; 8689 struct zone *zone; 8690 8691 for_each_online_pgdat(pgdat) 8692 pgdat->min_slab_pages = 0; 8693 8694 for_each_zone(zone) 8695 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 8696 sysctl_min_slab_ratio) / 100; 8697 } 8698 8699 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 8700 void *buffer, size_t *length, loff_t *ppos) 8701 { 8702 int rc; 8703 8704 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8705 if (rc) 8706 return rc; 8707 8708 setup_min_slab_ratio(); 8709 8710 return 0; 8711 } 8712 #endif 8713 8714 /* 8715 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 8716 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 8717 * whenever sysctl_lowmem_reserve_ratio changes. 8718 * 8719 * The reserve ratio obviously has absolutely no relation with the 8720 * minimum watermarks. The lowmem reserve ratio can only make sense 8721 * if in function of the boot time zone sizes. 8722 */ 8723 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 8724 void *buffer, size_t *length, loff_t *ppos) 8725 { 8726 int i; 8727 8728 proc_dointvec_minmax(table, write, buffer, length, ppos); 8729 8730 for (i = 0; i < MAX_NR_ZONES; i++) { 8731 if (sysctl_lowmem_reserve_ratio[i] < 1) 8732 sysctl_lowmem_reserve_ratio[i] = 0; 8733 } 8734 8735 setup_per_zone_lowmem_reserve(); 8736 return 0; 8737 } 8738 8739 /* 8740 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each 8741 * cpu. It is the fraction of total pages in each zone that a hot per cpu 8742 * pagelist can have before it gets flushed back to buddy allocator. 8743 */ 8744 int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, 8745 int write, void *buffer, size_t *length, loff_t *ppos) 8746 { 8747 struct zone *zone; 8748 int old_percpu_pagelist_high_fraction; 8749 int ret; 8750 8751 mutex_lock(&pcp_batch_high_lock); 8752 old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; 8753 8754 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 8755 if (!write || ret < 0) 8756 goto out; 8757 8758 /* Sanity checking to avoid pcp imbalance */ 8759 if (percpu_pagelist_high_fraction && 8760 percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { 8761 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; 8762 ret = -EINVAL; 8763 goto out; 8764 } 8765 8766 /* No change? */ 8767 if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) 8768 goto out; 8769 8770 for_each_populated_zone(zone) 8771 zone_set_pageset_high_and_batch(zone, 0); 8772 out: 8773 mutex_unlock(&pcp_batch_high_lock); 8774 return ret; 8775 } 8776 8777 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 8778 /* 8779 * Returns the number of pages that arch has reserved but 8780 * is not known to alloc_large_system_hash(). 8781 */ 8782 static unsigned long __init arch_reserved_kernel_pages(void) 8783 { 8784 return 0; 8785 } 8786 #endif 8787 8788 /* 8789 * Adaptive scale is meant to reduce sizes of hash tables on large memory 8790 * machines. As memory size is increased the scale is also increased but at 8791 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 8792 * quadruples the scale is increased by one, which means the size of hash table 8793 * only doubles, instead of quadrupling as well. 8794 * Because 32-bit systems cannot have large physical memory, where this scaling 8795 * makes sense, it is disabled on such platforms. 8796 */ 8797 #if __BITS_PER_LONG > 32 8798 #define ADAPT_SCALE_BASE (64ul << 30) 8799 #define ADAPT_SCALE_SHIFT 2 8800 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 8801 #endif 8802 8803 /* 8804 * allocate a large system hash table from bootmem 8805 * - it is assumed that the hash table must contain an exact power-of-2 8806 * quantity of entries 8807 * - limit is the number of hash buckets, not the total allocation size 8808 */ 8809 void *__init alloc_large_system_hash(const char *tablename, 8810 unsigned long bucketsize, 8811 unsigned long numentries, 8812 int scale, 8813 int flags, 8814 unsigned int *_hash_shift, 8815 unsigned int *_hash_mask, 8816 unsigned long low_limit, 8817 unsigned long high_limit) 8818 { 8819 unsigned long long max = high_limit; 8820 unsigned long log2qty, size; 8821 void *table = NULL; 8822 gfp_t gfp_flags; 8823 bool virt; 8824 bool huge; 8825 8826 /* allow the kernel cmdline to have a say */ 8827 if (!numentries) { 8828 /* round applicable memory size up to nearest megabyte */ 8829 numentries = nr_kernel_pages; 8830 numentries -= arch_reserved_kernel_pages(); 8831 8832 /* It isn't necessary when PAGE_SIZE >= 1MB */ 8833 if (PAGE_SHIFT < 20) 8834 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 8835 8836 #if __BITS_PER_LONG > 32 8837 if (!high_limit) { 8838 unsigned long adapt; 8839 8840 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 8841 adapt <<= ADAPT_SCALE_SHIFT) 8842 scale++; 8843 } 8844 #endif 8845 8846 /* limit to 1 bucket per 2^scale bytes of low memory */ 8847 if (scale > PAGE_SHIFT) 8848 numentries >>= (scale - PAGE_SHIFT); 8849 else 8850 numentries <<= (PAGE_SHIFT - scale); 8851 8852 /* Make sure we've got at least a 0-order allocation.. */ 8853 if (unlikely(flags & HASH_SMALL)) { 8854 /* Makes no sense without HASH_EARLY */ 8855 WARN_ON(!(flags & HASH_EARLY)); 8856 if (!(numentries >> *_hash_shift)) { 8857 numentries = 1UL << *_hash_shift; 8858 BUG_ON(!numentries); 8859 } 8860 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 8861 numentries = PAGE_SIZE / bucketsize; 8862 } 8863 numentries = roundup_pow_of_two(numentries); 8864 8865 /* limit allocation size to 1/16 total memory by default */ 8866 if (max == 0) { 8867 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 8868 do_div(max, bucketsize); 8869 } 8870 max = min(max, 0x80000000ULL); 8871 8872 if (numentries < low_limit) 8873 numentries = low_limit; 8874 if (numentries > max) 8875 numentries = max; 8876 8877 log2qty = ilog2(numentries); 8878 8879 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 8880 do { 8881 virt = false; 8882 size = bucketsize << log2qty; 8883 if (flags & HASH_EARLY) { 8884 if (flags & HASH_ZERO) 8885 table = memblock_alloc(size, SMP_CACHE_BYTES); 8886 else 8887 table = memblock_alloc_raw(size, 8888 SMP_CACHE_BYTES); 8889 } else if (get_order(size) >= MAX_ORDER || hashdist) { 8890 table = vmalloc_huge(size, gfp_flags); 8891 virt = true; 8892 if (table) 8893 huge = is_vm_area_hugepages(table); 8894 } else { 8895 /* 8896 * If bucketsize is not a power-of-two, we may free 8897 * some pages at the end of hash table which 8898 * alloc_pages_exact() automatically does 8899 */ 8900 table = alloc_pages_exact(size, gfp_flags); 8901 kmemleak_alloc(table, size, 1, gfp_flags); 8902 } 8903 } while (!table && size > PAGE_SIZE && --log2qty); 8904 8905 if (!table) 8906 panic("Failed to allocate %s hash table\n", tablename); 8907 8908 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 8909 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, 8910 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); 8911 8912 if (_hash_shift) 8913 *_hash_shift = log2qty; 8914 if (_hash_mask) 8915 *_hash_mask = (1 << log2qty) - 1; 8916 8917 return table; 8918 } 8919 8920 /* 8921 * This function checks whether pageblock includes unmovable pages or not. 8922 * 8923 * PageLRU check without isolation or lru_lock could race so that 8924 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 8925 * check without lock_page also may miss some movable non-lru pages at 8926 * race condition. So you can't expect this function should be exact. 8927 * 8928 * Returns a page without holding a reference. If the caller wants to 8929 * dereference that page (e.g., dumping), it has to make sure that it 8930 * cannot get removed (e.g., via memory unplug) concurrently. 8931 * 8932 */ 8933 struct page *has_unmovable_pages(struct zone *zone, struct page *page, 8934 int migratetype, int flags) 8935 { 8936 unsigned long iter = 0; 8937 unsigned long pfn = page_to_pfn(page); 8938 unsigned long offset = pfn % pageblock_nr_pages; 8939 8940 if (is_migrate_cma_page(page)) { 8941 /* 8942 * CMA allocations (alloc_contig_range) really need to mark 8943 * isolate CMA pageblocks even when they are not movable in fact 8944 * so consider them movable here. 8945 */ 8946 if (is_migrate_cma(migratetype)) 8947 return NULL; 8948 8949 return page; 8950 } 8951 8952 for (; iter < pageblock_nr_pages - offset; iter++) { 8953 page = pfn_to_page(pfn + iter); 8954 8955 /* 8956 * Both, bootmem allocations and memory holes are marked 8957 * PG_reserved and are unmovable. We can even have unmovable 8958 * allocations inside ZONE_MOVABLE, for example when 8959 * specifying "movablecore". 8960 */ 8961 if (PageReserved(page)) 8962 return page; 8963 8964 /* 8965 * If the zone is movable and we have ruled out all reserved 8966 * pages then it should be reasonably safe to assume the rest 8967 * is movable. 8968 */ 8969 if (zone_idx(zone) == ZONE_MOVABLE) 8970 continue; 8971 8972 /* 8973 * Hugepages are not in LRU lists, but they're movable. 8974 * THPs are on the LRU, but need to be counted as #small pages. 8975 * We need not scan over tail pages because we don't 8976 * handle each tail page individually in migration. 8977 */ 8978 if (PageHuge(page) || PageTransCompound(page)) { 8979 struct page *head = compound_head(page); 8980 unsigned int skip_pages; 8981 8982 if (PageHuge(page)) { 8983 if (!hugepage_migration_supported(page_hstate(head))) 8984 return page; 8985 } else if (!PageLRU(head) && !__PageMovable(head)) { 8986 return page; 8987 } 8988 8989 skip_pages = compound_nr(head) - (page - head); 8990 iter += skip_pages - 1; 8991 continue; 8992 } 8993 8994 /* 8995 * We can't use page_count without pin a page 8996 * because another CPU can free compound page. 8997 * This check already skips compound tails of THP 8998 * because their page->_refcount is zero at all time. 8999 */ 9000 if (!page_ref_count(page)) { 9001 if (PageBuddy(page)) 9002 iter += (1 << buddy_order(page)) - 1; 9003 continue; 9004 } 9005 9006 /* 9007 * The HWPoisoned page may be not in buddy system, and 9008 * page_count() is not 0. 9009 */ 9010 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) 9011 continue; 9012 9013 /* 9014 * We treat all PageOffline() pages as movable when offlining 9015 * to give drivers a chance to decrement their reference count 9016 * in MEM_GOING_OFFLINE in order to indicate that these pages 9017 * can be offlined as there are no direct references anymore. 9018 * For actually unmovable PageOffline() where the driver does 9019 * not support this, we will fail later when trying to actually 9020 * move these pages that still have a reference count > 0. 9021 * (false negatives in this function only) 9022 */ 9023 if ((flags & MEMORY_OFFLINE) && PageOffline(page)) 9024 continue; 9025 9026 if (__PageMovable(page) || PageLRU(page)) 9027 continue; 9028 9029 /* 9030 * If there are RECLAIMABLE pages, we need to check 9031 * it. But now, memory offline itself doesn't call 9032 * shrink_node_slabs() and it still to be fixed. 9033 */ 9034 return page; 9035 } 9036 return NULL; 9037 } 9038 9039 #ifdef CONFIG_CONTIG_ALLOC 9040 static unsigned long pfn_max_align_down(unsigned long pfn) 9041 { 9042 return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); 9043 } 9044 9045 static unsigned long pfn_max_align_up(unsigned long pfn) 9046 { 9047 return ALIGN(pfn, MAX_ORDER_NR_PAGES); 9048 } 9049 9050 #if defined(CONFIG_DYNAMIC_DEBUG) || \ 9051 (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) 9052 /* Usage: See admin-guide/dynamic-debug-howto.rst */ 9053 static void alloc_contig_dump_pages(struct list_head *page_list) 9054 { 9055 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); 9056 9057 if (DYNAMIC_DEBUG_BRANCH(descriptor)) { 9058 struct page *page; 9059 9060 dump_stack(); 9061 list_for_each_entry(page, page_list, lru) 9062 dump_page(page, "migration failure"); 9063 } 9064 } 9065 #else 9066 static inline void alloc_contig_dump_pages(struct list_head *page_list) 9067 { 9068 } 9069 #endif 9070 9071 /* [start, end) must belong to a single zone. */ 9072 static int __alloc_contig_migrate_range(struct compact_control *cc, 9073 unsigned long start, unsigned long end) 9074 { 9075 /* This function is based on compact_zone() from compaction.c. */ 9076 unsigned int nr_reclaimed; 9077 unsigned long pfn = start; 9078 unsigned int tries = 0; 9079 int ret = 0; 9080 struct migration_target_control mtc = { 9081 .nid = zone_to_nid(cc->zone), 9082 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 9083 }; 9084 9085 lru_cache_disable(); 9086 9087 while (pfn < end || !list_empty(&cc->migratepages)) { 9088 if (fatal_signal_pending(current)) { 9089 ret = -EINTR; 9090 break; 9091 } 9092 9093 if (list_empty(&cc->migratepages)) { 9094 cc->nr_migratepages = 0; 9095 ret = isolate_migratepages_range(cc, pfn, end); 9096 if (ret && ret != -EAGAIN) 9097 break; 9098 pfn = cc->migrate_pfn; 9099 tries = 0; 9100 } else if (++tries == 5) { 9101 ret = -EBUSY; 9102 break; 9103 } 9104 9105 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 9106 &cc->migratepages); 9107 cc->nr_migratepages -= nr_reclaimed; 9108 9109 ret = migrate_pages(&cc->migratepages, alloc_migration_target, 9110 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); 9111 9112 /* 9113 * On -ENOMEM, migrate_pages() bails out right away. It is pointless 9114 * to retry again over this error, so do the same here. 9115 */ 9116 if (ret == -ENOMEM) 9117 break; 9118 } 9119 9120 lru_cache_enable(); 9121 if (ret < 0) { 9122 if (ret == -EBUSY) 9123 alloc_contig_dump_pages(&cc->migratepages); 9124 putback_movable_pages(&cc->migratepages); 9125 return ret; 9126 } 9127 return 0; 9128 } 9129 9130 /** 9131 * alloc_contig_range() -- tries to allocate given range of pages 9132 * @start: start PFN to allocate 9133 * @end: one-past-the-last PFN to allocate 9134 * @migratetype: migratetype of the underlying pageblocks (either 9135 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 9136 * in range must have the same migratetype and it must 9137 * be either of the two. 9138 * @gfp_mask: GFP mask to use during compaction 9139 * 9140 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 9141 * aligned. The PFN range must belong to a single zone. 9142 * 9143 * The first thing this routine does is attempt to MIGRATE_ISOLATE all 9144 * pageblocks in the range. Once isolated, the pageblocks should not 9145 * be modified by others. 9146 * 9147 * Return: zero on success or negative error code. On success all 9148 * pages which PFN is in [start, end) are allocated for the caller and 9149 * need to be freed with free_contig_range(). 9150 */ 9151 int alloc_contig_range(unsigned long start, unsigned long end, 9152 unsigned migratetype, gfp_t gfp_mask) 9153 { 9154 unsigned long outer_start, outer_end; 9155 unsigned int order; 9156 int ret = 0; 9157 9158 struct compact_control cc = { 9159 .nr_migratepages = 0, 9160 .order = -1, 9161 .zone = page_zone(pfn_to_page(start)), 9162 .mode = MIGRATE_SYNC, 9163 .ignore_skip_hint = true, 9164 .no_set_skip_hint = true, 9165 .gfp_mask = current_gfp_context(gfp_mask), 9166 .alloc_contig = true, 9167 }; 9168 INIT_LIST_HEAD(&cc.migratepages); 9169 9170 /* 9171 * What we do here is we mark all pageblocks in range as 9172 * MIGRATE_ISOLATE. Because pageblock and max order pages may 9173 * have different sizes, and due to the way page allocator 9174 * work, we align the range to biggest of the two pages so 9175 * that page allocator won't try to merge buddies from 9176 * different pageblocks and change MIGRATE_ISOLATE to some 9177 * other migration type. 9178 * 9179 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 9180 * migrate the pages from an unaligned range (ie. pages that 9181 * we are interested in). This will put all the pages in 9182 * range back to page allocator as MIGRATE_ISOLATE. 9183 * 9184 * When this is done, we take the pages in range from page 9185 * allocator removing them from the buddy system. This way 9186 * page allocator will never consider using them. 9187 * 9188 * This lets us mark the pageblocks back as 9189 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 9190 * aligned range but not in the unaligned, original range are 9191 * put back to page allocator so that buddy can use them. 9192 */ 9193 9194 ret = start_isolate_page_range(pfn_max_align_down(start), 9195 pfn_max_align_up(end), migratetype, 0); 9196 if (ret) 9197 return ret; 9198 9199 drain_all_pages(cc.zone); 9200 9201 /* 9202 * In case of -EBUSY, we'd like to know which page causes problem. 9203 * So, just fall through. test_pages_isolated() has a tracepoint 9204 * which will report the busy page. 9205 * 9206 * It is possible that busy pages could become available before 9207 * the call to test_pages_isolated, and the range will actually be 9208 * allocated. So, if we fall through be sure to clear ret so that 9209 * -EBUSY is not accidentally used or returned to caller. 9210 */ 9211 ret = __alloc_contig_migrate_range(&cc, start, end); 9212 if (ret && ret != -EBUSY) 9213 goto done; 9214 ret = 0; 9215 9216 /* 9217 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 9218 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 9219 * more, all pages in [start, end) are free in page allocator. 9220 * What we are going to do is to allocate all pages from 9221 * [start, end) (that is remove them from page allocator). 9222 * 9223 * The only problem is that pages at the beginning and at the 9224 * end of interesting range may be not aligned with pages that 9225 * page allocator holds, ie. they can be part of higher order 9226 * pages. Because of this, we reserve the bigger range and 9227 * once this is done free the pages we are not interested in. 9228 * 9229 * We don't have to hold zone->lock here because the pages are 9230 * isolated thus they won't get removed from buddy. 9231 */ 9232 9233 order = 0; 9234 outer_start = start; 9235 while (!PageBuddy(pfn_to_page(outer_start))) { 9236 if (++order >= MAX_ORDER) { 9237 outer_start = start; 9238 break; 9239 } 9240 outer_start &= ~0UL << order; 9241 } 9242 9243 if (outer_start != start) { 9244 order = buddy_order(pfn_to_page(outer_start)); 9245 9246 /* 9247 * outer_start page could be small order buddy page and 9248 * it doesn't include start page. Adjust outer_start 9249 * in this case to report failed page properly 9250 * on tracepoint in test_pages_isolated() 9251 */ 9252 if (outer_start + (1UL << order) <= start) 9253 outer_start = start; 9254 } 9255 9256 /* Make sure the range is really isolated. */ 9257 if (test_pages_isolated(outer_start, end, 0)) { 9258 ret = -EBUSY; 9259 goto done; 9260 } 9261 9262 /* Grab isolated pages from freelists. */ 9263 outer_end = isolate_freepages_range(&cc, outer_start, end); 9264 if (!outer_end) { 9265 ret = -EBUSY; 9266 goto done; 9267 } 9268 9269 /* Free head and tail (if any) */ 9270 if (start != outer_start) 9271 free_contig_range(outer_start, start - outer_start); 9272 if (end != outer_end) 9273 free_contig_range(end, outer_end - end); 9274 9275 done: 9276 undo_isolate_page_range(pfn_max_align_down(start), 9277 pfn_max_align_up(end), migratetype); 9278 return ret; 9279 } 9280 EXPORT_SYMBOL(alloc_contig_range); 9281 9282 static int __alloc_contig_pages(unsigned long start_pfn, 9283 unsigned long nr_pages, gfp_t gfp_mask) 9284 { 9285 unsigned long end_pfn = start_pfn + nr_pages; 9286 9287 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 9288 gfp_mask); 9289 } 9290 9291 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 9292 unsigned long nr_pages) 9293 { 9294 unsigned long i, end_pfn = start_pfn + nr_pages; 9295 struct page *page; 9296 9297 for (i = start_pfn; i < end_pfn; i++) { 9298 page = pfn_to_online_page(i); 9299 if (!page) 9300 return false; 9301 9302 if (page_zone(page) != z) 9303 return false; 9304 9305 if (PageReserved(page)) 9306 return false; 9307 } 9308 return true; 9309 } 9310 9311 static bool zone_spans_last_pfn(const struct zone *zone, 9312 unsigned long start_pfn, unsigned long nr_pages) 9313 { 9314 unsigned long last_pfn = start_pfn + nr_pages - 1; 9315 9316 return zone_spans_pfn(zone, last_pfn); 9317 } 9318 9319 /** 9320 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 9321 * @nr_pages: Number of contiguous pages to allocate 9322 * @gfp_mask: GFP mask to limit search and used during compaction 9323 * @nid: Target node 9324 * @nodemask: Mask for other possible nodes 9325 * 9326 * This routine is a wrapper around alloc_contig_range(). It scans over zones 9327 * on an applicable zonelist to find a contiguous pfn range which can then be 9328 * tried for allocation with alloc_contig_range(). This routine is intended 9329 * for allocation requests which can not be fulfilled with the buddy allocator. 9330 * 9331 * The allocated memory is always aligned to a page boundary. If nr_pages is a 9332 * power of two, then allocated range is also guaranteed to be aligned to same 9333 * nr_pages (e.g. 1GB request would be aligned to 1GB). 9334 * 9335 * Allocated pages can be freed with free_contig_range() or by manually calling 9336 * __free_page() on each allocated page. 9337 * 9338 * Return: pointer to contiguous pages on success, or NULL if not successful. 9339 */ 9340 struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, 9341 int nid, nodemask_t *nodemask) 9342 { 9343 unsigned long ret, pfn, flags; 9344 struct zonelist *zonelist; 9345 struct zone *zone; 9346 struct zoneref *z; 9347 9348 zonelist = node_zonelist(nid, gfp_mask); 9349 for_each_zone_zonelist_nodemask(zone, z, zonelist, 9350 gfp_zone(gfp_mask), nodemask) { 9351 spin_lock_irqsave(&zone->lock, flags); 9352 9353 pfn = ALIGN(zone->zone_start_pfn, nr_pages); 9354 while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 9355 if (pfn_range_valid_contig(zone, pfn, nr_pages)) { 9356 /* 9357 * We release the zone lock here because 9358 * alloc_contig_range() will also lock the zone 9359 * at some point. If there's an allocation 9360 * spinning on this lock, it may win the race 9361 * and cause alloc_contig_range() to fail... 9362 */ 9363 spin_unlock_irqrestore(&zone->lock, flags); 9364 ret = __alloc_contig_pages(pfn, nr_pages, 9365 gfp_mask); 9366 if (!ret) 9367 return pfn_to_page(pfn); 9368 spin_lock_irqsave(&zone->lock, flags); 9369 } 9370 pfn += nr_pages; 9371 } 9372 spin_unlock_irqrestore(&zone->lock, flags); 9373 } 9374 return NULL; 9375 } 9376 #endif /* CONFIG_CONTIG_ALLOC */ 9377 9378 void free_contig_range(unsigned long pfn, unsigned long nr_pages) 9379 { 9380 unsigned long count = 0; 9381 9382 for (; nr_pages--; pfn++) { 9383 struct page *page = pfn_to_page(pfn); 9384 9385 count += page_count(page) != 1; 9386 __free_page(page); 9387 } 9388 WARN(count != 0, "%lu pages are still in use!\n", count); 9389 } 9390 EXPORT_SYMBOL(free_contig_range); 9391 9392 /* 9393 * The zone indicated has a new number of managed_pages; batch sizes and percpu 9394 * page high values need to be recalculated. 9395 */ 9396 void zone_pcp_update(struct zone *zone, int cpu_online) 9397 { 9398 mutex_lock(&pcp_batch_high_lock); 9399 zone_set_pageset_high_and_batch(zone, cpu_online); 9400 mutex_unlock(&pcp_batch_high_lock); 9401 } 9402 9403 /* 9404 * Effectively disable pcplists for the zone by setting the high limit to 0 9405 * and draining all cpus. A concurrent page freeing on another CPU that's about 9406 * to put the page on pcplist will either finish before the drain and the page 9407 * will be drained, or observe the new high limit and skip the pcplist. 9408 * 9409 * Must be paired with a call to zone_pcp_enable(). 9410 */ 9411 void zone_pcp_disable(struct zone *zone) 9412 { 9413 mutex_lock(&pcp_batch_high_lock); 9414 __zone_set_pageset_high_and_batch(zone, 0, 1); 9415 __drain_all_pages(zone, true); 9416 } 9417 9418 void zone_pcp_enable(struct zone *zone) 9419 { 9420 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); 9421 mutex_unlock(&pcp_batch_high_lock); 9422 } 9423 9424 void zone_pcp_reset(struct zone *zone) 9425 { 9426 int cpu; 9427 struct per_cpu_zonestat *pzstats; 9428 9429 if (zone->per_cpu_pageset != &boot_pageset) { 9430 for_each_online_cpu(cpu) { 9431 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 9432 drain_zonestat(zone, pzstats); 9433 } 9434 free_percpu(zone->per_cpu_pageset); 9435 free_percpu(zone->per_cpu_zonestats); 9436 zone->per_cpu_pageset = &boot_pageset; 9437 zone->per_cpu_zonestats = &boot_zonestats; 9438 } 9439 } 9440 9441 #ifdef CONFIG_MEMORY_HOTREMOVE 9442 /* 9443 * All pages in the range must be in a single zone, must not contain holes, 9444 * must span full sections, and must be isolated before calling this function. 9445 */ 9446 void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 9447 { 9448 unsigned long pfn = start_pfn; 9449 struct page *page; 9450 struct zone *zone; 9451 unsigned int order; 9452 unsigned long flags; 9453 9454 offline_mem_sections(pfn, end_pfn); 9455 zone = page_zone(pfn_to_page(pfn)); 9456 spin_lock_irqsave(&zone->lock, flags); 9457 while (pfn < end_pfn) { 9458 page = pfn_to_page(pfn); 9459 /* 9460 * The HWPoisoned page may be not in buddy system, and 9461 * page_count() is not 0. 9462 */ 9463 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 9464 pfn++; 9465 continue; 9466 } 9467 /* 9468 * At this point all remaining PageOffline() pages have a 9469 * reference count of 0 and can simply be skipped. 9470 */ 9471 if (PageOffline(page)) { 9472 BUG_ON(page_count(page)); 9473 BUG_ON(PageBuddy(page)); 9474 pfn++; 9475 continue; 9476 } 9477 9478 BUG_ON(page_count(page)); 9479 BUG_ON(!PageBuddy(page)); 9480 order = buddy_order(page); 9481 del_page_from_free_list(page, zone, order); 9482 pfn += (1 << order); 9483 } 9484 spin_unlock_irqrestore(&zone->lock, flags); 9485 } 9486 #endif 9487 9488 /* 9489 * This function returns a stable result only if called under zone lock. 9490 */ 9491 bool is_free_buddy_page(struct page *page) 9492 { 9493 unsigned long pfn = page_to_pfn(page); 9494 unsigned int order; 9495 9496 for (order = 0; order < MAX_ORDER; order++) { 9497 struct page *page_head = page - (pfn & ((1 << order) - 1)); 9498 9499 if (PageBuddy(page_head) && 9500 buddy_order_unsafe(page_head) >= order) 9501 break; 9502 } 9503 9504 return order < MAX_ORDER; 9505 } 9506 EXPORT_SYMBOL(is_free_buddy_page); 9507 9508 #ifdef CONFIG_MEMORY_FAILURE 9509 /* 9510 * Break down a higher-order page in sub-pages, and keep our target out of 9511 * buddy allocator. 9512 */ 9513 static void break_down_buddy_pages(struct zone *zone, struct page *page, 9514 struct page *target, int low, int high, 9515 int migratetype) 9516 { 9517 unsigned long size = 1 << high; 9518 struct page *current_buddy, *next_page; 9519 9520 while (high > low) { 9521 high--; 9522 size >>= 1; 9523 9524 if (target >= &page[size]) { 9525 next_page = page + size; 9526 current_buddy = page; 9527 } else { 9528 next_page = page; 9529 current_buddy = page + size; 9530 } 9531 9532 if (set_page_guard(zone, current_buddy, high, migratetype)) 9533 continue; 9534 9535 if (current_buddy != target) { 9536 add_to_free_list(current_buddy, zone, high, migratetype); 9537 set_buddy_order(current_buddy, high); 9538 page = next_page; 9539 } 9540 } 9541 } 9542 9543 /* 9544 * Take a page that will be marked as poisoned off the buddy allocator. 9545 */ 9546 bool take_page_off_buddy(struct page *page) 9547 { 9548 struct zone *zone = page_zone(page); 9549 unsigned long pfn = page_to_pfn(page); 9550 unsigned long flags; 9551 unsigned int order; 9552 bool ret = false; 9553 9554 spin_lock_irqsave(&zone->lock, flags); 9555 for (order = 0; order < MAX_ORDER; order++) { 9556 struct page *page_head = page - (pfn & ((1 << order) - 1)); 9557 int page_order = buddy_order(page_head); 9558 9559 if (PageBuddy(page_head) && page_order >= order) { 9560 unsigned long pfn_head = page_to_pfn(page_head); 9561 int migratetype = get_pfnblock_migratetype(page_head, 9562 pfn_head); 9563 9564 del_page_from_free_list(page_head, zone, page_order); 9565 break_down_buddy_pages(zone, page_head, page, 0, 9566 page_order, migratetype); 9567 SetPageHWPoisonTakenOff(page); 9568 if (!is_migrate_isolate(migratetype)) 9569 __mod_zone_freepage_state(zone, -1, migratetype); 9570 ret = true; 9571 break; 9572 } 9573 if (page_count(page_head) > 0) 9574 break; 9575 } 9576 spin_unlock_irqrestore(&zone->lock, flags); 9577 return ret; 9578 } 9579 9580 /* 9581 * Cancel takeoff done by take_page_off_buddy(). 9582 */ 9583 bool put_page_back_buddy(struct page *page) 9584 { 9585 struct zone *zone = page_zone(page); 9586 unsigned long pfn = page_to_pfn(page); 9587 unsigned long flags; 9588 int migratetype = get_pfnblock_migratetype(page, pfn); 9589 bool ret = false; 9590 9591 spin_lock_irqsave(&zone->lock, flags); 9592 if (put_page_testzero(page)) { 9593 ClearPageHWPoisonTakenOff(page); 9594 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); 9595 if (TestClearPageHWPoison(page)) { 9596 num_poisoned_pages_dec(); 9597 ret = true; 9598 } 9599 } 9600 spin_unlock_irqrestore(&zone->lock, flags); 9601 9602 return ret; 9603 } 9604 #endif 9605 9606 #ifdef CONFIG_ZONE_DMA 9607 bool has_managed_dma(void) 9608 { 9609 struct pglist_data *pgdat; 9610 9611 for_each_online_pgdat(pgdat) { 9612 struct zone *zone = &pgdat->node_zones[ZONE_DMA]; 9613 9614 if (managed_zone(zone)) 9615 return true; 9616 } 9617 return false; 9618 } 9619 #endif /* CONFIG_ZONE_DMA */ 9620