1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/page_alloc.c 4 * 5 * Manages the free list, the system allocates free pages here. 6 * Note that kmalloc() lives in slab.c 7 * 8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 9 * Swap reorganised 29.12.95, Stephen Tweedie 10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 16 */ 17 18 #include <linux/stddef.h> 19 #include <linux/mm.h> 20 #include <linux/highmem.h> 21 #include <linux/swap.h> 22 #include <linux/interrupt.h> 23 #include <linux/pagemap.h> 24 #include <linux/jiffies.h> 25 #include <linux/memblock.h> 26 #include <linux/compiler.h> 27 #include <linux/kernel.h> 28 #include <linux/kasan.h> 29 #include <linux/module.h> 30 #include <linux/suspend.h> 31 #include <linux/pagevec.h> 32 #include <linux/blkdev.h> 33 #include <linux/slab.h> 34 #include <linux/ratelimit.h> 35 #include <linux/oom.h> 36 #include <linux/topology.h> 37 #include <linux/sysctl.h> 38 #include <linux/cpu.h> 39 #include <linux/cpuset.h> 40 #include <linux/memory_hotplug.h> 41 #include <linux/nodemask.h> 42 #include <linux/vmalloc.h> 43 #include <linux/vmstat.h> 44 #include <linux/mempolicy.h> 45 #include <linux/memremap.h> 46 #include <linux/stop_machine.h> 47 #include <linux/random.h> 48 #include <linux/sort.h> 49 #include <linux/pfn.h> 50 #include <linux/backing-dev.h> 51 #include <linux/fault-inject.h> 52 #include <linux/page-isolation.h> 53 #include <linux/debugobjects.h> 54 #include <linux/kmemleak.h> 55 #include <linux/compaction.h> 56 #include <trace/events/kmem.h> 57 #include <trace/events/oom.h> 58 #include <linux/prefetch.h> 59 #include <linux/mm_inline.h> 60 #include <linux/mmu_notifier.h> 61 #include <linux/migrate.h> 62 #include <linux/hugetlb.h> 63 #include <linux/sched/rt.h> 64 #include <linux/sched/mm.h> 65 #include <linux/page_owner.h> 66 #include <linux/kthread.h> 67 #include <linux/memcontrol.h> 68 #include <linux/ftrace.h> 69 #include <linux/lockdep.h> 70 #include <linux/nmi.h> 71 #include <linux/psi.h> 72 #include <linux/padata.h> 73 #include <linux/khugepaged.h> 74 #include <linux/buffer_head.h> 75 #include <asm/sections.h> 76 #include <asm/tlbflush.h> 77 #include <asm/div64.h> 78 #include "internal.h" 79 #include "shuffle.h" 80 #include "page_reporting.h" 81 82 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 83 typedef int __bitwise fpi_t; 84 85 /* No special request */ 86 #define FPI_NONE ((__force fpi_t)0) 87 88 /* 89 * Skip free page reporting notification for the (possibly merged) page. 90 * This does not hinder free page reporting from grabbing the page, 91 * reporting it and marking it "reported" - it only skips notifying 92 * the free page reporting infrastructure about a newly freed page. For 93 * example, used when temporarily pulling a page from a freelist and 94 * putting it back unmodified. 95 */ 96 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 97 98 /* 99 * Place the (possibly merged) page to the tail of the freelist. Will ignore 100 * page shuffling (relevant code - e.g., memory onlining - is expected to 101 * shuffle the whole zone). 102 * 103 * Note: No code should rely on this flag for correctness - it's purely 104 * to allow for optimizations when handing back either fresh pages 105 * (memory onlining) or untouched pages (page isolation, free page 106 * reporting). 107 */ 108 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 109 110 /* 111 * Don't poison memory with KASAN (only for the tag-based modes). 112 * During boot, all non-reserved memblock memory is exposed to page_alloc. 113 * Poisoning all that memory lengthens boot time, especially on systems with 114 * large amount of RAM. This flag is used to skip that poisoning. 115 * This is only done for the tag-based KASAN modes, as those are able to 116 * detect memory corruptions with the memory tags assigned by default. 117 * All memory allocated normally after boot gets poisoned as usual. 118 */ 119 #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) 120 121 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 122 static DEFINE_MUTEX(pcp_batch_high_lock); 123 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) 124 125 struct pagesets { 126 local_lock_t lock; 127 #if defined(CONFIG_DEBUG_INFO_BTF) && \ 128 !defined(CONFIG_DEBUG_LOCK_ALLOC) && \ 129 !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT) 130 /* 131 * pahole 1.21 and earlier gets confused by zero-sized per-CPU 132 * variables and produces invalid BTF. Ensure that 133 * sizeof(struct pagesets) != 0 for older versions of pahole. 134 */ 135 char __pahole_hack; 136 #warning "pahole too old to support zero-sized struct pagesets" 137 #endif 138 }; 139 static DEFINE_PER_CPU(struct pagesets, pagesets) = { 140 .lock = INIT_LOCAL_LOCK(lock), 141 }; 142 143 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 144 DEFINE_PER_CPU(int, numa_node); 145 EXPORT_PER_CPU_SYMBOL(numa_node); 146 #endif 147 148 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 149 150 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 151 /* 152 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 153 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 154 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 155 * defined in <linux/topology.h>. 156 */ 157 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 158 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 159 #endif 160 161 /* work_structs for global per-cpu drains */ 162 struct pcpu_drain { 163 struct zone *zone; 164 struct work_struct work; 165 }; 166 static DEFINE_MUTEX(pcpu_drain_mutex); 167 static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); 168 169 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 170 volatile unsigned long latent_entropy __latent_entropy; 171 EXPORT_SYMBOL(latent_entropy); 172 #endif 173 174 /* 175 * Array of node states. 176 */ 177 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 178 [N_POSSIBLE] = NODE_MASK_ALL, 179 [N_ONLINE] = { { [0] = 1UL } }, 180 #ifndef CONFIG_NUMA 181 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 182 #ifdef CONFIG_HIGHMEM 183 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 184 #endif 185 [N_MEMORY] = { { [0] = 1UL } }, 186 [N_CPU] = { { [0] = 1UL } }, 187 #endif /* NUMA */ 188 }; 189 EXPORT_SYMBOL(node_states); 190 191 atomic_long_t _totalram_pages __read_mostly; 192 EXPORT_SYMBOL(_totalram_pages); 193 unsigned long totalreserve_pages __read_mostly; 194 unsigned long totalcma_pages __read_mostly; 195 196 int percpu_pagelist_high_fraction; 197 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 198 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); 199 EXPORT_SYMBOL(init_on_alloc); 200 201 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); 202 EXPORT_SYMBOL(init_on_free); 203 204 static bool _init_on_alloc_enabled_early __read_mostly 205 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); 206 static int __init early_init_on_alloc(char *buf) 207 { 208 209 return kstrtobool(buf, &_init_on_alloc_enabled_early); 210 } 211 early_param("init_on_alloc", early_init_on_alloc); 212 213 static bool _init_on_free_enabled_early __read_mostly 214 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); 215 static int __init early_init_on_free(char *buf) 216 { 217 return kstrtobool(buf, &_init_on_free_enabled_early); 218 } 219 early_param("init_on_free", early_init_on_free); 220 221 /* 222 * A cached value of the page's pageblock's migratetype, used when the page is 223 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 224 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 225 * Also the migratetype set in the page does not necessarily match the pcplist 226 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 227 * other index - this ensures that it will be put on the correct CMA freelist. 228 */ 229 static inline int get_pcppage_migratetype(struct page *page) 230 { 231 return page->index; 232 } 233 234 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 235 { 236 page->index = migratetype; 237 } 238 239 #ifdef CONFIG_PM_SLEEP 240 /* 241 * The following functions are used by the suspend/hibernate code to temporarily 242 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 243 * while devices are suspended. To avoid races with the suspend/hibernate code, 244 * they should always be called with system_transition_mutex held 245 * (gfp_allowed_mask also should only be modified with system_transition_mutex 246 * held, unless the suspend/hibernate code is guaranteed not to run in parallel 247 * with that modification). 248 */ 249 250 static gfp_t saved_gfp_mask; 251 252 void pm_restore_gfp_mask(void) 253 { 254 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 255 if (saved_gfp_mask) { 256 gfp_allowed_mask = saved_gfp_mask; 257 saved_gfp_mask = 0; 258 } 259 } 260 261 void pm_restrict_gfp_mask(void) 262 { 263 WARN_ON(!mutex_is_locked(&system_transition_mutex)); 264 WARN_ON(saved_gfp_mask); 265 saved_gfp_mask = gfp_allowed_mask; 266 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 267 } 268 269 bool pm_suspended_storage(void) 270 { 271 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 272 return false; 273 return true; 274 } 275 #endif /* CONFIG_PM_SLEEP */ 276 277 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 278 unsigned int pageblock_order __read_mostly; 279 #endif 280 281 static void __free_pages_ok(struct page *page, unsigned int order, 282 fpi_t fpi_flags); 283 284 /* 285 * results with 256, 32 in the lowmem_reserve sysctl: 286 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 287 * 1G machine -> (16M dma, 784M normal, 224M high) 288 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 289 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 290 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 291 * 292 * TBD: should special case ZONE_DMA32 machines here - in those we normally 293 * don't need any ZONE_NORMAL reservation 294 */ 295 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 296 #ifdef CONFIG_ZONE_DMA 297 [ZONE_DMA] = 256, 298 #endif 299 #ifdef CONFIG_ZONE_DMA32 300 [ZONE_DMA32] = 256, 301 #endif 302 [ZONE_NORMAL] = 32, 303 #ifdef CONFIG_HIGHMEM 304 [ZONE_HIGHMEM] = 0, 305 #endif 306 [ZONE_MOVABLE] = 0, 307 }; 308 309 static char * const zone_names[MAX_NR_ZONES] = { 310 #ifdef CONFIG_ZONE_DMA 311 "DMA", 312 #endif 313 #ifdef CONFIG_ZONE_DMA32 314 "DMA32", 315 #endif 316 "Normal", 317 #ifdef CONFIG_HIGHMEM 318 "HighMem", 319 #endif 320 "Movable", 321 #ifdef CONFIG_ZONE_DEVICE 322 "Device", 323 #endif 324 }; 325 326 const char * const migratetype_names[MIGRATE_TYPES] = { 327 "Unmovable", 328 "Movable", 329 "Reclaimable", 330 "HighAtomic", 331 #ifdef CONFIG_CMA 332 "CMA", 333 #endif 334 #ifdef CONFIG_MEMORY_ISOLATION 335 "Isolate", 336 #endif 337 }; 338 339 compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { 340 [NULL_COMPOUND_DTOR] = NULL, 341 [COMPOUND_PAGE_DTOR] = free_compound_page, 342 #ifdef CONFIG_HUGETLB_PAGE 343 [HUGETLB_PAGE_DTOR] = free_huge_page, 344 #endif 345 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 346 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, 347 #endif 348 }; 349 350 int min_free_kbytes = 1024; 351 int user_min_free_kbytes = -1; 352 int watermark_boost_factor __read_mostly = 15000; 353 int watermark_scale_factor = 10; 354 355 static unsigned long nr_kernel_pages __initdata; 356 static unsigned long nr_all_pages __initdata; 357 static unsigned long dma_reserve __initdata; 358 359 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; 360 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; 361 static unsigned long required_kernelcore __initdata; 362 static unsigned long required_kernelcore_percent __initdata; 363 static unsigned long required_movablecore __initdata; 364 static unsigned long required_movablecore_percent __initdata; 365 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; 366 static bool mirrored_kernelcore __meminitdata; 367 368 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 369 int movable_zone; 370 EXPORT_SYMBOL(movable_zone); 371 372 #if MAX_NUMNODES > 1 373 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 374 unsigned int nr_online_nodes __read_mostly = 1; 375 EXPORT_SYMBOL(nr_node_ids); 376 EXPORT_SYMBOL(nr_online_nodes); 377 #endif 378 379 int page_group_by_mobility_disabled __read_mostly; 380 381 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 382 /* 383 * During boot we initialize deferred pages on-demand, as needed, but once 384 * page_alloc_init_late() has finished, the deferred pages are all initialized, 385 * and we can permanently disable that path. 386 */ 387 static DEFINE_STATIC_KEY_TRUE(deferred_pages); 388 389 /* 390 * Calling kasan_poison_pages() only after deferred memory initialization 391 * has completed. Poisoning pages during deferred memory init will greatly 392 * lengthen the process and cause problem in large memory systems as the 393 * deferred pages initialization is done with interrupt disabled. 394 * 395 * Assuming that there will be no reference to those newly initialized 396 * pages before they are ever allocated, this should have no effect on 397 * KASAN memory tracking as the poison will be properly inserted at page 398 * allocation time. The only corner case is when pages are allocated by 399 * on-demand allocation and then freed again before the deferred pages 400 * initialization is done, but this is not likely to happen. 401 */ 402 static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) 403 { 404 return static_branch_unlikely(&deferred_pages) || 405 (!IS_ENABLED(CONFIG_KASAN_GENERIC) && 406 (fpi_flags & FPI_SKIP_KASAN_POISON)) || 407 PageSkipKASanPoison(page); 408 } 409 410 /* Returns true if the struct page for the pfn is uninitialised */ 411 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 412 { 413 int nid = early_pfn_to_nid(pfn); 414 415 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 416 return true; 417 418 return false; 419 } 420 421 /* 422 * Returns true when the remaining initialisation should be deferred until 423 * later in the boot cycle when it can be parallelised. 424 */ 425 static bool __meminit 426 defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 427 { 428 static unsigned long prev_end_pfn, nr_initialised; 429 430 /* 431 * prev_end_pfn static that contains the end of previous zone 432 * No need to protect because called very early in boot before smp_init. 433 */ 434 if (prev_end_pfn != end_pfn) { 435 prev_end_pfn = end_pfn; 436 nr_initialised = 0; 437 } 438 439 /* Always populate low zones for address-constrained allocations */ 440 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 441 return false; 442 443 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) 444 return true; 445 /* 446 * We start only with one section of pages, more pages are added as 447 * needed until the rest of deferred pages are initialized. 448 */ 449 nr_initialised++; 450 if ((nr_initialised > PAGES_PER_SECTION) && 451 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 452 NODE_DATA(nid)->first_deferred_pfn = pfn; 453 return true; 454 } 455 return false; 456 } 457 #else 458 static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) 459 { 460 return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && 461 (fpi_flags & FPI_SKIP_KASAN_POISON)) || 462 PageSkipKASanPoison(page); 463 } 464 465 static inline bool early_page_uninitialised(unsigned long pfn) 466 { 467 return false; 468 } 469 470 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 471 { 472 return false; 473 } 474 #endif 475 476 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 477 static inline unsigned long *get_pageblock_bitmap(const struct page *page, 478 unsigned long pfn) 479 { 480 #ifdef CONFIG_SPARSEMEM 481 return section_to_usemap(__pfn_to_section(pfn)); 482 #else 483 return page_zone(page)->pageblock_flags; 484 #endif /* CONFIG_SPARSEMEM */ 485 } 486 487 static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) 488 { 489 #ifdef CONFIG_SPARSEMEM 490 pfn &= (PAGES_PER_SECTION-1); 491 #else 492 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 493 #endif /* CONFIG_SPARSEMEM */ 494 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 495 } 496 497 static __always_inline 498 unsigned long __get_pfnblock_flags_mask(const struct page *page, 499 unsigned long pfn, 500 unsigned long mask) 501 { 502 unsigned long *bitmap; 503 unsigned long bitidx, word_bitidx; 504 unsigned long word; 505 506 bitmap = get_pageblock_bitmap(page, pfn); 507 bitidx = pfn_to_bitidx(page, pfn); 508 word_bitidx = bitidx / BITS_PER_LONG; 509 bitidx &= (BITS_PER_LONG-1); 510 511 word = bitmap[word_bitidx]; 512 return (word >> bitidx) & mask; 513 } 514 515 /** 516 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 517 * @page: The page within the block of interest 518 * @pfn: The target page frame number 519 * @mask: mask of bits that the caller is interested in 520 * 521 * Return: pageblock_bits flags 522 */ 523 unsigned long get_pfnblock_flags_mask(const struct page *page, 524 unsigned long pfn, unsigned long mask) 525 { 526 return __get_pfnblock_flags_mask(page, pfn, mask); 527 } 528 529 static __always_inline int get_pfnblock_migratetype(const struct page *page, 530 unsigned long pfn) 531 { 532 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); 533 } 534 535 /** 536 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 537 * @page: The page within the block of interest 538 * @flags: The flags to set 539 * @pfn: The target page frame number 540 * @mask: mask of bits that the caller is interested in 541 */ 542 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 543 unsigned long pfn, 544 unsigned long mask) 545 { 546 unsigned long *bitmap; 547 unsigned long bitidx, word_bitidx; 548 unsigned long old_word, word; 549 550 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 551 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); 552 553 bitmap = get_pageblock_bitmap(page, pfn); 554 bitidx = pfn_to_bitidx(page, pfn); 555 word_bitidx = bitidx / BITS_PER_LONG; 556 bitidx &= (BITS_PER_LONG-1); 557 558 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 559 560 mask <<= bitidx; 561 flags <<= bitidx; 562 563 word = READ_ONCE(bitmap[word_bitidx]); 564 for (;;) { 565 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 566 if (word == old_word) 567 break; 568 word = old_word; 569 } 570 } 571 572 void set_pageblock_migratetype(struct page *page, int migratetype) 573 { 574 if (unlikely(page_group_by_mobility_disabled && 575 migratetype < MIGRATE_PCPTYPES)) 576 migratetype = MIGRATE_UNMOVABLE; 577 578 set_pfnblock_flags_mask(page, (unsigned long)migratetype, 579 page_to_pfn(page), MIGRATETYPE_MASK); 580 } 581 582 #ifdef CONFIG_DEBUG_VM 583 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 584 { 585 int ret = 0; 586 unsigned seq; 587 unsigned long pfn = page_to_pfn(page); 588 unsigned long sp, start_pfn; 589 590 do { 591 seq = zone_span_seqbegin(zone); 592 start_pfn = zone->zone_start_pfn; 593 sp = zone->spanned_pages; 594 if (!zone_spans_pfn(zone, pfn)) 595 ret = 1; 596 } while (zone_span_seqretry(zone, seq)); 597 598 if (ret) 599 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 600 pfn, zone_to_nid(zone), zone->name, 601 start_pfn, start_pfn + sp); 602 603 return ret; 604 } 605 606 static int page_is_consistent(struct zone *zone, struct page *page) 607 { 608 if (!pfn_valid_within(page_to_pfn(page))) 609 return 0; 610 if (zone != page_zone(page)) 611 return 0; 612 613 return 1; 614 } 615 /* 616 * Temporary debugging check for pages not lying within a given zone. 617 */ 618 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 619 { 620 if (page_outside_zone_boundaries(zone, page)) 621 return 1; 622 if (!page_is_consistent(zone, page)) 623 return 1; 624 625 return 0; 626 } 627 #else 628 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 629 { 630 return 0; 631 } 632 #endif 633 634 static void bad_page(struct page *page, const char *reason) 635 { 636 static unsigned long resume; 637 static unsigned long nr_shown; 638 static unsigned long nr_unshown; 639 640 /* 641 * Allow a burst of 60 reports, then keep quiet for that minute; 642 * or allow a steady drip of one report per second. 643 */ 644 if (nr_shown == 60) { 645 if (time_before(jiffies, resume)) { 646 nr_unshown++; 647 goto out; 648 } 649 if (nr_unshown) { 650 pr_alert( 651 "BUG: Bad page state: %lu messages suppressed\n", 652 nr_unshown); 653 nr_unshown = 0; 654 } 655 nr_shown = 0; 656 } 657 if (nr_shown++ == 0) 658 resume = jiffies + 60 * HZ; 659 660 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 661 current->comm, page_to_pfn(page)); 662 dump_page(page, reason); 663 664 print_modules(); 665 dump_stack(); 666 out: 667 /* Leave bad fields for debug, except PageBuddy could make trouble */ 668 page_mapcount_reset(page); /* remove PageBuddy */ 669 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 670 } 671 672 static inline unsigned int order_to_pindex(int migratetype, int order) 673 { 674 int base = order; 675 676 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 677 if (order > PAGE_ALLOC_COSTLY_ORDER) { 678 VM_BUG_ON(order != pageblock_order); 679 base = PAGE_ALLOC_COSTLY_ORDER + 1; 680 } 681 #else 682 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 683 #endif 684 685 return (MIGRATE_PCPTYPES * base) + migratetype; 686 } 687 688 static inline int pindex_to_order(unsigned int pindex) 689 { 690 int order = pindex / MIGRATE_PCPTYPES; 691 692 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 693 if (order > PAGE_ALLOC_COSTLY_ORDER) { 694 order = pageblock_order; 695 VM_BUG_ON(order != pageblock_order); 696 } 697 #else 698 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 699 #endif 700 701 return order; 702 } 703 704 static inline bool pcp_allowed_order(unsigned int order) 705 { 706 if (order <= PAGE_ALLOC_COSTLY_ORDER) 707 return true; 708 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 709 if (order == pageblock_order) 710 return true; 711 #endif 712 return false; 713 } 714 715 static inline void free_the_page(struct page *page, unsigned int order) 716 { 717 if (pcp_allowed_order(order)) /* Via pcp? */ 718 free_unref_page(page, order); 719 else 720 __free_pages_ok(page, order, FPI_NONE); 721 } 722 723 /* 724 * Higher-order pages are called "compound pages". They are structured thusly: 725 * 726 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 727 * 728 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 729 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 730 * 731 * The first tail page's ->compound_dtor holds the offset in array of compound 732 * page destructors. See compound_page_dtors. 733 * 734 * The first tail page's ->compound_order holds the order of allocation. 735 * This usage means that zero-order pages may not be compound. 736 */ 737 738 void free_compound_page(struct page *page) 739 { 740 mem_cgroup_uncharge(page); 741 free_the_page(page, compound_order(page)); 742 } 743 744 void prep_compound_page(struct page *page, unsigned int order) 745 { 746 int i; 747 int nr_pages = 1 << order; 748 749 __SetPageHead(page); 750 for (i = 1; i < nr_pages; i++) { 751 struct page *p = page + i; 752 p->mapping = TAIL_MAPPING; 753 set_compound_head(p, page); 754 } 755 756 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 757 set_compound_order(page, order); 758 atomic_set(compound_mapcount_ptr(page), -1); 759 if (hpage_pincount_available(page)) 760 atomic_set(compound_pincount_ptr(page), 0); 761 } 762 763 #ifdef CONFIG_DEBUG_PAGEALLOC 764 unsigned int _debug_guardpage_minorder; 765 766 bool _debug_pagealloc_enabled_early __read_mostly 767 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 768 EXPORT_SYMBOL(_debug_pagealloc_enabled_early); 769 DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); 770 EXPORT_SYMBOL(_debug_pagealloc_enabled); 771 772 DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); 773 774 static int __init early_debug_pagealloc(char *buf) 775 { 776 return kstrtobool(buf, &_debug_pagealloc_enabled_early); 777 } 778 early_param("debug_pagealloc", early_debug_pagealloc); 779 780 static int __init debug_guardpage_minorder_setup(char *buf) 781 { 782 unsigned long res; 783 784 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 785 pr_err("Bad debug_guardpage_minorder value\n"); 786 return 0; 787 } 788 _debug_guardpage_minorder = res; 789 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 790 return 0; 791 } 792 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 793 794 static inline bool set_page_guard(struct zone *zone, struct page *page, 795 unsigned int order, int migratetype) 796 { 797 if (!debug_guardpage_enabled()) 798 return false; 799 800 if (order >= debug_guardpage_minorder()) 801 return false; 802 803 __SetPageGuard(page); 804 INIT_LIST_HEAD(&page->lru); 805 set_page_private(page, order); 806 /* Guard pages are not available for any usage */ 807 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 808 809 return true; 810 } 811 812 static inline void clear_page_guard(struct zone *zone, struct page *page, 813 unsigned int order, int migratetype) 814 { 815 if (!debug_guardpage_enabled()) 816 return; 817 818 __ClearPageGuard(page); 819 820 set_page_private(page, 0); 821 if (!is_migrate_isolate(migratetype)) 822 __mod_zone_freepage_state(zone, (1 << order), migratetype); 823 } 824 #else 825 static inline bool set_page_guard(struct zone *zone, struct page *page, 826 unsigned int order, int migratetype) { return false; } 827 static inline void clear_page_guard(struct zone *zone, struct page *page, 828 unsigned int order, int migratetype) {} 829 #endif 830 831 /* 832 * Enable static keys related to various memory debugging and hardening options. 833 * Some override others, and depend on early params that are evaluated in the 834 * order of appearance. So we need to first gather the full picture of what was 835 * enabled, and then make decisions. 836 */ 837 void init_mem_debugging_and_hardening(void) 838 { 839 bool page_poisoning_requested = false; 840 841 #ifdef CONFIG_PAGE_POISONING 842 /* 843 * Page poisoning is debug page alloc for some arches. If 844 * either of those options are enabled, enable poisoning. 845 */ 846 if (page_poisoning_enabled() || 847 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && 848 debug_pagealloc_enabled())) { 849 static_branch_enable(&_page_poisoning_enabled); 850 page_poisoning_requested = true; 851 } 852 #endif 853 854 if (_init_on_alloc_enabled_early) { 855 if (page_poisoning_requested) 856 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 857 "will take precedence over init_on_alloc\n"); 858 else 859 static_branch_enable(&init_on_alloc); 860 } 861 if (_init_on_free_enabled_early) { 862 if (page_poisoning_requested) 863 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 864 "will take precedence over init_on_free\n"); 865 else 866 static_branch_enable(&init_on_free); 867 } 868 869 #ifdef CONFIG_DEBUG_PAGEALLOC 870 if (!debug_pagealloc_enabled()) 871 return; 872 873 static_branch_enable(&_debug_pagealloc_enabled); 874 875 if (!debug_guardpage_minorder()) 876 return; 877 878 static_branch_enable(&_debug_guardpage_enabled); 879 #endif 880 } 881 882 static inline void set_buddy_order(struct page *page, unsigned int order) 883 { 884 set_page_private(page, order); 885 __SetPageBuddy(page); 886 } 887 888 /* 889 * This function checks whether a page is free && is the buddy 890 * we can coalesce a page and its buddy if 891 * (a) the buddy is not in a hole (check before calling!) && 892 * (b) the buddy is in the buddy system && 893 * (c) a page and its buddy have the same order && 894 * (d) a page and its buddy are in the same zone. 895 * 896 * For recording whether a page is in the buddy system, we set PageBuddy. 897 * Setting, clearing, and testing PageBuddy is serialized by zone->lock. 898 * 899 * For recording page's order, we use page_private(page). 900 */ 901 static inline bool page_is_buddy(struct page *page, struct page *buddy, 902 unsigned int order) 903 { 904 if (!page_is_guard(buddy) && !PageBuddy(buddy)) 905 return false; 906 907 if (buddy_order(buddy) != order) 908 return false; 909 910 /* 911 * zone check is done late to avoid uselessly calculating 912 * zone/node ids for pages that could never merge. 913 */ 914 if (page_zone_id(page) != page_zone_id(buddy)) 915 return false; 916 917 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 918 919 return true; 920 } 921 922 #ifdef CONFIG_COMPACTION 923 static inline struct capture_control *task_capc(struct zone *zone) 924 { 925 struct capture_control *capc = current->capture_control; 926 927 return unlikely(capc) && 928 !(current->flags & PF_KTHREAD) && 929 !capc->page && 930 capc->cc->zone == zone ? capc : NULL; 931 } 932 933 static inline bool 934 compaction_capture(struct capture_control *capc, struct page *page, 935 int order, int migratetype) 936 { 937 if (!capc || order != capc->cc->order) 938 return false; 939 940 /* Do not accidentally pollute CMA or isolated regions*/ 941 if (is_migrate_cma(migratetype) || 942 is_migrate_isolate(migratetype)) 943 return false; 944 945 /* 946 * Do not let lower order allocations pollute a movable pageblock. 947 * This might let an unmovable request use a reclaimable pageblock 948 * and vice-versa but no more than normal fallback logic which can 949 * have trouble finding a high-order free page. 950 */ 951 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 952 return false; 953 954 capc->page = page; 955 return true; 956 } 957 958 #else 959 static inline struct capture_control *task_capc(struct zone *zone) 960 { 961 return NULL; 962 } 963 964 static inline bool 965 compaction_capture(struct capture_control *capc, struct page *page, 966 int order, int migratetype) 967 { 968 return false; 969 } 970 #endif /* CONFIG_COMPACTION */ 971 972 /* Used for pages not on another list */ 973 static inline void add_to_free_list(struct page *page, struct zone *zone, 974 unsigned int order, int migratetype) 975 { 976 struct free_area *area = &zone->free_area[order]; 977 978 list_add(&page->lru, &area->free_list[migratetype]); 979 area->nr_free++; 980 } 981 982 /* Used for pages not on another list */ 983 static inline void add_to_free_list_tail(struct page *page, struct zone *zone, 984 unsigned int order, int migratetype) 985 { 986 struct free_area *area = &zone->free_area[order]; 987 988 list_add_tail(&page->lru, &area->free_list[migratetype]); 989 area->nr_free++; 990 } 991 992 /* 993 * Used for pages which are on another list. Move the pages to the tail 994 * of the list - so the moved pages won't immediately be considered for 995 * allocation again (e.g., optimization for memory onlining). 996 */ 997 static inline void move_to_free_list(struct page *page, struct zone *zone, 998 unsigned int order, int migratetype) 999 { 1000 struct free_area *area = &zone->free_area[order]; 1001 1002 list_move_tail(&page->lru, &area->free_list[migratetype]); 1003 } 1004 1005 static inline void del_page_from_free_list(struct page *page, struct zone *zone, 1006 unsigned int order) 1007 { 1008 /* clear reported state and update reported page count */ 1009 if (page_reported(page)) 1010 __ClearPageReported(page); 1011 1012 list_del(&page->lru); 1013 __ClearPageBuddy(page); 1014 set_page_private(page, 0); 1015 zone->free_area[order].nr_free--; 1016 } 1017 1018 /* 1019 * If this is not the largest possible page, check if the buddy 1020 * of the next-highest order is free. If it is, it's possible 1021 * that pages are being freed that will coalesce soon. In case, 1022 * that is happening, add the free page to the tail of the list 1023 * so it's less likely to be used soon and more likely to be merged 1024 * as a higher order page 1025 */ 1026 static inline bool 1027 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 1028 struct page *page, unsigned int order) 1029 { 1030 struct page *higher_page, *higher_buddy; 1031 unsigned long combined_pfn; 1032 1033 if (order >= MAX_ORDER - 2) 1034 return false; 1035 1036 if (!pfn_valid_within(buddy_pfn)) 1037 return false; 1038 1039 combined_pfn = buddy_pfn & pfn; 1040 higher_page = page + (combined_pfn - pfn); 1041 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 1042 higher_buddy = higher_page + (buddy_pfn - combined_pfn); 1043 1044 return pfn_valid_within(buddy_pfn) && 1045 page_is_buddy(higher_page, higher_buddy, order + 1); 1046 } 1047 1048 /* 1049 * Freeing function for a buddy system allocator. 1050 * 1051 * The concept of a buddy system is to maintain direct-mapped table 1052 * (containing bit values) for memory blocks of various "orders". 1053 * The bottom level table contains the map for the smallest allocatable 1054 * units of memory (here, pages), and each level above it describes 1055 * pairs of units from the levels below, hence, "buddies". 1056 * At a high level, all that happens here is marking the table entry 1057 * at the bottom level available, and propagating the changes upward 1058 * as necessary, plus some accounting needed to play nicely with other 1059 * parts of the VM system. 1060 * At each level, we keep a list of pages, which are heads of continuous 1061 * free pages of length of (1 << order) and marked with PageBuddy. 1062 * Page's order is recorded in page_private(page) field. 1063 * So when we are allocating or freeing one, we can derive the state of the 1064 * other. That is, if we allocate a small block, and both were 1065 * free, the remainder of the region must be split into blocks. 1066 * If a block is freed, and its buddy is also free, then this 1067 * triggers coalescing into a block of larger size. 1068 * 1069 * -- nyc 1070 */ 1071 1072 static inline void __free_one_page(struct page *page, 1073 unsigned long pfn, 1074 struct zone *zone, unsigned int order, 1075 int migratetype, fpi_t fpi_flags) 1076 { 1077 struct capture_control *capc = task_capc(zone); 1078 unsigned long buddy_pfn; 1079 unsigned long combined_pfn; 1080 unsigned int max_order; 1081 struct page *buddy; 1082 bool to_tail; 1083 1084 max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); 1085 1086 VM_BUG_ON(!zone_is_initialized(zone)); 1087 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 1088 1089 VM_BUG_ON(migratetype == -1); 1090 if (likely(!is_migrate_isolate(migratetype))) 1091 __mod_zone_freepage_state(zone, 1 << order, migratetype); 1092 1093 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 1094 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1095 1096 continue_merging: 1097 while (order < max_order) { 1098 if (compaction_capture(capc, page, order, migratetype)) { 1099 __mod_zone_freepage_state(zone, -(1 << order), 1100 migratetype); 1101 return; 1102 } 1103 buddy_pfn = __find_buddy_pfn(pfn, order); 1104 buddy = page + (buddy_pfn - pfn); 1105 1106 if (!pfn_valid_within(buddy_pfn)) 1107 goto done_merging; 1108 if (!page_is_buddy(page, buddy, order)) 1109 goto done_merging; 1110 /* 1111 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 1112 * merge with it and move up one order. 1113 */ 1114 if (page_is_guard(buddy)) 1115 clear_page_guard(zone, buddy, order, migratetype); 1116 else 1117 del_page_from_free_list(buddy, zone, order); 1118 combined_pfn = buddy_pfn & pfn; 1119 page = page + (combined_pfn - pfn); 1120 pfn = combined_pfn; 1121 order++; 1122 } 1123 if (order < MAX_ORDER - 1) { 1124 /* If we are here, it means order is >= pageblock_order. 1125 * We want to prevent merge between freepages on isolate 1126 * pageblock and normal pageblock. Without this, pageblock 1127 * isolation could cause incorrect freepage or CMA accounting. 1128 * 1129 * We don't want to hit this code for the more frequent 1130 * low-order merging. 1131 */ 1132 if (unlikely(has_isolate_pageblock(zone))) { 1133 int buddy_mt; 1134 1135 buddy_pfn = __find_buddy_pfn(pfn, order); 1136 buddy = page + (buddy_pfn - pfn); 1137 buddy_mt = get_pageblock_migratetype(buddy); 1138 1139 if (migratetype != buddy_mt 1140 && (is_migrate_isolate(migratetype) || 1141 is_migrate_isolate(buddy_mt))) 1142 goto done_merging; 1143 } 1144 max_order = order + 1; 1145 goto continue_merging; 1146 } 1147 1148 done_merging: 1149 set_buddy_order(page, order); 1150 1151 if (fpi_flags & FPI_TO_TAIL) 1152 to_tail = true; 1153 else if (is_shuffle_order(order)) 1154 to_tail = shuffle_pick_tail(); 1155 else 1156 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 1157 1158 if (to_tail) 1159 add_to_free_list_tail(page, zone, order, migratetype); 1160 else 1161 add_to_free_list(page, zone, order, migratetype); 1162 1163 /* Notify page reporting subsystem of freed page */ 1164 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 1165 page_reporting_notify_free(order); 1166 } 1167 1168 /* 1169 * A bad page could be due to a number of fields. Instead of multiple branches, 1170 * try and check multiple fields with one check. The caller must do a detailed 1171 * check if necessary. 1172 */ 1173 static inline bool page_expected_state(struct page *page, 1174 unsigned long check_flags) 1175 { 1176 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1177 return false; 1178 1179 if (unlikely((unsigned long)page->mapping | 1180 page_ref_count(page) | 1181 #ifdef CONFIG_MEMCG 1182 page->memcg_data | 1183 #endif 1184 (page->flags & check_flags))) 1185 return false; 1186 1187 return true; 1188 } 1189 1190 static const char *page_bad_reason(struct page *page, unsigned long flags) 1191 { 1192 const char *bad_reason = NULL; 1193 1194 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1195 bad_reason = "nonzero mapcount"; 1196 if (unlikely(page->mapping != NULL)) 1197 bad_reason = "non-NULL mapping"; 1198 if (unlikely(page_ref_count(page) != 0)) 1199 bad_reason = "nonzero _refcount"; 1200 if (unlikely(page->flags & flags)) { 1201 if (flags == PAGE_FLAGS_CHECK_AT_PREP) 1202 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 1203 else 1204 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 1205 } 1206 #ifdef CONFIG_MEMCG 1207 if (unlikely(page->memcg_data)) 1208 bad_reason = "page still charged to cgroup"; 1209 #endif 1210 return bad_reason; 1211 } 1212 1213 static void check_free_page_bad(struct page *page) 1214 { 1215 bad_page(page, 1216 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 1217 } 1218 1219 static inline int check_free_page(struct page *page) 1220 { 1221 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 1222 return 0; 1223 1224 /* Something has gone sideways, find it */ 1225 check_free_page_bad(page); 1226 return 1; 1227 } 1228 1229 static int free_tail_pages_check(struct page *head_page, struct page *page) 1230 { 1231 int ret = 1; 1232 1233 /* 1234 * We rely page->lru.next never has bit 0 set, unless the page 1235 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 1236 */ 1237 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 1238 1239 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 1240 ret = 0; 1241 goto out; 1242 } 1243 switch (page - head_page) { 1244 case 1: 1245 /* the first tail page: ->mapping may be compound_mapcount() */ 1246 if (unlikely(compound_mapcount(page))) { 1247 bad_page(page, "nonzero compound_mapcount"); 1248 goto out; 1249 } 1250 break; 1251 case 2: 1252 /* 1253 * the second tail page: ->mapping is 1254 * deferred_list.next -- ignore value. 1255 */ 1256 break; 1257 default: 1258 if (page->mapping != TAIL_MAPPING) { 1259 bad_page(page, "corrupted mapping in tail page"); 1260 goto out; 1261 } 1262 break; 1263 } 1264 if (unlikely(!PageTail(page))) { 1265 bad_page(page, "PageTail not set"); 1266 goto out; 1267 } 1268 if (unlikely(compound_head(page) != head_page)) { 1269 bad_page(page, "compound_head not consistent"); 1270 goto out; 1271 } 1272 ret = 0; 1273 out: 1274 page->mapping = NULL; 1275 clear_compound_head(page); 1276 return ret; 1277 } 1278 1279 static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) 1280 { 1281 int i; 1282 1283 if (zero_tags) { 1284 for (i = 0; i < numpages; i++) 1285 tag_clear_highpage(page + i); 1286 return; 1287 } 1288 1289 /* s390's use of memset() could override KASAN redzones. */ 1290 kasan_disable_current(); 1291 for (i = 0; i < numpages; i++) { 1292 u8 tag = page_kasan_tag(page + i); 1293 page_kasan_tag_reset(page + i); 1294 clear_highpage(page + i); 1295 page_kasan_tag_set(page + i, tag); 1296 } 1297 kasan_enable_current(); 1298 } 1299 1300 static __always_inline bool free_pages_prepare(struct page *page, 1301 unsigned int order, bool check_free, fpi_t fpi_flags) 1302 { 1303 int bad = 0; 1304 bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); 1305 1306 VM_BUG_ON_PAGE(PageTail(page), page); 1307 1308 trace_mm_page_free(page, order); 1309 1310 if (unlikely(PageHWPoison(page)) && !order) { 1311 /* 1312 * Do not let hwpoison pages hit pcplists/buddy 1313 * Untie memcg state and reset page's owner 1314 */ 1315 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1316 __memcg_kmem_uncharge_page(page, order); 1317 reset_page_owner(page, order); 1318 return false; 1319 } 1320 1321 /* 1322 * Check tail pages before head page information is cleared to 1323 * avoid checking PageCompound for order-0 pages. 1324 */ 1325 if (unlikely(order)) { 1326 bool compound = PageCompound(page); 1327 int i; 1328 1329 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1330 1331 if (compound) 1332 ClearPageDoubleMap(page); 1333 for (i = 1; i < (1 << order); i++) { 1334 if (compound) 1335 bad += free_tail_pages_check(page, page + i); 1336 if (unlikely(check_free_page(page + i))) { 1337 bad++; 1338 continue; 1339 } 1340 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1341 } 1342 } 1343 if (PageMappingFlags(page)) 1344 page->mapping = NULL; 1345 if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1346 __memcg_kmem_uncharge_page(page, order); 1347 if (check_free) 1348 bad += check_free_page(page); 1349 if (bad) 1350 return false; 1351 1352 page_cpupid_reset_last(page); 1353 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1354 reset_page_owner(page, order); 1355 1356 if (!PageHighMem(page)) { 1357 debug_check_no_locks_freed(page_address(page), 1358 PAGE_SIZE << order); 1359 debug_check_no_obj_freed(page_address(page), 1360 PAGE_SIZE << order); 1361 } 1362 1363 kernel_poison_pages(page, 1 << order); 1364 1365 /* 1366 * As memory initialization might be integrated into KASAN, 1367 * kasan_free_pages and kernel_init_free_pages must be 1368 * kept together to avoid discrepancies in behavior. 1369 * 1370 * With hardware tag-based KASAN, memory tags must be set before the 1371 * page becomes unavailable via debug_pagealloc or arch_free_page. 1372 */ 1373 if (kasan_has_integrated_init()) { 1374 if (!skip_kasan_poison) 1375 kasan_free_pages(page, order); 1376 } else { 1377 bool init = want_init_on_free(); 1378 1379 if (init) 1380 kernel_init_free_pages(page, 1 << order, false); 1381 if (!skip_kasan_poison) 1382 kasan_poison_pages(page, order, init); 1383 } 1384 1385 /* 1386 * arch_free_page() can make the page's contents inaccessible. s390 1387 * does this. So nothing which can access the page's contents should 1388 * happen after this. 1389 */ 1390 arch_free_page(page, order); 1391 1392 debug_pagealloc_unmap_pages(page, 1 << order); 1393 1394 return true; 1395 } 1396 1397 #ifdef CONFIG_DEBUG_VM 1398 /* 1399 * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed 1400 * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when 1401 * moved from pcp lists to free lists. 1402 */ 1403 static bool free_pcp_prepare(struct page *page, unsigned int order) 1404 { 1405 return free_pages_prepare(page, order, true, FPI_NONE); 1406 } 1407 1408 static bool bulkfree_pcp_prepare(struct page *page) 1409 { 1410 if (debug_pagealloc_enabled_static()) 1411 return check_free_page(page); 1412 else 1413 return false; 1414 } 1415 #else 1416 /* 1417 * With DEBUG_VM disabled, order-0 pages being freed are checked only when 1418 * moving from pcp lists to free list in order to reduce overhead. With 1419 * debug_pagealloc enabled, they are checked also immediately when being freed 1420 * to the pcp lists. 1421 */ 1422 static bool free_pcp_prepare(struct page *page, unsigned int order) 1423 { 1424 if (debug_pagealloc_enabled_static()) 1425 return free_pages_prepare(page, order, true, FPI_NONE); 1426 else 1427 return free_pages_prepare(page, order, false, FPI_NONE); 1428 } 1429 1430 static bool bulkfree_pcp_prepare(struct page *page) 1431 { 1432 return check_free_page(page); 1433 } 1434 #endif /* CONFIG_DEBUG_VM */ 1435 1436 static inline void prefetch_buddy(struct page *page) 1437 { 1438 unsigned long pfn = page_to_pfn(page); 1439 unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); 1440 struct page *buddy = page + (buddy_pfn - pfn); 1441 1442 prefetch(buddy); 1443 } 1444 1445 /* 1446 * Frees a number of pages from the PCP lists 1447 * Assumes all pages on list are in same zone, and of same order. 1448 * count is the number of pages to free. 1449 * 1450 * If the zone was previously in an "all pages pinned" state then look to 1451 * see if this freeing clears that state. 1452 * 1453 * And clear the zone's pages_scanned counter, to hold off the "all pages are 1454 * pinned" detection logic. 1455 */ 1456 static void free_pcppages_bulk(struct zone *zone, int count, 1457 struct per_cpu_pages *pcp) 1458 { 1459 int pindex = 0; 1460 int batch_free = 0; 1461 int nr_freed = 0; 1462 unsigned int order; 1463 int prefetch_nr = READ_ONCE(pcp->batch); 1464 bool isolated_pageblocks; 1465 struct page *page, *tmp; 1466 LIST_HEAD(head); 1467 1468 /* 1469 * Ensure proper count is passed which otherwise would stuck in the 1470 * below while (list_empty(list)) loop. 1471 */ 1472 count = min(pcp->count, count); 1473 while (count > 0) { 1474 struct list_head *list; 1475 1476 /* 1477 * Remove pages from lists in a round-robin fashion. A 1478 * batch_free count is maintained that is incremented when an 1479 * empty list is encountered. This is so more pages are freed 1480 * off fuller lists instead of spinning excessively around empty 1481 * lists 1482 */ 1483 do { 1484 batch_free++; 1485 if (++pindex == NR_PCP_LISTS) 1486 pindex = 0; 1487 list = &pcp->lists[pindex]; 1488 } while (list_empty(list)); 1489 1490 /* This is the only non-empty list. Free them all. */ 1491 if (batch_free == NR_PCP_LISTS) 1492 batch_free = count; 1493 1494 order = pindex_to_order(pindex); 1495 BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); 1496 do { 1497 page = list_last_entry(list, struct page, lru); 1498 /* must delete to avoid corrupting pcp list */ 1499 list_del(&page->lru); 1500 nr_freed += 1 << order; 1501 count -= 1 << order; 1502 1503 if (bulkfree_pcp_prepare(page)) 1504 continue; 1505 1506 /* Encode order with the migratetype */ 1507 page->index <<= NR_PCP_ORDER_WIDTH; 1508 page->index |= order; 1509 1510 list_add_tail(&page->lru, &head); 1511 1512 /* 1513 * We are going to put the page back to the global 1514 * pool, prefetch its buddy to speed up later access 1515 * under zone->lock. It is believed the overhead of 1516 * an additional test and calculating buddy_pfn here 1517 * can be offset by reduced memory latency later. To 1518 * avoid excessive prefetching due to large count, only 1519 * prefetch buddy for the first pcp->batch nr of pages. 1520 */ 1521 if (prefetch_nr) { 1522 prefetch_buddy(page); 1523 prefetch_nr--; 1524 } 1525 } while (count > 0 && --batch_free && !list_empty(list)); 1526 } 1527 pcp->count -= nr_freed; 1528 1529 /* 1530 * local_lock_irq held so equivalent to spin_lock_irqsave for 1531 * both PREEMPT_RT and non-PREEMPT_RT configurations. 1532 */ 1533 spin_lock(&zone->lock); 1534 isolated_pageblocks = has_isolate_pageblock(zone); 1535 1536 /* 1537 * Use safe version since after __free_one_page(), 1538 * page->lru.next will not point to original list. 1539 */ 1540 list_for_each_entry_safe(page, tmp, &head, lru) { 1541 int mt = get_pcppage_migratetype(page); 1542 1543 /* mt has been encoded with the order (see above) */ 1544 order = mt & NR_PCP_ORDER_MASK; 1545 mt >>= NR_PCP_ORDER_WIDTH; 1546 1547 /* MIGRATE_ISOLATE page should not go to pcplists */ 1548 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1549 /* Pageblock could have been isolated meanwhile */ 1550 if (unlikely(isolated_pageblocks)) 1551 mt = get_pageblock_migratetype(page); 1552 1553 __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); 1554 trace_mm_page_pcpu_drain(page, order, mt); 1555 } 1556 spin_unlock(&zone->lock); 1557 } 1558 1559 static void free_one_page(struct zone *zone, 1560 struct page *page, unsigned long pfn, 1561 unsigned int order, 1562 int migratetype, fpi_t fpi_flags) 1563 { 1564 unsigned long flags; 1565 1566 spin_lock_irqsave(&zone->lock, flags); 1567 if (unlikely(has_isolate_pageblock(zone) || 1568 is_migrate_isolate(migratetype))) { 1569 migratetype = get_pfnblock_migratetype(page, pfn); 1570 } 1571 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1572 spin_unlock_irqrestore(&zone->lock, flags); 1573 } 1574 1575 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1576 unsigned long zone, int nid) 1577 { 1578 mm_zero_struct_page(page); 1579 set_page_links(page, zone, nid, pfn); 1580 init_page_count(page); 1581 page_mapcount_reset(page); 1582 page_cpupid_reset_last(page); 1583 page_kasan_tag_reset(page); 1584 1585 INIT_LIST_HEAD(&page->lru); 1586 #ifdef WANT_PAGE_VIRTUAL 1587 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1588 if (!is_highmem_idx(zone)) 1589 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1590 #endif 1591 } 1592 1593 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1594 static void __meminit init_reserved_page(unsigned long pfn) 1595 { 1596 pg_data_t *pgdat; 1597 int nid, zid; 1598 1599 if (!early_page_uninitialised(pfn)) 1600 return; 1601 1602 nid = early_pfn_to_nid(pfn); 1603 pgdat = NODE_DATA(nid); 1604 1605 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1606 struct zone *zone = &pgdat->node_zones[zid]; 1607 1608 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1609 break; 1610 } 1611 __init_single_page(pfn_to_page(pfn), pfn, zid, nid); 1612 } 1613 #else 1614 static inline void init_reserved_page(unsigned long pfn) 1615 { 1616 } 1617 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1618 1619 /* 1620 * Initialised pages do not have PageReserved set. This function is 1621 * called for each range allocated by the bootmem allocator and 1622 * marks the pages PageReserved. The remaining valid pages are later 1623 * sent to the buddy page allocator. 1624 */ 1625 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1626 { 1627 unsigned long start_pfn = PFN_DOWN(start); 1628 unsigned long end_pfn = PFN_UP(end); 1629 1630 for (; start_pfn < end_pfn; start_pfn++) { 1631 if (pfn_valid(start_pfn)) { 1632 struct page *page = pfn_to_page(start_pfn); 1633 1634 init_reserved_page(start_pfn); 1635 1636 /* Avoid false-positive PageTail() */ 1637 INIT_LIST_HEAD(&page->lru); 1638 1639 /* 1640 * no need for atomic set_bit because the struct 1641 * page is not visible yet so nobody should 1642 * access it yet. 1643 */ 1644 __SetPageReserved(page); 1645 } 1646 } 1647 } 1648 1649 static void __free_pages_ok(struct page *page, unsigned int order, 1650 fpi_t fpi_flags) 1651 { 1652 unsigned long flags; 1653 int migratetype; 1654 unsigned long pfn = page_to_pfn(page); 1655 struct zone *zone = page_zone(page); 1656 1657 if (!free_pages_prepare(page, order, true, fpi_flags)) 1658 return; 1659 1660 migratetype = get_pfnblock_migratetype(page, pfn); 1661 1662 spin_lock_irqsave(&zone->lock, flags); 1663 if (unlikely(has_isolate_pageblock(zone) || 1664 is_migrate_isolate(migratetype))) { 1665 migratetype = get_pfnblock_migratetype(page, pfn); 1666 } 1667 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1668 spin_unlock_irqrestore(&zone->lock, flags); 1669 1670 __count_vm_events(PGFREE, 1 << order); 1671 } 1672 1673 void __free_pages_core(struct page *page, unsigned int order) 1674 { 1675 unsigned int nr_pages = 1 << order; 1676 struct page *p = page; 1677 unsigned int loop; 1678 1679 /* 1680 * When initializing the memmap, __init_single_page() sets the refcount 1681 * of all pages to 1 ("allocated"/"not free"). We have to set the 1682 * refcount of all involved pages to 0. 1683 */ 1684 prefetchw(p); 1685 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1686 prefetchw(p + 1); 1687 __ClearPageReserved(p); 1688 set_page_count(p, 0); 1689 } 1690 __ClearPageReserved(p); 1691 set_page_count(p, 0); 1692 1693 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1694 1695 /* 1696 * Bypass PCP and place fresh pages right to the tail, primarily 1697 * relevant for memory onlining. 1698 */ 1699 __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); 1700 } 1701 1702 #ifdef CONFIG_NUMA 1703 1704 /* 1705 * During memory init memblocks map pfns to nids. The search is expensive and 1706 * this caches recent lookups. The implementation of __early_pfn_to_nid 1707 * treats start/end as pfns. 1708 */ 1709 struct mminit_pfnnid_cache { 1710 unsigned long last_start; 1711 unsigned long last_end; 1712 int last_nid; 1713 }; 1714 1715 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1716 1717 /* 1718 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 1719 */ 1720 static int __meminit __early_pfn_to_nid(unsigned long pfn, 1721 struct mminit_pfnnid_cache *state) 1722 { 1723 unsigned long start_pfn, end_pfn; 1724 int nid; 1725 1726 if (state->last_start <= pfn && pfn < state->last_end) 1727 return state->last_nid; 1728 1729 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 1730 if (nid != NUMA_NO_NODE) { 1731 state->last_start = start_pfn; 1732 state->last_end = end_pfn; 1733 state->last_nid = nid; 1734 } 1735 1736 return nid; 1737 } 1738 1739 int __meminit early_pfn_to_nid(unsigned long pfn) 1740 { 1741 static DEFINE_SPINLOCK(early_pfn_lock); 1742 int nid; 1743 1744 spin_lock(&early_pfn_lock); 1745 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1746 if (nid < 0) 1747 nid = first_online_node; 1748 spin_unlock(&early_pfn_lock); 1749 1750 return nid; 1751 } 1752 #endif /* CONFIG_NUMA */ 1753 1754 void __init memblock_free_pages(struct page *page, unsigned long pfn, 1755 unsigned int order) 1756 { 1757 if (early_page_uninitialised(pfn)) 1758 return; 1759 __free_pages_core(page, order); 1760 } 1761 1762 /* 1763 * Check that the whole (or subset of) a pageblock given by the interval of 1764 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1765 * with the migration of free compaction scanner. The scanners then need to 1766 * use only pfn_valid_within() check for arches that allow holes within 1767 * pageblocks. 1768 * 1769 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1770 * 1771 * It's possible on some configurations to have a setup like node0 node1 node0 1772 * i.e. it's possible that all pages within a zones range of pages do not 1773 * belong to a single zone. We assume that a border between node0 and node1 1774 * can occur within a single pageblock, but not a node0 node1 node0 1775 * interleaving within a single pageblock. It is therefore sufficient to check 1776 * the first and last page of a pageblock and avoid checking each individual 1777 * page in a pageblock. 1778 */ 1779 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1780 unsigned long end_pfn, struct zone *zone) 1781 { 1782 struct page *start_page; 1783 struct page *end_page; 1784 1785 /* end_pfn is one past the range we are checking */ 1786 end_pfn--; 1787 1788 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1789 return NULL; 1790 1791 start_page = pfn_to_online_page(start_pfn); 1792 if (!start_page) 1793 return NULL; 1794 1795 if (page_zone(start_page) != zone) 1796 return NULL; 1797 1798 end_page = pfn_to_page(end_pfn); 1799 1800 /* This gives a shorter code than deriving page_zone(end_page) */ 1801 if (page_zone_id(start_page) != page_zone_id(end_page)) 1802 return NULL; 1803 1804 return start_page; 1805 } 1806 1807 void set_zone_contiguous(struct zone *zone) 1808 { 1809 unsigned long block_start_pfn = zone->zone_start_pfn; 1810 unsigned long block_end_pfn; 1811 1812 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1813 for (; block_start_pfn < zone_end_pfn(zone); 1814 block_start_pfn = block_end_pfn, 1815 block_end_pfn += pageblock_nr_pages) { 1816 1817 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1818 1819 if (!__pageblock_pfn_to_page(block_start_pfn, 1820 block_end_pfn, zone)) 1821 return; 1822 cond_resched(); 1823 } 1824 1825 /* We confirm that there is no hole */ 1826 zone->contiguous = true; 1827 } 1828 1829 void clear_zone_contiguous(struct zone *zone) 1830 { 1831 zone->contiguous = false; 1832 } 1833 1834 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1835 static void __init deferred_free_range(unsigned long pfn, 1836 unsigned long nr_pages) 1837 { 1838 struct page *page; 1839 unsigned long i; 1840 1841 if (!nr_pages) 1842 return; 1843 1844 page = pfn_to_page(pfn); 1845 1846 /* Free a large naturally-aligned chunk if possible */ 1847 if (nr_pages == pageblock_nr_pages && 1848 (pfn & (pageblock_nr_pages - 1)) == 0) { 1849 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1850 __free_pages_core(page, pageblock_order); 1851 return; 1852 } 1853 1854 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1855 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1856 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1857 __free_pages_core(page, 0); 1858 } 1859 } 1860 1861 /* Completion tracking for deferred_init_memmap() threads */ 1862 static atomic_t pgdat_init_n_undone __initdata; 1863 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1864 1865 static inline void __init pgdat_init_report_one_done(void) 1866 { 1867 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1868 complete(&pgdat_init_all_done_comp); 1869 } 1870 1871 /* 1872 * Returns true if page needs to be initialized or freed to buddy allocator. 1873 * 1874 * First we check if pfn is valid on architectures where it is possible to have 1875 * holes within pageblock_nr_pages. On systems where it is not possible, this 1876 * function is optimized out. 1877 * 1878 * Then, we check if a current large page is valid by only checking the validity 1879 * of the head pfn. 1880 */ 1881 static inline bool __init deferred_pfn_valid(unsigned long pfn) 1882 { 1883 if (!pfn_valid_within(pfn)) 1884 return false; 1885 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) 1886 return false; 1887 return true; 1888 } 1889 1890 /* 1891 * Free pages to buddy allocator. Try to free aligned pages in 1892 * pageblock_nr_pages sizes. 1893 */ 1894 static void __init deferred_free_pages(unsigned long pfn, 1895 unsigned long end_pfn) 1896 { 1897 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1898 unsigned long nr_free = 0; 1899 1900 for (; pfn < end_pfn; pfn++) { 1901 if (!deferred_pfn_valid(pfn)) { 1902 deferred_free_range(pfn - nr_free, nr_free); 1903 nr_free = 0; 1904 } else if (!(pfn & nr_pgmask)) { 1905 deferred_free_range(pfn - nr_free, nr_free); 1906 nr_free = 1; 1907 } else { 1908 nr_free++; 1909 } 1910 } 1911 /* Free the last block of pages to allocator */ 1912 deferred_free_range(pfn - nr_free, nr_free); 1913 } 1914 1915 /* 1916 * Initialize struct pages. We minimize pfn page lookups and scheduler checks 1917 * by performing it only once every pageblock_nr_pages. 1918 * Return number of pages initialized. 1919 */ 1920 static unsigned long __init deferred_init_pages(struct zone *zone, 1921 unsigned long pfn, 1922 unsigned long end_pfn) 1923 { 1924 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1925 int nid = zone_to_nid(zone); 1926 unsigned long nr_pages = 0; 1927 int zid = zone_idx(zone); 1928 struct page *page = NULL; 1929 1930 for (; pfn < end_pfn; pfn++) { 1931 if (!deferred_pfn_valid(pfn)) { 1932 page = NULL; 1933 continue; 1934 } else if (!page || !(pfn & nr_pgmask)) { 1935 page = pfn_to_page(pfn); 1936 } else { 1937 page++; 1938 } 1939 __init_single_page(page, pfn, zid, nid); 1940 nr_pages++; 1941 } 1942 return (nr_pages); 1943 } 1944 1945 /* 1946 * This function is meant to pre-load the iterator for the zone init. 1947 * Specifically it walks through the ranges until we are caught up to the 1948 * first_init_pfn value and exits there. If we never encounter the value we 1949 * return false indicating there are no valid ranges left. 1950 */ 1951 static bool __init 1952 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, 1953 unsigned long *spfn, unsigned long *epfn, 1954 unsigned long first_init_pfn) 1955 { 1956 u64 j; 1957 1958 /* 1959 * Start out by walking through the ranges in this zone that have 1960 * already been initialized. We don't need to do anything with them 1961 * so we just need to flush them out of the system. 1962 */ 1963 for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { 1964 if (*epfn <= first_init_pfn) 1965 continue; 1966 if (*spfn < first_init_pfn) 1967 *spfn = first_init_pfn; 1968 *i = j; 1969 return true; 1970 } 1971 1972 return false; 1973 } 1974 1975 /* 1976 * Initialize and free pages. We do it in two loops: first we initialize 1977 * struct page, then free to buddy allocator, because while we are 1978 * freeing pages we can access pages that are ahead (computing buddy 1979 * page in __free_one_page()). 1980 * 1981 * In order to try and keep some memory in the cache we have the loop 1982 * broken along max page order boundaries. This way we will not cause 1983 * any issues with the buddy page computation. 1984 */ 1985 static unsigned long __init 1986 deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, 1987 unsigned long *end_pfn) 1988 { 1989 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); 1990 unsigned long spfn = *start_pfn, epfn = *end_pfn; 1991 unsigned long nr_pages = 0; 1992 u64 j = *i; 1993 1994 /* First we loop through and initialize the page values */ 1995 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { 1996 unsigned long t; 1997 1998 if (mo_pfn <= *start_pfn) 1999 break; 2000 2001 t = min(mo_pfn, *end_pfn); 2002 nr_pages += deferred_init_pages(zone, *start_pfn, t); 2003 2004 if (mo_pfn < *end_pfn) { 2005 *start_pfn = mo_pfn; 2006 break; 2007 } 2008 } 2009 2010 /* Reset values and now loop through freeing pages as needed */ 2011 swap(j, *i); 2012 2013 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { 2014 unsigned long t; 2015 2016 if (mo_pfn <= spfn) 2017 break; 2018 2019 t = min(mo_pfn, epfn); 2020 deferred_free_pages(spfn, t); 2021 2022 if (mo_pfn <= epfn) 2023 break; 2024 } 2025 2026 return nr_pages; 2027 } 2028 2029 static void __init 2030 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, 2031 void *arg) 2032 { 2033 unsigned long spfn, epfn; 2034 struct zone *zone = arg; 2035 u64 i; 2036 2037 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); 2038 2039 /* 2040 * Initialize and free pages in MAX_ORDER sized increments so that we 2041 * can avoid introducing any issues with the buddy allocator. 2042 */ 2043 while (spfn < end_pfn) { 2044 deferred_init_maxorder(&i, zone, &spfn, &epfn); 2045 cond_resched(); 2046 } 2047 } 2048 2049 /* An arch may override for more concurrency. */ 2050 __weak int __init 2051 deferred_page_init_max_threads(const struct cpumask *node_cpumask) 2052 { 2053 return 1; 2054 } 2055 2056 /* Initialise remaining memory on a node */ 2057 static int __init deferred_init_memmap(void *data) 2058 { 2059 pg_data_t *pgdat = data; 2060 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2061 unsigned long spfn = 0, epfn = 0; 2062 unsigned long first_init_pfn, flags; 2063 unsigned long start = jiffies; 2064 struct zone *zone; 2065 int zid, max_threads; 2066 u64 i; 2067 2068 /* Bind memory initialisation thread to a local node if possible */ 2069 if (!cpumask_empty(cpumask)) 2070 set_cpus_allowed_ptr(current, cpumask); 2071 2072 pgdat_resize_lock(pgdat, &flags); 2073 first_init_pfn = pgdat->first_deferred_pfn; 2074 if (first_init_pfn == ULONG_MAX) { 2075 pgdat_resize_unlock(pgdat, &flags); 2076 pgdat_init_report_one_done(); 2077 return 0; 2078 } 2079 2080 /* Sanity check boundaries */ 2081 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 2082 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 2083 pgdat->first_deferred_pfn = ULONG_MAX; 2084 2085 /* 2086 * Once we unlock here, the zone cannot be grown anymore, thus if an 2087 * interrupt thread must allocate this early in boot, zone must be 2088 * pre-grown prior to start of deferred page initialization. 2089 */ 2090 pgdat_resize_unlock(pgdat, &flags); 2091 2092 /* Only the highest zone is deferred so find it */ 2093 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2094 zone = pgdat->node_zones + zid; 2095 if (first_init_pfn < zone_end_pfn(zone)) 2096 break; 2097 } 2098 2099 /* If the zone is empty somebody else may have cleared out the zone */ 2100 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2101 first_init_pfn)) 2102 goto zone_empty; 2103 2104 max_threads = deferred_page_init_max_threads(cpumask); 2105 2106 while (spfn < epfn) { 2107 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); 2108 struct padata_mt_job job = { 2109 .thread_fn = deferred_init_memmap_chunk, 2110 .fn_arg = zone, 2111 .start = spfn, 2112 .size = epfn_align - spfn, 2113 .align = PAGES_PER_SECTION, 2114 .min_chunk = PAGES_PER_SECTION, 2115 .max_threads = max_threads, 2116 }; 2117 2118 padata_do_multithreaded(&job); 2119 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2120 epfn_align); 2121 } 2122 zone_empty: 2123 /* Sanity check that the next zone really is unpopulated */ 2124 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 2125 2126 pr_info("node %d deferred pages initialised in %ums\n", 2127 pgdat->node_id, jiffies_to_msecs(jiffies - start)); 2128 2129 pgdat_init_report_one_done(); 2130 return 0; 2131 } 2132 2133 /* 2134 * If this zone has deferred pages, try to grow it by initializing enough 2135 * deferred pages to satisfy the allocation specified by order, rounded up to 2136 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 2137 * of SECTION_SIZE bytes by initializing struct pages in increments of 2138 * PAGES_PER_SECTION * sizeof(struct page) bytes. 2139 * 2140 * Return true when zone was grown, otherwise return false. We return true even 2141 * when we grow less than requested, to let the caller decide if there are 2142 * enough pages to satisfy the allocation. 2143 * 2144 * Note: We use noinline because this function is needed only during boot, and 2145 * it is called from a __ref function _deferred_grow_zone. This way we are 2146 * making sure that it is not inlined into permanent text section. 2147 */ 2148 static noinline bool __init 2149 deferred_grow_zone(struct zone *zone, unsigned int order) 2150 { 2151 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); 2152 pg_data_t *pgdat = zone->zone_pgdat; 2153 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 2154 unsigned long spfn, epfn, flags; 2155 unsigned long nr_pages = 0; 2156 u64 i; 2157 2158 /* Only the last zone may have deferred pages */ 2159 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) 2160 return false; 2161 2162 pgdat_resize_lock(pgdat, &flags); 2163 2164 /* 2165 * If someone grew this zone while we were waiting for spinlock, return 2166 * true, as there might be enough pages already. 2167 */ 2168 if (first_deferred_pfn != pgdat->first_deferred_pfn) { 2169 pgdat_resize_unlock(pgdat, &flags); 2170 return true; 2171 } 2172 2173 /* If the zone is empty somebody else may have cleared out the zone */ 2174 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 2175 first_deferred_pfn)) { 2176 pgdat->first_deferred_pfn = ULONG_MAX; 2177 pgdat_resize_unlock(pgdat, &flags); 2178 /* Retry only once. */ 2179 return first_deferred_pfn != ULONG_MAX; 2180 } 2181 2182 /* 2183 * Initialize and free pages in MAX_ORDER sized increments so 2184 * that we can avoid introducing any issues with the buddy 2185 * allocator. 2186 */ 2187 while (spfn < epfn) { 2188 /* update our first deferred PFN for this section */ 2189 first_deferred_pfn = spfn; 2190 2191 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 2192 touch_nmi_watchdog(); 2193 2194 /* We should only stop along section boundaries */ 2195 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) 2196 continue; 2197 2198 /* If our quota has been met we can stop here */ 2199 if (nr_pages >= nr_pages_needed) 2200 break; 2201 } 2202 2203 pgdat->first_deferred_pfn = spfn; 2204 pgdat_resize_unlock(pgdat, &flags); 2205 2206 return nr_pages > 0; 2207 } 2208 2209 /* 2210 * deferred_grow_zone() is __init, but it is called from 2211 * get_page_from_freelist() during early boot until deferred_pages permanently 2212 * disables this call. This is why we have refdata wrapper to avoid warning, 2213 * and to ensure that the function body gets unloaded. 2214 */ 2215 static bool __ref 2216 _deferred_grow_zone(struct zone *zone, unsigned int order) 2217 { 2218 return deferred_grow_zone(zone, order); 2219 } 2220 2221 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 2222 2223 void __init page_alloc_init_late(void) 2224 { 2225 struct zone *zone; 2226 int nid; 2227 2228 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 2229 2230 /* There will be num_node_state(N_MEMORY) threads */ 2231 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 2232 for_each_node_state(nid, N_MEMORY) { 2233 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 2234 } 2235 2236 /* Block until all are initialised */ 2237 wait_for_completion(&pgdat_init_all_done_comp); 2238 2239 /* 2240 * We initialized the rest of the deferred pages. Permanently disable 2241 * on-demand struct page initialization. 2242 */ 2243 static_branch_disable(&deferred_pages); 2244 2245 /* Reinit limits that are based on free pages after the kernel is up */ 2246 files_maxfiles_init(); 2247 #endif 2248 2249 buffer_init(); 2250 2251 /* Discard memblock private memory */ 2252 memblock_discard(); 2253 2254 for_each_node_state(nid, N_MEMORY) 2255 shuffle_free_memory(NODE_DATA(nid)); 2256 2257 for_each_populated_zone(zone) 2258 set_zone_contiguous(zone); 2259 } 2260 2261 #ifdef CONFIG_CMA 2262 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 2263 void __init init_cma_reserved_pageblock(struct page *page) 2264 { 2265 unsigned i = pageblock_nr_pages; 2266 struct page *p = page; 2267 2268 do { 2269 __ClearPageReserved(p); 2270 set_page_count(p, 0); 2271 } while (++p, --i); 2272 2273 set_pageblock_migratetype(page, MIGRATE_CMA); 2274 2275 if (pageblock_order >= MAX_ORDER) { 2276 i = pageblock_nr_pages; 2277 p = page; 2278 do { 2279 set_page_refcounted(p); 2280 __free_pages(p, MAX_ORDER - 1); 2281 p += MAX_ORDER_NR_PAGES; 2282 } while (i -= MAX_ORDER_NR_PAGES); 2283 } else { 2284 set_page_refcounted(page); 2285 __free_pages(page, pageblock_order); 2286 } 2287 2288 adjust_managed_page_count(page, pageblock_nr_pages); 2289 page_zone(page)->cma_pages += pageblock_nr_pages; 2290 } 2291 #endif 2292 2293 /* 2294 * The order of subdivision here is critical for the IO subsystem. 2295 * Please do not alter this order without good reasons and regression 2296 * testing. Specifically, as large blocks of memory are subdivided, 2297 * the order in which smaller blocks are delivered depends on the order 2298 * they're subdivided in this function. This is the primary factor 2299 * influencing the order in which pages are delivered to the IO 2300 * subsystem according to empirical testing, and this is also justified 2301 * by considering the behavior of a buddy system containing a single 2302 * large block of memory acted on by a series of small allocations. 2303 * This behavior is a critical factor in sglist merging's success. 2304 * 2305 * -- nyc 2306 */ 2307 static inline void expand(struct zone *zone, struct page *page, 2308 int low, int high, int migratetype) 2309 { 2310 unsigned long size = 1 << high; 2311 2312 while (high > low) { 2313 high--; 2314 size >>= 1; 2315 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 2316 2317 /* 2318 * Mark as guard pages (or page), that will allow to 2319 * merge back to allocator when buddy will be freed. 2320 * Corresponding page table entries will not be touched, 2321 * pages will stay not present in virtual address space 2322 */ 2323 if (set_page_guard(zone, &page[size], high, migratetype)) 2324 continue; 2325 2326 add_to_free_list(&page[size], zone, high, migratetype); 2327 set_buddy_order(&page[size], high); 2328 } 2329 } 2330 2331 static void check_new_page_bad(struct page *page) 2332 { 2333 if (unlikely(page->flags & __PG_HWPOISON)) { 2334 /* Don't complain about hwpoisoned pages */ 2335 page_mapcount_reset(page); /* remove PageBuddy */ 2336 return; 2337 } 2338 2339 bad_page(page, 2340 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 2341 } 2342 2343 /* 2344 * This page is about to be returned from the page allocator 2345 */ 2346 static inline int check_new_page(struct page *page) 2347 { 2348 if (likely(page_expected_state(page, 2349 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 2350 return 0; 2351 2352 check_new_page_bad(page); 2353 return 1; 2354 } 2355 2356 #ifdef CONFIG_DEBUG_VM 2357 /* 2358 * With DEBUG_VM enabled, order-0 pages are checked for expected state when 2359 * being allocated from pcp lists. With debug_pagealloc also enabled, they are 2360 * also checked when pcp lists are refilled from the free lists. 2361 */ 2362 static inline bool check_pcp_refill(struct page *page) 2363 { 2364 if (debug_pagealloc_enabled_static()) 2365 return check_new_page(page); 2366 else 2367 return false; 2368 } 2369 2370 static inline bool check_new_pcp(struct page *page) 2371 { 2372 return check_new_page(page); 2373 } 2374 #else 2375 /* 2376 * With DEBUG_VM disabled, free order-0 pages are checked for expected state 2377 * when pcp lists are being refilled from the free lists. With debug_pagealloc 2378 * enabled, they are also checked when being allocated from the pcp lists. 2379 */ 2380 static inline bool check_pcp_refill(struct page *page) 2381 { 2382 return check_new_page(page); 2383 } 2384 static inline bool check_new_pcp(struct page *page) 2385 { 2386 if (debug_pagealloc_enabled_static()) 2387 return check_new_page(page); 2388 else 2389 return false; 2390 } 2391 #endif /* CONFIG_DEBUG_VM */ 2392 2393 static bool check_new_pages(struct page *page, unsigned int order) 2394 { 2395 int i; 2396 for (i = 0; i < (1 << order); i++) { 2397 struct page *p = page + i; 2398 2399 if (unlikely(check_new_page(p))) 2400 return true; 2401 } 2402 2403 return false; 2404 } 2405 2406 inline void post_alloc_hook(struct page *page, unsigned int order, 2407 gfp_t gfp_flags) 2408 { 2409 set_page_private(page, 0); 2410 set_page_refcounted(page); 2411 2412 arch_alloc_page(page, order); 2413 debug_pagealloc_map_pages(page, 1 << order); 2414 2415 /* 2416 * Page unpoisoning must happen before memory initialization. 2417 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO 2418 * allocations and the page unpoisoning code will complain. 2419 */ 2420 kernel_unpoison_pages(page, 1 << order); 2421 2422 /* 2423 * As memory initialization might be integrated into KASAN, 2424 * kasan_alloc_pages and kernel_init_free_pages must be 2425 * kept together to avoid discrepancies in behavior. 2426 */ 2427 if (kasan_has_integrated_init()) { 2428 kasan_alloc_pages(page, order, gfp_flags); 2429 } else { 2430 bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); 2431 2432 kasan_unpoison_pages(page, order, init); 2433 if (init) 2434 kernel_init_free_pages(page, 1 << order, 2435 gfp_flags & __GFP_ZEROTAGS); 2436 } 2437 2438 set_page_owner(page, order, gfp_flags); 2439 } 2440 2441 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 2442 unsigned int alloc_flags) 2443 { 2444 post_alloc_hook(page, order, gfp_flags); 2445 2446 if (order && (gfp_flags & __GFP_COMP)) 2447 prep_compound_page(page, order); 2448 2449 /* 2450 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 2451 * allocate the page. The expectation is that the caller is taking 2452 * steps that will free more memory. The caller should avoid the page 2453 * being used for !PFMEMALLOC purposes. 2454 */ 2455 if (alloc_flags & ALLOC_NO_WATERMARKS) 2456 set_page_pfmemalloc(page); 2457 else 2458 clear_page_pfmemalloc(page); 2459 } 2460 2461 /* 2462 * Go through the free lists for the given migratetype and remove 2463 * the smallest available page from the freelists 2464 */ 2465 static __always_inline 2466 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 2467 int migratetype) 2468 { 2469 unsigned int current_order; 2470 struct free_area *area; 2471 struct page *page; 2472 2473 /* Find a page of the appropriate size in the preferred list */ 2474 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 2475 area = &(zone->free_area[current_order]); 2476 page = get_page_from_free_area(area, migratetype); 2477 if (!page) 2478 continue; 2479 del_page_from_free_list(page, zone, current_order); 2480 expand(zone, page, order, current_order, migratetype); 2481 set_pcppage_migratetype(page, migratetype); 2482 return page; 2483 } 2484 2485 return NULL; 2486 } 2487 2488 2489 /* 2490 * This array describes the order lists are fallen back to when 2491 * the free lists for the desirable migrate type are depleted 2492 */ 2493 static int fallbacks[MIGRATE_TYPES][3] = { 2494 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2495 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 2496 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2497 #ifdef CONFIG_CMA 2498 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 2499 #endif 2500 #ifdef CONFIG_MEMORY_ISOLATION 2501 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 2502 #endif 2503 }; 2504 2505 #ifdef CONFIG_CMA 2506 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2507 unsigned int order) 2508 { 2509 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 2510 } 2511 #else 2512 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 2513 unsigned int order) { return NULL; } 2514 #endif 2515 2516 /* 2517 * Move the free pages in a range to the freelist tail of the requested type. 2518 * Note that start_page and end_pages are not aligned on a pageblock 2519 * boundary. If alignment is required, use move_freepages_block() 2520 */ 2521 static int move_freepages(struct zone *zone, 2522 unsigned long start_pfn, unsigned long end_pfn, 2523 int migratetype, int *num_movable) 2524 { 2525 struct page *page; 2526 unsigned long pfn; 2527 unsigned int order; 2528 int pages_moved = 0; 2529 2530 for (pfn = start_pfn; pfn <= end_pfn;) { 2531 if (!pfn_valid_within(pfn)) { 2532 pfn++; 2533 continue; 2534 } 2535 2536 page = pfn_to_page(pfn); 2537 if (!PageBuddy(page)) { 2538 /* 2539 * We assume that pages that could be isolated for 2540 * migration are movable. But we don't actually try 2541 * isolating, as that would be expensive. 2542 */ 2543 if (num_movable && 2544 (PageLRU(page) || __PageMovable(page))) 2545 (*num_movable)++; 2546 pfn++; 2547 continue; 2548 } 2549 2550 /* Make sure we are not inadvertently changing nodes */ 2551 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 2552 VM_BUG_ON_PAGE(page_zone(page) != zone, page); 2553 2554 order = buddy_order(page); 2555 move_to_free_list(page, zone, order, migratetype); 2556 pfn += 1 << order; 2557 pages_moved += 1 << order; 2558 } 2559 2560 return pages_moved; 2561 } 2562 2563 int move_freepages_block(struct zone *zone, struct page *page, 2564 int migratetype, int *num_movable) 2565 { 2566 unsigned long start_pfn, end_pfn, pfn; 2567 2568 if (num_movable) 2569 *num_movable = 0; 2570 2571 pfn = page_to_pfn(page); 2572 start_pfn = pfn & ~(pageblock_nr_pages - 1); 2573 end_pfn = start_pfn + pageblock_nr_pages - 1; 2574 2575 /* Do not cross zone boundaries */ 2576 if (!zone_spans_pfn(zone, start_pfn)) 2577 start_pfn = pfn; 2578 if (!zone_spans_pfn(zone, end_pfn)) 2579 return 0; 2580 2581 return move_freepages(zone, start_pfn, end_pfn, migratetype, 2582 num_movable); 2583 } 2584 2585 static void change_pageblock_range(struct page *pageblock_page, 2586 int start_order, int migratetype) 2587 { 2588 int nr_pageblocks = 1 << (start_order - pageblock_order); 2589 2590 while (nr_pageblocks--) { 2591 set_pageblock_migratetype(pageblock_page, migratetype); 2592 pageblock_page += pageblock_nr_pages; 2593 } 2594 } 2595 2596 /* 2597 * When we are falling back to another migratetype during allocation, try to 2598 * steal extra free pages from the same pageblocks to satisfy further 2599 * allocations, instead of polluting multiple pageblocks. 2600 * 2601 * If we are stealing a relatively large buddy page, it is likely there will 2602 * be more free pages in the pageblock, so try to steal them all. For 2603 * reclaimable and unmovable allocations, we steal regardless of page size, 2604 * as fragmentation caused by those allocations polluting movable pageblocks 2605 * is worse than movable allocations stealing from unmovable and reclaimable 2606 * pageblocks. 2607 */ 2608 static bool can_steal_fallback(unsigned int order, int start_mt) 2609 { 2610 /* 2611 * Leaving this order check is intended, although there is 2612 * relaxed order check in next check. The reason is that 2613 * we can actually steal whole pageblock if this condition met, 2614 * but, below check doesn't guarantee it and that is just heuristic 2615 * so could be changed anytime. 2616 */ 2617 if (order >= pageblock_order) 2618 return true; 2619 2620 if (order >= pageblock_order / 2 || 2621 start_mt == MIGRATE_RECLAIMABLE || 2622 start_mt == MIGRATE_UNMOVABLE || 2623 page_group_by_mobility_disabled) 2624 return true; 2625 2626 return false; 2627 } 2628 2629 static inline bool boost_watermark(struct zone *zone) 2630 { 2631 unsigned long max_boost; 2632 2633 if (!watermark_boost_factor) 2634 return false; 2635 /* 2636 * Don't bother in zones that are unlikely to produce results. 2637 * On small machines, including kdump capture kernels running 2638 * in a small area, boosting the watermark can cause an out of 2639 * memory situation immediately. 2640 */ 2641 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 2642 return false; 2643 2644 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 2645 watermark_boost_factor, 10000); 2646 2647 /* 2648 * high watermark may be uninitialised if fragmentation occurs 2649 * very early in boot so do not boost. We do not fall 2650 * through and boost by pageblock_nr_pages as failing 2651 * allocations that early means that reclaim is not going 2652 * to help and it may even be impossible to reclaim the 2653 * boosted watermark resulting in a hang. 2654 */ 2655 if (!max_boost) 2656 return false; 2657 2658 max_boost = max(pageblock_nr_pages, max_boost); 2659 2660 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 2661 max_boost); 2662 2663 return true; 2664 } 2665 2666 /* 2667 * This function implements actual steal behaviour. If order is large enough, 2668 * we can steal whole pageblock. If not, we first move freepages in this 2669 * pageblock to our migratetype and determine how many already-allocated pages 2670 * are there in the pageblock with a compatible migratetype. If at least half 2671 * of pages are free or compatible, we can change migratetype of the pageblock 2672 * itself, so pages freed in the future will be put on the correct free list. 2673 */ 2674 static void steal_suitable_fallback(struct zone *zone, struct page *page, 2675 unsigned int alloc_flags, int start_type, bool whole_block) 2676 { 2677 unsigned int current_order = buddy_order(page); 2678 int free_pages, movable_pages, alike_pages; 2679 int old_block_type; 2680 2681 old_block_type = get_pageblock_migratetype(page); 2682 2683 /* 2684 * This can happen due to races and we want to prevent broken 2685 * highatomic accounting. 2686 */ 2687 if (is_migrate_highatomic(old_block_type)) 2688 goto single_page; 2689 2690 /* Take ownership for orders >= pageblock_order */ 2691 if (current_order >= pageblock_order) { 2692 change_pageblock_range(page, current_order, start_type); 2693 goto single_page; 2694 } 2695 2696 /* 2697 * Boost watermarks to increase reclaim pressure to reduce the 2698 * likelihood of future fallbacks. Wake kswapd now as the node 2699 * may be balanced overall and kswapd will not wake naturally. 2700 */ 2701 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 2702 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 2703 2704 /* We are not allowed to try stealing from the whole block */ 2705 if (!whole_block) 2706 goto single_page; 2707 2708 free_pages = move_freepages_block(zone, page, start_type, 2709 &movable_pages); 2710 /* 2711 * Determine how many pages are compatible with our allocation. 2712 * For movable allocation, it's the number of movable pages which 2713 * we just obtained. For other types it's a bit more tricky. 2714 */ 2715 if (start_type == MIGRATE_MOVABLE) { 2716 alike_pages = movable_pages; 2717 } else { 2718 /* 2719 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2720 * to MOVABLE pageblock, consider all non-movable pages as 2721 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2722 * vice versa, be conservative since we can't distinguish the 2723 * exact migratetype of non-movable pages. 2724 */ 2725 if (old_block_type == MIGRATE_MOVABLE) 2726 alike_pages = pageblock_nr_pages 2727 - (free_pages + movable_pages); 2728 else 2729 alike_pages = 0; 2730 } 2731 2732 /* moving whole block can fail due to zone boundary conditions */ 2733 if (!free_pages) 2734 goto single_page; 2735 2736 /* 2737 * If a sufficient number of pages in the block are either free or of 2738 * comparable migratability as our allocation, claim the whole block. 2739 */ 2740 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2741 page_group_by_mobility_disabled) 2742 set_pageblock_migratetype(page, start_type); 2743 2744 return; 2745 2746 single_page: 2747 move_to_free_list(page, zone, current_order, start_type); 2748 } 2749 2750 /* 2751 * Check whether there is a suitable fallback freepage with requested order. 2752 * If only_stealable is true, this function returns fallback_mt only if 2753 * we can steal other freepages all together. This would help to reduce 2754 * fragmentation due to mixed migratetype pages in one pageblock. 2755 */ 2756 int find_suitable_fallback(struct free_area *area, unsigned int order, 2757 int migratetype, bool only_stealable, bool *can_steal) 2758 { 2759 int i; 2760 int fallback_mt; 2761 2762 if (area->nr_free == 0) 2763 return -1; 2764 2765 *can_steal = false; 2766 for (i = 0;; i++) { 2767 fallback_mt = fallbacks[migratetype][i]; 2768 if (fallback_mt == MIGRATE_TYPES) 2769 break; 2770 2771 if (free_area_empty(area, fallback_mt)) 2772 continue; 2773 2774 if (can_steal_fallback(order, migratetype)) 2775 *can_steal = true; 2776 2777 if (!only_stealable) 2778 return fallback_mt; 2779 2780 if (*can_steal) 2781 return fallback_mt; 2782 } 2783 2784 return -1; 2785 } 2786 2787 /* 2788 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2789 * there are no empty page blocks that contain a page with a suitable order 2790 */ 2791 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2792 unsigned int alloc_order) 2793 { 2794 int mt; 2795 unsigned long max_managed, flags; 2796 2797 /* 2798 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2799 * Check is race-prone but harmless. 2800 */ 2801 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; 2802 if (zone->nr_reserved_highatomic >= max_managed) 2803 return; 2804 2805 spin_lock_irqsave(&zone->lock, flags); 2806 2807 /* Recheck the nr_reserved_highatomic limit under the lock */ 2808 if (zone->nr_reserved_highatomic >= max_managed) 2809 goto out_unlock; 2810 2811 /* Yoink! */ 2812 mt = get_pageblock_migratetype(page); 2813 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 2814 && !is_migrate_cma(mt)) { 2815 zone->nr_reserved_highatomic += pageblock_nr_pages; 2816 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2817 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2818 } 2819 2820 out_unlock: 2821 spin_unlock_irqrestore(&zone->lock, flags); 2822 } 2823 2824 /* 2825 * Used when an allocation is about to fail under memory pressure. This 2826 * potentially hurts the reliability of high-order allocations when under 2827 * intense memory pressure but failed atomic allocations should be easier 2828 * to recover from than an OOM. 2829 * 2830 * If @force is true, try to unreserve a pageblock even though highatomic 2831 * pageblock is exhausted. 2832 */ 2833 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2834 bool force) 2835 { 2836 struct zonelist *zonelist = ac->zonelist; 2837 unsigned long flags; 2838 struct zoneref *z; 2839 struct zone *zone; 2840 struct page *page; 2841 int order; 2842 bool ret; 2843 2844 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 2845 ac->nodemask) { 2846 /* 2847 * Preserve at least one pageblock unless memory pressure 2848 * is really high. 2849 */ 2850 if (!force && zone->nr_reserved_highatomic <= 2851 pageblock_nr_pages) 2852 continue; 2853 2854 spin_lock_irqsave(&zone->lock, flags); 2855 for (order = 0; order < MAX_ORDER; order++) { 2856 struct free_area *area = &(zone->free_area[order]); 2857 2858 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 2859 if (!page) 2860 continue; 2861 2862 /* 2863 * In page freeing path, migratetype change is racy so 2864 * we can counter several free pages in a pageblock 2865 * in this loop although we changed the pageblock type 2866 * from highatomic to ac->migratetype. So we should 2867 * adjust the count once. 2868 */ 2869 if (is_migrate_highatomic_page(page)) { 2870 /* 2871 * It should never happen but changes to 2872 * locking could inadvertently allow a per-cpu 2873 * drain to add pages to MIGRATE_HIGHATOMIC 2874 * while unreserving so be safe and watch for 2875 * underflows. 2876 */ 2877 zone->nr_reserved_highatomic -= min( 2878 pageblock_nr_pages, 2879 zone->nr_reserved_highatomic); 2880 } 2881 2882 /* 2883 * Convert to ac->migratetype and avoid the normal 2884 * pageblock stealing heuristics. Minimally, the caller 2885 * is doing the work and needs the pages. More 2886 * importantly, if the block was always converted to 2887 * MIGRATE_UNMOVABLE or another type then the number 2888 * of pageblocks that cannot be completely freed 2889 * may increase. 2890 */ 2891 set_pageblock_migratetype(page, ac->migratetype); 2892 ret = move_freepages_block(zone, page, ac->migratetype, 2893 NULL); 2894 if (ret) { 2895 spin_unlock_irqrestore(&zone->lock, flags); 2896 return ret; 2897 } 2898 } 2899 spin_unlock_irqrestore(&zone->lock, flags); 2900 } 2901 2902 return false; 2903 } 2904 2905 /* 2906 * Try finding a free buddy page on the fallback list and put it on the free 2907 * list of requested migratetype, possibly along with other pages from the same 2908 * block, depending on fragmentation avoidance heuristics. Returns true if 2909 * fallback was found so that __rmqueue_smallest() can grab it. 2910 * 2911 * The use of signed ints for order and current_order is a deliberate 2912 * deviation from the rest of this file, to make the for loop 2913 * condition simpler. 2914 */ 2915 static __always_inline bool 2916 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, 2917 unsigned int alloc_flags) 2918 { 2919 struct free_area *area; 2920 int current_order; 2921 int min_order = order; 2922 struct page *page; 2923 int fallback_mt; 2924 bool can_steal; 2925 2926 /* 2927 * Do not steal pages from freelists belonging to other pageblocks 2928 * i.e. orders < pageblock_order. If there are no local zones free, 2929 * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 2930 */ 2931 if (alloc_flags & ALLOC_NOFRAGMENT) 2932 min_order = pageblock_order; 2933 2934 /* 2935 * Find the largest available free page in the other list. This roughly 2936 * approximates finding the pageblock with the most free pages, which 2937 * would be too costly to do exactly. 2938 */ 2939 for (current_order = MAX_ORDER - 1; current_order >= min_order; 2940 --current_order) { 2941 area = &(zone->free_area[current_order]); 2942 fallback_mt = find_suitable_fallback(area, current_order, 2943 start_migratetype, false, &can_steal); 2944 if (fallback_mt == -1) 2945 continue; 2946 2947 /* 2948 * We cannot steal all free pages from the pageblock and the 2949 * requested migratetype is movable. In that case it's better to 2950 * steal and split the smallest available page instead of the 2951 * largest available page, because even if the next movable 2952 * allocation falls back into a different pageblock than this 2953 * one, it won't cause permanent fragmentation. 2954 */ 2955 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2956 && current_order > order) 2957 goto find_smallest; 2958 2959 goto do_steal; 2960 } 2961 2962 return false; 2963 2964 find_smallest: 2965 for (current_order = order; current_order < MAX_ORDER; 2966 current_order++) { 2967 area = &(zone->free_area[current_order]); 2968 fallback_mt = find_suitable_fallback(area, current_order, 2969 start_migratetype, false, &can_steal); 2970 if (fallback_mt != -1) 2971 break; 2972 } 2973 2974 /* 2975 * This should not happen - we already found a suitable fallback 2976 * when looking for the largest page. 2977 */ 2978 VM_BUG_ON(current_order == MAX_ORDER); 2979 2980 do_steal: 2981 page = get_page_from_free_area(area, fallback_mt); 2982 2983 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, 2984 can_steal); 2985 2986 trace_mm_page_alloc_extfrag(page, order, current_order, 2987 start_migratetype, fallback_mt); 2988 2989 return true; 2990 2991 } 2992 2993 /* 2994 * Do the hard work of removing an element from the buddy allocator. 2995 * Call me with the zone->lock already held. 2996 */ 2997 static __always_inline struct page * 2998 __rmqueue(struct zone *zone, unsigned int order, int migratetype, 2999 unsigned int alloc_flags) 3000 { 3001 struct page *page; 3002 3003 if (IS_ENABLED(CONFIG_CMA)) { 3004 /* 3005 * Balance movable allocations between regular and CMA areas by 3006 * allocating from CMA when over half of the zone's free memory 3007 * is in the CMA area. 3008 */ 3009 if (alloc_flags & ALLOC_CMA && 3010 zone_page_state(zone, NR_FREE_CMA_PAGES) > 3011 zone_page_state(zone, NR_FREE_PAGES) / 2) { 3012 page = __rmqueue_cma_fallback(zone, order); 3013 if (page) 3014 goto out; 3015 } 3016 } 3017 retry: 3018 page = __rmqueue_smallest(zone, order, migratetype); 3019 if (unlikely(!page)) { 3020 if (alloc_flags & ALLOC_CMA) 3021 page = __rmqueue_cma_fallback(zone, order); 3022 3023 if (!page && __rmqueue_fallback(zone, order, migratetype, 3024 alloc_flags)) 3025 goto retry; 3026 } 3027 out: 3028 if (page) 3029 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3030 return page; 3031 } 3032 3033 /* 3034 * Obtain a specified number of elements from the buddy allocator, all under 3035 * a single hold of the lock, for efficiency. Add them to the supplied list. 3036 * Returns the number of new pages which were placed at *list. 3037 */ 3038 static int rmqueue_bulk(struct zone *zone, unsigned int order, 3039 unsigned long count, struct list_head *list, 3040 int migratetype, unsigned int alloc_flags) 3041 { 3042 int i, allocated = 0; 3043 3044 /* 3045 * local_lock_irq held so equivalent to spin_lock_irqsave for 3046 * both PREEMPT_RT and non-PREEMPT_RT configurations. 3047 */ 3048 spin_lock(&zone->lock); 3049 for (i = 0; i < count; ++i) { 3050 struct page *page = __rmqueue(zone, order, migratetype, 3051 alloc_flags); 3052 if (unlikely(page == NULL)) 3053 break; 3054 3055 if (unlikely(check_pcp_refill(page))) 3056 continue; 3057 3058 /* 3059 * Split buddy pages returned by expand() are received here in 3060 * physical page order. The page is added to the tail of 3061 * caller's list. From the callers perspective, the linked list 3062 * is ordered by page number under some conditions. This is 3063 * useful for IO devices that can forward direction from the 3064 * head, thus also in the physical page order. This is useful 3065 * for IO devices that can merge IO requests if the physical 3066 * pages are ordered properly. 3067 */ 3068 list_add_tail(&page->lru, list); 3069 allocated++; 3070 if (is_migrate_cma(get_pcppage_migratetype(page))) 3071 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 3072 -(1 << order)); 3073 } 3074 3075 /* 3076 * i pages were removed from the buddy list even if some leak due 3077 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 3078 * on i. Do not confuse with 'allocated' which is the number of 3079 * pages added to the pcp list. 3080 */ 3081 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 3082 spin_unlock(&zone->lock); 3083 return allocated; 3084 } 3085 3086 #ifdef CONFIG_NUMA 3087 /* 3088 * Called from the vmstat counter updater to drain pagesets of this 3089 * currently executing processor on remote nodes after they have 3090 * expired. 3091 * 3092 * Note that this function must be called with the thread pinned to 3093 * a single processor. 3094 */ 3095 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 3096 { 3097 unsigned long flags; 3098 int to_drain, batch; 3099 3100 local_lock_irqsave(&pagesets.lock, flags); 3101 batch = READ_ONCE(pcp->batch); 3102 to_drain = min(pcp->count, batch); 3103 if (to_drain > 0) 3104 free_pcppages_bulk(zone, to_drain, pcp); 3105 local_unlock_irqrestore(&pagesets.lock, flags); 3106 } 3107 #endif 3108 3109 /* 3110 * Drain pcplists of the indicated processor and zone. 3111 * 3112 * The processor must either be the current processor and the 3113 * thread pinned to the current processor or a processor that 3114 * is not online. 3115 */ 3116 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 3117 { 3118 unsigned long flags; 3119 struct per_cpu_pages *pcp; 3120 3121 local_lock_irqsave(&pagesets.lock, flags); 3122 3123 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 3124 if (pcp->count) 3125 free_pcppages_bulk(zone, pcp->count, pcp); 3126 3127 local_unlock_irqrestore(&pagesets.lock, flags); 3128 } 3129 3130 /* 3131 * Drain pcplists of all zones on the indicated processor. 3132 * 3133 * The processor must either be the current processor and the 3134 * thread pinned to the current processor or a processor that 3135 * is not online. 3136 */ 3137 static void drain_pages(unsigned int cpu) 3138 { 3139 struct zone *zone; 3140 3141 for_each_populated_zone(zone) { 3142 drain_pages_zone(cpu, zone); 3143 } 3144 } 3145 3146 /* 3147 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 3148 * 3149 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 3150 * the single zone's pages. 3151 */ 3152 void drain_local_pages(struct zone *zone) 3153 { 3154 int cpu = smp_processor_id(); 3155 3156 if (zone) 3157 drain_pages_zone(cpu, zone); 3158 else 3159 drain_pages(cpu); 3160 } 3161 3162 static void drain_local_pages_wq(struct work_struct *work) 3163 { 3164 struct pcpu_drain *drain; 3165 3166 drain = container_of(work, struct pcpu_drain, work); 3167 3168 /* 3169 * drain_all_pages doesn't use proper cpu hotplug protection so 3170 * we can race with cpu offline when the WQ can move this from 3171 * a cpu pinned worker to an unbound one. We can operate on a different 3172 * cpu which is alright but we also have to make sure to not move to 3173 * a different one. 3174 */ 3175 preempt_disable(); 3176 drain_local_pages(drain->zone); 3177 preempt_enable(); 3178 } 3179 3180 /* 3181 * The implementation of drain_all_pages(), exposing an extra parameter to 3182 * drain on all cpus. 3183 * 3184 * drain_all_pages() is optimized to only execute on cpus where pcplists are 3185 * not empty. The check for non-emptiness can however race with a free to 3186 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers 3187 * that need the guarantee that every CPU has drained can disable the 3188 * optimizing racy check. 3189 */ 3190 static void __drain_all_pages(struct zone *zone, bool force_all_cpus) 3191 { 3192 int cpu; 3193 3194 /* 3195 * Allocate in the BSS so we won't require allocation in 3196 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 3197 */ 3198 static cpumask_t cpus_with_pcps; 3199 3200 /* 3201 * Make sure nobody triggers this path before mm_percpu_wq is fully 3202 * initialized. 3203 */ 3204 if (WARN_ON_ONCE(!mm_percpu_wq)) 3205 return; 3206 3207 /* 3208 * Do not drain if one is already in progress unless it's specific to 3209 * a zone. Such callers are primarily CMA and memory hotplug and need 3210 * the drain to be complete when the call returns. 3211 */ 3212 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 3213 if (!zone) 3214 return; 3215 mutex_lock(&pcpu_drain_mutex); 3216 } 3217 3218 /* 3219 * We don't care about racing with CPU hotplug event 3220 * as offline notification will cause the notified 3221 * cpu to drain that CPU pcps and on_each_cpu_mask 3222 * disables preemption as part of its processing 3223 */ 3224 for_each_online_cpu(cpu) { 3225 struct per_cpu_pages *pcp; 3226 struct zone *z; 3227 bool has_pcps = false; 3228 3229 if (force_all_cpus) { 3230 /* 3231 * The pcp.count check is racy, some callers need a 3232 * guarantee that no cpu is missed. 3233 */ 3234 has_pcps = true; 3235 } else if (zone) { 3236 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 3237 if (pcp->count) 3238 has_pcps = true; 3239 } else { 3240 for_each_populated_zone(z) { 3241 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); 3242 if (pcp->count) { 3243 has_pcps = true; 3244 break; 3245 } 3246 } 3247 } 3248 3249 if (has_pcps) 3250 cpumask_set_cpu(cpu, &cpus_with_pcps); 3251 else 3252 cpumask_clear_cpu(cpu, &cpus_with_pcps); 3253 } 3254 3255 for_each_cpu(cpu, &cpus_with_pcps) { 3256 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); 3257 3258 drain->zone = zone; 3259 INIT_WORK(&drain->work, drain_local_pages_wq); 3260 queue_work_on(cpu, mm_percpu_wq, &drain->work); 3261 } 3262 for_each_cpu(cpu, &cpus_with_pcps) 3263 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); 3264 3265 mutex_unlock(&pcpu_drain_mutex); 3266 } 3267 3268 /* 3269 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 3270 * 3271 * When zone parameter is non-NULL, spill just the single zone's pages. 3272 * 3273 * Note that this can be extremely slow as the draining happens in a workqueue. 3274 */ 3275 void drain_all_pages(struct zone *zone) 3276 { 3277 __drain_all_pages(zone, false); 3278 } 3279 3280 #ifdef CONFIG_HIBERNATION 3281 3282 /* 3283 * Touch the watchdog for every WD_PAGE_COUNT pages. 3284 */ 3285 #define WD_PAGE_COUNT (128*1024) 3286 3287 void mark_free_pages(struct zone *zone) 3288 { 3289 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 3290 unsigned long flags; 3291 unsigned int order, t; 3292 struct page *page; 3293 3294 if (zone_is_empty(zone)) 3295 return; 3296 3297 spin_lock_irqsave(&zone->lock, flags); 3298 3299 max_zone_pfn = zone_end_pfn(zone); 3300 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 3301 if (pfn_valid(pfn)) { 3302 page = pfn_to_page(pfn); 3303 3304 if (!--page_count) { 3305 touch_nmi_watchdog(); 3306 page_count = WD_PAGE_COUNT; 3307 } 3308 3309 if (page_zone(page) != zone) 3310 continue; 3311 3312 if (!swsusp_page_is_forbidden(page)) 3313 swsusp_unset_page_free(page); 3314 } 3315 3316 for_each_migratetype_order(order, t) { 3317 list_for_each_entry(page, 3318 &zone->free_area[order].free_list[t], lru) { 3319 unsigned long i; 3320 3321 pfn = page_to_pfn(page); 3322 for (i = 0; i < (1UL << order); i++) { 3323 if (!--page_count) { 3324 touch_nmi_watchdog(); 3325 page_count = WD_PAGE_COUNT; 3326 } 3327 swsusp_set_page_free(pfn_to_page(pfn + i)); 3328 } 3329 } 3330 } 3331 spin_unlock_irqrestore(&zone->lock, flags); 3332 } 3333 #endif /* CONFIG_PM */ 3334 3335 static bool free_unref_page_prepare(struct page *page, unsigned long pfn, 3336 unsigned int order) 3337 { 3338 int migratetype; 3339 3340 if (!free_pcp_prepare(page, order)) 3341 return false; 3342 3343 migratetype = get_pfnblock_migratetype(page, pfn); 3344 set_pcppage_migratetype(page, migratetype); 3345 return true; 3346 } 3347 3348 static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) 3349 { 3350 int min_nr_free, max_nr_free; 3351 3352 /* Check for PCP disabled or boot pageset */ 3353 if (unlikely(high < batch)) 3354 return 1; 3355 3356 /* Leave at least pcp->batch pages on the list */ 3357 min_nr_free = batch; 3358 max_nr_free = high - batch; 3359 3360 /* 3361 * Double the number of pages freed each time there is subsequent 3362 * freeing of pages without any allocation. 3363 */ 3364 batch <<= pcp->free_factor; 3365 if (batch < max_nr_free) 3366 pcp->free_factor++; 3367 batch = clamp(batch, min_nr_free, max_nr_free); 3368 3369 return batch; 3370 } 3371 3372 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) 3373 { 3374 int high = READ_ONCE(pcp->high); 3375 3376 if (unlikely(!high)) 3377 return 0; 3378 3379 if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) 3380 return high; 3381 3382 /* 3383 * If reclaim is active, limit the number of pages that can be 3384 * stored on pcp lists 3385 */ 3386 return min(READ_ONCE(pcp->batch) << 2, high); 3387 } 3388 3389 static void free_unref_page_commit(struct page *page, unsigned long pfn, 3390 int migratetype, unsigned int order) 3391 { 3392 struct zone *zone = page_zone(page); 3393 struct per_cpu_pages *pcp; 3394 int high; 3395 int pindex; 3396 3397 __count_vm_event(PGFREE); 3398 pcp = this_cpu_ptr(zone->per_cpu_pageset); 3399 pindex = order_to_pindex(migratetype, order); 3400 list_add(&page->lru, &pcp->lists[pindex]); 3401 pcp->count += 1 << order; 3402 high = nr_pcp_high(pcp, zone); 3403 if (pcp->count >= high) { 3404 int batch = READ_ONCE(pcp->batch); 3405 3406 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); 3407 } 3408 } 3409 3410 /* 3411 * Free a pcp page 3412 */ 3413 void free_unref_page(struct page *page, unsigned int order) 3414 { 3415 unsigned long flags; 3416 unsigned long pfn = page_to_pfn(page); 3417 int migratetype; 3418 3419 if (!free_unref_page_prepare(page, pfn, order)) 3420 return; 3421 3422 /* 3423 * We only track unmovable, reclaimable and movable on pcp lists. 3424 * Place ISOLATE pages on the isolated list because they are being 3425 * offlined but treat HIGHATOMIC as movable pages so we can get those 3426 * areas back if necessary. Otherwise, we may have to free 3427 * excessively into the page allocator 3428 */ 3429 migratetype = get_pcppage_migratetype(page); 3430 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 3431 if (unlikely(is_migrate_isolate(migratetype))) { 3432 free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); 3433 return; 3434 } 3435 migratetype = MIGRATE_MOVABLE; 3436 } 3437 3438 local_lock_irqsave(&pagesets.lock, flags); 3439 free_unref_page_commit(page, pfn, migratetype, order); 3440 local_unlock_irqrestore(&pagesets.lock, flags); 3441 } 3442 3443 /* 3444 * Free a list of 0-order pages 3445 */ 3446 void free_unref_page_list(struct list_head *list) 3447 { 3448 struct page *page, *next; 3449 unsigned long flags, pfn; 3450 int batch_count = 0; 3451 int migratetype; 3452 3453 /* Prepare pages for freeing */ 3454 list_for_each_entry_safe(page, next, list, lru) { 3455 pfn = page_to_pfn(page); 3456 if (!free_unref_page_prepare(page, pfn, 0)) 3457 list_del(&page->lru); 3458 3459 /* 3460 * Free isolated pages directly to the allocator, see 3461 * comment in free_unref_page. 3462 */ 3463 migratetype = get_pcppage_migratetype(page); 3464 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 3465 if (unlikely(is_migrate_isolate(migratetype))) { 3466 list_del(&page->lru); 3467 free_one_page(page_zone(page), page, pfn, 0, 3468 migratetype, FPI_NONE); 3469 continue; 3470 } 3471 3472 /* 3473 * Non-isolated types over MIGRATE_PCPTYPES get added 3474 * to the MIGRATE_MOVABLE pcp list. 3475 */ 3476 set_pcppage_migratetype(page, MIGRATE_MOVABLE); 3477 } 3478 3479 set_page_private(page, pfn); 3480 } 3481 3482 local_lock_irqsave(&pagesets.lock, flags); 3483 list_for_each_entry_safe(page, next, list, lru) { 3484 pfn = page_private(page); 3485 set_page_private(page, 0); 3486 migratetype = get_pcppage_migratetype(page); 3487 trace_mm_page_free_batched(page); 3488 free_unref_page_commit(page, pfn, migratetype, 0); 3489 3490 /* 3491 * Guard against excessive IRQ disabled times when we get 3492 * a large list of pages to free. 3493 */ 3494 if (++batch_count == SWAP_CLUSTER_MAX) { 3495 local_unlock_irqrestore(&pagesets.lock, flags); 3496 batch_count = 0; 3497 local_lock_irqsave(&pagesets.lock, flags); 3498 } 3499 } 3500 local_unlock_irqrestore(&pagesets.lock, flags); 3501 } 3502 3503 /* 3504 * split_page takes a non-compound higher-order page, and splits it into 3505 * n (1<<order) sub-pages: page[0..n] 3506 * Each sub-page must be freed individually. 3507 * 3508 * Note: this is probably too low level an operation for use in drivers. 3509 * Please consult with lkml before using this in your driver. 3510 */ 3511 void split_page(struct page *page, unsigned int order) 3512 { 3513 int i; 3514 3515 VM_BUG_ON_PAGE(PageCompound(page), page); 3516 VM_BUG_ON_PAGE(!page_count(page), page); 3517 3518 for (i = 1; i < (1 << order); i++) 3519 set_page_refcounted(page + i); 3520 split_page_owner(page, 1 << order); 3521 split_page_memcg(page, 1 << order); 3522 } 3523 EXPORT_SYMBOL_GPL(split_page); 3524 3525 int __isolate_free_page(struct page *page, unsigned int order) 3526 { 3527 unsigned long watermark; 3528 struct zone *zone; 3529 int mt; 3530 3531 BUG_ON(!PageBuddy(page)); 3532 3533 zone = page_zone(page); 3534 mt = get_pageblock_migratetype(page); 3535 3536 if (!is_migrate_isolate(mt)) { 3537 /* 3538 * Obey watermarks as if the page was being allocated. We can 3539 * emulate a high-order watermark check with a raised order-0 3540 * watermark, because we already know our high-order page 3541 * exists. 3542 */ 3543 watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 3544 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3545 return 0; 3546 3547 __mod_zone_freepage_state(zone, -(1UL << order), mt); 3548 } 3549 3550 /* Remove page from free list */ 3551 3552 del_page_from_free_list(page, zone, order); 3553 3554 /* 3555 * Set the pageblock if the isolated page is at least half of a 3556 * pageblock 3557 */ 3558 if (order >= pageblock_order - 1) { 3559 struct page *endpage = page + (1 << order) - 1; 3560 for (; page < endpage; page += pageblock_nr_pages) { 3561 int mt = get_pageblock_migratetype(page); 3562 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 3563 && !is_migrate_highatomic(mt)) 3564 set_pageblock_migratetype(page, 3565 MIGRATE_MOVABLE); 3566 } 3567 } 3568 3569 3570 return 1UL << order; 3571 } 3572 3573 /** 3574 * __putback_isolated_page - Return a now-isolated page back where we got it 3575 * @page: Page that was isolated 3576 * @order: Order of the isolated page 3577 * @mt: The page's pageblock's migratetype 3578 * 3579 * This function is meant to return a page pulled from the free lists via 3580 * __isolate_free_page back to the free lists they were pulled from. 3581 */ 3582 void __putback_isolated_page(struct page *page, unsigned int order, int mt) 3583 { 3584 struct zone *zone = page_zone(page); 3585 3586 /* zone lock should be held when this function is called */ 3587 lockdep_assert_held(&zone->lock); 3588 3589 /* Return isolated page to tail of freelist. */ 3590 __free_one_page(page, page_to_pfn(page), zone, order, mt, 3591 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 3592 } 3593 3594 /* 3595 * Update NUMA hit/miss statistics 3596 * 3597 * Must be called with interrupts disabled. 3598 */ 3599 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, 3600 long nr_account) 3601 { 3602 #ifdef CONFIG_NUMA 3603 enum numa_stat_item local_stat = NUMA_LOCAL; 3604 3605 /* skip numa counters update if numa stats is disabled */ 3606 if (!static_branch_likely(&vm_numa_stat_key)) 3607 return; 3608 3609 if (zone_to_nid(z) != numa_node_id()) 3610 local_stat = NUMA_OTHER; 3611 3612 if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 3613 __count_numa_events(z, NUMA_HIT, nr_account); 3614 else { 3615 __count_numa_events(z, NUMA_MISS, nr_account); 3616 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); 3617 } 3618 __count_numa_events(z, local_stat, nr_account); 3619 #endif 3620 } 3621 3622 /* Remove page from the per-cpu list, caller must protect the list */ 3623 static inline 3624 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, 3625 int migratetype, 3626 unsigned int alloc_flags, 3627 struct per_cpu_pages *pcp, 3628 struct list_head *list) 3629 { 3630 struct page *page; 3631 3632 do { 3633 if (list_empty(list)) { 3634 int batch = READ_ONCE(pcp->batch); 3635 int alloced; 3636 3637 /* 3638 * Scale batch relative to order if batch implies 3639 * free pages can be stored on the PCP. Batch can 3640 * be 1 for small zones or for boot pagesets which 3641 * should never store free pages as the pages may 3642 * belong to arbitrary zones. 3643 */ 3644 if (batch > 1) 3645 batch = max(batch >> order, 2); 3646 alloced = rmqueue_bulk(zone, order, 3647 batch, list, 3648 migratetype, alloc_flags); 3649 3650 pcp->count += alloced << order; 3651 if (unlikely(list_empty(list))) 3652 return NULL; 3653 } 3654 3655 page = list_first_entry(list, struct page, lru); 3656 list_del(&page->lru); 3657 pcp->count -= 1 << order; 3658 } while (check_new_pcp(page)); 3659 3660 return page; 3661 } 3662 3663 /* Lock and remove page from the per-cpu list */ 3664 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3665 struct zone *zone, unsigned int order, 3666 gfp_t gfp_flags, int migratetype, 3667 unsigned int alloc_flags) 3668 { 3669 struct per_cpu_pages *pcp; 3670 struct list_head *list; 3671 struct page *page; 3672 unsigned long flags; 3673 3674 local_lock_irqsave(&pagesets.lock, flags); 3675 3676 /* 3677 * On allocation, reduce the number of pages that are batch freed. 3678 * See nr_pcp_free() where free_factor is increased for subsequent 3679 * frees. 3680 */ 3681 pcp = this_cpu_ptr(zone->per_cpu_pageset); 3682 pcp->free_factor >>= 1; 3683 list = &pcp->lists[order_to_pindex(migratetype, order)]; 3684 page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); 3685 local_unlock_irqrestore(&pagesets.lock, flags); 3686 if (page) { 3687 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); 3688 zone_statistics(preferred_zone, zone, 1); 3689 } 3690 return page; 3691 } 3692 3693 /* 3694 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 3695 */ 3696 static inline 3697 struct page *rmqueue(struct zone *preferred_zone, 3698 struct zone *zone, unsigned int order, 3699 gfp_t gfp_flags, unsigned int alloc_flags, 3700 int migratetype) 3701 { 3702 unsigned long flags; 3703 struct page *page; 3704 3705 if (likely(pcp_allowed_order(order))) { 3706 /* 3707 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and 3708 * we need to skip it when CMA area isn't allowed. 3709 */ 3710 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || 3711 migratetype != MIGRATE_MOVABLE) { 3712 page = rmqueue_pcplist(preferred_zone, zone, order, 3713 gfp_flags, migratetype, alloc_flags); 3714 goto out; 3715 } 3716 } 3717 3718 /* 3719 * We most definitely don't want callers attempting to 3720 * allocate greater than order-1 page units with __GFP_NOFAIL. 3721 */ 3722 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 3723 spin_lock_irqsave(&zone->lock, flags); 3724 3725 do { 3726 page = NULL; 3727 /* 3728 * order-0 request can reach here when the pcplist is skipped 3729 * due to non-CMA allocation context. HIGHATOMIC area is 3730 * reserved for high-order atomic allocation, so order-0 3731 * request should skip it. 3732 */ 3733 if (order > 0 && alloc_flags & ALLOC_HARDER) { 3734 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 3735 if (page) 3736 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3737 } 3738 if (!page) 3739 page = __rmqueue(zone, order, migratetype, alloc_flags); 3740 } while (page && check_new_pages(page, order)); 3741 if (!page) 3742 goto failed; 3743 3744 __mod_zone_freepage_state(zone, -(1 << order), 3745 get_pcppage_migratetype(page)); 3746 spin_unlock_irqrestore(&zone->lock, flags); 3747 3748 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3749 zone_statistics(preferred_zone, zone, 1); 3750 3751 out: 3752 /* Separate test+clear to avoid unnecessary atomics */ 3753 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { 3754 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 3755 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 3756 } 3757 3758 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3759 return page; 3760 3761 failed: 3762 spin_unlock_irqrestore(&zone->lock, flags); 3763 return NULL; 3764 } 3765 3766 #ifdef CONFIG_FAIL_PAGE_ALLOC 3767 3768 static struct { 3769 struct fault_attr attr; 3770 3771 bool ignore_gfp_highmem; 3772 bool ignore_gfp_reclaim; 3773 u32 min_order; 3774 } fail_page_alloc = { 3775 .attr = FAULT_ATTR_INITIALIZER, 3776 .ignore_gfp_reclaim = true, 3777 .ignore_gfp_highmem = true, 3778 .min_order = 1, 3779 }; 3780 3781 static int __init setup_fail_page_alloc(char *str) 3782 { 3783 return setup_fault_attr(&fail_page_alloc.attr, str); 3784 } 3785 __setup("fail_page_alloc=", setup_fail_page_alloc); 3786 3787 static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3788 { 3789 if (order < fail_page_alloc.min_order) 3790 return false; 3791 if (gfp_mask & __GFP_NOFAIL) 3792 return false; 3793 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 3794 return false; 3795 if (fail_page_alloc.ignore_gfp_reclaim && 3796 (gfp_mask & __GFP_DIRECT_RECLAIM)) 3797 return false; 3798 3799 return should_fail(&fail_page_alloc.attr, 1 << order); 3800 } 3801 3802 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 3803 3804 static int __init fail_page_alloc_debugfs(void) 3805 { 3806 umode_t mode = S_IFREG | 0600; 3807 struct dentry *dir; 3808 3809 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 3810 &fail_page_alloc.attr); 3811 3812 debugfs_create_bool("ignore-gfp-wait", mode, dir, 3813 &fail_page_alloc.ignore_gfp_reclaim); 3814 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 3815 &fail_page_alloc.ignore_gfp_highmem); 3816 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); 3817 3818 return 0; 3819 } 3820 3821 late_initcall(fail_page_alloc_debugfs); 3822 3823 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 3824 3825 #else /* CONFIG_FAIL_PAGE_ALLOC */ 3826 3827 static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3828 { 3829 return false; 3830 } 3831 3832 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 3833 3834 static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3835 { 3836 return __should_fail_alloc_page(gfp_mask, order); 3837 } 3838 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); 3839 3840 static inline long __zone_watermark_unusable_free(struct zone *z, 3841 unsigned int order, unsigned int alloc_flags) 3842 { 3843 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3844 long unusable_free = (1 << order) - 1; 3845 3846 /* 3847 * If the caller does not have rights to ALLOC_HARDER then subtract 3848 * the high-atomic reserves. This will over-estimate the size of the 3849 * atomic reserve but it avoids a search. 3850 */ 3851 if (likely(!alloc_harder)) 3852 unusable_free += z->nr_reserved_highatomic; 3853 3854 #ifdef CONFIG_CMA 3855 /* If allocation can't use CMA areas don't use free CMA pages */ 3856 if (!(alloc_flags & ALLOC_CMA)) 3857 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 3858 #endif 3859 3860 return unusable_free; 3861 } 3862 3863 /* 3864 * Return true if free base pages are above 'mark'. For high-order checks it 3865 * will return true of the order-0 watermark is reached and there is at least 3866 * one free page of a suitable size. Checking now avoids taking the zone lock 3867 * to check in the allocation paths if no pages are free. 3868 */ 3869 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3870 int highest_zoneidx, unsigned int alloc_flags, 3871 long free_pages) 3872 { 3873 long min = mark; 3874 int o; 3875 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 3876 3877 /* free_pages may go negative - that's OK */ 3878 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 3879 3880 if (alloc_flags & ALLOC_HIGH) 3881 min -= min / 2; 3882 3883 if (unlikely(alloc_harder)) { 3884 /* 3885 * OOM victims can try even harder than normal ALLOC_HARDER 3886 * users on the grounds that it's definitely going to be in 3887 * the exit path shortly and free memory. Any allocation it 3888 * makes during the free path will be small and short-lived. 3889 */ 3890 if (alloc_flags & ALLOC_OOM) 3891 min -= min / 2; 3892 else 3893 min -= min / 4; 3894 } 3895 3896 /* 3897 * Check watermarks for an order-0 allocation request. If these 3898 * are not met, then a high-order request also cannot go ahead 3899 * even if a suitable page happened to be free. 3900 */ 3901 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 3902 return false; 3903 3904 /* If this is an order-0 request then the watermark is fine */ 3905 if (!order) 3906 return true; 3907 3908 /* For a high-order request, check at least one suitable page is free */ 3909 for (o = order; o < MAX_ORDER; o++) { 3910 struct free_area *area = &z->free_area[o]; 3911 int mt; 3912 3913 if (!area->nr_free) 3914 continue; 3915 3916 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3917 if (!free_area_empty(area, mt)) 3918 return true; 3919 } 3920 3921 #ifdef CONFIG_CMA 3922 if ((alloc_flags & ALLOC_CMA) && 3923 !free_area_empty(area, MIGRATE_CMA)) { 3924 return true; 3925 } 3926 #endif 3927 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) 3928 return true; 3929 } 3930 return false; 3931 } 3932 3933 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3934 int highest_zoneidx, unsigned int alloc_flags) 3935 { 3936 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3937 zone_page_state(z, NR_FREE_PAGES)); 3938 } 3939 3940 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3941 unsigned long mark, int highest_zoneidx, 3942 unsigned int alloc_flags, gfp_t gfp_mask) 3943 { 3944 long free_pages; 3945 3946 free_pages = zone_page_state(z, NR_FREE_PAGES); 3947 3948 /* 3949 * Fast check for order-0 only. If this fails then the reserves 3950 * need to be calculated. 3951 */ 3952 if (!order) { 3953 long fast_free; 3954 3955 fast_free = free_pages; 3956 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); 3957 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) 3958 return true; 3959 } 3960 3961 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3962 free_pages)) 3963 return true; 3964 /* 3965 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations 3966 * when checking the min watermark. The min watermark is the 3967 * point where boosting is ignored so that kswapd is woken up 3968 * when below the low watermark. 3969 */ 3970 if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost 3971 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 3972 mark = z->_watermark[WMARK_MIN]; 3973 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 3974 alloc_flags, free_pages); 3975 } 3976 3977 return false; 3978 } 3979 3980 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3981 unsigned long mark, int highest_zoneidx) 3982 { 3983 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3984 3985 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3986 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3987 3988 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, 3989 free_pages); 3990 } 3991 3992 #ifdef CONFIG_NUMA 3993 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3994 { 3995 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3996 node_reclaim_distance; 3997 } 3998 #else /* CONFIG_NUMA */ 3999 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 4000 { 4001 return true; 4002 } 4003 #endif /* CONFIG_NUMA */ 4004 4005 /* 4006 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 4007 * fragmentation is subtle. If the preferred zone was HIGHMEM then 4008 * premature use of a lower zone may cause lowmem pressure problems that 4009 * are worse than fragmentation. If the next zone is ZONE_DMA then it is 4010 * probably too small. It only makes sense to spread allocations to avoid 4011 * fragmentation between the Normal and DMA32 zones. 4012 */ 4013 static inline unsigned int 4014 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 4015 { 4016 unsigned int alloc_flags; 4017 4018 /* 4019 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 4020 * to save a branch. 4021 */ 4022 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 4023 4024 #ifdef CONFIG_ZONE_DMA32 4025 if (!zone) 4026 return alloc_flags; 4027 4028 if (zone_idx(zone) != ZONE_NORMAL) 4029 return alloc_flags; 4030 4031 /* 4032 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 4033 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 4034 * on UMA that if Normal is populated then so is DMA32. 4035 */ 4036 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 4037 if (nr_online_nodes > 1 && !populated_zone(--zone)) 4038 return alloc_flags; 4039 4040 alloc_flags |= ALLOC_NOFRAGMENT; 4041 #endif /* CONFIG_ZONE_DMA32 */ 4042 return alloc_flags; 4043 } 4044 4045 /* Must be called after current_gfp_context() which can change gfp_mask */ 4046 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, 4047 unsigned int alloc_flags) 4048 { 4049 #ifdef CONFIG_CMA 4050 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) 4051 alloc_flags |= ALLOC_CMA; 4052 #endif 4053 return alloc_flags; 4054 } 4055 4056 /* 4057 * get_page_from_freelist goes through the zonelist trying to allocate 4058 * a page. 4059 */ 4060 static struct page * 4061 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 4062 const struct alloc_context *ac) 4063 { 4064 struct zoneref *z; 4065 struct zone *zone; 4066 struct pglist_data *last_pgdat_dirty_limit = NULL; 4067 bool no_fallback; 4068 4069 retry: 4070 /* 4071 * Scan zonelist, looking for a zone with enough free. 4072 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 4073 */ 4074 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 4075 z = ac->preferred_zoneref; 4076 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 4077 ac->nodemask) { 4078 struct page *page; 4079 unsigned long mark; 4080 4081 if (cpusets_enabled() && 4082 (alloc_flags & ALLOC_CPUSET) && 4083 !__cpuset_zone_allowed(zone, gfp_mask)) 4084 continue; 4085 /* 4086 * When allocating a page cache page for writing, we 4087 * want to get it from a node that is within its dirty 4088 * limit, such that no single node holds more than its 4089 * proportional share of globally allowed dirty pages. 4090 * The dirty limits take into account the node's 4091 * lowmem reserves and high watermark so that kswapd 4092 * should be able to balance it without having to 4093 * write pages from its LRU list. 4094 * 4095 * XXX: For now, allow allocations to potentially 4096 * exceed the per-node dirty limit in the slowpath 4097 * (spread_dirty_pages unset) before going into reclaim, 4098 * which is important when on a NUMA setup the allowed 4099 * nodes are together not big enough to reach the 4100 * global limit. The proper fix for these situations 4101 * will require awareness of nodes in the 4102 * dirty-throttling and the flusher threads. 4103 */ 4104 if (ac->spread_dirty_pages) { 4105 if (last_pgdat_dirty_limit == zone->zone_pgdat) 4106 continue; 4107 4108 if (!node_dirty_ok(zone->zone_pgdat)) { 4109 last_pgdat_dirty_limit = zone->zone_pgdat; 4110 continue; 4111 } 4112 } 4113 4114 if (no_fallback && nr_online_nodes > 1 && 4115 zone != ac->preferred_zoneref->zone) { 4116 int local_nid; 4117 4118 /* 4119 * If moving to a remote node, retry but allow 4120 * fragmenting fallbacks. Locality is more important 4121 * than fragmentation avoidance. 4122 */ 4123 local_nid = zone_to_nid(ac->preferred_zoneref->zone); 4124 if (zone_to_nid(zone) != local_nid) { 4125 alloc_flags &= ~ALLOC_NOFRAGMENT; 4126 goto retry; 4127 } 4128 } 4129 4130 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 4131 if (!zone_watermark_fast(zone, order, mark, 4132 ac->highest_zoneidx, alloc_flags, 4133 gfp_mask)) { 4134 int ret; 4135 4136 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 4137 /* 4138 * Watermark failed for this zone, but see if we can 4139 * grow this zone if it contains deferred pages. 4140 */ 4141 if (static_branch_unlikely(&deferred_pages)) { 4142 if (_deferred_grow_zone(zone, order)) 4143 goto try_this_zone; 4144 } 4145 #endif 4146 /* Checked here to keep the fast path fast */ 4147 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 4148 if (alloc_flags & ALLOC_NO_WATERMARKS) 4149 goto try_this_zone; 4150 4151 if (!node_reclaim_enabled() || 4152 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 4153 continue; 4154 4155 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 4156 switch (ret) { 4157 case NODE_RECLAIM_NOSCAN: 4158 /* did not scan */ 4159 continue; 4160 case NODE_RECLAIM_FULL: 4161 /* scanned but unreclaimable */ 4162 continue; 4163 default: 4164 /* did we reclaim enough */ 4165 if (zone_watermark_ok(zone, order, mark, 4166 ac->highest_zoneidx, alloc_flags)) 4167 goto try_this_zone; 4168 4169 continue; 4170 } 4171 } 4172 4173 try_this_zone: 4174 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 4175 gfp_mask, alloc_flags, ac->migratetype); 4176 if (page) { 4177 prep_new_page(page, order, gfp_mask, alloc_flags); 4178 4179 /* 4180 * If this is a high-order atomic allocation then check 4181 * if the pageblock should be reserved for the future 4182 */ 4183 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 4184 reserve_highatomic_pageblock(page, zone, order); 4185 4186 return page; 4187 } else { 4188 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 4189 /* Try again if zone has deferred pages */ 4190 if (static_branch_unlikely(&deferred_pages)) { 4191 if (_deferred_grow_zone(zone, order)) 4192 goto try_this_zone; 4193 } 4194 #endif 4195 } 4196 } 4197 4198 /* 4199 * It's possible on a UMA machine to get through all zones that are 4200 * fragmented. If avoiding fragmentation, reset and try again. 4201 */ 4202 if (no_fallback) { 4203 alloc_flags &= ~ALLOC_NOFRAGMENT; 4204 goto retry; 4205 } 4206 4207 return NULL; 4208 } 4209 4210 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 4211 { 4212 unsigned int filter = SHOW_MEM_FILTER_NODES; 4213 4214 /* 4215 * This documents exceptions given to allocations in certain 4216 * contexts that are allowed to allocate outside current's set 4217 * of allowed nodes. 4218 */ 4219 if (!(gfp_mask & __GFP_NOMEMALLOC)) 4220 if (tsk_is_oom_victim(current) || 4221 (current->flags & (PF_MEMALLOC | PF_EXITING))) 4222 filter &= ~SHOW_MEM_FILTER_NODES; 4223 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 4224 filter &= ~SHOW_MEM_FILTER_NODES; 4225 4226 show_mem(filter, nodemask); 4227 } 4228 4229 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 4230 { 4231 struct va_format vaf; 4232 va_list args; 4233 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 4234 4235 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 4236 return; 4237 4238 va_start(args, fmt); 4239 vaf.fmt = fmt; 4240 vaf.va = &args; 4241 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 4242 current->comm, &vaf, gfp_mask, &gfp_mask, 4243 nodemask_pr_args(nodemask)); 4244 va_end(args); 4245 4246 cpuset_print_current_mems_allowed(); 4247 pr_cont("\n"); 4248 dump_stack(); 4249 warn_alloc_show_mem(gfp_mask, nodemask); 4250 } 4251 4252 static inline struct page * 4253 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 4254 unsigned int alloc_flags, 4255 const struct alloc_context *ac) 4256 { 4257 struct page *page; 4258 4259 page = get_page_from_freelist(gfp_mask, order, 4260 alloc_flags|ALLOC_CPUSET, ac); 4261 /* 4262 * fallback to ignore cpuset restriction if our nodes 4263 * are depleted 4264 */ 4265 if (!page) 4266 page = get_page_from_freelist(gfp_mask, order, 4267 alloc_flags, ac); 4268 4269 return page; 4270 } 4271 4272 static inline struct page * 4273 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 4274 const struct alloc_context *ac, unsigned long *did_some_progress) 4275 { 4276 struct oom_control oc = { 4277 .zonelist = ac->zonelist, 4278 .nodemask = ac->nodemask, 4279 .memcg = NULL, 4280 .gfp_mask = gfp_mask, 4281 .order = order, 4282 }; 4283 struct page *page; 4284 4285 *did_some_progress = 0; 4286 4287 /* 4288 * Acquire the oom lock. If that fails, somebody else is 4289 * making progress for us. 4290 */ 4291 if (!mutex_trylock(&oom_lock)) { 4292 *did_some_progress = 1; 4293 schedule_timeout_uninterruptible(1); 4294 return NULL; 4295 } 4296 4297 /* 4298 * Go through the zonelist yet one more time, keep very high watermark 4299 * here, this is only to catch a parallel oom killing, we must fail if 4300 * we're still under heavy pressure. But make sure that this reclaim 4301 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 4302 * allocation which will never fail due to oom_lock already held. 4303 */ 4304 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 4305 ~__GFP_DIRECT_RECLAIM, order, 4306 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 4307 if (page) 4308 goto out; 4309 4310 /* Coredumps can quickly deplete all memory reserves */ 4311 if (current->flags & PF_DUMPCORE) 4312 goto out; 4313 /* The OOM killer will not help higher order allocs */ 4314 if (order > PAGE_ALLOC_COSTLY_ORDER) 4315 goto out; 4316 /* 4317 * We have already exhausted all our reclaim opportunities without any 4318 * success so it is time to admit defeat. We will skip the OOM killer 4319 * because it is very likely that the caller has a more reasonable 4320 * fallback than shooting a random task. 4321 * 4322 * The OOM killer may not free memory on a specific node. 4323 */ 4324 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 4325 goto out; 4326 /* The OOM killer does not needlessly kill tasks for lowmem */ 4327 if (ac->highest_zoneidx < ZONE_NORMAL) 4328 goto out; 4329 if (pm_suspended_storage()) 4330 goto out; 4331 /* 4332 * XXX: GFP_NOFS allocations should rather fail than rely on 4333 * other request to make a forward progress. 4334 * We are in an unfortunate situation where out_of_memory cannot 4335 * do much for this context but let's try it to at least get 4336 * access to memory reserved if the current task is killed (see 4337 * out_of_memory). Once filesystems are ready to handle allocation 4338 * failures more gracefully we should just bail out here. 4339 */ 4340 4341 /* Exhausted what can be done so it's blame time */ 4342 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 4343 *did_some_progress = 1; 4344 4345 /* 4346 * Help non-failing allocations by giving them access to memory 4347 * reserves 4348 */ 4349 if (gfp_mask & __GFP_NOFAIL) 4350 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 4351 ALLOC_NO_WATERMARKS, ac); 4352 } 4353 out: 4354 mutex_unlock(&oom_lock); 4355 return page; 4356 } 4357 4358 /* 4359 * Maximum number of compaction retries with a progress before OOM 4360 * killer is consider as the only way to move forward. 4361 */ 4362 #define MAX_COMPACT_RETRIES 16 4363 4364 #ifdef CONFIG_COMPACTION 4365 /* Try memory compaction for high-order allocations before reclaim */ 4366 static struct page * 4367 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4368 unsigned int alloc_flags, const struct alloc_context *ac, 4369 enum compact_priority prio, enum compact_result *compact_result) 4370 { 4371 struct page *page = NULL; 4372 unsigned long pflags; 4373 unsigned int noreclaim_flag; 4374 4375 if (!order) 4376 return NULL; 4377 4378 psi_memstall_enter(&pflags); 4379 noreclaim_flag = memalloc_noreclaim_save(); 4380 4381 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 4382 prio, &page); 4383 4384 memalloc_noreclaim_restore(noreclaim_flag); 4385 psi_memstall_leave(&pflags); 4386 4387 if (*compact_result == COMPACT_SKIPPED) 4388 return NULL; 4389 /* 4390 * At least in one zone compaction wasn't deferred or skipped, so let's 4391 * count a compaction stall 4392 */ 4393 count_vm_event(COMPACTSTALL); 4394 4395 /* Prep a captured page if available */ 4396 if (page) 4397 prep_new_page(page, order, gfp_mask, alloc_flags); 4398 4399 /* Try get a page from the freelist if available */ 4400 if (!page) 4401 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4402 4403 if (page) { 4404 struct zone *zone = page_zone(page); 4405 4406 zone->compact_blockskip_flush = false; 4407 compaction_defer_reset(zone, order, true); 4408 count_vm_event(COMPACTSUCCESS); 4409 return page; 4410 } 4411 4412 /* 4413 * It's bad if compaction run occurs and fails. The most likely reason 4414 * is that pages exist, but not enough to satisfy watermarks. 4415 */ 4416 count_vm_event(COMPACTFAIL); 4417 4418 cond_resched(); 4419 4420 return NULL; 4421 } 4422 4423 static inline bool 4424 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 4425 enum compact_result compact_result, 4426 enum compact_priority *compact_priority, 4427 int *compaction_retries) 4428 { 4429 int max_retries = MAX_COMPACT_RETRIES; 4430 int min_priority; 4431 bool ret = false; 4432 int retries = *compaction_retries; 4433 enum compact_priority priority = *compact_priority; 4434 4435 if (!order) 4436 return false; 4437 4438 if (fatal_signal_pending(current)) 4439 return false; 4440 4441 if (compaction_made_progress(compact_result)) 4442 (*compaction_retries)++; 4443 4444 /* 4445 * compaction considers all the zone as desperately out of memory 4446 * so it doesn't really make much sense to retry except when the 4447 * failure could be caused by insufficient priority 4448 */ 4449 if (compaction_failed(compact_result)) 4450 goto check_priority; 4451 4452 /* 4453 * compaction was skipped because there are not enough order-0 pages 4454 * to work with, so we retry only if it looks like reclaim can help. 4455 */ 4456 if (compaction_needs_reclaim(compact_result)) { 4457 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 4458 goto out; 4459 } 4460 4461 /* 4462 * make sure the compaction wasn't deferred or didn't bail out early 4463 * due to locks contention before we declare that we should give up. 4464 * But the next retry should use a higher priority if allowed, so 4465 * we don't just keep bailing out endlessly. 4466 */ 4467 if (compaction_withdrawn(compact_result)) { 4468 goto check_priority; 4469 } 4470 4471 /* 4472 * !costly requests are much more important than __GFP_RETRY_MAYFAIL 4473 * costly ones because they are de facto nofail and invoke OOM 4474 * killer to move on while costly can fail and users are ready 4475 * to cope with that. 1/4 retries is rather arbitrary but we 4476 * would need much more detailed feedback from compaction to 4477 * make a better decision. 4478 */ 4479 if (order > PAGE_ALLOC_COSTLY_ORDER) 4480 max_retries /= 4; 4481 if (*compaction_retries <= max_retries) { 4482 ret = true; 4483 goto out; 4484 } 4485 4486 /* 4487 * Make sure there are attempts at the highest priority if we exhausted 4488 * all retries or failed at the lower priorities. 4489 */ 4490 check_priority: 4491 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 4492 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 4493 4494 if (*compact_priority > min_priority) { 4495 (*compact_priority)--; 4496 *compaction_retries = 0; 4497 ret = true; 4498 } 4499 out: 4500 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 4501 return ret; 4502 } 4503 #else 4504 static inline struct page * 4505 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4506 unsigned int alloc_flags, const struct alloc_context *ac, 4507 enum compact_priority prio, enum compact_result *compact_result) 4508 { 4509 *compact_result = COMPACT_SKIPPED; 4510 return NULL; 4511 } 4512 4513 static inline bool 4514 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 4515 enum compact_result compact_result, 4516 enum compact_priority *compact_priority, 4517 int *compaction_retries) 4518 { 4519 struct zone *zone; 4520 struct zoneref *z; 4521 4522 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 4523 return false; 4524 4525 /* 4526 * There are setups with compaction disabled which would prefer to loop 4527 * inside the allocator rather than hit the oom killer prematurely. 4528 * Let's give them a good hope and keep retrying while the order-0 4529 * watermarks are OK. 4530 */ 4531 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4532 ac->highest_zoneidx, ac->nodemask) { 4533 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 4534 ac->highest_zoneidx, alloc_flags)) 4535 return true; 4536 } 4537 return false; 4538 } 4539 #endif /* CONFIG_COMPACTION */ 4540 4541 #ifdef CONFIG_LOCKDEP 4542 static struct lockdep_map __fs_reclaim_map = 4543 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 4544 4545 static bool __need_reclaim(gfp_t gfp_mask) 4546 { 4547 /* no reclaim without waiting on it */ 4548 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 4549 return false; 4550 4551 /* this guy won't enter reclaim */ 4552 if (current->flags & PF_MEMALLOC) 4553 return false; 4554 4555 if (gfp_mask & __GFP_NOLOCKDEP) 4556 return false; 4557 4558 return true; 4559 } 4560 4561 void __fs_reclaim_acquire(void) 4562 { 4563 lock_map_acquire(&__fs_reclaim_map); 4564 } 4565 4566 void __fs_reclaim_release(void) 4567 { 4568 lock_map_release(&__fs_reclaim_map); 4569 } 4570 4571 void fs_reclaim_acquire(gfp_t gfp_mask) 4572 { 4573 gfp_mask = current_gfp_context(gfp_mask); 4574 4575 if (__need_reclaim(gfp_mask)) { 4576 if (gfp_mask & __GFP_FS) 4577 __fs_reclaim_acquire(); 4578 4579 #ifdef CONFIG_MMU_NOTIFIER 4580 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 4581 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 4582 #endif 4583 4584 } 4585 } 4586 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 4587 4588 void fs_reclaim_release(gfp_t gfp_mask) 4589 { 4590 gfp_mask = current_gfp_context(gfp_mask); 4591 4592 if (__need_reclaim(gfp_mask)) { 4593 if (gfp_mask & __GFP_FS) 4594 __fs_reclaim_release(); 4595 } 4596 } 4597 EXPORT_SYMBOL_GPL(fs_reclaim_release); 4598 #endif 4599 4600 /* Perform direct synchronous page reclaim */ 4601 static unsigned long 4602 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 4603 const struct alloc_context *ac) 4604 { 4605 unsigned int noreclaim_flag; 4606 unsigned long pflags, progress; 4607 4608 cond_resched(); 4609 4610 /* We now go into synchronous reclaim */ 4611 cpuset_memory_pressure_bump(); 4612 psi_memstall_enter(&pflags); 4613 fs_reclaim_acquire(gfp_mask); 4614 noreclaim_flag = memalloc_noreclaim_save(); 4615 4616 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 4617 ac->nodemask); 4618 4619 memalloc_noreclaim_restore(noreclaim_flag); 4620 fs_reclaim_release(gfp_mask); 4621 psi_memstall_leave(&pflags); 4622 4623 cond_resched(); 4624 4625 return progress; 4626 } 4627 4628 /* The really slow allocator path where we enter direct reclaim */ 4629 static inline struct page * 4630 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 4631 unsigned int alloc_flags, const struct alloc_context *ac, 4632 unsigned long *did_some_progress) 4633 { 4634 struct page *page = NULL; 4635 bool drained = false; 4636 4637 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 4638 if (unlikely(!(*did_some_progress))) 4639 return NULL; 4640 4641 retry: 4642 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4643 4644 /* 4645 * If an allocation failed after direct reclaim, it could be because 4646 * pages are pinned on the per-cpu lists or in high alloc reserves. 4647 * Shrink them and try again 4648 */ 4649 if (!page && !drained) { 4650 unreserve_highatomic_pageblock(ac, false); 4651 drain_all_pages(NULL); 4652 drained = true; 4653 goto retry; 4654 } 4655 4656 return page; 4657 } 4658 4659 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 4660 const struct alloc_context *ac) 4661 { 4662 struct zoneref *z; 4663 struct zone *zone; 4664 pg_data_t *last_pgdat = NULL; 4665 enum zone_type highest_zoneidx = ac->highest_zoneidx; 4666 4667 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 4668 ac->nodemask) { 4669 if (last_pgdat != zone->zone_pgdat) 4670 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); 4671 last_pgdat = zone->zone_pgdat; 4672 } 4673 } 4674 4675 static inline unsigned int 4676 gfp_to_alloc_flags(gfp_t gfp_mask) 4677 { 4678 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 4679 4680 /* 4681 * __GFP_HIGH is assumed to be the same as ALLOC_HIGH 4682 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 4683 * to save two branches. 4684 */ 4685 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 4686 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 4687 4688 /* 4689 * The caller may dip into page reserves a bit more if the caller 4690 * cannot run direct reclaim, or if the caller has realtime scheduling 4691 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 4692 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 4693 */ 4694 alloc_flags |= (__force int) 4695 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 4696 4697 if (gfp_mask & __GFP_ATOMIC) { 4698 /* 4699 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 4700 * if it can't schedule. 4701 */ 4702 if (!(gfp_mask & __GFP_NOMEMALLOC)) 4703 alloc_flags |= ALLOC_HARDER; 4704 /* 4705 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 4706 * comment for __cpuset_node_allowed(). 4707 */ 4708 alloc_flags &= ~ALLOC_CPUSET; 4709 } else if (unlikely(rt_task(current)) && !in_interrupt()) 4710 alloc_flags |= ALLOC_HARDER; 4711 4712 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); 4713 4714 return alloc_flags; 4715 } 4716 4717 static bool oom_reserves_allowed(struct task_struct *tsk) 4718 { 4719 if (!tsk_is_oom_victim(tsk)) 4720 return false; 4721 4722 /* 4723 * !MMU doesn't have oom reaper so give access to memory reserves 4724 * only to the thread with TIF_MEMDIE set 4725 */ 4726 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 4727 return false; 4728 4729 return true; 4730 } 4731 4732 /* 4733 * Distinguish requests which really need access to full memory 4734 * reserves from oom victims which can live with a portion of it 4735 */ 4736 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 4737 { 4738 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 4739 return 0; 4740 if (gfp_mask & __GFP_MEMALLOC) 4741 return ALLOC_NO_WATERMARKS; 4742 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 4743 return ALLOC_NO_WATERMARKS; 4744 if (!in_interrupt()) { 4745 if (current->flags & PF_MEMALLOC) 4746 return ALLOC_NO_WATERMARKS; 4747 else if (oom_reserves_allowed(current)) 4748 return ALLOC_OOM; 4749 } 4750 4751 return 0; 4752 } 4753 4754 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 4755 { 4756 return !!__gfp_pfmemalloc_flags(gfp_mask); 4757 } 4758 4759 /* 4760 * Checks whether it makes sense to retry the reclaim to make a forward progress 4761 * for the given allocation request. 4762 * 4763 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 4764 * without success, or when we couldn't even meet the watermark if we 4765 * reclaimed all remaining pages on the LRU lists. 4766 * 4767 * Returns true if a retry is viable or false to enter the oom path. 4768 */ 4769 static inline bool 4770 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 4771 struct alloc_context *ac, int alloc_flags, 4772 bool did_some_progress, int *no_progress_loops) 4773 { 4774 struct zone *zone; 4775 struct zoneref *z; 4776 bool ret = false; 4777 4778 /* 4779 * Costly allocations might have made a progress but this doesn't mean 4780 * their order will become available due to high fragmentation so 4781 * always increment the no progress counter for them 4782 */ 4783 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 4784 *no_progress_loops = 0; 4785 else 4786 (*no_progress_loops)++; 4787 4788 /* 4789 * Make sure we converge to OOM if we cannot make any progress 4790 * several times in the row. 4791 */ 4792 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 4793 /* Before OOM, exhaust highatomic_reserve */ 4794 return unreserve_highatomic_pageblock(ac, true); 4795 } 4796 4797 /* 4798 * Keep reclaiming pages while there is a chance this will lead 4799 * somewhere. If none of the target zones can satisfy our allocation 4800 * request even if all reclaimable pages are considered then we are 4801 * screwed and have to go OOM. 4802 */ 4803 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4804 ac->highest_zoneidx, ac->nodemask) { 4805 unsigned long available; 4806 unsigned long reclaimable; 4807 unsigned long min_wmark = min_wmark_pages(zone); 4808 bool wmark; 4809 4810 available = reclaimable = zone_reclaimable_pages(zone); 4811 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 4812 4813 /* 4814 * Would the allocation succeed if we reclaimed all 4815 * reclaimable pages? 4816 */ 4817 wmark = __zone_watermark_ok(zone, order, min_wmark, 4818 ac->highest_zoneidx, alloc_flags, available); 4819 trace_reclaim_retry_zone(z, order, reclaimable, 4820 available, min_wmark, *no_progress_loops, wmark); 4821 if (wmark) { 4822 /* 4823 * If we didn't make any progress and have a lot of 4824 * dirty + writeback pages then we should wait for 4825 * an IO to complete to slow down the reclaim and 4826 * prevent from pre mature OOM 4827 */ 4828 if (!did_some_progress) { 4829 unsigned long write_pending; 4830 4831 write_pending = zone_page_state_snapshot(zone, 4832 NR_ZONE_WRITE_PENDING); 4833 4834 if (2 * write_pending > reclaimable) { 4835 congestion_wait(BLK_RW_ASYNC, HZ/10); 4836 return true; 4837 } 4838 } 4839 4840 ret = true; 4841 goto out; 4842 } 4843 } 4844 4845 out: 4846 /* 4847 * Memory allocation/reclaim might be called from a WQ context and the 4848 * current implementation of the WQ concurrency control doesn't 4849 * recognize that a particular WQ is congested if the worker thread is 4850 * looping without ever sleeping. Therefore we have to do a short sleep 4851 * here rather than calling cond_resched(). 4852 */ 4853 if (current->flags & PF_WQ_WORKER) 4854 schedule_timeout_uninterruptible(1); 4855 else 4856 cond_resched(); 4857 return ret; 4858 } 4859 4860 static inline bool 4861 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 4862 { 4863 /* 4864 * It's possible that cpuset's mems_allowed and the nodemask from 4865 * mempolicy don't intersect. This should be normally dealt with by 4866 * policy_nodemask(), but it's possible to race with cpuset update in 4867 * such a way the check therein was true, and then it became false 4868 * before we got our cpuset_mems_cookie here. 4869 * This assumes that for all allocations, ac->nodemask can come only 4870 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 4871 * when it does not intersect with the cpuset restrictions) or the 4872 * caller can deal with a violated nodemask. 4873 */ 4874 if (cpusets_enabled() && ac->nodemask && 4875 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 4876 ac->nodemask = NULL; 4877 return true; 4878 } 4879 4880 /* 4881 * When updating a task's mems_allowed or mempolicy nodemask, it is 4882 * possible to race with parallel threads in such a way that our 4883 * allocation can fail while the mask is being updated. If we are about 4884 * to fail, check if the cpuset changed during allocation and if so, 4885 * retry. 4886 */ 4887 if (read_mems_allowed_retry(cpuset_mems_cookie)) 4888 return true; 4889 4890 return false; 4891 } 4892 4893 static inline struct page * 4894 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 4895 struct alloc_context *ac) 4896 { 4897 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 4898 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 4899 struct page *page = NULL; 4900 unsigned int alloc_flags; 4901 unsigned long did_some_progress; 4902 enum compact_priority compact_priority; 4903 enum compact_result compact_result; 4904 int compaction_retries; 4905 int no_progress_loops; 4906 unsigned int cpuset_mems_cookie; 4907 int reserve_flags; 4908 4909 /* 4910 * We also sanity check to catch abuse of atomic reserves being used by 4911 * callers that are not in atomic context. 4912 */ 4913 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 4914 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 4915 gfp_mask &= ~__GFP_ATOMIC; 4916 4917 retry_cpuset: 4918 compaction_retries = 0; 4919 no_progress_loops = 0; 4920 compact_priority = DEF_COMPACT_PRIORITY; 4921 cpuset_mems_cookie = read_mems_allowed_begin(); 4922 4923 /* 4924 * The fast path uses conservative alloc_flags to succeed only until 4925 * kswapd needs to be woken up, and to avoid the cost of setting up 4926 * alloc_flags precisely. So we do that now. 4927 */ 4928 alloc_flags = gfp_to_alloc_flags(gfp_mask); 4929 4930 /* 4931 * We need to recalculate the starting point for the zonelist iterator 4932 * because we might have used different nodemask in the fast path, or 4933 * there was a cpuset modification and we are retrying - otherwise we 4934 * could end up iterating over non-eligible zones endlessly. 4935 */ 4936 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4937 ac->highest_zoneidx, ac->nodemask); 4938 if (!ac->preferred_zoneref->zone) 4939 goto nopage; 4940 4941 if (alloc_flags & ALLOC_KSWAPD) 4942 wake_all_kswapds(order, gfp_mask, ac); 4943 4944 /* 4945 * The adjusted alloc_flags might result in immediate success, so try 4946 * that first 4947 */ 4948 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4949 if (page) 4950 goto got_pg; 4951 4952 /* 4953 * For costly allocations, try direct compaction first, as it's likely 4954 * that we have enough base pages and don't need to reclaim. For non- 4955 * movable high-order allocations, do that as well, as compaction will 4956 * try prevent permanent fragmentation by migrating from blocks of the 4957 * same migratetype. 4958 * Don't try this for allocations that are allowed to ignore 4959 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 4960 */ 4961 if (can_direct_reclaim && 4962 (costly_order || 4963 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 4964 && !gfp_pfmemalloc_allowed(gfp_mask)) { 4965 page = __alloc_pages_direct_compact(gfp_mask, order, 4966 alloc_flags, ac, 4967 INIT_COMPACT_PRIORITY, 4968 &compact_result); 4969 if (page) 4970 goto got_pg; 4971 4972 /* 4973 * Checks for costly allocations with __GFP_NORETRY, which 4974 * includes some THP page fault allocations 4975 */ 4976 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 4977 /* 4978 * If allocating entire pageblock(s) and compaction 4979 * failed because all zones are below low watermarks 4980 * or is prohibited because it recently failed at this 4981 * order, fail immediately unless the allocator has 4982 * requested compaction and reclaim retry. 4983 * 4984 * Reclaim is 4985 * - potentially very expensive because zones are far 4986 * below their low watermarks or this is part of very 4987 * bursty high order allocations, 4988 * - not guaranteed to help because isolate_freepages() 4989 * may not iterate over freed pages as part of its 4990 * linear scan, and 4991 * - unlikely to make entire pageblocks free on its 4992 * own. 4993 */ 4994 if (compact_result == COMPACT_SKIPPED || 4995 compact_result == COMPACT_DEFERRED) 4996 goto nopage; 4997 4998 /* 4999 * Looks like reclaim/compaction is worth trying, but 5000 * sync compaction could be very expensive, so keep 5001 * using async compaction. 5002 */ 5003 compact_priority = INIT_COMPACT_PRIORITY; 5004 } 5005 } 5006 5007 retry: 5008 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 5009 if (alloc_flags & ALLOC_KSWAPD) 5010 wake_all_kswapds(order, gfp_mask, ac); 5011 5012 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 5013 if (reserve_flags) 5014 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); 5015 5016 /* 5017 * Reset the nodemask and zonelist iterators if memory policies can be 5018 * ignored. These allocations are high priority and system rather than 5019 * user oriented. 5020 */ 5021 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 5022 ac->nodemask = NULL; 5023 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 5024 ac->highest_zoneidx, ac->nodemask); 5025 } 5026 5027 /* Attempt with potentially adjusted zonelist and alloc_flags */ 5028 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 5029 if (page) 5030 goto got_pg; 5031 5032 /* Caller is not willing to reclaim, we can't balance anything */ 5033 if (!can_direct_reclaim) 5034 goto nopage; 5035 5036 /* Avoid recursion of direct reclaim */ 5037 if (current->flags & PF_MEMALLOC) 5038 goto nopage; 5039 5040 /* Try direct reclaim and then allocating */ 5041 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 5042 &did_some_progress); 5043 if (page) 5044 goto got_pg; 5045 5046 /* Try direct compaction and then allocating */ 5047 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 5048 compact_priority, &compact_result); 5049 if (page) 5050 goto got_pg; 5051 5052 /* Do not loop if specifically requested */ 5053 if (gfp_mask & __GFP_NORETRY) 5054 goto nopage; 5055 5056 /* 5057 * Do not retry costly high order allocations unless they are 5058 * __GFP_RETRY_MAYFAIL 5059 */ 5060 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 5061 goto nopage; 5062 5063 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 5064 did_some_progress > 0, &no_progress_loops)) 5065 goto retry; 5066 5067 /* 5068 * It doesn't make any sense to retry for the compaction if the order-0 5069 * reclaim is not able to make any progress because the current 5070 * implementation of the compaction depends on the sufficient amount 5071 * of free memory (see __compaction_suitable) 5072 */ 5073 if (did_some_progress > 0 && 5074 should_compact_retry(ac, order, alloc_flags, 5075 compact_result, &compact_priority, 5076 &compaction_retries)) 5077 goto retry; 5078 5079 5080 /* Deal with possible cpuset update races before we start OOM killing */ 5081 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 5082 goto retry_cpuset; 5083 5084 /* Reclaim has failed us, start killing things */ 5085 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 5086 if (page) 5087 goto got_pg; 5088 5089 /* Avoid allocations with no watermarks from looping endlessly */ 5090 if (tsk_is_oom_victim(current) && 5091 (alloc_flags & ALLOC_OOM || 5092 (gfp_mask & __GFP_NOMEMALLOC))) 5093 goto nopage; 5094 5095 /* Retry as long as the OOM killer is making progress */ 5096 if (did_some_progress) { 5097 no_progress_loops = 0; 5098 goto retry; 5099 } 5100 5101 nopage: 5102 /* Deal with possible cpuset update races before we fail */ 5103 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 5104 goto retry_cpuset; 5105 5106 /* 5107 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 5108 * we always retry 5109 */ 5110 if (gfp_mask & __GFP_NOFAIL) { 5111 /* 5112 * All existing users of the __GFP_NOFAIL are blockable, so warn 5113 * of any new users that actually require GFP_NOWAIT 5114 */ 5115 if (WARN_ON_ONCE(!can_direct_reclaim)) 5116 goto fail; 5117 5118 /* 5119 * PF_MEMALLOC request from this context is rather bizarre 5120 * because we cannot reclaim anything and only can loop waiting 5121 * for somebody to do a work for us 5122 */ 5123 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 5124 5125 /* 5126 * non failing costly orders are a hard requirement which we 5127 * are not prepared for much so let's warn about these users 5128 * so that we can identify them and convert them to something 5129 * else. 5130 */ 5131 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 5132 5133 /* 5134 * Help non-failing allocations by giving them access to memory 5135 * reserves but do not use ALLOC_NO_WATERMARKS because this 5136 * could deplete whole memory reserves which would just make 5137 * the situation worse 5138 */ 5139 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 5140 if (page) 5141 goto got_pg; 5142 5143 cond_resched(); 5144 goto retry; 5145 } 5146 fail: 5147 warn_alloc(gfp_mask, ac->nodemask, 5148 "page allocation failure: order:%u", order); 5149 got_pg: 5150 return page; 5151 } 5152 5153 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 5154 int preferred_nid, nodemask_t *nodemask, 5155 struct alloc_context *ac, gfp_t *alloc_gfp, 5156 unsigned int *alloc_flags) 5157 { 5158 ac->highest_zoneidx = gfp_zone(gfp_mask); 5159 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 5160 ac->nodemask = nodemask; 5161 ac->migratetype = gfp_migratetype(gfp_mask); 5162 5163 if (cpusets_enabled()) { 5164 *alloc_gfp |= __GFP_HARDWALL; 5165 /* 5166 * When we are in the interrupt context, it is irrelevant 5167 * to the current task context. It means that any node ok. 5168 */ 5169 if (!in_interrupt() && !ac->nodemask) 5170 ac->nodemask = &cpuset_current_mems_allowed; 5171 else 5172 *alloc_flags |= ALLOC_CPUSET; 5173 } 5174 5175 fs_reclaim_acquire(gfp_mask); 5176 fs_reclaim_release(gfp_mask); 5177 5178 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 5179 5180 if (should_fail_alloc_page(gfp_mask, order)) 5181 return false; 5182 5183 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); 5184 5185 /* Dirty zone balancing only done in the fast path */ 5186 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 5187 5188 /* 5189 * The preferred zone is used for statistics but crucially it is 5190 * also used as the starting point for the zonelist iterator. It 5191 * may get reset for allocations that ignore memory policies. 5192 */ 5193 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 5194 ac->highest_zoneidx, ac->nodemask); 5195 5196 return true; 5197 } 5198 5199 /* 5200 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array 5201 * @gfp: GFP flags for the allocation 5202 * @preferred_nid: The preferred NUMA node ID to allocate from 5203 * @nodemask: Set of nodes to allocate from, may be NULL 5204 * @nr_pages: The number of pages desired on the list or array 5205 * @page_list: Optional list to store the allocated pages 5206 * @page_array: Optional array to store the pages 5207 * 5208 * This is a batched version of the page allocator that attempts to 5209 * allocate nr_pages quickly. Pages are added to page_list if page_list 5210 * is not NULL, otherwise it is assumed that the page_array is valid. 5211 * 5212 * For lists, nr_pages is the number of pages that should be allocated. 5213 * 5214 * For arrays, only NULL elements are populated with pages and nr_pages 5215 * is the maximum number of pages that will be stored in the array. 5216 * 5217 * Returns the number of pages on the list or array. 5218 */ 5219 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, 5220 nodemask_t *nodemask, int nr_pages, 5221 struct list_head *page_list, 5222 struct page **page_array) 5223 { 5224 struct page *page; 5225 unsigned long flags; 5226 struct zone *zone; 5227 struct zoneref *z; 5228 struct per_cpu_pages *pcp; 5229 struct list_head *pcp_list; 5230 struct alloc_context ac; 5231 gfp_t alloc_gfp; 5232 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5233 int nr_populated = 0, nr_account = 0; 5234 5235 if (unlikely(nr_pages <= 0)) 5236 return 0; 5237 5238 /* 5239 * Skip populated array elements to determine if any pages need 5240 * to be allocated before disabling IRQs. 5241 */ 5242 while (page_array && nr_populated < nr_pages && page_array[nr_populated]) 5243 nr_populated++; 5244 5245 /* Already populated array? */ 5246 if (unlikely(page_array && nr_pages - nr_populated == 0)) 5247 return nr_populated; 5248 5249 /* Use the single page allocator for one page. */ 5250 if (nr_pages - nr_populated == 1) 5251 goto failed; 5252 5253 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ 5254 gfp &= gfp_allowed_mask; 5255 alloc_gfp = gfp; 5256 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) 5257 return 0; 5258 gfp = alloc_gfp; 5259 5260 /* Find an allowed local zone that meets the low watermark. */ 5261 for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { 5262 unsigned long mark; 5263 5264 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && 5265 !__cpuset_zone_allowed(zone, gfp)) { 5266 continue; 5267 } 5268 5269 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && 5270 zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { 5271 goto failed; 5272 } 5273 5274 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; 5275 if (zone_watermark_fast(zone, 0, mark, 5276 zonelist_zone_idx(ac.preferred_zoneref), 5277 alloc_flags, gfp)) { 5278 break; 5279 } 5280 } 5281 5282 /* 5283 * If there are no allowed local zones that meets the watermarks then 5284 * try to allocate a single page and reclaim if necessary. 5285 */ 5286 if (unlikely(!zone)) 5287 goto failed; 5288 5289 /* Attempt the batch allocation */ 5290 local_lock_irqsave(&pagesets.lock, flags); 5291 pcp = this_cpu_ptr(zone->per_cpu_pageset); 5292 pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; 5293 5294 while (nr_populated < nr_pages) { 5295 5296 /* Skip existing pages */ 5297 if (page_array && page_array[nr_populated]) { 5298 nr_populated++; 5299 continue; 5300 } 5301 5302 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, 5303 pcp, pcp_list); 5304 if (unlikely(!page)) { 5305 /* Try and get at least one page */ 5306 if (!nr_populated) 5307 goto failed_irq; 5308 break; 5309 } 5310 nr_account++; 5311 5312 prep_new_page(page, 0, gfp, 0); 5313 if (page_list) 5314 list_add(&page->lru, page_list); 5315 else 5316 page_array[nr_populated] = page; 5317 nr_populated++; 5318 } 5319 5320 local_unlock_irqrestore(&pagesets.lock, flags); 5321 5322 __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); 5323 zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); 5324 5325 return nr_populated; 5326 5327 failed_irq: 5328 local_unlock_irqrestore(&pagesets.lock, flags); 5329 5330 failed: 5331 page = __alloc_pages(gfp, 0, preferred_nid, nodemask); 5332 if (page) { 5333 if (page_list) 5334 list_add(&page->lru, page_list); 5335 else 5336 page_array[nr_populated] = page; 5337 nr_populated++; 5338 } 5339 5340 return nr_populated; 5341 } 5342 EXPORT_SYMBOL_GPL(__alloc_pages_bulk); 5343 5344 /* 5345 * This is the 'heart' of the zoned buddy allocator. 5346 */ 5347 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, 5348 nodemask_t *nodemask) 5349 { 5350 struct page *page; 5351 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5352 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ 5353 struct alloc_context ac = { }; 5354 5355 /* 5356 * There are several places where we assume that the order value is sane 5357 * so bail out early if the request is out of bound. 5358 */ 5359 if (unlikely(order >= MAX_ORDER)) { 5360 WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); 5361 return NULL; 5362 } 5363 5364 gfp &= gfp_allowed_mask; 5365 /* 5366 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 5367 * resp. GFP_NOIO which has to be inherited for all allocation requests 5368 * from a particular context which has been marked by 5369 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures 5370 * movable zones are not used during allocation. 5371 */ 5372 gfp = current_gfp_context(gfp); 5373 alloc_gfp = gfp; 5374 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, 5375 &alloc_gfp, &alloc_flags)) 5376 return NULL; 5377 5378 /* 5379 * Forbid the first pass from falling back to types that fragment 5380 * memory until all local zones are considered. 5381 */ 5382 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); 5383 5384 /* First allocation attempt */ 5385 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 5386 if (likely(page)) 5387 goto out; 5388 5389 alloc_gfp = gfp; 5390 ac.spread_dirty_pages = false; 5391 5392 /* 5393 * Restore the original nodemask if it was potentially replaced with 5394 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 5395 */ 5396 ac.nodemask = nodemask; 5397 5398 page = __alloc_pages_slowpath(alloc_gfp, order, &ac); 5399 5400 out: 5401 if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && 5402 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { 5403 __free_pages(page, order); 5404 page = NULL; 5405 } 5406 5407 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 5408 5409 return page; 5410 } 5411 EXPORT_SYMBOL(__alloc_pages); 5412 5413 /* 5414 * Common helper functions. Never use with __GFP_HIGHMEM because the returned 5415 * address cannot represent highmem pages. Use alloc_pages and then kmap if 5416 * you need to access high mem. 5417 */ 5418 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 5419 { 5420 struct page *page; 5421 5422 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); 5423 if (!page) 5424 return 0; 5425 return (unsigned long) page_address(page); 5426 } 5427 EXPORT_SYMBOL(__get_free_pages); 5428 5429 unsigned long get_zeroed_page(gfp_t gfp_mask) 5430 { 5431 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 5432 } 5433 EXPORT_SYMBOL(get_zeroed_page); 5434 5435 /** 5436 * __free_pages - Free pages allocated with alloc_pages(). 5437 * @page: The page pointer returned from alloc_pages(). 5438 * @order: The order of the allocation. 5439 * 5440 * This function can free multi-page allocations that are not compound 5441 * pages. It does not check that the @order passed in matches that of 5442 * the allocation, so it is easy to leak memory. Freeing more memory 5443 * than was allocated will probably emit a warning. 5444 * 5445 * If the last reference to this page is speculative, it will be released 5446 * by put_page() which only frees the first page of a non-compound 5447 * allocation. To prevent the remaining pages from being leaked, we free 5448 * the subsequent pages here. If you want to use the page's reference 5449 * count to decide when to free the allocation, you should allocate a 5450 * compound page, and use put_page() instead of __free_pages(). 5451 * 5452 * Context: May be called in interrupt context or while holding a normal 5453 * spinlock, but not in NMI context or while holding a raw spinlock. 5454 */ 5455 void __free_pages(struct page *page, unsigned int order) 5456 { 5457 if (put_page_testzero(page)) 5458 free_the_page(page, order); 5459 else if (!PageHead(page)) 5460 while (order-- > 0) 5461 free_the_page(page + (1 << order), order); 5462 } 5463 EXPORT_SYMBOL(__free_pages); 5464 5465 void free_pages(unsigned long addr, unsigned int order) 5466 { 5467 if (addr != 0) { 5468 VM_BUG_ON(!virt_addr_valid((void *)addr)); 5469 __free_pages(virt_to_page((void *)addr), order); 5470 } 5471 } 5472 5473 EXPORT_SYMBOL(free_pages); 5474 5475 /* 5476 * Page Fragment: 5477 * An arbitrary-length arbitrary-offset area of memory which resides 5478 * within a 0 or higher order page. Multiple fragments within that page 5479 * are individually refcounted, in the page's reference counter. 5480 * 5481 * The page_frag functions below provide a simple allocation framework for 5482 * page fragments. This is used by the network stack and network device 5483 * drivers to provide a backing region of memory for use as either an 5484 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 5485 */ 5486 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 5487 gfp_t gfp_mask) 5488 { 5489 struct page *page = NULL; 5490 gfp_t gfp = gfp_mask; 5491 5492 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5493 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 5494 __GFP_NOMEMALLOC; 5495 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 5496 PAGE_FRAG_CACHE_MAX_ORDER); 5497 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 5498 #endif 5499 if (unlikely(!page)) 5500 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 5501 5502 nc->va = page ? page_address(page) : NULL; 5503 5504 return page; 5505 } 5506 5507 void __page_frag_cache_drain(struct page *page, unsigned int count) 5508 { 5509 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 5510 5511 if (page_ref_sub_and_test(page, count)) 5512 free_the_page(page, compound_order(page)); 5513 } 5514 EXPORT_SYMBOL(__page_frag_cache_drain); 5515 5516 void *page_frag_alloc_align(struct page_frag_cache *nc, 5517 unsigned int fragsz, gfp_t gfp_mask, 5518 unsigned int align_mask) 5519 { 5520 unsigned int size = PAGE_SIZE; 5521 struct page *page; 5522 int offset; 5523 5524 if (unlikely(!nc->va)) { 5525 refill: 5526 page = __page_frag_cache_refill(nc, gfp_mask); 5527 if (!page) 5528 return NULL; 5529 5530 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5531 /* if size can vary use size else just use PAGE_SIZE */ 5532 size = nc->size; 5533 #endif 5534 /* Even if we own the page, we do not use atomic_set(). 5535 * This would break get_page_unless_zero() users. 5536 */ 5537 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); 5538 5539 /* reset page count bias and offset to start of new frag */ 5540 nc->pfmemalloc = page_is_pfmemalloc(page); 5541 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5542 nc->offset = size; 5543 } 5544 5545 offset = nc->offset - fragsz; 5546 if (unlikely(offset < 0)) { 5547 page = virt_to_page(nc->va); 5548 5549 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 5550 goto refill; 5551 5552 if (unlikely(nc->pfmemalloc)) { 5553 free_the_page(page, compound_order(page)); 5554 goto refill; 5555 } 5556 5557 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 5558 /* if size can vary use size else just use PAGE_SIZE */ 5559 size = nc->size; 5560 #endif 5561 /* OK, page count is 0, we can safely set it */ 5562 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); 5563 5564 /* reset page count bias and offset to start of new frag */ 5565 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 5566 offset = size - fragsz; 5567 } 5568 5569 nc->pagecnt_bias--; 5570 offset &= align_mask; 5571 nc->offset = offset; 5572 5573 return nc->va + offset; 5574 } 5575 EXPORT_SYMBOL(page_frag_alloc_align); 5576 5577 /* 5578 * Frees a page fragment allocated out of either a compound or order 0 page. 5579 */ 5580 void page_frag_free(void *addr) 5581 { 5582 struct page *page = virt_to_head_page(addr); 5583 5584 if (unlikely(put_page_testzero(page))) 5585 free_the_page(page, compound_order(page)); 5586 } 5587 EXPORT_SYMBOL(page_frag_free); 5588 5589 static void *make_alloc_exact(unsigned long addr, unsigned int order, 5590 size_t size) 5591 { 5592 if (addr) { 5593 unsigned long alloc_end = addr + (PAGE_SIZE << order); 5594 unsigned long used = addr + PAGE_ALIGN(size); 5595 5596 split_page(virt_to_page((void *)addr), order); 5597 while (used < alloc_end) { 5598 free_page(used); 5599 used += PAGE_SIZE; 5600 } 5601 } 5602 return (void *)addr; 5603 } 5604 5605 /** 5606 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 5607 * @size: the number of bytes to allocate 5608 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5609 * 5610 * This function is similar to alloc_pages(), except that it allocates the 5611 * minimum number of pages to satisfy the request. alloc_pages() can only 5612 * allocate memory in power-of-two pages. 5613 * 5614 * This function is also limited by MAX_ORDER. 5615 * 5616 * Memory allocated by this function must be released by free_pages_exact(). 5617 * 5618 * Return: pointer to the allocated area or %NULL in case of error. 5619 */ 5620 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 5621 { 5622 unsigned int order = get_order(size); 5623 unsigned long addr; 5624 5625 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 5626 gfp_mask &= ~__GFP_COMP; 5627 5628 addr = __get_free_pages(gfp_mask, order); 5629 return make_alloc_exact(addr, order, size); 5630 } 5631 EXPORT_SYMBOL(alloc_pages_exact); 5632 5633 /** 5634 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 5635 * pages on a node. 5636 * @nid: the preferred node ID where memory should be allocated 5637 * @size: the number of bytes to allocate 5638 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5639 * 5640 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 5641 * back. 5642 * 5643 * Return: pointer to the allocated area or %NULL in case of error. 5644 */ 5645 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 5646 { 5647 unsigned int order = get_order(size); 5648 struct page *p; 5649 5650 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 5651 gfp_mask &= ~__GFP_COMP; 5652 5653 p = alloc_pages_node(nid, gfp_mask, order); 5654 if (!p) 5655 return NULL; 5656 return make_alloc_exact((unsigned long)page_address(p), order, size); 5657 } 5658 5659 /** 5660 * free_pages_exact - release memory allocated via alloc_pages_exact() 5661 * @virt: the value returned by alloc_pages_exact. 5662 * @size: size of allocation, same value as passed to alloc_pages_exact(). 5663 * 5664 * Release the memory allocated by a previous call to alloc_pages_exact. 5665 */ 5666 void free_pages_exact(void *virt, size_t size) 5667 { 5668 unsigned long addr = (unsigned long)virt; 5669 unsigned long end = addr + PAGE_ALIGN(size); 5670 5671 while (addr < end) { 5672 free_page(addr); 5673 addr += PAGE_SIZE; 5674 } 5675 } 5676 EXPORT_SYMBOL(free_pages_exact); 5677 5678 /** 5679 * nr_free_zone_pages - count number of pages beyond high watermark 5680 * @offset: The zone index of the highest zone 5681 * 5682 * nr_free_zone_pages() counts the number of pages which are beyond the 5683 * high watermark within all zones at or below a given zone index. For each 5684 * zone, the number of pages is calculated as: 5685 * 5686 * nr_free_zone_pages = managed_pages - high_pages 5687 * 5688 * Return: number of pages beyond high watermark. 5689 */ 5690 static unsigned long nr_free_zone_pages(int offset) 5691 { 5692 struct zoneref *z; 5693 struct zone *zone; 5694 5695 /* Just pick one node, since fallback list is circular */ 5696 unsigned long sum = 0; 5697 5698 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 5699 5700 for_each_zone_zonelist(zone, z, zonelist, offset) { 5701 unsigned long size = zone_managed_pages(zone); 5702 unsigned long high = high_wmark_pages(zone); 5703 if (size > high) 5704 sum += size - high; 5705 } 5706 5707 return sum; 5708 } 5709 5710 /** 5711 * nr_free_buffer_pages - count number of pages beyond high watermark 5712 * 5713 * nr_free_buffer_pages() counts the number of pages which are beyond the high 5714 * watermark within ZONE_DMA and ZONE_NORMAL. 5715 * 5716 * Return: number of pages beyond high watermark within ZONE_DMA and 5717 * ZONE_NORMAL. 5718 */ 5719 unsigned long nr_free_buffer_pages(void) 5720 { 5721 return nr_free_zone_pages(gfp_zone(GFP_USER)); 5722 } 5723 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 5724 5725 static inline void show_node(struct zone *zone) 5726 { 5727 if (IS_ENABLED(CONFIG_NUMA)) 5728 printk("Node %d ", zone_to_nid(zone)); 5729 } 5730 5731 long si_mem_available(void) 5732 { 5733 long available; 5734 unsigned long pagecache; 5735 unsigned long wmark_low = 0; 5736 unsigned long pages[NR_LRU_LISTS]; 5737 unsigned long reclaimable; 5738 struct zone *zone; 5739 int lru; 5740 5741 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 5742 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 5743 5744 for_each_zone(zone) 5745 wmark_low += low_wmark_pages(zone); 5746 5747 /* 5748 * Estimate the amount of memory available for userspace allocations, 5749 * without causing swapping. 5750 */ 5751 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; 5752 5753 /* 5754 * Not all the page cache can be freed, otherwise the system will 5755 * start swapping. Assume at least half of the page cache, or the 5756 * low watermark worth of cache, needs to stay. 5757 */ 5758 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 5759 pagecache -= min(pagecache / 2, wmark_low); 5760 available += pagecache; 5761 5762 /* 5763 * Part of the reclaimable slab and other kernel memory consists of 5764 * items that are in use, and cannot be freed. Cap this estimate at the 5765 * low watermark. 5766 */ 5767 reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + 5768 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); 5769 available += reclaimable - min(reclaimable / 2, wmark_low); 5770 5771 if (available < 0) 5772 available = 0; 5773 return available; 5774 } 5775 EXPORT_SYMBOL_GPL(si_mem_available); 5776 5777 void si_meminfo(struct sysinfo *val) 5778 { 5779 val->totalram = totalram_pages(); 5780 val->sharedram = global_node_page_state(NR_SHMEM); 5781 val->freeram = global_zone_page_state(NR_FREE_PAGES); 5782 val->bufferram = nr_blockdev_pages(); 5783 val->totalhigh = totalhigh_pages(); 5784 val->freehigh = nr_free_highpages(); 5785 val->mem_unit = PAGE_SIZE; 5786 } 5787 5788 EXPORT_SYMBOL(si_meminfo); 5789 5790 #ifdef CONFIG_NUMA 5791 void si_meminfo_node(struct sysinfo *val, int nid) 5792 { 5793 int zone_type; /* needs to be signed */ 5794 unsigned long managed_pages = 0; 5795 unsigned long managed_highpages = 0; 5796 unsigned long free_highpages = 0; 5797 pg_data_t *pgdat = NODE_DATA(nid); 5798 5799 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 5800 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); 5801 val->totalram = managed_pages; 5802 val->sharedram = node_page_state(pgdat, NR_SHMEM); 5803 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 5804 #ifdef CONFIG_HIGHMEM 5805 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 5806 struct zone *zone = &pgdat->node_zones[zone_type]; 5807 5808 if (is_highmem(zone)) { 5809 managed_highpages += zone_managed_pages(zone); 5810 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 5811 } 5812 } 5813 val->totalhigh = managed_highpages; 5814 val->freehigh = free_highpages; 5815 #else 5816 val->totalhigh = managed_highpages; 5817 val->freehigh = free_highpages; 5818 #endif 5819 val->mem_unit = PAGE_SIZE; 5820 } 5821 #endif 5822 5823 /* 5824 * Determine whether the node should be displayed or not, depending on whether 5825 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 5826 */ 5827 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 5828 { 5829 if (!(flags & SHOW_MEM_FILTER_NODES)) 5830 return false; 5831 5832 /* 5833 * no node mask - aka implicit memory numa policy. Do not bother with 5834 * the synchronization - read_mems_allowed_begin - because we do not 5835 * have to be precise here. 5836 */ 5837 if (!nodemask) 5838 nodemask = &cpuset_current_mems_allowed; 5839 5840 return !node_isset(nid, *nodemask); 5841 } 5842 5843 #define K(x) ((x) << (PAGE_SHIFT-10)) 5844 5845 static void show_migration_types(unsigned char type) 5846 { 5847 static const char types[MIGRATE_TYPES] = { 5848 [MIGRATE_UNMOVABLE] = 'U', 5849 [MIGRATE_MOVABLE] = 'M', 5850 [MIGRATE_RECLAIMABLE] = 'E', 5851 [MIGRATE_HIGHATOMIC] = 'H', 5852 #ifdef CONFIG_CMA 5853 [MIGRATE_CMA] = 'C', 5854 #endif 5855 #ifdef CONFIG_MEMORY_ISOLATION 5856 [MIGRATE_ISOLATE] = 'I', 5857 #endif 5858 }; 5859 char tmp[MIGRATE_TYPES + 1]; 5860 char *p = tmp; 5861 int i; 5862 5863 for (i = 0; i < MIGRATE_TYPES; i++) { 5864 if (type & (1 << i)) 5865 *p++ = types[i]; 5866 } 5867 5868 *p = '\0'; 5869 printk(KERN_CONT "(%s) ", tmp); 5870 } 5871 5872 /* 5873 * Show free area list (used inside shift_scroll-lock stuff) 5874 * We also calculate the percentage fragmentation. We do this by counting the 5875 * memory on each free list with the exception of the first item on the list. 5876 * 5877 * Bits in @filter: 5878 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 5879 * cpuset. 5880 */ 5881 void show_free_areas(unsigned int filter, nodemask_t *nodemask) 5882 { 5883 unsigned long free_pcp = 0; 5884 int cpu; 5885 struct zone *zone; 5886 pg_data_t *pgdat; 5887 5888 for_each_populated_zone(zone) { 5889 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5890 continue; 5891 5892 for_each_online_cpu(cpu) 5893 free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; 5894 } 5895 5896 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 5897 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 5898 " unevictable:%lu dirty:%lu writeback:%lu\n" 5899 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 5900 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 5901 " free:%lu free_pcp:%lu free_cma:%lu\n", 5902 global_node_page_state(NR_ACTIVE_ANON), 5903 global_node_page_state(NR_INACTIVE_ANON), 5904 global_node_page_state(NR_ISOLATED_ANON), 5905 global_node_page_state(NR_ACTIVE_FILE), 5906 global_node_page_state(NR_INACTIVE_FILE), 5907 global_node_page_state(NR_ISOLATED_FILE), 5908 global_node_page_state(NR_UNEVICTABLE), 5909 global_node_page_state(NR_FILE_DIRTY), 5910 global_node_page_state(NR_WRITEBACK), 5911 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), 5912 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), 5913 global_node_page_state(NR_FILE_MAPPED), 5914 global_node_page_state(NR_SHMEM), 5915 global_node_page_state(NR_PAGETABLE), 5916 global_zone_page_state(NR_BOUNCE), 5917 global_zone_page_state(NR_FREE_PAGES), 5918 free_pcp, 5919 global_zone_page_state(NR_FREE_CMA_PAGES)); 5920 5921 for_each_online_pgdat(pgdat) { 5922 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 5923 continue; 5924 5925 printk("Node %d" 5926 " active_anon:%lukB" 5927 " inactive_anon:%lukB" 5928 " active_file:%lukB" 5929 " inactive_file:%lukB" 5930 " unevictable:%lukB" 5931 " isolated(anon):%lukB" 5932 " isolated(file):%lukB" 5933 " mapped:%lukB" 5934 " dirty:%lukB" 5935 " writeback:%lukB" 5936 " shmem:%lukB" 5937 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5938 " shmem_thp: %lukB" 5939 " shmem_pmdmapped: %lukB" 5940 " anon_thp: %lukB" 5941 #endif 5942 " writeback_tmp:%lukB" 5943 " kernel_stack:%lukB" 5944 #ifdef CONFIG_SHADOW_CALL_STACK 5945 " shadow_call_stack:%lukB" 5946 #endif 5947 " pagetables:%lukB" 5948 " all_unreclaimable? %s" 5949 "\n", 5950 pgdat->node_id, 5951 K(node_page_state(pgdat, NR_ACTIVE_ANON)), 5952 K(node_page_state(pgdat, NR_INACTIVE_ANON)), 5953 K(node_page_state(pgdat, NR_ACTIVE_FILE)), 5954 K(node_page_state(pgdat, NR_INACTIVE_FILE)), 5955 K(node_page_state(pgdat, NR_UNEVICTABLE)), 5956 K(node_page_state(pgdat, NR_ISOLATED_ANON)), 5957 K(node_page_state(pgdat, NR_ISOLATED_FILE)), 5958 K(node_page_state(pgdat, NR_FILE_MAPPED)), 5959 K(node_page_state(pgdat, NR_FILE_DIRTY)), 5960 K(node_page_state(pgdat, NR_WRITEBACK)), 5961 K(node_page_state(pgdat, NR_SHMEM)), 5962 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5963 K(node_page_state(pgdat, NR_SHMEM_THPS)), 5964 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), 5965 K(node_page_state(pgdat, NR_ANON_THPS)), 5966 #endif 5967 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 5968 node_page_state(pgdat, NR_KERNEL_STACK_KB), 5969 #ifdef CONFIG_SHADOW_CALL_STACK 5970 node_page_state(pgdat, NR_KERNEL_SCS_KB), 5971 #endif 5972 K(node_page_state(pgdat, NR_PAGETABLE)), 5973 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 5974 "yes" : "no"); 5975 } 5976 5977 for_each_populated_zone(zone) { 5978 int i; 5979 5980 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 5981 continue; 5982 5983 free_pcp = 0; 5984 for_each_online_cpu(cpu) 5985 free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; 5986 5987 show_node(zone); 5988 printk(KERN_CONT 5989 "%s" 5990 " free:%lukB" 5991 " min:%lukB" 5992 " low:%lukB" 5993 " high:%lukB" 5994 " reserved_highatomic:%luKB" 5995 " active_anon:%lukB" 5996 " inactive_anon:%lukB" 5997 " active_file:%lukB" 5998 " inactive_file:%lukB" 5999 " unevictable:%lukB" 6000 " writepending:%lukB" 6001 " present:%lukB" 6002 " managed:%lukB" 6003 " mlocked:%lukB" 6004 " bounce:%lukB" 6005 " free_pcp:%lukB" 6006 " local_pcp:%ukB" 6007 " free_cma:%lukB" 6008 "\n", 6009 zone->name, 6010 K(zone_page_state(zone, NR_FREE_PAGES)), 6011 K(min_wmark_pages(zone)), 6012 K(low_wmark_pages(zone)), 6013 K(high_wmark_pages(zone)), 6014 K(zone->nr_reserved_highatomic), 6015 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 6016 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 6017 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 6018 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 6019 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 6020 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 6021 K(zone->present_pages), 6022 K(zone_managed_pages(zone)), 6023 K(zone_page_state(zone, NR_MLOCK)), 6024 K(zone_page_state(zone, NR_BOUNCE)), 6025 K(free_pcp), 6026 K(this_cpu_read(zone->per_cpu_pageset->count)), 6027 K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 6028 printk("lowmem_reserve[]:"); 6029 for (i = 0; i < MAX_NR_ZONES; i++) 6030 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 6031 printk(KERN_CONT "\n"); 6032 } 6033 6034 for_each_populated_zone(zone) { 6035 unsigned int order; 6036 unsigned long nr[MAX_ORDER], flags, total = 0; 6037 unsigned char types[MAX_ORDER]; 6038 6039 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 6040 continue; 6041 show_node(zone); 6042 printk(KERN_CONT "%s: ", zone->name); 6043 6044 spin_lock_irqsave(&zone->lock, flags); 6045 for (order = 0; order < MAX_ORDER; order++) { 6046 struct free_area *area = &zone->free_area[order]; 6047 int type; 6048 6049 nr[order] = area->nr_free; 6050 total += nr[order] << order; 6051 6052 types[order] = 0; 6053 for (type = 0; type < MIGRATE_TYPES; type++) { 6054 if (!free_area_empty(area, type)) 6055 types[order] |= 1 << type; 6056 } 6057 } 6058 spin_unlock_irqrestore(&zone->lock, flags); 6059 for (order = 0; order < MAX_ORDER; order++) { 6060 printk(KERN_CONT "%lu*%lukB ", 6061 nr[order], K(1UL) << order); 6062 if (nr[order]) 6063 show_migration_types(types[order]); 6064 } 6065 printk(KERN_CONT "= %lukB\n", K(total)); 6066 } 6067 6068 hugetlb_show_meminfo(); 6069 6070 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 6071 6072 show_swap_cache_info(); 6073 } 6074 6075 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 6076 { 6077 zoneref->zone = zone; 6078 zoneref->zone_idx = zone_idx(zone); 6079 } 6080 6081 /* 6082 * Builds allocation fallback zone lists. 6083 * 6084 * Add all populated zones of a node to the zonelist. 6085 */ 6086 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 6087 { 6088 struct zone *zone; 6089 enum zone_type zone_type = MAX_NR_ZONES; 6090 int nr_zones = 0; 6091 6092 do { 6093 zone_type--; 6094 zone = pgdat->node_zones + zone_type; 6095 if (managed_zone(zone)) { 6096 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 6097 check_highest_zone(zone_type); 6098 } 6099 } while (zone_type); 6100 6101 return nr_zones; 6102 } 6103 6104 #ifdef CONFIG_NUMA 6105 6106 static int __parse_numa_zonelist_order(char *s) 6107 { 6108 /* 6109 * We used to support different zonelists modes but they turned 6110 * out to be just not useful. Let's keep the warning in place 6111 * if somebody still use the cmd line parameter so that we do 6112 * not fail it silently 6113 */ 6114 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 6115 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 6116 return -EINVAL; 6117 } 6118 return 0; 6119 } 6120 6121 char numa_zonelist_order[] = "Node"; 6122 6123 /* 6124 * sysctl handler for numa_zonelist_order 6125 */ 6126 int numa_zonelist_order_handler(struct ctl_table *table, int write, 6127 void *buffer, size_t *length, loff_t *ppos) 6128 { 6129 if (write) 6130 return __parse_numa_zonelist_order(buffer); 6131 return proc_dostring(table, write, buffer, length, ppos); 6132 } 6133 6134 6135 #define MAX_NODE_LOAD (nr_online_nodes) 6136 static int node_load[MAX_NUMNODES]; 6137 6138 /** 6139 * find_next_best_node - find the next node that should appear in a given node's fallback list 6140 * @node: node whose fallback list we're appending 6141 * @used_node_mask: nodemask_t of already used nodes 6142 * 6143 * We use a number of factors to determine which is the next node that should 6144 * appear on a given node's fallback list. The node should not have appeared 6145 * already in @node's fallback list, and it should be the next closest node 6146 * according to the distance array (which contains arbitrary distance values 6147 * from each node to each node in the system), and should also prefer nodes 6148 * with no CPUs, since presumably they'll have very little allocation pressure 6149 * on them otherwise. 6150 * 6151 * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 6152 */ 6153 static int find_next_best_node(int node, nodemask_t *used_node_mask) 6154 { 6155 int n, val; 6156 int min_val = INT_MAX; 6157 int best_node = NUMA_NO_NODE; 6158 6159 /* Use the local node if we haven't already */ 6160 if (!node_isset(node, *used_node_mask)) { 6161 node_set(node, *used_node_mask); 6162 return node; 6163 } 6164 6165 for_each_node_state(n, N_MEMORY) { 6166 6167 /* Don't want a node to appear more than once */ 6168 if (node_isset(n, *used_node_mask)) 6169 continue; 6170 6171 /* Use the distance array to find the distance */ 6172 val = node_distance(node, n); 6173 6174 /* Penalize nodes under us ("prefer the next node") */ 6175 val += (n < node); 6176 6177 /* Give preference to headless and unused nodes */ 6178 if (!cpumask_empty(cpumask_of_node(n))) 6179 val += PENALTY_FOR_NODE_WITH_CPUS; 6180 6181 /* Slight preference for less loaded node */ 6182 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 6183 val += node_load[n]; 6184 6185 if (val < min_val) { 6186 min_val = val; 6187 best_node = n; 6188 } 6189 } 6190 6191 if (best_node >= 0) 6192 node_set(best_node, *used_node_mask); 6193 6194 return best_node; 6195 } 6196 6197 6198 /* 6199 * Build zonelists ordered by node and zones within node. 6200 * This results in maximum locality--normal zone overflows into local 6201 * DMA zone, if any--but risks exhausting DMA zone. 6202 */ 6203 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 6204 unsigned nr_nodes) 6205 { 6206 struct zoneref *zonerefs; 6207 int i; 6208 6209 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 6210 6211 for (i = 0; i < nr_nodes; i++) { 6212 int nr_zones; 6213 6214 pg_data_t *node = NODE_DATA(node_order[i]); 6215 6216 nr_zones = build_zonerefs_node(node, zonerefs); 6217 zonerefs += nr_zones; 6218 } 6219 zonerefs->zone = NULL; 6220 zonerefs->zone_idx = 0; 6221 } 6222 6223 /* 6224 * Build gfp_thisnode zonelists 6225 */ 6226 static void build_thisnode_zonelists(pg_data_t *pgdat) 6227 { 6228 struct zoneref *zonerefs; 6229 int nr_zones; 6230 6231 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 6232 nr_zones = build_zonerefs_node(pgdat, zonerefs); 6233 zonerefs += nr_zones; 6234 zonerefs->zone = NULL; 6235 zonerefs->zone_idx = 0; 6236 } 6237 6238 /* 6239 * Build zonelists ordered by zone and nodes within zones. 6240 * This results in conserving DMA zone[s] until all Normal memory is 6241 * exhausted, but results in overflowing to remote node while memory 6242 * may still exist in local DMA zone. 6243 */ 6244 6245 static void build_zonelists(pg_data_t *pgdat) 6246 { 6247 static int node_order[MAX_NUMNODES]; 6248 int node, load, nr_nodes = 0; 6249 nodemask_t used_mask = NODE_MASK_NONE; 6250 int local_node, prev_node; 6251 6252 /* NUMA-aware ordering of nodes */ 6253 local_node = pgdat->node_id; 6254 load = nr_online_nodes; 6255 prev_node = local_node; 6256 6257 memset(node_order, 0, sizeof(node_order)); 6258 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 6259 /* 6260 * We don't want to pressure a particular node. 6261 * So adding penalty to the first node in same 6262 * distance group to make it round-robin. 6263 */ 6264 if (node_distance(local_node, node) != 6265 node_distance(local_node, prev_node)) 6266 node_load[node] = load; 6267 6268 node_order[nr_nodes++] = node; 6269 prev_node = node; 6270 load--; 6271 } 6272 6273 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 6274 build_thisnode_zonelists(pgdat); 6275 } 6276 6277 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 6278 /* 6279 * Return node id of node used for "local" allocations. 6280 * I.e., first node id of first zone in arg node's generic zonelist. 6281 * Used for initializing percpu 'numa_mem', which is used primarily 6282 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 6283 */ 6284 int local_memory_node(int node) 6285 { 6286 struct zoneref *z; 6287 6288 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 6289 gfp_zone(GFP_KERNEL), 6290 NULL); 6291 return zone_to_nid(z->zone); 6292 } 6293 #endif 6294 6295 static void setup_min_unmapped_ratio(void); 6296 static void setup_min_slab_ratio(void); 6297 #else /* CONFIG_NUMA */ 6298 6299 static void build_zonelists(pg_data_t *pgdat) 6300 { 6301 int node, local_node; 6302 struct zoneref *zonerefs; 6303 int nr_zones; 6304 6305 local_node = pgdat->node_id; 6306 6307 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 6308 nr_zones = build_zonerefs_node(pgdat, zonerefs); 6309 zonerefs += nr_zones; 6310 6311 /* 6312 * Now we build the zonelist so that it contains the zones 6313 * of all the other nodes. 6314 * We don't want to pressure a particular node, so when 6315 * building the zones for node N, we make sure that the 6316 * zones coming right after the local ones are those from 6317 * node N+1 (modulo N) 6318 */ 6319 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 6320 if (!node_online(node)) 6321 continue; 6322 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 6323 zonerefs += nr_zones; 6324 } 6325 for (node = 0; node < local_node; node++) { 6326 if (!node_online(node)) 6327 continue; 6328 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 6329 zonerefs += nr_zones; 6330 } 6331 6332 zonerefs->zone = NULL; 6333 zonerefs->zone_idx = 0; 6334 } 6335 6336 #endif /* CONFIG_NUMA */ 6337 6338 /* 6339 * Boot pageset table. One per cpu which is going to be used for all 6340 * zones and all nodes. The parameters will be set in such a way 6341 * that an item put on a list will immediately be handed over to 6342 * the buddy list. This is safe since pageset manipulation is done 6343 * with interrupts disabled. 6344 * 6345 * The boot_pagesets must be kept even after bootup is complete for 6346 * unused processors and/or zones. They do play a role for bootstrapping 6347 * hotplugged processors. 6348 * 6349 * zoneinfo_show() and maybe other functions do 6350 * not check if the processor is online before following the pageset pointer. 6351 * Other parts of the kernel may not check if the zone is available. 6352 */ 6353 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); 6354 /* These effectively disable the pcplists in the boot pageset completely */ 6355 #define BOOT_PAGESET_HIGH 0 6356 #define BOOT_PAGESET_BATCH 1 6357 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); 6358 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); 6359 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 6360 6361 static void __build_all_zonelists(void *data) 6362 { 6363 int nid; 6364 int __maybe_unused cpu; 6365 pg_data_t *self = data; 6366 static DEFINE_SPINLOCK(lock); 6367 6368 spin_lock(&lock); 6369 6370 #ifdef CONFIG_NUMA 6371 memset(node_load, 0, sizeof(node_load)); 6372 #endif 6373 6374 /* 6375 * This node is hotadded and no memory is yet present. So just 6376 * building zonelists is fine - no need to touch other nodes. 6377 */ 6378 if (self && !node_online(self->node_id)) { 6379 build_zonelists(self); 6380 } else { 6381 for_each_online_node(nid) { 6382 pg_data_t *pgdat = NODE_DATA(nid); 6383 6384 build_zonelists(pgdat); 6385 } 6386 6387 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 6388 /* 6389 * We now know the "local memory node" for each node-- 6390 * i.e., the node of the first zone in the generic zonelist. 6391 * Set up numa_mem percpu variable for on-line cpus. During 6392 * boot, only the boot cpu should be on-line; we'll init the 6393 * secondary cpus' numa_mem as they come on-line. During 6394 * node/memory hotplug, we'll fixup all on-line cpus. 6395 */ 6396 for_each_online_cpu(cpu) 6397 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 6398 #endif 6399 } 6400 6401 spin_unlock(&lock); 6402 } 6403 6404 static noinline void __init 6405 build_all_zonelists_init(void) 6406 { 6407 int cpu; 6408 6409 __build_all_zonelists(NULL); 6410 6411 /* 6412 * Initialize the boot_pagesets that are going to be used 6413 * for bootstrapping processors. The real pagesets for 6414 * each zone will be allocated later when the per cpu 6415 * allocator is available. 6416 * 6417 * boot_pagesets are used also for bootstrapping offline 6418 * cpus if the system is already booted because the pagesets 6419 * are needed to initialize allocators on a specific cpu too. 6420 * F.e. the percpu allocator needs the page allocator which 6421 * needs the percpu allocator in order to allocate its pagesets 6422 * (a chicken-egg dilemma). 6423 */ 6424 for_each_possible_cpu(cpu) 6425 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); 6426 6427 mminit_verify_zonelist(); 6428 cpuset_init_current_mems_allowed(); 6429 } 6430 6431 /* 6432 * unless system_state == SYSTEM_BOOTING. 6433 * 6434 * __ref due to call of __init annotated helper build_all_zonelists_init 6435 * [protected by SYSTEM_BOOTING]. 6436 */ 6437 void __ref build_all_zonelists(pg_data_t *pgdat) 6438 { 6439 unsigned long vm_total_pages; 6440 6441 if (system_state == SYSTEM_BOOTING) { 6442 build_all_zonelists_init(); 6443 } else { 6444 __build_all_zonelists(pgdat); 6445 /* cpuset refresh routine should be here */ 6446 } 6447 /* Get the number of free pages beyond high watermark in all zones. */ 6448 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 6449 /* 6450 * Disable grouping by mobility if the number of pages in the 6451 * system is too low to allow the mechanism to work. It would be 6452 * more accurate, but expensive to check per-zone. This check is 6453 * made on memory-hotadd so a system can start with mobility 6454 * disabled and enable it later 6455 */ 6456 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 6457 page_group_by_mobility_disabled = 1; 6458 else 6459 page_group_by_mobility_disabled = 0; 6460 6461 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 6462 nr_online_nodes, 6463 page_group_by_mobility_disabled ? "off" : "on", 6464 vm_total_pages); 6465 #ifdef CONFIG_NUMA 6466 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 6467 #endif 6468 } 6469 6470 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ 6471 static bool __meminit 6472 overlap_memmap_init(unsigned long zone, unsigned long *pfn) 6473 { 6474 static struct memblock_region *r; 6475 6476 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 6477 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { 6478 for_each_mem_region(r) { 6479 if (*pfn < memblock_region_memory_end_pfn(r)) 6480 break; 6481 } 6482 } 6483 if (*pfn >= memblock_region_memory_base_pfn(r) && 6484 memblock_is_mirror(r)) { 6485 *pfn = memblock_region_memory_end_pfn(r); 6486 return true; 6487 } 6488 } 6489 return false; 6490 } 6491 6492 /* 6493 * Initially all pages are reserved - free ones are freed 6494 * up by memblock_free_all() once the early boot process is 6495 * done. Non-atomic initialization, single-pass. 6496 * 6497 * All aligned pageblocks are initialized to the specified migratetype 6498 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 6499 * zone stats (e.g., nr_isolate_pageblock) are touched. 6500 */ 6501 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, 6502 unsigned long start_pfn, unsigned long zone_end_pfn, 6503 enum meminit_context context, 6504 struct vmem_altmap *altmap, int migratetype) 6505 { 6506 unsigned long pfn, end_pfn = start_pfn + size; 6507 struct page *page; 6508 6509 if (highest_memmap_pfn < end_pfn - 1) 6510 highest_memmap_pfn = end_pfn - 1; 6511 6512 #ifdef CONFIG_ZONE_DEVICE 6513 /* 6514 * Honor reservation requested by the driver for this ZONE_DEVICE 6515 * memory. We limit the total number of pages to initialize to just 6516 * those that might contain the memory mapping. We will defer the 6517 * ZONE_DEVICE page initialization until after we have released 6518 * the hotplug lock. 6519 */ 6520 if (zone == ZONE_DEVICE) { 6521 if (!altmap) 6522 return; 6523 6524 if (start_pfn == altmap->base_pfn) 6525 start_pfn += altmap->reserve; 6526 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6527 } 6528 #endif 6529 6530 for (pfn = start_pfn; pfn < end_pfn; ) { 6531 /* 6532 * There can be holes in boot-time mem_map[]s handed to this 6533 * function. They do not exist on hotplugged memory. 6534 */ 6535 if (context == MEMINIT_EARLY) { 6536 if (overlap_memmap_init(zone, &pfn)) 6537 continue; 6538 if (defer_init(nid, pfn, zone_end_pfn)) 6539 break; 6540 } 6541 6542 page = pfn_to_page(pfn); 6543 __init_single_page(page, pfn, zone, nid); 6544 if (context == MEMINIT_HOTPLUG) 6545 __SetPageReserved(page); 6546 6547 /* 6548 * Usually, we want to mark the pageblock MIGRATE_MOVABLE, 6549 * such that unmovable allocations won't be scattered all 6550 * over the place during system boot. 6551 */ 6552 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6553 set_pageblock_migratetype(page, migratetype); 6554 cond_resched(); 6555 } 6556 pfn++; 6557 } 6558 } 6559 6560 #ifdef CONFIG_ZONE_DEVICE 6561 void __ref memmap_init_zone_device(struct zone *zone, 6562 unsigned long start_pfn, 6563 unsigned long nr_pages, 6564 struct dev_pagemap *pgmap) 6565 { 6566 unsigned long pfn, end_pfn = start_pfn + nr_pages; 6567 struct pglist_data *pgdat = zone->zone_pgdat; 6568 struct vmem_altmap *altmap = pgmap_altmap(pgmap); 6569 unsigned long zone_idx = zone_idx(zone); 6570 unsigned long start = jiffies; 6571 int nid = pgdat->node_id; 6572 6573 if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) 6574 return; 6575 6576 /* 6577 * The call to memmap_init should have already taken care 6578 * of the pages reserved for the memmap, so we can just jump to 6579 * the end of that region and start processing the device pages. 6580 */ 6581 if (altmap) { 6582 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 6583 nr_pages = end_pfn - start_pfn; 6584 } 6585 6586 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 6587 struct page *page = pfn_to_page(pfn); 6588 6589 __init_single_page(page, pfn, zone_idx, nid); 6590 6591 /* 6592 * Mark page reserved as it will need to wait for onlining 6593 * phase for it to be fully associated with a zone. 6594 * 6595 * We can use the non-atomic __set_bit operation for setting 6596 * the flag as we are still initializing the pages. 6597 */ 6598 __SetPageReserved(page); 6599 6600 /* 6601 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer 6602 * and zone_device_data. It is a bug if a ZONE_DEVICE page is 6603 * ever freed or placed on a driver-private list. 6604 */ 6605 page->pgmap = pgmap; 6606 page->zone_device_data = NULL; 6607 6608 /* 6609 * Mark the block movable so that blocks are reserved for 6610 * movable at startup. This will force kernel allocations 6611 * to reserve their blocks rather than leaking throughout 6612 * the address space during boot when many long-lived 6613 * kernel allocations are made. 6614 * 6615 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap 6616 * because this is done early in section_activate() 6617 */ 6618 if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 6619 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 6620 cond_resched(); 6621 } 6622 } 6623 6624 pr_info("%s initialised %lu pages in %ums\n", __func__, 6625 nr_pages, jiffies_to_msecs(jiffies - start)); 6626 } 6627 6628 #endif 6629 static void __meminit zone_init_free_lists(struct zone *zone) 6630 { 6631 unsigned int order, t; 6632 for_each_migratetype_order(order, t) { 6633 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 6634 zone->free_area[order].nr_free = 0; 6635 } 6636 } 6637 6638 #if !defined(CONFIG_FLATMEM) 6639 /* 6640 * Only struct pages that correspond to ranges defined by memblock.memory 6641 * are zeroed and initialized by going through __init_single_page() during 6642 * memmap_init_zone_range(). 6643 * 6644 * But, there could be struct pages that correspond to holes in 6645 * memblock.memory. This can happen because of the following reasons: 6646 * - physical memory bank size is not necessarily the exact multiple of the 6647 * arbitrary section size 6648 * - early reserved memory may not be listed in memblock.memory 6649 * - memory layouts defined with memmap= kernel parameter may not align 6650 * nicely with memmap sections 6651 * 6652 * Explicitly initialize those struct pages so that: 6653 * - PG_Reserved is set 6654 * - zone and node links point to zone and node that span the page if the 6655 * hole is in the middle of a zone 6656 * - zone and node links point to adjacent zone/node if the hole falls on 6657 * the zone boundary; the pages in such holes will be prepended to the 6658 * zone/node above the hole except for the trailing pages in the last 6659 * section that will be appended to the zone/node below. 6660 */ 6661 static void __init init_unavailable_range(unsigned long spfn, 6662 unsigned long epfn, 6663 int zone, int node) 6664 { 6665 unsigned long pfn; 6666 u64 pgcnt = 0; 6667 6668 for (pfn = spfn; pfn < epfn; pfn++) { 6669 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6670 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6671 + pageblock_nr_pages - 1; 6672 continue; 6673 } 6674 __init_single_page(pfn_to_page(pfn), pfn, zone, node); 6675 __SetPageReserved(pfn_to_page(pfn)); 6676 pgcnt++; 6677 } 6678 6679 if (pgcnt) 6680 pr_info("On node %d, zone %s: %lld pages in unavailable ranges", 6681 node, zone_names[zone], pgcnt); 6682 } 6683 #else 6684 static inline void init_unavailable_range(unsigned long spfn, 6685 unsigned long epfn, 6686 int zone, int node) 6687 { 6688 } 6689 #endif 6690 6691 static void __init memmap_init_zone_range(struct zone *zone, 6692 unsigned long start_pfn, 6693 unsigned long end_pfn, 6694 unsigned long *hole_pfn) 6695 { 6696 unsigned long zone_start_pfn = zone->zone_start_pfn; 6697 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 6698 int nid = zone_to_nid(zone), zone_id = zone_idx(zone); 6699 6700 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); 6701 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); 6702 6703 if (start_pfn >= end_pfn) 6704 return; 6705 6706 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, 6707 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); 6708 6709 if (*hole_pfn < start_pfn) 6710 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); 6711 6712 *hole_pfn = end_pfn; 6713 } 6714 6715 static void __init memmap_init(void) 6716 { 6717 unsigned long start_pfn, end_pfn; 6718 unsigned long hole_pfn = 0; 6719 int i, j, zone_id, nid; 6720 6721 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 6722 struct pglist_data *node = NODE_DATA(nid); 6723 6724 for (j = 0; j < MAX_NR_ZONES; j++) { 6725 struct zone *zone = node->node_zones + j; 6726 6727 if (!populated_zone(zone)) 6728 continue; 6729 6730 memmap_init_zone_range(zone, start_pfn, end_pfn, 6731 &hole_pfn); 6732 zone_id = j; 6733 } 6734 } 6735 6736 #ifdef CONFIG_SPARSEMEM 6737 /* 6738 * Initialize the memory map for hole in the range [memory_end, 6739 * section_end]. 6740 * Append the pages in this hole to the highest zone in the last 6741 * node. 6742 * The call to init_unavailable_range() is outside the ifdef to 6743 * silence the compiler warining about zone_id set but not used; 6744 * for FLATMEM it is a nop anyway 6745 */ 6746 end_pfn = round_up(end_pfn, PAGES_PER_SECTION); 6747 if (hole_pfn < end_pfn) 6748 #endif 6749 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); 6750 } 6751 6752 static int zone_batchsize(struct zone *zone) 6753 { 6754 #ifdef CONFIG_MMU 6755 int batch; 6756 6757 /* 6758 * The number of pages to batch allocate is either ~0.1% 6759 * of the zone or 1MB, whichever is smaller. The batch 6760 * size is striking a balance between allocation latency 6761 * and zone lock contention. 6762 */ 6763 batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); 6764 batch /= 4; /* We effectively *= 4 below */ 6765 if (batch < 1) 6766 batch = 1; 6767 6768 /* 6769 * Clamp the batch to a 2^n - 1 value. Having a power 6770 * of 2 value was found to be more likely to have 6771 * suboptimal cache aliasing properties in some cases. 6772 * 6773 * For example if 2 tasks are alternately allocating 6774 * batches of pages, one task can end up with a lot 6775 * of pages of one half of the possible page colors 6776 * and the other with pages of the other colors. 6777 */ 6778 batch = rounddown_pow_of_two(batch + batch/2) - 1; 6779 6780 return batch; 6781 6782 #else 6783 /* The deferral and batching of frees should be suppressed under NOMMU 6784 * conditions. 6785 * 6786 * The problem is that NOMMU needs to be able to allocate large chunks 6787 * of contiguous memory as there's no hardware page translation to 6788 * assemble apparent contiguous memory from discontiguous pages. 6789 * 6790 * Queueing large contiguous runs of pages for batching, however, 6791 * causes the pages to actually be freed in smaller chunks. As there 6792 * can be a significant delay between the individual batches being 6793 * recycled, this leads to the once large chunks of space being 6794 * fragmented and becoming unavailable for high-order allocations. 6795 */ 6796 return 0; 6797 #endif 6798 } 6799 6800 static int zone_highsize(struct zone *zone, int batch, int cpu_online) 6801 { 6802 #ifdef CONFIG_MMU 6803 int high; 6804 int nr_split_cpus; 6805 unsigned long total_pages; 6806 6807 if (!percpu_pagelist_high_fraction) { 6808 /* 6809 * By default, the high value of the pcp is based on the zone 6810 * low watermark so that if they are full then background 6811 * reclaim will not be started prematurely. 6812 */ 6813 total_pages = low_wmark_pages(zone); 6814 } else { 6815 /* 6816 * If percpu_pagelist_high_fraction is configured, the high 6817 * value is based on a fraction of the managed pages in the 6818 * zone. 6819 */ 6820 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; 6821 } 6822 6823 /* 6824 * Split the high value across all online CPUs local to the zone. Note 6825 * that early in boot that CPUs may not be online yet and that during 6826 * CPU hotplug that the cpumask is not yet updated when a CPU is being 6827 * onlined. For memory nodes that have no CPUs, split pcp->high across 6828 * all online CPUs to mitigate the risk that reclaim is triggered 6829 * prematurely due to pages stored on pcp lists. 6830 */ 6831 nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; 6832 if (!nr_split_cpus) 6833 nr_split_cpus = num_online_cpus(); 6834 high = total_pages / nr_split_cpus; 6835 6836 /* 6837 * Ensure high is at least batch*4. The multiple is based on the 6838 * historical relationship between high and batch. 6839 */ 6840 high = max(high, batch << 2); 6841 6842 return high; 6843 #else 6844 return 0; 6845 #endif 6846 } 6847 6848 /* 6849 * pcp->high and pcp->batch values are related and generally batch is lower 6850 * than high. They are also related to pcp->count such that count is lower 6851 * than high, and as soon as it reaches high, the pcplist is flushed. 6852 * 6853 * However, guaranteeing these relations at all times would require e.g. write 6854 * barriers here but also careful usage of read barriers at the read side, and 6855 * thus be prone to error and bad for performance. Thus the update only prevents 6856 * store tearing. Any new users of pcp->batch and pcp->high should ensure they 6857 * can cope with those fields changing asynchronously, and fully trust only the 6858 * pcp->count field on the local CPU with interrupts disabled. 6859 * 6860 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 6861 * outside of boot time (or some other assurance that no concurrent updaters 6862 * exist). 6863 */ 6864 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 6865 unsigned long batch) 6866 { 6867 WRITE_ONCE(pcp->batch, batch); 6868 WRITE_ONCE(pcp->high, high); 6869 } 6870 6871 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) 6872 { 6873 int pindex; 6874 6875 memset(pcp, 0, sizeof(*pcp)); 6876 memset(pzstats, 0, sizeof(*pzstats)); 6877 6878 for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) 6879 INIT_LIST_HEAD(&pcp->lists[pindex]); 6880 6881 /* 6882 * Set batch and high values safe for a boot pageset. A true percpu 6883 * pageset's initialization will update them subsequently. Here we don't 6884 * need to be as careful as pageset_update() as nobody can access the 6885 * pageset yet. 6886 */ 6887 pcp->high = BOOT_PAGESET_HIGH; 6888 pcp->batch = BOOT_PAGESET_BATCH; 6889 pcp->free_factor = 0; 6890 } 6891 6892 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, 6893 unsigned long batch) 6894 { 6895 struct per_cpu_pages *pcp; 6896 int cpu; 6897 6898 for_each_possible_cpu(cpu) { 6899 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6900 pageset_update(pcp, high, batch); 6901 } 6902 } 6903 6904 /* 6905 * Calculate and set new high and batch values for all per-cpu pagesets of a 6906 * zone based on the zone's size. 6907 */ 6908 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) 6909 { 6910 int new_high, new_batch; 6911 6912 new_batch = max(1, zone_batchsize(zone)); 6913 new_high = zone_highsize(zone, new_batch, cpu_online); 6914 6915 if (zone->pageset_high == new_high && 6916 zone->pageset_batch == new_batch) 6917 return; 6918 6919 zone->pageset_high = new_high; 6920 zone->pageset_batch = new_batch; 6921 6922 __zone_set_pageset_high_and_batch(zone, new_high, new_batch); 6923 } 6924 6925 void __meminit setup_zone_pageset(struct zone *zone) 6926 { 6927 int cpu; 6928 6929 /* Size may be 0 on !SMP && !NUMA */ 6930 if (sizeof(struct per_cpu_zonestat) > 0) 6931 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); 6932 6933 zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); 6934 for_each_possible_cpu(cpu) { 6935 struct per_cpu_pages *pcp; 6936 struct per_cpu_zonestat *pzstats; 6937 6938 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6939 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 6940 per_cpu_pages_init(pcp, pzstats); 6941 } 6942 6943 zone_set_pageset_high_and_batch(zone, 0); 6944 } 6945 6946 /* 6947 * Allocate per cpu pagesets and initialize them. 6948 * Before this call only boot pagesets were available. 6949 */ 6950 void __init setup_per_cpu_pageset(void) 6951 { 6952 struct pglist_data *pgdat; 6953 struct zone *zone; 6954 int __maybe_unused cpu; 6955 6956 for_each_populated_zone(zone) 6957 setup_zone_pageset(zone); 6958 6959 #ifdef CONFIG_NUMA 6960 /* 6961 * Unpopulated zones continue using the boot pagesets. 6962 * The numa stats for these pagesets need to be reset. 6963 * Otherwise, they will end up skewing the stats of 6964 * the nodes these zones are associated with. 6965 */ 6966 for_each_possible_cpu(cpu) { 6967 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); 6968 memset(pzstats->vm_numa_event, 0, 6969 sizeof(pzstats->vm_numa_event)); 6970 } 6971 #endif 6972 6973 for_each_online_pgdat(pgdat) 6974 pgdat->per_cpu_nodestats = 6975 alloc_percpu(struct per_cpu_nodestat); 6976 } 6977 6978 static __meminit void zone_pcp_init(struct zone *zone) 6979 { 6980 /* 6981 * per cpu subsystem is not up at this point. The following code 6982 * relies on the ability of the linker to provide the 6983 * offset of a (static) per cpu variable into the per cpu area. 6984 */ 6985 zone->per_cpu_pageset = &boot_pageset; 6986 zone->per_cpu_zonestats = &boot_zonestats; 6987 zone->pageset_high = BOOT_PAGESET_HIGH; 6988 zone->pageset_batch = BOOT_PAGESET_BATCH; 6989 6990 if (populated_zone(zone)) 6991 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, 6992 zone->present_pages, zone_batchsize(zone)); 6993 } 6994 6995 void __meminit init_currently_empty_zone(struct zone *zone, 6996 unsigned long zone_start_pfn, 6997 unsigned long size) 6998 { 6999 struct pglist_data *pgdat = zone->zone_pgdat; 7000 int zone_idx = zone_idx(zone) + 1; 7001 7002 if (zone_idx > pgdat->nr_zones) 7003 pgdat->nr_zones = zone_idx; 7004 7005 zone->zone_start_pfn = zone_start_pfn; 7006 7007 mminit_dprintk(MMINIT_TRACE, "memmap_init", 7008 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 7009 pgdat->node_id, 7010 (unsigned long)zone_idx(zone), 7011 zone_start_pfn, (zone_start_pfn + size)); 7012 7013 zone_init_free_lists(zone); 7014 zone->initialized = 1; 7015 } 7016 7017 /** 7018 * get_pfn_range_for_nid - Return the start and end page frames for a node 7019 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 7020 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 7021 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 7022 * 7023 * It returns the start and end page frame of a node based on information 7024 * provided by memblock_set_node(). If called for a node 7025 * with no available memory, a warning is printed and the start and end 7026 * PFNs will be 0. 7027 */ 7028 void __init get_pfn_range_for_nid(unsigned int nid, 7029 unsigned long *start_pfn, unsigned long *end_pfn) 7030 { 7031 unsigned long this_start_pfn, this_end_pfn; 7032 int i; 7033 7034 *start_pfn = -1UL; 7035 *end_pfn = 0; 7036 7037 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 7038 *start_pfn = min(*start_pfn, this_start_pfn); 7039 *end_pfn = max(*end_pfn, this_end_pfn); 7040 } 7041 7042 if (*start_pfn == -1UL) 7043 *start_pfn = 0; 7044 } 7045 7046 /* 7047 * This finds a zone that can be used for ZONE_MOVABLE pages. The 7048 * assumption is made that zones within a node are ordered in monotonic 7049 * increasing memory addresses so that the "highest" populated zone is used 7050 */ 7051 static void __init find_usable_zone_for_movable(void) 7052 { 7053 int zone_index; 7054 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 7055 if (zone_index == ZONE_MOVABLE) 7056 continue; 7057 7058 if (arch_zone_highest_possible_pfn[zone_index] > 7059 arch_zone_lowest_possible_pfn[zone_index]) 7060 break; 7061 } 7062 7063 VM_BUG_ON(zone_index == -1); 7064 movable_zone = zone_index; 7065 } 7066 7067 /* 7068 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 7069 * because it is sized independent of architecture. Unlike the other zones, 7070 * the starting point for ZONE_MOVABLE is not fixed. It may be different 7071 * in each node depending on the size of each node and how evenly kernelcore 7072 * is distributed. This helper function adjusts the zone ranges 7073 * provided by the architecture for a given node by using the end of the 7074 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 7075 * zones within a node are in order of monotonic increases memory addresses 7076 */ 7077 static void __init adjust_zone_range_for_zone_movable(int nid, 7078 unsigned long zone_type, 7079 unsigned long node_start_pfn, 7080 unsigned long node_end_pfn, 7081 unsigned long *zone_start_pfn, 7082 unsigned long *zone_end_pfn) 7083 { 7084 /* Only adjust if ZONE_MOVABLE is on this node */ 7085 if (zone_movable_pfn[nid]) { 7086 /* Size ZONE_MOVABLE */ 7087 if (zone_type == ZONE_MOVABLE) { 7088 *zone_start_pfn = zone_movable_pfn[nid]; 7089 *zone_end_pfn = min(node_end_pfn, 7090 arch_zone_highest_possible_pfn[movable_zone]); 7091 7092 /* Adjust for ZONE_MOVABLE starting within this range */ 7093 } else if (!mirrored_kernelcore && 7094 *zone_start_pfn < zone_movable_pfn[nid] && 7095 *zone_end_pfn > zone_movable_pfn[nid]) { 7096 *zone_end_pfn = zone_movable_pfn[nid]; 7097 7098 /* Check if this whole range is within ZONE_MOVABLE */ 7099 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 7100 *zone_start_pfn = *zone_end_pfn; 7101 } 7102 } 7103 7104 /* 7105 * Return the number of pages a zone spans in a node, including holes 7106 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 7107 */ 7108 static unsigned long __init zone_spanned_pages_in_node(int nid, 7109 unsigned long zone_type, 7110 unsigned long node_start_pfn, 7111 unsigned long node_end_pfn, 7112 unsigned long *zone_start_pfn, 7113 unsigned long *zone_end_pfn) 7114 { 7115 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 7116 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 7117 /* When hotadd a new node from cpu_up(), the node should be empty */ 7118 if (!node_start_pfn && !node_end_pfn) 7119 return 0; 7120 7121 /* Get the start and end of the zone */ 7122 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 7123 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 7124 adjust_zone_range_for_zone_movable(nid, zone_type, 7125 node_start_pfn, node_end_pfn, 7126 zone_start_pfn, zone_end_pfn); 7127 7128 /* Check that this node has pages within the zone's required range */ 7129 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 7130 return 0; 7131 7132 /* Move the zone boundaries inside the node if necessary */ 7133 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 7134 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 7135 7136 /* Return the spanned pages */ 7137 return *zone_end_pfn - *zone_start_pfn; 7138 } 7139 7140 /* 7141 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 7142 * then all holes in the requested range will be accounted for. 7143 */ 7144 unsigned long __init __absent_pages_in_range(int nid, 7145 unsigned long range_start_pfn, 7146 unsigned long range_end_pfn) 7147 { 7148 unsigned long nr_absent = range_end_pfn - range_start_pfn; 7149 unsigned long start_pfn, end_pfn; 7150 int i; 7151 7152 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 7153 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 7154 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 7155 nr_absent -= end_pfn - start_pfn; 7156 } 7157 return nr_absent; 7158 } 7159 7160 /** 7161 * absent_pages_in_range - Return number of page frames in holes within a range 7162 * @start_pfn: The start PFN to start searching for holes 7163 * @end_pfn: The end PFN to stop searching for holes 7164 * 7165 * Return: the number of pages frames in memory holes within a range. 7166 */ 7167 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 7168 unsigned long end_pfn) 7169 { 7170 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 7171 } 7172 7173 /* Return the number of page frames in holes in a zone on a node */ 7174 static unsigned long __init zone_absent_pages_in_node(int nid, 7175 unsigned long zone_type, 7176 unsigned long node_start_pfn, 7177 unsigned long node_end_pfn) 7178 { 7179 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 7180 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 7181 unsigned long zone_start_pfn, zone_end_pfn; 7182 unsigned long nr_absent; 7183 7184 /* When hotadd a new node from cpu_up(), the node should be empty */ 7185 if (!node_start_pfn && !node_end_pfn) 7186 return 0; 7187 7188 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 7189 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 7190 7191 adjust_zone_range_for_zone_movable(nid, zone_type, 7192 node_start_pfn, node_end_pfn, 7193 &zone_start_pfn, &zone_end_pfn); 7194 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 7195 7196 /* 7197 * ZONE_MOVABLE handling. 7198 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 7199 * and vice versa. 7200 */ 7201 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 7202 unsigned long start_pfn, end_pfn; 7203 struct memblock_region *r; 7204 7205 for_each_mem_region(r) { 7206 start_pfn = clamp(memblock_region_memory_base_pfn(r), 7207 zone_start_pfn, zone_end_pfn); 7208 end_pfn = clamp(memblock_region_memory_end_pfn(r), 7209 zone_start_pfn, zone_end_pfn); 7210 7211 if (zone_type == ZONE_MOVABLE && 7212 memblock_is_mirror(r)) 7213 nr_absent += end_pfn - start_pfn; 7214 7215 if (zone_type == ZONE_NORMAL && 7216 !memblock_is_mirror(r)) 7217 nr_absent += end_pfn - start_pfn; 7218 } 7219 } 7220 7221 return nr_absent; 7222 } 7223 7224 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 7225 unsigned long node_start_pfn, 7226 unsigned long node_end_pfn) 7227 { 7228 unsigned long realtotalpages = 0, totalpages = 0; 7229 enum zone_type i; 7230 7231 for (i = 0; i < MAX_NR_ZONES; i++) { 7232 struct zone *zone = pgdat->node_zones + i; 7233 unsigned long zone_start_pfn, zone_end_pfn; 7234 unsigned long spanned, absent; 7235 unsigned long size, real_size; 7236 7237 spanned = zone_spanned_pages_in_node(pgdat->node_id, i, 7238 node_start_pfn, 7239 node_end_pfn, 7240 &zone_start_pfn, 7241 &zone_end_pfn); 7242 absent = zone_absent_pages_in_node(pgdat->node_id, i, 7243 node_start_pfn, 7244 node_end_pfn); 7245 7246 size = spanned; 7247 real_size = size - absent; 7248 7249 if (size) 7250 zone->zone_start_pfn = zone_start_pfn; 7251 else 7252 zone->zone_start_pfn = 0; 7253 zone->spanned_pages = size; 7254 zone->present_pages = real_size; 7255 7256 totalpages += size; 7257 realtotalpages += real_size; 7258 } 7259 7260 pgdat->node_spanned_pages = totalpages; 7261 pgdat->node_present_pages = realtotalpages; 7262 pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 7263 } 7264 7265 #ifndef CONFIG_SPARSEMEM 7266 /* 7267 * Calculate the size of the zone->blockflags rounded to an unsigned long 7268 * Start by making sure zonesize is a multiple of pageblock_order by rounding 7269 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 7270 * round what is now in bits to nearest long in bits, then return it in 7271 * bytes. 7272 */ 7273 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 7274 { 7275 unsigned long usemapsize; 7276 7277 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 7278 usemapsize = roundup(zonesize, pageblock_nr_pages); 7279 usemapsize = usemapsize >> pageblock_order; 7280 usemapsize *= NR_PAGEBLOCK_BITS; 7281 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 7282 7283 return usemapsize / 8; 7284 } 7285 7286 static void __ref setup_usemap(struct zone *zone) 7287 { 7288 unsigned long usemapsize = usemap_size(zone->zone_start_pfn, 7289 zone->spanned_pages); 7290 zone->pageblock_flags = NULL; 7291 if (usemapsize) { 7292 zone->pageblock_flags = 7293 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, 7294 zone_to_nid(zone)); 7295 if (!zone->pageblock_flags) 7296 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 7297 usemapsize, zone->name, zone_to_nid(zone)); 7298 } 7299 } 7300 #else 7301 static inline void setup_usemap(struct zone *zone) {} 7302 #endif /* CONFIG_SPARSEMEM */ 7303 7304 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 7305 7306 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 7307 void __init set_pageblock_order(void) 7308 { 7309 unsigned int order; 7310 7311 /* Check that pageblock_nr_pages has not already been setup */ 7312 if (pageblock_order) 7313 return; 7314 7315 if (HPAGE_SHIFT > PAGE_SHIFT) 7316 order = HUGETLB_PAGE_ORDER; 7317 else 7318 order = MAX_ORDER - 1; 7319 7320 /* 7321 * Assume the largest contiguous order of interest is a huge page. 7322 * This value may be variable depending on boot parameters on IA64 and 7323 * powerpc. 7324 */ 7325 pageblock_order = order; 7326 } 7327 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 7328 7329 /* 7330 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 7331 * is unused as pageblock_order is set at compile-time. See 7332 * include/linux/pageblock-flags.h for the values of pageblock_order based on 7333 * the kernel config 7334 */ 7335 void __init set_pageblock_order(void) 7336 { 7337 } 7338 7339 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 7340 7341 static unsigned long __init calc_memmap_size(unsigned long spanned_pages, 7342 unsigned long present_pages) 7343 { 7344 unsigned long pages = spanned_pages; 7345 7346 /* 7347 * Provide a more accurate estimation if there are holes within 7348 * the zone and SPARSEMEM is in use. If there are holes within the 7349 * zone, each populated memory region may cost us one or two extra 7350 * memmap pages due to alignment because memmap pages for each 7351 * populated regions may not be naturally aligned on page boundary. 7352 * So the (present_pages >> 4) heuristic is a tradeoff for that. 7353 */ 7354 if (spanned_pages > present_pages + (present_pages >> 4) && 7355 IS_ENABLED(CONFIG_SPARSEMEM)) 7356 pages = present_pages; 7357 7358 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 7359 } 7360 7361 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 7362 static void pgdat_init_split_queue(struct pglist_data *pgdat) 7363 { 7364 struct deferred_split *ds_queue = &pgdat->deferred_split_queue; 7365 7366 spin_lock_init(&ds_queue->split_queue_lock); 7367 INIT_LIST_HEAD(&ds_queue->split_queue); 7368 ds_queue->split_queue_len = 0; 7369 } 7370 #else 7371 static void pgdat_init_split_queue(struct pglist_data *pgdat) {} 7372 #endif 7373 7374 #ifdef CONFIG_COMPACTION 7375 static void pgdat_init_kcompactd(struct pglist_data *pgdat) 7376 { 7377 init_waitqueue_head(&pgdat->kcompactd_wait); 7378 } 7379 #else 7380 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} 7381 #endif 7382 7383 static void __meminit pgdat_init_internals(struct pglist_data *pgdat) 7384 { 7385 pgdat_resize_init(pgdat); 7386 7387 pgdat_init_split_queue(pgdat); 7388 pgdat_init_kcompactd(pgdat); 7389 7390 init_waitqueue_head(&pgdat->kswapd_wait); 7391 init_waitqueue_head(&pgdat->pfmemalloc_wait); 7392 7393 pgdat_page_ext_init(pgdat); 7394 lruvec_init(&pgdat->__lruvec); 7395 } 7396 7397 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 7398 unsigned long remaining_pages) 7399 { 7400 atomic_long_set(&zone->managed_pages, remaining_pages); 7401 zone_set_nid(zone, nid); 7402 zone->name = zone_names[idx]; 7403 zone->zone_pgdat = NODE_DATA(nid); 7404 spin_lock_init(&zone->lock); 7405 zone_seqlock_init(zone); 7406 zone_pcp_init(zone); 7407 } 7408 7409 /* 7410 * Set up the zone data structures 7411 * - init pgdat internals 7412 * - init all zones belonging to this node 7413 * 7414 * NOTE: this function is only called during memory hotplug 7415 */ 7416 #ifdef CONFIG_MEMORY_HOTPLUG 7417 void __ref free_area_init_core_hotplug(int nid) 7418 { 7419 enum zone_type z; 7420 pg_data_t *pgdat = NODE_DATA(nid); 7421 7422 pgdat_init_internals(pgdat); 7423 for (z = 0; z < MAX_NR_ZONES; z++) 7424 zone_init_internals(&pgdat->node_zones[z], z, nid, 0); 7425 } 7426 #endif 7427 7428 /* 7429 * Set up the zone data structures: 7430 * - mark all pages reserved 7431 * - mark all memory queues empty 7432 * - clear the memory bitmaps 7433 * 7434 * NOTE: pgdat should get zeroed by caller. 7435 * NOTE: this function is only called during early init. 7436 */ 7437 static void __init free_area_init_core(struct pglist_data *pgdat) 7438 { 7439 enum zone_type j; 7440 int nid = pgdat->node_id; 7441 7442 pgdat_init_internals(pgdat); 7443 pgdat->per_cpu_nodestats = &boot_nodestats; 7444 7445 for (j = 0; j < MAX_NR_ZONES; j++) { 7446 struct zone *zone = pgdat->node_zones + j; 7447 unsigned long size, freesize, memmap_pages; 7448 7449 size = zone->spanned_pages; 7450 freesize = zone->present_pages; 7451 7452 /* 7453 * Adjust freesize so that it accounts for how much memory 7454 * is used by this zone for memmap. This affects the watermark 7455 * and per-cpu initialisations 7456 */ 7457 memmap_pages = calc_memmap_size(size, freesize); 7458 if (!is_highmem_idx(j)) { 7459 if (freesize >= memmap_pages) { 7460 freesize -= memmap_pages; 7461 if (memmap_pages) 7462 pr_debug(" %s zone: %lu pages used for memmap\n", 7463 zone_names[j], memmap_pages); 7464 } else 7465 pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", 7466 zone_names[j], memmap_pages, freesize); 7467 } 7468 7469 /* Account for reserved pages */ 7470 if (j == 0 && freesize > dma_reserve) { 7471 freesize -= dma_reserve; 7472 pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); 7473 } 7474 7475 if (!is_highmem_idx(j)) 7476 nr_kernel_pages += freesize; 7477 /* Charge for highmem memmap if there are enough kernel pages */ 7478 else if (nr_kernel_pages > memmap_pages * 2) 7479 nr_kernel_pages -= memmap_pages; 7480 nr_all_pages += freesize; 7481 7482 /* 7483 * Set an approximate value for lowmem here, it will be adjusted 7484 * when the bootmem allocator frees pages into the buddy system. 7485 * And all highmem pages will be managed by the buddy system. 7486 */ 7487 zone_init_internals(zone, j, nid, freesize); 7488 7489 if (!size) 7490 continue; 7491 7492 set_pageblock_order(); 7493 setup_usemap(zone); 7494 init_currently_empty_zone(zone, zone->zone_start_pfn, size); 7495 } 7496 } 7497 7498 #ifdef CONFIG_FLATMEM 7499 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 7500 { 7501 unsigned long __maybe_unused start = 0; 7502 unsigned long __maybe_unused offset = 0; 7503 7504 /* Skip empty nodes */ 7505 if (!pgdat->node_spanned_pages) 7506 return; 7507 7508 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 7509 offset = pgdat->node_start_pfn - start; 7510 /* ia64 gets its own node_mem_map, before this, without bootmem */ 7511 if (!pgdat->node_mem_map) { 7512 unsigned long size, end; 7513 struct page *map; 7514 7515 /* 7516 * The zone's endpoints aren't required to be MAX_ORDER 7517 * aligned but the node_mem_map endpoints must be in order 7518 * for the buddy allocator to function correctly. 7519 */ 7520 end = pgdat_end_pfn(pgdat); 7521 end = ALIGN(end, MAX_ORDER_NR_PAGES); 7522 size = (end - start) * sizeof(struct page); 7523 map = memblock_alloc_node(size, SMP_CACHE_BYTES, 7524 pgdat->node_id); 7525 if (!map) 7526 panic("Failed to allocate %ld bytes for node %d memory map\n", 7527 size, pgdat->node_id); 7528 pgdat->node_mem_map = map + offset; 7529 } 7530 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 7531 __func__, pgdat->node_id, (unsigned long)pgdat, 7532 (unsigned long)pgdat->node_mem_map); 7533 #ifndef CONFIG_NUMA 7534 /* 7535 * With no DISCONTIG, the global mem_map is just set as node 0's 7536 */ 7537 if (pgdat == NODE_DATA(0)) { 7538 mem_map = NODE_DATA(0)->node_mem_map; 7539 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 7540 mem_map -= offset; 7541 } 7542 #endif 7543 } 7544 #else 7545 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } 7546 #endif /* CONFIG_FLATMEM */ 7547 7548 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 7549 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 7550 { 7551 pgdat->first_deferred_pfn = ULONG_MAX; 7552 } 7553 #else 7554 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} 7555 #endif 7556 7557 static void __init free_area_init_node(int nid) 7558 { 7559 pg_data_t *pgdat = NODE_DATA(nid); 7560 unsigned long start_pfn = 0; 7561 unsigned long end_pfn = 0; 7562 7563 /* pg_data_t should be reset to zero when it's allocated */ 7564 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); 7565 7566 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 7567 7568 pgdat->node_id = nid; 7569 pgdat->node_start_pfn = start_pfn; 7570 pgdat->per_cpu_nodestats = NULL; 7571 7572 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 7573 (u64)start_pfn << PAGE_SHIFT, 7574 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 7575 calculate_node_totalpages(pgdat, start_pfn, end_pfn); 7576 7577 alloc_node_mem_map(pgdat); 7578 pgdat_set_deferred_range(pgdat); 7579 7580 free_area_init_core(pgdat); 7581 } 7582 7583 void __init free_area_init_memoryless_node(int nid) 7584 { 7585 free_area_init_node(nid); 7586 } 7587 7588 #if MAX_NUMNODES > 1 7589 /* 7590 * Figure out the number of possible node ids. 7591 */ 7592 void __init setup_nr_node_ids(void) 7593 { 7594 unsigned int highest; 7595 7596 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 7597 nr_node_ids = highest + 1; 7598 } 7599 #endif 7600 7601 /** 7602 * node_map_pfn_alignment - determine the maximum internode alignment 7603 * 7604 * This function should be called after node map is populated and sorted. 7605 * It calculates the maximum power of two alignment which can distinguish 7606 * all the nodes. 7607 * 7608 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 7609 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 7610 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 7611 * shifted, 1GiB is enough and this function will indicate so. 7612 * 7613 * This is used to test whether pfn -> nid mapping of the chosen memory 7614 * model has fine enough granularity to avoid incorrect mapping for the 7615 * populated node map. 7616 * 7617 * Return: the determined alignment in pfn's. 0 if there is no alignment 7618 * requirement (single node). 7619 */ 7620 unsigned long __init node_map_pfn_alignment(void) 7621 { 7622 unsigned long accl_mask = 0, last_end = 0; 7623 unsigned long start, end, mask; 7624 int last_nid = NUMA_NO_NODE; 7625 int i, nid; 7626 7627 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 7628 if (!start || last_nid < 0 || last_nid == nid) { 7629 last_nid = nid; 7630 last_end = end; 7631 continue; 7632 } 7633 7634 /* 7635 * Start with a mask granular enough to pin-point to the 7636 * start pfn and tick off bits one-by-one until it becomes 7637 * too coarse to separate the current node from the last. 7638 */ 7639 mask = ~((1 << __ffs(start)) - 1); 7640 while (mask && last_end <= (start & (mask << 1))) 7641 mask <<= 1; 7642 7643 /* accumulate all internode masks */ 7644 accl_mask |= mask; 7645 } 7646 7647 /* convert mask to number of pages */ 7648 return ~accl_mask + 1; 7649 } 7650 7651 /** 7652 * find_min_pfn_with_active_regions - Find the minimum PFN registered 7653 * 7654 * Return: the minimum PFN based on information provided via 7655 * memblock_set_node(). 7656 */ 7657 unsigned long __init find_min_pfn_with_active_regions(void) 7658 { 7659 return PHYS_PFN(memblock_start_of_DRAM()); 7660 } 7661 7662 /* 7663 * early_calculate_totalpages() 7664 * Sum pages in active regions for movable zone. 7665 * Populate N_MEMORY for calculating usable_nodes. 7666 */ 7667 static unsigned long __init early_calculate_totalpages(void) 7668 { 7669 unsigned long totalpages = 0; 7670 unsigned long start_pfn, end_pfn; 7671 int i, nid; 7672 7673 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 7674 unsigned long pages = end_pfn - start_pfn; 7675 7676 totalpages += pages; 7677 if (pages) 7678 node_set_state(nid, N_MEMORY); 7679 } 7680 return totalpages; 7681 } 7682 7683 /* 7684 * Find the PFN the Movable zone begins in each node. Kernel memory 7685 * is spread evenly between nodes as long as the nodes have enough 7686 * memory. When they don't, some nodes will have more kernelcore than 7687 * others 7688 */ 7689 static void __init find_zone_movable_pfns_for_nodes(void) 7690 { 7691 int i, nid; 7692 unsigned long usable_startpfn; 7693 unsigned long kernelcore_node, kernelcore_remaining; 7694 /* save the state before borrow the nodemask */ 7695 nodemask_t saved_node_state = node_states[N_MEMORY]; 7696 unsigned long totalpages = early_calculate_totalpages(); 7697 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 7698 struct memblock_region *r; 7699 7700 /* Need to find movable_zone earlier when movable_node is specified. */ 7701 find_usable_zone_for_movable(); 7702 7703 /* 7704 * If movable_node is specified, ignore kernelcore and movablecore 7705 * options. 7706 */ 7707 if (movable_node_is_enabled()) { 7708 for_each_mem_region(r) { 7709 if (!memblock_is_hotpluggable(r)) 7710 continue; 7711 7712 nid = memblock_get_region_node(r); 7713 7714 usable_startpfn = PFN_DOWN(r->base); 7715 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7716 min(usable_startpfn, zone_movable_pfn[nid]) : 7717 usable_startpfn; 7718 } 7719 7720 goto out2; 7721 } 7722 7723 /* 7724 * If kernelcore=mirror is specified, ignore movablecore option 7725 */ 7726 if (mirrored_kernelcore) { 7727 bool mem_below_4gb_not_mirrored = false; 7728 7729 for_each_mem_region(r) { 7730 if (memblock_is_mirror(r)) 7731 continue; 7732 7733 nid = memblock_get_region_node(r); 7734 7735 usable_startpfn = memblock_region_memory_base_pfn(r); 7736 7737 if (usable_startpfn < 0x100000) { 7738 mem_below_4gb_not_mirrored = true; 7739 continue; 7740 } 7741 7742 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 7743 min(usable_startpfn, zone_movable_pfn[nid]) : 7744 usable_startpfn; 7745 } 7746 7747 if (mem_below_4gb_not_mirrored) 7748 pr_warn("This configuration results in unmirrored kernel memory.\n"); 7749 7750 goto out2; 7751 } 7752 7753 /* 7754 * If kernelcore=nn% or movablecore=nn% was specified, calculate the 7755 * amount of necessary memory. 7756 */ 7757 if (required_kernelcore_percent) 7758 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / 7759 10000UL; 7760 if (required_movablecore_percent) 7761 required_movablecore = (totalpages * 100 * required_movablecore_percent) / 7762 10000UL; 7763 7764 /* 7765 * If movablecore= was specified, calculate what size of 7766 * kernelcore that corresponds so that memory usable for 7767 * any allocation type is evenly spread. If both kernelcore 7768 * and movablecore are specified, then the value of kernelcore 7769 * will be used for required_kernelcore if it's greater than 7770 * what movablecore would have allowed. 7771 */ 7772 if (required_movablecore) { 7773 unsigned long corepages; 7774 7775 /* 7776 * Round-up so that ZONE_MOVABLE is at least as large as what 7777 * was requested by the user 7778 */ 7779 required_movablecore = 7780 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 7781 required_movablecore = min(totalpages, required_movablecore); 7782 corepages = totalpages - required_movablecore; 7783 7784 required_kernelcore = max(required_kernelcore, corepages); 7785 } 7786 7787 /* 7788 * If kernelcore was not specified or kernelcore size is larger 7789 * than totalpages, there is no ZONE_MOVABLE. 7790 */ 7791 if (!required_kernelcore || required_kernelcore >= totalpages) 7792 goto out; 7793 7794 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 7795 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 7796 7797 restart: 7798 /* Spread kernelcore memory as evenly as possible throughout nodes */ 7799 kernelcore_node = required_kernelcore / usable_nodes; 7800 for_each_node_state(nid, N_MEMORY) { 7801 unsigned long start_pfn, end_pfn; 7802 7803 /* 7804 * Recalculate kernelcore_node if the division per node 7805 * now exceeds what is necessary to satisfy the requested 7806 * amount of memory for the kernel 7807 */ 7808 if (required_kernelcore < kernelcore_node) 7809 kernelcore_node = required_kernelcore / usable_nodes; 7810 7811 /* 7812 * As the map is walked, we track how much memory is usable 7813 * by the kernel using kernelcore_remaining. When it is 7814 * 0, the rest of the node is usable by ZONE_MOVABLE 7815 */ 7816 kernelcore_remaining = kernelcore_node; 7817 7818 /* Go through each range of PFNs within this node */ 7819 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 7820 unsigned long size_pages; 7821 7822 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 7823 if (start_pfn >= end_pfn) 7824 continue; 7825 7826 /* Account for what is only usable for kernelcore */ 7827 if (start_pfn < usable_startpfn) { 7828 unsigned long kernel_pages; 7829 kernel_pages = min(end_pfn, usable_startpfn) 7830 - start_pfn; 7831 7832 kernelcore_remaining -= min(kernel_pages, 7833 kernelcore_remaining); 7834 required_kernelcore -= min(kernel_pages, 7835 required_kernelcore); 7836 7837 /* Continue if range is now fully accounted */ 7838 if (end_pfn <= usable_startpfn) { 7839 7840 /* 7841 * Push zone_movable_pfn to the end so 7842 * that if we have to rebalance 7843 * kernelcore across nodes, we will 7844 * not double account here 7845 */ 7846 zone_movable_pfn[nid] = end_pfn; 7847 continue; 7848 } 7849 start_pfn = usable_startpfn; 7850 } 7851 7852 /* 7853 * The usable PFN range for ZONE_MOVABLE is from 7854 * start_pfn->end_pfn. Calculate size_pages as the 7855 * number of pages used as kernelcore 7856 */ 7857 size_pages = end_pfn - start_pfn; 7858 if (size_pages > kernelcore_remaining) 7859 size_pages = kernelcore_remaining; 7860 zone_movable_pfn[nid] = start_pfn + size_pages; 7861 7862 /* 7863 * Some kernelcore has been met, update counts and 7864 * break if the kernelcore for this node has been 7865 * satisfied 7866 */ 7867 required_kernelcore -= min(required_kernelcore, 7868 size_pages); 7869 kernelcore_remaining -= size_pages; 7870 if (!kernelcore_remaining) 7871 break; 7872 } 7873 } 7874 7875 /* 7876 * If there is still required_kernelcore, we do another pass with one 7877 * less node in the count. This will push zone_movable_pfn[nid] further 7878 * along on the nodes that still have memory until kernelcore is 7879 * satisfied 7880 */ 7881 usable_nodes--; 7882 if (usable_nodes && required_kernelcore > usable_nodes) 7883 goto restart; 7884 7885 out2: 7886 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 7887 for (nid = 0; nid < MAX_NUMNODES; nid++) 7888 zone_movable_pfn[nid] = 7889 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 7890 7891 out: 7892 /* restore the node_state */ 7893 node_states[N_MEMORY] = saved_node_state; 7894 } 7895 7896 /* Any regular or high memory on that node ? */ 7897 static void check_for_memory(pg_data_t *pgdat, int nid) 7898 { 7899 enum zone_type zone_type; 7900 7901 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 7902 struct zone *zone = &pgdat->node_zones[zone_type]; 7903 if (populated_zone(zone)) { 7904 if (IS_ENABLED(CONFIG_HIGHMEM)) 7905 node_set_state(nid, N_HIGH_MEMORY); 7906 if (zone_type <= ZONE_NORMAL) 7907 node_set_state(nid, N_NORMAL_MEMORY); 7908 break; 7909 } 7910 } 7911 } 7912 7913 /* 7914 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For 7915 * such cases we allow max_zone_pfn sorted in the descending order 7916 */ 7917 bool __weak arch_has_descending_max_zone_pfns(void) 7918 { 7919 return false; 7920 } 7921 7922 /** 7923 * free_area_init - Initialise all pg_data_t and zone data 7924 * @max_zone_pfn: an array of max PFNs for each zone 7925 * 7926 * This will call free_area_init_node() for each active node in the system. 7927 * Using the page ranges provided by memblock_set_node(), the size of each 7928 * zone in each node and their holes is calculated. If the maximum PFN 7929 * between two adjacent zones match, it is assumed that the zone is empty. 7930 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 7931 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 7932 * starts where the previous one ended. For example, ZONE_DMA32 starts 7933 * at arch_max_dma_pfn. 7934 */ 7935 void __init free_area_init(unsigned long *max_zone_pfn) 7936 { 7937 unsigned long start_pfn, end_pfn; 7938 int i, nid, zone; 7939 bool descending; 7940 7941 /* Record where the zone boundaries are */ 7942 memset(arch_zone_lowest_possible_pfn, 0, 7943 sizeof(arch_zone_lowest_possible_pfn)); 7944 memset(arch_zone_highest_possible_pfn, 0, 7945 sizeof(arch_zone_highest_possible_pfn)); 7946 7947 start_pfn = find_min_pfn_with_active_regions(); 7948 descending = arch_has_descending_max_zone_pfns(); 7949 7950 for (i = 0; i < MAX_NR_ZONES; i++) { 7951 if (descending) 7952 zone = MAX_NR_ZONES - i - 1; 7953 else 7954 zone = i; 7955 7956 if (zone == ZONE_MOVABLE) 7957 continue; 7958 7959 end_pfn = max(max_zone_pfn[zone], start_pfn); 7960 arch_zone_lowest_possible_pfn[zone] = start_pfn; 7961 arch_zone_highest_possible_pfn[zone] = end_pfn; 7962 7963 start_pfn = end_pfn; 7964 } 7965 7966 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 7967 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 7968 find_zone_movable_pfns_for_nodes(); 7969 7970 /* Print out the zone ranges */ 7971 pr_info("Zone ranges:\n"); 7972 for (i = 0; i < MAX_NR_ZONES; i++) { 7973 if (i == ZONE_MOVABLE) 7974 continue; 7975 pr_info(" %-8s ", zone_names[i]); 7976 if (arch_zone_lowest_possible_pfn[i] == 7977 arch_zone_highest_possible_pfn[i]) 7978 pr_cont("empty\n"); 7979 else 7980 pr_cont("[mem %#018Lx-%#018Lx]\n", 7981 (u64)arch_zone_lowest_possible_pfn[i] 7982 << PAGE_SHIFT, 7983 ((u64)arch_zone_highest_possible_pfn[i] 7984 << PAGE_SHIFT) - 1); 7985 } 7986 7987 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 7988 pr_info("Movable zone start for each node\n"); 7989 for (i = 0; i < MAX_NUMNODES; i++) { 7990 if (zone_movable_pfn[i]) 7991 pr_info(" Node %d: %#018Lx\n", i, 7992 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 7993 } 7994 7995 /* 7996 * Print out the early node map, and initialize the 7997 * subsection-map relative to active online memory ranges to 7998 * enable future "sub-section" extensions of the memory map. 7999 */ 8000 pr_info("Early memory node ranges\n"); 8001 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 8002 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 8003 (u64)start_pfn << PAGE_SHIFT, 8004 ((u64)end_pfn << PAGE_SHIFT) - 1); 8005 subsection_map_init(start_pfn, end_pfn - start_pfn); 8006 } 8007 8008 /* Initialise every node */ 8009 mminit_verify_pageflags_layout(); 8010 setup_nr_node_ids(); 8011 for_each_online_node(nid) { 8012 pg_data_t *pgdat = NODE_DATA(nid); 8013 free_area_init_node(nid); 8014 8015 /* Any memory on that node */ 8016 if (pgdat->node_present_pages) 8017 node_set_state(nid, N_MEMORY); 8018 check_for_memory(pgdat, nid); 8019 } 8020 8021 memmap_init(); 8022 } 8023 8024 static int __init cmdline_parse_core(char *p, unsigned long *core, 8025 unsigned long *percent) 8026 { 8027 unsigned long long coremem; 8028 char *endptr; 8029 8030 if (!p) 8031 return -EINVAL; 8032 8033 /* Value may be a percentage of total memory, otherwise bytes */ 8034 coremem = simple_strtoull(p, &endptr, 0); 8035 if (*endptr == '%') { 8036 /* Paranoid check for percent values greater than 100 */ 8037 WARN_ON(coremem > 100); 8038 8039 *percent = coremem; 8040 } else { 8041 coremem = memparse(p, &p); 8042 /* Paranoid check that UL is enough for the coremem value */ 8043 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 8044 8045 *core = coremem >> PAGE_SHIFT; 8046 *percent = 0UL; 8047 } 8048 return 0; 8049 } 8050 8051 /* 8052 * kernelcore=size sets the amount of memory for use for allocations that 8053 * cannot be reclaimed or migrated. 8054 */ 8055 static int __init cmdline_parse_kernelcore(char *p) 8056 { 8057 /* parse kernelcore=mirror */ 8058 if (parse_option_str(p, "mirror")) { 8059 mirrored_kernelcore = true; 8060 return 0; 8061 } 8062 8063 return cmdline_parse_core(p, &required_kernelcore, 8064 &required_kernelcore_percent); 8065 } 8066 8067 /* 8068 * movablecore=size sets the amount of memory for use for allocations that 8069 * can be reclaimed or migrated. 8070 */ 8071 static int __init cmdline_parse_movablecore(char *p) 8072 { 8073 return cmdline_parse_core(p, &required_movablecore, 8074 &required_movablecore_percent); 8075 } 8076 8077 early_param("kernelcore", cmdline_parse_kernelcore); 8078 early_param("movablecore", cmdline_parse_movablecore); 8079 8080 void adjust_managed_page_count(struct page *page, long count) 8081 { 8082 atomic_long_add(count, &page_zone(page)->managed_pages); 8083 totalram_pages_add(count); 8084 #ifdef CONFIG_HIGHMEM 8085 if (PageHighMem(page)) 8086 totalhigh_pages_add(count); 8087 #endif 8088 } 8089 EXPORT_SYMBOL(adjust_managed_page_count); 8090 8091 unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) 8092 { 8093 void *pos; 8094 unsigned long pages = 0; 8095 8096 start = (void *)PAGE_ALIGN((unsigned long)start); 8097 end = (void *)((unsigned long)end & PAGE_MASK); 8098 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 8099 struct page *page = virt_to_page(pos); 8100 void *direct_map_addr; 8101 8102 /* 8103 * 'direct_map_addr' might be different from 'pos' 8104 * because some architectures' virt_to_page() 8105 * work with aliases. Getting the direct map 8106 * address ensures that we get a _writeable_ 8107 * alias for the memset(). 8108 */ 8109 direct_map_addr = page_address(page); 8110 /* 8111 * Perform a kasan-unchecked memset() since this memory 8112 * has not been initialized. 8113 */ 8114 direct_map_addr = kasan_reset_tag(direct_map_addr); 8115 if ((unsigned int)poison <= 0xFF) 8116 memset(direct_map_addr, poison, PAGE_SIZE); 8117 8118 free_reserved_page(page); 8119 } 8120 8121 if (pages && s) 8122 pr_info("Freeing %s memory: %ldK\n", 8123 s, pages << (PAGE_SHIFT - 10)); 8124 8125 return pages; 8126 } 8127 8128 void __init mem_init_print_info(void) 8129 { 8130 unsigned long physpages, codesize, datasize, rosize, bss_size; 8131 unsigned long init_code_size, init_data_size; 8132 8133 physpages = get_num_physpages(); 8134 codesize = _etext - _stext; 8135 datasize = _edata - _sdata; 8136 rosize = __end_rodata - __start_rodata; 8137 bss_size = __bss_stop - __bss_start; 8138 init_data_size = __init_end - __init_begin; 8139 init_code_size = _einittext - _sinittext; 8140 8141 /* 8142 * Detect special cases and adjust section sizes accordingly: 8143 * 1) .init.* may be embedded into .data sections 8144 * 2) .init.text.* may be out of [__init_begin, __init_end], 8145 * please refer to arch/tile/kernel/vmlinux.lds.S. 8146 * 3) .rodata.* may be embedded into .text or .data sections. 8147 */ 8148 #define adj_init_size(start, end, size, pos, adj) \ 8149 do { \ 8150 if (start <= pos && pos < end && size > adj) \ 8151 size -= adj; \ 8152 } while (0) 8153 8154 adj_init_size(__init_begin, __init_end, init_data_size, 8155 _sinittext, init_code_size); 8156 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 8157 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 8158 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 8159 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 8160 8161 #undef adj_init_size 8162 8163 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 8164 #ifdef CONFIG_HIGHMEM 8165 ", %luK highmem" 8166 #endif 8167 ")\n", 8168 nr_free_pages() << (PAGE_SHIFT - 10), 8169 physpages << (PAGE_SHIFT - 10), 8170 codesize >> 10, datasize >> 10, rosize >> 10, 8171 (init_data_size + init_code_size) >> 10, bss_size >> 10, 8172 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), 8173 totalcma_pages << (PAGE_SHIFT - 10) 8174 #ifdef CONFIG_HIGHMEM 8175 , totalhigh_pages() << (PAGE_SHIFT - 10) 8176 #endif 8177 ); 8178 } 8179 8180 /** 8181 * set_dma_reserve - set the specified number of pages reserved in the first zone 8182 * @new_dma_reserve: The number of pages to mark reserved 8183 * 8184 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 8185 * In the DMA zone, a significant percentage may be consumed by kernel image 8186 * and other unfreeable allocations which can skew the watermarks badly. This 8187 * function may optionally be used to account for unfreeable pages in the 8188 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 8189 * smaller per-cpu batchsize. 8190 */ 8191 void __init set_dma_reserve(unsigned long new_dma_reserve) 8192 { 8193 dma_reserve = new_dma_reserve; 8194 } 8195 8196 static int page_alloc_cpu_dead(unsigned int cpu) 8197 { 8198 struct zone *zone; 8199 8200 lru_add_drain_cpu(cpu); 8201 drain_pages(cpu); 8202 8203 /* 8204 * Spill the event counters of the dead processor 8205 * into the current processors event counters. 8206 * This artificially elevates the count of the current 8207 * processor. 8208 */ 8209 vm_events_fold_cpu(cpu); 8210 8211 /* 8212 * Zero the differential counters of the dead processor 8213 * so that the vm statistics are consistent. 8214 * 8215 * This is only okay since the processor is dead and cannot 8216 * race with what we are doing. 8217 */ 8218 cpu_vm_stats_fold(cpu); 8219 8220 for_each_populated_zone(zone) 8221 zone_pcp_update(zone, 0); 8222 8223 return 0; 8224 } 8225 8226 static int page_alloc_cpu_online(unsigned int cpu) 8227 { 8228 struct zone *zone; 8229 8230 for_each_populated_zone(zone) 8231 zone_pcp_update(zone, 1); 8232 return 0; 8233 } 8234 8235 #ifdef CONFIG_NUMA 8236 int hashdist = HASHDIST_DEFAULT; 8237 8238 static int __init set_hashdist(char *str) 8239 { 8240 if (!str) 8241 return 0; 8242 hashdist = simple_strtoul(str, &str, 0); 8243 return 1; 8244 } 8245 __setup("hashdist=", set_hashdist); 8246 #endif 8247 8248 void __init page_alloc_init(void) 8249 { 8250 int ret; 8251 8252 #ifdef CONFIG_NUMA 8253 if (num_node_state(N_MEMORY) == 1) 8254 hashdist = 0; 8255 #endif 8256 8257 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, 8258 "mm/page_alloc:pcp", 8259 page_alloc_cpu_online, 8260 page_alloc_cpu_dead); 8261 WARN_ON(ret < 0); 8262 } 8263 8264 /* 8265 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 8266 * or min_free_kbytes changes. 8267 */ 8268 static void calculate_totalreserve_pages(void) 8269 { 8270 struct pglist_data *pgdat; 8271 unsigned long reserve_pages = 0; 8272 enum zone_type i, j; 8273 8274 for_each_online_pgdat(pgdat) { 8275 8276 pgdat->totalreserve_pages = 0; 8277 8278 for (i = 0; i < MAX_NR_ZONES; i++) { 8279 struct zone *zone = pgdat->node_zones + i; 8280 long max = 0; 8281 unsigned long managed_pages = zone_managed_pages(zone); 8282 8283 /* Find valid and maximum lowmem_reserve in the zone */ 8284 for (j = i; j < MAX_NR_ZONES; j++) { 8285 if (zone->lowmem_reserve[j] > max) 8286 max = zone->lowmem_reserve[j]; 8287 } 8288 8289 /* we treat the high watermark as reserved pages. */ 8290 max += high_wmark_pages(zone); 8291 8292 if (max > managed_pages) 8293 max = managed_pages; 8294 8295 pgdat->totalreserve_pages += max; 8296 8297 reserve_pages += max; 8298 } 8299 } 8300 totalreserve_pages = reserve_pages; 8301 } 8302 8303 /* 8304 * setup_per_zone_lowmem_reserve - called whenever 8305 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 8306 * has a correct pages reserved value, so an adequate number of 8307 * pages are left in the zone after a successful __alloc_pages(). 8308 */ 8309 static void setup_per_zone_lowmem_reserve(void) 8310 { 8311 struct pglist_data *pgdat; 8312 enum zone_type i, j; 8313 8314 for_each_online_pgdat(pgdat) { 8315 for (i = 0; i < MAX_NR_ZONES - 1; i++) { 8316 struct zone *zone = &pgdat->node_zones[i]; 8317 int ratio = sysctl_lowmem_reserve_ratio[i]; 8318 bool clear = !ratio || !zone_managed_pages(zone); 8319 unsigned long managed_pages = 0; 8320 8321 for (j = i + 1; j < MAX_NR_ZONES; j++) { 8322 struct zone *upper_zone = &pgdat->node_zones[j]; 8323 8324 managed_pages += zone_managed_pages(upper_zone); 8325 8326 if (clear) 8327 zone->lowmem_reserve[j] = 0; 8328 else 8329 zone->lowmem_reserve[j] = managed_pages / ratio; 8330 } 8331 } 8332 } 8333 8334 /* update totalreserve_pages */ 8335 calculate_totalreserve_pages(); 8336 } 8337 8338 static void __setup_per_zone_wmarks(void) 8339 { 8340 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 8341 unsigned long lowmem_pages = 0; 8342 struct zone *zone; 8343 unsigned long flags; 8344 8345 /* Calculate total number of !ZONE_HIGHMEM pages */ 8346 for_each_zone(zone) { 8347 if (!is_highmem(zone)) 8348 lowmem_pages += zone_managed_pages(zone); 8349 } 8350 8351 for_each_zone(zone) { 8352 u64 tmp; 8353 8354 spin_lock_irqsave(&zone->lock, flags); 8355 tmp = (u64)pages_min * zone_managed_pages(zone); 8356 do_div(tmp, lowmem_pages); 8357 if (is_highmem(zone)) { 8358 /* 8359 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 8360 * need highmem pages, so cap pages_min to a small 8361 * value here. 8362 * 8363 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 8364 * deltas control async page reclaim, and so should 8365 * not be capped for highmem. 8366 */ 8367 unsigned long min_pages; 8368 8369 min_pages = zone_managed_pages(zone) / 1024; 8370 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 8371 zone->_watermark[WMARK_MIN] = min_pages; 8372 } else { 8373 /* 8374 * If it's a lowmem zone, reserve a number of pages 8375 * proportionate to the zone's size. 8376 */ 8377 zone->_watermark[WMARK_MIN] = tmp; 8378 } 8379 8380 /* 8381 * Set the kswapd watermarks distance according to the 8382 * scale factor in proportion to available memory, but 8383 * ensure a minimum size on small systems. 8384 */ 8385 tmp = max_t(u64, tmp >> 2, 8386 mult_frac(zone_managed_pages(zone), 8387 watermark_scale_factor, 10000)); 8388 8389 zone->watermark_boost = 0; 8390 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 8391 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 8392 8393 spin_unlock_irqrestore(&zone->lock, flags); 8394 } 8395 8396 /* update totalreserve_pages */ 8397 calculate_totalreserve_pages(); 8398 } 8399 8400 /** 8401 * setup_per_zone_wmarks - called when min_free_kbytes changes 8402 * or when memory is hot-{added|removed} 8403 * 8404 * Ensures that the watermark[min,low,high] values for each zone are set 8405 * correctly with respect to min_free_kbytes. 8406 */ 8407 void setup_per_zone_wmarks(void) 8408 { 8409 struct zone *zone; 8410 static DEFINE_SPINLOCK(lock); 8411 8412 spin_lock(&lock); 8413 __setup_per_zone_wmarks(); 8414 spin_unlock(&lock); 8415 8416 /* 8417 * The watermark size have changed so update the pcpu batch 8418 * and high limits or the limits may be inappropriate. 8419 */ 8420 for_each_zone(zone) 8421 zone_pcp_update(zone, 0); 8422 } 8423 8424 /* 8425 * Initialise min_free_kbytes. 8426 * 8427 * For small machines we want it small (128k min). For large machines 8428 * we want it large (256MB max). But it is not linear, because network 8429 * bandwidth does not increase linearly with machine size. We use 8430 * 8431 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 8432 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 8433 * 8434 * which yields 8435 * 8436 * 16MB: 512k 8437 * 32MB: 724k 8438 * 64MB: 1024k 8439 * 128MB: 1448k 8440 * 256MB: 2048k 8441 * 512MB: 2896k 8442 * 1024MB: 4096k 8443 * 2048MB: 5792k 8444 * 4096MB: 8192k 8445 * 8192MB: 11584k 8446 * 16384MB: 16384k 8447 */ 8448 int __meminit init_per_zone_wmark_min(void) 8449 { 8450 unsigned long lowmem_kbytes; 8451 int new_min_free_kbytes; 8452 8453 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 8454 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 8455 8456 if (new_min_free_kbytes > user_min_free_kbytes) { 8457 min_free_kbytes = new_min_free_kbytes; 8458 if (min_free_kbytes < 128) 8459 min_free_kbytes = 128; 8460 if (min_free_kbytes > 262144) 8461 min_free_kbytes = 262144; 8462 } else { 8463 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 8464 new_min_free_kbytes, user_min_free_kbytes); 8465 } 8466 setup_per_zone_wmarks(); 8467 refresh_zone_stat_thresholds(); 8468 setup_per_zone_lowmem_reserve(); 8469 8470 #ifdef CONFIG_NUMA 8471 setup_min_unmapped_ratio(); 8472 setup_min_slab_ratio(); 8473 #endif 8474 8475 khugepaged_min_free_kbytes_update(); 8476 8477 return 0; 8478 } 8479 postcore_initcall(init_per_zone_wmark_min) 8480 8481 /* 8482 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 8483 * that we can call two helper functions whenever min_free_kbytes 8484 * changes. 8485 */ 8486 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 8487 void *buffer, size_t *length, loff_t *ppos) 8488 { 8489 int rc; 8490 8491 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8492 if (rc) 8493 return rc; 8494 8495 if (write) { 8496 user_min_free_kbytes = min_free_kbytes; 8497 setup_per_zone_wmarks(); 8498 } 8499 return 0; 8500 } 8501 8502 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 8503 void *buffer, size_t *length, loff_t *ppos) 8504 { 8505 int rc; 8506 8507 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8508 if (rc) 8509 return rc; 8510 8511 if (write) 8512 setup_per_zone_wmarks(); 8513 8514 return 0; 8515 } 8516 8517 #ifdef CONFIG_NUMA 8518 static void setup_min_unmapped_ratio(void) 8519 { 8520 pg_data_t *pgdat; 8521 struct zone *zone; 8522 8523 for_each_online_pgdat(pgdat) 8524 pgdat->min_unmapped_pages = 0; 8525 8526 for_each_zone(zone) 8527 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 8528 sysctl_min_unmapped_ratio) / 100; 8529 } 8530 8531 8532 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 8533 void *buffer, size_t *length, loff_t *ppos) 8534 { 8535 int rc; 8536 8537 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8538 if (rc) 8539 return rc; 8540 8541 setup_min_unmapped_ratio(); 8542 8543 return 0; 8544 } 8545 8546 static void setup_min_slab_ratio(void) 8547 { 8548 pg_data_t *pgdat; 8549 struct zone *zone; 8550 8551 for_each_online_pgdat(pgdat) 8552 pgdat->min_slab_pages = 0; 8553 8554 for_each_zone(zone) 8555 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 8556 sysctl_min_slab_ratio) / 100; 8557 } 8558 8559 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 8560 void *buffer, size_t *length, loff_t *ppos) 8561 { 8562 int rc; 8563 8564 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 8565 if (rc) 8566 return rc; 8567 8568 setup_min_slab_ratio(); 8569 8570 return 0; 8571 } 8572 #endif 8573 8574 /* 8575 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 8576 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 8577 * whenever sysctl_lowmem_reserve_ratio changes. 8578 * 8579 * The reserve ratio obviously has absolutely no relation with the 8580 * minimum watermarks. The lowmem reserve ratio can only make sense 8581 * if in function of the boot time zone sizes. 8582 */ 8583 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 8584 void *buffer, size_t *length, loff_t *ppos) 8585 { 8586 int i; 8587 8588 proc_dointvec_minmax(table, write, buffer, length, ppos); 8589 8590 for (i = 0; i < MAX_NR_ZONES; i++) { 8591 if (sysctl_lowmem_reserve_ratio[i] < 1) 8592 sysctl_lowmem_reserve_ratio[i] = 0; 8593 } 8594 8595 setup_per_zone_lowmem_reserve(); 8596 return 0; 8597 } 8598 8599 /* 8600 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each 8601 * cpu. It is the fraction of total pages in each zone that a hot per cpu 8602 * pagelist can have before it gets flushed back to buddy allocator. 8603 */ 8604 int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, 8605 int write, void *buffer, size_t *length, loff_t *ppos) 8606 { 8607 struct zone *zone; 8608 int old_percpu_pagelist_high_fraction; 8609 int ret; 8610 8611 mutex_lock(&pcp_batch_high_lock); 8612 old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; 8613 8614 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 8615 if (!write || ret < 0) 8616 goto out; 8617 8618 /* Sanity checking to avoid pcp imbalance */ 8619 if (percpu_pagelist_high_fraction && 8620 percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { 8621 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; 8622 ret = -EINVAL; 8623 goto out; 8624 } 8625 8626 /* No change? */ 8627 if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) 8628 goto out; 8629 8630 for_each_populated_zone(zone) 8631 zone_set_pageset_high_and_batch(zone, 0); 8632 out: 8633 mutex_unlock(&pcp_batch_high_lock); 8634 return ret; 8635 } 8636 8637 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 8638 /* 8639 * Returns the number of pages that arch has reserved but 8640 * is not known to alloc_large_system_hash(). 8641 */ 8642 static unsigned long __init arch_reserved_kernel_pages(void) 8643 { 8644 return 0; 8645 } 8646 #endif 8647 8648 /* 8649 * Adaptive scale is meant to reduce sizes of hash tables on large memory 8650 * machines. As memory size is increased the scale is also increased but at 8651 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 8652 * quadruples the scale is increased by one, which means the size of hash table 8653 * only doubles, instead of quadrupling as well. 8654 * Because 32-bit systems cannot have large physical memory, where this scaling 8655 * makes sense, it is disabled on such platforms. 8656 */ 8657 #if __BITS_PER_LONG > 32 8658 #define ADAPT_SCALE_BASE (64ul << 30) 8659 #define ADAPT_SCALE_SHIFT 2 8660 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 8661 #endif 8662 8663 /* 8664 * allocate a large system hash table from bootmem 8665 * - it is assumed that the hash table must contain an exact power-of-2 8666 * quantity of entries 8667 * - limit is the number of hash buckets, not the total allocation size 8668 */ 8669 void *__init alloc_large_system_hash(const char *tablename, 8670 unsigned long bucketsize, 8671 unsigned long numentries, 8672 int scale, 8673 int flags, 8674 unsigned int *_hash_shift, 8675 unsigned int *_hash_mask, 8676 unsigned long low_limit, 8677 unsigned long high_limit) 8678 { 8679 unsigned long long max = high_limit; 8680 unsigned long log2qty, size; 8681 void *table = NULL; 8682 gfp_t gfp_flags; 8683 bool virt; 8684 bool huge; 8685 8686 /* allow the kernel cmdline to have a say */ 8687 if (!numentries) { 8688 /* round applicable memory size up to nearest megabyte */ 8689 numentries = nr_kernel_pages; 8690 numentries -= arch_reserved_kernel_pages(); 8691 8692 /* It isn't necessary when PAGE_SIZE >= 1MB */ 8693 if (PAGE_SHIFT < 20) 8694 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 8695 8696 #if __BITS_PER_LONG > 32 8697 if (!high_limit) { 8698 unsigned long adapt; 8699 8700 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 8701 adapt <<= ADAPT_SCALE_SHIFT) 8702 scale++; 8703 } 8704 #endif 8705 8706 /* limit to 1 bucket per 2^scale bytes of low memory */ 8707 if (scale > PAGE_SHIFT) 8708 numentries >>= (scale - PAGE_SHIFT); 8709 else 8710 numentries <<= (PAGE_SHIFT - scale); 8711 8712 /* Make sure we've got at least a 0-order allocation.. */ 8713 if (unlikely(flags & HASH_SMALL)) { 8714 /* Makes no sense without HASH_EARLY */ 8715 WARN_ON(!(flags & HASH_EARLY)); 8716 if (!(numentries >> *_hash_shift)) { 8717 numentries = 1UL << *_hash_shift; 8718 BUG_ON(!numentries); 8719 } 8720 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 8721 numentries = PAGE_SIZE / bucketsize; 8722 } 8723 numentries = roundup_pow_of_two(numentries); 8724 8725 /* limit allocation size to 1/16 total memory by default */ 8726 if (max == 0) { 8727 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 8728 do_div(max, bucketsize); 8729 } 8730 max = min(max, 0x80000000ULL); 8731 8732 if (numentries < low_limit) 8733 numentries = low_limit; 8734 if (numentries > max) 8735 numentries = max; 8736 8737 log2qty = ilog2(numentries); 8738 8739 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 8740 do { 8741 virt = false; 8742 size = bucketsize << log2qty; 8743 if (flags & HASH_EARLY) { 8744 if (flags & HASH_ZERO) 8745 table = memblock_alloc(size, SMP_CACHE_BYTES); 8746 else 8747 table = memblock_alloc_raw(size, 8748 SMP_CACHE_BYTES); 8749 } else if (get_order(size) >= MAX_ORDER || hashdist) { 8750 table = __vmalloc(size, gfp_flags); 8751 virt = true; 8752 huge = is_vm_area_hugepages(table); 8753 } else { 8754 /* 8755 * If bucketsize is not a power-of-two, we may free 8756 * some pages at the end of hash table which 8757 * alloc_pages_exact() automatically does 8758 */ 8759 table = alloc_pages_exact(size, gfp_flags); 8760 kmemleak_alloc(table, size, 1, gfp_flags); 8761 } 8762 } while (!table && size > PAGE_SIZE && --log2qty); 8763 8764 if (!table) 8765 panic("Failed to allocate %s hash table\n", tablename); 8766 8767 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 8768 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, 8769 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); 8770 8771 if (_hash_shift) 8772 *_hash_shift = log2qty; 8773 if (_hash_mask) 8774 *_hash_mask = (1 << log2qty) - 1; 8775 8776 return table; 8777 } 8778 8779 /* 8780 * This function checks whether pageblock includes unmovable pages or not. 8781 * 8782 * PageLRU check without isolation or lru_lock could race so that 8783 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 8784 * check without lock_page also may miss some movable non-lru pages at 8785 * race condition. So you can't expect this function should be exact. 8786 * 8787 * Returns a page without holding a reference. If the caller wants to 8788 * dereference that page (e.g., dumping), it has to make sure that it 8789 * cannot get removed (e.g., via memory unplug) concurrently. 8790 * 8791 */ 8792 struct page *has_unmovable_pages(struct zone *zone, struct page *page, 8793 int migratetype, int flags) 8794 { 8795 unsigned long iter = 0; 8796 unsigned long pfn = page_to_pfn(page); 8797 unsigned long offset = pfn % pageblock_nr_pages; 8798 8799 if (is_migrate_cma_page(page)) { 8800 /* 8801 * CMA allocations (alloc_contig_range) really need to mark 8802 * isolate CMA pageblocks even when they are not movable in fact 8803 * so consider them movable here. 8804 */ 8805 if (is_migrate_cma(migratetype)) 8806 return NULL; 8807 8808 return page; 8809 } 8810 8811 for (; iter < pageblock_nr_pages - offset; iter++) { 8812 if (!pfn_valid_within(pfn + iter)) 8813 continue; 8814 8815 page = pfn_to_page(pfn + iter); 8816 8817 /* 8818 * Both, bootmem allocations and memory holes are marked 8819 * PG_reserved and are unmovable. We can even have unmovable 8820 * allocations inside ZONE_MOVABLE, for example when 8821 * specifying "movablecore". 8822 */ 8823 if (PageReserved(page)) 8824 return page; 8825 8826 /* 8827 * If the zone is movable and we have ruled out all reserved 8828 * pages then it should be reasonably safe to assume the rest 8829 * is movable. 8830 */ 8831 if (zone_idx(zone) == ZONE_MOVABLE) 8832 continue; 8833 8834 /* 8835 * Hugepages are not in LRU lists, but they're movable. 8836 * THPs are on the LRU, but need to be counted as #small pages. 8837 * We need not scan over tail pages because we don't 8838 * handle each tail page individually in migration. 8839 */ 8840 if (PageHuge(page) || PageTransCompound(page)) { 8841 struct page *head = compound_head(page); 8842 unsigned int skip_pages; 8843 8844 if (PageHuge(page)) { 8845 if (!hugepage_migration_supported(page_hstate(head))) 8846 return page; 8847 } else if (!PageLRU(head) && !__PageMovable(head)) { 8848 return page; 8849 } 8850 8851 skip_pages = compound_nr(head) - (page - head); 8852 iter += skip_pages - 1; 8853 continue; 8854 } 8855 8856 /* 8857 * We can't use page_count without pin a page 8858 * because another CPU can free compound page. 8859 * This check already skips compound tails of THP 8860 * because their page->_refcount is zero at all time. 8861 */ 8862 if (!page_ref_count(page)) { 8863 if (PageBuddy(page)) 8864 iter += (1 << buddy_order(page)) - 1; 8865 continue; 8866 } 8867 8868 /* 8869 * The HWPoisoned page may be not in buddy system, and 8870 * page_count() is not 0. 8871 */ 8872 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) 8873 continue; 8874 8875 /* 8876 * We treat all PageOffline() pages as movable when offlining 8877 * to give drivers a chance to decrement their reference count 8878 * in MEM_GOING_OFFLINE in order to indicate that these pages 8879 * can be offlined as there are no direct references anymore. 8880 * For actually unmovable PageOffline() where the driver does 8881 * not support this, we will fail later when trying to actually 8882 * move these pages that still have a reference count > 0. 8883 * (false negatives in this function only) 8884 */ 8885 if ((flags & MEMORY_OFFLINE) && PageOffline(page)) 8886 continue; 8887 8888 if (__PageMovable(page) || PageLRU(page)) 8889 continue; 8890 8891 /* 8892 * If there are RECLAIMABLE pages, we need to check 8893 * it. But now, memory offline itself doesn't call 8894 * shrink_node_slabs() and it still to be fixed. 8895 */ 8896 return page; 8897 } 8898 return NULL; 8899 } 8900 8901 #ifdef CONFIG_CONTIG_ALLOC 8902 static unsigned long pfn_max_align_down(unsigned long pfn) 8903 { 8904 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 8905 pageblock_nr_pages) - 1); 8906 } 8907 8908 static unsigned long pfn_max_align_up(unsigned long pfn) 8909 { 8910 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 8911 pageblock_nr_pages)); 8912 } 8913 8914 #if defined(CONFIG_DYNAMIC_DEBUG) || \ 8915 (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) 8916 /* Usage: See admin-guide/dynamic-debug-howto.rst */ 8917 static void alloc_contig_dump_pages(struct list_head *page_list) 8918 { 8919 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); 8920 8921 if (DYNAMIC_DEBUG_BRANCH(descriptor)) { 8922 struct page *page; 8923 8924 dump_stack(); 8925 list_for_each_entry(page, page_list, lru) 8926 dump_page(page, "migration failure"); 8927 } 8928 } 8929 #else 8930 static inline void alloc_contig_dump_pages(struct list_head *page_list) 8931 { 8932 } 8933 #endif 8934 8935 /* [start, end) must belong to a single zone. */ 8936 static int __alloc_contig_migrate_range(struct compact_control *cc, 8937 unsigned long start, unsigned long end) 8938 { 8939 /* This function is based on compact_zone() from compaction.c. */ 8940 unsigned int nr_reclaimed; 8941 unsigned long pfn = start; 8942 unsigned int tries = 0; 8943 int ret = 0; 8944 struct migration_target_control mtc = { 8945 .nid = zone_to_nid(cc->zone), 8946 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 8947 }; 8948 8949 lru_cache_disable(); 8950 8951 while (pfn < end || !list_empty(&cc->migratepages)) { 8952 if (fatal_signal_pending(current)) { 8953 ret = -EINTR; 8954 break; 8955 } 8956 8957 if (list_empty(&cc->migratepages)) { 8958 cc->nr_migratepages = 0; 8959 ret = isolate_migratepages_range(cc, pfn, end); 8960 if (ret && ret != -EAGAIN) 8961 break; 8962 pfn = cc->migrate_pfn; 8963 tries = 0; 8964 } else if (++tries == 5) { 8965 ret = -EBUSY; 8966 break; 8967 } 8968 8969 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 8970 &cc->migratepages); 8971 cc->nr_migratepages -= nr_reclaimed; 8972 8973 ret = migrate_pages(&cc->migratepages, alloc_migration_target, 8974 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); 8975 8976 /* 8977 * On -ENOMEM, migrate_pages() bails out right away. It is pointless 8978 * to retry again over this error, so do the same here. 8979 */ 8980 if (ret == -ENOMEM) 8981 break; 8982 } 8983 8984 lru_cache_enable(); 8985 if (ret < 0) { 8986 if (ret == -EBUSY) 8987 alloc_contig_dump_pages(&cc->migratepages); 8988 putback_movable_pages(&cc->migratepages); 8989 return ret; 8990 } 8991 return 0; 8992 } 8993 8994 /** 8995 * alloc_contig_range() -- tries to allocate given range of pages 8996 * @start: start PFN to allocate 8997 * @end: one-past-the-last PFN to allocate 8998 * @migratetype: migratetype of the underlying pageblocks (either 8999 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 9000 * in range must have the same migratetype and it must 9001 * be either of the two. 9002 * @gfp_mask: GFP mask to use during compaction 9003 * 9004 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 9005 * aligned. The PFN range must belong to a single zone. 9006 * 9007 * The first thing this routine does is attempt to MIGRATE_ISOLATE all 9008 * pageblocks in the range. Once isolated, the pageblocks should not 9009 * be modified by others. 9010 * 9011 * Return: zero on success or negative error code. On success all 9012 * pages which PFN is in [start, end) are allocated for the caller and 9013 * need to be freed with free_contig_range(). 9014 */ 9015 int alloc_contig_range(unsigned long start, unsigned long end, 9016 unsigned migratetype, gfp_t gfp_mask) 9017 { 9018 unsigned long outer_start, outer_end; 9019 unsigned int order; 9020 int ret = 0; 9021 9022 struct compact_control cc = { 9023 .nr_migratepages = 0, 9024 .order = -1, 9025 .zone = page_zone(pfn_to_page(start)), 9026 .mode = MIGRATE_SYNC, 9027 .ignore_skip_hint = true, 9028 .no_set_skip_hint = true, 9029 .gfp_mask = current_gfp_context(gfp_mask), 9030 .alloc_contig = true, 9031 }; 9032 INIT_LIST_HEAD(&cc.migratepages); 9033 9034 /* 9035 * What we do here is we mark all pageblocks in range as 9036 * MIGRATE_ISOLATE. Because pageblock and max order pages may 9037 * have different sizes, and due to the way page allocator 9038 * work, we align the range to biggest of the two pages so 9039 * that page allocator won't try to merge buddies from 9040 * different pageblocks and change MIGRATE_ISOLATE to some 9041 * other migration type. 9042 * 9043 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 9044 * migrate the pages from an unaligned range (ie. pages that 9045 * we are interested in). This will put all the pages in 9046 * range back to page allocator as MIGRATE_ISOLATE. 9047 * 9048 * When this is done, we take the pages in range from page 9049 * allocator removing them from the buddy system. This way 9050 * page allocator will never consider using them. 9051 * 9052 * This lets us mark the pageblocks back as 9053 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 9054 * aligned range but not in the unaligned, original range are 9055 * put back to page allocator so that buddy can use them. 9056 */ 9057 9058 ret = start_isolate_page_range(pfn_max_align_down(start), 9059 pfn_max_align_up(end), migratetype, 0); 9060 if (ret) 9061 return ret; 9062 9063 drain_all_pages(cc.zone); 9064 9065 /* 9066 * In case of -EBUSY, we'd like to know which page causes problem. 9067 * So, just fall through. test_pages_isolated() has a tracepoint 9068 * which will report the busy page. 9069 * 9070 * It is possible that busy pages could become available before 9071 * the call to test_pages_isolated, and the range will actually be 9072 * allocated. So, if we fall through be sure to clear ret so that 9073 * -EBUSY is not accidentally used or returned to caller. 9074 */ 9075 ret = __alloc_contig_migrate_range(&cc, start, end); 9076 if (ret && ret != -EBUSY) 9077 goto done; 9078 ret = 0; 9079 9080 /* 9081 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 9082 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 9083 * more, all pages in [start, end) are free in page allocator. 9084 * What we are going to do is to allocate all pages from 9085 * [start, end) (that is remove them from page allocator). 9086 * 9087 * The only problem is that pages at the beginning and at the 9088 * end of interesting range may be not aligned with pages that 9089 * page allocator holds, ie. they can be part of higher order 9090 * pages. Because of this, we reserve the bigger range and 9091 * once this is done free the pages we are not interested in. 9092 * 9093 * We don't have to hold zone->lock here because the pages are 9094 * isolated thus they won't get removed from buddy. 9095 */ 9096 9097 order = 0; 9098 outer_start = start; 9099 while (!PageBuddy(pfn_to_page(outer_start))) { 9100 if (++order >= MAX_ORDER) { 9101 outer_start = start; 9102 break; 9103 } 9104 outer_start &= ~0UL << order; 9105 } 9106 9107 if (outer_start != start) { 9108 order = buddy_order(pfn_to_page(outer_start)); 9109 9110 /* 9111 * outer_start page could be small order buddy page and 9112 * it doesn't include start page. Adjust outer_start 9113 * in this case to report failed page properly 9114 * on tracepoint in test_pages_isolated() 9115 */ 9116 if (outer_start + (1UL << order) <= start) 9117 outer_start = start; 9118 } 9119 9120 /* Make sure the range is really isolated. */ 9121 if (test_pages_isolated(outer_start, end, 0)) { 9122 ret = -EBUSY; 9123 goto done; 9124 } 9125 9126 /* Grab isolated pages from freelists. */ 9127 outer_end = isolate_freepages_range(&cc, outer_start, end); 9128 if (!outer_end) { 9129 ret = -EBUSY; 9130 goto done; 9131 } 9132 9133 /* Free head and tail (if any) */ 9134 if (start != outer_start) 9135 free_contig_range(outer_start, start - outer_start); 9136 if (end != outer_end) 9137 free_contig_range(end, outer_end - end); 9138 9139 done: 9140 undo_isolate_page_range(pfn_max_align_down(start), 9141 pfn_max_align_up(end), migratetype); 9142 return ret; 9143 } 9144 EXPORT_SYMBOL(alloc_contig_range); 9145 9146 static int __alloc_contig_pages(unsigned long start_pfn, 9147 unsigned long nr_pages, gfp_t gfp_mask) 9148 { 9149 unsigned long end_pfn = start_pfn + nr_pages; 9150 9151 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 9152 gfp_mask); 9153 } 9154 9155 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 9156 unsigned long nr_pages) 9157 { 9158 unsigned long i, end_pfn = start_pfn + nr_pages; 9159 struct page *page; 9160 9161 for (i = start_pfn; i < end_pfn; i++) { 9162 page = pfn_to_online_page(i); 9163 if (!page) 9164 return false; 9165 9166 if (page_zone(page) != z) 9167 return false; 9168 9169 if (PageReserved(page)) 9170 return false; 9171 } 9172 return true; 9173 } 9174 9175 static bool zone_spans_last_pfn(const struct zone *zone, 9176 unsigned long start_pfn, unsigned long nr_pages) 9177 { 9178 unsigned long last_pfn = start_pfn + nr_pages - 1; 9179 9180 return zone_spans_pfn(zone, last_pfn); 9181 } 9182 9183 /** 9184 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 9185 * @nr_pages: Number of contiguous pages to allocate 9186 * @gfp_mask: GFP mask to limit search and used during compaction 9187 * @nid: Target node 9188 * @nodemask: Mask for other possible nodes 9189 * 9190 * This routine is a wrapper around alloc_contig_range(). It scans over zones 9191 * on an applicable zonelist to find a contiguous pfn range which can then be 9192 * tried for allocation with alloc_contig_range(). This routine is intended 9193 * for allocation requests which can not be fulfilled with the buddy allocator. 9194 * 9195 * The allocated memory is always aligned to a page boundary. If nr_pages is a 9196 * power of two then the alignment is guaranteed to be to the given nr_pages 9197 * (e.g. 1GB request would be aligned to 1GB). 9198 * 9199 * Allocated pages can be freed with free_contig_range() or by manually calling 9200 * __free_page() on each allocated page. 9201 * 9202 * Return: pointer to contiguous pages on success, or NULL if not successful. 9203 */ 9204 struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, 9205 int nid, nodemask_t *nodemask) 9206 { 9207 unsigned long ret, pfn, flags; 9208 struct zonelist *zonelist; 9209 struct zone *zone; 9210 struct zoneref *z; 9211 9212 zonelist = node_zonelist(nid, gfp_mask); 9213 for_each_zone_zonelist_nodemask(zone, z, zonelist, 9214 gfp_zone(gfp_mask), nodemask) { 9215 spin_lock_irqsave(&zone->lock, flags); 9216 9217 pfn = ALIGN(zone->zone_start_pfn, nr_pages); 9218 while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 9219 if (pfn_range_valid_contig(zone, pfn, nr_pages)) { 9220 /* 9221 * We release the zone lock here because 9222 * alloc_contig_range() will also lock the zone 9223 * at some point. If there's an allocation 9224 * spinning on this lock, it may win the race 9225 * and cause alloc_contig_range() to fail... 9226 */ 9227 spin_unlock_irqrestore(&zone->lock, flags); 9228 ret = __alloc_contig_pages(pfn, nr_pages, 9229 gfp_mask); 9230 if (!ret) 9231 return pfn_to_page(pfn); 9232 spin_lock_irqsave(&zone->lock, flags); 9233 } 9234 pfn += nr_pages; 9235 } 9236 spin_unlock_irqrestore(&zone->lock, flags); 9237 } 9238 return NULL; 9239 } 9240 #endif /* CONFIG_CONTIG_ALLOC */ 9241 9242 void free_contig_range(unsigned long pfn, unsigned long nr_pages) 9243 { 9244 unsigned long count = 0; 9245 9246 for (; nr_pages--; pfn++) { 9247 struct page *page = pfn_to_page(pfn); 9248 9249 count += page_count(page) != 1; 9250 __free_page(page); 9251 } 9252 WARN(count != 0, "%lu pages are still in use!\n", count); 9253 } 9254 EXPORT_SYMBOL(free_contig_range); 9255 9256 /* 9257 * The zone indicated has a new number of managed_pages; batch sizes and percpu 9258 * page high values need to be recalculated. 9259 */ 9260 void zone_pcp_update(struct zone *zone, int cpu_online) 9261 { 9262 mutex_lock(&pcp_batch_high_lock); 9263 zone_set_pageset_high_and_batch(zone, cpu_online); 9264 mutex_unlock(&pcp_batch_high_lock); 9265 } 9266 9267 /* 9268 * Effectively disable pcplists for the zone by setting the high limit to 0 9269 * and draining all cpus. A concurrent page freeing on another CPU that's about 9270 * to put the page on pcplist will either finish before the drain and the page 9271 * will be drained, or observe the new high limit and skip the pcplist. 9272 * 9273 * Must be paired with a call to zone_pcp_enable(). 9274 */ 9275 void zone_pcp_disable(struct zone *zone) 9276 { 9277 mutex_lock(&pcp_batch_high_lock); 9278 __zone_set_pageset_high_and_batch(zone, 0, 1); 9279 __drain_all_pages(zone, true); 9280 } 9281 9282 void zone_pcp_enable(struct zone *zone) 9283 { 9284 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); 9285 mutex_unlock(&pcp_batch_high_lock); 9286 } 9287 9288 void zone_pcp_reset(struct zone *zone) 9289 { 9290 int cpu; 9291 struct per_cpu_zonestat *pzstats; 9292 9293 if (zone->per_cpu_pageset != &boot_pageset) { 9294 for_each_online_cpu(cpu) { 9295 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 9296 drain_zonestat(zone, pzstats); 9297 } 9298 free_percpu(zone->per_cpu_pageset); 9299 free_percpu(zone->per_cpu_zonestats); 9300 zone->per_cpu_pageset = &boot_pageset; 9301 zone->per_cpu_zonestats = &boot_zonestats; 9302 } 9303 } 9304 9305 #ifdef CONFIG_MEMORY_HOTREMOVE 9306 /* 9307 * All pages in the range must be in a single zone, must not contain holes, 9308 * must span full sections, and must be isolated before calling this function. 9309 */ 9310 void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 9311 { 9312 unsigned long pfn = start_pfn; 9313 struct page *page; 9314 struct zone *zone; 9315 unsigned int order; 9316 unsigned long flags; 9317 9318 offline_mem_sections(pfn, end_pfn); 9319 zone = page_zone(pfn_to_page(pfn)); 9320 spin_lock_irqsave(&zone->lock, flags); 9321 while (pfn < end_pfn) { 9322 page = pfn_to_page(pfn); 9323 /* 9324 * The HWPoisoned page may be not in buddy system, and 9325 * page_count() is not 0. 9326 */ 9327 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 9328 pfn++; 9329 continue; 9330 } 9331 /* 9332 * At this point all remaining PageOffline() pages have a 9333 * reference count of 0 and can simply be skipped. 9334 */ 9335 if (PageOffline(page)) { 9336 BUG_ON(page_count(page)); 9337 BUG_ON(PageBuddy(page)); 9338 pfn++; 9339 continue; 9340 } 9341 9342 BUG_ON(page_count(page)); 9343 BUG_ON(!PageBuddy(page)); 9344 order = buddy_order(page); 9345 del_page_from_free_list(page, zone, order); 9346 pfn += (1 << order); 9347 } 9348 spin_unlock_irqrestore(&zone->lock, flags); 9349 } 9350 #endif 9351 9352 bool is_free_buddy_page(struct page *page) 9353 { 9354 struct zone *zone = page_zone(page); 9355 unsigned long pfn = page_to_pfn(page); 9356 unsigned long flags; 9357 unsigned int order; 9358 9359 spin_lock_irqsave(&zone->lock, flags); 9360 for (order = 0; order < MAX_ORDER; order++) { 9361 struct page *page_head = page - (pfn & ((1 << order) - 1)); 9362 9363 if (PageBuddy(page_head) && buddy_order(page_head) >= order) 9364 break; 9365 } 9366 spin_unlock_irqrestore(&zone->lock, flags); 9367 9368 return order < MAX_ORDER; 9369 } 9370 9371 #ifdef CONFIG_MEMORY_FAILURE 9372 /* 9373 * Break down a higher-order page in sub-pages, and keep our target out of 9374 * buddy allocator. 9375 */ 9376 static void break_down_buddy_pages(struct zone *zone, struct page *page, 9377 struct page *target, int low, int high, 9378 int migratetype) 9379 { 9380 unsigned long size = 1 << high; 9381 struct page *current_buddy, *next_page; 9382 9383 while (high > low) { 9384 high--; 9385 size >>= 1; 9386 9387 if (target >= &page[size]) { 9388 next_page = page + size; 9389 current_buddy = page; 9390 } else { 9391 next_page = page; 9392 current_buddy = page + size; 9393 } 9394 9395 if (set_page_guard(zone, current_buddy, high, migratetype)) 9396 continue; 9397 9398 if (current_buddy != target) { 9399 add_to_free_list(current_buddy, zone, high, migratetype); 9400 set_buddy_order(current_buddy, high); 9401 page = next_page; 9402 } 9403 } 9404 } 9405 9406 /* 9407 * Take a page that will be marked as poisoned off the buddy allocator. 9408 */ 9409 bool take_page_off_buddy(struct page *page) 9410 { 9411 struct zone *zone = page_zone(page); 9412 unsigned long pfn = page_to_pfn(page); 9413 unsigned long flags; 9414 unsigned int order; 9415 bool ret = false; 9416 9417 spin_lock_irqsave(&zone->lock, flags); 9418 for (order = 0; order < MAX_ORDER; order++) { 9419 struct page *page_head = page - (pfn & ((1 << order) - 1)); 9420 int page_order = buddy_order(page_head); 9421 9422 if (PageBuddy(page_head) && page_order >= order) { 9423 unsigned long pfn_head = page_to_pfn(page_head); 9424 int migratetype = get_pfnblock_migratetype(page_head, 9425 pfn_head); 9426 9427 del_page_from_free_list(page_head, zone, page_order); 9428 break_down_buddy_pages(zone, page_head, page, 0, 9429 page_order, migratetype); 9430 if (!is_migrate_isolate(migratetype)) 9431 __mod_zone_freepage_state(zone, -1, migratetype); 9432 ret = true; 9433 break; 9434 } 9435 if (page_count(page_head) > 0) 9436 break; 9437 } 9438 spin_unlock_irqrestore(&zone->lock, flags); 9439 return ret; 9440 } 9441 #endif 9442