1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/interrupt.h> 21 #include <linux/pagemap.h> 22 #include <linux/jiffies.h> 23 #include <linux/bootmem.h> 24 #include <linux/memblock.h> 25 #include <linux/compiler.h> 26 #include <linux/kernel.h> 27 #include <linux/kmemcheck.h> 28 #include <linux/kasan.h> 29 #include <linux/module.h> 30 #include <linux/suspend.h> 31 #include <linux/pagevec.h> 32 #include <linux/blkdev.h> 33 #include <linux/slab.h> 34 #include <linux/ratelimit.h> 35 #include <linux/oom.h> 36 #include <linux/notifier.h> 37 #include <linux/topology.h> 38 #include <linux/sysctl.h> 39 #include <linux/cpu.h> 40 #include <linux/cpuset.h> 41 #include <linux/memory_hotplug.h> 42 #include <linux/nodemask.h> 43 #include <linux/vmalloc.h> 44 #include <linux/vmstat.h> 45 #include <linux/mempolicy.h> 46 #include <linux/memremap.h> 47 #include <linux/stop_machine.h> 48 #include <linux/sort.h> 49 #include <linux/pfn.h> 50 #include <linux/backing-dev.h> 51 #include <linux/fault-inject.h> 52 #include <linux/page-isolation.h> 53 #include <linux/page_ext.h> 54 #include <linux/debugobjects.h> 55 #include <linux/kmemleak.h> 56 #include <linux/compaction.h> 57 #include <trace/events/kmem.h> 58 #include <trace/events/oom.h> 59 #include <linux/prefetch.h> 60 #include <linux/mm_inline.h> 61 #include <linux/migrate.h> 62 #include <linux/hugetlb.h> 63 #include <linux/sched/rt.h> 64 #include <linux/sched/mm.h> 65 #include <linux/page_owner.h> 66 #include <linux/kthread.h> 67 #include <linux/memcontrol.h> 68 #include <linux/ftrace.h> 69 70 #include <asm/sections.h> 71 #include <asm/tlbflush.h> 72 #include <asm/div64.h> 73 #include "internal.h" 74 75 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 76 static DEFINE_MUTEX(pcp_batch_high_lock); 77 #define MIN_PERCPU_PAGELIST_FRACTION (8) 78 79 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 80 DEFINE_PER_CPU(int, numa_node); 81 EXPORT_PER_CPU_SYMBOL(numa_node); 82 #endif 83 84 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 85 /* 86 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 87 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 88 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 89 * defined in <linux/topology.h>. 90 */ 91 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 92 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 93 int _node_numa_mem_[MAX_NUMNODES]; 94 #endif 95 96 /* work_structs for global per-cpu drains */ 97 DEFINE_MUTEX(pcpu_drain_mutex); 98 DEFINE_PER_CPU(struct work_struct, pcpu_drain); 99 100 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 101 volatile unsigned long latent_entropy __latent_entropy; 102 EXPORT_SYMBOL(latent_entropy); 103 #endif 104 105 /* 106 * Array of node states. 107 */ 108 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 109 [N_POSSIBLE] = NODE_MASK_ALL, 110 [N_ONLINE] = { { [0] = 1UL } }, 111 #ifndef CONFIG_NUMA 112 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 113 #ifdef CONFIG_HIGHMEM 114 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 115 #endif 116 #ifdef CONFIG_MOVABLE_NODE 117 [N_MEMORY] = { { [0] = 1UL } }, 118 #endif 119 [N_CPU] = { { [0] = 1UL } }, 120 #endif /* NUMA */ 121 }; 122 EXPORT_SYMBOL(node_states); 123 124 /* Protect totalram_pages and zone->managed_pages */ 125 static DEFINE_SPINLOCK(managed_page_count_lock); 126 127 unsigned long totalram_pages __read_mostly; 128 unsigned long totalreserve_pages __read_mostly; 129 unsigned long totalcma_pages __read_mostly; 130 131 int percpu_pagelist_fraction; 132 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 133 134 /* 135 * A cached value of the page's pageblock's migratetype, used when the page is 136 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 137 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 138 * Also the migratetype set in the page does not necessarily match the pcplist 139 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 140 * other index - this ensures that it will be put on the correct CMA freelist. 141 */ 142 static inline int get_pcppage_migratetype(struct page *page) 143 { 144 return page->index; 145 } 146 147 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 148 { 149 page->index = migratetype; 150 } 151 152 #ifdef CONFIG_PM_SLEEP 153 /* 154 * The following functions are used by the suspend/hibernate code to temporarily 155 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 156 * while devices are suspended. To avoid races with the suspend/hibernate code, 157 * they should always be called with pm_mutex held (gfp_allowed_mask also should 158 * only be modified with pm_mutex held, unless the suspend/hibernate code is 159 * guaranteed not to run in parallel with that modification). 160 */ 161 162 static gfp_t saved_gfp_mask; 163 164 void pm_restore_gfp_mask(void) 165 { 166 WARN_ON(!mutex_is_locked(&pm_mutex)); 167 if (saved_gfp_mask) { 168 gfp_allowed_mask = saved_gfp_mask; 169 saved_gfp_mask = 0; 170 } 171 } 172 173 void pm_restrict_gfp_mask(void) 174 { 175 WARN_ON(!mutex_is_locked(&pm_mutex)); 176 WARN_ON(saved_gfp_mask); 177 saved_gfp_mask = gfp_allowed_mask; 178 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 179 } 180 181 bool pm_suspended_storage(void) 182 { 183 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 184 return false; 185 return true; 186 } 187 #endif /* CONFIG_PM_SLEEP */ 188 189 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 190 unsigned int pageblock_order __read_mostly; 191 #endif 192 193 static void __free_pages_ok(struct page *page, unsigned int order); 194 195 /* 196 * results with 256, 32 in the lowmem_reserve sysctl: 197 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 198 * 1G machine -> (16M dma, 784M normal, 224M high) 199 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 200 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 201 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 202 * 203 * TBD: should special case ZONE_DMA32 machines here - in those we normally 204 * don't need any ZONE_NORMAL reservation 205 */ 206 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 207 #ifdef CONFIG_ZONE_DMA 208 256, 209 #endif 210 #ifdef CONFIG_ZONE_DMA32 211 256, 212 #endif 213 #ifdef CONFIG_HIGHMEM 214 32, 215 #endif 216 32, 217 }; 218 219 EXPORT_SYMBOL(totalram_pages); 220 221 static char * const zone_names[MAX_NR_ZONES] = { 222 #ifdef CONFIG_ZONE_DMA 223 "DMA", 224 #endif 225 #ifdef CONFIG_ZONE_DMA32 226 "DMA32", 227 #endif 228 "Normal", 229 #ifdef CONFIG_HIGHMEM 230 "HighMem", 231 #endif 232 "Movable", 233 #ifdef CONFIG_ZONE_DEVICE 234 "Device", 235 #endif 236 }; 237 238 char * const migratetype_names[MIGRATE_TYPES] = { 239 "Unmovable", 240 "Movable", 241 "Reclaimable", 242 "HighAtomic", 243 #ifdef CONFIG_CMA 244 "CMA", 245 #endif 246 #ifdef CONFIG_MEMORY_ISOLATION 247 "Isolate", 248 #endif 249 }; 250 251 compound_page_dtor * const compound_page_dtors[] = { 252 NULL, 253 free_compound_page, 254 #ifdef CONFIG_HUGETLB_PAGE 255 free_huge_page, 256 #endif 257 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 258 free_transhuge_page, 259 #endif 260 }; 261 262 int min_free_kbytes = 1024; 263 int user_min_free_kbytes = -1; 264 int watermark_scale_factor = 10; 265 266 static unsigned long __meminitdata nr_kernel_pages; 267 static unsigned long __meminitdata nr_all_pages; 268 static unsigned long __meminitdata dma_reserve; 269 270 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 271 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 272 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 273 static unsigned long __initdata required_kernelcore; 274 static unsigned long __initdata required_movablecore; 275 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 276 static bool mirrored_kernelcore; 277 278 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 279 int movable_zone; 280 EXPORT_SYMBOL(movable_zone); 281 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 282 283 #if MAX_NUMNODES > 1 284 int nr_node_ids __read_mostly = MAX_NUMNODES; 285 int nr_online_nodes __read_mostly = 1; 286 EXPORT_SYMBOL(nr_node_ids); 287 EXPORT_SYMBOL(nr_online_nodes); 288 #endif 289 290 int page_group_by_mobility_disabled __read_mostly; 291 292 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 293 static inline void reset_deferred_meminit(pg_data_t *pgdat) 294 { 295 unsigned long max_initialise; 296 unsigned long reserved_lowmem; 297 298 /* 299 * Initialise at least 2G of a node but also take into account that 300 * two large system hashes that can take up 1GB for 0.25TB/node. 301 */ 302 max_initialise = max(2UL << (30 - PAGE_SHIFT), 303 (pgdat->node_spanned_pages >> 8)); 304 305 /* 306 * Compensate the all the memblock reservations (e.g. crash kernel) 307 * from the initial estimation to make sure we will initialize enough 308 * memory to boot. 309 */ 310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, 311 pgdat->node_start_pfn + max_initialise); 312 max_initialise += reserved_lowmem; 313 314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); 315 pgdat->first_deferred_pfn = ULONG_MAX; 316 } 317 318 /* Returns true if the struct page for the pfn is uninitialised */ 319 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 320 { 321 int nid = early_pfn_to_nid(pfn); 322 323 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 324 return true; 325 326 return false; 327 } 328 329 /* 330 * Returns false when the remaining initialisation should be deferred until 331 * later in the boot cycle when it can be parallelised. 332 */ 333 static inline bool update_defer_init(pg_data_t *pgdat, 334 unsigned long pfn, unsigned long zone_end, 335 unsigned long *nr_initialised) 336 { 337 /* Always populate low zones for address-contrained allocations */ 338 if (zone_end < pgdat_end_pfn(pgdat)) 339 return true; 340 (*nr_initialised)++; 341 if ((*nr_initialised > pgdat->static_init_size) && 342 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 343 pgdat->first_deferred_pfn = pfn; 344 return false; 345 } 346 347 return true; 348 } 349 #else 350 static inline void reset_deferred_meminit(pg_data_t *pgdat) 351 { 352 } 353 354 static inline bool early_page_uninitialised(unsigned long pfn) 355 { 356 return false; 357 } 358 359 static inline bool update_defer_init(pg_data_t *pgdat, 360 unsigned long pfn, unsigned long zone_end, 361 unsigned long *nr_initialised) 362 { 363 return true; 364 } 365 #endif 366 367 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 368 static inline unsigned long *get_pageblock_bitmap(struct page *page, 369 unsigned long pfn) 370 { 371 #ifdef CONFIG_SPARSEMEM 372 return __pfn_to_section(pfn)->pageblock_flags; 373 #else 374 return page_zone(page)->pageblock_flags; 375 #endif /* CONFIG_SPARSEMEM */ 376 } 377 378 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) 379 { 380 #ifdef CONFIG_SPARSEMEM 381 pfn &= (PAGES_PER_SECTION-1); 382 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 383 #else 384 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 385 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 386 #endif /* CONFIG_SPARSEMEM */ 387 } 388 389 /** 390 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 391 * @page: The page within the block of interest 392 * @pfn: The target page frame number 393 * @end_bitidx: The last bit of interest to retrieve 394 * @mask: mask of bits that the caller is interested in 395 * 396 * Return: pageblock_bits flags 397 */ 398 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, 399 unsigned long pfn, 400 unsigned long end_bitidx, 401 unsigned long mask) 402 { 403 unsigned long *bitmap; 404 unsigned long bitidx, word_bitidx; 405 unsigned long word; 406 407 bitmap = get_pageblock_bitmap(page, pfn); 408 bitidx = pfn_to_bitidx(page, pfn); 409 word_bitidx = bitidx / BITS_PER_LONG; 410 bitidx &= (BITS_PER_LONG-1); 411 412 word = bitmap[word_bitidx]; 413 bitidx += end_bitidx; 414 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 415 } 416 417 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 418 unsigned long end_bitidx, 419 unsigned long mask) 420 { 421 return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); 422 } 423 424 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) 425 { 426 return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); 427 } 428 429 /** 430 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 431 * @page: The page within the block of interest 432 * @flags: The flags to set 433 * @pfn: The target page frame number 434 * @end_bitidx: The last bit of interest 435 * @mask: mask of bits that the caller is interested in 436 */ 437 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 438 unsigned long pfn, 439 unsigned long end_bitidx, 440 unsigned long mask) 441 { 442 unsigned long *bitmap; 443 unsigned long bitidx, word_bitidx; 444 unsigned long old_word, word; 445 446 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 447 448 bitmap = get_pageblock_bitmap(page, pfn); 449 bitidx = pfn_to_bitidx(page, pfn); 450 word_bitidx = bitidx / BITS_PER_LONG; 451 bitidx &= (BITS_PER_LONG-1); 452 453 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 454 455 bitidx += end_bitidx; 456 mask <<= (BITS_PER_LONG - bitidx - 1); 457 flags <<= (BITS_PER_LONG - bitidx - 1); 458 459 word = READ_ONCE(bitmap[word_bitidx]); 460 for (;;) { 461 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 462 if (word == old_word) 463 break; 464 word = old_word; 465 } 466 } 467 468 void set_pageblock_migratetype(struct page *page, int migratetype) 469 { 470 if (unlikely(page_group_by_mobility_disabled && 471 migratetype < MIGRATE_PCPTYPES)) 472 migratetype = MIGRATE_UNMOVABLE; 473 474 set_pageblock_flags_group(page, (unsigned long)migratetype, 475 PB_migrate, PB_migrate_end); 476 } 477 478 #ifdef CONFIG_DEBUG_VM 479 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 480 { 481 int ret = 0; 482 unsigned seq; 483 unsigned long pfn = page_to_pfn(page); 484 unsigned long sp, start_pfn; 485 486 do { 487 seq = zone_span_seqbegin(zone); 488 start_pfn = zone->zone_start_pfn; 489 sp = zone->spanned_pages; 490 if (!zone_spans_pfn(zone, pfn)) 491 ret = 1; 492 } while (zone_span_seqretry(zone, seq)); 493 494 if (ret) 495 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 496 pfn, zone_to_nid(zone), zone->name, 497 start_pfn, start_pfn + sp); 498 499 return ret; 500 } 501 502 static int page_is_consistent(struct zone *zone, struct page *page) 503 { 504 if (!pfn_valid_within(page_to_pfn(page))) 505 return 0; 506 if (zone != page_zone(page)) 507 return 0; 508 509 return 1; 510 } 511 /* 512 * Temporary debugging check for pages not lying within a given zone. 513 */ 514 static int bad_range(struct zone *zone, struct page *page) 515 { 516 if (page_outside_zone_boundaries(zone, page)) 517 return 1; 518 if (!page_is_consistent(zone, page)) 519 return 1; 520 521 return 0; 522 } 523 #else 524 static inline int bad_range(struct zone *zone, struct page *page) 525 { 526 return 0; 527 } 528 #endif 529 530 static void bad_page(struct page *page, const char *reason, 531 unsigned long bad_flags) 532 { 533 static unsigned long resume; 534 static unsigned long nr_shown; 535 static unsigned long nr_unshown; 536 537 /* 538 * Allow a burst of 60 reports, then keep quiet for that minute; 539 * or allow a steady drip of one report per second. 540 */ 541 if (nr_shown == 60) { 542 if (time_before(jiffies, resume)) { 543 nr_unshown++; 544 goto out; 545 } 546 if (nr_unshown) { 547 pr_alert( 548 "BUG: Bad page state: %lu messages suppressed\n", 549 nr_unshown); 550 nr_unshown = 0; 551 } 552 nr_shown = 0; 553 } 554 if (nr_shown++ == 0) 555 resume = jiffies + 60 * HZ; 556 557 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 558 current->comm, page_to_pfn(page)); 559 __dump_page(page, reason); 560 bad_flags &= page->flags; 561 if (bad_flags) 562 pr_alert("bad because of flags: %#lx(%pGp)\n", 563 bad_flags, &bad_flags); 564 dump_page_owner(page); 565 566 print_modules(); 567 dump_stack(); 568 out: 569 /* Leave bad fields for debug, except PageBuddy could make trouble */ 570 page_mapcount_reset(page); /* remove PageBuddy */ 571 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 572 } 573 574 /* 575 * Higher-order pages are called "compound pages". They are structured thusly: 576 * 577 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 578 * 579 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 580 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 581 * 582 * The first tail page's ->compound_dtor holds the offset in array of compound 583 * page destructors. See compound_page_dtors. 584 * 585 * The first tail page's ->compound_order holds the order of allocation. 586 * This usage means that zero-order pages may not be compound. 587 */ 588 589 void free_compound_page(struct page *page) 590 { 591 __free_pages_ok(page, compound_order(page)); 592 } 593 594 void prep_compound_page(struct page *page, unsigned int order) 595 { 596 int i; 597 int nr_pages = 1 << order; 598 599 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 600 set_compound_order(page, order); 601 __SetPageHead(page); 602 for (i = 1; i < nr_pages; i++) { 603 struct page *p = page + i; 604 set_page_count(p, 0); 605 p->mapping = TAIL_MAPPING; 606 set_compound_head(p, page); 607 } 608 atomic_set(compound_mapcount_ptr(page), -1); 609 } 610 611 #ifdef CONFIG_DEBUG_PAGEALLOC 612 unsigned int _debug_guardpage_minorder; 613 bool _debug_pagealloc_enabled __read_mostly 614 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 615 EXPORT_SYMBOL(_debug_pagealloc_enabled); 616 bool _debug_guardpage_enabled __read_mostly; 617 618 static int __init early_debug_pagealloc(char *buf) 619 { 620 if (!buf) 621 return -EINVAL; 622 return kstrtobool(buf, &_debug_pagealloc_enabled); 623 } 624 early_param("debug_pagealloc", early_debug_pagealloc); 625 626 static bool need_debug_guardpage(void) 627 { 628 /* If we don't use debug_pagealloc, we don't need guard page */ 629 if (!debug_pagealloc_enabled()) 630 return false; 631 632 if (!debug_guardpage_minorder()) 633 return false; 634 635 return true; 636 } 637 638 static void init_debug_guardpage(void) 639 { 640 if (!debug_pagealloc_enabled()) 641 return; 642 643 if (!debug_guardpage_minorder()) 644 return; 645 646 _debug_guardpage_enabled = true; 647 } 648 649 struct page_ext_operations debug_guardpage_ops = { 650 .need = need_debug_guardpage, 651 .init = init_debug_guardpage, 652 }; 653 654 static int __init debug_guardpage_minorder_setup(char *buf) 655 { 656 unsigned long res; 657 658 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 659 pr_err("Bad debug_guardpage_minorder value\n"); 660 return 0; 661 } 662 _debug_guardpage_minorder = res; 663 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 664 return 0; 665 } 666 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 667 668 static inline bool set_page_guard(struct zone *zone, struct page *page, 669 unsigned int order, int migratetype) 670 { 671 struct page_ext *page_ext; 672 673 if (!debug_guardpage_enabled()) 674 return false; 675 676 if (order >= debug_guardpage_minorder()) 677 return false; 678 679 page_ext = lookup_page_ext(page); 680 if (unlikely(!page_ext)) 681 return false; 682 683 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 684 685 INIT_LIST_HEAD(&page->lru); 686 set_page_private(page, order); 687 /* Guard pages are not available for any usage */ 688 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 689 690 return true; 691 } 692 693 static inline void clear_page_guard(struct zone *zone, struct page *page, 694 unsigned int order, int migratetype) 695 { 696 struct page_ext *page_ext; 697 698 if (!debug_guardpage_enabled()) 699 return; 700 701 page_ext = lookup_page_ext(page); 702 if (unlikely(!page_ext)) 703 return; 704 705 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 706 707 set_page_private(page, 0); 708 if (!is_migrate_isolate(migratetype)) 709 __mod_zone_freepage_state(zone, (1 << order), migratetype); 710 } 711 #else 712 struct page_ext_operations debug_guardpage_ops; 713 static inline bool set_page_guard(struct zone *zone, struct page *page, 714 unsigned int order, int migratetype) { return false; } 715 static inline void clear_page_guard(struct zone *zone, struct page *page, 716 unsigned int order, int migratetype) {} 717 #endif 718 719 static inline void set_page_order(struct page *page, unsigned int order) 720 { 721 set_page_private(page, order); 722 __SetPageBuddy(page); 723 } 724 725 static inline void rmv_page_order(struct page *page) 726 { 727 __ClearPageBuddy(page); 728 set_page_private(page, 0); 729 } 730 731 /* 732 * This function checks whether a page is free && is the buddy 733 * we can do coalesce a page and its buddy if 734 * (a) the buddy is not in a hole (check before calling!) && 735 * (b) the buddy is in the buddy system && 736 * (c) a page and its buddy have the same order && 737 * (d) a page and its buddy are in the same zone. 738 * 739 * For recording whether a page is in the buddy system, we set ->_mapcount 740 * PAGE_BUDDY_MAPCOUNT_VALUE. 741 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 742 * serialized by zone->lock. 743 * 744 * For recording page's order, we use page_private(page). 745 */ 746 static inline int page_is_buddy(struct page *page, struct page *buddy, 747 unsigned int order) 748 { 749 if (page_is_guard(buddy) && page_order(buddy) == order) { 750 if (page_zone_id(page) != page_zone_id(buddy)) 751 return 0; 752 753 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 754 755 return 1; 756 } 757 758 if (PageBuddy(buddy) && page_order(buddy) == order) { 759 /* 760 * zone check is done late to avoid uselessly 761 * calculating zone/node ids for pages that could 762 * never merge. 763 */ 764 if (page_zone_id(page) != page_zone_id(buddy)) 765 return 0; 766 767 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 768 769 return 1; 770 } 771 return 0; 772 } 773 774 /* 775 * Freeing function for a buddy system allocator. 776 * 777 * The concept of a buddy system is to maintain direct-mapped table 778 * (containing bit values) for memory blocks of various "orders". 779 * The bottom level table contains the map for the smallest allocatable 780 * units of memory (here, pages), and each level above it describes 781 * pairs of units from the levels below, hence, "buddies". 782 * At a high level, all that happens here is marking the table entry 783 * at the bottom level available, and propagating the changes upward 784 * as necessary, plus some accounting needed to play nicely with other 785 * parts of the VM system. 786 * At each level, we keep a list of pages, which are heads of continuous 787 * free pages of length of (1 << order) and marked with _mapcount 788 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 789 * field. 790 * So when we are allocating or freeing one, we can derive the state of the 791 * other. That is, if we allocate a small block, and both were 792 * free, the remainder of the region must be split into blocks. 793 * If a block is freed, and its buddy is also free, then this 794 * triggers coalescing into a block of larger size. 795 * 796 * -- nyc 797 */ 798 799 static inline void __free_one_page(struct page *page, 800 unsigned long pfn, 801 struct zone *zone, unsigned int order, 802 int migratetype) 803 { 804 unsigned long combined_pfn; 805 unsigned long uninitialized_var(buddy_pfn); 806 struct page *buddy; 807 unsigned int max_order; 808 809 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 810 811 VM_BUG_ON(!zone_is_initialized(zone)); 812 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 813 814 VM_BUG_ON(migratetype == -1); 815 if (likely(!is_migrate_isolate(migratetype))) 816 __mod_zone_freepage_state(zone, 1 << order, migratetype); 817 818 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 819 VM_BUG_ON_PAGE(bad_range(zone, page), page); 820 821 continue_merging: 822 while (order < max_order - 1) { 823 buddy_pfn = __find_buddy_pfn(pfn, order); 824 buddy = page + (buddy_pfn - pfn); 825 826 if (!pfn_valid_within(buddy_pfn)) 827 goto done_merging; 828 if (!page_is_buddy(page, buddy, order)) 829 goto done_merging; 830 /* 831 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 832 * merge with it and move up one order. 833 */ 834 if (page_is_guard(buddy)) { 835 clear_page_guard(zone, buddy, order, migratetype); 836 } else { 837 list_del(&buddy->lru); 838 zone->free_area[order].nr_free--; 839 rmv_page_order(buddy); 840 } 841 combined_pfn = buddy_pfn & pfn; 842 page = page + (combined_pfn - pfn); 843 pfn = combined_pfn; 844 order++; 845 } 846 if (max_order < MAX_ORDER) { 847 /* If we are here, it means order is >= pageblock_order. 848 * We want to prevent merge between freepages on isolate 849 * pageblock and normal pageblock. Without this, pageblock 850 * isolation could cause incorrect freepage or CMA accounting. 851 * 852 * We don't want to hit this code for the more frequent 853 * low-order merging. 854 */ 855 if (unlikely(has_isolate_pageblock(zone))) { 856 int buddy_mt; 857 858 buddy_pfn = __find_buddy_pfn(pfn, order); 859 buddy = page + (buddy_pfn - pfn); 860 buddy_mt = get_pageblock_migratetype(buddy); 861 862 if (migratetype != buddy_mt 863 && (is_migrate_isolate(migratetype) || 864 is_migrate_isolate(buddy_mt))) 865 goto done_merging; 866 } 867 max_order++; 868 goto continue_merging; 869 } 870 871 done_merging: 872 set_page_order(page, order); 873 874 /* 875 * If this is not the largest possible page, check if the buddy 876 * of the next-highest order is free. If it is, it's possible 877 * that pages are being freed that will coalesce soon. In case, 878 * that is happening, add the free page to the tail of the list 879 * so it's less likely to be used soon and more likely to be merged 880 * as a higher order page 881 */ 882 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { 883 struct page *higher_page, *higher_buddy; 884 combined_pfn = buddy_pfn & pfn; 885 higher_page = page + (combined_pfn - pfn); 886 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 887 higher_buddy = higher_page + (buddy_pfn - combined_pfn); 888 if (pfn_valid_within(buddy_pfn) && 889 page_is_buddy(higher_page, higher_buddy, order + 1)) { 890 list_add_tail(&page->lru, 891 &zone->free_area[order].free_list[migratetype]); 892 goto out; 893 } 894 } 895 896 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 897 out: 898 zone->free_area[order].nr_free++; 899 } 900 901 /* 902 * A bad page could be due to a number of fields. Instead of multiple branches, 903 * try and check multiple fields with one check. The caller must do a detailed 904 * check if necessary. 905 */ 906 static inline bool page_expected_state(struct page *page, 907 unsigned long check_flags) 908 { 909 if (unlikely(atomic_read(&page->_mapcount) != -1)) 910 return false; 911 912 if (unlikely((unsigned long)page->mapping | 913 page_ref_count(page) | 914 #ifdef CONFIG_MEMCG 915 (unsigned long)page->mem_cgroup | 916 #endif 917 (page->flags & check_flags))) 918 return false; 919 920 return true; 921 } 922 923 static void free_pages_check_bad(struct page *page) 924 { 925 const char *bad_reason; 926 unsigned long bad_flags; 927 928 bad_reason = NULL; 929 bad_flags = 0; 930 931 if (unlikely(atomic_read(&page->_mapcount) != -1)) 932 bad_reason = "nonzero mapcount"; 933 if (unlikely(page->mapping != NULL)) 934 bad_reason = "non-NULL mapping"; 935 if (unlikely(page_ref_count(page) != 0)) 936 bad_reason = "nonzero _refcount"; 937 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { 938 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 939 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 940 } 941 #ifdef CONFIG_MEMCG 942 if (unlikely(page->mem_cgroup)) 943 bad_reason = "page still charged to cgroup"; 944 #endif 945 bad_page(page, bad_reason, bad_flags); 946 } 947 948 static inline int free_pages_check(struct page *page) 949 { 950 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 951 return 0; 952 953 /* Something has gone sideways, find it */ 954 free_pages_check_bad(page); 955 return 1; 956 } 957 958 static int free_tail_pages_check(struct page *head_page, struct page *page) 959 { 960 int ret = 1; 961 962 /* 963 * We rely page->lru.next never has bit 0 set, unless the page 964 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 965 */ 966 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 967 968 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 969 ret = 0; 970 goto out; 971 } 972 switch (page - head_page) { 973 case 1: 974 /* the first tail page: ->mapping is compound_mapcount() */ 975 if (unlikely(compound_mapcount(page))) { 976 bad_page(page, "nonzero compound_mapcount", 0); 977 goto out; 978 } 979 break; 980 case 2: 981 /* 982 * the second tail page: ->mapping is 983 * page_deferred_list().next -- ignore value. 984 */ 985 break; 986 default: 987 if (page->mapping != TAIL_MAPPING) { 988 bad_page(page, "corrupted mapping in tail page", 0); 989 goto out; 990 } 991 break; 992 } 993 if (unlikely(!PageTail(page))) { 994 bad_page(page, "PageTail not set", 0); 995 goto out; 996 } 997 if (unlikely(compound_head(page) != head_page)) { 998 bad_page(page, "compound_head not consistent", 0); 999 goto out; 1000 } 1001 ret = 0; 1002 out: 1003 page->mapping = NULL; 1004 clear_compound_head(page); 1005 return ret; 1006 } 1007 1008 static __always_inline bool free_pages_prepare(struct page *page, 1009 unsigned int order, bool check_free) 1010 { 1011 int bad = 0; 1012 1013 VM_BUG_ON_PAGE(PageTail(page), page); 1014 1015 trace_mm_page_free(page, order); 1016 kmemcheck_free_shadow(page, order); 1017 1018 /* 1019 * Check tail pages before head page information is cleared to 1020 * avoid checking PageCompound for order-0 pages. 1021 */ 1022 if (unlikely(order)) { 1023 bool compound = PageCompound(page); 1024 int i; 1025 1026 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1027 1028 if (compound) 1029 ClearPageDoubleMap(page); 1030 for (i = 1; i < (1 << order); i++) { 1031 if (compound) 1032 bad += free_tail_pages_check(page, page + i); 1033 if (unlikely(free_pages_check(page + i))) { 1034 bad++; 1035 continue; 1036 } 1037 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1038 } 1039 } 1040 if (PageMappingFlags(page)) 1041 page->mapping = NULL; 1042 if (memcg_kmem_enabled() && PageKmemcg(page)) 1043 memcg_kmem_uncharge(page, order); 1044 if (check_free) 1045 bad += free_pages_check(page); 1046 if (bad) 1047 return false; 1048 1049 page_cpupid_reset_last(page); 1050 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1051 reset_page_owner(page, order); 1052 1053 if (!PageHighMem(page)) { 1054 debug_check_no_locks_freed(page_address(page), 1055 PAGE_SIZE << order); 1056 debug_check_no_obj_freed(page_address(page), 1057 PAGE_SIZE << order); 1058 } 1059 arch_free_page(page, order); 1060 kernel_poison_pages(page, 1 << order, 0); 1061 kernel_map_pages(page, 1 << order, 0); 1062 kasan_free_pages(page, order); 1063 1064 return true; 1065 } 1066 1067 #ifdef CONFIG_DEBUG_VM 1068 static inline bool free_pcp_prepare(struct page *page) 1069 { 1070 return free_pages_prepare(page, 0, true); 1071 } 1072 1073 static inline bool bulkfree_pcp_prepare(struct page *page) 1074 { 1075 return false; 1076 } 1077 #else 1078 static bool free_pcp_prepare(struct page *page) 1079 { 1080 return free_pages_prepare(page, 0, false); 1081 } 1082 1083 static bool bulkfree_pcp_prepare(struct page *page) 1084 { 1085 return free_pages_check(page); 1086 } 1087 #endif /* CONFIG_DEBUG_VM */ 1088 1089 /* 1090 * Frees a number of pages from the PCP lists 1091 * Assumes all pages on list are in same zone, and of same order. 1092 * count is the number of pages to free. 1093 * 1094 * If the zone was previously in an "all pages pinned" state then look to 1095 * see if this freeing clears that state. 1096 * 1097 * And clear the zone's pages_scanned counter, to hold off the "all pages are 1098 * pinned" detection logic. 1099 */ 1100 static void free_pcppages_bulk(struct zone *zone, int count, 1101 struct per_cpu_pages *pcp) 1102 { 1103 int migratetype = 0; 1104 int batch_free = 0; 1105 bool isolated_pageblocks; 1106 1107 spin_lock(&zone->lock); 1108 isolated_pageblocks = has_isolate_pageblock(zone); 1109 1110 while (count) { 1111 struct page *page; 1112 struct list_head *list; 1113 1114 /* 1115 * Remove pages from lists in a round-robin fashion. A 1116 * batch_free count is maintained that is incremented when an 1117 * empty list is encountered. This is so more pages are freed 1118 * off fuller lists instead of spinning excessively around empty 1119 * lists 1120 */ 1121 do { 1122 batch_free++; 1123 if (++migratetype == MIGRATE_PCPTYPES) 1124 migratetype = 0; 1125 list = &pcp->lists[migratetype]; 1126 } while (list_empty(list)); 1127 1128 /* This is the only non-empty list. Free them all. */ 1129 if (batch_free == MIGRATE_PCPTYPES) 1130 batch_free = count; 1131 1132 do { 1133 int mt; /* migratetype of the to-be-freed page */ 1134 1135 page = list_last_entry(list, struct page, lru); 1136 /* must delete as __free_one_page list manipulates */ 1137 list_del(&page->lru); 1138 1139 mt = get_pcppage_migratetype(page); 1140 /* MIGRATE_ISOLATE page should not go to pcplists */ 1141 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1142 /* Pageblock could have been isolated meanwhile */ 1143 if (unlikely(isolated_pageblocks)) 1144 mt = get_pageblock_migratetype(page); 1145 1146 if (bulkfree_pcp_prepare(page)) 1147 continue; 1148 1149 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 1150 trace_mm_page_pcpu_drain(page, 0, mt); 1151 } while (--count && --batch_free && !list_empty(list)); 1152 } 1153 spin_unlock(&zone->lock); 1154 } 1155 1156 static void free_one_page(struct zone *zone, 1157 struct page *page, unsigned long pfn, 1158 unsigned int order, 1159 int migratetype) 1160 { 1161 spin_lock(&zone->lock); 1162 if (unlikely(has_isolate_pageblock(zone) || 1163 is_migrate_isolate(migratetype))) { 1164 migratetype = get_pfnblock_migratetype(page, pfn); 1165 } 1166 __free_one_page(page, pfn, zone, order, migratetype); 1167 spin_unlock(&zone->lock); 1168 } 1169 1170 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1171 unsigned long zone, int nid) 1172 { 1173 set_page_links(page, zone, nid, pfn); 1174 init_page_count(page); 1175 page_mapcount_reset(page); 1176 page_cpupid_reset_last(page); 1177 1178 INIT_LIST_HEAD(&page->lru); 1179 #ifdef WANT_PAGE_VIRTUAL 1180 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1181 if (!is_highmem_idx(zone)) 1182 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1183 #endif 1184 } 1185 1186 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, 1187 int nid) 1188 { 1189 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); 1190 } 1191 1192 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1193 static void init_reserved_page(unsigned long pfn) 1194 { 1195 pg_data_t *pgdat; 1196 int nid, zid; 1197 1198 if (!early_page_uninitialised(pfn)) 1199 return; 1200 1201 nid = early_pfn_to_nid(pfn); 1202 pgdat = NODE_DATA(nid); 1203 1204 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1205 struct zone *zone = &pgdat->node_zones[zid]; 1206 1207 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1208 break; 1209 } 1210 __init_single_pfn(pfn, zid, nid); 1211 } 1212 #else 1213 static inline void init_reserved_page(unsigned long pfn) 1214 { 1215 } 1216 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1217 1218 /* 1219 * Initialised pages do not have PageReserved set. This function is 1220 * called for each range allocated by the bootmem allocator and 1221 * marks the pages PageReserved. The remaining valid pages are later 1222 * sent to the buddy page allocator. 1223 */ 1224 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1225 { 1226 unsigned long start_pfn = PFN_DOWN(start); 1227 unsigned long end_pfn = PFN_UP(end); 1228 1229 for (; start_pfn < end_pfn; start_pfn++) { 1230 if (pfn_valid(start_pfn)) { 1231 struct page *page = pfn_to_page(start_pfn); 1232 1233 init_reserved_page(start_pfn); 1234 1235 /* Avoid false-positive PageTail() */ 1236 INIT_LIST_HEAD(&page->lru); 1237 1238 SetPageReserved(page); 1239 } 1240 } 1241 } 1242 1243 static void __free_pages_ok(struct page *page, unsigned int order) 1244 { 1245 unsigned long flags; 1246 int migratetype; 1247 unsigned long pfn = page_to_pfn(page); 1248 1249 if (!free_pages_prepare(page, order, true)) 1250 return; 1251 1252 migratetype = get_pfnblock_migratetype(page, pfn); 1253 local_irq_save(flags); 1254 __count_vm_events(PGFREE, 1 << order); 1255 free_one_page(page_zone(page), page, pfn, order, migratetype); 1256 local_irq_restore(flags); 1257 } 1258 1259 static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1260 { 1261 unsigned int nr_pages = 1 << order; 1262 struct page *p = page; 1263 unsigned int loop; 1264 1265 prefetchw(p); 1266 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1267 prefetchw(p + 1); 1268 __ClearPageReserved(p); 1269 set_page_count(p, 0); 1270 } 1271 __ClearPageReserved(p); 1272 set_page_count(p, 0); 1273 1274 page_zone(page)->managed_pages += nr_pages; 1275 set_page_refcounted(page); 1276 __free_pages(page, order); 1277 } 1278 1279 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ 1280 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) 1281 1282 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1283 1284 int __meminit early_pfn_to_nid(unsigned long pfn) 1285 { 1286 static DEFINE_SPINLOCK(early_pfn_lock); 1287 int nid; 1288 1289 spin_lock(&early_pfn_lock); 1290 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1291 if (nid < 0) 1292 nid = first_online_node; 1293 spin_unlock(&early_pfn_lock); 1294 1295 return nid; 1296 } 1297 #endif 1298 1299 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1300 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, 1301 struct mminit_pfnnid_cache *state) 1302 { 1303 int nid; 1304 1305 nid = __early_pfn_to_nid(pfn, state); 1306 if (nid >= 0 && nid != node) 1307 return false; 1308 return true; 1309 } 1310 1311 /* Only safe to use early in boot when initialisation is single-threaded */ 1312 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1313 { 1314 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); 1315 } 1316 1317 #else 1318 1319 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1320 { 1321 return true; 1322 } 1323 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, 1324 struct mminit_pfnnid_cache *state) 1325 { 1326 return true; 1327 } 1328 #endif 1329 1330 1331 void __init __free_pages_bootmem(struct page *page, unsigned long pfn, 1332 unsigned int order) 1333 { 1334 if (early_page_uninitialised(pfn)) 1335 return; 1336 return __free_pages_boot_core(page, order); 1337 } 1338 1339 /* 1340 * Check that the whole (or subset of) a pageblock given by the interval of 1341 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1342 * with the migration of free compaction scanner. The scanners then need to 1343 * use only pfn_valid_within() check for arches that allow holes within 1344 * pageblocks. 1345 * 1346 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1347 * 1348 * It's possible on some configurations to have a setup like node0 node1 node0 1349 * i.e. it's possible that all pages within a zones range of pages do not 1350 * belong to a single zone. We assume that a border between node0 and node1 1351 * can occur within a single pageblock, but not a node0 node1 node0 1352 * interleaving within a single pageblock. It is therefore sufficient to check 1353 * the first and last page of a pageblock and avoid checking each individual 1354 * page in a pageblock. 1355 */ 1356 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1357 unsigned long end_pfn, struct zone *zone) 1358 { 1359 struct page *start_page; 1360 struct page *end_page; 1361 1362 /* end_pfn is one past the range we are checking */ 1363 end_pfn--; 1364 1365 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1366 return NULL; 1367 1368 start_page = pfn_to_page(start_pfn); 1369 1370 if (page_zone(start_page) != zone) 1371 return NULL; 1372 1373 end_page = pfn_to_page(end_pfn); 1374 1375 /* This gives a shorter code than deriving page_zone(end_page) */ 1376 if (page_zone_id(start_page) != page_zone_id(end_page)) 1377 return NULL; 1378 1379 return start_page; 1380 } 1381 1382 void set_zone_contiguous(struct zone *zone) 1383 { 1384 unsigned long block_start_pfn = zone->zone_start_pfn; 1385 unsigned long block_end_pfn; 1386 1387 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1388 for (; block_start_pfn < zone_end_pfn(zone); 1389 block_start_pfn = block_end_pfn, 1390 block_end_pfn += pageblock_nr_pages) { 1391 1392 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1393 1394 if (!__pageblock_pfn_to_page(block_start_pfn, 1395 block_end_pfn, zone)) 1396 return; 1397 } 1398 1399 /* We confirm that there is no hole */ 1400 zone->contiguous = true; 1401 } 1402 1403 void clear_zone_contiguous(struct zone *zone) 1404 { 1405 zone->contiguous = false; 1406 } 1407 1408 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1409 static void __init deferred_free_range(struct page *page, 1410 unsigned long pfn, int nr_pages) 1411 { 1412 int i; 1413 1414 if (!page) 1415 return; 1416 1417 /* Free a large naturally-aligned chunk if possible */ 1418 if (nr_pages == pageblock_nr_pages && 1419 (pfn & (pageblock_nr_pages - 1)) == 0) { 1420 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1421 __free_pages_boot_core(page, pageblock_order); 1422 return; 1423 } 1424 1425 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1426 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1427 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1428 __free_pages_boot_core(page, 0); 1429 } 1430 } 1431 1432 /* Completion tracking for deferred_init_memmap() threads */ 1433 static atomic_t pgdat_init_n_undone __initdata; 1434 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1435 1436 static inline void __init pgdat_init_report_one_done(void) 1437 { 1438 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1439 complete(&pgdat_init_all_done_comp); 1440 } 1441 1442 /* Initialise remaining memory on a node */ 1443 static int __init deferred_init_memmap(void *data) 1444 { 1445 pg_data_t *pgdat = data; 1446 int nid = pgdat->node_id; 1447 struct mminit_pfnnid_cache nid_init_state = { }; 1448 unsigned long start = jiffies; 1449 unsigned long nr_pages = 0; 1450 unsigned long walk_start, walk_end; 1451 int i, zid; 1452 struct zone *zone; 1453 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1454 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1455 1456 if (first_init_pfn == ULONG_MAX) { 1457 pgdat_init_report_one_done(); 1458 return 0; 1459 } 1460 1461 /* Bind memory initialisation thread to a local node if possible */ 1462 if (!cpumask_empty(cpumask)) 1463 set_cpus_allowed_ptr(current, cpumask); 1464 1465 /* Sanity check boundaries */ 1466 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1467 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1468 pgdat->first_deferred_pfn = ULONG_MAX; 1469 1470 /* Only the highest zone is deferred so find it */ 1471 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1472 zone = pgdat->node_zones + zid; 1473 if (first_init_pfn < zone_end_pfn(zone)) 1474 break; 1475 } 1476 1477 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1478 unsigned long pfn, end_pfn; 1479 struct page *page = NULL; 1480 struct page *free_base_page = NULL; 1481 unsigned long free_base_pfn = 0; 1482 int nr_to_free = 0; 1483 1484 end_pfn = min(walk_end, zone_end_pfn(zone)); 1485 pfn = first_init_pfn; 1486 if (pfn < walk_start) 1487 pfn = walk_start; 1488 if (pfn < zone->zone_start_pfn) 1489 pfn = zone->zone_start_pfn; 1490 1491 for (; pfn < end_pfn; pfn++) { 1492 if (!pfn_valid_within(pfn)) 1493 goto free_range; 1494 1495 /* 1496 * Ensure pfn_valid is checked every 1497 * pageblock_nr_pages for memory holes 1498 */ 1499 if ((pfn & (pageblock_nr_pages - 1)) == 0) { 1500 if (!pfn_valid(pfn)) { 1501 page = NULL; 1502 goto free_range; 1503 } 1504 } 1505 1506 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { 1507 page = NULL; 1508 goto free_range; 1509 } 1510 1511 /* Minimise pfn page lookups and scheduler checks */ 1512 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { 1513 page++; 1514 } else { 1515 nr_pages += nr_to_free; 1516 deferred_free_range(free_base_page, 1517 free_base_pfn, nr_to_free); 1518 free_base_page = NULL; 1519 free_base_pfn = nr_to_free = 0; 1520 1521 page = pfn_to_page(pfn); 1522 cond_resched(); 1523 } 1524 1525 if (page->flags) { 1526 VM_BUG_ON(page_zone(page) != zone); 1527 goto free_range; 1528 } 1529 1530 __init_single_page(page, pfn, zid, nid); 1531 if (!free_base_page) { 1532 free_base_page = page; 1533 free_base_pfn = pfn; 1534 nr_to_free = 0; 1535 } 1536 nr_to_free++; 1537 1538 /* Where possible, batch up pages for a single free */ 1539 continue; 1540 free_range: 1541 /* Free the current block of pages to allocator */ 1542 nr_pages += nr_to_free; 1543 deferred_free_range(free_base_page, free_base_pfn, 1544 nr_to_free); 1545 free_base_page = NULL; 1546 free_base_pfn = nr_to_free = 0; 1547 } 1548 /* Free the last block of pages to allocator */ 1549 nr_pages += nr_to_free; 1550 deferred_free_range(free_base_page, free_base_pfn, nr_to_free); 1551 1552 first_init_pfn = max(end_pfn, first_init_pfn); 1553 } 1554 1555 /* Sanity check that the next zone really is unpopulated */ 1556 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1557 1558 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, 1559 jiffies_to_msecs(jiffies - start)); 1560 1561 pgdat_init_report_one_done(); 1562 return 0; 1563 } 1564 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1565 1566 void __init page_alloc_init_late(void) 1567 { 1568 struct zone *zone; 1569 1570 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1571 int nid; 1572 1573 /* There will be num_node_state(N_MEMORY) threads */ 1574 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 1575 for_each_node_state(nid, N_MEMORY) { 1576 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 1577 } 1578 1579 /* Block until all are initialised */ 1580 wait_for_completion(&pgdat_init_all_done_comp); 1581 1582 /* Reinit limits that are based on free pages after the kernel is up */ 1583 files_maxfiles_init(); 1584 #endif 1585 1586 for_each_populated_zone(zone) 1587 set_zone_contiguous(zone); 1588 } 1589 1590 #ifdef CONFIG_CMA 1591 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1592 void __init init_cma_reserved_pageblock(struct page *page) 1593 { 1594 unsigned i = pageblock_nr_pages; 1595 struct page *p = page; 1596 1597 do { 1598 __ClearPageReserved(p); 1599 set_page_count(p, 0); 1600 } while (++p, --i); 1601 1602 set_pageblock_migratetype(page, MIGRATE_CMA); 1603 1604 if (pageblock_order >= MAX_ORDER) { 1605 i = pageblock_nr_pages; 1606 p = page; 1607 do { 1608 set_page_refcounted(p); 1609 __free_pages(p, MAX_ORDER - 1); 1610 p += MAX_ORDER_NR_PAGES; 1611 } while (i -= MAX_ORDER_NR_PAGES); 1612 } else { 1613 set_page_refcounted(page); 1614 __free_pages(page, pageblock_order); 1615 } 1616 1617 adjust_managed_page_count(page, pageblock_nr_pages); 1618 } 1619 #endif 1620 1621 /* 1622 * The order of subdivision here is critical for the IO subsystem. 1623 * Please do not alter this order without good reasons and regression 1624 * testing. Specifically, as large blocks of memory are subdivided, 1625 * the order in which smaller blocks are delivered depends on the order 1626 * they're subdivided in this function. This is the primary factor 1627 * influencing the order in which pages are delivered to the IO 1628 * subsystem according to empirical testing, and this is also justified 1629 * by considering the behavior of a buddy system containing a single 1630 * large block of memory acted on by a series of small allocations. 1631 * This behavior is a critical factor in sglist merging's success. 1632 * 1633 * -- nyc 1634 */ 1635 static inline void expand(struct zone *zone, struct page *page, 1636 int low, int high, struct free_area *area, 1637 int migratetype) 1638 { 1639 unsigned long size = 1 << high; 1640 1641 while (high > low) { 1642 area--; 1643 high--; 1644 size >>= 1; 1645 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1646 1647 /* 1648 * Mark as guard pages (or page), that will allow to 1649 * merge back to allocator when buddy will be freed. 1650 * Corresponding page table entries will not be touched, 1651 * pages will stay not present in virtual address space 1652 */ 1653 if (set_page_guard(zone, &page[size], high, migratetype)) 1654 continue; 1655 1656 list_add(&page[size].lru, &area->free_list[migratetype]); 1657 area->nr_free++; 1658 set_page_order(&page[size], high); 1659 } 1660 } 1661 1662 static void check_new_page_bad(struct page *page) 1663 { 1664 const char *bad_reason = NULL; 1665 unsigned long bad_flags = 0; 1666 1667 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1668 bad_reason = "nonzero mapcount"; 1669 if (unlikely(page->mapping != NULL)) 1670 bad_reason = "non-NULL mapping"; 1671 if (unlikely(page_ref_count(page) != 0)) 1672 bad_reason = "nonzero _count"; 1673 if (unlikely(page->flags & __PG_HWPOISON)) { 1674 bad_reason = "HWPoisoned (hardware-corrupted)"; 1675 bad_flags = __PG_HWPOISON; 1676 /* Don't complain about hwpoisoned pages */ 1677 page_mapcount_reset(page); /* remove PageBuddy */ 1678 return; 1679 } 1680 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { 1681 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 1682 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 1683 } 1684 #ifdef CONFIG_MEMCG 1685 if (unlikely(page->mem_cgroup)) 1686 bad_reason = "page still charged to cgroup"; 1687 #endif 1688 bad_page(page, bad_reason, bad_flags); 1689 } 1690 1691 /* 1692 * This page is about to be returned from the page allocator 1693 */ 1694 static inline int check_new_page(struct page *page) 1695 { 1696 if (likely(page_expected_state(page, 1697 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 1698 return 0; 1699 1700 check_new_page_bad(page); 1701 return 1; 1702 } 1703 1704 static inline bool free_pages_prezeroed(void) 1705 { 1706 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && 1707 page_poisoning_enabled(); 1708 } 1709 1710 #ifdef CONFIG_DEBUG_VM 1711 static bool check_pcp_refill(struct page *page) 1712 { 1713 return false; 1714 } 1715 1716 static bool check_new_pcp(struct page *page) 1717 { 1718 return check_new_page(page); 1719 } 1720 #else 1721 static bool check_pcp_refill(struct page *page) 1722 { 1723 return check_new_page(page); 1724 } 1725 static bool check_new_pcp(struct page *page) 1726 { 1727 return false; 1728 } 1729 #endif /* CONFIG_DEBUG_VM */ 1730 1731 static bool check_new_pages(struct page *page, unsigned int order) 1732 { 1733 int i; 1734 for (i = 0; i < (1 << order); i++) { 1735 struct page *p = page + i; 1736 1737 if (unlikely(check_new_page(p))) 1738 return true; 1739 } 1740 1741 return false; 1742 } 1743 1744 inline void post_alloc_hook(struct page *page, unsigned int order, 1745 gfp_t gfp_flags) 1746 { 1747 set_page_private(page, 0); 1748 set_page_refcounted(page); 1749 1750 arch_alloc_page(page, order); 1751 kernel_map_pages(page, 1 << order, 1); 1752 kernel_poison_pages(page, 1 << order, 1); 1753 kasan_alloc_pages(page, order); 1754 set_page_owner(page, order, gfp_flags); 1755 } 1756 1757 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1758 unsigned int alloc_flags) 1759 { 1760 int i; 1761 1762 post_alloc_hook(page, order, gfp_flags); 1763 1764 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) 1765 for (i = 0; i < (1 << order); i++) 1766 clear_highpage(page + i); 1767 1768 if (order && (gfp_flags & __GFP_COMP)) 1769 prep_compound_page(page, order); 1770 1771 /* 1772 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1773 * allocate the page. The expectation is that the caller is taking 1774 * steps that will free more memory. The caller should avoid the page 1775 * being used for !PFMEMALLOC purposes. 1776 */ 1777 if (alloc_flags & ALLOC_NO_WATERMARKS) 1778 set_page_pfmemalloc(page); 1779 else 1780 clear_page_pfmemalloc(page); 1781 } 1782 1783 /* 1784 * Go through the free lists for the given migratetype and remove 1785 * the smallest available page from the freelists 1786 */ 1787 static inline 1788 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1789 int migratetype) 1790 { 1791 unsigned int current_order; 1792 struct free_area *area; 1793 struct page *page; 1794 1795 /* Find a page of the appropriate size in the preferred list */ 1796 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 1797 area = &(zone->free_area[current_order]); 1798 page = list_first_entry_or_null(&area->free_list[migratetype], 1799 struct page, lru); 1800 if (!page) 1801 continue; 1802 list_del(&page->lru); 1803 rmv_page_order(page); 1804 area->nr_free--; 1805 expand(zone, page, order, current_order, area, migratetype); 1806 set_pcppage_migratetype(page, migratetype); 1807 return page; 1808 } 1809 1810 return NULL; 1811 } 1812 1813 1814 /* 1815 * This array describes the order lists are fallen back to when 1816 * the free lists for the desirable migrate type are depleted 1817 */ 1818 static int fallbacks[MIGRATE_TYPES][4] = { 1819 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1820 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1821 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 1822 #ifdef CONFIG_CMA 1823 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 1824 #endif 1825 #ifdef CONFIG_MEMORY_ISOLATION 1826 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 1827 #endif 1828 }; 1829 1830 #ifdef CONFIG_CMA 1831 static struct page *__rmqueue_cma_fallback(struct zone *zone, 1832 unsigned int order) 1833 { 1834 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1835 } 1836 #else 1837 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1838 unsigned int order) { return NULL; } 1839 #endif 1840 1841 /* 1842 * Move the free pages in a range to the free lists of the requested type. 1843 * Note that start_page and end_pages are not aligned on a pageblock 1844 * boundary. If alignment is required, use move_freepages_block() 1845 */ 1846 static int move_freepages(struct zone *zone, 1847 struct page *start_page, struct page *end_page, 1848 int migratetype, int *num_movable) 1849 { 1850 struct page *page; 1851 unsigned int order; 1852 int pages_moved = 0; 1853 1854 #ifndef CONFIG_HOLES_IN_ZONE 1855 /* 1856 * page_zone is not safe to call in this context when 1857 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 1858 * anyway as we check zone boundaries in move_freepages_block(). 1859 * Remove at a later date when no bug reports exist related to 1860 * grouping pages by mobility 1861 */ 1862 VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); 1863 #endif 1864 1865 if (num_movable) 1866 *num_movable = 0; 1867 1868 for (page = start_page; page <= end_page;) { 1869 if (!pfn_valid_within(page_to_pfn(page))) { 1870 page++; 1871 continue; 1872 } 1873 1874 /* Make sure we are not inadvertently changing nodes */ 1875 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1876 1877 if (!PageBuddy(page)) { 1878 /* 1879 * We assume that pages that could be isolated for 1880 * migration are movable. But we don't actually try 1881 * isolating, as that would be expensive. 1882 */ 1883 if (num_movable && 1884 (PageLRU(page) || __PageMovable(page))) 1885 (*num_movable)++; 1886 1887 page++; 1888 continue; 1889 } 1890 1891 order = page_order(page); 1892 list_move(&page->lru, 1893 &zone->free_area[order].free_list[migratetype]); 1894 page += 1 << order; 1895 pages_moved += 1 << order; 1896 } 1897 1898 return pages_moved; 1899 } 1900 1901 int move_freepages_block(struct zone *zone, struct page *page, 1902 int migratetype, int *num_movable) 1903 { 1904 unsigned long start_pfn, end_pfn; 1905 struct page *start_page, *end_page; 1906 1907 start_pfn = page_to_pfn(page); 1908 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1909 start_page = pfn_to_page(start_pfn); 1910 end_page = start_page + pageblock_nr_pages - 1; 1911 end_pfn = start_pfn + pageblock_nr_pages - 1; 1912 1913 /* Do not cross zone boundaries */ 1914 if (!zone_spans_pfn(zone, start_pfn)) 1915 start_page = page; 1916 if (!zone_spans_pfn(zone, end_pfn)) 1917 return 0; 1918 1919 return move_freepages(zone, start_page, end_page, migratetype, 1920 num_movable); 1921 } 1922 1923 static void change_pageblock_range(struct page *pageblock_page, 1924 int start_order, int migratetype) 1925 { 1926 int nr_pageblocks = 1 << (start_order - pageblock_order); 1927 1928 while (nr_pageblocks--) { 1929 set_pageblock_migratetype(pageblock_page, migratetype); 1930 pageblock_page += pageblock_nr_pages; 1931 } 1932 } 1933 1934 /* 1935 * When we are falling back to another migratetype during allocation, try to 1936 * steal extra free pages from the same pageblocks to satisfy further 1937 * allocations, instead of polluting multiple pageblocks. 1938 * 1939 * If we are stealing a relatively large buddy page, it is likely there will 1940 * be more free pages in the pageblock, so try to steal them all. For 1941 * reclaimable and unmovable allocations, we steal regardless of page size, 1942 * as fragmentation caused by those allocations polluting movable pageblocks 1943 * is worse than movable allocations stealing from unmovable and reclaimable 1944 * pageblocks. 1945 */ 1946 static bool can_steal_fallback(unsigned int order, int start_mt) 1947 { 1948 /* 1949 * Leaving this order check is intended, although there is 1950 * relaxed order check in next check. The reason is that 1951 * we can actually steal whole pageblock if this condition met, 1952 * but, below check doesn't guarantee it and that is just heuristic 1953 * so could be changed anytime. 1954 */ 1955 if (order >= pageblock_order) 1956 return true; 1957 1958 if (order >= pageblock_order / 2 || 1959 start_mt == MIGRATE_RECLAIMABLE || 1960 start_mt == MIGRATE_UNMOVABLE || 1961 page_group_by_mobility_disabled) 1962 return true; 1963 1964 return false; 1965 } 1966 1967 /* 1968 * This function implements actual steal behaviour. If order is large enough, 1969 * we can steal whole pageblock. If not, we first move freepages in this 1970 * pageblock to our migratetype and determine how many already-allocated pages 1971 * are there in the pageblock with a compatible migratetype. If at least half 1972 * of pages are free or compatible, we can change migratetype of the pageblock 1973 * itself, so pages freed in the future will be put on the correct free list. 1974 */ 1975 static void steal_suitable_fallback(struct zone *zone, struct page *page, 1976 int start_type, bool whole_block) 1977 { 1978 unsigned int current_order = page_order(page); 1979 struct free_area *area; 1980 int free_pages, movable_pages, alike_pages; 1981 int old_block_type; 1982 1983 old_block_type = get_pageblock_migratetype(page); 1984 1985 /* 1986 * This can happen due to races and we want to prevent broken 1987 * highatomic accounting. 1988 */ 1989 if (is_migrate_highatomic(old_block_type)) 1990 goto single_page; 1991 1992 /* Take ownership for orders >= pageblock_order */ 1993 if (current_order >= pageblock_order) { 1994 change_pageblock_range(page, current_order, start_type); 1995 goto single_page; 1996 } 1997 1998 /* We are not allowed to try stealing from the whole block */ 1999 if (!whole_block) 2000 goto single_page; 2001 2002 free_pages = move_freepages_block(zone, page, start_type, 2003 &movable_pages); 2004 /* 2005 * Determine how many pages are compatible with our allocation. 2006 * For movable allocation, it's the number of movable pages which 2007 * we just obtained. For other types it's a bit more tricky. 2008 */ 2009 if (start_type == MIGRATE_MOVABLE) { 2010 alike_pages = movable_pages; 2011 } else { 2012 /* 2013 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2014 * to MOVABLE pageblock, consider all non-movable pages as 2015 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2016 * vice versa, be conservative since we can't distinguish the 2017 * exact migratetype of non-movable pages. 2018 */ 2019 if (old_block_type == MIGRATE_MOVABLE) 2020 alike_pages = pageblock_nr_pages 2021 - (free_pages + movable_pages); 2022 else 2023 alike_pages = 0; 2024 } 2025 2026 /* moving whole block can fail due to zone boundary conditions */ 2027 if (!free_pages) 2028 goto single_page; 2029 2030 /* 2031 * If a sufficient number of pages in the block are either free or of 2032 * comparable migratability as our allocation, claim the whole block. 2033 */ 2034 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2035 page_group_by_mobility_disabled) 2036 set_pageblock_migratetype(page, start_type); 2037 2038 return; 2039 2040 single_page: 2041 area = &zone->free_area[current_order]; 2042 list_move(&page->lru, &area->free_list[start_type]); 2043 } 2044 2045 /* 2046 * Check whether there is a suitable fallback freepage with requested order. 2047 * If only_stealable is true, this function returns fallback_mt only if 2048 * we can steal other freepages all together. This would help to reduce 2049 * fragmentation due to mixed migratetype pages in one pageblock. 2050 */ 2051 int find_suitable_fallback(struct free_area *area, unsigned int order, 2052 int migratetype, bool only_stealable, bool *can_steal) 2053 { 2054 int i; 2055 int fallback_mt; 2056 2057 if (area->nr_free == 0) 2058 return -1; 2059 2060 *can_steal = false; 2061 for (i = 0;; i++) { 2062 fallback_mt = fallbacks[migratetype][i]; 2063 if (fallback_mt == MIGRATE_TYPES) 2064 break; 2065 2066 if (list_empty(&area->free_list[fallback_mt])) 2067 continue; 2068 2069 if (can_steal_fallback(order, migratetype)) 2070 *can_steal = true; 2071 2072 if (!only_stealable) 2073 return fallback_mt; 2074 2075 if (*can_steal) 2076 return fallback_mt; 2077 } 2078 2079 return -1; 2080 } 2081 2082 /* 2083 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2084 * there are no empty page blocks that contain a page with a suitable order 2085 */ 2086 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2087 unsigned int alloc_order) 2088 { 2089 int mt; 2090 unsigned long max_managed, flags; 2091 2092 /* 2093 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2094 * Check is race-prone but harmless. 2095 */ 2096 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 2097 if (zone->nr_reserved_highatomic >= max_managed) 2098 return; 2099 2100 spin_lock_irqsave(&zone->lock, flags); 2101 2102 /* Recheck the nr_reserved_highatomic limit under the lock */ 2103 if (zone->nr_reserved_highatomic >= max_managed) 2104 goto out_unlock; 2105 2106 /* Yoink! */ 2107 mt = get_pageblock_migratetype(page); 2108 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 2109 && !is_migrate_cma(mt)) { 2110 zone->nr_reserved_highatomic += pageblock_nr_pages; 2111 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2112 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2113 } 2114 2115 out_unlock: 2116 spin_unlock_irqrestore(&zone->lock, flags); 2117 } 2118 2119 /* 2120 * Used when an allocation is about to fail under memory pressure. This 2121 * potentially hurts the reliability of high-order allocations when under 2122 * intense memory pressure but failed atomic allocations should be easier 2123 * to recover from than an OOM. 2124 * 2125 * If @force is true, try to unreserve a pageblock even though highatomic 2126 * pageblock is exhausted. 2127 */ 2128 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2129 bool force) 2130 { 2131 struct zonelist *zonelist = ac->zonelist; 2132 unsigned long flags; 2133 struct zoneref *z; 2134 struct zone *zone; 2135 struct page *page; 2136 int order; 2137 bool ret; 2138 2139 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 2140 ac->nodemask) { 2141 /* 2142 * Preserve at least one pageblock unless memory pressure 2143 * is really high. 2144 */ 2145 if (!force && zone->nr_reserved_highatomic <= 2146 pageblock_nr_pages) 2147 continue; 2148 2149 spin_lock_irqsave(&zone->lock, flags); 2150 for (order = 0; order < MAX_ORDER; order++) { 2151 struct free_area *area = &(zone->free_area[order]); 2152 2153 page = list_first_entry_or_null( 2154 &area->free_list[MIGRATE_HIGHATOMIC], 2155 struct page, lru); 2156 if (!page) 2157 continue; 2158 2159 /* 2160 * In page freeing path, migratetype change is racy so 2161 * we can counter several free pages in a pageblock 2162 * in this loop althoug we changed the pageblock type 2163 * from highatomic to ac->migratetype. So we should 2164 * adjust the count once. 2165 */ 2166 if (is_migrate_highatomic_page(page)) { 2167 /* 2168 * It should never happen but changes to 2169 * locking could inadvertently allow a per-cpu 2170 * drain to add pages to MIGRATE_HIGHATOMIC 2171 * while unreserving so be safe and watch for 2172 * underflows. 2173 */ 2174 zone->nr_reserved_highatomic -= min( 2175 pageblock_nr_pages, 2176 zone->nr_reserved_highatomic); 2177 } 2178 2179 /* 2180 * Convert to ac->migratetype and avoid the normal 2181 * pageblock stealing heuristics. Minimally, the caller 2182 * is doing the work and needs the pages. More 2183 * importantly, if the block was always converted to 2184 * MIGRATE_UNMOVABLE or another type then the number 2185 * of pageblocks that cannot be completely freed 2186 * may increase. 2187 */ 2188 set_pageblock_migratetype(page, ac->migratetype); 2189 ret = move_freepages_block(zone, page, ac->migratetype, 2190 NULL); 2191 if (ret) { 2192 spin_unlock_irqrestore(&zone->lock, flags); 2193 return ret; 2194 } 2195 } 2196 spin_unlock_irqrestore(&zone->lock, flags); 2197 } 2198 2199 return false; 2200 } 2201 2202 /* 2203 * Try finding a free buddy page on the fallback list and put it on the free 2204 * list of requested migratetype, possibly along with other pages from the same 2205 * block, depending on fragmentation avoidance heuristics. Returns true if 2206 * fallback was found so that __rmqueue_smallest() can grab it. 2207 */ 2208 static inline bool 2209 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) 2210 { 2211 struct free_area *area; 2212 unsigned int current_order; 2213 struct page *page; 2214 int fallback_mt; 2215 bool can_steal; 2216 2217 /* Find the largest possible block of pages in the other list */ 2218 for (current_order = MAX_ORDER-1; 2219 current_order >= order && current_order <= MAX_ORDER-1; 2220 --current_order) { 2221 area = &(zone->free_area[current_order]); 2222 fallback_mt = find_suitable_fallback(area, current_order, 2223 start_migratetype, false, &can_steal); 2224 if (fallback_mt == -1) 2225 continue; 2226 2227 page = list_first_entry(&area->free_list[fallback_mt], 2228 struct page, lru); 2229 2230 steal_suitable_fallback(zone, page, start_migratetype, 2231 can_steal); 2232 2233 trace_mm_page_alloc_extfrag(page, order, current_order, 2234 start_migratetype, fallback_mt); 2235 2236 return true; 2237 } 2238 2239 return false; 2240 } 2241 2242 /* 2243 * Do the hard work of removing an element from the buddy allocator. 2244 * Call me with the zone->lock already held. 2245 */ 2246 static struct page *__rmqueue(struct zone *zone, unsigned int order, 2247 int migratetype) 2248 { 2249 struct page *page; 2250 2251 retry: 2252 page = __rmqueue_smallest(zone, order, migratetype); 2253 if (unlikely(!page)) { 2254 if (migratetype == MIGRATE_MOVABLE) 2255 page = __rmqueue_cma_fallback(zone, order); 2256 2257 if (!page && __rmqueue_fallback(zone, order, migratetype)) 2258 goto retry; 2259 } 2260 2261 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2262 return page; 2263 } 2264 2265 /* 2266 * Obtain a specified number of elements from the buddy allocator, all under 2267 * a single hold of the lock, for efficiency. Add them to the supplied list. 2268 * Returns the number of new pages which were placed at *list. 2269 */ 2270 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2271 unsigned long count, struct list_head *list, 2272 int migratetype, bool cold) 2273 { 2274 int i, alloced = 0; 2275 2276 spin_lock(&zone->lock); 2277 for (i = 0; i < count; ++i) { 2278 struct page *page = __rmqueue(zone, order, migratetype); 2279 if (unlikely(page == NULL)) 2280 break; 2281 2282 if (unlikely(check_pcp_refill(page))) 2283 continue; 2284 2285 /* 2286 * Split buddy pages returned by expand() are received here 2287 * in physical page order. The page is added to the callers and 2288 * list and the list head then moves forward. From the callers 2289 * perspective, the linked list is ordered by page number in 2290 * some conditions. This is useful for IO devices that can 2291 * merge IO requests if the physical pages are ordered 2292 * properly. 2293 */ 2294 if (likely(!cold)) 2295 list_add(&page->lru, list); 2296 else 2297 list_add_tail(&page->lru, list); 2298 list = &page->lru; 2299 alloced++; 2300 if (is_migrate_cma(get_pcppage_migratetype(page))) 2301 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2302 -(1 << order)); 2303 } 2304 2305 /* 2306 * i pages were removed from the buddy list even if some leak due 2307 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 2308 * on i. Do not confuse with 'alloced' which is the number of 2309 * pages added to the pcp list. 2310 */ 2311 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2312 spin_unlock(&zone->lock); 2313 return alloced; 2314 } 2315 2316 #ifdef CONFIG_NUMA 2317 /* 2318 * Called from the vmstat counter updater to drain pagesets of this 2319 * currently executing processor on remote nodes after they have 2320 * expired. 2321 * 2322 * Note that this function must be called with the thread pinned to 2323 * a single processor. 2324 */ 2325 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2326 { 2327 unsigned long flags; 2328 int to_drain, batch; 2329 2330 local_irq_save(flags); 2331 batch = READ_ONCE(pcp->batch); 2332 to_drain = min(pcp->count, batch); 2333 if (to_drain > 0) { 2334 free_pcppages_bulk(zone, to_drain, pcp); 2335 pcp->count -= to_drain; 2336 } 2337 local_irq_restore(flags); 2338 } 2339 #endif 2340 2341 /* 2342 * Drain pcplists of the indicated processor and zone. 2343 * 2344 * The processor must either be the current processor and the 2345 * thread pinned to the current processor or a processor that 2346 * is not online. 2347 */ 2348 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2349 { 2350 unsigned long flags; 2351 struct per_cpu_pageset *pset; 2352 struct per_cpu_pages *pcp; 2353 2354 local_irq_save(flags); 2355 pset = per_cpu_ptr(zone->pageset, cpu); 2356 2357 pcp = &pset->pcp; 2358 if (pcp->count) { 2359 free_pcppages_bulk(zone, pcp->count, pcp); 2360 pcp->count = 0; 2361 } 2362 local_irq_restore(flags); 2363 } 2364 2365 /* 2366 * Drain pcplists of all zones on the indicated processor. 2367 * 2368 * The processor must either be the current processor and the 2369 * thread pinned to the current processor or a processor that 2370 * is not online. 2371 */ 2372 static void drain_pages(unsigned int cpu) 2373 { 2374 struct zone *zone; 2375 2376 for_each_populated_zone(zone) { 2377 drain_pages_zone(cpu, zone); 2378 } 2379 } 2380 2381 /* 2382 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 2383 * 2384 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 2385 * the single zone's pages. 2386 */ 2387 void drain_local_pages(struct zone *zone) 2388 { 2389 int cpu = smp_processor_id(); 2390 2391 if (zone) 2392 drain_pages_zone(cpu, zone); 2393 else 2394 drain_pages(cpu); 2395 } 2396 2397 static void drain_local_pages_wq(struct work_struct *work) 2398 { 2399 /* 2400 * drain_all_pages doesn't use proper cpu hotplug protection so 2401 * we can race with cpu offline when the WQ can move this from 2402 * a cpu pinned worker to an unbound one. We can operate on a different 2403 * cpu which is allright but we also have to make sure to not move to 2404 * a different one. 2405 */ 2406 preempt_disable(); 2407 drain_local_pages(NULL); 2408 preempt_enable(); 2409 } 2410 2411 /* 2412 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2413 * 2414 * When zone parameter is non-NULL, spill just the single zone's pages. 2415 * 2416 * Note that this can be extremely slow as the draining happens in a workqueue. 2417 */ 2418 void drain_all_pages(struct zone *zone) 2419 { 2420 int cpu; 2421 2422 /* 2423 * Allocate in the BSS so we wont require allocation in 2424 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 2425 */ 2426 static cpumask_t cpus_with_pcps; 2427 2428 /* 2429 * Make sure nobody triggers this path before mm_percpu_wq is fully 2430 * initialized. 2431 */ 2432 if (WARN_ON_ONCE(!mm_percpu_wq)) 2433 return; 2434 2435 /* Workqueues cannot recurse */ 2436 if (current->flags & PF_WQ_WORKER) 2437 return; 2438 2439 /* 2440 * Do not drain if one is already in progress unless it's specific to 2441 * a zone. Such callers are primarily CMA and memory hotplug and need 2442 * the drain to be complete when the call returns. 2443 */ 2444 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 2445 if (!zone) 2446 return; 2447 mutex_lock(&pcpu_drain_mutex); 2448 } 2449 2450 /* 2451 * We don't care about racing with CPU hotplug event 2452 * as offline notification will cause the notified 2453 * cpu to drain that CPU pcps and on_each_cpu_mask 2454 * disables preemption as part of its processing 2455 */ 2456 for_each_online_cpu(cpu) { 2457 struct per_cpu_pageset *pcp; 2458 struct zone *z; 2459 bool has_pcps = false; 2460 2461 if (zone) { 2462 pcp = per_cpu_ptr(zone->pageset, cpu); 2463 if (pcp->pcp.count) 2464 has_pcps = true; 2465 } else { 2466 for_each_populated_zone(z) { 2467 pcp = per_cpu_ptr(z->pageset, cpu); 2468 if (pcp->pcp.count) { 2469 has_pcps = true; 2470 break; 2471 } 2472 } 2473 } 2474 2475 if (has_pcps) 2476 cpumask_set_cpu(cpu, &cpus_with_pcps); 2477 else 2478 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2479 } 2480 2481 for_each_cpu(cpu, &cpus_with_pcps) { 2482 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2483 INIT_WORK(work, drain_local_pages_wq); 2484 queue_work_on(cpu, mm_percpu_wq, work); 2485 } 2486 for_each_cpu(cpu, &cpus_with_pcps) 2487 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2488 2489 mutex_unlock(&pcpu_drain_mutex); 2490 } 2491 2492 #ifdef CONFIG_HIBERNATION 2493 2494 void mark_free_pages(struct zone *zone) 2495 { 2496 unsigned long pfn, max_zone_pfn; 2497 unsigned long flags; 2498 unsigned int order, t; 2499 struct page *page; 2500 2501 if (zone_is_empty(zone)) 2502 return; 2503 2504 spin_lock_irqsave(&zone->lock, flags); 2505 2506 max_zone_pfn = zone_end_pfn(zone); 2507 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 2508 if (pfn_valid(pfn)) { 2509 page = pfn_to_page(pfn); 2510 2511 if (page_zone(page) != zone) 2512 continue; 2513 2514 if (!swsusp_page_is_forbidden(page)) 2515 swsusp_unset_page_free(page); 2516 } 2517 2518 for_each_migratetype_order(order, t) { 2519 list_for_each_entry(page, 2520 &zone->free_area[order].free_list[t], lru) { 2521 unsigned long i; 2522 2523 pfn = page_to_pfn(page); 2524 for (i = 0; i < (1UL << order); i++) 2525 swsusp_set_page_free(pfn_to_page(pfn + i)); 2526 } 2527 } 2528 spin_unlock_irqrestore(&zone->lock, flags); 2529 } 2530 #endif /* CONFIG_PM */ 2531 2532 /* 2533 * Free a 0-order page 2534 * cold == true ? free a cold page : free a hot page 2535 */ 2536 void free_hot_cold_page(struct page *page, bool cold) 2537 { 2538 struct zone *zone = page_zone(page); 2539 struct per_cpu_pages *pcp; 2540 unsigned long flags; 2541 unsigned long pfn = page_to_pfn(page); 2542 int migratetype; 2543 2544 if (!free_pcp_prepare(page)) 2545 return; 2546 2547 migratetype = get_pfnblock_migratetype(page, pfn); 2548 set_pcppage_migratetype(page, migratetype); 2549 local_irq_save(flags); 2550 __count_vm_event(PGFREE); 2551 2552 /* 2553 * We only track unmovable, reclaimable and movable on pcp lists. 2554 * Free ISOLATE pages back to the allocator because they are being 2555 * offlined but treat HIGHATOMIC as movable pages so we can get those 2556 * areas back if necessary. Otherwise, we may have to free 2557 * excessively into the page allocator 2558 */ 2559 if (migratetype >= MIGRATE_PCPTYPES) { 2560 if (unlikely(is_migrate_isolate(migratetype))) { 2561 free_one_page(zone, page, pfn, 0, migratetype); 2562 goto out; 2563 } 2564 migratetype = MIGRATE_MOVABLE; 2565 } 2566 2567 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2568 if (!cold) 2569 list_add(&page->lru, &pcp->lists[migratetype]); 2570 else 2571 list_add_tail(&page->lru, &pcp->lists[migratetype]); 2572 pcp->count++; 2573 if (pcp->count >= pcp->high) { 2574 unsigned long batch = READ_ONCE(pcp->batch); 2575 free_pcppages_bulk(zone, batch, pcp); 2576 pcp->count -= batch; 2577 } 2578 2579 out: 2580 local_irq_restore(flags); 2581 } 2582 2583 /* 2584 * Free a list of 0-order pages 2585 */ 2586 void free_hot_cold_page_list(struct list_head *list, bool cold) 2587 { 2588 struct page *page, *next; 2589 2590 list_for_each_entry_safe(page, next, list, lru) { 2591 trace_mm_page_free_batched(page, cold); 2592 free_hot_cold_page(page, cold); 2593 } 2594 } 2595 2596 /* 2597 * split_page takes a non-compound higher-order page, and splits it into 2598 * n (1<<order) sub-pages: page[0..n] 2599 * Each sub-page must be freed individually. 2600 * 2601 * Note: this is probably too low level an operation for use in drivers. 2602 * Please consult with lkml before using this in your driver. 2603 */ 2604 void split_page(struct page *page, unsigned int order) 2605 { 2606 int i; 2607 2608 VM_BUG_ON_PAGE(PageCompound(page), page); 2609 VM_BUG_ON_PAGE(!page_count(page), page); 2610 2611 #ifdef CONFIG_KMEMCHECK 2612 /* 2613 * Split shadow pages too, because free(page[0]) would 2614 * otherwise free the whole shadow. 2615 */ 2616 if (kmemcheck_page_is_tracked(page)) 2617 split_page(virt_to_page(page[0].shadow), order); 2618 #endif 2619 2620 for (i = 1; i < (1 << order); i++) 2621 set_page_refcounted(page + i); 2622 split_page_owner(page, order); 2623 } 2624 EXPORT_SYMBOL_GPL(split_page); 2625 2626 int __isolate_free_page(struct page *page, unsigned int order) 2627 { 2628 unsigned long watermark; 2629 struct zone *zone; 2630 int mt; 2631 2632 BUG_ON(!PageBuddy(page)); 2633 2634 zone = page_zone(page); 2635 mt = get_pageblock_migratetype(page); 2636 2637 if (!is_migrate_isolate(mt)) { 2638 /* 2639 * Obey watermarks as if the page was being allocated. We can 2640 * emulate a high-order watermark check with a raised order-0 2641 * watermark, because we already know our high-order page 2642 * exists. 2643 */ 2644 watermark = min_wmark_pages(zone) + (1UL << order); 2645 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 2646 return 0; 2647 2648 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2649 } 2650 2651 /* Remove page from free list */ 2652 list_del(&page->lru); 2653 zone->free_area[order].nr_free--; 2654 rmv_page_order(page); 2655 2656 /* 2657 * Set the pageblock if the isolated page is at least half of a 2658 * pageblock 2659 */ 2660 if (order >= pageblock_order - 1) { 2661 struct page *endpage = page + (1 << order) - 1; 2662 for (; page < endpage; page += pageblock_nr_pages) { 2663 int mt = get_pageblock_migratetype(page); 2664 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 2665 && !is_migrate_highatomic(mt)) 2666 set_pageblock_migratetype(page, 2667 MIGRATE_MOVABLE); 2668 } 2669 } 2670 2671 2672 return 1UL << order; 2673 } 2674 2675 /* 2676 * Update NUMA hit/miss statistics 2677 * 2678 * Must be called with interrupts disabled. 2679 */ 2680 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 2681 { 2682 #ifdef CONFIG_NUMA 2683 enum zone_stat_item local_stat = NUMA_LOCAL; 2684 2685 if (z->node != numa_node_id()) 2686 local_stat = NUMA_OTHER; 2687 2688 if (z->node == preferred_zone->node) 2689 __inc_zone_state(z, NUMA_HIT); 2690 else { 2691 __inc_zone_state(z, NUMA_MISS); 2692 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 2693 } 2694 __inc_zone_state(z, local_stat); 2695 #endif 2696 } 2697 2698 /* Remove page from the per-cpu list, caller must protect the list */ 2699 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2700 bool cold, struct per_cpu_pages *pcp, 2701 struct list_head *list) 2702 { 2703 struct page *page; 2704 2705 do { 2706 if (list_empty(list)) { 2707 pcp->count += rmqueue_bulk(zone, 0, 2708 pcp->batch, list, 2709 migratetype, cold); 2710 if (unlikely(list_empty(list))) 2711 return NULL; 2712 } 2713 2714 if (cold) 2715 page = list_last_entry(list, struct page, lru); 2716 else 2717 page = list_first_entry(list, struct page, lru); 2718 2719 list_del(&page->lru); 2720 pcp->count--; 2721 } while (check_new_pcp(page)); 2722 2723 return page; 2724 } 2725 2726 /* Lock and remove page from the per-cpu list */ 2727 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 2728 struct zone *zone, unsigned int order, 2729 gfp_t gfp_flags, int migratetype) 2730 { 2731 struct per_cpu_pages *pcp; 2732 struct list_head *list; 2733 bool cold = ((gfp_flags & __GFP_COLD) != 0); 2734 struct page *page; 2735 unsigned long flags; 2736 2737 local_irq_save(flags); 2738 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2739 list = &pcp->lists[migratetype]; 2740 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2741 if (page) { 2742 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2743 zone_statistics(preferred_zone, zone); 2744 } 2745 local_irq_restore(flags); 2746 return page; 2747 } 2748 2749 /* 2750 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2751 */ 2752 static inline 2753 struct page *rmqueue(struct zone *preferred_zone, 2754 struct zone *zone, unsigned int order, 2755 gfp_t gfp_flags, unsigned int alloc_flags, 2756 int migratetype) 2757 { 2758 unsigned long flags; 2759 struct page *page; 2760 2761 if (likely(order == 0)) { 2762 page = rmqueue_pcplist(preferred_zone, zone, order, 2763 gfp_flags, migratetype); 2764 goto out; 2765 } 2766 2767 /* 2768 * We most definitely don't want callers attempting to 2769 * allocate greater than order-1 page units with __GFP_NOFAIL. 2770 */ 2771 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 2772 spin_lock_irqsave(&zone->lock, flags); 2773 2774 do { 2775 page = NULL; 2776 if (alloc_flags & ALLOC_HARDER) { 2777 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2778 if (page) 2779 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2780 } 2781 if (!page) 2782 page = __rmqueue(zone, order, migratetype); 2783 } while (page && check_new_pages(page, order)); 2784 spin_unlock(&zone->lock); 2785 if (!page) 2786 goto failed; 2787 __mod_zone_freepage_state(zone, -(1 << order), 2788 get_pcppage_migratetype(page)); 2789 2790 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2791 zone_statistics(preferred_zone, zone); 2792 local_irq_restore(flags); 2793 2794 out: 2795 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 2796 return page; 2797 2798 failed: 2799 local_irq_restore(flags); 2800 return NULL; 2801 } 2802 2803 #ifdef CONFIG_FAIL_PAGE_ALLOC 2804 2805 static struct { 2806 struct fault_attr attr; 2807 2808 bool ignore_gfp_highmem; 2809 bool ignore_gfp_reclaim; 2810 u32 min_order; 2811 } fail_page_alloc = { 2812 .attr = FAULT_ATTR_INITIALIZER, 2813 .ignore_gfp_reclaim = true, 2814 .ignore_gfp_highmem = true, 2815 .min_order = 1, 2816 }; 2817 2818 static int __init setup_fail_page_alloc(char *str) 2819 { 2820 return setup_fault_attr(&fail_page_alloc.attr, str); 2821 } 2822 __setup("fail_page_alloc=", setup_fail_page_alloc); 2823 2824 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2825 { 2826 if (order < fail_page_alloc.min_order) 2827 return false; 2828 if (gfp_mask & __GFP_NOFAIL) 2829 return false; 2830 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 2831 return false; 2832 if (fail_page_alloc.ignore_gfp_reclaim && 2833 (gfp_mask & __GFP_DIRECT_RECLAIM)) 2834 return false; 2835 2836 return should_fail(&fail_page_alloc.attr, 1 << order); 2837 } 2838 2839 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 2840 2841 static int __init fail_page_alloc_debugfs(void) 2842 { 2843 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 2844 struct dentry *dir; 2845 2846 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 2847 &fail_page_alloc.attr); 2848 if (IS_ERR(dir)) 2849 return PTR_ERR(dir); 2850 2851 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 2852 &fail_page_alloc.ignore_gfp_reclaim)) 2853 goto fail; 2854 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 2855 &fail_page_alloc.ignore_gfp_highmem)) 2856 goto fail; 2857 if (!debugfs_create_u32("min-order", mode, dir, 2858 &fail_page_alloc.min_order)) 2859 goto fail; 2860 2861 return 0; 2862 fail: 2863 debugfs_remove_recursive(dir); 2864 2865 return -ENOMEM; 2866 } 2867 2868 late_initcall(fail_page_alloc_debugfs); 2869 2870 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 2871 2872 #else /* CONFIG_FAIL_PAGE_ALLOC */ 2873 2874 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2875 { 2876 return false; 2877 } 2878 2879 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 2880 2881 /* 2882 * Return true if free base pages are above 'mark'. For high-order checks it 2883 * will return true of the order-0 watermark is reached and there is at least 2884 * one free page of a suitable size. Checking now avoids taking the zone lock 2885 * to check in the allocation paths if no pages are free. 2886 */ 2887 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2888 int classzone_idx, unsigned int alloc_flags, 2889 long free_pages) 2890 { 2891 long min = mark; 2892 int o; 2893 const bool alloc_harder = (alloc_flags & ALLOC_HARDER); 2894 2895 /* free_pages may go negative - that's OK */ 2896 free_pages -= (1 << order) - 1; 2897 2898 if (alloc_flags & ALLOC_HIGH) 2899 min -= min / 2; 2900 2901 /* 2902 * If the caller does not have rights to ALLOC_HARDER then subtract 2903 * the high-atomic reserves. This will over-estimate the size of the 2904 * atomic reserve but it avoids a search. 2905 */ 2906 if (likely(!alloc_harder)) 2907 free_pages -= z->nr_reserved_highatomic; 2908 else 2909 min -= min / 4; 2910 2911 #ifdef CONFIG_CMA 2912 /* If allocation can't use CMA areas don't use free CMA pages */ 2913 if (!(alloc_flags & ALLOC_CMA)) 2914 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); 2915 #endif 2916 2917 /* 2918 * Check watermarks for an order-0 allocation request. If these 2919 * are not met, then a high-order request also cannot go ahead 2920 * even if a suitable page happened to be free. 2921 */ 2922 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 2923 return false; 2924 2925 /* If this is an order-0 request then the watermark is fine */ 2926 if (!order) 2927 return true; 2928 2929 /* For a high-order request, check at least one suitable page is free */ 2930 for (o = order; o < MAX_ORDER; o++) { 2931 struct free_area *area = &z->free_area[o]; 2932 int mt; 2933 2934 if (!area->nr_free) 2935 continue; 2936 2937 if (alloc_harder) 2938 return true; 2939 2940 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 2941 if (!list_empty(&area->free_list[mt])) 2942 return true; 2943 } 2944 2945 #ifdef CONFIG_CMA 2946 if ((alloc_flags & ALLOC_CMA) && 2947 !list_empty(&area->free_list[MIGRATE_CMA])) { 2948 return true; 2949 } 2950 #endif 2951 } 2952 return false; 2953 } 2954 2955 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2956 int classzone_idx, unsigned int alloc_flags) 2957 { 2958 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 2959 zone_page_state(z, NR_FREE_PAGES)); 2960 } 2961 2962 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 2963 unsigned long mark, int classzone_idx, unsigned int alloc_flags) 2964 { 2965 long free_pages = zone_page_state(z, NR_FREE_PAGES); 2966 long cma_pages = 0; 2967 2968 #ifdef CONFIG_CMA 2969 /* If allocation can't use CMA areas don't use free CMA pages */ 2970 if (!(alloc_flags & ALLOC_CMA)) 2971 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); 2972 #endif 2973 2974 /* 2975 * Fast check for order-0 only. If this fails then the reserves 2976 * need to be calculated. There is a corner case where the check 2977 * passes but only the high-order atomic reserve are free. If 2978 * the caller is !atomic then it'll uselessly search the free 2979 * list. That corner case is then slower but it is harmless. 2980 */ 2981 if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) 2982 return true; 2983 2984 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 2985 free_pages); 2986 } 2987 2988 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 2989 unsigned long mark, int classzone_idx) 2990 { 2991 long free_pages = zone_page_state(z, NR_FREE_PAGES); 2992 2993 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 2994 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 2995 2996 return __zone_watermark_ok(z, order, mark, classzone_idx, 0, 2997 free_pages); 2998 } 2999 3000 #ifdef CONFIG_NUMA 3001 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3002 { 3003 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3004 RECLAIM_DISTANCE; 3005 } 3006 #else /* CONFIG_NUMA */ 3007 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3008 { 3009 return true; 3010 } 3011 #endif /* CONFIG_NUMA */ 3012 3013 /* 3014 * get_page_from_freelist goes through the zonelist trying to allocate 3015 * a page. 3016 */ 3017 static struct page * 3018 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3019 const struct alloc_context *ac) 3020 { 3021 struct zoneref *z = ac->preferred_zoneref; 3022 struct zone *zone; 3023 struct pglist_data *last_pgdat_dirty_limit = NULL; 3024 3025 /* 3026 * Scan zonelist, looking for a zone with enough free. 3027 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3028 */ 3029 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3030 ac->nodemask) { 3031 struct page *page; 3032 unsigned long mark; 3033 3034 if (cpusets_enabled() && 3035 (alloc_flags & ALLOC_CPUSET) && 3036 !__cpuset_zone_allowed(zone, gfp_mask)) 3037 continue; 3038 /* 3039 * When allocating a page cache page for writing, we 3040 * want to get it from a node that is within its dirty 3041 * limit, such that no single node holds more than its 3042 * proportional share of globally allowed dirty pages. 3043 * The dirty limits take into account the node's 3044 * lowmem reserves and high watermark so that kswapd 3045 * should be able to balance it without having to 3046 * write pages from its LRU list. 3047 * 3048 * XXX: For now, allow allocations to potentially 3049 * exceed the per-node dirty limit in the slowpath 3050 * (spread_dirty_pages unset) before going into reclaim, 3051 * which is important when on a NUMA setup the allowed 3052 * nodes are together not big enough to reach the 3053 * global limit. The proper fix for these situations 3054 * will require awareness of nodes in the 3055 * dirty-throttling and the flusher threads. 3056 */ 3057 if (ac->spread_dirty_pages) { 3058 if (last_pgdat_dirty_limit == zone->zone_pgdat) 3059 continue; 3060 3061 if (!node_dirty_ok(zone->zone_pgdat)) { 3062 last_pgdat_dirty_limit = zone->zone_pgdat; 3063 continue; 3064 } 3065 } 3066 3067 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 3068 if (!zone_watermark_fast(zone, order, mark, 3069 ac_classzone_idx(ac), alloc_flags)) { 3070 int ret; 3071 3072 /* Checked here to keep the fast path fast */ 3073 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3074 if (alloc_flags & ALLOC_NO_WATERMARKS) 3075 goto try_this_zone; 3076 3077 if (node_reclaim_mode == 0 || 3078 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 3079 continue; 3080 3081 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3082 switch (ret) { 3083 case NODE_RECLAIM_NOSCAN: 3084 /* did not scan */ 3085 continue; 3086 case NODE_RECLAIM_FULL: 3087 /* scanned but unreclaimable */ 3088 continue; 3089 default: 3090 /* did we reclaim enough */ 3091 if (zone_watermark_ok(zone, order, mark, 3092 ac_classzone_idx(ac), alloc_flags)) 3093 goto try_this_zone; 3094 3095 continue; 3096 } 3097 } 3098 3099 try_this_zone: 3100 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 3101 gfp_mask, alloc_flags, ac->migratetype); 3102 if (page) { 3103 prep_new_page(page, order, gfp_mask, alloc_flags); 3104 3105 /* 3106 * If this is a high-order atomic allocation then check 3107 * if the pageblock should be reserved for the future 3108 */ 3109 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 3110 reserve_highatomic_pageblock(page, zone, order); 3111 3112 return page; 3113 } 3114 } 3115 3116 return NULL; 3117 } 3118 3119 /* 3120 * Large machines with many possible nodes should not always dump per-node 3121 * meminfo in irq context. 3122 */ 3123 static inline bool should_suppress_show_mem(void) 3124 { 3125 bool ret = false; 3126 3127 #if NODES_SHIFT > 8 3128 ret = in_interrupt(); 3129 #endif 3130 return ret; 3131 } 3132 3133 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3134 { 3135 unsigned int filter = SHOW_MEM_FILTER_NODES; 3136 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); 3137 3138 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) 3139 return; 3140 3141 /* 3142 * This documents exceptions given to allocations in certain 3143 * contexts that are allowed to allocate outside current's set 3144 * of allowed nodes. 3145 */ 3146 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3147 if (test_thread_flag(TIF_MEMDIE) || 3148 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3149 filter &= ~SHOW_MEM_FILTER_NODES; 3150 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3151 filter &= ~SHOW_MEM_FILTER_NODES; 3152 3153 show_mem(filter, nodemask); 3154 } 3155 3156 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 3157 { 3158 struct va_format vaf; 3159 va_list args; 3160 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, 3161 DEFAULT_RATELIMIT_BURST); 3162 3163 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3164 return; 3165 3166 pr_warn("%s: ", current->comm); 3167 3168 va_start(args, fmt); 3169 vaf.fmt = fmt; 3170 vaf.va = &args; 3171 pr_cont("%pV", &vaf); 3172 va_end(args); 3173 3174 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); 3175 if (nodemask) 3176 pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); 3177 else 3178 pr_cont("(null)\n"); 3179 3180 cpuset_print_current_mems_allowed(); 3181 3182 dump_stack(); 3183 warn_alloc_show_mem(gfp_mask, nodemask); 3184 } 3185 3186 static inline struct page * 3187 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 3188 unsigned int alloc_flags, 3189 const struct alloc_context *ac) 3190 { 3191 struct page *page; 3192 3193 page = get_page_from_freelist(gfp_mask, order, 3194 alloc_flags|ALLOC_CPUSET, ac); 3195 /* 3196 * fallback to ignore cpuset restriction if our nodes 3197 * are depleted 3198 */ 3199 if (!page) 3200 page = get_page_from_freelist(gfp_mask, order, 3201 alloc_flags, ac); 3202 3203 return page; 3204 } 3205 3206 static inline struct page * 3207 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 3208 const struct alloc_context *ac, unsigned long *did_some_progress) 3209 { 3210 struct oom_control oc = { 3211 .zonelist = ac->zonelist, 3212 .nodemask = ac->nodemask, 3213 .memcg = NULL, 3214 .gfp_mask = gfp_mask, 3215 .order = order, 3216 }; 3217 struct page *page; 3218 3219 *did_some_progress = 0; 3220 3221 /* 3222 * Acquire the oom lock. If that fails, somebody else is 3223 * making progress for us. 3224 */ 3225 if (!mutex_trylock(&oom_lock)) { 3226 *did_some_progress = 1; 3227 schedule_timeout_uninterruptible(1); 3228 return NULL; 3229 } 3230 3231 /* 3232 * Go through the zonelist yet one more time, keep very high watermark 3233 * here, this is only to catch a parallel oom killing, we must fail if 3234 * we're still under heavy pressure. 3235 */ 3236 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, 3237 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 3238 if (page) 3239 goto out; 3240 3241 /* Coredumps can quickly deplete all memory reserves */ 3242 if (current->flags & PF_DUMPCORE) 3243 goto out; 3244 /* The OOM killer will not help higher order allocs */ 3245 if (order > PAGE_ALLOC_COSTLY_ORDER) 3246 goto out; 3247 /* The OOM killer does not needlessly kill tasks for lowmem */ 3248 if (ac->high_zoneidx < ZONE_NORMAL) 3249 goto out; 3250 if (pm_suspended_storage()) 3251 goto out; 3252 /* 3253 * XXX: GFP_NOFS allocations should rather fail than rely on 3254 * other request to make a forward progress. 3255 * We are in an unfortunate situation where out_of_memory cannot 3256 * do much for this context but let's try it to at least get 3257 * access to memory reserved if the current task is killed (see 3258 * out_of_memory). Once filesystems are ready to handle allocation 3259 * failures more gracefully we should just bail out here. 3260 */ 3261 3262 /* The OOM killer may not free memory on a specific node */ 3263 if (gfp_mask & __GFP_THISNODE) 3264 goto out; 3265 3266 /* Exhausted what can be done so it's blamo time */ 3267 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 3268 *did_some_progress = 1; 3269 3270 /* 3271 * Help non-failing allocations by giving them access to memory 3272 * reserves 3273 */ 3274 if (gfp_mask & __GFP_NOFAIL) 3275 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 3276 ALLOC_NO_WATERMARKS, ac); 3277 } 3278 out: 3279 mutex_unlock(&oom_lock); 3280 return page; 3281 } 3282 3283 /* 3284 * Maximum number of compaction retries wit a progress before OOM 3285 * killer is consider as the only way to move forward. 3286 */ 3287 #define MAX_COMPACT_RETRIES 16 3288 3289 #ifdef CONFIG_COMPACTION 3290 /* Try memory compaction for high-order allocations before reclaim */ 3291 static struct page * 3292 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3293 unsigned int alloc_flags, const struct alloc_context *ac, 3294 enum compact_priority prio, enum compact_result *compact_result) 3295 { 3296 struct page *page; 3297 unsigned int noreclaim_flag; 3298 3299 if (!order) 3300 return NULL; 3301 3302 noreclaim_flag = memalloc_noreclaim_save(); 3303 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3304 prio); 3305 memalloc_noreclaim_restore(noreclaim_flag); 3306 3307 if (*compact_result <= COMPACT_INACTIVE) 3308 return NULL; 3309 3310 /* 3311 * At least in one zone compaction wasn't deferred or skipped, so let's 3312 * count a compaction stall 3313 */ 3314 count_vm_event(COMPACTSTALL); 3315 3316 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3317 3318 if (page) { 3319 struct zone *zone = page_zone(page); 3320 3321 zone->compact_blockskip_flush = false; 3322 compaction_defer_reset(zone, order, true); 3323 count_vm_event(COMPACTSUCCESS); 3324 return page; 3325 } 3326 3327 /* 3328 * It's bad if compaction run occurs and fails. The most likely reason 3329 * is that pages exist, but not enough to satisfy watermarks. 3330 */ 3331 count_vm_event(COMPACTFAIL); 3332 3333 cond_resched(); 3334 3335 return NULL; 3336 } 3337 3338 static inline bool 3339 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 3340 enum compact_result compact_result, 3341 enum compact_priority *compact_priority, 3342 int *compaction_retries) 3343 { 3344 int max_retries = MAX_COMPACT_RETRIES; 3345 int min_priority; 3346 bool ret = false; 3347 int retries = *compaction_retries; 3348 enum compact_priority priority = *compact_priority; 3349 3350 if (!order) 3351 return false; 3352 3353 if (compaction_made_progress(compact_result)) 3354 (*compaction_retries)++; 3355 3356 /* 3357 * compaction considers all the zone as desperately out of memory 3358 * so it doesn't really make much sense to retry except when the 3359 * failure could be caused by insufficient priority 3360 */ 3361 if (compaction_failed(compact_result)) 3362 goto check_priority; 3363 3364 /* 3365 * make sure the compaction wasn't deferred or didn't bail out early 3366 * due to locks contention before we declare that we should give up. 3367 * But do not retry if the given zonelist is not suitable for 3368 * compaction. 3369 */ 3370 if (compaction_withdrawn(compact_result)) { 3371 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3372 goto out; 3373 } 3374 3375 /* 3376 * !costly requests are much more important than __GFP_REPEAT 3377 * costly ones because they are de facto nofail and invoke OOM 3378 * killer to move on while costly can fail and users are ready 3379 * to cope with that. 1/4 retries is rather arbitrary but we 3380 * would need much more detailed feedback from compaction to 3381 * make a better decision. 3382 */ 3383 if (order > PAGE_ALLOC_COSTLY_ORDER) 3384 max_retries /= 4; 3385 if (*compaction_retries <= max_retries) { 3386 ret = true; 3387 goto out; 3388 } 3389 3390 /* 3391 * Make sure there are attempts at the highest priority if we exhausted 3392 * all retries or failed at the lower priorities. 3393 */ 3394 check_priority: 3395 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 3396 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 3397 3398 if (*compact_priority > min_priority) { 3399 (*compact_priority)--; 3400 *compaction_retries = 0; 3401 ret = true; 3402 } 3403 out: 3404 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 3405 return ret; 3406 } 3407 #else 3408 static inline struct page * 3409 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3410 unsigned int alloc_flags, const struct alloc_context *ac, 3411 enum compact_priority prio, enum compact_result *compact_result) 3412 { 3413 *compact_result = COMPACT_SKIPPED; 3414 return NULL; 3415 } 3416 3417 static inline bool 3418 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3419 enum compact_result compact_result, 3420 enum compact_priority *compact_priority, 3421 int *compaction_retries) 3422 { 3423 struct zone *zone; 3424 struct zoneref *z; 3425 3426 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 3427 return false; 3428 3429 /* 3430 * There are setups with compaction disabled which would prefer to loop 3431 * inside the allocator rather than hit the oom killer prematurely. 3432 * Let's give them a good hope and keep retrying while the order-0 3433 * watermarks are OK. 3434 */ 3435 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3436 ac->nodemask) { 3437 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 3438 ac_classzone_idx(ac), alloc_flags)) 3439 return true; 3440 } 3441 return false; 3442 } 3443 #endif /* CONFIG_COMPACTION */ 3444 3445 /* Perform direct synchronous page reclaim */ 3446 static int 3447 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 3448 const struct alloc_context *ac) 3449 { 3450 struct reclaim_state reclaim_state; 3451 int progress; 3452 unsigned int noreclaim_flag; 3453 3454 cond_resched(); 3455 3456 /* We now go into synchronous reclaim */ 3457 cpuset_memory_pressure_bump(); 3458 noreclaim_flag = memalloc_noreclaim_save(); 3459 lockdep_set_current_reclaim_state(gfp_mask); 3460 reclaim_state.reclaimed_slab = 0; 3461 current->reclaim_state = &reclaim_state; 3462 3463 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 3464 ac->nodemask); 3465 3466 current->reclaim_state = NULL; 3467 lockdep_clear_current_reclaim_state(); 3468 memalloc_noreclaim_restore(noreclaim_flag); 3469 3470 cond_resched(); 3471 3472 return progress; 3473 } 3474 3475 /* The really slow allocator path where we enter direct reclaim */ 3476 static inline struct page * 3477 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 3478 unsigned int alloc_flags, const struct alloc_context *ac, 3479 unsigned long *did_some_progress) 3480 { 3481 struct page *page = NULL; 3482 bool drained = false; 3483 3484 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 3485 if (unlikely(!(*did_some_progress))) 3486 return NULL; 3487 3488 retry: 3489 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3490 3491 /* 3492 * If an allocation failed after direct reclaim, it could be because 3493 * pages are pinned on the per-cpu lists or in high alloc reserves. 3494 * Shrink them them and try again 3495 */ 3496 if (!page && !drained) { 3497 unreserve_highatomic_pageblock(ac, false); 3498 drain_all_pages(NULL); 3499 drained = true; 3500 goto retry; 3501 } 3502 3503 return page; 3504 } 3505 3506 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 3507 { 3508 struct zoneref *z; 3509 struct zone *zone; 3510 pg_data_t *last_pgdat = NULL; 3511 3512 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3513 ac->high_zoneidx, ac->nodemask) { 3514 if (last_pgdat != zone->zone_pgdat) 3515 wakeup_kswapd(zone, order, ac->high_zoneidx); 3516 last_pgdat = zone->zone_pgdat; 3517 } 3518 } 3519 3520 static inline unsigned int 3521 gfp_to_alloc_flags(gfp_t gfp_mask) 3522 { 3523 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 3524 3525 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 3526 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 3527 3528 /* 3529 * The caller may dip into page reserves a bit more if the caller 3530 * cannot run direct reclaim, or if the caller has realtime scheduling 3531 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 3532 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 3533 */ 3534 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 3535 3536 if (gfp_mask & __GFP_ATOMIC) { 3537 /* 3538 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 3539 * if it can't schedule. 3540 */ 3541 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3542 alloc_flags |= ALLOC_HARDER; 3543 /* 3544 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 3545 * comment for __cpuset_node_allowed(). 3546 */ 3547 alloc_flags &= ~ALLOC_CPUSET; 3548 } else if (unlikely(rt_task(current)) && !in_interrupt()) 3549 alloc_flags |= ALLOC_HARDER; 3550 3551 #ifdef CONFIG_CMA 3552 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3553 alloc_flags |= ALLOC_CMA; 3554 #endif 3555 return alloc_flags; 3556 } 3557 3558 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3559 { 3560 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3561 return false; 3562 3563 if (gfp_mask & __GFP_MEMALLOC) 3564 return true; 3565 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3566 return true; 3567 if (!in_interrupt() && 3568 ((current->flags & PF_MEMALLOC) || 3569 unlikely(test_thread_flag(TIF_MEMDIE)))) 3570 return true; 3571 3572 return false; 3573 } 3574 3575 /* 3576 * Checks whether it makes sense to retry the reclaim to make a forward progress 3577 * for the given allocation request. 3578 * 3579 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 3580 * without success, or when we couldn't even meet the watermark if we 3581 * reclaimed all remaining pages on the LRU lists. 3582 * 3583 * Returns true if a retry is viable or false to enter the oom path. 3584 */ 3585 static inline bool 3586 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 3587 struct alloc_context *ac, int alloc_flags, 3588 bool did_some_progress, int *no_progress_loops) 3589 { 3590 struct zone *zone; 3591 struct zoneref *z; 3592 3593 /* 3594 * Costly allocations might have made a progress but this doesn't mean 3595 * their order will become available due to high fragmentation so 3596 * always increment the no progress counter for them 3597 */ 3598 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 3599 *no_progress_loops = 0; 3600 else 3601 (*no_progress_loops)++; 3602 3603 /* 3604 * Make sure we converge to OOM if we cannot make any progress 3605 * several times in the row. 3606 */ 3607 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 3608 /* Before OOM, exhaust highatomic_reserve */ 3609 return unreserve_highatomic_pageblock(ac, true); 3610 } 3611 3612 /* 3613 * Keep reclaiming pages while there is a chance this will lead 3614 * somewhere. If none of the target zones can satisfy our allocation 3615 * request even if all reclaimable pages are considered then we are 3616 * screwed and have to go OOM. 3617 */ 3618 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3619 ac->nodemask) { 3620 unsigned long available; 3621 unsigned long reclaimable; 3622 unsigned long min_wmark = min_wmark_pages(zone); 3623 bool wmark; 3624 3625 available = reclaimable = zone_reclaimable_pages(zone); 3626 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3627 3628 /* 3629 * Would the allocation succeed if we reclaimed all 3630 * reclaimable pages? 3631 */ 3632 wmark = __zone_watermark_ok(zone, order, min_wmark, 3633 ac_classzone_idx(ac), alloc_flags, available); 3634 trace_reclaim_retry_zone(z, order, reclaimable, 3635 available, min_wmark, *no_progress_loops, wmark); 3636 if (wmark) { 3637 /* 3638 * If we didn't make any progress and have a lot of 3639 * dirty + writeback pages then we should wait for 3640 * an IO to complete to slow down the reclaim and 3641 * prevent from pre mature OOM 3642 */ 3643 if (!did_some_progress) { 3644 unsigned long write_pending; 3645 3646 write_pending = zone_page_state_snapshot(zone, 3647 NR_ZONE_WRITE_PENDING); 3648 3649 if (2 * write_pending > reclaimable) { 3650 congestion_wait(BLK_RW_ASYNC, HZ/10); 3651 return true; 3652 } 3653 } 3654 3655 /* 3656 * Memory allocation/reclaim might be called from a WQ 3657 * context and the current implementation of the WQ 3658 * concurrency control doesn't recognize that 3659 * a particular WQ is congested if the worker thread is 3660 * looping without ever sleeping. Therefore we have to 3661 * do a short sleep here rather than calling 3662 * cond_resched(). 3663 */ 3664 if (current->flags & PF_WQ_WORKER) 3665 schedule_timeout_uninterruptible(1); 3666 else 3667 cond_resched(); 3668 3669 return true; 3670 } 3671 } 3672 3673 return false; 3674 } 3675 3676 static inline struct page * 3677 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 3678 struct alloc_context *ac) 3679 { 3680 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 3681 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 3682 struct page *page = NULL; 3683 unsigned int alloc_flags; 3684 unsigned long did_some_progress; 3685 enum compact_priority compact_priority; 3686 enum compact_result compact_result; 3687 int compaction_retries; 3688 int no_progress_loops; 3689 unsigned long alloc_start = jiffies; 3690 unsigned int stall_timeout = 10 * HZ; 3691 unsigned int cpuset_mems_cookie; 3692 3693 /* 3694 * In the slowpath, we sanity check order to avoid ever trying to 3695 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 3696 * be using allocators in order of preference for an area that is 3697 * too large. 3698 */ 3699 if (order >= MAX_ORDER) { 3700 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 3701 return NULL; 3702 } 3703 3704 /* 3705 * We also sanity check to catch abuse of atomic reserves being used by 3706 * callers that are not in atomic context. 3707 */ 3708 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 3709 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3710 gfp_mask &= ~__GFP_ATOMIC; 3711 3712 retry_cpuset: 3713 compaction_retries = 0; 3714 no_progress_loops = 0; 3715 compact_priority = DEF_COMPACT_PRIORITY; 3716 cpuset_mems_cookie = read_mems_allowed_begin(); 3717 3718 /* 3719 * The fast path uses conservative alloc_flags to succeed only until 3720 * kswapd needs to be woken up, and to avoid the cost of setting up 3721 * alloc_flags precisely. So we do that now. 3722 */ 3723 alloc_flags = gfp_to_alloc_flags(gfp_mask); 3724 3725 /* 3726 * We need to recalculate the starting point for the zonelist iterator 3727 * because we might have used different nodemask in the fast path, or 3728 * there was a cpuset modification and we are retrying - otherwise we 3729 * could end up iterating over non-eligible zones endlessly. 3730 */ 3731 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3732 ac->high_zoneidx, ac->nodemask); 3733 if (!ac->preferred_zoneref->zone) 3734 goto nopage; 3735 3736 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3737 wake_all_kswapds(order, ac); 3738 3739 /* 3740 * The adjusted alloc_flags might result in immediate success, so try 3741 * that first 3742 */ 3743 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3744 if (page) 3745 goto got_pg; 3746 3747 /* 3748 * For costly allocations, try direct compaction first, as it's likely 3749 * that we have enough base pages and don't need to reclaim. For non- 3750 * movable high-order allocations, do that as well, as compaction will 3751 * try prevent permanent fragmentation by migrating from blocks of the 3752 * same migratetype. 3753 * Don't try this for allocations that are allowed to ignore 3754 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 3755 */ 3756 if (can_direct_reclaim && 3757 (costly_order || 3758 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 3759 && !gfp_pfmemalloc_allowed(gfp_mask)) { 3760 page = __alloc_pages_direct_compact(gfp_mask, order, 3761 alloc_flags, ac, 3762 INIT_COMPACT_PRIORITY, 3763 &compact_result); 3764 if (page) 3765 goto got_pg; 3766 3767 /* 3768 * Checks for costly allocations with __GFP_NORETRY, which 3769 * includes THP page fault allocations 3770 */ 3771 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 3772 /* 3773 * If compaction is deferred for high-order allocations, 3774 * it is because sync compaction recently failed. If 3775 * this is the case and the caller requested a THP 3776 * allocation, we do not want to heavily disrupt the 3777 * system, so we fail the allocation instead of entering 3778 * direct reclaim. 3779 */ 3780 if (compact_result == COMPACT_DEFERRED) 3781 goto nopage; 3782 3783 /* 3784 * Looks like reclaim/compaction is worth trying, but 3785 * sync compaction could be very expensive, so keep 3786 * using async compaction. 3787 */ 3788 compact_priority = INIT_COMPACT_PRIORITY; 3789 } 3790 } 3791 3792 retry: 3793 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 3794 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3795 wake_all_kswapds(order, ac); 3796 3797 if (gfp_pfmemalloc_allowed(gfp_mask)) 3798 alloc_flags = ALLOC_NO_WATERMARKS; 3799 3800 /* 3801 * Reset the zonelist iterators if memory policies can be ignored. 3802 * These allocations are high priority and system rather than user 3803 * orientated. 3804 */ 3805 if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { 3806 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3807 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3808 ac->high_zoneidx, ac->nodemask); 3809 } 3810 3811 /* Attempt with potentially adjusted zonelist and alloc_flags */ 3812 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3813 if (page) 3814 goto got_pg; 3815 3816 /* Caller is not willing to reclaim, we can't balance anything */ 3817 if (!can_direct_reclaim) 3818 goto nopage; 3819 3820 /* Make sure we know about allocations which stall for too long */ 3821 if (time_after(jiffies, alloc_start + stall_timeout)) { 3822 warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, 3823 "page allocation stalls for %ums, order:%u", 3824 jiffies_to_msecs(jiffies-alloc_start), order); 3825 stall_timeout += 10 * HZ; 3826 } 3827 3828 /* Avoid recursion of direct reclaim */ 3829 if (current->flags & PF_MEMALLOC) 3830 goto nopage; 3831 3832 /* Try direct reclaim and then allocating */ 3833 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 3834 &did_some_progress); 3835 if (page) 3836 goto got_pg; 3837 3838 /* Try direct compaction and then allocating */ 3839 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 3840 compact_priority, &compact_result); 3841 if (page) 3842 goto got_pg; 3843 3844 /* Do not loop if specifically requested */ 3845 if (gfp_mask & __GFP_NORETRY) 3846 goto nopage; 3847 3848 /* 3849 * Do not retry costly high order allocations unless they are 3850 * __GFP_REPEAT 3851 */ 3852 if (costly_order && !(gfp_mask & __GFP_REPEAT)) 3853 goto nopage; 3854 3855 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 3856 did_some_progress > 0, &no_progress_loops)) 3857 goto retry; 3858 3859 /* 3860 * It doesn't make any sense to retry for the compaction if the order-0 3861 * reclaim is not able to make any progress because the current 3862 * implementation of the compaction depends on the sufficient amount 3863 * of free memory (see __compaction_suitable) 3864 */ 3865 if (did_some_progress > 0 && 3866 should_compact_retry(ac, order, alloc_flags, 3867 compact_result, &compact_priority, 3868 &compaction_retries)) 3869 goto retry; 3870 3871 /* 3872 * It's possible we raced with cpuset update so the OOM would be 3873 * premature (see below the nopage: label for full explanation). 3874 */ 3875 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3876 goto retry_cpuset; 3877 3878 /* Reclaim has failed us, start killing things */ 3879 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3880 if (page) 3881 goto got_pg; 3882 3883 /* Avoid allocations with no watermarks from looping endlessly */ 3884 if (test_thread_flag(TIF_MEMDIE) && 3885 (alloc_flags == ALLOC_NO_WATERMARKS || 3886 (gfp_mask & __GFP_NOMEMALLOC))) 3887 goto nopage; 3888 3889 /* Retry as long as the OOM killer is making progress */ 3890 if (did_some_progress) { 3891 no_progress_loops = 0; 3892 goto retry; 3893 } 3894 3895 nopage: 3896 /* 3897 * When updating a task's mems_allowed or mempolicy nodemask, it is 3898 * possible to race with parallel threads in such a way that our 3899 * allocation can fail while the mask is being updated. If we are about 3900 * to fail, check if the cpuset changed during allocation and if so, 3901 * retry. 3902 */ 3903 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3904 goto retry_cpuset; 3905 3906 /* 3907 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 3908 * we always retry 3909 */ 3910 if (gfp_mask & __GFP_NOFAIL) { 3911 /* 3912 * All existing users of the __GFP_NOFAIL are blockable, so warn 3913 * of any new users that actually require GFP_NOWAIT 3914 */ 3915 if (WARN_ON_ONCE(!can_direct_reclaim)) 3916 goto fail; 3917 3918 /* 3919 * PF_MEMALLOC request from this context is rather bizarre 3920 * because we cannot reclaim anything and only can loop waiting 3921 * for somebody to do a work for us 3922 */ 3923 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 3924 3925 /* 3926 * non failing costly orders are a hard requirement which we 3927 * are not prepared for much so let's warn about these users 3928 * so that we can identify them and convert them to something 3929 * else. 3930 */ 3931 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 3932 3933 /* 3934 * Help non-failing allocations by giving them access to memory 3935 * reserves but do not use ALLOC_NO_WATERMARKS because this 3936 * could deplete whole memory reserves which would just make 3937 * the situation worse 3938 */ 3939 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 3940 if (page) 3941 goto got_pg; 3942 3943 cond_resched(); 3944 goto retry; 3945 } 3946 fail: 3947 warn_alloc(gfp_mask, ac->nodemask, 3948 "page allocation failure: order:%u", order); 3949 got_pg: 3950 return page; 3951 } 3952 3953 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 3954 struct zonelist *zonelist, nodemask_t *nodemask, 3955 struct alloc_context *ac, gfp_t *alloc_mask, 3956 unsigned int *alloc_flags) 3957 { 3958 ac->high_zoneidx = gfp_zone(gfp_mask); 3959 ac->zonelist = zonelist; 3960 ac->nodemask = nodemask; 3961 ac->migratetype = gfpflags_to_migratetype(gfp_mask); 3962 3963 if (cpusets_enabled()) { 3964 *alloc_mask |= __GFP_HARDWALL; 3965 if (!ac->nodemask) 3966 ac->nodemask = &cpuset_current_mems_allowed; 3967 else 3968 *alloc_flags |= ALLOC_CPUSET; 3969 } 3970 3971 lockdep_trace_alloc(gfp_mask); 3972 3973 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 3974 3975 if (should_fail_alloc_page(gfp_mask, order)) 3976 return false; 3977 3978 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) 3979 *alloc_flags |= ALLOC_CMA; 3980 3981 return true; 3982 } 3983 3984 /* Determine whether to spread dirty pages and what the first usable zone */ 3985 static inline void finalise_ac(gfp_t gfp_mask, 3986 unsigned int order, struct alloc_context *ac) 3987 { 3988 /* Dirty zone balancing only done in the fast path */ 3989 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3990 3991 /* 3992 * The preferred zone is used for statistics but crucially it is 3993 * also used as the starting point for the zonelist iterator. It 3994 * may get reset for allocations that ignore memory policies. 3995 */ 3996 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3997 ac->high_zoneidx, ac->nodemask); 3998 } 3999 4000 /* 4001 * This is the 'heart' of the zoned buddy allocator. 4002 */ 4003 struct page * 4004 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 4005 struct zonelist *zonelist, nodemask_t *nodemask) 4006 { 4007 struct page *page; 4008 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4009 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 4010 struct alloc_context ac = { }; 4011 4012 gfp_mask &= gfp_allowed_mask; 4013 if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) 4014 return NULL; 4015 4016 finalise_ac(gfp_mask, order, &ac); 4017 4018 /* First allocation attempt */ 4019 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 4020 if (likely(page)) 4021 goto out; 4022 4023 /* 4024 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 4025 * resp. GFP_NOIO which has to be inherited for all allocation requests 4026 * from a particular context which has been marked by 4027 * memalloc_no{fs,io}_{save,restore}. 4028 */ 4029 alloc_mask = current_gfp_context(gfp_mask); 4030 ac.spread_dirty_pages = false; 4031 4032 /* 4033 * Restore the original nodemask if it was potentially replaced with 4034 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 4035 */ 4036 if (unlikely(ac.nodemask != nodemask)) 4037 ac.nodemask = nodemask; 4038 4039 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 4040 4041 out: 4042 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 4043 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4044 __free_pages(page, order); 4045 page = NULL; 4046 } 4047 4048 if (kmemcheck_enabled && page) 4049 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 4050 4051 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4052 4053 return page; 4054 } 4055 EXPORT_SYMBOL(__alloc_pages_nodemask); 4056 4057 /* 4058 * Common helper functions. 4059 */ 4060 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 4061 { 4062 struct page *page; 4063 4064 /* 4065 * __get_free_pages() returns a 32-bit address, which cannot represent 4066 * a highmem page 4067 */ 4068 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 4069 4070 page = alloc_pages(gfp_mask, order); 4071 if (!page) 4072 return 0; 4073 return (unsigned long) page_address(page); 4074 } 4075 EXPORT_SYMBOL(__get_free_pages); 4076 4077 unsigned long get_zeroed_page(gfp_t gfp_mask) 4078 { 4079 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 4080 } 4081 EXPORT_SYMBOL(get_zeroed_page); 4082 4083 void __free_pages(struct page *page, unsigned int order) 4084 { 4085 if (put_page_testzero(page)) { 4086 if (order == 0) 4087 free_hot_cold_page(page, false); 4088 else 4089 __free_pages_ok(page, order); 4090 } 4091 } 4092 4093 EXPORT_SYMBOL(__free_pages); 4094 4095 void free_pages(unsigned long addr, unsigned int order) 4096 { 4097 if (addr != 0) { 4098 VM_BUG_ON(!virt_addr_valid((void *)addr)); 4099 __free_pages(virt_to_page((void *)addr), order); 4100 } 4101 } 4102 4103 EXPORT_SYMBOL(free_pages); 4104 4105 /* 4106 * Page Fragment: 4107 * An arbitrary-length arbitrary-offset area of memory which resides 4108 * within a 0 or higher order page. Multiple fragments within that page 4109 * are individually refcounted, in the page's reference counter. 4110 * 4111 * The page_frag functions below provide a simple allocation framework for 4112 * page fragments. This is used by the network stack and network device 4113 * drivers to provide a backing region of memory for use as either an 4114 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 4115 */ 4116 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 4117 gfp_t gfp_mask) 4118 { 4119 struct page *page = NULL; 4120 gfp_t gfp = gfp_mask; 4121 4122 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4123 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 4124 __GFP_NOMEMALLOC; 4125 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 4126 PAGE_FRAG_CACHE_MAX_ORDER); 4127 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 4128 #endif 4129 if (unlikely(!page)) 4130 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 4131 4132 nc->va = page ? page_address(page) : NULL; 4133 4134 return page; 4135 } 4136 4137 void __page_frag_cache_drain(struct page *page, unsigned int count) 4138 { 4139 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4140 4141 if (page_ref_sub_and_test(page, count)) { 4142 unsigned int order = compound_order(page); 4143 4144 if (order == 0) 4145 free_hot_cold_page(page, false); 4146 else 4147 __free_pages_ok(page, order); 4148 } 4149 } 4150 EXPORT_SYMBOL(__page_frag_cache_drain); 4151 4152 void *page_frag_alloc(struct page_frag_cache *nc, 4153 unsigned int fragsz, gfp_t gfp_mask) 4154 { 4155 unsigned int size = PAGE_SIZE; 4156 struct page *page; 4157 int offset; 4158 4159 if (unlikely(!nc->va)) { 4160 refill: 4161 page = __page_frag_cache_refill(nc, gfp_mask); 4162 if (!page) 4163 return NULL; 4164 4165 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4166 /* if size can vary use size else just use PAGE_SIZE */ 4167 size = nc->size; 4168 #endif 4169 /* Even if we own the page, we do not use atomic_set(). 4170 * This would break get_page_unless_zero() users. 4171 */ 4172 page_ref_add(page, size - 1); 4173 4174 /* reset page count bias and offset to start of new frag */ 4175 nc->pfmemalloc = page_is_pfmemalloc(page); 4176 nc->pagecnt_bias = size; 4177 nc->offset = size; 4178 } 4179 4180 offset = nc->offset - fragsz; 4181 if (unlikely(offset < 0)) { 4182 page = virt_to_page(nc->va); 4183 4184 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 4185 goto refill; 4186 4187 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4188 /* if size can vary use size else just use PAGE_SIZE */ 4189 size = nc->size; 4190 #endif 4191 /* OK, page count is 0, we can safely set it */ 4192 set_page_count(page, size); 4193 4194 /* reset page count bias and offset to start of new frag */ 4195 nc->pagecnt_bias = size; 4196 offset = size - fragsz; 4197 } 4198 4199 nc->pagecnt_bias--; 4200 nc->offset = offset; 4201 4202 return nc->va + offset; 4203 } 4204 EXPORT_SYMBOL(page_frag_alloc); 4205 4206 /* 4207 * Frees a page fragment allocated out of either a compound or order 0 page. 4208 */ 4209 void page_frag_free(void *addr) 4210 { 4211 struct page *page = virt_to_head_page(addr); 4212 4213 if (unlikely(put_page_testzero(page))) 4214 __free_pages_ok(page, compound_order(page)); 4215 } 4216 EXPORT_SYMBOL(page_frag_free); 4217 4218 static void *make_alloc_exact(unsigned long addr, unsigned int order, 4219 size_t size) 4220 { 4221 if (addr) { 4222 unsigned long alloc_end = addr + (PAGE_SIZE << order); 4223 unsigned long used = addr + PAGE_ALIGN(size); 4224 4225 split_page(virt_to_page((void *)addr), order); 4226 while (used < alloc_end) { 4227 free_page(used); 4228 used += PAGE_SIZE; 4229 } 4230 } 4231 return (void *)addr; 4232 } 4233 4234 /** 4235 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 4236 * @size: the number of bytes to allocate 4237 * @gfp_mask: GFP flags for the allocation 4238 * 4239 * This function is similar to alloc_pages(), except that it allocates the 4240 * minimum number of pages to satisfy the request. alloc_pages() can only 4241 * allocate memory in power-of-two pages. 4242 * 4243 * This function is also limited by MAX_ORDER. 4244 * 4245 * Memory allocated by this function must be released by free_pages_exact(). 4246 */ 4247 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4248 { 4249 unsigned int order = get_order(size); 4250 unsigned long addr; 4251 4252 addr = __get_free_pages(gfp_mask, order); 4253 return make_alloc_exact(addr, order, size); 4254 } 4255 EXPORT_SYMBOL(alloc_pages_exact); 4256 4257 /** 4258 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 4259 * pages on a node. 4260 * @nid: the preferred node ID where memory should be allocated 4261 * @size: the number of bytes to allocate 4262 * @gfp_mask: GFP flags for the allocation 4263 * 4264 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4265 * back. 4266 */ 4267 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4268 { 4269 unsigned int order = get_order(size); 4270 struct page *p = alloc_pages_node(nid, gfp_mask, order); 4271 if (!p) 4272 return NULL; 4273 return make_alloc_exact((unsigned long)page_address(p), order, size); 4274 } 4275 4276 /** 4277 * free_pages_exact - release memory allocated via alloc_pages_exact() 4278 * @virt: the value returned by alloc_pages_exact. 4279 * @size: size of allocation, same value as passed to alloc_pages_exact(). 4280 * 4281 * Release the memory allocated by a previous call to alloc_pages_exact. 4282 */ 4283 void free_pages_exact(void *virt, size_t size) 4284 { 4285 unsigned long addr = (unsigned long)virt; 4286 unsigned long end = addr + PAGE_ALIGN(size); 4287 4288 while (addr < end) { 4289 free_page(addr); 4290 addr += PAGE_SIZE; 4291 } 4292 } 4293 EXPORT_SYMBOL(free_pages_exact); 4294 4295 /** 4296 * nr_free_zone_pages - count number of pages beyond high watermark 4297 * @offset: The zone index of the highest zone 4298 * 4299 * nr_free_zone_pages() counts the number of counts pages which are beyond the 4300 * high watermark within all zones at or below a given zone index. For each 4301 * zone, the number of pages is calculated as: 4302 * 4303 * nr_free_zone_pages = managed_pages - high_pages 4304 */ 4305 static unsigned long nr_free_zone_pages(int offset) 4306 { 4307 struct zoneref *z; 4308 struct zone *zone; 4309 4310 /* Just pick one node, since fallback list is circular */ 4311 unsigned long sum = 0; 4312 4313 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 4314 4315 for_each_zone_zonelist(zone, z, zonelist, offset) { 4316 unsigned long size = zone->managed_pages; 4317 unsigned long high = high_wmark_pages(zone); 4318 if (size > high) 4319 sum += size - high; 4320 } 4321 4322 return sum; 4323 } 4324 4325 /** 4326 * nr_free_buffer_pages - count number of pages beyond high watermark 4327 * 4328 * nr_free_buffer_pages() counts the number of pages which are beyond the high 4329 * watermark within ZONE_DMA and ZONE_NORMAL. 4330 */ 4331 unsigned long nr_free_buffer_pages(void) 4332 { 4333 return nr_free_zone_pages(gfp_zone(GFP_USER)); 4334 } 4335 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 4336 4337 /** 4338 * nr_free_pagecache_pages - count number of pages beyond high watermark 4339 * 4340 * nr_free_pagecache_pages() counts the number of pages which are beyond the 4341 * high watermark within all zones. 4342 */ 4343 unsigned long nr_free_pagecache_pages(void) 4344 { 4345 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 4346 } 4347 4348 static inline void show_node(struct zone *zone) 4349 { 4350 if (IS_ENABLED(CONFIG_NUMA)) 4351 printk("Node %d ", zone_to_nid(zone)); 4352 } 4353 4354 long si_mem_available(void) 4355 { 4356 long available; 4357 unsigned long pagecache; 4358 unsigned long wmark_low = 0; 4359 unsigned long pages[NR_LRU_LISTS]; 4360 struct zone *zone; 4361 int lru; 4362 4363 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 4364 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 4365 4366 for_each_zone(zone) 4367 wmark_low += zone->watermark[WMARK_LOW]; 4368 4369 /* 4370 * Estimate the amount of memory available for userspace allocations, 4371 * without causing swapping. 4372 */ 4373 available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; 4374 4375 /* 4376 * Not all the page cache can be freed, otherwise the system will 4377 * start swapping. Assume at least half of the page cache, or the 4378 * low watermark worth of cache, needs to stay. 4379 */ 4380 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 4381 pagecache -= min(pagecache / 2, wmark_low); 4382 available += pagecache; 4383 4384 /* 4385 * Part of the reclaimable slab consists of items that are in use, 4386 * and cannot be freed. Cap this estimate at the low watermark. 4387 */ 4388 available += global_page_state(NR_SLAB_RECLAIMABLE) - 4389 min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); 4390 4391 if (available < 0) 4392 available = 0; 4393 return available; 4394 } 4395 EXPORT_SYMBOL_GPL(si_mem_available); 4396 4397 void si_meminfo(struct sysinfo *val) 4398 { 4399 val->totalram = totalram_pages; 4400 val->sharedram = global_node_page_state(NR_SHMEM); 4401 val->freeram = global_page_state(NR_FREE_PAGES); 4402 val->bufferram = nr_blockdev_pages(); 4403 val->totalhigh = totalhigh_pages; 4404 val->freehigh = nr_free_highpages(); 4405 val->mem_unit = PAGE_SIZE; 4406 } 4407 4408 EXPORT_SYMBOL(si_meminfo); 4409 4410 #ifdef CONFIG_NUMA 4411 void si_meminfo_node(struct sysinfo *val, int nid) 4412 { 4413 int zone_type; /* needs to be signed */ 4414 unsigned long managed_pages = 0; 4415 unsigned long managed_highpages = 0; 4416 unsigned long free_highpages = 0; 4417 pg_data_t *pgdat = NODE_DATA(nid); 4418 4419 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 4420 managed_pages += pgdat->node_zones[zone_type].managed_pages; 4421 val->totalram = managed_pages; 4422 val->sharedram = node_page_state(pgdat, NR_SHMEM); 4423 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 4424 #ifdef CONFIG_HIGHMEM 4425 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 4426 struct zone *zone = &pgdat->node_zones[zone_type]; 4427 4428 if (is_highmem(zone)) { 4429 managed_highpages += zone->managed_pages; 4430 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 4431 } 4432 } 4433 val->totalhigh = managed_highpages; 4434 val->freehigh = free_highpages; 4435 #else 4436 val->totalhigh = managed_highpages; 4437 val->freehigh = free_highpages; 4438 #endif 4439 val->mem_unit = PAGE_SIZE; 4440 } 4441 #endif 4442 4443 /* 4444 * Determine whether the node should be displayed or not, depending on whether 4445 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 4446 */ 4447 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 4448 { 4449 if (!(flags & SHOW_MEM_FILTER_NODES)) 4450 return false; 4451 4452 /* 4453 * no node mask - aka implicit memory numa policy. Do not bother with 4454 * the synchronization - read_mems_allowed_begin - because we do not 4455 * have to be precise here. 4456 */ 4457 if (!nodemask) 4458 nodemask = &cpuset_current_mems_allowed; 4459 4460 return !node_isset(nid, *nodemask); 4461 } 4462 4463 #define K(x) ((x) << (PAGE_SHIFT-10)) 4464 4465 static void show_migration_types(unsigned char type) 4466 { 4467 static const char types[MIGRATE_TYPES] = { 4468 [MIGRATE_UNMOVABLE] = 'U', 4469 [MIGRATE_MOVABLE] = 'M', 4470 [MIGRATE_RECLAIMABLE] = 'E', 4471 [MIGRATE_HIGHATOMIC] = 'H', 4472 #ifdef CONFIG_CMA 4473 [MIGRATE_CMA] = 'C', 4474 #endif 4475 #ifdef CONFIG_MEMORY_ISOLATION 4476 [MIGRATE_ISOLATE] = 'I', 4477 #endif 4478 }; 4479 char tmp[MIGRATE_TYPES + 1]; 4480 char *p = tmp; 4481 int i; 4482 4483 for (i = 0; i < MIGRATE_TYPES; i++) { 4484 if (type & (1 << i)) 4485 *p++ = types[i]; 4486 } 4487 4488 *p = '\0'; 4489 printk(KERN_CONT "(%s) ", tmp); 4490 } 4491 4492 /* 4493 * Show free area list (used inside shift_scroll-lock stuff) 4494 * We also calculate the percentage fragmentation. We do this by counting the 4495 * memory on each free list with the exception of the first item on the list. 4496 * 4497 * Bits in @filter: 4498 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 4499 * cpuset. 4500 */ 4501 void show_free_areas(unsigned int filter, nodemask_t *nodemask) 4502 { 4503 unsigned long free_pcp = 0; 4504 int cpu; 4505 struct zone *zone; 4506 pg_data_t *pgdat; 4507 4508 for_each_populated_zone(zone) { 4509 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4510 continue; 4511 4512 for_each_online_cpu(cpu) 4513 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 4514 } 4515 4516 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 4517 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 4518 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" 4519 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 4520 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 4521 " free:%lu free_pcp:%lu free_cma:%lu\n", 4522 global_node_page_state(NR_ACTIVE_ANON), 4523 global_node_page_state(NR_INACTIVE_ANON), 4524 global_node_page_state(NR_ISOLATED_ANON), 4525 global_node_page_state(NR_ACTIVE_FILE), 4526 global_node_page_state(NR_INACTIVE_FILE), 4527 global_node_page_state(NR_ISOLATED_FILE), 4528 global_node_page_state(NR_UNEVICTABLE), 4529 global_node_page_state(NR_FILE_DIRTY), 4530 global_node_page_state(NR_WRITEBACK), 4531 global_node_page_state(NR_UNSTABLE_NFS), 4532 global_page_state(NR_SLAB_RECLAIMABLE), 4533 global_page_state(NR_SLAB_UNRECLAIMABLE), 4534 global_node_page_state(NR_FILE_MAPPED), 4535 global_node_page_state(NR_SHMEM), 4536 global_page_state(NR_PAGETABLE), 4537 global_page_state(NR_BOUNCE), 4538 global_page_state(NR_FREE_PAGES), 4539 free_pcp, 4540 global_page_state(NR_FREE_CMA_PAGES)); 4541 4542 for_each_online_pgdat(pgdat) { 4543 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 4544 continue; 4545 4546 printk("Node %d" 4547 " active_anon:%lukB" 4548 " inactive_anon:%lukB" 4549 " active_file:%lukB" 4550 " inactive_file:%lukB" 4551 " unevictable:%lukB" 4552 " isolated(anon):%lukB" 4553 " isolated(file):%lukB" 4554 " mapped:%lukB" 4555 " dirty:%lukB" 4556 " writeback:%lukB" 4557 " shmem:%lukB" 4558 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4559 " shmem_thp: %lukB" 4560 " shmem_pmdmapped: %lukB" 4561 " anon_thp: %lukB" 4562 #endif 4563 " writeback_tmp:%lukB" 4564 " unstable:%lukB" 4565 " all_unreclaimable? %s" 4566 "\n", 4567 pgdat->node_id, 4568 K(node_page_state(pgdat, NR_ACTIVE_ANON)), 4569 K(node_page_state(pgdat, NR_INACTIVE_ANON)), 4570 K(node_page_state(pgdat, NR_ACTIVE_FILE)), 4571 K(node_page_state(pgdat, NR_INACTIVE_FILE)), 4572 K(node_page_state(pgdat, NR_UNEVICTABLE)), 4573 K(node_page_state(pgdat, NR_ISOLATED_ANON)), 4574 K(node_page_state(pgdat, NR_ISOLATED_FILE)), 4575 K(node_page_state(pgdat, NR_FILE_MAPPED)), 4576 K(node_page_state(pgdat, NR_FILE_DIRTY)), 4577 K(node_page_state(pgdat, NR_WRITEBACK)), 4578 K(node_page_state(pgdat, NR_SHMEM)), 4579 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4580 K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), 4581 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) 4582 * HPAGE_PMD_NR), 4583 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), 4584 #endif 4585 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 4586 K(node_page_state(pgdat, NR_UNSTABLE_NFS)), 4587 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 4588 "yes" : "no"); 4589 } 4590 4591 for_each_populated_zone(zone) { 4592 int i; 4593 4594 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4595 continue; 4596 4597 free_pcp = 0; 4598 for_each_online_cpu(cpu) 4599 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 4600 4601 show_node(zone); 4602 printk(KERN_CONT 4603 "%s" 4604 " free:%lukB" 4605 " min:%lukB" 4606 " low:%lukB" 4607 " high:%lukB" 4608 " active_anon:%lukB" 4609 " inactive_anon:%lukB" 4610 " active_file:%lukB" 4611 " inactive_file:%lukB" 4612 " unevictable:%lukB" 4613 " writepending:%lukB" 4614 " present:%lukB" 4615 " managed:%lukB" 4616 " mlocked:%lukB" 4617 " slab_reclaimable:%lukB" 4618 " slab_unreclaimable:%lukB" 4619 " kernel_stack:%lukB" 4620 " pagetables:%lukB" 4621 " bounce:%lukB" 4622 " free_pcp:%lukB" 4623 " local_pcp:%ukB" 4624 " free_cma:%lukB" 4625 "\n", 4626 zone->name, 4627 K(zone_page_state(zone, NR_FREE_PAGES)), 4628 K(min_wmark_pages(zone)), 4629 K(low_wmark_pages(zone)), 4630 K(high_wmark_pages(zone)), 4631 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 4632 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 4633 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 4634 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 4635 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 4636 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 4637 K(zone->present_pages), 4638 K(zone->managed_pages), 4639 K(zone_page_state(zone, NR_MLOCK)), 4640 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 4641 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 4642 zone_page_state(zone, NR_KERNEL_STACK_KB), 4643 K(zone_page_state(zone, NR_PAGETABLE)), 4644 K(zone_page_state(zone, NR_BOUNCE)), 4645 K(free_pcp), 4646 K(this_cpu_read(zone->pageset->pcp.count)), 4647 K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 4648 printk("lowmem_reserve[]:"); 4649 for (i = 0; i < MAX_NR_ZONES; i++) 4650 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 4651 printk(KERN_CONT "\n"); 4652 } 4653 4654 for_each_populated_zone(zone) { 4655 unsigned int order; 4656 unsigned long nr[MAX_ORDER], flags, total = 0; 4657 unsigned char types[MAX_ORDER]; 4658 4659 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4660 continue; 4661 show_node(zone); 4662 printk(KERN_CONT "%s: ", zone->name); 4663 4664 spin_lock_irqsave(&zone->lock, flags); 4665 for (order = 0; order < MAX_ORDER; order++) { 4666 struct free_area *area = &zone->free_area[order]; 4667 int type; 4668 4669 nr[order] = area->nr_free; 4670 total += nr[order] << order; 4671 4672 types[order] = 0; 4673 for (type = 0; type < MIGRATE_TYPES; type++) { 4674 if (!list_empty(&area->free_list[type])) 4675 types[order] |= 1 << type; 4676 } 4677 } 4678 spin_unlock_irqrestore(&zone->lock, flags); 4679 for (order = 0; order < MAX_ORDER; order++) { 4680 printk(KERN_CONT "%lu*%lukB ", 4681 nr[order], K(1UL) << order); 4682 if (nr[order]) 4683 show_migration_types(types[order]); 4684 } 4685 printk(KERN_CONT "= %lukB\n", K(total)); 4686 } 4687 4688 hugetlb_show_meminfo(); 4689 4690 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 4691 4692 show_swap_cache_info(); 4693 } 4694 4695 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 4696 { 4697 zoneref->zone = zone; 4698 zoneref->zone_idx = zone_idx(zone); 4699 } 4700 4701 /* 4702 * Builds allocation fallback zone lists. 4703 * 4704 * Add all populated zones of a node to the zonelist. 4705 */ 4706 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 4707 int nr_zones) 4708 { 4709 struct zone *zone; 4710 enum zone_type zone_type = MAX_NR_ZONES; 4711 4712 do { 4713 zone_type--; 4714 zone = pgdat->node_zones + zone_type; 4715 if (managed_zone(zone)) { 4716 zoneref_set_zone(zone, 4717 &zonelist->_zonerefs[nr_zones++]); 4718 check_highest_zone(zone_type); 4719 } 4720 } while (zone_type); 4721 4722 return nr_zones; 4723 } 4724 4725 4726 /* 4727 * zonelist_order: 4728 * 0 = automatic detection of better ordering. 4729 * 1 = order by ([node] distance, -zonetype) 4730 * 2 = order by (-zonetype, [node] distance) 4731 * 4732 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 4733 * the same zonelist. So only NUMA can configure this param. 4734 */ 4735 #define ZONELIST_ORDER_DEFAULT 0 4736 #define ZONELIST_ORDER_NODE 1 4737 #define ZONELIST_ORDER_ZONE 2 4738 4739 /* zonelist order in the kernel. 4740 * set_zonelist_order() will set this to NODE or ZONE. 4741 */ 4742 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 4743 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 4744 4745 4746 #ifdef CONFIG_NUMA 4747 /* The value user specified ....changed by config */ 4748 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 4749 /* string for sysctl */ 4750 #define NUMA_ZONELIST_ORDER_LEN 16 4751 char numa_zonelist_order[16] = "default"; 4752 4753 /* 4754 * interface for configure zonelist ordering. 4755 * command line option "numa_zonelist_order" 4756 * = "[dD]efault - default, automatic configuration. 4757 * = "[nN]ode - order by node locality, then by zone within node 4758 * = "[zZ]one - order by zone, then by locality within zone 4759 */ 4760 4761 static int __parse_numa_zonelist_order(char *s) 4762 { 4763 if (*s == 'd' || *s == 'D') { 4764 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 4765 } else if (*s == 'n' || *s == 'N') { 4766 user_zonelist_order = ZONELIST_ORDER_NODE; 4767 } else if (*s == 'z' || *s == 'Z') { 4768 user_zonelist_order = ZONELIST_ORDER_ZONE; 4769 } else { 4770 pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); 4771 return -EINVAL; 4772 } 4773 return 0; 4774 } 4775 4776 static __init int setup_numa_zonelist_order(char *s) 4777 { 4778 int ret; 4779 4780 if (!s) 4781 return 0; 4782 4783 ret = __parse_numa_zonelist_order(s); 4784 if (ret == 0) 4785 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 4786 4787 return ret; 4788 } 4789 early_param("numa_zonelist_order", setup_numa_zonelist_order); 4790 4791 /* 4792 * sysctl handler for numa_zonelist_order 4793 */ 4794 int numa_zonelist_order_handler(struct ctl_table *table, int write, 4795 void __user *buffer, size_t *length, 4796 loff_t *ppos) 4797 { 4798 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 4799 int ret; 4800 static DEFINE_MUTEX(zl_order_mutex); 4801 4802 mutex_lock(&zl_order_mutex); 4803 if (write) { 4804 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 4805 ret = -EINVAL; 4806 goto out; 4807 } 4808 strcpy(saved_string, (char *)table->data); 4809 } 4810 ret = proc_dostring(table, write, buffer, length, ppos); 4811 if (ret) 4812 goto out; 4813 if (write) { 4814 int oldval = user_zonelist_order; 4815 4816 ret = __parse_numa_zonelist_order((char *)table->data); 4817 if (ret) { 4818 /* 4819 * bogus value. restore saved string 4820 */ 4821 strncpy((char *)table->data, saved_string, 4822 NUMA_ZONELIST_ORDER_LEN); 4823 user_zonelist_order = oldval; 4824 } else if (oldval != user_zonelist_order) { 4825 mutex_lock(&zonelists_mutex); 4826 build_all_zonelists(NULL, NULL); 4827 mutex_unlock(&zonelists_mutex); 4828 } 4829 } 4830 out: 4831 mutex_unlock(&zl_order_mutex); 4832 return ret; 4833 } 4834 4835 4836 #define MAX_NODE_LOAD (nr_online_nodes) 4837 static int node_load[MAX_NUMNODES]; 4838 4839 /** 4840 * find_next_best_node - find the next node that should appear in a given node's fallback list 4841 * @node: node whose fallback list we're appending 4842 * @used_node_mask: nodemask_t of already used nodes 4843 * 4844 * We use a number of factors to determine which is the next node that should 4845 * appear on a given node's fallback list. The node should not have appeared 4846 * already in @node's fallback list, and it should be the next closest node 4847 * according to the distance array (which contains arbitrary distance values 4848 * from each node to each node in the system), and should also prefer nodes 4849 * with no CPUs, since presumably they'll have very little allocation pressure 4850 * on them otherwise. 4851 * It returns -1 if no node is found. 4852 */ 4853 static int find_next_best_node(int node, nodemask_t *used_node_mask) 4854 { 4855 int n, val; 4856 int min_val = INT_MAX; 4857 int best_node = NUMA_NO_NODE; 4858 const struct cpumask *tmp = cpumask_of_node(0); 4859 4860 /* Use the local node if we haven't already */ 4861 if (!node_isset(node, *used_node_mask)) { 4862 node_set(node, *used_node_mask); 4863 return node; 4864 } 4865 4866 for_each_node_state(n, N_MEMORY) { 4867 4868 /* Don't want a node to appear more than once */ 4869 if (node_isset(n, *used_node_mask)) 4870 continue; 4871 4872 /* Use the distance array to find the distance */ 4873 val = node_distance(node, n); 4874 4875 /* Penalize nodes under us ("prefer the next node") */ 4876 val += (n < node); 4877 4878 /* Give preference to headless and unused nodes */ 4879 tmp = cpumask_of_node(n); 4880 if (!cpumask_empty(tmp)) 4881 val += PENALTY_FOR_NODE_WITH_CPUS; 4882 4883 /* Slight preference for less loaded node */ 4884 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 4885 val += node_load[n]; 4886 4887 if (val < min_val) { 4888 min_val = val; 4889 best_node = n; 4890 } 4891 } 4892 4893 if (best_node >= 0) 4894 node_set(best_node, *used_node_mask); 4895 4896 return best_node; 4897 } 4898 4899 4900 /* 4901 * Build zonelists ordered by node and zones within node. 4902 * This results in maximum locality--normal zone overflows into local 4903 * DMA zone, if any--but risks exhausting DMA zone. 4904 */ 4905 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 4906 { 4907 int j; 4908 struct zonelist *zonelist; 4909 4910 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 4911 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 4912 ; 4913 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4914 zonelist->_zonerefs[j].zone = NULL; 4915 zonelist->_zonerefs[j].zone_idx = 0; 4916 } 4917 4918 /* 4919 * Build gfp_thisnode zonelists 4920 */ 4921 static void build_thisnode_zonelists(pg_data_t *pgdat) 4922 { 4923 int j; 4924 struct zonelist *zonelist; 4925 4926 zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; 4927 j = build_zonelists_node(pgdat, zonelist, 0); 4928 zonelist->_zonerefs[j].zone = NULL; 4929 zonelist->_zonerefs[j].zone_idx = 0; 4930 } 4931 4932 /* 4933 * Build zonelists ordered by zone and nodes within zones. 4934 * This results in conserving DMA zone[s] until all Normal memory is 4935 * exhausted, but results in overflowing to remote node while memory 4936 * may still exist in local DMA zone. 4937 */ 4938 static int node_order[MAX_NUMNODES]; 4939 4940 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 4941 { 4942 int pos, j, node; 4943 int zone_type; /* needs to be signed */ 4944 struct zone *z; 4945 struct zonelist *zonelist; 4946 4947 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 4948 pos = 0; 4949 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 4950 for (j = 0; j < nr_nodes; j++) { 4951 node = node_order[j]; 4952 z = &NODE_DATA(node)->node_zones[zone_type]; 4953 if (managed_zone(z)) { 4954 zoneref_set_zone(z, 4955 &zonelist->_zonerefs[pos++]); 4956 check_highest_zone(zone_type); 4957 } 4958 } 4959 } 4960 zonelist->_zonerefs[pos].zone = NULL; 4961 zonelist->_zonerefs[pos].zone_idx = 0; 4962 } 4963 4964 #if defined(CONFIG_64BIT) 4965 /* 4966 * Devices that require DMA32/DMA are relatively rare and do not justify a 4967 * penalty to every machine in case the specialised case applies. Default 4968 * to Node-ordering on 64-bit NUMA machines 4969 */ 4970 static int default_zonelist_order(void) 4971 { 4972 return ZONELIST_ORDER_NODE; 4973 } 4974 #else 4975 /* 4976 * On 32-bit, the Normal zone needs to be preserved for allocations accessible 4977 * by the kernel. If processes running on node 0 deplete the low memory zone 4978 * then reclaim will occur more frequency increasing stalls and potentially 4979 * be easier to OOM if a large percentage of the zone is under writeback or 4980 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. 4981 * Hence, default to zone ordering on 32-bit. 4982 */ 4983 static int default_zonelist_order(void) 4984 { 4985 return ZONELIST_ORDER_ZONE; 4986 } 4987 #endif /* CONFIG_64BIT */ 4988 4989 static void set_zonelist_order(void) 4990 { 4991 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 4992 current_zonelist_order = default_zonelist_order(); 4993 else 4994 current_zonelist_order = user_zonelist_order; 4995 } 4996 4997 static void build_zonelists(pg_data_t *pgdat) 4998 { 4999 int i, node, load; 5000 nodemask_t used_mask; 5001 int local_node, prev_node; 5002 struct zonelist *zonelist; 5003 unsigned int order = current_zonelist_order; 5004 5005 /* initialize zonelists */ 5006 for (i = 0; i < MAX_ZONELISTS; i++) { 5007 zonelist = pgdat->node_zonelists + i; 5008 zonelist->_zonerefs[0].zone = NULL; 5009 zonelist->_zonerefs[0].zone_idx = 0; 5010 } 5011 5012 /* NUMA-aware ordering of nodes */ 5013 local_node = pgdat->node_id; 5014 load = nr_online_nodes; 5015 prev_node = local_node; 5016 nodes_clear(used_mask); 5017 5018 memset(node_order, 0, sizeof(node_order)); 5019 i = 0; 5020 5021 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5022 /* 5023 * We don't want to pressure a particular node. 5024 * So adding penalty to the first node in same 5025 * distance group to make it round-robin. 5026 */ 5027 if (node_distance(local_node, node) != 5028 node_distance(local_node, prev_node)) 5029 node_load[node] = load; 5030 5031 prev_node = node; 5032 load--; 5033 if (order == ZONELIST_ORDER_NODE) 5034 build_zonelists_in_node_order(pgdat, node); 5035 else 5036 node_order[i++] = node; /* remember order */ 5037 } 5038 5039 if (order == ZONELIST_ORDER_ZONE) { 5040 /* calculate node order -- i.e., DMA last! */ 5041 build_zonelists_in_zone_order(pgdat, i); 5042 } 5043 5044 build_thisnode_zonelists(pgdat); 5045 } 5046 5047 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5048 /* 5049 * Return node id of node used for "local" allocations. 5050 * I.e., first node id of first zone in arg node's generic zonelist. 5051 * Used for initializing percpu 'numa_mem', which is used primarily 5052 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 5053 */ 5054 int local_memory_node(int node) 5055 { 5056 struct zoneref *z; 5057 5058 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5059 gfp_zone(GFP_KERNEL), 5060 NULL); 5061 return z->zone->node; 5062 } 5063 #endif 5064 5065 static void setup_min_unmapped_ratio(void); 5066 static void setup_min_slab_ratio(void); 5067 #else /* CONFIG_NUMA */ 5068 5069 static void set_zonelist_order(void) 5070 { 5071 current_zonelist_order = ZONELIST_ORDER_ZONE; 5072 } 5073 5074 static void build_zonelists(pg_data_t *pgdat) 5075 { 5076 int node, local_node; 5077 enum zone_type j; 5078 struct zonelist *zonelist; 5079 5080 local_node = pgdat->node_id; 5081 5082 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 5083 j = build_zonelists_node(pgdat, zonelist, 0); 5084 5085 /* 5086 * Now we build the zonelist so that it contains the zones 5087 * of all the other nodes. 5088 * We don't want to pressure a particular node, so when 5089 * building the zones for node N, we make sure that the 5090 * zones coming right after the local ones are those from 5091 * node N+1 (modulo N) 5092 */ 5093 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5094 if (!node_online(node)) 5095 continue; 5096 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5097 } 5098 for (node = 0; node < local_node; node++) { 5099 if (!node_online(node)) 5100 continue; 5101 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5102 } 5103 5104 zonelist->_zonerefs[j].zone = NULL; 5105 zonelist->_zonerefs[j].zone_idx = 0; 5106 } 5107 5108 #endif /* CONFIG_NUMA */ 5109 5110 /* 5111 * Boot pageset table. One per cpu which is going to be used for all 5112 * zones and all nodes. The parameters will be set in such a way 5113 * that an item put on a list will immediately be handed over to 5114 * the buddy list. This is safe since pageset manipulation is done 5115 * with interrupts disabled. 5116 * 5117 * The boot_pagesets must be kept even after bootup is complete for 5118 * unused processors and/or zones. They do play a role for bootstrapping 5119 * hotplugged processors. 5120 * 5121 * zoneinfo_show() and maybe other functions do 5122 * not check if the processor is online before following the pageset pointer. 5123 * Other parts of the kernel may not check if the zone is available. 5124 */ 5125 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 5126 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 5127 static void setup_zone_pageset(struct zone *zone); 5128 5129 /* 5130 * Global mutex to protect against size modification of zonelists 5131 * as well as to serialize pageset setup for the new populated zone. 5132 */ 5133 DEFINE_MUTEX(zonelists_mutex); 5134 5135 /* return values int ....just for stop_machine() */ 5136 static int __build_all_zonelists(void *data) 5137 { 5138 int nid; 5139 int cpu; 5140 pg_data_t *self = data; 5141 5142 #ifdef CONFIG_NUMA 5143 memset(node_load, 0, sizeof(node_load)); 5144 #endif 5145 5146 if (self && !node_online(self->node_id)) { 5147 build_zonelists(self); 5148 } 5149 5150 for_each_online_node(nid) { 5151 pg_data_t *pgdat = NODE_DATA(nid); 5152 5153 build_zonelists(pgdat); 5154 } 5155 5156 /* 5157 * Initialize the boot_pagesets that are going to be used 5158 * for bootstrapping processors. The real pagesets for 5159 * each zone will be allocated later when the per cpu 5160 * allocator is available. 5161 * 5162 * boot_pagesets are used also for bootstrapping offline 5163 * cpus if the system is already booted because the pagesets 5164 * are needed to initialize allocators on a specific cpu too. 5165 * F.e. the percpu allocator needs the page allocator which 5166 * needs the percpu allocator in order to allocate its pagesets 5167 * (a chicken-egg dilemma). 5168 */ 5169 for_each_possible_cpu(cpu) { 5170 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 5171 5172 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5173 /* 5174 * We now know the "local memory node" for each node-- 5175 * i.e., the node of the first zone in the generic zonelist. 5176 * Set up numa_mem percpu variable for on-line cpus. During 5177 * boot, only the boot cpu should be on-line; we'll init the 5178 * secondary cpus' numa_mem as they come on-line. During 5179 * node/memory hotplug, we'll fixup all on-line cpus. 5180 */ 5181 if (cpu_online(cpu)) 5182 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5183 #endif 5184 } 5185 5186 return 0; 5187 } 5188 5189 static noinline void __init 5190 build_all_zonelists_init(void) 5191 { 5192 __build_all_zonelists(NULL); 5193 mminit_verify_zonelist(); 5194 cpuset_init_current_mems_allowed(); 5195 } 5196 5197 /* 5198 * Called with zonelists_mutex held always 5199 * unless system_state == SYSTEM_BOOTING. 5200 * 5201 * __ref due to (1) call of __meminit annotated setup_zone_pageset 5202 * [we're only called with non-NULL zone through __meminit paths] and 5203 * (2) call of __init annotated helper build_all_zonelists_init 5204 * [protected by SYSTEM_BOOTING]. 5205 */ 5206 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 5207 { 5208 set_zonelist_order(); 5209 5210 if (system_state == SYSTEM_BOOTING) { 5211 build_all_zonelists_init(); 5212 } else { 5213 #ifdef CONFIG_MEMORY_HOTPLUG 5214 if (zone) 5215 setup_zone_pageset(zone); 5216 #endif 5217 /* we have to stop all cpus to guarantee there is no user 5218 of zonelist */ 5219 stop_machine(__build_all_zonelists, pgdat, NULL); 5220 /* cpuset refresh routine should be here */ 5221 } 5222 vm_total_pages = nr_free_pagecache_pages(); 5223 /* 5224 * Disable grouping by mobility if the number of pages in the 5225 * system is too low to allow the mechanism to work. It would be 5226 * more accurate, but expensive to check per-zone. This check is 5227 * made on memory-hotadd so a system can start with mobility 5228 * disabled and enable it later 5229 */ 5230 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 5231 page_group_by_mobility_disabled = 1; 5232 else 5233 page_group_by_mobility_disabled = 0; 5234 5235 pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", 5236 nr_online_nodes, 5237 zonelist_order_name[current_zonelist_order], 5238 page_group_by_mobility_disabled ? "off" : "on", 5239 vm_total_pages); 5240 #ifdef CONFIG_NUMA 5241 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 5242 #endif 5243 } 5244 5245 /* 5246 * Initially all pages are reserved - free ones are freed 5247 * up by free_all_bootmem() once the early boot process is 5248 * done. Non-atomic initialization, single-pass. 5249 */ 5250 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 5251 unsigned long start_pfn, enum memmap_context context) 5252 { 5253 struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); 5254 unsigned long end_pfn = start_pfn + size; 5255 pg_data_t *pgdat = NODE_DATA(nid); 5256 unsigned long pfn; 5257 unsigned long nr_initialised = 0; 5258 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5259 struct memblock_region *r = NULL, *tmp; 5260 #endif 5261 5262 if (highest_memmap_pfn < end_pfn - 1) 5263 highest_memmap_pfn = end_pfn - 1; 5264 5265 /* 5266 * Honor reservation requested by the driver for this ZONE_DEVICE 5267 * memory 5268 */ 5269 if (altmap && start_pfn == altmap->base_pfn) 5270 start_pfn += altmap->reserve; 5271 5272 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 5273 /* 5274 * There can be holes in boot-time mem_map[]s handed to this 5275 * function. They do not exist on hotplugged memory. 5276 */ 5277 if (context != MEMMAP_EARLY) 5278 goto not_early; 5279 5280 if (!early_pfn_valid(pfn)) { 5281 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5282 /* 5283 * Skip to the pfn preceding the next valid one (or 5284 * end_pfn), such that we hit a valid pfn (or end_pfn) 5285 * on our next iteration of the loop. 5286 */ 5287 pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; 5288 #endif 5289 continue; 5290 } 5291 if (!early_pfn_in_nid(pfn, nid)) 5292 continue; 5293 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) 5294 break; 5295 5296 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5297 /* 5298 * Check given memblock attribute by firmware which can affect 5299 * kernel memory layout. If zone==ZONE_MOVABLE but memory is 5300 * mirrored, it's an overlapped memmap init. skip it. 5301 */ 5302 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 5303 if (!r || pfn >= memblock_region_memory_end_pfn(r)) { 5304 for_each_memblock(memory, tmp) 5305 if (pfn < memblock_region_memory_end_pfn(tmp)) 5306 break; 5307 r = tmp; 5308 } 5309 if (pfn >= memblock_region_memory_base_pfn(r) && 5310 memblock_is_mirror(r)) { 5311 /* already initialized as NORMAL */ 5312 pfn = memblock_region_memory_end_pfn(r); 5313 continue; 5314 } 5315 } 5316 #endif 5317 5318 not_early: 5319 /* 5320 * Mark the block movable so that blocks are reserved for 5321 * movable at startup. This will force kernel allocations 5322 * to reserve their blocks rather than leaking throughout 5323 * the address space during boot when many long-lived 5324 * kernel allocations are made. 5325 * 5326 * bitmap is created for zone's valid pfn range. but memmap 5327 * can be created for invalid pages (for alignment) 5328 * check here not to call set_pageblock_migratetype() against 5329 * pfn out of zone. 5330 */ 5331 if (!(pfn & (pageblock_nr_pages - 1))) { 5332 struct page *page = pfn_to_page(pfn); 5333 5334 __init_single_page(page, pfn, zone, nid); 5335 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5336 } else { 5337 __init_single_pfn(pfn, zone, nid); 5338 } 5339 } 5340 } 5341 5342 static void __meminit zone_init_free_lists(struct zone *zone) 5343 { 5344 unsigned int order, t; 5345 for_each_migratetype_order(order, t) { 5346 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 5347 zone->free_area[order].nr_free = 0; 5348 } 5349 } 5350 5351 #ifndef __HAVE_ARCH_MEMMAP_INIT 5352 #define memmap_init(size, nid, zone, start_pfn) \ 5353 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 5354 #endif 5355 5356 static int zone_batchsize(struct zone *zone) 5357 { 5358 #ifdef CONFIG_MMU 5359 int batch; 5360 5361 /* 5362 * The per-cpu-pages pools are set to around 1000th of the 5363 * size of the zone. But no more than 1/2 of a meg. 5364 * 5365 * OK, so we don't know how big the cache is. So guess. 5366 */ 5367 batch = zone->managed_pages / 1024; 5368 if (batch * PAGE_SIZE > 512 * 1024) 5369 batch = (512 * 1024) / PAGE_SIZE; 5370 batch /= 4; /* We effectively *= 4 below */ 5371 if (batch < 1) 5372 batch = 1; 5373 5374 /* 5375 * Clamp the batch to a 2^n - 1 value. Having a power 5376 * of 2 value was found to be more likely to have 5377 * suboptimal cache aliasing properties in some cases. 5378 * 5379 * For example if 2 tasks are alternately allocating 5380 * batches of pages, one task can end up with a lot 5381 * of pages of one half of the possible page colors 5382 * and the other with pages of the other colors. 5383 */ 5384 batch = rounddown_pow_of_two(batch + batch/2) - 1; 5385 5386 return batch; 5387 5388 #else 5389 /* The deferral and batching of frees should be suppressed under NOMMU 5390 * conditions. 5391 * 5392 * The problem is that NOMMU needs to be able to allocate large chunks 5393 * of contiguous memory as there's no hardware page translation to 5394 * assemble apparent contiguous memory from discontiguous pages. 5395 * 5396 * Queueing large contiguous runs of pages for batching, however, 5397 * causes the pages to actually be freed in smaller chunks. As there 5398 * can be a significant delay between the individual batches being 5399 * recycled, this leads to the once large chunks of space being 5400 * fragmented and becoming unavailable for high-order allocations. 5401 */ 5402 return 0; 5403 #endif 5404 } 5405 5406 /* 5407 * pcp->high and pcp->batch values are related and dependent on one another: 5408 * ->batch must never be higher then ->high. 5409 * The following function updates them in a safe manner without read side 5410 * locking. 5411 * 5412 * Any new users of pcp->batch and pcp->high should ensure they can cope with 5413 * those fields changing asynchronously (acording the the above rule). 5414 * 5415 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 5416 * outside of boot time (or some other assurance that no concurrent updaters 5417 * exist). 5418 */ 5419 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 5420 unsigned long batch) 5421 { 5422 /* start with a fail safe value for batch */ 5423 pcp->batch = 1; 5424 smp_wmb(); 5425 5426 /* Update high, then batch, in order */ 5427 pcp->high = high; 5428 smp_wmb(); 5429 5430 pcp->batch = batch; 5431 } 5432 5433 /* a companion to pageset_set_high() */ 5434 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 5435 { 5436 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 5437 } 5438 5439 static void pageset_init(struct per_cpu_pageset *p) 5440 { 5441 struct per_cpu_pages *pcp; 5442 int migratetype; 5443 5444 memset(p, 0, sizeof(*p)); 5445 5446 pcp = &p->pcp; 5447 pcp->count = 0; 5448 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 5449 INIT_LIST_HEAD(&pcp->lists[migratetype]); 5450 } 5451 5452 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 5453 { 5454 pageset_init(p); 5455 pageset_set_batch(p, batch); 5456 } 5457 5458 /* 5459 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 5460 * to the value high for the pageset p. 5461 */ 5462 static void pageset_set_high(struct per_cpu_pageset *p, 5463 unsigned long high) 5464 { 5465 unsigned long batch = max(1UL, high / 4); 5466 if ((high / 4) > (PAGE_SHIFT * 8)) 5467 batch = PAGE_SHIFT * 8; 5468 5469 pageset_update(&p->pcp, high, batch); 5470 } 5471 5472 static void pageset_set_high_and_batch(struct zone *zone, 5473 struct per_cpu_pageset *pcp) 5474 { 5475 if (percpu_pagelist_fraction) 5476 pageset_set_high(pcp, 5477 (zone->managed_pages / 5478 percpu_pagelist_fraction)); 5479 else 5480 pageset_set_batch(pcp, zone_batchsize(zone)); 5481 } 5482 5483 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 5484 { 5485 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 5486 5487 pageset_init(pcp); 5488 pageset_set_high_and_batch(zone, pcp); 5489 } 5490 5491 static void __meminit setup_zone_pageset(struct zone *zone) 5492 { 5493 int cpu; 5494 zone->pageset = alloc_percpu(struct per_cpu_pageset); 5495 for_each_possible_cpu(cpu) 5496 zone_pageset_init(zone, cpu); 5497 } 5498 5499 /* 5500 * Allocate per cpu pagesets and initialize them. 5501 * Before this call only boot pagesets were available. 5502 */ 5503 void __init setup_per_cpu_pageset(void) 5504 { 5505 struct pglist_data *pgdat; 5506 struct zone *zone; 5507 5508 for_each_populated_zone(zone) 5509 setup_zone_pageset(zone); 5510 5511 for_each_online_pgdat(pgdat) 5512 pgdat->per_cpu_nodestats = 5513 alloc_percpu(struct per_cpu_nodestat); 5514 } 5515 5516 static __meminit void zone_pcp_init(struct zone *zone) 5517 { 5518 /* 5519 * per cpu subsystem is not up at this point. The following code 5520 * relies on the ability of the linker to provide the 5521 * offset of a (static) per cpu variable into the per cpu area. 5522 */ 5523 zone->pageset = &boot_pageset; 5524 5525 if (populated_zone(zone)) 5526 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 5527 zone->name, zone->present_pages, 5528 zone_batchsize(zone)); 5529 } 5530 5531 int __meminit init_currently_empty_zone(struct zone *zone, 5532 unsigned long zone_start_pfn, 5533 unsigned long size) 5534 { 5535 struct pglist_data *pgdat = zone->zone_pgdat; 5536 5537 pgdat->nr_zones = zone_idx(zone) + 1; 5538 5539 zone->zone_start_pfn = zone_start_pfn; 5540 5541 mminit_dprintk(MMINIT_TRACE, "memmap_init", 5542 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 5543 pgdat->node_id, 5544 (unsigned long)zone_idx(zone), 5545 zone_start_pfn, (zone_start_pfn + size)); 5546 5547 zone_init_free_lists(zone); 5548 zone->initialized = 1; 5549 5550 return 0; 5551 } 5552 5553 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5554 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 5555 5556 /* 5557 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 5558 */ 5559 int __meminit __early_pfn_to_nid(unsigned long pfn, 5560 struct mminit_pfnnid_cache *state) 5561 { 5562 unsigned long start_pfn, end_pfn; 5563 int nid; 5564 5565 if (state->last_start <= pfn && pfn < state->last_end) 5566 return state->last_nid; 5567 5568 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 5569 if (nid != -1) { 5570 state->last_start = start_pfn; 5571 state->last_end = end_pfn; 5572 state->last_nid = nid; 5573 } 5574 5575 return nid; 5576 } 5577 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 5578 5579 /** 5580 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range 5581 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 5582 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 5583 * 5584 * If an architecture guarantees that all ranges registered contain no holes 5585 * and may be freed, this this function may be used instead of calling 5586 * memblock_free_early_nid() manually. 5587 */ 5588 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 5589 { 5590 unsigned long start_pfn, end_pfn; 5591 int i, this_nid; 5592 5593 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 5594 start_pfn = min(start_pfn, max_low_pfn); 5595 end_pfn = min(end_pfn, max_low_pfn); 5596 5597 if (start_pfn < end_pfn) 5598 memblock_free_early_nid(PFN_PHYS(start_pfn), 5599 (end_pfn - start_pfn) << PAGE_SHIFT, 5600 this_nid); 5601 } 5602 } 5603 5604 /** 5605 * sparse_memory_present_with_active_regions - Call memory_present for each active range 5606 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 5607 * 5608 * If an architecture guarantees that all ranges registered contain no holes and may 5609 * be freed, this function may be used instead of calling memory_present() manually. 5610 */ 5611 void __init sparse_memory_present_with_active_regions(int nid) 5612 { 5613 unsigned long start_pfn, end_pfn; 5614 int i, this_nid; 5615 5616 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 5617 memory_present(this_nid, start_pfn, end_pfn); 5618 } 5619 5620 /** 5621 * get_pfn_range_for_nid - Return the start and end page frames for a node 5622 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 5623 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 5624 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 5625 * 5626 * It returns the start and end page frame of a node based on information 5627 * provided by memblock_set_node(). If called for a node 5628 * with no available memory, a warning is printed and the start and end 5629 * PFNs will be 0. 5630 */ 5631 void __meminit get_pfn_range_for_nid(unsigned int nid, 5632 unsigned long *start_pfn, unsigned long *end_pfn) 5633 { 5634 unsigned long this_start_pfn, this_end_pfn; 5635 int i; 5636 5637 *start_pfn = -1UL; 5638 *end_pfn = 0; 5639 5640 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 5641 *start_pfn = min(*start_pfn, this_start_pfn); 5642 *end_pfn = max(*end_pfn, this_end_pfn); 5643 } 5644 5645 if (*start_pfn == -1UL) 5646 *start_pfn = 0; 5647 } 5648 5649 /* 5650 * This finds a zone that can be used for ZONE_MOVABLE pages. The 5651 * assumption is made that zones within a node are ordered in monotonic 5652 * increasing memory addresses so that the "highest" populated zone is used 5653 */ 5654 static void __init find_usable_zone_for_movable(void) 5655 { 5656 int zone_index; 5657 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 5658 if (zone_index == ZONE_MOVABLE) 5659 continue; 5660 5661 if (arch_zone_highest_possible_pfn[zone_index] > 5662 arch_zone_lowest_possible_pfn[zone_index]) 5663 break; 5664 } 5665 5666 VM_BUG_ON(zone_index == -1); 5667 movable_zone = zone_index; 5668 } 5669 5670 /* 5671 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 5672 * because it is sized independent of architecture. Unlike the other zones, 5673 * the starting point for ZONE_MOVABLE is not fixed. It may be different 5674 * in each node depending on the size of each node and how evenly kernelcore 5675 * is distributed. This helper function adjusts the zone ranges 5676 * provided by the architecture for a given node by using the end of the 5677 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 5678 * zones within a node are in order of monotonic increases memory addresses 5679 */ 5680 static void __meminit adjust_zone_range_for_zone_movable(int nid, 5681 unsigned long zone_type, 5682 unsigned long node_start_pfn, 5683 unsigned long node_end_pfn, 5684 unsigned long *zone_start_pfn, 5685 unsigned long *zone_end_pfn) 5686 { 5687 /* Only adjust if ZONE_MOVABLE is on this node */ 5688 if (zone_movable_pfn[nid]) { 5689 /* Size ZONE_MOVABLE */ 5690 if (zone_type == ZONE_MOVABLE) { 5691 *zone_start_pfn = zone_movable_pfn[nid]; 5692 *zone_end_pfn = min(node_end_pfn, 5693 arch_zone_highest_possible_pfn[movable_zone]); 5694 5695 /* Adjust for ZONE_MOVABLE starting within this range */ 5696 } else if (!mirrored_kernelcore && 5697 *zone_start_pfn < zone_movable_pfn[nid] && 5698 *zone_end_pfn > zone_movable_pfn[nid]) { 5699 *zone_end_pfn = zone_movable_pfn[nid]; 5700 5701 /* Check if this whole range is within ZONE_MOVABLE */ 5702 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 5703 *zone_start_pfn = *zone_end_pfn; 5704 } 5705 } 5706 5707 /* 5708 * Return the number of pages a zone spans in a node, including holes 5709 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 5710 */ 5711 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 5712 unsigned long zone_type, 5713 unsigned long node_start_pfn, 5714 unsigned long node_end_pfn, 5715 unsigned long *zone_start_pfn, 5716 unsigned long *zone_end_pfn, 5717 unsigned long *ignored) 5718 { 5719 /* When hotadd a new node from cpu_up(), the node should be empty */ 5720 if (!node_start_pfn && !node_end_pfn) 5721 return 0; 5722 5723 /* Get the start and end of the zone */ 5724 *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 5725 *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 5726 adjust_zone_range_for_zone_movable(nid, zone_type, 5727 node_start_pfn, node_end_pfn, 5728 zone_start_pfn, zone_end_pfn); 5729 5730 /* Check that this node has pages within the zone's required range */ 5731 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 5732 return 0; 5733 5734 /* Move the zone boundaries inside the node if necessary */ 5735 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 5736 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 5737 5738 /* Return the spanned pages */ 5739 return *zone_end_pfn - *zone_start_pfn; 5740 } 5741 5742 /* 5743 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 5744 * then all holes in the requested range will be accounted for. 5745 */ 5746 unsigned long __meminit __absent_pages_in_range(int nid, 5747 unsigned long range_start_pfn, 5748 unsigned long range_end_pfn) 5749 { 5750 unsigned long nr_absent = range_end_pfn - range_start_pfn; 5751 unsigned long start_pfn, end_pfn; 5752 int i; 5753 5754 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5755 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 5756 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 5757 nr_absent -= end_pfn - start_pfn; 5758 } 5759 return nr_absent; 5760 } 5761 5762 /** 5763 * absent_pages_in_range - Return number of page frames in holes within a range 5764 * @start_pfn: The start PFN to start searching for holes 5765 * @end_pfn: The end PFN to stop searching for holes 5766 * 5767 * It returns the number of pages frames in memory holes within a range. 5768 */ 5769 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 5770 unsigned long end_pfn) 5771 { 5772 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 5773 } 5774 5775 /* Return the number of page frames in holes in a zone on a node */ 5776 static unsigned long __meminit zone_absent_pages_in_node(int nid, 5777 unsigned long zone_type, 5778 unsigned long node_start_pfn, 5779 unsigned long node_end_pfn, 5780 unsigned long *ignored) 5781 { 5782 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 5783 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5784 unsigned long zone_start_pfn, zone_end_pfn; 5785 unsigned long nr_absent; 5786 5787 /* When hotadd a new node from cpu_up(), the node should be empty */ 5788 if (!node_start_pfn && !node_end_pfn) 5789 return 0; 5790 5791 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 5792 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 5793 5794 adjust_zone_range_for_zone_movable(nid, zone_type, 5795 node_start_pfn, node_end_pfn, 5796 &zone_start_pfn, &zone_end_pfn); 5797 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 5798 5799 /* 5800 * ZONE_MOVABLE handling. 5801 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 5802 * and vice versa. 5803 */ 5804 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 5805 unsigned long start_pfn, end_pfn; 5806 struct memblock_region *r; 5807 5808 for_each_memblock(memory, r) { 5809 start_pfn = clamp(memblock_region_memory_base_pfn(r), 5810 zone_start_pfn, zone_end_pfn); 5811 end_pfn = clamp(memblock_region_memory_end_pfn(r), 5812 zone_start_pfn, zone_end_pfn); 5813 5814 if (zone_type == ZONE_MOVABLE && 5815 memblock_is_mirror(r)) 5816 nr_absent += end_pfn - start_pfn; 5817 5818 if (zone_type == ZONE_NORMAL && 5819 !memblock_is_mirror(r)) 5820 nr_absent += end_pfn - start_pfn; 5821 } 5822 } 5823 5824 return nr_absent; 5825 } 5826 5827 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5828 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 5829 unsigned long zone_type, 5830 unsigned long node_start_pfn, 5831 unsigned long node_end_pfn, 5832 unsigned long *zone_start_pfn, 5833 unsigned long *zone_end_pfn, 5834 unsigned long *zones_size) 5835 { 5836 unsigned int zone; 5837 5838 *zone_start_pfn = node_start_pfn; 5839 for (zone = 0; zone < zone_type; zone++) 5840 *zone_start_pfn += zones_size[zone]; 5841 5842 *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; 5843 5844 return zones_size[zone_type]; 5845 } 5846 5847 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 5848 unsigned long zone_type, 5849 unsigned long node_start_pfn, 5850 unsigned long node_end_pfn, 5851 unsigned long *zholes_size) 5852 { 5853 if (!zholes_size) 5854 return 0; 5855 5856 return zholes_size[zone_type]; 5857 } 5858 5859 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5860 5861 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 5862 unsigned long node_start_pfn, 5863 unsigned long node_end_pfn, 5864 unsigned long *zones_size, 5865 unsigned long *zholes_size) 5866 { 5867 unsigned long realtotalpages = 0, totalpages = 0; 5868 enum zone_type i; 5869 5870 for (i = 0; i < MAX_NR_ZONES; i++) { 5871 struct zone *zone = pgdat->node_zones + i; 5872 unsigned long zone_start_pfn, zone_end_pfn; 5873 unsigned long size, real_size; 5874 5875 size = zone_spanned_pages_in_node(pgdat->node_id, i, 5876 node_start_pfn, 5877 node_end_pfn, 5878 &zone_start_pfn, 5879 &zone_end_pfn, 5880 zones_size); 5881 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, 5882 node_start_pfn, node_end_pfn, 5883 zholes_size); 5884 if (size) 5885 zone->zone_start_pfn = zone_start_pfn; 5886 else 5887 zone->zone_start_pfn = 0; 5888 zone->spanned_pages = size; 5889 zone->present_pages = real_size; 5890 5891 totalpages += size; 5892 realtotalpages += real_size; 5893 } 5894 5895 pgdat->node_spanned_pages = totalpages; 5896 pgdat->node_present_pages = realtotalpages; 5897 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 5898 realtotalpages); 5899 } 5900 5901 #ifndef CONFIG_SPARSEMEM 5902 /* 5903 * Calculate the size of the zone->blockflags rounded to an unsigned long 5904 * Start by making sure zonesize is a multiple of pageblock_order by rounding 5905 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 5906 * round what is now in bits to nearest long in bits, then return it in 5907 * bytes. 5908 */ 5909 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 5910 { 5911 unsigned long usemapsize; 5912 5913 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 5914 usemapsize = roundup(zonesize, pageblock_nr_pages); 5915 usemapsize = usemapsize >> pageblock_order; 5916 usemapsize *= NR_PAGEBLOCK_BITS; 5917 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 5918 5919 return usemapsize / 8; 5920 } 5921 5922 static void __init setup_usemap(struct pglist_data *pgdat, 5923 struct zone *zone, 5924 unsigned long zone_start_pfn, 5925 unsigned long zonesize) 5926 { 5927 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 5928 zone->pageblock_flags = NULL; 5929 if (usemapsize) 5930 zone->pageblock_flags = 5931 memblock_virt_alloc_node_nopanic(usemapsize, 5932 pgdat->node_id); 5933 } 5934 #else 5935 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 5936 unsigned long zone_start_pfn, unsigned long zonesize) {} 5937 #endif /* CONFIG_SPARSEMEM */ 5938 5939 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 5940 5941 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 5942 void __paginginit set_pageblock_order(void) 5943 { 5944 unsigned int order; 5945 5946 /* Check that pageblock_nr_pages has not already been setup */ 5947 if (pageblock_order) 5948 return; 5949 5950 if (HPAGE_SHIFT > PAGE_SHIFT) 5951 order = HUGETLB_PAGE_ORDER; 5952 else 5953 order = MAX_ORDER - 1; 5954 5955 /* 5956 * Assume the largest contiguous order of interest is a huge page. 5957 * This value may be variable depending on boot parameters on IA64 and 5958 * powerpc. 5959 */ 5960 pageblock_order = order; 5961 } 5962 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5963 5964 /* 5965 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 5966 * is unused as pageblock_order is set at compile-time. See 5967 * include/linux/pageblock-flags.h for the values of pageblock_order based on 5968 * the kernel config 5969 */ 5970 void __paginginit set_pageblock_order(void) 5971 { 5972 } 5973 5974 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5975 5976 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 5977 unsigned long present_pages) 5978 { 5979 unsigned long pages = spanned_pages; 5980 5981 /* 5982 * Provide a more accurate estimation if there are holes within 5983 * the zone and SPARSEMEM is in use. If there are holes within the 5984 * zone, each populated memory region may cost us one or two extra 5985 * memmap pages due to alignment because memmap pages for each 5986 * populated regions may not be naturally aligned on page boundary. 5987 * So the (present_pages >> 4) heuristic is a tradeoff for that. 5988 */ 5989 if (spanned_pages > present_pages + (present_pages >> 4) && 5990 IS_ENABLED(CONFIG_SPARSEMEM)) 5991 pages = present_pages; 5992 5993 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 5994 } 5995 5996 /* 5997 * Set up the zone data structures: 5998 * - mark all pages reserved 5999 * - mark all memory queues empty 6000 * - clear the memory bitmaps 6001 * 6002 * NOTE: pgdat should get zeroed by caller. 6003 */ 6004 static void __paginginit free_area_init_core(struct pglist_data *pgdat) 6005 { 6006 enum zone_type j; 6007 int nid = pgdat->node_id; 6008 int ret; 6009 6010 pgdat_resize_init(pgdat); 6011 #ifdef CONFIG_NUMA_BALANCING 6012 spin_lock_init(&pgdat->numabalancing_migrate_lock); 6013 pgdat->numabalancing_migrate_nr_pages = 0; 6014 pgdat->numabalancing_migrate_next_window = jiffies; 6015 #endif 6016 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6017 spin_lock_init(&pgdat->split_queue_lock); 6018 INIT_LIST_HEAD(&pgdat->split_queue); 6019 pgdat->split_queue_len = 0; 6020 #endif 6021 init_waitqueue_head(&pgdat->kswapd_wait); 6022 init_waitqueue_head(&pgdat->pfmemalloc_wait); 6023 #ifdef CONFIG_COMPACTION 6024 init_waitqueue_head(&pgdat->kcompactd_wait); 6025 #endif 6026 pgdat_page_ext_init(pgdat); 6027 spin_lock_init(&pgdat->lru_lock); 6028 lruvec_init(node_lruvec(pgdat)); 6029 6030 for (j = 0; j < MAX_NR_ZONES; j++) { 6031 struct zone *zone = pgdat->node_zones + j; 6032 unsigned long size, realsize, freesize, memmap_pages; 6033 unsigned long zone_start_pfn = zone->zone_start_pfn; 6034 6035 size = zone->spanned_pages; 6036 realsize = freesize = zone->present_pages; 6037 6038 /* 6039 * Adjust freesize so that it accounts for how much memory 6040 * is used by this zone for memmap. This affects the watermark 6041 * and per-cpu initialisations 6042 */ 6043 memmap_pages = calc_memmap_size(size, realsize); 6044 if (!is_highmem_idx(j)) { 6045 if (freesize >= memmap_pages) { 6046 freesize -= memmap_pages; 6047 if (memmap_pages) 6048 printk(KERN_DEBUG 6049 " %s zone: %lu pages used for memmap\n", 6050 zone_names[j], memmap_pages); 6051 } else 6052 pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", 6053 zone_names[j], memmap_pages, freesize); 6054 } 6055 6056 /* Account for reserved pages */ 6057 if (j == 0 && freesize > dma_reserve) { 6058 freesize -= dma_reserve; 6059 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 6060 zone_names[0], dma_reserve); 6061 } 6062 6063 if (!is_highmem_idx(j)) 6064 nr_kernel_pages += freesize; 6065 /* Charge for highmem memmap if there are enough kernel pages */ 6066 else if (nr_kernel_pages > memmap_pages * 2) 6067 nr_kernel_pages -= memmap_pages; 6068 nr_all_pages += freesize; 6069 6070 /* 6071 * Set an approximate value for lowmem here, it will be adjusted 6072 * when the bootmem allocator frees pages into the buddy system. 6073 * And all highmem pages will be managed by the buddy system. 6074 */ 6075 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 6076 #ifdef CONFIG_NUMA 6077 zone->node = nid; 6078 #endif 6079 zone->name = zone_names[j]; 6080 zone->zone_pgdat = pgdat; 6081 spin_lock_init(&zone->lock); 6082 zone_seqlock_init(zone); 6083 zone_pcp_init(zone); 6084 6085 if (!size) 6086 continue; 6087 6088 set_pageblock_order(); 6089 setup_usemap(pgdat, zone, zone_start_pfn, size); 6090 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 6091 BUG_ON(ret); 6092 memmap_init(size, nid, j, zone_start_pfn); 6093 } 6094 } 6095 6096 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 6097 { 6098 unsigned long __maybe_unused start = 0; 6099 unsigned long __maybe_unused offset = 0; 6100 6101 /* Skip empty nodes */ 6102 if (!pgdat->node_spanned_pages) 6103 return; 6104 6105 #ifdef CONFIG_FLAT_NODE_MEM_MAP 6106 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 6107 offset = pgdat->node_start_pfn - start; 6108 /* ia64 gets its own node_mem_map, before this, without bootmem */ 6109 if (!pgdat->node_mem_map) { 6110 unsigned long size, end; 6111 struct page *map; 6112 6113 /* 6114 * The zone's endpoints aren't required to be MAX_ORDER 6115 * aligned but the node_mem_map endpoints must be in order 6116 * for the buddy allocator to function correctly. 6117 */ 6118 end = pgdat_end_pfn(pgdat); 6119 end = ALIGN(end, MAX_ORDER_NR_PAGES); 6120 size = (end - start) * sizeof(struct page); 6121 map = alloc_remap(pgdat->node_id, size); 6122 if (!map) 6123 map = memblock_virt_alloc_node_nopanic(size, 6124 pgdat->node_id); 6125 pgdat->node_mem_map = map + offset; 6126 } 6127 #ifndef CONFIG_NEED_MULTIPLE_NODES 6128 /* 6129 * With no DISCONTIG, the global mem_map is just set as node 0's 6130 */ 6131 if (pgdat == NODE_DATA(0)) { 6132 mem_map = NODE_DATA(0)->node_mem_map; 6133 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) 6134 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 6135 mem_map -= offset; 6136 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6137 } 6138 #endif 6139 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 6140 } 6141 6142 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6143 unsigned long node_start_pfn, unsigned long *zholes_size) 6144 { 6145 pg_data_t *pgdat = NODE_DATA(nid); 6146 unsigned long start_pfn = 0; 6147 unsigned long end_pfn = 0; 6148 6149 /* pg_data_t should be reset to zero when it's allocated */ 6150 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); 6151 6152 pgdat->node_id = nid; 6153 pgdat->node_start_pfn = node_start_pfn; 6154 pgdat->per_cpu_nodestats = NULL; 6155 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6156 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 6157 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 6158 (u64)start_pfn << PAGE_SHIFT, 6159 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 6160 #else 6161 start_pfn = node_start_pfn; 6162 #endif 6163 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 6164 zones_size, zholes_size); 6165 6166 alloc_node_mem_map(pgdat); 6167 #ifdef CONFIG_FLAT_NODE_MEM_MAP 6168 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 6169 nid, (unsigned long)pgdat, 6170 (unsigned long)pgdat->node_mem_map); 6171 #endif 6172 6173 reset_deferred_meminit(pgdat); 6174 free_area_init_core(pgdat); 6175 } 6176 6177 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6178 6179 #if MAX_NUMNODES > 1 6180 /* 6181 * Figure out the number of possible node ids. 6182 */ 6183 void __init setup_nr_node_ids(void) 6184 { 6185 unsigned int highest; 6186 6187 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 6188 nr_node_ids = highest + 1; 6189 } 6190 #endif 6191 6192 /** 6193 * node_map_pfn_alignment - determine the maximum internode alignment 6194 * 6195 * This function should be called after node map is populated and sorted. 6196 * It calculates the maximum power of two alignment which can distinguish 6197 * all the nodes. 6198 * 6199 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 6200 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 6201 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 6202 * shifted, 1GiB is enough and this function will indicate so. 6203 * 6204 * This is used to test whether pfn -> nid mapping of the chosen memory 6205 * model has fine enough granularity to avoid incorrect mapping for the 6206 * populated node map. 6207 * 6208 * Returns the determined alignment in pfn's. 0 if there is no alignment 6209 * requirement (single node). 6210 */ 6211 unsigned long __init node_map_pfn_alignment(void) 6212 { 6213 unsigned long accl_mask = 0, last_end = 0; 6214 unsigned long start, end, mask; 6215 int last_nid = -1; 6216 int i, nid; 6217 6218 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 6219 if (!start || last_nid < 0 || last_nid == nid) { 6220 last_nid = nid; 6221 last_end = end; 6222 continue; 6223 } 6224 6225 /* 6226 * Start with a mask granular enough to pin-point to the 6227 * start pfn and tick off bits one-by-one until it becomes 6228 * too coarse to separate the current node from the last. 6229 */ 6230 mask = ~((1 << __ffs(start)) - 1); 6231 while (mask && last_end <= (start & (mask << 1))) 6232 mask <<= 1; 6233 6234 /* accumulate all internode masks */ 6235 accl_mask |= mask; 6236 } 6237 6238 /* convert mask to number of pages */ 6239 return ~accl_mask + 1; 6240 } 6241 6242 /* Find the lowest pfn for a node */ 6243 static unsigned long __init find_min_pfn_for_node(int nid) 6244 { 6245 unsigned long min_pfn = ULONG_MAX; 6246 unsigned long start_pfn; 6247 int i; 6248 6249 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 6250 min_pfn = min(min_pfn, start_pfn); 6251 6252 if (min_pfn == ULONG_MAX) { 6253 pr_warn("Could not find start_pfn for node %d\n", nid); 6254 return 0; 6255 } 6256 6257 return min_pfn; 6258 } 6259 6260 /** 6261 * find_min_pfn_with_active_regions - Find the minimum PFN registered 6262 * 6263 * It returns the minimum PFN based on information provided via 6264 * memblock_set_node(). 6265 */ 6266 unsigned long __init find_min_pfn_with_active_regions(void) 6267 { 6268 return find_min_pfn_for_node(MAX_NUMNODES); 6269 } 6270 6271 /* 6272 * early_calculate_totalpages() 6273 * Sum pages in active regions for movable zone. 6274 * Populate N_MEMORY for calculating usable_nodes. 6275 */ 6276 static unsigned long __init early_calculate_totalpages(void) 6277 { 6278 unsigned long totalpages = 0; 6279 unsigned long start_pfn, end_pfn; 6280 int i, nid; 6281 6282 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 6283 unsigned long pages = end_pfn - start_pfn; 6284 6285 totalpages += pages; 6286 if (pages) 6287 node_set_state(nid, N_MEMORY); 6288 } 6289 return totalpages; 6290 } 6291 6292 /* 6293 * Find the PFN the Movable zone begins in each node. Kernel memory 6294 * is spread evenly between nodes as long as the nodes have enough 6295 * memory. When they don't, some nodes will have more kernelcore than 6296 * others 6297 */ 6298 static void __init find_zone_movable_pfns_for_nodes(void) 6299 { 6300 int i, nid; 6301 unsigned long usable_startpfn; 6302 unsigned long kernelcore_node, kernelcore_remaining; 6303 /* save the state before borrow the nodemask */ 6304 nodemask_t saved_node_state = node_states[N_MEMORY]; 6305 unsigned long totalpages = early_calculate_totalpages(); 6306 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 6307 struct memblock_region *r; 6308 6309 /* Need to find movable_zone earlier when movable_node is specified. */ 6310 find_usable_zone_for_movable(); 6311 6312 /* 6313 * If movable_node is specified, ignore kernelcore and movablecore 6314 * options. 6315 */ 6316 if (movable_node_is_enabled()) { 6317 for_each_memblock(memory, r) { 6318 if (!memblock_is_hotpluggable(r)) 6319 continue; 6320 6321 nid = r->nid; 6322 6323 usable_startpfn = PFN_DOWN(r->base); 6324 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 6325 min(usable_startpfn, zone_movable_pfn[nid]) : 6326 usable_startpfn; 6327 } 6328 6329 goto out2; 6330 } 6331 6332 /* 6333 * If kernelcore=mirror is specified, ignore movablecore option 6334 */ 6335 if (mirrored_kernelcore) { 6336 bool mem_below_4gb_not_mirrored = false; 6337 6338 for_each_memblock(memory, r) { 6339 if (memblock_is_mirror(r)) 6340 continue; 6341 6342 nid = r->nid; 6343 6344 usable_startpfn = memblock_region_memory_base_pfn(r); 6345 6346 if (usable_startpfn < 0x100000) { 6347 mem_below_4gb_not_mirrored = true; 6348 continue; 6349 } 6350 6351 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 6352 min(usable_startpfn, zone_movable_pfn[nid]) : 6353 usable_startpfn; 6354 } 6355 6356 if (mem_below_4gb_not_mirrored) 6357 pr_warn("This configuration results in unmirrored kernel memory."); 6358 6359 goto out2; 6360 } 6361 6362 /* 6363 * If movablecore=nn[KMG] was specified, calculate what size of 6364 * kernelcore that corresponds so that memory usable for 6365 * any allocation type is evenly spread. If both kernelcore 6366 * and movablecore are specified, then the value of kernelcore 6367 * will be used for required_kernelcore if it's greater than 6368 * what movablecore would have allowed. 6369 */ 6370 if (required_movablecore) { 6371 unsigned long corepages; 6372 6373 /* 6374 * Round-up so that ZONE_MOVABLE is at least as large as what 6375 * was requested by the user 6376 */ 6377 required_movablecore = 6378 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 6379 required_movablecore = min(totalpages, required_movablecore); 6380 corepages = totalpages - required_movablecore; 6381 6382 required_kernelcore = max(required_kernelcore, corepages); 6383 } 6384 6385 /* 6386 * If kernelcore was not specified or kernelcore size is larger 6387 * than totalpages, there is no ZONE_MOVABLE. 6388 */ 6389 if (!required_kernelcore || required_kernelcore >= totalpages) 6390 goto out; 6391 6392 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 6393 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 6394 6395 restart: 6396 /* Spread kernelcore memory as evenly as possible throughout nodes */ 6397 kernelcore_node = required_kernelcore / usable_nodes; 6398 for_each_node_state(nid, N_MEMORY) { 6399 unsigned long start_pfn, end_pfn; 6400 6401 /* 6402 * Recalculate kernelcore_node if the division per node 6403 * now exceeds what is necessary to satisfy the requested 6404 * amount of memory for the kernel 6405 */ 6406 if (required_kernelcore < kernelcore_node) 6407 kernelcore_node = required_kernelcore / usable_nodes; 6408 6409 /* 6410 * As the map is walked, we track how much memory is usable 6411 * by the kernel using kernelcore_remaining. When it is 6412 * 0, the rest of the node is usable by ZONE_MOVABLE 6413 */ 6414 kernelcore_remaining = kernelcore_node; 6415 6416 /* Go through each range of PFNs within this node */ 6417 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 6418 unsigned long size_pages; 6419 6420 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 6421 if (start_pfn >= end_pfn) 6422 continue; 6423 6424 /* Account for what is only usable for kernelcore */ 6425 if (start_pfn < usable_startpfn) { 6426 unsigned long kernel_pages; 6427 kernel_pages = min(end_pfn, usable_startpfn) 6428 - start_pfn; 6429 6430 kernelcore_remaining -= min(kernel_pages, 6431 kernelcore_remaining); 6432 required_kernelcore -= min(kernel_pages, 6433 required_kernelcore); 6434 6435 /* Continue if range is now fully accounted */ 6436 if (end_pfn <= usable_startpfn) { 6437 6438 /* 6439 * Push zone_movable_pfn to the end so 6440 * that if we have to rebalance 6441 * kernelcore across nodes, we will 6442 * not double account here 6443 */ 6444 zone_movable_pfn[nid] = end_pfn; 6445 continue; 6446 } 6447 start_pfn = usable_startpfn; 6448 } 6449 6450 /* 6451 * The usable PFN range for ZONE_MOVABLE is from 6452 * start_pfn->end_pfn. Calculate size_pages as the 6453 * number of pages used as kernelcore 6454 */ 6455 size_pages = end_pfn - start_pfn; 6456 if (size_pages > kernelcore_remaining) 6457 size_pages = kernelcore_remaining; 6458 zone_movable_pfn[nid] = start_pfn + size_pages; 6459 6460 /* 6461 * Some kernelcore has been met, update counts and 6462 * break if the kernelcore for this node has been 6463 * satisfied 6464 */ 6465 required_kernelcore -= min(required_kernelcore, 6466 size_pages); 6467 kernelcore_remaining -= size_pages; 6468 if (!kernelcore_remaining) 6469 break; 6470 } 6471 } 6472 6473 /* 6474 * If there is still required_kernelcore, we do another pass with one 6475 * less node in the count. This will push zone_movable_pfn[nid] further 6476 * along on the nodes that still have memory until kernelcore is 6477 * satisfied 6478 */ 6479 usable_nodes--; 6480 if (usable_nodes && required_kernelcore > usable_nodes) 6481 goto restart; 6482 6483 out2: 6484 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 6485 for (nid = 0; nid < MAX_NUMNODES; nid++) 6486 zone_movable_pfn[nid] = 6487 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 6488 6489 out: 6490 /* restore the node_state */ 6491 node_states[N_MEMORY] = saved_node_state; 6492 } 6493 6494 /* Any regular or high memory on that node ? */ 6495 static void check_for_memory(pg_data_t *pgdat, int nid) 6496 { 6497 enum zone_type zone_type; 6498 6499 if (N_MEMORY == N_NORMAL_MEMORY) 6500 return; 6501 6502 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 6503 struct zone *zone = &pgdat->node_zones[zone_type]; 6504 if (populated_zone(zone)) { 6505 node_set_state(nid, N_HIGH_MEMORY); 6506 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 6507 zone_type <= ZONE_NORMAL) 6508 node_set_state(nid, N_NORMAL_MEMORY); 6509 break; 6510 } 6511 } 6512 } 6513 6514 /** 6515 * free_area_init_nodes - Initialise all pg_data_t and zone data 6516 * @max_zone_pfn: an array of max PFNs for each zone 6517 * 6518 * This will call free_area_init_node() for each active node in the system. 6519 * Using the page ranges provided by memblock_set_node(), the size of each 6520 * zone in each node and their holes is calculated. If the maximum PFN 6521 * between two adjacent zones match, it is assumed that the zone is empty. 6522 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 6523 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 6524 * starts where the previous one ended. For example, ZONE_DMA32 starts 6525 * at arch_max_dma_pfn. 6526 */ 6527 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 6528 { 6529 unsigned long start_pfn, end_pfn; 6530 int i, nid; 6531 6532 /* Record where the zone boundaries are */ 6533 memset(arch_zone_lowest_possible_pfn, 0, 6534 sizeof(arch_zone_lowest_possible_pfn)); 6535 memset(arch_zone_highest_possible_pfn, 0, 6536 sizeof(arch_zone_highest_possible_pfn)); 6537 6538 start_pfn = find_min_pfn_with_active_regions(); 6539 6540 for (i = 0; i < MAX_NR_ZONES; i++) { 6541 if (i == ZONE_MOVABLE) 6542 continue; 6543 6544 end_pfn = max(max_zone_pfn[i], start_pfn); 6545 arch_zone_lowest_possible_pfn[i] = start_pfn; 6546 arch_zone_highest_possible_pfn[i] = end_pfn; 6547 6548 start_pfn = end_pfn; 6549 } 6550 6551 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 6552 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 6553 find_zone_movable_pfns_for_nodes(); 6554 6555 /* Print out the zone ranges */ 6556 pr_info("Zone ranges:\n"); 6557 for (i = 0; i < MAX_NR_ZONES; i++) { 6558 if (i == ZONE_MOVABLE) 6559 continue; 6560 pr_info(" %-8s ", zone_names[i]); 6561 if (arch_zone_lowest_possible_pfn[i] == 6562 arch_zone_highest_possible_pfn[i]) 6563 pr_cont("empty\n"); 6564 else 6565 pr_cont("[mem %#018Lx-%#018Lx]\n", 6566 (u64)arch_zone_lowest_possible_pfn[i] 6567 << PAGE_SHIFT, 6568 ((u64)arch_zone_highest_possible_pfn[i] 6569 << PAGE_SHIFT) - 1); 6570 } 6571 6572 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 6573 pr_info("Movable zone start for each node\n"); 6574 for (i = 0; i < MAX_NUMNODES; i++) { 6575 if (zone_movable_pfn[i]) 6576 pr_info(" Node %d: %#018Lx\n", i, 6577 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 6578 } 6579 6580 /* Print out the early node map */ 6581 pr_info("Early memory node ranges\n"); 6582 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 6583 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 6584 (u64)start_pfn << PAGE_SHIFT, 6585 ((u64)end_pfn << PAGE_SHIFT) - 1); 6586 6587 /* Initialise every node */ 6588 mminit_verify_pageflags_layout(); 6589 setup_nr_node_ids(); 6590 for_each_online_node(nid) { 6591 pg_data_t *pgdat = NODE_DATA(nid); 6592 free_area_init_node(nid, NULL, 6593 find_min_pfn_for_node(nid), NULL); 6594 6595 /* Any memory on that node */ 6596 if (pgdat->node_present_pages) 6597 node_set_state(nid, N_MEMORY); 6598 check_for_memory(pgdat, nid); 6599 } 6600 } 6601 6602 static int __init cmdline_parse_core(char *p, unsigned long *core) 6603 { 6604 unsigned long long coremem; 6605 if (!p) 6606 return -EINVAL; 6607 6608 coremem = memparse(p, &p); 6609 *core = coremem >> PAGE_SHIFT; 6610 6611 /* Paranoid check that UL is enough for the coremem value */ 6612 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 6613 6614 return 0; 6615 } 6616 6617 /* 6618 * kernelcore=size sets the amount of memory for use for allocations that 6619 * cannot be reclaimed or migrated. 6620 */ 6621 static int __init cmdline_parse_kernelcore(char *p) 6622 { 6623 /* parse kernelcore=mirror */ 6624 if (parse_option_str(p, "mirror")) { 6625 mirrored_kernelcore = true; 6626 return 0; 6627 } 6628 6629 return cmdline_parse_core(p, &required_kernelcore); 6630 } 6631 6632 /* 6633 * movablecore=size sets the amount of memory for use for allocations that 6634 * can be reclaimed or migrated. 6635 */ 6636 static int __init cmdline_parse_movablecore(char *p) 6637 { 6638 return cmdline_parse_core(p, &required_movablecore); 6639 } 6640 6641 early_param("kernelcore", cmdline_parse_kernelcore); 6642 early_param("movablecore", cmdline_parse_movablecore); 6643 6644 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6645 6646 void adjust_managed_page_count(struct page *page, long count) 6647 { 6648 spin_lock(&managed_page_count_lock); 6649 page_zone(page)->managed_pages += count; 6650 totalram_pages += count; 6651 #ifdef CONFIG_HIGHMEM 6652 if (PageHighMem(page)) 6653 totalhigh_pages += count; 6654 #endif 6655 spin_unlock(&managed_page_count_lock); 6656 } 6657 EXPORT_SYMBOL(adjust_managed_page_count); 6658 6659 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 6660 { 6661 void *pos; 6662 unsigned long pages = 0; 6663 6664 start = (void *)PAGE_ALIGN((unsigned long)start); 6665 end = (void *)((unsigned long)end & PAGE_MASK); 6666 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 6667 if ((unsigned int)poison <= 0xFF) 6668 memset(pos, poison, PAGE_SIZE); 6669 free_reserved_page(virt_to_page(pos)); 6670 } 6671 6672 if (pages && s) 6673 pr_info("Freeing %s memory: %ldK\n", 6674 s, pages << (PAGE_SHIFT - 10)); 6675 6676 return pages; 6677 } 6678 EXPORT_SYMBOL(free_reserved_area); 6679 6680 #ifdef CONFIG_HIGHMEM 6681 void free_highmem_page(struct page *page) 6682 { 6683 __free_reserved_page(page); 6684 totalram_pages++; 6685 page_zone(page)->managed_pages++; 6686 totalhigh_pages++; 6687 } 6688 #endif 6689 6690 6691 void __init mem_init_print_info(const char *str) 6692 { 6693 unsigned long physpages, codesize, datasize, rosize, bss_size; 6694 unsigned long init_code_size, init_data_size; 6695 6696 physpages = get_num_physpages(); 6697 codesize = _etext - _stext; 6698 datasize = _edata - _sdata; 6699 rosize = __end_rodata - __start_rodata; 6700 bss_size = __bss_stop - __bss_start; 6701 init_data_size = __init_end - __init_begin; 6702 init_code_size = _einittext - _sinittext; 6703 6704 /* 6705 * Detect special cases and adjust section sizes accordingly: 6706 * 1) .init.* may be embedded into .data sections 6707 * 2) .init.text.* may be out of [__init_begin, __init_end], 6708 * please refer to arch/tile/kernel/vmlinux.lds.S. 6709 * 3) .rodata.* may be embedded into .text or .data sections. 6710 */ 6711 #define adj_init_size(start, end, size, pos, adj) \ 6712 do { \ 6713 if (start <= pos && pos < end && size > adj) \ 6714 size -= adj; \ 6715 } while (0) 6716 6717 adj_init_size(__init_begin, __init_end, init_data_size, 6718 _sinittext, init_code_size); 6719 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 6720 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 6721 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 6722 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 6723 6724 #undef adj_init_size 6725 6726 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 6727 #ifdef CONFIG_HIGHMEM 6728 ", %luK highmem" 6729 #endif 6730 "%s%s)\n", 6731 nr_free_pages() << (PAGE_SHIFT - 10), 6732 physpages << (PAGE_SHIFT - 10), 6733 codesize >> 10, datasize >> 10, rosize >> 10, 6734 (init_data_size + init_code_size) >> 10, bss_size >> 10, 6735 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), 6736 totalcma_pages << (PAGE_SHIFT - 10), 6737 #ifdef CONFIG_HIGHMEM 6738 totalhigh_pages << (PAGE_SHIFT - 10), 6739 #endif 6740 str ? ", " : "", str ? str : ""); 6741 } 6742 6743 /** 6744 * set_dma_reserve - set the specified number of pages reserved in the first zone 6745 * @new_dma_reserve: The number of pages to mark reserved 6746 * 6747 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 6748 * In the DMA zone, a significant percentage may be consumed by kernel image 6749 * and other unfreeable allocations which can skew the watermarks badly. This 6750 * function may optionally be used to account for unfreeable pages in the 6751 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 6752 * smaller per-cpu batchsize. 6753 */ 6754 void __init set_dma_reserve(unsigned long new_dma_reserve) 6755 { 6756 dma_reserve = new_dma_reserve; 6757 } 6758 6759 void __init free_area_init(unsigned long *zones_size) 6760 { 6761 free_area_init_node(0, zones_size, 6762 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 6763 } 6764 6765 static int page_alloc_cpu_dead(unsigned int cpu) 6766 { 6767 6768 lru_add_drain_cpu(cpu); 6769 drain_pages(cpu); 6770 6771 /* 6772 * Spill the event counters of the dead processor 6773 * into the current processors event counters. 6774 * This artificially elevates the count of the current 6775 * processor. 6776 */ 6777 vm_events_fold_cpu(cpu); 6778 6779 /* 6780 * Zero the differential counters of the dead processor 6781 * so that the vm statistics are consistent. 6782 * 6783 * This is only okay since the processor is dead and cannot 6784 * race with what we are doing. 6785 */ 6786 cpu_vm_stats_fold(cpu); 6787 return 0; 6788 } 6789 6790 void __init page_alloc_init(void) 6791 { 6792 int ret; 6793 6794 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, 6795 "mm/page_alloc:dead", NULL, 6796 page_alloc_cpu_dead); 6797 WARN_ON(ret < 0); 6798 } 6799 6800 /* 6801 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 6802 * or min_free_kbytes changes. 6803 */ 6804 static void calculate_totalreserve_pages(void) 6805 { 6806 struct pglist_data *pgdat; 6807 unsigned long reserve_pages = 0; 6808 enum zone_type i, j; 6809 6810 for_each_online_pgdat(pgdat) { 6811 6812 pgdat->totalreserve_pages = 0; 6813 6814 for (i = 0; i < MAX_NR_ZONES; i++) { 6815 struct zone *zone = pgdat->node_zones + i; 6816 long max = 0; 6817 6818 /* Find valid and maximum lowmem_reserve in the zone */ 6819 for (j = i; j < MAX_NR_ZONES; j++) { 6820 if (zone->lowmem_reserve[j] > max) 6821 max = zone->lowmem_reserve[j]; 6822 } 6823 6824 /* we treat the high watermark as reserved pages. */ 6825 max += high_wmark_pages(zone); 6826 6827 if (max > zone->managed_pages) 6828 max = zone->managed_pages; 6829 6830 pgdat->totalreserve_pages += max; 6831 6832 reserve_pages += max; 6833 } 6834 } 6835 totalreserve_pages = reserve_pages; 6836 } 6837 6838 /* 6839 * setup_per_zone_lowmem_reserve - called whenever 6840 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 6841 * has a correct pages reserved value, so an adequate number of 6842 * pages are left in the zone after a successful __alloc_pages(). 6843 */ 6844 static void setup_per_zone_lowmem_reserve(void) 6845 { 6846 struct pglist_data *pgdat; 6847 enum zone_type j, idx; 6848 6849 for_each_online_pgdat(pgdat) { 6850 for (j = 0; j < MAX_NR_ZONES; j++) { 6851 struct zone *zone = pgdat->node_zones + j; 6852 unsigned long managed_pages = zone->managed_pages; 6853 6854 zone->lowmem_reserve[j] = 0; 6855 6856 idx = j; 6857 while (idx) { 6858 struct zone *lower_zone; 6859 6860 idx--; 6861 6862 if (sysctl_lowmem_reserve_ratio[idx] < 1) 6863 sysctl_lowmem_reserve_ratio[idx] = 1; 6864 6865 lower_zone = pgdat->node_zones + idx; 6866 lower_zone->lowmem_reserve[j] = managed_pages / 6867 sysctl_lowmem_reserve_ratio[idx]; 6868 managed_pages += lower_zone->managed_pages; 6869 } 6870 } 6871 } 6872 6873 /* update totalreserve_pages */ 6874 calculate_totalreserve_pages(); 6875 } 6876 6877 static void __setup_per_zone_wmarks(void) 6878 { 6879 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 6880 unsigned long lowmem_pages = 0; 6881 struct zone *zone; 6882 unsigned long flags; 6883 6884 /* Calculate total number of !ZONE_HIGHMEM pages */ 6885 for_each_zone(zone) { 6886 if (!is_highmem(zone)) 6887 lowmem_pages += zone->managed_pages; 6888 } 6889 6890 for_each_zone(zone) { 6891 u64 tmp; 6892 6893 spin_lock_irqsave(&zone->lock, flags); 6894 tmp = (u64)pages_min * zone->managed_pages; 6895 do_div(tmp, lowmem_pages); 6896 if (is_highmem(zone)) { 6897 /* 6898 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 6899 * need highmem pages, so cap pages_min to a small 6900 * value here. 6901 * 6902 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 6903 * deltas control asynch page reclaim, and so should 6904 * not be capped for highmem. 6905 */ 6906 unsigned long min_pages; 6907 6908 min_pages = zone->managed_pages / 1024; 6909 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 6910 zone->watermark[WMARK_MIN] = min_pages; 6911 } else { 6912 /* 6913 * If it's a lowmem zone, reserve a number of pages 6914 * proportionate to the zone's size. 6915 */ 6916 zone->watermark[WMARK_MIN] = tmp; 6917 } 6918 6919 /* 6920 * Set the kswapd watermarks distance according to the 6921 * scale factor in proportion to available memory, but 6922 * ensure a minimum size on small systems. 6923 */ 6924 tmp = max_t(u64, tmp >> 2, 6925 mult_frac(zone->managed_pages, 6926 watermark_scale_factor, 10000)); 6927 6928 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 6929 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 6930 6931 spin_unlock_irqrestore(&zone->lock, flags); 6932 } 6933 6934 /* update totalreserve_pages */ 6935 calculate_totalreserve_pages(); 6936 } 6937 6938 /** 6939 * setup_per_zone_wmarks - called when min_free_kbytes changes 6940 * or when memory is hot-{added|removed} 6941 * 6942 * Ensures that the watermark[min,low,high] values for each zone are set 6943 * correctly with respect to min_free_kbytes. 6944 */ 6945 void setup_per_zone_wmarks(void) 6946 { 6947 mutex_lock(&zonelists_mutex); 6948 __setup_per_zone_wmarks(); 6949 mutex_unlock(&zonelists_mutex); 6950 } 6951 6952 /* 6953 * Initialise min_free_kbytes. 6954 * 6955 * For small machines we want it small (128k min). For large machines 6956 * we want it large (64MB max). But it is not linear, because network 6957 * bandwidth does not increase linearly with machine size. We use 6958 * 6959 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 6960 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 6961 * 6962 * which yields 6963 * 6964 * 16MB: 512k 6965 * 32MB: 724k 6966 * 64MB: 1024k 6967 * 128MB: 1448k 6968 * 256MB: 2048k 6969 * 512MB: 2896k 6970 * 1024MB: 4096k 6971 * 2048MB: 5792k 6972 * 4096MB: 8192k 6973 * 8192MB: 11584k 6974 * 16384MB: 16384k 6975 */ 6976 int __meminit init_per_zone_wmark_min(void) 6977 { 6978 unsigned long lowmem_kbytes; 6979 int new_min_free_kbytes; 6980 6981 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 6982 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 6983 6984 if (new_min_free_kbytes > user_min_free_kbytes) { 6985 min_free_kbytes = new_min_free_kbytes; 6986 if (min_free_kbytes < 128) 6987 min_free_kbytes = 128; 6988 if (min_free_kbytes > 65536) 6989 min_free_kbytes = 65536; 6990 } else { 6991 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 6992 new_min_free_kbytes, user_min_free_kbytes); 6993 } 6994 setup_per_zone_wmarks(); 6995 refresh_zone_stat_thresholds(); 6996 setup_per_zone_lowmem_reserve(); 6997 6998 #ifdef CONFIG_NUMA 6999 setup_min_unmapped_ratio(); 7000 setup_min_slab_ratio(); 7001 #endif 7002 7003 return 0; 7004 } 7005 core_initcall(init_per_zone_wmark_min) 7006 7007 /* 7008 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 7009 * that we can call two helper functions whenever min_free_kbytes 7010 * changes. 7011 */ 7012 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 7013 void __user *buffer, size_t *length, loff_t *ppos) 7014 { 7015 int rc; 7016 7017 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7018 if (rc) 7019 return rc; 7020 7021 if (write) { 7022 user_min_free_kbytes = min_free_kbytes; 7023 setup_per_zone_wmarks(); 7024 } 7025 return 0; 7026 } 7027 7028 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 7029 void __user *buffer, size_t *length, loff_t *ppos) 7030 { 7031 int rc; 7032 7033 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7034 if (rc) 7035 return rc; 7036 7037 if (write) 7038 setup_per_zone_wmarks(); 7039 7040 return 0; 7041 } 7042 7043 #ifdef CONFIG_NUMA 7044 static void setup_min_unmapped_ratio(void) 7045 { 7046 pg_data_t *pgdat; 7047 struct zone *zone; 7048 7049 for_each_online_pgdat(pgdat) 7050 pgdat->min_unmapped_pages = 0; 7051 7052 for_each_zone(zone) 7053 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * 7054 sysctl_min_unmapped_ratio) / 100; 7055 } 7056 7057 7058 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 7059 void __user *buffer, size_t *length, loff_t *ppos) 7060 { 7061 int rc; 7062 7063 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7064 if (rc) 7065 return rc; 7066 7067 setup_min_unmapped_ratio(); 7068 7069 return 0; 7070 } 7071 7072 static void setup_min_slab_ratio(void) 7073 { 7074 pg_data_t *pgdat; 7075 struct zone *zone; 7076 7077 for_each_online_pgdat(pgdat) 7078 pgdat->min_slab_pages = 0; 7079 7080 for_each_zone(zone) 7081 zone->zone_pgdat->min_slab_pages += (zone->managed_pages * 7082 sysctl_min_slab_ratio) / 100; 7083 } 7084 7085 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 7086 void __user *buffer, size_t *length, loff_t *ppos) 7087 { 7088 int rc; 7089 7090 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7091 if (rc) 7092 return rc; 7093 7094 setup_min_slab_ratio(); 7095 7096 return 0; 7097 } 7098 #endif 7099 7100 /* 7101 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 7102 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 7103 * whenever sysctl_lowmem_reserve_ratio changes. 7104 * 7105 * The reserve ratio obviously has absolutely no relation with the 7106 * minimum watermarks. The lowmem reserve ratio can only make sense 7107 * if in function of the boot time zone sizes. 7108 */ 7109 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 7110 void __user *buffer, size_t *length, loff_t *ppos) 7111 { 7112 proc_dointvec_minmax(table, write, buffer, length, ppos); 7113 setup_per_zone_lowmem_reserve(); 7114 return 0; 7115 } 7116 7117 /* 7118 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 7119 * cpu. It is the fraction of total pages in each zone that a hot per cpu 7120 * pagelist can have before it gets flushed back to buddy allocator. 7121 */ 7122 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, 7123 void __user *buffer, size_t *length, loff_t *ppos) 7124 { 7125 struct zone *zone; 7126 int old_percpu_pagelist_fraction; 7127 int ret; 7128 7129 mutex_lock(&pcp_batch_high_lock); 7130 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 7131 7132 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 7133 if (!write || ret < 0) 7134 goto out; 7135 7136 /* Sanity checking to avoid pcp imbalance */ 7137 if (percpu_pagelist_fraction && 7138 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 7139 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 7140 ret = -EINVAL; 7141 goto out; 7142 } 7143 7144 /* No change? */ 7145 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 7146 goto out; 7147 7148 for_each_populated_zone(zone) { 7149 unsigned int cpu; 7150 7151 for_each_possible_cpu(cpu) 7152 pageset_set_high_and_batch(zone, 7153 per_cpu_ptr(zone->pageset, cpu)); 7154 } 7155 out: 7156 mutex_unlock(&pcp_batch_high_lock); 7157 return ret; 7158 } 7159 7160 #ifdef CONFIG_NUMA 7161 int hashdist = HASHDIST_DEFAULT; 7162 7163 static int __init set_hashdist(char *str) 7164 { 7165 if (!str) 7166 return 0; 7167 hashdist = simple_strtoul(str, &str, 0); 7168 return 1; 7169 } 7170 __setup("hashdist=", set_hashdist); 7171 #endif 7172 7173 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 7174 /* 7175 * Returns the number of pages that arch has reserved but 7176 * is not known to alloc_large_system_hash(). 7177 */ 7178 static unsigned long __init arch_reserved_kernel_pages(void) 7179 { 7180 return 0; 7181 } 7182 #endif 7183 7184 /* 7185 * allocate a large system hash table from bootmem 7186 * - it is assumed that the hash table must contain an exact power-of-2 7187 * quantity of entries 7188 * - limit is the number of hash buckets, not the total allocation size 7189 */ 7190 void *__init alloc_large_system_hash(const char *tablename, 7191 unsigned long bucketsize, 7192 unsigned long numentries, 7193 int scale, 7194 int flags, 7195 unsigned int *_hash_shift, 7196 unsigned int *_hash_mask, 7197 unsigned long low_limit, 7198 unsigned long high_limit) 7199 { 7200 unsigned long long max = high_limit; 7201 unsigned long log2qty, size; 7202 void *table = NULL; 7203 7204 /* allow the kernel cmdline to have a say */ 7205 if (!numentries) { 7206 /* round applicable memory size up to nearest megabyte */ 7207 numentries = nr_kernel_pages; 7208 numentries -= arch_reserved_kernel_pages(); 7209 7210 /* It isn't necessary when PAGE_SIZE >= 1MB */ 7211 if (PAGE_SHIFT < 20) 7212 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 7213 7214 /* limit to 1 bucket per 2^scale bytes of low memory */ 7215 if (scale > PAGE_SHIFT) 7216 numentries >>= (scale - PAGE_SHIFT); 7217 else 7218 numentries <<= (PAGE_SHIFT - scale); 7219 7220 /* Make sure we've got at least a 0-order allocation.. */ 7221 if (unlikely(flags & HASH_SMALL)) { 7222 /* Makes no sense without HASH_EARLY */ 7223 WARN_ON(!(flags & HASH_EARLY)); 7224 if (!(numentries >> *_hash_shift)) { 7225 numentries = 1UL << *_hash_shift; 7226 BUG_ON(!numentries); 7227 } 7228 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 7229 numentries = PAGE_SIZE / bucketsize; 7230 } 7231 numentries = roundup_pow_of_two(numentries); 7232 7233 /* limit allocation size to 1/16 total memory by default */ 7234 if (max == 0) { 7235 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 7236 do_div(max, bucketsize); 7237 } 7238 max = min(max, 0x80000000ULL); 7239 7240 if (numentries < low_limit) 7241 numentries = low_limit; 7242 if (numentries > max) 7243 numentries = max; 7244 7245 log2qty = ilog2(numentries); 7246 7247 do { 7248 size = bucketsize << log2qty; 7249 if (flags & HASH_EARLY) 7250 table = memblock_virt_alloc_nopanic(size, 0); 7251 else if (hashdist) 7252 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 7253 else { 7254 /* 7255 * If bucketsize is not a power-of-two, we may free 7256 * some pages at the end of hash table which 7257 * alloc_pages_exact() automatically does 7258 */ 7259 if (get_order(size) < MAX_ORDER) { 7260 table = alloc_pages_exact(size, GFP_ATOMIC); 7261 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 7262 } 7263 } 7264 } while (!table && size > PAGE_SIZE && --log2qty); 7265 7266 if (!table) 7267 panic("Failed to allocate %s hash table\n", tablename); 7268 7269 pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", 7270 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); 7271 7272 if (_hash_shift) 7273 *_hash_shift = log2qty; 7274 if (_hash_mask) 7275 *_hash_mask = (1 << log2qty) - 1; 7276 7277 return table; 7278 } 7279 7280 /* 7281 * This function checks whether pageblock includes unmovable pages or not. 7282 * If @count is not zero, it is okay to include less @count unmovable pages 7283 * 7284 * PageLRU check without isolation or lru_lock could race so that 7285 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 7286 * check without lock_page also may miss some movable non-lru pages at 7287 * race condition. So you can't expect this function should be exact. 7288 */ 7289 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7290 bool skip_hwpoisoned_pages) 7291 { 7292 unsigned long pfn, iter, found; 7293 int mt; 7294 7295 /* 7296 * For avoiding noise data, lru_add_drain_all() should be called 7297 * If ZONE_MOVABLE, the zone never contains unmovable pages 7298 */ 7299 if (zone_idx(zone) == ZONE_MOVABLE) 7300 return false; 7301 mt = get_pageblock_migratetype(page); 7302 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 7303 return false; 7304 7305 pfn = page_to_pfn(page); 7306 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 7307 unsigned long check = pfn + iter; 7308 7309 if (!pfn_valid_within(check)) 7310 continue; 7311 7312 page = pfn_to_page(check); 7313 7314 /* 7315 * Hugepages are not in LRU lists, but they're movable. 7316 * We need not scan over tail pages bacause we don't 7317 * handle each tail page individually in migration. 7318 */ 7319 if (PageHuge(page)) { 7320 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 7321 continue; 7322 } 7323 7324 /* 7325 * We can't use page_count without pin a page 7326 * because another CPU can free compound page. 7327 * This check already skips compound tails of THP 7328 * because their page->_refcount is zero at all time. 7329 */ 7330 if (!page_ref_count(page)) { 7331 if (PageBuddy(page)) 7332 iter += (1 << page_order(page)) - 1; 7333 continue; 7334 } 7335 7336 /* 7337 * The HWPoisoned page may be not in buddy system, and 7338 * page_count() is not 0. 7339 */ 7340 if (skip_hwpoisoned_pages && PageHWPoison(page)) 7341 continue; 7342 7343 if (__PageMovable(page)) 7344 continue; 7345 7346 if (!PageLRU(page)) 7347 found++; 7348 /* 7349 * If there are RECLAIMABLE pages, we need to check 7350 * it. But now, memory offline itself doesn't call 7351 * shrink_node_slabs() and it still to be fixed. 7352 */ 7353 /* 7354 * If the page is not RAM, page_count()should be 0. 7355 * we don't need more check. This is an _used_ not-movable page. 7356 * 7357 * The problematic thing here is PG_reserved pages. PG_reserved 7358 * is set to both of a memory hole page and a _used_ kernel 7359 * page at boot. 7360 */ 7361 if (found > count) 7362 return true; 7363 } 7364 return false; 7365 } 7366 7367 bool is_pageblock_removable_nolock(struct page *page) 7368 { 7369 struct zone *zone; 7370 unsigned long pfn; 7371 7372 /* 7373 * We have to be careful here because we are iterating over memory 7374 * sections which are not zone aware so we might end up outside of 7375 * the zone but still within the section. 7376 * We have to take care about the node as well. If the node is offline 7377 * its NODE_DATA will be NULL - see page_zone. 7378 */ 7379 if (!node_online(page_to_nid(page))) 7380 return false; 7381 7382 zone = page_zone(page); 7383 pfn = page_to_pfn(page); 7384 if (!zone_spans_pfn(zone, pfn)) 7385 return false; 7386 7387 return !has_unmovable_pages(zone, page, 0, true); 7388 } 7389 7390 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 7391 7392 static unsigned long pfn_max_align_down(unsigned long pfn) 7393 { 7394 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 7395 pageblock_nr_pages) - 1); 7396 } 7397 7398 static unsigned long pfn_max_align_up(unsigned long pfn) 7399 { 7400 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 7401 pageblock_nr_pages)); 7402 } 7403 7404 /* [start, end) must belong to a single zone. */ 7405 static int __alloc_contig_migrate_range(struct compact_control *cc, 7406 unsigned long start, unsigned long end) 7407 { 7408 /* This function is based on compact_zone() from compaction.c. */ 7409 unsigned long nr_reclaimed; 7410 unsigned long pfn = start; 7411 unsigned int tries = 0; 7412 int ret = 0; 7413 7414 migrate_prep(); 7415 7416 while (pfn < end || !list_empty(&cc->migratepages)) { 7417 if (fatal_signal_pending(current)) { 7418 ret = -EINTR; 7419 break; 7420 } 7421 7422 if (list_empty(&cc->migratepages)) { 7423 cc->nr_migratepages = 0; 7424 pfn = isolate_migratepages_range(cc, pfn, end); 7425 if (!pfn) { 7426 ret = -EINTR; 7427 break; 7428 } 7429 tries = 0; 7430 } else if (++tries == 5) { 7431 ret = ret < 0 ? ret : -EBUSY; 7432 break; 7433 } 7434 7435 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 7436 &cc->migratepages); 7437 cc->nr_migratepages -= nr_reclaimed; 7438 7439 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 7440 NULL, 0, cc->mode, MR_CMA); 7441 } 7442 if (ret < 0) { 7443 putback_movable_pages(&cc->migratepages); 7444 return ret; 7445 } 7446 return 0; 7447 } 7448 7449 /** 7450 * alloc_contig_range() -- tries to allocate given range of pages 7451 * @start: start PFN to allocate 7452 * @end: one-past-the-last PFN to allocate 7453 * @migratetype: migratetype of the underlaying pageblocks (either 7454 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 7455 * in range must have the same migratetype and it must 7456 * be either of the two. 7457 * @gfp_mask: GFP mask to use during compaction 7458 * 7459 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 7460 * aligned, however it's the caller's responsibility to guarantee that 7461 * we are the only thread that changes migrate type of pageblocks the 7462 * pages fall in. 7463 * 7464 * The PFN range must belong to a single zone. 7465 * 7466 * Returns zero on success or negative error code. On success all 7467 * pages which PFN is in [start, end) are allocated for the caller and 7468 * need to be freed with free_contig_range(). 7469 */ 7470 int alloc_contig_range(unsigned long start, unsigned long end, 7471 unsigned migratetype, gfp_t gfp_mask) 7472 { 7473 unsigned long outer_start, outer_end; 7474 unsigned int order; 7475 int ret = 0; 7476 7477 struct compact_control cc = { 7478 .nr_migratepages = 0, 7479 .order = -1, 7480 .zone = page_zone(pfn_to_page(start)), 7481 .mode = MIGRATE_SYNC, 7482 .ignore_skip_hint = true, 7483 .gfp_mask = current_gfp_context(gfp_mask), 7484 }; 7485 INIT_LIST_HEAD(&cc.migratepages); 7486 7487 /* 7488 * What we do here is we mark all pageblocks in range as 7489 * MIGRATE_ISOLATE. Because pageblock and max order pages may 7490 * have different sizes, and due to the way page allocator 7491 * work, we align the range to biggest of the two pages so 7492 * that page allocator won't try to merge buddies from 7493 * different pageblocks and change MIGRATE_ISOLATE to some 7494 * other migration type. 7495 * 7496 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 7497 * migrate the pages from an unaligned range (ie. pages that 7498 * we are interested in). This will put all the pages in 7499 * range back to page allocator as MIGRATE_ISOLATE. 7500 * 7501 * When this is done, we take the pages in range from page 7502 * allocator removing them from the buddy system. This way 7503 * page allocator will never consider using them. 7504 * 7505 * This lets us mark the pageblocks back as 7506 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 7507 * aligned range but not in the unaligned, original range are 7508 * put back to page allocator so that buddy can use them. 7509 */ 7510 7511 ret = start_isolate_page_range(pfn_max_align_down(start), 7512 pfn_max_align_up(end), migratetype, 7513 false); 7514 if (ret) 7515 return ret; 7516 7517 /* 7518 * In case of -EBUSY, we'd like to know which page causes problem. 7519 * So, just fall through. We will check it in test_pages_isolated(). 7520 */ 7521 ret = __alloc_contig_migrate_range(&cc, start, end); 7522 if (ret && ret != -EBUSY) 7523 goto done; 7524 7525 /* 7526 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 7527 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 7528 * more, all pages in [start, end) are free in page allocator. 7529 * What we are going to do is to allocate all pages from 7530 * [start, end) (that is remove them from page allocator). 7531 * 7532 * The only problem is that pages at the beginning and at the 7533 * end of interesting range may be not aligned with pages that 7534 * page allocator holds, ie. they can be part of higher order 7535 * pages. Because of this, we reserve the bigger range and 7536 * once this is done free the pages we are not interested in. 7537 * 7538 * We don't have to hold zone->lock here because the pages are 7539 * isolated thus they won't get removed from buddy. 7540 */ 7541 7542 lru_add_drain_all(); 7543 drain_all_pages(cc.zone); 7544 7545 order = 0; 7546 outer_start = start; 7547 while (!PageBuddy(pfn_to_page(outer_start))) { 7548 if (++order >= MAX_ORDER) { 7549 outer_start = start; 7550 break; 7551 } 7552 outer_start &= ~0UL << order; 7553 } 7554 7555 if (outer_start != start) { 7556 order = page_order(pfn_to_page(outer_start)); 7557 7558 /* 7559 * outer_start page could be small order buddy page and 7560 * it doesn't include start page. Adjust outer_start 7561 * in this case to report failed page properly 7562 * on tracepoint in test_pages_isolated() 7563 */ 7564 if (outer_start + (1UL << order) <= start) 7565 outer_start = start; 7566 } 7567 7568 /* Make sure the range is really isolated. */ 7569 if (test_pages_isolated(outer_start, end, false)) { 7570 pr_info("%s: [%lx, %lx) PFNs busy\n", 7571 __func__, outer_start, end); 7572 ret = -EBUSY; 7573 goto done; 7574 } 7575 7576 /* Grab isolated pages from freelists. */ 7577 outer_end = isolate_freepages_range(&cc, outer_start, end); 7578 if (!outer_end) { 7579 ret = -EBUSY; 7580 goto done; 7581 } 7582 7583 /* Free head and tail (if any) */ 7584 if (start != outer_start) 7585 free_contig_range(outer_start, start - outer_start); 7586 if (end != outer_end) 7587 free_contig_range(end, outer_end - end); 7588 7589 done: 7590 undo_isolate_page_range(pfn_max_align_down(start), 7591 pfn_max_align_up(end), migratetype); 7592 return ret; 7593 } 7594 7595 void free_contig_range(unsigned long pfn, unsigned nr_pages) 7596 { 7597 unsigned int count = 0; 7598 7599 for (; nr_pages--; pfn++) { 7600 struct page *page = pfn_to_page(pfn); 7601 7602 count += page_count(page) != 1; 7603 __free_page(page); 7604 } 7605 WARN(count != 0, "%d pages are still in use!\n", count); 7606 } 7607 #endif 7608 7609 #ifdef CONFIG_MEMORY_HOTPLUG 7610 /* 7611 * The zone indicated has a new number of managed_pages; batch sizes and percpu 7612 * page high values need to be recalulated. 7613 */ 7614 void __meminit zone_pcp_update(struct zone *zone) 7615 { 7616 unsigned cpu; 7617 mutex_lock(&pcp_batch_high_lock); 7618 for_each_possible_cpu(cpu) 7619 pageset_set_high_and_batch(zone, 7620 per_cpu_ptr(zone->pageset, cpu)); 7621 mutex_unlock(&pcp_batch_high_lock); 7622 } 7623 #endif 7624 7625 void zone_pcp_reset(struct zone *zone) 7626 { 7627 unsigned long flags; 7628 int cpu; 7629 struct per_cpu_pageset *pset; 7630 7631 /* avoid races with drain_pages() */ 7632 local_irq_save(flags); 7633 if (zone->pageset != &boot_pageset) { 7634 for_each_online_cpu(cpu) { 7635 pset = per_cpu_ptr(zone->pageset, cpu); 7636 drain_zonestat(zone, pset); 7637 } 7638 free_percpu(zone->pageset); 7639 zone->pageset = &boot_pageset; 7640 } 7641 local_irq_restore(flags); 7642 } 7643 7644 #ifdef CONFIG_MEMORY_HOTREMOVE 7645 /* 7646 * All pages in the range must be in a single zone and isolated 7647 * before calling this. 7648 */ 7649 void 7650 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 7651 { 7652 struct page *page; 7653 struct zone *zone; 7654 unsigned int order, i; 7655 unsigned long pfn; 7656 unsigned long flags; 7657 /* find the first valid pfn */ 7658 for (pfn = start_pfn; pfn < end_pfn; pfn++) 7659 if (pfn_valid(pfn)) 7660 break; 7661 if (pfn == end_pfn) 7662 return; 7663 zone = page_zone(pfn_to_page(pfn)); 7664 spin_lock_irqsave(&zone->lock, flags); 7665 pfn = start_pfn; 7666 while (pfn < end_pfn) { 7667 if (!pfn_valid(pfn)) { 7668 pfn++; 7669 continue; 7670 } 7671 page = pfn_to_page(pfn); 7672 /* 7673 * The HWPoisoned page may be not in buddy system, and 7674 * page_count() is not 0. 7675 */ 7676 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 7677 pfn++; 7678 SetPageReserved(page); 7679 continue; 7680 } 7681 7682 BUG_ON(page_count(page)); 7683 BUG_ON(!PageBuddy(page)); 7684 order = page_order(page); 7685 #ifdef CONFIG_DEBUG_VM 7686 pr_info("remove from free list %lx %d %lx\n", 7687 pfn, 1 << order, end_pfn); 7688 #endif 7689 list_del(&page->lru); 7690 rmv_page_order(page); 7691 zone->free_area[order].nr_free--; 7692 for (i = 0; i < (1 << order); i++) 7693 SetPageReserved((page+i)); 7694 pfn += (1 << order); 7695 } 7696 spin_unlock_irqrestore(&zone->lock, flags); 7697 } 7698 #endif 7699 7700 bool is_free_buddy_page(struct page *page) 7701 { 7702 struct zone *zone = page_zone(page); 7703 unsigned long pfn = page_to_pfn(page); 7704 unsigned long flags; 7705 unsigned int order; 7706 7707 spin_lock_irqsave(&zone->lock, flags); 7708 for (order = 0; order < MAX_ORDER; order++) { 7709 struct page *page_head = page - (pfn & ((1 << order) - 1)); 7710 7711 if (PageBuddy(page_head) && page_order(page_head) >= order) 7712 break; 7713 } 7714 spin_unlock_irqrestore(&zone->lock, flags); 7715 7716 return order < MAX_ORDER; 7717 } 7718