1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/interrupt.h> 21 #include <linux/pagemap.h> 22 #include <linux/jiffies.h> 23 #include <linux/bootmem.h> 24 #include <linux/memblock.h> 25 #include <linux/compiler.h> 26 #include <linux/kernel.h> 27 #include <linux/kmemcheck.h> 28 #include <linux/kasan.h> 29 #include <linux/module.h> 30 #include <linux/suspend.h> 31 #include <linux/pagevec.h> 32 #include <linux/blkdev.h> 33 #include <linux/slab.h> 34 #include <linux/ratelimit.h> 35 #include <linux/oom.h> 36 #include <linux/notifier.h> 37 #include <linux/topology.h> 38 #include <linux/sysctl.h> 39 #include <linux/cpu.h> 40 #include <linux/cpuset.h> 41 #include <linux/memory_hotplug.h> 42 #include <linux/nodemask.h> 43 #include <linux/vmalloc.h> 44 #include <linux/vmstat.h> 45 #include <linux/mempolicy.h> 46 #include <linux/memremap.h> 47 #include <linux/stop_machine.h> 48 #include <linux/sort.h> 49 #include <linux/pfn.h> 50 #include <linux/backing-dev.h> 51 #include <linux/fault-inject.h> 52 #include <linux/page-isolation.h> 53 #include <linux/page_ext.h> 54 #include <linux/debugobjects.h> 55 #include <linux/kmemleak.h> 56 #include <linux/compaction.h> 57 #include <trace/events/kmem.h> 58 #include <linux/prefetch.h> 59 #include <linux/mm_inline.h> 60 #include <linux/migrate.h> 61 #include <linux/page_ext.h> 62 #include <linux/hugetlb.h> 63 #include <linux/sched/rt.h> 64 #include <linux/page_owner.h> 65 #include <linux/kthread.h> 66 67 #include <asm/sections.h> 68 #include <asm/tlbflush.h> 69 #include <asm/div64.h> 70 #include "internal.h" 71 72 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 73 static DEFINE_MUTEX(pcp_batch_high_lock); 74 #define MIN_PERCPU_PAGELIST_FRACTION (8) 75 76 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 77 DEFINE_PER_CPU(int, numa_node); 78 EXPORT_PER_CPU_SYMBOL(numa_node); 79 #endif 80 81 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 82 /* 83 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 84 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 85 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 86 * defined in <linux/topology.h>. 87 */ 88 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 89 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 90 int _node_numa_mem_[MAX_NUMNODES]; 91 #endif 92 93 /* 94 * Array of node states. 95 */ 96 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 97 [N_POSSIBLE] = NODE_MASK_ALL, 98 [N_ONLINE] = { { [0] = 1UL } }, 99 #ifndef CONFIG_NUMA 100 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 101 #ifdef CONFIG_HIGHMEM 102 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 103 #endif 104 #ifdef CONFIG_MOVABLE_NODE 105 [N_MEMORY] = { { [0] = 1UL } }, 106 #endif 107 [N_CPU] = { { [0] = 1UL } }, 108 #endif /* NUMA */ 109 }; 110 EXPORT_SYMBOL(node_states); 111 112 /* Protect totalram_pages and zone->managed_pages */ 113 static DEFINE_SPINLOCK(managed_page_count_lock); 114 115 unsigned long totalram_pages __read_mostly; 116 unsigned long totalreserve_pages __read_mostly; 117 unsigned long totalcma_pages __read_mostly; 118 119 int percpu_pagelist_fraction; 120 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 121 122 /* 123 * A cached value of the page's pageblock's migratetype, used when the page is 124 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 125 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 126 * Also the migratetype set in the page does not necessarily match the pcplist 127 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 128 * other index - this ensures that it will be put on the correct CMA freelist. 129 */ 130 static inline int get_pcppage_migratetype(struct page *page) 131 { 132 return page->index; 133 } 134 135 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 136 { 137 page->index = migratetype; 138 } 139 140 #ifdef CONFIG_PM_SLEEP 141 /* 142 * The following functions are used by the suspend/hibernate code to temporarily 143 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 144 * while devices are suspended. To avoid races with the suspend/hibernate code, 145 * they should always be called with pm_mutex held (gfp_allowed_mask also should 146 * only be modified with pm_mutex held, unless the suspend/hibernate code is 147 * guaranteed not to run in parallel with that modification). 148 */ 149 150 static gfp_t saved_gfp_mask; 151 152 void pm_restore_gfp_mask(void) 153 { 154 WARN_ON(!mutex_is_locked(&pm_mutex)); 155 if (saved_gfp_mask) { 156 gfp_allowed_mask = saved_gfp_mask; 157 saved_gfp_mask = 0; 158 } 159 } 160 161 void pm_restrict_gfp_mask(void) 162 { 163 WARN_ON(!mutex_is_locked(&pm_mutex)); 164 WARN_ON(saved_gfp_mask); 165 saved_gfp_mask = gfp_allowed_mask; 166 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 167 } 168 169 bool pm_suspended_storage(void) 170 { 171 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 172 return false; 173 return true; 174 } 175 #endif /* CONFIG_PM_SLEEP */ 176 177 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 178 unsigned int pageblock_order __read_mostly; 179 #endif 180 181 static void __free_pages_ok(struct page *page, unsigned int order); 182 183 /* 184 * results with 256, 32 in the lowmem_reserve sysctl: 185 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 186 * 1G machine -> (16M dma, 784M normal, 224M high) 187 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 188 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 189 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 190 * 191 * TBD: should special case ZONE_DMA32 machines here - in those we normally 192 * don't need any ZONE_NORMAL reservation 193 */ 194 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 195 #ifdef CONFIG_ZONE_DMA 196 256, 197 #endif 198 #ifdef CONFIG_ZONE_DMA32 199 256, 200 #endif 201 #ifdef CONFIG_HIGHMEM 202 32, 203 #endif 204 32, 205 }; 206 207 EXPORT_SYMBOL(totalram_pages); 208 209 static char * const zone_names[MAX_NR_ZONES] = { 210 #ifdef CONFIG_ZONE_DMA 211 "DMA", 212 #endif 213 #ifdef CONFIG_ZONE_DMA32 214 "DMA32", 215 #endif 216 "Normal", 217 #ifdef CONFIG_HIGHMEM 218 "HighMem", 219 #endif 220 "Movable", 221 #ifdef CONFIG_ZONE_DEVICE 222 "Device", 223 #endif 224 }; 225 226 compound_page_dtor * const compound_page_dtors[] = { 227 NULL, 228 free_compound_page, 229 #ifdef CONFIG_HUGETLB_PAGE 230 free_huge_page, 231 #endif 232 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 233 free_transhuge_page, 234 #endif 235 }; 236 237 int min_free_kbytes = 1024; 238 int user_min_free_kbytes = -1; 239 240 static unsigned long __meminitdata nr_kernel_pages; 241 static unsigned long __meminitdata nr_all_pages; 242 static unsigned long __meminitdata dma_reserve; 243 244 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 245 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 246 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 247 static unsigned long __initdata required_kernelcore; 248 static unsigned long __initdata required_movablecore; 249 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 250 251 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 252 int movable_zone; 253 EXPORT_SYMBOL(movable_zone); 254 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 255 256 #if MAX_NUMNODES > 1 257 int nr_node_ids __read_mostly = MAX_NUMNODES; 258 int nr_online_nodes __read_mostly = 1; 259 EXPORT_SYMBOL(nr_node_ids); 260 EXPORT_SYMBOL(nr_online_nodes); 261 #endif 262 263 int page_group_by_mobility_disabled __read_mostly; 264 265 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 266 static inline void reset_deferred_meminit(pg_data_t *pgdat) 267 { 268 pgdat->first_deferred_pfn = ULONG_MAX; 269 } 270 271 /* Returns true if the struct page for the pfn is uninitialised */ 272 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 273 { 274 if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) 275 return true; 276 277 return false; 278 } 279 280 static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) 281 { 282 if (pfn >= NODE_DATA(nid)->first_deferred_pfn) 283 return true; 284 285 return false; 286 } 287 288 /* 289 * Returns false when the remaining initialisation should be deferred until 290 * later in the boot cycle when it can be parallelised. 291 */ 292 static inline bool update_defer_init(pg_data_t *pgdat, 293 unsigned long pfn, unsigned long zone_end, 294 unsigned long *nr_initialised) 295 { 296 /* Always populate low zones for address-contrained allocations */ 297 if (zone_end < pgdat_end_pfn(pgdat)) 298 return true; 299 300 /* Initialise at least 2G of the highest zone */ 301 (*nr_initialised)++; 302 if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && 303 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 304 pgdat->first_deferred_pfn = pfn; 305 return false; 306 } 307 308 return true; 309 } 310 #else 311 static inline void reset_deferred_meminit(pg_data_t *pgdat) 312 { 313 } 314 315 static inline bool early_page_uninitialised(unsigned long pfn) 316 { 317 return false; 318 } 319 320 static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) 321 { 322 return false; 323 } 324 325 static inline bool update_defer_init(pg_data_t *pgdat, 326 unsigned long pfn, unsigned long zone_end, 327 unsigned long *nr_initialised) 328 { 329 return true; 330 } 331 #endif 332 333 334 void set_pageblock_migratetype(struct page *page, int migratetype) 335 { 336 if (unlikely(page_group_by_mobility_disabled && 337 migratetype < MIGRATE_PCPTYPES)) 338 migratetype = MIGRATE_UNMOVABLE; 339 340 set_pageblock_flags_group(page, (unsigned long)migratetype, 341 PB_migrate, PB_migrate_end); 342 } 343 344 #ifdef CONFIG_DEBUG_VM 345 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 346 { 347 int ret = 0; 348 unsigned seq; 349 unsigned long pfn = page_to_pfn(page); 350 unsigned long sp, start_pfn; 351 352 do { 353 seq = zone_span_seqbegin(zone); 354 start_pfn = zone->zone_start_pfn; 355 sp = zone->spanned_pages; 356 if (!zone_spans_pfn(zone, pfn)) 357 ret = 1; 358 } while (zone_span_seqretry(zone, seq)); 359 360 if (ret) 361 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 362 pfn, zone_to_nid(zone), zone->name, 363 start_pfn, start_pfn + sp); 364 365 return ret; 366 } 367 368 static int page_is_consistent(struct zone *zone, struct page *page) 369 { 370 if (!pfn_valid_within(page_to_pfn(page))) 371 return 0; 372 if (zone != page_zone(page)) 373 return 0; 374 375 return 1; 376 } 377 /* 378 * Temporary debugging check for pages not lying within a given zone. 379 */ 380 static int bad_range(struct zone *zone, struct page *page) 381 { 382 if (page_outside_zone_boundaries(zone, page)) 383 return 1; 384 if (!page_is_consistent(zone, page)) 385 return 1; 386 387 return 0; 388 } 389 #else 390 static inline int bad_range(struct zone *zone, struct page *page) 391 { 392 return 0; 393 } 394 #endif 395 396 static void bad_page(struct page *page, const char *reason, 397 unsigned long bad_flags) 398 { 399 static unsigned long resume; 400 static unsigned long nr_shown; 401 static unsigned long nr_unshown; 402 403 /* Don't complain about poisoned pages */ 404 if (PageHWPoison(page)) { 405 page_mapcount_reset(page); /* remove PageBuddy */ 406 return; 407 } 408 409 /* 410 * Allow a burst of 60 reports, then keep quiet for that minute; 411 * or allow a steady drip of one report per second. 412 */ 413 if (nr_shown == 60) { 414 if (time_before(jiffies, resume)) { 415 nr_unshown++; 416 goto out; 417 } 418 if (nr_unshown) { 419 printk(KERN_ALERT 420 "BUG: Bad page state: %lu messages suppressed\n", 421 nr_unshown); 422 nr_unshown = 0; 423 } 424 nr_shown = 0; 425 } 426 if (nr_shown++ == 0) 427 resume = jiffies + 60 * HZ; 428 429 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 430 current->comm, page_to_pfn(page)); 431 dump_page_badflags(page, reason, bad_flags); 432 433 print_modules(); 434 dump_stack(); 435 out: 436 /* Leave bad fields for debug, except PageBuddy could make trouble */ 437 page_mapcount_reset(page); /* remove PageBuddy */ 438 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 439 } 440 441 /* 442 * Higher-order pages are called "compound pages". They are structured thusly: 443 * 444 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 445 * 446 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 447 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 448 * 449 * The first tail page's ->compound_dtor holds the offset in array of compound 450 * page destructors. See compound_page_dtors. 451 * 452 * The first tail page's ->compound_order holds the order of allocation. 453 * This usage means that zero-order pages may not be compound. 454 */ 455 456 void free_compound_page(struct page *page) 457 { 458 __free_pages_ok(page, compound_order(page)); 459 } 460 461 void prep_compound_page(struct page *page, unsigned int order) 462 { 463 int i; 464 int nr_pages = 1 << order; 465 466 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 467 set_compound_order(page, order); 468 __SetPageHead(page); 469 for (i = 1; i < nr_pages; i++) { 470 struct page *p = page + i; 471 set_page_count(p, 0); 472 p->mapping = TAIL_MAPPING; 473 set_compound_head(p, page); 474 } 475 atomic_set(compound_mapcount_ptr(page), -1); 476 } 477 478 #ifdef CONFIG_DEBUG_PAGEALLOC 479 unsigned int _debug_guardpage_minorder; 480 bool _debug_pagealloc_enabled __read_mostly; 481 bool _debug_guardpage_enabled __read_mostly; 482 483 static int __init early_debug_pagealloc(char *buf) 484 { 485 if (!buf) 486 return -EINVAL; 487 488 if (strcmp(buf, "on") == 0) 489 _debug_pagealloc_enabled = true; 490 491 return 0; 492 } 493 early_param("debug_pagealloc", early_debug_pagealloc); 494 495 static bool need_debug_guardpage(void) 496 { 497 /* If we don't use debug_pagealloc, we don't need guard page */ 498 if (!debug_pagealloc_enabled()) 499 return false; 500 501 return true; 502 } 503 504 static void init_debug_guardpage(void) 505 { 506 if (!debug_pagealloc_enabled()) 507 return; 508 509 _debug_guardpage_enabled = true; 510 } 511 512 struct page_ext_operations debug_guardpage_ops = { 513 .need = need_debug_guardpage, 514 .init = init_debug_guardpage, 515 }; 516 517 static int __init debug_guardpage_minorder_setup(char *buf) 518 { 519 unsigned long res; 520 521 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 522 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 523 return 0; 524 } 525 _debug_guardpage_minorder = res; 526 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 527 return 0; 528 } 529 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 530 531 static inline void set_page_guard(struct zone *zone, struct page *page, 532 unsigned int order, int migratetype) 533 { 534 struct page_ext *page_ext; 535 536 if (!debug_guardpage_enabled()) 537 return; 538 539 page_ext = lookup_page_ext(page); 540 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 541 542 INIT_LIST_HEAD(&page->lru); 543 set_page_private(page, order); 544 /* Guard pages are not available for any usage */ 545 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 546 } 547 548 static inline void clear_page_guard(struct zone *zone, struct page *page, 549 unsigned int order, int migratetype) 550 { 551 struct page_ext *page_ext; 552 553 if (!debug_guardpage_enabled()) 554 return; 555 556 page_ext = lookup_page_ext(page); 557 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 558 559 set_page_private(page, 0); 560 if (!is_migrate_isolate(migratetype)) 561 __mod_zone_freepage_state(zone, (1 << order), migratetype); 562 } 563 #else 564 struct page_ext_operations debug_guardpage_ops = { NULL, }; 565 static inline void set_page_guard(struct zone *zone, struct page *page, 566 unsigned int order, int migratetype) {} 567 static inline void clear_page_guard(struct zone *zone, struct page *page, 568 unsigned int order, int migratetype) {} 569 #endif 570 571 static inline void set_page_order(struct page *page, unsigned int order) 572 { 573 set_page_private(page, order); 574 __SetPageBuddy(page); 575 } 576 577 static inline void rmv_page_order(struct page *page) 578 { 579 __ClearPageBuddy(page); 580 set_page_private(page, 0); 581 } 582 583 /* 584 * This function checks whether a page is free && is the buddy 585 * we can do coalesce a page and its buddy if 586 * (a) the buddy is not in a hole && 587 * (b) the buddy is in the buddy system && 588 * (c) a page and its buddy have the same order && 589 * (d) a page and its buddy are in the same zone. 590 * 591 * For recording whether a page is in the buddy system, we set ->_mapcount 592 * PAGE_BUDDY_MAPCOUNT_VALUE. 593 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 594 * serialized by zone->lock. 595 * 596 * For recording page's order, we use page_private(page). 597 */ 598 static inline int page_is_buddy(struct page *page, struct page *buddy, 599 unsigned int order) 600 { 601 if (!pfn_valid_within(page_to_pfn(buddy))) 602 return 0; 603 604 if (page_is_guard(buddy) && page_order(buddy) == order) { 605 if (page_zone_id(page) != page_zone_id(buddy)) 606 return 0; 607 608 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 609 610 return 1; 611 } 612 613 if (PageBuddy(buddy) && page_order(buddy) == order) { 614 /* 615 * zone check is done late to avoid uselessly 616 * calculating zone/node ids for pages that could 617 * never merge. 618 */ 619 if (page_zone_id(page) != page_zone_id(buddy)) 620 return 0; 621 622 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 623 624 return 1; 625 } 626 return 0; 627 } 628 629 /* 630 * Freeing function for a buddy system allocator. 631 * 632 * The concept of a buddy system is to maintain direct-mapped table 633 * (containing bit values) for memory blocks of various "orders". 634 * The bottom level table contains the map for the smallest allocatable 635 * units of memory (here, pages), and each level above it describes 636 * pairs of units from the levels below, hence, "buddies". 637 * At a high level, all that happens here is marking the table entry 638 * at the bottom level available, and propagating the changes upward 639 * as necessary, plus some accounting needed to play nicely with other 640 * parts of the VM system. 641 * At each level, we keep a list of pages, which are heads of continuous 642 * free pages of length of (1 << order) and marked with _mapcount 643 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 644 * field. 645 * So when we are allocating or freeing one, we can derive the state of the 646 * other. That is, if we allocate a small block, and both were 647 * free, the remainder of the region must be split into blocks. 648 * If a block is freed, and its buddy is also free, then this 649 * triggers coalescing into a block of larger size. 650 * 651 * -- nyc 652 */ 653 654 static inline void __free_one_page(struct page *page, 655 unsigned long pfn, 656 struct zone *zone, unsigned int order, 657 int migratetype) 658 { 659 unsigned long page_idx; 660 unsigned long combined_idx; 661 unsigned long uninitialized_var(buddy_idx); 662 struct page *buddy; 663 unsigned int max_order = MAX_ORDER; 664 665 VM_BUG_ON(!zone_is_initialized(zone)); 666 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 667 668 VM_BUG_ON(migratetype == -1); 669 if (is_migrate_isolate(migratetype)) { 670 /* 671 * We restrict max order of merging to prevent merge 672 * between freepages on isolate pageblock and normal 673 * pageblock. Without this, pageblock isolation 674 * could cause incorrect freepage accounting. 675 */ 676 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 677 } else { 678 __mod_zone_freepage_state(zone, 1 << order, migratetype); 679 } 680 681 page_idx = pfn & ((1 << max_order) - 1); 682 683 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 684 VM_BUG_ON_PAGE(bad_range(zone, page), page); 685 686 while (order < max_order - 1) { 687 buddy_idx = __find_buddy_index(page_idx, order); 688 buddy = page + (buddy_idx - page_idx); 689 if (!page_is_buddy(page, buddy, order)) 690 break; 691 /* 692 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 693 * merge with it and move up one order. 694 */ 695 if (page_is_guard(buddy)) { 696 clear_page_guard(zone, buddy, order, migratetype); 697 } else { 698 list_del(&buddy->lru); 699 zone->free_area[order].nr_free--; 700 rmv_page_order(buddy); 701 } 702 combined_idx = buddy_idx & page_idx; 703 page = page + (combined_idx - page_idx); 704 page_idx = combined_idx; 705 order++; 706 } 707 set_page_order(page, order); 708 709 /* 710 * If this is not the largest possible page, check if the buddy 711 * of the next-highest order is free. If it is, it's possible 712 * that pages are being freed that will coalesce soon. In case, 713 * that is happening, add the free page to the tail of the list 714 * so it's less likely to be used soon and more likely to be merged 715 * as a higher order page 716 */ 717 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 718 struct page *higher_page, *higher_buddy; 719 combined_idx = buddy_idx & page_idx; 720 higher_page = page + (combined_idx - page_idx); 721 buddy_idx = __find_buddy_index(combined_idx, order + 1); 722 higher_buddy = higher_page + (buddy_idx - combined_idx); 723 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 724 list_add_tail(&page->lru, 725 &zone->free_area[order].free_list[migratetype]); 726 goto out; 727 } 728 } 729 730 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 731 out: 732 zone->free_area[order].nr_free++; 733 } 734 735 static inline int free_pages_check(struct page *page) 736 { 737 const char *bad_reason = NULL; 738 unsigned long bad_flags = 0; 739 740 if (unlikely(atomic_read(&page->_mapcount) != -1)) 741 bad_reason = "nonzero mapcount"; 742 if (unlikely(page->mapping != NULL)) 743 bad_reason = "non-NULL mapping"; 744 if (unlikely(atomic_read(&page->_count) != 0)) 745 bad_reason = "nonzero _count"; 746 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { 747 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 748 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 749 } 750 #ifdef CONFIG_MEMCG 751 if (unlikely(page->mem_cgroup)) 752 bad_reason = "page still charged to cgroup"; 753 #endif 754 if (unlikely(bad_reason)) { 755 bad_page(page, bad_reason, bad_flags); 756 return 1; 757 } 758 page_cpupid_reset_last(page); 759 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 760 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 761 return 0; 762 } 763 764 /* 765 * Frees a number of pages from the PCP lists 766 * Assumes all pages on list are in same zone, and of same order. 767 * count is the number of pages to free. 768 * 769 * If the zone was previously in an "all pages pinned" state then look to 770 * see if this freeing clears that state. 771 * 772 * And clear the zone's pages_scanned counter, to hold off the "all pages are 773 * pinned" detection logic. 774 */ 775 static void free_pcppages_bulk(struct zone *zone, int count, 776 struct per_cpu_pages *pcp) 777 { 778 int migratetype = 0; 779 int batch_free = 0; 780 int to_free = count; 781 unsigned long nr_scanned; 782 783 spin_lock(&zone->lock); 784 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 785 if (nr_scanned) 786 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 787 788 while (to_free) { 789 struct page *page; 790 struct list_head *list; 791 792 /* 793 * Remove pages from lists in a round-robin fashion. A 794 * batch_free count is maintained that is incremented when an 795 * empty list is encountered. This is so more pages are freed 796 * off fuller lists instead of spinning excessively around empty 797 * lists 798 */ 799 do { 800 batch_free++; 801 if (++migratetype == MIGRATE_PCPTYPES) 802 migratetype = 0; 803 list = &pcp->lists[migratetype]; 804 } while (list_empty(list)); 805 806 /* This is the only non-empty list. Free them all. */ 807 if (batch_free == MIGRATE_PCPTYPES) 808 batch_free = to_free; 809 810 do { 811 int mt; /* migratetype of the to-be-freed page */ 812 813 page = list_last_entry(list, struct page, lru); 814 /* must delete as __free_one_page list manipulates */ 815 list_del(&page->lru); 816 817 mt = get_pcppage_migratetype(page); 818 /* MIGRATE_ISOLATE page should not go to pcplists */ 819 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 820 /* Pageblock could have been isolated meanwhile */ 821 if (unlikely(has_isolate_pageblock(zone))) 822 mt = get_pageblock_migratetype(page); 823 824 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 825 trace_mm_page_pcpu_drain(page, 0, mt); 826 } while (--to_free && --batch_free && !list_empty(list)); 827 } 828 spin_unlock(&zone->lock); 829 } 830 831 static void free_one_page(struct zone *zone, 832 struct page *page, unsigned long pfn, 833 unsigned int order, 834 int migratetype) 835 { 836 unsigned long nr_scanned; 837 spin_lock(&zone->lock); 838 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 839 if (nr_scanned) 840 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 841 842 if (unlikely(has_isolate_pageblock(zone) || 843 is_migrate_isolate(migratetype))) { 844 migratetype = get_pfnblock_migratetype(page, pfn); 845 } 846 __free_one_page(page, pfn, zone, order, migratetype); 847 spin_unlock(&zone->lock); 848 } 849 850 static int free_tail_pages_check(struct page *head_page, struct page *page) 851 { 852 int ret = 1; 853 854 /* 855 * We rely page->lru.next never has bit 0 set, unless the page 856 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 857 */ 858 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 859 860 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 861 ret = 0; 862 goto out; 863 } 864 switch (page - head_page) { 865 case 1: 866 /* the first tail page: ->mapping is compound_mapcount() */ 867 if (unlikely(compound_mapcount(page))) { 868 bad_page(page, "nonzero compound_mapcount", 0); 869 goto out; 870 } 871 break; 872 case 2: 873 /* 874 * the second tail page: ->mapping is 875 * page_deferred_list().next -- ignore value. 876 */ 877 break; 878 default: 879 if (page->mapping != TAIL_MAPPING) { 880 bad_page(page, "corrupted mapping in tail page", 0); 881 goto out; 882 } 883 break; 884 } 885 if (unlikely(!PageTail(page))) { 886 bad_page(page, "PageTail not set", 0); 887 goto out; 888 } 889 if (unlikely(compound_head(page) != head_page)) { 890 bad_page(page, "compound_head not consistent", 0); 891 goto out; 892 } 893 ret = 0; 894 out: 895 page->mapping = NULL; 896 clear_compound_head(page); 897 return ret; 898 } 899 900 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 901 unsigned long zone, int nid) 902 { 903 set_page_links(page, zone, nid, pfn); 904 init_page_count(page); 905 page_mapcount_reset(page); 906 page_cpupid_reset_last(page); 907 908 INIT_LIST_HEAD(&page->lru); 909 #ifdef WANT_PAGE_VIRTUAL 910 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 911 if (!is_highmem_idx(zone)) 912 set_page_address(page, __va(pfn << PAGE_SHIFT)); 913 #endif 914 } 915 916 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, 917 int nid) 918 { 919 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); 920 } 921 922 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 923 static void init_reserved_page(unsigned long pfn) 924 { 925 pg_data_t *pgdat; 926 int nid, zid; 927 928 if (!early_page_uninitialised(pfn)) 929 return; 930 931 nid = early_pfn_to_nid(pfn); 932 pgdat = NODE_DATA(nid); 933 934 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 935 struct zone *zone = &pgdat->node_zones[zid]; 936 937 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 938 break; 939 } 940 __init_single_pfn(pfn, zid, nid); 941 } 942 #else 943 static inline void init_reserved_page(unsigned long pfn) 944 { 945 } 946 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 947 948 /* 949 * Initialised pages do not have PageReserved set. This function is 950 * called for each range allocated by the bootmem allocator and 951 * marks the pages PageReserved. The remaining valid pages are later 952 * sent to the buddy page allocator. 953 */ 954 void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) 955 { 956 unsigned long start_pfn = PFN_DOWN(start); 957 unsigned long end_pfn = PFN_UP(end); 958 959 for (; start_pfn < end_pfn; start_pfn++) { 960 if (pfn_valid(start_pfn)) { 961 struct page *page = pfn_to_page(start_pfn); 962 963 init_reserved_page(start_pfn); 964 965 /* Avoid false-positive PageTail() */ 966 INIT_LIST_HEAD(&page->lru); 967 968 SetPageReserved(page); 969 } 970 } 971 } 972 973 static bool free_pages_prepare(struct page *page, unsigned int order) 974 { 975 bool compound = PageCompound(page); 976 int i, bad = 0; 977 978 VM_BUG_ON_PAGE(PageTail(page), page); 979 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 980 981 trace_mm_page_free(page, order); 982 kmemcheck_free_shadow(page, order); 983 kasan_free_pages(page, order); 984 985 if (PageAnon(page)) 986 page->mapping = NULL; 987 bad += free_pages_check(page); 988 for (i = 1; i < (1 << order); i++) { 989 if (compound) 990 bad += free_tail_pages_check(page, page + i); 991 bad += free_pages_check(page + i); 992 } 993 if (bad) 994 return false; 995 996 reset_page_owner(page, order); 997 998 if (!PageHighMem(page)) { 999 debug_check_no_locks_freed(page_address(page), 1000 PAGE_SIZE << order); 1001 debug_check_no_obj_freed(page_address(page), 1002 PAGE_SIZE << order); 1003 } 1004 arch_free_page(page, order); 1005 kernel_map_pages(page, 1 << order, 0); 1006 1007 return true; 1008 } 1009 1010 static void __free_pages_ok(struct page *page, unsigned int order) 1011 { 1012 unsigned long flags; 1013 int migratetype; 1014 unsigned long pfn = page_to_pfn(page); 1015 1016 if (!free_pages_prepare(page, order)) 1017 return; 1018 1019 migratetype = get_pfnblock_migratetype(page, pfn); 1020 local_irq_save(flags); 1021 __count_vm_events(PGFREE, 1 << order); 1022 free_one_page(page_zone(page), page, pfn, order, migratetype); 1023 local_irq_restore(flags); 1024 } 1025 1026 static void __init __free_pages_boot_core(struct page *page, 1027 unsigned long pfn, unsigned int order) 1028 { 1029 unsigned int nr_pages = 1 << order; 1030 struct page *p = page; 1031 unsigned int loop; 1032 1033 prefetchw(p); 1034 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1035 prefetchw(p + 1); 1036 __ClearPageReserved(p); 1037 set_page_count(p, 0); 1038 } 1039 __ClearPageReserved(p); 1040 set_page_count(p, 0); 1041 1042 page_zone(page)->managed_pages += nr_pages; 1043 set_page_refcounted(page); 1044 __free_pages(page, order); 1045 } 1046 1047 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ 1048 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) 1049 1050 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1051 1052 int __meminit early_pfn_to_nid(unsigned long pfn) 1053 { 1054 static DEFINE_SPINLOCK(early_pfn_lock); 1055 int nid; 1056 1057 spin_lock(&early_pfn_lock); 1058 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1059 if (nid < 0) 1060 nid = 0; 1061 spin_unlock(&early_pfn_lock); 1062 1063 return nid; 1064 } 1065 #endif 1066 1067 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1068 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, 1069 struct mminit_pfnnid_cache *state) 1070 { 1071 int nid; 1072 1073 nid = __early_pfn_to_nid(pfn, state); 1074 if (nid >= 0 && nid != node) 1075 return false; 1076 return true; 1077 } 1078 1079 /* Only safe to use early in boot when initialisation is single-threaded */ 1080 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1081 { 1082 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); 1083 } 1084 1085 #else 1086 1087 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1088 { 1089 return true; 1090 } 1091 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, 1092 struct mminit_pfnnid_cache *state) 1093 { 1094 return true; 1095 } 1096 #endif 1097 1098 1099 void __init __free_pages_bootmem(struct page *page, unsigned long pfn, 1100 unsigned int order) 1101 { 1102 if (early_page_uninitialised(pfn)) 1103 return; 1104 return __free_pages_boot_core(page, pfn, order); 1105 } 1106 1107 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1108 static void __init deferred_free_range(struct page *page, 1109 unsigned long pfn, int nr_pages) 1110 { 1111 int i; 1112 1113 if (!page) 1114 return; 1115 1116 /* Free a large naturally-aligned chunk if possible */ 1117 if (nr_pages == MAX_ORDER_NR_PAGES && 1118 (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { 1119 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1120 __free_pages_boot_core(page, pfn, MAX_ORDER-1); 1121 return; 1122 } 1123 1124 for (i = 0; i < nr_pages; i++, page++, pfn++) 1125 __free_pages_boot_core(page, pfn, 0); 1126 } 1127 1128 /* Completion tracking for deferred_init_memmap() threads */ 1129 static atomic_t pgdat_init_n_undone __initdata; 1130 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1131 1132 static inline void __init pgdat_init_report_one_done(void) 1133 { 1134 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1135 complete(&pgdat_init_all_done_comp); 1136 } 1137 1138 /* Initialise remaining memory on a node */ 1139 static int __init deferred_init_memmap(void *data) 1140 { 1141 pg_data_t *pgdat = data; 1142 int nid = pgdat->node_id; 1143 struct mminit_pfnnid_cache nid_init_state = { }; 1144 unsigned long start = jiffies; 1145 unsigned long nr_pages = 0; 1146 unsigned long walk_start, walk_end; 1147 int i, zid; 1148 struct zone *zone; 1149 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1150 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1151 1152 if (first_init_pfn == ULONG_MAX) { 1153 pgdat_init_report_one_done(); 1154 return 0; 1155 } 1156 1157 /* Bind memory initialisation thread to a local node if possible */ 1158 if (!cpumask_empty(cpumask)) 1159 set_cpus_allowed_ptr(current, cpumask); 1160 1161 /* Sanity check boundaries */ 1162 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1163 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1164 pgdat->first_deferred_pfn = ULONG_MAX; 1165 1166 /* Only the highest zone is deferred so find it */ 1167 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1168 zone = pgdat->node_zones + zid; 1169 if (first_init_pfn < zone_end_pfn(zone)) 1170 break; 1171 } 1172 1173 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1174 unsigned long pfn, end_pfn; 1175 struct page *page = NULL; 1176 struct page *free_base_page = NULL; 1177 unsigned long free_base_pfn = 0; 1178 int nr_to_free = 0; 1179 1180 end_pfn = min(walk_end, zone_end_pfn(zone)); 1181 pfn = first_init_pfn; 1182 if (pfn < walk_start) 1183 pfn = walk_start; 1184 if (pfn < zone->zone_start_pfn) 1185 pfn = zone->zone_start_pfn; 1186 1187 for (; pfn < end_pfn; pfn++) { 1188 if (!pfn_valid_within(pfn)) 1189 goto free_range; 1190 1191 /* 1192 * Ensure pfn_valid is checked every 1193 * MAX_ORDER_NR_PAGES for memory holes 1194 */ 1195 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 1196 if (!pfn_valid(pfn)) { 1197 page = NULL; 1198 goto free_range; 1199 } 1200 } 1201 1202 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { 1203 page = NULL; 1204 goto free_range; 1205 } 1206 1207 /* Minimise pfn page lookups and scheduler checks */ 1208 if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { 1209 page++; 1210 } else { 1211 nr_pages += nr_to_free; 1212 deferred_free_range(free_base_page, 1213 free_base_pfn, nr_to_free); 1214 free_base_page = NULL; 1215 free_base_pfn = nr_to_free = 0; 1216 1217 page = pfn_to_page(pfn); 1218 cond_resched(); 1219 } 1220 1221 if (page->flags) { 1222 VM_BUG_ON(page_zone(page) != zone); 1223 goto free_range; 1224 } 1225 1226 __init_single_page(page, pfn, zid, nid); 1227 if (!free_base_page) { 1228 free_base_page = page; 1229 free_base_pfn = pfn; 1230 nr_to_free = 0; 1231 } 1232 nr_to_free++; 1233 1234 /* Where possible, batch up pages for a single free */ 1235 continue; 1236 free_range: 1237 /* Free the current block of pages to allocator */ 1238 nr_pages += nr_to_free; 1239 deferred_free_range(free_base_page, free_base_pfn, 1240 nr_to_free); 1241 free_base_page = NULL; 1242 free_base_pfn = nr_to_free = 0; 1243 } 1244 1245 first_init_pfn = max(end_pfn, first_init_pfn); 1246 } 1247 1248 /* Sanity check that the next zone really is unpopulated */ 1249 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1250 1251 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, 1252 jiffies_to_msecs(jiffies - start)); 1253 1254 pgdat_init_report_one_done(); 1255 return 0; 1256 } 1257 1258 void __init page_alloc_init_late(void) 1259 { 1260 int nid; 1261 1262 /* There will be num_node_state(N_MEMORY) threads */ 1263 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 1264 for_each_node_state(nid, N_MEMORY) { 1265 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 1266 } 1267 1268 /* Block until all are initialised */ 1269 wait_for_completion(&pgdat_init_all_done_comp); 1270 1271 /* Reinit limits that are based on free pages after the kernel is up */ 1272 files_maxfiles_init(); 1273 } 1274 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1275 1276 #ifdef CONFIG_CMA 1277 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1278 void __init init_cma_reserved_pageblock(struct page *page) 1279 { 1280 unsigned i = pageblock_nr_pages; 1281 struct page *p = page; 1282 1283 do { 1284 __ClearPageReserved(p); 1285 set_page_count(p, 0); 1286 } while (++p, --i); 1287 1288 set_pageblock_migratetype(page, MIGRATE_CMA); 1289 1290 if (pageblock_order >= MAX_ORDER) { 1291 i = pageblock_nr_pages; 1292 p = page; 1293 do { 1294 set_page_refcounted(p); 1295 __free_pages(p, MAX_ORDER - 1); 1296 p += MAX_ORDER_NR_PAGES; 1297 } while (i -= MAX_ORDER_NR_PAGES); 1298 } else { 1299 set_page_refcounted(page); 1300 __free_pages(page, pageblock_order); 1301 } 1302 1303 adjust_managed_page_count(page, pageblock_nr_pages); 1304 } 1305 #endif 1306 1307 /* 1308 * The order of subdivision here is critical for the IO subsystem. 1309 * Please do not alter this order without good reasons and regression 1310 * testing. Specifically, as large blocks of memory are subdivided, 1311 * the order in which smaller blocks are delivered depends on the order 1312 * they're subdivided in this function. This is the primary factor 1313 * influencing the order in which pages are delivered to the IO 1314 * subsystem according to empirical testing, and this is also justified 1315 * by considering the behavior of a buddy system containing a single 1316 * large block of memory acted on by a series of small allocations. 1317 * This behavior is a critical factor in sglist merging's success. 1318 * 1319 * -- nyc 1320 */ 1321 static inline void expand(struct zone *zone, struct page *page, 1322 int low, int high, struct free_area *area, 1323 int migratetype) 1324 { 1325 unsigned long size = 1 << high; 1326 1327 while (high > low) { 1328 area--; 1329 high--; 1330 size >>= 1; 1331 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1332 1333 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && 1334 debug_guardpage_enabled() && 1335 high < debug_guardpage_minorder()) { 1336 /* 1337 * Mark as guard pages (or page), that will allow to 1338 * merge back to allocator when buddy will be freed. 1339 * Corresponding page table entries will not be touched, 1340 * pages will stay not present in virtual address space 1341 */ 1342 set_page_guard(zone, &page[size], high, migratetype); 1343 continue; 1344 } 1345 list_add(&page[size].lru, &area->free_list[migratetype]); 1346 area->nr_free++; 1347 set_page_order(&page[size], high); 1348 } 1349 } 1350 1351 /* 1352 * This page is about to be returned from the page allocator 1353 */ 1354 static inline int check_new_page(struct page *page) 1355 { 1356 const char *bad_reason = NULL; 1357 unsigned long bad_flags = 0; 1358 1359 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1360 bad_reason = "nonzero mapcount"; 1361 if (unlikely(page->mapping != NULL)) 1362 bad_reason = "non-NULL mapping"; 1363 if (unlikely(atomic_read(&page->_count) != 0)) 1364 bad_reason = "nonzero _count"; 1365 if (unlikely(page->flags & __PG_HWPOISON)) { 1366 bad_reason = "HWPoisoned (hardware-corrupted)"; 1367 bad_flags = __PG_HWPOISON; 1368 } 1369 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { 1370 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 1371 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 1372 } 1373 #ifdef CONFIG_MEMCG 1374 if (unlikely(page->mem_cgroup)) 1375 bad_reason = "page still charged to cgroup"; 1376 #endif 1377 if (unlikely(bad_reason)) { 1378 bad_page(page, bad_reason, bad_flags); 1379 return 1; 1380 } 1381 return 0; 1382 } 1383 1384 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1385 int alloc_flags) 1386 { 1387 int i; 1388 1389 for (i = 0; i < (1 << order); i++) { 1390 struct page *p = page + i; 1391 if (unlikely(check_new_page(p))) 1392 return 1; 1393 } 1394 1395 set_page_private(page, 0); 1396 set_page_refcounted(page); 1397 1398 arch_alloc_page(page, order); 1399 kernel_map_pages(page, 1 << order, 1); 1400 kasan_alloc_pages(page, order); 1401 1402 if (gfp_flags & __GFP_ZERO) 1403 for (i = 0; i < (1 << order); i++) 1404 clear_highpage(page + i); 1405 1406 if (order && (gfp_flags & __GFP_COMP)) 1407 prep_compound_page(page, order); 1408 1409 set_page_owner(page, order, gfp_flags); 1410 1411 /* 1412 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1413 * allocate the page. The expectation is that the caller is taking 1414 * steps that will free more memory. The caller should avoid the page 1415 * being used for !PFMEMALLOC purposes. 1416 */ 1417 if (alloc_flags & ALLOC_NO_WATERMARKS) 1418 set_page_pfmemalloc(page); 1419 else 1420 clear_page_pfmemalloc(page); 1421 1422 return 0; 1423 } 1424 1425 /* 1426 * Go through the free lists for the given migratetype and remove 1427 * the smallest available page from the freelists 1428 */ 1429 static inline 1430 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1431 int migratetype) 1432 { 1433 unsigned int current_order; 1434 struct free_area *area; 1435 struct page *page; 1436 1437 /* Find a page of the appropriate size in the preferred list */ 1438 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 1439 area = &(zone->free_area[current_order]); 1440 page = list_first_entry_or_null(&area->free_list[migratetype], 1441 struct page, lru); 1442 if (!page) 1443 continue; 1444 list_del(&page->lru); 1445 rmv_page_order(page); 1446 area->nr_free--; 1447 expand(zone, page, order, current_order, area, migratetype); 1448 set_pcppage_migratetype(page, migratetype); 1449 return page; 1450 } 1451 1452 return NULL; 1453 } 1454 1455 1456 /* 1457 * This array describes the order lists are fallen back to when 1458 * the free lists for the desirable migrate type are depleted 1459 */ 1460 static int fallbacks[MIGRATE_TYPES][4] = { 1461 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1462 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1463 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 1464 #ifdef CONFIG_CMA 1465 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 1466 #endif 1467 #ifdef CONFIG_MEMORY_ISOLATION 1468 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 1469 #endif 1470 }; 1471 1472 #ifdef CONFIG_CMA 1473 static struct page *__rmqueue_cma_fallback(struct zone *zone, 1474 unsigned int order) 1475 { 1476 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1477 } 1478 #else 1479 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1480 unsigned int order) { return NULL; } 1481 #endif 1482 1483 /* 1484 * Move the free pages in a range to the free lists of the requested type. 1485 * Note that start_page and end_pages are not aligned on a pageblock 1486 * boundary. If alignment is required, use move_freepages_block() 1487 */ 1488 int move_freepages(struct zone *zone, 1489 struct page *start_page, struct page *end_page, 1490 int migratetype) 1491 { 1492 struct page *page; 1493 unsigned int order; 1494 int pages_moved = 0; 1495 1496 #ifndef CONFIG_HOLES_IN_ZONE 1497 /* 1498 * page_zone is not safe to call in this context when 1499 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 1500 * anyway as we check zone boundaries in move_freepages_block(). 1501 * Remove at a later date when no bug reports exist related to 1502 * grouping pages by mobility 1503 */ 1504 VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); 1505 #endif 1506 1507 for (page = start_page; page <= end_page;) { 1508 /* Make sure we are not inadvertently changing nodes */ 1509 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1510 1511 if (!pfn_valid_within(page_to_pfn(page))) { 1512 page++; 1513 continue; 1514 } 1515 1516 if (!PageBuddy(page)) { 1517 page++; 1518 continue; 1519 } 1520 1521 order = page_order(page); 1522 list_move(&page->lru, 1523 &zone->free_area[order].free_list[migratetype]); 1524 page += 1 << order; 1525 pages_moved += 1 << order; 1526 } 1527 1528 return pages_moved; 1529 } 1530 1531 int move_freepages_block(struct zone *zone, struct page *page, 1532 int migratetype) 1533 { 1534 unsigned long start_pfn, end_pfn; 1535 struct page *start_page, *end_page; 1536 1537 start_pfn = page_to_pfn(page); 1538 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1539 start_page = pfn_to_page(start_pfn); 1540 end_page = start_page + pageblock_nr_pages - 1; 1541 end_pfn = start_pfn + pageblock_nr_pages - 1; 1542 1543 /* Do not cross zone boundaries */ 1544 if (!zone_spans_pfn(zone, start_pfn)) 1545 start_page = page; 1546 if (!zone_spans_pfn(zone, end_pfn)) 1547 return 0; 1548 1549 return move_freepages(zone, start_page, end_page, migratetype); 1550 } 1551 1552 static void change_pageblock_range(struct page *pageblock_page, 1553 int start_order, int migratetype) 1554 { 1555 int nr_pageblocks = 1 << (start_order - pageblock_order); 1556 1557 while (nr_pageblocks--) { 1558 set_pageblock_migratetype(pageblock_page, migratetype); 1559 pageblock_page += pageblock_nr_pages; 1560 } 1561 } 1562 1563 /* 1564 * When we are falling back to another migratetype during allocation, try to 1565 * steal extra free pages from the same pageblocks to satisfy further 1566 * allocations, instead of polluting multiple pageblocks. 1567 * 1568 * If we are stealing a relatively large buddy page, it is likely there will 1569 * be more free pages in the pageblock, so try to steal them all. For 1570 * reclaimable and unmovable allocations, we steal regardless of page size, 1571 * as fragmentation caused by those allocations polluting movable pageblocks 1572 * is worse than movable allocations stealing from unmovable and reclaimable 1573 * pageblocks. 1574 */ 1575 static bool can_steal_fallback(unsigned int order, int start_mt) 1576 { 1577 /* 1578 * Leaving this order check is intended, although there is 1579 * relaxed order check in next check. The reason is that 1580 * we can actually steal whole pageblock if this condition met, 1581 * but, below check doesn't guarantee it and that is just heuristic 1582 * so could be changed anytime. 1583 */ 1584 if (order >= pageblock_order) 1585 return true; 1586 1587 if (order >= pageblock_order / 2 || 1588 start_mt == MIGRATE_RECLAIMABLE || 1589 start_mt == MIGRATE_UNMOVABLE || 1590 page_group_by_mobility_disabled) 1591 return true; 1592 1593 return false; 1594 } 1595 1596 /* 1597 * This function implements actual steal behaviour. If order is large enough, 1598 * we can steal whole pageblock. If not, we first move freepages in this 1599 * pageblock and check whether half of pages are moved or not. If half of 1600 * pages are moved, we can change migratetype of pageblock and permanently 1601 * use it's pages as requested migratetype in the future. 1602 */ 1603 static void steal_suitable_fallback(struct zone *zone, struct page *page, 1604 int start_type) 1605 { 1606 unsigned int current_order = page_order(page); 1607 int pages; 1608 1609 /* Take ownership for orders >= pageblock_order */ 1610 if (current_order >= pageblock_order) { 1611 change_pageblock_range(page, current_order, start_type); 1612 return; 1613 } 1614 1615 pages = move_freepages_block(zone, page, start_type); 1616 1617 /* Claim the whole block if over half of it is free */ 1618 if (pages >= (1 << (pageblock_order-1)) || 1619 page_group_by_mobility_disabled) 1620 set_pageblock_migratetype(page, start_type); 1621 } 1622 1623 /* 1624 * Check whether there is a suitable fallback freepage with requested order. 1625 * If only_stealable is true, this function returns fallback_mt only if 1626 * we can steal other freepages all together. This would help to reduce 1627 * fragmentation due to mixed migratetype pages in one pageblock. 1628 */ 1629 int find_suitable_fallback(struct free_area *area, unsigned int order, 1630 int migratetype, bool only_stealable, bool *can_steal) 1631 { 1632 int i; 1633 int fallback_mt; 1634 1635 if (area->nr_free == 0) 1636 return -1; 1637 1638 *can_steal = false; 1639 for (i = 0;; i++) { 1640 fallback_mt = fallbacks[migratetype][i]; 1641 if (fallback_mt == MIGRATE_TYPES) 1642 break; 1643 1644 if (list_empty(&area->free_list[fallback_mt])) 1645 continue; 1646 1647 if (can_steal_fallback(order, migratetype)) 1648 *can_steal = true; 1649 1650 if (!only_stealable) 1651 return fallback_mt; 1652 1653 if (*can_steal) 1654 return fallback_mt; 1655 } 1656 1657 return -1; 1658 } 1659 1660 /* 1661 * Reserve a pageblock for exclusive use of high-order atomic allocations if 1662 * there are no empty page blocks that contain a page with a suitable order 1663 */ 1664 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 1665 unsigned int alloc_order) 1666 { 1667 int mt; 1668 unsigned long max_managed, flags; 1669 1670 /* 1671 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 1672 * Check is race-prone but harmless. 1673 */ 1674 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 1675 if (zone->nr_reserved_highatomic >= max_managed) 1676 return; 1677 1678 spin_lock_irqsave(&zone->lock, flags); 1679 1680 /* Recheck the nr_reserved_highatomic limit under the lock */ 1681 if (zone->nr_reserved_highatomic >= max_managed) 1682 goto out_unlock; 1683 1684 /* Yoink! */ 1685 mt = get_pageblock_migratetype(page); 1686 if (mt != MIGRATE_HIGHATOMIC && 1687 !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { 1688 zone->nr_reserved_highatomic += pageblock_nr_pages; 1689 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 1690 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); 1691 } 1692 1693 out_unlock: 1694 spin_unlock_irqrestore(&zone->lock, flags); 1695 } 1696 1697 /* 1698 * Used when an allocation is about to fail under memory pressure. This 1699 * potentially hurts the reliability of high-order allocations when under 1700 * intense memory pressure but failed atomic allocations should be easier 1701 * to recover from than an OOM. 1702 */ 1703 static void unreserve_highatomic_pageblock(const struct alloc_context *ac) 1704 { 1705 struct zonelist *zonelist = ac->zonelist; 1706 unsigned long flags; 1707 struct zoneref *z; 1708 struct zone *zone; 1709 struct page *page; 1710 int order; 1711 1712 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 1713 ac->nodemask) { 1714 /* Preserve at least one pageblock */ 1715 if (zone->nr_reserved_highatomic <= pageblock_nr_pages) 1716 continue; 1717 1718 spin_lock_irqsave(&zone->lock, flags); 1719 for (order = 0; order < MAX_ORDER; order++) { 1720 struct free_area *area = &(zone->free_area[order]); 1721 1722 page = list_first_entry_or_null( 1723 &area->free_list[MIGRATE_HIGHATOMIC], 1724 struct page, lru); 1725 if (!page) 1726 continue; 1727 1728 /* 1729 * It should never happen but changes to locking could 1730 * inadvertently allow a per-cpu drain to add pages 1731 * to MIGRATE_HIGHATOMIC while unreserving so be safe 1732 * and watch for underflows. 1733 */ 1734 zone->nr_reserved_highatomic -= min(pageblock_nr_pages, 1735 zone->nr_reserved_highatomic); 1736 1737 /* 1738 * Convert to ac->migratetype and avoid the normal 1739 * pageblock stealing heuristics. Minimally, the caller 1740 * is doing the work and needs the pages. More 1741 * importantly, if the block was always converted to 1742 * MIGRATE_UNMOVABLE or another type then the number 1743 * of pageblocks that cannot be completely freed 1744 * may increase. 1745 */ 1746 set_pageblock_migratetype(page, ac->migratetype); 1747 move_freepages_block(zone, page, ac->migratetype); 1748 spin_unlock_irqrestore(&zone->lock, flags); 1749 return; 1750 } 1751 spin_unlock_irqrestore(&zone->lock, flags); 1752 } 1753 } 1754 1755 /* Remove an element from the buddy allocator from the fallback list */ 1756 static inline struct page * 1757 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) 1758 { 1759 struct free_area *area; 1760 unsigned int current_order; 1761 struct page *page; 1762 int fallback_mt; 1763 bool can_steal; 1764 1765 /* Find the largest possible block of pages in the other list */ 1766 for (current_order = MAX_ORDER-1; 1767 current_order >= order && current_order <= MAX_ORDER-1; 1768 --current_order) { 1769 area = &(zone->free_area[current_order]); 1770 fallback_mt = find_suitable_fallback(area, current_order, 1771 start_migratetype, false, &can_steal); 1772 if (fallback_mt == -1) 1773 continue; 1774 1775 page = list_first_entry(&area->free_list[fallback_mt], 1776 struct page, lru); 1777 if (can_steal) 1778 steal_suitable_fallback(zone, page, start_migratetype); 1779 1780 /* Remove the page from the freelists */ 1781 area->nr_free--; 1782 list_del(&page->lru); 1783 rmv_page_order(page); 1784 1785 expand(zone, page, order, current_order, area, 1786 start_migratetype); 1787 /* 1788 * The pcppage_migratetype may differ from pageblock's 1789 * migratetype depending on the decisions in 1790 * find_suitable_fallback(). This is OK as long as it does not 1791 * differ for MIGRATE_CMA pageblocks. Those can be used as 1792 * fallback only via special __rmqueue_cma_fallback() function 1793 */ 1794 set_pcppage_migratetype(page, start_migratetype); 1795 1796 trace_mm_page_alloc_extfrag(page, order, current_order, 1797 start_migratetype, fallback_mt); 1798 1799 return page; 1800 } 1801 1802 return NULL; 1803 } 1804 1805 /* 1806 * Do the hard work of removing an element from the buddy allocator. 1807 * Call me with the zone->lock already held. 1808 */ 1809 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1810 int migratetype) 1811 { 1812 struct page *page; 1813 1814 page = __rmqueue_smallest(zone, order, migratetype); 1815 if (unlikely(!page)) { 1816 if (migratetype == MIGRATE_MOVABLE) 1817 page = __rmqueue_cma_fallback(zone, order); 1818 1819 if (!page) 1820 page = __rmqueue_fallback(zone, order, migratetype); 1821 } 1822 1823 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1824 return page; 1825 } 1826 1827 /* 1828 * Obtain a specified number of elements from the buddy allocator, all under 1829 * a single hold of the lock, for efficiency. Add them to the supplied list. 1830 * Returns the number of new pages which were placed at *list. 1831 */ 1832 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1833 unsigned long count, struct list_head *list, 1834 int migratetype, bool cold) 1835 { 1836 int i; 1837 1838 spin_lock(&zone->lock); 1839 for (i = 0; i < count; ++i) { 1840 struct page *page = __rmqueue(zone, order, migratetype); 1841 if (unlikely(page == NULL)) 1842 break; 1843 1844 /* 1845 * Split buddy pages returned by expand() are received here 1846 * in physical page order. The page is added to the callers and 1847 * list and the list head then moves forward. From the callers 1848 * perspective, the linked list is ordered by page number in 1849 * some conditions. This is useful for IO devices that can 1850 * merge IO requests if the physical pages are ordered 1851 * properly. 1852 */ 1853 if (likely(!cold)) 1854 list_add(&page->lru, list); 1855 else 1856 list_add_tail(&page->lru, list); 1857 list = &page->lru; 1858 if (is_migrate_cma(get_pcppage_migratetype(page))) 1859 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1860 -(1 << order)); 1861 } 1862 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1863 spin_unlock(&zone->lock); 1864 return i; 1865 } 1866 1867 #ifdef CONFIG_NUMA 1868 /* 1869 * Called from the vmstat counter updater to drain pagesets of this 1870 * currently executing processor on remote nodes after they have 1871 * expired. 1872 * 1873 * Note that this function must be called with the thread pinned to 1874 * a single processor. 1875 */ 1876 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1877 { 1878 unsigned long flags; 1879 int to_drain, batch; 1880 1881 local_irq_save(flags); 1882 batch = READ_ONCE(pcp->batch); 1883 to_drain = min(pcp->count, batch); 1884 if (to_drain > 0) { 1885 free_pcppages_bulk(zone, to_drain, pcp); 1886 pcp->count -= to_drain; 1887 } 1888 local_irq_restore(flags); 1889 } 1890 #endif 1891 1892 /* 1893 * Drain pcplists of the indicated processor and zone. 1894 * 1895 * The processor must either be the current processor and the 1896 * thread pinned to the current processor or a processor that 1897 * is not online. 1898 */ 1899 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 1900 { 1901 unsigned long flags; 1902 struct per_cpu_pageset *pset; 1903 struct per_cpu_pages *pcp; 1904 1905 local_irq_save(flags); 1906 pset = per_cpu_ptr(zone->pageset, cpu); 1907 1908 pcp = &pset->pcp; 1909 if (pcp->count) { 1910 free_pcppages_bulk(zone, pcp->count, pcp); 1911 pcp->count = 0; 1912 } 1913 local_irq_restore(flags); 1914 } 1915 1916 /* 1917 * Drain pcplists of all zones on the indicated processor. 1918 * 1919 * The processor must either be the current processor and the 1920 * thread pinned to the current processor or a processor that 1921 * is not online. 1922 */ 1923 static void drain_pages(unsigned int cpu) 1924 { 1925 struct zone *zone; 1926 1927 for_each_populated_zone(zone) { 1928 drain_pages_zone(cpu, zone); 1929 } 1930 } 1931 1932 /* 1933 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1934 * 1935 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 1936 * the single zone's pages. 1937 */ 1938 void drain_local_pages(struct zone *zone) 1939 { 1940 int cpu = smp_processor_id(); 1941 1942 if (zone) 1943 drain_pages_zone(cpu, zone); 1944 else 1945 drain_pages(cpu); 1946 } 1947 1948 /* 1949 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1950 * 1951 * When zone parameter is non-NULL, spill just the single zone's pages. 1952 * 1953 * Note that this code is protected against sending an IPI to an offline 1954 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1955 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1956 * nothing keeps CPUs from showing up after we populated the cpumask and 1957 * before the call to on_each_cpu_mask(). 1958 */ 1959 void drain_all_pages(struct zone *zone) 1960 { 1961 int cpu; 1962 1963 /* 1964 * Allocate in the BSS so we wont require allocation in 1965 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1966 */ 1967 static cpumask_t cpus_with_pcps; 1968 1969 /* 1970 * We don't care about racing with CPU hotplug event 1971 * as offline notification will cause the notified 1972 * cpu to drain that CPU pcps and on_each_cpu_mask 1973 * disables preemption as part of its processing 1974 */ 1975 for_each_online_cpu(cpu) { 1976 struct per_cpu_pageset *pcp; 1977 struct zone *z; 1978 bool has_pcps = false; 1979 1980 if (zone) { 1981 pcp = per_cpu_ptr(zone->pageset, cpu); 1982 if (pcp->pcp.count) 1983 has_pcps = true; 1984 } else { 1985 for_each_populated_zone(z) { 1986 pcp = per_cpu_ptr(z->pageset, cpu); 1987 if (pcp->pcp.count) { 1988 has_pcps = true; 1989 break; 1990 } 1991 } 1992 } 1993 1994 if (has_pcps) 1995 cpumask_set_cpu(cpu, &cpus_with_pcps); 1996 else 1997 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1998 } 1999 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, 2000 zone, 1); 2001 } 2002 2003 #ifdef CONFIG_HIBERNATION 2004 2005 void mark_free_pages(struct zone *zone) 2006 { 2007 unsigned long pfn, max_zone_pfn; 2008 unsigned long flags; 2009 unsigned int order, t; 2010 struct page *page; 2011 2012 if (zone_is_empty(zone)) 2013 return; 2014 2015 spin_lock_irqsave(&zone->lock, flags); 2016 2017 max_zone_pfn = zone_end_pfn(zone); 2018 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 2019 if (pfn_valid(pfn)) { 2020 page = pfn_to_page(pfn); 2021 if (!swsusp_page_is_forbidden(page)) 2022 swsusp_unset_page_free(page); 2023 } 2024 2025 for_each_migratetype_order(order, t) { 2026 list_for_each_entry(page, 2027 &zone->free_area[order].free_list[t], lru) { 2028 unsigned long i; 2029 2030 pfn = page_to_pfn(page); 2031 for (i = 0; i < (1UL << order); i++) 2032 swsusp_set_page_free(pfn_to_page(pfn + i)); 2033 } 2034 } 2035 spin_unlock_irqrestore(&zone->lock, flags); 2036 } 2037 #endif /* CONFIG_PM */ 2038 2039 /* 2040 * Free a 0-order page 2041 * cold == true ? free a cold page : free a hot page 2042 */ 2043 void free_hot_cold_page(struct page *page, bool cold) 2044 { 2045 struct zone *zone = page_zone(page); 2046 struct per_cpu_pages *pcp; 2047 unsigned long flags; 2048 unsigned long pfn = page_to_pfn(page); 2049 int migratetype; 2050 2051 if (!free_pages_prepare(page, 0)) 2052 return; 2053 2054 migratetype = get_pfnblock_migratetype(page, pfn); 2055 set_pcppage_migratetype(page, migratetype); 2056 local_irq_save(flags); 2057 __count_vm_event(PGFREE); 2058 2059 /* 2060 * We only track unmovable, reclaimable and movable on pcp lists. 2061 * Free ISOLATE pages back to the allocator because they are being 2062 * offlined but treat RESERVE as movable pages so we can get those 2063 * areas back if necessary. Otherwise, we may have to free 2064 * excessively into the page allocator 2065 */ 2066 if (migratetype >= MIGRATE_PCPTYPES) { 2067 if (unlikely(is_migrate_isolate(migratetype))) { 2068 free_one_page(zone, page, pfn, 0, migratetype); 2069 goto out; 2070 } 2071 migratetype = MIGRATE_MOVABLE; 2072 } 2073 2074 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2075 if (!cold) 2076 list_add(&page->lru, &pcp->lists[migratetype]); 2077 else 2078 list_add_tail(&page->lru, &pcp->lists[migratetype]); 2079 pcp->count++; 2080 if (pcp->count >= pcp->high) { 2081 unsigned long batch = READ_ONCE(pcp->batch); 2082 free_pcppages_bulk(zone, batch, pcp); 2083 pcp->count -= batch; 2084 } 2085 2086 out: 2087 local_irq_restore(flags); 2088 } 2089 2090 /* 2091 * Free a list of 0-order pages 2092 */ 2093 void free_hot_cold_page_list(struct list_head *list, bool cold) 2094 { 2095 struct page *page, *next; 2096 2097 list_for_each_entry_safe(page, next, list, lru) { 2098 trace_mm_page_free_batched(page, cold); 2099 free_hot_cold_page(page, cold); 2100 } 2101 } 2102 2103 /* 2104 * split_page takes a non-compound higher-order page, and splits it into 2105 * n (1<<order) sub-pages: page[0..n] 2106 * Each sub-page must be freed individually. 2107 * 2108 * Note: this is probably too low level an operation for use in drivers. 2109 * Please consult with lkml before using this in your driver. 2110 */ 2111 void split_page(struct page *page, unsigned int order) 2112 { 2113 int i; 2114 gfp_t gfp_mask; 2115 2116 VM_BUG_ON_PAGE(PageCompound(page), page); 2117 VM_BUG_ON_PAGE(!page_count(page), page); 2118 2119 #ifdef CONFIG_KMEMCHECK 2120 /* 2121 * Split shadow pages too, because free(page[0]) would 2122 * otherwise free the whole shadow. 2123 */ 2124 if (kmemcheck_page_is_tracked(page)) 2125 split_page(virt_to_page(page[0].shadow), order); 2126 #endif 2127 2128 gfp_mask = get_page_owner_gfp(page); 2129 set_page_owner(page, 0, gfp_mask); 2130 for (i = 1; i < (1 << order); i++) { 2131 set_page_refcounted(page + i); 2132 set_page_owner(page + i, 0, gfp_mask); 2133 } 2134 } 2135 EXPORT_SYMBOL_GPL(split_page); 2136 2137 int __isolate_free_page(struct page *page, unsigned int order) 2138 { 2139 unsigned long watermark; 2140 struct zone *zone; 2141 int mt; 2142 2143 BUG_ON(!PageBuddy(page)); 2144 2145 zone = page_zone(page); 2146 mt = get_pageblock_migratetype(page); 2147 2148 if (!is_migrate_isolate(mt)) { 2149 /* Obey watermarks as if the page was being allocated */ 2150 watermark = low_wmark_pages(zone) + (1 << order); 2151 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 2152 return 0; 2153 2154 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2155 } 2156 2157 /* Remove page from free list */ 2158 list_del(&page->lru); 2159 zone->free_area[order].nr_free--; 2160 rmv_page_order(page); 2161 2162 set_page_owner(page, order, __GFP_MOVABLE); 2163 2164 /* Set the pageblock if the isolated page is at least a pageblock */ 2165 if (order >= pageblock_order - 1) { 2166 struct page *endpage = page + (1 << order) - 1; 2167 for (; page < endpage; page += pageblock_nr_pages) { 2168 int mt = get_pageblock_migratetype(page); 2169 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 2170 set_pageblock_migratetype(page, 2171 MIGRATE_MOVABLE); 2172 } 2173 } 2174 2175 2176 return 1UL << order; 2177 } 2178 2179 /* 2180 * Similar to split_page except the page is already free. As this is only 2181 * being used for migration, the migratetype of the block also changes. 2182 * As this is called with interrupts disabled, the caller is responsible 2183 * for calling arch_alloc_page() and kernel_map_page() after interrupts 2184 * are enabled. 2185 * 2186 * Note: this is probably too low level an operation for use in drivers. 2187 * Please consult with lkml before using this in your driver. 2188 */ 2189 int split_free_page(struct page *page) 2190 { 2191 unsigned int order; 2192 int nr_pages; 2193 2194 order = page_order(page); 2195 2196 nr_pages = __isolate_free_page(page, order); 2197 if (!nr_pages) 2198 return 0; 2199 2200 /* Split into individual pages */ 2201 set_page_refcounted(page); 2202 split_page(page, order); 2203 return nr_pages; 2204 } 2205 2206 /* 2207 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2208 */ 2209 static inline 2210 struct page *buffered_rmqueue(struct zone *preferred_zone, 2211 struct zone *zone, unsigned int order, 2212 gfp_t gfp_flags, int alloc_flags, int migratetype) 2213 { 2214 unsigned long flags; 2215 struct page *page; 2216 bool cold = ((gfp_flags & __GFP_COLD) != 0); 2217 2218 if (likely(order == 0)) { 2219 struct per_cpu_pages *pcp; 2220 struct list_head *list; 2221 2222 local_irq_save(flags); 2223 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2224 list = &pcp->lists[migratetype]; 2225 if (list_empty(list)) { 2226 pcp->count += rmqueue_bulk(zone, 0, 2227 pcp->batch, list, 2228 migratetype, cold); 2229 if (unlikely(list_empty(list))) 2230 goto failed; 2231 } 2232 2233 if (cold) 2234 page = list_last_entry(list, struct page, lru); 2235 else 2236 page = list_first_entry(list, struct page, lru); 2237 2238 list_del(&page->lru); 2239 pcp->count--; 2240 } else { 2241 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 2242 /* 2243 * __GFP_NOFAIL is not to be used in new code. 2244 * 2245 * All __GFP_NOFAIL callers should be fixed so that they 2246 * properly detect and handle allocation failures. 2247 * 2248 * We most definitely don't want callers attempting to 2249 * allocate greater than order-1 page units with 2250 * __GFP_NOFAIL. 2251 */ 2252 WARN_ON_ONCE(order > 1); 2253 } 2254 spin_lock_irqsave(&zone->lock, flags); 2255 2256 page = NULL; 2257 if (alloc_flags & ALLOC_HARDER) { 2258 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2259 if (page) 2260 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2261 } 2262 if (!page) 2263 page = __rmqueue(zone, order, migratetype); 2264 spin_unlock(&zone->lock); 2265 if (!page) 2266 goto failed; 2267 __mod_zone_freepage_state(zone, -(1 << order), 2268 get_pcppage_migratetype(page)); 2269 } 2270 2271 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 2272 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && 2273 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) 2274 set_bit(ZONE_FAIR_DEPLETED, &zone->flags); 2275 2276 __count_zone_vm_events(PGALLOC, zone, 1 << order); 2277 zone_statistics(preferred_zone, zone, gfp_flags); 2278 local_irq_restore(flags); 2279 2280 VM_BUG_ON_PAGE(bad_range(zone, page), page); 2281 return page; 2282 2283 failed: 2284 local_irq_restore(flags); 2285 return NULL; 2286 } 2287 2288 #ifdef CONFIG_FAIL_PAGE_ALLOC 2289 2290 static struct { 2291 struct fault_attr attr; 2292 2293 bool ignore_gfp_highmem; 2294 bool ignore_gfp_reclaim; 2295 u32 min_order; 2296 } fail_page_alloc = { 2297 .attr = FAULT_ATTR_INITIALIZER, 2298 .ignore_gfp_reclaim = true, 2299 .ignore_gfp_highmem = true, 2300 .min_order = 1, 2301 }; 2302 2303 static int __init setup_fail_page_alloc(char *str) 2304 { 2305 return setup_fault_attr(&fail_page_alloc.attr, str); 2306 } 2307 __setup("fail_page_alloc=", setup_fail_page_alloc); 2308 2309 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2310 { 2311 if (order < fail_page_alloc.min_order) 2312 return false; 2313 if (gfp_mask & __GFP_NOFAIL) 2314 return false; 2315 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 2316 return false; 2317 if (fail_page_alloc.ignore_gfp_reclaim && 2318 (gfp_mask & __GFP_DIRECT_RECLAIM)) 2319 return false; 2320 2321 return should_fail(&fail_page_alloc.attr, 1 << order); 2322 } 2323 2324 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 2325 2326 static int __init fail_page_alloc_debugfs(void) 2327 { 2328 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 2329 struct dentry *dir; 2330 2331 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 2332 &fail_page_alloc.attr); 2333 if (IS_ERR(dir)) 2334 return PTR_ERR(dir); 2335 2336 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 2337 &fail_page_alloc.ignore_gfp_reclaim)) 2338 goto fail; 2339 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 2340 &fail_page_alloc.ignore_gfp_highmem)) 2341 goto fail; 2342 if (!debugfs_create_u32("min-order", mode, dir, 2343 &fail_page_alloc.min_order)) 2344 goto fail; 2345 2346 return 0; 2347 fail: 2348 debugfs_remove_recursive(dir); 2349 2350 return -ENOMEM; 2351 } 2352 2353 late_initcall(fail_page_alloc_debugfs); 2354 2355 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 2356 2357 #else /* CONFIG_FAIL_PAGE_ALLOC */ 2358 2359 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2360 { 2361 return false; 2362 } 2363 2364 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 2365 2366 /* 2367 * Return true if free base pages are above 'mark'. For high-order checks it 2368 * will return true of the order-0 watermark is reached and there is at least 2369 * one free page of a suitable size. Checking now avoids taking the zone lock 2370 * to check in the allocation paths if no pages are free. 2371 */ 2372 static bool __zone_watermark_ok(struct zone *z, unsigned int order, 2373 unsigned long mark, int classzone_idx, int alloc_flags, 2374 long free_pages) 2375 { 2376 long min = mark; 2377 int o; 2378 const int alloc_harder = (alloc_flags & ALLOC_HARDER); 2379 2380 /* free_pages may go negative - that's OK */ 2381 free_pages -= (1 << order) - 1; 2382 2383 if (alloc_flags & ALLOC_HIGH) 2384 min -= min / 2; 2385 2386 /* 2387 * If the caller does not have rights to ALLOC_HARDER then subtract 2388 * the high-atomic reserves. This will over-estimate the size of the 2389 * atomic reserve but it avoids a search. 2390 */ 2391 if (likely(!alloc_harder)) 2392 free_pages -= z->nr_reserved_highatomic; 2393 else 2394 min -= min / 4; 2395 2396 #ifdef CONFIG_CMA 2397 /* If allocation can't use CMA areas don't use free CMA pages */ 2398 if (!(alloc_flags & ALLOC_CMA)) 2399 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); 2400 #endif 2401 2402 /* 2403 * Check watermarks for an order-0 allocation request. If these 2404 * are not met, then a high-order request also cannot go ahead 2405 * even if a suitable page happened to be free. 2406 */ 2407 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 2408 return false; 2409 2410 /* If this is an order-0 request then the watermark is fine */ 2411 if (!order) 2412 return true; 2413 2414 /* For a high-order request, check at least one suitable page is free */ 2415 for (o = order; o < MAX_ORDER; o++) { 2416 struct free_area *area = &z->free_area[o]; 2417 int mt; 2418 2419 if (!area->nr_free) 2420 continue; 2421 2422 if (alloc_harder) 2423 return true; 2424 2425 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 2426 if (!list_empty(&area->free_list[mt])) 2427 return true; 2428 } 2429 2430 #ifdef CONFIG_CMA 2431 if ((alloc_flags & ALLOC_CMA) && 2432 !list_empty(&area->free_list[MIGRATE_CMA])) { 2433 return true; 2434 } 2435 #endif 2436 } 2437 return false; 2438 } 2439 2440 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2441 int classzone_idx, int alloc_flags) 2442 { 2443 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 2444 zone_page_state(z, NR_FREE_PAGES)); 2445 } 2446 2447 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 2448 unsigned long mark, int classzone_idx) 2449 { 2450 long free_pages = zone_page_state(z, NR_FREE_PAGES); 2451 2452 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 2453 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 2454 2455 return __zone_watermark_ok(z, order, mark, classzone_idx, 0, 2456 free_pages); 2457 } 2458 2459 #ifdef CONFIG_NUMA 2460 static bool zone_local(struct zone *local_zone, struct zone *zone) 2461 { 2462 return local_zone->node == zone->node; 2463 } 2464 2465 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2466 { 2467 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < 2468 RECLAIM_DISTANCE; 2469 } 2470 #else /* CONFIG_NUMA */ 2471 static bool zone_local(struct zone *local_zone, struct zone *zone) 2472 { 2473 return true; 2474 } 2475 2476 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2477 { 2478 return true; 2479 } 2480 #endif /* CONFIG_NUMA */ 2481 2482 static void reset_alloc_batches(struct zone *preferred_zone) 2483 { 2484 struct zone *zone = preferred_zone->zone_pgdat->node_zones; 2485 2486 do { 2487 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2488 high_wmark_pages(zone) - low_wmark_pages(zone) - 2489 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2490 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); 2491 } while (zone++ != preferred_zone); 2492 } 2493 2494 /* 2495 * get_page_from_freelist goes through the zonelist trying to allocate 2496 * a page. 2497 */ 2498 static struct page * 2499 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 2500 const struct alloc_context *ac) 2501 { 2502 struct zonelist *zonelist = ac->zonelist; 2503 struct zoneref *z; 2504 struct page *page = NULL; 2505 struct zone *zone; 2506 int nr_fair_skipped = 0; 2507 bool zonelist_rescan; 2508 2509 zonelist_scan: 2510 zonelist_rescan = false; 2511 2512 /* 2513 * Scan zonelist, looking for a zone with enough free. 2514 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2515 */ 2516 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 2517 ac->nodemask) { 2518 unsigned long mark; 2519 2520 if (cpusets_enabled() && 2521 (alloc_flags & ALLOC_CPUSET) && 2522 !cpuset_zone_allowed(zone, gfp_mask)) 2523 continue; 2524 /* 2525 * Distribute pages in proportion to the individual 2526 * zone size to ensure fair page aging. The zone a 2527 * page was allocated in should have no effect on the 2528 * time the page has in memory before being reclaimed. 2529 */ 2530 if (alloc_flags & ALLOC_FAIR) { 2531 if (!zone_local(ac->preferred_zone, zone)) 2532 break; 2533 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { 2534 nr_fair_skipped++; 2535 continue; 2536 } 2537 } 2538 /* 2539 * When allocating a page cache page for writing, we 2540 * want to get it from a zone that is within its dirty 2541 * limit, such that no single zone holds more than its 2542 * proportional share of globally allowed dirty pages. 2543 * The dirty limits take into account the zone's 2544 * lowmem reserves and high watermark so that kswapd 2545 * should be able to balance it without having to 2546 * write pages from its LRU list. 2547 * 2548 * This may look like it could increase pressure on 2549 * lower zones by failing allocations in higher zones 2550 * before they are full. But the pages that do spill 2551 * over are limited as the lower zones are protected 2552 * by this very same mechanism. It should not become 2553 * a practical burden to them. 2554 * 2555 * XXX: For now, allow allocations to potentially 2556 * exceed the per-zone dirty limit in the slowpath 2557 * (spread_dirty_pages unset) before going into reclaim, 2558 * which is important when on a NUMA setup the allowed 2559 * zones are together not big enough to reach the 2560 * global limit. The proper fix for these situations 2561 * will require awareness of zones in the 2562 * dirty-throttling and the flusher threads. 2563 */ 2564 if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) 2565 continue; 2566 2567 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2568 if (!zone_watermark_ok(zone, order, mark, 2569 ac->classzone_idx, alloc_flags)) { 2570 int ret; 2571 2572 /* Checked here to keep the fast path fast */ 2573 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 2574 if (alloc_flags & ALLOC_NO_WATERMARKS) 2575 goto try_this_zone; 2576 2577 if (zone_reclaim_mode == 0 || 2578 !zone_allows_reclaim(ac->preferred_zone, zone)) 2579 continue; 2580 2581 ret = zone_reclaim(zone, gfp_mask, order); 2582 switch (ret) { 2583 case ZONE_RECLAIM_NOSCAN: 2584 /* did not scan */ 2585 continue; 2586 case ZONE_RECLAIM_FULL: 2587 /* scanned but unreclaimable */ 2588 continue; 2589 default: 2590 /* did we reclaim enough */ 2591 if (zone_watermark_ok(zone, order, mark, 2592 ac->classzone_idx, alloc_flags)) 2593 goto try_this_zone; 2594 2595 continue; 2596 } 2597 } 2598 2599 try_this_zone: 2600 page = buffered_rmqueue(ac->preferred_zone, zone, order, 2601 gfp_mask, alloc_flags, ac->migratetype); 2602 if (page) { 2603 if (prep_new_page(page, order, gfp_mask, alloc_flags)) 2604 goto try_this_zone; 2605 2606 /* 2607 * If this is a high-order atomic allocation then check 2608 * if the pageblock should be reserved for the future 2609 */ 2610 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 2611 reserve_highatomic_pageblock(page, zone, order); 2612 2613 return page; 2614 } 2615 } 2616 2617 /* 2618 * The first pass makes sure allocations are spread fairly within the 2619 * local node. However, the local node might have free pages left 2620 * after the fairness batches are exhausted, and remote zones haven't 2621 * even been considered yet. Try once more without fairness, and 2622 * include remote zones now, before entering the slowpath and waking 2623 * kswapd: prefer spilling to a remote zone over swapping locally. 2624 */ 2625 if (alloc_flags & ALLOC_FAIR) { 2626 alloc_flags &= ~ALLOC_FAIR; 2627 if (nr_fair_skipped) { 2628 zonelist_rescan = true; 2629 reset_alloc_batches(ac->preferred_zone); 2630 } 2631 if (nr_online_nodes > 1) 2632 zonelist_rescan = true; 2633 } 2634 2635 if (zonelist_rescan) 2636 goto zonelist_scan; 2637 2638 return NULL; 2639 } 2640 2641 /* 2642 * Large machines with many possible nodes should not always dump per-node 2643 * meminfo in irq context. 2644 */ 2645 static inline bool should_suppress_show_mem(void) 2646 { 2647 bool ret = false; 2648 2649 #if NODES_SHIFT > 8 2650 ret = in_interrupt(); 2651 #endif 2652 return ret; 2653 } 2654 2655 static DEFINE_RATELIMIT_STATE(nopage_rs, 2656 DEFAULT_RATELIMIT_INTERVAL, 2657 DEFAULT_RATELIMIT_BURST); 2658 2659 void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) 2660 { 2661 unsigned int filter = SHOW_MEM_FILTER_NODES; 2662 2663 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2664 debug_guardpage_minorder() > 0) 2665 return; 2666 2667 /* 2668 * This documents exceptions given to allocations in certain 2669 * contexts that are allowed to allocate outside current's set 2670 * of allowed nodes. 2671 */ 2672 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2673 if (test_thread_flag(TIF_MEMDIE) || 2674 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2675 filter &= ~SHOW_MEM_FILTER_NODES; 2676 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 2677 filter &= ~SHOW_MEM_FILTER_NODES; 2678 2679 if (fmt) { 2680 struct va_format vaf; 2681 va_list args; 2682 2683 va_start(args, fmt); 2684 2685 vaf.fmt = fmt; 2686 vaf.va = &args; 2687 2688 pr_warn("%pV", &vaf); 2689 2690 va_end(args); 2691 } 2692 2693 pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", 2694 current->comm, order, gfp_mask); 2695 2696 dump_stack(); 2697 if (!should_suppress_show_mem()) 2698 show_mem(filter); 2699 } 2700 2701 static inline struct page * 2702 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2703 const struct alloc_context *ac, unsigned long *did_some_progress) 2704 { 2705 struct oom_control oc = { 2706 .zonelist = ac->zonelist, 2707 .nodemask = ac->nodemask, 2708 .gfp_mask = gfp_mask, 2709 .order = order, 2710 }; 2711 struct page *page; 2712 2713 *did_some_progress = 0; 2714 2715 /* 2716 * Acquire the oom lock. If that fails, somebody else is 2717 * making progress for us. 2718 */ 2719 if (!mutex_trylock(&oom_lock)) { 2720 *did_some_progress = 1; 2721 schedule_timeout_uninterruptible(1); 2722 return NULL; 2723 } 2724 2725 /* 2726 * Go through the zonelist yet one more time, keep very high watermark 2727 * here, this is only to catch a parallel oom killing, we must fail if 2728 * we're still under heavy pressure. 2729 */ 2730 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, 2731 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 2732 if (page) 2733 goto out; 2734 2735 if (!(gfp_mask & __GFP_NOFAIL)) { 2736 /* Coredumps can quickly deplete all memory reserves */ 2737 if (current->flags & PF_DUMPCORE) 2738 goto out; 2739 /* The OOM killer will not help higher order allocs */ 2740 if (order > PAGE_ALLOC_COSTLY_ORDER) 2741 goto out; 2742 /* The OOM killer does not needlessly kill tasks for lowmem */ 2743 if (ac->high_zoneidx < ZONE_NORMAL) 2744 goto out; 2745 /* The OOM killer does not compensate for IO-less reclaim */ 2746 if (!(gfp_mask & __GFP_FS)) { 2747 /* 2748 * XXX: Page reclaim didn't yield anything, 2749 * and the OOM killer can't be invoked, but 2750 * keep looping as per tradition. 2751 */ 2752 *did_some_progress = 1; 2753 goto out; 2754 } 2755 if (pm_suspended_storage()) 2756 goto out; 2757 /* The OOM killer may not free memory on a specific node */ 2758 if (gfp_mask & __GFP_THISNODE) 2759 goto out; 2760 } 2761 /* Exhausted what can be done so it's blamo time */ 2762 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 2763 *did_some_progress = 1; 2764 2765 if (gfp_mask & __GFP_NOFAIL) { 2766 page = get_page_from_freelist(gfp_mask, order, 2767 ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); 2768 /* 2769 * fallback to ignore cpuset restriction if our nodes 2770 * are depleted 2771 */ 2772 if (!page) 2773 page = get_page_from_freelist(gfp_mask, order, 2774 ALLOC_NO_WATERMARKS, ac); 2775 } 2776 } 2777 out: 2778 mutex_unlock(&oom_lock); 2779 return page; 2780 } 2781 2782 #ifdef CONFIG_COMPACTION 2783 /* Try memory compaction for high-order allocations before reclaim */ 2784 static struct page * 2785 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2786 int alloc_flags, const struct alloc_context *ac, 2787 enum migrate_mode mode, int *contended_compaction, 2788 bool *deferred_compaction) 2789 { 2790 unsigned long compact_result; 2791 struct page *page; 2792 2793 if (!order) 2794 return NULL; 2795 2796 current->flags |= PF_MEMALLOC; 2797 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 2798 mode, contended_compaction); 2799 current->flags &= ~PF_MEMALLOC; 2800 2801 switch (compact_result) { 2802 case COMPACT_DEFERRED: 2803 *deferred_compaction = true; 2804 /* fall-through */ 2805 case COMPACT_SKIPPED: 2806 return NULL; 2807 default: 2808 break; 2809 } 2810 2811 /* 2812 * At least in one zone compaction wasn't deferred or skipped, so let's 2813 * count a compaction stall 2814 */ 2815 count_vm_event(COMPACTSTALL); 2816 2817 page = get_page_from_freelist(gfp_mask, order, 2818 alloc_flags & ~ALLOC_NO_WATERMARKS, ac); 2819 2820 if (page) { 2821 struct zone *zone = page_zone(page); 2822 2823 zone->compact_blockskip_flush = false; 2824 compaction_defer_reset(zone, order, true); 2825 count_vm_event(COMPACTSUCCESS); 2826 return page; 2827 } 2828 2829 /* 2830 * It's bad if compaction run occurs and fails. The most likely reason 2831 * is that pages exist, but not enough to satisfy watermarks. 2832 */ 2833 count_vm_event(COMPACTFAIL); 2834 2835 cond_resched(); 2836 2837 return NULL; 2838 } 2839 #else 2840 static inline struct page * 2841 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2842 int alloc_flags, const struct alloc_context *ac, 2843 enum migrate_mode mode, int *contended_compaction, 2844 bool *deferred_compaction) 2845 { 2846 return NULL; 2847 } 2848 #endif /* CONFIG_COMPACTION */ 2849 2850 /* Perform direct synchronous page reclaim */ 2851 static int 2852 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 2853 const struct alloc_context *ac) 2854 { 2855 struct reclaim_state reclaim_state; 2856 int progress; 2857 2858 cond_resched(); 2859 2860 /* We now go into synchronous reclaim */ 2861 cpuset_memory_pressure_bump(); 2862 current->flags |= PF_MEMALLOC; 2863 lockdep_set_current_reclaim_state(gfp_mask); 2864 reclaim_state.reclaimed_slab = 0; 2865 current->reclaim_state = &reclaim_state; 2866 2867 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 2868 ac->nodemask); 2869 2870 current->reclaim_state = NULL; 2871 lockdep_clear_current_reclaim_state(); 2872 current->flags &= ~PF_MEMALLOC; 2873 2874 cond_resched(); 2875 2876 return progress; 2877 } 2878 2879 /* The really slow allocator path where we enter direct reclaim */ 2880 static inline struct page * 2881 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2882 int alloc_flags, const struct alloc_context *ac, 2883 unsigned long *did_some_progress) 2884 { 2885 struct page *page = NULL; 2886 bool drained = false; 2887 2888 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 2889 if (unlikely(!(*did_some_progress))) 2890 return NULL; 2891 2892 retry: 2893 page = get_page_from_freelist(gfp_mask, order, 2894 alloc_flags & ~ALLOC_NO_WATERMARKS, ac); 2895 2896 /* 2897 * If an allocation failed after direct reclaim, it could be because 2898 * pages are pinned on the per-cpu lists or in high alloc reserves. 2899 * Shrink them them and try again 2900 */ 2901 if (!page && !drained) { 2902 unreserve_highatomic_pageblock(ac); 2903 drain_all_pages(NULL); 2904 drained = true; 2905 goto retry; 2906 } 2907 2908 return page; 2909 } 2910 2911 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 2912 { 2913 struct zoneref *z; 2914 struct zone *zone; 2915 2916 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 2917 ac->high_zoneidx, ac->nodemask) 2918 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); 2919 } 2920 2921 static inline int 2922 gfp_to_alloc_flags(gfp_t gfp_mask) 2923 { 2924 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2925 2926 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2927 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2928 2929 /* 2930 * The caller may dip into page reserves a bit more if the caller 2931 * cannot run direct reclaim, or if the caller has realtime scheduling 2932 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2933 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 2934 */ 2935 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2936 2937 if (gfp_mask & __GFP_ATOMIC) { 2938 /* 2939 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2940 * if it can't schedule. 2941 */ 2942 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2943 alloc_flags |= ALLOC_HARDER; 2944 /* 2945 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2946 * comment for __cpuset_node_allowed(). 2947 */ 2948 alloc_flags &= ~ALLOC_CPUSET; 2949 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2950 alloc_flags |= ALLOC_HARDER; 2951 2952 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2953 if (gfp_mask & __GFP_MEMALLOC) 2954 alloc_flags |= ALLOC_NO_WATERMARKS; 2955 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2956 alloc_flags |= ALLOC_NO_WATERMARKS; 2957 else if (!in_interrupt() && 2958 ((current->flags & PF_MEMALLOC) || 2959 unlikely(test_thread_flag(TIF_MEMDIE)))) 2960 alloc_flags |= ALLOC_NO_WATERMARKS; 2961 } 2962 #ifdef CONFIG_CMA 2963 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2964 alloc_flags |= ALLOC_CMA; 2965 #endif 2966 return alloc_flags; 2967 } 2968 2969 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2970 { 2971 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2972 } 2973 2974 static inline bool is_thp_gfp_mask(gfp_t gfp_mask) 2975 { 2976 return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; 2977 } 2978 2979 static inline struct page * 2980 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2981 struct alloc_context *ac) 2982 { 2983 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 2984 struct page *page = NULL; 2985 int alloc_flags; 2986 unsigned long pages_reclaimed = 0; 2987 unsigned long did_some_progress; 2988 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2989 bool deferred_compaction = false; 2990 int contended_compaction = COMPACT_CONTENDED_NONE; 2991 2992 /* 2993 * In the slowpath, we sanity check order to avoid ever trying to 2994 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2995 * be using allocators in order of preference for an area that is 2996 * too large. 2997 */ 2998 if (order >= MAX_ORDER) { 2999 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 3000 return NULL; 3001 } 3002 3003 /* 3004 * We also sanity check to catch abuse of atomic reserves being used by 3005 * callers that are not in atomic context. 3006 */ 3007 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 3008 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3009 gfp_mask &= ~__GFP_ATOMIC; 3010 3011 /* 3012 * If this allocation cannot block and it is for a specific node, then 3013 * fail early. There's no need to wakeup kswapd or retry for a 3014 * speculative node-specific allocation. 3015 */ 3016 if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) 3017 goto nopage; 3018 3019 retry: 3020 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3021 wake_all_kswapds(order, ac); 3022 3023 /* 3024 * OK, we're below the kswapd watermark and have kicked background 3025 * reclaim. Now things get more complex, so set up alloc_flags according 3026 * to how we want to proceed. 3027 */ 3028 alloc_flags = gfp_to_alloc_flags(gfp_mask); 3029 3030 /* 3031 * Find the true preferred zone if the allocation is unconstrained by 3032 * cpusets. 3033 */ 3034 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { 3035 struct zoneref *preferred_zoneref; 3036 preferred_zoneref = first_zones_zonelist(ac->zonelist, 3037 ac->high_zoneidx, NULL, &ac->preferred_zone); 3038 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); 3039 } 3040 3041 /* This is the last chance, in general, before the goto nopage. */ 3042 page = get_page_from_freelist(gfp_mask, order, 3043 alloc_flags & ~ALLOC_NO_WATERMARKS, ac); 3044 if (page) 3045 goto got_pg; 3046 3047 /* Allocate without watermarks if the context allows */ 3048 if (alloc_flags & ALLOC_NO_WATERMARKS) { 3049 /* 3050 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 3051 * the allocation is high priority and these type of 3052 * allocations are system rather than user orientated 3053 */ 3054 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3055 page = get_page_from_freelist(gfp_mask, order, 3056 ALLOC_NO_WATERMARKS, ac); 3057 if (page) 3058 goto got_pg; 3059 } 3060 3061 /* Caller is not willing to reclaim, we can't balance anything */ 3062 if (!can_direct_reclaim) { 3063 /* 3064 * All existing users of the __GFP_NOFAIL are blockable, so warn 3065 * of any new users that actually allow this type of allocation 3066 * to fail. 3067 */ 3068 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); 3069 goto nopage; 3070 } 3071 3072 /* Avoid recursion of direct reclaim */ 3073 if (current->flags & PF_MEMALLOC) { 3074 /* 3075 * __GFP_NOFAIL request from this context is rather bizarre 3076 * because we cannot reclaim anything and only can loop waiting 3077 * for somebody to do a work for us. 3078 */ 3079 if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 3080 cond_resched(); 3081 goto retry; 3082 } 3083 goto nopage; 3084 } 3085 3086 /* Avoid allocations with no watermarks from looping endlessly */ 3087 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 3088 goto nopage; 3089 3090 /* 3091 * Try direct compaction. The first pass is asynchronous. Subsequent 3092 * attempts after direct reclaim are synchronous 3093 */ 3094 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 3095 migration_mode, 3096 &contended_compaction, 3097 &deferred_compaction); 3098 if (page) 3099 goto got_pg; 3100 3101 /* Checks for THP-specific high-order allocations */ 3102 if (is_thp_gfp_mask(gfp_mask)) { 3103 /* 3104 * If compaction is deferred for high-order allocations, it is 3105 * because sync compaction recently failed. If this is the case 3106 * and the caller requested a THP allocation, we do not want 3107 * to heavily disrupt the system, so we fail the allocation 3108 * instead of entering direct reclaim. 3109 */ 3110 if (deferred_compaction) 3111 goto nopage; 3112 3113 /* 3114 * In all zones where compaction was attempted (and not 3115 * deferred or skipped), lock contention has been detected. 3116 * For THP allocation we do not want to disrupt the others 3117 * so we fallback to base pages instead. 3118 */ 3119 if (contended_compaction == COMPACT_CONTENDED_LOCK) 3120 goto nopage; 3121 3122 /* 3123 * If compaction was aborted due to need_resched(), we do not 3124 * want to further increase allocation latency, unless it is 3125 * khugepaged trying to collapse. 3126 */ 3127 if (contended_compaction == COMPACT_CONTENDED_SCHED 3128 && !(current->flags & PF_KTHREAD)) 3129 goto nopage; 3130 } 3131 3132 /* 3133 * It can become very expensive to allocate transparent hugepages at 3134 * fault, so use asynchronous memory compaction for THP unless it is 3135 * khugepaged trying to collapse. 3136 */ 3137 if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) 3138 migration_mode = MIGRATE_SYNC_LIGHT; 3139 3140 /* Try direct reclaim and then allocating */ 3141 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 3142 &did_some_progress); 3143 if (page) 3144 goto got_pg; 3145 3146 /* Do not loop if specifically requested */ 3147 if (gfp_mask & __GFP_NORETRY) 3148 goto noretry; 3149 3150 /* Keep reclaiming pages as long as there is reasonable progress */ 3151 pages_reclaimed += did_some_progress; 3152 if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || 3153 ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { 3154 /* Wait for some write requests to complete then retry */ 3155 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); 3156 goto retry; 3157 } 3158 3159 /* Reclaim has failed us, start killing things */ 3160 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3161 if (page) 3162 goto got_pg; 3163 3164 /* Retry as long as the OOM killer is making progress */ 3165 if (did_some_progress) 3166 goto retry; 3167 3168 noretry: 3169 /* 3170 * High-order allocations do not necessarily loop after 3171 * direct reclaim and reclaim/compaction depends on compaction 3172 * being called after reclaim so call directly if necessary 3173 */ 3174 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, 3175 ac, migration_mode, 3176 &contended_compaction, 3177 &deferred_compaction); 3178 if (page) 3179 goto got_pg; 3180 nopage: 3181 warn_alloc_failed(gfp_mask, order, NULL); 3182 got_pg: 3183 return page; 3184 } 3185 3186 /* 3187 * This is the 'heart' of the zoned buddy allocator. 3188 */ 3189 struct page * 3190 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 3191 struct zonelist *zonelist, nodemask_t *nodemask) 3192 { 3193 struct zoneref *preferred_zoneref; 3194 struct page *page = NULL; 3195 unsigned int cpuset_mems_cookie; 3196 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 3197 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ 3198 struct alloc_context ac = { 3199 .high_zoneidx = gfp_zone(gfp_mask), 3200 .nodemask = nodemask, 3201 .migratetype = gfpflags_to_migratetype(gfp_mask), 3202 }; 3203 3204 gfp_mask &= gfp_allowed_mask; 3205 3206 lockdep_trace_alloc(gfp_mask); 3207 3208 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 3209 3210 if (should_fail_alloc_page(gfp_mask, order)) 3211 return NULL; 3212 3213 /* 3214 * Check the zones suitable for the gfp_mask contain at least one 3215 * valid zone. It's possible to have an empty zonelist as a result 3216 * of __GFP_THISNODE and a memoryless node 3217 */ 3218 if (unlikely(!zonelist->_zonerefs->zone)) 3219 return NULL; 3220 3221 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) 3222 alloc_flags |= ALLOC_CMA; 3223 3224 retry_cpuset: 3225 cpuset_mems_cookie = read_mems_allowed_begin(); 3226 3227 /* We set it here, as __alloc_pages_slowpath might have changed it */ 3228 ac.zonelist = zonelist; 3229 3230 /* Dirty zone balancing only done in the fast path */ 3231 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3232 3233 /* The preferred zone is used for statistics later */ 3234 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, 3235 ac.nodemask ? : &cpuset_current_mems_allowed, 3236 &ac.preferred_zone); 3237 if (!ac.preferred_zone) 3238 goto out; 3239 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); 3240 3241 /* First allocation attempt */ 3242 alloc_mask = gfp_mask|__GFP_HARDWALL; 3243 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 3244 if (unlikely(!page)) { 3245 /* 3246 * Runtime PM, block IO and its error handling path 3247 * can deadlock because I/O on the device might not 3248 * complete. 3249 */ 3250 alloc_mask = memalloc_noio_flags(gfp_mask); 3251 ac.spread_dirty_pages = false; 3252 3253 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 3254 } 3255 3256 if (kmemcheck_enabled && page) 3257 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 3258 3259 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 3260 3261 out: 3262 /* 3263 * When updating a task's mems_allowed, it is possible to race with 3264 * parallel threads in such a way that an allocation can fail while 3265 * the mask is being updated. If a page allocation is about to fail, 3266 * check if the cpuset changed during allocation and if so, retry. 3267 */ 3268 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 3269 goto retry_cpuset; 3270 3271 return page; 3272 } 3273 EXPORT_SYMBOL(__alloc_pages_nodemask); 3274 3275 /* 3276 * Common helper functions. 3277 */ 3278 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 3279 { 3280 struct page *page; 3281 3282 /* 3283 * __get_free_pages() returns a 32-bit address, which cannot represent 3284 * a highmem page 3285 */ 3286 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 3287 3288 page = alloc_pages(gfp_mask, order); 3289 if (!page) 3290 return 0; 3291 return (unsigned long) page_address(page); 3292 } 3293 EXPORT_SYMBOL(__get_free_pages); 3294 3295 unsigned long get_zeroed_page(gfp_t gfp_mask) 3296 { 3297 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 3298 } 3299 EXPORT_SYMBOL(get_zeroed_page); 3300 3301 void __free_pages(struct page *page, unsigned int order) 3302 { 3303 if (put_page_testzero(page)) { 3304 if (order == 0) 3305 free_hot_cold_page(page, false); 3306 else 3307 __free_pages_ok(page, order); 3308 } 3309 } 3310 3311 EXPORT_SYMBOL(__free_pages); 3312 3313 void free_pages(unsigned long addr, unsigned int order) 3314 { 3315 if (addr != 0) { 3316 VM_BUG_ON(!virt_addr_valid((void *)addr)); 3317 __free_pages(virt_to_page((void *)addr), order); 3318 } 3319 } 3320 3321 EXPORT_SYMBOL(free_pages); 3322 3323 /* 3324 * Page Fragment: 3325 * An arbitrary-length arbitrary-offset area of memory which resides 3326 * within a 0 or higher order page. Multiple fragments within that page 3327 * are individually refcounted, in the page's reference counter. 3328 * 3329 * The page_frag functions below provide a simple allocation framework for 3330 * page fragments. This is used by the network stack and network device 3331 * drivers to provide a backing region of memory for use as either an 3332 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 3333 */ 3334 static struct page *__page_frag_refill(struct page_frag_cache *nc, 3335 gfp_t gfp_mask) 3336 { 3337 struct page *page = NULL; 3338 gfp_t gfp = gfp_mask; 3339 3340 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 3341 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 3342 __GFP_NOMEMALLOC; 3343 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 3344 PAGE_FRAG_CACHE_MAX_ORDER); 3345 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 3346 #endif 3347 if (unlikely(!page)) 3348 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 3349 3350 nc->va = page ? page_address(page) : NULL; 3351 3352 return page; 3353 } 3354 3355 void *__alloc_page_frag(struct page_frag_cache *nc, 3356 unsigned int fragsz, gfp_t gfp_mask) 3357 { 3358 unsigned int size = PAGE_SIZE; 3359 struct page *page; 3360 int offset; 3361 3362 if (unlikely(!nc->va)) { 3363 refill: 3364 page = __page_frag_refill(nc, gfp_mask); 3365 if (!page) 3366 return NULL; 3367 3368 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 3369 /* if size can vary use size else just use PAGE_SIZE */ 3370 size = nc->size; 3371 #endif 3372 /* Even if we own the page, we do not use atomic_set(). 3373 * This would break get_page_unless_zero() users. 3374 */ 3375 atomic_add(size - 1, &page->_count); 3376 3377 /* reset page count bias and offset to start of new frag */ 3378 nc->pfmemalloc = page_is_pfmemalloc(page); 3379 nc->pagecnt_bias = size; 3380 nc->offset = size; 3381 } 3382 3383 offset = nc->offset - fragsz; 3384 if (unlikely(offset < 0)) { 3385 page = virt_to_page(nc->va); 3386 3387 if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) 3388 goto refill; 3389 3390 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 3391 /* if size can vary use size else just use PAGE_SIZE */ 3392 size = nc->size; 3393 #endif 3394 /* OK, page count is 0, we can safely set it */ 3395 atomic_set(&page->_count, size); 3396 3397 /* reset page count bias and offset to start of new frag */ 3398 nc->pagecnt_bias = size; 3399 offset = size - fragsz; 3400 } 3401 3402 nc->pagecnt_bias--; 3403 nc->offset = offset; 3404 3405 return nc->va + offset; 3406 } 3407 EXPORT_SYMBOL(__alloc_page_frag); 3408 3409 /* 3410 * Frees a page fragment allocated out of either a compound or order 0 page. 3411 */ 3412 void __free_page_frag(void *addr) 3413 { 3414 struct page *page = virt_to_head_page(addr); 3415 3416 if (unlikely(put_page_testzero(page))) 3417 __free_pages_ok(page, compound_order(page)); 3418 } 3419 EXPORT_SYMBOL(__free_page_frag); 3420 3421 /* 3422 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter 3423 * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is 3424 * equivalent to alloc_pages. 3425 * 3426 * It should be used when the caller would like to use kmalloc, but since the 3427 * allocation is large, it has to fall back to the page allocator. 3428 */ 3429 struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) 3430 { 3431 struct page *page; 3432 3433 page = alloc_pages(gfp_mask, order); 3434 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { 3435 __free_pages(page, order); 3436 page = NULL; 3437 } 3438 return page; 3439 } 3440 3441 struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) 3442 { 3443 struct page *page; 3444 3445 page = alloc_pages_node(nid, gfp_mask, order); 3446 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { 3447 __free_pages(page, order); 3448 page = NULL; 3449 } 3450 return page; 3451 } 3452 3453 /* 3454 * __free_kmem_pages and free_kmem_pages will free pages allocated with 3455 * alloc_kmem_pages. 3456 */ 3457 void __free_kmem_pages(struct page *page, unsigned int order) 3458 { 3459 memcg_kmem_uncharge(page, order); 3460 __free_pages(page, order); 3461 } 3462 3463 void free_kmem_pages(unsigned long addr, unsigned int order) 3464 { 3465 if (addr != 0) { 3466 VM_BUG_ON(!virt_addr_valid((void *)addr)); 3467 __free_kmem_pages(virt_to_page((void *)addr), order); 3468 } 3469 } 3470 3471 static void *make_alloc_exact(unsigned long addr, unsigned int order, 3472 size_t size) 3473 { 3474 if (addr) { 3475 unsigned long alloc_end = addr + (PAGE_SIZE << order); 3476 unsigned long used = addr + PAGE_ALIGN(size); 3477 3478 split_page(virt_to_page((void *)addr), order); 3479 while (used < alloc_end) { 3480 free_page(used); 3481 used += PAGE_SIZE; 3482 } 3483 } 3484 return (void *)addr; 3485 } 3486 3487 /** 3488 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 3489 * @size: the number of bytes to allocate 3490 * @gfp_mask: GFP flags for the allocation 3491 * 3492 * This function is similar to alloc_pages(), except that it allocates the 3493 * minimum number of pages to satisfy the request. alloc_pages() can only 3494 * allocate memory in power-of-two pages. 3495 * 3496 * This function is also limited by MAX_ORDER. 3497 * 3498 * Memory allocated by this function must be released by free_pages_exact(). 3499 */ 3500 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 3501 { 3502 unsigned int order = get_order(size); 3503 unsigned long addr; 3504 3505 addr = __get_free_pages(gfp_mask, order); 3506 return make_alloc_exact(addr, order, size); 3507 } 3508 EXPORT_SYMBOL(alloc_pages_exact); 3509 3510 /** 3511 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 3512 * pages on a node. 3513 * @nid: the preferred node ID where memory should be allocated 3514 * @size: the number of bytes to allocate 3515 * @gfp_mask: GFP flags for the allocation 3516 * 3517 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 3518 * back. 3519 */ 3520 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 3521 { 3522 unsigned int order = get_order(size); 3523 struct page *p = alloc_pages_node(nid, gfp_mask, order); 3524 if (!p) 3525 return NULL; 3526 return make_alloc_exact((unsigned long)page_address(p), order, size); 3527 } 3528 3529 /** 3530 * free_pages_exact - release memory allocated via alloc_pages_exact() 3531 * @virt: the value returned by alloc_pages_exact. 3532 * @size: size of allocation, same value as passed to alloc_pages_exact(). 3533 * 3534 * Release the memory allocated by a previous call to alloc_pages_exact. 3535 */ 3536 void free_pages_exact(void *virt, size_t size) 3537 { 3538 unsigned long addr = (unsigned long)virt; 3539 unsigned long end = addr + PAGE_ALIGN(size); 3540 3541 while (addr < end) { 3542 free_page(addr); 3543 addr += PAGE_SIZE; 3544 } 3545 } 3546 EXPORT_SYMBOL(free_pages_exact); 3547 3548 /** 3549 * nr_free_zone_pages - count number of pages beyond high watermark 3550 * @offset: The zone index of the highest zone 3551 * 3552 * nr_free_zone_pages() counts the number of counts pages which are beyond the 3553 * high watermark within all zones at or below a given zone index. For each 3554 * zone, the number of pages is calculated as: 3555 * managed_pages - high_pages 3556 */ 3557 static unsigned long nr_free_zone_pages(int offset) 3558 { 3559 struct zoneref *z; 3560 struct zone *zone; 3561 3562 /* Just pick one node, since fallback list is circular */ 3563 unsigned long sum = 0; 3564 3565 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 3566 3567 for_each_zone_zonelist(zone, z, zonelist, offset) { 3568 unsigned long size = zone->managed_pages; 3569 unsigned long high = high_wmark_pages(zone); 3570 if (size > high) 3571 sum += size - high; 3572 } 3573 3574 return sum; 3575 } 3576 3577 /** 3578 * nr_free_buffer_pages - count number of pages beyond high watermark 3579 * 3580 * nr_free_buffer_pages() counts the number of pages which are beyond the high 3581 * watermark within ZONE_DMA and ZONE_NORMAL. 3582 */ 3583 unsigned long nr_free_buffer_pages(void) 3584 { 3585 return nr_free_zone_pages(gfp_zone(GFP_USER)); 3586 } 3587 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 3588 3589 /** 3590 * nr_free_pagecache_pages - count number of pages beyond high watermark 3591 * 3592 * nr_free_pagecache_pages() counts the number of pages which are beyond the 3593 * high watermark within all zones. 3594 */ 3595 unsigned long nr_free_pagecache_pages(void) 3596 { 3597 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 3598 } 3599 3600 static inline void show_node(struct zone *zone) 3601 { 3602 if (IS_ENABLED(CONFIG_NUMA)) 3603 printk("Node %d ", zone_to_nid(zone)); 3604 } 3605 3606 void si_meminfo(struct sysinfo *val) 3607 { 3608 val->totalram = totalram_pages; 3609 val->sharedram = global_page_state(NR_SHMEM); 3610 val->freeram = global_page_state(NR_FREE_PAGES); 3611 val->bufferram = nr_blockdev_pages(); 3612 val->totalhigh = totalhigh_pages; 3613 val->freehigh = nr_free_highpages(); 3614 val->mem_unit = PAGE_SIZE; 3615 } 3616 3617 EXPORT_SYMBOL(si_meminfo); 3618 3619 #ifdef CONFIG_NUMA 3620 void si_meminfo_node(struct sysinfo *val, int nid) 3621 { 3622 int zone_type; /* needs to be signed */ 3623 unsigned long managed_pages = 0; 3624 pg_data_t *pgdat = NODE_DATA(nid); 3625 3626 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3627 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3628 val->totalram = managed_pages; 3629 val->sharedram = node_page_state(nid, NR_SHMEM); 3630 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3631 #ifdef CONFIG_HIGHMEM 3632 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3633 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3634 NR_FREE_PAGES); 3635 #else 3636 val->totalhigh = 0; 3637 val->freehigh = 0; 3638 #endif 3639 val->mem_unit = PAGE_SIZE; 3640 } 3641 #endif 3642 3643 /* 3644 * Determine whether the node should be displayed or not, depending on whether 3645 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3646 */ 3647 bool skip_free_areas_node(unsigned int flags, int nid) 3648 { 3649 bool ret = false; 3650 unsigned int cpuset_mems_cookie; 3651 3652 if (!(flags & SHOW_MEM_FILTER_NODES)) 3653 goto out; 3654 3655 do { 3656 cpuset_mems_cookie = read_mems_allowed_begin(); 3657 ret = !node_isset(nid, cpuset_current_mems_allowed); 3658 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3659 out: 3660 return ret; 3661 } 3662 3663 #define K(x) ((x) << (PAGE_SHIFT-10)) 3664 3665 static void show_migration_types(unsigned char type) 3666 { 3667 static const char types[MIGRATE_TYPES] = { 3668 [MIGRATE_UNMOVABLE] = 'U', 3669 [MIGRATE_MOVABLE] = 'M', 3670 [MIGRATE_RECLAIMABLE] = 'E', 3671 [MIGRATE_HIGHATOMIC] = 'H', 3672 #ifdef CONFIG_CMA 3673 [MIGRATE_CMA] = 'C', 3674 #endif 3675 #ifdef CONFIG_MEMORY_ISOLATION 3676 [MIGRATE_ISOLATE] = 'I', 3677 #endif 3678 }; 3679 char tmp[MIGRATE_TYPES + 1]; 3680 char *p = tmp; 3681 int i; 3682 3683 for (i = 0; i < MIGRATE_TYPES; i++) { 3684 if (type & (1 << i)) 3685 *p++ = types[i]; 3686 } 3687 3688 *p = '\0'; 3689 printk("(%s) ", tmp); 3690 } 3691 3692 /* 3693 * Show free area list (used inside shift_scroll-lock stuff) 3694 * We also calculate the percentage fragmentation. We do this by counting the 3695 * memory on each free list with the exception of the first item on the list. 3696 * 3697 * Bits in @filter: 3698 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 3699 * cpuset. 3700 */ 3701 void show_free_areas(unsigned int filter) 3702 { 3703 unsigned long free_pcp = 0; 3704 int cpu; 3705 struct zone *zone; 3706 3707 for_each_populated_zone(zone) { 3708 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3709 continue; 3710 3711 for_each_online_cpu(cpu) 3712 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 3713 } 3714 3715 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3716 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3717 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" 3718 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3719 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3720 " free:%lu free_pcp:%lu free_cma:%lu\n", 3721 global_page_state(NR_ACTIVE_ANON), 3722 global_page_state(NR_INACTIVE_ANON), 3723 global_page_state(NR_ISOLATED_ANON), 3724 global_page_state(NR_ACTIVE_FILE), 3725 global_page_state(NR_INACTIVE_FILE), 3726 global_page_state(NR_ISOLATED_FILE), 3727 global_page_state(NR_UNEVICTABLE), 3728 global_page_state(NR_FILE_DIRTY), 3729 global_page_state(NR_WRITEBACK), 3730 global_page_state(NR_UNSTABLE_NFS), 3731 global_page_state(NR_SLAB_RECLAIMABLE), 3732 global_page_state(NR_SLAB_UNRECLAIMABLE), 3733 global_page_state(NR_FILE_MAPPED), 3734 global_page_state(NR_SHMEM), 3735 global_page_state(NR_PAGETABLE), 3736 global_page_state(NR_BOUNCE), 3737 global_page_state(NR_FREE_PAGES), 3738 free_pcp, 3739 global_page_state(NR_FREE_CMA_PAGES)); 3740 3741 for_each_populated_zone(zone) { 3742 int i; 3743 3744 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3745 continue; 3746 3747 free_pcp = 0; 3748 for_each_online_cpu(cpu) 3749 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 3750 3751 show_node(zone); 3752 printk("%s" 3753 " free:%lukB" 3754 " min:%lukB" 3755 " low:%lukB" 3756 " high:%lukB" 3757 " active_anon:%lukB" 3758 " inactive_anon:%lukB" 3759 " active_file:%lukB" 3760 " inactive_file:%lukB" 3761 " unevictable:%lukB" 3762 " isolated(anon):%lukB" 3763 " isolated(file):%lukB" 3764 " present:%lukB" 3765 " managed:%lukB" 3766 " mlocked:%lukB" 3767 " dirty:%lukB" 3768 " writeback:%lukB" 3769 " mapped:%lukB" 3770 " shmem:%lukB" 3771 " slab_reclaimable:%lukB" 3772 " slab_unreclaimable:%lukB" 3773 " kernel_stack:%lukB" 3774 " pagetables:%lukB" 3775 " unstable:%lukB" 3776 " bounce:%lukB" 3777 " free_pcp:%lukB" 3778 " local_pcp:%ukB" 3779 " free_cma:%lukB" 3780 " writeback_tmp:%lukB" 3781 " pages_scanned:%lu" 3782 " all_unreclaimable? %s" 3783 "\n", 3784 zone->name, 3785 K(zone_page_state(zone, NR_FREE_PAGES)), 3786 K(min_wmark_pages(zone)), 3787 K(low_wmark_pages(zone)), 3788 K(high_wmark_pages(zone)), 3789 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3790 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3791 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3792 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3793 K(zone_page_state(zone, NR_UNEVICTABLE)), 3794 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3795 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3796 K(zone->present_pages), 3797 K(zone->managed_pages), 3798 K(zone_page_state(zone, NR_MLOCK)), 3799 K(zone_page_state(zone, NR_FILE_DIRTY)), 3800 K(zone_page_state(zone, NR_WRITEBACK)), 3801 K(zone_page_state(zone, NR_FILE_MAPPED)), 3802 K(zone_page_state(zone, NR_SHMEM)), 3803 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3804 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3805 zone_page_state(zone, NR_KERNEL_STACK) * 3806 THREAD_SIZE / 1024, 3807 K(zone_page_state(zone, NR_PAGETABLE)), 3808 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3809 K(zone_page_state(zone, NR_BOUNCE)), 3810 K(free_pcp), 3811 K(this_cpu_read(zone->pageset->pcp.count)), 3812 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3813 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3814 K(zone_page_state(zone, NR_PAGES_SCANNED)), 3815 (!zone_reclaimable(zone) ? "yes" : "no") 3816 ); 3817 printk("lowmem_reserve[]:"); 3818 for (i = 0; i < MAX_NR_ZONES; i++) 3819 printk(" %ld", zone->lowmem_reserve[i]); 3820 printk("\n"); 3821 } 3822 3823 for_each_populated_zone(zone) { 3824 unsigned int order; 3825 unsigned long nr[MAX_ORDER], flags, total = 0; 3826 unsigned char types[MAX_ORDER]; 3827 3828 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3829 continue; 3830 show_node(zone); 3831 printk("%s: ", zone->name); 3832 3833 spin_lock_irqsave(&zone->lock, flags); 3834 for (order = 0; order < MAX_ORDER; order++) { 3835 struct free_area *area = &zone->free_area[order]; 3836 int type; 3837 3838 nr[order] = area->nr_free; 3839 total += nr[order] << order; 3840 3841 types[order] = 0; 3842 for (type = 0; type < MIGRATE_TYPES; type++) { 3843 if (!list_empty(&area->free_list[type])) 3844 types[order] |= 1 << type; 3845 } 3846 } 3847 spin_unlock_irqrestore(&zone->lock, flags); 3848 for (order = 0; order < MAX_ORDER; order++) { 3849 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3850 if (nr[order]) 3851 show_migration_types(types[order]); 3852 } 3853 printk("= %lukB\n", K(total)); 3854 } 3855 3856 hugetlb_show_meminfo(); 3857 3858 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3859 3860 show_swap_cache_info(); 3861 } 3862 3863 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3864 { 3865 zoneref->zone = zone; 3866 zoneref->zone_idx = zone_idx(zone); 3867 } 3868 3869 /* 3870 * Builds allocation fallback zone lists. 3871 * 3872 * Add all populated zones of a node to the zonelist. 3873 */ 3874 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3875 int nr_zones) 3876 { 3877 struct zone *zone; 3878 enum zone_type zone_type = MAX_NR_ZONES; 3879 3880 do { 3881 zone_type--; 3882 zone = pgdat->node_zones + zone_type; 3883 if (populated_zone(zone)) { 3884 zoneref_set_zone(zone, 3885 &zonelist->_zonerefs[nr_zones++]); 3886 check_highest_zone(zone_type); 3887 } 3888 } while (zone_type); 3889 3890 return nr_zones; 3891 } 3892 3893 3894 /* 3895 * zonelist_order: 3896 * 0 = automatic detection of better ordering. 3897 * 1 = order by ([node] distance, -zonetype) 3898 * 2 = order by (-zonetype, [node] distance) 3899 * 3900 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3901 * the same zonelist. So only NUMA can configure this param. 3902 */ 3903 #define ZONELIST_ORDER_DEFAULT 0 3904 #define ZONELIST_ORDER_NODE 1 3905 #define ZONELIST_ORDER_ZONE 2 3906 3907 /* zonelist order in the kernel. 3908 * set_zonelist_order() will set this to NODE or ZONE. 3909 */ 3910 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3911 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3912 3913 3914 #ifdef CONFIG_NUMA 3915 /* The value user specified ....changed by config */ 3916 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3917 /* string for sysctl */ 3918 #define NUMA_ZONELIST_ORDER_LEN 16 3919 char numa_zonelist_order[16] = "default"; 3920 3921 /* 3922 * interface for configure zonelist ordering. 3923 * command line option "numa_zonelist_order" 3924 * = "[dD]efault - default, automatic configuration. 3925 * = "[nN]ode - order by node locality, then by zone within node 3926 * = "[zZ]one - order by zone, then by locality within zone 3927 */ 3928 3929 static int __parse_numa_zonelist_order(char *s) 3930 { 3931 if (*s == 'd' || *s == 'D') { 3932 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3933 } else if (*s == 'n' || *s == 'N') { 3934 user_zonelist_order = ZONELIST_ORDER_NODE; 3935 } else if (*s == 'z' || *s == 'Z') { 3936 user_zonelist_order = ZONELIST_ORDER_ZONE; 3937 } else { 3938 printk(KERN_WARNING 3939 "Ignoring invalid numa_zonelist_order value: " 3940 "%s\n", s); 3941 return -EINVAL; 3942 } 3943 return 0; 3944 } 3945 3946 static __init int setup_numa_zonelist_order(char *s) 3947 { 3948 int ret; 3949 3950 if (!s) 3951 return 0; 3952 3953 ret = __parse_numa_zonelist_order(s); 3954 if (ret == 0) 3955 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3956 3957 return ret; 3958 } 3959 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3960 3961 /* 3962 * sysctl handler for numa_zonelist_order 3963 */ 3964 int numa_zonelist_order_handler(struct ctl_table *table, int write, 3965 void __user *buffer, size_t *length, 3966 loff_t *ppos) 3967 { 3968 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3969 int ret; 3970 static DEFINE_MUTEX(zl_order_mutex); 3971 3972 mutex_lock(&zl_order_mutex); 3973 if (write) { 3974 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3975 ret = -EINVAL; 3976 goto out; 3977 } 3978 strcpy(saved_string, (char *)table->data); 3979 } 3980 ret = proc_dostring(table, write, buffer, length, ppos); 3981 if (ret) 3982 goto out; 3983 if (write) { 3984 int oldval = user_zonelist_order; 3985 3986 ret = __parse_numa_zonelist_order((char *)table->data); 3987 if (ret) { 3988 /* 3989 * bogus value. restore saved string 3990 */ 3991 strncpy((char *)table->data, saved_string, 3992 NUMA_ZONELIST_ORDER_LEN); 3993 user_zonelist_order = oldval; 3994 } else if (oldval != user_zonelist_order) { 3995 mutex_lock(&zonelists_mutex); 3996 build_all_zonelists(NULL, NULL); 3997 mutex_unlock(&zonelists_mutex); 3998 } 3999 } 4000 out: 4001 mutex_unlock(&zl_order_mutex); 4002 return ret; 4003 } 4004 4005 4006 #define MAX_NODE_LOAD (nr_online_nodes) 4007 static int node_load[MAX_NUMNODES]; 4008 4009 /** 4010 * find_next_best_node - find the next node that should appear in a given node's fallback list 4011 * @node: node whose fallback list we're appending 4012 * @used_node_mask: nodemask_t of already used nodes 4013 * 4014 * We use a number of factors to determine which is the next node that should 4015 * appear on a given node's fallback list. The node should not have appeared 4016 * already in @node's fallback list, and it should be the next closest node 4017 * according to the distance array (which contains arbitrary distance values 4018 * from each node to each node in the system), and should also prefer nodes 4019 * with no CPUs, since presumably they'll have very little allocation pressure 4020 * on them otherwise. 4021 * It returns -1 if no node is found. 4022 */ 4023 static int find_next_best_node(int node, nodemask_t *used_node_mask) 4024 { 4025 int n, val; 4026 int min_val = INT_MAX; 4027 int best_node = NUMA_NO_NODE; 4028 const struct cpumask *tmp = cpumask_of_node(0); 4029 4030 /* Use the local node if we haven't already */ 4031 if (!node_isset(node, *used_node_mask)) { 4032 node_set(node, *used_node_mask); 4033 return node; 4034 } 4035 4036 for_each_node_state(n, N_MEMORY) { 4037 4038 /* Don't want a node to appear more than once */ 4039 if (node_isset(n, *used_node_mask)) 4040 continue; 4041 4042 /* Use the distance array to find the distance */ 4043 val = node_distance(node, n); 4044 4045 /* Penalize nodes under us ("prefer the next node") */ 4046 val += (n < node); 4047 4048 /* Give preference to headless and unused nodes */ 4049 tmp = cpumask_of_node(n); 4050 if (!cpumask_empty(tmp)) 4051 val += PENALTY_FOR_NODE_WITH_CPUS; 4052 4053 /* Slight preference for less loaded node */ 4054 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 4055 val += node_load[n]; 4056 4057 if (val < min_val) { 4058 min_val = val; 4059 best_node = n; 4060 } 4061 } 4062 4063 if (best_node >= 0) 4064 node_set(best_node, *used_node_mask); 4065 4066 return best_node; 4067 } 4068 4069 4070 /* 4071 * Build zonelists ordered by node and zones within node. 4072 * This results in maximum locality--normal zone overflows into local 4073 * DMA zone, if any--but risks exhausting DMA zone. 4074 */ 4075 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 4076 { 4077 int j; 4078 struct zonelist *zonelist; 4079 4080 zonelist = &pgdat->node_zonelists[0]; 4081 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 4082 ; 4083 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4084 zonelist->_zonerefs[j].zone = NULL; 4085 zonelist->_zonerefs[j].zone_idx = 0; 4086 } 4087 4088 /* 4089 * Build gfp_thisnode zonelists 4090 */ 4091 static void build_thisnode_zonelists(pg_data_t *pgdat) 4092 { 4093 int j; 4094 struct zonelist *zonelist; 4095 4096 zonelist = &pgdat->node_zonelists[1]; 4097 j = build_zonelists_node(pgdat, zonelist, 0); 4098 zonelist->_zonerefs[j].zone = NULL; 4099 zonelist->_zonerefs[j].zone_idx = 0; 4100 } 4101 4102 /* 4103 * Build zonelists ordered by zone and nodes within zones. 4104 * This results in conserving DMA zone[s] until all Normal memory is 4105 * exhausted, but results in overflowing to remote node while memory 4106 * may still exist in local DMA zone. 4107 */ 4108 static int node_order[MAX_NUMNODES]; 4109 4110 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 4111 { 4112 int pos, j, node; 4113 int zone_type; /* needs to be signed */ 4114 struct zone *z; 4115 struct zonelist *zonelist; 4116 4117 zonelist = &pgdat->node_zonelists[0]; 4118 pos = 0; 4119 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 4120 for (j = 0; j < nr_nodes; j++) { 4121 node = node_order[j]; 4122 z = &NODE_DATA(node)->node_zones[zone_type]; 4123 if (populated_zone(z)) { 4124 zoneref_set_zone(z, 4125 &zonelist->_zonerefs[pos++]); 4126 check_highest_zone(zone_type); 4127 } 4128 } 4129 } 4130 zonelist->_zonerefs[pos].zone = NULL; 4131 zonelist->_zonerefs[pos].zone_idx = 0; 4132 } 4133 4134 #if defined(CONFIG_64BIT) 4135 /* 4136 * Devices that require DMA32/DMA are relatively rare and do not justify a 4137 * penalty to every machine in case the specialised case applies. Default 4138 * to Node-ordering on 64-bit NUMA machines 4139 */ 4140 static int default_zonelist_order(void) 4141 { 4142 return ZONELIST_ORDER_NODE; 4143 } 4144 #else 4145 /* 4146 * On 32-bit, the Normal zone needs to be preserved for allocations accessible 4147 * by the kernel. If processes running on node 0 deplete the low memory zone 4148 * then reclaim will occur more frequency increasing stalls and potentially 4149 * be easier to OOM if a large percentage of the zone is under writeback or 4150 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. 4151 * Hence, default to zone ordering on 32-bit. 4152 */ 4153 static int default_zonelist_order(void) 4154 { 4155 return ZONELIST_ORDER_ZONE; 4156 } 4157 #endif /* CONFIG_64BIT */ 4158 4159 static void set_zonelist_order(void) 4160 { 4161 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 4162 current_zonelist_order = default_zonelist_order(); 4163 else 4164 current_zonelist_order = user_zonelist_order; 4165 } 4166 4167 static void build_zonelists(pg_data_t *pgdat) 4168 { 4169 int i, node, load; 4170 nodemask_t used_mask; 4171 int local_node, prev_node; 4172 struct zonelist *zonelist; 4173 unsigned int order = current_zonelist_order; 4174 4175 /* initialize zonelists */ 4176 for (i = 0; i < MAX_ZONELISTS; i++) { 4177 zonelist = pgdat->node_zonelists + i; 4178 zonelist->_zonerefs[0].zone = NULL; 4179 zonelist->_zonerefs[0].zone_idx = 0; 4180 } 4181 4182 /* NUMA-aware ordering of nodes */ 4183 local_node = pgdat->node_id; 4184 load = nr_online_nodes; 4185 prev_node = local_node; 4186 nodes_clear(used_mask); 4187 4188 memset(node_order, 0, sizeof(node_order)); 4189 i = 0; 4190 4191 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 4192 /* 4193 * We don't want to pressure a particular node. 4194 * So adding penalty to the first node in same 4195 * distance group to make it round-robin. 4196 */ 4197 if (node_distance(local_node, node) != 4198 node_distance(local_node, prev_node)) 4199 node_load[node] = load; 4200 4201 prev_node = node; 4202 load--; 4203 if (order == ZONELIST_ORDER_NODE) 4204 build_zonelists_in_node_order(pgdat, node); 4205 else 4206 node_order[i++] = node; /* remember order */ 4207 } 4208 4209 if (order == ZONELIST_ORDER_ZONE) { 4210 /* calculate node order -- i.e., DMA last! */ 4211 build_zonelists_in_zone_order(pgdat, i); 4212 } 4213 4214 build_thisnode_zonelists(pgdat); 4215 } 4216 4217 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 4218 /* 4219 * Return node id of node used for "local" allocations. 4220 * I.e., first node id of first zone in arg node's generic zonelist. 4221 * Used for initializing percpu 'numa_mem', which is used primarily 4222 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 4223 */ 4224 int local_memory_node(int node) 4225 { 4226 struct zone *zone; 4227 4228 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 4229 gfp_zone(GFP_KERNEL), 4230 NULL, 4231 &zone); 4232 return zone->node; 4233 } 4234 #endif 4235 4236 #else /* CONFIG_NUMA */ 4237 4238 static void set_zonelist_order(void) 4239 { 4240 current_zonelist_order = ZONELIST_ORDER_ZONE; 4241 } 4242 4243 static void build_zonelists(pg_data_t *pgdat) 4244 { 4245 int node, local_node; 4246 enum zone_type j; 4247 struct zonelist *zonelist; 4248 4249 local_node = pgdat->node_id; 4250 4251 zonelist = &pgdat->node_zonelists[0]; 4252 j = build_zonelists_node(pgdat, zonelist, 0); 4253 4254 /* 4255 * Now we build the zonelist so that it contains the zones 4256 * of all the other nodes. 4257 * We don't want to pressure a particular node, so when 4258 * building the zones for node N, we make sure that the 4259 * zones coming right after the local ones are those from 4260 * node N+1 (modulo N) 4261 */ 4262 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 4263 if (!node_online(node)) 4264 continue; 4265 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4266 } 4267 for (node = 0; node < local_node; node++) { 4268 if (!node_online(node)) 4269 continue; 4270 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4271 } 4272 4273 zonelist->_zonerefs[j].zone = NULL; 4274 zonelist->_zonerefs[j].zone_idx = 0; 4275 } 4276 4277 #endif /* CONFIG_NUMA */ 4278 4279 /* 4280 * Boot pageset table. One per cpu which is going to be used for all 4281 * zones and all nodes. The parameters will be set in such a way 4282 * that an item put on a list will immediately be handed over to 4283 * the buddy list. This is safe since pageset manipulation is done 4284 * with interrupts disabled. 4285 * 4286 * The boot_pagesets must be kept even after bootup is complete for 4287 * unused processors and/or zones. They do play a role for bootstrapping 4288 * hotplugged processors. 4289 * 4290 * zoneinfo_show() and maybe other functions do 4291 * not check if the processor is online before following the pageset pointer. 4292 * Other parts of the kernel may not check if the zone is available. 4293 */ 4294 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 4295 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 4296 static void setup_zone_pageset(struct zone *zone); 4297 4298 /* 4299 * Global mutex to protect against size modification of zonelists 4300 * as well as to serialize pageset setup for the new populated zone. 4301 */ 4302 DEFINE_MUTEX(zonelists_mutex); 4303 4304 /* return values int ....just for stop_machine() */ 4305 static int __build_all_zonelists(void *data) 4306 { 4307 int nid; 4308 int cpu; 4309 pg_data_t *self = data; 4310 4311 #ifdef CONFIG_NUMA 4312 memset(node_load, 0, sizeof(node_load)); 4313 #endif 4314 4315 if (self && !node_online(self->node_id)) { 4316 build_zonelists(self); 4317 } 4318 4319 for_each_online_node(nid) { 4320 pg_data_t *pgdat = NODE_DATA(nid); 4321 4322 build_zonelists(pgdat); 4323 } 4324 4325 /* 4326 * Initialize the boot_pagesets that are going to be used 4327 * for bootstrapping processors. The real pagesets for 4328 * each zone will be allocated later when the per cpu 4329 * allocator is available. 4330 * 4331 * boot_pagesets are used also for bootstrapping offline 4332 * cpus if the system is already booted because the pagesets 4333 * are needed to initialize allocators on a specific cpu too. 4334 * F.e. the percpu allocator needs the page allocator which 4335 * needs the percpu allocator in order to allocate its pagesets 4336 * (a chicken-egg dilemma). 4337 */ 4338 for_each_possible_cpu(cpu) { 4339 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 4340 4341 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 4342 /* 4343 * We now know the "local memory node" for each node-- 4344 * i.e., the node of the first zone in the generic zonelist. 4345 * Set up numa_mem percpu variable for on-line cpus. During 4346 * boot, only the boot cpu should be on-line; we'll init the 4347 * secondary cpus' numa_mem as they come on-line. During 4348 * node/memory hotplug, we'll fixup all on-line cpus. 4349 */ 4350 if (cpu_online(cpu)) 4351 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 4352 #endif 4353 } 4354 4355 return 0; 4356 } 4357 4358 static noinline void __init 4359 build_all_zonelists_init(void) 4360 { 4361 __build_all_zonelists(NULL); 4362 mminit_verify_zonelist(); 4363 cpuset_init_current_mems_allowed(); 4364 } 4365 4366 /* 4367 * Called with zonelists_mutex held always 4368 * unless system_state == SYSTEM_BOOTING. 4369 * 4370 * __ref due to (1) call of __meminit annotated setup_zone_pageset 4371 * [we're only called with non-NULL zone through __meminit paths] and 4372 * (2) call of __init annotated helper build_all_zonelists_init 4373 * [protected by SYSTEM_BOOTING]. 4374 */ 4375 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 4376 { 4377 set_zonelist_order(); 4378 4379 if (system_state == SYSTEM_BOOTING) { 4380 build_all_zonelists_init(); 4381 } else { 4382 #ifdef CONFIG_MEMORY_HOTPLUG 4383 if (zone) 4384 setup_zone_pageset(zone); 4385 #endif 4386 /* we have to stop all cpus to guarantee there is no user 4387 of zonelist */ 4388 stop_machine(__build_all_zonelists, pgdat, NULL); 4389 /* cpuset refresh routine should be here */ 4390 } 4391 vm_total_pages = nr_free_pagecache_pages(); 4392 /* 4393 * Disable grouping by mobility if the number of pages in the 4394 * system is too low to allow the mechanism to work. It would be 4395 * more accurate, but expensive to check per-zone. This check is 4396 * made on memory-hotadd so a system can start with mobility 4397 * disabled and enable it later 4398 */ 4399 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 4400 page_group_by_mobility_disabled = 1; 4401 else 4402 page_group_by_mobility_disabled = 0; 4403 4404 pr_info("Built %i zonelists in %s order, mobility grouping %s. " 4405 "Total pages: %ld\n", 4406 nr_online_nodes, 4407 zonelist_order_name[current_zonelist_order], 4408 page_group_by_mobility_disabled ? "off" : "on", 4409 vm_total_pages); 4410 #ifdef CONFIG_NUMA 4411 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 4412 #endif 4413 } 4414 4415 /* 4416 * Helper functions to size the waitqueue hash table. 4417 * Essentially these want to choose hash table sizes sufficiently 4418 * large so that collisions trying to wait on pages are rare. 4419 * But in fact, the number of active page waitqueues on typical 4420 * systems is ridiculously low, less than 200. So this is even 4421 * conservative, even though it seems large. 4422 * 4423 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 4424 * waitqueues, i.e. the size of the waitq table given the number of pages. 4425 */ 4426 #define PAGES_PER_WAITQUEUE 256 4427 4428 #ifndef CONFIG_MEMORY_HOTPLUG 4429 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 4430 { 4431 unsigned long size = 1; 4432 4433 pages /= PAGES_PER_WAITQUEUE; 4434 4435 while (size < pages) 4436 size <<= 1; 4437 4438 /* 4439 * Once we have dozens or even hundreds of threads sleeping 4440 * on IO we've got bigger problems than wait queue collision. 4441 * Limit the size of the wait table to a reasonable size. 4442 */ 4443 size = min(size, 4096UL); 4444 4445 return max(size, 4UL); 4446 } 4447 #else 4448 /* 4449 * A zone's size might be changed by hot-add, so it is not possible to determine 4450 * a suitable size for its wait_table. So we use the maximum size now. 4451 * 4452 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 4453 * 4454 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 4455 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 4456 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 4457 * 4458 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 4459 * or more by the traditional way. (See above). It equals: 4460 * 4461 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 4462 * ia64(16K page size) : = ( 8G + 4M)byte. 4463 * powerpc (64K page size) : = (32G +16M)byte. 4464 */ 4465 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 4466 { 4467 return 4096UL; 4468 } 4469 #endif 4470 4471 /* 4472 * This is an integer logarithm so that shifts can be used later 4473 * to extract the more random high bits from the multiplicative 4474 * hash function before the remainder is taken. 4475 */ 4476 static inline unsigned long wait_table_bits(unsigned long size) 4477 { 4478 return ffz(~size); 4479 } 4480 4481 /* 4482 * Initially all pages are reserved - free ones are freed 4483 * up by free_all_bootmem() once the early boot process is 4484 * done. Non-atomic initialization, single-pass. 4485 */ 4486 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4487 unsigned long start_pfn, enum memmap_context context) 4488 { 4489 struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); 4490 unsigned long end_pfn = start_pfn + size; 4491 pg_data_t *pgdat = NODE_DATA(nid); 4492 unsigned long pfn; 4493 unsigned long nr_initialised = 0; 4494 4495 if (highest_memmap_pfn < end_pfn - 1) 4496 highest_memmap_pfn = end_pfn - 1; 4497 4498 /* 4499 * Honor reservation requested by the driver for this ZONE_DEVICE 4500 * memory 4501 */ 4502 if (altmap && start_pfn == altmap->base_pfn) 4503 start_pfn += altmap->reserve; 4504 4505 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4506 /* 4507 * There can be holes in boot-time mem_map[]s 4508 * handed to this function. They do not 4509 * exist on hotplugged memory. 4510 */ 4511 if (context == MEMMAP_EARLY) { 4512 if (!early_pfn_valid(pfn)) 4513 continue; 4514 if (!early_pfn_in_nid(pfn, nid)) 4515 continue; 4516 if (!update_defer_init(pgdat, pfn, end_pfn, 4517 &nr_initialised)) 4518 break; 4519 } 4520 4521 /* 4522 * Mark the block movable so that blocks are reserved for 4523 * movable at startup. This will force kernel allocations 4524 * to reserve their blocks rather than leaking throughout 4525 * the address space during boot when many long-lived 4526 * kernel allocations are made. 4527 * 4528 * bitmap is created for zone's valid pfn range. but memmap 4529 * can be created for invalid pages (for alignment) 4530 * check here not to call set_pageblock_migratetype() against 4531 * pfn out of zone. 4532 */ 4533 if (!(pfn & (pageblock_nr_pages - 1))) { 4534 struct page *page = pfn_to_page(pfn); 4535 4536 __init_single_page(page, pfn, zone, nid); 4537 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4538 } else { 4539 __init_single_pfn(pfn, zone, nid); 4540 } 4541 } 4542 } 4543 4544 static void __meminit zone_init_free_lists(struct zone *zone) 4545 { 4546 unsigned int order, t; 4547 for_each_migratetype_order(order, t) { 4548 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4549 zone->free_area[order].nr_free = 0; 4550 } 4551 } 4552 4553 #ifndef __HAVE_ARCH_MEMMAP_INIT 4554 #define memmap_init(size, nid, zone, start_pfn) \ 4555 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4556 #endif 4557 4558 static int zone_batchsize(struct zone *zone) 4559 { 4560 #ifdef CONFIG_MMU 4561 int batch; 4562 4563 /* 4564 * The per-cpu-pages pools are set to around 1000th of the 4565 * size of the zone. But no more than 1/2 of a meg. 4566 * 4567 * OK, so we don't know how big the cache is. So guess. 4568 */ 4569 batch = zone->managed_pages / 1024; 4570 if (batch * PAGE_SIZE > 512 * 1024) 4571 batch = (512 * 1024) / PAGE_SIZE; 4572 batch /= 4; /* We effectively *= 4 below */ 4573 if (batch < 1) 4574 batch = 1; 4575 4576 /* 4577 * Clamp the batch to a 2^n - 1 value. Having a power 4578 * of 2 value was found to be more likely to have 4579 * suboptimal cache aliasing properties in some cases. 4580 * 4581 * For example if 2 tasks are alternately allocating 4582 * batches of pages, one task can end up with a lot 4583 * of pages of one half of the possible page colors 4584 * and the other with pages of the other colors. 4585 */ 4586 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4587 4588 return batch; 4589 4590 #else 4591 /* The deferral and batching of frees should be suppressed under NOMMU 4592 * conditions. 4593 * 4594 * The problem is that NOMMU needs to be able to allocate large chunks 4595 * of contiguous memory as there's no hardware page translation to 4596 * assemble apparent contiguous memory from discontiguous pages. 4597 * 4598 * Queueing large contiguous runs of pages for batching, however, 4599 * causes the pages to actually be freed in smaller chunks. As there 4600 * can be a significant delay between the individual batches being 4601 * recycled, this leads to the once large chunks of space being 4602 * fragmented and becoming unavailable for high-order allocations. 4603 */ 4604 return 0; 4605 #endif 4606 } 4607 4608 /* 4609 * pcp->high and pcp->batch values are related and dependent on one another: 4610 * ->batch must never be higher then ->high. 4611 * The following function updates them in a safe manner without read side 4612 * locking. 4613 * 4614 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4615 * those fields changing asynchronously (acording the the above rule). 4616 * 4617 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4618 * outside of boot time (or some other assurance that no concurrent updaters 4619 * exist). 4620 */ 4621 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4622 unsigned long batch) 4623 { 4624 /* start with a fail safe value for batch */ 4625 pcp->batch = 1; 4626 smp_wmb(); 4627 4628 /* Update high, then batch, in order */ 4629 pcp->high = high; 4630 smp_wmb(); 4631 4632 pcp->batch = batch; 4633 } 4634 4635 /* a companion to pageset_set_high() */ 4636 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4637 { 4638 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4639 } 4640 4641 static void pageset_init(struct per_cpu_pageset *p) 4642 { 4643 struct per_cpu_pages *pcp; 4644 int migratetype; 4645 4646 memset(p, 0, sizeof(*p)); 4647 4648 pcp = &p->pcp; 4649 pcp->count = 0; 4650 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4651 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4652 } 4653 4654 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4655 { 4656 pageset_init(p); 4657 pageset_set_batch(p, batch); 4658 } 4659 4660 /* 4661 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4662 * to the value high for the pageset p. 4663 */ 4664 static void pageset_set_high(struct per_cpu_pageset *p, 4665 unsigned long high) 4666 { 4667 unsigned long batch = max(1UL, high / 4); 4668 if ((high / 4) > (PAGE_SHIFT * 8)) 4669 batch = PAGE_SHIFT * 8; 4670 4671 pageset_update(&p->pcp, high, batch); 4672 } 4673 4674 static void pageset_set_high_and_batch(struct zone *zone, 4675 struct per_cpu_pageset *pcp) 4676 { 4677 if (percpu_pagelist_fraction) 4678 pageset_set_high(pcp, 4679 (zone->managed_pages / 4680 percpu_pagelist_fraction)); 4681 else 4682 pageset_set_batch(pcp, zone_batchsize(zone)); 4683 } 4684 4685 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4686 { 4687 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4688 4689 pageset_init(pcp); 4690 pageset_set_high_and_batch(zone, pcp); 4691 } 4692 4693 static void __meminit setup_zone_pageset(struct zone *zone) 4694 { 4695 int cpu; 4696 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4697 for_each_possible_cpu(cpu) 4698 zone_pageset_init(zone, cpu); 4699 } 4700 4701 /* 4702 * Allocate per cpu pagesets and initialize them. 4703 * Before this call only boot pagesets were available. 4704 */ 4705 void __init setup_per_cpu_pageset(void) 4706 { 4707 struct zone *zone; 4708 4709 for_each_populated_zone(zone) 4710 setup_zone_pageset(zone); 4711 } 4712 4713 static noinline __init_refok 4714 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4715 { 4716 int i; 4717 size_t alloc_size; 4718 4719 /* 4720 * The per-page waitqueue mechanism uses hashed waitqueues 4721 * per zone. 4722 */ 4723 zone->wait_table_hash_nr_entries = 4724 wait_table_hash_nr_entries(zone_size_pages); 4725 zone->wait_table_bits = 4726 wait_table_bits(zone->wait_table_hash_nr_entries); 4727 alloc_size = zone->wait_table_hash_nr_entries 4728 * sizeof(wait_queue_head_t); 4729 4730 if (!slab_is_available()) { 4731 zone->wait_table = (wait_queue_head_t *) 4732 memblock_virt_alloc_node_nopanic( 4733 alloc_size, zone->zone_pgdat->node_id); 4734 } else { 4735 /* 4736 * This case means that a zone whose size was 0 gets new memory 4737 * via memory hot-add. 4738 * But it may be the case that a new node was hot-added. In 4739 * this case vmalloc() will not be able to use this new node's 4740 * memory - this wait_table must be initialized to use this new 4741 * node itself as well. 4742 * To use this new node's memory, further consideration will be 4743 * necessary. 4744 */ 4745 zone->wait_table = vmalloc(alloc_size); 4746 } 4747 if (!zone->wait_table) 4748 return -ENOMEM; 4749 4750 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4751 init_waitqueue_head(zone->wait_table + i); 4752 4753 return 0; 4754 } 4755 4756 static __meminit void zone_pcp_init(struct zone *zone) 4757 { 4758 /* 4759 * per cpu subsystem is not up at this point. The following code 4760 * relies on the ability of the linker to provide the 4761 * offset of a (static) per cpu variable into the per cpu area. 4762 */ 4763 zone->pageset = &boot_pageset; 4764 4765 if (populated_zone(zone)) 4766 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4767 zone->name, zone->present_pages, 4768 zone_batchsize(zone)); 4769 } 4770 4771 int __meminit init_currently_empty_zone(struct zone *zone, 4772 unsigned long zone_start_pfn, 4773 unsigned long size) 4774 { 4775 struct pglist_data *pgdat = zone->zone_pgdat; 4776 int ret; 4777 ret = zone_wait_table_init(zone, size); 4778 if (ret) 4779 return ret; 4780 pgdat->nr_zones = zone_idx(zone) + 1; 4781 4782 zone->zone_start_pfn = zone_start_pfn; 4783 4784 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4785 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4786 pgdat->node_id, 4787 (unsigned long)zone_idx(zone), 4788 zone_start_pfn, (zone_start_pfn + size)); 4789 4790 zone_init_free_lists(zone); 4791 4792 return 0; 4793 } 4794 4795 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4796 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4797 4798 /* 4799 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4800 */ 4801 int __meminit __early_pfn_to_nid(unsigned long pfn, 4802 struct mminit_pfnnid_cache *state) 4803 { 4804 unsigned long start_pfn, end_pfn; 4805 int nid; 4806 4807 if (state->last_start <= pfn && pfn < state->last_end) 4808 return state->last_nid; 4809 4810 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4811 if (nid != -1) { 4812 state->last_start = start_pfn; 4813 state->last_end = end_pfn; 4814 state->last_nid = nid; 4815 } 4816 4817 return nid; 4818 } 4819 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4820 4821 /** 4822 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range 4823 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4824 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 4825 * 4826 * If an architecture guarantees that all ranges registered contain no holes 4827 * and may be freed, this this function may be used instead of calling 4828 * memblock_free_early_nid() manually. 4829 */ 4830 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4831 { 4832 unsigned long start_pfn, end_pfn; 4833 int i, this_nid; 4834 4835 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4836 start_pfn = min(start_pfn, max_low_pfn); 4837 end_pfn = min(end_pfn, max_low_pfn); 4838 4839 if (start_pfn < end_pfn) 4840 memblock_free_early_nid(PFN_PHYS(start_pfn), 4841 (end_pfn - start_pfn) << PAGE_SHIFT, 4842 this_nid); 4843 } 4844 } 4845 4846 /** 4847 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4848 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4849 * 4850 * If an architecture guarantees that all ranges registered contain no holes and may 4851 * be freed, this function may be used instead of calling memory_present() manually. 4852 */ 4853 void __init sparse_memory_present_with_active_regions(int nid) 4854 { 4855 unsigned long start_pfn, end_pfn; 4856 int i, this_nid; 4857 4858 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4859 memory_present(this_nid, start_pfn, end_pfn); 4860 } 4861 4862 /** 4863 * get_pfn_range_for_nid - Return the start and end page frames for a node 4864 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4865 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4866 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4867 * 4868 * It returns the start and end page frame of a node based on information 4869 * provided by memblock_set_node(). If called for a node 4870 * with no available memory, a warning is printed and the start and end 4871 * PFNs will be 0. 4872 */ 4873 void __meminit get_pfn_range_for_nid(unsigned int nid, 4874 unsigned long *start_pfn, unsigned long *end_pfn) 4875 { 4876 unsigned long this_start_pfn, this_end_pfn; 4877 int i; 4878 4879 *start_pfn = -1UL; 4880 *end_pfn = 0; 4881 4882 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4883 *start_pfn = min(*start_pfn, this_start_pfn); 4884 *end_pfn = max(*end_pfn, this_end_pfn); 4885 } 4886 4887 if (*start_pfn == -1UL) 4888 *start_pfn = 0; 4889 } 4890 4891 /* 4892 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4893 * assumption is made that zones within a node are ordered in monotonic 4894 * increasing memory addresses so that the "highest" populated zone is used 4895 */ 4896 static void __init find_usable_zone_for_movable(void) 4897 { 4898 int zone_index; 4899 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4900 if (zone_index == ZONE_MOVABLE) 4901 continue; 4902 4903 if (arch_zone_highest_possible_pfn[zone_index] > 4904 arch_zone_lowest_possible_pfn[zone_index]) 4905 break; 4906 } 4907 4908 VM_BUG_ON(zone_index == -1); 4909 movable_zone = zone_index; 4910 } 4911 4912 /* 4913 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4914 * because it is sized independent of architecture. Unlike the other zones, 4915 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4916 * in each node depending on the size of each node and how evenly kernelcore 4917 * is distributed. This helper function adjusts the zone ranges 4918 * provided by the architecture for a given node by using the end of the 4919 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4920 * zones within a node are in order of monotonic increases memory addresses 4921 */ 4922 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4923 unsigned long zone_type, 4924 unsigned long node_start_pfn, 4925 unsigned long node_end_pfn, 4926 unsigned long *zone_start_pfn, 4927 unsigned long *zone_end_pfn) 4928 { 4929 /* Only adjust if ZONE_MOVABLE is on this node */ 4930 if (zone_movable_pfn[nid]) { 4931 /* Size ZONE_MOVABLE */ 4932 if (zone_type == ZONE_MOVABLE) { 4933 *zone_start_pfn = zone_movable_pfn[nid]; 4934 *zone_end_pfn = min(node_end_pfn, 4935 arch_zone_highest_possible_pfn[movable_zone]); 4936 4937 /* Adjust for ZONE_MOVABLE starting within this range */ 4938 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4939 *zone_end_pfn > zone_movable_pfn[nid]) { 4940 *zone_end_pfn = zone_movable_pfn[nid]; 4941 4942 /* Check if this whole range is within ZONE_MOVABLE */ 4943 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4944 *zone_start_pfn = *zone_end_pfn; 4945 } 4946 } 4947 4948 /* 4949 * Return the number of pages a zone spans in a node, including holes 4950 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4951 */ 4952 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4953 unsigned long zone_type, 4954 unsigned long node_start_pfn, 4955 unsigned long node_end_pfn, 4956 unsigned long *ignored) 4957 { 4958 unsigned long zone_start_pfn, zone_end_pfn; 4959 4960 /* When hotadd a new node from cpu_up(), the node should be empty */ 4961 if (!node_start_pfn && !node_end_pfn) 4962 return 0; 4963 4964 /* Get the start and end of the zone */ 4965 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4966 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4967 adjust_zone_range_for_zone_movable(nid, zone_type, 4968 node_start_pfn, node_end_pfn, 4969 &zone_start_pfn, &zone_end_pfn); 4970 4971 /* Check that this node has pages within the zone's required range */ 4972 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4973 return 0; 4974 4975 /* Move the zone boundaries inside the node if necessary */ 4976 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4977 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4978 4979 /* Return the spanned pages */ 4980 return zone_end_pfn - zone_start_pfn; 4981 } 4982 4983 /* 4984 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4985 * then all holes in the requested range will be accounted for. 4986 */ 4987 unsigned long __meminit __absent_pages_in_range(int nid, 4988 unsigned long range_start_pfn, 4989 unsigned long range_end_pfn) 4990 { 4991 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4992 unsigned long start_pfn, end_pfn; 4993 int i; 4994 4995 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4996 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4997 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4998 nr_absent -= end_pfn - start_pfn; 4999 } 5000 return nr_absent; 5001 } 5002 5003 /** 5004 * absent_pages_in_range - Return number of page frames in holes within a range 5005 * @start_pfn: The start PFN to start searching for holes 5006 * @end_pfn: The end PFN to stop searching for holes 5007 * 5008 * It returns the number of pages frames in memory holes within a range. 5009 */ 5010 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 5011 unsigned long end_pfn) 5012 { 5013 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 5014 } 5015 5016 /* Return the number of page frames in holes in a zone on a node */ 5017 static unsigned long __meminit zone_absent_pages_in_node(int nid, 5018 unsigned long zone_type, 5019 unsigned long node_start_pfn, 5020 unsigned long node_end_pfn, 5021 unsigned long *ignored) 5022 { 5023 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 5024 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5025 unsigned long zone_start_pfn, zone_end_pfn; 5026 5027 /* When hotadd a new node from cpu_up(), the node should be empty */ 5028 if (!node_start_pfn && !node_end_pfn) 5029 return 0; 5030 5031 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 5032 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 5033 5034 adjust_zone_range_for_zone_movable(nid, zone_type, 5035 node_start_pfn, node_end_pfn, 5036 &zone_start_pfn, &zone_end_pfn); 5037 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 5038 } 5039 5040 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5041 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 5042 unsigned long zone_type, 5043 unsigned long node_start_pfn, 5044 unsigned long node_end_pfn, 5045 unsigned long *zones_size) 5046 { 5047 return zones_size[zone_type]; 5048 } 5049 5050 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 5051 unsigned long zone_type, 5052 unsigned long node_start_pfn, 5053 unsigned long node_end_pfn, 5054 unsigned long *zholes_size) 5055 { 5056 if (!zholes_size) 5057 return 0; 5058 5059 return zholes_size[zone_type]; 5060 } 5061 5062 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5063 5064 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 5065 unsigned long node_start_pfn, 5066 unsigned long node_end_pfn, 5067 unsigned long *zones_size, 5068 unsigned long *zholes_size) 5069 { 5070 unsigned long realtotalpages = 0, totalpages = 0; 5071 enum zone_type i; 5072 5073 for (i = 0; i < MAX_NR_ZONES; i++) { 5074 struct zone *zone = pgdat->node_zones + i; 5075 unsigned long size, real_size; 5076 5077 size = zone_spanned_pages_in_node(pgdat->node_id, i, 5078 node_start_pfn, 5079 node_end_pfn, 5080 zones_size); 5081 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, 5082 node_start_pfn, node_end_pfn, 5083 zholes_size); 5084 zone->spanned_pages = size; 5085 zone->present_pages = real_size; 5086 5087 totalpages += size; 5088 realtotalpages += real_size; 5089 } 5090 5091 pgdat->node_spanned_pages = totalpages; 5092 pgdat->node_present_pages = realtotalpages; 5093 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 5094 realtotalpages); 5095 } 5096 5097 #ifndef CONFIG_SPARSEMEM 5098 /* 5099 * Calculate the size of the zone->blockflags rounded to an unsigned long 5100 * Start by making sure zonesize is a multiple of pageblock_order by rounding 5101 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 5102 * round what is now in bits to nearest long in bits, then return it in 5103 * bytes. 5104 */ 5105 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 5106 { 5107 unsigned long usemapsize; 5108 5109 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 5110 usemapsize = roundup(zonesize, pageblock_nr_pages); 5111 usemapsize = usemapsize >> pageblock_order; 5112 usemapsize *= NR_PAGEBLOCK_BITS; 5113 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 5114 5115 return usemapsize / 8; 5116 } 5117 5118 static void __init setup_usemap(struct pglist_data *pgdat, 5119 struct zone *zone, 5120 unsigned long zone_start_pfn, 5121 unsigned long zonesize) 5122 { 5123 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 5124 zone->pageblock_flags = NULL; 5125 if (usemapsize) 5126 zone->pageblock_flags = 5127 memblock_virt_alloc_node_nopanic(usemapsize, 5128 pgdat->node_id); 5129 } 5130 #else 5131 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 5132 unsigned long zone_start_pfn, unsigned long zonesize) {} 5133 #endif /* CONFIG_SPARSEMEM */ 5134 5135 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 5136 5137 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 5138 void __paginginit set_pageblock_order(void) 5139 { 5140 unsigned int order; 5141 5142 /* Check that pageblock_nr_pages has not already been setup */ 5143 if (pageblock_order) 5144 return; 5145 5146 if (HPAGE_SHIFT > PAGE_SHIFT) 5147 order = HUGETLB_PAGE_ORDER; 5148 else 5149 order = MAX_ORDER - 1; 5150 5151 /* 5152 * Assume the largest contiguous order of interest is a huge page. 5153 * This value may be variable depending on boot parameters on IA64 and 5154 * powerpc. 5155 */ 5156 pageblock_order = order; 5157 } 5158 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5159 5160 /* 5161 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 5162 * is unused as pageblock_order is set at compile-time. See 5163 * include/linux/pageblock-flags.h for the values of pageblock_order based on 5164 * the kernel config 5165 */ 5166 void __paginginit set_pageblock_order(void) 5167 { 5168 } 5169 5170 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5171 5172 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 5173 unsigned long present_pages) 5174 { 5175 unsigned long pages = spanned_pages; 5176 5177 /* 5178 * Provide a more accurate estimation if there are holes within 5179 * the zone and SPARSEMEM is in use. If there are holes within the 5180 * zone, each populated memory region may cost us one or two extra 5181 * memmap pages due to alignment because memmap pages for each 5182 * populated regions may not naturally algined on page boundary. 5183 * So the (present_pages >> 4) heuristic is a tradeoff for that. 5184 */ 5185 if (spanned_pages > present_pages + (present_pages >> 4) && 5186 IS_ENABLED(CONFIG_SPARSEMEM)) 5187 pages = present_pages; 5188 5189 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 5190 } 5191 5192 /* 5193 * Set up the zone data structures: 5194 * - mark all pages reserved 5195 * - mark all memory queues empty 5196 * - clear the memory bitmaps 5197 * 5198 * NOTE: pgdat should get zeroed by caller. 5199 */ 5200 static void __paginginit free_area_init_core(struct pglist_data *pgdat) 5201 { 5202 enum zone_type j; 5203 int nid = pgdat->node_id; 5204 unsigned long zone_start_pfn = pgdat->node_start_pfn; 5205 int ret; 5206 5207 pgdat_resize_init(pgdat); 5208 #ifdef CONFIG_NUMA_BALANCING 5209 spin_lock_init(&pgdat->numabalancing_migrate_lock); 5210 pgdat->numabalancing_migrate_nr_pages = 0; 5211 pgdat->numabalancing_migrate_next_window = jiffies; 5212 #endif 5213 init_waitqueue_head(&pgdat->kswapd_wait); 5214 init_waitqueue_head(&pgdat->pfmemalloc_wait); 5215 pgdat_page_ext_init(pgdat); 5216 5217 for (j = 0; j < MAX_NR_ZONES; j++) { 5218 struct zone *zone = pgdat->node_zones + j; 5219 unsigned long size, realsize, freesize, memmap_pages; 5220 5221 size = zone->spanned_pages; 5222 realsize = freesize = zone->present_pages; 5223 5224 /* 5225 * Adjust freesize so that it accounts for how much memory 5226 * is used by this zone for memmap. This affects the watermark 5227 * and per-cpu initialisations 5228 */ 5229 memmap_pages = calc_memmap_size(size, realsize); 5230 if (!is_highmem_idx(j)) { 5231 if (freesize >= memmap_pages) { 5232 freesize -= memmap_pages; 5233 if (memmap_pages) 5234 printk(KERN_DEBUG 5235 " %s zone: %lu pages used for memmap\n", 5236 zone_names[j], memmap_pages); 5237 } else 5238 printk(KERN_WARNING 5239 " %s zone: %lu pages exceeds freesize %lu\n", 5240 zone_names[j], memmap_pages, freesize); 5241 } 5242 5243 /* Account for reserved pages */ 5244 if (j == 0 && freesize > dma_reserve) { 5245 freesize -= dma_reserve; 5246 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 5247 zone_names[0], dma_reserve); 5248 } 5249 5250 if (!is_highmem_idx(j)) 5251 nr_kernel_pages += freesize; 5252 /* Charge for highmem memmap if there are enough kernel pages */ 5253 else if (nr_kernel_pages > memmap_pages * 2) 5254 nr_kernel_pages -= memmap_pages; 5255 nr_all_pages += freesize; 5256 5257 /* 5258 * Set an approximate value for lowmem here, it will be adjusted 5259 * when the bootmem allocator frees pages into the buddy system. 5260 * And all highmem pages will be managed by the buddy system. 5261 */ 5262 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 5263 #ifdef CONFIG_NUMA 5264 zone->node = nid; 5265 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 5266 / 100; 5267 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 5268 #endif 5269 zone->name = zone_names[j]; 5270 spin_lock_init(&zone->lock); 5271 spin_lock_init(&zone->lru_lock); 5272 zone_seqlock_init(zone); 5273 zone->zone_pgdat = pgdat; 5274 zone_pcp_init(zone); 5275 5276 /* For bootup, initialized properly in watermark setup */ 5277 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 5278 5279 lruvec_init(&zone->lruvec); 5280 if (!size) 5281 continue; 5282 5283 set_pageblock_order(); 5284 setup_usemap(pgdat, zone, zone_start_pfn, size); 5285 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 5286 BUG_ON(ret); 5287 memmap_init(size, nid, j, zone_start_pfn); 5288 zone_start_pfn += size; 5289 } 5290 } 5291 5292 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 5293 { 5294 unsigned long __maybe_unused start = 0; 5295 unsigned long __maybe_unused offset = 0; 5296 5297 /* Skip empty nodes */ 5298 if (!pgdat->node_spanned_pages) 5299 return; 5300 5301 #ifdef CONFIG_FLAT_NODE_MEM_MAP 5302 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 5303 offset = pgdat->node_start_pfn - start; 5304 /* ia64 gets its own node_mem_map, before this, without bootmem */ 5305 if (!pgdat->node_mem_map) { 5306 unsigned long size, end; 5307 struct page *map; 5308 5309 /* 5310 * The zone's endpoints aren't required to be MAX_ORDER 5311 * aligned but the node_mem_map endpoints must be in order 5312 * for the buddy allocator to function correctly. 5313 */ 5314 end = pgdat_end_pfn(pgdat); 5315 end = ALIGN(end, MAX_ORDER_NR_PAGES); 5316 size = (end - start) * sizeof(struct page); 5317 map = alloc_remap(pgdat->node_id, size); 5318 if (!map) 5319 map = memblock_virt_alloc_node_nopanic(size, 5320 pgdat->node_id); 5321 pgdat->node_mem_map = map + offset; 5322 } 5323 #ifndef CONFIG_NEED_MULTIPLE_NODES 5324 /* 5325 * With no DISCONTIG, the global mem_map is just set as node 0's 5326 */ 5327 if (pgdat == NODE_DATA(0)) { 5328 mem_map = NODE_DATA(0)->node_mem_map; 5329 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) 5330 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 5331 mem_map -= offset; 5332 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5333 } 5334 #endif 5335 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 5336 } 5337 5338 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 5339 unsigned long node_start_pfn, unsigned long *zholes_size) 5340 { 5341 pg_data_t *pgdat = NODE_DATA(nid); 5342 unsigned long start_pfn = 0; 5343 unsigned long end_pfn = 0; 5344 5345 /* pg_data_t should be reset to zero when it's allocated */ 5346 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 5347 5348 reset_deferred_meminit(pgdat); 5349 pgdat->node_id = nid; 5350 pgdat->node_start_pfn = node_start_pfn; 5351 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5352 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5353 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 5354 (u64)start_pfn << PAGE_SHIFT, 5355 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 5356 #endif 5357 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5358 zones_size, zholes_size); 5359 5360 alloc_node_mem_map(pgdat); 5361 #ifdef CONFIG_FLAT_NODE_MEM_MAP 5362 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 5363 nid, (unsigned long)pgdat, 5364 (unsigned long)pgdat->node_mem_map); 5365 #endif 5366 5367 free_area_init_core(pgdat); 5368 } 5369 5370 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5371 5372 #if MAX_NUMNODES > 1 5373 /* 5374 * Figure out the number of possible node ids. 5375 */ 5376 void __init setup_nr_node_ids(void) 5377 { 5378 unsigned int highest; 5379 5380 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 5381 nr_node_ids = highest + 1; 5382 } 5383 #endif 5384 5385 /** 5386 * node_map_pfn_alignment - determine the maximum internode alignment 5387 * 5388 * This function should be called after node map is populated and sorted. 5389 * It calculates the maximum power of two alignment which can distinguish 5390 * all the nodes. 5391 * 5392 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 5393 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 5394 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 5395 * shifted, 1GiB is enough and this function will indicate so. 5396 * 5397 * This is used to test whether pfn -> nid mapping of the chosen memory 5398 * model has fine enough granularity to avoid incorrect mapping for the 5399 * populated node map. 5400 * 5401 * Returns the determined alignment in pfn's. 0 if there is no alignment 5402 * requirement (single node). 5403 */ 5404 unsigned long __init node_map_pfn_alignment(void) 5405 { 5406 unsigned long accl_mask = 0, last_end = 0; 5407 unsigned long start, end, mask; 5408 int last_nid = -1; 5409 int i, nid; 5410 5411 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 5412 if (!start || last_nid < 0 || last_nid == nid) { 5413 last_nid = nid; 5414 last_end = end; 5415 continue; 5416 } 5417 5418 /* 5419 * Start with a mask granular enough to pin-point to the 5420 * start pfn and tick off bits one-by-one until it becomes 5421 * too coarse to separate the current node from the last. 5422 */ 5423 mask = ~((1 << __ffs(start)) - 1); 5424 while (mask && last_end <= (start & (mask << 1))) 5425 mask <<= 1; 5426 5427 /* accumulate all internode masks */ 5428 accl_mask |= mask; 5429 } 5430 5431 /* convert mask to number of pages */ 5432 return ~accl_mask + 1; 5433 } 5434 5435 /* Find the lowest pfn for a node */ 5436 static unsigned long __init find_min_pfn_for_node(int nid) 5437 { 5438 unsigned long min_pfn = ULONG_MAX; 5439 unsigned long start_pfn; 5440 int i; 5441 5442 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5443 min_pfn = min(min_pfn, start_pfn); 5444 5445 if (min_pfn == ULONG_MAX) { 5446 printk(KERN_WARNING 5447 "Could not find start_pfn for node %d\n", nid); 5448 return 0; 5449 } 5450 5451 return min_pfn; 5452 } 5453 5454 /** 5455 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5456 * 5457 * It returns the minimum PFN based on information provided via 5458 * memblock_set_node(). 5459 */ 5460 unsigned long __init find_min_pfn_with_active_regions(void) 5461 { 5462 return find_min_pfn_for_node(MAX_NUMNODES); 5463 } 5464 5465 /* 5466 * early_calculate_totalpages() 5467 * Sum pages in active regions for movable zone. 5468 * Populate N_MEMORY for calculating usable_nodes. 5469 */ 5470 static unsigned long __init early_calculate_totalpages(void) 5471 { 5472 unsigned long totalpages = 0; 5473 unsigned long start_pfn, end_pfn; 5474 int i, nid; 5475 5476 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5477 unsigned long pages = end_pfn - start_pfn; 5478 5479 totalpages += pages; 5480 if (pages) 5481 node_set_state(nid, N_MEMORY); 5482 } 5483 return totalpages; 5484 } 5485 5486 /* 5487 * Find the PFN the Movable zone begins in each node. Kernel memory 5488 * is spread evenly between nodes as long as the nodes have enough 5489 * memory. When they don't, some nodes will have more kernelcore than 5490 * others 5491 */ 5492 static void __init find_zone_movable_pfns_for_nodes(void) 5493 { 5494 int i, nid; 5495 unsigned long usable_startpfn; 5496 unsigned long kernelcore_node, kernelcore_remaining; 5497 /* save the state before borrow the nodemask */ 5498 nodemask_t saved_node_state = node_states[N_MEMORY]; 5499 unsigned long totalpages = early_calculate_totalpages(); 5500 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5501 struct memblock_region *r; 5502 5503 /* Need to find movable_zone earlier when movable_node is specified. */ 5504 find_usable_zone_for_movable(); 5505 5506 /* 5507 * If movable_node is specified, ignore kernelcore and movablecore 5508 * options. 5509 */ 5510 if (movable_node_is_enabled()) { 5511 for_each_memblock(memory, r) { 5512 if (!memblock_is_hotpluggable(r)) 5513 continue; 5514 5515 nid = r->nid; 5516 5517 usable_startpfn = PFN_DOWN(r->base); 5518 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 5519 min(usable_startpfn, zone_movable_pfn[nid]) : 5520 usable_startpfn; 5521 } 5522 5523 goto out2; 5524 } 5525 5526 /* 5527 * If movablecore=nn[KMG] was specified, calculate what size of 5528 * kernelcore that corresponds so that memory usable for 5529 * any allocation type is evenly spread. If both kernelcore 5530 * and movablecore are specified, then the value of kernelcore 5531 * will be used for required_kernelcore if it's greater than 5532 * what movablecore would have allowed. 5533 */ 5534 if (required_movablecore) { 5535 unsigned long corepages; 5536 5537 /* 5538 * Round-up so that ZONE_MOVABLE is at least as large as what 5539 * was requested by the user 5540 */ 5541 required_movablecore = 5542 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5543 required_movablecore = min(totalpages, required_movablecore); 5544 corepages = totalpages - required_movablecore; 5545 5546 required_kernelcore = max(required_kernelcore, corepages); 5547 } 5548 5549 /* 5550 * If kernelcore was not specified or kernelcore size is larger 5551 * than totalpages, there is no ZONE_MOVABLE. 5552 */ 5553 if (!required_kernelcore || required_kernelcore >= totalpages) 5554 goto out; 5555 5556 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5557 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5558 5559 restart: 5560 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5561 kernelcore_node = required_kernelcore / usable_nodes; 5562 for_each_node_state(nid, N_MEMORY) { 5563 unsigned long start_pfn, end_pfn; 5564 5565 /* 5566 * Recalculate kernelcore_node if the division per node 5567 * now exceeds what is necessary to satisfy the requested 5568 * amount of memory for the kernel 5569 */ 5570 if (required_kernelcore < kernelcore_node) 5571 kernelcore_node = required_kernelcore / usable_nodes; 5572 5573 /* 5574 * As the map is walked, we track how much memory is usable 5575 * by the kernel using kernelcore_remaining. When it is 5576 * 0, the rest of the node is usable by ZONE_MOVABLE 5577 */ 5578 kernelcore_remaining = kernelcore_node; 5579 5580 /* Go through each range of PFNs within this node */ 5581 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5582 unsigned long size_pages; 5583 5584 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5585 if (start_pfn >= end_pfn) 5586 continue; 5587 5588 /* Account for what is only usable for kernelcore */ 5589 if (start_pfn < usable_startpfn) { 5590 unsigned long kernel_pages; 5591 kernel_pages = min(end_pfn, usable_startpfn) 5592 - start_pfn; 5593 5594 kernelcore_remaining -= min(kernel_pages, 5595 kernelcore_remaining); 5596 required_kernelcore -= min(kernel_pages, 5597 required_kernelcore); 5598 5599 /* Continue if range is now fully accounted */ 5600 if (end_pfn <= usable_startpfn) { 5601 5602 /* 5603 * Push zone_movable_pfn to the end so 5604 * that if we have to rebalance 5605 * kernelcore across nodes, we will 5606 * not double account here 5607 */ 5608 zone_movable_pfn[nid] = end_pfn; 5609 continue; 5610 } 5611 start_pfn = usable_startpfn; 5612 } 5613 5614 /* 5615 * The usable PFN range for ZONE_MOVABLE is from 5616 * start_pfn->end_pfn. Calculate size_pages as the 5617 * number of pages used as kernelcore 5618 */ 5619 size_pages = end_pfn - start_pfn; 5620 if (size_pages > kernelcore_remaining) 5621 size_pages = kernelcore_remaining; 5622 zone_movable_pfn[nid] = start_pfn + size_pages; 5623 5624 /* 5625 * Some kernelcore has been met, update counts and 5626 * break if the kernelcore for this node has been 5627 * satisfied 5628 */ 5629 required_kernelcore -= min(required_kernelcore, 5630 size_pages); 5631 kernelcore_remaining -= size_pages; 5632 if (!kernelcore_remaining) 5633 break; 5634 } 5635 } 5636 5637 /* 5638 * If there is still required_kernelcore, we do another pass with one 5639 * less node in the count. This will push zone_movable_pfn[nid] further 5640 * along on the nodes that still have memory until kernelcore is 5641 * satisfied 5642 */ 5643 usable_nodes--; 5644 if (usable_nodes && required_kernelcore > usable_nodes) 5645 goto restart; 5646 5647 out2: 5648 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5649 for (nid = 0; nid < MAX_NUMNODES; nid++) 5650 zone_movable_pfn[nid] = 5651 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5652 5653 out: 5654 /* restore the node_state */ 5655 node_states[N_MEMORY] = saved_node_state; 5656 } 5657 5658 /* Any regular or high memory on that node ? */ 5659 static void check_for_memory(pg_data_t *pgdat, int nid) 5660 { 5661 enum zone_type zone_type; 5662 5663 if (N_MEMORY == N_NORMAL_MEMORY) 5664 return; 5665 5666 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5667 struct zone *zone = &pgdat->node_zones[zone_type]; 5668 if (populated_zone(zone)) { 5669 node_set_state(nid, N_HIGH_MEMORY); 5670 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5671 zone_type <= ZONE_NORMAL) 5672 node_set_state(nid, N_NORMAL_MEMORY); 5673 break; 5674 } 5675 } 5676 } 5677 5678 /** 5679 * free_area_init_nodes - Initialise all pg_data_t and zone data 5680 * @max_zone_pfn: an array of max PFNs for each zone 5681 * 5682 * This will call free_area_init_node() for each active node in the system. 5683 * Using the page ranges provided by memblock_set_node(), the size of each 5684 * zone in each node and their holes is calculated. If the maximum PFN 5685 * between two adjacent zones match, it is assumed that the zone is empty. 5686 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5687 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5688 * starts where the previous one ended. For example, ZONE_DMA32 starts 5689 * at arch_max_dma_pfn. 5690 */ 5691 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5692 { 5693 unsigned long start_pfn, end_pfn; 5694 int i, nid; 5695 5696 /* Record where the zone boundaries are */ 5697 memset(arch_zone_lowest_possible_pfn, 0, 5698 sizeof(arch_zone_lowest_possible_pfn)); 5699 memset(arch_zone_highest_possible_pfn, 0, 5700 sizeof(arch_zone_highest_possible_pfn)); 5701 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5702 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5703 for (i = 1; i < MAX_NR_ZONES; i++) { 5704 if (i == ZONE_MOVABLE) 5705 continue; 5706 arch_zone_lowest_possible_pfn[i] = 5707 arch_zone_highest_possible_pfn[i-1]; 5708 arch_zone_highest_possible_pfn[i] = 5709 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5710 } 5711 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5712 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5713 5714 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5715 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5716 find_zone_movable_pfns_for_nodes(); 5717 5718 /* Print out the zone ranges */ 5719 pr_info("Zone ranges:\n"); 5720 for (i = 0; i < MAX_NR_ZONES; i++) { 5721 if (i == ZONE_MOVABLE) 5722 continue; 5723 pr_info(" %-8s ", zone_names[i]); 5724 if (arch_zone_lowest_possible_pfn[i] == 5725 arch_zone_highest_possible_pfn[i]) 5726 pr_cont("empty\n"); 5727 else 5728 pr_cont("[mem %#018Lx-%#018Lx]\n", 5729 (u64)arch_zone_lowest_possible_pfn[i] 5730 << PAGE_SHIFT, 5731 ((u64)arch_zone_highest_possible_pfn[i] 5732 << PAGE_SHIFT) - 1); 5733 } 5734 5735 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5736 pr_info("Movable zone start for each node\n"); 5737 for (i = 0; i < MAX_NUMNODES; i++) { 5738 if (zone_movable_pfn[i]) 5739 pr_info(" Node %d: %#018Lx\n", i, 5740 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 5741 } 5742 5743 /* Print out the early node map */ 5744 pr_info("Early memory node ranges\n"); 5745 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5746 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 5747 (u64)start_pfn << PAGE_SHIFT, 5748 ((u64)end_pfn << PAGE_SHIFT) - 1); 5749 5750 /* Initialise every node */ 5751 mminit_verify_pageflags_layout(); 5752 setup_nr_node_ids(); 5753 for_each_online_node(nid) { 5754 pg_data_t *pgdat = NODE_DATA(nid); 5755 free_area_init_node(nid, NULL, 5756 find_min_pfn_for_node(nid), NULL); 5757 5758 /* Any memory on that node */ 5759 if (pgdat->node_present_pages) 5760 node_set_state(nid, N_MEMORY); 5761 check_for_memory(pgdat, nid); 5762 } 5763 } 5764 5765 static int __init cmdline_parse_core(char *p, unsigned long *core) 5766 { 5767 unsigned long long coremem; 5768 if (!p) 5769 return -EINVAL; 5770 5771 coremem = memparse(p, &p); 5772 *core = coremem >> PAGE_SHIFT; 5773 5774 /* Paranoid check that UL is enough for the coremem value */ 5775 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5776 5777 return 0; 5778 } 5779 5780 /* 5781 * kernelcore=size sets the amount of memory for use for allocations that 5782 * cannot be reclaimed or migrated. 5783 */ 5784 static int __init cmdline_parse_kernelcore(char *p) 5785 { 5786 return cmdline_parse_core(p, &required_kernelcore); 5787 } 5788 5789 /* 5790 * movablecore=size sets the amount of memory for use for allocations that 5791 * can be reclaimed or migrated. 5792 */ 5793 static int __init cmdline_parse_movablecore(char *p) 5794 { 5795 return cmdline_parse_core(p, &required_movablecore); 5796 } 5797 5798 early_param("kernelcore", cmdline_parse_kernelcore); 5799 early_param("movablecore", cmdline_parse_movablecore); 5800 5801 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5802 5803 void adjust_managed_page_count(struct page *page, long count) 5804 { 5805 spin_lock(&managed_page_count_lock); 5806 page_zone(page)->managed_pages += count; 5807 totalram_pages += count; 5808 #ifdef CONFIG_HIGHMEM 5809 if (PageHighMem(page)) 5810 totalhigh_pages += count; 5811 #endif 5812 spin_unlock(&managed_page_count_lock); 5813 } 5814 EXPORT_SYMBOL(adjust_managed_page_count); 5815 5816 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5817 { 5818 void *pos; 5819 unsigned long pages = 0; 5820 5821 start = (void *)PAGE_ALIGN((unsigned long)start); 5822 end = (void *)((unsigned long)end & PAGE_MASK); 5823 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5824 if ((unsigned int)poison <= 0xFF) 5825 memset(pos, poison, PAGE_SIZE); 5826 free_reserved_page(virt_to_page(pos)); 5827 } 5828 5829 if (pages && s) 5830 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5831 s, pages << (PAGE_SHIFT - 10), start, end); 5832 5833 return pages; 5834 } 5835 EXPORT_SYMBOL(free_reserved_area); 5836 5837 #ifdef CONFIG_HIGHMEM 5838 void free_highmem_page(struct page *page) 5839 { 5840 __free_reserved_page(page); 5841 totalram_pages++; 5842 page_zone(page)->managed_pages++; 5843 totalhigh_pages++; 5844 } 5845 #endif 5846 5847 5848 void __init mem_init_print_info(const char *str) 5849 { 5850 unsigned long physpages, codesize, datasize, rosize, bss_size; 5851 unsigned long init_code_size, init_data_size; 5852 5853 physpages = get_num_physpages(); 5854 codesize = _etext - _stext; 5855 datasize = _edata - _sdata; 5856 rosize = __end_rodata - __start_rodata; 5857 bss_size = __bss_stop - __bss_start; 5858 init_data_size = __init_end - __init_begin; 5859 init_code_size = _einittext - _sinittext; 5860 5861 /* 5862 * Detect special cases and adjust section sizes accordingly: 5863 * 1) .init.* may be embedded into .data sections 5864 * 2) .init.text.* may be out of [__init_begin, __init_end], 5865 * please refer to arch/tile/kernel/vmlinux.lds.S. 5866 * 3) .rodata.* may be embedded into .text or .data sections. 5867 */ 5868 #define adj_init_size(start, end, size, pos, adj) \ 5869 do { \ 5870 if (start <= pos && pos < end && size > adj) \ 5871 size -= adj; \ 5872 } while (0) 5873 5874 adj_init_size(__init_begin, __init_end, init_data_size, 5875 _sinittext, init_code_size); 5876 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5877 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5878 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5879 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5880 5881 #undef adj_init_size 5882 5883 pr_info("Memory: %luK/%luK available " 5884 "(%luK kernel code, %luK rwdata, %luK rodata, " 5885 "%luK init, %luK bss, %luK reserved, %luK cma-reserved" 5886 #ifdef CONFIG_HIGHMEM 5887 ", %luK highmem" 5888 #endif 5889 "%s%s)\n", 5890 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5891 codesize >> 10, datasize >> 10, rosize >> 10, 5892 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5893 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), 5894 totalcma_pages << (PAGE_SHIFT-10), 5895 #ifdef CONFIG_HIGHMEM 5896 totalhigh_pages << (PAGE_SHIFT-10), 5897 #endif 5898 str ? ", " : "", str ? str : ""); 5899 } 5900 5901 /** 5902 * set_dma_reserve - set the specified number of pages reserved in the first zone 5903 * @new_dma_reserve: The number of pages to mark reserved 5904 * 5905 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 5906 * In the DMA zone, a significant percentage may be consumed by kernel image 5907 * and other unfreeable allocations which can skew the watermarks badly. This 5908 * function may optionally be used to account for unfreeable pages in the 5909 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5910 * smaller per-cpu batchsize. 5911 */ 5912 void __init set_dma_reserve(unsigned long new_dma_reserve) 5913 { 5914 dma_reserve = new_dma_reserve; 5915 } 5916 5917 void __init free_area_init(unsigned long *zones_size) 5918 { 5919 free_area_init_node(0, zones_size, 5920 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5921 } 5922 5923 static int page_alloc_cpu_notify(struct notifier_block *self, 5924 unsigned long action, void *hcpu) 5925 { 5926 int cpu = (unsigned long)hcpu; 5927 5928 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5929 lru_add_drain_cpu(cpu); 5930 drain_pages(cpu); 5931 5932 /* 5933 * Spill the event counters of the dead processor 5934 * into the current processors event counters. 5935 * This artificially elevates the count of the current 5936 * processor. 5937 */ 5938 vm_events_fold_cpu(cpu); 5939 5940 /* 5941 * Zero the differential counters of the dead processor 5942 * so that the vm statistics are consistent. 5943 * 5944 * This is only okay since the processor is dead and cannot 5945 * race with what we are doing. 5946 */ 5947 cpu_vm_stats_fold(cpu); 5948 } 5949 return NOTIFY_OK; 5950 } 5951 5952 void __init page_alloc_init(void) 5953 { 5954 hotcpu_notifier(page_alloc_cpu_notify, 0); 5955 } 5956 5957 /* 5958 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 5959 * or min_free_kbytes changes. 5960 */ 5961 static void calculate_totalreserve_pages(void) 5962 { 5963 struct pglist_data *pgdat; 5964 unsigned long reserve_pages = 0; 5965 enum zone_type i, j; 5966 5967 for_each_online_pgdat(pgdat) { 5968 for (i = 0; i < MAX_NR_ZONES; i++) { 5969 struct zone *zone = pgdat->node_zones + i; 5970 long max = 0; 5971 5972 /* Find valid and maximum lowmem_reserve in the zone */ 5973 for (j = i; j < MAX_NR_ZONES; j++) { 5974 if (zone->lowmem_reserve[j] > max) 5975 max = zone->lowmem_reserve[j]; 5976 } 5977 5978 /* we treat the high watermark as reserved pages. */ 5979 max += high_wmark_pages(zone); 5980 5981 if (max > zone->managed_pages) 5982 max = zone->managed_pages; 5983 5984 zone->totalreserve_pages = max; 5985 5986 reserve_pages += max; 5987 } 5988 } 5989 totalreserve_pages = reserve_pages; 5990 } 5991 5992 /* 5993 * setup_per_zone_lowmem_reserve - called whenever 5994 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 5995 * has a correct pages reserved value, so an adequate number of 5996 * pages are left in the zone after a successful __alloc_pages(). 5997 */ 5998 static void setup_per_zone_lowmem_reserve(void) 5999 { 6000 struct pglist_data *pgdat; 6001 enum zone_type j, idx; 6002 6003 for_each_online_pgdat(pgdat) { 6004 for (j = 0; j < MAX_NR_ZONES; j++) { 6005 struct zone *zone = pgdat->node_zones + j; 6006 unsigned long managed_pages = zone->managed_pages; 6007 6008 zone->lowmem_reserve[j] = 0; 6009 6010 idx = j; 6011 while (idx) { 6012 struct zone *lower_zone; 6013 6014 idx--; 6015 6016 if (sysctl_lowmem_reserve_ratio[idx] < 1) 6017 sysctl_lowmem_reserve_ratio[idx] = 1; 6018 6019 lower_zone = pgdat->node_zones + idx; 6020 lower_zone->lowmem_reserve[j] = managed_pages / 6021 sysctl_lowmem_reserve_ratio[idx]; 6022 managed_pages += lower_zone->managed_pages; 6023 } 6024 } 6025 } 6026 6027 /* update totalreserve_pages */ 6028 calculate_totalreserve_pages(); 6029 } 6030 6031 static void __setup_per_zone_wmarks(void) 6032 { 6033 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 6034 unsigned long lowmem_pages = 0; 6035 struct zone *zone; 6036 unsigned long flags; 6037 6038 /* Calculate total number of !ZONE_HIGHMEM pages */ 6039 for_each_zone(zone) { 6040 if (!is_highmem(zone)) 6041 lowmem_pages += zone->managed_pages; 6042 } 6043 6044 for_each_zone(zone) { 6045 u64 tmp; 6046 6047 spin_lock_irqsave(&zone->lock, flags); 6048 tmp = (u64)pages_min * zone->managed_pages; 6049 do_div(tmp, lowmem_pages); 6050 if (is_highmem(zone)) { 6051 /* 6052 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 6053 * need highmem pages, so cap pages_min to a small 6054 * value here. 6055 * 6056 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 6057 * deltas control asynch page reclaim, and so should 6058 * not be capped for highmem. 6059 */ 6060 unsigned long min_pages; 6061 6062 min_pages = zone->managed_pages / 1024; 6063 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 6064 zone->watermark[WMARK_MIN] = min_pages; 6065 } else { 6066 /* 6067 * If it's a lowmem zone, reserve a number of pages 6068 * proportionate to the zone's size. 6069 */ 6070 zone->watermark[WMARK_MIN] = tmp; 6071 } 6072 6073 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 6074 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 6075 6076 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 6077 high_wmark_pages(zone) - low_wmark_pages(zone) - 6078 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 6079 6080 spin_unlock_irqrestore(&zone->lock, flags); 6081 } 6082 6083 /* update totalreserve_pages */ 6084 calculate_totalreserve_pages(); 6085 } 6086 6087 /** 6088 * setup_per_zone_wmarks - called when min_free_kbytes changes 6089 * or when memory is hot-{added|removed} 6090 * 6091 * Ensures that the watermark[min,low,high] values for each zone are set 6092 * correctly with respect to min_free_kbytes. 6093 */ 6094 void setup_per_zone_wmarks(void) 6095 { 6096 mutex_lock(&zonelists_mutex); 6097 __setup_per_zone_wmarks(); 6098 mutex_unlock(&zonelists_mutex); 6099 } 6100 6101 /* 6102 * The inactive anon list should be small enough that the VM never has to 6103 * do too much work, but large enough that each inactive page has a chance 6104 * to be referenced again before it is swapped out. 6105 * 6106 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 6107 * INACTIVE_ANON pages on this zone's LRU, maintained by the 6108 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 6109 * the anonymous pages are kept on the inactive list. 6110 * 6111 * total target max 6112 * memory ratio inactive anon 6113 * ------------------------------------- 6114 * 10MB 1 5MB 6115 * 100MB 1 50MB 6116 * 1GB 3 250MB 6117 * 10GB 10 0.9GB 6118 * 100GB 31 3GB 6119 * 1TB 101 10GB 6120 * 10TB 320 32GB 6121 */ 6122 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 6123 { 6124 unsigned int gb, ratio; 6125 6126 /* Zone size in gigabytes */ 6127 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 6128 if (gb) 6129 ratio = int_sqrt(10 * gb); 6130 else 6131 ratio = 1; 6132 6133 zone->inactive_ratio = ratio; 6134 } 6135 6136 static void __meminit setup_per_zone_inactive_ratio(void) 6137 { 6138 struct zone *zone; 6139 6140 for_each_zone(zone) 6141 calculate_zone_inactive_ratio(zone); 6142 } 6143 6144 /* 6145 * Initialise min_free_kbytes. 6146 * 6147 * For small machines we want it small (128k min). For large machines 6148 * we want it large (64MB max). But it is not linear, because network 6149 * bandwidth does not increase linearly with machine size. We use 6150 * 6151 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 6152 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 6153 * 6154 * which yields 6155 * 6156 * 16MB: 512k 6157 * 32MB: 724k 6158 * 64MB: 1024k 6159 * 128MB: 1448k 6160 * 256MB: 2048k 6161 * 512MB: 2896k 6162 * 1024MB: 4096k 6163 * 2048MB: 5792k 6164 * 4096MB: 8192k 6165 * 8192MB: 11584k 6166 * 16384MB: 16384k 6167 */ 6168 int __meminit init_per_zone_wmark_min(void) 6169 { 6170 unsigned long lowmem_kbytes; 6171 int new_min_free_kbytes; 6172 6173 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 6174 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 6175 6176 if (new_min_free_kbytes > user_min_free_kbytes) { 6177 min_free_kbytes = new_min_free_kbytes; 6178 if (min_free_kbytes < 128) 6179 min_free_kbytes = 128; 6180 if (min_free_kbytes > 65536) 6181 min_free_kbytes = 65536; 6182 } else { 6183 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 6184 new_min_free_kbytes, user_min_free_kbytes); 6185 } 6186 setup_per_zone_wmarks(); 6187 refresh_zone_stat_thresholds(); 6188 setup_per_zone_lowmem_reserve(); 6189 setup_per_zone_inactive_ratio(); 6190 return 0; 6191 } 6192 module_init(init_per_zone_wmark_min) 6193 6194 /* 6195 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 6196 * that we can call two helper functions whenever min_free_kbytes 6197 * changes. 6198 */ 6199 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 6200 void __user *buffer, size_t *length, loff_t *ppos) 6201 { 6202 int rc; 6203 6204 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6205 if (rc) 6206 return rc; 6207 6208 if (write) { 6209 user_min_free_kbytes = min_free_kbytes; 6210 setup_per_zone_wmarks(); 6211 } 6212 return 0; 6213 } 6214 6215 #ifdef CONFIG_NUMA 6216 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 6217 void __user *buffer, size_t *length, loff_t *ppos) 6218 { 6219 struct zone *zone; 6220 int rc; 6221 6222 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6223 if (rc) 6224 return rc; 6225 6226 for_each_zone(zone) 6227 zone->min_unmapped_pages = (zone->managed_pages * 6228 sysctl_min_unmapped_ratio) / 100; 6229 return 0; 6230 } 6231 6232 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 6233 void __user *buffer, size_t *length, loff_t *ppos) 6234 { 6235 struct zone *zone; 6236 int rc; 6237 6238 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6239 if (rc) 6240 return rc; 6241 6242 for_each_zone(zone) 6243 zone->min_slab_pages = (zone->managed_pages * 6244 sysctl_min_slab_ratio) / 100; 6245 return 0; 6246 } 6247 #endif 6248 6249 /* 6250 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 6251 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 6252 * whenever sysctl_lowmem_reserve_ratio changes. 6253 * 6254 * The reserve ratio obviously has absolutely no relation with the 6255 * minimum watermarks. The lowmem reserve ratio can only make sense 6256 * if in function of the boot time zone sizes. 6257 */ 6258 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 6259 void __user *buffer, size_t *length, loff_t *ppos) 6260 { 6261 proc_dointvec_minmax(table, write, buffer, length, ppos); 6262 setup_per_zone_lowmem_reserve(); 6263 return 0; 6264 } 6265 6266 /* 6267 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 6268 * cpu. It is the fraction of total pages in each zone that a hot per cpu 6269 * pagelist can have before it gets flushed back to buddy allocator. 6270 */ 6271 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, 6272 void __user *buffer, size_t *length, loff_t *ppos) 6273 { 6274 struct zone *zone; 6275 int old_percpu_pagelist_fraction; 6276 int ret; 6277 6278 mutex_lock(&pcp_batch_high_lock); 6279 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 6280 6281 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 6282 if (!write || ret < 0) 6283 goto out; 6284 6285 /* Sanity checking to avoid pcp imbalance */ 6286 if (percpu_pagelist_fraction && 6287 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 6288 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 6289 ret = -EINVAL; 6290 goto out; 6291 } 6292 6293 /* No change? */ 6294 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 6295 goto out; 6296 6297 for_each_populated_zone(zone) { 6298 unsigned int cpu; 6299 6300 for_each_possible_cpu(cpu) 6301 pageset_set_high_and_batch(zone, 6302 per_cpu_ptr(zone->pageset, cpu)); 6303 } 6304 out: 6305 mutex_unlock(&pcp_batch_high_lock); 6306 return ret; 6307 } 6308 6309 #ifdef CONFIG_NUMA 6310 int hashdist = HASHDIST_DEFAULT; 6311 6312 static int __init set_hashdist(char *str) 6313 { 6314 if (!str) 6315 return 0; 6316 hashdist = simple_strtoul(str, &str, 0); 6317 return 1; 6318 } 6319 __setup("hashdist=", set_hashdist); 6320 #endif 6321 6322 /* 6323 * allocate a large system hash table from bootmem 6324 * - it is assumed that the hash table must contain an exact power-of-2 6325 * quantity of entries 6326 * - limit is the number of hash buckets, not the total allocation size 6327 */ 6328 void *__init alloc_large_system_hash(const char *tablename, 6329 unsigned long bucketsize, 6330 unsigned long numentries, 6331 int scale, 6332 int flags, 6333 unsigned int *_hash_shift, 6334 unsigned int *_hash_mask, 6335 unsigned long low_limit, 6336 unsigned long high_limit) 6337 { 6338 unsigned long long max = high_limit; 6339 unsigned long log2qty, size; 6340 void *table = NULL; 6341 6342 /* allow the kernel cmdline to have a say */ 6343 if (!numentries) { 6344 /* round applicable memory size up to nearest megabyte */ 6345 numentries = nr_kernel_pages; 6346 6347 /* It isn't necessary when PAGE_SIZE >= 1MB */ 6348 if (PAGE_SHIFT < 20) 6349 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 6350 6351 /* limit to 1 bucket per 2^scale bytes of low memory */ 6352 if (scale > PAGE_SHIFT) 6353 numentries >>= (scale - PAGE_SHIFT); 6354 else 6355 numentries <<= (PAGE_SHIFT - scale); 6356 6357 /* Make sure we've got at least a 0-order allocation.. */ 6358 if (unlikely(flags & HASH_SMALL)) { 6359 /* Makes no sense without HASH_EARLY */ 6360 WARN_ON(!(flags & HASH_EARLY)); 6361 if (!(numentries >> *_hash_shift)) { 6362 numentries = 1UL << *_hash_shift; 6363 BUG_ON(!numentries); 6364 } 6365 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 6366 numentries = PAGE_SIZE / bucketsize; 6367 } 6368 numentries = roundup_pow_of_two(numentries); 6369 6370 /* limit allocation size to 1/16 total memory by default */ 6371 if (max == 0) { 6372 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 6373 do_div(max, bucketsize); 6374 } 6375 max = min(max, 0x80000000ULL); 6376 6377 if (numentries < low_limit) 6378 numentries = low_limit; 6379 if (numentries > max) 6380 numentries = max; 6381 6382 log2qty = ilog2(numentries); 6383 6384 do { 6385 size = bucketsize << log2qty; 6386 if (flags & HASH_EARLY) 6387 table = memblock_virt_alloc_nopanic(size, 0); 6388 else if (hashdist) 6389 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 6390 else { 6391 /* 6392 * If bucketsize is not a power-of-two, we may free 6393 * some pages at the end of hash table which 6394 * alloc_pages_exact() automatically does 6395 */ 6396 if (get_order(size) < MAX_ORDER) { 6397 table = alloc_pages_exact(size, GFP_ATOMIC); 6398 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 6399 } 6400 } 6401 } while (!table && size > PAGE_SIZE && --log2qty); 6402 6403 if (!table) 6404 panic("Failed to allocate %s hash table\n", tablename); 6405 6406 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 6407 tablename, 6408 (1UL << log2qty), 6409 ilog2(size) - PAGE_SHIFT, 6410 size); 6411 6412 if (_hash_shift) 6413 *_hash_shift = log2qty; 6414 if (_hash_mask) 6415 *_hash_mask = (1 << log2qty) - 1; 6416 6417 return table; 6418 } 6419 6420 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 6421 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 6422 unsigned long pfn) 6423 { 6424 #ifdef CONFIG_SPARSEMEM 6425 return __pfn_to_section(pfn)->pageblock_flags; 6426 #else 6427 return zone->pageblock_flags; 6428 #endif /* CONFIG_SPARSEMEM */ 6429 } 6430 6431 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 6432 { 6433 #ifdef CONFIG_SPARSEMEM 6434 pfn &= (PAGES_PER_SECTION-1); 6435 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6436 #else 6437 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 6438 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6439 #endif /* CONFIG_SPARSEMEM */ 6440 } 6441 6442 /** 6443 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 6444 * @page: The page within the block of interest 6445 * @pfn: The target page frame number 6446 * @end_bitidx: The last bit of interest to retrieve 6447 * @mask: mask of bits that the caller is interested in 6448 * 6449 * Return: pageblock_bits flags 6450 */ 6451 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 6452 unsigned long end_bitidx, 6453 unsigned long mask) 6454 { 6455 struct zone *zone; 6456 unsigned long *bitmap; 6457 unsigned long bitidx, word_bitidx; 6458 unsigned long word; 6459 6460 zone = page_zone(page); 6461 bitmap = get_pageblock_bitmap(zone, pfn); 6462 bitidx = pfn_to_bitidx(zone, pfn); 6463 word_bitidx = bitidx / BITS_PER_LONG; 6464 bitidx &= (BITS_PER_LONG-1); 6465 6466 word = bitmap[word_bitidx]; 6467 bitidx += end_bitidx; 6468 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6469 } 6470 6471 /** 6472 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6473 * @page: The page within the block of interest 6474 * @flags: The flags to set 6475 * @pfn: The target page frame number 6476 * @end_bitidx: The last bit of interest 6477 * @mask: mask of bits that the caller is interested in 6478 */ 6479 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 6480 unsigned long pfn, 6481 unsigned long end_bitidx, 6482 unsigned long mask) 6483 { 6484 struct zone *zone; 6485 unsigned long *bitmap; 6486 unsigned long bitidx, word_bitidx; 6487 unsigned long old_word, word; 6488 6489 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6490 6491 zone = page_zone(page); 6492 bitmap = get_pageblock_bitmap(zone, pfn); 6493 bitidx = pfn_to_bitidx(zone, pfn); 6494 word_bitidx = bitidx / BITS_PER_LONG; 6495 bitidx &= (BITS_PER_LONG-1); 6496 6497 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); 6498 6499 bitidx += end_bitidx; 6500 mask <<= (BITS_PER_LONG - bitidx - 1); 6501 flags <<= (BITS_PER_LONG - bitidx - 1); 6502 6503 word = READ_ONCE(bitmap[word_bitidx]); 6504 for (;;) { 6505 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6506 if (word == old_word) 6507 break; 6508 word = old_word; 6509 } 6510 } 6511 6512 /* 6513 * This function checks whether pageblock includes unmovable pages or not. 6514 * If @count is not zero, it is okay to include less @count unmovable pages 6515 * 6516 * PageLRU check without isolation or lru_lock could race so that 6517 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6518 * expect this function should be exact. 6519 */ 6520 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6521 bool skip_hwpoisoned_pages) 6522 { 6523 unsigned long pfn, iter, found; 6524 int mt; 6525 6526 /* 6527 * For avoiding noise data, lru_add_drain_all() should be called 6528 * If ZONE_MOVABLE, the zone never contains unmovable pages 6529 */ 6530 if (zone_idx(zone) == ZONE_MOVABLE) 6531 return false; 6532 mt = get_pageblock_migratetype(page); 6533 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6534 return false; 6535 6536 pfn = page_to_pfn(page); 6537 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6538 unsigned long check = pfn + iter; 6539 6540 if (!pfn_valid_within(check)) 6541 continue; 6542 6543 page = pfn_to_page(check); 6544 6545 /* 6546 * Hugepages are not in LRU lists, but they're movable. 6547 * We need not scan over tail pages bacause we don't 6548 * handle each tail page individually in migration. 6549 */ 6550 if (PageHuge(page)) { 6551 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6552 continue; 6553 } 6554 6555 /* 6556 * We can't use page_count without pin a page 6557 * because another CPU can free compound page. 6558 * This check already skips compound tails of THP 6559 * because their page->_count is zero at all time. 6560 */ 6561 if (!atomic_read(&page->_count)) { 6562 if (PageBuddy(page)) 6563 iter += (1 << page_order(page)) - 1; 6564 continue; 6565 } 6566 6567 /* 6568 * The HWPoisoned page may be not in buddy system, and 6569 * page_count() is not 0. 6570 */ 6571 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6572 continue; 6573 6574 if (!PageLRU(page)) 6575 found++; 6576 /* 6577 * If there are RECLAIMABLE pages, we need to check 6578 * it. But now, memory offline itself doesn't call 6579 * shrink_node_slabs() and it still to be fixed. 6580 */ 6581 /* 6582 * If the page is not RAM, page_count()should be 0. 6583 * we don't need more check. This is an _used_ not-movable page. 6584 * 6585 * The problematic thing here is PG_reserved pages. PG_reserved 6586 * is set to both of a memory hole page and a _used_ kernel 6587 * page at boot. 6588 */ 6589 if (found > count) 6590 return true; 6591 } 6592 return false; 6593 } 6594 6595 bool is_pageblock_removable_nolock(struct page *page) 6596 { 6597 struct zone *zone; 6598 unsigned long pfn; 6599 6600 /* 6601 * We have to be careful here because we are iterating over memory 6602 * sections which are not zone aware so we might end up outside of 6603 * the zone but still within the section. 6604 * We have to take care about the node as well. If the node is offline 6605 * its NODE_DATA will be NULL - see page_zone. 6606 */ 6607 if (!node_online(page_to_nid(page))) 6608 return false; 6609 6610 zone = page_zone(page); 6611 pfn = page_to_pfn(page); 6612 if (!zone_spans_pfn(zone, pfn)) 6613 return false; 6614 6615 return !has_unmovable_pages(zone, page, 0, true); 6616 } 6617 6618 #ifdef CONFIG_CMA 6619 6620 static unsigned long pfn_max_align_down(unsigned long pfn) 6621 { 6622 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6623 pageblock_nr_pages) - 1); 6624 } 6625 6626 static unsigned long pfn_max_align_up(unsigned long pfn) 6627 { 6628 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6629 pageblock_nr_pages)); 6630 } 6631 6632 /* [start, end) must belong to a single zone. */ 6633 static int __alloc_contig_migrate_range(struct compact_control *cc, 6634 unsigned long start, unsigned long end) 6635 { 6636 /* This function is based on compact_zone() from compaction.c. */ 6637 unsigned long nr_reclaimed; 6638 unsigned long pfn = start; 6639 unsigned int tries = 0; 6640 int ret = 0; 6641 6642 migrate_prep(); 6643 6644 while (pfn < end || !list_empty(&cc->migratepages)) { 6645 if (fatal_signal_pending(current)) { 6646 ret = -EINTR; 6647 break; 6648 } 6649 6650 if (list_empty(&cc->migratepages)) { 6651 cc->nr_migratepages = 0; 6652 pfn = isolate_migratepages_range(cc, pfn, end); 6653 if (!pfn) { 6654 ret = -EINTR; 6655 break; 6656 } 6657 tries = 0; 6658 } else if (++tries == 5) { 6659 ret = ret < 0 ? ret : -EBUSY; 6660 break; 6661 } 6662 6663 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6664 &cc->migratepages); 6665 cc->nr_migratepages -= nr_reclaimed; 6666 6667 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6668 NULL, 0, cc->mode, MR_CMA); 6669 } 6670 if (ret < 0) { 6671 putback_movable_pages(&cc->migratepages); 6672 return ret; 6673 } 6674 return 0; 6675 } 6676 6677 /** 6678 * alloc_contig_range() -- tries to allocate given range of pages 6679 * @start: start PFN to allocate 6680 * @end: one-past-the-last PFN to allocate 6681 * @migratetype: migratetype of the underlaying pageblocks (either 6682 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6683 * in range must have the same migratetype and it must 6684 * be either of the two. 6685 * 6686 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6687 * aligned, however it's the caller's responsibility to guarantee that 6688 * we are the only thread that changes migrate type of pageblocks the 6689 * pages fall in. 6690 * 6691 * The PFN range must belong to a single zone. 6692 * 6693 * Returns zero on success or negative error code. On success all 6694 * pages which PFN is in [start, end) are allocated for the caller and 6695 * need to be freed with free_contig_range(). 6696 */ 6697 int alloc_contig_range(unsigned long start, unsigned long end, 6698 unsigned migratetype) 6699 { 6700 unsigned long outer_start, outer_end; 6701 unsigned int order; 6702 int ret = 0; 6703 6704 struct compact_control cc = { 6705 .nr_migratepages = 0, 6706 .order = -1, 6707 .zone = page_zone(pfn_to_page(start)), 6708 .mode = MIGRATE_SYNC, 6709 .ignore_skip_hint = true, 6710 }; 6711 INIT_LIST_HEAD(&cc.migratepages); 6712 6713 /* 6714 * What we do here is we mark all pageblocks in range as 6715 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6716 * have different sizes, and due to the way page allocator 6717 * work, we align the range to biggest of the two pages so 6718 * that page allocator won't try to merge buddies from 6719 * different pageblocks and change MIGRATE_ISOLATE to some 6720 * other migration type. 6721 * 6722 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6723 * migrate the pages from an unaligned range (ie. pages that 6724 * we are interested in). This will put all the pages in 6725 * range back to page allocator as MIGRATE_ISOLATE. 6726 * 6727 * When this is done, we take the pages in range from page 6728 * allocator removing them from the buddy system. This way 6729 * page allocator will never consider using them. 6730 * 6731 * This lets us mark the pageblocks back as 6732 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6733 * aligned range but not in the unaligned, original range are 6734 * put back to page allocator so that buddy can use them. 6735 */ 6736 6737 ret = start_isolate_page_range(pfn_max_align_down(start), 6738 pfn_max_align_up(end), migratetype, 6739 false); 6740 if (ret) 6741 return ret; 6742 6743 /* 6744 * In case of -EBUSY, we'd like to know which page causes problem. 6745 * So, just fall through. We will check it in test_pages_isolated(). 6746 */ 6747 ret = __alloc_contig_migrate_range(&cc, start, end); 6748 if (ret && ret != -EBUSY) 6749 goto done; 6750 6751 /* 6752 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6753 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6754 * more, all pages in [start, end) are free in page allocator. 6755 * What we are going to do is to allocate all pages from 6756 * [start, end) (that is remove them from page allocator). 6757 * 6758 * The only problem is that pages at the beginning and at the 6759 * end of interesting range may be not aligned with pages that 6760 * page allocator holds, ie. they can be part of higher order 6761 * pages. Because of this, we reserve the bigger range and 6762 * once this is done free the pages we are not interested in. 6763 * 6764 * We don't have to hold zone->lock here because the pages are 6765 * isolated thus they won't get removed from buddy. 6766 */ 6767 6768 lru_add_drain_all(); 6769 drain_all_pages(cc.zone); 6770 6771 order = 0; 6772 outer_start = start; 6773 while (!PageBuddy(pfn_to_page(outer_start))) { 6774 if (++order >= MAX_ORDER) { 6775 outer_start = start; 6776 break; 6777 } 6778 outer_start &= ~0UL << order; 6779 } 6780 6781 if (outer_start != start) { 6782 order = page_order(pfn_to_page(outer_start)); 6783 6784 /* 6785 * outer_start page could be small order buddy page and 6786 * it doesn't include start page. Adjust outer_start 6787 * in this case to report failed page properly 6788 * on tracepoint in test_pages_isolated() 6789 */ 6790 if (outer_start + (1UL << order) <= start) 6791 outer_start = start; 6792 } 6793 6794 /* Make sure the range is really isolated. */ 6795 if (test_pages_isolated(outer_start, end, false)) { 6796 pr_info("%s: [%lx, %lx) PFNs busy\n", 6797 __func__, outer_start, end); 6798 ret = -EBUSY; 6799 goto done; 6800 } 6801 6802 /* Grab isolated pages from freelists. */ 6803 outer_end = isolate_freepages_range(&cc, outer_start, end); 6804 if (!outer_end) { 6805 ret = -EBUSY; 6806 goto done; 6807 } 6808 6809 /* Free head and tail (if any) */ 6810 if (start != outer_start) 6811 free_contig_range(outer_start, start - outer_start); 6812 if (end != outer_end) 6813 free_contig_range(end, outer_end - end); 6814 6815 done: 6816 undo_isolate_page_range(pfn_max_align_down(start), 6817 pfn_max_align_up(end), migratetype); 6818 return ret; 6819 } 6820 6821 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6822 { 6823 unsigned int count = 0; 6824 6825 for (; nr_pages--; pfn++) { 6826 struct page *page = pfn_to_page(pfn); 6827 6828 count += page_count(page) != 1; 6829 __free_page(page); 6830 } 6831 WARN(count != 0, "%d pages are still in use!\n", count); 6832 } 6833 #endif 6834 6835 #ifdef CONFIG_MEMORY_HOTPLUG 6836 /* 6837 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6838 * page high values need to be recalulated. 6839 */ 6840 void __meminit zone_pcp_update(struct zone *zone) 6841 { 6842 unsigned cpu; 6843 mutex_lock(&pcp_batch_high_lock); 6844 for_each_possible_cpu(cpu) 6845 pageset_set_high_and_batch(zone, 6846 per_cpu_ptr(zone->pageset, cpu)); 6847 mutex_unlock(&pcp_batch_high_lock); 6848 } 6849 #endif 6850 6851 void zone_pcp_reset(struct zone *zone) 6852 { 6853 unsigned long flags; 6854 int cpu; 6855 struct per_cpu_pageset *pset; 6856 6857 /* avoid races with drain_pages() */ 6858 local_irq_save(flags); 6859 if (zone->pageset != &boot_pageset) { 6860 for_each_online_cpu(cpu) { 6861 pset = per_cpu_ptr(zone->pageset, cpu); 6862 drain_zonestat(zone, pset); 6863 } 6864 free_percpu(zone->pageset); 6865 zone->pageset = &boot_pageset; 6866 } 6867 local_irq_restore(flags); 6868 } 6869 6870 #ifdef CONFIG_MEMORY_HOTREMOVE 6871 /* 6872 * All pages in the range must be isolated before calling this. 6873 */ 6874 void 6875 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6876 { 6877 struct page *page; 6878 struct zone *zone; 6879 unsigned int order, i; 6880 unsigned long pfn; 6881 unsigned long flags; 6882 /* find the first valid pfn */ 6883 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6884 if (pfn_valid(pfn)) 6885 break; 6886 if (pfn == end_pfn) 6887 return; 6888 zone = page_zone(pfn_to_page(pfn)); 6889 spin_lock_irqsave(&zone->lock, flags); 6890 pfn = start_pfn; 6891 while (pfn < end_pfn) { 6892 if (!pfn_valid(pfn)) { 6893 pfn++; 6894 continue; 6895 } 6896 page = pfn_to_page(pfn); 6897 /* 6898 * The HWPoisoned page may be not in buddy system, and 6899 * page_count() is not 0. 6900 */ 6901 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6902 pfn++; 6903 SetPageReserved(page); 6904 continue; 6905 } 6906 6907 BUG_ON(page_count(page)); 6908 BUG_ON(!PageBuddy(page)); 6909 order = page_order(page); 6910 #ifdef CONFIG_DEBUG_VM 6911 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6912 pfn, 1 << order, end_pfn); 6913 #endif 6914 list_del(&page->lru); 6915 rmv_page_order(page); 6916 zone->free_area[order].nr_free--; 6917 for (i = 0; i < (1 << order); i++) 6918 SetPageReserved((page+i)); 6919 pfn += (1 << order); 6920 } 6921 spin_unlock_irqrestore(&zone->lock, flags); 6922 } 6923 #endif 6924 6925 #ifdef CONFIG_MEMORY_FAILURE 6926 bool is_free_buddy_page(struct page *page) 6927 { 6928 struct zone *zone = page_zone(page); 6929 unsigned long pfn = page_to_pfn(page); 6930 unsigned long flags; 6931 unsigned int order; 6932 6933 spin_lock_irqsave(&zone->lock, flags); 6934 for (order = 0; order < MAX_ORDER; order++) { 6935 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6936 6937 if (PageBuddy(page_head) && page_order(page_head) >= order) 6938 break; 6939 } 6940 spin_unlock_irqrestore(&zone->lock, flags); 6941 6942 return order < MAX_ORDER; 6943 } 6944 #endif 6945