1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/interrupt.h> 21 #include <linux/pagemap.h> 22 #include <linux/bootmem.h> 23 #include <linux/compiler.h> 24 #include <linux/kernel.h> 25 #include <linux/module.h> 26 #include <linux/suspend.h> 27 #include <linux/pagevec.h> 28 #include <linux/blkdev.h> 29 #include <linux/slab.h> 30 #include <linux/oom.h> 31 #include <linux/notifier.h> 32 #include <linux/topology.h> 33 #include <linux/sysctl.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/memory_hotplug.h> 37 #include <linux/nodemask.h> 38 #include <linux/vmalloc.h> 39 #include <linux/mempolicy.h> 40 #include <linux/stop_machine.h> 41 #include <linux/sort.h> 42 #include <linux/pfn.h> 43 #include <linux/backing-dev.h> 44 #include <linux/fault-inject.h> 45 #include <linux/page-isolation.h> 46 47 #include <asm/tlbflush.h> 48 #include <asm/div64.h> 49 #include "internal.h" 50 51 /* 52 * Array of node states. 53 */ 54 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 55 [N_POSSIBLE] = NODE_MASK_ALL, 56 [N_ONLINE] = { { [0] = 1UL } }, 57 #ifndef CONFIG_NUMA 58 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 59 #ifdef CONFIG_HIGHMEM 60 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 61 #endif 62 [N_CPU] = { { [0] = 1UL } }, 63 #endif /* NUMA */ 64 }; 65 EXPORT_SYMBOL(node_states); 66 67 unsigned long totalram_pages __read_mostly; 68 unsigned long totalreserve_pages __read_mostly; 69 long nr_swap_pages; 70 int percpu_pagelist_fraction; 71 72 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 73 int pageblock_order __read_mostly; 74 #endif 75 76 static void __free_pages_ok(struct page *page, unsigned int order); 77 78 /* 79 * results with 256, 32 in the lowmem_reserve sysctl: 80 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 81 * 1G machine -> (16M dma, 784M normal, 224M high) 82 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 83 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 84 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 85 * 86 * TBD: should special case ZONE_DMA32 machines here - in those we normally 87 * don't need any ZONE_NORMAL reservation 88 */ 89 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 90 #ifdef CONFIG_ZONE_DMA 91 256, 92 #endif 93 #ifdef CONFIG_ZONE_DMA32 94 256, 95 #endif 96 #ifdef CONFIG_HIGHMEM 97 32, 98 #endif 99 32, 100 }; 101 102 EXPORT_SYMBOL(totalram_pages); 103 104 static char * const zone_names[MAX_NR_ZONES] = { 105 #ifdef CONFIG_ZONE_DMA 106 "DMA", 107 #endif 108 #ifdef CONFIG_ZONE_DMA32 109 "DMA32", 110 #endif 111 "Normal", 112 #ifdef CONFIG_HIGHMEM 113 "HighMem", 114 #endif 115 "Movable", 116 }; 117 118 int min_free_kbytes = 1024; 119 120 unsigned long __meminitdata nr_kernel_pages; 121 unsigned long __meminitdata nr_all_pages; 122 static unsigned long __meminitdata dma_reserve; 123 124 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 125 /* 126 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct 127 * ranges of memory (RAM) that may be registered with add_active_range(). 128 * Ranges passed to add_active_range() will be merged if possible 129 * so the number of times add_active_range() can be called is 130 * related to the number of nodes and the number of holes 131 */ 132 #ifdef CONFIG_MAX_ACTIVE_REGIONS 133 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 134 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 135 #else 136 #if MAX_NUMNODES >= 32 137 /* If there can be many nodes, allow up to 50 holes per node */ 138 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 139 #else 140 /* By default, allow up to 256 distinct regions */ 141 #define MAX_ACTIVE_REGIONS 256 142 #endif 143 #endif 144 145 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; 146 static int __meminitdata nr_nodemap_entries; 147 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 148 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 149 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 150 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 151 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 152 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 153 unsigned long __initdata required_kernelcore; 154 static unsigned long __initdata required_movablecore; 155 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 156 157 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 158 int movable_zone; 159 EXPORT_SYMBOL(movable_zone); 160 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 161 162 #if MAX_NUMNODES > 1 163 int nr_node_ids __read_mostly = MAX_NUMNODES; 164 EXPORT_SYMBOL(nr_node_ids); 165 #endif 166 167 int page_group_by_mobility_disabled __read_mostly; 168 169 static void set_pageblock_migratetype(struct page *page, int migratetype) 170 { 171 set_pageblock_flags_group(page, (unsigned long)migratetype, 172 PB_migrate, PB_migrate_end); 173 } 174 175 #ifdef CONFIG_DEBUG_VM 176 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 177 { 178 int ret = 0; 179 unsigned seq; 180 unsigned long pfn = page_to_pfn(page); 181 182 do { 183 seq = zone_span_seqbegin(zone); 184 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 185 ret = 1; 186 else if (pfn < zone->zone_start_pfn) 187 ret = 1; 188 } while (zone_span_seqretry(zone, seq)); 189 190 return ret; 191 } 192 193 static int page_is_consistent(struct zone *zone, struct page *page) 194 { 195 if (!pfn_valid_within(page_to_pfn(page))) 196 return 0; 197 if (zone != page_zone(page)) 198 return 0; 199 200 return 1; 201 } 202 /* 203 * Temporary debugging check for pages not lying within a given zone. 204 */ 205 static int bad_range(struct zone *zone, struct page *page) 206 { 207 if (page_outside_zone_boundaries(zone, page)) 208 return 1; 209 if (!page_is_consistent(zone, page)) 210 return 1; 211 212 return 0; 213 } 214 #else 215 static inline int bad_range(struct zone *zone, struct page *page) 216 { 217 return 0; 218 } 219 #endif 220 221 static void bad_page(struct page *page) 222 { 223 printk(KERN_EMERG "Bad page state in process '%s'\n" 224 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 225 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 226 KERN_EMERG "Backtrace:\n", 227 current->comm, page, (int)(2*sizeof(unsigned long)), 228 (unsigned long)page->flags, page->mapping, 229 page_mapcount(page), page_count(page)); 230 dump_stack(); 231 page->flags &= ~(1 << PG_lru | 232 1 << PG_private | 233 1 << PG_locked | 234 1 << PG_active | 235 1 << PG_dirty | 236 1 << PG_reclaim | 237 1 << PG_slab | 238 1 << PG_swapcache | 239 1 << PG_writeback | 240 1 << PG_buddy ); 241 set_page_count(page, 0); 242 reset_page_mapcount(page); 243 page->mapping = NULL; 244 add_taint(TAINT_BAD_PAGE); 245 } 246 247 /* 248 * Higher-order pages are called "compound pages". They are structured thusly: 249 * 250 * The first PAGE_SIZE page is called the "head page". 251 * 252 * The remaining PAGE_SIZE pages are called "tail pages". 253 * 254 * All pages have PG_compound set. All pages have their ->private pointing at 255 * the head page (even the head page has this). 256 * 257 * The first tail page's ->lru.next holds the address of the compound page's 258 * put_page() function. Its ->lru.prev holds the order of allocation. 259 * This usage means that zero-order pages may not be compound. 260 */ 261 262 static void free_compound_page(struct page *page) 263 { 264 __free_pages_ok(page, compound_order(page)); 265 } 266 267 static void prep_compound_page(struct page *page, unsigned long order) 268 { 269 int i; 270 int nr_pages = 1 << order; 271 272 set_compound_page_dtor(page, free_compound_page); 273 set_compound_order(page, order); 274 __SetPageHead(page); 275 for (i = 1; i < nr_pages; i++) { 276 struct page *p = page + i; 277 278 __SetPageTail(p); 279 p->first_page = page; 280 } 281 } 282 283 static void destroy_compound_page(struct page *page, unsigned long order) 284 { 285 int i; 286 int nr_pages = 1 << order; 287 288 if (unlikely(compound_order(page) != order)) 289 bad_page(page); 290 291 if (unlikely(!PageHead(page))) 292 bad_page(page); 293 __ClearPageHead(page); 294 for (i = 1; i < nr_pages; i++) { 295 struct page *p = page + i; 296 297 if (unlikely(!PageTail(p) | 298 (p->first_page != page))) 299 bad_page(page); 300 __ClearPageTail(p); 301 } 302 } 303 304 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 305 { 306 int i; 307 308 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 309 /* 310 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 311 * and __GFP_HIGHMEM from hard or soft interrupt context. 312 */ 313 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 314 for (i = 0; i < (1 << order); i++) 315 clear_highpage(page + i); 316 } 317 318 static inline void set_page_order(struct page *page, int order) 319 { 320 set_page_private(page, order); 321 __SetPageBuddy(page); 322 } 323 324 static inline void rmv_page_order(struct page *page) 325 { 326 __ClearPageBuddy(page); 327 set_page_private(page, 0); 328 } 329 330 /* 331 * Locate the struct page for both the matching buddy in our 332 * pair (buddy1) and the combined O(n+1) page they form (page). 333 * 334 * 1) Any buddy B1 will have an order O twin B2 which satisfies 335 * the following equation: 336 * B2 = B1 ^ (1 << O) 337 * For example, if the starting buddy (buddy2) is #8 its order 338 * 1 buddy is #10: 339 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 340 * 341 * 2) Any buddy B will have an order O+1 parent P which 342 * satisfies the following equation: 343 * P = B & ~(1 << O) 344 * 345 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 346 */ 347 static inline struct page * 348 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 349 { 350 unsigned long buddy_idx = page_idx ^ (1 << order); 351 352 return page + (buddy_idx - page_idx); 353 } 354 355 static inline unsigned long 356 __find_combined_index(unsigned long page_idx, unsigned int order) 357 { 358 return (page_idx & ~(1 << order)); 359 } 360 361 /* 362 * This function checks whether a page is free && is the buddy 363 * we can do coalesce a page and its buddy if 364 * (a) the buddy is not in a hole && 365 * (b) the buddy is in the buddy system && 366 * (c) a page and its buddy have the same order && 367 * (d) a page and its buddy are in the same zone. 368 * 369 * For recording whether a page is in the buddy system, we use PG_buddy. 370 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 371 * 372 * For recording page's order, we use page_private(page). 373 */ 374 static inline int page_is_buddy(struct page *page, struct page *buddy, 375 int order) 376 { 377 if (!pfn_valid_within(page_to_pfn(buddy))) 378 return 0; 379 380 if (page_zone_id(page) != page_zone_id(buddy)) 381 return 0; 382 383 if (PageBuddy(buddy) && page_order(buddy) == order) { 384 BUG_ON(page_count(buddy) != 0); 385 return 1; 386 } 387 return 0; 388 } 389 390 /* 391 * Freeing function for a buddy system allocator. 392 * 393 * The concept of a buddy system is to maintain direct-mapped table 394 * (containing bit values) for memory blocks of various "orders". 395 * The bottom level table contains the map for the smallest allocatable 396 * units of memory (here, pages), and each level above it describes 397 * pairs of units from the levels below, hence, "buddies". 398 * At a high level, all that happens here is marking the table entry 399 * at the bottom level available, and propagating the changes upward 400 * as necessary, plus some accounting needed to play nicely with other 401 * parts of the VM system. 402 * At each level, we keep a list of pages, which are heads of continuous 403 * free pages of length of (1 << order) and marked with PG_buddy. Page's 404 * order is recorded in page_private(page) field. 405 * So when we are allocating or freeing one, we can derive the state of the 406 * other. That is, if we allocate a small block, and both were 407 * free, the remainder of the region must be split into blocks. 408 * If a block is freed, and its buddy is also free, then this 409 * triggers coalescing into a block of larger size. 410 * 411 * -- wli 412 */ 413 414 static inline void __free_one_page(struct page *page, 415 struct zone *zone, unsigned int order) 416 { 417 unsigned long page_idx; 418 int order_size = 1 << order; 419 int migratetype = get_pageblock_migratetype(page); 420 421 if (unlikely(PageCompound(page))) 422 destroy_compound_page(page, order); 423 424 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 425 426 VM_BUG_ON(page_idx & (order_size - 1)); 427 VM_BUG_ON(bad_range(zone, page)); 428 429 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 430 while (order < MAX_ORDER-1) { 431 unsigned long combined_idx; 432 struct page *buddy; 433 434 buddy = __page_find_buddy(page, page_idx, order); 435 if (!page_is_buddy(page, buddy, order)) 436 break; /* Move the buddy up one level. */ 437 438 list_del(&buddy->lru); 439 zone->free_area[order].nr_free--; 440 rmv_page_order(buddy); 441 combined_idx = __find_combined_index(page_idx, order); 442 page = page + (combined_idx - page_idx); 443 page_idx = combined_idx; 444 order++; 445 } 446 set_page_order(page, order); 447 list_add(&page->lru, 448 &zone->free_area[order].free_list[migratetype]); 449 zone->free_area[order].nr_free++; 450 } 451 452 static inline int free_pages_check(struct page *page) 453 { 454 if (unlikely(page_mapcount(page) | 455 (page->mapping != NULL) | 456 (page_count(page) != 0) | 457 (page->flags & ( 458 1 << PG_lru | 459 1 << PG_private | 460 1 << PG_locked | 461 1 << PG_active | 462 1 << PG_slab | 463 1 << PG_swapcache | 464 1 << PG_writeback | 465 1 << PG_reserved | 466 1 << PG_buddy )))) 467 bad_page(page); 468 if (PageDirty(page)) 469 __ClearPageDirty(page); 470 /* 471 * For now, we report if PG_reserved was found set, but do not 472 * clear it, and do not free the page. But we shall soon need 473 * to do more, for when the ZERO_PAGE count wraps negative. 474 */ 475 return PageReserved(page); 476 } 477 478 /* 479 * Frees a list of pages. 480 * Assumes all pages on list are in same zone, and of same order. 481 * count is the number of pages to free. 482 * 483 * If the zone was previously in an "all pages pinned" state then look to 484 * see if this freeing clears that state. 485 * 486 * And clear the zone's pages_scanned counter, to hold off the "all pages are 487 * pinned" detection logic. 488 */ 489 static void free_pages_bulk(struct zone *zone, int count, 490 struct list_head *list, int order) 491 { 492 spin_lock(&zone->lock); 493 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 494 zone->pages_scanned = 0; 495 while (count--) { 496 struct page *page; 497 498 VM_BUG_ON(list_empty(list)); 499 page = list_entry(list->prev, struct page, lru); 500 /* have to delete it as __free_one_page list manipulates */ 501 list_del(&page->lru); 502 __free_one_page(page, zone, order); 503 } 504 spin_unlock(&zone->lock); 505 } 506 507 static void free_one_page(struct zone *zone, struct page *page, int order) 508 { 509 spin_lock(&zone->lock); 510 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 511 zone->pages_scanned = 0; 512 __free_one_page(page, zone, order); 513 spin_unlock(&zone->lock); 514 } 515 516 static void __free_pages_ok(struct page *page, unsigned int order) 517 { 518 unsigned long flags; 519 int i; 520 int reserved = 0; 521 522 for (i = 0 ; i < (1 << order) ; ++i) 523 reserved += free_pages_check(page + i); 524 if (reserved) 525 return; 526 527 if (!PageHighMem(page)) 528 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 529 arch_free_page(page, order); 530 kernel_map_pages(page, 1 << order, 0); 531 532 local_irq_save(flags); 533 __count_vm_events(PGFREE, 1 << order); 534 free_one_page(page_zone(page), page, order); 535 local_irq_restore(flags); 536 } 537 538 /* 539 * permit the bootmem allocator to evade page validation on high-order frees 540 */ 541 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 542 { 543 if (order == 0) { 544 __ClearPageReserved(page); 545 set_page_count(page, 0); 546 set_page_refcounted(page); 547 __free_page(page); 548 } else { 549 int loop; 550 551 prefetchw(page); 552 for (loop = 0; loop < BITS_PER_LONG; loop++) { 553 struct page *p = &page[loop]; 554 555 if (loop + 1 < BITS_PER_LONG) 556 prefetchw(p + 1); 557 __ClearPageReserved(p); 558 set_page_count(p, 0); 559 } 560 561 set_page_refcounted(page); 562 __free_pages(page, order); 563 } 564 } 565 566 567 /* 568 * The order of subdivision here is critical for the IO subsystem. 569 * Please do not alter this order without good reasons and regression 570 * testing. Specifically, as large blocks of memory are subdivided, 571 * the order in which smaller blocks are delivered depends on the order 572 * they're subdivided in this function. This is the primary factor 573 * influencing the order in which pages are delivered to the IO 574 * subsystem according to empirical testing, and this is also justified 575 * by considering the behavior of a buddy system containing a single 576 * large block of memory acted on by a series of small allocations. 577 * This behavior is a critical factor in sglist merging's success. 578 * 579 * -- wli 580 */ 581 static inline void expand(struct zone *zone, struct page *page, 582 int low, int high, struct free_area *area, 583 int migratetype) 584 { 585 unsigned long size = 1 << high; 586 587 while (high > low) { 588 area--; 589 high--; 590 size >>= 1; 591 VM_BUG_ON(bad_range(zone, &page[size])); 592 list_add(&page[size].lru, &area->free_list[migratetype]); 593 area->nr_free++; 594 set_page_order(&page[size], high); 595 } 596 } 597 598 /* 599 * This page is about to be returned from the page allocator 600 */ 601 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 602 { 603 if (unlikely(page_mapcount(page) | 604 (page->mapping != NULL) | 605 (page_count(page) != 0) | 606 (page->flags & ( 607 1 << PG_lru | 608 1 << PG_private | 609 1 << PG_locked | 610 1 << PG_active | 611 1 << PG_dirty | 612 1 << PG_slab | 613 1 << PG_swapcache | 614 1 << PG_writeback | 615 1 << PG_reserved | 616 1 << PG_buddy )))) 617 bad_page(page); 618 619 /* 620 * For now, we report if PG_reserved was found set, but do not 621 * clear it, and do not allocate the page: as a safety net. 622 */ 623 if (PageReserved(page)) 624 return 1; 625 626 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 627 1 << PG_referenced | 1 << PG_arch_1 | 628 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 629 set_page_private(page, 0); 630 set_page_refcounted(page); 631 632 arch_alloc_page(page, order); 633 kernel_map_pages(page, 1 << order, 1); 634 635 if (gfp_flags & __GFP_ZERO) 636 prep_zero_page(page, order, gfp_flags); 637 638 if (order && (gfp_flags & __GFP_COMP)) 639 prep_compound_page(page, order); 640 641 return 0; 642 } 643 644 /* 645 * Go through the free lists for the given migratetype and remove 646 * the smallest available page from the freelists 647 */ 648 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 649 int migratetype) 650 { 651 unsigned int current_order; 652 struct free_area * area; 653 struct page *page; 654 655 /* Find a page of the appropriate size in the preferred list */ 656 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 657 area = &(zone->free_area[current_order]); 658 if (list_empty(&area->free_list[migratetype])) 659 continue; 660 661 page = list_entry(area->free_list[migratetype].next, 662 struct page, lru); 663 list_del(&page->lru); 664 rmv_page_order(page); 665 area->nr_free--; 666 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 667 expand(zone, page, order, current_order, area, migratetype); 668 return page; 669 } 670 671 return NULL; 672 } 673 674 675 /* 676 * This array describes the order lists are fallen back to when 677 * the free lists for the desirable migrate type are depleted 678 */ 679 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 680 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 681 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 682 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 683 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 684 }; 685 686 /* 687 * Move the free pages in a range to the free lists of the requested type. 688 * Note that start_page and end_pages are not aligned on a pageblock 689 * boundary. If alignment is required, use move_freepages_block() 690 */ 691 int move_freepages(struct zone *zone, 692 struct page *start_page, struct page *end_page, 693 int migratetype) 694 { 695 struct page *page; 696 unsigned long order; 697 int pages_moved = 0; 698 699 #ifndef CONFIG_HOLES_IN_ZONE 700 /* 701 * page_zone is not safe to call in this context when 702 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 703 * anyway as we check zone boundaries in move_freepages_block(). 704 * Remove at a later date when no bug reports exist related to 705 * grouping pages by mobility 706 */ 707 BUG_ON(page_zone(start_page) != page_zone(end_page)); 708 #endif 709 710 for (page = start_page; page <= end_page;) { 711 if (!pfn_valid_within(page_to_pfn(page))) { 712 page++; 713 continue; 714 } 715 716 if (!PageBuddy(page)) { 717 page++; 718 continue; 719 } 720 721 order = page_order(page); 722 list_del(&page->lru); 723 list_add(&page->lru, 724 &zone->free_area[order].free_list[migratetype]); 725 page += 1 << order; 726 pages_moved += 1 << order; 727 } 728 729 return pages_moved; 730 } 731 732 int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 733 { 734 unsigned long start_pfn, end_pfn; 735 struct page *start_page, *end_page; 736 737 start_pfn = page_to_pfn(page); 738 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 739 start_page = pfn_to_page(start_pfn); 740 end_page = start_page + pageblock_nr_pages - 1; 741 end_pfn = start_pfn + pageblock_nr_pages - 1; 742 743 /* Do not cross zone boundaries */ 744 if (start_pfn < zone->zone_start_pfn) 745 start_page = page; 746 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 747 return 0; 748 749 return move_freepages(zone, start_page, end_page, migratetype); 750 } 751 752 /* Return the page with the lowest PFN in the list */ 753 static struct page *min_page(struct list_head *list) 754 { 755 unsigned long min_pfn = -1UL; 756 struct page *min_page = NULL, *page;; 757 758 list_for_each_entry(page, list, lru) { 759 unsigned long pfn = page_to_pfn(page); 760 if (pfn < min_pfn) { 761 min_pfn = pfn; 762 min_page = page; 763 } 764 } 765 766 return min_page; 767 } 768 769 /* Remove an element from the buddy allocator from the fallback list */ 770 static struct page *__rmqueue_fallback(struct zone *zone, int order, 771 int start_migratetype) 772 { 773 struct free_area * area; 774 int current_order; 775 struct page *page; 776 int migratetype, i; 777 778 /* Find the largest possible block of pages in the other list */ 779 for (current_order = MAX_ORDER-1; current_order >= order; 780 --current_order) { 781 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 782 migratetype = fallbacks[start_migratetype][i]; 783 784 /* MIGRATE_RESERVE handled later if necessary */ 785 if (migratetype == MIGRATE_RESERVE) 786 continue; 787 788 area = &(zone->free_area[current_order]); 789 if (list_empty(&area->free_list[migratetype])) 790 continue; 791 792 /* Bias kernel allocations towards low pfns */ 793 page = list_entry(area->free_list[migratetype].next, 794 struct page, lru); 795 if (unlikely(start_migratetype != MIGRATE_MOVABLE)) 796 page = min_page(&area->free_list[migratetype]); 797 area->nr_free--; 798 799 /* 800 * If breaking a large block of pages, move all free 801 * pages to the preferred allocation list. If falling 802 * back for a reclaimable kernel allocation, be more 803 * agressive about taking ownership of free pages 804 */ 805 if (unlikely(current_order >= (pageblock_order >> 1)) || 806 start_migratetype == MIGRATE_RECLAIMABLE) { 807 unsigned long pages; 808 pages = move_freepages_block(zone, page, 809 start_migratetype); 810 811 /* Claim the whole block if over half of it is free */ 812 if (pages >= (1 << (pageblock_order-1))) 813 set_pageblock_migratetype(page, 814 start_migratetype); 815 816 migratetype = start_migratetype; 817 } 818 819 /* Remove the page from the freelists */ 820 list_del(&page->lru); 821 rmv_page_order(page); 822 __mod_zone_page_state(zone, NR_FREE_PAGES, 823 -(1UL << order)); 824 825 if (current_order == pageblock_order) 826 set_pageblock_migratetype(page, 827 start_migratetype); 828 829 expand(zone, page, order, current_order, area, migratetype); 830 return page; 831 } 832 } 833 834 /* Use MIGRATE_RESERVE rather than fail an allocation */ 835 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); 836 } 837 838 /* 839 * Do the hard work of removing an element from the buddy allocator. 840 * Call me with the zone->lock already held. 841 */ 842 static struct page *__rmqueue(struct zone *zone, unsigned int order, 843 int migratetype) 844 { 845 struct page *page; 846 847 page = __rmqueue_smallest(zone, order, migratetype); 848 849 if (unlikely(!page)) 850 page = __rmqueue_fallback(zone, order, migratetype); 851 852 return page; 853 } 854 855 /* 856 * Obtain a specified number of elements from the buddy allocator, all under 857 * a single hold of the lock, for efficiency. Add them to the supplied list. 858 * Returns the number of new pages which were placed at *list. 859 */ 860 static int rmqueue_bulk(struct zone *zone, unsigned int order, 861 unsigned long count, struct list_head *list, 862 int migratetype) 863 { 864 int i; 865 866 spin_lock(&zone->lock); 867 for (i = 0; i < count; ++i) { 868 struct page *page = __rmqueue(zone, order, migratetype); 869 if (unlikely(page == NULL)) 870 break; 871 list_add(&page->lru, list); 872 set_page_private(page, migratetype); 873 } 874 spin_unlock(&zone->lock); 875 return i; 876 } 877 878 #ifdef CONFIG_NUMA 879 /* 880 * Called from the vmstat counter updater to drain pagesets of this 881 * currently executing processor on remote nodes after they have 882 * expired. 883 * 884 * Note that this function must be called with the thread pinned to 885 * a single processor. 886 */ 887 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 888 { 889 unsigned long flags; 890 int to_drain; 891 892 local_irq_save(flags); 893 if (pcp->count >= pcp->batch) 894 to_drain = pcp->batch; 895 else 896 to_drain = pcp->count; 897 free_pages_bulk(zone, to_drain, &pcp->list, 0); 898 pcp->count -= to_drain; 899 local_irq_restore(flags); 900 } 901 #endif 902 903 static void __drain_pages(unsigned int cpu) 904 { 905 unsigned long flags; 906 struct zone *zone; 907 int i; 908 909 for_each_zone(zone) { 910 struct per_cpu_pageset *pset; 911 912 if (!populated_zone(zone)) 913 continue; 914 915 pset = zone_pcp(zone, cpu); 916 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 917 struct per_cpu_pages *pcp; 918 919 pcp = &pset->pcp[i]; 920 local_irq_save(flags); 921 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 922 pcp->count = 0; 923 local_irq_restore(flags); 924 } 925 } 926 } 927 928 #ifdef CONFIG_HIBERNATION 929 930 void mark_free_pages(struct zone *zone) 931 { 932 unsigned long pfn, max_zone_pfn; 933 unsigned long flags; 934 int order, t; 935 struct list_head *curr; 936 937 if (!zone->spanned_pages) 938 return; 939 940 spin_lock_irqsave(&zone->lock, flags); 941 942 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 943 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 944 if (pfn_valid(pfn)) { 945 struct page *page = pfn_to_page(pfn); 946 947 if (!swsusp_page_is_forbidden(page)) 948 swsusp_unset_page_free(page); 949 } 950 951 for_each_migratetype_order(order, t) { 952 list_for_each(curr, &zone->free_area[order].free_list[t]) { 953 unsigned long i; 954 955 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 956 for (i = 0; i < (1UL << order); i++) 957 swsusp_set_page_free(pfn_to_page(pfn + i)); 958 } 959 } 960 spin_unlock_irqrestore(&zone->lock, flags); 961 } 962 #endif /* CONFIG_PM */ 963 964 /* 965 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 966 */ 967 void drain_local_pages(void) 968 { 969 unsigned long flags; 970 971 local_irq_save(flags); 972 __drain_pages(smp_processor_id()); 973 local_irq_restore(flags); 974 } 975 976 void smp_drain_local_pages(void *arg) 977 { 978 drain_local_pages(); 979 } 980 981 /* 982 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 983 */ 984 void drain_all_local_pages(void) 985 { 986 unsigned long flags; 987 988 local_irq_save(flags); 989 __drain_pages(smp_processor_id()); 990 local_irq_restore(flags); 991 992 smp_call_function(smp_drain_local_pages, NULL, 0, 1); 993 } 994 995 /* 996 * Free a 0-order page 997 */ 998 static void fastcall free_hot_cold_page(struct page *page, int cold) 999 { 1000 struct zone *zone = page_zone(page); 1001 struct per_cpu_pages *pcp; 1002 unsigned long flags; 1003 1004 if (PageAnon(page)) 1005 page->mapping = NULL; 1006 if (free_pages_check(page)) 1007 return; 1008 1009 if (!PageHighMem(page)) 1010 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 1011 arch_free_page(page, 0); 1012 kernel_map_pages(page, 1, 0); 1013 1014 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 1015 local_irq_save(flags); 1016 __count_vm_event(PGFREE); 1017 list_add(&page->lru, &pcp->list); 1018 set_page_private(page, get_pageblock_migratetype(page)); 1019 pcp->count++; 1020 if (pcp->count >= pcp->high) { 1021 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1022 pcp->count -= pcp->batch; 1023 } 1024 local_irq_restore(flags); 1025 put_cpu(); 1026 } 1027 1028 void fastcall free_hot_page(struct page *page) 1029 { 1030 free_hot_cold_page(page, 0); 1031 } 1032 1033 void fastcall free_cold_page(struct page *page) 1034 { 1035 free_hot_cold_page(page, 1); 1036 } 1037 1038 /* 1039 * split_page takes a non-compound higher-order page, and splits it into 1040 * n (1<<order) sub-pages: page[0..n] 1041 * Each sub-page must be freed individually. 1042 * 1043 * Note: this is probably too low level an operation for use in drivers. 1044 * Please consult with lkml before using this in your driver. 1045 */ 1046 void split_page(struct page *page, unsigned int order) 1047 { 1048 int i; 1049 1050 VM_BUG_ON(PageCompound(page)); 1051 VM_BUG_ON(!page_count(page)); 1052 for (i = 1; i < (1 << order); i++) 1053 set_page_refcounted(page + i); 1054 } 1055 1056 /* 1057 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1058 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1059 * or two. 1060 */ 1061 static struct page *buffered_rmqueue(struct zonelist *zonelist, 1062 struct zone *zone, int order, gfp_t gfp_flags) 1063 { 1064 unsigned long flags; 1065 struct page *page; 1066 int cold = !!(gfp_flags & __GFP_COLD); 1067 int cpu; 1068 int migratetype = allocflags_to_migratetype(gfp_flags); 1069 1070 again: 1071 cpu = get_cpu(); 1072 if (likely(order == 0)) { 1073 struct per_cpu_pages *pcp; 1074 1075 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 1076 local_irq_save(flags); 1077 if (!pcp->count) { 1078 pcp->count = rmqueue_bulk(zone, 0, 1079 pcp->batch, &pcp->list, migratetype); 1080 if (unlikely(!pcp->count)) 1081 goto failed; 1082 } 1083 1084 /* Find a page of the appropriate migrate type */ 1085 list_for_each_entry(page, &pcp->list, lru) 1086 if (page_private(page) == migratetype) 1087 break; 1088 1089 /* Allocate more to the pcp list if necessary */ 1090 if (unlikely(&page->lru == &pcp->list)) { 1091 pcp->count += rmqueue_bulk(zone, 0, 1092 pcp->batch, &pcp->list, migratetype); 1093 page = list_entry(pcp->list.next, struct page, lru); 1094 } 1095 1096 list_del(&page->lru); 1097 pcp->count--; 1098 } else { 1099 spin_lock_irqsave(&zone->lock, flags); 1100 page = __rmqueue(zone, order, migratetype); 1101 spin_unlock(&zone->lock); 1102 if (!page) 1103 goto failed; 1104 } 1105 1106 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1107 zone_statistics(zonelist, zone); 1108 local_irq_restore(flags); 1109 put_cpu(); 1110 1111 VM_BUG_ON(bad_range(zone, page)); 1112 if (prep_new_page(page, order, gfp_flags)) 1113 goto again; 1114 return page; 1115 1116 failed: 1117 local_irq_restore(flags); 1118 put_cpu(); 1119 return NULL; 1120 } 1121 1122 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1123 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1124 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1125 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1126 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 1127 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1128 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1129 1130 #ifdef CONFIG_FAIL_PAGE_ALLOC 1131 1132 static struct fail_page_alloc_attr { 1133 struct fault_attr attr; 1134 1135 u32 ignore_gfp_highmem; 1136 u32 ignore_gfp_wait; 1137 u32 min_order; 1138 1139 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1140 1141 struct dentry *ignore_gfp_highmem_file; 1142 struct dentry *ignore_gfp_wait_file; 1143 struct dentry *min_order_file; 1144 1145 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1146 1147 } fail_page_alloc = { 1148 .attr = FAULT_ATTR_INITIALIZER, 1149 .ignore_gfp_wait = 1, 1150 .ignore_gfp_highmem = 1, 1151 .min_order = 1, 1152 }; 1153 1154 static int __init setup_fail_page_alloc(char *str) 1155 { 1156 return setup_fault_attr(&fail_page_alloc.attr, str); 1157 } 1158 __setup("fail_page_alloc=", setup_fail_page_alloc); 1159 1160 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1161 { 1162 if (order < fail_page_alloc.min_order) 1163 return 0; 1164 if (gfp_mask & __GFP_NOFAIL) 1165 return 0; 1166 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1167 return 0; 1168 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1169 return 0; 1170 1171 return should_fail(&fail_page_alloc.attr, 1 << order); 1172 } 1173 1174 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1175 1176 static int __init fail_page_alloc_debugfs(void) 1177 { 1178 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1179 struct dentry *dir; 1180 int err; 1181 1182 err = init_fault_attr_dentries(&fail_page_alloc.attr, 1183 "fail_page_alloc"); 1184 if (err) 1185 return err; 1186 dir = fail_page_alloc.attr.dentries.dir; 1187 1188 fail_page_alloc.ignore_gfp_wait_file = 1189 debugfs_create_bool("ignore-gfp-wait", mode, dir, 1190 &fail_page_alloc.ignore_gfp_wait); 1191 1192 fail_page_alloc.ignore_gfp_highmem_file = 1193 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1194 &fail_page_alloc.ignore_gfp_highmem); 1195 fail_page_alloc.min_order_file = 1196 debugfs_create_u32("min-order", mode, dir, 1197 &fail_page_alloc.min_order); 1198 1199 if (!fail_page_alloc.ignore_gfp_wait_file || 1200 !fail_page_alloc.ignore_gfp_highmem_file || 1201 !fail_page_alloc.min_order_file) { 1202 err = -ENOMEM; 1203 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 1204 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 1205 debugfs_remove(fail_page_alloc.min_order_file); 1206 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 1207 } 1208 1209 return err; 1210 } 1211 1212 late_initcall(fail_page_alloc_debugfs); 1213 1214 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1215 1216 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1217 1218 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1219 { 1220 return 0; 1221 } 1222 1223 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1224 1225 /* 1226 * Return 1 if free pages are above 'mark'. This takes into account the order 1227 * of the allocation. 1228 */ 1229 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1230 int classzone_idx, int alloc_flags) 1231 { 1232 /* free_pages my go negative - that's OK */ 1233 long min = mark; 1234 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1235 int o; 1236 1237 if (alloc_flags & ALLOC_HIGH) 1238 min -= min / 2; 1239 if (alloc_flags & ALLOC_HARDER) 1240 min -= min / 4; 1241 1242 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1243 return 0; 1244 for (o = 0; o < order; o++) { 1245 /* At the next order, this order's pages become unavailable */ 1246 free_pages -= z->free_area[o].nr_free << o; 1247 1248 /* Require fewer higher order pages to be free */ 1249 min >>= 1; 1250 1251 if (free_pages <= min) 1252 return 0; 1253 } 1254 return 1; 1255 } 1256 1257 #ifdef CONFIG_NUMA 1258 /* 1259 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1260 * skip over zones that are not allowed by the cpuset, or that have 1261 * been recently (in last second) found to be nearly full. See further 1262 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1263 * that have to skip over alot of full or unallowed zones. 1264 * 1265 * If the zonelist cache is present in the passed in zonelist, then 1266 * returns a pointer to the allowed node mask (either the current 1267 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1268 * 1269 * If the zonelist cache is not available for this zonelist, does 1270 * nothing and returns NULL. 1271 * 1272 * If the fullzones BITMAP in the zonelist cache is stale (more than 1273 * a second since last zap'd) then we zap it out (clear its bits.) 1274 * 1275 * We hold off even calling zlc_setup, until after we've checked the 1276 * first zone in the zonelist, on the theory that most allocations will 1277 * be satisfied from that first zone, so best to examine that zone as 1278 * quickly as we can. 1279 */ 1280 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1281 { 1282 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1283 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1284 1285 zlc = zonelist->zlcache_ptr; 1286 if (!zlc) 1287 return NULL; 1288 1289 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1290 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1291 zlc->last_full_zap = jiffies; 1292 } 1293 1294 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1295 &cpuset_current_mems_allowed : 1296 &node_states[N_HIGH_MEMORY]; 1297 return allowednodes; 1298 } 1299 1300 /* 1301 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1302 * if it is worth looking at further for free memory: 1303 * 1) Check that the zone isn't thought to be full (doesn't have its 1304 * bit set in the zonelist_cache fullzones BITMAP). 1305 * 2) Check that the zones node (obtained from the zonelist_cache 1306 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1307 * Return true (non-zero) if zone is worth looking at further, or 1308 * else return false (zero) if it is not. 1309 * 1310 * This check -ignores- the distinction between various watermarks, 1311 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1312 * found to be full for any variation of these watermarks, it will 1313 * be considered full for up to one second by all requests, unless 1314 * we are so low on memory on all allowed nodes that we are forced 1315 * into the second scan of the zonelist. 1316 * 1317 * In the second scan we ignore this zonelist cache and exactly 1318 * apply the watermarks to all zones, even it is slower to do so. 1319 * We are low on memory in the second scan, and should leave no stone 1320 * unturned looking for a free page. 1321 */ 1322 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1323 nodemask_t *allowednodes) 1324 { 1325 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1326 int i; /* index of *z in zonelist zones */ 1327 int n; /* node that zone *z is on */ 1328 1329 zlc = zonelist->zlcache_ptr; 1330 if (!zlc) 1331 return 1; 1332 1333 i = z - zonelist->zones; 1334 n = zlc->z_to_n[i]; 1335 1336 /* This zone is worth trying if it is allowed but not full */ 1337 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1338 } 1339 1340 /* 1341 * Given 'z' scanning a zonelist, set the corresponding bit in 1342 * zlc->fullzones, so that subsequent attempts to allocate a page 1343 * from that zone don't waste time re-examining it. 1344 */ 1345 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1346 { 1347 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1348 int i; /* index of *z in zonelist zones */ 1349 1350 zlc = zonelist->zlcache_ptr; 1351 if (!zlc) 1352 return; 1353 1354 i = z - zonelist->zones; 1355 1356 set_bit(i, zlc->fullzones); 1357 } 1358 1359 #else /* CONFIG_NUMA */ 1360 1361 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1362 { 1363 return NULL; 1364 } 1365 1366 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1367 nodemask_t *allowednodes) 1368 { 1369 return 1; 1370 } 1371 1372 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1373 { 1374 } 1375 #endif /* CONFIG_NUMA */ 1376 1377 /* 1378 * get_page_from_freelist goes through the zonelist trying to allocate 1379 * a page. 1380 */ 1381 static struct page * 1382 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1383 struct zonelist *zonelist, int alloc_flags) 1384 { 1385 struct zone **z; 1386 struct page *page = NULL; 1387 int classzone_idx = zone_idx(zonelist->zones[0]); 1388 struct zone *zone; 1389 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1390 int zlc_active = 0; /* set if using zonelist_cache */ 1391 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1392 enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ 1393 1394 zonelist_scan: 1395 /* 1396 * Scan zonelist, looking for a zone with enough free. 1397 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 */ 1399 z = zonelist->zones; 1400 1401 do { 1402 /* 1403 * In NUMA, this could be a policy zonelist which contains 1404 * zones that may not be allowed by the current gfp_mask. 1405 * Check the zone is allowed by the current flags 1406 */ 1407 if (unlikely(alloc_should_filter_zonelist(zonelist))) { 1408 if (highest_zoneidx == -1) 1409 highest_zoneidx = gfp_zone(gfp_mask); 1410 if (zone_idx(*z) > highest_zoneidx) 1411 continue; 1412 } 1413 1414 if (NUMA_BUILD && zlc_active && 1415 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1416 continue; 1417 zone = *z; 1418 if ((alloc_flags & ALLOC_CPUSET) && 1419 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1420 goto try_next_zone; 1421 1422 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1423 unsigned long mark; 1424 if (alloc_flags & ALLOC_WMARK_MIN) 1425 mark = zone->pages_min; 1426 else if (alloc_flags & ALLOC_WMARK_LOW) 1427 mark = zone->pages_low; 1428 else 1429 mark = zone->pages_high; 1430 if (!zone_watermark_ok(zone, order, mark, 1431 classzone_idx, alloc_flags)) { 1432 if (!zone_reclaim_mode || 1433 !zone_reclaim(zone, gfp_mask, order)) 1434 goto this_zone_full; 1435 } 1436 } 1437 1438 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1439 if (page) 1440 break; 1441 this_zone_full: 1442 if (NUMA_BUILD) 1443 zlc_mark_zone_full(zonelist, z); 1444 try_next_zone: 1445 if (NUMA_BUILD && !did_zlc_setup) { 1446 /* we do zlc_setup after the first zone is tried */ 1447 allowednodes = zlc_setup(zonelist, alloc_flags); 1448 zlc_active = 1; 1449 did_zlc_setup = 1; 1450 } 1451 } while (*(++z) != NULL); 1452 1453 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1454 /* Disable zlc cache for second zonelist scan */ 1455 zlc_active = 0; 1456 goto zonelist_scan; 1457 } 1458 return page; 1459 } 1460 1461 /* 1462 * This is the 'heart' of the zoned buddy allocator. 1463 */ 1464 struct page * fastcall 1465 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1466 struct zonelist *zonelist) 1467 { 1468 const gfp_t wait = gfp_mask & __GFP_WAIT; 1469 struct zone **z; 1470 struct page *page; 1471 struct reclaim_state reclaim_state; 1472 struct task_struct *p = current; 1473 int do_retry; 1474 int alloc_flags; 1475 int did_some_progress; 1476 1477 might_sleep_if(wait); 1478 1479 if (should_fail_alloc_page(gfp_mask, order)) 1480 return NULL; 1481 1482 restart: 1483 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1484 1485 if (unlikely(*z == NULL)) { 1486 /* 1487 * Happens if we have an empty zonelist as a result of 1488 * GFP_THISNODE being used on a memoryless node 1489 */ 1490 return NULL; 1491 } 1492 1493 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1494 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1495 if (page) 1496 goto got_pg; 1497 1498 /* 1499 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1500 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1501 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1502 * using a larger set of nodes after it has established that the 1503 * allowed per node queues are empty and that nodes are 1504 * over allocated. 1505 */ 1506 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1507 goto nopage; 1508 1509 for (z = zonelist->zones; *z; z++) 1510 wakeup_kswapd(*z, order); 1511 1512 /* 1513 * OK, we're below the kswapd watermark and have kicked background 1514 * reclaim. Now things get more complex, so set up alloc_flags according 1515 * to how we want to proceed. 1516 * 1517 * The caller may dip into page reserves a bit more if the caller 1518 * cannot run direct reclaim, or if the caller has realtime scheduling 1519 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1520 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1521 */ 1522 alloc_flags = ALLOC_WMARK_MIN; 1523 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1524 alloc_flags |= ALLOC_HARDER; 1525 if (gfp_mask & __GFP_HIGH) 1526 alloc_flags |= ALLOC_HIGH; 1527 if (wait) 1528 alloc_flags |= ALLOC_CPUSET; 1529 1530 /* 1531 * Go through the zonelist again. Let __GFP_HIGH and allocations 1532 * coming from realtime tasks go deeper into reserves. 1533 * 1534 * This is the last chance, in general, before the goto nopage. 1535 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1536 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1537 */ 1538 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1539 if (page) 1540 goto got_pg; 1541 1542 /* This allocation should allow future memory freeing. */ 1543 1544 rebalance: 1545 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1546 && !in_interrupt()) { 1547 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1548 nofail_alloc: 1549 /* go through the zonelist yet again, ignoring mins */ 1550 page = get_page_from_freelist(gfp_mask, order, 1551 zonelist, ALLOC_NO_WATERMARKS); 1552 if (page) 1553 goto got_pg; 1554 if (gfp_mask & __GFP_NOFAIL) { 1555 congestion_wait(WRITE, HZ/50); 1556 goto nofail_alloc; 1557 } 1558 } 1559 goto nopage; 1560 } 1561 1562 /* Atomic allocations - we can't balance anything */ 1563 if (!wait) 1564 goto nopage; 1565 1566 cond_resched(); 1567 1568 /* We now go into synchronous reclaim */ 1569 cpuset_memory_pressure_bump(); 1570 p->flags |= PF_MEMALLOC; 1571 reclaim_state.reclaimed_slab = 0; 1572 p->reclaim_state = &reclaim_state; 1573 1574 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); 1575 1576 p->reclaim_state = NULL; 1577 p->flags &= ~PF_MEMALLOC; 1578 1579 cond_resched(); 1580 1581 if (order != 0) 1582 drain_all_local_pages(); 1583 1584 if (likely(did_some_progress)) { 1585 page = get_page_from_freelist(gfp_mask, order, 1586 zonelist, alloc_flags); 1587 if (page) 1588 goto got_pg; 1589 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1590 if (!try_set_zone_oom(zonelist)) { 1591 schedule_timeout_uninterruptible(1); 1592 goto restart; 1593 } 1594 1595 /* 1596 * Go through the zonelist yet one more time, keep 1597 * very high watermark here, this is only to catch 1598 * a parallel oom killing, we must fail if we're still 1599 * under heavy pressure. 1600 */ 1601 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1602 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1603 if (page) { 1604 clear_zonelist_oom(zonelist); 1605 goto got_pg; 1606 } 1607 1608 /* The OOM killer will not help higher order allocs so fail */ 1609 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1610 clear_zonelist_oom(zonelist); 1611 goto nopage; 1612 } 1613 1614 out_of_memory(zonelist, gfp_mask, order); 1615 clear_zonelist_oom(zonelist); 1616 goto restart; 1617 } 1618 1619 /* 1620 * Don't let big-order allocations loop unless the caller explicitly 1621 * requests that. Wait for some write requests to complete then retry. 1622 * 1623 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 1624 * <= 3, but that may not be true in other implementations. 1625 */ 1626 do_retry = 0; 1627 if (!(gfp_mask & __GFP_NORETRY)) { 1628 if ((order <= PAGE_ALLOC_COSTLY_ORDER) || 1629 (gfp_mask & __GFP_REPEAT)) 1630 do_retry = 1; 1631 if (gfp_mask & __GFP_NOFAIL) 1632 do_retry = 1; 1633 } 1634 if (do_retry) { 1635 congestion_wait(WRITE, HZ/50); 1636 goto rebalance; 1637 } 1638 1639 nopage: 1640 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1641 printk(KERN_WARNING "%s: page allocation failure." 1642 " order:%d, mode:0x%x\n", 1643 p->comm, order, gfp_mask); 1644 dump_stack(); 1645 show_mem(); 1646 } 1647 got_pg: 1648 return page; 1649 } 1650 1651 EXPORT_SYMBOL(__alloc_pages); 1652 1653 /* 1654 * Common helper functions. 1655 */ 1656 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1657 { 1658 struct page * page; 1659 page = alloc_pages(gfp_mask, order); 1660 if (!page) 1661 return 0; 1662 return (unsigned long) page_address(page); 1663 } 1664 1665 EXPORT_SYMBOL(__get_free_pages); 1666 1667 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1668 { 1669 struct page * page; 1670 1671 /* 1672 * get_zeroed_page() returns a 32-bit address, which cannot represent 1673 * a highmem page 1674 */ 1675 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1676 1677 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1678 if (page) 1679 return (unsigned long) page_address(page); 1680 return 0; 1681 } 1682 1683 EXPORT_SYMBOL(get_zeroed_page); 1684 1685 void __pagevec_free(struct pagevec *pvec) 1686 { 1687 int i = pagevec_count(pvec); 1688 1689 while (--i >= 0) 1690 free_hot_cold_page(pvec->pages[i], pvec->cold); 1691 } 1692 1693 fastcall void __free_pages(struct page *page, unsigned int order) 1694 { 1695 if (put_page_testzero(page)) { 1696 if (order == 0) 1697 free_hot_page(page); 1698 else 1699 __free_pages_ok(page, order); 1700 } 1701 } 1702 1703 EXPORT_SYMBOL(__free_pages); 1704 1705 fastcall void free_pages(unsigned long addr, unsigned int order) 1706 { 1707 if (addr != 0) { 1708 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1709 __free_pages(virt_to_page((void *)addr), order); 1710 } 1711 } 1712 1713 EXPORT_SYMBOL(free_pages); 1714 1715 static unsigned int nr_free_zone_pages(int offset) 1716 { 1717 /* Just pick one node, since fallback list is circular */ 1718 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1719 unsigned int sum = 0; 1720 1721 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1722 struct zone **zonep = zonelist->zones; 1723 struct zone *zone; 1724 1725 for (zone = *zonep++; zone; zone = *zonep++) { 1726 unsigned long size = zone->present_pages; 1727 unsigned long high = zone->pages_high; 1728 if (size > high) 1729 sum += size - high; 1730 } 1731 1732 return sum; 1733 } 1734 1735 /* 1736 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1737 */ 1738 unsigned int nr_free_buffer_pages(void) 1739 { 1740 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1741 } 1742 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 1743 1744 /* 1745 * Amount of free RAM allocatable within all zones 1746 */ 1747 unsigned int nr_free_pagecache_pages(void) 1748 { 1749 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 1750 } 1751 1752 static inline void show_node(struct zone *zone) 1753 { 1754 if (NUMA_BUILD) 1755 printk("Node %d ", zone_to_nid(zone)); 1756 } 1757 1758 void si_meminfo(struct sysinfo *val) 1759 { 1760 val->totalram = totalram_pages; 1761 val->sharedram = 0; 1762 val->freeram = global_page_state(NR_FREE_PAGES); 1763 val->bufferram = nr_blockdev_pages(); 1764 val->totalhigh = totalhigh_pages; 1765 val->freehigh = nr_free_highpages(); 1766 val->mem_unit = PAGE_SIZE; 1767 } 1768 1769 EXPORT_SYMBOL(si_meminfo); 1770 1771 #ifdef CONFIG_NUMA 1772 void si_meminfo_node(struct sysinfo *val, int nid) 1773 { 1774 pg_data_t *pgdat = NODE_DATA(nid); 1775 1776 val->totalram = pgdat->node_present_pages; 1777 val->freeram = node_page_state(nid, NR_FREE_PAGES); 1778 #ifdef CONFIG_HIGHMEM 1779 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1780 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 1781 NR_FREE_PAGES); 1782 #else 1783 val->totalhigh = 0; 1784 val->freehigh = 0; 1785 #endif 1786 val->mem_unit = PAGE_SIZE; 1787 } 1788 #endif 1789 1790 #define K(x) ((x) << (PAGE_SHIFT-10)) 1791 1792 /* 1793 * Show free area list (used inside shift_scroll-lock stuff) 1794 * We also calculate the percentage fragmentation. We do this by counting the 1795 * memory on each free list with the exception of the first item on the list. 1796 */ 1797 void show_free_areas(void) 1798 { 1799 int cpu; 1800 struct zone *zone; 1801 1802 for_each_zone(zone) { 1803 if (!populated_zone(zone)) 1804 continue; 1805 1806 show_node(zone); 1807 printk("%s per-cpu:\n", zone->name); 1808 1809 for_each_online_cpu(cpu) { 1810 struct per_cpu_pageset *pageset; 1811 1812 pageset = zone_pcp(zone, cpu); 1813 1814 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1815 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1816 cpu, pageset->pcp[0].high, 1817 pageset->pcp[0].batch, pageset->pcp[0].count, 1818 pageset->pcp[1].high, pageset->pcp[1].batch, 1819 pageset->pcp[1].count); 1820 } 1821 } 1822 1823 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1824 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1825 global_page_state(NR_ACTIVE), 1826 global_page_state(NR_INACTIVE), 1827 global_page_state(NR_FILE_DIRTY), 1828 global_page_state(NR_WRITEBACK), 1829 global_page_state(NR_UNSTABLE_NFS), 1830 global_page_state(NR_FREE_PAGES), 1831 global_page_state(NR_SLAB_RECLAIMABLE) + 1832 global_page_state(NR_SLAB_UNRECLAIMABLE), 1833 global_page_state(NR_FILE_MAPPED), 1834 global_page_state(NR_PAGETABLE), 1835 global_page_state(NR_BOUNCE)); 1836 1837 for_each_zone(zone) { 1838 int i; 1839 1840 if (!populated_zone(zone)) 1841 continue; 1842 1843 show_node(zone); 1844 printk("%s" 1845 " free:%lukB" 1846 " min:%lukB" 1847 " low:%lukB" 1848 " high:%lukB" 1849 " active:%lukB" 1850 " inactive:%lukB" 1851 " present:%lukB" 1852 " pages_scanned:%lu" 1853 " all_unreclaimable? %s" 1854 "\n", 1855 zone->name, 1856 K(zone_page_state(zone, NR_FREE_PAGES)), 1857 K(zone->pages_min), 1858 K(zone->pages_low), 1859 K(zone->pages_high), 1860 K(zone_page_state(zone, NR_ACTIVE)), 1861 K(zone_page_state(zone, NR_INACTIVE)), 1862 K(zone->present_pages), 1863 zone->pages_scanned, 1864 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1865 ); 1866 printk("lowmem_reserve[]:"); 1867 for (i = 0; i < MAX_NR_ZONES; i++) 1868 printk(" %lu", zone->lowmem_reserve[i]); 1869 printk("\n"); 1870 } 1871 1872 for_each_zone(zone) { 1873 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1874 1875 if (!populated_zone(zone)) 1876 continue; 1877 1878 show_node(zone); 1879 printk("%s: ", zone->name); 1880 1881 spin_lock_irqsave(&zone->lock, flags); 1882 for (order = 0; order < MAX_ORDER; order++) { 1883 nr[order] = zone->free_area[order].nr_free; 1884 total += nr[order] << order; 1885 } 1886 spin_unlock_irqrestore(&zone->lock, flags); 1887 for (order = 0; order < MAX_ORDER; order++) 1888 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1889 printk("= %lukB\n", K(total)); 1890 } 1891 1892 show_swap_cache_info(); 1893 } 1894 1895 /* 1896 * Builds allocation fallback zone lists. 1897 * 1898 * Add all populated zones of a node to the zonelist. 1899 */ 1900 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 1901 int nr_zones, enum zone_type zone_type) 1902 { 1903 struct zone *zone; 1904 1905 BUG_ON(zone_type >= MAX_NR_ZONES); 1906 zone_type++; 1907 1908 do { 1909 zone_type--; 1910 zone = pgdat->node_zones + zone_type; 1911 if (populated_zone(zone)) { 1912 zonelist->zones[nr_zones++] = zone; 1913 check_highest_zone(zone_type); 1914 } 1915 1916 } while (zone_type); 1917 return nr_zones; 1918 } 1919 1920 1921 /* 1922 * zonelist_order: 1923 * 0 = automatic detection of better ordering. 1924 * 1 = order by ([node] distance, -zonetype) 1925 * 2 = order by (-zonetype, [node] distance) 1926 * 1927 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 1928 * the same zonelist. So only NUMA can configure this param. 1929 */ 1930 #define ZONELIST_ORDER_DEFAULT 0 1931 #define ZONELIST_ORDER_NODE 1 1932 #define ZONELIST_ORDER_ZONE 2 1933 1934 /* zonelist order in the kernel. 1935 * set_zonelist_order() will set this to NODE or ZONE. 1936 */ 1937 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 1938 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 1939 1940 1941 #ifdef CONFIG_NUMA 1942 /* The value user specified ....changed by config */ 1943 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1944 /* string for sysctl */ 1945 #define NUMA_ZONELIST_ORDER_LEN 16 1946 char numa_zonelist_order[16] = "default"; 1947 1948 /* 1949 * interface for configure zonelist ordering. 1950 * command line option "numa_zonelist_order" 1951 * = "[dD]efault - default, automatic configuration. 1952 * = "[nN]ode - order by node locality, then by zone within node 1953 * = "[zZ]one - order by zone, then by locality within zone 1954 */ 1955 1956 static int __parse_numa_zonelist_order(char *s) 1957 { 1958 if (*s == 'd' || *s == 'D') { 1959 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1960 } else if (*s == 'n' || *s == 'N') { 1961 user_zonelist_order = ZONELIST_ORDER_NODE; 1962 } else if (*s == 'z' || *s == 'Z') { 1963 user_zonelist_order = ZONELIST_ORDER_ZONE; 1964 } else { 1965 printk(KERN_WARNING 1966 "Ignoring invalid numa_zonelist_order value: " 1967 "%s\n", s); 1968 return -EINVAL; 1969 } 1970 return 0; 1971 } 1972 1973 static __init int setup_numa_zonelist_order(char *s) 1974 { 1975 if (s) 1976 return __parse_numa_zonelist_order(s); 1977 return 0; 1978 } 1979 early_param("numa_zonelist_order", setup_numa_zonelist_order); 1980 1981 /* 1982 * sysctl handler for numa_zonelist_order 1983 */ 1984 int numa_zonelist_order_handler(ctl_table *table, int write, 1985 struct file *file, void __user *buffer, size_t *length, 1986 loff_t *ppos) 1987 { 1988 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 1989 int ret; 1990 1991 if (write) 1992 strncpy(saved_string, (char*)table->data, 1993 NUMA_ZONELIST_ORDER_LEN); 1994 ret = proc_dostring(table, write, file, buffer, length, ppos); 1995 if (ret) 1996 return ret; 1997 if (write) { 1998 int oldval = user_zonelist_order; 1999 if (__parse_numa_zonelist_order((char*)table->data)) { 2000 /* 2001 * bogus value. restore saved string 2002 */ 2003 strncpy((char*)table->data, saved_string, 2004 NUMA_ZONELIST_ORDER_LEN); 2005 user_zonelist_order = oldval; 2006 } else if (oldval != user_zonelist_order) 2007 build_all_zonelists(); 2008 } 2009 return 0; 2010 } 2011 2012 2013 #define MAX_NODE_LOAD (num_online_nodes()) 2014 static int node_load[MAX_NUMNODES]; 2015 2016 /** 2017 * find_next_best_node - find the next node that should appear in a given node's fallback list 2018 * @node: node whose fallback list we're appending 2019 * @used_node_mask: nodemask_t of already used nodes 2020 * 2021 * We use a number of factors to determine which is the next node that should 2022 * appear on a given node's fallback list. The node should not have appeared 2023 * already in @node's fallback list, and it should be the next closest node 2024 * according to the distance array (which contains arbitrary distance values 2025 * from each node to each node in the system), and should also prefer nodes 2026 * with no CPUs, since presumably they'll have very little allocation pressure 2027 * on them otherwise. 2028 * It returns -1 if no node is found. 2029 */ 2030 static int find_next_best_node(int node, nodemask_t *used_node_mask) 2031 { 2032 int n, val; 2033 int min_val = INT_MAX; 2034 int best_node = -1; 2035 2036 /* Use the local node if we haven't already */ 2037 if (!node_isset(node, *used_node_mask)) { 2038 node_set(node, *used_node_mask); 2039 return node; 2040 } 2041 2042 for_each_node_state(n, N_HIGH_MEMORY) { 2043 cpumask_t tmp; 2044 2045 /* Don't want a node to appear more than once */ 2046 if (node_isset(n, *used_node_mask)) 2047 continue; 2048 2049 /* Use the distance array to find the distance */ 2050 val = node_distance(node, n); 2051 2052 /* Penalize nodes under us ("prefer the next node") */ 2053 val += (n < node); 2054 2055 /* Give preference to headless and unused nodes */ 2056 tmp = node_to_cpumask(n); 2057 if (!cpus_empty(tmp)) 2058 val += PENALTY_FOR_NODE_WITH_CPUS; 2059 2060 /* Slight preference for less loaded node */ 2061 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 2062 val += node_load[n]; 2063 2064 if (val < min_val) { 2065 min_val = val; 2066 best_node = n; 2067 } 2068 } 2069 2070 if (best_node >= 0) 2071 node_set(best_node, *used_node_mask); 2072 2073 return best_node; 2074 } 2075 2076 2077 /* 2078 * Build zonelists ordered by node and zones within node. 2079 * This results in maximum locality--normal zone overflows into local 2080 * DMA zone, if any--but risks exhausting DMA zone. 2081 */ 2082 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2083 { 2084 enum zone_type i; 2085 int j; 2086 struct zonelist *zonelist; 2087 2088 for (i = 0; i < MAX_NR_ZONES; i++) { 2089 zonelist = pgdat->node_zonelists + i; 2090 for (j = 0; zonelist->zones[j] != NULL; j++) 2091 ; 2092 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2093 zonelist->zones[j] = NULL; 2094 } 2095 } 2096 2097 /* 2098 * Build gfp_thisnode zonelists 2099 */ 2100 static void build_thisnode_zonelists(pg_data_t *pgdat) 2101 { 2102 enum zone_type i; 2103 int j; 2104 struct zonelist *zonelist; 2105 2106 for (i = 0; i < MAX_NR_ZONES; i++) { 2107 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; 2108 j = build_zonelists_node(pgdat, zonelist, 0, i); 2109 zonelist->zones[j] = NULL; 2110 } 2111 } 2112 2113 /* 2114 * Build zonelists ordered by zone and nodes within zones. 2115 * This results in conserving DMA zone[s] until all Normal memory is 2116 * exhausted, but results in overflowing to remote node while memory 2117 * may still exist in local DMA zone. 2118 */ 2119 static int node_order[MAX_NUMNODES]; 2120 2121 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2122 { 2123 enum zone_type i; 2124 int pos, j, node; 2125 int zone_type; /* needs to be signed */ 2126 struct zone *z; 2127 struct zonelist *zonelist; 2128 2129 for (i = 0; i < MAX_NR_ZONES; i++) { 2130 zonelist = pgdat->node_zonelists + i; 2131 pos = 0; 2132 for (zone_type = i; zone_type >= 0; zone_type--) { 2133 for (j = 0; j < nr_nodes; j++) { 2134 node = node_order[j]; 2135 z = &NODE_DATA(node)->node_zones[zone_type]; 2136 if (populated_zone(z)) { 2137 zonelist->zones[pos++] = z; 2138 check_highest_zone(zone_type); 2139 } 2140 } 2141 } 2142 zonelist->zones[pos] = NULL; 2143 } 2144 } 2145 2146 static int default_zonelist_order(void) 2147 { 2148 int nid, zone_type; 2149 unsigned long low_kmem_size,total_size; 2150 struct zone *z; 2151 int average_size; 2152 /* 2153 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2154 * If they are really small and used heavily, the system can fall 2155 * into OOM very easily. 2156 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2157 */ 2158 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2159 low_kmem_size = 0; 2160 total_size = 0; 2161 for_each_online_node(nid) { 2162 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2163 z = &NODE_DATA(nid)->node_zones[zone_type]; 2164 if (populated_zone(z)) { 2165 if (zone_type < ZONE_NORMAL) 2166 low_kmem_size += z->present_pages; 2167 total_size += z->present_pages; 2168 } 2169 } 2170 } 2171 if (!low_kmem_size || /* there are no DMA area. */ 2172 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 2173 return ZONELIST_ORDER_NODE; 2174 /* 2175 * look into each node's config. 2176 * If there is a node whose DMA/DMA32 memory is very big area on 2177 * local memory, NODE_ORDER may be suitable. 2178 */ 2179 average_size = total_size / 2180 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 2181 for_each_online_node(nid) { 2182 low_kmem_size = 0; 2183 total_size = 0; 2184 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2185 z = &NODE_DATA(nid)->node_zones[zone_type]; 2186 if (populated_zone(z)) { 2187 if (zone_type < ZONE_NORMAL) 2188 low_kmem_size += z->present_pages; 2189 total_size += z->present_pages; 2190 } 2191 } 2192 if (low_kmem_size && 2193 total_size > average_size && /* ignore small node */ 2194 low_kmem_size > total_size * 70/100) 2195 return ZONELIST_ORDER_NODE; 2196 } 2197 return ZONELIST_ORDER_ZONE; 2198 } 2199 2200 static void set_zonelist_order(void) 2201 { 2202 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 2203 current_zonelist_order = default_zonelist_order(); 2204 else 2205 current_zonelist_order = user_zonelist_order; 2206 } 2207 2208 static void build_zonelists(pg_data_t *pgdat) 2209 { 2210 int j, node, load; 2211 enum zone_type i; 2212 nodemask_t used_mask; 2213 int local_node, prev_node; 2214 struct zonelist *zonelist; 2215 int order = current_zonelist_order; 2216 2217 /* initialize zonelists */ 2218 for (i = 0; i < MAX_ZONELISTS; i++) { 2219 zonelist = pgdat->node_zonelists + i; 2220 zonelist->zones[0] = NULL; 2221 } 2222 2223 /* NUMA-aware ordering of nodes */ 2224 local_node = pgdat->node_id; 2225 load = num_online_nodes(); 2226 prev_node = local_node; 2227 nodes_clear(used_mask); 2228 2229 memset(node_load, 0, sizeof(node_load)); 2230 memset(node_order, 0, sizeof(node_order)); 2231 j = 0; 2232 2233 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 2234 int distance = node_distance(local_node, node); 2235 2236 /* 2237 * If another node is sufficiently far away then it is better 2238 * to reclaim pages in a zone before going off node. 2239 */ 2240 if (distance > RECLAIM_DISTANCE) 2241 zone_reclaim_mode = 1; 2242 2243 /* 2244 * We don't want to pressure a particular node. 2245 * So adding penalty to the first node in same 2246 * distance group to make it round-robin. 2247 */ 2248 if (distance != node_distance(local_node, prev_node)) 2249 node_load[node] = load; 2250 2251 prev_node = node; 2252 load--; 2253 if (order == ZONELIST_ORDER_NODE) 2254 build_zonelists_in_node_order(pgdat, node); 2255 else 2256 node_order[j++] = node; /* remember order */ 2257 } 2258 2259 if (order == ZONELIST_ORDER_ZONE) { 2260 /* calculate node order -- i.e., DMA last! */ 2261 build_zonelists_in_zone_order(pgdat, j); 2262 } 2263 2264 build_thisnode_zonelists(pgdat); 2265 } 2266 2267 /* Construct the zonelist performance cache - see further mmzone.h */ 2268 static void build_zonelist_cache(pg_data_t *pgdat) 2269 { 2270 int i; 2271 2272 for (i = 0; i < MAX_NR_ZONES; i++) { 2273 struct zonelist *zonelist; 2274 struct zonelist_cache *zlc; 2275 struct zone **z; 2276 2277 zonelist = pgdat->node_zonelists + i; 2278 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2279 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2280 for (z = zonelist->zones; *z; z++) 2281 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2282 } 2283 } 2284 2285 2286 #else /* CONFIG_NUMA */ 2287 2288 static void set_zonelist_order(void) 2289 { 2290 current_zonelist_order = ZONELIST_ORDER_ZONE; 2291 } 2292 2293 static void build_zonelists(pg_data_t *pgdat) 2294 { 2295 int node, local_node; 2296 enum zone_type i,j; 2297 2298 local_node = pgdat->node_id; 2299 for (i = 0; i < MAX_NR_ZONES; i++) { 2300 struct zonelist *zonelist; 2301 2302 zonelist = pgdat->node_zonelists + i; 2303 2304 j = build_zonelists_node(pgdat, zonelist, 0, i); 2305 /* 2306 * Now we build the zonelist so that it contains the zones 2307 * of all the other nodes. 2308 * We don't want to pressure a particular node, so when 2309 * building the zones for node N, we make sure that the 2310 * zones coming right after the local ones are those from 2311 * node N+1 (modulo N) 2312 */ 2313 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2314 if (!node_online(node)) 2315 continue; 2316 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2317 } 2318 for (node = 0; node < local_node; node++) { 2319 if (!node_online(node)) 2320 continue; 2321 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2322 } 2323 2324 zonelist->zones[j] = NULL; 2325 } 2326 } 2327 2328 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2329 static void build_zonelist_cache(pg_data_t *pgdat) 2330 { 2331 int i; 2332 2333 for (i = 0; i < MAX_NR_ZONES; i++) 2334 pgdat->node_zonelists[i].zlcache_ptr = NULL; 2335 } 2336 2337 #endif /* CONFIG_NUMA */ 2338 2339 /* return values int ....just for stop_machine_run() */ 2340 static int __build_all_zonelists(void *dummy) 2341 { 2342 int nid; 2343 2344 for_each_online_node(nid) { 2345 pg_data_t *pgdat = NODE_DATA(nid); 2346 2347 build_zonelists(pgdat); 2348 build_zonelist_cache(pgdat); 2349 } 2350 return 0; 2351 } 2352 2353 void build_all_zonelists(void) 2354 { 2355 set_zonelist_order(); 2356 2357 if (system_state == SYSTEM_BOOTING) { 2358 __build_all_zonelists(NULL); 2359 cpuset_init_current_mems_allowed(); 2360 } else { 2361 /* we have to stop all cpus to guaranntee there is no user 2362 of zonelist */ 2363 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2364 /* cpuset refresh routine should be here */ 2365 } 2366 vm_total_pages = nr_free_pagecache_pages(); 2367 /* 2368 * Disable grouping by mobility if the number of pages in the 2369 * system is too low to allow the mechanism to work. It would be 2370 * more accurate, but expensive to check per-zone. This check is 2371 * made on memory-hotadd so a system can start with mobility 2372 * disabled and enable it later 2373 */ 2374 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 2375 page_group_by_mobility_disabled = 1; 2376 else 2377 page_group_by_mobility_disabled = 0; 2378 2379 printk("Built %i zonelists in %s order, mobility grouping %s. " 2380 "Total pages: %ld\n", 2381 num_online_nodes(), 2382 zonelist_order_name[current_zonelist_order], 2383 page_group_by_mobility_disabled ? "off" : "on", 2384 vm_total_pages); 2385 #ifdef CONFIG_NUMA 2386 printk("Policy zone: %s\n", zone_names[policy_zone]); 2387 #endif 2388 } 2389 2390 /* 2391 * Helper functions to size the waitqueue hash table. 2392 * Essentially these want to choose hash table sizes sufficiently 2393 * large so that collisions trying to wait on pages are rare. 2394 * But in fact, the number of active page waitqueues on typical 2395 * systems is ridiculously low, less than 200. So this is even 2396 * conservative, even though it seems large. 2397 * 2398 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 2399 * waitqueues, i.e. the size of the waitq table given the number of pages. 2400 */ 2401 #define PAGES_PER_WAITQUEUE 256 2402 2403 #ifndef CONFIG_MEMORY_HOTPLUG 2404 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2405 { 2406 unsigned long size = 1; 2407 2408 pages /= PAGES_PER_WAITQUEUE; 2409 2410 while (size < pages) 2411 size <<= 1; 2412 2413 /* 2414 * Once we have dozens or even hundreds of threads sleeping 2415 * on IO we've got bigger problems than wait queue collision. 2416 * Limit the size of the wait table to a reasonable size. 2417 */ 2418 size = min(size, 4096UL); 2419 2420 return max(size, 4UL); 2421 } 2422 #else 2423 /* 2424 * A zone's size might be changed by hot-add, so it is not possible to determine 2425 * a suitable size for its wait_table. So we use the maximum size now. 2426 * 2427 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 2428 * 2429 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 2430 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 2431 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 2432 * 2433 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 2434 * or more by the traditional way. (See above). It equals: 2435 * 2436 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 2437 * ia64(16K page size) : = ( 8G + 4M)byte. 2438 * powerpc (64K page size) : = (32G +16M)byte. 2439 */ 2440 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2441 { 2442 return 4096UL; 2443 } 2444 #endif 2445 2446 /* 2447 * This is an integer logarithm so that shifts can be used later 2448 * to extract the more random high bits from the multiplicative 2449 * hash function before the remainder is taken. 2450 */ 2451 static inline unsigned long wait_table_bits(unsigned long size) 2452 { 2453 return ffz(~size); 2454 } 2455 2456 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2457 2458 /* 2459 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2460 * of blocks reserved is based on zone->pages_min. The memory within the 2461 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2462 * higher will lead to a bigger reserve which will get freed as contiguous 2463 * blocks as reclaim kicks in 2464 */ 2465 static void setup_zone_migrate_reserve(struct zone *zone) 2466 { 2467 unsigned long start_pfn, pfn, end_pfn; 2468 struct page *page; 2469 unsigned long reserve, block_migratetype; 2470 2471 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2472 start_pfn = zone->zone_start_pfn; 2473 end_pfn = start_pfn + zone->spanned_pages; 2474 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2475 pageblock_order; 2476 2477 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2478 if (!pfn_valid(pfn)) 2479 continue; 2480 page = pfn_to_page(pfn); 2481 2482 /* Blocks with reserved pages will never free, skip them. */ 2483 if (PageReserved(page)) 2484 continue; 2485 2486 block_migratetype = get_pageblock_migratetype(page); 2487 2488 /* If this block is reserved, account for it */ 2489 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 2490 reserve--; 2491 continue; 2492 } 2493 2494 /* Suitable for reserving if this block is movable */ 2495 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 2496 set_pageblock_migratetype(page, MIGRATE_RESERVE); 2497 move_freepages_block(zone, page, MIGRATE_RESERVE); 2498 reserve--; 2499 continue; 2500 } 2501 2502 /* 2503 * If the reserve is met and this is a previous reserved block, 2504 * take it back 2505 */ 2506 if (block_migratetype == MIGRATE_RESERVE) { 2507 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2508 move_freepages_block(zone, page, MIGRATE_MOVABLE); 2509 } 2510 } 2511 } 2512 2513 /* 2514 * Initially all pages are reserved - free ones are freed 2515 * up by free_all_bootmem() once the early boot process is 2516 * done. Non-atomic initialization, single-pass. 2517 */ 2518 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 2519 unsigned long start_pfn, enum memmap_context context) 2520 { 2521 struct page *page; 2522 unsigned long end_pfn = start_pfn + size; 2523 unsigned long pfn; 2524 2525 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2526 /* 2527 * There can be holes in boot-time mem_map[]s 2528 * handed to this function. They do not 2529 * exist on hotplugged memory. 2530 */ 2531 if (context == MEMMAP_EARLY) { 2532 if (!early_pfn_valid(pfn)) 2533 continue; 2534 if (!early_pfn_in_nid(pfn, nid)) 2535 continue; 2536 } 2537 page = pfn_to_page(pfn); 2538 set_page_links(page, zone, nid, pfn); 2539 init_page_count(page); 2540 reset_page_mapcount(page); 2541 SetPageReserved(page); 2542 2543 /* 2544 * Mark the block movable so that blocks are reserved for 2545 * movable at startup. This will force kernel allocations 2546 * to reserve their blocks rather than leaking throughout 2547 * the address space during boot when many long-lived 2548 * kernel allocations are made. Later some blocks near 2549 * the start are marked MIGRATE_RESERVE by 2550 * setup_zone_migrate_reserve() 2551 */ 2552 if ((pfn & (pageblock_nr_pages-1))) 2553 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2554 2555 INIT_LIST_HEAD(&page->lru); 2556 #ifdef WANT_PAGE_VIRTUAL 2557 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2558 if (!is_highmem_idx(zone)) 2559 set_page_address(page, __va(pfn << PAGE_SHIFT)); 2560 #endif 2561 } 2562 } 2563 2564 static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2565 struct zone *zone, unsigned long size) 2566 { 2567 int order, t; 2568 for_each_migratetype_order(order, t) { 2569 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 2570 zone->free_area[order].nr_free = 0; 2571 } 2572 } 2573 2574 #ifndef __HAVE_ARCH_MEMMAP_INIT 2575 #define memmap_init(size, nid, zone, start_pfn) \ 2576 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 2577 #endif 2578 2579 static int __devinit zone_batchsize(struct zone *zone) 2580 { 2581 int batch; 2582 2583 /* 2584 * The per-cpu-pages pools are set to around 1000th of the 2585 * size of the zone. But no more than 1/2 of a meg. 2586 * 2587 * OK, so we don't know how big the cache is. So guess. 2588 */ 2589 batch = zone->present_pages / 1024; 2590 if (batch * PAGE_SIZE > 512 * 1024) 2591 batch = (512 * 1024) / PAGE_SIZE; 2592 batch /= 4; /* We effectively *= 4 below */ 2593 if (batch < 1) 2594 batch = 1; 2595 2596 /* 2597 * Clamp the batch to a 2^n - 1 value. Having a power 2598 * of 2 value was found to be more likely to have 2599 * suboptimal cache aliasing properties in some cases. 2600 * 2601 * For example if 2 tasks are alternately allocating 2602 * batches of pages, one task can end up with a lot 2603 * of pages of one half of the possible page colors 2604 * and the other with pages of the other colors. 2605 */ 2606 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2607 2608 return batch; 2609 } 2610 2611 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2612 { 2613 struct per_cpu_pages *pcp; 2614 2615 memset(p, 0, sizeof(*p)); 2616 2617 pcp = &p->pcp[0]; /* hot */ 2618 pcp->count = 0; 2619 pcp->high = 6 * batch; 2620 pcp->batch = max(1UL, 1 * batch); 2621 INIT_LIST_HEAD(&pcp->list); 2622 2623 pcp = &p->pcp[1]; /* cold*/ 2624 pcp->count = 0; 2625 pcp->high = 2 * batch; 2626 pcp->batch = max(1UL, batch/2); 2627 INIT_LIST_HEAD(&pcp->list); 2628 } 2629 2630 /* 2631 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 2632 * to the value high for the pageset p. 2633 */ 2634 2635 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 2636 unsigned long high) 2637 { 2638 struct per_cpu_pages *pcp; 2639 2640 pcp = &p->pcp[0]; /* hot list */ 2641 pcp->high = high; 2642 pcp->batch = max(1UL, high/4); 2643 if ((high/4) > (PAGE_SHIFT * 8)) 2644 pcp->batch = PAGE_SHIFT * 8; 2645 } 2646 2647 2648 #ifdef CONFIG_NUMA 2649 /* 2650 * Boot pageset table. One per cpu which is going to be used for all 2651 * zones and all nodes. The parameters will be set in such a way 2652 * that an item put on a list will immediately be handed over to 2653 * the buddy list. This is safe since pageset manipulation is done 2654 * with interrupts disabled. 2655 * 2656 * Some NUMA counter updates may also be caught by the boot pagesets. 2657 * 2658 * The boot_pagesets must be kept even after bootup is complete for 2659 * unused processors and/or zones. They do play a role for bootstrapping 2660 * hotplugged processors. 2661 * 2662 * zoneinfo_show() and maybe other functions do 2663 * not check if the processor is online before following the pageset pointer. 2664 * Other parts of the kernel may not check if the zone is available. 2665 */ 2666 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 2667 2668 /* 2669 * Dynamically allocate memory for the 2670 * per cpu pageset array in struct zone. 2671 */ 2672 static int __cpuinit process_zones(int cpu) 2673 { 2674 struct zone *zone, *dzone; 2675 int node = cpu_to_node(cpu); 2676 2677 node_set_state(node, N_CPU); /* this node has a cpu */ 2678 2679 for_each_zone(zone) { 2680 2681 if (!populated_zone(zone)) 2682 continue; 2683 2684 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2685 GFP_KERNEL, node); 2686 if (!zone_pcp(zone, cpu)) 2687 goto bad; 2688 2689 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2690 2691 if (percpu_pagelist_fraction) 2692 setup_pagelist_highmark(zone_pcp(zone, cpu), 2693 (zone->present_pages / percpu_pagelist_fraction)); 2694 } 2695 2696 return 0; 2697 bad: 2698 for_each_zone(dzone) { 2699 if (!populated_zone(dzone)) 2700 continue; 2701 if (dzone == zone) 2702 break; 2703 kfree(zone_pcp(dzone, cpu)); 2704 zone_pcp(dzone, cpu) = NULL; 2705 } 2706 return -ENOMEM; 2707 } 2708 2709 static inline void free_zone_pagesets(int cpu) 2710 { 2711 struct zone *zone; 2712 2713 for_each_zone(zone) { 2714 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2715 2716 /* Free per_cpu_pageset if it is slab allocated */ 2717 if (pset != &boot_pageset[cpu]) 2718 kfree(pset); 2719 zone_pcp(zone, cpu) = NULL; 2720 } 2721 } 2722 2723 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2724 unsigned long action, 2725 void *hcpu) 2726 { 2727 int cpu = (long)hcpu; 2728 int ret = NOTIFY_OK; 2729 2730 switch (action) { 2731 case CPU_UP_PREPARE: 2732 case CPU_UP_PREPARE_FROZEN: 2733 if (process_zones(cpu)) 2734 ret = NOTIFY_BAD; 2735 break; 2736 case CPU_UP_CANCELED: 2737 case CPU_UP_CANCELED_FROZEN: 2738 case CPU_DEAD: 2739 case CPU_DEAD_FROZEN: 2740 free_zone_pagesets(cpu); 2741 break; 2742 default: 2743 break; 2744 } 2745 return ret; 2746 } 2747 2748 static struct notifier_block __cpuinitdata pageset_notifier = 2749 { &pageset_cpuup_callback, NULL, 0 }; 2750 2751 void __init setup_per_cpu_pageset(void) 2752 { 2753 int err; 2754 2755 /* Initialize per_cpu_pageset for cpu 0. 2756 * A cpuup callback will do this for every cpu 2757 * as it comes online 2758 */ 2759 err = process_zones(smp_processor_id()); 2760 BUG_ON(err); 2761 register_cpu_notifier(&pageset_notifier); 2762 } 2763 2764 #endif 2765 2766 static noinline __init_refok 2767 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2768 { 2769 int i; 2770 struct pglist_data *pgdat = zone->zone_pgdat; 2771 size_t alloc_size; 2772 2773 /* 2774 * The per-page waitqueue mechanism uses hashed waitqueues 2775 * per zone. 2776 */ 2777 zone->wait_table_hash_nr_entries = 2778 wait_table_hash_nr_entries(zone_size_pages); 2779 zone->wait_table_bits = 2780 wait_table_bits(zone->wait_table_hash_nr_entries); 2781 alloc_size = zone->wait_table_hash_nr_entries 2782 * sizeof(wait_queue_head_t); 2783 2784 if (system_state == SYSTEM_BOOTING) { 2785 zone->wait_table = (wait_queue_head_t *) 2786 alloc_bootmem_node(pgdat, alloc_size); 2787 } else { 2788 /* 2789 * This case means that a zone whose size was 0 gets new memory 2790 * via memory hot-add. 2791 * But it may be the case that a new node was hot-added. In 2792 * this case vmalloc() will not be able to use this new node's 2793 * memory - this wait_table must be initialized to use this new 2794 * node itself as well. 2795 * To use this new node's memory, further consideration will be 2796 * necessary. 2797 */ 2798 zone->wait_table = vmalloc(alloc_size); 2799 } 2800 if (!zone->wait_table) 2801 return -ENOMEM; 2802 2803 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2804 init_waitqueue_head(zone->wait_table + i); 2805 2806 return 0; 2807 } 2808 2809 static __meminit void zone_pcp_init(struct zone *zone) 2810 { 2811 int cpu; 2812 unsigned long batch = zone_batchsize(zone); 2813 2814 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2815 #ifdef CONFIG_NUMA 2816 /* Early boot. Slab allocator not functional yet */ 2817 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2818 setup_pageset(&boot_pageset[cpu],0); 2819 #else 2820 setup_pageset(zone_pcp(zone,cpu), batch); 2821 #endif 2822 } 2823 if (zone->present_pages) 2824 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2825 zone->name, zone->present_pages, batch); 2826 } 2827 2828 __meminit int init_currently_empty_zone(struct zone *zone, 2829 unsigned long zone_start_pfn, 2830 unsigned long size, 2831 enum memmap_context context) 2832 { 2833 struct pglist_data *pgdat = zone->zone_pgdat; 2834 int ret; 2835 ret = zone_wait_table_init(zone, size); 2836 if (ret) 2837 return ret; 2838 pgdat->nr_zones = zone_idx(zone) + 1; 2839 2840 zone->zone_start_pfn = zone_start_pfn; 2841 2842 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2843 2844 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2845 2846 return 0; 2847 } 2848 2849 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2850 /* 2851 * Basic iterator support. Return the first range of PFNs for a node 2852 * Note: nid == MAX_NUMNODES returns first region regardless of node 2853 */ 2854 static int __meminit first_active_region_index_in_nid(int nid) 2855 { 2856 int i; 2857 2858 for (i = 0; i < nr_nodemap_entries; i++) 2859 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2860 return i; 2861 2862 return -1; 2863 } 2864 2865 /* 2866 * Basic iterator support. Return the next active range of PFNs for a node 2867 * Note: nid == MAX_NUMNODES returns next region regardles of node 2868 */ 2869 static int __meminit next_active_region_index_in_nid(int index, int nid) 2870 { 2871 for (index = index + 1; index < nr_nodemap_entries; index++) 2872 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2873 return index; 2874 2875 return -1; 2876 } 2877 2878 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2879 /* 2880 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2881 * Architectures may implement their own version but if add_active_range() 2882 * was used and there are no special requirements, this is a convenient 2883 * alternative 2884 */ 2885 int __meminit early_pfn_to_nid(unsigned long pfn) 2886 { 2887 int i; 2888 2889 for (i = 0; i < nr_nodemap_entries; i++) { 2890 unsigned long start_pfn = early_node_map[i].start_pfn; 2891 unsigned long end_pfn = early_node_map[i].end_pfn; 2892 2893 if (start_pfn <= pfn && pfn < end_pfn) 2894 return early_node_map[i].nid; 2895 } 2896 2897 return 0; 2898 } 2899 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2900 2901 /* Basic iterator support to walk early_node_map[] */ 2902 #define for_each_active_range_index_in_nid(i, nid) \ 2903 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2904 i = next_active_region_index_in_nid(i, nid)) 2905 2906 /** 2907 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2908 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2909 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2910 * 2911 * If an architecture guarantees that all ranges registered with 2912 * add_active_ranges() contain no holes and may be freed, this 2913 * this function may be used instead of calling free_bootmem() manually. 2914 */ 2915 void __init free_bootmem_with_active_regions(int nid, 2916 unsigned long max_low_pfn) 2917 { 2918 int i; 2919 2920 for_each_active_range_index_in_nid(i, nid) { 2921 unsigned long size_pages = 0; 2922 unsigned long end_pfn = early_node_map[i].end_pfn; 2923 2924 if (early_node_map[i].start_pfn >= max_low_pfn) 2925 continue; 2926 2927 if (end_pfn > max_low_pfn) 2928 end_pfn = max_low_pfn; 2929 2930 size_pages = end_pfn - early_node_map[i].start_pfn; 2931 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2932 PFN_PHYS(early_node_map[i].start_pfn), 2933 size_pages << PAGE_SHIFT); 2934 } 2935 } 2936 2937 /** 2938 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2939 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2940 * 2941 * If an architecture guarantees that all ranges registered with 2942 * add_active_ranges() contain no holes and may be freed, this 2943 * function may be used instead of calling memory_present() manually. 2944 */ 2945 void __init sparse_memory_present_with_active_regions(int nid) 2946 { 2947 int i; 2948 2949 for_each_active_range_index_in_nid(i, nid) 2950 memory_present(early_node_map[i].nid, 2951 early_node_map[i].start_pfn, 2952 early_node_map[i].end_pfn); 2953 } 2954 2955 /** 2956 * push_node_boundaries - Push node boundaries to at least the requested boundary 2957 * @nid: The nid of the node to push the boundary for 2958 * @start_pfn: The start pfn of the node 2959 * @end_pfn: The end pfn of the node 2960 * 2961 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2962 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2963 * be hotplugged even though no physical memory exists. This function allows 2964 * an arch to push out the node boundaries so mem_map is allocated that can 2965 * be used later. 2966 */ 2967 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2968 void __init push_node_boundaries(unsigned int nid, 2969 unsigned long start_pfn, unsigned long end_pfn) 2970 { 2971 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 2972 nid, start_pfn, end_pfn); 2973 2974 /* Initialise the boundary for this node if necessary */ 2975 if (node_boundary_end_pfn[nid] == 0) 2976 node_boundary_start_pfn[nid] = -1UL; 2977 2978 /* Update the boundaries */ 2979 if (node_boundary_start_pfn[nid] > start_pfn) 2980 node_boundary_start_pfn[nid] = start_pfn; 2981 if (node_boundary_end_pfn[nid] < end_pfn) 2982 node_boundary_end_pfn[nid] = end_pfn; 2983 } 2984 2985 /* If necessary, push the node boundary out for reserve hotadd */ 2986 static void __meminit account_node_boundary(unsigned int nid, 2987 unsigned long *start_pfn, unsigned long *end_pfn) 2988 { 2989 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2990 nid, *start_pfn, *end_pfn); 2991 2992 /* Return if boundary information has not been provided */ 2993 if (node_boundary_end_pfn[nid] == 0) 2994 return; 2995 2996 /* Check the boundaries and update if necessary */ 2997 if (node_boundary_start_pfn[nid] < *start_pfn) 2998 *start_pfn = node_boundary_start_pfn[nid]; 2999 if (node_boundary_end_pfn[nid] > *end_pfn) 3000 *end_pfn = node_boundary_end_pfn[nid]; 3001 } 3002 #else 3003 void __init push_node_boundaries(unsigned int nid, 3004 unsigned long start_pfn, unsigned long end_pfn) {} 3005 3006 static void __meminit account_node_boundary(unsigned int nid, 3007 unsigned long *start_pfn, unsigned long *end_pfn) {} 3008 #endif 3009 3010 3011 /** 3012 * get_pfn_range_for_nid - Return the start and end page frames for a node 3013 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3014 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3015 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 3016 * 3017 * It returns the start and end page frame of a node based on information 3018 * provided by an arch calling add_active_range(). If called for a node 3019 * with no available memory, a warning is printed and the start and end 3020 * PFNs will be 0. 3021 */ 3022 void __meminit get_pfn_range_for_nid(unsigned int nid, 3023 unsigned long *start_pfn, unsigned long *end_pfn) 3024 { 3025 int i; 3026 *start_pfn = -1UL; 3027 *end_pfn = 0; 3028 3029 for_each_active_range_index_in_nid(i, nid) { 3030 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 3031 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3032 } 3033 3034 if (*start_pfn == -1UL) 3035 *start_pfn = 0; 3036 3037 /* Push the node boundaries out if requested */ 3038 account_node_boundary(nid, start_pfn, end_pfn); 3039 } 3040 3041 /* 3042 * This finds a zone that can be used for ZONE_MOVABLE pages. The 3043 * assumption is made that zones within a node are ordered in monotonic 3044 * increasing memory addresses so that the "highest" populated zone is used 3045 */ 3046 void __init find_usable_zone_for_movable(void) 3047 { 3048 int zone_index; 3049 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3050 if (zone_index == ZONE_MOVABLE) 3051 continue; 3052 3053 if (arch_zone_highest_possible_pfn[zone_index] > 3054 arch_zone_lowest_possible_pfn[zone_index]) 3055 break; 3056 } 3057 3058 VM_BUG_ON(zone_index == -1); 3059 movable_zone = zone_index; 3060 } 3061 3062 /* 3063 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3064 * because it is sized independant of architecture. Unlike the other zones, 3065 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3066 * in each node depending on the size of each node and how evenly kernelcore 3067 * is distributed. This helper function adjusts the zone ranges 3068 * provided by the architecture for a given node by using the end of the 3069 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3070 * zones within a node are in order of monotonic increases memory addresses 3071 */ 3072 void __meminit adjust_zone_range_for_zone_movable(int nid, 3073 unsigned long zone_type, 3074 unsigned long node_start_pfn, 3075 unsigned long node_end_pfn, 3076 unsigned long *zone_start_pfn, 3077 unsigned long *zone_end_pfn) 3078 { 3079 /* Only adjust if ZONE_MOVABLE is on this node */ 3080 if (zone_movable_pfn[nid]) { 3081 /* Size ZONE_MOVABLE */ 3082 if (zone_type == ZONE_MOVABLE) { 3083 *zone_start_pfn = zone_movable_pfn[nid]; 3084 *zone_end_pfn = min(node_end_pfn, 3085 arch_zone_highest_possible_pfn[movable_zone]); 3086 3087 /* Adjust for ZONE_MOVABLE starting within this range */ 3088 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 3089 *zone_end_pfn > zone_movable_pfn[nid]) { 3090 *zone_end_pfn = zone_movable_pfn[nid]; 3091 3092 /* Check if this whole range is within ZONE_MOVABLE */ 3093 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 3094 *zone_start_pfn = *zone_end_pfn; 3095 } 3096 } 3097 3098 /* 3099 * Return the number of pages a zone spans in a node, including holes 3100 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 3101 */ 3102 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 3103 unsigned long zone_type, 3104 unsigned long *ignored) 3105 { 3106 unsigned long node_start_pfn, node_end_pfn; 3107 unsigned long zone_start_pfn, zone_end_pfn; 3108 3109 /* Get the start and end of the node and zone */ 3110 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3111 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 3112 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 3113 adjust_zone_range_for_zone_movable(nid, zone_type, 3114 node_start_pfn, node_end_pfn, 3115 &zone_start_pfn, &zone_end_pfn); 3116 3117 /* Check that this node has pages within the zone's required range */ 3118 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 3119 return 0; 3120 3121 /* Move the zone boundaries inside the node if necessary */ 3122 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 3123 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 3124 3125 /* Return the spanned pages */ 3126 return zone_end_pfn - zone_start_pfn; 3127 } 3128 3129 /* 3130 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3131 * then all holes in the requested range will be accounted for. 3132 */ 3133 unsigned long __meminit __absent_pages_in_range(int nid, 3134 unsigned long range_start_pfn, 3135 unsigned long range_end_pfn) 3136 { 3137 int i = 0; 3138 unsigned long prev_end_pfn = 0, hole_pages = 0; 3139 unsigned long start_pfn; 3140 3141 /* Find the end_pfn of the first active range of pfns in the node */ 3142 i = first_active_region_index_in_nid(nid); 3143 if (i == -1) 3144 return 0; 3145 3146 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3147 3148 /* Account for ranges before physical memory on this node */ 3149 if (early_node_map[i].start_pfn > range_start_pfn) 3150 hole_pages = prev_end_pfn - range_start_pfn; 3151 3152 /* Find all holes for the zone within the node */ 3153 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 3154 3155 /* No need to continue if prev_end_pfn is outside the zone */ 3156 if (prev_end_pfn >= range_end_pfn) 3157 break; 3158 3159 /* Make sure the end of the zone is not within the hole */ 3160 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3161 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 3162 3163 /* Update the hole size cound and move on */ 3164 if (start_pfn > range_start_pfn) { 3165 BUG_ON(prev_end_pfn > start_pfn); 3166 hole_pages += start_pfn - prev_end_pfn; 3167 } 3168 prev_end_pfn = early_node_map[i].end_pfn; 3169 } 3170 3171 /* Account for ranges past physical memory on this node */ 3172 if (range_end_pfn > prev_end_pfn) 3173 hole_pages += range_end_pfn - 3174 max(range_start_pfn, prev_end_pfn); 3175 3176 return hole_pages; 3177 } 3178 3179 /** 3180 * absent_pages_in_range - Return number of page frames in holes within a range 3181 * @start_pfn: The start PFN to start searching for holes 3182 * @end_pfn: The end PFN to stop searching for holes 3183 * 3184 * It returns the number of pages frames in memory holes within a range. 3185 */ 3186 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 3187 unsigned long end_pfn) 3188 { 3189 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 3190 } 3191 3192 /* Return the number of page frames in holes in a zone on a node */ 3193 static unsigned long __meminit zone_absent_pages_in_node(int nid, 3194 unsigned long zone_type, 3195 unsigned long *ignored) 3196 { 3197 unsigned long node_start_pfn, node_end_pfn; 3198 unsigned long zone_start_pfn, zone_end_pfn; 3199 3200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3201 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 3202 node_start_pfn); 3203 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 3204 node_end_pfn); 3205 3206 adjust_zone_range_for_zone_movable(nid, zone_type, 3207 node_start_pfn, node_end_pfn, 3208 &zone_start_pfn, &zone_end_pfn); 3209 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 3210 } 3211 3212 #else 3213 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 3214 unsigned long zone_type, 3215 unsigned long *zones_size) 3216 { 3217 return zones_size[zone_type]; 3218 } 3219 3220 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 3221 unsigned long zone_type, 3222 unsigned long *zholes_size) 3223 { 3224 if (!zholes_size) 3225 return 0; 3226 3227 return zholes_size[zone_type]; 3228 } 3229 3230 #endif 3231 3232 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 3233 unsigned long *zones_size, unsigned long *zholes_size) 3234 { 3235 unsigned long realtotalpages, totalpages = 0; 3236 enum zone_type i; 3237 3238 for (i = 0; i < MAX_NR_ZONES; i++) 3239 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 3240 zones_size); 3241 pgdat->node_spanned_pages = totalpages; 3242 3243 realtotalpages = totalpages; 3244 for (i = 0; i < MAX_NR_ZONES; i++) 3245 realtotalpages -= 3246 zone_absent_pages_in_node(pgdat->node_id, i, 3247 zholes_size); 3248 pgdat->node_present_pages = realtotalpages; 3249 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 3250 realtotalpages); 3251 } 3252 3253 #ifndef CONFIG_SPARSEMEM 3254 /* 3255 * Calculate the size of the zone->blockflags rounded to an unsigned long 3256 * Start by making sure zonesize is a multiple of pageblock_order by rounding 3257 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 3258 * round what is now in bits to nearest long in bits, then return it in 3259 * bytes. 3260 */ 3261 static unsigned long __init usemap_size(unsigned long zonesize) 3262 { 3263 unsigned long usemapsize; 3264 3265 usemapsize = roundup(zonesize, pageblock_nr_pages); 3266 usemapsize = usemapsize >> pageblock_order; 3267 usemapsize *= NR_PAGEBLOCK_BITS; 3268 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 3269 3270 return usemapsize / 8; 3271 } 3272 3273 static void __init setup_usemap(struct pglist_data *pgdat, 3274 struct zone *zone, unsigned long zonesize) 3275 { 3276 unsigned long usemapsize = usemap_size(zonesize); 3277 zone->pageblock_flags = NULL; 3278 if (usemapsize) { 3279 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3280 memset(zone->pageblock_flags, 0, usemapsize); 3281 } 3282 } 3283 #else 3284 static void inline setup_usemap(struct pglist_data *pgdat, 3285 struct zone *zone, unsigned long zonesize) {} 3286 #endif /* CONFIG_SPARSEMEM */ 3287 3288 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 3289 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 3290 static inline void __init set_pageblock_order(unsigned int order) 3291 { 3292 /* Check that pageblock_nr_pages has not already been setup */ 3293 if (pageblock_order) 3294 return; 3295 3296 /* 3297 * Assume the largest contiguous order of interest is a huge page. 3298 * This value may be variable depending on boot parameters on IA64 3299 */ 3300 pageblock_order = order; 3301 } 3302 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3303 3304 /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ 3305 #define set_pageblock_order(x) do {} while (0) 3306 3307 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3308 3309 /* 3310 * Set up the zone data structures: 3311 * - mark all pages reserved 3312 * - mark all memory queues empty 3313 * - clear the memory bitmaps 3314 */ 3315 static void __meminit free_area_init_core(struct pglist_data *pgdat, 3316 unsigned long *zones_size, unsigned long *zholes_size) 3317 { 3318 enum zone_type j; 3319 int nid = pgdat->node_id; 3320 unsigned long zone_start_pfn = pgdat->node_start_pfn; 3321 int ret; 3322 3323 pgdat_resize_init(pgdat); 3324 pgdat->nr_zones = 0; 3325 init_waitqueue_head(&pgdat->kswapd_wait); 3326 pgdat->kswapd_max_order = 0; 3327 3328 for (j = 0; j < MAX_NR_ZONES; j++) { 3329 struct zone *zone = pgdat->node_zones + j; 3330 unsigned long size, realsize, memmap_pages; 3331 3332 size = zone_spanned_pages_in_node(nid, j, zones_size); 3333 realsize = size - zone_absent_pages_in_node(nid, j, 3334 zholes_size); 3335 3336 /* 3337 * Adjust realsize so that it accounts for how much memory 3338 * is used by this zone for memmap. This affects the watermark 3339 * and per-cpu initialisations 3340 */ 3341 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; 3342 if (realsize >= memmap_pages) { 3343 realsize -= memmap_pages; 3344 printk(KERN_DEBUG 3345 " %s zone: %lu pages used for memmap\n", 3346 zone_names[j], memmap_pages); 3347 } else 3348 printk(KERN_WARNING 3349 " %s zone: %lu pages exceeds realsize %lu\n", 3350 zone_names[j], memmap_pages, realsize); 3351 3352 /* Account for reserved pages */ 3353 if (j == 0 && realsize > dma_reserve) { 3354 realsize -= dma_reserve; 3355 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3356 zone_names[0], dma_reserve); 3357 } 3358 3359 if (!is_highmem_idx(j)) 3360 nr_kernel_pages += realsize; 3361 nr_all_pages += realsize; 3362 3363 zone->spanned_pages = size; 3364 zone->present_pages = realsize; 3365 #ifdef CONFIG_NUMA 3366 zone->node = nid; 3367 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 3368 / 100; 3369 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 3370 #endif 3371 zone->name = zone_names[j]; 3372 spin_lock_init(&zone->lock); 3373 spin_lock_init(&zone->lru_lock); 3374 zone_seqlock_init(zone); 3375 zone->zone_pgdat = pgdat; 3376 3377 zone->prev_priority = DEF_PRIORITY; 3378 3379 zone_pcp_init(zone); 3380 INIT_LIST_HEAD(&zone->active_list); 3381 INIT_LIST_HEAD(&zone->inactive_list); 3382 zone->nr_scan_active = 0; 3383 zone->nr_scan_inactive = 0; 3384 zap_zone_vm_stats(zone); 3385 zone->flags = 0; 3386 if (!size) 3387 continue; 3388 3389 set_pageblock_order(HUGETLB_PAGE_ORDER); 3390 setup_usemap(pgdat, zone, size); 3391 ret = init_currently_empty_zone(zone, zone_start_pfn, 3392 size, MEMMAP_EARLY); 3393 BUG_ON(ret); 3394 zone_start_pfn += size; 3395 } 3396 } 3397 3398 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 3399 { 3400 /* Skip empty nodes */ 3401 if (!pgdat->node_spanned_pages) 3402 return; 3403 3404 #ifdef CONFIG_FLAT_NODE_MEM_MAP 3405 /* ia64 gets its own node_mem_map, before this, without bootmem */ 3406 if (!pgdat->node_mem_map) { 3407 unsigned long size, start, end; 3408 struct page *map; 3409 3410 /* 3411 * The zone's endpoints aren't required to be MAX_ORDER 3412 * aligned but the node_mem_map endpoints must be in order 3413 * for the buddy allocator to function correctly. 3414 */ 3415 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 3416 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 3417 end = ALIGN(end, MAX_ORDER_NR_PAGES); 3418 size = (end - start) * sizeof(struct page); 3419 map = alloc_remap(pgdat->node_id, size); 3420 if (!map) 3421 map = alloc_bootmem_node(pgdat, size); 3422 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 3423 } 3424 #ifndef CONFIG_NEED_MULTIPLE_NODES 3425 /* 3426 * With no DISCONTIG, the global mem_map is just set as node 0's 3427 */ 3428 if (pgdat == NODE_DATA(0)) { 3429 mem_map = NODE_DATA(0)->node_mem_map; 3430 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3431 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 3432 mem_map -= pgdat->node_start_pfn; 3433 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3434 } 3435 #endif 3436 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3437 } 3438 3439 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, 3440 unsigned long *zones_size, unsigned long node_start_pfn, 3441 unsigned long *zholes_size) 3442 { 3443 pgdat->node_id = nid; 3444 pgdat->node_start_pfn = node_start_pfn; 3445 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3446 3447 alloc_node_mem_map(pgdat); 3448 3449 free_area_init_core(pgdat, zones_size, zholes_size); 3450 } 3451 3452 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3453 3454 #if MAX_NUMNODES > 1 3455 /* 3456 * Figure out the number of possible node ids. 3457 */ 3458 static void __init setup_nr_node_ids(void) 3459 { 3460 unsigned int node; 3461 unsigned int highest = 0; 3462 3463 for_each_node_mask(node, node_possible_map) 3464 highest = node; 3465 nr_node_ids = highest + 1; 3466 } 3467 #else 3468 static inline void setup_nr_node_ids(void) 3469 { 3470 } 3471 #endif 3472 3473 /** 3474 * add_active_range - Register a range of PFNs backed by physical memory 3475 * @nid: The node ID the range resides on 3476 * @start_pfn: The start PFN of the available physical memory 3477 * @end_pfn: The end PFN of the available physical memory 3478 * 3479 * These ranges are stored in an early_node_map[] and later used by 3480 * free_area_init_nodes() to calculate zone sizes and holes. If the 3481 * range spans a memory hole, it is up to the architecture to ensure 3482 * the memory is not freed by the bootmem allocator. If possible 3483 * the range being registered will be merged with existing ranges. 3484 */ 3485 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 3486 unsigned long end_pfn) 3487 { 3488 int i; 3489 3490 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 3491 "%d entries of %d used\n", 3492 nid, start_pfn, end_pfn, 3493 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3494 3495 /* Merge with existing active regions if possible */ 3496 for (i = 0; i < nr_nodemap_entries; i++) { 3497 if (early_node_map[i].nid != nid) 3498 continue; 3499 3500 /* Skip if an existing region covers this new one */ 3501 if (start_pfn >= early_node_map[i].start_pfn && 3502 end_pfn <= early_node_map[i].end_pfn) 3503 return; 3504 3505 /* Merge forward if suitable */ 3506 if (start_pfn <= early_node_map[i].end_pfn && 3507 end_pfn > early_node_map[i].end_pfn) { 3508 early_node_map[i].end_pfn = end_pfn; 3509 return; 3510 } 3511 3512 /* Merge backward if suitable */ 3513 if (start_pfn < early_node_map[i].end_pfn && 3514 end_pfn >= early_node_map[i].start_pfn) { 3515 early_node_map[i].start_pfn = start_pfn; 3516 return; 3517 } 3518 } 3519 3520 /* Check that early_node_map is large enough */ 3521 if (i >= MAX_ACTIVE_REGIONS) { 3522 printk(KERN_CRIT "More than %d memory regions, truncating\n", 3523 MAX_ACTIVE_REGIONS); 3524 return; 3525 } 3526 3527 early_node_map[i].nid = nid; 3528 early_node_map[i].start_pfn = start_pfn; 3529 early_node_map[i].end_pfn = end_pfn; 3530 nr_nodemap_entries = i + 1; 3531 } 3532 3533 /** 3534 * shrink_active_range - Shrink an existing registered range of PFNs 3535 * @nid: The node id the range is on that should be shrunk 3536 * @old_end_pfn: The old end PFN of the range 3537 * @new_end_pfn: The new PFN of the range 3538 * 3539 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 3540 * The map is kept at the end physical page range that has already been 3541 * registered with add_active_range(). This function allows an arch to shrink 3542 * an existing registered range. 3543 */ 3544 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 3545 unsigned long new_end_pfn) 3546 { 3547 int i; 3548 3549 /* Find the old active region end and shrink */ 3550 for_each_active_range_index_in_nid(i, nid) 3551 if (early_node_map[i].end_pfn == old_end_pfn) { 3552 early_node_map[i].end_pfn = new_end_pfn; 3553 break; 3554 } 3555 } 3556 3557 /** 3558 * remove_all_active_ranges - Remove all currently registered regions 3559 * 3560 * During discovery, it may be found that a table like SRAT is invalid 3561 * and an alternative discovery method must be used. This function removes 3562 * all currently registered regions. 3563 */ 3564 void __init remove_all_active_ranges(void) 3565 { 3566 memset(early_node_map, 0, sizeof(early_node_map)); 3567 nr_nodemap_entries = 0; 3568 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 3569 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 3570 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 3571 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 3572 } 3573 3574 /* Compare two active node_active_regions */ 3575 static int __init cmp_node_active_region(const void *a, const void *b) 3576 { 3577 struct node_active_region *arange = (struct node_active_region *)a; 3578 struct node_active_region *brange = (struct node_active_region *)b; 3579 3580 /* Done this way to avoid overflows */ 3581 if (arange->start_pfn > brange->start_pfn) 3582 return 1; 3583 if (arange->start_pfn < brange->start_pfn) 3584 return -1; 3585 3586 return 0; 3587 } 3588 3589 /* sort the node_map by start_pfn */ 3590 static void __init sort_node_map(void) 3591 { 3592 sort(early_node_map, (size_t)nr_nodemap_entries, 3593 sizeof(struct node_active_region), 3594 cmp_node_active_region, NULL); 3595 } 3596 3597 /* Find the lowest pfn for a node */ 3598 unsigned long __init find_min_pfn_for_node(unsigned long nid) 3599 { 3600 int i; 3601 unsigned long min_pfn = ULONG_MAX; 3602 3603 /* Assuming a sorted map, the first range found has the starting pfn */ 3604 for_each_active_range_index_in_nid(i, nid) 3605 min_pfn = min(min_pfn, early_node_map[i].start_pfn); 3606 3607 if (min_pfn == ULONG_MAX) { 3608 printk(KERN_WARNING 3609 "Could not find start_pfn for node %lu\n", nid); 3610 return 0; 3611 } 3612 3613 return min_pfn; 3614 } 3615 3616 /** 3617 * find_min_pfn_with_active_regions - Find the minimum PFN registered 3618 * 3619 * It returns the minimum PFN based on information provided via 3620 * add_active_range(). 3621 */ 3622 unsigned long __init find_min_pfn_with_active_regions(void) 3623 { 3624 return find_min_pfn_for_node(MAX_NUMNODES); 3625 } 3626 3627 /** 3628 * find_max_pfn_with_active_regions - Find the maximum PFN registered 3629 * 3630 * It returns the maximum PFN based on information provided via 3631 * add_active_range(). 3632 */ 3633 unsigned long __init find_max_pfn_with_active_regions(void) 3634 { 3635 int i; 3636 unsigned long max_pfn = 0; 3637 3638 for (i = 0; i < nr_nodemap_entries; i++) 3639 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 3640 3641 return max_pfn; 3642 } 3643 3644 /* 3645 * early_calculate_totalpages() 3646 * Sum pages in active regions for movable zone. 3647 * Populate N_HIGH_MEMORY for calculating usable_nodes. 3648 */ 3649 static unsigned long __init early_calculate_totalpages(void) 3650 { 3651 int i; 3652 unsigned long totalpages = 0; 3653 3654 for (i = 0; i < nr_nodemap_entries; i++) { 3655 unsigned long pages = early_node_map[i].end_pfn - 3656 early_node_map[i].start_pfn; 3657 totalpages += pages; 3658 if (pages) 3659 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); 3660 } 3661 return totalpages; 3662 } 3663 3664 /* 3665 * Find the PFN the Movable zone begins in each node. Kernel memory 3666 * is spread evenly between nodes as long as the nodes have enough 3667 * memory. When they don't, some nodes will have more kernelcore than 3668 * others 3669 */ 3670 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3671 { 3672 int i, nid; 3673 unsigned long usable_startpfn; 3674 unsigned long kernelcore_node, kernelcore_remaining; 3675 unsigned long totalpages = early_calculate_totalpages(); 3676 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 3677 3678 /* 3679 * If movablecore was specified, calculate what size of 3680 * kernelcore that corresponds so that memory usable for 3681 * any allocation type is evenly spread. If both kernelcore 3682 * and movablecore are specified, then the value of kernelcore 3683 * will be used for required_kernelcore if it's greater than 3684 * what movablecore would have allowed. 3685 */ 3686 if (required_movablecore) { 3687 unsigned long corepages; 3688 3689 /* 3690 * Round-up so that ZONE_MOVABLE is at least as large as what 3691 * was requested by the user 3692 */ 3693 required_movablecore = 3694 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 3695 corepages = totalpages - required_movablecore; 3696 3697 required_kernelcore = max(required_kernelcore, corepages); 3698 } 3699 3700 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 3701 if (!required_kernelcore) 3702 return; 3703 3704 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 3705 find_usable_zone_for_movable(); 3706 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 3707 3708 restart: 3709 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3710 kernelcore_node = required_kernelcore / usable_nodes; 3711 for_each_node_state(nid, N_HIGH_MEMORY) { 3712 /* 3713 * Recalculate kernelcore_node if the division per node 3714 * now exceeds what is necessary to satisfy the requested 3715 * amount of memory for the kernel 3716 */ 3717 if (required_kernelcore < kernelcore_node) 3718 kernelcore_node = required_kernelcore / usable_nodes; 3719 3720 /* 3721 * As the map is walked, we track how much memory is usable 3722 * by the kernel using kernelcore_remaining. When it is 3723 * 0, the rest of the node is usable by ZONE_MOVABLE 3724 */ 3725 kernelcore_remaining = kernelcore_node; 3726 3727 /* Go through each range of PFNs within this node */ 3728 for_each_active_range_index_in_nid(i, nid) { 3729 unsigned long start_pfn, end_pfn; 3730 unsigned long size_pages; 3731 3732 start_pfn = max(early_node_map[i].start_pfn, 3733 zone_movable_pfn[nid]); 3734 end_pfn = early_node_map[i].end_pfn; 3735 if (start_pfn >= end_pfn) 3736 continue; 3737 3738 /* Account for what is only usable for kernelcore */ 3739 if (start_pfn < usable_startpfn) { 3740 unsigned long kernel_pages; 3741 kernel_pages = min(end_pfn, usable_startpfn) 3742 - start_pfn; 3743 3744 kernelcore_remaining -= min(kernel_pages, 3745 kernelcore_remaining); 3746 required_kernelcore -= min(kernel_pages, 3747 required_kernelcore); 3748 3749 /* Continue if range is now fully accounted */ 3750 if (end_pfn <= usable_startpfn) { 3751 3752 /* 3753 * Push zone_movable_pfn to the end so 3754 * that if we have to rebalance 3755 * kernelcore across nodes, we will 3756 * not double account here 3757 */ 3758 zone_movable_pfn[nid] = end_pfn; 3759 continue; 3760 } 3761 start_pfn = usable_startpfn; 3762 } 3763 3764 /* 3765 * The usable PFN range for ZONE_MOVABLE is from 3766 * start_pfn->end_pfn. Calculate size_pages as the 3767 * number of pages used as kernelcore 3768 */ 3769 size_pages = end_pfn - start_pfn; 3770 if (size_pages > kernelcore_remaining) 3771 size_pages = kernelcore_remaining; 3772 zone_movable_pfn[nid] = start_pfn + size_pages; 3773 3774 /* 3775 * Some kernelcore has been met, update counts and 3776 * break if the kernelcore for this node has been 3777 * satisified 3778 */ 3779 required_kernelcore -= min(required_kernelcore, 3780 size_pages); 3781 kernelcore_remaining -= size_pages; 3782 if (!kernelcore_remaining) 3783 break; 3784 } 3785 } 3786 3787 /* 3788 * If there is still required_kernelcore, we do another pass with one 3789 * less node in the count. This will push zone_movable_pfn[nid] further 3790 * along on the nodes that still have memory until kernelcore is 3791 * satisified 3792 */ 3793 usable_nodes--; 3794 if (usable_nodes && required_kernelcore > usable_nodes) 3795 goto restart; 3796 3797 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 3798 for (nid = 0; nid < MAX_NUMNODES; nid++) 3799 zone_movable_pfn[nid] = 3800 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3801 } 3802 3803 /* Any regular memory on that node ? */ 3804 static void check_for_regular_memory(pg_data_t *pgdat) 3805 { 3806 #ifdef CONFIG_HIGHMEM 3807 enum zone_type zone_type; 3808 3809 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 3810 struct zone *zone = &pgdat->node_zones[zone_type]; 3811 if (zone->present_pages) 3812 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 3813 } 3814 #endif 3815 } 3816 3817 /** 3818 * free_area_init_nodes - Initialise all pg_data_t and zone data 3819 * @max_zone_pfn: an array of max PFNs for each zone 3820 * 3821 * This will call free_area_init_node() for each active node in the system. 3822 * Using the page ranges provided by add_active_range(), the size of each 3823 * zone in each node and their holes is calculated. If the maximum PFN 3824 * between two adjacent zones match, it is assumed that the zone is empty. 3825 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 3826 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 3827 * starts where the previous one ended. For example, ZONE_DMA32 starts 3828 * at arch_max_dma_pfn. 3829 */ 3830 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3831 { 3832 unsigned long nid; 3833 enum zone_type i; 3834 3835 /* Sort early_node_map as initialisation assumes it is sorted */ 3836 sort_node_map(); 3837 3838 /* Record where the zone boundaries are */ 3839 memset(arch_zone_lowest_possible_pfn, 0, 3840 sizeof(arch_zone_lowest_possible_pfn)); 3841 memset(arch_zone_highest_possible_pfn, 0, 3842 sizeof(arch_zone_highest_possible_pfn)); 3843 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 3844 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 3845 for (i = 1; i < MAX_NR_ZONES; i++) { 3846 if (i == ZONE_MOVABLE) 3847 continue; 3848 arch_zone_lowest_possible_pfn[i] = 3849 arch_zone_highest_possible_pfn[i-1]; 3850 arch_zone_highest_possible_pfn[i] = 3851 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 3852 } 3853 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 3854 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 3855 3856 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 3857 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 3858 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 3859 3860 /* Print out the zone ranges */ 3861 printk("Zone PFN ranges:\n"); 3862 for (i = 0; i < MAX_NR_ZONES; i++) { 3863 if (i == ZONE_MOVABLE) 3864 continue; 3865 printk(" %-8s %8lu -> %8lu\n", 3866 zone_names[i], 3867 arch_zone_lowest_possible_pfn[i], 3868 arch_zone_highest_possible_pfn[i]); 3869 } 3870 3871 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 3872 printk("Movable zone start PFN for each node\n"); 3873 for (i = 0; i < MAX_NUMNODES; i++) { 3874 if (zone_movable_pfn[i]) 3875 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 3876 } 3877 3878 /* Print out the early_node_map[] */ 3879 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 3880 for (i = 0; i < nr_nodemap_entries; i++) 3881 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 3882 early_node_map[i].start_pfn, 3883 early_node_map[i].end_pfn); 3884 3885 /* Initialise every node */ 3886 setup_nr_node_ids(); 3887 for_each_online_node(nid) { 3888 pg_data_t *pgdat = NODE_DATA(nid); 3889 free_area_init_node(nid, pgdat, NULL, 3890 find_min_pfn_for_node(nid), NULL); 3891 3892 /* Any memory on that node */ 3893 if (pgdat->node_present_pages) 3894 node_set_state(nid, N_HIGH_MEMORY); 3895 check_for_regular_memory(pgdat); 3896 } 3897 } 3898 3899 static int __init cmdline_parse_core(char *p, unsigned long *core) 3900 { 3901 unsigned long long coremem; 3902 if (!p) 3903 return -EINVAL; 3904 3905 coremem = memparse(p, &p); 3906 *core = coremem >> PAGE_SHIFT; 3907 3908 /* Paranoid check that UL is enough for the coremem value */ 3909 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 3910 3911 return 0; 3912 } 3913 3914 /* 3915 * kernelcore=size sets the amount of memory for use for allocations that 3916 * cannot be reclaimed or migrated. 3917 */ 3918 static int __init cmdline_parse_kernelcore(char *p) 3919 { 3920 return cmdline_parse_core(p, &required_kernelcore); 3921 } 3922 3923 /* 3924 * movablecore=size sets the amount of memory for use for allocations that 3925 * can be reclaimed or migrated. 3926 */ 3927 static int __init cmdline_parse_movablecore(char *p) 3928 { 3929 return cmdline_parse_core(p, &required_movablecore); 3930 } 3931 3932 early_param("kernelcore", cmdline_parse_kernelcore); 3933 early_param("movablecore", cmdline_parse_movablecore); 3934 3935 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3936 3937 /** 3938 * set_dma_reserve - set the specified number of pages reserved in the first zone 3939 * @new_dma_reserve: The number of pages to mark reserved 3940 * 3941 * The per-cpu batchsize and zone watermarks are determined by present_pages. 3942 * In the DMA zone, a significant percentage may be consumed by kernel image 3943 * and other unfreeable allocations which can skew the watermarks badly. This 3944 * function may optionally be used to account for unfreeable pages in the 3945 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 3946 * smaller per-cpu batchsize. 3947 */ 3948 void __init set_dma_reserve(unsigned long new_dma_reserve) 3949 { 3950 dma_reserve = new_dma_reserve; 3951 } 3952 3953 #ifndef CONFIG_NEED_MULTIPLE_NODES 3954 static bootmem_data_t contig_bootmem_data; 3955 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 3956 3957 EXPORT_SYMBOL(contig_page_data); 3958 #endif 3959 3960 void __init free_area_init(unsigned long *zones_size) 3961 { 3962 free_area_init_node(0, NODE_DATA(0), zones_size, 3963 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 3964 } 3965 3966 static int page_alloc_cpu_notify(struct notifier_block *self, 3967 unsigned long action, void *hcpu) 3968 { 3969 int cpu = (unsigned long)hcpu; 3970 3971 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3972 local_irq_disable(); 3973 __drain_pages(cpu); 3974 vm_events_fold_cpu(cpu); 3975 local_irq_enable(); 3976 refresh_cpu_vm_stats(cpu); 3977 } 3978 return NOTIFY_OK; 3979 } 3980 3981 void __init page_alloc_init(void) 3982 { 3983 hotcpu_notifier(page_alloc_cpu_notify, 0); 3984 } 3985 3986 /* 3987 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 3988 * or min_free_kbytes changes. 3989 */ 3990 static void calculate_totalreserve_pages(void) 3991 { 3992 struct pglist_data *pgdat; 3993 unsigned long reserve_pages = 0; 3994 enum zone_type i, j; 3995 3996 for_each_online_pgdat(pgdat) { 3997 for (i = 0; i < MAX_NR_ZONES; i++) { 3998 struct zone *zone = pgdat->node_zones + i; 3999 unsigned long max = 0; 4000 4001 /* Find valid and maximum lowmem_reserve in the zone */ 4002 for (j = i; j < MAX_NR_ZONES; j++) { 4003 if (zone->lowmem_reserve[j] > max) 4004 max = zone->lowmem_reserve[j]; 4005 } 4006 4007 /* we treat pages_high as reserved pages. */ 4008 max += zone->pages_high; 4009 4010 if (max > zone->present_pages) 4011 max = zone->present_pages; 4012 reserve_pages += max; 4013 } 4014 } 4015 totalreserve_pages = reserve_pages; 4016 } 4017 4018 /* 4019 * setup_per_zone_lowmem_reserve - called whenever 4020 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 4021 * has a correct pages reserved value, so an adequate number of 4022 * pages are left in the zone after a successful __alloc_pages(). 4023 */ 4024 static void setup_per_zone_lowmem_reserve(void) 4025 { 4026 struct pglist_data *pgdat; 4027 enum zone_type j, idx; 4028 4029 for_each_online_pgdat(pgdat) { 4030 for (j = 0; j < MAX_NR_ZONES; j++) { 4031 struct zone *zone = pgdat->node_zones + j; 4032 unsigned long present_pages = zone->present_pages; 4033 4034 zone->lowmem_reserve[j] = 0; 4035 4036 idx = j; 4037 while (idx) { 4038 struct zone *lower_zone; 4039 4040 idx--; 4041 4042 if (sysctl_lowmem_reserve_ratio[idx] < 1) 4043 sysctl_lowmem_reserve_ratio[idx] = 1; 4044 4045 lower_zone = pgdat->node_zones + idx; 4046 lower_zone->lowmem_reserve[j] = present_pages / 4047 sysctl_lowmem_reserve_ratio[idx]; 4048 present_pages += lower_zone->present_pages; 4049 } 4050 } 4051 } 4052 4053 /* update totalreserve_pages */ 4054 calculate_totalreserve_pages(); 4055 } 4056 4057 /** 4058 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4059 * 4060 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4061 * with respect to min_free_kbytes. 4062 */ 4063 void setup_per_zone_pages_min(void) 4064 { 4065 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4066 unsigned long lowmem_pages = 0; 4067 struct zone *zone; 4068 unsigned long flags; 4069 4070 /* Calculate total number of !ZONE_HIGHMEM pages */ 4071 for_each_zone(zone) { 4072 if (!is_highmem(zone)) 4073 lowmem_pages += zone->present_pages; 4074 } 4075 4076 for_each_zone(zone) { 4077 u64 tmp; 4078 4079 spin_lock_irqsave(&zone->lru_lock, flags); 4080 tmp = (u64)pages_min * zone->present_pages; 4081 do_div(tmp, lowmem_pages); 4082 if (is_highmem(zone)) { 4083 /* 4084 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 4085 * need highmem pages, so cap pages_min to a small 4086 * value here. 4087 * 4088 * The (pages_high-pages_low) and (pages_low-pages_min) 4089 * deltas controls asynch page reclaim, and so should 4090 * not be capped for highmem. 4091 */ 4092 int min_pages; 4093 4094 min_pages = zone->present_pages / 1024; 4095 if (min_pages < SWAP_CLUSTER_MAX) 4096 min_pages = SWAP_CLUSTER_MAX; 4097 if (min_pages > 128) 4098 min_pages = 128; 4099 zone->pages_min = min_pages; 4100 } else { 4101 /* 4102 * If it's a lowmem zone, reserve a number of pages 4103 * proportionate to the zone's size. 4104 */ 4105 zone->pages_min = tmp; 4106 } 4107 4108 zone->pages_low = zone->pages_min + (tmp >> 2); 4109 zone->pages_high = zone->pages_min + (tmp >> 1); 4110 setup_zone_migrate_reserve(zone); 4111 spin_unlock_irqrestore(&zone->lru_lock, flags); 4112 } 4113 4114 /* update totalreserve_pages */ 4115 calculate_totalreserve_pages(); 4116 } 4117 4118 /* 4119 * Initialise min_free_kbytes. 4120 * 4121 * For small machines we want it small (128k min). For large machines 4122 * we want it large (64MB max). But it is not linear, because network 4123 * bandwidth does not increase linearly with machine size. We use 4124 * 4125 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 4126 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 4127 * 4128 * which yields 4129 * 4130 * 16MB: 512k 4131 * 32MB: 724k 4132 * 64MB: 1024k 4133 * 128MB: 1448k 4134 * 256MB: 2048k 4135 * 512MB: 2896k 4136 * 1024MB: 4096k 4137 * 2048MB: 5792k 4138 * 4096MB: 8192k 4139 * 8192MB: 11584k 4140 * 16384MB: 16384k 4141 */ 4142 static int __init init_per_zone_pages_min(void) 4143 { 4144 unsigned long lowmem_kbytes; 4145 4146 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 4147 4148 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 4149 if (min_free_kbytes < 128) 4150 min_free_kbytes = 128; 4151 if (min_free_kbytes > 65536) 4152 min_free_kbytes = 65536; 4153 setup_per_zone_pages_min(); 4154 setup_per_zone_lowmem_reserve(); 4155 return 0; 4156 } 4157 module_init(init_per_zone_pages_min) 4158 4159 /* 4160 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4161 * that we can call two helper functions whenever min_free_kbytes 4162 * changes. 4163 */ 4164 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4165 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4166 { 4167 proc_dointvec(table, write, file, buffer, length, ppos); 4168 if (write) 4169 setup_per_zone_pages_min(); 4170 return 0; 4171 } 4172 4173 #ifdef CONFIG_NUMA 4174 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4175 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4176 { 4177 struct zone *zone; 4178 int rc; 4179 4180 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4181 if (rc) 4182 return rc; 4183 4184 for_each_zone(zone) 4185 zone->min_unmapped_pages = (zone->present_pages * 4186 sysctl_min_unmapped_ratio) / 100; 4187 return 0; 4188 } 4189 4190 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4191 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4192 { 4193 struct zone *zone; 4194 int rc; 4195 4196 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4197 if (rc) 4198 return rc; 4199 4200 for_each_zone(zone) 4201 zone->min_slab_pages = (zone->present_pages * 4202 sysctl_min_slab_ratio) / 100; 4203 return 0; 4204 } 4205 #endif 4206 4207 /* 4208 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 4209 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 4210 * whenever sysctl_lowmem_reserve_ratio changes. 4211 * 4212 * The reserve ratio obviously has absolutely no relation with the 4213 * pages_min watermarks. The lowmem reserve ratio can only make sense 4214 * if in function of the boot time zone sizes. 4215 */ 4216 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4217 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4218 { 4219 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4220 setup_per_zone_lowmem_reserve(); 4221 return 0; 4222 } 4223 4224 /* 4225 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 4226 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 4227 * can have before it gets flushed back to buddy allocator. 4228 */ 4229 4230 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4231 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4232 { 4233 struct zone *zone; 4234 unsigned int cpu; 4235 int ret; 4236 4237 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4238 if (!write || (ret == -EINVAL)) 4239 return ret; 4240 for_each_zone(zone) { 4241 for_each_online_cpu(cpu) { 4242 unsigned long high; 4243 high = zone->present_pages / percpu_pagelist_fraction; 4244 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4245 } 4246 } 4247 return 0; 4248 } 4249 4250 int hashdist = HASHDIST_DEFAULT; 4251 4252 #ifdef CONFIG_NUMA 4253 static int __init set_hashdist(char *str) 4254 { 4255 if (!str) 4256 return 0; 4257 hashdist = simple_strtoul(str, &str, 0); 4258 return 1; 4259 } 4260 __setup("hashdist=", set_hashdist); 4261 #endif 4262 4263 /* 4264 * allocate a large system hash table from bootmem 4265 * - it is assumed that the hash table must contain an exact power-of-2 4266 * quantity of entries 4267 * - limit is the number of hash buckets, not the total allocation size 4268 */ 4269 void *__init alloc_large_system_hash(const char *tablename, 4270 unsigned long bucketsize, 4271 unsigned long numentries, 4272 int scale, 4273 int flags, 4274 unsigned int *_hash_shift, 4275 unsigned int *_hash_mask, 4276 unsigned long limit) 4277 { 4278 unsigned long long max = limit; 4279 unsigned long log2qty, size; 4280 void *table = NULL; 4281 4282 /* allow the kernel cmdline to have a say */ 4283 if (!numentries) { 4284 /* round applicable memory size up to nearest megabyte */ 4285 numentries = nr_kernel_pages; 4286 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 4287 numentries >>= 20 - PAGE_SHIFT; 4288 numentries <<= 20 - PAGE_SHIFT; 4289 4290 /* limit to 1 bucket per 2^scale bytes of low memory */ 4291 if (scale > PAGE_SHIFT) 4292 numentries >>= (scale - PAGE_SHIFT); 4293 else 4294 numentries <<= (PAGE_SHIFT - scale); 4295 4296 /* Make sure we've got at least a 0-order allocation.. */ 4297 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4298 numentries = PAGE_SIZE / bucketsize; 4299 } 4300 numentries = roundup_pow_of_two(numentries); 4301 4302 /* limit allocation size to 1/16 total memory by default */ 4303 if (max == 0) { 4304 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 4305 do_div(max, bucketsize); 4306 } 4307 4308 if (numentries > max) 4309 numentries = max; 4310 4311 log2qty = ilog2(numentries); 4312 4313 do { 4314 size = bucketsize << log2qty; 4315 if (flags & HASH_EARLY) 4316 table = alloc_bootmem(size); 4317 else if (hashdist) 4318 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4319 else { 4320 unsigned long order; 4321 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 4322 ; 4323 table = (void*) __get_free_pages(GFP_ATOMIC, order); 4324 /* 4325 * If bucketsize is not a power-of-two, we may free 4326 * some pages at the end of hash table. 4327 */ 4328 if (table) { 4329 unsigned long alloc_end = (unsigned long)table + 4330 (PAGE_SIZE << order); 4331 unsigned long used = (unsigned long)table + 4332 PAGE_ALIGN(size); 4333 split_page(virt_to_page(table), order); 4334 while (used < alloc_end) { 4335 free_page(used); 4336 used += PAGE_SIZE; 4337 } 4338 } 4339 } 4340 } while (!table && size > PAGE_SIZE && --log2qty); 4341 4342 if (!table) 4343 panic("Failed to allocate %s hash table\n", tablename); 4344 4345 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 4346 tablename, 4347 (1U << log2qty), 4348 ilog2(size) - PAGE_SHIFT, 4349 size); 4350 4351 if (_hash_shift) 4352 *_hash_shift = log2qty; 4353 if (_hash_mask) 4354 *_hash_mask = (1 << log2qty) - 1; 4355 4356 return table; 4357 } 4358 4359 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 4360 struct page *pfn_to_page(unsigned long pfn) 4361 { 4362 return __pfn_to_page(pfn); 4363 } 4364 unsigned long page_to_pfn(struct page *page) 4365 { 4366 return __page_to_pfn(page); 4367 } 4368 EXPORT_SYMBOL(pfn_to_page); 4369 EXPORT_SYMBOL(page_to_pfn); 4370 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4371 4372 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 4373 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4374 unsigned long pfn) 4375 { 4376 #ifdef CONFIG_SPARSEMEM 4377 return __pfn_to_section(pfn)->pageblock_flags; 4378 #else 4379 return zone->pageblock_flags; 4380 #endif /* CONFIG_SPARSEMEM */ 4381 } 4382 4383 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 4384 { 4385 #ifdef CONFIG_SPARSEMEM 4386 pfn &= (PAGES_PER_SECTION-1); 4387 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4388 #else 4389 pfn = pfn - zone->zone_start_pfn; 4390 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4391 #endif /* CONFIG_SPARSEMEM */ 4392 } 4393 4394 /** 4395 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 4396 * @page: The page within the block of interest 4397 * @start_bitidx: The first bit of interest to retrieve 4398 * @end_bitidx: The last bit of interest 4399 * returns pageblock_bits flags 4400 */ 4401 unsigned long get_pageblock_flags_group(struct page *page, 4402 int start_bitidx, int end_bitidx) 4403 { 4404 struct zone *zone; 4405 unsigned long *bitmap; 4406 unsigned long pfn, bitidx; 4407 unsigned long flags = 0; 4408 unsigned long value = 1; 4409 4410 zone = page_zone(page); 4411 pfn = page_to_pfn(page); 4412 bitmap = get_pageblock_bitmap(zone, pfn); 4413 bitidx = pfn_to_bitidx(zone, pfn); 4414 4415 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4416 if (test_bit(bitidx + start_bitidx, bitmap)) 4417 flags |= value; 4418 4419 return flags; 4420 } 4421 4422 /** 4423 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 4424 * @page: The page within the block of interest 4425 * @start_bitidx: The first bit of interest 4426 * @end_bitidx: The last bit of interest 4427 * @flags: The flags to set 4428 */ 4429 void set_pageblock_flags_group(struct page *page, unsigned long flags, 4430 int start_bitidx, int end_bitidx) 4431 { 4432 struct zone *zone; 4433 unsigned long *bitmap; 4434 unsigned long pfn, bitidx; 4435 unsigned long value = 1; 4436 4437 zone = page_zone(page); 4438 pfn = page_to_pfn(page); 4439 bitmap = get_pageblock_bitmap(zone, pfn); 4440 bitidx = pfn_to_bitidx(zone, pfn); 4441 4442 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4443 if (flags & value) 4444 __set_bit(bitidx + start_bitidx, bitmap); 4445 else 4446 __clear_bit(bitidx + start_bitidx, bitmap); 4447 } 4448 4449 /* 4450 * This is designed as sub function...plz see page_isolation.c also. 4451 * set/clear page block's type to be ISOLATE. 4452 * page allocater never alloc memory from ISOLATE block. 4453 */ 4454 4455 int set_migratetype_isolate(struct page *page) 4456 { 4457 struct zone *zone; 4458 unsigned long flags; 4459 int ret = -EBUSY; 4460 4461 zone = page_zone(page); 4462 spin_lock_irqsave(&zone->lock, flags); 4463 /* 4464 * In future, more migrate types will be able to be isolation target. 4465 */ 4466 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 4467 goto out; 4468 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 4469 move_freepages_block(zone, page, MIGRATE_ISOLATE); 4470 ret = 0; 4471 out: 4472 spin_unlock_irqrestore(&zone->lock, flags); 4473 if (!ret) 4474 drain_all_local_pages(); 4475 return ret; 4476 } 4477 4478 void unset_migratetype_isolate(struct page *page) 4479 { 4480 struct zone *zone; 4481 unsigned long flags; 4482 zone = page_zone(page); 4483 spin_lock_irqsave(&zone->lock, flags); 4484 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 4485 goto out; 4486 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4487 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4488 out: 4489 spin_unlock_irqrestore(&zone->lock, flags); 4490 } 4491 4492 #ifdef CONFIG_MEMORY_HOTREMOVE 4493 /* 4494 * All pages in the range must be isolated before calling this. 4495 */ 4496 void 4497 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 4498 { 4499 struct page *page; 4500 struct zone *zone; 4501 int order, i; 4502 unsigned long pfn; 4503 unsigned long flags; 4504 /* find the first valid pfn */ 4505 for (pfn = start_pfn; pfn < end_pfn; pfn++) 4506 if (pfn_valid(pfn)) 4507 break; 4508 if (pfn == end_pfn) 4509 return; 4510 zone = page_zone(pfn_to_page(pfn)); 4511 spin_lock_irqsave(&zone->lock, flags); 4512 pfn = start_pfn; 4513 while (pfn < end_pfn) { 4514 if (!pfn_valid(pfn)) { 4515 pfn++; 4516 continue; 4517 } 4518 page = pfn_to_page(pfn); 4519 BUG_ON(page_count(page)); 4520 BUG_ON(!PageBuddy(page)); 4521 order = page_order(page); 4522 #ifdef CONFIG_DEBUG_VM 4523 printk(KERN_INFO "remove from free list %lx %d %lx\n", 4524 pfn, 1 << order, end_pfn); 4525 #endif 4526 list_del(&page->lru); 4527 rmv_page_order(page); 4528 zone->free_area[order].nr_free--; 4529 __mod_zone_page_state(zone, NR_FREE_PAGES, 4530 - (1UL << order)); 4531 for (i = 0; i < (1 << order); i++) 4532 SetPageReserved((page+i)); 4533 pfn += (1 << order); 4534 } 4535 spin_unlock_irqrestore(&zone->lock, flags); 4536 } 4537 #endif 4538