13947be19SDave Hansen /* 23947be19SDave Hansen * linux/mm/memory_hotplug.c 33947be19SDave Hansen * 43947be19SDave Hansen * Copyright (C) 53947be19SDave Hansen */ 63947be19SDave Hansen 73947be19SDave Hansen #include <linux/stddef.h> 83947be19SDave Hansen #include <linux/mm.h> 93947be19SDave Hansen #include <linux/swap.h> 103947be19SDave Hansen #include <linux/interrupt.h> 113947be19SDave Hansen #include <linux/pagemap.h> 123947be19SDave Hansen #include <linux/compiler.h> 13b95f1b31SPaul Gortmaker #include <linux/export.h> 143947be19SDave Hansen #include <linux/pagevec.h> 152d1d43f6SChandra Seetharaman #include <linux/writeback.h> 163947be19SDave Hansen #include <linux/slab.h> 173947be19SDave Hansen #include <linux/sysctl.h> 183947be19SDave Hansen #include <linux/cpu.h> 193947be19SDave Hansen #include <linux/memory.h> 203947be19SDave Hansen #include <linux/memory_hotplug.h> 213947be19SDave Hansen #include <linux/highmem.h> 223947be19SDave Hansen #include <linux/vmalloc.h> 230a547039SKAMEZAWA Hiroyuki #include <linux/ioport.h> 240c0e6195SKAMEZAWA Hiroyuki #include <linux/delay.h> 250c0e6195SKAMEZAWA Hiroyuki #include <linux/migrate.h> 260c0e6195SKAMEZAWA Hiroyuki #include <linux/page-isolation.h> 2771088785SBadari Pulavarty #include <linux/pfn.h> 286ad696d2SAndi Kleen #include <linux/suspend.h> 296d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h> 30d96ae530Sakpm@linux-foundation.org #include <linux/firmware-map.h> 3160a5a19eSTang Chen #include <linux/stop_machine.h> 32c8721bbbSNaoya Horiguchi #include <linux/hugetlb.h> 33c5320926STang Chen #include <linux/memblock.h> 343947be19SDave Hansen 353947be19SDave Hansen #include <asm/tlbflush.h> 363947be19SDave Hansen 371e5ad9a3SAdrian Bunk #include "internal.h" 381e5ad9a3SAdrian Bunk 399d0ad8caSDaniel Kiper /* 409d0ad8caSDaniel Kiper * online_page_callback contains pointer to current page onlining function. 419d0ad8caSDaniel Kiper * Initially it is generic_online_page(). If it is required it could be 429d0ad8caSDaniel Kiper * changed by calling set_online_page_callback() for callback registration 439d0ad8caSDaniel Kiper * and restore_online_page_callback() for generic callback restore. 449d0ad8caSDaniel Kiper */ 459d0ad8caSDaniel Kiper 469d0ad8caSDaniel Kiper static void generic_online_page(struct page *page); 479d0ad8caSDaniel Kiper 489d0ad8caSDaniel Kiper static online_page_callback_t online_page_callback = generic_online_page; 49*bfc8c901SVladimir Davydov static DEFINE_MUTEX(online_page_callback_lock); 509d0ad8caSDaniel Kiper 51*bfc8c901SVladimir Davydov /* The same as the cpu_hotplug lock, but for memory hotplug. */ 52*bfc8c901SVladimir Davydov static struct { 53*bfc8c901SVladimir Davydov struct task_struct *active_writer; 54*bfc8c901SVladimir Davydov struct mutex lock; /* Synchronizes accesses to refcount, */ 55*bfc8c901SVladimir Davydov /* 56*bfc8c901SVladimir Davydov * Also blocks the new readers during 57*bfc8c901SVladimir Davydov * an ongoing mem hotplug operation. 58*bfc8c901SVladimir Davydov */ 59*bfc8c901SVladimir Davydov int refcount; 6020d6c96bSKOSAKI Motohiro 61*bfc8c901SVladimir Davydov #ifdef CONFIG_DEBUG_LOCK_ALLOC 62*bfc8c901SVladimir Davydov struct lockdep_map dep_map; 63*bfc8c901SVladimir Davydov #endif 64*bfc8c901SVladimir Davydov } mem_hotplug = { 65*bfc8c901SVladimir Davydov .active_writer = NULL, 66*bfc8c901SVladimir Davydov .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 67*bfc8c901SVladimir Davydov .refcount = 0, 68*bfc8c901SVladimir Davydov #ifdef CONFIG_DEBUG_LOCK_ALLOC 69*bfc8c901SVladimir Davydov .dep_map = {.name = "mem_hotplug.lock" }, 70*bfc8c901SVladimir Davydov #endif 71*bfc8c901SVladimir Davydov }; 72*bfc8c901SVladimir Davydov 73*bfc8c901SVladimir Davydov /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 74*bfc8c901SVladimir Davydov #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 75*bfc8c901SVladimir Davydov #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 76*bfc8c901SVladimir Davydov #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 77*bfc8c901SVladimir Davydov 78*bfc8c901SVladimir Davydov void get_online_mems(void) 7920d6c96bSKOSAKI Motohiro { 80*bfc8c901SVladimir Davydov might_sleep(); 81*bfc8c901SVladimir Davydov if (mem_hotplug.active_writer == current) 82*bfc8c901SVladimir Davydov return; 83*bfc8c901SVladimir Davydov memhp_lock_acquire_read(); 84*bfc8c901SVladimir Davydov mutex_lock(&mem_hotplug.lock); 85*bfc8c901SVladimir Davydov mem_hotplug.refcount++; 86*bfc8c901SVladimir Davydov mutex_unlock(&mem_hotplug.lock); 87*bfc8c901SVladimir Davydov 8820d6c96bSKOSAKI Motohiro } 8920d6c96bSKOSAKI Motohiro 90*bfc8c901SVladimir Davydov void put_online_mems(void) 9120d6c96bSKOSAKI Motohiro { 92*bfc8c901SVladimir Davydov if (mem_hotplug.active_writer == current) 93*bfc8c901SVladimir Davydov return; 94*bfc8c901SVladimir Davydov mutex_lock(&mem_hotplug.lock); 95*bfc8c901SVladimir Davydov 96*bfc8c901SVladimir Davydov if (WARN_ON(!mem_hotplug.refcount)) 97*bfc8c901SVladimir Davydov mem_hotplug.refcount++; /* try to fix things up */ 98*bfc8c901SVladimir Davydov 99*bfc8c901SVladimir Davydov if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 100*bfc8c901SVladimir Davydov wake_up_process(mem_hotplug.active_writer); 101*bfc8c901SVladimir Davydov mutex_unlock(&mem_hotplug.lock); 102*bfc8c901SVladimir Davydov memhp_lock_release(); 103*bfc8c901SVladimir Davydov 10420d6c96bSKOSAKI Motohiro } 10520d6c96bSKOSAKI Motohiro 106*bfc8c901SVladimir Davydov static void mem_hotplug_begin(void) 107*bfc8c901SVladimir Davydov { 108*bfc8c901SVladimir Davydov mem_hotplug.active_writer = current; 109*bfc8c901SVladimir Davydov 110*bfc8c901SVladimir Davydov memhp_lock_acquire(); 111*bfc8c901SVladimir Davydov for (;;) { 112*bfc8c901SVladimir Davydov mutex_lock(&mem_hotplug.lock); 113*bfc8c901SVladimir Davydov if (likely(!mem_hotplug.refcount)) 114*bfc8c901SVladimir Davydov break; 115*bfc8c901SVladimir Davydov __set_current_state(TASK_UNINTERRUPTIBLE); 116*bfc8c901SVladimir Davydov mutex_unlock(&mem_hotplug.lock); 117*bfc8c901SVladimir Davydov schedule(); 118*bfc8c901SVladimir Davydov } 119*bfc8c901SVladimir Davydov } 120*bfc8c901SVladimir Davydov 121*bfc8c901SVladimir Davydov static void mem_hotplug_done(void) 122*bfc8c901SVladimir Davydov { 123*bfc8c901SVladimir Davydov mem_hotplug.active_writer = NULL; 124*bfc8c901SVladimir Davydov mutex_unlock(&mem_hotplug.lock); 125*bfc8c901SVladimir Davydov memhp_lock_release(); 126*bfc8c901SVladimir Davydov } 12720d6c96bSKOSAKI Motohiro 12845e0b78bSKeith Mannthey /* add this memory to iomem resource */ 12945e0b78bSKeith Mannthey static struct resource *register_memory_resource(u64 start, u64 size) 13045e0b78bSKeith Mannthey { 13145e0b78bSKeith Mannthey struct resource *res; 13245e0b78bSKeith Mannthey res = kzalloc(sizeof(struct resource), GFP_KERNEL); 13345e0b78bSKeith Mannthey BUG_ON(!res); 13445e0b78bSKeith Mannthey 13545e0b78bSKeith Mannthey res->name = "System RAM"; 13645e0b78bSKeith Mannthey res->start = start; 13745e0b78bSKeith Mannthey res->end = start + size - 1; 138887c3cb1SYasunori Goto res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 13945e0b78bSKeith Mannthey if (request_resource(&iomem_resource, res) < 0) { 1404996eed8SToshi Kani pr_debug("System RAM resource %pR cannot be added\n", res); 14145e0b78bSKeith Mannthey kfree(res); 14245e0b78bSKeith Mannthey res = NULL; 14345e0b78bSKeith Mannthey } 14445e0b78bSKeith Mannthey return res; 14545e0b78bSKeith Mannthey } 14645e0b78bSKeith Mannthey 14745e0b78bSKeith Mannthey static void release_memory_resource(struct resource *res) 14845e0b78bSKeith Mannthey { 14945e0b78bSKeith Mannthey if (!res) 15045e0b78bSKeith Mannthey return; 15145e0b78bSKeith Mannthey release_resource(res); 15245e0b78bSKeith Mannthey kfree(res); 15345e0b78bSKeith Mannthey return; 15445e0b78bSKeith Mannthey } 15545e0b78bSKeith Mannthey 15653947027SKeith Mannthey #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 15746723bfaSYasuaki Ishimatsu void get_page_bootmem(unsigned long info, struct page *page, 1585f24ce5fSAndrea Arcangeli unsigned long type) 15904753278SYasunori Goto { 1605f24ce5fSAndrea Arcangeli page->lru.next = (struct list_head *) type; 16104753278SYasunori Goto SetPagePrivate(page); 16204753278SYasunori Goto set_page_private(page, info); 16304753278SYasunori Goto atomic_inc(&page->_count); 16404753278SYasunori Goto } 16504753278SYasunori Goto 166170a5a7eSJiang Liu void put_page_bootmem(struct page *page) 16704753278SYasunori Goto { 1685f24ce5fSAndrea Arcangeli unsigned long type; 16904753278SYasunori Goto 1705f24ce5fSAndrea Arcangeli type = (unsigned long) page->lru.next; 1715f24ce5fSAndrea Arcangeli BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 1725f24ce5fSAndrea Arcangeli type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 17304753278SYasunori Goto 17404753278SYasunori Goto if (atomic_dec_return(&page->_count) == 1) { 17504753278SYasunori Goto ClearPagePrivate(page); 17604753278SYasunori Goto set_page_private(page, 0); 1775f24ce5fSAndrea Arcangeli INIT_LIST_HEAD(&page->lru); 178170a5a7eSJiang Liu free_reserved_page(page); 17904753278SYasunori Goto } 18004753278SYasunori Goto } 18104753278SYasunori Goto 18246723bfaSYasuaki Ishimatsu #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 18346723bfaSYasuaki Ishimatsu #ifndef CONFIG_SPARSEMEM_VMEMMAP 184d92bc318SAdrian Bunk static void register_page_bootmem_info_section(unsigned long start_pfn) 18504753278SYasunori Goto { 18604753278SYasunori Goto unsigned long *usemap, mapsize, section_nr, i; 18704753278SYasunori Goto struct mem_section *ms; 18804753278SYasunori Goto struct page *page, *memmap; 18904753278SYasunori Goto 19004753278SYasunori Goto section_nr = pfn_to_section_nr(start_pfn); 19104753278SYasunori Goto ms = __nr_to_section(section_nr); 19204753278SYasunori Goto 19304753278SYasunori Goto /* Get section's memmap address */ 19404753278SYasunori Goto memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 19504753278SYasunori Goto 19604753278SYasunori Goto /* 19704753278SYasunori Goto * Get page for the memmap's phys address 19804753278SYasunori Goto * XXX: need more consideration for sparse_vmemmap... 19904753278SYasunori Goto */ 20004753278SYasunori Goto page = virt_to_page(memmap); 20104753278SYasunori Goto mapsize = sizeof(struct page) * PAGES_PER_SECTION; 20204753278SYasunori Goto mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 20304753278SYasunori Goto 20404753278SYasunori Goto /* remember memmap's page */ 20504753278SYasunori Goto for (i = 0; i < mapsize; i++, page++) 20604753278SYasunori Goto get_page_bootmem(section_nr, page, SECTION_INFO); 20704753278SYasunori Goto 20804753278SYasunori Goto usemap = __nr_to_section(section_nr)->pageblock_flags; 20904753278SYasunori Goto page = virt_to_page(usemap); 21004753278SYasunori Goto 21104753278SYasunori Goto mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 21204753278SYasunori Goto 21304753278SYasunori Goto for (i = 0; i < mapsize; i++, page++) 214af370fb8SYasunori Goto get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 21504753278SYasunori Goto 21604753278SYasunori Goto } 21746723bfaSYasuaki Ishimatsu #else /* CONFIG_SPARSEMEM_VMEMMAP */ 21846723bfaSYasuaki Ishimatsu static void register_page_bootmem_info_section(unsigned long start_pfn) 21946723bfaSYasuaki Ishimatsu { 22046723bfaSYasuaki Ishimatsu unsigned long *usemap, mapsize, section_nr, i; 22146723bfaSYasuaki Ishimatsu struct mem_section *ms; 22246723bfaSYasuaki Ishimatsu struct page *page, *memmap; 22346723bfaSYasuaki Ishimatsu 22446723bfaSYasuaki Ishimatsu if (!pfn_valid(start_pfn)) 22546723bfaSYasuaki Ishimatsu return; 22646723bfaSYasuaki Ishimatsu 22746723bfaSYasuaki Ishimatsu section_nr = pfn_to_section_nr(start_pfn); 22846723bfaSYasuaki Ishimatsu ms = __nr_to_section(section_nr); 22946723bfaSYasuaki Ishimatsu 23046723bfaSYasuaki Ishimatsu memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 23146723bfaSYasuaki Ishimatsu 23246723bfaSYasuaki Ishimatsu register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 23346723bfaSYasuaki Ishimatsu 23446723bfaSYasuaki Ishimatsu usemap = __nr_to_section(section_nr)->pageblock_flags; 23546723bfaSYasuaki Ishimatsu page = virt_to_page(usemap); 23646723bfaSYasuaki Ishimatsu 23746723bfaSYasuaki Ishimatsu mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 23846723bfaSYasuaki Ishimatsu 23946723bfaSYasuaki Ishimatsu for (i = 0; i < mapsize; i++, page++) 24046723bfaSYasuaki Ishimatsu get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 24146723bfaSYasuaki Ishimatsu } 24246723bfaSYasuaki Ishimatsu #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 24304753278SYasunori Goto 24404753278SYasunori Goto void register_page_bootmem_info_node(struct pglist_data *pgdat) 24504753278SYasunori Goto { 24604753278SYasunori Goto unsigned long i, pfn, end_pfn, nr_pages; 24704753278SYasunori Goto int node = pgdat->node_id; 24804753278SYasunori Goto struct page *page; 24904753278SYasunori Goto struct zone *zone; 25004753278SYasunori Goto 25104753278SYasunori Goto nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 25204753278SYasunori Goto page = virt_to_page(pgdat); 25304753278SYasunori Goto 25404753278SYasunori Goto for (i = 0; i < nr_pages; i++, page++) 25504753278SYasunori Goto get_page_bootmem(node, page, NODE_INFO); 25604753278SYasunori Goto 25704753278SYasunori Goto zone = &pgdat->node_zones[0]; 25804753278SYasunori Goto for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 259139c2d75SXishi Qiu if (zone_is_initialized(zone)) { 26004753278SYasunori Goto nr_pages = zone->wait_table_hash_nr_entries 26104753278SYasunori Goto * sizeof(wait_queue_head_t); 26204753278SYasunori Goto nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 26304753278SYasunori Goto page = virt_to_page(zone->wait_table); 26404753278SYasunori Goto 26504753278SYasunori Goto for (i = 0; i < nr_pages; i++, page++) 26604753278SYasunori Goto get_page_bootmem(node, page, NODE_INFO); 26704753278SYasunori Goto } 26804753278SYasunori Goto } 26904753278SYasunori Goto 27004753278SYasunori Goto pfn = pgdat->node_start_pfn; 271c1f19495SCody P Schafer end_pfn = pgdat_end_pfn(pgdat); 27204753278SYasunori Goto 2737e9f5eb0STang Chen /* register section info */ 274f14851afSqiuxishi for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 275f14851afSqiuxishi /* 276f14851afSqiuxishi * Some platforms can assign the same pfn to multiple nodes - on 277f14851afSqiuxishi * node0 as well as nodeN. To avoid registering a pfn against 278f14851afSqiuxishi * multiple nodes we check that this pfn does not already 2797e9f5eb0STang Chen * reside in some other nodes. 280f14851afSqiuxishi */ 281f14851afSqiuxishi if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 28204753278SYasunori Goto register_page_bootmem_info_section(pfn); 283f14851afSqiuxishi } 28404753278SYasunori Goto } 28546723bfaSYasuaki Ishimatsu #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 28604753278SYasunori Goto 28776cdd58eSHeiko Carstens static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 28876cdd58eSHeiko Carstens unsigned long end_pfn) 28976cdd58eSHeiko Carstens { 29076cdd58eSHeiko Carstens unsigned long old_zone_end_pfn; 29176cdd58eSHeiko Carstens 29276cdd58eSHeiko Carstens zone_span_writelock(zone); 29376cdd58eSHeiko Carstens 294c33bc315SXishi Qiu old_zone_end_pfn = zone_end_pfn(zone); 2958080fc03SXishi Qiu if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 29676cdd58eSHeiko Carstens zone->zone_start_pfn = start_pfn; 29776cdd58eSHeiko Carstens 29876cdd58eSHeiko Carstens zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 29976cdd58eSHeiko Carstens zone->zone_start_pfn; 30076cdd58eSHeiko Carstens 30176cdd58eSHeiko Carstens zone_span_writeunlock(zone); 30276cdd58eSHeiko Carstens } 30376cdd58eSHeiko Carstens 304511c2abaSLai Jiangshan static void resize_zone(struct zone *zone, unsigned long start_pfn, 305511c2abaSLai Jiangshan unsigned long end_pfn) 306511c2abaSLai Jiangshan { 307511c2abaSLai Jiangshan zone_span_writelock(zone); 308511c2abaSLai Jiangshan 309e455a9b9SLai Jiangshan if (end_pfn - start_pfn) { 310511c2abaSLai Jiangshan zone->zone_start_pfn = start_pfn; 311511c2abaSLai Jiangshan zone->spanned_pages = end_pfn - start_pfn; 312e455a9b9SLai Jiangshan } else { 313e455a9b9SLai Jiangshan /* 314e455a9b9SLai Jiangshan * make it consist as free_area_init_core(), 315e455a9b9SLai Jiangshan * if spanned_pages = 0, then keep start_pfn = 0 316e455a9b9SLai Jiangshan */ 317e455a9b9SLai Jiangshan zone->zone_start_pfn = 0; 318e455a9b9SLai Jiangshan zone->spanned_pages = 0; 319e455a9b9SLai Jiangshan } 320511c2abaSLai Jiangshan 321511c2abaSLai Jiangshan zone_span_writeunlock(zone); 322511c2abaSLai Jiangshan } 323511c2abaSLai Jiangshan 324511c2abaSLai Jiangshan static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 325511c2abaSLai Jiangshan unsigned long end_pfn) 326511c2abaSLai Jiangshan { 327511c2abaSLai Jiangshan enum zone_type zid = zone_idx(zone); 328511c2abaSLai Jiangshan int nid = zone->zone_pgdat->node_id; 329511c2abaSLai Jiangshan unsigned long pfn; 330511c2abaSLai Jiangshan 331511c2abaSLai Jiangshan for (pfn = start_pfn; pfn < end_pfn; pfn++) 332511c2abaSLai Jiangshan set_page_links(pfn_to_page(pfn), zid, nid, pfn); 333511c2abaSLai Jiangshan } 334511c2abaSLai Jiangshan 335f6bbb78eSCody P Schafer /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 3369e43aa2bSSantosh Shilimkar * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 337f6bbb78eSCody P Schafer static int __ref ensure_zone_is_initialized(struct zone *zone, 338f6bbb78eSCody P Schafer unsigned long start_pfn, unsigned long num_pages) 339f6bbb78eSCody P Schafer { 340f6bbb78eSCody P Schafer if (!zone_is_initialized(zone)) 341f6bbb78eSCody P Schafer return init_currently_empty_zone(zone, start_pfn, num_pages, 342f6bbb78eSCody P Schafer MEMMAP_HOTPLUG); 343f6bbb78eSCody P Schafer return 0; 344f6bbb78eSCody P Schafer } 345f6bbb78eSCody P Schafer 346e455a9b9SLai Jiangshan static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 347511c2abaSLai Jiangshan unsigned long start_pfn, unsigned long end_pfn) 348511c2abaSLai Jiangshan { 349e455a9b9SLai Jiangshan int ret; 350511c2abaSLai Jiangshan unsigned long flags; 351e455a9b9SLai Jiangshan unsigned long z1_start_pfn; 352e455a9b9SLai Jiangshan 35364dd1b29SCody P Schafer ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 354e455a9b9SLai Jiangshan if (ret) 355e455a9b9SLai Jiangshan return ret; 356511c2abaSLai Jiangshan 357511c2abaSLai Jiangshan pgdat_resize_lock(z1->zone_pgdat, &flags); 358511c2abaSLai Jiangshan 359511c2abaSLai Jiangshan /* can't move pfns which are higher than @z2 */ 360108bcc96SCody P Schafer if (end_pfn > zone_end_pfn(z2)) 361511c2abaSLai Jiangshan goto out_fail; 362834405c3SJiang Liu /* the move out part must be at the left most of @z2 */ 363511c2abaSLai Jiangshan if (start_pfn > z2->zone_start_pfn) 364511c2abaSLai Jiangshan goto out_fail; 365511c2abaSLai Jiangshan /* must included/overlap */ 366511c2abaSLai Jiangshan if (end_pfn <= z2->zone_start_pfn) 367511c2abaSLai Jiangshan goto out_fail; 368511c2abaSLai Jiangshan 369e455a9b9SLai Jiangshan /* use start_pfn for z1's start_pfn if z1 is empty */ 3708080fc03SXishi Qiu if (!zone_is_empty(z1)) 371e455a9b9SLai Jiangshan z1_start_pfn = z1->zone_start_pfn; 372e455a9b9SLai Jiangshan else 373e455a9b9SLai Jiangshan z1_start_pfn = start_pfn; 374e455a9b9SLai Jiangshan 375e455a9b9SLai Jiangshan resize_zone(z1, z1_start_pfn, end_pfn); 376108bcc96SCody P Schafer resize_zone(z2, end_pfn, zone_end_pfn(z2)); 377511c2abaSLai Jiangshan 378511c2abaSLai Jiangshan pgdat_resize_unlock(z1->zone_pgdat, &flags); 379511c2abaSLai Jiangshan 380511c2abaSLai Jiangshan fix_zone_id(z1, start_pfn, end_pfn); 381511c2abaSLai Jiangshan 382511c2abaSLai Jiangshan return 0; 383511c2abaSLai Jiangshan out_fail: 384511c2abaSLai Jiangshan pgdat_resize_unlock(z1->zone_pgdat, &flags); 385511c2abaSLai Jiangshan return -1; 386511c2abaSLai Jiangshan } 387511c2abaSLai Jiangshan 388e455a9b9SLai Jiangshan static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 389511c2abaSLai Jiangshan unsigned long start_pfn, unsigned long end_pfn) 390511c2abaSLai Jiangshan { 391e455a9b9SLai Jiangshan int ret; 392511c2abaSLai Jiangshan unsigned long flags; 393e455a9b9SLai Jiangshan unsigned long z2_end_pfn; 394e455a9b9SLai Jiangshan 39564dd1b29SCody P Schafer ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 396e455a9b9SLai Jiangshan if (ret) 397e455a9b9SLai Jiangshan return ret; 398511c2abaSLai Jiangshan 399511c2abaSLai Jiangshan pgdat_resize_lock(z1->zone_pgdat, &flags); 400511c2abaSLai Jiangshan 401511c2abaSLai Jiangshan /* can't move pfns which are lower than @z1 */ 402511c2abaSLai Jiangshan if (z1->zone_start_pfn > start_pfn) 403511c2abaSLai Jiangshan goto out_fail; 404511c2abaSLai Jiangshan /* the move out part mast at the right most of @z1 */ 405108bcc96SCody P Schafer if (zone_end_pfn(z1) > end_pfn) 406511c2abaSLai Jiangshan goto out_fail; 407511c2abaSLai Jiangshan /* must included/overlap */ 408108bcc96SCody P Schafer if (start_pfn >= zone_end_pfn(z1)) 409511c2abaSLai Jiangshan goto out_fail; 410511c2abaSLai Jiangshan 411e455a9b9SLai Jiangshan /* use end_pfn for z2's end_pfn if z2 is empty */ 4128080fc03SXishi Qiu if (!zone_is_empty(z2)) 413108bcc96SCody P Schafer z2_end_pfn = zone_end_pfn(z2); 414e455a9b9SLai Jiangshan else 415e455a9b9SLai Jiangshan z2_end_pfn = end_pfn; 416e455a9b9SLai Jiangshan 417511c2abaSLai Jiangshan resize_zone(z1, z1->zone_start_pfn, start_pfn); 418e455a9b9SLai Jiangshan resize_zone(z2, start_pfn, z2_end_pfn); 419511c2abaSLai Jiangshan 420511c2abaSLai Jiangshan pgdat_resize_unlock(z1->zone_pgdat, &flags); 421511c2abaSLai Jiangshan 422511c2abaSLai Jiangshan fix_zone_id(z2, start_pfn, end_pfn); 423511c2abaSLai Jiangshan 424511c2abaSLai Jiangshan return 0; 425511c2abaSLai Jiangshan out_fail: 426511c2abaSLai Jiangshan pgdat_resize_unlock(z1->zone_pgdat, &flags); 427511c2abaSLai Jiangshan return -1; 428511c2abaSLai Jiangshan } 429511c2abaSLai Jiangshan 43076cdd58eSHeiko Carstens static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 43176cdd58eSHeiko Carstens unsigned long end_pfn) 43276cdd58eSHeiko Carstens { 43383285c72SXishi Qiu unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 43476cdd58eSHeiko Carstens 435712cd386STang Chen if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 43676cdd58eSHeiko Carstens pgdat->node_start_pfn = start_pfn; 43776cdd58eSHeiko Carstens 43876cdd58eSHeiko Carstens pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 43976cdd58eSHeiko Carstens pgdat->node_start_pfn; 44076cdd58eSHeiko Carstens } 44176cdd58eSHeiko Carstens 44231168481SAl Viro static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 4433947be19SDave Hansen { 4443947be19SDave Hansen struct pglist_data *pgdat = zone->zone_pgdat; 4453947be19SDave Hansen int nr_pages = PAGES_PER_SECTION; 4463947be19SDave Hansen int nid = pgdat->node_id; 4473947be19SDave Hansen int zone_type; 44876cdd58eSHeiko Carstens unsigned long flags; 44976cdd58eSHeiko Carstens int ret; 45076cdd58eSHeiko Carstens 45164dd1b29SCody P Schafer zone_type = zone - pgdat->node_zones; 45264dd1b29SCody P Schafer ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 45376cdd58eSHeiko Carstens if (ret) 45476cdd58eSHeiko Carstens return ret; 45564dd1b29SCody P Schafer 45676cdd58eSHeiko Carstens pgdat_resize_lock(zone->zone_pgdat, &flags); 45776cdd58eSHeiko Carstens grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 45876cdd58eSHeiko Carstens grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 45976cdd58eSHeiko Carstens phys_start_pfn + nr_pages); 46076cdd58eSHeiko Carstens pgdat_resize_unlock(zone->zone_pgdat, &flags); 461a2f3aa02SDave Hansen memmap_init_zone(nr_pages, nid, zone_type, 462a2f3aa02SDave Hansen phys_start_pfn, MEMMAP_HOTPLUG); 463718127ccSYasunori Goto return 0; 4643947be19SDave Hansen } 4653947be19SDave Hansen 466c04fc586SGary Hade static int __meminit __add_section(int nid, struct zone *zone, 467c04fc586SGary Hade unsigned long phys_start_pfn) 4683947be19SDave Hansen { 4693947be19SDave Hansen int ret; 4703947be19SDave Hansen 471ebd15302SKAMEZAWA Hiroyuki if (pfn_valid(phys_start_pfn)) 472ebd15302SKAMEZAWA Hiroyuki return -EEXIST; 473ebd15302SKAMEZAWA Hiroyuki 47485b35feaSZhang Yanfei ret = sparse_add_one_section(zone, phys_start_pfn); 4753947be19SDave Hansen 4763947be19SDave Hansen if (ret < 0) 4773947be19SDave Hansen return ret; 4783947be19SDave Hansen 479718127ccSYasunori Goto ret = __add_zone(zone, phys_start_pfn); 480718127ccSYasunori Goto 481718127ccSYasunori Goto if (ret < 0) 482718127ccSYasunori Goto return ret; 483718127ccSYasunori Goto 484c04fc586SGary Hade return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 4853947be19SDave Hansen } 4863947be19SDave Hansen 4874edd7cefSDavid Rientjes /* 4884edd7cefSDavid Rientjes * Reasonably generic function for adding memory. It is 4894edd7cefSDavid Rientjes * expected that archs that support memory hotplug will 4904edd7cefSDavid Rientjes * call this function after deciding the zone to which to 4914edd7cefSDavid Rientjes * add the new pages. 4924edd7cefSDavid Rientjes */ 4934edd7cefSDavid Rientjes int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 4944edd7cefSDavid Rientjes unsigned long nr_pages) 4954edd7cefSDavid Rientjes { 4964edd7cefSDavid Rientjes unsigned long i; 4974edd7cefSDavid Rientjes int err = 0; 4984edd7cefSDavid Rientjes int start_sec, end_sec; 4994edd7cefSDavid Rientjes /* during initialize mem_map, align hot-added range to section */ 5004edd7cefSDavid Rientjes start_sec = pfn_to_section_nr(phys_start_pfn); 5014edd7cefSDavid Rientjes end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 5024edd7cefSDavid Rientjes 5034edd7cefSDavid Rientjes for (i = start_sec; i <= end_sec; i++) { 5044edd7cefSDavid Rientjes err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 5054edd7cefSDavid Rientjes 5064edd7cefSDavid Rientjes /* 5074edd7cefSDavid Rientjes * EEXIST is finally dealt with by ioresource collision 5084edd7cefSDavid Rientjes * check. see add_memory() => register_memory_resource() 5094edd7cefSDavid Rientjes * Warning will be printed if there is collision. 5104edd7cefSDavid Rientjes */ 5114edd7cefSDavid Rientjes if (err && (err != -EEXIST)) 5124edd7cefSDavid Rientjes break; 5134edd7cefSDavid Rientjes err = 0; 5144edd7cefSDavid Rientjes } 5154edd7cefSDavid Rientjes 5164edd7cefSDavid Rientjes return err; 5174edd7cefSDavid Rientjes } 5184edd7cefSDavid Rientjes EXPORT_SYMBOL_GPL(__add_pages); 5194edd7cefSDavid Rientjes 5204edd7cefSDavid Rientjes #ifdef CONFIG_MEMORY_HOTREMOVE 521815121d2SYasuaki Ishimatsu /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 522815121d2SYasuaki Ishimatsu static int find_smallest_section_pfn(int nid, struct zone *zone, 523815121d2SYasuaki Ishimatsu unsigned long start_pfn, 524815121d2SYasuaki Ishimatsu unsigned long end_pfn) 525815121d2SYasuaki Ishimatsu { 526815121d2SYasuaki Ishimatsu struct mem_section *ms; 527815121d2SYasuaki Ishimatsu 528815121d2SYasuaki Ishimatsu for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 529815121d2SYasuaki Ishimatsu ms = __pfn_to_section(start_pfn); 530815121d2SYasuaki Ishimatsu 531815121d2SYasuaki Ishimatsu if (unlikely(!valid_section(ms))) 532815121d2SYasuaki Ishimatsu continue; 533815121d2SYasuaki Ishimatsu 534815121d2SYasuaki Ishimatsu if (unlikely(pfn_to_nid(start_pfn) != nid)) 535815121d2SYasuaki Ishimatsu continue; 536815121d2SYasuaki Ishimatsu 537815121d2SYasuaki Ishimatsu if (zone && zone != page_zone(pfn_to_page(start_pfn))) 538815121d2SYasuaki Ishimatsu continue; 539815121d2SYasuaki Ishimatsu 540815121d2SYasuaki Ishimatsu return start_pfn; 541815121d2SYasuaki Ishimatsu } 542815121d2SYasuaki Ishimatsu 543815121d2SYasuaki Ishimatsu return 0; 544815121d2SYasuaki Ishimatsu } 545815121d2SYasuaki Ishimatsu 546815121d2SYasuaki Ishimatsu /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 547815121d2SYasuaki Ishimatsu static int find_biggest_section_pfn(int nid, struct zone *zone, 548815121d2SYasuaki Ishimatsu unsigned long start_pfn, 549815121d2SYasuaki Ishimatsu unsigned long end_pfn) 550815121d2SYasuaki Ishimatsu { 551815121d2SYasuaki Ishimatsu struct mem_section *ms; 552815121d2SYasuaki Ishimatsu unsigned long pfn; 553815121d2SYasuaki Ishimatsu 554815121d2SYasuaki Ishimatsu /* pfn is the end pfn of a memory section. */ 555815121d2SYasuaki Ishimatsu pfn = end_pfn - 1; 556815121d2SYasuaki Ishimatsu for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 557815121d2SYasuaki Ishimatsu ms = __pfn_to_section(pfn); 558815121d2SYasuaki Ishimatsu 559815121d2SYasuaki Ishimatsu if (unlikely(!valid_section(ms))) 560815121d2SYasuaki Ishimatsu continue; 561815121d2SYasuaki Ishimatsu 562815121d2SYasuaki Ishimatsu if (unlikely(pfn_to_nid(pfn) != nid)) 563815121d2SYasuaki Ishimatsu continue; 564815121d2SYasuaki Ishimatsu 565815121d2SYasuaki Ishimatsu if (zone && zone != page_zone(pfn_to_page(pfn))) 566815121d2SYasuaki Ishimatsu continue; 567815121d2SYasuaki Ishimatsu 568815121d2SYasuaki Ishimatsu return pfn; 569815121d2SYasuaki Ishimatsu } 570815121d2SYasuaki Ishimatsu 571815121d2SYasuaki Ishimatsu return 0; 572815121d2SYasuaki Ishimatsu } 573815121d2SYasuaki Ishimatsu 574815121d2SYasuaki Ishimatsu static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 575815121d2SYasuaki Ishimatsu unsigned long end_pfn) 576815121d2SYasuaki Ishimatsu { 577815121d2SYasuaki Ishimatsu unsigned long zone_start_pfn = zone->zone_start_pfn; 578c33bc315SXishi Qiu unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 579c33bc315SXishi Qiu unsigned long zone_end_pfn = z; 580815121d2SYasuaki Ishimatsu unsigned long pfn; 581815121d2SYasuaki Ishimatsu struct mem_section *ms; 582815121d2SYasuaki Ishimatsu int nid = zone_to_nid(zone); 583815121d2SYasuaki Ishimatsu 584815121d2SYasuaki Ishimatsu zone_span_writelock(zone); 585815121d2SYasuaki Ishimatsu if (zone_start_pfn == start_pfn) { 586815121d2SYasuaki Ishimatsu /* 587815121d2SYasuaki Ishimatsu * If the section is smallest section in the zone, it need 588815121d2SYasuaki Ishimatsu * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 589815121d2SYasuaki Ishimatsu * In this case, we find second smallest valid mem_section 590815121d2SYasuaki Ishimatsu * for shrinking zone. 591815121d2SYasuaki Ishimatsu */ 592815121d2SYasuaki Ishimatsu pfn = find_smallest_section_pfn(nid, zone, end_pfn, 593815121d2SYasuaki Ishimatsu zone_end_pfn); 594815121d2SYasuaki Ishimatsu if (pfn) { 595815121d2SYasuaki Ishimatsu zone->zone_start_pfn = pfn; 596815121d2SYasuaki Ishimatsu zone->spanned_pages = zone_end_pfn - pfn; 597815121d2SYasuaki Ishimatsu } 598815121d2SYasuaki Ishimatsu } else if (zone_end_pfn == end_pfn) { 599815121d2SYasuaki Ishimatsu /* 600815121d2SYasuaki Ishimatsu * If the section is biggest section in the zone, it need 601815121d2SYasuaki Ishimatsu * shrink zone->spanned_pages. 602815121d2SYasuaki Ishimatsu * In this case, we find second biggest valid mem_section for 603815121d2SYasuaki Ishimatsu * shrinking zone. 604815121d2SYasuaki Ishimatsu */ 605815121d2SYasuaki Ishimatsu pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 606815121d2SYasuaki Ishimatsu start_pfn); 607815121d2SYasuaki Ishimatsu if (pfn) 608815121d2SYasuaki Ishimatsu zone->spanned_pages = pfn - zone_start_pfn + 1; 609815121d2SYasuaki Ishimatsu } 610815121d2SYasuaki Ishimatsu 611815121d2SYasuaki Ishimatsu /* 612815121d2SYasuaki Ishimatsu * The section is not biggest or smallest mem_section in the zone, it 613815121d2SYasuaki Ishimatsu * only creates a hole in the zone. So in this case, we need not 614815121d2SYasuaki Ishimatsu * change the zone. But perhaps, the zone has only hole data. Thus 615815121d2SYasuaki Ishimatsu * it check the zone has only hole or not. 616815121d2SYasuaki Ishimatsu */ 617815121d2SYasuaki Ishimatsu pfn = zone_start_pfn; 618815121d2SYasuaki Ishimatsu for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 619815121d2SYasuaki Ishimatsu ms = __pfn_to_section(pfn); 620815121d2SYasuaki Ishimatsu 621815121d2SYasuaki Ishimatsu if (unlikely(!valid_section(ms))) 622815121d2SYasuaki Ishimatsu continue; 623815121d2SYasuaki Ishimatsu 624815121d2SYasuaki Ishimatsu if (page_zone(pfn_to_page(pfn)) != zone) 625815121d2SYasuaki Ishimatsu continue; 626815121d2SYasuaki Ishimatsu 627815121d2SYasuaki Ishimatsu /* If the section is current section, it continues the loop */ 628815121d2SYasuaki Ishimatsu if (start_pfn == pfn) 629815121d2SYasuaki Ishimatsu continue; 630815121d2SYasuaki Ishimatsu 631815121d2SYasuaki Ishimatsu /* If we find valid section, we have nothing to do */ 632815121d2SYasuaki Ishimatsu zone_span_writeunlock(zone); 633815121d2SYasuaki Ishimatsu return; 634815121d2SYasuaki Ishimatsu } 635815121d2SYasuaki Ishimatsu 636815121d2SYasuaki Ishimatsu /* The zone has no valid section */ 637815121d2SYasuaki Ishimatsu zone->zone_start_pfn = 0; 638815121d2SYasuaki Ishimatsu zone->spanned_pages = 0; 639815121d2SYasuaki Ishimatsu zone_span_writeunlock(zone); 640815121d2SYasuaki Ishimatsu } 641815121d2SYasuaki Ishimatsu 642815121d2SYasuaki Ishimatsu static void shrink_pgdat_span(struct pglist_data *pgdat, 643815121d2SYasuaki Ishimatsu unsigned long start_pfn, unsigned long end_pfn) 644815121d2SYasuaki Ishimatsu { 645815121d2SYasuaki Ishimatsu unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 64683285c72SXishi Qiu unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 64783285c72SXishi Qiu unsigned long pgdat_end_pfn = p; 648815121d2SYasuaki Ishimatsu unsigned long pfn; 649815121d2SYasuaki Ishimatsu struct mem_section *ms; 650815121d2SYasuaki Ishimatsu int nid = pgdat->node_id; 651815121d2SYasuaki Ishimatsu 652815121d2SYasuaki Ishimatsu if (pgdat_start_pfn == start_pfn) { 653815121d2SYasuaki Ishimatsu /* 654815121d2SYasuaki Ishimatsu * If the section is smallest section in the pgdat, it need 655815121d2SYasuaki Ishimatsu * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 656815121d2SYasuaki Ishimatsu * In this case, we find second smallest valid mem_section 657815121d2SYasuaki Ishimatsu * for shrinking zone. 658815121d2SYasuaki Ishimatsu */ 659815121d2SYasuaki Ishimatsu pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 660815121d2SYasuaki Ishimatsu pgdat_end_pfn); 661815121d2SYasuaki Ishimatsu if (pfn) { 662815121d2SYasuaki Ishimatsu pgdat->node_start_pfn = pfn; 663815121d2SYasuaki Ishimatsu pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 664815121d2SYasuaki Ishimatsu } 665815121d2SYasuaki Ishimatsu } else if (pgdat_end_pfn == end_pfn) { 666815121d2SYasuaki Ishimatsu /* 667815121d2SYasuaki Ishimatsu * If the section is biggest section in the pgdat, it need 668815121d2SYasuaki Ishimatsu * shrink pgdat->node_spanned_pages. 669815121d2SYasuaki Ishimatsu * In this case, we find second biggest valid mem_section for 670815121d2SYasuaki Ishimatsu * shrinking zone. 671815121d2SYasuaki Ishimatsu */ 672815121d2SYasuaki Ishimatsu pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 673815121d2SYasuaki Ishimatsu start_pfn); 674815121d2SYasuaki Ishimatsu if (pfn) 675815121d2SYasuaki Ishimatsu pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 676815121d2SYasuaki Ishimatsu } 677815121d2SYasuaki Ishimatsu 678815121d2SYasuaki Ishimatsu /* 679815121d2SYasuaki Ishimatsu * If the section is not biggest or smallest mem_section in the pgdat, 680815121d2SYasuaki Ishimatsu * it only creates a hole in the pgdat. So in this case, we need not 681815121d2SYasuaki Ishimatsu * change the pgdat. 682815121d2SYasuaki Ishimatsu * But perhaps, the pgdat has only hole data. Thus it check the pgdat 683815121d2SYasuaki Ishimatsu * has only hole or not. 684815121d2SYasuaki Ishimatsu */ 685815121d2SYasuaki Ishimatsu pfn = pgdat_start_pfn; 686815121d2SYasuaki Ishimatsu for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 687815121d2SYasuaki Ishimatsu ms = __pfn_to_section(pfn); 688815121d2SYasuaki Ishimatsu 689815121d2SYasuaki Ishimatsu if (unlikely(!valid_section(ms))) 690815121d2SYasuaki Ishimatsu continue; 691815121d2SYasuaki Ishimatsu 692815121d2SYasuaki Ishimatsu if (pfn_to_nid(pfn) != nid) 693815121d2SYasuaki Ishimatsu continue; 694815121d2SYasuaki Ishimatsu 695815121d2SYasuaki Ishimatsu /* If the section is current section, it continues the loop */ 696815121d2SYasuaki Ishimatsu if (start_pfn == pfn) 697815121d2SYasuaki Ishimatsu continue; 698815121d2SYasuaki Ishimatsu 699815121d2SYasuaki Ishimatsu /* If we find valid section, we have nothing to do */ 700815121d2SYasuaki Ishimatsu return; 701815121d2SYasuaki Ishimatsu } 702815121d2SYasuaki Ishimatsu 703815121d2SYasuaki Ishimatsu /* The pgdat has no valid section */ 704815121d2SYasuaki Ishimatsu pgdat->node_start_pfn = 0; 705815121d2SYasuaki Ishimatsu pgdat->node_spanned_pages = 0; 706815121d2SYasuaki Ishimatsu } 707815121d2SYasuaki Ishimatsu 708815121d2SYasuaki Ishimatsu static void __remove_zone(struct zone *zone, unsigned long start_pfn) 709815121d2SYasuaki Ishimatsu { 710815121d2SYasuaki Ishimatsu struct pglist_data *pgdat = zone->zone_pgdat; 711815121d2SYasuaki Ishimatsu int nr_pages = PAGES_PER_SECTION; 712815121d2SYasuaki Ishimatsu int zone_type; 713815121d2SYasuaki Ishimatsu unsigned long flags; 714815121d2SYasuaki Ishimatsu 715815121d2SYasuaki Ishimatsu zone_type = zone - pgdat->node_zones; 716815121d2SYasuaki Ishimatsu 717815121d2SYasuaki Ishimatsu pgdat_resize_lock(zone->zone_pgdat, &flags); 718815121d2SYasuaki Ishimatsu shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 719815121d2SYasuaki Ishimatsu shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 720815121d2SYasuaki Ishimatsu pgdat_resize_unlock(zone->zone_pgdat, &flags); 721815121d2SYasuaki Ishimatsu } 722815121d2SYasuaki Ishimatsu 723ea01ea93SBadari Pulavarty static int __remove_section(struct zone *zone, struct mem_section *ms) 724ea01ea93SBadari Pulavarty { 725815121d2SYasuaki Ishimatsu unsigned long start_pfn; 726815121d2SYasuaki Ishimatsu int scn_nr; 727ea01ea93SBadari Pulavarty int ret = -EINVAL; 728ea01ea93SBadari Pulavarty 729ea01ea93SBadari Pulavarty if (!valid_section(ms)) 730ea01ea93SBadari Pulavarty return ret; 731ea01ea93SBadari Pulavarty 732ea01ea93SBadari Pulavarty ret = unregister_memory_section(ms); 733ea01ea93SBadari Pulavarty if (ret) 734ea01ea93SBadari Pulavarty return ret; 735ea01ea93SBadari Pulavarty 736815121d2SYasuaki Ishimatsu scn_nr = __section_nr(ms); 737815121d2SYasuaki Ishimatsu start_pfn = section_nr_to_pfn(scn_nr); 738815121d2SYasuaki Ishimatsu __remove_zone(zone, start_pfn); 739815121d2SYasuaki Ishimatsu 740ea01ea93SBadari Pulavarty sparse_remove_one_section(zone, ms); 741ea01ea93SBadari Pulavarty return 0; 742ea01ea93SBadari Pulavarty } 743ea01ea93SBadari Pulavarty 744ea01ea93SBadari Pulavarty /** 745ea01ea93SBadari Pulavarty * __remove_pages() - remove sections of pages from a zone 746ea01ea93SBadari Pulavarty * @zone: zone from which pages need to be removed 747ea01ea93SBadari Pulavarty * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 748ea01ea93SBadari Pulavarty * @nr_pages: number of pages to remove (must be multiple of section size) 749ea01ea93SBadari Pulavarty * 750ea01ea93SBadari Pulavarty * Generic helper function to remove section mappings and sysfs entries 751ea01ea93SBadari Pulavarty * for the section of the memory we are removing. Caller needs to make 752ea01ea93SBadari Pulavarty * sure that pages are marked reserved and zones are adjust properly by 753ea01ea93SBadari Pulavarty * calling offline_pages(). 754ea01ea93SBadari Pulavarty */ 755ea01ea93SBadari Pulavarty int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 756ea01ea93SBadari Pulavarty unsigned long nr_pages) 757ea01ea93SBadari Pulavarty { 758fe74ebb1SToshi Kani unsigned long i; 759ea01ea93SBadari Pulavarty int sections_to_remove; 760fe74ebb1SToshi Kani resource_size_t start, size; 761fe74ebb1SToshi Kani int ret = 0; 762ea01ea93SBadari Pulavarty 763ea01ea93SBadari Pulavarty /* 764ea01ea93SBadari Pulavarty * We can only remove entire sections 765ea01ea93SBadari Pulavarty */ 766ea01ea93SBadari Pulavarty BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 767ea01ea93SBadari Pulavarty BUG_ON(nr_pages % PAGES_PER_SECTION); 768ea01ea93SBadari Pulavarty 769fe74ebb1SToshi Kani start = phys_start_pfn << PAGE_SHIFT; 770fe74ebb1SToshi Kani size = nr_pages * PAGE_SIZE; 771fe74ebb1SToshi Kani ret = release_mem_region_adjustable(&iomem_resource, start, size); 772348f9f05SRandy Dunlap if (ret) { 773348f9f05SRandy Dunlap resource_size_t endres = start + size - 1; 774348f9f05SRandy Dunlap 775348f9f05SRandy Dunlap pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 776348f9f05SRandy Dunlap &start, &endres, ret); 777348f9f05SRandy Dunlap } 778d760afd4SYasuaki Ishimatsu 779ea01ea93SBadari Pulavarty sections_to_remove = nr_pages / PAGES_PER_SECTION; 780ea01ea93SBadari Pulavarty for (i = 0; i < sections_to_remove; i++) { 781ea01ea93SBadari Pulavarty unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 782ea01ea93SBadari Pulavarty ret = __remove_section(zone, __pfn_to_section(pfn)); 783ea01ea93SBadari Pulavarty if (ret) 784ea01ea93SBadari Pulavarty break; 785ea01ea93SBadari Pulavarty } 786ea01ea93SBadari Pulavarty return ret; 787ea01ea93SBadari Pulavarty } 788ea01ea93SBadari Pulavarty EXPORT_SYMBOL_GPL(__remove_pages); 7894edd7cefSDavid Rientjes #endif /* CONFIG_MEMORY_HOTREMOVE */ 790ea01ea93SBadari Pulavarty 7919d0ad8caSDaniel Kiper int set_online_page_callback(online_page_callback_t callback) 7929d0ad8caSDaniel Kiper { 7939d0ad8caSDaniel Kiper int rc = -EINVAL; 7949d0ad8caSDaniel Kiper 795*bfc8c901SVladimir Davydov get_online_mems(); 796*bfc8c901SVladimir Davydov mutex_lock(&online_page_callback_lock); 7979d0ad8caSDaniel Kiper 7989d0ad8caSDaniel Kiper if (online_page_callback == generic_online_page) { 7999d0ad8caSDaniel Kiper online_page_callback = callback; 8009d0ad8caSDaniel Kiper rc = 0; 8019d0ad8caSDaniel Kiper } 8029d0ad8caSDaniel Kiper 803*bfc8c901SVladimir Davydov mutex_unlock(&online_page_callback_lock); 804*bfc8c901SVladimir Davydov put_online_mems(); 8059d0ad8caSDaniel Kiper 8069d0ad8caSDaniel Kiper return rc; 8079d0ad8caSDaniel Kiper } 8089d0ad8caSDaniel Kiper EXPORT_SYMBOL_GPL(set_online_page_callback); 8099d0ad8caSDaniel Kiper 8109d0ad8caSDaniel Kiper int restore_online_page_callback(online_page_callback_t callback) 8119d0ad8caSDaniel Kiper { 8129d0ad8caSDaniel Kiper int rc = -EINVAL; 8139d0ad8caSDaniel Kiper 814*bfc8c901SVladimir Davydov get_online_mems(); 815*bfc8c901SVladimir Davydov mutex_lock(&online_page_callback_lock); 8169d0ad8caSDaniel Kiper 8179d0ad8caSDaniel Kiper if (online_page_callback == callback) { 8189d0ad8caSDaniel Kiper online_page_callback = generic_online_page; 8199d0ad8caSDaniel Kiper rc = 0; 8209d0ad8caSDaniel Kiper } 8219d0ad8caSDaniel Kiper 822*bfc8c901SVladimir Davydov mutex_unlock(&online_page_callback_lock); 823*bfc8c901SVladimir Davydov put_online_mems(); 8249d0ad8caSDaniel Kiper 8259d0ad8caSDaniel Kiper return rc; 8269d0ad8caSDaniel Kiper } 8279d0ad8caSDaniel Kiper EXPORT_SYMBOL_GPL(restore_online_page_callback); 8289d0ad8caSDaniel Kiper 8299d0ad8caSDaniel Kiper void __online_page_set_limits(struct page *page) 830180c06efSJeremy Fitzhardinge { 8319d0ad8caSDaniel Kiper } 8329d0ad8caSDaniel Kiper EXPORT_SYMBOL_GPL(__online_page_set_limits); 8339d0ad8caSDaniel Kiper 8349d0ad8caSDaniel Kiper void __online_page_increment_counters(struct page *page) 8359d0ad8caSDaniel Kiper { 8363dcc0571SJiang Liu adjust_managed_page_count(page, 1); 8379d0ad8caSDaniel Kiper } 8389d0ad8caSDaniel Kiper EXPORT_SYMBOL_GPL(__online_page_increment_counters); 839180c06efSJeremy Fitzhardinge 8409d0ad8caSDaniel Kiper void __online_page_free(struct page *page) 8419d0ad8caSDaniel Kiper { 8423dcc0571SJiang Liu __free_reserved_page(page); 843180c06efSJeremy Fitzhardinge } 8449d0ad8caSDaniel Kiper EXPORT_SYMBOL_GPL(__online_page_free); 8459d0ad8caSDaniel Kiper 8469d0ad8caSDaniel Kiper static void generic_online_page(struct page *page) 8479d0ad8caSDaniel Kiper { 8489d0ad8caSDaniel Kiper __online_page_set_limits(page); 8499d0ad8caSDaniel Kiper __online_page_increment_counters(page); 8509d0ad8caSDaniel Kiper __online_page_free(page); 8519d0ad8caSDaniel Kiper } 852180c06efSJeremy Fitzhardinge 85375884fb1SKAMEZAWA Hiroyuki static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 85475884fb1SKAMEZAWA Hiroyuki void *arg) 8553947be19SDave Hansen { 8563947be19SDave Hansen unsigned long i; 85775884fb1SKAMEZAWA Hiroyuki unsigned long onlined_pages = *(unsigned long *)arg; 85875884fb1SKAMEZAWA Hiroyuki struct page *page; 85975884fb1SKAMEZAWA Hiroyuki if (PageReserved(pfn_to_page(start_pfn))) 86075884fb1SKAMEZAWA Hiroyuki for (i = 0; i < nr_pages; i++) { 86175884fb1SKAMEZAWA Hiroyuki page = pfn_to_page(start_pfn + i); 8629d0ad8caSDaniel Kiper (*online_page_callback)(page); 86375884fb1SKAMEZAWA Hiroyuki onlined_pages++; 86475884fb1SKAMEZAWA Hiroyuki } 86575884fb1SKAMEZAWA Hiroyuki *(unsigned long *)arg = onlined_pages; 86675884fb1SKAMEZAWA Hiroyuki return 0; 86775884fb1SKAMEZAWA Hiroyuki } 86875884fb1SKAMEZAWA Hiroyuki 86909285af7SLai Jiangshan #ifdef CONFIG_MOVABLE_NODE 87079a4dcefSTang Chen /* 87179a4dcefSTang Chen * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 87279a4dcefSTang Chen * normal memory. 87379a4dcefSTang Chen */ 87409285af7SLai Jiangshan static bool can_online_high_movable(struct zone *zone) 87509285af7SLai Jiangshan { 87609285af7SLai Jiangshan return true; 87709285af7SLai Jiangshan } 87879a4dcefSTang Chen #else /* CONFIG_MOVABLE_NODE */ 87974d42d8fSLai Jiangshan /* ensure every online node has NORMAL memory */ 88074d42d8fSLai Jiangshan static bool can_online_high_movable(struct zone *zone) 88174d42d8fSLai Jiangshan { 88274d42d8fSLai Jiangshan return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 88374d42d8fSLai Jiangshan } 88479a4dcefSTang Chen #endif /* CONFIG_MOVABLE_NODE */ 88574d42d8fSLai Jiangshan 886d9713679SLai Jiangshan /* check which state of node_states will be changed when online memory */ 887d9713679SLai Jiangshan static void node_states_check_changes_online(unsigned long nr_pages, 888d9713679SLai Jiangshan struct zone *zone, struct memory_notify *arg) 889d9713679SLai Jiangshan { 890d9713679SLai Jiangshan int nid = zone_to_nid(zone); 891d9713679SLai Jiangshan enum zone_type zone_last = ZONE_NORMAL; 892d9713679SLai Jiangshan 893d9713679SLai Jiangshan /* 8946715ddf9SLai Jiangshan * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 8956715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_NORMAL, 8966715ddf9SLai Jiangshan * set zone_last to ZONE_NORMAL. 897d9713679SLai Jiangshan * 8986715ddf9SLai Jiangshan * If we don't have HIGHMEM nor movable node, 8996715ddf9SLai Jiangshan * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 9006715ddf9SLai Jiangshan * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 901d9713679SLai Jiangshan */ 9026715ddf9SLai Jiangshan if (N_MEMORY == N_NORMAL_MEMORY) 903d9713679SLai Jiangshan zone_last = ZONE_MOVABLE; 904d9713679SLai Jiangshan 905d9713679SLai Jiangshan /* 906d9713679SLai Jiangshan * if the memory to be online is in a zone of 0...zone_last, and 907d9713679SLai Jiangshan * the zones of 0...zone_last don't have memory before online, we will 908d9713679SLai Jiangshan * need to set the node to node_states[N_NORMAL_MEMORY] after 909d9713679SLai Jiangshan * the memory is online. 910d9713679SLai Jiangshan */ 911d9713679SLai Jiangshan if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 912d9713679SLai Jiangshan arg->status_change_nid_normal = nid; 913d9713679SLai Jiangshan else 914d9713679SLai Jiangshan arg->status_change_nid_normal = -1; 915d9713679SLai Jiangshan 9166715ddf9SLai Jiangshan #ifdef CONFIG_HIGHMEM 9176715ddf9SLai Jiangshan /* 9186715ddf9SLai Jiangshan * If we have movable node, node_states[N_HIGH_MEMORY] 9196715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_HIGHMEM, 9206715ddf9SLai Jiangshan * set zone_last to ZONE_HIGHMEM. 9216715ddf9SLai Jiangshan * 9226715ddf9SLai Jiangshan * If we don't have movable node, node_states[N_NORMAL_MEMORY] 9236715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_MOVABLE, 9246715ddf9SLai Jiangshan * set zone_last to ZONE_MOVABLE. 9256715ddf9SLai Jiangshan */ 9266715ddf9SLai Jiangshan zone_last = ZONE_HIGHMEM; 9276715ddf9SLai Jiangshan if (N_MEMORY == N_HIGH_MEMORY) 9286715ddf9SLai Jiangshan zone_last = ZONE_MOVABLE; 9296715ddf9SLai Jiangshan 9306715ddf9SLai Jiangshan if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 9316715ddf9SLai Jiangshan arg->status_change_nid_high = nid; 9326715ddf9SLai Jiangshan else 9336715ddf9SLai Jiangshan arg->status_change_nid_high = -1; 9346715ddf9SLai Jiangshan #else 9356715ddf9SLai Jiangshan arg->status_change_nid_high = arg->status_change_nid_normal; 9366715ddf9SLai Jiangshan #endif 9376715ddf9SLai Jiangshan 938d9713679SLai Jiangshan /* 939d9713679SLai Jiangshan * if the node don't have memory befor online, we will need to 9406715ddf9SLai Jiangshan * set the node to node_states[N_MEMORY] after the memory 941d9713679SLai Jiangshan * is online. 942d9713679SLai Jiangshan */ 9436715ddf9SLai Jiangshan if (!node_state(nid, N_MEMORY)) 944d9713679SLai Jiangshan arg->status_change_nid = nid; 945d9713679SLai Jiangshan else 946d9713679SLai Jiangshan arg->status_change_nid = -1; 947d9713679SLai Jiangshan } 948d9713679SLai Jiangshan 949d9713679SLai Jiangshan static void node_states_set_node(int node, struct memory_notify *arg) 950d9713679SLai Jiangshan { 951d9713679SLai Jiangshan if (arg->status_change_nid_normal >= 0) 952d9713679SLai Jiangshan node_set_state(node, N_NORMAL_MEMORY); 953d9713679SLai Jiangshan 9546715ddf9SLai Jiangshan if (arg->status_change_nid_high >= 0) 955d9713679SLai Jiangshan node_set_state(node, N_HIGH_MEMORY); 9566715ddf9SLai Jiangshan 9576715ddf9SLai Jiangshan node_set_state(node, N_MEMORY); 958d9713679SLai Jiangshan } 959d9713679SLai Jiangshan 96075884fb1SKAMEZAWA Hiroyuki 961511c2abaSLai Jiangshan int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 96275884fb1SKAMEZAWA Hiroyuki { 963aa47228aSCody P Schafer unsigned long flags; 9643947be19SDave Hansen unsigned long onlined_pages = 0; 9653947be19SDave Hansen struct zone *zone; 9666811378eSYasunori Goto int need_zonelists_rebuild = 0; 9677b78d335SYasunori Goto int nid; 9687b78d335SYasunori Goto int ret; 9697b78d335SYasunori Goto struct memory_notify arg; 9703947be19SDave Hansen 971*bfc8c901SVladimir Davydov mem_hotplug_begin(); 972d9713679SLai Jiangshan /* 973d9713679SLai Jiangshan * This doesn't need a lock to do pfn_to_page(). 974d9713679SLai Jiangshan * The section can't be removed here because of the 975d9713679SLai Jiangshan * memory_block->state_mutex. 976d9713679SLai Jiangshan */ 977d9713679SLai Jiangshan zone = page_zone(pfn_to_page(pfn)); 978d9713679SLai Jiangshan 979*bfc8c901SVladimir Davydov ret = -EINVAL; 98074d42d8fSLai Jiangshan if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 981*bfc8c901SVladimir Davydov !can_online_high_movable(zone)) 982*bfc8c901SVladimir Davydov goto out; 98374d42d8fSLai Jiangshan 984511c2abaSLai Jiangshan if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 985*bfc8c901SVladimir Davydov if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 986*bfc8c901SVladimir Davydov goto out; 987511c2abaSLai Jiangshan } 988511c2abaSLai Jiangshan if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 989*bfc8c901SVladimir Davydov if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 990*bfc8c901SVladimir Davydov goto out; 991511c2abaSLai Jiangshan } 992511c2abaSLai Jiangshan 993511c2abaSLai Jiangshan /* Previous code may changed the zone of the pfn range */ 994511c2abaSLai Jiangshan zone = page_zone(pfn_to_page(pfn)); 995511c2abaSLai Jiangshan 9967b78d335SYasunori Goto arg.start_pfn = pfn; 9977b78d335SYasunori Goto arg.nr_pages = nr_pages; 998d9713679SLai Jiangshan node_states_check_changes_online(nr_pages, zone, &arg); 9997b78d335SYasunori Goto 10009c2606b7SXishi Qiu nid = pfn_to_nid(pfn); 10017b78d335SYasunori Goto 10027b78d335SYasunori Goto ret = memory_notify(MEM_GOING_ONLINE, &arg); 10037b78d335SYasunori Goto ret = notifier_to_errno(ret); 10047b78d335SYasunori Goto if (ret) { 10057b78d335SYasunori Goto memory_notify(MEM_CANCEL_ONLINE, &arg); 1006*bfc8c901SVladimir Davydov goto out; 10077b78d335SYasunori Goto } 10083947be19SDave Hansen /* 10096811378eSYasunori Goto * If this zone is not populated, then it is not in zonelist. 10106811378eSYasunori Goto * This means the page allocator ignores this zone. 10116811378eSYasunori Goto * So, zonelist must be updated after online. 10126811378eSYasunori Goto */ 10134eaf3f64SHaicheng Li mutex_lock(&zonelists_mutex); 10146dcd73d7SWen Congyang if (!populated_zone(zone)) { 10156811378eSYasunori Goto need_zonelists_rebuild = 1; 10166dcd73d7SWen Congyang build_all_zonelists(NULL, zone); 10176dcd73d7SWen Congyang } 10186811378eSYasunori Goto 1019908eedc6SKAMEZAWA Hiroyuki ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 102075884fb1SKAMEZAWA Hiroyuki online_pages_range); 1021fd8a4221SGeoff Levand if (ret) { 10226dcd73d7SWen Congyang if (need_zonelists_rebuild) 10236dcd73d7SWen Congyang zone_pcp_reset(zone); 10244eaf3f64SHaicheng Li mutex_unlock(&zonelists_mutex); 1025a62e2f4fSBjorn Helgaas printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 1026a62e2f4fSBjorn Helgaas (unsigned long long) pfn << PAGE_SHIFT, 1027a62e2f4fSBjorn Helgaas (((unsigned long long) pfn + nr_pages) 1028a62e2f4fSBjorn Helgaas << PAGE_SHIFT) - 1); 1029fd8a4221SGeoff Levand memory_notify(MEM_CANCEL_ONLINE, &arg); 1030*bfc8c901SVladimir Davydov goto out; 1031fd8a4221SGeoff Levand } 1032fd8a4221SGeoff Levand 10333947be19SDave Hansen zone->present_pages += onlined_pages; 1034aa47228aSCody P Schafer 1035aa47228aSCody P Schafer pgdat_resize_lock(zone->zone_pgdat, &flags); 1036f2937be5SYasunori Goto zone->zone_pgdat->node_present_pages += onlined_pages; 1037aa47228aSCody P Schafer pgdat_resize_unlock(zone->zone_pgdat, &flags); 1038aa47228aSCody P Schafer 103908dff7b7SJiang Liu if (onlined_pages) { 1040d9713679SLai Jiangshan node_states_set_node(zone_to_nid(zone), &arg); 10411f522509SHaicheng Li if (need_zonelists_rebuild) 10426dcd73d7SWen Congyang build_all_zonelists(NULL, NULL); 10431f522509SHaicheng Li else 1044112067f0SShaohua Li zone_pcp_update(zone); 104508dff7b7SJiang Liu } 10461f522509SHaicheng Li 10474eaf3f64SHaicheng Li mutex_unlock(&zonelists_mutex); 10481b79acc9SKOSAKI Motohiro 10491b79acc9SKOSAKI Motohiro init_per_zone_wmark_min(); 10501b79acc9SKOSAKI Motohiro 105108dff7b7SJiang Liu if (onlined_pages) 10527ea1530aSChristoph Lameter kswapd_run(zone_to_nid(zone)); 105361b13993SDave Hansen 10545a4d4361SKAMEZAWA Hiroyuki vm_total_pages = nr_free_pagecache_pages(); 10552f7f24ecSKent Liu 10562d1d43f6SChandra Seetharaman writeback_set_ratelimit(); 10577b78d335SYasunori Goto 10587b78d335SYasunori Goto if (onlined_pages) 10597b78d335SYasunori Goto memory_notify(MEM_ONLINE, &arg); 1060*bfc8c901SVladimir Davydov out: 1061*bfc8c901SVladimir Davydov mem_hotplug_done(); 1062*bfc8c901SVladimir Davydov return ret; 10633947be19SDave Hansen } 106453947027SKeith Mannthey #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1065bc02af93SYasunori Goto 1066e1319331SHidetoshi Seto /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1067e1319331SHidetoshi Seto static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 10689af3c2deSYasunori Goto { 10699af3c2deSYasunori Goto struct pglist_data *pgdat; 10709af3c2deSYasunori Goto unsigned long zones_size[MAX_NR_ZONES] = {0}; 10719af3c2deSYasunori Goto unsigned long zholes_size[MAX_NR_ZONES] = {0}; 10729af3c2deSYasunori Goto unsigned long start_pfn = start >> PAGE_SHIFT; 10739af3c2deSYasunori Goto 1074a1e565aaSTang Chen pgdat = NODE_DATA(nid); 1075a1e565aaSTang Chen if (!pgdat) { 10769af3c2deSYasunori Goto pgdat = arch_alloc_nodedata(nid); 10779af3c2deSYasunori Goto if (!pgdat) 10789af3c2deSYasunori Goto return NULL; 10799af3c2deSYasunori Goto 10809af3c2deSYasunori Goto arch_refresh_nodedata(nid, pgdat); 1081a1e565aaSTang Chen } 10829af3c2deSYasunori Goto 10839af3c2deSYasunori Goto /* we can use NODE_DATA(nid) from here */ 10849af3c2deSYasunori Goto 10859af3c2deSYasunori Goto /* init node's zones as empty zones, we don't have any present pages.*/ 10869109fb7bSJohannes Weiner free_area_init_node(nid, zones_size, start_pfn, zholes_size); 10879af3c2deSYasunori Goto 1088959ecc48SKAMEZAWA Hiroyuki /* 1089959ecc48SKAMEZAWA Hiroyuki * The node we allocated has no zone fallback lists. For avoiding 1090959ecc48SKAMEZAWA Hiroyuki * to access not-initialized zonelist, build here. 1091959ecc48SKAMEZAWA Hiroyuki */ 1092f957db4fSDavid Rientjes mutex_lock(&zonelists_mutex); 10939adb62a5SJiang Liu build_all_zonelists(pgdat, NULL); 1094f957db4fSDavid Rientjes mutex_unlock(&zonelists_mutex); 1095959ecc48SKAMEZAWA Hiroyuki 10969af3c2deSYasunori Goto return pgdat; 10979af3c2deSYasunori Goto } 10989af3c2deSYasunori Goto 10999af3c2deSYasunori Goto static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 11009af3c2deSYasunori Goto { 11019af3c2deSYasunori Goto arch_refresh_nodedata(nid, NULL); 11029af3c2deSYasunori Goto arch_free_nodedata(pgdat); 11039af3c2deSYasunori Goto return; 11049af3c2deSYasunori Goto } 11059af3c2deSYasunori Goto 11060a547039SKAMEZAWA Hiroyuki 110701b0f197SToshi Kani /** 110801b0f197SToshi Kani * try_online_node - online a node if offlined 110901b0f197SToshi Kani * 1110cf23422bSminskey guo * called by cpu_up() to online a node without onlined memory. 1111cf23422bSminskey guo */ 111201b0f197SToshi Kani int try_online_node(int nid) 1113cf23422bSminskey guo { 1114cf23422bSminskey guo pg_data_t *pgdat; 1115cf23422bSminskey guo int ret; 1116cf23422bSminskey guo 111701b0f197SToshi Kani if (node_online(nid)) 111801b0f197SToshi Kani return 0; 111901b0f197SToshi Kani 1120*bfc8c901SVladimir Davydov mem_hotplug_begin(); 1121cf23422bSminskey guo pgdat = hotadd_new_pgdat(nid, 0); 11227553e8f2SDavid Rientjes if (!pgdat) { 112301b0f197SToshi Kani pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1124cf23422bSminskey guo ret = -ENOMEM; 1125cf23422bSminskey guo goto out; 1126cf23422bSminskey guo } 1127cf23422bSminskey guo node_set_online(nid); 1128cf23422bSminskey guo ret = register_one_node(nid); 1129cf23422bSminskey guo BUG_ON(ret); 1130cf23422bSminskey guo 113101b0f197SToshi Kani if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 113201b0f197SToshi Kani mutex_lock(&zonelists_mutex); 113301b0f197SToshi Kani build_all_zonelists(NULL, NULL); 113401b0f197SToshi Kani mutex_unlock(&zonelists_mutex); 113501b0f197SToshi Kani } 113601b0f197SToshi Kani 1137cf23422bSminskey guo out: 1138*bfc8c901SVladimir Davydov mem_hotplug_done(); 1139cf23422bSminskey guo return ret; 1140cf23422bSminskey guo } 1141cf23422bSminskey guo 114227356f54SToshi Kani static int check_hotplug_memory_range(u64 start, u64 size) 114327356f54SToshi Kani { 114427356f54SToshi Kani u64 start_pfn = start >> PAGE_SHIFT; 114527356f54SToshi Kani u64 nr_pages = size >> PAGE_SHIFT; 114627356f54SToshi Kani 114727356f54SToshi Kani /* Memory range must be aligned with section */ 114827356f54SToshi Kani if ((start_pfn & ~PAGE_SECTION_MASK) || 114927356f54SToshi Kani (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 115027356f54SToshi Kani pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 115127356f54SToshi Kani (unsigned long long)start, 115227356f54SToshi Kani (unsigned long long)size); 115327356f54SToshi Kani return -EINVAL; 115427356f54SToshi Kani } 115527356f54SToshi Kani 115627356f54SToshi Kani return 0; 115727356f54SToshi Kani } 115827356f54SToshi Kani 115931168481SAl Viro /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 116031168481SAl Viro int __ref add_memory(int nid, u64 start, u64 size) 1161bc02af93SYasunori Goto { 11629af3c2deSYasunori Goto pg_data_t *pgdat = NULL; 1163a1e565aaSTang Chen bool new_pgdat; 1164a1e565aaSTang Chen bool new_node; 1165ebd15302SKAMEZAWA Hiroyuki struct resource *res; 1166bc02af93SYasunori Goto int ret; 1167bc02af93SYasunori Goto 116827356f54SToshi Kani ret = check_hotplug_memory_range(start, size); 116927356f54SToshi Kani if (ret) 117027356f54SToshi Kani return ret; 117127356f54SToshi Kani 1172ebd15302SKAMEZAWA Hiroyuki res = register_memory_resource(start, size); 11736ad696d2SAndi Kleen ret = -EEXIST; 1174ebd15302SKAMEZAWA Hiroyuki if (!res) 1175ac13c462SNathan Zimmer return ret; 1176ebd15302SKAMEZAWA Hiroyuki 1177a1e565aaSTang Chen { /* Stupid hack to suppress address-never-null warning */ 1178a1e565aaSTang Chen void *p = NODE_DATA(nid); 1179a1e565aaSTang Chen new_pgdat = !p; 1180a1e565aaSTang Chen } 1181ac13c462SNathan Zimmer 1182*bfc8c901SVladimir Davydov mem_hotplug_begin(); 1183ac13c462SNathan Zimmer 1184a1e565aaSTang Chen new_node = !node_online(nid); 1185a1e565aaSTang Chen if (new_node) { 11869af3c2deSYasunori Goto pgdat = hotadd_new_pgdat(nid, start); 11876ad696d2SAndi Kleen ret = -ENOMEM; 11889af3c2deSYasunori Goto if (!pgdat) 118941b9e2d7SWen Congyang goto error; 11909af3c2deSYasunori Goto } 11919af3c2deSYasunori Goto 1192bc02af93SYasunori Goto /* call arch's memory hotadd */ 1193bc02af93SYasunori Goto ret = arch_add_memory(nid, start, size); 1194bc02af93SYasunori Goto 11959af3c2deSYasunori Goto if (ret < 0) 11969af3c2deSYasunori Goto goto error; 11979af3c2deSYasunori Goto 11980fc44159SYasunori Goto /* we online node here. we can't roll back from here. */ 11999af3c2deSYasunori Goto node_set_online(nid); 12009af3c2deSYasunori Goto 1201a1e565aaSTang Chen if (new_node) { 12020fc44159SYasunori Goto ret = register_one_node(nid); 12030fc44159SYasunori Goto /* 12040fc44159SYasunori Goto * If sysfs file of new node can't create, cpu on the node 12050fc44159SYasunori Goto * can't be hot-added. There is no rollback way now. 12060fc44159SYasunori Goto * So, check by BUG_ON() to catch it reluctantly.. 12070fc44159SYasunori Goto */ 12080fc44159SYasunori Goto BUG_ON(ret); 12090fc44159SYasunori Goto } 12100fc44159SYasunori Goto 1211d96ae530Sakpm@linux-foundation.org /* create new memmap entry */ 1212d96ae530Sakpm@linux-foundation.org firmware_map_add_hotplug(start, start + size, "System RAM"); 1213d96ae530Sakpm@linux-foundation.org 12146ad696d2SAndi Kleen goto out; 12156ad696d2SAndi Kleen 12169af3c2deSYasunori Goto error: 12179af3c2deSYasunori Goto /* rollback pgdat allocation and others */ 12189af3c2deSYasunori Goto if (new_pgdat) 12199af3c2deSYasunori Goto rollback_node_hotadd(nid, pgdat); 1220ebd15302SKAMEZAWA Hiroyuki release_memory_resource(res); 12219af3c2deSYasunori Goto 12226ad696d2SAndi Kleen out: 1223*bfc8c901SVladimir Davydov mem_hotplug_done(); 1224bc02af93SYasunori Goto return ret; 1225bc02af93SYasunori Goto } 1226bc02af93SYasunori Goto EXPORT_SYMBOL_GPL(add_memory); 12270c0e6195SKAMEZAWA Hiroyuki 12280c0e6195SKAMEZAWA Hiroyuki #ifdef CONFIG_MEMORY_HOTREMOVE 12290c0e6195SKAMEZAWA Hiroyuki /* 12305c755e9fSBadari Pulavarty * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 12315c755e9fSBadari Pulavarty * set and the size of the free page is given by page_order(). Using this, 12325c755e9fSBadari Pulavarty * the function determines if the pageblock contains only free pages. 12335c755e9fSBadari Pulavarty * Due to buddy contraints, a free page at least the size of a pageblock will 12345c755e9fSBadari Pulavarty * be located at the start of the pageblock 12355c755e9fSBadari Pulavarty */ 12365c755e9fSBadari Pulavarty static inline int pageblock_free(struct page *page) 12375c755e9fSBadari Pulavarty { 12385c755e9fSBadari Pulavarty return PageBuddy(page) && page_order(page) >= pageblock_order; 12395c755e9fSBadari Pulavarty } 12405c755e9fSBadari Pulavarty 12415c755e9fSBadari Pulavarty /* Return the start of the next active pageblock after a given page */ 12425c755e9fSBadari Pulavarty static struct page *next_active_pageblock(struct page *page) 12435c755e9fSBadari Pulavarty { 12445c755e9fSBadari Pulavarty /* Ensure the starting page is pageblock-aligned */ 12455c755e9fSBadari Pulavarty BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 12465c755e9fSBadari Pulavarty 12475c755e9fSBadari Pulavarty /* If the entire pageblock is free, move to the end of free page */ 12480dcc48c1SKAMEZAWA Hiroyuki if (pageblock_free(page)) { 12490dcc48c1SKAMEZAWA Hiroyuki int order; 12500dcc48c1SKAMEZAWA Hiroyuki /* be careful. we don't have locks, page_order can be changed.*/ 12510dcc48c1SKAMEZAWA Hiroyuki order = page_order(page); 12520dcc48c1SKAMEZAWA Hiroyuki if ((order < MAX_ORDER) && (order >= pageblock_order)) 12530dcc48c1SKAMEZAWA Hiroyuki return page + (1 << order); 12540dcc48c1SKAMEZAWA Hiroyuki } 12555c755e9fSBadari Pulavarty 12560dcc48c1SKAMEZAWA Hiroyuki return page + pageblock_nr_pages; 12575c755e9fSBadari Pulavarty } 12585c755e9fSBadari Pulavarty 12595c755e9fSBadari Pulavarty /* Checks if this range of memory is likely to be hot-removable. */ 12605c755e9fSBadari Pulavarty int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 12615c755e9fSBadari Pulavarty { 12625c755e9fSBadari Pulavarty struct page *page = pfn_to_page(start_pfn); 12635c755e9fSBadari Pulavarty struct page *end_page = page + nr_pages; 12645c755e9fSBadari Pulavarty 12655c755e9fSBadari Pulavarty /* Check the starting page of each pageblock within the range */ 12665c755e9fSBadari Pulavarty for (; page < end_page; page = next_active_pageblock(page)) { 126749ac8255SKAMEZAWA Hiroyuki if (!is_pageblock_removable_nolock(page)) 12685c755e9fSBadari Pulavarty return 0; 126949ac8255SKAMEZAWA Hiroyuki cond_resched(); 12705c755e9fSBadari Pulavarty } 12715c755e9fSBadari Pulavarty 12725c755e9fSBadari Pulavarty /* All pageblocks in the memory block are likely to be hot-removable */ 12735c755e9fSBadari Pulavarty return 1; 12745c755e9fSBadari Pulavarty } 12755c755e9fSBadari Pulavarty 12765c755e9fSBadari Pulavarty /* 12770c0e6195SKAMEZAWA Hiroyuki * Confirm all pages in a range [start, end) is belongs to the same zone. 12780c0e6195SKAMEZAWA Hiroyuki */ 12790c0e6195SKAMEZAWA Hiroyuki static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 12800c0e6195SKAMEZAWA Hiroyuki { 12810c0e6195SKAMEZAWA Hiroyuki unsigned long pfn; 12820c0e6195SKAMEZAWA Hiroyuki struct zone *zone = NULL; 12830c0e6195SKAMEZAWA Hiroyuki struct page *page; 12840c0e6195SKAMEZAWA Hiroyuki int i; 12850c0e6195SKAMEZAWA Hiroyuki for (pfn = start_pfn; 12860c0e6195SKAMEZAWA Hiroyuki pfn < end_pfn; 12870c0e6195SKAMEZAWA Hiroyuki pfn += MAX_ORDER_NR_PAGES) { 12880c0e6195SKAMEZAWA Hiroyuki i = 0; 12890c0e6195SKAMEZAWA Hiroyuki /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 12900c0e6195SKAMEZAWA Hiroyuki while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 12910c0e6195SKAMEZAWA Hiroyuki i++; 12920c0e6195SKAMEZAWA Hiroyuki if (i == MAX_ORDER_NR_PAGES) 12930c0e6195SKAMEZAWA Hiroyuki continue; 12940c0e6195SKAMEZAWA Hiroyuki page = pfn_to_page(pfn + i); 12950c0e6195SKAMEZAWA Hiroyuki if (zone && page_zone(page) != zone) 12960c0e6195SKAMEZAWA Hiroyuki return 0; 12970c0e6195SKAMEZAWA Hiroyuki zone = page_zone(page); 12980c0e6195SKAMEZAWA Hiroyuki } 12990c0e6195SKAMEZAWA Hiroyuki return 1; 13000c0e6195SKAMEZAWA Hiroyuki } 13010c0e6195SKAMEZAWA Hiroyuki 13020c0e6195SKAMEZAWA Hiroyuki /* 1303c8721bbbSNaoya Horiguchi * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1304c8721bbbSNaoya Horiguchi * and hugepages). We scan pfn because it's much easier than scanning over 1305c8721bbbSNaoya Horiguchi * linked list. This function returns the pfn of the first found movable 1306c8721bbbSNaoya Horiguchi * page if it's found, otherwise 0. 13070c0e6195SKAMEZAWA Hiroyuki */ 1308c8721bbbSNaoya Horiguchi static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 13090c0e6195SKAMEZAWA Hiroyuki { 13100c0e6195SKAMEZAWA Hiroyuki unsigned long pfn; 13110c0e6195SKAMEZAWA Hiroyuki struct page *page; 13120c0e6195SKAMEZAWA Hiroyuki for (pfn = start; pfn < end; pfn++) { 13130c0e6195SKAMEZAWA Hiroyuki if (pfn_valid(pfn)) { 13140c0e6195SKAMEZAWA Hiroyuki page = pfn_to_page(pfn); 13150c0e6195SKAMEZAWA Hiroyuki if (PageLRU(page)) 13160c0e6195SKAMEZAWA Hiroyuki return pfn; 1317c8721bbbSNaoya Horiguchi if (PageHuge(page)) { 1318c8721bbbSNaoya Horiguchi if (is_hugepage_active(page)) 1319c8721bbbSNaoya Horiguchi return pfn; 1320c8721bbbSNaoya Horiguchi else 1321c8721bbbSNaoya Horiguchi pfn = round_up(pfn + 1, 1322c8721bbbSNaoya Horiguchi 1 << compound_order(page)) - 1; 1323c8721bbbSNaoya Horiguchi } 13240c0e6195SKAMEZAWA Hiroyuki } 13250c0e6195SKAMEZAWA Hiroyuki } 13260c0e6195SKAMEZAWA Hiroyuki return 0; 13270c0e6195SKAMEZAWA Hiroyuki } 13280c0e6195SKAMEZAWA Hiroyuki 13290c0e6195SKAMEZAWA Hiroyuki #define NR_OFFLINE_AT_ONCE_PAGES (256) 13300c0e6195SKAMEZAWA Hiroyuki static int 13310c0e6195SKAMEZAWA Hiroyuki do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 13320c0e6195SKAMEZAWA Hiroyuki { 13330c0e6195SKAMEZAWA Hiroyuki unsigned long pfn; 13340c0e6195SKAMEZAWA Hiroyuki struct page *page; 13350c0e6195SKAMEZAWA Hiroyuki int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 13360c0e6195SKAMEZAWA Hiroyuki int not_managed = 0; 13370c0e6195SKAMEZAWA Hiroyuki int ret = 0; 13380c0e6195SKAMEZAWA Hiroyuki LIST_HEAD(source); 13390c0e6195SKAMEZAWA Hiroyuki 13400c0e6195SKAMEZAWA Hiroyuki for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 13410c0e6195SKAMEZAWA Hiroyuki if (!pfn_valid(pfn)) 13420c0e6195SKAMEZAWA Hiroyuki continue; 13430c0e6195SKAMEZAWA Hiroyuki page = pfn_to_page(pfn); 1344c8721bbbSNaoya Horiguchi 1345c8721bbbSNaoya Horiguchi if (PageHuge(page)) { 1346c8721bbbSNaoya Horiguchi struct page *head = compound_head(page); 1347c8721bbbSNaoya Horiguchi pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1348c8721bbbSNaoya Horiguchi if (compound_order(head) > PFN_SECTION_SHIFT) { 1349c8721bbbSNaoya Horiguchi ret = -EBUSY; 1350c8721bbbSNaoya Horiguchi break; 1351c8721bbbSNaoya Horiguchi } 1352c8721bbbSNaoya Horiguchi if (isolate_huge_page(page, &source)) 1353c8721bbbSNaoya Horiguchi move_pages -= 1 << compound_order(head); 1354c8721bbbSNaoya Horiguchi continue; 1355c8721bbbSNaoya Horiguchi } 1356c8721bbbSNaoya Horiguchi 1357700c2a46SKonstantin Khlebnikov if (!get_page_unless_zero(page)) 13580c0e6195SKAMEZAWA Hiroyuki continue; 13590c0e6195SKAMEZAWA Hiroyuki /* 13600c0e6195SKAMEZAWA Hiroyuki * We can skip free pages. And we can only deal with pages on 13610c0e6195SKAMEZAWA Hiroyuki * LRU. 13620c0e6195SKAMEZAWA Hiroyuki */ 136362695a84SNick Piggin ret = isolate_lru_page(page); 13640c0e6195SKAMEZAWA Hiroyuki if (!ret) { /* Success */ 1365700c2a46SKonstantin Khlebnikov put_page(page); 136662695a84SNick Piggin list_add_tail(&page->lru, &source); 13670c0e6195SKAMEZAWA Hiroyuki move_pages--; 13686d9c285aSKOSAKI Motohiro inc_zone_page_state(page, NR_ISOLATED_ANON + 13696d9c285aSKOSAKI Motohiro page_is_file_cache(page)); 13706d9c285aSKOSAKI Motohiro 13710c0e6195SKAMEZAWA Hiroyuki } else { 13720c0e6195SKAMEZAWA Hiroyuki #ifdef CONFIG_DEBUG_VM 1373718a3821SWu Fengguang printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1374718a3821SWu Fengguang pfn); 1375f0b791a3SDave Hansen dump_page(page, "failed to remove from LRU"); 13760c0e6195SKAMEZAWA Hiroyuki #endif 1377700c2a46SKonstantin Khlebnikov put_page(page); 137825985edcSLucas De Marchi /* Because we don't have big zone->lock. we should 1379809c4449SBob Liu check this again here. */ 1380809c4449SBob Liu if (page_count(page)) { 1381809c4449SBob Liu not_managed++; 1382f3ab2636SBob Liu ret = -EBUSY; 1383809c4449SBob Liu break; 1384809c4449SBob Liu } 13850c0e6195SKAMEZAWA Hiroyuki } 13860c0e6195SKAMEZAWA Hiroyuki } 1387f3ab2636SBob Liu if (!list_empty(&source)) { 13880c0e6195SKAMEZAWA Hiroyuki if (not_managed) { 1389c8721bbbSNaoya Horiguchi putback_movable_pages(&source); 13900c0e6195SKAMEZAWA Hiroyuki goto out; 13910c0e6195SKAMEZAWA Hiroyuki } 139274c08f98SMinchan Kim 139374c08f98SMinchan Kim /* 139474c08f98SMinchan Kim * alloc_migrate_target should be improooooved!! 139574c08f98SMinchan Kim * migrate_pages returns # of failed pages. 139674c08f98SMinchan Kim */ 139774c08f98SMinchan Kim ret = migrate_pages(&source, alloc_migrate_target, 0, 13989c620e2bSHugh Dickins MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1399cf608ac1SMinchan Kim if (ret) 1400c8721bbbSNaoya Horiguchi putback_movable_pages(&source); 1401f3ab2636SBob Liu } 14020c0e6195SKAMEZAWA Hiroyuki out: 14030c0e6195SKAMEZAWA Hiroyuki return ret; 14040c0e6195SKAMEZAWA Hiroyuki } 14050c0e6195SKAMEZAWA Hiroyuki 14060c0e6195SKAMEZAWA Hiroyuki /* 14070c0e6195SKAMEZAWA Hiroyuki * remove from free_area[] and mark all as Reserved. 14080c0e6195SKAMEZAWA Hiroyuki */ 14090c0e6195SKAMEZAWA Hiroyuki static int 14100c0e6195SKAMEZAWA Hiroyuki offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 14110c0e6195SKAMEZAWA Hiroyuki void *data) 14120c0e6195SKAMEZAWA Hiroyuki { 14130c0e6195SKAMEZAWA Hiroyuki __offline_isolated_pages(start, start + nr_pages); 14140c0e6195SKAMEZAWA Hiroyuki return 0; 14150c0e6195SKAMEZAWA Hiroyuki } 14160c0e6195SKAMEZAWA Hiroyuki 14170c0e6195SKAMEZAWA Hiroyuki static void 14180c0e6195SKAMEZAWA Hiroyuki offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 14190c0e6195SKAMEZAWA Hiroyuki { 1420908eedc6SKAMEZAWA Hiroyuki walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 14210c0e6195SKAMEZAWA Hiroyuki offline_isolated_pages_cb); 14220c0e6195SKAMEZAWA Hiroyuki } 14230c0e6195SKAMEZAWA Hiroyuki 14240c0e6195SKAMEZAWA Hiroyuki /* 14250c0e6195SKAMEZAWA Hiroyuki * Check all pages in range, recoreded as memory resource, are isolated. 14260c0e6195SKAMEZAWA Hiroyuki */ 14270c0e6195SKAMEZAWA Hiroyuki static int 14280c0e6195SKAMEZAWA Hiroyuki check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 14290c0e6195SKAMEZAWA Hiroyuki void *data) 14300c0e6195SKAMEZAWA Hiroyuki { 14310c0e6195SKAMEZAWA Hiroyuki int ret; 14320c0e6195SKAMEZAWA Hiroyuki long offlined = *(long *)data; 1433b023f468SWen Congyang ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 14340c0e6195SKAMEZAWA Hiroyuki offlined = nr_pages; 14350c0e6195SKAMEZAWA Hiroyuki if (!ret) 14360c0e6195SKAMEZAWA Hiroyuki *(long *)data += offlined; 14370c0e6195SKAMEZAWA Hiroyuki return ret; 14380c0e6195SKAMEZAWA Hiroyuki } 14390c0e6195SKAMEZAWA Hiroyuki 14400c0e6195SKAMEZAWA Hiroyuki static long 14410c0e6195SKAMEZAWA Hiroyuki check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 14420c0e6195SKAMEZAWA Hiroyuki { 14430c0e6195SKAMEZAWA Hiroyuki long offlined = 0; 14440c0e6195SKAMEZAWA Hiroyuki int ret; 14450c0e6195SKAMEZAWA Hiroyuki 1446908eedc6SKAMEZAWA Hiroyuki ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 14470c0e6195SKAMEZAWA Hiroyuki check_pages_isolated_cb); 14480c0e6195SKAMEZAWA Hiroyuki if (ret < 0) 14490c0e6195SKAMEZAWA Hiroyuki offlined = (long)ret; 14500c0e6195SKAMEZAWA Hiroyuki return offlined; 14510c0e6195SKAMEZAWA Hiroyuki } 14520c0e6195SKAMEZAWA Hiroyuki 145309285af7SLai Jiangshan #ifdef CONFIG_MOVABLE_NODE 145479a4dcefSTang Chen /* 145579a4dcefSTang Chen * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 145679a4dcefSTang Chen * normal memory. 145779a4dcefSTang Chen */ 145809285af7SLai Jiangshan static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 145909285af7SLai Jiangshan { 146009285af7SLai Jiangshan return true; 146109285af7SLai Jiangshan } 146279a4dcefSTang Chen #else /* CONFIG_MOVABLE_NODE */ 146374d42d8fSLai Jiangshan /* ensure the node has NORMAL memory if it is still online */ 146474d42d8fSLai Jiangshan static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 146574d42d8fSLai Jiangshan { 146674d42d8fSLai Jiangshan struct pglist_data *pgdat = zone->zone_pgdat; 146774d42d8fSLai Jiangshan unsigned long present_pages = 0; 146874d42d8fSLai Jiangshan enum zone_type zt; 146974d42d8fSLai Jiangshan 147074d42d8fSLai Jiangshan for (zt = 0; zt <= ZONE_NORMAL; zt++) 147174d42d8fSLai Jiangshan present_pages += pgdat->node_zones[zt].present_pages; 147274d42d8fSLai Jiangshan 147374d42d8fSLai Jiangshan if (present_pages > nr_pages) 147474d42d8fSLai Jiangshan return true; 147574d42d8fSLai Jiangshan 147674d42d8fSLai Jiangshan present_pages = 0; 147774d42d8fSLai Jiangshan for (; zt <= ZONE_MOVABLE; zt++) 147874d42d8fSLai Jiangshan present_pages += pgdat->node_zones[zt].present_pages; 147974d42d8fSLai Jiangshan 148074d42d8fSLai Jiangshan /* 148174d42d8fSLai Jiangshan * we can't offline the last normal memory until all 148274d42d8fSLai Jiangshan * higher memory is offlined. 148374d42d8fSLai Jiangshan */ 148474d42d8fSLai Jiangshan return present_pages == 0; 148574d42d8fSLai Jiangshan } 148679a4dcefSTang Chen #endif /* CONFIG_MOVABLE_NODE */ 148774d42d8fSLai Jiangshan 1488c5320926STang Chen static int __init cmdline_parse_movable_node(char *p) 1489c5320926STang Chen { 1490c5320926STang Chen #ifdef CONFIG_MOVABLE_NODE 1491c5320926STang Chen /* 1492c5320926STang Chen * Memory used by the kernel cannot be hot-removed because Linux 1493c5320926STang Chen * cannot migrate the kernel pages. When memory hotplug is 1494c5320926STang Chen * enabled, we should prevent memblock from allocating memory 1495c5320926STang Chen * for the kernel. 1496c5320926STang Chen * 1497c5320926STang Chen * ACPI SRAT records all hotpluggable memory ranges. But before 1498c5320926STang Chen * SRAT is parsed, we don't know about it. 1499c5320926STang Chen * 1500c5320926STang Chen * The kernel image is loaded into memory at very early time. We 1501c5320926STang Chen * cannot prevent this anyway. So on NUMA system, we set any 1502c5320926STang Chen * node the kernel resides in as un-hotpluggable. 1503c5320926STang Chen * 1504c5320926STang Chen * Since on modern servers, one node could have double-digit 1505c5320926STang Chen * gigabytes memory, we can assume the memory around the kernel 1506c5320926STang Chen * image is also un-hotpluggable. So before SRAT is parsed, just 1507c5320926STang Chen * allocate memory near the kernel image to try the best to keep 1508c5320926STang Chen * the kernel away from hotpluggable memory. 1509c5320926STang Chen */ 1510c5320926STang Chen memblock_set_bottom_up(true); 151155ac590cSTang Chen movable_node_enabled = true; 1512c5320926STang Chen #else 1513c5320926STang Chen pr_warn("movable_node option not supported\n"); 1514c5320926STang Chen #endif 1515c5320926STang Chen return 0; 1516c5320926STang Chen } 1517c5320926STang Chen early_param("movable_node", cmdline_parse_movable_node); 1518c5320926STang Chen 1519d9713679SLai Jiangshan /* check which state of node_states will be changed when offline memory */ 1520d9713679SLai Jiangshan static void node_states_check_changes_offline(unsigned long nr_pages, 1521d9713679SLai Jiangshan struct zone *zone, struct memory_notify *arg) 1522d9713679SLai Jiangshan { 1523d9713679SLai Jiangshan struct pglist_data *pgdat = zone->zone_pgdat; 1524d9713679SLai Jiangshan unsigned long present_pages = 0; 1525d9713679SLai Jiangshan enum zone_type zt, zone_last = ZONE_NORMAL; 1526d9713679SLai Jiangshan 1527d9713679SLai Jiangshan /* 15286715ddf9SLai Jiangshan * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 15296715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_NORMAL, 15306715ddf9SLai Jiangshan * set zone_last to ZONE_NORMAL. 1531d9713679SLai Jiangshan * 15326715ddf9SLai Jiangshan * If we don't have HIGHMEM nor movable node, 15336715ddf9SLai Jiangshan * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 15346715ddf9SLai Jiangshan * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1535d9713679SLai Jiangshan */ 15366715ddf9SLai Jiangshan if (N_MEMORY == N_NORMAL_MEMORY) 1537d9713679SLai Jiangshan zone_last = ZONE_MOVABLE; 1538d9713679SLai Jiangshan 1539d9713679SLai Jiangshan /* 1540d9713679SLai Jiangshan * check whether node_states[N_NORMAL_MEMORY] will be changed. 1541d9713679SLai Jiangshan * If the memory to be offline is in a zone of 0...zone_last, 1542d9713679SLai Jiangshan * and it is the last present memory, 0...zone_last will 1543d9713679SLai Jiangshan * become empty after offline , thus we can determind we will 1544d9713679SLai Jiangshan * need to clear the node from node_states[N_NORMAL_MEMORY]. 1545d9713679SLai Jiangshan */ 1546d9713679SLai Jiangshan for (zt = 0; zt <= zone_last; zt++) 1547d9713679SLai Jiangshan present_pages += pgdat->node_zones[zt].present_pages; 1548d9713679SLai Jiangshan if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1549d9713679SLai Jiangshan arg->status_change_nid_normal = zone_to_nid(zone); 1550d9713679SLai Jiangshan else 1551d9713679SLai Jiangshan arg->status_change_nid_normal = -1; 1552d9713679SLai Jiangshan 15536715ddf9SLai Jiangshan #ifdef CONFIG_HIGHMEM 15546715ddf9SLai Jiangshan /* 15556715ddf9SLai Jiangshan * If we have movable node, node_states[N_HIGH_MEMORY] 15566715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_HIGHMEM, 15576715ddf9SLai Jiangshan * set zone_last to ZONE_HIGHMEM. 15586715ddf9SLai Jiangshan * 15596715ddf9SLai Jiangshan * If we don't have movable node, node_states[N_NORMAL_MEMORY] 15606715ddf9SLai Jiangshan * contains nodes which have zones of 0...ZONE_MOVABLE, 15616715ddf9SLai Jiangshan * set zone_last to ZONE_MOVABLE. 15626715ddf9SLai Jiangshan */ 15636715ddf9SLai Jiangshan zone_last = ZONE_HIGHMEM; 15646715ddf9SLai Jiangshan if (N_MEMORY == N_HIGH_MEMORY) 15656715ddf9SLai Jiangshan zone_last = ZONE_MOVABLE; 15666715ddf9SLai Jiangshan 15676715ddf9SLai Jiangshan for (; zt <= zone_last; zt++) 15686715ddf9SLai Jiangshan present_pages += pgdat->node_zones[zt].present_pages; 15696715ddf9SLai Jiangshan if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 15706715ddf9SLai Jiangshan arg->status_change_nid_high = zone_to_nid(zone); 15716715ddf9SLai Jiangshan else 15726715ddf9SLai Jiangshan arg->status_change_nid_high = -1; 15736715ddf9SLai Jiangshan #else 15746715ddf9SLai Jiangshan arg->status_change_nid_high = arg->status_change_nid_normal; 15756715ddf9SLai Jiangshan #endif 15766715ddf9SLai Jiangshan 1577d9713679SLai Jiangshan /* 1578d9713679SLai Jiangshan * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1579d9713679SLai Jiangshan */ 1580d9713679SLai Jiangshan zone_last = ZONE_MOVABLE; 1581d9713679SLai Jiangshan 1582d9713679SLai Jiangshan /* 1583d9713679SLai Jiangshan * check whether node_states[N_HIGH_MEMORY] will be changed 1584d9713679SLai Jiangshan * If we try to offline the last present @nr_pages from the node, 1585d9713679SLai Jiangshan * we can determind we will need to clear the node from 1586d9713679SLai Jiangshan * node_states[N_HIGH_MEMORY]. 1587d9713679SLai Jiangshan */ 1588d9713679SLai Jiangshan for (; zt <= zone_last; zt++) 1589d9713679SLai Jiangshan present_pages += pgdat->node_zones[zt].present_pages; 1590d9713679SLai Jiangshan if (nr_pages >= present_pages) 1591d9713679SLai Jiangshan arg->status_change_nid = zone_to_nid(zone); 1592d9713679SLai Jiangshan else 1593d9713679SLai Jiangshan arg->status_change_nid = -1; 1594d9713679SLai Jiangshan } 1595d9713679SLai Jiangshan 1596d9713679SLai Jiangshan static void node_states_clear_node(int node, struct memory_notify *arg) 1597d9713679SLai Jiangshan { 1598d9713679SLai Jiangshan if (arg->status_change_nid_normal >= 0) 1599d9713679SLai Jiangshan node_clear_state(node, N_NORMAL_MEMORY); 1600d9713679SLai Jiangshan 16016715ddf9SLai Jiangshan if ((N_MEMORY != N_NORMAL_MEMORY) && 16026715ddf9SLai Jiangshan (arg->status_change_nid_high >= 0)) 1603d9713679SLai Jiangshan node_clear_state(node, N_HIGH_MEMORY); 16046715ddf9SLai Jiangshan 16056715ddf9SLai Jiangshan if ((N_MEMORY != N_HIGH_MEMORY) && 16066715ddf9SLai Jiangshan (arg->status_change_nid >= 0)) 16076715ddf9SLai Jiangshan node_clear_state(node, N_MEMORY); 1608d9713679SLai Jiangshan } 1609d9713679SLai Jiangshan 1610a16cee10SWen Congyang static int __ref __offline_pages(unsigned long start_pfn, 16110c0e6195SKAMEZAWA Hiroyuki unsigned long end_pfn, unsigned long timeout) 16120c0e6195SKAMEZAWA Hiroyuki { 16130c0e6195SKAMEZAWA Hiroyuki unsigned long pfn, nr_pages, expire; 16140c0e6195SKAMEZAWA Hiroyuki long offlined_pages; 16157b78d335SYasunori Goto int ret, drain, retry_max, node; 1616d702909fSCody P Schafer unsigned long flags; 16170c0e6195SKAMEZAWA Hiroyuki struct zone *zone; 16187b78d335SYasunori Goto struct memory_notify arg; 16190c0e6195SKAMEZAWA Hiroyuki 16200c0e6195SKAMEZAWA Hiroyuki /* at least, alignment against pageblock is necessary */ 16210c0e6195SKAMEZAWA Hiroyuki if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 16220c0e6195SKAMEZAWA Hiroyuki return -EINVAL; 16230c0e6195SKAMEZAWA Hiroyuki if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 16240c0e6195SKAMEZAWA Hiroyuki return -EINVAL; 16250c0e6195SKAMEZAWA Hiroyuki /* This makes hotplug much easier...and readable. 16260c0e6195SKAMEZAWA Hiroyuki we assume this for now. .*/ 16270c0e6195SKAMEZAWA Hiroyuki if (!test_pages_in_a_zone(start_pfn, end_pfn)) 16280c0e6195SKAMEZAWA Hiroyuki return -EINVAL; 16297b78d335SYasunori Goto 1630*bfc8c901SVladimir Davydov mem_hotplug_begin(); 16316ad696d2SAndi Kleen 16327b78d335SYasunori Goto zone = page_zone(pfn_to_page(start_pfn)); 16337b78d335SYasunori Goto node = zone_to_nid(zone); 16347b78d335SYasunori Goto nr_pages = end_pfn - start_pfn; 16357b78d335SYasunori Goto 163674d42d8fSLai Jiangshan ret = -EINVAL; 163774d42d8fSLai Jiangshan if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 163874d42d8fSLai Jiangshan goto out; 163974d42d8fSLai Jiangshan 16400c0e6195SKAMEZAWA Hiroyuki /* set above range as isolated */ 1641b023f468SWen Congyang ret = start_isolate_page_range(start_pfn, end_pfn, 1642b023f468SWen Congyang MIGRATE_MOVABLE, true); 16430c0e6195SKAMEZAWA Hiroyuki if (ret) 16446ad696d2SAndi Kleen goto out; 16457b78d335SYasunori Goto 16467b78d335SYasunori Goto arg.start_pfn = start_pfn; 16477b78d335SYasunori Goto arg.nr_pages = nr_pages; 1648d9713679SLai Jiangshan node_states_check_changes_offline(nr_pages, zone, &arg); 16497b78d335SYasunori Goto 16507b78d335SYasunori Goto ret = memory_notify(MEM_GOING_OFFLINE, &arg); 16517b78d335SYasunori Goto ret = notifier_to_errno(ret); 16527b78d335SYasunori Goto if (ret) 16537b78d335SYasunori Goto goto failed_removal; 16547b78d335SYasunori Goto 16550c0e6195SKAMEZAWA Hiroyuki pfn = start_pfn; 16560c0e6195SKAMEZAWA Hiroyuki expire = jiffies + timeout; 16570c0e6195SKAMEZAWA Hiroyuki drain = 0; 16580c0e6195SKAMEZAWA Hiroyuki retry_max = 5; 16590c0e6195SKAMEZAWA Hiroyuki repeat: 16600c0e6195SKAMEZAWA Hiroyuki /* start memory hot removal */ 16610c0e6195SKAMEZAWA Hiroyuki ret = -EAGAIN; 16620c0e6195SKAMEZAWA Hiroyuki if (time_after(jiffies, expire)) 16630c0e6195SKAMEZAWA Hiroyuki goto failed_removal; 16640c0e6195SKAMEZAWA Hiroyuki ret = -EINTR; 16650c0e6195SKAMEZAWA Hiroyuki if (signal_pending(current)) 16660c0e6195SKAMEZAWA Hiroyuki goto failed_removal; 16670c0e6195SKAMEZAWA Hiroyuki ret = 0; 16680c0e6195SKAMEZAWA Hiroyuki if (drain) { 16690c0e6195SKAMEZAWA Hiroyuki lru_add_drain_all(); 16700c0e6195SKAMEZAWA Hiroyuki cond_resched(); 16719f8f2172SChristoph Lameter drain_all_pages(); 16720c0e6195SKAMEZAWA Hiroyuki } 16730c0e6195SKAMEZAWA Hiroyuki 1674c8721bbbSNaoya Horiguchi pfn = scan_movable_pages(start_pfn, end_pfn); 1675c8721bbbSNaoya Horiguchi if (pfn) { /* We have movable pages */ 16760c0e6195SKAMEZAWA Hiroyuki ret = do_migrate_range(pfn, end_pfn); 16770c0e6195SKAMEZAWA Hiroyuki if (!ret) { 16780c0e6195SKAMEZAWA Hiroyuki drain = 1; 16790c0e6195SKAMEZAWA Hiroyuki goto repeat; 16800c0e6195SKAMEZAWA Hiroyuki } else { 16810c0e6195SKAMEZAWA Hiroyuki if (ret < 0) 16820c0e6195SKAMEZAWA Hiroyuki if (--retry_max == 0) 16830c0e6195SKAMEZAWA Hiroyuki goto failed_removal; 16840c0e6195SKAMEZAWA Hiroyuki yield(); 16850c0e6195SKAMEZAWA Hiroyuki drain = 1; 16860c0e6195SKAMEZAWA Hiroyuki goto repeat; 16870c0e6195SKAMEZAWA Hiroyuki } 16880c0e6195SKAMEZAWA Hiroyuki } 1689b3834be5SAdam Buchbinder /* drain all zone's lru pagevec, this is asynchronous... */ 16900c0e6195SKAMEZAWA Hiroyuki lru_add_drain_all(); 16910c0e6195SKAMEZAWA Hiroyuki yield(); 1692b3834be5SAdam Buchbinder /* drain pcp pages, this is synchronous. */ 16939f8f2172SChristoph Lameter drain_all_pages(); 1694c8721bbbSNaoya Horiguchi /* 1695c8721bbbSNaoya Horiguchi * dissolve free hugepages in the memory block before doing offlining 1696c8721bbbSNaoya Horiguchi * actually in order to make hugetlbfs's object counting consistent. 1697c8721bbbSNaoya Horiguchi */ 1698c8721bbbSNaoya Horiguchi dissolve_free_huge_pages(start_pfn, end_pfn); 16990c0e6195SKAMEZAWA Hiroyuki /* check again */ 17000c0e6195SKAMEZAWA Hiroyuki offlined_pages = check_pages_isolated(start_pfn, end_pfn); 17010c0e6195SKAMEZAWA Hiroyuki if (offlined_pages < 0) { 17020c0e6195SKAMEZAWA Hiroyuki ret = -EBUSY; 17030c0e6195SKAMEZAWA Hiroyuki goto failed_removal; 17040c0e6195SKAMEZAWA Hiroyuki } 17050c0e6195SKAMEZAWA Hiroyuki printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1706b3834be5SAdam Buchbinder /* Ok, all of our target is isolated. 17070c0e6195SKAMEZAWA Hiroyuki We cannot do rollback at this point. */ 17080c0e6195SKAMEZAWA Hiroyuki offline_isolated_pages(start_pfn, end_pfn); 1709dbc0e4ceSKAMEZAWA Hiroyuki /* reset pagetype flags and makes migrate type to be MOVABLE */ 17100815f3d8SMichal Nazarewicz undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 17110c0e6195SKAMEZAWA Hiroyuki /* removal success */ 17123dcc0571SJiang Liu adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 17130c0e6195SKAMEZAWA Hiroyuki zone->present_pages -= offlined_pages; 1714d702909fSCody P Schafer 1715d702909fSCody P Schafer pgdat_resize_lock(zone->zone_pgdat, &flags); 17160c0e6195SKAMEZAWA Hiroyuki zone->zone_pgdat->node_present_pages -= offlined_pages; 1717d702909fSCody P Schafer pgdat_resize_unlock(zone->zone_pgdat, &flags); 17187b78d335SYasunori Goto 17191b79acc9SKOSAKI Motohiro init_per_zone_wmark_min(); 17201b79acc9SKOSAKI Motohiro 17211e8537baSXishi Qiu if (!populated_zone(zone)) { 1722340175b7SJiang Liu zone_pcp_reset(zone); 17231e8537baSXishi Qiu mutex_lock(&zonelists_mutex); 17241e8537baSXishi Qiu build_all_zonelists(NULL, NULL); 17251e8537baSXishi Qiu mutex_unlock(&zonelists_mutex); 17261e8537baSXishi Qiu } else 17271e8537baSXishi Qiu zone_pcp_update(zone); 1728340175b7SJiang Liu 1729d9713679SLai Jiangshan node_states_clear_node(node, &arg); 1730d9713679SLai Jiangshan if (arg.status_change_nid >= 0) 17318fe23e05SDavid Rientjes kswapd_stop(node); 1732bce7394aSMinchan Kim 17330c0e6195SKAMEZAWA Hiroyuki vm_total_pages = nr_free_pagecache_pages(); 17340c0e6195SKAMEZAWA Hiroyuki writeback_set_ratelimit(); 17357b78d335SYasunori Goto 17367b78d335SYasunori Goto memory_notify(MEM_OFFLINE, &arg); 1737*bfc8c901SVladimir Davydov mem_hotplug_done(); 17380c0e6195SKAMEZAWA Hiroyuki return 0; 17390c0e6195SKAMEZAWA Hiroyuki 17400c0e6195SKAMEZAWA Hiroyuki failed_removal: 1741a62e2f4fSBjorn Helgaas printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1742a62e2f4fSBjorn Helgaas (unsigned long long) start_pfn << PAGE_SHIFT, 1743a62e2f4fSBjorn Helgaas ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 17447b78d335SYasunori Goto memory_notify(MEM_CANCEL_OFFLINE, &arg); 17450c0e6195SKAMEZAWA Hiroyuki /* pushback to free area */ 17460815f3d8SMichal Nazarewicz undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 17477b78d335SYasunori Goto 17486ad696d2SAndi Kleen out: 1749*bfc8c901SVladimir Davydov mem_hotplug_done(); 17500c0e6195SKAMEZAWA Hiroyuki return ret; 17510c0e6195SKAMEZAWA Hiroyuki } 175271088785SBadari Pulavarty 1753a16cee10SWen Congyang int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1754a16cee10SWen Congyang { 1755a16cee10SWen Congyang return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1756a16cee10SWen Congyang } 1757e2ff3940SRafael J. Wysocki #endif /* CONFIG_MEMORY_HOTREMOVE */ 1758a16cee10SWen Congyang 1759bbc76be6SWen Congyang /** 1760bbc76be6SWen Congyang * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1761bbc76be6SWen Congyang * @start_pfn: start pfn of the memory range 1762e05c4bbfSToshi Kani * @end_pfn: end pfn of the memory range 1763bbc76be6SWen Congyang * @arg: argument passed to func 1764bbc76be6SWen Congyang * @func: callback for each memory section walked 1765bbc76be6SWen Congyang * 1766bbc76be6SWen Congyang * This function walks through all present mem sections in range 1767bbc76be6SWen Congyang * [start_pfn, end_pfn) and call func on each mem section. 1768bbc76be6SWen Congyang * 1769bbc76be6SWen Congyang * Returns the return value of func. 1770bbc76be6SWen Congyang */ 1771e2ff3940SRafael J. Wysocki int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1772bbc76be6SWen Congyang void *arg, int (*func)(struct memory_block *, void *)) 177371088785SBadari Pulavarty { 1774e90bdb7fSWen Congyang struct memory_block *mem = NULL; 1775e90bdb7fSWen Congyang struct mem_section *section; 1776e90bdb7fSWen Congyang unsigned long pfn, section_nr; 1777e90bdb7fSWen Congyang int ret; 177871088785SBadari Pulavarty 1779e90bdb7fSWen Congyang for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1780e90bdb7fSWen Congyang section_nr = pfn_to_section_nr(pfn); 1781e90bdb7fSWen Congyang if (!present_section_nr(section_nr)) 1782e90bdb7fSWen Congyang continue; 1783e90bdb7fSWen Congyang 1784e90bdb7fSWen Congyang section = __nr_to_section(section_nr); 1785e90bdb7fSWen Congyang /* same memblock? */ 1786e90bdb7fSWen Congyang if (mem) 1787e90bdb7fSWen Congyang if ((section_nr >= mem->start_section_nr) && 1788e90bdb7fSWen Congyang (section_nr <= mem->end_section_nr)) 1789e90bdb7fSWen Congyang continue; 1790e90bdb7fSWen Congyang 1791e90bdb7fSWen Congyang mem = find_memory_block_hinted(section, mem); 1792e90bdb7fSWen Congyang if (!mem) 1793e90bdb7fSWen Congyang continue; 1794e90bdb7fSWen Congyang 1795bbc76be6SWen Congyang ret = func(mem, arg); 1796e90bdb7fSWen Congyang if (ret) { 1797e90bdb7fSWen Congyang kobject_put(&mem->dev.kobj); 1798e90bdb7fSWen Congyang return ret; 1799e90bdb7fSWen Congyang } 1800e90bdb7fSWen Congyang } 1801e90bdb7fSWen Congyang 1802e90bdb7fSWen Congyang if (mem) 1803e90bdb7fSWen Congyang kobject_put(&mem->dev.kobj); 1804e90bdb7fSWen Congyang 1805bbc76be6SWen Congyang return 0; 1806bbc76be6SWen Congyang } 1807bbc76be6SWen Congyang 1808e2ff3940SRafael J. Wysocki #ifdef CONFIG_MEMORY_HOTREMOVE 1809d6de9d53SXishi Qiu static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1810bbc76be6SWen Congyang { 1811bbc76be6SWen Congyang int ret = !is_memblock_offlined(mem); 1812bbc76be6SWen Congyang 1813349daa0fSRandy Dunlap if (unlikely(ret)) { 1814349daa0fSRandy Dunlap phys_addr_t beginpa, endpa; 1815349daa0fSRandy Dunlap 1816349daa0fSRandy Dunlap beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1817349daa0fSRandy Dunlap endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1818bbc76be6SWen Congyang pr_warn("removing memory fails, because memory " 1819349daa0fSRandy Dunlap "[%pa-%pa] is onlined\n", 1820349daa0fSRandy Dunlap &beginpa, &endpa); 1821349daa0fSRandy Dunlap } 1822bbc76be6SWen Congyang 1823bbc76be6SWen Congyang return ret; 1824bbc76be6SWen Congyang } 1825bbc76be6SWen Congyang 18260f1cfe9dSToshi Kani static int check_cpu_on_node(pg_data_t *pgdat) 182760a5a19eSTang Chen { 182860a5a19eSTang Chen int cpu; 182960a5a19eSTang Chen 183060a5a19eSTang Chen for_each_present_cpu(cpu) { 183160a5a19eSTang Chen if (cpu_to_node(cpu) == pgdat->node_id) 183260a5a19eSTang Chen /* 183360a5a19eSTang Chen * the cpu on this node isn't removed, and we can't 183460a5a19eSTang Chen * offline this node. 183560a5a19eSTang Chen */ 183660a5a19eSTang Chen return -EBUSY; 183760a5a19eSTang Chen } 183860a5a19eSTang Chen 183960a5a19eSTang Chen return 0; 184060a5a19eSTang Chen } 184160a5a19eSTang Chen 18420f1cfe9dSToshi Kani static void unmap_cpu_on_node(pg_data_t *pgdat) 1843e13fe869SWen Congyang { 1844e13fe869SWen Congyang #ifdef CONFIG_ACPI_NUMA 1845e13fe869SWen Congyang int cpu; 1846e13fe869SWen Congyang 1847e13fe869SWen Congyang for_each_possible_cpu(cpu) 1848e13fe869SWen Congyang if (cpu_to_node(cpu) == pgdat->node_id) 1849e13fe869SWen Congyang numa_clear_node(cpu); 1850e13fe869SWen Congyang #endif 1851e13fe869SWen Congyang } 1852e13fe869SWen Congyang 18530f1cfe9dSToshi Kani static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1854e13fe869SWen Congyang { 18550f1cfe9dSToshi Kani int ret; 1856e13fe869SWen Congyang 18570f1cfe9dSToshi Kani ret = check_cpu_on_node(pgdat); 1858e13fe869SWen Congyang if (ret) 1859e13fe869SWen Congyang return ret; 1860e13fe869SWen Congyang 1861e13fe869SWen Congyang /* 1862e13fe869SWen Congyang * the node will be offlined when we come here, so we can clear 1863e13fe869SWen Congyang * the cpu_to_node() now. 1864e13fe869SWen Congyang */ 1865e13fe869SWen Congyang 18660f1cfe9dSToshi Kani unmap_cpu_on_node(pgdat); 1867e13fe869SWen Congyang return 0; 1868e13fe869SWen Congyang } 1869e13fe869SWen Congyang 18700f1cfe9dSToshi Kani /** 18710f1cfe9dSToshi Kani * try_offline_node 18720f1cfe9dSToshi Kani * 18730f1cfe9dSToshi Kani * Offline a node if all memory sections and cpus of the node are removed. 18740f1cfe9dSToshi Kani * 18750f1cfe9dSToshi Kani * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 18760f1cfe9dSToshi Kani * and online/offline operations before this call. 18770f1cfe9dSToshi Kani */ 187890b30cdcSWen Congyang void try_offline_node(int nid) 187960a5a19eSTang Chen { 1880d822b86aSWen Congyang pg_data_t *pgdat = NODE_DATA(nid); 1881d822b86aSWen Congyang unsigned long start_pfn = pgdat->node_start_pfn; 1882d822b86aSWen Congyang unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 188360a5a19eSTang Chen unsigned long pfn; 1884d822b86aSWen Congyang struct page *pgdat_page = virt_to_page(pgdat); 1885d822b86aSWen Congyang int i; 188660a5a19eSTang Chen 188760a5a19eSTang Chen for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 188860a5a19eSTang Chen unsigned long section_nr = pfn_to_section_nr(pfn); 188960a5a19eSTang Chen 189060a5a19eSTang Chen if (!present_section_nr(section_nr)) 189160a5a19eSTang Chen continue; 189260a5a19eSTang Chen 189360a5a19eSTang Chen if (pfn_to_nid(pfn) != nid) 189460a5a19eSTang Chen continue; 189560a5a19eSTang Chen 189660a5a19eSTang Chen /* 189760a5a19eSTang Chen * some memory sections of this node are not removed, and we 189860a5a19eSTang Chen * can't offline node now. 189960a5a19eSTang Chen */ 190060a5a19eSTang Chen return; 190160a5a19eSTang Chen } 190260a5a19eSTang Chen 19030f1cfe9dSToshi Kani if (check_and_unmap_cpu_on_node(pgdat)) 190460a5a19eSTang Chen return; 190560a5a19eSTang Chen 190660a5a19eSTang Chen /* 190760a5a19eSTang Chen * all memory/cpu of this node are removed, we can offline this 190860a5a19eSTang Chen * node now. 190960a5a19eSTang Chen */ 191060a5a19eSTang Chen node_set_offline(nid); 191160a5a19eSTang Chen unregister_one_node(nid); 1912d822b86aSWen Congyang 1913d822b86aSWen Congyang if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1914d822b86aSWen Congyang /* node data is allocated from boot memory */ 1915d822b86aSWen Congyang return; 1916d822b86aSWen Congyang 1917d822b86aSWen Congyang /* free waittable in each zone */ 1918d822b86aSWen Congyang for (i = 0; i < MAX_NR_ZONES; i++) { 1919d822b86aSWen Congyang struct zone *zone = pgdat->node_zones + i; 1920d822b86aSWen Congyang 1921ca4b3f30SJianguo Wu /* 1922ca4b3f30SJianguo Wu * wait_table may be allocated from boot memory, 1923ca4b3f30SJianguo Wu * here only free if it's allocated by vmalloc. 1924ca4b3f30SJianguo Wu */ 1925ca4b3f30SJianguo Wu if (is_vmalloc_addr(zone->wait_table)) 1926d822b86aSWen Congyang vfree(zone->wait_table); 1927d822b86aSWen Congyang } 1928d822b86aSWen Congyang 1929d822b86aSWen Congyang /* 1930d822b86aSWen Congyang * Since there is no way to guarentee the address of pgdat/zone is not 1931d822b86aSWen Congyang * on stack of any kernel threads or used by other kernel objects 1932d822b86aSWen Congyang * without reference counting or other symchronizing method, do not 1933d822b86aSWen Congyang * reset node_data and free pgdat here. Just reset it to 0 and reuse 1934d822b86aSWen Congyang * the memory when the node is online again. 1935d822b86aSWen Congyang */ 1936d822b86aSWen Congyang memset(pgdat, 0, sizeof(*pgdat)); 193760a5a19eSTang Chen } 193890b30cdcSWen Congyang EXPORT_SYMBOL(try_offline_node); 193960a5a19eSTang Chen 19400f1cfe9dSToshi Kani /** 19410f1cfe9dSToshi Kani * remove_memory 19420f1cfe9dSToshi Kani * 19430f1cfe9dSToshi Kani * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 19440f1cfe9dSToshi Kani * and online/offline operations before this call, as required by 19450f1cfe9dSToshi Kani * try_offline_node(). 19460f1cfe9dSToshi Kani */ 1947242831ebSRafael J. Wysocki void __ref remove_memory(int nid, u64 start, u64 size) 1948bbc76be6SWen Congyang { 1949242831ebSRafael J. Wysocki int ret; 1950993c1aadSWen Congyang 195127356f54SToshi Kani BUG_ON(check_hotplug_memory_range(start, size)); 195227356f54SToshi Kani 1953*bfc8c901SVladimir Davydov mem_hotplug_begin(); 19546677e3eaSYasuaki Ishimatsu 19556677e3eaSYasuaki Ishimatsu /* 1956242831ebSRafael J. Wysocki * All memory blocks must be offlined before removing memory. Check 1957242831ebSRafael J. Wysocki * whether all memory blocks in question are offline and trigger a BUG() 1958242831ebSRafael J. Wysocki * if this is not the case. 19596677e3eaSYasuaki Ishimatsu */ 1960242831ebSRafael J. Wysocki ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1961d6de9d53SXishi Qiu check_memblock_offlined_cb); 1962*bfc8c901SVladimir Davydov if (ret) 1963242831ebSRafael J. Wysocki BUG(); 19646677e3eaSYasuaki Ishimatsu 196546c66c4bSYasuaki Ishimatsu /* remove memmap entry */ 196646c66c4bSYasuaki Ishimatsu firmware_map_remove(start, start + size, "System RAM"); 196746c66c4bSYasuaki Ishimatsu 196824d335caSWen Congyang arch_remove_memory(start, size); 196924d335caSWen Congyang 197060a5a19eSTang Chen try_offline_node(nid); 197160a5a19eSTang Chen 1972*bfc8c901SVladimir Davydov mem_hotplug_done(); 197371088785SBadari Pulavarty } 197471088785SBadari Pulavarty EXPORT_SYMBOL_GPL(remove_memory); 1975aba6efc4SRafael J. Wysocki #endif /* CONFIG_MEMORY_HOTREMOVE */ 1976