xref: /openbmc/linux/mm/memory_hotplug.c (revision dea54fba)
1 /*
2  *  linux/mm/memory_hotplug.c
3  *
4  *  Copyright (C)
5  */
6 
7 #include <linux/stddef.h>
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/swap.h>
11 #include <linux/interrupt.h>
12 #include <linux/pagemap.h>
13 #include <linux/compiler.h>
14 #include <linux/export.h>
15 #include <linux/pagevec.h>
16 #include <linux/writeback.h>
17 #include <linux/slab.h>
18 #include <linux/sysctl.h>
19 #include <linux/cpu.h>
20 #include <linux/memory.h>
21 #include <linux/memremap.h>
22 #include <linux/memory_hotplug.h>
23 #include <linux/highmem.h>
24 #include <linux/vmalloc.h>
25 #include <linux/ioport.h>
26 #include <linux/delay.h>
27 #include <linux/migrate.h>
28 #include <linux/page-isolation.h>
29 #include <linux/pfn.h>
30 #include <linux/suspend.h>
31 #include <linux/mm_inline.h>
32 #include <linux/firmware-map.h>
33 #include <linux/stop_machine.h>
34 #include <linux/hugetlb.h>
35 #include <linux/memblock.h>
36 #include <linux/bootmem.h>
37 #include <linux/compaction.h>
38 
39 #include <asm/tlbflush.h>
40 
41 #include "internal.h"
42 
43 /*
44  * online_page_callback contains pointer to current page onlining function.
45  * Initially it is generic_online_page(). If it is required it could be
46  * changed by calling set_online_page_callback() for callback registration
47  * and restore_online_page_callback() for generic callback restore.
48  */
49 
50 static void generic_online_page(struct page *page);
51 
52 static online_page_callback_t online_page_callback = generic_online_page;
53 static DEFINE_MUTEX(online_page_callback_lock);
54 
55 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
56 
57 void get_online_mems(void)
58 {
59 	percpu_down_read(&mem_hotplug_lock);
60 }
61 
62 void put_online_mems(void)
63 {
64 	percpu_up_read(&mem_hotplug_lock);
65 }
66 
67 bool movable_node_enabled = false;
68 
69 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
70 bool memhp_auto_online;
71 #else
72 bool memhp_auto_online = true;
73 #endif
74 EXPORT_SYMBOL_GPL(memhp_auto_online);
75 
76 static int __init setup_memhp_default_state(char *str)
77 {
78 	if (!strcmp(str, "online"))
79 		memhp_auto_online = true;
80 	else if (!strcmp(str, "offline"))
81 		memhp_auto_online = false;
82 
83 	return 1;
84 }
85 __setup("memhp_default_state=", setup_memhp_default_state);
86 
87 void mem_hotplug_begin(void)
88 {
89 	cpus_read_lock();
90 	percpu_down_write(&mem_hotplug_lock);
91 }
92 
93 void mem_hotplug_done(void)
94 {
95 	percpu_up_write(&mem_hotplug_lock);
96 	cpus_read_unlock();
97 }
98 
99 /* add this memory to iomem resource */
100 static struct resource *register_memory_resource(u64 start, u64 size)
101 {
102 	struct resource *res;
103 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
104 	if (!res)
105 		return ERR_PTR(-ENOMEM);
106 
107 	res->name = "System RAM";
108 	res->start = start;
109 	res->end = start + size - 1;
110 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
111 	if (request_resource(&iomem_resource, res) < 0) {
112 		pr_debug("System RAM resource %pR cannot be added\n", res);
113 		kfree(res);
114 		return ERR_PTR(-EEXIST);
115 	}
116 	return res;
117 }
118 
119 static void release_memory_resource(struct resource *res)
120 {
121 	if (!res)
122 		return;
123 	release_resource(res);
124 	kfree(res);
125 	return;
126 }
127 
128 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
129 void get_page_bootmem(unsigned long info,  struct page *page,
130 		      unsigned long type)
131 {
132 	page->freelist = (void *)type;
133 	SetPagePrivate(page);
134 	set_page_private(page, info);
135 	page_ref_inc(page);
136 }
137 
138 void put_page_bootmem(struct page *page)
139 {
140 	unsigned long type;
141 
142 	type = (unsigned long) page->freelist;
143 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
144 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
145 
146 	if (page_ref_dec_return(page) == 1) {
147 		page->freelist = NULL;
148 		ClearPagePrivate(page);
149 		set_page_private(page, 0);
150 		INIT_LIST_HEAD(&page->lru);
151 		free_reserved_page(page);
152 	}
153 }
154 
155 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
156 #ifndef CONFIG_SPARSEMEM_VMEMMAP
157 static void register_page_bootmem_info_section(unsigned long start_pfn)
158 {
159 	unsigned long *usemap, mapsize, section_nr, i;
160 	struct mem_section *ms;
161 	struct page *page, *memmap;
162 
163 	section_nr = pfn_to_section_nr(start_pfn);
164 	ms = __nr_to_section(section_nr);
165 
166 	/* Get section's memmap address */
167 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
168 
169 	/*
170 	 * Get page for the memmap's phys address
171 	 * XXX: need more consideration for sparse_vmemmap...
172 	 */
173 	page = virt_to_page(memmap);
174 	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
175 	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
176 
177 	/* remember memmap's page */
178 	for (i = 0; i < mapsize; i++, page++)
179 		get_page_bootmem(section_nr, page, SECTION_INFO);
180 
181 	usemap = __nr_to_section(section_nr)->pageblock_flags;
182 	page = virt_to_page(usemap);
183 
184 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
185 
186 	for (i = 0; i < mapsize; i++, page++)
187 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
188 
189 }
190 #else /* CONFIG_SPARSEMEM_VMEMMAP */
191 static void register_page_bootmem_info_section(unsigned long start_pfn)
192 {
193 	unsigned long *usemap, mapsize, section_nr, i;
194 	struct mem_section *ms;
195 	struct page *page, *memmap;
196 
197 	if (!pfn_valid(start_pfn))
198 		return;
199 
200 	section_nr = pfn_to_section_nr(start_pfn);
201 	ms = __nr_to_section(section_nr);
202 
203 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
204 
205 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
206 
207 	usemap = __nr_to_section(section_nr)->pageblock_flags;
208 	page = virt_to_page(usemap);
209 
210 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
211 
212 	for (i = 0; i < mapsize; i++, page++)
213 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
214 }
215 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
216 
217 void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
218 {
219 	unsigned long i, pfn, end_pfn, nr_pages;
220 	int node = pgdat->node_id;
221 	struct page *page;
222 
223 	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
224 	page = virt_to_page(pgdat);
225 
226 	for (i = 0; i < nr_pages; i++, page++)
227 		get_page_bootmem(node, page, NODE_INFO);
228 
229 	pfn = pgdat->node_start_pfn;
230 	end_pfn = pgdat_end_pfn(pgdat);
231 
232 	/* register section info */
233 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
234 		/*
235 		 * Some platforms can assign the same pfn to multiple nodes - on
236 		 * node0 as well as nodeN.  To avoid registering a pfn against
237 		 * multiple nodes we check that this pfn does not already
238 		 * reside in some other nodes.
239 		 */
240 		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
241 			register_page_bootmem_info_section(pfn);
242 	}
243 }
244 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
245 
246 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
247 		bool want_memblock)
248 {
249 	int ret;
250 	int i;
251 
252 	if (pfn_valid(phys_start_pfn))
253 		return -EEXIST;
254 
255 	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
256 	if (ret < 0)
257 		return ret;
258 
259 	/*
260 	 * Make all the pages reserved so that nobody will stumble over half
261 	 * initialized state.
262 	 * FIXME: We also have to associate it with a node because pfn_to_node
263 	 * relies on having page with the proper node.
264 	 */
265 	for (i = 0; i < PAGES_PER_SECTION; i++) {
266 		unsigned long pfn = phys_start_pfn + i;
267 		struct page *page;
268 		if (!pfn_valid(pfn))
269 			continue;
270 
271 		page = pfn_to_page(pfn);
272 		set_page_node(page, nid);
273 		SetPageReserved(page);
274 	}
275 
276 	if (!want_memblock)
277 		return 0;
278 
279 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
280 }
281 
282 /*
283  * Reasonably generic function for adding memory.  It is
284  * expected that archs that support memory hotplug will
285  * call this function after deciding the zone to which to
286  * add the new pages.
287  */
288 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
289 			unsigned long nr_pages, bool want_memblock)
290 {
291 	unsigned long i;
292 	int err = 0;
293 	int start_sec, end_sec;
294 	struct vmem_altmap *altmap;
295 
296 	/* during initialize mem_map, align hot-added range to section */
297 	start_sec = pfn_to_section_nr(phys_start_pfn);
298 	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
299 
300 	altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
301 	if (altmap) {
302 		/*
303 		 * Validate altmap is within bounds of the total request
304 		 */
305 		if (altmap->base_pfn != phys_start_pfn
306 				|| vmem_altmap_offset(altmap) > nr_pages) {
307 			pr_warn_once("memory add fail, invalid altmap\n");
308 			err = -EINVAL;
309 			goto out;
310 		}
311 		altmap->alloc = 0;
312 	}
313 
314 	for (i = start_sec; i <= end_sec; i++) {
315 		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
316 
317 		/*
318 		 * EEXIST is finally dealt with by ioresource collision
319 		 * check. see add_memory() => register_memory_resource()
320 		 * Warning will be printed if there is collision.
321 		 */
322 		if (err && (err != -EEXIST))
323 			break;
324 		err = 0;
325 	}
326 	vmemmap_populate_print_last();
327 out:
328 	return err;
329 }
330 EXPORT_SYMBOL_GPL(__add_pages);
331 
332 #ifdef CONFIG_MEMORY_HOTREMOVE
333 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
334 static int find_smallest_section_pfn(int nid, struct zone *zone,
335 				     unsigned long start_pfn,
336 				     unsigned long end_pfn)
337 {
338 	struct mem_section *ms;
339 
340 	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
341 		ms = __pfn_to_section(start_pfn);
342 
343 		if (unlikely(!valid_section(ms)))
344 			continue;
345 
346 		if (unlikely(pfn_to_nid(start_pfn) != nid))
347 			continue;
348 
349 		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
350 			continue;
351 
352 		return start_pfn;
353 	}
354 
355 	return 0;
356 }
357 
358 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
359 static int find_biggest_section_pfn(int nid, struct zone *zone,
360 				    unsigned long start_pfn,
361 				    unsigned long end_pfn)
362 {
363 	struct mem_section *ms;
364 	unsigned long pfn;
365 
366 	/* pfn is the end pfn of a memory section. */
367 	pfn = end_pfn - 1;
368 	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
369 		ms = __pfn_to_section(pfn);
370 
371 		if (unlikely(!valid_section(ms)))
372 			continue;
373 
374 		if (unlikely(pfn_to_nid(pfn) != nid))
375 			continue;
376 
377 		if (zone && zone != page_zone(pfn_to_page(pfn)))
378 			continue;
379 
380 		return pfn;
381 	}
382 
383 	return 0;
384 }
385 
386 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
387 			     unsigned long end_pfn)
388 {
389 	unsigned long zone_start_pfn = zone->zone_start_pfn;
390 	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
391 	unsigned long zone_end_pfn = z;
392 	unsigned long pfn;
393 	struct mem_section *ms;
394 	int nid = zone_to_nid(zone);
395 
396 	zone_span_writelock(zone);
397 	if (zone_start_pfn == start_pfn) {
398 		/*
399 		 * If the section is smallest section in the zone, it need
400 		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
401 		 * In this case, we find second smallest valid mem_section
402 		 * for shrinking zone.
403 		 */
404 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
405 						zone_end_pfn);
406 		if (pfn) {
407 			zone->zone_start_pfn = pfn;
408 			zone->spanned_pages = zone_end_pfn - pfn;
409 		}
410 	} else if (zone_end_pfn == end_pfn) {
411 		/*
412 		 * If the section is biggest section in the zone, it need
413 		 * shrink zone->spanned_pages.
414 		 * In this case, we find second biggest valid mem_section for
415 		 * shrinking zone.
416 		 */
417 		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
418 					       start_pfn);
419 		if (pfn)
420 			zone->spanned_pages = pfn - zone_start_pfn + 1;
421 	}
422 
423 	/*
424 	 * The section is not biggest or smallest mem_section in the zone, it
425 	 * only creates a hole in the zone. So in this case, we need not
426 	 * change the zone. But perhaps, the zone has only hole data. Thus
427 	 * it check the zone has only hole or not.
428 	 */
429 	pfn = zone_start_pfn;
430 	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
431 		ms = __pfn_to_section(pfn);
432 
433 		if (unlikely(!valid_section(ms)))
434 			continue;
435 
436 		if (page_zone(pfn_to_page(pfn)) != zone)
437 			continue;
438 
439 		 /* If the section is current section, it continues the loop */
440 		if (start_pfn == pfn)
441 			continue;
442 
443 		/* If we find valid section, we have nothing to do */
444 		zone_span_writeunlock(zone);
445 		return;
446 	}
447 
448 	/* The zone has no valid section */
449 	zone->zone_start_pfn = 0;
450 	zone->spanned_pages = 0;
451 	zone_span_writeunlock(zone);
452 }
453 
454 static void shrink_pgdat_span(struct pglist_data *pgdat,
455 			      unsigned long start_pfn, unsigned long end_pfn)
456 {
457 	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
458 	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
459 	unsigned long pgdat_end_pfn = p;
460 	unsigned long pfn;
461 	struct mem_section *ms;
462 	int nid = pgdat->node_id;
463 
464 	if (pgdat_start_pfn == start_pfn) {
465 		/*
466 		 * If the section is smallest section in the pgdat, it need
467 		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
468 		 * In this case, we find second smallest valid mem_section
469 		 * for shrinking zone.
470 		 */
471 		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
472 						pgdat_end_pfn);
473 		if (pfn) {
474 			pgdat->node_start_pfn = pfn;
475 			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
476 		}
477 	} else if (pgdat_end_pfn == end_pfn) {
478 		/*
479 		 * If the section is biggest section in the pgdat, it need
480 		 * shrink pgdat->node_spanned_pages.
481 		 * In this case, we find second biggest valid mem_section for
482 		 * shrinking zone.
483 		 */
484 		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
485 					       start_pfn);
486 		if (pfn)
487 			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
488 	}
489 
490 	/*
491 	 * If the section is not biggest or smallest mem_section in the pgdat,
492 	 * it only creates a hole in the pgdat. So in this case, we need not
493 	 * change the pgdat.
494 	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
495 	 * has only hole or not.
496 	 */
497 	pfn = pgdat_start_pfn;
498 	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
499 		ms = __pfn_to_section(pfn);
500 
501 		if (unlikely(!valid_section(ms)))
502 			continue;
503 
504 		if (pfn_to_nid(pfn) != nid)
505 			continue;
506 
507 		 /* If the section is current section, it continues the loop */
508 		if (start_pfn == pfn)
509 			continue;
510 
511 		/* If we find valid section, we have nothing to do */
512 		return;
513 	}
514 
515 	/* The pgdat has no valid section */
516 	pgdat->node_start_pfn = 0;
517 	pgdat->node_spanned_pages = 0;
518 }
519 
520 static void __remove_zone(struct zone *zone, unsigned long start_pfn)
521 {
522 	struct pglist_data *pgdat = zone->zone_pgdat;
523 	int nr_pages = PAGES_PER_SECTION;
524 	unsigned long flags;
525 
526 	pgdat_resize_lock(zone->zone_pgdat, &flags);
527 	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
528 	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
529 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
530 }
531 
532 static int __remove_section(struct zone *zone, struct mem_section *ms,
533 		unsigned long map_offset)
534 {
535 	unsigned long start_pfn;
536 	int scn_nr;
537 	int ret = -EINVAL;
538 
539 	if (!valid_section(ms))
540 		return ret;
541 
542 	ret = unregister_memory_section(ms);
543 	if (ret)
544 		return ret;
545 
546 	scn_nr = __section_nr(ms);
547 	start_pfn = section_nr_to_pfn(scn_nr);
548 	__remove_zone(zone, start_pfn);
549 
550 	sparse_remove_one_section(zone, ms, map_offset);
551 	return 0;
552 }
553 
554 /**
555  * __remove_pages() - remove sections of pages from a zone
556  * @zone: zone from which pages need to be removed
557  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
558  * @nr_pages: number of pages to remove (must be multiple of section size)
559  *
560  * Generic helper function to remove section mappings and sysfs entries
561  * for the section of the memory we are removing. Caller needs to make
562  * sure that pages are marked reserved and zones are adjust properly by
563  * calling offline_pages().
564  */
565 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
566 		 unsigned long nr_pages)
567 {
568 	unsigned long i;
569 	unsigned long map_offset = 0;
570 	int sections_to_remove, ret = 0;
571 
572 	/* In the ZONE_DEVICE case device driver owns the memory region */
573 	if (is_dev_zone(zone)) {
574 		struct page *page = pfn_to_page(phys_start_pfn);
575 		struct vmem_altmap *altmap;
576 
577 		altmap = to_vmem_altmap((unsigned long) page);
578 		if (altmap)
579 			map_offset = vmem_altmap_offset(altmap);
580 	} else {
581 		resource_size_t start, size;
582 
583 		start = phys_start_pfn << PAGE_SHIFT;
584 		size = nr_pages * PAGE_SIZE;
585 
586 		ret = release_mem_region_adjustable(&iomem_resource, start,
587 					size);
588 		if (ret) {
589 			resource_size_t endres = start + size - 1;
590 
591 			pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
592 					&start, &endres, ret);
593 		}
594 	}
595 
596 	clear_zone_contiguous(zone);
597 
598 	/*
599 	 * We can only remove entire sections
600 	 */
601 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
602 	BUG_ON(nr_pages % PAGES_PER_SECTION);
603 
604 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
605 	for (i = 0; i < sections_to_remove; i++) {
606 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
607 
608 		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
609 		map_offset = 0;
610 		if (ret)
611 			break;
612 	}
613 
614 	set_zone_contiguous(zone);
615 
616 	return ret;
617 }
618 #endif /* CONFIG_MEMORY_HOTREMOVE */
619 
620 int set_online_page_callback(online_page_callback_t callback)
621 {
622 	int rc = -EINVAL;
623 
624 	get_online_mems();
625 	mutex_lock(&online_page_callback_lock);
626 
627 	if (online_page_callback == generic_online_page) {
628 		online_page_callback = callback;
629 		rc = 0;
630 	}
631 
632 	mutex_unlock(&online_page_callback_lock);
633 	put_online_mems();
634 
635 	return rc;
636 }
637 EXPORT_SYMBOL_GPL(set_online_page_callback);
638 
639 int restore_online_page_callback(online_page_callback_t callback)
640 {
641 	int rc = -EINVAL;
642 
643 	get_online_mems();
644 	mutex_lock(&online_page_callback_lock);
645 
646 	if (online_page_callback == callback) {
647 		online_page_callback = generic_online_page;
648 		rc = 0;
649 	}
650 
651 	mutex_unlock(&online_page_callback_lock);
652 	put_online_mems();
653 
654 	return rc;
655 }
656 EXPORT_SYMBOL_GPL(restore_online_page_callback);
657 
658 void __online_page_set_limits(struct page *page)
659 {
660 }
661 EXPORT_SYMBOL_GPL(__online_page_set_limits);
662 
663 void __online_page_increment_counters(struct page *page)
664 {
665 	adjust_managed_page_count(page, 1);
666 }
667 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
668 
669 void __online_page_free(struct page *page)
670 {
671 	__free_reserved_page(page);
672 }
673 EXPORT_SYMBOL_GPL(__online_page_free);
674 
675 static void generic_online_page(struct page *page)
676 {
677 	__online_page_set_limits(page);
678 	__online_page_increment_counters(page);
679 	__online_page_free(page);
680 }
681 
682 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
683 			void *arg)
684 {
685 	unsigned long i;
686 	unsigned long onlined_pages = *(unsigned long *)arg;
687 	struct page *page;
688 
689 	if (PageReserved(pfn_to_page(start_pfn)))
690 		for (i = 0; i < nr_pages; i++) {
691 			page = pfn_to_page(start_pfn + i);
692 			(*online_page_callback)(page);
693 			onlined_pages++;
694 		}
695 
696 	online_mem_sections(start_pfn, start_pfn + nr_pages);
697 
698 	*(unsigned long *)arg = onlined_pages;
699 	return 0;
700 }
701 
702 /* check which state of node_states will be changed when online memory */
703 static void node_states_check_changes_online(unsigned long nr_pages,
704 	struct zone *zone, struct memory_notify *arg)
705 {
706 	int nid = zone_to_nid(zone);
707 	enum zone_type zone_last = ZONE_NORMAL;
708 
709 	/*
710 	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
711 	 * contains nodes which have zones of 0...ZONE_NORMAL,
712 	 * set zone_last to ZONE_NORMAL.
713 	 *
714 	 * If we don't have HIGHMEM nor movable node,
715 	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
716 	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
717 	 */
718 	if (N_MEMORY == N_NORMAL_MEMORY)
719 		zone_last = ZONE_MOVABLE;
720 
721 	/*
722 	 * if the memory to be online is in a zone of 0...zone_last, and
723 	 * the zones of 0...zone_last don't have memory before online, we will
724 	 * need to set the node to node_states[N_NORMAL_MEMORY] after
725 	 * the memory is online.
726 	 */
727 	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
728 		arg->status_change_nid_normal = nid;
729 	else
730 		arg->status_change_nid_normal = -1;
731 
732 #ifdef CONFIG_HIGHMEM
733 	/*
734 	 * If we have movable node, node_states[N_HIGH_MEMORY]
735 	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
736 	 * set zone_last to ZONE_HIGHMEM.
737 	 *
738 	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
739 	 * contains nodes which have zones of 0...ZONE_MOVABLE,
740 	 * set zone_last to ZONE_MOVABLE.
741 	 */
742 	zone_last = ZONE_HIGHMEM;
743 	if (N_MEMORY == N_HIGH_MEMORY)
744 		zone_last = ZONE_MOVABLE;
745 
746 	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
747 		arg->status_change_nid_high = nid;
748 	else
749 		arg->status_change_nid_high = -1;
750 #else
751 	arg->status_change_nid_high = arg->status_change_nid_normal;
752 #endif
753 
754 	/*
755 	 * if the node don't have memory befor online, we will need to
756 	 * set the node to node_states[N_MEMORY] after the memory
757 	 * is online.
758 	 */
759 	if (!node_state(nid, N_MEMORY))
760 		arg->status_change_nid = nid;
761 	else
762 		arg->status_change_nid = -1;
763 }
764 
765 static void node_states_set_node(int node, struct memory_notify *arg)
766 {
767 	if (arg->status_change_nid_normal >= 0)
768 		node_set_state(node, N_NORMAL_MEMORY);
769 
770 	if (arg->status_change_nid_high >= 0)
771 		node_set_state(node, N_HIGH_MEMORY);
772 
773 	node_set_state(node, N_MEMORY);
774 }
775 
776 bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
777 {
778 	struct pglist_data *pgdat = NODE_DATA(nid);
779 	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
780 	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
781 
782 	/*
783 	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
784 	 * physically before ZONE_MOVABLE. All we need is they do not
785 	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
786 	 * though so let's stick with it for simplicity for now.
787 	 * TODO make sure we do not overlap with ZONE_DEVICE
788 	 */
789 	if (online_type == MMOP_ONLINE_KERNEL) {
790 		if (zone_is_empty(movable_zone))
791 			return true;
792 		return movable_zone->zone_start_pfn >= pfn + nr_pages;
793 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
794 		return zone_end_pfn(default_zone) <= pfn;
795 	}
796 
797 	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
798 	return online_type == MMOP_ONLINE_KEEP;
799 }
800 
801 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
802 		unsigned long nr_pages)
803 {
804 	unsigned long old_end_pfn = zone_end_pfn(zone);
805 
806 	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
807 		zone->zone_start_pfn = start_pfn;
808 
809 	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
810 }
811 
812 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
813                                      unsigned long nr_pages)
814 {
815 	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
816 
817 	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
818 		pgdat->node_start_pfn = start_pfn;
819 
820 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
821 }
822 
823 void __ref move_pfn_range_to_zone(struct zone *zone,
824 		unsigned long start_pfn, unsigned long nr_pages)
825 {
826 	struct pglist_data *pgdat = zone->zone_pgdat;
827 	int nid = pgdat->node_id;
828 	unsigned long flags;
829 
830 	if (zone_is_empty(zone))
831 		init_currently_empty_zone(zone, start_pfn, nr_pages);
832 
833 	clear_zone_contiguous(zone);
834 
835 	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
836 	pgdat_resize_lock(pgdat, &flags);
837 	zone_span_writelock(zone);
838 	resize_zone_range(zone, start_pfn, nr_pages);
839 	zone_span_writeunlock(zone);
840 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
841 	pgdat_resize_unlock(pgdat, &flags);
842 
843 	/*
844 	 * TODO now we have a visible range of pages which are not associated
845 	 * with their zone properly. Not nice but set_pfnblock_flags_mask
846 	 * expects the zone spans the pfn range. All the pages in the range
847 	 * are reserved so nobody should be touching them so we should be safe
848 	 */
849 	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
850 
851 	set_zone_contiguous(zone);
852 }
853 
854 /*
855  * Returns a default kernel memory zone for the given pfn range.
856  * If no kernel zone covers this pfn range it will automatically go
857  * to the ZONE_NORMAL.
858  */
859 struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
860 		unsigned long nr_pages)
861 {
862 	struct pglist_data *pgdat = NODE_DATA(nid);
863 	int zid;
864 
865 	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
866 		struct zone *zone = &pgdat->node_zones[zid];
867 
868 		if (zone_intersects(zone, start_pfn, nr_pages))
869 			return zone;
870 	}
871 
872 	return &pgdat->node_zones[ZONE_NORMAL];
873 }
874 
875 static inline bool movable_pfn_range(int nid, struct zone *default_zone,
876 		unsigned long start_pfn, unsigned long nr_pages)
877 {
878 	if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
879 				MMOP_ONLINE_KERNEL))
880 		return true;
881 
882 	if (!movable_node_is_enabled())
883 		return false;
884 
885 	return !zone_intersects(default_zone, start_pfn, nr_pages);
886 }
887 
888 /*
889  * Associates the given pfn range with the given node and the zone appropriate
890  * for the given online type.
891  */
892 static struct zone * __meminit move_pfn_range(int online_type, int nid,
893 		unsigned long start_pfn, unsigned long nr_pages)
894 {
895 	struct pglist_data *pgdat = NODE_DATA(nid);
896 	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
897 
898 	if (online_type == MMOP_ONLINE_KEEP) {
899 		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
900 		/*
901 		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
902 		 * movable zone if that is not possible (e.g. we are within
903 		 * or past the existing movable zone). movable_node overrides
904 		 * this default and defaults to movable zone
905 		 */
906 		if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
907 			zone = movable_zone;
908 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
909 		zone = &pgdat->node_zones[ZONE_MOVABLE];
910 	}
911 
912 	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
913 	return zone;
914 }
915 
916 /* Must be protected by mem_hotplug_begin() */
917 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
918 {
919 	unsigned long flags;
920 	unsigned long onlined_pages = 0;
921 	struct zone *zone;
922 	int need_zonelists_rebuild = 0;
923 	int nid;
924 	int ret;
925 	struct memory_notify arg;
926 
927 	nid = pfn_to_nid(pfn);
928 	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
929 		return -EINVAL;
930 
931 	/* associate pfn range with the zone */
932 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
933 
934 	arg.start_pfn = pfn;
935 	arg.nr_pages = nr_pages;
936 	node_states_check_changes_online(nr_pages, zone, &arg);
937 
938 	ret = memory_notify(MEM_GOING_ONLINE, &arg);
939 	ret = notifier_to_errno(ret);
940 	if (ret)
941 		goto failed_addition;
942 
943 	/*
944 	 * If this zone is not populated, then it is not in zonelist.
945 	 * This means the page allocator ignores this zone.
946 	 * So, zonelist must be updated after online.
947 	 */
948 	mutex_lock(&zonelists_mutex);
949 	if (!populated_zone(zone)) {
950 		need_zonelists_rebuild = 1;
951 		build_all_zonelists(NULL, zone);
952 	}
953 
954 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
955 		online_pages_range);
956 	if (ret) {
957 		if (need_zonelists_rebuild)
958 			zone_pcp_reset(zone);
959 		mutex_unlock(&zonelists_mutex);
960 		goto failed_addition;
961 	}
962 
963 	zone->present_pages += onlined_pages;
964 
965 	pgdat_resize_lock(zone->zone_pgdat, &flags);
966 	zone->zone_pgdat->node_present_pages += onlined_pages;
967 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
968 
969 	if (onlined_pages) {
970 		node_states_set_node(nid, &arg);
971 		if (need_zonelists_rebuild)
972 			build_all_zonelists(NULL, NULL);
973 		else
974 			zone_pcp_update(zone);
975 	}
976 
977 	mutex_unlock(&zonelists_mutex);
978 
979 	init_per_zone_wmark_min();
980 
981 	if (onlined_pages) {
982 		kswapd_run(nid);
983 		kcompactd_run(nid);
984 	}
985 
986 	vm_total_pages = nr_free_pagecache_pages();
987 
988 	writeback_set_ratelimit();
989 
990 	if (onlined_pages)
991 		memory_notify(MEM_ONLINE, &arg);
992 	return 0;
993 
994 failed_addition:
995 	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
996 		 (unsigned long long) pfn << PAGE_SHIFT,
997 		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
998 	memory_notify(MEM_CANCEL_ONLINE, &arg);
999 	return ret;
1000 }
1001 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1002 
1003 static void reset_node_present_pages(pg_data_t *pgdat)
1004 {
1005 	struct zone *z;
1006 
1007 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1008 		z->present_pages = 0;
1009 
1010 	pgdat->node_present_pages = 0;
1011 }
1012 
1013 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1014 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1015 {
1016 	struct pglist_data *pgdat;
1017 	unsigned long zones_size[MAX_NR_ZONES] = {0};
1018 	unsigned long zholes_size[MAX_NR_ZONES] = {0};
1019 	unsigned long start_pfn = PFN_DOWN(start);
1020 
1021 	pgdat = NODE_DATA(nid);
1022 	if (!pgdat) {
1023 		pgdat = arch_alloc_nodedata(nid);
1024 		if (!pgdat)
1025 			return NULL;
1026 
1027 		arch_refresh_nodedata(nid, pgdat);
1028 	} else {
1029 		/*
1030 		 * Reset the nr_zones, order and classzone_idx before reuse.
1031 		 * Note that kswapd will init kswapd_classzone_idx properly
1032 		 * when it starts in the near future.
1033 		 */
1034 		pgdat->nr_zones = 0;
1035 		pgdat->kswapd_order = 0;
1036 		pgdat->kswapd_classzone_idx = 0;
1037 	}
1038 
1039 	/* we can use NODE_DATA(nid) from here */
1040 
1041 	/* init node's zones as empty zones, we don't have any present pages.*/
1042 	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
1043 	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1044 
1045 	/*
1046 	 * The node we allocated has no zone fallback lists. For avoiding
1047 	 * to access not-initialized zonelist, build here.
1048 	 */
1049 	mutex_lock(&zonelists_mutex);
1050 	build_all_zonelists(pgdat, NULL);
1051 	mutex_unlock(&zonelists_mutex);
1052 
1053 	/*
1054 	 * zone->managed_pages is set to an approximate value in
1055 	 * free_area_init_core(), which will cause
1056 	 * /sys/device/system/node/nodeX/meminfo has wrong data.
1057 	 * So reset it to 0 before any memory is onlined.
1058 	 */
1059 	reset_node_managed_pages(pgdat);
1060 
1061 	/*
1062 	 * When memory is hot-added, all the memory is in offline state. So
1063 	 * clear all zones' present_pages because they will be updated in
1064 	 * online_pages() and offline_pages().
1065 	 */
1066 	reset_node_present_pages(pgdat);
1067 
1068 	return pgdat;
1069 }
1070 
1071 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1072 {
1073 	arch_refresh_nodedata(nid, NULL);
1074 	free_percpu(pgdat->per_cpu_nodestats);
1075 	arch_free_nodedata(pgdat);
1076 	return;
1077 }
1078 
1079 
1080 /**
1081  * try_online_node - online a node if offlined
1082  *
1083  * called by cpu_up() to online a node without onlined memory.
1084  */
1085 int try_online_node(int nid)
1086 {
1087 	pg_data_t	*pgdat;
1088 	int	ret;
1089 
1090 	if (node_online(nid))
1091 		return 0;
1092 
1093 	mem_hotplug_begin();
1094 	pgdat = hotadd_new_pgdat(nid, 0);
1095 	if (!pgdat) {
1096 		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1097 		ret = -ENOMEM;
1098 		goto out;
1099 	}
1100 	node_set_online(nid);
1101 	ret = register_one_node(nid);
1102 	BUG_ON(ret);
1103 
1104 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1105 		mutex_lock(&zonelists_mutex);
1106 		build_all_zonelists(NULL, NULL);
1107 		mutex_unlock(&zonelists_mutex);
1108 	}
1109 
1110 out:
1111 	mem_hotplug_done();
1112 	return ret;
1113 }
1114 
1115 static int check_hotplug_memory_range(u64 start, u64 size)
1116 {
1117 	u64 start_pfn = PFN_DOWN(start);
1118 	u64 nr_pages = size >> PAGE_SHIFT;
1119 
1120 	/* Memory range must be aligned with section */
1121 	if ((start_pfn & ~PAGE_SECTION_MASK) ||
1122 	    (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1123 		pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1124 				(unsigned long long)start,
1125 				(unsigned long long)size);
1126 		return -EINVAL;
1127 	}
1128 
1129 	return 0;
1130 }
1131 
1132 static int online_memory_block(struct memory_block *mem, void *arg)
1133 {
1134 	return device_online(&mem->dev);
1135 }
1136 
1137 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1138 int __ref add_memory_resource(int nid, struct resource *res, bool online)
1139 {
1140 	u64 start, size;
1141 	pg_data_t *pgdat = NULL;
1142 	bool new_pgdat;
1143 	bool new_node;
1144 	int ret;
1145 
1146 	start = res->start;
1147 	size = resource_size(res);
1148 
1149 	ret = check_hotplug_memory_range(start, size);
1150 	if (ret)
1151 		return ret;
1152 
1153 	{	/* Stupid hack to suppress address-never-null warning */
1154 		void *p = NODE_DATA(nid);
1155 		new_pgdat = !p;
1156 	}
1157 
1158 	mem_hotplug_begin();
1159 
1160 	/*
1161 	 * Add new range to memblock so that when hotadd_new_pgdat() is called
1162 	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1163 	 * this new range and calculate total pages correctly.  The range will
1164 	 * be removed at hot-remove time.
1165 	 */
1166 	memblock_add_node(start, size, nid);
1167 
1168 	new_node = !node_online(nid);
1169 	if (new_node) {
1170 		pgdat = hotadd_new_pgdat(nid, start);
1171 		ret = -ENOMEM;
1172 		if (!pgdat)
1173 			goto error;
1174 	}
1175 
1176 	/* call arch's memory hotadd */
1177 	ret = arch_add_memory(nid, start, size, true);
1178 
1179 	if (ret < 0)
1180 		goto error;
1181 
1182 	/* we online node here. we can't roll back from here. */
1183 	node_set_online(nid);
1184 
1185 	if (new_node) {
1186 		unsigned long start_pfn = start >> PAGE_SHIFT;
1187 		unsigned long nr_pages = size >> PAGE_SHIFT;
1188 
1189 		ret = __register_one_node(nid);
1190 		if (ret)
1191 			goto register_fail;
1192 
1193 		/*
1194 		 * link memory sections under this node. This is already
1195 		 * done when creatig memory section in register_new_memory
1196 		 * but that depends to have the node registered so offline
1197 		 * nodes have to go through register_node.
1198 		 * TODO clean up this mess.
1199 		 */
1200 		ret = link_mem_sections(nid, start_pfn, nr_pages);
1201 register_fail:
1202 		/*
1203 		 * If sysfs file of new node can't create, cpu on the node
1204 		 * can't be hot-added. There is no rollback way now.
1205 		 * So, check by BUG_ON() to catch it reluctantly..
1206 		 */
1207 		BUG_ON(ret);
1208 	}
1209 
1210 	/* create new memmap entry */
1211 	firmware_map_add_hotplug(start, start + size, "System RAM");
1212 
1213 	/* online pages if requested */
1214 	if (online)
1215 		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1216 				  NULL, online_memory_block);
1217 
1218 	goto out;
1219 
1220 error:
1221 	/* rollback pgdat allocation and others */
1222 	if (new_pgdat && pgdat)
1223 		rollback_node_hotadd(nid, pgdat);
1224 	memblock_remove(start, size);
1225 
1226 out:
1227 	mem_hotplug_done();
1228 	return ret;
1229 }
1230 EXPORT_SYMBOL_GPL(add_memory_resource);
1231 
1232 int __ref add_memory(int nid, u64 start, u64 size)
1233 {
1234 	struct resource *res;
1235 	int ret;
1236 
1237 	res = register_memory_resource(start, size);
1238 	if (IS_ERR(res))
1239 		return PTR_ERR(res);
1240 
1241 	ret = add_memory_resource(nid, res, memhp_auto_online);
1242 	if (ret < 0)
1243 		release_memory_resource(res);
1244 	return ret;
1245 }
1246 EXPORT_SYMBOL_GPL(add_memory);
1247 
1248 #ifdef CONFIG_MEMORY_HOTREMOVE
1249 /*
1250  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1251  * set and the size of the free page is given by page_order(). Using this,
1252  * the function determines if the pageblock contains only free pages.
1253  * Due to buddy contraints, a free page at least the size of a pageblock will
1254  * be located at the start of the pageblock
1255  */
1256 static inline int pageblock_free(struct page *page)
1257 {
1258 	return PageBuddy(page) && page_order(page) >= pageblock_order;
1259 }
1260 
1261 /* Return the start of the next active pageblock after a given page */
1262 static struct page *next_active_pageblock(struct page *page)
1263 {
1264 	/* Ensure the starting page is pageblock-aligned */
1265 	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
1266 
1267 	/* If the entire pageblock is free, move to the end of free page */
1268 	if (pageblock_free(page)) {
1269 		int order;
1270 		/* be careful. we don't have locks, page_order can be changed.*/
1271 		order = page_order(page);
1272 		if ((order < MAX_ORDER) && (order >= pageblock_order))
1273 			return page + (1 << order);
1274 	}
1275 
1276 	return page + pageblock_nr_pages;
1277 }
1278 
1279 /* Checks if this range of memory is likely to be hot-removable. */
1280 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1281 {
1282 	struct page *page = pfn_to_page(start_pfn);
1283 	struct page *end_page = page + nr_pages;
1284 
1285 	/* Check the starting page of each pageblock within the range */
1286 	for (; page < end_page; page = next_active_pageblock(page)) {
1287 		if (!is_pageblock_removable_nolock(page))
1288 			return false;
1289 		cond_resched();
1290 	}
1291 
1292 	/* All pageblocks in the memory block are likely to be hot-removable */
1293 	return true;
1294 }
1295 
1296 /*
1297  * Confirm all pages in a range [start, end) belong to the same zone.
1298  * When true, return its valid [start, end).
1299  */
1300 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1301 			 unsigned long *valid_start, unsigned long *valid_end)
1302 {
1303 	unsigned long pfn, sec_end_pfn;
1304 	unsigned long start, end;
1305 	struct zone *zone = NULL;
1306 	struct page *page;
1307 	int i;
1308 	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1309 	     pfn < end_pfn;
1310 	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1311 		/* Make sure the memory section is present first */
1312 		if (!present_section_nr(pfn_to_section_nr(pfn)))
1313 			continue;
1314 		for (; pfn < sec_end_pfn && pfn < end_pfn;
1315 		     pfn += MAX_ORDER_NR_PAGES) {
1316 			i = 0;
1317 			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
1318 			while ((i < MAX_ORDER_NR_PAGES) &&
1319 				!pfn_valid_within(pfn + i))
1320 				i++;
1321 			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1322 				continue;
1323 			page = pfn_to_page(pfn + i);
1324 			if (zone && page_zone(page) != zone)
1325 				return 0;
1326 			if (!zone)
1327 				start = pfn + i;
1328 			zone = page_zone(page);
1329 			end = pfn + MAX_ORDER_NR_PAGES;
1330 		}
1331 	}
1332 
1333 	if (zone) {
1334 		*valid_start = start;
1335 		*valid_end = min(end, end_pfn);
1336 		return 1;
1337 	} else {
1338 		return 0;
1339 	}
1340 }
1341 
1342 /*
1343  * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1344  * non-lru movable pages and hugepages). We scan pfn because it's much
1345  * easier than scanning over linked list. This function returns the pfn
1346  * of the first found movable page if it's found, otherwise 0.
1347  */
1348 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1349 {
1350 	unsigned long pfn;
1351 	struct page *page;
1352 	for (pfn = start; pfn < end; pfn++) {
1353 		if (pfn_valid(pfn)) {
1354 			page = pfn_to_page(pfn);
1355 			if (PageLRU(page))
1356 				return pfn;
1357 			if (__PageMovable(page))
1358 				return pfn;
1359 			if (PageHuge(page)) {
1360 				if (page_huge_active(page))
1361 					return pfn;
1362 				else
1363 					pfn = round_up(pfn + 1,
1364 						1 << compound_order(page)) - 1;
1365 			}
1366 		}
1367 	}
1368 	return 0;
1369 }
1370 
1371 static struct page *new_node_page(struct page *page, unsigned long private,
1372 		int **result)
1373 {
1374 	int nid = page_to_nid(page);
1375 	nodemask_t nmask = node_states[N_MEMORY];
1376 
1377 	/*
1378 	 * try to allocate from a different node but reuse this node if there
1379 	 * are no other online nodes to be used (e.g. we are offlining a part
1380 	 * of the only existing node)
1381 	 */
1382 	node_clear(nid, nmask);
1383 	if (nodes_empty(nmask))
1384 		node_set(nid, nmask);
1385 
1386 	return new_page_nodemask(page, nid, &nmask);
1387 }
1388 
1389 #define NR_OFFLINE_AT_ONCE_PAGES	(256)
1390 static int
1391 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1392 {
1393 	unsigned long pfn;
1394 	struct page *page;
1395 	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1396 	int not_managed = 0;
1397 	int ret = 0;
1398 	LIST_HEAD(source);
1399 
1400 	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1401 		if (!pfn_valid(pfn))
1402 			continue;
1403 		page = pfn_to_page(pfn);
1404 
1405 		if (PageHuge(page)) {
1406 			struct page *head = compound_head(page);
1407 			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1408 			if (compound_order(head) > PFN_SECTION_SHIFT) {
1409 				ret = -EBUSY;
1410 				break;
1411 			}
1412 			if (isolate_huge_page(page, &source))
1413 				move_pages -= 1 << compound_order(head);
1414 			continue;
1415 		}
1416 
1417 		if (!get_page_unless_zero(page))
1418 			continue;
1419 		/*
1420 		 * We can skip free pages. And we can deal with pages on
1421 		 * LRU and non-lru movable pages.
1422 		 */
1423 		if (PageLRU(page))
1424 			ret = isolate_lru_page(page);
1425 		else
1426 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1427 		if (!ret) { /* Success */
1428 			put_page(page);
1429 			list_add_tail(&page->lru, &source);
1430 			move_pages--;
1431 			if (!__PageMovable(page))
1432 				inc_node_page_state(page, NR_ISOLATED_ANON +
1433 						    page_is_file_cache(page));
1434 
1435 		} else {
1436 #ifdef CONFIG_DEBUG_VM
1437 			pr_alert("failed to isolate pfn %lx\n", pfn);
1438 			dump_page(page, "isolation failed");
1439 #endif
1440 			put_page(page);
1441 			/* Because we don't have big zone->lock. we should
1442 			   check this again here. */
1443 			if (page_count(page)) {
1444 				not_managed++;
1445 				ret = -EBUSY;
1446 				break;
1447 			}
1448 		}
1449 	}
1450 	if (!list_empty(&source)) {
1451 		if (not_managed) {
1452 			putback_movable_pages(&source);
1453 			goto out;
1454 		}
1455 
1456 		/* Allocate a new page from the nearest neighbor node */
1457 		ret = migrate_pages(&source, new_node_page, NULL, 0,
1458 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1459 		if (ret)
1460 			putback_movable_pages(&source);
1461 	}
1462 out:
1463 	return ret;
1464 }
1465 
1466 /*
1467  * remove from free_area[] and mark all as Reserved.
1468  */
1469 static int
1470 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1471 			void *data)
1472 {
1473 	__offline_isolated_pages(start, start + nr_pages);
1474 	return 0;
1475 }
1476 
1477 static void
1478 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1479 {
1480 	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1481 				offline_isolated_pages_cb);
1482 }
1483 
1484 /*
1485  * Check all pages in range, recoreded as memory resource, are isolated.
1486  */
1487 static int
1488 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1489 			void *data)
1490 {
1491 	int ret;
1492 	long offlined = *(long *)data;
1493 	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1494 	offlined = nr_pages;
1495 	if (!ret)
1496 		*(long *)data += offlined;
1497 	return ret;
1498 }
1499 
1500 static long
1501 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1502 {
1503 	long offlined = 0;
1504 	int ret;
1505 
1506 	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1507 			check_pages_isolated_cb);
1508 	if (ret < 0)
1509 		offlined = (long)ret;
1510 	return offlined;
1511 }
1512 
1513 static int __init cmdline_parse_movable_node(char *p)
1514 {
1515 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1516 	movable_node_enabled = true;
1517 #else
1518 	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1519 #endif
1520 	return 0;
1521 }
1522 early_param("movable_node", cmdline_parse_movable_node);
1523 
1524 /* check which state of node_states will be changed when offline memory */
1525 static void node_states_check_changes_offline(unsigned long nr_pages,
1526 		struct zone *zone, struct memory_notify *arg)
1527 {
1528 	struct pglist_data *pgdat = zone->zone_pgdat;
1529 	unsigned long present_pages = 0;
1530 	enum zone_type zt, zone_last = ZONE_NORMAL;
1531 
1532 	/*
1533 	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1534 	 * contains nodes which have zones of 0...ZONE_NORMAL,
1535 	 * set zone_last to ZONE_NORMAL.
1536 	 *
1537 	 * If we don't have HIGHMEM nor movable node,
1538 	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1539 	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1540 	 */
1541 	if (N_MEMORY == N_NORMAL_MEMORY)
1542 		zone_last = ZONE_MOVABLE;
1543 
1544 	/*
1545 	 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1546 	 * If the memory to be offline is in a zone of 0...zone_last,
1547 	 * and it is the last present memory, 0...zone_last will
1548 	 * become empty after offline , thus we can determind we will
1549 	 * need to clear the node from node_states[N_NORMAL_MEMORY].
1550 	 */
1551 	for (zt = 0; zt <= zone_last; zt++)
1552 		present_pages += pgdat->node_zones[zt].present_pages;
1553 	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1554 		arg->status_change_nid_normal = zone_to_nid(zone);
1555 	else
1556 		arg->status_change_nid_normal = -1;
1557 
1558 #ifdef CONFIG_HIGHMEM
1559 	/*
1560 	 * If we have movable node, node_states[N_HIGH_MEMORY]
1561 	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1562 	 * set zone_last to ZONE_HIGHMEM.
1563 	 *
1564 	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1565 	 * contains nodes which have zones of 0...ZONE_MOVABLE,
1566 	 * set zone_last to ZONE_MOVABLE.
1567 	 */
1568 	zone_last = ZONE_HIGHMEM;
1569 	if (N_MEMORY == N_HIGH_MEMORY)
1570 		zone_last = ZONE_MOVABLE;
1571 
1572 	for (; zt <= zone_last; zt++)
1573 		present_pages += pgdat->node_zones[zt].present_pages;
1574 	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1575 		arg->status_change_nid_high = zone_to_nid(zone);
1576 	else
1577 		arg->status_change_nid_high = -1;
1578 #else
1579 	arg->status_change_nid_high = arg->status_change_nid_normal;
1580 #endif
1581 
1582 	/*
1583 	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1584 	 */
1585 	zone_last = ZONE_MOVABLE;
1586 
1587 	/*
1588 	 * check whether node_states[N_HIGH_MEMORY] will be changed
1589 	 * If we try to offline the last present @nr_pages from the node,
1590 	 * we can determind we will need to clear the node from
1591 	 * node_states[N_HIGH_MEMORY].
1592 	 */
1593 	for (; zt <= zone_last; zt++)
1594 		present_pages += pgdat->node_zones[zt].present_pages;
1595 	if (nr_pages >= present_pages)
1596 		arg->status_change_nid = zone_to_nid(zone);
1597 	else
1598 		arg->status_change_nid = -1;
1599 }
1600 
1601 static void node_states_clear_node(int node, struct memory_notify *arg)
1602 {
1603 	if (arg->status_change_nid_normal >= 0)
1604 		node_clear_state(node, N_NORMAL_MEMORY);
1605 
1606 	if ((N_MEMORY != N_NORMAL_MEMORY) &&
1607 	    (arg->status_change_nid_high >= 0))
1608 		node_clear_state(node, N_HIGH_MEMORY);
1609 
1610 	if ((N_MEMORY != N_HIGH_MEMORY) &&
1611 	    (arg->status_change_nid >= 0))
1612 		node_clear_state(node, N_MEMORY);
1613 }
1614 
1615 static int __ref __offline_pages(unsigned long start_pfn,
1616 		  unsigned long end_pfn, unsigned long timeout)
1617 {
1618 	unsigned long pfn, nr_pages, expire;
1619 	long offlined_pages;
1620 	int ret, drain, retry_max, node;
1621 	unsigned long flags;
1622 	unsigned long valid_start, valid_end;
1623 	struct zone *zone;
1624 	struct memory_notify arg;
1625 
1626 	/* at least, alignment against pageblock is necessary */
1627 	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1628 		return -EINVAL;
1629 	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1630 		return -EINVAL;
1631 	/* This makes hotplug much easier...and readable.
1632 	   we assume this for now. .*/
1633 	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
1634 		return -EINVAL;
1635 
1636 	zone = page_zone(pfn_to_page(valid_start));
1637 	node = zone_to_nid(zone);
1638 	nr_pages = end_pfn - start_pfn;
1639 
1640 	/* set above range as isolated */
1641 	ret = start_isolate_page_range(start_pfn, end_pfn,
1642 				       MIGRATE_MOVABLE, true);
1643 	if (ret)
1644 		return ret;
1645 
1646 	arg.start_pfn = start_pfn;
1647 	arg.nr_pages = nr_pages;
1648 	node_states_check_changes_offline(nr_pages, zone, &arg);
1649 
1650 	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1651 	ret = notifier_to_errno(ret);
1652 	if (ret)
1653 		goto failed_removal;
1654 
1655 	pfn = start_pfn;
1656 	expire = jiffies + timeout;
1657 	drain = 0;
1658 	retry_max = 5;
1659 repeat:
1660 	/* start memory hot removal */
1661 	ret = -EAGAIN;
1662 	if (time_after(jiffies, expire))
1663 		goto failed_removal;
1664 	ret = -EINTR;
1665 	if (signal_pending(current))
1666 		goto failed_removal;
1667 	ret = 0;
1668 	if (drain) {
1669 		lru_add_drain_all_cpuslocked();
1670 		cond_resched();
1671 		drain_all_pages(zone);
1672 	}
1673 
1674 	pfn = scan_movable_pages(start_pfn, end_pfn);
1675 	if (pfn) { /* We have movable pages */
1676 		ret = do_migrate_range(pfn, end_pfn);
1677 		if (!ret) {
1678 			drain = 1;
1679 			goto repeat;
1680 		} else {
1681 			if (ret < 0)
1682 				if (--retry_max == 0)
1683 					goto failed_removal;
1684 			yield();
1685 			drain = 1;
1686 			goto repeat;
1687 		}
1688 	}
1689 	/* drain all zone's lru pagevec, this is asynchronous... */
1690 	lru_add_drain_all_cpuslocked();
1691 	yield();
1692 	/* drain pcp pages, this is synchronous. */
1693 	drain_all_pages(zone);
1694 	/*
1695 	 * dissolve free hugepages in the memory block before doing offlining
1696 	 * actually in order to make hugetlbfs's object counting consistent.
1697 	 */
1698 	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1699 	if (ret)
1700 		goto failed_removal;
1701 	/* check again */
1702 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1703 	if (offlined_pages < 0) {
1704 		ret = -EBUSY;
1705 		goto failed_removal;
1706 	}
1707 	pr_info("Offlined Pages %ld\n", offlined_pages);
1708 	/* Ok, all of our target is isolated.
1709 	   We cannot do rollback at this point. */
1710 	offline_isolated_pages(start_pfn, end_pfn);
1711 	/* reset pagetype flags and makes migrate type to be MOVABLE */
1712 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1713 	/* removal success */
1714 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1715 	zone->present_pages -= offlined_pages;
1716 
1717 	pgdat_resize_lock(zone->zone_pgdat, &flags);
1718 	zone->zone_pgdat->node_present_pages -= offlined_pages;
1719 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1720 
1721 	init_per_zone_wmark_min();
1722 
1723 	if (!populated_zone(zone)) {
1724 		zone_pcp_reset(zone);
1725 		mutex_lock(&zonelists_mutex);
1726 		build_all_zonelists(NULL, NULL);
1727 		mutex_unlock(&zonelists_mutex);
1728 	} else
1729 		zone_pcp_update(zone);
1730 
1731 	node_states_clear_node(node, &arg);
1732 	if (arg.status_change_nid >= 0) {
1733 		kswapd_stop(node);
1734 		kcompactd_stop(node);
1735 	}
1736 
1737 	vm_total_pages = nr_free_pagecache_pages();
1738 	writeback_set_ratelimit();
1739 
1740 	memory_notify(MEM_OFFLINE, &arg);
1741 	return 0;
1742 
1743 failed_removal:
1744 	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1745 		 (unsigned long long) start_pfn << PAGE_SHIFT,
1746 		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1747 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
1748 	/* pushback to free area */
1749 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1750 	return ret;
1751 }
1752 
1753 /* Must be protected by mem_hotplug_begin() */
1754 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1755 {
1756 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1757 }
1758 #endif /* CONFIG_MEMORY_HOTREMOVE */
1759 
1760 /**
1761  * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1762  * @start_pfn: start pfn of the memory range
1763  * @end_pfn: end pfn of the memory range
1764  * @arg: argument passed to func
1765  * @func: callback for each memory section walked
1766  *
1767  * This function walks through all present mem sections in range
1768  * [start_pfn, end_pfn) and call func on each mem section.
1769  *
1770  * Returns the return value of func.
1771  */
1772 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1773 		void *arg, int (*func)(struct memory_block *, void *))
1774 {
1775 	struct memory_block *mem = NULL;
1776 	struct mem_section *section;
1777 	unsigned long pfn, section_nr;
1778 	int ret;
1779 
1780 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1781 		section_nr = pfn_to_section_nr(pfn);
1782 		if (!present_section_nr(section_nr))
1783 			continue;
1784 
1785 		section = __nr_to_section(section_nr);
1786 		/* same memblock? */
1787 		if (mem)
1788 			if ((section_nr >= mem->start_section_nr) &&
1789 			    (section_nr <= mem->end_section_nr))
1790 				continue;
1791 
1792 		mem = find_memory_block_hinted(section, mem);
1793 		if (!mem)
1794 			continue;
1795 
1796 		ret = func(mem, arg);
1797 		if (ret) {
1798 			kobject_put(&mem->dev.kobj);
1799 			return ret;
1800 		}
1801 	}
1802 
1803 	if (mem)
1804 		kobject_put(&mem->dev.kobj);
1805 
1806 	return 0;
1807 }
1808 
1809 #ifdef CONFIG_MEMORY_HOTREMOVE
1810 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1811 {
1812 	int ret = !is_memblock_offlined(mem);
1813 
1814 	if (unlikely(ret)) {
1815 		phys_addr_t beginpa, endpa;
1816 
1817 		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1818 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1819 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1820 			&beginpa, &endpa);
1821 	}
1822 
1823 	return ret;
1824 }
1825 
1826 static int check_cpu_on_node(pg_data_t *pgdat)
1827 {
1828 	int cpu;
1829 
1830 	for_each_present_cpu(cpu) {
1831 		if (cpu_to_node(cpu) == pgdat->node_id)
1832 			/*
1833 			 * the cpu on this node isn't removed, and we can't
1834 			 * offline this node.
1835 			 */
1836 			return -EBUSY;
1837 	}
1838 
1839 	return 0;
1840 }
1841 
1842 static void unmap_cpu_on_node(pg_data_t *pgdat)
1843 {
1844 #ifdef CONFIG_ACPI_NUMA
1845 	int cpu;
1846 
1847 	for_each_possible_cpu(cpu)
1848 		if (cpu_to_node(cpu) == pgdat->node_id)
1849 			numa_clear_node(cpu);
1850 #endif
1851 }
1852 
1853 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1854 {
1855 	int ret;
1856 
1857 	ret = check_cpu_on_node(pgdat);
1858 	if (ret)
1859 		return ret;
1860 
1861 	/*
1862 	 * the node will be offlined when we come here, so we can clear
1863 	 * the cpu_to_node() now.
1864 	 */
1865 
1866 	unmap_cpu_on_node(pgdat);
1867 	return 0;
1868 }
1869 
1870 /**
1871  * try_offline_node
1872  *
1873  * Offline a node if all memory sections and cpus of the node are removed.
1874  *
1875  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1876  * and online/offline operations before this call.
1877  */
1878 void try_offline_node(int nid)
1879 {
1880 	pg_data_t *pgdat = NODE_DATA(nid);
1881 	unsigned long start_pfn = pgdat->node_start_pfn;
1882 	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1883 	unsigned long pfn;
1884 
1885 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1886 		unsigned long section_nr = pfn_to_section_nr(pfn);
1887 
1888 		if (!present_section_nr(section_nr))
1889 			continue;
1890 
1891 		if (pfn_to_nid(pfn) != nid)
1892 			continue;
1893 
1894 		/*
1895 		 * some memory sections of this node are not removed, and we
1896 		 * can't offline node now.
1897 		 */
1898 		return;
1899 	}
1900 
1901 	if (check_and_unmap_cpu_on_node(pgdat))
1902 		return;
1903 
1904 	/*
1905 	 * all memory/cpu of this node are removed, we can offline this
1906 	 * node now.
1907 	 */
1908 	node_set_offline(nid);
1909 	unregister_one_node(nid);
1910 }
1911 EXPORT_SYMBOL(try_offline_node);
1912 
1913 /**
1914  * remove_memory
1915  *
1916  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1917  * and online/offline operations before this call, as required by
1918  * try_offline_node().
1919  */
1920 void __ref remove_memory(int nid, u64 start, u64 size)
1921 {
1922 	int ret;
1923 
1924 	BUG_ON(check_hotplug_memory_range(start, size));
1925 
1926 	mem_hotplug_begin();
1927 
1928 	/*
1929 	 * All memory blocks must be offlined before removing memory.  Check
1930 	 * whether all memory blocks in question are offline and trigger a BUG()
1931 	 * if this is not the case.
1932 	 */
1933 	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1934 				check_memblock_offlined_cb);
1935 	if (ret)
1936 		BUG();
1937 
1938 	/* remove memmap entry */
1939 	firmware_map_remove(start, start + size, "System RAM");
1940 	memblock_free(start, size);
1941 	memblock_remove(start, size);
1942 
1943 	arch_remove_memory(start, size);
1944 
1945 	try_offline_node(nid);
1946 
1947 	mem_hotplug_done();
1948 }
1949 EXPORT_SYMBOL_GPL(remove_memory);
1950 #endif /* CONFIG_MEMORY_HOTREMOVE */
1951