xref: /openbmc/linux/mm/memory_hotplug.c (revision d15e5926)
1 /*
2  *  linux/mm/memory_hotplug.c
3  *
4  *  Copyright (C)
5  */
6 
7 #include <linux/stddef.h>
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/swap.h>
11 #include <linux/interrupt.h>
12 #include <linux/pagemap.h>
13 #include <linux/compiler.h>
14 #include <linux/export.h>
15 #include <linux/pagevec.h>
16 #include <linux/writeback.h>
17 #include <linux/slab.h>
18 #include <linux/sysctl.h>
19 #include <linux/cpu.h>
20 #include <linux/memory.h>
21 #include <linux/memremap.h>
22 #include <linux/memory_hotplug.h>
23 #include <linux/highmem.h>
24 #include <linux/vmalloc.h>
25 #include <linux/ioport.h>
26 #include <linux/delay.h>
27 #include <linux/migrate.h>
28 #include <linux/page-isolation.h>
29 #include <linux/pfn.h>
30 #include <linux/suspend.h>
31 #include <linux/mm_inline.h>
32 #include <linux/firmware-map.h>
33 #include <linux/stop_machine.h>
34 #include <linux/hugetlb.h>
35 #include <linux/memblock.h>
36 #include <linux/compaction.h>
37 
38 #include <asm/tlbflush.h>
39 
40 #include "internal.h"
41 
42 /*
43  * online_page_callback contains pointer to current page onlining function.
44  * Initially it is generic_online_page(). If it is required it could be
45  * changed by calling set_online_page_callback() for callback registration
46  * and restore_online_page_callback() for generic callback restore.
47  */
48 
49 static void generic_online_page(struct page *page);
50 
51 static online_page_callback_t online_page_callback = generic_online_page;
52 static DEFINE_MUTEX(online_page_callback_lock);
53 
54 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
55 
56 void get_online_mems(void)
57 {
58 	percpu_down_read(&mem_hotplug_lock);
59 }
60 
61 void put_online_mems(void)
62 {
63 	percpu_up_read(&mem_hotplug_lock);
64 }
65 
66 bool movable_node_enabled = false;
67 
68 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
69 bool memhp_auto_online;
70 #else
71 bool memhp_auto_online = true;
72 #endif
73 EXPORT_SYMBOL_GPL(memhp_auto_online);
74 
75 static int __init setup_memhp_default_state(char *str)
76 {
77 	if (!strcmp(str, "online"))
78 		memhp_auto_online = true;
79 	else if (!strcmp(str, "offline"))
80 		memhp_auto_online = false;
81 
82 	return 1;
83 }
84 __setup("memhp_default_state=", setup_memhp_default_state);
85 
86 void mem_hotplug_begin(void)
87 {
88 	cpus_read_lock();
89 	percpu_down_write(&mem_hotplug_lock);
90 }
91 
92 void mem_hotplug_done(void)
93 {
94 	percpu_up_write(&mem_hotplug_lock);
95 	cpus_read_unlock();
96 }
97 
98 /* add this memory to iomem resource */
99 static struct resource *register_memory_resource(u64 start, u64 size)
100 {
101 	struct resource *res, *conflict;
102 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
103 	if (!res)
104 		return ERR_PTR(-ENOMEM);
105 
106 	res->name = "System RAM";
107 	res->start = start;
108 	res->end = start + size - 1;
109 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
110 	conflict =  request_resource_conflict(&iomem_resource, res);
111 	if (conflict) {
112 		if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
113 			pr_debug("Device unaddressable memory block "
114 				 "memory hotplug at %#010llx !\n",
115 				 (unsigned long long)start);
116 		}
117 		pr_debug("System RAM resource %pR cannot be added\n", res);
118 		kfree(res);
119 		return ERR_PTR(-EEXIST);
120 	}
121 	return res;
122 }
123 
124 static void release_memory_resource(struct resource *res)
125 {
126 	if (!res)
127 		return;
128 	release_resource(res);
129 	kfree(res);
130 	return;
131 }
132 
133 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
134 void get_page_bootmem(unsigned long info,  struct page *page,
135 		      unsigned long type)
136 {
137 	page->freelist = (void *)type;
138 	SetPagePrivate(page);
139 	set_page_private(page, info);
140 	page_ref_inc(page);
141 }
142 
143 void put_page_bootmem(struct page *page)
144 {
145 	unsigned long type;
146 
147 	type = (unsigned long) page->freelist;
148 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
149 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
150 
151 	if (page_ref_dec_return(page) == 1) {
152 		page->freelist = NULL;
153 		ClearPagePrivate(page);
154 		set_page_private(page, 0);
155 		INIT_LIST_HEAD(&page->lru);
156 		free_reserved_page(page);
157 	}
158 }
159 
160 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
161 #ifndef CONFIG_SPARSEMEM_VMEMMAP
162 static void register_page_bootmem_info_section(unsigned long start_pfn)
163 {
164 	unsigned long *usemap, mapsize, section_nr, i;
165 	struct mem_section *ms;
166 	struct page *page, *memmap;
167 
168 	section_nr = pfn_to_section_nr(start_pfn);
169 	ms = __nr_to_section(section_nr);
170 
171 	/* Get section's memmap address */
172 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
173 
174 	/*
175 	 * Get page for the memmap's phys address
176 	 * XXX: need more consideration for sparse_vmemmap...
177 	 */
178 	page = virt_to_page(memmap);
179 	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
180 	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
181 
182 	/* remember memmap's page */
183 	for (i = 0; i < mapsize; i++, page++)
184 		get_page_bootmem(section_nr, page, SECTION_INFO);
185 
186 	usemap = ms->pageblock_flags;
187 	page = virt_to_page(usemap);
188 
189 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
190 
191 	for (i = 0; i < mapsize; i++, page++)
192 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
193 
194 }
195 #else /* CONFIG_SPARSEMEM_VMEMMAP */
196 static void register_page_bootmem_info_section(unsigned long start_pfn)
197 {
198 	unsigned long *usemap, mapsize, section_nr, i;
199 	struct mem_section *ms;
200 	struct page *page, *memmap;
201 
202 	section_nr = pfn_to_section_nr(start_pfn);
203 	ms = __nr_to_section(section_nr);
204 
205 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
206 
207 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
208 
209 	usemap = ms->pageblock_flags;
210 	page = virt_to_page(usemap);
211 
212 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
213 
214 	for (i = 0; i < mapsize; i++, page++)
215 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
216 }
217 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
218 
219 void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
220 {
221 	unsigned long i, pfn, end_pfn, nr_pages;
222 	int node = pgdat->node_id;
223 	struct page *page;
224 
225 	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
226 	page = virt_to_page(pgdat);
227 
228 	for (i = 0; i < nr_pages; i++, page++)
229 		get_page_bootmem(node, page, NODE_INFO);
230 
231 	pfn = pgdat->node_start_pfn;
232 	end_pfn = pgdat_end_pfn(pgdat);
233 
234 	/* register section info */
235 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
236 		/*
237 		 * Some platforms can assign the same pfn to multiple nodes - on
238 		 * node0 as well as nodeN.  To avoid registering a pfn against
239 		 * multiple nodes we check that this pfn does not already
240 		 * reside in some other nodes.
241 		 */
242 		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
243 			register_page_bootmem_info_section(pfn);
244 	}
245 }
246 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
247 
248 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
249 		struct vmem_altmap *altmap, bool want_memblock)
250 {
251 	int ret;
252 
253 	if (pfn_valid(phys_start_pfn))
254 		return -EEXIST;
255 
256 	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
257 	if (ret < 0)
258 		return ret;
259 
260 	if (!want_memblock)
261 		return 0;
262 
263 	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
264 }
265 
266 /*
267  * Reasonably generic function for adding memory.  It is
268  * expected that archs that support memory hotplug will
269  * call this function after deciding the zone to which to
270  * add the new pages.
271  */
272 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
273 		unsigned long nr_pages, struct vmem_altmap *altmap,
274 		bool want_memblock)
275 {
276 	unsigned long i;
277 	int err = 0;
278 	int start_sec, end_sec;
279 
280 	/* during initialize mem_map, align hot-added range to section */
281 	start_sec = pfn_to_section_nr(phys_start_pfn);
282 	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
283 
284 	if (altmap) {
285 		/*
286 		 * Validate altmap is within bounds of the total request
287 		 */
288 		if (altmap->base_pfn != phys_start_pfn
289 				|| vmem_altmap_offset(altmap) > nr_pages) {
290 			pr_warn_once("memory add fail, invalid altmap\n");
291 			err = -EINVAL;
292 			goto out;
293 		}
294 		altmap->alloc = 0;
295 	}
296 
297 	for (i = start_sec; i <= end_sec; i++) {
298 		err = __add_section(nid, section_nr_to_pfn(i), altmap,
299 				want_memblock);
300 
301 		/*
302 		 * EEXIST is finally dealt with by ioresource collision
303 		 * check. see add_memory() => register_memory_resource()
304 		 * Warning will be printed if there is collision.
305 		 */
306 		if (err && (err != -EEXIST))
307 			break;
308 		err = 0;
309 		cond_resched();
310 	}
311 	vmemmap_populate_print_last();
312 out:
313 	return err;
314 }
315 
316 #ifdef CONFIG_MEMORY_HOTREMOVE
317 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
318 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
319 				     unsigned long start_pfn,
320 				     unsigned long end_pfn)
321 {
322 	struct mem_section *ms;
323 
324 	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
325 		ms = __pfn_to_section(start_pfn);
326 
327 		if (unlikely(!valid_section(ms)))
328 			continue;
329 
330 		if (unlikely(pfn_to_nid(start_pfn) != nid))
331 			continue;
332 
333 		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
334 			continue;
335 
336 		return start_pfn;
337 	}
338 
339 	return 0;
340 }
341 
342 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
343 static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
344 				    unsigned long start_pfn,
345 				    unsigned long end_pfn)
346 {
347 	struct mem_section *ms;
348 	unsigned long pfn;
349 
350 	/* pfn is the end pfn of a memory section. */
351 	pfn = end_pfn - 1;
352 	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
353 		ms = __pfn_to_section(pfn);
354 
355 		if (unlikely(!valid_section(ms)))
356 			continue;
357 
358 		if (unlikely(pfn_to_nid(pfn) != nid))
359 			continue;
360 
361 		if (zone && zone != page_zone(pfn_to_page(pfn)))
362 			continue;
363 
364 		return pfn;
365 	}
366 
367 	return 0;
368 }
369 
370 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
371 			     unsigned long end_pfn)
372 {
373 	unsigned long zone_start_pfn = zone->zone_start_pfn;
374 	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
375 	unsigned long zone_end_pfn = z;
376 	unsigned long pfn;
377 	struct mem_section *ms;
378 	int nid = zone_to_nid(zone);
379 
380 	zone_span_writelock(zone);
381 	if (zone_start_pfn == start_pfn) {
382 		/*
383 		 * If the section is smallest section in the zone, it need
384 		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
385 		 * In this case, we find second smallest valid mem_section
386 		 * for shrinking zone.
387 		 */
388 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
389 						zone_end_pfn);
390 		if (pfn) {
391 			zone->zone_start_pfn = pfn;
392 			zone->spanned_pages = zone_end_pfn - pfn;
393 		}
394 	} else if (zone_end_pfn == end_pfn) {
395 		/*
396 		 * If the section is biggest section in the zone, it need
397 		 * shrink zone->spanned_pages.
398 		 * In this case, we find second biggest valid mem_section for
399 		 * shrinking zone.
400 		 */
401 		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
402 					       start_pfn);
403 		if (pfn)
404 			zone->spanned_pages = pfn - zone_start_pfn + 1;
405 	}
406 
407 	/*
408 	 * The section is not biggest or smallest mem_section in the zone, it
409 	 * only creates a hole in the zone. So in this case, we need not
410 	 * change the zone. But perhaps, the zone has only hole data. Thus
411 	 * it check the zone has only hole or not.
412 	 */
413 	pfn = zone_start_pfn;
414 	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
415 		ms = __pfn_to_section(pfn);
416 
417 		if (unlikely(!valid_section(ms)))
418 			continue;
419 
420 		if (page_zone(pfn_to_page(pfn)) != zone)
421 			continue;
422 
423 		 /* If the section is current section, it continues the loop */
424 		if (start_pfn == pfn)
425 			continue;
426 
427 		/* If we find valid section, we have nothing to do */
428 		zone_span_writeunlock(zone);
429 		return;
430 	}
431 
432 	/* The zone has no valid section */
433 	zone->zone_start_pfn = 0;
434 	zone->spanned_pages = 0;
435 	zone_span_writeunlock(zone);
436 }
437 
438 static void shrink_pgdat_span(struct pglist_data *pgdat,
439 			      unsigned long start_pfn, unsigned long end_pfn)
440 {
441 	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
442 	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
443 	unsigned long pgdat_end_pfn = p;
444 	unsigned long pfn;
445 	struct mem_section *ms;
446 	int nid = pgdat->node_id;
447 
448 	if (pgdat_start_pfn == start_pfn) {
449 		/*
450 		 * If the section is smallest section in the pgdat, it need
451 		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
452 		 * In this case, we find second smallest valid mem_section
453 		 * for shrinking zone.
454 		 */
455 		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
456 						pgdat_end_pfn);
457 		if (pfn) {
458 			pgdat->node_start_pfn = pfn;
459 			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
460 		}
461 	} else if (pgdat_end_pfn == end_pfn) {
462 		/*
463 		 * If the section is biggest section in the pgdat, it need
464 		 * shrink pgdat->node_spanned_pages.
465 		 * In this case, we find second biggest valid mem_section for
466 		 * shrinking zone.
467 		 */
468 		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
469 					       start_pfn);
470 		if (pfn)
471 			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
472 	}
473 
474 	/*
475 	 * If the section is not biggest or smallest mem_section in the pgdat,
476 	 * it only creates a hole in the pgdat. So in this case, we need not
477 	 * change the pgdat.
478 	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
479 	 * has only hole or not.
480 	 */
481 	pfn = pgdat_start_pfn;
482 	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
483 		ms = __pfn_to_section(pfn);
484 
485 		if (unlikely(!valid_section(ms)))
486 			continue;
487 
488 		if (pfn_to_nid(pfn) != nid)
489 			continue;
490 
491 		 /* If the section is current section, it continues the loop */
492 		if (start_pfn == pfn)
493 			continue;
494 
495 		/* If we find valid section, we have nothing to do */
496 		return;
497 	}
498 
499 	/* The pgdat has no valid section */
500 	pgdat->node_start_pfn = 0;
501 	pgdat->node_spanned_pages = 0;
502 }
503 
504 static void __remove_zone(struct zone *zone, unsigned long start_pfn)
505 {
506 	struct pglist_data *pgdat = zone->zone_pgdat;
507 	int nr_pages = PAGES_PER_SECTION;
508 	unsigned long flags;
509 
510 	pgdat_resize_lock(zone->zone_pgdat, &flags);
511 	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
512 	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
513 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
514 }
515 
516 static int __remove_section(struct zone *zone, struct mem_section *ms,
517 		unsigned long map_offset, struct vmem_altmap *altmap)
518 {
519 	unsigned long start_pfn;
520 	int scn_nr;
521 	int ret = -EINVAL;
522 
523 	if (!valid_section(ms))
524 		return ret;
525 
526 	ret = unregister_memory_section(ms);
527 	if (ret)
528 		return ret;
529 
530 	scn_nr = __section_nr(ms);
531 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
532 	__remove_zone(zone, start_pfn);
533 
534 	sparse_remove_one_section(zone, ms, map_offset, altmap);
535 	return 0;
536 }
537 
538 /**
539  * __remove_pages() - remove sections of pages from a zone
540  * @zone: zone from which pages need to be removed
541  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
542  * @nr_pages: number of pages to remove (must be multiple of section size)
543  * @altmap: alternative device page map or %NULL if default memmap is used
544  *
545  * Generic helper function to remove section mappings and sysfs entries
546  * for the section of the memory we are removing. Caller needs to make
547  * sure that pages are marked reserved and zones are adjust properly by
548  * calling offline_pages().
549  */
550 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
551 		 unsigned long nr_pages, struct vmem_altmap *altmap)
552 {
553 	unsigned long i;
554 	unsigned long map_offset = 0;
555 	int sections_to_remove, ret = 0;
556 
557 	/* In the ZONE_DEVICE case device driver owns the memory region */
558 	if (is_dev_zone(zone)) {
559 		if (altmap)
560 			map_offset = vmem_altmap_offset(altmap);
561 	} else {
562 		resource_size_t start, size;
563 
564 		start = phys_start_pfn << PAGE_SHIFT;
565 		size = nr_pages * PAGE_SIZE;
566 
567 		ret = release_mem_region_adjustable(&iomem_resource, start,
568 					size);
569 		if (ret) {
570 			resource_size_t endres = start + size - 1;
571 
572 			pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
573 					&start, &endres, ret);
574 		}
575 	}
576 
577 	clear_zone_contiguous(zone);
578 
579 	/*
580 	 * We can only remove entire sections
581 	 */
582 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
583 	BUG_ON(nr_pages % PAGES_PER_SECTION);
584 
585 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
586 	for (i = 0; i < sections_to_remove; i++) {
587 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
588 
589 		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
590 				altmap);
591 		map_offset = 0;
592 		if (ret)
593 			break;
594 	}
595 
596 	set_zone_contiguous(zone);
597 
598 	return ret;
599 }
600 #endif /* CONFIG_MEMORY_HOTREMOVE */
601 
602 int set_online_page_callback(online_page_callback_t callback)
603 {
604 	int rc = -EINVAL;
605 
606 	get_online_mems();
607 	mutex_lock(&online_page_callback_lock);
608 
609 	if (online_page_callback == generic_online_page) {
610 		online_page_callback = callback;
611 		rc = 0;
612 	}
613 
614 	mutex_unlock(&online_page_callback_lock);
615 	put_online_mems();
616 
617 	return rc;
618 }
619 EXPORT_SYMBOL_GPL(set_online_page_callback);
620 
621 int restore_online_page_callback(online_page_callback_t callback)
622 {
623 	int rc = -EINVAL;
624 
625 	get_online_mems();
626 	mutex_lock(&online_page_callback_lock);
627 
628 	if (online_page_callback == callback) {
629 		online_page_callback = generic_online_page;
630 		rc = 0;
631 	}
632 
633 	mutex_unlock(&online_page_callback_lock);
634 	put_online_mems();
635 
636 	return rc;
637 }
638 EXPORT_SYMBOL_GPL(restore_online_page_callback);
639 
640 void __online_page_set_limits(struct page *page)
641 {
642 }
643 EXPORT_SYMBOL_GPL(__online_page_set_limits);
644 
645 void __online_page_increment_counters(struct page *page)
646 {
647 	adjust_managed_page_count(page, 1);
648 }
649 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
650 
651 void __online_page_free(struct page *page)
652 {
653 	__free_reserved_page(page);
654 }
655 EXPORT_SYMBOL_GPL(__online_page_free);
656 
657 static void generic_online_page(struct page *page)
658 {
659 	__online_page_set_limits(page);
660 	__online_page_increment_counters(page);
661 	__online_page_free(page);
662 }
663 
664 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
665 			void *arg)
666 {
667 	unsigned long i;
668 	unsigned long onlined_pages = *(unsigned long *)arg;
669 	struct page *page;
670 
671 	if (PageReserved(pfn_to_page(start_pfn)))
672 		for (i = 0; i < nr_pages; i++) {
673 			page = pfn_to_page(start_pfn + i);
674 			(*online_page_callback)(page);
675 			onlined_pages++;
676 		}
677 
678 	online_mem_sections(start_pfn, start_pfn + nr_pages);
679 
680 	*(unsigned long *)arg = onlined_pages;
681 	return 0;
682 }
683 
684 /* check which state of node_states will be changed when online memory */
685 static void node_states_check_changes_online(unsigned long nr_pages,
686 	struct zone *zone, struct memory_notify *arg)
687 {
688 	int nid = zone_to_nid(zone);
689 
690 	arg->status_change_nid = -1;
691 	arg->status_change_nid_normal = -1;
692 	arg->status_change_nid_high = -1;
693 
694 	if (!node_state(nid, N_MEMORY))
695 		arg->status_change_nid = nid;
696 	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
697 		arg->status_change_nid_normal = nid;
698 #ifdef CONFIG_HIGHMEM
699 	if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
700 		arg->status_change_nid_high = nid;
701 #endif
702 }
703 
704 static void node_states_set_node(int node, struct memory_notify *arg)
705 {
706 	if (arg->status_change_nid_normal >= 0)
707 		node_set_state(node, N_NORMAL_MEMORY);
708 
709 	if (arg->status_change_nid_high >= 0)
710 		node_set_state(node, N_HIGH_MEMORY);
711 
712 	if (arg->status_change_nid >= 0)
713 		node_set_state(node, N_MEMORY);
714 }
715 
716 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
717 		unsigned long nr_pages)
718 {
719 	unsigned long old_end_pfn = zone_end_pfn(zone);
720 
721 	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
722 		zone->zone_start_pfn = start_pfn;
723 
724 	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
725 }
726 
727 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
728                                      unsigned long nr_pages)
729 {
730 	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
731 
732 	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
733 		pgdat->node_start_pfn = start_pfn;
734 
735 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
736 }
737 
738 void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
739 		unsigned long nr_pages, struct vmem_altmap *altmap)
740 {
741 	struct pglist_data *pgdat = zone->zone_pgdat;
742 	int nid = pgdat->node_id;
743 	unsigned long flags;
744 
745 	if (zone_is_empty(zone))
746 		init_currently_empty_zone(zone, start_pfn, nr_pages);
747 
748 	clear_zone_contiguous(zone);
749 
750 	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
751 	pgdat_resize_lock(pgdat, &flags);
752 	zone_span_writelock(zone);
753 	resize_zone_range(zone, start_pfn, nr_pages);
754 	zone_span_writeunlock(zone);
755 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
756 	pgdat_resize_unlock(pgdat, &flags);
757 
758 	/*
759 	 * TODO now we have a visible range of pages which are not associated
760 	 * with their zone properly. Not nice but set_pfnblock_flags_mask
761 	 * expects the zone spans the pfn range. All the pages in the range
762 	 * are reserved so nobody should be touching them so we should be safe
763 	 */
764 	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
765 			MEMMAP_HOTPLUG, altmap);
766 
767 	set_zone_contiguous(zone);
768 }
769 
770 /*
771  * Returns a default kernel memory zone for the given pfn range.
772  * If no kernel zone covers this pfn range it will automatically go
773  * to the ZONE_NORMAL.
774  */
775 static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
776 		unsigned long nr_pages)
777 {
778 	struct pglist_data *pgdat = NODE_DATA(nid);
779 	int zid;
780 
781 	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
782 		struct zone *zone = &pgdat->node_zones[zid];
783 
784 		if (zone_intersects(zone, start_pfn, nr_pages))
785 			return zone;
786 	}
787 
788 	return &pgdat->node_zones[ZONE_NORMAL];
789 }
790 
791 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
792 		unsigned long nr_pages)
793 {
794 	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
795 			nr_pages);
796 	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
797 	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
798 	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
799 
800 	/*
801 	 * We inherit the existing zone in a simple case where zones do not
802 	 * overlap in the given range
803 	 */
804 	if (in_kernel ^ in_movable)
805 		return (in_kernel) ? kernel_zone : movable_zone;
806 
807 	/*
808 	 * If the range doesn't belong to any zone or two zones overlap in the
809 	 * given range then we use movable zone only if movable_node is
810 	 * enabled because we always online to a kernel zone by default.
811 	 */
812 	return movable_node_enabled ? movable_zone : kernel_zone;
813 }
814 
815 struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
816 		unsigned long nr_pages)
817 {
818 	if (online_type == MMOP_ONLINE_KERNEL)
819 		return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
820 
821 	if (online_type == MMOP_ONLINE_MOVABLE)
822 		return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
823 
824 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
825 }
826 
827 /*
828  * Associates the given pfn range with the given node and the zone appropriate
829  * for the given online type.
830  */
831 static struct zone * __meminit move_pfn_range(int online_type, int nid,
832 		unsigned long start_pfn, unsigned long nr_pages)
833 {
834 	struct zone *zone;
835 
836 	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
837 	move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
838 	return zone;
839 }
840 
841 /* Must be protected by mem_hotplug_begin() or a device_lock */
842 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
843 {
844 	unsigned long flags;
845 	unsigned long onlined_pages = 0;
846 	struct zone *zone;
847 	int need_zonelists_rebuild = 0;
848 	int nid;
849 	int ret;
850 	struct memory_notify arg;
851 	struct memory_block *mem;
852 
853 	/*
854 	 * We can't use pfn_to_nid() because nid might be stored in struct page
855 	 * which is not yet initialized. Instead, we find nid from memory block.
856 	 */
857 	mem = find_memory_block(__pfn_to_section(pfn));
858 	nid = mem->nid;
859 
860 	/* associate pfn range with the zone */
861 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
862 
863 	arg.start_pfn = pfn;
864 	arg.nr_pages = nr_pages;
865 	node_states_check_changes_online(nr_pages, zone, &arg);
866 
867 	ret = memory_notify(MEM_GOING_ONLINE, &arg);
868 	ret = notifier_to_errno(ret);
869 	if (ret)
870 		goto failed_addition;
871 
872 	/*
873 	 * If this zone is not populated, then it is not in zonelist.
874 	 * This means the page allocator ignores this zone.
875 	 * So, zonelist must be updated after online.
876 	 */
877 	if (!populated_zone(zone)) {
878 		need_zonelists_rebuild = 1;
879 		setup_zone_pageset(zone);
880 	}
881 
882 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
883 		online_pages_range);
884 	if (ret) {
885 		if (need_zonelists_rebuild)
886 			zone_pcp_reset(zone);
887 		goto failed_addition;
888 	}
889 
890 	zone->present_pages += onlined_pages;
891 
892 	pgdat_resize_lock(zone->zone_pgdat, &flags);
893 	zone->zone_pgdat->node_present_pages += onlined_pages;
894 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
895 
896 	if (onlined_pages) {
897 		node_states_set_node(nid, &arg);
898 		if (need_zonelists_rebuild)
899 			build_all_zonelists(NULL);
900 		else
901 			zone_pcp_update(zone);
902 	}
903 
904 	init_per_zone_wmark_min();
905 
906 	if (onlined_pages) {
907 		kswapd_run(nid);
908 		kcompactd_run(nid);
909 	}
910 
911 	vm_total_pages = nr_free_pagecache_pages();
912 
913 	writeback_set_ratelimit();
914 
915 	if (onlined_pages)
916 		memory_notify(MEM_ONLINE, &arg);
917 	return 0;
918 
919 failed_addition:
920 	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
921 		 (unsigned long long) pfn << PAGE_SHIFT,
922 		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
923 	memory_notify(MEM_CANCEL_ONLINE, &arg);
924 	return ret;
925 }
926 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
927 
928 static void reset_node_present_pages(pg_data_t *pgdat)
929 {
930 	struct zone *z;
931 
932 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
933 		z->present_pages = 0;
934 
935 	pgdat->node_present_pages = 0;
936 }
937 
938 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
939 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
940 {
941 	struct pglist_data *pgdat;
942 	unsigned long start_pfn = PFN_DOWN(start);
943 
944 	pgdat = NODE_DATA(nid);
945 	if (!pgdat) {
946 		pgdat = arch_alloc_nodedata(nid);
947 		if (!pgdat)
948 			return NULL;
949 
950 		arch_refresh_nodedata(nid, pgdat);
951 	} else {
952 		/*
953 		 * Reset the nr_zones, order and classzone_idx before reuse.
954 		 * Note that kswapd will init kswapd_classzone_idx properly
955 		 * when it starts in the near future.
956 		 */
957 		pgdat->nr_zones = 0;
958 		pgdat->kswapd_order = 0;
959 		pgdat->kswapd_classzone_idx = 0;
960 	}
961 
962 	/* we can use NODE_DATA(nid) from here */
963 
964 	pgdat->node_id = nid;
965 	pgdat->node_start_pfn = start_pfn;
966 
967 	/* init node's zones as empty zones, we don't have any present pages.*/
968 	free_area_init_core_hotplug(nid);
969 	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
970 
971 	/*
972 	 * The node we allocated has no zone fallback lists. For avoiding
973 	 * to access not-initialized zonelist, build here.
974 	 */
975 	build_all_zonelists(pgdat);
976 
977 	/*
978 	 * When memory is hot-added, all the memory is in offline state. So
979 	 * clear all zones' present_pages because they will be updated in
980 	 * online_pages() and offline_pages().
981 	 */
982 	reset_node_managed_pages(pgdat);
983 	reset_node_present_pages(pgdat);
984 
985 	return pgdat;
986 }
987 
988 static void rollback_node_hotadd(int nid)
989 {
990 	pg_data_t *pgdat = NODE_DATA(nid);
991 
992 	arch_refresh_nodedata(nid, NULL);
993 	free_percpu(pgdat->per_cpu_nodestats);
994 	arch_free_nodedata(pgdat);
995 	return;
996 }
997 
998 
999 /**
1000  * try_online_node - online a node if offlined
1001  * @nid: the node ID
1002  * @start: start addr of the node
1003  * @set_node_online: Whether we want to online the node
1004  * called by cpu_up() to online a node without onlined memory.
1005  *
1006  * Returns:
1007  * 1 -> a new node has been allocated
1008  * 0 -> the node is already online
1009  * -ENOMEM -> the node could not be allocated
1010  */
1011 static int __try_online_node(int nid, u64 start, bool set_node_online)
1012 {
1013 	pg_data_t *pgdat;
1014 	int ret = 1;
1015 
1016 	if (node_online(nid))
1017 		return 0;
1018 
1019 	pgdat = hotadd_new_pgdat(nid, start);
1020 	if (!pgdat) {
1021 		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1022 		ret = -ENOMEM;
1023 		goto out;
1024 	}
1025 
1026 	if (set_node_online) {
1027 		node_set_online(nid);
1028 		ret = register_one_node(nid);
1029 		BUG_ON(ret);
1030 	}
1031 out:
1032 	return ret;
1033 }
1034 
1035 /*
1036  * Users of this function always want to online/register the node
1037  */
1038 int try_online_node(int nid)
1039 {
1040 	int ret;
1041 
1042 	mem_hotplug_begin();
1043 	ret =  __try_online_node(nid, 0, true);
1044 	mem_hotplug_done();
1045 	return ret;
1046 }
1047 
1048 static int check_hotplug_memory_range(u64 start, u64 size)
1049 {
1050 	unsigned long block_sz = memory_block_size_bytes();
1051 	u64 block_nr_pages = block_sz >> PAGE_SHIFT;
1052 	u64 nr_pages = size >> PAGE_SHIFT;
1053 	u64 start_pfn = PFN_DOWN(start);
1054 
1055 	/* memory range must be block size aligned */
1056 	if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
1057 	    !IS_ALIGNED(nr_pages, block_nr_pages)) {
1058 		pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1059 		       block_sz, start, size);
1060 		return -EINVAL;
1061 	}
1062 
1063 	return 0;
1064 }
1065 
1066 static int online_memory_block(struct memory_block *mem, void *arg)
1067 {
1068 	return device_online(&mem->dev);
1069 }
1070 
1071 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1072 int __ref add_memory_resource(int nid, struct resource *res, bool online)
1073 {
1074 	u64 start, size;
1075 	bool new_node = false;
1076 	int ret;
1077 
1078 	start = res->start;
1079 	size = resource_size(res);
1080 
1081 	ret = check_hotplug_memory_range(start, size);
1082 	if (ret)
1083 		return ret;
1084 
1085 	mem_hotplug_begin();
1086 
1087 	/*
1088 	 * Add new range to memblock so that when hotadd_new_pgdat() is called
1089 	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1090 	 * this new range and calculate total pages correctly.  The range will
1091 	 * be removed at hot-remove time.
1092 	 */
1093 	memblock_add_node(start, size, nid);
1094 
1095 	ret = __try_online_node(nid, start, false);
1096 	if (ret < 0)
1097 		goto error;
1098 	new_node = ret;
1099 
1100 	/* call arch's memory hotadd */
1101 	ret = arch_add_memory(nid, start, size, NULL, true);
1102 	if (ret < 0)
1103 		goto error;
1104 
1105 	if (new_node) {
1106 		/* If sysfs file of new node can't be created, cpu on the node
1107 		 * can't be hot-added. There is no rollback way now.
1108 		 * So, check by BUG_ON() to catch it reluctantly..
1109 		 * We online node here. We can't roll back from here.
1110 		 */
1111 		node_set_online(nid);
1112 		ret = __register_one_node(nid);
1113 		BUG_ON(ret);
1114 	}
1115 
1116 	/* link memory sections under this node.*/
1117 	ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
1118 	BUG_ON(ret);
1119 
1120 	/* create new memmap entry */
1121 	firmware_map_add_hotplug(start, start + size, "System RAM");
1122 
1123 	/* online pages if requested */
1124 	if (online)
1125 		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1126 				  NULL, online_memory_block);
1127 
1128 	goto out;
1129 
1130 error:
1131 	/* rollback pgdat allocation and others */
1132 	if (new_node)
1133 		rollback_node_hotadd(nid);
1134 	memblock_remove(start, size);
1135 
1136 out:
1137 	mem_hotplug_done();
1138 	return ret;
1139 }
1140 EXPORT_SYMBOL_GPL(add_memory_resource);
1141 
1142 int __ref add_memory(int nid, u64 start, u64 size)
1143 {
1144 	struct resource *res;
1145 	int ret;
1146 
1147 	res = register_memory_resource(start, size);
1148 	if (IS_ERR(res))
1149 		return PTR_ERR(res);
1150 
1151 	ret = add_memory_resource(nid, res, memhp_auto_online);
1152 	if (ret < 0)
1153 		release_memory_resource(res);
1154 	return ret;
1155 }
1156 EXPORT_SYMBOL_GPL(add_memory);
1157 
1158 #ifdef CONFIG_MEMORY_HOTREMOVE
1159 /*
1160  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1161  * set and the size of the free page is given by page_order(). Using this,
1162  * the function determines if the pageblock contains only free pages.
1163  * Due to buddy contraints, a free page at least the size of a pageblock will
1164  * be located at the start of the pageblock
1165  */
1166 static inline int pageblock_free(struct page *page)
1167 {
1168 	return PageBuddy(page) && page_order(page) >= pageblock_order;
1169 }
1170 
1171 /* Return the start of the next active pageblock after a given page */
1172 static struct page *next_active_pageblock(struct page *page)
1173 {
1174 	/* Ensure the starting page is pageblock-aligned */
1175 	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
1176 
1177 	/* If the entire pageblock is free, move to the end of free page */
1178 	if (pageblock_free(page)) {
1179 		int order;
1180 		/* be careful. we don't have locks, page_order can be changed.*/
1181 		order = page_order(page);
1182 		if ((order < MAX_ORDER) && (order >= pageblock_order))
1183 			return page + (1 << order);
1184 	}
1185 
1186 	return page + pageblock_nr_pages;
1187 }
1188 
1189 static bool is_pageblock_removable_nolock(struct page *page)
1190 {
1191 	struct zone *zone;
1192 	unsigned long pfn;
1193 
1194 	/*
1195 	 * We have to be careful here because we are iterating over memory
1196 	 * sections which are not zone aware so we might end up outside of
1197 	 * the zone but still within the section.
1198 	 * We have to take care about the node as well. If the node is offline
1199 	 * its NODE_DATA will be NULL - see page_zone.
1200 	 */
1201 	if (!node_online(page_to_nid(page)))
1202 		return false;
1203 
1204 	zone = page_zone(page);
1205 	pfn = page_to_pfn(page);
1206 	if (!zone_spans_pfn(zone, pfn))
1207 		return false;
1208 
1209 	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
1210 }
1211 
1212 /* Checks if this range of memory is likely to be hot-removable. */
1213 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1214 {
1215 	struct page *page = pfn_to_page(start_pfn);
1216 	struct page *end_page = page + nr_pages;
1217 
1218 	/* Check the starting page of each pageblock within the range */
1219 	for (; page < end_page; page = next_active_pageblock(page)) {
1220 		if (!is_pageblock_removable_nolock(page))
1221 			return false;
1222 		cond_resched();
1223 	}
1224 
1225 	/* All pageblocks in the memory block are likely to be hot-removable */
1226 	return true;
1227 }
1228 
1229 /*
1230  * Confirm all pages in a range [start, end) belong to the same zone.
1231  * When true, return its valid [start, end).
1232  */
1233 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1234 			 unsigned long *valid_start, unsigned long *valid_end)
1235 {
1236 	unsigned long pfn, sec_end_pfn;
1237 	unsigned long start, end;
1238 	struct zone *zone = NULL;
1239 	struct page *page;
1240 	int i;
1241 	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1242 	     pfn < end_pfn;
1243 	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1244 		/* Make sure the memory section is present first */
1245 		if (!present_section_nr(pfn_to_section_nr(pfn)))
1246 			continue;
1247 		for (; pfn < sec_end_pfn && pfn < end_pfn;
1248 		     pfn += MAX_ORDER_NR_PAGES) {
1249 			i = 0;
1250 			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
1251 			while ((i < MAX_ORDER_NR_PAGES) &&
1252 				!pfn_valid_within(pfn + i))
1253 				i++;
1254 			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1255 				continue;
1256 			page = pfn_to_page(pfn + i);
1257 			if (zone && page_zone(page) != zone)
1258 				return 0;
1259 			if (!zone)
1260 				start = pfn + i;
1261 			zone = page_zone(page);
1262 			end = pfn + MAX_ORDER_NR_PAGES;
1263 		}
1264 	}
1265 
1266 	if (zone) {
1267 		*valid_start = start;
1268 		*valid_end = min(end, end_pfn);
1269 		return 1;
1270 	} else {
1271 		return 0;
1272 	}
1273 }
1274 
1275 /*
1276  * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1277  * non-lru movable pages and hugepages). We scan pfn because it's much
1278  * easier than scanning over linked list. This function returns the pfn
1279  * of the first found movable page if it's found, otherwise 0.
1280  */
1281 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1282 {
1283 	unsigned long pfn;
1284 	struct page *page;
1285 	for (pfn = start; pfn < end; pfn++) {
1286 		if (pfn_valid(pfn)) {
1287 			page = pfn_to_page(pfn);
1288 			if (PageLRU(page))
1289 				return pfn;
1290 			if (__PageMovable(page))
1291 				return pfn;
1292 			if (PageHuge(page)) {
1293 				if (hugepage_migration_supported(page_hstate(page)) &&
1294 				    page_huge_active(page))
1295 					return pfn;
1296 				else
1297 					pfn = round_up(pfn + 1,
1298 						1 << compound_order(page)) - 1;
1299 			}
1300 		}
1301 	}
1302 	return 0;
1303 }
1304 
1305 static struct page *new_node_page(struct page *page, unsigned long private)
1306 {
1307 	int nid = page_to_nid(page);
1308 	nodemask_t nmask = node_states[N_MEMORY];
1309 
1310 	/*
1311 	 * try to allocate from a different node but reuse this node if there
1312 	 * are no other online nodes to be used (e.g. we are offlining a part
1313 	 * of the only existing node)
1314 	 */
1315 	node_clear(nid, nmask);
1316 	if (nodes_empty(nmask))
1317 		node_set(nid, nmask);
1318 
1319 	return new_page_nodemask(page, nid, &nmask);
1320 }
1321 
1322 #define NR_OFFLINE_AT_ONCE_PAGES	(256)
1323 static int
1324 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1325 {
1326 	unsigned long pfn;
1327 	struct page *page;
1328 	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1329 	int not_managed = 0;
1330 	int ret = 0;
1331 	LIST_HEAD(source);
1332 
1333 	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1334 		if (!pfn_valid(pfn))
1335 			continue;
1336 		page = pfn_to_page(pfn);
1337 
1338 		if (PageHuge(page)) {
1339 			struct page *head = compound_head(page);
1340 			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1341 			if (compound_order(head) > PFN_SECTION_SHIFT) {
1342 				ret = -EBUSY;
1343 				break;
1344 			}
1345 			if (isolate_huge_page(page, &source))
1346 				move_pages -= 1 << compound_order(head);
1347 			continue;
1348 		} else if (PageTransHuge(page))
1349 			pfn = page_to_pfn(compound_head(page))
1350 				+ hpage_nr_pages(page) - 1;
1351 
1352 		if (!get_page_unless_zero(page))
1353 			continue;
1354 		/*
1355 		 * We can skip free pages. And we can deal with pages on
1356 		 * LRU and non-lru movable pages.
1357 		 */
1358 		if (PageLRU(page))
1359 			ret = isolate_lru_page(page);
1360 		else
1361 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1362 		if (!ret) { /* Success */
1363 			put_page(page);
1364 			list_add_tail(&page->lru, &source);
1365 			move_pages--;
1366 			if (!__PageMovable(page))
1367 				inc_node_page_state(page, NR_ISOLATED_ANON +
1368 						    page_is_file_cache(page));
1369 
1370 		} else {
1371 #ifdef CONFIG_DEBUG_VM
1372 			pr_alert("failed to isolate pfn %lx\n", pfn);
1373 			dump_page(page, "isolation failed");
1374 #endif
1375 			put_page(page);
1376 			/* Because we don't have big zone->lock. we should
1377 			   check this again here. */
1378 			if (page_count(page)) {
1379 				not_managed++;
1380 				ret = -EBUSY;
1381 				break;
1382 			}
1383 		}
1384 	}
1385 	if (!list_empty(&source)) {
1386 		if (not_managed) {
1387 			putback_movable_pages(&source);
1388 			goto out;
1389 		}
1390 
1391 		/* Allocate a new page from the nearest neighbor node */
1392 		ret = migrate_pages(&source, new_node_page, NULL, 0,
1393 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1394 		if (ret)
1395 			putback_movable_pages(&source);
1396 	}
1397 out:
1398 	return ret;
1399 }
1400 
1401 /*
1402  * remove from free_area[] and mark all as Reserved.
1403  */
1404 static int
1405 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1406 			void *data)
1407 {
1408 	__offline_isolated_pages(start, start + nr_pages);
1409 	return 0;
1410 }
1411 
1412 static void
1413 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1414 {
1415 	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1416 				offline_isolated_pages_cb);
1417 }
1418 
1419 /*
1420  * Check all pages in range, recoreded as memory resource, are isolated.
1421  */
1422 static int
1423 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1424 			void *data)
1425 {
1426 	int ret;
1427 	long offlined = *(long *)data;
1428 	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1429 	offlined = nr_pages;
1430 	if (!ret)
1431 		*(long *)data += offlined;
1432 	return ret;
1433 }
1434 
1435 static long
1436 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1437 {
1438 	long offlined = 0;
1439 	int ret;
1440 
1441 	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1442 			check_pages_isolated_cb);
1443 	if (ret < 0)
1444 		offlined = (long)ret;
1445 	return offlined;
1446 }
1447 
1448 static int __init cmdline_parse_movable_node(char *p)
1449 {
1450 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1451 	movable_node_enabled = true;
1452 #else
1453 	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1454 #endif
1455 	return 0;
1456 }
1457 early_param("movable_node", cmdline_parse_movable_node);
1458 
1459 /* check which state of node_states will be changed when offline memory */
1460 static void node_states_check_changes_offline(unsigned long nr_pages,
1461 		struct zone *zone, struct memory_notify *arg)
1462 {
1463 	struct pglist_data *pgdat = zone->zone_pgdat;
1464 	unsigned long present_pages = 0;
1465 	enum zone_type zt;
1466 
1467 	arg->status_change_nid = -1;
1468 	arg->status_change_nid_normal = -1;
1469 	arg->status_change_nid_high = -1;
1470 
1471 	/*
1472 	 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
1473 	 * If the memory to be offline is within the range
1474 	 * [0..ZONE_NORMAL], and it is the last present memory there,
1475 	 * the zones in that range will become empty after the offlining,
1476 	 * thus we can determine that we need to clear the node from
1477 	 * node_states[N_NORMAL_MEMORY].
1478 	 */
1479 	for (zt = 0; zt <= ZONE_NORMAL; zt++)
1480 		present_pages += pgdat->node_zones[zt].present_pages;
1481 	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1482 		arg->status_change_nid_normal = zone_to_nid(zone);
1483 
1484 #ifdef CONFIG_HIGHMEM
1485 	/*
1486 	 * node_states[N_HIGH_MEMORY] contains nodes which
1487 	 * have normal memory or high memory.
1488 	 * Here we add the present_pages belonging to ZONE_HIGHMEM.
1489 	 * If the zone is within the range of [0..ZONE_HIGHMEM), and
1490 	 * we determine that the zones in that range become empty,
1491 	 * we need to clear the node for N_HIGH_MEMORY.
1492 	 */
1493 	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1494 	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
1495 		arg->status_change_nid_high = zone_to_nid(zone);
1496 #endif
1497 
1498 	/*
1499 	 * We have accounted the pages from [0..ZONE_NORMAL), and
1500 	 * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
1501 	 * as well.
1502 	 * Here we count the possible pages from ZONE_MOVABLE.
1503 	 * If after having accounted all the pages, we see that the nr_pages
1504 	 * to be offlined is over or equal to the accounted pages,
1505 	 * we know that the node will become empty, and so, we can clear
1506 	 * it for N_MEMORY as well.
1507 	 */
1508 	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1509 
1510 	if (nr_pages >= present_pages)
1511 		arg->status_change_nid = zone_to_nid(zone);
1512 }
1513 
1514 static void node_states_clear_node(int node, struct memory_notify *arg)
1515 {
1516 	if (arg->status_change_nid_normal >= 0)
1517 		node_clear_state(node, N_NORMAL_MEMORY);
1518 
1519 	if (arg->status_change_nid_high >= 0)
1520 		node_clear_state(node, N_HIGH_MEMORY);
1521 
1522 	if (arg->status_change_nid >= 0)
1523 		node_clear_state(node, N_MEMORY);
1524 }
1525 
1526 static int __ref __offline_pages(unsigned long start_pfn,
1527 		  unsigned long end_pfn)
1528 {
1529 	unsigned long pfn, nr_pages;
1530 	long offlined_pages;
1531 	int ret, node;
1532 	unsigned long flags;
1533 	unsigned long valid_start, valid_end;
1534 	struct zone *zone;
1535 	struct memory_notify arg;
1536 
1537 	/* at least, alignment against pageblock is necessary */
1538 	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1539 		return -EINVAL;
1540 	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1541 		return -EINVAL;
1542 	/* This makes hotplug much easier...and readable.
1543 	   we assume this for now. .*/
1544 	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
1545 		return -EINVAL;
1546 
1547 	zone = page_zone(pfn_to_page(valid_start));
1548 	node = zone_to_nid(zone);
1549 	nr_pages = end_pfn - start_pfn;
1550 
1551 	/* set above range as isolated */
1552 	ret = start_isolate_page_range(start_pfn, end_pfn,
1553 				       MIGRATE_MOVABLE, true);
1554 	if (ret)
1555 		return ret;
1556 
1557 	arg.start_pfn = start_pfn;
1558 	arg.nr_pages = nr_pages;
1559 	node_states_check_changes_offline(nr_pages, zone, &arg);
1560 
1561 	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1562 	ret = notifier_to_errno(ret);
1563 	if (ret)
1564 		goto failed_removal;
1565 
1566 	pfn = start_pfn;
1567 repeat:
1568 	/* start memory hot removal */
1569 	ret = -EINTR;
1570 	if (signal_pending(current))
1571 		goto failed_removal;
1572 
1573 	cond_resched();
1574 	lru_add_drain_all();
1575 	drain_all_pages(zone);
1576 
1577 	pfn = scan_movable_pages(start_pfn, end_pfn);
1578 	if (pfn) { /* We have movable pages */
1579 		ret = do_migrate_range(pfn, end_pfn);
1580 		goto repeat;
1581 	}
1582 
1583 	/*
1584 	 * dissolve free hugepages in the memory block before doing offlining
1585 	 * actually in order to make hugetlbfs's object counting consistent.
1586 	 */
1587 	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1588 	if (ret)
1589 		goto failed_removal;
1590 	/* check again */
1591 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1592 	if (offlined_pages < 0)
1593 		goto repeat;
1594 	pr_info("Offlined Pages %ld\n", offlined_pages);
1595 	/* Ok, all of our target is isolated.
1596 	   We cannot do rollback at this point. */
1597 	offline_isolated_pages(start_pfn, end_pfn);
1598 	/* reset pagetype flags and makes migrate type to be MOVABLE */
1599 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1600 	/* removal success */
1601 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1602 	zone->present_pages -= offlined_pages;
1603 
1604 	pgdat_resize_lock(zone->zone_pgdat, &flags);
1605 	zone->zone_pgdat->node_present_pages -= offlined_pages;
1606 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1607 
1608 	init_per_zone_wmark_min();
1609 
1610 	if (!populated_zone(zone)) {
1611 		zone_pcp_reset(zone);
1612 		build_all_zonelists(NULL);
1613 	} else
1614 		zone_pcp_update(zone);
1615 
1616 	node_states_clear_node(node, &arg);
1617 	if (arg.status_change_nid >= 0) {
1618 		kswapd_stop(node);
1619 		kcompactd_stop(node);
1620 	}
1621 
1622 	vm_total_pages = nr_free_pagecache_pages();
1623 	writeback_set_ratelimit();
1624 
1625 	memory_notify(MEM_OFFLINE, &arg);
1626 	return 0;
1627 
1628 failed_removal:
1629 	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1630 		 (unsigned long long) start_pfn << PAGE_SHIFT,
1631 		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1632 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
1633 	/* pushback to free area */
1634 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1635 	return ret;
1636 }
1637 
1638 /* Must be protected by mem_hotplug_begin() or a device_lock */
1639 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1640 {
1641 	return __offline_pages(start_pfn, start_pfn + nr_pages);
1642 }
1643 #endif /* CONFIG_MEMORY_HOTREMOVE */
1644 
1645 /**
1646  * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1647  * @start_pfn: start pfn of the memory range
1648  * @end_pfn: end pfn of the memory range
1649  * @arg: argument passed to func
1650  * @func: callback for each memory section walked
1651  *
1652  * This function walks through all present mem sections in range
1653  * [start_pfn, end_pfn) and call func on each mem section.
1654  *
1655  * Returns the return value of func.
1656  */
1657 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1658 		void *arg, int (*func)(struct memory_block *, void *))
1659 {
1660 	struct memory_block *mem = NULL;
1661 	struct mem_section *section;
1662 	unsigned long pfn, section_nr;
1663 	int ret;
1664 
1665 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1666 		section_nr = pfn_to_section_nr(pfn);
1667 		if (!present_section_nr(section_nr))
1668 			continue;
1669 
1670 		section = __nr_to_section(section_nr);
1671 		/* same memblock? */
1672 		if (mem)
1673 			if ((section_nr >= mem->start_section_nr) &&
1674 			    (section_nr <= mem->end_section_nr))
1675 				continue;
1676 
1677 		mem = find_memory_block_hinted(section, mem);
1678 		if (!mem)
1679 			continue;
1680 
1681 		ret = func(mem, arg);
1682 		if (ret) {
1683 			kobject_put(&mem->dev.kobj);
1684 			return ret;
1685 		}
1686 	}
1687 
1688 	if (mem)
1689 		kobject_put(&mem->dev.kobj);
1690 
1691 	return 0;
1692 }
1693 
1694 #ifdef CONFIG_MEMORY_HOTREMOVE
1695 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1696 {
1697 	int ret = !is_memblock_offlined(mem);
1698 
1699 	if (unlikely(ret)) {
1700 		phys_addr_t beginpa, endpa;
1701 
1702 		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1703 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1704 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1705 			&beginpa, &endpa);
1706 	}
1707 
1708 	return ret;
1709 }
1710 
1711 static int check_cpu_on_node(pg_data_t *pgdat)
1712 {
1713 	int cpu;
1714 
1715 	for_each_present_cpu(cpu) {
1716 		if (cpu_to_node(cpu) == pgdat->node_id)
1717 			/*
1718 			 * the cpu on this node isn't removed, and we can't
1719 			 * offline this node.
1720 			 */
1721 			return -EBUSY;
1722 	}
1723 
1724 	return 0;
1725 }
1726 
1727 static void unmap_cpu_on_node(pg_data_t *pgdat)
1728 {
1729 #ifdef CONFIG_ACPI_NUMA
1730 	int cpu;
1731 
1732 	for_each_possible_cpu(cpu)
1733 		if (cpu_to_node(cpu) == pgdat->node_id)
1734 			numa_clear_node(cpu);
1735 #endif
1736 }
1737 
1738 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1739 {
1740 	int ret;
1741 
1742 	ret = check_cpu_on_node(pgdat);
1743 	if (ret)
1744 		return ret;
1745 
1746 	/*
1747 	 * the node will be offlined when we come here, so we can clear
1748 	 * the cpu_to_node() now.
1749 	 */
1750 
1751 	unmap_cpu_on_node(pgdat);
1752 	return 0;
1753 }
1754 
1755 /**
1756  * try_offline_node
1757  * @nid: the node ID
1758  *
1759  * Offline a node if all memory sections and cpus of the node are removed.
1760  *
1761  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1762  * and online/offline operations before this call.
1763  */
1764 void try_offline_node(int nid)
1765 {
1766 	pg_data_t *pgdat = NODE_DATA(nid);
1767 	unsigned long start_pfn = pgdat->node_start_pfn;
1768 	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1769 	unsigned long pfn;
1770 
1771 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1772 		unsigned long section_nr = pfn_to_section_nr(pfn);
1773 
1774 		if (!present_section_nr(section_nr))
1775 			continue;
1776 
1777 		if (pfn_to_nid(pfn) != nid)
1778 			continue;
1779 
1780 		/*
1781 		 * some memory sections of this node are not removed, and we
1782 		 * can't offline node now.
1783 		 */
1784 		return;
1785 	}
1786 
1787 	if (check_and_unmap_cpu_on_node(pgdat))
1788 		return;
1789 
1790 	/*
1791 	 * all memory/cpu of this node are removed, we can offline this
1792 	 * node now.
1793 	 */
1794 	node_set_offline(nid);
1795 	unregister_one_node(nid);
1796 }
1797 EXPORT_SYMBOL(try_offline_node);
1798 
1799 /**
1800  * remove_memory
1801  * @nid: the node ID
1802  * @start: physical address of the region to remove
1803  * @size: size of the region to remove
1804  *
1805  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1806  * and online/offline operations before this call, as required by
1807  * try_offline_node().
1808  */
1809 void __ref __remove_memory(int nid, u64 start, u64 size)
1810 {
1811 	int ret;
1812 
1813 	BUG_ON(check_hotplug_memory_range(start, size));
1814 
1815 	mem_hotplug_begin();
1816 
1817 	/*
1818 	 * All memory blocks must be offlined before removing memory.  Check
1819 	 * whether all memory blocks in question are offline and trigger a BUG()
1820 	 * if this is not the case.
1821 	 */
1822 	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1823 				check_memblock_offlined_cb);
1824 	if (ret)
1825 		BUG();
1826 
1827 	/* remove memmap entry */
1828 	firmware_map_remove(start, start + size, "System RAM");
1829 	memblock_free(start, size);
1830 	memblock_remove(start, size);
1831 
1832 	arch_remove_memory(start, size, NULL);
1833 
1834 	try_offline_node(nid);
1835 
1836 	mem_hotplug_done();
1837 }
1838 
1839 void remove_memory(int nid, u64 start, u64 size)
1840 {
1841 	lock_device_hotplug();
1842 	__remove_memory(nid, start, size);
1843 	unlock_device_hotplug();
1844 }
1845 EXPORT_SYMBOL_GPL(remove_memory);
1846 #endif /* CONFIG_MEMORY_HOTREMOVE */
1847