xref: /openbmc/linux/mm/compaction.c (revision 7587eb18)
1 /*
2  * linux/mm/compaction.c
3  *
4  * Memory compaction for the reduction of external fragmentation. Note that
5  * this heavily depends upon page migration to do all the real heavy
6  * lifting
7  *
8  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9  */
10 #include <linux/cpu.h>
11 #include <linux/swap.h>
12 #include <linux/migrate.h>
13 #include <linux/compaction.h>
14 #include <linux/mm_inline.h>
15 #include <linux/backing-dev.h>
16 #include <linux/sysctl.h>
17 #include <linux/sysfs.h>
18 #include <linux/balloon_compaction.h>
19 #include <linux/page-isolation.h>
20 #include <linux/kasan.h>
21 #include <linux/kthread.h>
22 #include <linux/freezer.h>
23 #include "internal.h"
24 
25 #ifdef CONFIG_COMPACTION
26 static inline void count_compact_event(enum vm_event_item item)
27 {
28 	count_vm_event(item);
29 }
30 
31 static inline void count_compact_events(enum vm_event_item item, long delta)
32 {
33 	count_vm_events(item, delta);
34 }
35 #else
36 #define count_compact_event(item) do { } while (0)
37 #define count_compact_events(item, delta) do { } while (0)
38 #endif
39 
40 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
41 
42 #define CREATE_TRACE_POINTS
43 #include <trace/events/compaction.h>
44 
45 #define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
46 #define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
47 #define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
48 #define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)
49 
50 static unsigned long release_freepages(struct list_head *freelist)
51 {
52 	struct page *page, *next;
53 	unsigned long high_pfn = 0;
54 
55 	list_for_each_entry_safe(page, next, freelist, lru) {
56 		unsigned long pfn = page_to_pfn(page);
57 		list_del(&page->lru);
58 		__free_page(page);
59 		if (pfn > high_pfn)
60 			high_pfn = pfn;
61 	}
62 
63 	return high_pfn;
64 }
65 
66 static void map_pages(struct list_head *list)
67 {
68 	struct page *page;
69 
70 	list_for_each_entry(page, list, lru) {
71 		arch_alloc_page(page, 0);
72 		kernel_map_pages(page, 1, 1);
73 		kasan_alloc_pages(page, 0);
74 	}
75 }
76 
77 static inline bool migrate_async_suitable(int migratetype)
78 {
79 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
80 }
81 
82 #ifdef CONFIG_COMPACTION
83 
84 /* Do not skip compaction more than 64 times */
85 #define COMPACT_MAX_DEFER_SHIFT 6
86 
87 /*
88  * Compaction is deferred when compaction fails to result in a page
89  * allocation success. 1 << compact_defer_limit compactions are skipped up
90  * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
91  */
92 void defer_compaction(struct zone *zone, int order)
93 {
94 	zone->compact_considered = 0;
95 	zone->compact_defer_shift++;
96 
97 	if (order < zone->compact_order_failed)
98 		zone->compact_order_failed = order;
99 
100 	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
101 		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
102 
103 	trace_mm_compaction_defer_compaction(zone, order);
104 }
105 
106 /* Returns true if compaction should be skipped this time */
107 bool compaction_deferred(struct zone *zone, int order)
108 {
109 	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
110 
111 	if (order < zone->compact_order_failed)
112 		return false;
113 
114 	/* Avoid possible overflow */
115 	if (++zone->compact_considered > defer_limit)
116 		zone->compact_considered = defer_limit;
117 
118 	if (zone->compact_considered >= defer_limit)
119 		return false;
120 
121 	trace_mm_compaction_deferred(zone, order);
122 
123 	return true;
124 }
125 
126 /*
127  * Update defer tracking counters after successful compaction of given order,
128  * which means an allocation either succeeded (alloc_success == true) or is
129  * expected to succeed.
130  */
131 void compaction_defer_reset(struct zone *zone, int order,
132 		bool alloc_success)
133 {
134 	if (alloc_success) {
135 		zone->compact_considered = 0;
136 		zone->compact_defer_shift = 0;
137 	}
138 	if (order >= zone->compact_order_failed)
139 		zone->compact_order_failed = order + 1;
140 
141 	trace_mm_compaction_defer_reset(zone, order);
142 }
143 
144 /* Returns true if restarting compaction after many failures */
145 bool compaction_restarting(struct zone *zone, int order)
146 {
147 	if (order < zone->compact_order_failed)
148 		return false;
149 
150 	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
151 		zone->compact_considered >= 1UL << zone->compact_defer_shift;
152 }
153 
154 /* Returns true if the pageblock should be scanned for pages to isolate. */
155 static inline bool isolation_suitable(struct compact_control *cc,
156 					struct page *page)
157 {
158 	if (cc->ignore_skip_hint)
159 		return true;
160 
161 	return !get_pageblock_skip(page);
162 }
163 
164 static void reset_cached_positions(struct zone *zone)
165 {
166 	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
167 	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
168 	zone->compact_cached_free_pfn =
169 				pageblock_start_pfn(zone_end_pfn(zone) - 1);
170 }
171 
172 /*
173  * This function is called to clear all cached information on pageblocks that
174  * should be skipped for page isolation when the migrate and free page scanner
175  * meet.
176  */
177 static void __reset_isolation_suitable(struct zone *zone)
178 {
179 	unsigned long start_pfn = zone->zone_start_pfn;
180 	unsigned long end_pfn = zone_end_pfn(zone);
181 	unsigned long pfn;
182 
183 	zone->compact_blockskip_flush = false;
184 
185 	/* Walk the zone and mark every pageblock as suitable for isolation */
186 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
187 		struct page *page;
188 
189 		cond_resched();
190 
191 		if (!pfn_valid(pfn))
192 			continue;
193 
194 		page = pfn_to_page(pfn);
195 		if (zone != page_zone(page))
196 			continue;
197 
198 		clear_pageblock_skip(page);
199 	}
200 
201 	reset_cached_positions(zone);
202 }
203 
204 void reset_isolation_suitable(pg_data_t *pgdat)
205 {
206 	int zoneid;
207 
208 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
209 		struct zone *zone = &pgdat->node_zones[zoneid];
210 		if (!populated_zone(zone))
211 			continue;
212 
213 		/* Only flush if a full compaction finished recently */
214 		if (zone->compact_blockskip_flush)
215 			__reset_isolation_suitable(zone);
216 	}
217 }
218 
219 /*
220  * If no pages were isolated then mark this pageblock to be skipped in the
221  * future. The information is later cleared by __reset_isolation_suitable().
222  */
223 static void update_pageblock_skip(struct compact_control *cc,
224 			struct page *page, unsigned long nr_isolated,
225 			bool migrate_scanner)
226 {
227 	struct zone *zone = cc->zone;
228 	unsigned long pfn;
229 
230 	if (cc->ignore_skip_hint)
231 		return;
232 
233 	if (!page)
234 		return;
235 
236 	if (nr_isolated)
237 		return;
238 
239 	set_pageblock_skip(page);
240 
241 	pfn = page_to_pfn(page);
242 
243 	/* Update where async and sync compaction should restart */
244 	if (migrate_scanner) {
245 		if (pfn > zone->compact_cached_migrate_pfn[0])
246 			zone->compact_cached_migrate_pfn[0] = pfn;
247 		if (cc->mode != MIGRATE_ASYNC &&
248 		    pfn > zone->compact_cached_migrate_pfn[1])
249 			zone->compact_cached_migrate_pfn[1] = pfn;
250 	} else {
251 		if (pfn < zone->compact_cached_free_pfn)
252 			zone->compact_cached_free_pfn = pfn;
253 	}
254 }
255 #else
256 static inline bool isolation_suitable(struct compact_control *cc,
257 					struct page *page)
258 {
259 	return true;
260 }
261 
262 static void update_pageblock_skip(struct compact_control *cc,
263 			struct page *page, unsigned long nr_isolated,
264 			bool migrate_scanner)
265 {
266 }
267 #endif /* CONFIG_COMPACTION */
268 
269 /*
270  * Compaction requires the taking of some coarse locks that are potentially
271  * very heavily contended. For async compaction, back out if the lock cannot
272  * be taken immediately. For sync compaction, spin on the lock if needed.
273  *
274  * Returns true if the lock is held
275  * Returns false if the lock is not held and compaction should abort
276  */
277 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
278 						struct compact_control *cc)
279 {
280 	if (cc->mode == MIGRATE_ASYNC) {
281 		if (!spin_trylock_irqsave(lock, *flags)) {
282 			cc->contended = COMPACT_CONTENDED_LOCK;
283 			return false;
284 		}
285 	} else {
286 		spin_lock_irqsave(lock, *flags);
287 	}
288 
289 	return true;
290 }
291 
292 /*
293  * Compaction requires the taking of some coarse locks that are potentially
294  * very heavily contended. The lock should be periodically unlocked to avoid
295  * having disabled IRQs for a long time, even when there is nobody waiting on
296  * the lock. It might also be that allowing the IRQs will result in
297  * need_resched() becoming true. If scheduling is needed, async compaction
298  * aborts. Sync compaction schedules.
299  * Either compaction type will also abort if a fatal signal is pending.
300  * In either case if the lock was locked, it is dropped and not regained.
301  *
302  * Returns true if compaction should abort due to fatal signal pending, or
303  *		async compaction due to need_resched()
304  * Returns false when compaction can continue (sync compaction might have
305  *		scheduled)
306  */
307 static bool compact_unlock_should_abort(spinlock_t *lock,
308 		unsigned long flags, bool *locked, struct compact_control *cc)
309 {
310 	if (*locked) {
311 		spin_unlock_irqrestore(lock, flags);
312 		*locked = false;
313 	}
314 
315 	if (fatal_signal_pending(current)) {
316 		cc->contended = COMPACT_CONTENDED_SCHED;
317 		return true;
318 	}
319 
320 	if (need_resched()) {
321 		if (cc->mode == MIGRATE_ASYNC) {
322 			cc->contended = COMPACT_CONTENDED_SCHED;
323 			return true;
324 		}
325 		cond_resched();
326 	}
327 
328 	return false;
329 }
330 
331 /*
332  * Aside from avoiding lock contention, compaction also periodically checks
333  * need_resched() and either schedules in sync compaction or aborts async
334  * compaction. This is similar to what compact_unlock_should_abort() does, but
335  * is used where no lock is concerned.
336  *
337  * Returns false when no scheduling was needed, or sync compaction scheduled.
338  * Returns true when async compaction should abort.
339  */
340 static inline bool compact_should_abort(struct compact_control *cc)
341 {
342 	/* async compaction aborts if contended */
343 	if (need_resched()) {
344 		if (cc->mode == MIGRATE_ASYNC) {
345 			cc->contended = COMPACT_CONTENDED_SCHED;
346 			return true;
347 		}
348 
349 		cond_resched();
350 	}
351 
352 	return false;
353 }
354 
355 /*
356  * Isolate free pages onto a private freelist. If @strict is true, will abort
357  * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
358  * (even though it may still end up isolating some pages).
359  */
360 static unsigned long isolate_freepages_block(struct compact_control *cc,
361 				unsigned long *start_pfn,
362 				unsigned long end_pfn,
363 				struct list_head *freelist,
364 				bool strict)
365 {
366 	int nr_scanned = 0, total_isolated = 0;
367 	struct page *cursor, *valid_page = NULL;
368 	unsigned long flags = 0;
369 	bool locked = false;
370 	unsigned long blockpfn = *start_pfn;
371 
372 	cursor = pfn_to_page(blockpfn);
373 
374 	/* Isolate free pages. */
375 	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
376 		int isolated, i;
377 		struct page *page = cursor;
378 
379 		/*
380 		 * Periodically drop the lock (if held) regardless of its
381 		 * contention, to give chance to IRQs. Abort if fatal signal
382 		 * pending or async compaction detects need_resched()
383 		 */
384 		if (!(blockpfn % SWAP_CLUSTER_MAX)
385 		    && compact_unlock_should_abort(&cc->zone->lock, flags,
386 								&locked, cc))
387 			break;
388 
389 		nr_scanned++;
390 		if (!pfn_valid_within(blockpfn))
391 			goto isolate_fail;
392 
393 		if (!valid_page)
394 			valid_page = page;
395 
396 		/*
397 		 * For compound pages such as THP and hugetlbfs, we can save
398 		 * potentially a lot of iterations if we skip them at once.
399 		 * The check is racy, but we can consider only valid values
400 		 * and the only danger is skipping too much.
401 		 */
402 		if (PageCompound(page)) {
403 			unsigned int comp_order = compound_order(page);
404 
405 			if (likely(comp_order < MAX_ORDER)) {
406 				blockpfn += (1UL << comp_order) - 1;
407 				cursor += (1UL << comp_order) - 1;
408 			}
409 
410 			goto isolate_fail;
411 		}
412 
413 		if (!PageBuddy(page))
414 			goto isolate_fail;
415 
416 		/*
417 		 * If we already hold the lock, we can skip some rechecking.
418 		 * Note that if we hold the lock now, checked_pageblock was
419 		 * already set in some previous iteration (or strict is true),
420 		 * so it is correct to skip the suitable migration target
421 		 * recheck as well.
422 		 */
423 		if (!locked) {
424 			/*
425 			 * The zone lock must be held to isolate freepages.
426 			 * Unfortunately this is a very coarse lock and can be
427 			 * heavily contended if there are parallel allocations
428 			 * or parallel compactions. For async compaction do not
429 			 * spin on the lock and we acquire the lock as late as
430 			 * possible.
431 			 */
432 			locked = compact_trylock_irqsave(&cc->zone->lock,
433 								&flags, cc);
434 			if (!locked)
435 				break;
436 
437 			/* Recheck this is a buddy page under lock */
438 			if (!PageBuddy(page))
439 				goto isolate_fail;
440 		}
441 
442 		/* Found a free page, break it into order-0 pages */
443 		isolated = split_free_page(page);
444 		total_isolated += isolated;
445 		for (i = 0; i < isolated; i++) {
446 			list_add(&page->lru, freelist);
447 			page++;
448 		}
449 
450 		/* If a page was split, advance to the end of it */
451 		if (isolated) {
452 			cc->nr_freepages += isolated;
453 			if (!strict &&
454 				cc->nr_migratepages <= cc->nr_freepages) {
455 				blockpfn += isolated;
456 				break;
457 			}
458 
459 			blockpfn += isolated - 1;
460 			cursor += isolated - 1;
461 			continue;
462 		}
463 
464 isolate_fail:
465 		if (strict)
466 			break;
467 		else
468 			continue;
469 
470 	}
471 
472 	/*
473 	 * There is a tiny chance that we have read bogus compound_order(),
474 	 * so be careful to not go outside of the pageblock.
475 	 */
476 	if (unlikely(blockpfn > end_pfn))
477 		blockpfn = end_pfn;
478 
479 	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
480 					nr_scanned, total_isolated);
481 
482 	/* Record how far we have got within the block */
483 	*start_pfn = blockpfn;
484 
485 	/*
486 	 * If strict isolation is requested by CMA then check that all the
487 	 * pages requested were isolated. If there were any failures, 0 is
488 	 * returned and CMA will fail.
489 	 */
490 	if (strict && blockpfn < end_pfn)
491 		total_isolated = 0;
492 
493 	if (locked)
494 		spin_unlock_irqrestore(&cc->zone->lock, flags);
495 
496 	/* Update the pageblock-skip if the whole pageblock was scanned */
497 	if (blockpfn == end_pfn)
498 		update_pageblock_skip(cc, valid_page, total_isolated, false);
499 
500 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
501 	if (total_isolated)
502 		count_compact_events(COMPACTISOLATED, total_isolated);
503 	return total_isolated;
504 }
505 
506 /**
507  * isolate_freepages_range() - isolate free pages.
508  * @start_pfn: The first PFN to start isolating.
509  * @end_pfn:   The one-past-last PFN.
510  *
511  * Non-free pages, invalid PFNs, or zone boundaries within the
512  * [start_pfn, end_pfn) range are considered errors, cause function to
513  * undo its actions and return zero.
514  *
515  * Otherwise, function returns one-past-the-last PFN of isolated page
516  * (which may be greater then end_pfn if end fell in a middle of
517  * a free page).
518  */
519 unsigned long
520 isolate_freepages_range(struct compact_control *cc,
521 			unsigned long start_pfn, unsigned long end_pfn)
522 {
523 	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
524 	LIST_HEAD(freelist);
525 
526 	pfn = start_pfn;
527 	block_start_pfn = pageblock_start_pfn(pfn);
528 	if (block_start_pfn < cc->zone->zone_start_pfn)
529 		block_start_pfn = cc->zone->zone_start_pfn;
530 	block_end_pfn = pageblock_end_pfn(pfn);
531 
532 	for (; pfn < end_pfn; pfn += isolated,
533 				block_start_pfn = block_end_pfn,
534 				block_end_pfn += pageblock_nr_pages) {
535 		/* Protect pfn from changing by isolate_freepages_block */
536 		unsigned long isolate_start_pfn = pfn;
537 
538 		block_end_pfn = min(block_end_pfn, end_pfn);
539 
540 		/*
541 		 * pfn could pass the block_end_pfn if isolated freepage
542 		 * is more than pageblock order. In this case, we adjust
543 		 * scanning range to right one.
544 		 */
545 		if (pfn >= block_end_pfn) {
546 			block_start_pfn = pageblock_start_pfn(pfn);
547 			block_end_pfn = pageblock_end_pfn(pfn);
548 			block_end_pfn = min(block_end_pfn, end_pfn);
549 		}
550 
551 		if (!pageblock_pfn_to_page(block_start_pfn,
552 					block_end_pfn, cc->zone))
553 			break;
554 
555 		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
556 						block_end_pfn, &freelist, true);
557 
558 		/*
559 		 * In strict mode, isolate_freepages_block() returns 0 if
560 		 * there are any holes in the block (ie. invalid PFNs or
561 		 * non-free pages).
562 		 */
563 		if (!isolated)
564 			break;
565 
566 		/*
567 		 * If we managed to isolate pages, it is always (1 << n) *
568 		 * pageblock_nr_pages for some non-negative n.  (Max order
569 		 * page may span two pageblocks).
570 		 */
571 	}
572 
573 	/* split_free_page does not map the pages */
574 	map_pages(&freelist);
575 
576 	if (pfn < end_pfn) {
577 		/* Loop terminated early, cleanup. */
578 		release_freepages(&freelist);
579 		return 0;
580 	}
581 
582 	/* We don't use freelists for anything. */
583 	return pfn;
584 }
585 
586 /* Update the number of anon and file isolated pages in the zone */
587 static void acct_isolated(struct zone *zone, struct compact_control *cc)
588 {
589 	struct page *page;
590 	unsigned int count[2] = { 0, };
591 
592 	if (list_empty(&cc->migratepages))
593 		return;
594 
595 	list_for_each_entry(page, &cc->migratepages, lru)
596 		count[!!page_is_file_cache(page)]++;
597 
598 	mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
599 	mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
600 }
601 
602 /* Similar to reclaim, but different enough that they don't share logic */
603 static bool too_many_isolated(struct zone *zone)
604 {
605 	unsigned long active, inactive, isolated;
606 
607 	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
608 					zone_page_state(zone, NR_INACTIVE_ANON);
609 	active = zone_page_state(zone, NR_ACTIVE_FILE) +
610 					zone_page_state(zone, NR_ACTIVE_ANON);
611 	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
612 					zone_page_state(zone, NR_ISOLATED_ANON);
613 
614 	return isolated > (inactive + active) / 2;
615 }
616 
617 /**
618  * isolate_migratepages_block() - isolate all migrate-able pages within
619  *				  a single pageblock
620  * @cc:		Compaction control structure.
621  * @low_pfn:	The first PFN to isolate
622  * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
623  * @isolate_mode: Isolation mode to be used.
624  *
625  * Isolate all pages that can be migrated from the range specified by
626  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
627  * Returns zero if there is a fatal signal pending, otherwise PFN of the
628  * first page that was not scanned (which may be both less, equal to or more
629  * than end_pfn).
630  *
631  * The pages are isolated on cc->migratepages list (not required to be empty),
632  * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
633  * is neither read nor updated.
634  */
635 static unsigned long
636 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
637 			unsigned long end_pfn, isolate_mode_t isolate_mode)
638 {
639 	struct zone *zone = cc->zone;
640 	unsigned long nr_scanned = 0, nr_isolated = 0;
641 	struct lruvec *lruvec;
642 	unsigned long flags = 0;
643 	bool locked = false;
644 	struct page *page = NULL, *valid_page = NULL;
645 	unsigned long start_pfn = low_pfn;
646 	bool skip_on_failure = false;
647 	unsigned long next_skip_pfn = 0;
648 
649 	/*
650 	 * Ensure that there are not too many pages isolated from the LRU
651 	 * list by either parallel reclaimers or compaction. If there are,
652 	 * delay for some time until fewer pages are isolated
653 	 */
654 	while (unlikely(too_many_isolated(zone))) {
655 		/* async migration should just abort */
656 		if (cc->mode == MIGRATE_ASYNC)
657 			return 0;
658 
659 		congestion_wait(BLK_RW_ASYNC, HZ/10);
660 
661 		if (fatal_signal_pending(current))
662 			return 0;
663 	}
664 
665 	if (compact_should_abort(cc))
666 		return 0;
667 
668 	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
669 		skip_on_failure = true;
670 		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
671 	}
672 
673 	/* Time to isolate some pages for migration */
674 	for (; low_pfn < end_pfn; low_pfn++) {
675 		bool is_lru;
676 
677 		if (skip_on_failure && low_pfn >= next_skip_pfn) {
678 			/*
679 			 * We have isolated all migration candidates in the
680 			 * previous order-aligned block, and did not skip it due
681 			 * to failure. We should migrate the pages now and
682 			 * hopefully succeed compaction.
683 			 */
684 			if (nr_isolated)
685 				break;
686 
687 			/*
688 			 * We failed to isolate in the previous order-aligned
689 			 * block. Set the new boundary to the end of the
690 			 * current block. Note we can't simply increase
691 			 * next_skip_pfn by 1 << order, as low_pfn might have
692 			 * been incremented by a higher number due to skipping
693 			 * a compound or a high-order buddy page in the
694 			 * previous loop iteration.
695 			 */
696 			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
697 		}
698 
699 		/*
700 		 * Periodically drop the lock (if held) regardless of its
701 		 * contention, to give chance to IRQs. Abort async compaction
702 		 * if contended.
703 		 */
704 		if (!(low_pfn % SWAP_CLUSTER_MAX)
705 		    && compact_unlock_should_abort(&zone->lru_lock, flags,
706 								&locked, cc))
707 			break;
708 
709 		if (!pfn_valid_within(low_pfn))
710 			goto isolate_fail;
711 		nr_scanned++;
712 
713 		page = pfn_to_page(low_pfn);
714 
715 		if (!valid_page)
716 			valid_page = page;
717 
718 		/*
719 		 * Skip if free. We read page order here without zone lock
720 		 * which is generally unsafe, but the race window is small and
721 		 * the worst thing that can happen is that we skip some
722 		 * potential isolation targets.
723 		 */
724 		if (PageBuddy(page)) {
725 			unsigned long freepage_order = page_order_unsafe(page);
726 
727 			/*
728 			 * Without lock, we cannot be sure that what we got is
729 			 * a valid page order. Consider only values in the
730 			 * valid order range to prevent low_pfn overflow.
731 			 */
732 			if (freepage_order > 0 && freepage_order < MAX_ORDER)
733 				low_pfn += (1UL << freepage_order) - 1;
734 			continue;
735 		}
736 
737 		/*
738 		 * Check may be lockless but that's ok as we recheck later.
739 		 * It's possible to migrate LRU pages and balloon pages
740 		 * Skip any other type of page
741 		 */
742 		is_lru = PageLRU(page);
743 		if (!is_lru) {
744 			if (unlikely(balloon_page_movable(page))) {
745 				if (balloon_page_isolate(page)) {
746 					/* Successfully isolated */
747 					goto isolate_success;
748 				}
749 			}
750 		}
751 
752 		/*
753 		 * Regardless of being on LRU, compound pages such as THP and
754 		 * hugetlbfs are not to be compacted. We can potentially save
755 		 * a lot of iterations if we skip them at once. The check is
756 		 * racy, but we can consider only valid values and the only
757 		 * danger is skipping too much.
758 		 */
759 		if (PageCompound(page)) {
760 			unsigned int comp_order = compound_order(page);
761 
762 			if (likely(comp_order < MAX_ORDER))
763 				low_pfn += (1UL << comp_order) - 1;
764 
765 			goto isolate_fail;
766 		}
767 
768 		if (!is_lru)
769 			goto isolate_fail;
770 
771 		/*
772 		 * Migration will fail if an anonymous page is pinned in memory,
773 		 * so avoid taking lru_lock and isolating it unnecessarily in an
774 		 * admittedly racy check.
775 		 */
776 		if (!page_mapping(page) &&
777 		    page_count(page) > page_mapcount(page))
778 			goto isolate_fail;
779 
780 		/* If we already hold the lock, we can skip some rechecking */
781 		if (!locked) {
782 			locked = compact_trylock_irqsave(&zone->lru_lock,
783 								&flags, cc);
784 			if (!locked)
785 				break;
786 
787 			/* Recheck PageLRU and PageCompound under lock */
788 			if (!PageLRU(page))
789 				goto isolate_fail;
790 
791 			/*
792 			 * Page become compound since the non-locked check,
793 			 * and it's on LRU. It can only be a THP so the order
794 			 * is safe to read and it's 0 for tail pages.
795 			 */
796 			if (unlikely(PageCompound(page))) {
797 				low_pfn += (1UL << compound_order(page)) - 1;
798 				goto isolate_fail;
799 			}
800 		}
801 
802 		lruvec = mem_cgroup_page_lruvec(page, zone);
803 
804 		/* Try isolate the page */
805 		if (__isolate_lru_page(page, isolate_mode) != 0)
806 			goto isolate_fail;
807 
808 		VM_BUG_ON_PAGE(PageCompound(page), page);
809 
810 		/* Successfully isolated */
811 		del_page_from_lru_list(page, lruvec, page_lru(page));
812 
813 isolate_success:
814 		list_add(&page->lru, &cc->migratepages);
815 		cc->nr_migratepages++;
816 		nr_isolated++;
817 
818 		/*
819 		 * Record where we could have freed pages by migration and not
820 		 * yet flushed them to buddy allocator.
821 		 * - this is the lowest page that was isolated and likely be
822 		 * then freed by migration.
823 		 */
824 		if (!cc->last_migrated_pfn)
825 			cc->last_migrated_pfn = low_pfn;
826 
827 		/* Avoid isolating too much */
828 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
829 			++low_pfn;
830 			break;
831 		}
832 
833 		continue;
834 isolate_fail:
835 		if (!skip_on_failure)
836 			continue;
837 
838 		/*
839 		 * We have isolated some pages, but then failed. Release them
840 		 * instead of migrating, as we cannot form the cc->order buddy
841 		 * page anyway.
842 		 */
843 		if (nr_isolated) {
844 			if (locked) {
845 				spin_unlock_irqrestore(&zone->lru_lock,	flags);
846 				locked = false;
847 			}
848 			acct_isolated(zone, cc);
849 			putback_movable_pages(&cc->migratepages);
850 			cc->nr_migratepages = 0;
851 			cc->last_migrated_pfn = 0;
852 			nr_isolated = 0;
853 		}
854 
855 		if (low_pfn < next_skip_pfn) {
856 			low_pfn = next_skip_pfn - 1;
857 			/*
858 			 * The check near the loop beginning would have updated
859 			 * next_skip_pfn too, but this is a bit simpler.
860 			 */
861 			next_skip_pfn += 1UL << cc->order;
862 		}
863 	}
864 
865 	/*
866 	 * The PageBuddy() check could have potentially brought us outside
867 	 * the range to be scanned.
868 	 */
869 	if (unlikely(low_pfn > end_pfn))
870 		low_pfn = end_pfn;
871 
872 	if (locked)
873 		spin_unlock_irqrestore(&zone->lru_lock, flags);
874 
875 	/*
876 	 * Update the pageblock-skip information and cached scanner pfn,
877 	 * if the whole pageblock was scanned without isolating any page.
878 	 */
879 	if (low_pfn == end_pfn)
880 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
881 
882 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
883 						nr_scanned, nr_isolated);
884 
885 	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
886 	if (nr_isolated)
887 		count_compact_events(COMPACTISOLATED, nr_isolated);
888 
889 	return low_pfn;
890 }
891 
892 /**
893  * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
894  * @cc:        Compaction control structure.
895  * @start_pfn: The first PFN to start isolating.
896  * @end_pfn:   The one-past-last PFN.
897  *
898  * Returns zero if isolation fails fatally due to e.g. pending signal.
899  * Otherwise, function returns one-past-the-last PFN of isolated page
900  * (which may be greater than end_pfn if end fell in a middle of a THP page).
901  */
902 unsigned long
903 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
904 							unsigned long end_pfn)
905 {
906 	unsigned long pfn, block_start_pfn, block_end_pfn;
907 
908 	/* Scan block by block. First and last block may be incomplete */
909 	pfn = start_pfn;
910 	block_start_pfn = pageblock_start_pfn(pfn);
911 	if (block_start_pfn < cc->zone->zone_start_pfn)
912 		block_start_pfn = cc->zone->zone_start_pfn;
913 	block_end_pfn = pageblock_end_pfn(pfn);
914 
915 	for (; pfn < end_pfn; pfn = block_end_pfn,
916 				block_start_pfn = block_end_pfn,
917 				block_end_pfn += pageblock_nr_pages) {
918 
919 		block_end_pfn = min(block_end_pfn, end_pfn);
920 
921 		if (!pageblock_pfn_to_page(block_start_pfn,
922 					block_end_pfn, cc->zone))
923 			continue;
924 
925 		pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
926 							ISOLATE_UNEVICTABLE);
927 
928 		if (!pfn)
929 			break;
930 
931 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
932 			break;
933 	}
934 	acct_isolated(cc->zone, cc);
935 
936 	return pfn;
937 }
938 
939 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
940 #ifdef CONFIG_COMPACTION
941 
942 /* Returns true if the page is within a block suitable for migration to */
943 static bool suitable_migration_target(struct page *page)
944 {
945 	/* If the page is a large free page, then disallow migration */
946 	if (PageBuddy(page)) {
947 		/*
948 		 * We are checking page_order without zone->lock taken. But
949 		 * the only small danger is that we skip a potentially suitable
950 		 * pageblock, so it's not worth to check order for valid range.
951 		 */
952 		if (page_order_unsafe(page) >= pageblock_order)
953 			return false;
954 	}
955 
956 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
957 	if (migrate_async_suitable(get_pageblock_migratetype(page)))
958 		return true;
959 
960 	/* Otherwise skip the block */
961 	return false;
962 }
963 
964 /*
965  * Test whether the free scanner has reached the same or lower pageblock than
966  * the migration scanner, and compaction should thus terminate.
967  */
968 static inline bool compact_scanners_met(struct compact_control *cc)
969 {
970 	return (cc->free_pfn >> pageblock_order)
971 		<= (cc->migrate_pfn >> pageblock_order);
972 }
973 
974 /*
975  * Based on information in the current compact_control, find blocks
976  * suitable for isolating free pages from and then isolate them.
977  */
978 static void isolate_freepages(struct compact_control *cc)
979 {
980 	struct zone *zone = cc->zone;
981 	struct page *page;
982 	unsigned long block_start_pfn;	/* start of current pageblock */
983 	unsigned long isolate_start_pfn; /* exact pfn we start at */
984 	unsigned long block_end_pfn;	/* end of current pageblock */
985 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
986 	struct list_head *freelist = &cc->freepages;
987 
988 	/*
989 	 * Initialise the free scanner. The starting point is where we last
990 	 * successfully isolated from, zone-cached value, or the end of the
991 	 * zone when isolating for the first time. For looping we also need
992 	 * this pfn aligned down to the pageblock boundary, because we do
993 	 * block_start_pfn -= pageblock_nr_pages in the for loop.
994 	 * For ending point, take care when isolating in last pageblock of a
995 	 * a zone which ends in the middle of a pageblock.
996 	 * The low boundary is the end of the pageblock the migration scanner
997 	 * is using.
998 	 */
999 	isolate_start_pfn = cc->free_pfn;
1000 	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1001 	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1002 						zone_end_pfn(zone));
1003 	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1004 
1005 	/*
1006 	 * Isolate free pages until enough are available to migrate the
1007 	 * pages on cc->migratepages. We stop searching if the migrate
1008 	 * and free page scanners meet or enough free pages are isolated.
1009 	 */
1010 	for (; block_start_pfn >= low_pfn;
1011 				block_end_pfn = block_start_pfn,
1012 				block_start_pfn -= pageblock_nr_pages,
1013 				isolate_start_pfn = block_start_pfn) {
1014 
1015 		/*
1016 		 * This can iterate a massively long zone without finding any
1017 		 * suitable migration targets, so periodically check if we need
1018 		 * to schedule, or even abort async compaction.
1019 		 */
1020 		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1021 						&& compact_should_abort(cc))
1022 			break;
1023 
1024 		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1025 									zone);
1026 		if (!page)
1027 			continue;
1028 
1029 		/* Check the block is suitable for migration */
1030 		if (!suitable_migration_target(page))
1031 			continue;
1032 
1033 		/* If isolation recently failed, do not retry */
1034 		if (!isolation_suitable(cc, page))
1035 			continue;
1036 
1037 		/* Found a block suitable for isolating free pages from. */
1038 		isolate_freepages_block(cc, &isolate_start_pfn,
1039 					block_end_pfn, freelist, false);
1040 
1041 		/*
1042 		 * If we isolated enough freepages, or aborted due to async
1043 		 * compaction being contended, terminate the loop.
1044 		 * Remember where the free scanner should restart next time,
1045 		 * which is where isolate_freepages_block() left off.
1046 		 * But if it scanned the whole pageblock, isolate_start_pfn
1047 		 * now points at block_end_pfn, which is the start of the next
1048 		 * pageblock.
1049 		 * In that case we will however want to restart at the start
1050 		 * of the previous pageblock.
1051 		 */
1052 		if ((cc->nr_freepages >= cc->nr_migratepages)
1053 							|| cc->contended) {
1054 			if (isolate_start_pfn >= block_end_pfn)
1055 				isolate_start_pfn =
1056 					block_start_pfn - pageblock_nr_pages;
1057 			break;
1058 		} else {
1059 			/*
1060 			 * isolate_freepages_block() should not terminate
1061 			 * prematurely unless contended, or isolated enough
1062 			 */
1063 			VM_BUG_ON(isolate_start_pfn < block_end_pfn);
1064 		}
1065 	}
1066 
1067 	/* split_free_page does not map the pages */
1068 	map_pages(freelist);
1069 
1070 	/*
1071 	 * Record where the free scanner will restart next time. Either we
1072 	 * broke from the loop and set isolate_start_pfn based on the last
1073 	 * call to isolate_freepages_block(), or we met the migration scanner
1074 	 * and the loop terminated due to isolate_start_pfn < low_pfn
1075 	 */
1076 	cc->free_pfn = isolate_start_pfn;
1077 }
1078 
1079 /*
1080  * This is a migrate-callback that "allocates" freepages by taking pages
1081  * from the isolated freelists in the block we are migrating to.
1082  */
1083 static struct page *compaction_alloc(struct page *migratepage,
1084 					unsigned long data,
1085 					int **result)
1086 {
1087 	struct compact_control *cc = (struct compact_control *)data;
1088 	struct page *freepage;
1089 
1090 	/*
1091 	 * Isolate free pages if necessary, and if we are not aborting due to
1092 	 * contention.
1093 	 */
1094 	if (list_empty(&cc->freepages)) {
1095 		if (!cc->contended)
1096 			isolate_freepages(cc);
1097 
1098 		if (list_empty(&cc->freepages))
1099 			return NULL;
1100 	}
1101 
1102 	freepage = list_entry(cc->freepages.next, struct page, lru);
1103 	list_del(&freepage->lru);
1104 	cc->nr_freepages--;
1105 
1106 	return freepage;
1107 }
1108 
1109 /*
1110  * This is a migrate-callback that "frees" freepages back to the isolated
1111  * freelist.  All pages on the freelist are from the same zone, so there is no
1112  * special handling needed for NUMA.
1113  */
1114 static void compaction_free(struct page *page, unsigned long data)
1115 {
1116 	struct compact_control *cc = (struct compact_control *)data;
1117 
1118 	list_add(&page->lru, &cc->freepages);
1119 	cc->nr_freepages++;
1120 }
1121 
1122 /* possible outcome of isolate_migratepages */
1123 typedef enum {
1124 	ISOLATE_ABORT,		/* Abort compaction now */
1125 	ISOLATE_NONE,		/* No pages isolated, continue scanning */
1126 	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
1127 } isolate_migrate_t;
1128 
1129 /*
1130  * Allow userspace to control policy on scanning the unevictable LRU for
1131  * compactable pages.
1132  */
1133 int sysctl_compact_unevictable_allowed __read_mostly = 1;
1134 
1135 /*
1136  * Isolate all pages that can be migrated from the first suitable block,
1137  * starting at the block pointed to by the migrate scanner pfn within
1138  * compact_control.
1139  */
1140 static isolate_migrate_t isolate_migratepages(struct zone *zone,
1141 					struct compact_control *cc)
1142 {
1143 	unsigned long block_start_pfn;
1144 	unsigned long block_end_pfn;
1145 	unsigned long low_pfn;
1146 	struct page *page;
1147 	const isolate_mode_t isolate_mode =
1148 		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1149 		(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1150 
1151 	/*
1152 	 * Start at where we last stopped, or beginning of the zone as
1153 	 * initialized by compact_zone()
1154 	 */
1155 	low_pfn = cc->migrate_pfn;
1156 	block_start_pfn = pageblock_start_pfn(low_pfn);
1157 	if (block_start_pfn < zone->zone_start_pfn)
1158 		block_start_pfn = zone->zone_start_pfn;
1159 
1160 	/* Only scan within a pageblock boundary */
1161 	block_end_pfn = pageblock_end_pfn(low_pfn);
1162 
1163 	/*
1164 	 * Iterate over whole pageblocks until we find the first suitable.
1165 	 * Do not cross the free scanner.
1166 	 */
1167 	for (; block_end_pfn <= cc->free_pfn;
1168 			low_pfn = block_end_pfn,
1169 			block_start_pfn = block_end_pfn,
1170 			block_end_pfn += pageblock_nr_pages) {
1171 
1172 		/*
1173 		 * This can potentially iterate a massively long zone with
1174 		 * many pageblocks unsuitable, so periodically check if we
1175 		 * need to schedule, or even abort async compaction.
1176 		 */
1177 		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1178 						&& compact_should_abort(cc))
1179 			break;
1180 
1181 		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1182 									zone);
1183 		if (!page)
1184 			continue;
1185 
1186 		/* If isolation recently failed, do not retry */
1187 		if (!isolation_suitable(cc, page))
1188 			continue;
1189 
1190 		/*
1191 		 * For async compaction, also only scan in MOVABLE blocks.
1192 		 * Async compaction is optimistic to see if the minimum amount
1193 		 * of work satisfies the allocation.
1194 		 */
1195 		if (cc->mode == MIGRATE_ASYNC &&
1196 		    !migrate_async_suitable(get_pageblock_migratetype(page)))
1197 			continue;
1198 
1199 		/* Perform the isolation */
1200 		low_pfn = isolate_migratepages_block(cc, low_pfn,
1201 						block_end_pfn, isolate_mode);
1202 
1203 		if (!low_pfn || cc->contended) {
1204 			acct_isolated(zone, cc);
1205 			return ISOLATE_ABORT;
1206 		}
1207 
1208 		/*
1209 		 * Either we isolated something and proceed with migration. Or
1210 		 * we failed and compact_zone should decide if we should
1211 		 * continue or not.
1212 		 */
1213 		break;
1214 	}
1215 
1216 	acct_isolated(zone, cc);
1217 	/* Record where migration scanner will be restarted. */
1218 	cc->migrate_pfn = low_pfn;
1219 
1220 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1221 }
1222 
1223 /*
1224  * order == -1 is expected when compacting via
1225  * /proc/sys/vm/compact_memory
1226  */
1227 static inline bool is_via_compact_memory(int order)
1228 {
1229 	return order == -1;
1230 }
1231 
1232 static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
1233 			    const int migratetype)
1234 {
1235 	unsigned int order;
1236 	unsigned long watermark;
1237 
1238 	if (cc->contended || fatal_signal_pending(current))
1239 		return COMPACT_CONTENDED;
1240 
1241 	/* Compaction run completes if the migrate and free scanner meet */
1242 	if (compact_scanners_met(cc)) {
1243 		/* Let the next compaction start anew. */
1244 		reset_cached_positions(zone);
1245 
1246 		/*
1247 		 * Mark that the PG_migrate_skip information should be cleared
1248 		 * by kswapd when it goes to sleep. kcompactd does not set the
1249 		 * flag itself as the decision to be clear should be directly
1250 		 * based on an allocation request.
1251 		 */
1252 		if (cc->direct_compaction)
1253 			zone->compact_blockskip_flush = true;
1254 
1255 		if (cc->whole_zone)
1256 			return COMPACT_COMPLETE;
1257 		else
1258 			return COMPACT_PARTIAL_SKIPPED;
1259 	}
1260 
1261 	if (is_via_compact_memory(cc->order))
1262 		return COMPACT_CONTINUE;
1263 
1264 	/* Compaction run is not finished if the watermark is not met */
1265 	watermark = low_wmark_pages(zone);
1266 
1267 	if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1268 							cc->alloc_flags))
1269 		return COMPACT_CONTINUE;
1270 
1271 	/* Direct compactor: Is a suitable page free? */
1272 	for (order = cc->order; order < MAX_ORDER; order++) {
1273 		struct free_area *area = &zone->free_area[order];
1274 		bool can_steal;
1275 
1276 		/* Job done if page is free of the right migratetype */
1277 		if (!list_empty(&area->free_list[migratetype]))
1278 			return COMPACT_PARTIAL;
1279 
1280 #ifdef CONFIG_CMA
1281 		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1282 		if (migratetype == MIGRATE_MOVABLE &&
1283 			!list_empty(&area->free_list[MIGRATE_CMA]))
1284 			return COMPACT_PARTIAL;
1285 #endif
1286 		/*
1287 		 * Job done if allocation would steal freepages from
1288 		 * other migratetype buddy lists.
1289 		 */
1290 		if (find_suitable_fallback(area, order, migratetype,
1291 						true, &can_steal) != -1)
1292 			return COMPACT_PARTIAL;
1293 	}
1294 
1295 	return COMPACT_NO_SUITABLE_PAGE;
1296 }
1297 
1298 static enum compact_result compact_finished(struct zone *zone,
1299 			struct compact_control *cc,
1300 			const int migratetype)
1301 {
1302 	int ret;
1303 
1304 	ret = __compact_finished(zone, cc, migratetype);
1305 	trace_mm_compaction_finished(zone, cc->order, ret);
1306 	if (ret == COMPACT_NO_SUITABLE_PAGE)
1307 		ret = COMPACT_CONTINUE;
1308 
1309 	return ret;
1310 }
1311 
1312 /*
1313  * compaction_suitable: Is this suitable to run compaction on this zone now?
1314  * Returns
1315  *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1316  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
1317  *   COMPACT_CONTINUE - If compaction should run now
1318  */
1319 static enum compact_result __compaction_suitable(struct zone *zone, int order,
1320 					unsigned int alloc_flags,
1321 					int classzone_idx,
1322 					unsigned long wmark_target)
1323 {
1324 	int fragindex;
1325 	unsigned long watermark;
1326 
1327 	if (is_via_compact_memory(order))
1328 		return COMPACT_CONTINUE;
1329 
1330 	watermark = low_wmark_pages(zone);
1331 	/*
1332 	 * If watermarks for high-order allocation are already met, there
1333 	 * should be no need for compaction at all.
1334 	 */
1335 	if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1336 								alloc_flags))
1337 		return COMPACT_PARTIAL;
1338 
1339 	/*
1340 	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
1341 	 * This is because during migration, copies of pages need to be
1342 	 * allocated and for a short time, the footprint is higher
1343 	 */
1344 	watermark += (2UL << order);
1345 	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1346 				 alloc_flags, wmark_target))
1347 		return COMPACT_SKIPPED;
1348 
1349 	/*
1350 	 * fragmentation index determines if allocation failures are due to
1351 	 * low memory or external fragmentation
1352 	 *
1353 	 * index of -1000 would imply allocations might succeed depending on
1354 	 * watermarks, but we already failed the high-order watermark check
1355 	 * index towards 0 implies failure is due to lack of memory
1356 	 * index towards 1000 implies failure is due to fragmentation
1357 	 *
1358 	 * Only compact if a failure would be due to fragmentation.
1359 	 */
1360 	fragindex = fragmentation_index(zone, order);
1361 	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1362 		return COMPACT_NOT_SUITABLE_ZONE;
1363 
1364 	return COMPACT_CONTINUE;
1365 }
1366 
1367 enum compact_result compaction_suitable(struct zone *zone, int order,
1368 					unsigned int alloc_flags,
1369 					int classzone_idx)
1370 {
1371 	enum compact_result ret;
1372 
1373 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
1374 				    zone_page_state(zone, NR_FREE_PAGES));
1375 	trace_mm_compaction_suitable(zone, order, ret);
1376 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
1377 		ret = COMPACT_SKIPPED;
1378 
1379 	return ret;
1380 }
1381 
1382 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1383 		int alloc_flags)
1384 {
1385 	struct zone *zone;
1386 	struct zoneref *z;
1387 
1388 	/*
1389 	 * Make sure at least one zone would pass __compaction_suitable if we continue
1390 	 * retrying the reclaim.
1391 	 */
1392 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1393 					ac->nodemask) {
1394 		unsigned long available;
1395 		enum compact_result compact_result;
1396 
1397 		/*
1398 		 * Do not consider all the reclaimable memory because we do not
1399 		 * want to trash just for a single high order allocation which
1400 		 * is even not guaranteed to appear even if __compaction_suitable
1401 		 * is happy about the watermark check.
1402 		 */
1403 		available = zone_reclaimable_pages(zone) / order;
1404 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
1405 		compact_result = __compaction_suitable(zone, order, alloc_flags,
1406 				ac_classzone_idx(ac), available);
1407 		if (compact_result != COMPACT_SKIPPED &&
1408 				compact_result != COMPACT_NOT_SUITABLE_ZONE)
1409 			return true;
1410 	}
1411 
1412 	return false;
1413 }
1414 
1415 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
1416 {
1417 	enum compact_result ret;
1418 	unsigned long start_pfn = zone->zone_start_pfn;
1419 	unsigned long end_pfn = zone_end_pfn(zone);
1420 	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1421 	const bool sync = cc->mode != MIGRATE_ASYNC;
1422 
1423 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1424 							cc->classzone_idx);
1425 	/* Compaction is likely to fail */
1426 	if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
1427 		return ret;
1428 
1429 	/* huh, compaction_suitable is returning something unexpected */
1430 	VM_BUG_ON(ret != COMPACT_CONTINUE);
1431 
1432 	/*
1433 	 * Clear pageblock skip if there were failures recently and compaction
1434 	 * is about to be retried after being deferred.
1435 	 */
1436 	if (compaction_restarting(zone, cc->order))
1437 		__reset_isolation_suitable(zone);
1438 
1439 	/*
1440 	 * Setup to move all movable pages to the end of the zone. Used cached
1441 	 * information on where the scanners should start but check that it
1442 	 * is initialised by ensuring the values are within zone boundaries.
1443 	 */
1444 	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1445 	cc->free_pfn = zone->compact_cached_free_pfn;
1446 	if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1447 		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1448 		zone->compact_cached_free_pfn = cc->free_pfn;
1449 	}
1450 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1451 		cc->migrate_pfn = start_pfn;
1452 		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1453 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1454 	}
1455 
1456 	if (cc->migrate_pfn == start_pfn)
1457 		cc->whole_zone = true;
1458 
1459 	cc->last_migrated_pfn = 0;
1460 
1461 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1462 				cc->free_pfn, end_pfn, sync);
1463 
1464 	migrate_prep_local();
1465 
1466 	while ((ret = compact_finished(zone, cc, migratetype)) ==
1467 						COMPACT_CONTINUE) {
1468 		int err;
1469 
1470 		switch (isolate_migratepages(zone, cc)) {
1471 		case ISOLATE_ABORT:
1472 			ret = COMPACT_CONTENDED;
1473 			putback_movable_pages(&cc->migratepages);
1474 			cc->nr_migratepages = 0;
1475 			goto out;
1476 		case ISOLATE_NONE:
1477 			/*
1478 			 * We haven't isolated and migrated anything, but
1479 			 * there might still be unflushed migrations from
1480 			 * previous cc->order aligned block.
1481 			 */
1482 			goto check_drain;
1483 		case ISOLATE_SUCCESS:
1484 			;
1485 		}
1486 
1487 		err = migrate_pages(&cc->migratepages, compaction_alloc,
1488 				compaction_free, (unsigned long)cc, cc->mode,
1489 				MR_COMPACTION);
1490 
1491 		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1492 							&cc->migratepages);
1493 
1494 		/* All pages were either migrated or will be released */
1495 		cc->nr_migratepages = 0;
1496 		if (err) {
1497 			putback_movable_pages(&cc->migratepages);
1498 			/*
1499 			 * migrate_pages() may return -ENOMEM when scanners meet
1500 			 * and we want compact_finished() to detect it
1501 			 */
1502 			if (err == -ENOMEM && !compact_scanners_met(cc)) {
1503 				ret = COMPACT_CONTENDED;
1504 				goto out;
1505 			}
1506 			/*
1507 			 * We failed to migrate at least one page in the current
1508 			 * order-aligned block, so skip the rest of it.
1509 			 */
1510 			if (cc->direct_compaction &&
1511 						(cc->mode == MIGRATE_ASYNC)) {
1512 				cc->migrate_pfn = block_end_pfn(
1513 						cc->migrate_pfn - 1, cc->order);
1514 				/* Draining pcplists is useless in this case */
1515 				cc->last_migrated_pfn = 0;
1516 
1517 			}
1518 		}
1519 
1520 check_drain:
1521 		/*
1522 		 * Has the migration scanner moved away from the previous
1523 		 * cc->order aligned block where we migrated from? If yes,
1524 		 * flush the pages that were freed, so that they can merge and
1525 		 * compact_finished() can detect immediately if allocation
1526 		 * would succeed.
1527 		 */
1528 		if (cc->order > 0 && cc->last_migrated_pfn) {
1529 			int cpu;
1530 			unsigned long current_block_start =
1531 				block_start_pfn(cc->migrate_pfn, cc->order);
1532 
1533 			if (cc->last_migrated_pfn < current_block_start) {
1534 				cpu = get_cpu();
1535 				lru_add_drain_cpu(cpu);
1536 				drain_local_pages(zone);
1537 				put_cpu();
1538 				/* No more flushing until we migrate again */
1539 				cc->last_migrated_pfn = 0;
1540 			}
1541 		}
1542 
1543 	}
1544 
1545 out:
1546 	/*
1547 	 * Release free pages and update where the free scanner should restart,
1548 	 * so we don't leave any returned pages behind in the next attempt.
1549 	 */
1550 	if (cc->nr_freepages > 0) {
1551 		unsigned long free_pfn = release_freepages(&cc->freepages);
1552 
1553 		cc->nr_freepages = 0;
1554 		VM_BUG_ON(free_pfn == 0);
1555 		/* The cached pfn is always the first in a pageblock */
1556 		free_pfn = pageblock_start_pfn(free_pfn);
1557 		/*
1558 		 * Only go back, not forward. The cached pfn might have been
1559 		 * already reset to zone end in compact_finished()
1560 		 */
1561 		if (free_pfn > zone->compact_cached_free_pfn)
1562 			zone->compact_cached_free_pfn = free_pfn;
1563 	}
1564 
1565 	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1566 				cc->free_pfn, end_pfn, sync, ret);
1567 
1568 	if (ret == COMPACT_CONTENDED)
1569 		ret = COMPACT_PARTIAL;
1570 
1571 	return ret;
1572 }
1573 
1574 static enum compact_result compact_zone_order(struct zone *zone, int order,
1575 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
1576 		unsigned int alloc_flags, int classzone_idx)
1577 {
1578 	enum compact_result ret;
1579 	struct compact_control cc = {
1580 		.nr_freepages = 0,
1581 		.nr_migratepages = 0,
1582 		.order = order,
1583 		.gfp_mask = gfp_mask,
1584 		.zone = zone,
1585 		.mode = mode,
1586 		.alloc_flags = alloc_flags,
1587 		.classzone_idx = classzone_idx,
1588 		.direct_compaction = true,
1589 	};
1590 	INIT_LIST_HEAD(&cc.freepages);
1591 	INIT_LIST_HEAD(&cc.migratepages);
1592 
1593 	ret = compact_zone(zone, &cc);
1594 
1595 	VM_BUG_ON(!list_empty(&cc.freepages));
1596 	VM_BUG_ON(!list_empty(&cc.migratepages));
1597 
1598 	*contended = cc.contended;
1599 	return ret;
1600 }
1601 
1602 int sysctl_extfrag_threshold = 500;
1603 
1604 /**
1605  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1606  * @gfp_mask: The GFP mask of the current allocation
1607  * @order: The order of the current allocation
1608  * @alloc_flags: The allocation flags of the current allocation
1609  * @ac: The context of current allocation
1610  * @mode: The migration mode for async, sync light, or sync migration
1611  * @contended: Return value that determines if compaction was aborted due to
1612  *	       need_resched() or lock contention
1613  *
1614  * This is the main entry point for direct page compaction.
1615  */
1616 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1617 		unsigned int alloc_flags, const struct alloc_context *ac,
1618 		enum migrate_mode mode, int *contended)
1619 {
1620 	int may_enter_fs = gfp_mask & __GFP_FS;
1621 	int may_perform_io = gfp_mask & __GFP_IO;
1622 	struct zoneref *z;
1623 	struct zone *zone;
1624 	enum compact_result rc = COMPACT_SKIPPED;
1625 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1626 
1627 	*contended = COMPACT_CONTENDED_NONE;
1628 
1629 	/* Check if the GFP flags allow compaction */
1630 	if (!order || !may_enter_fs || !may_perform_io)
1631 		return COMPACT_SKIPPED;
1632 
1633 	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
1634 
1635 	/* Compact each zone in the list */
1636 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1637 								ac->nodemask) {
1638 		enum compact_result status;
1639 		int zone_contended;
1640 
1641 		if (compaction_deferred(zone, order)) {
1642 			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1643 			continue;
1644 		}
1645 
1646 		status = compact_zone_order(zone, order, gfp_mask, mode,
1647 				&zone_contended, alloc_flags,
1648 				ac_classzone_idx(ac));
1649 		rc = max(status, rc);
1650 		/*
1651 		 * It takes at least one zone that wasn't lock contended
1652 		 * to clear all_zones_contended.
1653 		 */
1654 		all_zones_contended &= zone_contended;
1655 
1656 		/* If a normal allocation would succeed, stop compacting */
1657 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1658 					ac_classzone_idx(ac), alloc_flags)) {
1659 			/*
1660 			 * We think the allocation will succeed in this zone,
1661 			 * but it is not certain, hence the false. The caller
1662 			 * will repeat this with true if allocation indeed
1663 			 * succeeds in this zone.
1664 			 */
1665 			compaction_defer_reset(zone, order, false);
1666 			/*
1667 			 * It is possible that async compaction aborted due to
1668 			 * need_resched() and the watermarks were ok thanks to
1669 			 * somebody else freeing memory. The allocation can
1670 			 * however still fail so we better signal the
1671 			 * need_resched() contention anyway (this will not
1672 			 * prevent the allocation attempt).
1673 			 */
1674 			if (zone_contended == COMPACT_CONTENDED_SCHED)
1675 				*contended = COMPACT_CONTENDED_SCHED;
1676 
1677 			goto break_loop;
1678 		}
1679 
1680 		if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
1681 					status == COMPACT_PARTIAL_SKIPPED)) {
1682 			/*
1683 			 * We think that allocation won't succeed in this zone
1684 			 * so we defer compaction there. If it ends up
1685 			 * succeeding after all, it will be reset.
1686 			 */
1687 			defer_compaction(zone, order);
1688 		}
1689 
1690 		/*
1691 		 * We might have stopped compacting due to need_resched() in
1692 		 * async compaction, or due to a fatal signal detected. In that
1693 		 * case do not try further zones and signal need_resched()
1694 		 * contention.
1695 		 */
1696 		if ((zone_contended == COMPACT_CONTENDED_SCHED)
1697 					|| fatal_signal_pending(current)) {
1698 			*contended = COMPACT_CONTENDED_SCHED;
1699 			goto break_loop;
1700 		}
1701 
1702 		continue;
1703 break_loop:
1704 		/*
1705 		 * We might not have tried all the zones, so  be conservative
1706 		 * and assume they are not all lock contended.
1707 		 */
1708 		all_zones_contended = 0;
1709 		break;
1710 	}
1711 
1712 	/*
1713 	 * If at least one zone wasn't deferred or skipped, we report if all
1714 	 * zones that were tried were lock contended.
1715 	 */
1716 	if (rc > COMPACT_INACTIVE && all_zones_contended)
1717 		*contended = COMPACT_CONTENDED_LOCK;
1718 
1719 	return rc;
1720 }
1721 
1722 
1723 /* Compact all zones within a node */
1724 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1725 {
1726 	int zoneid;
1727 	struct zone *zone;
1728 
1729 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1730 
1731 		zone = &pgdat->node_zones[zoneid];
1732 		if (!populated_zone(zone))
1733 			continue;
1734 
1735 		cc->nr_freepages = 0;
1736 		cc->nr_migratepages = 0;
1737 		cc->zone = zone;
1738 		INIT_LIST_HEAD(&cc->freepages);
1739 		INIT_LIST_HEAD(&cc->migratepages);
1740 
1741 		/*
1742 		 * When called via /proc/sys/vm/compact_memory
1743 		 * this makes sure we compact the whole zone regardless of
1744 		 * cached scanner positions.
1745 		 */
1746 		if (is_via_compact_memory(cc->order))
1747 			__reset_isolation_suitable(zone);
1748 
1749 		if (is_via_compact_memory(cc->order) ||
1750 				!compaction_deferred(zone, cc->order))
1751 			compact_zone(zone, cc);
1752 
1753 		VM_BUG_ON(!list_empty(&cc->freepages));
1754 		VM_BUG_ON(!list_empty(&cc->migratepages));
1755 
1756 		if (is_via_compact_memory(cc->order))
1757 			continue;
1758 
1759 		if (zone_watermark_ok(zone, cc->order,
1760 				low_wmark_pages(zone), 0, 0))
1761 			compaction_defer_reset(zone, cc->order, false);
1762 	}
1763 }
1764 
1765 void compact_pgdat(pg_data_t *pgdat, int order)
1766 {
1767 	struct compact_control cc = {
1768 		.order = order,
1769 		.mode = MIGRATE_ASYNC,
1770 	};
1771 
1772 	if (!order)
1773 		return;
1774 
1775 	__compact_pgdat(pgdat, &cc);
1776 }
1777 
1778 static void compact_node(int nid)
1779 {
1780 	struct compact_control cc = {
1781 		.order = -1,
1782 		.mode = MIGRATE_SYNC,
1783 		.ignore_skip_hint = true,
1784 	};
1785 
1786 	__compact_pgdat(NODE_DATA(nid), &cc);
1787 }
1788 
1789 /* Compact all nodes in the system */
1790 static void compact_nodes(void)
1791 {
1792 	int nid;
1793 
1794 	/* Flush pending updates to the LRU lists */
1795 	lru_add_drain_all();
1796 
1797 	for_each_online_node(nid)
1798 		compact_node(nid);
1799 }
1800 
1801 /* The written value is actually unused, all memory is compacted */
1802 int sysctl_compact_memory;
1803 
1804 /*
1805  * This is the entry point for compacting all nodes via
1806  * /proc/sys/vm/compact_memory
1807  */
1808 int sysctl_compaction_handler(struct ctl_table *table, int write,
1809 			void __user *buffer, size_t *length, loff_t *ppos)
1810 {
1811 	if (write)
1812 		compact_nodes();
1813 
1814 	return 0;
1815 }
1816 
1817 int sysctl_extfrag_handler(struct ctl_table *table, int write,
1818 			void __user *buffer, size_t *length, loff_t *ppos)
1819 {
1820 	proc_dointvec_minmax(table, write, buffer, length, ppos);
1821 
1822 	return 0;
1823 }
1824 
1825 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1826 static ssize_t sysfs_compact_node(struct device *dev,
1827 			struct device_attribute *attr,
1828 			const char *buf, size_t count)
1829 {
1830 	int nid = dev->id;
1831 
1832 	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1833 		/* Flush pending updates to the LRU lists */
1834 		lru_add_drain_all();
1835 
1836 		compact_node(nid);
1837 	}
1838 
1839 	return count;
1840 }
1841 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1842 
1843 int compaction_register_node(struct node *node)
1844 {
1845 	return device_create_file(&node->dev, &dev_attr_compact);
1846 }
1847 
1848 void compaction_unregister_node(struct node *node)
1849 {
1850 	return device_remove_file(&node->dev, &dev_attr_compact);
1851 }
1852 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
1853 
1854 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
1855 {
1856 	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1857 }
1858 
1859 static bool kcompactd_node_suitable(pg_data_t *pgdat)
1860 {
1861 	int zoneid;
1862 	struct zone *zone;
1863 	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
1864 
1865 	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
1866 		zone = &pgdat->node_zones[zoneid];
1867 
1868 		if (!populated_zone(zone))
1869 			continue;
1870 
1871 		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
1872 					classzone_idx) == COMPACT_CONTINUE)
1873 			return true;
1874 	}
1875 
1876 	return false;
1877 }
1878 
1879 static void kcompactd_do_work(pg_data_t *pgdat)
1880 {
1881 	/*
1882 	 * With no special task, compact all zones so that a page of requested
1883 	 * order is allocatable.
1884 	 */
1885 	int zoneid;
1886 	struct zone *zone;
1887 	struct compact_control cc = {
1888 		.order = pgdat->kcompactd_max_order,
1889 		.classzone_idx = pgdat->kcompactd_classzone_idx,
1890 		.mode = MIGRATE_SYNC_LIGHT,
1891 		.ignore_skip_hint = true,
1892 
1893 	};
1894 	bool success = false;
1895 
1896 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1897 							cc.classzone_idx);
1898 	count_vm_event(KCOMPACTD_WAKE);
1899 
1900 	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
1901 		int status;
1902 
1903 		zone = &pgdat->node_zones[zoneid];
1904 		if (!populated_zone(zone))
1905 			continue;
1906 
1907 		if (compaction_deferred(zone, cc.order))
1908 			continue;
1909 
1910 		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
1911 							COMPACT_CONTINUE)
1912 			continue;
1913 
1914 		cc.nr_freepages = 0;
1915 		cc.nr_migratepages = 0;
1916 		cc.zone = zone;
1917 		INIT_LIST_HEAD(&cc.freepages);
1918 		INIT_LIST_HEAD(&cc.migratepages);
1919 
1920 		if (kthread_should_stop())
1921 			return;
1922 		status = compact_zone(zone, &cc);
1923 
1924 		if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
1925 						cc.classzone_idx, 0)) {
1926 			success = true;
1927 			compaction_defer_reset(zone, cc.order, false);
1928 		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1929 			/*
1930 			 * We use sync migration mode here, so we defer like
1931 			 * sync direct compaction does.
1932 			 */
1933 			defer_compaction(zone, cc.order);
1934 		}
1935 
1936 		VM_BUG_ON(!list_empty(&cc.freepages));
1937 		VM_BUG_ON(!list_empty(&cc.migratepages));
1938 	}
1939 
1940 	/*
1941 	 * Regardless of success, we are done until woken up next. But remember
1942 	 * the requested order/classzone_idx in case it was higher/tighter than
1943 	 * our current ones
1944 	 */
1945 	if (pgdat->kcompactd_max_order <= cc.order)
1946 		pgdat->kcompactd_max_order = 0;
1947 	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
1948 		pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1949 }
1950 
1951 void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
1952 {
1953 	if (!order)
1954 		return;
1955 
1956 	if (pgdat->kcompactd_max_order < order)
1957 		pgdat->kcompactd_max_order = order;
1958 
1959 	if (pgdat->kcompactd_classzone_idx > classzone_idx)
1960 		pgdat->kcompactd_classzone_idx = classzone_idx;
1961 
1962 	if (!waitqueue_active(&pgdat->kcompactd_wait))
1963 		return;
1964 
1965 	if (!kcompactd_node_suitable(pgdat))
1966 		return;
1967 
1968 	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
1969 							classzone_idx);
1970 	wake_up_interruptible(&pgdat->kcompactd_wait);
1971 }
1972 
1973 /*
1974  * The background compaction daemon, started as a kernel thread
1975  * from the init process.
1976  */
1977 static int kcompactd(void *p)
1978 {
1979 	pg_data_t *pgdat = (pg_data_t*)p;
1980 	struct task_struct *tsk = current;
1981 
1982 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1983 
1984 	if (!cpumask_empty(cpumask))
1985 		set_cpus_allowed_ptr(tsk, cpumask);
1986 
1987 	set_freezable();
1988 
1989 	pgdat->kcompactd_max_order = 0;
1990 	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1991 
1992 	while (!kthread_should_stop()) {
1993 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
1994 		wait_event_freezable(pgdat->kcompactd_wait,
1995 				kcompactd_work_requested(pgdat));
1996 
1997 		kcompactd_do_work(pgdat);
1998 	}
1999 
2000 	return 0;
2001 }
2002 
2003 /*
2004  * This kcompactd start function will be called by init and node-hot-add.
2005  * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
2006  */
2007 int kcompactd_run(int nid)
2008 {
2009 	pg_data_t *pgdat = NODE_DATA(nid);
2010 	int ret = 0;
2011 
2012 	if (pgdat->kcompactd)
2013 		return 0;
2014 
2015 	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
2016 	if (IS_ERR(pgdat->kcompactd)) {
2017 		pr_err("Failed to start kcompactd on node %d\n", nid);
2018 		ret = PTR_ERR(pgdat->kcompactd);
2019 		pgdat->kcompactd = NULL;
2020 	}
2021 	return ret;
2022 }
2023 
2024 /*
2025  * Called by memory hotplug when all memory in a node is offlined. Caller must
2026  * hold mem_hotplug_begin/end().
2027  */
2028 void kcompactd_stop(int nid)
2029 {
2030 	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
2031 
2032 	if (kcompactd) {
2033 		kthread_stop(kcompactd);
2034 		NODE_DATA(nid)->kcompactd = NULL;
2035 	}
2036 }
2037 
2038 /*
2039  * It's optimal to keep kcompactd on the same CPUs as their memory, but
2040  * not required for correctness. So if the last cpu in a node goes
2041  * away, we get changed to run anywhere: as the first one comes back,
2042  * restore their cpu bindings.
2043  */
2044 static int cpu_callback(struct notifier_block *nfb, unsigned long action,
2045 			void *hcpu)
2046 {
2047 	int nid;
2048 
2049 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2050 		for_each_node_state(nid, N_MEMORY) {
2051 			pg_data_t *pgdat = NODE_DATA(nid);
2052 			const struct cpumask *mask;
2053 
2054 			mask = cpumask_of_node(pgdat->node_id);
2055 
2056 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2057 				/* One of our CPUs online: restore mask */
2058 				set_cpus_allowed_ptr(pgdat->kcompactd, mask);
2059 		}
2060 	}
2061 	return NOTIFY_OK;
2062 }
2063 
2064 static int __init kcompactd_init(void)
2065 {
2066 	int nid;
2067 
2068 	for_each_node_state(nid, N_MEMORY)
2069 		kcompactd_run(nid);
2070 	hotcpu_notifier(cpu_callback, 0);
2071 	return 0;
2072 }
2073 subsys_initcall(kcompactd_init)
2074 
2075 #endif /* CONFIG_COMPACTION */
2076