xref: /openbmc/linux/mm/compaction.c (revision 05bcf503)
1 /*
2  * linux/mm/compaction.c
3  *
4  * Memory compaction for the reduction of external fragmentation. Note that
5  * this heavily depends upon page migration to do all the real heavy
6  * lifting
7  *
8  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9  */
10 #include <linux/swap.h>
11 #include <linux/migrate.h>
12 #include <linux/compaction.h>
13 #include <linux/mm_inline.h>
14 #include <linux/backing-dev.h>
15 #include <linux/sysctl.h>
16 #include <linux/sysfs.h>
17 #include "internal.h"
18 
19 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
20 
21 #define CREATE_TRACE_POINTS
22 #include <trace/events/compaction.h>
23 
24 static unsigned long release_freepages(struct list_head *freelist)
25 {
26 	struct page *page, *next;
27 	unsigned long count = 0;
28 
29 	list_for_each_entry_safe(page, next, freelist, lru) {
30 		list_del(&page->lru);
31 		__free_page(page);
32 		count++;
33 	}
34 
35 	return count;
36 }
37 
38 static void map_pages(struct list_head *list)
39 {
40 	struct page *page;
41 
42 	list_for_each_entry(page, list, lru) {
43 		arch_alloc_page(page, 0);
44 		kernel_map_pages(page, 1, 1);
45 	}
46 }
47 
48 static inline bool migrate_async_suitable(int migratetype)
49 {
50 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51 }
52 
53 #ifdef CONFIG_COMPACTION
54 /* Returns true if the pageblock should be scanned for pages to isolate. */
55 static inline bool isolation_suitable(struct compact_control *cc,
56 					struct page *page)
57 {
58 	if (cc->ignore_skip_hint)
59 		return true;
60 
61 	return !get_pageblock_skip(page);
62 }
63 
64 /*
65  * This function is called to clear all cached information on pageblocks that
66  * should be skipped for page isolation when the migrate and free page scanner
67  * meet.
68  */
69 static void __reset_isolation_suitable(struct zone *zone)
70 {
71 	unsigned long start_pfn = zone->zone_start_pfn;
72 	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
73 	unsigned long pfn;
74 
75 	zone->compact_cached_migrate_pfn = start_pfn;
76 	zone->compact_cached_free_pfn = end_pfn;
77 	zone->compact_blockskip_flush = false;
78 
79 	/* Walk the zone and mark every pageblock as suitable for isolation */
80 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
81 		struct page *page;
82 
83 		cond_resched();
84 
85 		if (!pfn_valid(pfn))
86 			continue;
87 
88 		page = pfn_to_page(pfn);
89 		if (zone != page_zone(page))
90 			continue;
91 
92 		clear_pageblock_skip(page);
93 	}
94 }
95 
96 void reset_isolation_suitable(pg_data_t *pgdat)
97 {
98 	int zoneid;
99 
100 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
101 		struct zone *zone = &pgdat->node_zones[zoneid];
102 		if (!populated_zone(zone))
103 			continue;
104 
105 		/* Only flush if a full compaction finished recently */
106 		if (zone->compact_blockskip_flush)
107 			__reset_isolation_suitable(zone);
108 	}
109 }
110 
111 /*
112  * If no pages were isolated then mark this pageblock to be skipped in the
113  * future. The information is later cleared by __reset_isolation_suitable().
114  */
115 static void update_pageblock_skip(struct compact_control *cc,
116 			struct page *page, unsigned long nr_isolated,
117 			bool migrate_scanner)
118 {
119 	struct zone *zone = cc->zone;
120 	if (!page)
121 		return;
122 
123 	if (!nr_isolated) {
124 		unsigned long pfn = page_to_pfn(page);
125 		set_pageblock_skip(page);
126 
127 		/* Update where compaction should restart */
128 		if (migrate_scanner) {
129 			if (!cc->finished_update_migrate &&
130 			    pfn > zone->compact_cached_migrate_pfn)
131 				zone->compact_cached_migrate_pfn = pfn;
132 		} else {
133 			if (!cc->finished_update_free &&
134 			    pfn < zone->compact_cached_free_pfn)
135 				zone->compact_cached_free_pfn = pfn;
136 		}
137 	}
138 }
139 #else
140 static inline bool isolation_suitable(struct compact_control *cc,
141 					struct page *page)
142 {
143 	return true;
144 }
145 
146 static void update_pageblock_skip(struct compact_control *cc,
147 			struct page *page, unsigned long nr_isolated,
148 			bool migrate_scanner)
149 {
150 }
151 #endif /* CONFIG_COMPACTION */
152 
153 static inline bool should_release_lock(spinlock_t *lock)
154 {
155 	return need_resched() || spin_is_contended(lock);
156 }
157 
158 /*
159  * Compaction requires the taking of some coarse locks that are potentially
160  * very heavily contended. Check if the process needs to be scheduled or
161  * if the lock is contended. For async compaction, back out in the event
162  * if contention is severe. For sync compaction, schedule.
163  *
164  * Returns true if the lock is held.
165  * Returns false if the lock is released and compaction should abort
166  */
167 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
168 				      bool locked, struct compact_control *cc)
169 {
170 	if (should_release_lock(lock)) {
171 		if (locked) {
172 			spin_unlock_irqrestore(lock, *flags);
173 			locked = false;
174 		}
175 
176 		/* async aborts if taking too long or contended */
177 		if (!cc->sync) {
178 			cc->contended = true;
179 			return false;
180 		}
181 
182 		cond_resched();
183 	}
184 
185 	if (!locked)
186 		spin_lock_irqsave(lock, *flags);
187 	return true;
188 }
189 
190 static inline bool compact_trylock_irqsave(spinlock_t *lock,
191 			unsigned long *flags, struct compact_control *cc)
192 {
193 	return compact_checklock_irqsave(lock, flags, false, cc);
194 }
195 
196 /* Returns true if the page is within a block suitable for migration to */
197 static bool suitable_migration_target(struct page *page)
198 {
199 	int migratetype = get_pageblock_migratetype(page);
200 
201 	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
202 	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
203 		return false;
204 
205 	/* If the page is a large free page, then allow migration */
206 	if (PageBuddy(page) && page_order(page) >= pageblock_order)
207 		return true;
208 
209 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
210 	if (migrate_async_suitable(migratetype))
211 		return true;
212 
213 	/* Otherwise skip the block */
214 	return false;
215 }
216 
217 static void compact_capture_page(struct compact_control *cc)
218 {
219 	unsigned long flags;
220 	int mtype, mtype_low, mtype_high;
221 
222 	if (!cc->page || *cc->page)
223 		return;
224 
225 	/*
226 	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 	 * regardless of the migratetype of the freelist is is captured from.
228 	 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 	 * allocation is typically at least a pageblock size and overall
230 	 * fragmentation is not impaired. Other allocation types must
231 	 * capture pages from their own migratelist because otherwise they
232 	 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 	 * difficult to move pages and making fragmentation worse overall.
234 	 */
235 	if (cc->migratetype == MIGRATE_MOVABLE) {
236 		mtype_low = 0;
237 		mtype_high = MIGRATE_PCPTYPES;
238 	} else {
239 		mtype_low = cc->migratetype;
240 		mtype_high = cc->migratetype + 1;
241 	}
242 
243 	/* Speculatively examine the free lists without zone lock */
244 	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 		int order;
246 		for (order = cc->order; order < MAX_ORDER; order++) {
247 			struct page *page;
248 			struct free_area *area;
249 			area = &(cc->zone->free_area[order]);
250 			if (list_empty(&area->free_list[mtype]))
251 				continue;
252 
253 			/* Take the lock and attempt capture of the page */
254 			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 				return;
256 			if (!list_empty(&area->free_list[mtype])) {
257 				page = list_entry(area->free_list[mtype].next,
258 							struct page, lru);
259 				if (capture_free_page(page, cc->order, mtype)) {
260 					spin_unlock_irqrestore(&cc->zone->lock,
261 									flags);
262 					*cc->page = page;
263 					return;
264 				}
265 			}
266 			spin_unlock_irqrestore(&cc->zone->lock, flags);
267 		}
268 	}
269 }
270 
271 /*
272  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
273  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
274  * pages inside of the pageblock (even though it may still end up isolating
275  * some pages).
276  */
277 static unsigned long isolate_freepages_block(struct compact_control *cc,
278 				unsigned long blockpfn,
279 				unsigned long end_pfn,
280 				struct list_head *freelist,
281 				bool strict)
282 {
283 	int nr_scanned = 0, total_isolated = 0;
284 	struct page *cursor, *valid_page = NULL;
285 	unsigned long nr_strict_required = end_pfn - blockpfn;
286 	unsigned long flags;
287 	bool locked = false;
288 
289 	cursor = pfn_to_page(blockpfn);
290 
291 	/* Isolate free pages. */
292 	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
293 		int isolated, i;
294 		struct page *page = cursor;
295 
296 		nr_scanned++;
297 		if (!pfn_valid_within(blockpfn))
298 			continue;
299 		if (!valid_page)
300 			valid_page = page;
301 		if (!PageBuddy(page))
302 			continue;
303 
304 		/*
305 		 * The zone lock must be held to isolate freepages.
306 		 * Unfortunately this is a very coarse lock and can be
307 		 * heavily contended if there are parallel allocations
308 		 * or parallel compactions. For async compaction do not
309 		 * spin on the lock and we acquire the lock as late as
310 		 * possible.
311 		 */
312 		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
313 								locked, cc);
314 		if (!locked)
315 			break;
316 
317 		/* Recheck this is a suitable migration target under lock */
318 		if (!strict && !suitable_migration_target(page))
319 			break;
320 
321 		/* Recheck this is a buddy page under lock */
322 		if (!PageBuddy(page))
323 			continue;
324 
325 		/* Found a free page, break it into order-0 pages */
326 		isolated = split_free_page(page);
327 		if (!isolated && strict)
328 			break;
329 		total_isolated += isolated;
330 		for (i = 0; i < isolated; i++) {
331 			list_add(&page->lru, freelist);
332 			page++;
333 		}
334 
335 		/* If a page was split, advance to the end of it */
336 		if (isolated) {
337 			blockpfn += isolated - 1;
338 			cursor += isolated - 1;
339 		}
340 	}
341 
342 	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
343 
344 	/*
345 	 * If strict isolation is requested by CMA then check that all the
346 	 * pages requested were isolated. If there were any failures, 0 is
347 	 * returned and CMA will fail.
348 	 */
349 	if (strict && nr_strict_required > total_isolated)
350 		total_isolated = 0;
351 
352 	if (locked)
353 		spin_unlock_irqrestore(&cc->zone->lock, flags);
354 
355 	/* Update the pageblock-skip if the whole pageblock was scanned */
356 	if (blockpfn == end_pfn)
357 		update_pageblock_skip(cc, valid_page, total_isolated, false);
358 
359 	return total_isolated;
360 }
361 
362 /**
363  * isolate_freepages_range() - isolate free pages.
364  * @start_pfn: The first PFN to start isolating.
365  * @end_pfn:   The one-past-last PFN.
366  *
367  * Non-free pages, invalid PFNs, or zone boundaries within the
368  * [start_pfn, end_pfn) range are considered errors, cause function to
369  * undo its actions and return zero.
370  *
371  * Otherwise, function returns one-past-the-last PFN of isolated page
372  * (which may be greater then end_pfn if end fell in a middle of
373  * a free page).
374  */
375 unsigned long
376 isolate_freepages_range(struct compact_control *cc,
377 			unsigned long start_pfn, unsigned long end_pfn)
378 {
379 	unsigned long isolated, pfn, block_end_pfn;
380 	LIST_HEAD(freelist);
381 
382 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
383 		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
384 			break;
385 
386 		/*
387 		 * On subsequent iterations ALIGN() is actually not needed,
388 		 * but we keep it that we not to complicate the code.
389 		 */
390 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
391 		block_end_pfn = min(block_end_pfn, end_pfn);
392 
393 		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
394 						   &freelist, true);
395 
396 		/*
397 		 * In strict mode, isolate_freepages_block() returns 0 if
398 		 * there are any holes in the block (ie. invalid PFNs or
399 		 * non-free pages).
400 		 */
401 		if (!isolated)
402 			break;
403 
404 		/*
405 		 * If we managed to isolate pages, it is always (1 << n) *
406 		 * pageblock_nr_pages for some non-negative n.  (Max order
407 		 * page may span two pageblocks).
408 		 */
409 	}
410 
411 	/* split_free_page does not map the pages */
412 	map_pages(&freelist);
413 
414 	if (pfn < end_pfn) {
415 		/* Loop terminated early, cleanup. */
416 		release_freepages(&freelist);
417 		return 0;
418 	}
419 
420 	/* We don't use freelists for anything. */
421 	return pfn;
422 }
423 
424 /* Update the number of anon and file isolated pages in the zone */
425 static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
426 {
427 	struct page *page;
428 	unsigned int count[2] = { 0, };
429 
430 	list_for_each_entry(page, &cc->migratepages, lru)
431 		count[!!page_is_file_cache(page)]++;
432 
433 	/* If locked we can use the interrupt unsafe versions */
434 	if (locked) {
435 		__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
436 		__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
437 	} else {
438 		mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
439 		mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
440 	}
441 }
442 
443 /* Similar to reclaim, but different enough that they don't share logic */
444 static bool too_many_isolated(struct zone *zone)
445 {
446 	unsigned long active, inactive, isolated;
447 
448 	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
449 					zone_page_state(zone, NR_INACTIVE_ANON);
450 	active = zone_page_state(zone, NR_ACTIVE_FILE) +
451 					zone_page_state(zone, NR_ACTIVE_ANON);
452 	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
453 					zone_page_state(zone, NR_ISOLATED_ANON);
454 
455 	return isolated > (inactive + active) / 2;
456 }
457 
458 /**
459  * isolate_migratepages_range() - isolate all migrate-able pages in range.
460  * @zone:	Zone pages are in.
461  * @cc:		Compaction control structure.
462  * @low_pfn:	The first PFN of the range.
463  * @end_pfn:	The one-past-the-last PFN of the range.
464  * @unevictable: true if it allows to isolate unevictable pages
465  *
466  * Isolate all pages that can be migrated from the range specified by
467  * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
468  * pending), otherwise PFN of the first page that was not scanned
469  * (which may be both less, equal to or more then end_pfn).
470  *
471  * Assumes that cc->migratepages is empty and cc->nr_migratepages is
472  * zero.
473  *
474  * Apart from cc->migratepages and cc->nr_migratetypes this function
475  * does not modify any cc's fields, in particular it does not modify
476  * (or read for that matter) cc->migrate_pfn.
477  */
478 unsigned long
479 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
480 		unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
481 {
482 	unsigned long last_pageblock_nr = 0, pageblock_nr;
483 	unsigned long nr_scanned = 0, nr_isolated = 0;
484 	struct list_head *migratelist = &cc->migratepages;
485 	isolate_mode_t mode = 0;
486 	struct lruvec *lruvec;
487 	unsigned long flags;
488 	bool locked = false;
489 	struct page *page = NULL, *valid_page = NULL;
490 
491 	/*
492 	 * Ensure that there are not too many pages isolated from the LRU
493 	 * list by either parallel reclaimers or compaction. If there are,
494 	 * delay for some time until fewer pages are isolated
495 	 */
496 	while (unlikely(too_many_isolated(zone))) {
497 		/* async migration should just abort */
498 		if (!cc->sync)
499 			return 0;
500 
501 		congestion_wait(BLK_RW_ASYNC, HZ/10);
502 
503 		if (fatal_signal_pending(current))
504 			return 0;
505 	}
506 
507 	/* Time to isolate some pages for migration */
508 	cond_resched();
509 	for (; low_pfn < end_pfn; low_pfn++) {
510 		/* give a chance to irqs before checking need_resched() */
511 		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
512 			if (should_release_lock(&zone->lru_lock)) {
513 				spin_unlock_irqrestore(&zone->lru_lock, flags);
514 				locked = false;
515 			}
516 		}
517 
518 		/*
519 		 * migrate_pfn does not necessarily start aligned to a
520 		 * pageblock. Ensure that pfn_valid is called when moving
521 		 * into a new MAX_ORDER_NR_PAGES range in case of large
522 		 * memory holes within the zone
523 		 */
524 		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
525 			if (!pfn_valid(low_pfn)) {
526 				low_pfn += MAX_ORDER_NR_PAGES - 1;
527 				continue;
528 			}
529 		}
530 
531 		if (!pfn_valid_within(low_pfn))
532 			continue;
533 		nr_scanned++;
534 
535 		/*
536 		 * Get the page and ensure the page is within the same zone.
537 		 * See the comment in isolate_freepages about overlapping
538 		 * nodes. It is deliberate that the new zone lock is not taken
539 		 * as memory compaction should not move pages between nodes.
540 		 */
541 		page = pfn_to_page(low_pfn);
542 		if (page_zone(page) != zone)
543 			continue;
544 
545 		if (!valid_page)
546 			valid_page = page;
547 
548 		/* If isolation recently failed, do not retry */
549 		pageblock_nr = low_pfn >> pageblock_order;
550 		if (!isolation_suitable(cc, page))
551 			goto next_pageblock;
552 
553 		/* Skip if free */
554 		if (PageBuddy(page))
555 			continue;
556 
557 		/*
558 		 * For async migration, also only scan in MOVABLE blocks. Async
559 		 * migration is optimistic to see if the minimum amount of work
560 		 * satisfies the allocation
561 		 */
562 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
563 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
564 			cc->finished_update_migrate = true;
565 			goto next_pageblock;
566 		}
567 
568 		/* Check may be lockless but that's ok as we recheck later */
569 		if (!PageLRU(page))
570 			continue;
571 
572 		/*
573 		 * PageLRU is set. lru_lock normally excludes isolation
574 		 * splitting and collapsing (collapsing has already happened
575 		 * if PageLRU is set) but the lock is not necessarily taken
576 		 * here and it is wasteful to take it just to check transhuge.
577 		 * Check TransHuge without lock and skip the whole pageblock if
578 		 * it's either a transhuge or hugetlbfs page, as calling
579 		 * compound_order() without preventing THP from splitting the
580 		 * page underneath us may return surprising results.
581 		 */
582 		if (PageTransHuge(page)) {
583 			if (!locked)
584 				goto next_pageblock;
585 			low_pfn += (1 << compound_order(page)) - 1;
586 			continue;
587 		}
588 
589 		/* Check if it is ok to still hold the lock */
590 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
591 								locked, cc);
592 		if (!locked || fatal_signal_pending(current))
593 			break;
594 
595 		/* Recheck PageLRU and PageTransHuge under lock */
596 		if (!PageLRU(page))
597 			continue;
598 		if (PageTransHuge(page)) {
599 			low_pfn += (1 << compound_order(page)) - 1;
600 			continue;
601 		}
602 
603 		if (!cc->sync)
604 			mode |= ISOLATE_ASYNC_MIGRATE;
605 
606 		if (unevictable)
607 			mode |= ISOLATE_UNEVICTABLE;
608 
609 		lruvec = mem_cgroup_page_lruvec(page, zone);
610 
611 		/* Try isolate the page */
612 		if (__isolate_lru_page(page, mode) != 0)
613 			continue;
614 
615 		VM_BUG_ON(PageTransCompound(page));
616 
617 		/* Successfully isolated */
618 		cc->finished_update_migrate = true;
619 		del_page_from_lru_list(page, lruvec, page_lru(page));
620 		list_add(&page->lru, migratelist);
621 		cc->nr_migratepages++;
622 		nr_isolated++;
623 
624 		/* Avoid isolating too much */
625 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 			++low_pfn;
627 			break;
628 		}
629 
630 		continue;
631 
632 next_pageblock:
633 		low_pfn += pageblock_nr_pages;
634 		low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
635 		last_pageblock_nr = pageblock_nr;
636 	}
637 
638 	acct_isolated(zone, locked, cc);
639 
640 	if (locked)
641 		spin_unlock_irqrestore(&zone->lru_lock, flags);
642 
643 	/* Update the pageblock-skip if the whole pageblock was scanned */
644 	if (low_pfn == end_pfn)
645 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
646 
647 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
648 
649 	return low_pfn;
650 }
651 
652 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
653 #ifdef CONFIG_COMPACTION
654 /*
655  * Based on information in the current compact_control, find blocks
656  * suitable for isolating free pages from and then isolate them.
657  */
658 static void isolate_freepages(struct zone *zone,
659 				struct compact_control *cc)
660 {
661 	struct page *page;
662 	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
663 	int nr_freepages = cc->nr_freepages;
664 	struct list_head *freelist = &cc->freepages;
665 
666 	/*
667 	 * Initialise the free scanner. The starting point is where we last
668 	 * scanned from (or the end of the zone if starting). The low point
669 	 * is the end of the pageblock the migration scanner is using.
670 	 */
671 	pfn = cc->free_pfn;
672 	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
673 
674 	/*
675 	 * Take care that if the migration scanner is at the end of the zone
676 	 * that the free scanner does not accidentally move to the next zone
677 	 * in the next isolation cycle.
678 	 */
679 	high_pfn = min(low_pfn, pfn);
680 
681 	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
682 
683 	/*
684 	 * Isolate free pages until enough are available to migrate the
685 	 * pages on cc->migratepages. We stop searching if the migrate
686 	 * and free page scanners meet or enough free pages are isolated.
687 	 */
688 	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
689 					pfn -= pageblock_nr_pages) {
690 		unsigned long isolated;
691 
692 		if (!pfn_valid(pfn))
693 			continue;
694 
695 		/*
696 		 * Check for overlapping nodes/zones. It's possible on some
697 		 * configurations to have a setup like
698 		 * node0 node1 node0
699 		 * i.e. it's possible that all pages within a zones range of
700 		 * pages do not belong to a single zone.
701 		 */
702 		page = pfn_to_page(pfn);
703 		if (page_zone(page) != zone)
704 			continue;
705 
706 		/* Check the block is suitable for migration */
707 		if (!suitable_migration_target(page))
708 			continue;
709 
710 		/* If isolation recently failed, do not retry */
711 		if (!isolation_suitable(cc, page))
712 			continue;
713 
714 		/* Found a block suitable for isolating free pages from */
715 		isolated = 0;
716 		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
717 		isolated = isolate_freepages_block(cc, pfn, end_pfn,
718 						   freelist, false);
719 		nr_freepages += isolated;
720 
721 		/*
722 		 * Record the highest PFN we isolated pages from. When next
723 		 * looking for free pages, the search will restart here as
724 		 * page migration may have returned some pages to the allocator
725 		 */
726 		if (isolated) {
727 			cc->finished_update_free = true;
728 			high_pfn = max(high_pfn, pfn);
729 		}
730 	}
731 
732 	/* split_free_page does not map the pages */
733 	map_pages(freelist);
734 
735 	cc->free_pfn = high_pfn;
736 	cc->nr_freepages = nr_freepages;
737 }
738 
739 /*
740  * This is a migrate-callback that "allocates" freepages by taking pages
741  * from the isolated freelists in the block we are migrating to.
742  */
743 static struct page *compaction_alloc(struct page *migratepage,
744 					unsigned long data,
745 					int **result)
746 {
747 	struct compact_control *cc = (struct compact_control *)data;
748 	struct page *freepage;
749 
750 	/* Isolate free pages if necessary */
751 	if (list_empty(&cc->freepages)) {
752 		isolate_freepages(cc->zone, cc);
753 
754 		if (list_empty(&cc->freepages))
755 			return NULL;
756 	}
757 
758 	freepage = list_entry(cc->freepages.next, struct page, lru);
759 	list_del(&freepage->lru);
760 	cc->nr_freepages--;
761 
762 	return freepage;
763 }
764 
765 /*
766  * We cannot control nr_migratepages and nr_freepages fully when migration is
767  * running as migrate_pages() has no knowledge of compact_control. When
768  * migration is complete, we count the number of pages on the lists by hand.
769  */
770 static void update_nr_listpages(struct compact_control *cc)
771 {
772 	int nr_migratepages = 0;
773 	int nr_freepages = 0;
774 	struct page *page;
775 
776 	list_for_each_entry(page, &cc->migratepages, lru)
777 		nr_migratepages++;
778 	list_for_each_entry(page, &cc->freepages, lru)
779 		nr_freepages++;
780 
781 	cc->nr_migratepages = nr_migratepages;
782 	cc->nr_freepages = nr_freepages;
783 }
784 
785 /* possible outcome of isolate_migratepages */
786 typedef enum {
787 	ISOLATE_ABORT,		/* Abort compaction now */
788 	ISOLATE_NONE,		/* No pages isolated, continue scanning */
789 	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
790 } isolate_migrate_t;
791 
792 /*
793  * Isolate all pages that can be migrated from the block pointed to by
794  * the migrate scanner within compact_control.
795  */
796 static isolate_migrate_t isolate_migratepages(struct zone *zone,
797 					struct compact_control *cc)
798 {
799 	unsigned long low_pfn, end_pfn;
800 
801 	/* Do not scan outside zone boundaries */
802 	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
803 
804 	/* Only scan within a pageblock boundary */
805 	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
806 
807 	/* Do not cross the free scanner or scan within a memory hole */
808 	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
809 		cc->migrate_pfn = end_pfn;
810 		return ISOLATE_NONE;
811 	}
812 
813 	/* Perform the isolation */
814 	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
815 	if (!low_pfn || cc->contended)
816 		return ISOLATE_ABORT;
817 
818 	cc->migrate_pfn = low_pfn;
819 
820 	return ISOLATE_SUCCESS;
821 }
822 
823 static int compact_finished(struct zone *zone,
824 			    struct compact_control *cc)
825 {
826 	unsigned long watermark;
827 
828 	if (fatal_signal_pending(current))
829 		return COMPACT_PARTIAL;
830 
831 	/* Compaction run completes if the migrate and free scanner meet */
832 	if (cc->free_pfn <= cc->migrate_pfn) {
833 		/*
834 		 * Mark that the PG_migrate_skip information should be cleared
835 		 * by kswapd when it goes to sleep. kswapd does not set the
836 		 * flag itself as the decision to be clear should be directly
837 		 * based on an allocation request.
838 		 */
839 		if (!current_is_kswapd())
840 			zone->compact_blockskip_flush = true;
841 
842 		return COMPACT_COMPLETE;
843 	}
844 
845 	/*
846 	 * order == -1 is expected when compacting via
847 	 * /proc/sys/vm/compact_memory
848 	 */
849 	if (cc->order == -1)
850 		return COMPACT_CONTINUE;
851 
852 	/* Compaction run is not finished if the watermark is not met */
853 	watermark = low_wmark_pages(zone);
854 	watermark += (1 << cc->order);
855 
856 	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
857 		return COMPACT_CONTINUE;
858 
859 	/* Direct compactor: Is a suitable page free? */
860 	if (cc->page) {
861 		/* Was a suitable page captured? */
862 		if (*cc->page)
863 			return COMPACT_PARTIAL;
864 	} else {
865 		unsigned int order;
866 		for (order = cc->order; order < MAX_ORDER; order++) {
867 			struct free_area *area = &zone->free_area[cc->order];
868 			/* Job done if page is free of the right migratetype */
869 			if (!list_empty(&area->free_list[cc->migratetype]))
870 				return COMPACT_PARTIAL;
871 
872 			/* Job done if allocation would set block type */
873 			if (cc->order >= pageblock_order && area->nr_free)
874 				return COMPACT_PARTIAL;
875 		}
876 	}
877 
878 	return COMPACT_CONTINUE;
879 }
880 
881 /*
882  * compaction_suitable: Is this suitable to run compaction on this zone now?
883  * Returns
884  *   COMPACT_SKIPPED  - If there are too few free pages for compaction
885  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
886  *   COMPACT_CONTINUE - If compaction should run now
887  */
888 unsigned long compaction_suitable(struct zone *zone, int order)
889 {
890 	int fragindex;
891 	unsigned long watermark;
892 
893 	/*
894 	 * order == -1 is expected when compacting via
895 	 * /proc/sys/vm/compact_memory
896 	 */
897 	if (order == -1)
898 		return COMPACT_CONTINUE;
899 
900 	/*
901 	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
902 	 * This is because during migration, copies of pages need to be
903 	 * allocated and for a short time, the footprint is higher
904 	 */
905 	watermark = low_wmark_pages(zone) + (2UL << order);
906 	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
907 		return COMPACT_SKIPPED;
908 
909 	/*
910 	 * fragmentation index determines if allocation failures are due to
911 	 * low memory or external fragmentation
912 	 *
913 	 * index of -1000 implies allocations might succeed depending on
914 	 * watermarks
915 	 * index towards 0 implies failure is due to lack of memory
916 	 * index towards 1000 implies failure is due to fragmentation
917 	 *
918 	 * Only compact if a failure would be due to fragmentation.
919 	 */
920 	fragindex = fragmentation_index(zone, order);
921 	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
922 		return COMPACT_SKIPPED;
923 
924 	if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
925 	    0, 0))
926 		return COMPACT_PARTIAL;
927 
928 	return COMPACT_CONTINUE;
929 }
930 
931 static int compact_zone(struct zone *zone, struct compact_control *cc)
932 {
933 	int ret;
934 	unsigned long start_pfn = zone->zone_start_pfn;
935 	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
936 
937 	ret = compaction_suitable(zone, cc->order);
938 	switch (ret) {
939 	case COMPACT_PARTIAL:
940 	case COMPACT_SKIPPED:
941 		/* Compaction is likely to fail */
942 		return ret;
943 	case COMPACT_CONTINUE:
944 		/* Fall through to compaction */
945 		;
946 	}
947 
948 	/*
949 	 * Setup to move all movable pages to the end of the zone. Used cached
950 	 * information on where the scanners should start but check that it
951 	 * is initialised by ensuring the values are within zone boundaries.
952 	 */
953 	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
954 	cc->free_pfn = zone->compact_cached_free_pfn;
955 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
956 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
957 		zone->compact_cached_free_pfn = cc->free_pfn;
958 	}
959 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
960 		cc->migrate_pfn = start_pfn;
961 		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
962 	}
963 
964 	/*
965 	 * Clear pageblock skip if there were failures recently and compaction
966 	 * is about to be retried after being deferred. kswapd does not do
967 	 * this reset as it'll reset the cached information when going to sleep.
968 	 */
969 	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
970 		__reset_isolation_suitable(zone);
971 
972 	migrate_prep_local();
973 
974 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
975 		unsigned long nr_migrate, nr_remaining;
976 		int err;
977 
978 		switch (isolate_migratepages(zone, cc)) {
979 		case ISOLATE_ABORT:
980 			ret = COMPACT_PARTIAL;
981 			putback_lru_pages(&cc->migratepages);
982 			cc->nr_migratepages = 0;
983 			goto out;
984 		case ISOLATE_NONE:
985 			continue;
986 		case ISOLATE_SUCCESS:
987 			;
988 		}
989 
990 		nr_migrate = cc->nr_migratepages;
991 		err = migrate_pages(&cc->migratepages, compaction_alloc,
992 				(unsigned long)cc, false,
993 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
994 		update_nr_listpages(cc);
995 		nr_remaining = cc->nr_migratepages;
996 
997 		count_vm_event(COMPACTBLOCKS);
998 		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
999 		if (nr_remaining)
1000 			count_vm_events(COMPACTPAGEFAILED, nr_remaining);
1001 		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1002 						nr_remaining);
1003 
1004 		/* Release LRU pages not migrated */
1005 		if (err) {
1006 			putback_lru_pages(&cc->migratepages);
1007 			cc->nr_migratepages = 0;
1008 			if (err == -ENOMEM) {
1009 				ret = COMPACT_PARTIAL;
1010 				goto out;
1011 			}
1012 		}
1013 
1014 		/* Capture a page now if it is a suitable size */
1015 		compact_capture_page(cc);
1016 	}
1017 
1018 out:
1019 	/* Release free pages and check accounting */
1020 	cc->nr_freepages -= release_freepages(&cc->freepages);
1021 	VM_BUG_ON(cc->nr_freepages != 0);
1022 
1023 	return ret;
1024 }
1025 
1026 static unsigned long compact_zone_order(struct zone *zone,
1027 				 int order, gfp_t gfp_mask,
1028 				 bool sync, bool *contended,
1029 				 struct page **page)
1030 {
1031 	unsigned long ret;
1032 	struct compact_control cc = {
1033 		.nr_freepages = 0,
1034 		.nr_migratepages = 0,
1035 		.order = order,
1036 		.migratetype = allocflags_to_migratetype(gfp_mask),
1037 		.zone = zone,
1038 		.sync = sync,
1039 		.page = page,
1040 	};
1041 	INIT_LIST_HEAD(&cc.freepages);
1042 	INIT_LIST_HEAD(&cc.migratepages);
1043 
1044 	ret = compact_zone(zone, &cc);
1045 
1046 	VM_BUG_ON(!list_empty(&cc.freepages));
1047 	VM_BUG_ON(!list_empty(&cc.migratepages));
1048 
1049 	*contended = cc.contended;
1050 	return ret;
1051 }
1052 
1053 int sysctl_extfrag_threshold = 500;
1054 
1055 /**
1056  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1057  * @zonelist: The zonelist used for the current allocation
1058  * @order: The order of the current allocation
1059  * @gfp_mask: The GFP mask of the current allocation
1060  * @nodemask: The allowed nodes to allocate from
1061  * @sync: Whether migration is synchronous or not
1062  * @contended: Return value that is true if compaction was aborted due to lock contention
1063  * @page: Optionally capture a free page of the requested order during compaction
1064  *
1065  * This is the main entry point for direct page compaction.
1066  */
1067 unsigned long try_to_compact_pages(struct zonelist *zonelist,
1068 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
1069 			bool sync, bool *contended, struct page **page)
1070 {
1071 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1072 	int may_enter_fs = gfp_mask & __GFP_FS;
1073 	int may_perform_io = gfp_mask & __GFP_IO;
1074 	struct zoneref *z;
1075 	struct zone *zone;
1076 	int rc = COMPACT_SKIPPED;
1077 	int alloc_flags = 0;
1078 
1079 	/* Check if the GFP flags allow compaction */
1080 	if (!order || !may_enter_fs || !may_perform_io)
1081 		return rc;
1082 
1083 	count_vm_event(COMPACTSTALL);
1084 
1085 #ifdef CONFIG_CMA
1086 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1087 		alloc_flags |= ALLOC_CMA;
1088 #endif
1089 	/* Compact each zone in the list */
1090 	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1091 								nodemask) {
1092 		int status;
1093 
1094 		status = compact_zone_order(zone, order, gfp_mask, sync,
1095 						contended, page);
1096 		rc = max(status, rc);
1097 
1098 		/* If a normal allocation would succeed, stop compacting */
1099 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1100 				      alloc_flags))
1101 			break;
1102 	}
1103 
1104 	return rc;
1105 }
1106 
1107 
1108 /* Compact all zones within a node */
1109 static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1110 {
1111 	int zoneid;
1112 	struct zone *zone;
1113 
1114 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1115 
1116 		zone = &pgdat->node_zones[zoneid];
1117 		if (!populated_zone(zone))
1118 			continue;
1119 
1120 		cc->nr_freepages = 0;
1121 		cc->nr_migratepages = 0;
1122 		cc->zone = zone;
1123 		INIT_LIST_HEAD(&cc->freepages);
1124 		INIT_LIST_HEAD(&cc->migratepages);
1125 
1126 		if (cc->order == -1 || !compaction_deferred(zone, cc->order))
1127 			compact_zone(zone, cc);
1128 
1129 		if (cc->order > 0) {
1130 			int ok = zone_watermark_ok(zone, cc->order,
1131 						low_wmark_pages(zone), 0, 0);
1132 			if (ok && cc->order >= zone->compact_order_failed)
1133 				zone->compact_order_failed = cc->order + 1;
1134 			/* Currently async compaction is never deferred. */
1135 			else if (!ok && cc->sync)
1136 				defer_compaction(zone, cc->order);
1137 		}
1138 
1139 		VM_BUG_ON(!list_empty(&cc->freepages));
1140 		VM_BUG_ON(!list_empty(&cc->migratepages));
1141 	}
1142 
1143 	return 0;
1144 }
1145 
1146 int compact_pgdat(pg_data_t *pgdat, int order)
1147 {
1148 	struct compact_control cc = {
1149 		.order = order,
1150 		.sync = false,
1151 		.page = NULL,
1152 	};
1153 
1154 	return __compact_pgdat(pgdat, &cc);
1155 }
1156 
1157 static int compact_node(int nid)
1158 {
1159 	struct compact_control cc = {
1160 		.order = -1,
1161 		.sync = true,
1162 		.page = NULL,
1163 	};
1164 
1165 	return __compact_pgdat(NODE_DATA(nid), &cc);
1166 }
1167 
1168 /* Compact all nodes in the system */
1169 static int compact_nodes(void)
1170 {
1171 	int nid;
1172 
1173 	/* Flush pending updates to the LRU lists */
1174 	lru_add_drain_all();
1175 
1176 	for_each_online_node(nid)
1177 		compact_node(nid);
1178 
1179 	return COMPACT_COMPLETE;
1180 }
1181 
1182 /* The written value is actually unused, all memory is compacted */
1183 int sysctl_compact_memory;
1184 
1185 /* This is the entry point for compacting all nodes via /proc/sys/vm */
1186 int sysctl_compaction_handler(struct ctl_table *table, int write,
1187 			void __user *buffer, size_t *length, loff_t *ppos)
1188 {
1189 	if (write)
1190 		return compact_nodes();
1191 
1192 	return 0;
1193 }
1194 
1195 int sysctl_extfrag_handler(struct ctl_table *table, int write,
1196 			void __user *buffer, size_t *length, loff_t *ppos)
1197 {
1198 	proc_dointvec_minmax(table, write, buffer, length, ppos);
1199 
1200 	return 0;
1201 }
1202 
1203 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1204 ssize_t sysfs_compact_node(struct device *dev,
1205 			struct device_attribute *attr,
1206 			const char *buf, size_t count)
1207 {
1208 	int nid = dev->id;
1209 
1210 	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1211 		/* Flush pending updates to the LRU lists */
1212 		lru_add_drain_all();
1213 
1214 		compact_node(nid);
1215 	}
1216 
1217 	return count;
1218 }
1219 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1220 
1221 int compaction_register_node(struct node *node)
1222 {
1223 	return device_create_file(&node->dev, &dev_attr_compact);
1224 }
1225 
1226 void compaction_unregister_node(struct node *node)
1227 {
1228 	return device_remove_file(&node->dev, &dev_attr_compact);
1229 }
1230 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
1231 
1232 #endif /* CONFIG_COMPACTION */
1233