xref: /openbmc/linux/mm/vmstat.c (revision b78412b8)
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *		Christoph Lameter <christoph@lameter.com>
10  *  Copyright (C) 2008-2014 Christoph Lameter
11  */
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/err.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/cpu.h>
18 #include <linux/cpumask.h>
19 #include <linux/vmstat.h>
20 #include <linux/proc_fs.h>
21 #include <linux/seq_file.h>
22 #include <linux/debugfs.h>
23 #include <linux/sched.h>
24 #include <linux/math64.h>
25 #include <linux/writeback.h>
26 #include <linux/compaction.h>
27 #include <linux/mm_inline.h>
28 #include <linux/page_ext.h>
29 #include <linux/page_owner.h>
30 
31 #include "internal.h"
32 
33 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
34 
35 #ifdef CONFIG_VM_EVENT_COUNTERS
36 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
37 EXPORT_PER_CPU_SYMBOL(vm_event_states);
38 
39 static void sum_vm_events(unsigned long *ret)
40 {
41 	int cpu;
42 	int i;
43 
44 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
45 
46 	for_each_online_cpu(cpu) {
47 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
48 
49 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
50 			ret[i] += this->event[i];
51 	}
52 }
53 
54 /*
55  * Accumulate the vm event counters across all CPUs.
56  * The result is unavoidably approximate - it can change
57  * during and after execution of this function.
58 */
59 void all_vm_events(unsigned long *ret)
60 {
61 	get_online_cpus();
62 	sum_vm_events(ret);
63 	put_online_cpus();
64 }
65 EXPORT_SYMBOL_GPL(all_vm_events);
66 
67 /*
68  * Fold the foreign cpu events into our own.
69  *
70  * This is adding to the events on one processor
71  * but keeps the global counts constant.
72  */
73 void vm_events_fold_cpu(int cpu)
74 {
75 	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
76 	int i;
77 
78 	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
79 		count_vm_events(i, fold_state->event[i]);
80 		fold_state->event[i] = 0;
81 	}
82 }
83 
84 #endif /* CONFIG_VM_EVENT_COUNTERS */
85 
86 /*
87  * Manage combined zone based / global counters
88  *
89  * vm_stat contains the global counters
90  */
91 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
92 atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
93 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
94 EXPORT_SYMBOL(vm_zone_stat);
95 EXPORT_SYMBOL(vm_numa_stat);
96 EXPORT_SYMBOL(vm_node_stat);
97 
98 #ifdef CONFIG_SMP
99 
100 int calculate_pressure_threshold(struct zone *zone)
101 {
102 	int threshold;
103 	int watermark_distance;
104 
105 	/*
106 	 * As vmstats are not up to date, there is drift between the estimated
107 	 * and real values. For high thresholds and a high number of CPUs, it
108 	 * is possible for the min watermark to be breached while the estimated
109 	 * value looks fine. The pressure threshold is a reduced value such
110 	 * that even the maximum amount of drift will not accidentally breach
111 	 * the min watermark
112 	 */
113 	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
114 	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
115 
116 	/*
117 	 * Maximum threshold is 125
118 	 */
119 	threshold = min(125, threshold);
120 
121 	return threshold;
122 }
123 
124 int calculate_normal_threshold(struct zone *zone)
125 {
126 	int threshold;
127 	int mem;	/* memory in 128 MB units */
128 
129 	/*
130 	 * The threshold scales with the number of processors and the amount
131 	 * of memory per zone. More memory means that we can defer updates for
132 	 * longer, more processors could lead to more contention.
133  	 * fls() is used to have a cheap way of logarithmic scaling.
134 	 *
135 	 * Some sample thresholds:
136 	 *
137 	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
138 	 * ------------------------------------------------------------------
139 	 * 8		1		1	0.9-1 GB	4
140 	 * 16		2		2	0.9-1 GB	4
141 	 * 20 		2		2	1-2 GB		5
142 	 * 24		2		2	2-4 GB		6
143 	 * 28		2		2	4-8 GB		7
144 	 * 32		2		2	8-16 GB		8
145 	 * 4		2		2	<128M		1
146 	 * 30		4		3	2-4 GB		5
147 	 * 48		4		3	8-16 GB		8
148 	 * 32		8		4	1-2 GB		4
149 	 * 32		8		4	0.9-1GB		4
150 	 * 10		16		5	<128M		1
151 	 * 40		16		5	900M		4
152 	 * 70		64		7	2-4 GB		5
153 	 * 84		64		7	4-8 GB		6
154 	 * 108		512		9	4-8 GB		6
155 	 * 125		1024		10	8-16 GB		8
156 	 * 125		1024		10	16-32 GB	9
157 	 */
158 
159 	mem = zone->managed_pages >> (27 - PAGE_SHIFT);
160 
161 	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
162 
163 	/*
164 	 * Maximum threshold is 125
165 	 */
166 	threshold = min(125, threshold);
167 
168 	return threshold;
169 }
170 
171 /*
172  * Refresh the thresholds for each zone.
173  */
174 void refresh_zone_stat_thresholds(void)
175 {
176 	struct pglist_data *pgdat;
177 	struct zone *zone;
178 	int cpu;
179 	int threshold;
180 
181 	/* Zero current pgdat thresholds */
182 	for_each_online_pgdat(pgdat) {
183 		for_each_online_cpu(cpu) {
184 			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
185 		}
186 	}
187 
188 	for_each_populated_zone(zone) {
189 		struct pglist_data *pgdat = zone->zone_pgdat;
190 		unsigned long max_drift, tolerate_drift;
191 
192 		threshold = calculate_normal_threshold(zone);
193 
194 		for_each_online_cpu(cpu) {
195 			int pgdat_threshold;
196 
197 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
198 							= threshold;
199 
200 			/* Base nodestat threshold on the largest populated zone. */
201 			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
202 			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
203 				= max(threshold, pgdat_threshold);
204 		}
205 
206 		/*
207 		 * Only set percpu_drift_mark if there is a danger that
208 		 * NR_FREE_PAGES reports the low watermark is ok when in fact
209 		 * the min watermark could be breached by an allocation
210 		 */
211 		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
212 		max_drift = num_online_cpus() * threshold;
213 		if (max_drift > tolerate_drift)
214 			zone->percpu_drift_mark = high_wmark_pages(zone) +
215 					max_drift;
216 	}
217 }
218 
219 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
220 				int (*calculate_pressure)(struct zone *))
221 {
222 	struct zone *zone;
223 	int cpu;
224 	int threshold;
225 	int i;
226 
227 	for (i = 0; i < pgdat->nr_zones; i++) {
228 		zone = &pgdat->node_zones[i];
229 		if (!zone->percpu_drift_mark)
230 			continue;
231 
232 		threshold = (*calculate_pressure)(zone);
233 		for_each_online_cpu(cpu)
234 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
235 							= threshold;
236 	}
237 }
238 
239 /*
240  * For use when we know that interrupts are disabled,
241  * or when we know that preemption is disabled and that
242  * particular counter cannot be updated from interrupt context.
243  */
244 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
245 			   long delta)
246 {
247 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
248 	s8 __percpu *p = pcp->vm_stat_diff + item;
249 	long x;
250 	long t;
251 
252 	x = delta + __this_cpu_read(*p);
253 
254 	t = __this_cpu_read(pcp->stat_threshold);
255 
256 	if (unlikely(x > t || x < -t)) {
257 		zone_page_state_add(x, zone, item);
258 		x = 0;
259 	}
260 	__this_cpu_write(*p, x);
261 }
262 EXPORT_SYMBOL(__mod_zone_page_state);
263 
264 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
265 				long delta)
266 {
267 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
268 	s8 __percpu *p = pcp->vm_node_stat_diff + item;
269 	long x;
270 	long t;
271 
272 	x = delta + __this_cpu_read(*p);
273 
274 	t = __this_cpu_read(pcp->stat_threshold);
275 
276 	if (unlikely(x > t || x < -t)) {
277 		node_page_state_add(x, pgdat, item);
278 		x = 0;
279 	}
280 	__this_cpu_write(*p, x);
281 }
282 EXPORT_SYMBOL(__mod_node_page_state);
283 
284 /*
285  * Optimized increment and decrement functions.
286  *
287  * These are only for a single page and therefore can take a struct page *
288  * argument instead of struct zone *. This allows the inclusion of the code
289  * generated for page_zone(page) into the optimized functions.
290  *
291  * No overflow check is necessary and therefore the differential can be
292  * incremented or decremented in place which may allow the compilers to
293  * generate better code.
294  * The increment or decrement is known and therefore one boundary check can
295  * be omitted.
296  *
297  * NOTE: These functions are very performance sensitive. Change only
298  * with care.
299  *
300  * Some processors have inc/dec instructions that are atomic vs an interrupt.
301  * However, the code must first determine the differential location in a zone
302  * based on the processor number and then inc/dec the counter. There is no
303  * guarantee without disabling preemption that the processor will not change
304  * in between and therefore the atomicity vs. interrupt cannot be exploited
305  * in a useful way here.
306  */
307 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
308 {
309 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
310 	s8 __percpu *p = pcp->vm_stat_diff + item;
311 	s8 v, t;
312 
313 	v = __this_cpu_inc_return(*p);
314 	t = __this_cpu_read(pcp->stat_threshold);
315 	if (unlikely(v > t)) {
316 		s8 overstep = t >> 1;
317 
318 		zone_page_state_add(v + overstep, zone, item);
319 		__this_cpu_write(*p, -overstep);
320 	}
321 }
322 
323 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
324 {
325 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
326 	s8 __percpu *p = pcp->vm_node_stat_diff + item;
327 	s8 v, t;
328 
329 	v = __this_cpu_inc_return(*p);
330 	t = __this_cpu_read(pcp->stat_threshold);
331 	if (unlikely(v > t)) {
332 		s8 overstep = t >> 1;
333 
334 		node_page_state_add(v + overstep, pgdat, item);
335 		__this_cpu_write(*p, -overstep);
336 	}
337 }
338 
339 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
340 {
341 	__inc_zone_state(page_zone(page), item);
342 }
343 EXPORT_SYMBOL(__inc_zone_page_state);
344 
345 void __inc_node_page_state(struct page *page, enum node_stat_item item)
346 {
347 	__inc_node_state(page_pgdat(page), item);
348 }
349 EXPORT_SYMBOL(__inc_node_page_state);
350 
351 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
352 {
353 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
354 	s8 __percpu *p = pcp->vm_stat_diff + item;
355 	s8 v, t;
356 
357 	v = __this_cpu_dec_return(*p);
358 	t = __this_cpu_read(pcp->stat_threshold);
359 	if (unlikely(v < - t)) {
360 		s8 overstep = t >> 1;
361 
362 		zone_page_state_add(v - overstep, zone, item);
363 		__this_cpu_write(*p, overstep);
364 	}
365 }
366 
367 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
368 {
369 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
370 	s8 __percpu *p = pcp->vm_node_stat_diff + item;
371 	s8 v, t;
372 
373 	v = __this_cpu_dec_return(*p);
374 	t = __this_cpu_read(pcp->stat_threshold);
375 	if (unlikely(v < - t)) {
376 		s8 overstep = t >> 1;
377 
378 		node_page_state_add(v - overstep, pgdat, item);
379 		__this_cpu_write(*p, overstep);
380 	}
381 }
382 
383 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
384 {
385 	__dec_zone_state(page_zone(page), item);
386 }
387 EXPORT_SYMBOL(__dec_zone_page_state);
388 
389 void __dec_node_page_state(struct page *page, enum node_stat_item item)
390 {
391 	__dec_node_state(page_pgdat(page), item);
392 }
393 EXPORT_SYMBOL(__dec_node_page_state);
394 
395 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
396 /*
397  * If we have cmpxchg_local support then we do not need to incur the overhead
398  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
399  *
400  * mod_state() modifies the zone counter state through atomic per cpu
401  * operations.
402  *
403  * Overstep mode specifies how overstep should handled:
404  *     0       No overstepping
405  *     1       Overstepping half of threshold
406  *     -1      Overstepping minus half of threshold
407 */
408 static inline void mod_zone_state(struct zone *zone,
409        enum zone_stat_item item, long delta, int overstep_mode)
410 {
411 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
412 	s8 __percpu *p = pcp->vm_stat_diff + item;
413 	long o, n, t, z;
414 
415 	do {
416 		z = 0;  /* overflow to zone counters */
417 
418 		/*
419 		 * The fetching of the stat_threshold is racy. We may apply
420 		 * a counter threshold to the wrong the cpu if we get
421 		 * rescheduled while executing here. However, the next
422 		 * counter update will apply the threshold again and
423 		 * therefore bring the counter under the threshold again.
424 		 *
425 		 * Most of the time the thresholds are the same anyways
426 		 * for all cpus in a zone.
427 		 */
428 		t = this_cpu_read(pcp->stat_threshold);
429 
430 		o = this_cpu_read(*p);
431 		n = delta + o;
432 
433 		if (n > t || n < -t) {
434 			int os = overstep_mode * (t >> 1) ;
435 
436 			/* Overflow must be added to zone counters */
437 			z = n + os;
438 			n = -os;
439 		}
440 	} while (this_cpu_cmpxchg(*p, o, n) != o);
441 
442 	if (z)
443 		zone_page_state_add(z, zone, item);
444 }
445 
446 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
447 			 long delta)
448 {
449 	mod_zone_state(zone, item, delta, 0);
450 }
451 EXPORT_SYMBOL(mod_zone_page_state);
452 
453 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
454 {
455 	mod_zone_state(page_zone(page), item, 1, 1);
456 }
457 EXPORT_SYMBOL(inc_zone_page_state);
458 
459 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
460 {
461 	mod_zone_state(page_zone(page), item, -1, -1);
462 }
463 EXPORT_SYMBOL(dec_zone_page_state);
464 
465 static inline void mod_node_state(struct pglist_data *pgdat,
466        enum node_stat_item item, int delta, int overstep_mode)
467 {
468 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
469 	s8 __percpu *p = pcp->vm_node_stat_diff + item;
470 	long o, n, t, z;
471 
472 	do {
473 		z = 0;  /* overflow to node counters */
474 
475 		/*
476 		 * The fetching of the stat_threshold is racy. We may apply
477 		 * a counter threshold to the wrong the cpu if we get
478 		 * rescheduled while executing here. However, the next
479 		 * counter update will apply the threshold again and
480 		 * therefore bring the counter under the threshold again.
481 		 *
482 		 * Most of the time the thresholds are the same anyways
483 		 * for all cpus in a node.
484 		 */
485 		t = this_cpu_read(pcp->stat_threshold);
486 
487 		o = this_cpu_read(*p);
488 		n = delta + o;
489 
490 		if (n > t || n < -t) {
491 			int os = overstep_mode * (t >> 1) ;
492 
493 			/* Overflow must be added to node counters */
494 			z = n + os;
495 			n = -os;
496 		}
497 	} while (this_cpu_cmpxchg(*p, o, n) != o);
498 
499 	if (z)
500 		node_page_state_add(z, pgdat, item);
501 }
502 
503 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
504 					long delta)
505 {
506 	mod_node_state(pgdat, item, delta, 0);
507 }
508 EXPORT_SYMBOL(mod_node_page_state);
509 
510 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
511 {
512 	mod_node_state(pgdat, item, 1, 1);
513 }
514 
515 void inc_node_page_state(struct page *page, enum node_stat_item item)
516 {
517 	mod_node_state(page_pgdat(page), item, 1, 1);
518 }
519 EXPORT_SYMBOL(inc_node_page_state);
520 
521 void dec_node_page_state(struct page *page, enum node_stat_item item)
522 {
523 	mod_node_state(page_pgdat(page), item, -1, -1);
524 }
525 EXPORT_SYMBOL(dec_node_page_state);
526 #else
527 /*
528  * Use interrupt disable to serialize counter updates
529  */
530 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
531 			 long delta)
532 {
533 	unsigned long flags;
534 
535 	local_irq_save(flags);
536 	__mod_zone_page_state(zone, item, delta);
537 	local_irq_restore(flags);
538 }
539 EXPORT_SYMBOL(mod_zone_page_state);
540 
541 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
542 {
543 	unsigned long flags;
544 	struct zone *zone;
545 
546 	zone = page_zone(page);
547 	local_irq_save(flags);
548 	__inc_zone_state(zone, item);
549 	local_irq_restore(flags);
550 }
551 EXPORT_SYMBOL(inc_zone_page_state);
552 
553 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
554 {
555 	unsigned long flags;
556 
557 	local_irq_save(flags);
558 	__dec_zone_page_state(page, item);
559 	local_irq_restore(flags);
560 }
561 EXPORT_SYMBOL(dec_zone_page_state);
562 
563 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
564 {
565 	unsigned long flags;
566 
567 	local_irq_save(flags);
568 	__inc_node_state(pgdat, item);
569 	local_irq_restore(flags);
570 }
571 EXPORT_SYMBOL(inc_node_state);
572 
573 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
574 					long delta)
575 {
576 	unsigned long flags;
577 
578 	local_irq_save(flags);
579 	__mod_node_page_state(pgdat, item, delta);
580 	local_irq_restore(flags);
581 }
582 EXPORT_SYMBOL(mod_node_page_state);
583 
584 void inc_node_page_state(struct page *page, enum node_stat_item item)
585 {
586 	unsigned long flags;
587 	struct pglist_data *pgdat;
588 
589 	pgdat = page_pgdat(page);
590 	local_irq_save(flags);
591 	__inc_node_state(pgdat, item);
592 	local_irq_restore(flags);
593 }
594 EXPORT_SYMBOL(inc_node_page_state);
595 
596 void dec_node_page_state(struct page *page, enum node_stat_item item)
597 {
598 	unsigned long flags;
599 
600 	local_irq_save(flags);
601 	__dec_node_page_state(page, item);
602 	local_irq_restore(flags);
603 }
604 EXPORT_SYMBOL(dec_node_page_state);
605 #endif
606 
607 /*
608  * Fold a differential into the global counters.
609  * Returns the number of counters updated.
610  */
611 #ifdef CONFIG_NUMA
612 static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
613 {
614 	int i;
615 	int changes = 0;
616 
617 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
618 		if (zone_diff[i]) {
619 			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
620 			changes++;
621 	}
622 
623 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
624 		if (numa_diff[i]) {
625 			atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
626 			changes++;
627 	}
628 
629 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
630 		if (node_diff[i]) {
631 			atomic_long_add(node_diff[i], &vm_node_stat[i]);
632 			changes++;
633 	}
634 	return changes;
635 }
636 #else
637 static int fold_diff(int *zone_diff, int *node_diff)
638 {
639 	int i;
640 	int changes = 0;
641 
642 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
643 		if (zone_diff[i]) {
644 			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
645 			changes++;
646 	}
647 
648 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
649 		if (node_diff[i]) {
650 			atomic_long_add(node_diff[i], &vm_node_stat[i]);
651 			changes++;
652 	}
653 	return changes;
654 }
655 #endif /* CONFIG_NUMA */
656 
657 /*
658  * Update the zone counters for the current cpu.
659  *
660  * Note that refresh_cpu_vm_stats strives to only access
661  * node local memory. The per cpu pagesets on remote zones are placed
662  * in the memory local to the processor using that pageset. So the
663  * loop over all zones will access a series of cachelines local to
664  * the processor.
665  *
666  * The call to zone_page_state_add updates the cachelines with the
667  * statistics in the remote zone struct as well as the global cachelines
668  * with the global counters. These could cause remote node cache line
669  * bouncing and will have to be only done when necessary.
670  *
671  * The function returns the number of global counters updated.
672  */
673 static int refresh_cpu_vm_stats(bool do_pagesets)
674 {
675 	struct pglist_data *pgdat;
676 	struct zone *zone;
677 	int i;
678 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
679 #ifdef CONFIG_NUMA
680 	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
681 #endif
682 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
683 	int changes = 0;
684 
685 	for_each_populated_zone(zone) {
686 		struct per_cpu_pageset __percpu *p = zone->pageset;
687 
688 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
689 			int v;
690 
691 			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
692 			if (v) {
693 
694 				atomic_long_add(v, &zone->vm_stat[i]);
695 				global_zone_diff[i] += v;
696 #ifdef CONFIG_NUMA
697 				/* 3 seconds idle till flush */
698 				__this_cpu_write(p->expire, 3);
699 #endif
700 			}
701 		}
702 #ifdef CONFIG_NUMA
703 		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
704 			int v;
705 
706 			v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
707 			if (v) {
708 
709 				atomic_long_add(v, &zone->vm_numa_stat[i]);
710 				global_numa_diff[i] += v;
711 				__this_cpu_write(p->expire, 3);
712 			}
713 		}
714 
715 		if (do_pagesets) {
716 			cond_resched();
717 			/*
718 			 * Deal with draining the remote pageset of this
719 			 * processor
720 			 *
721 			 * Check if there are pages remaining in this pageset
722 			 * if not then there is nothing to expire.
723 			 */
724 			if (!__this_cpu_read(p->expire) ||
725 			       !__this_cpu_read(p->pcp.count))
726 				continue;
727 
728 			/*
729 			 * We never drain zones local to this processor.
730 			 */
731 			if (zone_to_nid(zone) == numa_node_id()) {
732 				__this_cpu_write(p->expire, 0);
733 				continue;
734 			}
735 
736 			if (__this_cpu_dec_return(p->expire))
737 				continue;
738 
739 			if (__this_cpu_read(p->pcp.count)) {
740 				drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
741 				changes++;
742 			}
743 		}
744 #endif
745 	}
746 
747 	for_each_online_pgdat(pgdat) {
748 		struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
749 
750 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
751 			int v;
752 
753 			v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
754 			if (v) {
755 				atomic_long_add(v, &pgdat->vm_stat[i]);
756 				global_node_diff[i] += v;
757 			}
758 		}
759 	}
760 
761 #ifdef CONFIG_NUMA
762 	changes += fold_diff(global_zone_diff, global_numa_diff,
763 			     global_node_diff);
764 #else
765 	changes += fold_diff(global_zone_diff, global_node_diff);
766 #endif
767 	return changes;
768 }
769 
770 /*
771  * Fold the data for an offline cpu into the global array.
772  * There cannot be any access by the offline cpu and therefore
773  * synchronization is simplified.
774  */
775 void cpu_vm_stats_fold(int cpu)
776 {
777 	struct pglist_data *pgdat;
778 	struct zone *zone;
779 	int i;
780 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
781 #ifdef CONFIG_NUMA
782 	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
783 #endif
784 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
785 
786 	for_each_populated_zone(zone) {
787 		struct per_cpu_pageset *p;
788 
789 		p = per_cpu_ptr(zone->pageset, cpu);
790 
791 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
792 			if (p->vm_stat_diff[i]) {
793 				int v;
794 
795 				v = p->vm_stat_diff[i];
796 				p->vm_stat_diff[i] = 0;
797 				atomic_long_add(v, &zone->vm_stat[i]);
798 				global_zone_diff[i] += v;
799 			}
800 
801 #ifdef CONFIG_NUMA
802 		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
803 			if (p->vm_numa_stat_diff[i]) {
804 				int v;
805 
806 				v = p->vm_numa_stat_diff[i];
807 				p->vm_numa_stat_diff[i] = 0;
808 				atomic_long_add(v, &zone->vm_numa_stat[i]);
809 				global_numa_diff[i] += v;
810 			}
811 #endif
812 	}
813 
814 	for_each_online_pgdat(pgdat) {
815 		struct per_cpu_nodestat *p;
816 
817 		p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
818 
819 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
820 			if (p->vm_node_stat_diff[i]) {
821 				int v;
822 
823 				v = p->vm_node_stat_diff[i];
824 				p->vm_node_stat_diff[i] = 0;
825 				atomic_long_add(v, &pgdat->vm_stat[i]);
826 				global_node_diff[i] += v;
827 			}
828 	}
829 
830 #ifdef CONFIG_NUMA
831 	fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
832 #else
833 	fold_diff(global_zone_diff, global_node_diff);
834 #endif
835 }
836 
837 /*
838  * this is only called if !populated_zone(zone), which implies no other users of
839  * pset->vm_stat_diff[] exsist.
840  */
841 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
842 {
843 	int i;
844 
845 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
846 		if (pset->vm_stat_diff[i]) {
847 			int v = pset->vm_stat_diff[i];
848 			pset->vm_stat_diff[i] = 0;
849 			atomic_long_add(v, &zone->vm_stat[i]);
850 			atomic_long_add(v, &vm_zone_stat[i]);
851 		}
852 
853 #ifdef CONFIG_NUMA
854 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
855 		if (pset->vm_numa_stat_diff[i]) {
856 			int v = pset->vm_numa_stat_diff[i];
857 
858 			pset->vm_numa_stat_diff[i] = 0;
859 			atomic_long_add(v, &zone->vm_numa_stat[i]);
860 			atomic_long_add(v, &vm_numa_stat[i]);
861 		}
862 #endif
863 }
864 #endif
865 
866 #ifdef CONFIG_NUMA
867 void __inc_numa_state(struct zone *zone,
868 				 enum numa_stat_item item)
869 {
870 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
871 	u16 __percpu *p = pcp->vm_numa_stat_diff + item;
872 	u16 v;
873 
874 	v = __this_cpu_inc_return(*p);
875 
876 	if (unlikely(v > NUMA_STATS_THRESHOLD)) {
877 		zone_numa_state_add(v, zone, item);
878 		__this_cpu_write(*p, 0);
879 	}
880 }
881 
882 /*
883  * Determine the per node value of a stat item. This function
884  * is called frequently in a NUMA machine, so try to be as
885  * frugal as possible.
886  */
887 unsigned long sum_zone_node_page_state(int node,
888 				 enum zone_stat_item item)
889 {
890 	struct zone *zones = NODE_DATA(node)->node_zones;
891 	int i;
892 	unsigned long count = 0;
893 
894 	for (i = 0; i < MAX_NR_ZONES; i++)
895 		count += zone_page_state(zones + i, item);
896 
897 	return count;
898 }
899 
900 /*
901  * Determine the per node value of a numa stat item. To avoid deviation,
902  * the per cpu stat number in vm_numa_stat_diff[] is also included.
903  */
904 unsigned long sum_zone_numa_state(int node,
905 				 enum numa_stat_item item)
906 {
907 	struct zone *zones = NODE_DATA(node)->node_zones;
908 	int i;
909 	unsigned long count = 0;
910 
911 	for (i = 0; i < MAX_NR_ZONES; i++)
912 		count += zone_numa_state_snapshot(zones + i, item);
913 
914 	return count;
915 }
916 
917 /*
918  * Determine the per node value of a stat item.
919  */
920 unsigned long node_page_state(struct pglist_data *pgdat,
921 				enum node_stat_item item)
922 {
923 	long x = atomic_long_read(&pgdat->vm_stat[item]);
924 #ifdef CONFIG_SMP
925 	if (x < 0)
926 		x = 0;
927 #endif
928 	return x;
929 }
930 #endif
931 
932 #ifdef CONFIG_COMPACTION
933 
934 struct contig_page_info {
935 	unsigned long free_pages;
936 	unsigned long free_blocks_total;
937 	unsigned long free_blocks_suitable;
938 };
939 
940 /*
941  * Calculate the number of free pages in a zone, how many contiguous
942  * pages are free and how many are large enough to satisfy an allocation of
943  * the target size. Note that this function makes no attempt to estimate
944  * how many suitable free blocks there *might* be if MOVABLE pages were
945  * migrated. Calculating that is possible, but expensive and can be
946  * figured out from userspace
947  */
948 static void fill_contig_page_info(struct zone *zone,
949 				unsigned int suitable_order,
950 				struct contig_page_info *info)
951 {
952 	unsigned int order;
953 
954 	info->free_pages = 0;
955 	info->free_blocks_total = 0;
956 	info->free_blocks_suitable = 0;
957 
958 	for (order = 0; order < MAX_ORDER; order++) {
959 		unsigned long blocks;
960 
961 		/* Count number of free blocks */
962 		blocks = zone->free_area[order].nr_free;
963 		info->free_blocks_total += blocks;
964 
965 		/* Count free base pages */
966 		info->free_pages += blocks << order;
967 
968 		/* Count the suitable free blocks */
969 		if (order >= suitable_order)
970 			info->free_blocks_suitable += blocks <<
971 						(order - suitable_order);
972 	}
973 }
974 
975 /*
976  * A fragmentation index only makes sense if an allocation of a requested
977  * size would fail. If that is true, the fragmentation index indicates
978  * whether external fragmentation or a lack of memory was the problem.
979  * The value can be used to determine if page reclaim or compaction
980  * should be used
981  */
982 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
983 {
984 	unsigned long requested = 1UL << order;
985 
986 	if (WARN_ON_ONCE(order >= MAX_ORDER))
987 		return 0;
988 
989 	if (!info->free_blocks_total)
990 		return 0;
991 
992 	/* Fragmentation index only makes sense when a request would fail */
993 	if (info->free_blocks_suitable)
994 		return -1000;
995 
996 	/*
997 	 * Index is between 0 and 1 so return within 3 decimal places
998 	 *
999 	 * 0 => allocation would fail due to lack of memory
1000 	 * 1 => allocation would fail due to fragmentation
1001 	 */
1002 	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1003 }
1004 
1005 /* Same as __fragmentation index but allocs contig_page_info on stack */
1006 int fragmentation_index(struct zone *zone, unsigned int order)
1007 {
1008 	struct contig_page_info info;
1009 
1010 	fill_contig_page_info(zone, order, &info);
1011 	return __fragmentation_index(order, &info);
1012 }
1013 #endif
1014 
1015 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
1016 #ifdef CONFIG_ZONE_DMA
1017 #define TEXT_FOR_DMA(xx) xx "_dma",
1018 #else
1019 #define TEXT_FOR_DMA(xx)
1020 #endif
1021 
1022 #ifdef CONFIG_ZONE_DMA32
1023 #define TEXT_FOR_DMA32(xx) xx "_dma32",
1024 #else
1025 #define TEXT_FOR_DMA32(xx)
1026 #endif
1027 
1028 #ifdef CONFIG_HIGHMEM
1029 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1030 #else
1031 #define TEXT_FOR_HIGHMEM(xx)
1032 #endif
1033 
1034 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1035 					TEXT_FOR_HIGHMEM(xx) xx "_movable",
1036 
1037 const char * const vmstat_text[] = {
1038 	/* enum zone_stat_item countes */
1039 	"nr_free_pages",
1040 	"nr_zone_inactive_anon",
1041 	"nr_zone_active_anon",
1042 	"nr_zone_inactive_file",
1043 	"nr_zone_active_file",
1044 	"nr_zone_unevictable",
1045 	"nr_zone_write_pending",
1046 	"nr_mlock",
1047 	"nr_page_table_pages",
1048 	"nr_kernel_stack",
1049 	"nr_bounce",
1050 #if IS_ENABLED(CONFIG_ZSMALLOC)
1051 	"nr_zspages",
1052 #endif
1053 	"nr_free_cma",
1054 
1055 	/* enum numa_stat_item counters */
1056 #ifdef CONFIG_NUMA
1057 	"numa_hit",
1058 	"numa_miss",
1059 	"numa_foreign",
1060 	"numa_interleave",
1061 	"numa_local",
1062 	"numa_other",
1063 #endif
1064 
1065 	/* Node-based counters */
1066 	"nr_inactive_anon",
1067 	"nr_active_anon",
1068 	"nr_inactive_file",
1069 	"nr_active_file",
1070 	"nr_unevictable",
1071 	"nr_slab_reclaimable",
1072 	"nr_slab_unreclaimable",
1073 	"nr_isolated_anon",
1074 	"nr_isolated_file",
1075 	"workingset_refault",
1076 	"workingset_activate",
1077 	"workingset_nodereclaim",
1078 	"nr_anon_pages",
1079 	"nr_mapped",
1080 	"nr_file_pages",
1081 	"nr_dirty",
1082 	"nr_writeback",
1083 	"nr_writeback_temp",
1084 	"nr_shmem",
1085 	"nr_shmem_hugepages",
1086 	"nr_shmem_pmdmapped",
1087 	"nr_anon_transparent_hugepages",
1088 	"nr_unstable",
1089 	"nr_vmscan_write",
1090 	"nr_vmscan_immediate_reclaim",
1091 	"nr_dirtied",
1092 	"nr_written",
1093 
1094 	/* enum writeback_stat_item counters */
1095 	"nr_dirty_threshold",
1096 	"nr_dirty_background_threshold",
1097 
1098 #ifdef CONFIG_VM_EVENT_COUNTERS
1099 	/* enum vm_event_item counters */
1100 	"pgpgin",
1101 	"pgpgout",
1102 	"pswpin",
1103 	"pswpout",
1104 
1105 	TEXTS_FOR_ZONES("pgalloc")
1106 	TEXTS_FOR_ZONES("allocstall")
1107 	TEXTS_FOR_ZONES("pgskip")
1108 
1109 	"pgfree",
1110 	"pgactivate",
1111 	"pgdeactivate",
1112 	"pglazyfree",
1113 
1114 	"pgfault",
1115 	"pgmajfault",
1116 	"pglazyfreed",
1117 
1118 	"pgrefill",
1119 	"pgsteal_kswapd",
1120 	"pgsteal_direct",
1121 	"pgscan_kswapd",
1122 	"pgscan_direct",
1123 	"pgscan_direct_throttle",
1124 
1125 #ifdef CONFIG_NUMA
1126 	"zone_reclaim_failed",
1127 #endif
1128 	"pginodesteal",
1129 	"slabs_scanned",
1130 	"kswapd_inodesteal",
1131 	"kswapd_low_wmark_hit_quickly",
1132 	"kswapd_high_wmark_hit_quickly",
1133 	"pageoutrun",
1134 
1135 	"pgrotated",
1136 
1137 	"drop_pagecache",
1138 	"drop_slab",
1139 	"oom_kill",
1140 
1141 #ifdef CONFIG_NUMA_BALANCING
1142 	"numa_pte_updates",
1143 	"numa_huge_pte_updates",
1144 	"numa_hint_faults",
1145 	"numa_hint_faults_local",
1146 	"numa_pages_migrated",
1147 #endif
1148 #ifdef CONFIG_MIGRATION
1149 	"pgmigrate_success",
1150 	"pgmigrate_fail",
1151 #endif
1152 #ifdef CONFIG_COMPACTION
1153 	"compact_migrate_scanned",
1154 	"compact_free_scanned",
1155 	"compact_isolated",
1156 	"compact_stall",
1157 	"compact_fail",
1158 	"compact_success",
1159 	"compact_daemon_wake",
1160 	"compact_daemon_migrate_scanned",
1161 	"compact_daemon_free_scanned",
1162 #endif
1163 
1164 #ifdef CONFIG_HUGETLB_PAGE
1165 	"htlb_buddy_alloc_success",
1166 	"htlb_buddy_alloc_fail",
1167 #endif
1168 	"unevictable_pgs_culled",
1169 	"unevictable_pgs_scanned",
1170 	"unevictable_pgs_rescued",
1171 	"unevictable_pgs_mlocked",
1172 	"unevictable_pgs_munlocked",
1173 	"unevictable_pgs_cleared",
1174 	"unevictable_pgs_stranded",
1175 
1176 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1177 	"thp_fault_alloc",
1178 	"thp_fault_fallback",
1179 	"thp_collapse_alloc",
1180 	"thp_collapse_alloc_failed",
1181 	"thp_file_alloc",
1182 	"thp_file_mapped",
1183 	"thp_split_page",
1184 	"thp_split_page_failed",
1185 	"thp_deferred_split_page",
1186 	"thp_split_pmd",
1187 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1188 	"thp_split_pud",
1189 #endif
1190 	"thp_zero_page_alloc",
1191 	"thp_zero_page_alloc_failed",
1192 	"thp_swpout",
1193 	"thp_swpout_fallback",
1194 #endif
1195 #ifdef CONFIG_MEMORY_BALLOON
1196 	"balloon_inflate",
1197 	"balloon_deflate",
1198 #ifdef CONFIG_BALLOON_COMPACTION
1199 	"balloon_migrate",
1200 #endif
1201 #endif /* CONFIG_MEMORY_BALLOON */
1202 #ifdef CONFIG_DEBUG_TLBFLUSH
1203 #ifdef CONFIG_SMP
1204 	"nr_tlb_remote_flush",
1205 	"nr_tlb_remote_flush_received",
1206 #endif /* CONFIG_SMP */
1207 	"nr_tlb_local_flush_all",
1208 	"nr_tlb_local_flush_one",
1209 #endif /* CONFIG_DEBUG_TLBFLUSH */
1210 
1211 #ifdef CONFIG_DEBUG_VM_VMACACHE
1212 	"vmacache_find_calls",
1213 	"vmacache_find_hits",
1214 	"vmacache_full_flushes",
1215 #endif
1216 #ifdef CONFIG_SWAP
1217 	"swap_ra",
1218 	"swap_ra_hit",
1219 #endif
1220 #endif /* CONFIG_VM_EVENTS_COUNTERS */
1221 };
1222 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
1223 
1224 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1225      defined(CONFIG_PROC_FS)
1226 static void *frag_start(struct seq_file *m, loff_t *pos)
1227 {
1228 	pg_data_t *pgdat;
1229 	loff_t node = *pos;
1230 
1231 	for (pgdat = first_online_pgdat();
1232 	     pgdat && node;
1233 	     pgdat = next_online_pgdat(pgdat))
1234 		--node;
1235 
1236 	return pgdat;
1237 }
1238 
1239 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1240 {
1241 	pg_data_t *pgdat = (pg_data_t *)arg;
1242 
1243 	(*pos)++;
1244 	return next_online_pgdat(pgdat);
1245 }
1246 
1247 static void frag_stop(struct seq_file *m, void *arg)
1248 {
1249 }
1250 
1251 /*
1252  * Walk zones in a node and print using a callback.
1253  * If @assert_populated is true, only use callback for zones that are populated.
1254  */
1255 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1256 		bool assert_populated, bool nolock,
1257 		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1258 {
1259 	struct zone *zone;
1260 	struct zone *node_zones = pgdat->node_zones;
1261 	unsigned long flags;
1262 
1263 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1264 		if (assert_populated && !populated_zone(zone))
1265 			continue;
1266 
1267 		if (!nolock)
1268 			spin_lock_irqsave(&zone->lock, flags);
1269 		print(m, pgdat, zone);
1270 		if (!nolock)
1271 			spin_unlock_irqrestore(&zone->lock, flags);
1272 	}
1273 }
1274 #endif
1275 
1276 #ifdef CONFIG_PROC_FS
1277 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1278 						struct zone *zone)
1279 {
1280 	int order;
1281 
1282 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1283 	for (order = 0; order < MAX_ORDER; ++order)
1284 		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1285 	seq_putc(m, '\n');
1286 }
1287 
1288 /*
1289  * This walks the free areas for each zone.
1290  */
1291 static int frag_show(struct seq_file *m, void *arg)
1292 {
1293 	pg_data_t *pgdat = (pg_data_t *)arg;
1294 	walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1295 	return 0;
1296 }
1297 
1298 static void pagetypeinfo_showfree_print(struct seq_file *m,
1299 					pg_data_t *pgdat, struct zone *zone)
1300 {
1301 	int order, mtype;
1302 
1303 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1304 		seq_printf(m, "Node %4d, zone %8s, type %12s ",
1305 					pgdat->node_id,
1306 					zone->name,
1307 					migratetype_names[mtype]);
1308 		for (order = 0; order < MAX_ORDER; ++order) {
1309 			unsigned long freecount = 0;
1310 			struct free_area *area;
1311 			struct list_head *curr;
1312 
1313 			area = &(zone->free_area[order]);
1314 
1315 			list_for_each(curr, &area->free_list[mtype])
1316 				freecount++;
1317 			seq_printf(m, "%6lu ", freecount);
1318 		}
1319 		seq_putc(m, '\n');
1320 	}
1321 }
1322 
1323 /* Print out the free pages at each order for each migatetype */
1324 static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1325 {
1326 	int order;
1327 	pg_data_t *pgdat = (pg_data_t *)arg;
1328 
1329 	/* Print header */
1330 	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1331 	for (order = 0; order < MAX_ORDER; ++order)
1332 		seq_printf(m, "%6d ", order);
1333 	seq_putc(m, '\n');
1334 
1335 	walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1336 
1337 	return 0;
1338 }
1339 
1340 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1341 					pg_data_t *pgdat, struct zone *zone)
1342 {
1343 	int mtype;
1344 	unsigned long pfn;
1345 	unsigned long start_pfn = zone->zone_start_pfn;
1346 	unsigned long end_pfn = zone_end_pfn(zone);
1347 	unsigned long count[MIGRATE_TYPES] = { 0, };
1348 
1349 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1350 		struct page *page;
1351 
1352 		page = pfn_to_online_page(pfn);
1353 		if (!page)
1354 			continue;
1355 
1356 		/* Watch for unexpected holes punched in the memmap */
1357 		if (!memmap_valid_within(pfn, page, zone))
1358 			continue;
1359 
1360 		if (page_zone(page) != zone)
1361 			continue;
1362 
1363 		mtype = get_pageblock_migratetype(page);
1364 
1365 		if (mtype < MIGRATE_TYPES)
1366 			count[mtype]++;
1367 	}
1368 
1369 	/* Print counts */
1370 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1371 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1372 		seq_printf(m, "%12lu ", count[mtype]);
1373 	seq_putc(m, '\n');
1374 }
1375 
1376 /* Print out the number of pageblocks for each migratetype */
1377 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1378 {
1379 	int mtype;
1380 	pg_data_t *pgdat = (pg_data_t *)arg;
1381 
1382 	seq_printf(m, "\n%-23s", "Number of blocks type ");
1383 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1384 		seq_printf(m, "%12s ", migratetype_names[mtype]);
1385 	seq_putc(m, '\n');
1386 	walk_zones_in_node(m, pgdat, true, false,
1387 		pagetypeinfo_showblockcount_print);
1388 
1389 	return 0;
1390 }
1391 
1392 /*
1393  * Print out the number of pageblocks for each migratetype that contain pages
1394  * of other types. This gives an indication of how well fallbacks are being
1395  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1396  * to determine what is going on
1397  */
1398 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1399 {
1400 #ifdef CONFIG_PAGE_OWNER
1401 	int mtype;
1402 
1403 	if (!static_branch_unlikely(&page_owner_inited))
1404 		return;
1405 
1406 	drain_all_pages(NULL);
1407 
1408 	seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1409 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1410 		seq_printf(m, "%12s ", migratetype_names[mtype]);
1411 	seq_putc(m, '\n');
1412 
1413 	walk_zones_in_node(m, pgdat, true, true,
1414 		pagetypeinfo_showmixedcount_print);
1415 #endif /* CONFIG_PAGE_OWNER */
1416 }
1417 
1418 /*
1419  * This prints out statistics in relation to grouping pages by mobility.
1420  * It is expensive to collect so do not constantly read the file.
1421  */
1422 static int pagetypeinfo_show(struct seq_file *m, void *arg)
1423 {
1424 	pg_data_t *pgdat = (pg_data_t *)arg;
1425 
1426 	/* check memoryless node */
1427 	if (!node_state(pgdat->node_id, N_MEMORY))
1428 		return 0;
1429 
1430 	seq_printf(m, "Page block order: %d\n", pageblock_order);
1431 	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1432 	seq_putc(m, '\n');
1433 	pagetypeinfo_showfree(m, pgdat);
1434 	pagetypeinfo_showblockcount(m, pgdat);
1435 	pagetypeinfo_showmixedcount(m, pgdat);
1436 
1437 	return 0;
1438 }
1439 
1440 static const struct seq_operations fragmentation_op = {
1441 	.start	= frag_start,
1442 	.next	= frag_next,
1443 	.stop	= frag_stop,
1444 	.show	= frag_show,
1445 };
1446 
1447 static int fragmentation_open(struct inode *inode, struct file *file)
1448 {
1449 	return seq_open(file, &fragmentation_op);
1450 }
1451 
1452 static const struct file_operations buddyinfo_file_operations = {
1453 	.open		= fragmentation_open,
1454 	.read		= seq_read,
1455 	.llseek		= seq_lseek,
1456 	.release	= seq_release,
1457 };
1458 
1459 static const struct seq_operations pagetypeinfo_op = {
1460 	.start	= frag_start,
1461 	.next	= frag_next,
1462 	.stop	= frag_stop,
1463 	.show	= pagetypeinfo_show,
1464 };
1465 
1466 static int pagetypeinfo_open(struct inode *inode, struct file *file)
1467 {
1468 	return seq_open(file, &pagetypeinfo_op);
1469 }
1470 
1471 static const struct file_operations pagetypeinfo_file_operations = {
1472 	.open		= pagetypeinfo_open,
1473 	.read		= seq_read,
1474 	.llseek		= seq_lseek,
1475 	.release	= seq_release,
1476 };
1477 
1478 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1479 {
1480 	int zid;
1481 
1482 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1483 		struct zone *compare = &pgdat->node_zones[zid];
1484 
1485 		if (populated_zone(compare))
1486 			return zone == compare;
1487 	}
1488 
1489 	return false;
1490 }
1491 
1492 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1493 							struct zone *zone)
1494 {
1495 	int i;
1496 	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1497 	if (is_zone_first_populated(pgdat, zone)) {
1498 		seq_printf(m, "\n  per-node stats");
1499 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1500 			seq_printf(m, "\n      %-12s %lu",
1501 				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
1502 				NR_VM_NUMA_STAT_ITEMS],
1503 				node_page_state(pgdat, i));
1504 		}
1505 	}
1506 	seq_printf(m,
1507 		   "\n  pages free     %lu"
1508 		   "\n        min      %lu"
1509 		   "\n        low      %lu"
1510 		   "\n        high     %lu"
1511 		   "\n        spanned  %lu"
1512 		   "\n        present  %lu"
1513 		   "\n        managed  %lu",
1514 		   zone_page_state(zone, NR_FREE_PAGES),
1515 		   min_wmark_pages(zone),
1516 		   low_wmark_pages(zone),
1517 		   high_wmark_pages(zone),
1518 		   zone->spanned_pages,
1519 		   zone->present_pages,
1520 		   zone->managed_pages);
1521 
1522 	seq_printf(m,
1523 		   "\n        protection: (%ld",
1524 		   zone->lowmem_reserve[0]);
1525 	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1526 		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1527 	seq_putc(m, ')');
1528 
1529 	/* If unpopulated, no other information is useful */
1530 	if (!populated_zone(zone)) {
1531 		seq_putc(m, '\n');
1532 		return;
1533 	}
1534 
1535 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1536 		seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
1537 				zone_page_state(zone, i));
1538 
1539 #ifdef CONFIG_NUMA
1540 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1541 		seq_printf(m, "\n      %-12s %lu",
1542 				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
1543 				zone_numa_state_snapshot(zone, i));
1544 #endif
1545 
1546 	seq_printf(m, "\n  pagesets");
1547 	for_each_online_cpu(i) {
1548 		struct per_cpu_pageset *pageset;
1549 
1550 		pageset = per_cpu_ptr(zone->pageset, i);
1551 		seq_printf(m,
1552 			   "\n    cpu: %i"
1553 			   "\n              count: %i"
1554 			   "\n              high:  %i"
1555 			   "\n              batch: %i",
1556 			   i,
1557 			   pageset->pcp.count,
1558 			   pageset->pcp.high,
1559 			   pageset->pcp.batch);
1560 #ifdef CONFIG_SMP
1561 		seq_printf(m, "\n  vm stats threshold: %d",
1562 				pageset->stat_threshold);
1563 #endif
1564 	}
1565 	seq_printf(m,
1566 		   "\n  node_unreclaimable:  %u"
1567 		   "\n  start_pfn:           %lu"
1568 		   "\n  node_inactive_ratio: %u",
1569 		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1570 		   zone->zone_start_pfn,
1571 		   zone->zone_pgdat->inactive_ratio);
1572 	seq_putc(m, '\n');
1573 }
1574 
1575 /*
1576  * Output information about zones in @pgdat.  All zones are printed regardless
1577  * of whether they are populated or not: lowmem_reserve_ratio operates on the
1578  * set of all zones and userspace would not be aware of such zones if they are
1579  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1580  */
1581 static int zoneinfo_show(struct seq_file *m, void *arg)
1582 {
1583 	pg_data_t *pgdat = (pg_data_t *)arg;
1584 	walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1585 	return 0;
1586 }
1587 
1588 static const struct seq_operations zoneinfo_op = {
1589 	.start	= frag_start, /* iterate over all zones. The same as in
1590 			       * fragmentation. */
1591 	.next	= frag_next,
1592 	.stop	= frag_stop,
1593 	.show	= zoneinfo_show,
1594 };
1595 
1596 static int zoneinfo_open(struct inode *inode, struct file *file)
1597 {
1598 	return seq_open(file, &zoneinfo_op);
1599 }
1600 
1601 static const struct file_operations zoneinfo_file_operations = {
1602 	.open		= zoneinfo_open,
1603 	.read		= seq_read,
1604 	.llseek		= seq_lseek,
1605 	.release	= seq_release,
1606 };
1607 
1608 enum writeback_stat_item {
1609 	NR_DIRTY_THRESHOLD,
1610 	NR_DIRTY_BG_THRESHOLD,
1611 	NR_VM_WRITEBACK_STAT_ITEMS,
1612 };
1613 
1614 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1615 {
1616 	unsigned long *v;
1617 	int i, stat_items_size;
1618 
1619 	if (*pos >= ARRAY_SIZE(vmstat_text))
1620 		return NULL;
1621 	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1622 			  NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
1623 			  NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1624 			  NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1625 
1626 #ifdef CONFIG_VM_EVENT_COUNTERS
1627 	stat_items_size += sizeof(struct vm_event_state);
1628 #endif
1629 
1630 	v = kmalloc(stat_items_size, GFP_KERNEL);
1631 	m->private = v;
1632 	if (!v)
1633 		return ERR_PTR(-ENOMEM);
1634 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1635 		v[i] = global_zone_page_state(i);
1636 	v += NR_VM_ZONE_STAT_ITEMS;
1637 
1638 #ifdef CONFIG_NUMA
1639 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1640 		v[i] = global_numa_state(i);
1641 	v += NR_VM_NUMA_STAT_ITEMS;
1642 #endif
1643 
1644 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1645 		v[i] = global_node_page_state(i);
1646 	v += NR_VM_NODE_STAT_ITEMS;
1647 
1648 	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1649 			    v + NR_DIRTY_THRESHOLD);
1650 	v += NR_VM_WRITEBACK_STAT_ITEMS;
1651 
1652 #ifdef CONFIG_VM_EVENT_COUNTERS
1653 	all_vm_events(v);
1654 	v[PGPGIN] /= 2;		/* sectors -> kbytes */
1655 	v[PGPGOUT] /= 2;
1656 #endif
1657 	return (unsigned long *)m->private + *pos;
1658 }
1659 
1660 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1661 {
1662 	(*pos)++;
1663 	if (*pos >= ARRAY_SIZE(vmstat_text))
1664 		return NULL;
1665 	return (unsigned long *)m->private + *pos;
1666 }
1667 
1668 static int vmstat_show(struct seq_file *m, void *arg)
1669 {
1670 	unsigned long *l = arg;
1671 	unsigned long off = l - (unsigned long *)m->private;
1672 
1673 	seq_puts(m, vmstat_text[off]);
1674 	seq_put_decimal_ull(m, " ", *l);
1675 	seq_putc(m, '\n');
1676 	return 0;
1677 }
1678 
1679 static void vmstat_stop(struct seq_file *m, void *arg)
1680 {
1681 	kfree(m->private);
1682 	m->private = NULL;
1683 }
1684 
1685 static const struct seq_operations vmstat_op = {
1686 	.start	= vmstat_start,
1687 	.next	= vmstat_next,
1688 	.stop	= vmstat_stop,
1689 	.show	= vmstat_show,
1690 };
1691 
1692 static int vmstat_open(struct inode *inode, struct file *file)
1693 {
1694 	return seq_open(file, &vmstat_op);
1695 }
1696 
1697 static const struct file_operations vmstat_file_operations = {
1698 	.open		= vmstat_open,
1699 	.read		= seq_read,
1700 	.llseek		= seq_lseek,
1701 	.release	= seq_release,
1702 };
1703 #endif /* CONFIG_PROC_FS */
1704 
1705 #ifdef CONFIG_SMP
1706 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1707 int sysctl_stat_interval __read_mostly = HZ;
1708 
1709 #ifdef CONFIG_PROC_FS
1710 static void refresh_vm_stats(struct work_struct *work)
1711 {
1712 	refresh_cpu_vm_stats(true);
1713 }
1714 
1715 int vmstat_refresh(struct ctl_table *table, int write,
1716 		   void __user *buffer, size_t *lenp, loff_t *ppos)
1717 {
1718 	long val;
1719 	int err;
1720 	int i;
1721 
1722 	/*
1723 	 * The regular update, every sysctl_stat_interval, may come later
1724 	 * than expected: leaving a significant amount in per_cpu buckets.
1725 	 * This is particularly misleading when checking a quantity of HUGE
1726 	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1727 	 * which can equally be echo'ed to or cat'ted from (by root),
1728 	 * can be used to update the stats just before reading them.
1729 	 *
1730 	 * Oh, and since global_zone_page_state() etc. are so careful to hide
1731 	 * transiently negative values, report an error here if any of
1732 	 * the stats is negative, so we know to go looking for imbalance.
1733 	 */
1734 	err = schedule_on_each_cpu(refresh_vm_stats);
1735 	if (err)
1736 		return err;
1737 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1738 		val = atomic_long_read(&vm_zone_stat[i]);
1739 		if (val < 0) {
1740 			pr_warn("%s: %s %ld\n",
1741 				__func__, vmstat_text[i], val);
1742 			err = -EINVAL;
1743 		}
1744 	}
1745 #ifdef CONFIG_NUMA
1746 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1747 		val = atomic_long_read(&vm_numa_stat[i]);
1748 		if (val < 0) {
1749 			pr_warn("%s: %s %ld\n",
1750 				__func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
1751 			err = -EINVAL;
1752 		}
1753 	}
1754 #endif
1755 	if (err)
1756 		return err;
1757 	if (write)
1758 		*ppos += *lenp;
1759 	else
1760 		*lenp = 0;
1761 	return 0;
1762 }
1763 #endif /* CONFIG_PROC_FS */
1764 
1765 static void vmstat_update(struct work_struct *w)
1766 {
1767 	if (refresh_cpu_vm_stats(true)) {
1768 		/*
1769 		 * Counters were updated so we expect more updates
1770 		 * to occur in the future. Keep on running the
1771 		 * update worker thread.
1772 		 */
1773 		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
1774 				this_cpu_ptr(&vmstat_work),
1775 				round_jiffies_relative(sysctl_stat_interval));
1776 	}
1777 }
1778 
1779 /*
1780  * Switch off vmstat processing and then fold all the remaining differentials
1781  * until the diffs stay at zero. The function is used by NOHZ and can only be
1782  * invoked when tick processing is not active.
1783  */
1784 /*
1785  * Check if the diffs for a certain cpu indicate that
1786  * an update is needed.
1787  */
1788 static bool need_update(int cpu)
1789 {
1790 	struct zone *zone;
1791 
1792 	for_each_populated_zone(zone) {
1793 		struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1794 
1795 		BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1796 #ifdef CONFIG_NUMA
1797 		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
1798 #endif
1799 
1800 		/*
1801 		 * The fast way of checking if there are any vmstat diffs.
1802 		 * This works because the diffs are byte sized items.
1803 		 */
1804 		if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1805 			return true;
1806 #ifdef CONFIG_NUMA
1807 		if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
1808 			return true;
1809 #endif
1810 	}
1811 	return false;
1812 }
1813 
1814 /*
1815  * Switch off vmstat processing and then fold all the remaining differentials
1816  * until the diffs stay at zero. The function is used by NOHZ and can only be
1817  * invoked when tick processing is not active.
1818  */
1819 void quiet_vmstat(void)
1820 {
1821 	if (system_state != SYSTEM_RUNNING)
1822 		return;
1823 
1824 	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
1825 		return;
1826 
1827 	if (!need_update(smp_processor_id()))
1828 		return;
1829 
1830 	/*
1831 	 * Just refresh counters and do not care about the pending delayed
1832 	 * vmstat_update. It doesn't fire that often to matter and canceling
1833 	 * it would be too expensive from this path.
1834 	 * vmstat_shepherd will take care about that for us.
1835 	 */
1836 	refresh_cpu_vm_stats(false);
1837 }
1838 
1839 /*
1840  * Shepherd worker thread that checks the
1841  * differentials of processors that have their worker
1842  * threads for vm statistics updates disabled because of
1843  * inactivity.
1844  */
1845 static void vmstat_shepherd(struct work_struct *w);
1846 
1847 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
1848 
1849 static void vmstat_shepherd(struct work_struct *w)
1850 {
1851 	int cpu;
1852 
1853 	get_online_cpus();
1854 	/* Check processors whose vmstat worker threads have been disabled */
1855 	for_each_online_cpu(cpu) {
1856 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
1857 
1858 		if (!delayed_work_pending(dw) && need_update(cpu))
1859 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
1860 	}
1861 	put_online_cpus();
1862 
1863 	schedule_delayed_work(&shepherd,
1864 		round_jiffies_relative(sysctl_stat_interval));
1865 }
1866 
1867 static void __init start_shepherd_timer(void)
1868 {
1869 	int cpu;
1870 
1871 	for_each_possible_cpu(cpu)
1872 		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1873 			vmstat_update);
1874 
1875 	schedule_delayed_work(&shepherd,
1876 		round_jiffies_relative(sysctl_stat_interval));
1877 }
1878 
1879 static void __init init_cpu_node_state(void)
1880 {
1881 	int node;
1882 
1883 	for_each_online_node(node) {
1884 		if (cpumask_weight(cpumask_of_node(node)) > 0)
1885 			node_set_state(node, N_CPU);
1886 	}
1887 }
1888 
1889 static int vmstat_cpu_online(unsigned int cpu)
1890 {
1891 	refresh_zone_stat_thresholds();
1892 	node_set_state(cpu_to_node(cpu), N_CPU);
1893 	return 0;
1894 }
1895 
1896 static int vmstat_cpu_down_prep(unsigned int cpu)
1897 {
1898 	cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1899 	return 0;
1900 }
1901 
1902 static int vmstat_cpu_dead(unsigned int cpu)
1903 {
1904 	const struct cpumask *node_cpus;
1905 	int node;
1906 
1907 	node = cpu_to_node(cpu);
1908 
1909 	refresh_zone_stat_thresholds();
1910 	node_cpus = cpumask_of_node(node);
1911 	if (cpumask_weight(node_cpus) > 0)
1912 		return 0;
1913 
1914 	node_clear_state(node, N_CPU);
1915 	return 0;
1916 }
1917 
1918 #endif
1919 
1920 struct workqueue_struct *mm_percpu_wq;
1921 
1922 void __init init_mm_internals(void)
1923 {
1924 	int ret __maybe_unused;
1925 
1926 	mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
1927 
1928 #ifdef CONFIG_SMP
1929 	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
1930 					NULL, vmstat_cpu_dead);
1931 	if (ret < 0)
1932 		pr_err("vmstat: failed to register 'dead' hotplug state\n");
1933 
1934 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
1935 					vmstat_cpu_online,
1936 					vmstat_cpu_down_prep);
1937 	if (ret < 0)
1938 		pr_err("vmstat: failed to register 'online' hotplug state\n");
1939 
1940 	get_online_cpus();
1941 	init_cpu_node_state();
1942 	put_online_cpus();
1943 
1944 	start_shepherd_timer();
1945 #endif
1946 #ifdef CONFIG_PROC_FS
1947 	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
1948 	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
1949 	proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
1950 	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
1951 #endif
1952 }
1953 
1954 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1955 
1956 /*
1957  * Return an index indicating how much of the available free memory is
1958  * unusable for an allocation of the requested size.
1959  */
1960 static int unusable_free_index(unsigned int order,
1961 				struct contig_page_info *info)
1962 {
1963 	/* No free memory is interpreted as all free memory is unusable */
1964 	if (info->free_pages == 0)
1965 		return 1000;
1966 
1967 	/*
1968 	 * Index should be a value between 0 and 1. Return a value to 3
1969 	 * decimal places.
1970 	 *
1971 	 * 0 => no fragmentation
1972 	 * 1 => high fragmentation
1973 	 */
1974 	return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1975 
1976 }
1977 
1978 static void unusable_show_print(struct seq_file *m,
1979 					pg_data_t *pgdat, struct zone *zone)
1980 {
1981 	unsigned int order;
1982 	int index;
1983 	struct contig_page_info info;
1984 
1985 	seq_printf(m, "Node %d, zone %8s ",
1986 				pgdat->node_id,
1987 				zone->name);
1988 	for (order = 0; order < MAX_ORDER; ++order) {
1989 		fill_contig_page_info(zone, order, &info);
1990 		index = unusable_free_index(order, &info);
1991 		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1992 	}
1993 
1994 	seq_putc(m, '\n');
1995 }
1996 
1997 /*
1998  * Display unusable free space index
1999  *
2000  * The unusable free space index measures how much of the available free
2001  * memory cannot be used to satisfy an allocation of a given size and is a
2002  * value between 0 and 1. The higher the value, the more of free memory is
2003  * unusable and by implication, the worse the external fragmentation is. This
2004  * can be expressed as a percentage by multiplying by 100.
2005  */
2006 static int unusable_show(struct seq_file *m, void *arg)
2007 {
2008 	pg_data_t *pgdat = (pg_data_t *)arg;
2009 
2010 	/* check memoryless node */
2011 	if (!node_state(pgdat->node_id, N_MEMORY))
2012 		return 0;
2013 
2014 	walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2015 
2016 	return 0;
2017 }
2018 
2019 static const struct seq_operations unusable_op = {
2020 	.start	= frag_start,
2021 	.next	= frag_next,
2022 	.stop	= frag_stop,
2023 	.show	= unusable_show,
2024 };
2025 
2026 static int unusable_open(struct inode *inode, struct file *file)
2027 {
2028 	return seq_open(file, &unusable_op);
2029 }
2030 
2031 static const struct file_operations unusable_file_ops = {
2032 	.open		= unusable_open,
2033 	.read		= seq_read,
2034 	.llseek		= seq_lseek,
2035 	.release	= seq_release,
2036 };
2037 
2038 static void extfrag_show_print(struct seq_file *m,
2039 					pg_data_t *pgdat, struct zone *zone)
2040 {
2041 	unsigned int order;
2042 	int index;
2043 
2044 	/* Alloc on stack as interrupts are disabled for zone walk */
2045 	struct contig_page_info info;
2046 
2047 	seq_printf(m, "Node %d, zone %8s ",
2048 				pgdat->node_id,
2049 				zone->name);
2050 	for (order = 0; order < MAX_ORDER; ++order) {
2051 		fill_contig_page_info(zone, order, &info);
2052 		index = __fragmentation_index(order, &info);
2053 		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2054 	}
2055 
2056 	seq_putc(m, '\n');
2057 }
2058 
2059 /*
2060  * Display fragmentation index for orders that allocations would fail for
2061  */
2062 static int extfrag_show(struct seq_file *m, void *arg)
2063 {
2064 	pg_data_t *pgdat = (pg_data_t *)arg;
2065 
2066 	walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2067 
2068 	return 0;
2069 }
2070 
2071 static const struct seq_operations extfrag_op = {
2072 	.start	= frag_start,
2073 	.next	= frag_next,
2074 	.stop	= frag_stop,
2075 	.show	= extfrag_show,
2076 };
2077 
2078 static int extfrag_open(struct inode *inode, struct file *file)
2079 {
2080 	return seq_open(file, &extfrag_op);
2081 }
2082 
2083 static const struct file_operations extfrag_file_ops = {
2084 	.open		= extfrag_open,
2085 	.read		= seq_read,
2086 	.llseek		= seq_lseek,
2087 	.release	= seq_release,
2088 };
2089 
2090 static int __init extfrag_debug_init(void)
2091 {
2092 	struct dentry *extfrag_debug_root;
2093 
2094 	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2095 	if (!extfrag_debug_root)
2096 		return -ENOMEM;
2097 
2098 	if (!debugfs_create_file("unusable_index", 0444,
2099 			extfrag_debug_root, NULL, &unusable_file_ops))
2100 		goto fail;
2101 
2102 	if (!debugfs_create_file("extfrag_index", 0444,
2103 			extfrag_debug_root, NULL, &extfrag_file_ops))
2104 		goto fail;
2105 
2106 	return 0;
2107 fail:
2108 	debugfs_remove_recursive(extfrag_debug_root);
2109 	return -ENOMEM;
2110 }
2111 
2112 module_init(extfrag_debug_init);
2113 #endif
2114