xref: /openbmc/linux/mm/vmstat.c (revision 90cb380f9ceb811059340d06ff5fd0c0e93ecbe1)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/mm/vmstat.c
4   *
5   *  Manages VM statistics
6   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
7   *
8   *  zoned VM statistics
9   *  Copyright (C) 2006 Silicon Graphics, Inc.,
10   *		Christoph Lameter <christoph@lameter.com>
11   *  Copyright (C) 2008-2014 Christoph Lameter
12   */
13  #include <linux/fs.h>
14  #include <linux/mm.h>
15  #include <linux/err.h>
16  #include <linux/module.h>
17  #include <linux/slab.h>
18  #include <linux/cpu.h>
19  #include <linux/cpumask.h>
20  #include <linux/vmstat.h>
21  #include <linux/proc_fs.h>
22  #include <linux/seq_file.h>
23  #include <linux/debugfs.h>
24  #include <linux/sched.h>
25  #include <linux/math64.h>
26  #include <linux/writeback.h>
27  #include <linux/compaction.h>
28  #include <linux/mm_inline.h>
29  #include <linux/page_ext.h>
30  #include <linux/page_owner.h>
31  #include <linux/migrate.h>
32  
33  #include "internal.h"
34  
35  #ifdef CONFIG_NUMA
36  int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
37  
38  /* zero numa counters within a zone */
39  static void zero_zone_numa_counters(struct zone *zone)
40  {
41  	int item, cpu;
42  
43  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
44  		atomic_long_set(&zone->vm_numa_event[item], 0);
45  		for_each_online_cpu(cpu) {
46  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
47  						= 0;
48  		}
49  	}
50  }
51  
52  /* zero numa counters of all the populated zones */
53  static void zero_zones_numa_counters(void)
54  {
55  	struct zone *zone;
56  
57  	for_each_populated_zone(zone)
58  		zero_zone_numa_counters(zone);
59  }
60  
61  /* zero global numa counters */
62  static void zero_global_numa_counters(void)
63  {
64  	int item;
65  
66  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
67  		atomic_long_set(&vm_numa_event[item], 0);
68  }
69  
70  static void invalid_numa_statistics(void)
71  {
72  	zero_zones_numa_counters();
73  	zero_global_numa_counters();
74  }
75  
76  static DEFINE_MUTEX(vm_numa_stat_lock);
77  
78  int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
79  		void *buffer, size_t *length, loff_t *ppos)
80  {
81  	int ret, oldval;
82  
83  	mutex_lock(&vm_numa_stat_lock);
84  	if (write)
85  		oldval = sysctl_vm_numa_stat;
86  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87  	if (ret || !write)
88  		goto out;
89  
90  	if (oldval == sysctl_vm_numa_stat)
91  		goto out;
92  	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93  		static_branch_enable(&vm_numa_stat_key);
94  		pr_info("enable numa statistics\n");
95  	} else {
96  		static_branch_disable(&vm_numa_stat_key);
97  		invalid_numa_statistics();
98  		pr_info("disable numa statistics, and clear numa counters\n");
99  	}
100  
101  out:
102  	mutex_unlock(&vm_numa_stat_lock);
103  	return ret;
104  }
105  #endif
106  
107  #ifdef CONFIG_VM_EVENT_COUNTERS
108  DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109  EXPORT_PER_CPU_SYMBOL(vm_event_states);
110  
111  static void sum_vm_events(unsigned long *ret)
112  {
113  	int cpu;
114  	int i;
115  
116  	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117  
118  	for_each_online_cpu(cpu) {
119  		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120  
121  		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122  			ret[i] += this->event[i];
123  	}
124  }
125  
126  /*
127   * Accumulate the vm event counters across all CPUs.
128   * The result is unavoidably approximate - it can change
129   * during and after execution of this function.
130  */
131  void all_vm_events(unsigned long *ret)
132  {
133  	cpus_read_lock();
134  	sum_vm_events(ret);
135  	cpus_read_unlock();
136  }
137  EXPORT_SYMBOL_GPL(all_vm_events);
138  
139  /*
140   * Fold the foreign cpu events into our own.
141   *
142   * This is adding to the events on one processor
143   * but keeps the global counts constant.
144   */
145  void vm_events_fold_cpu(int cpu)
146  {
147  	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148  	int i;
149  
150  	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151  		count_vm_events(i, fold_state->event[i]);
152  		fold_state->event[i] = 0;
153  	}
154  }
155  
156  #endif /* CONFIG_VM_EVENT_COUNTERS */
157  
158  /*
159   * Manage combined zone based / global counters
160   *
161   * vm_stat contains the global counters
162   */
163  atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
164  atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
165  atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
166  EXPORT_SYMBOL(vm_zone_stat);
167  EXPORT_SYMBOL(vm_node_stat);
168  
169  #ifdef CONFIG_NUMA
170  static void fold_vm_zone_numa_events(struct zone *zone)
171  {
172  	unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
173  	int cpu;
174  	enum numa_stat_item item;
175  
176  	for_each_online_cpu(cpu) {
177  		struct per_cpu_zonestat *pzstats;
178  
179  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
180  		for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
181  			zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
182  	}
183  
184  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
185  		zone_numa_event_add(zone_numa_events[item], zone, item);
186  }
187  
188  void fold_vm_numa_events(void)
189  {
190  	struct zone *zone;
191  
192  	for_each_populated_zone(zone)
193  		fold_vm_zone_numa_events(zone);
194  }
195  #endif
196  
197  #ifdef CONFIG_SMP
198  
199  int calculate_pressure_threshold(struct zone *zone)
200  {
201  	int threshold;
202  	int watermark_distance;
203  
204  	/*
205  	 * As vmstats are not up to date, there is drift between the estimated
206  	 * and real values. For high thresholds and a high number of CPUs, it
207  	 * is possible for the min watermark to be breached while the estimated
208  	 * value looks fine. The pressure threshold is a reduced value such
209  	 * that even the maximum amount of drift will not accidentally breach
210  	 * the min watermark
211  	 */
212  	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
213  	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
214  
215  	/*
216  	 * Maximum threshold is 125
217  	 */
218  	threshold = min(125, threshold);
219  
220  	return threshold;
221  }
222  
223  int calculate_normal_threshold(struct zone *zone)
224  {
225  	int threshold;
226  	int mem;	/* memory in 128 MB units */
227  
228  	/*
229  	 * The threshold scales with the number of processors and the amount
230  	 * of memory per zone. More memory means that we can defer updates for
231  	 * longer, more processors could lead to more contention.
232   	 * fls() is used to have a cheap way of logarithmic scaling.
233  	 *
234  	 * Some sample thresholds:
235  	 *
236  	 * Threshold	Processors	(fls)	Zonesize	fls(mem)+1
237  	 * ------------------------------------------------------------------
238  	 * 8		1		1	0.9-1 GB	4
239  	 * 16		2		2	0.9-1 GB	4
240  	 * 20 		2		2	1-2 GB		5
241  	 * 24		2		2	2-4 GB		6
242  	 * 28		2		2	4-8 GB		7
243  	 * 32		2		2	8-16 GB		8
244  	 * 4		2		2	<128M		1
245  	 * 30		4		3	2-4 GB		5
246  	 * 48		4		3	8-16 GB		8
247  	 * 32		8		4	1-2 GB		4
248  	 * 32		8		4	0.9-1GB		4
249  	 * 10		16		5	<128M		1
250  	 * 40		16		5	900M		4
251  	 * 70		64		7	2-4 GB		5
252  	 * 84		64		7	4-8 GB		6
253  	 * 108		512		9	4-8 GB		6
254  	 * 125		1024		10	8-16 GB		8
255  	 * 125		1024		10	16-32 GB	9
256  	 */
257  
258  	mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
259  
260  	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
261  
262  	/*
263  	 * Maximum threshold is 125
264  	 */
265  	threshold = min(125, threshold);
266  
267  	return threshold;
268  }
269  
270  /*
271   * Refresh the thresholds for each zone.
272   */
273  void refresh_zone_stat_thresholds(void)
274  {
275  	struct pglist_data *pgdat;
276  	struct zone *zone;
277  	int cpu;
278  	int threshold;
279  
280  	/* Zero current pgdat thresholds */
281  	for_each_online_pgdat(pgdat) {
282  		for_each_online_cpu(cpu) {
283  			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
284  		}
285  	}
286  
287  	for_each_populated_zone(zone) {
288  		struct pglist_data *pgdat = zone->zone_pgdat;
289  		unsigned long max_drift, tolerate_drift;
290  
291  		threshold = calculate_normal_threshold(zone);
292  
293  		for_each_online_cpu(cpu) {
294  			int pgdat_threshold;
295  
296  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
297  							= threshold;
298  
299  			/* Base nodestat threshold on the largest populated zone. */
300  			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
301  			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
302  				= max(threshold, pgdat_threshold);
303  		}
304  
305  		/*
306  		 * Only set percpu_drift_mark if there is a danger that
307  		 * NR_FREE_PAGES reports the low watermark is ok when in fact
308  		 * the min watermark could be breached by an allocation
309  		 */
310  		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
311  		max_drift = num_online_cpus() * threshold;
312  		if (max_drift > tolerate_drift)
313  			zone->percpu_drift_mark = high_wmark_pages(zone) +
314  					max_drift;
315  	}
316  }
317  
318  void set_pgdat_percpu_threshold(pg_data_t *pgdat,
319  				int (*calculate_pressure)(struct zone *))
320  {
321  	struct zone *zone;
322  	int cpu;
323  	int threshold;
324  	int i;
325  
326  	for (i = 0; i < pgdat->nr_zones; i++) {
327  		zone = &pgdat->node_zones[i];
328  		if (!zone->percpu_drift_mark)
329  			continue;
330  
331  		threshold = (*calculate_pressure)(zone);
332  		for_each_online_cpu(cpu)
333  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
334  							= threshold;
335  	}
336  }
337  
338  /*
339   * For use when we know that interrupts are disabled,
340   * or when we know that preemption is disabled and that
341   * particular counter cannot be updated from interrupt context.
342   */
343  void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
344  			   long delta)
345  {
346  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
347  	s8 __percpu *p = pcp->vm_stat_diff + item;
348  	long x;
349  	long t;
350  
351  	/*
352  	 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
353  	 * atomicity is provided by IRQs being disabled -- either explicitly
354  	 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
355  	 * CPU migrations and preemption potentially corrupts a counter so
356  	 * disable preemption.
357  	 */
358  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
359  		preempt_disable();
360  
361  	x = delta + __this_cpu_read(*p);
362  
363  	t = __this_cpu_read(pcp->stat_threshold);
364  
365  	if (unlikely(abs(x) > t)) {
366  		zone_page_state_add(x, zone, item);
367  		x = 0;
368  	}
369  	__this_cpu_write(*p, x);
370  
371  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
372  		preempt_enable();
373  }
374  EXPORT_SYMBOL(__mod_zone_page_state);
375  
376  void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
377  				long delta)
378  {
379  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
380  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
381  	long x;
382  	long t;
383  
384  	if (vmstat_item_in_bytes(item)) {
385  		/*
386  		 * Only cgroups use subpage accounting right now; at
387  		 * the global level, these items still change in
388  		 * multiples of whole pages. Store them as pages
389  		 * internally to keep the per-cpu counters compact.
390  		 */
391  		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
392  		delta >>= PAGE_SHIFT;
393  	}
394  
395  	/* See __mod_node_page_state */
396  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
397  		preempt_disable();
398  
399  	x = delta + __this_cpu_read(*p);
400  
401  	t = __this_cpu_read(pcp->stat_threshold);
402  
403  	if (unlikely(abs(x) > t)) {
404  		node_page_state_add(x, pgdat, item);
405  		x = 0;
406  	}
407  	__this_cpu_write(*p, x);
408  
409  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
410  		preempt_enable();
411  }
412  EXPORT_SYMBOL(__mod_node_page_state);
413  
414  /*
415   * Optimized increment and decrement functions.
416   *
417   * These are only for a single page and therefore can take a struct page *
418   * argument instead of struct zone *. This allows the inclusion of the code
419   * generated for page_zone(page) into the optimized functions.
420   *
421   * No overflow check is necessary and therefore the differential can be
422   * incremented or decremented in place which may allow the compilers to
423   * generate better code.
424   * The increment or decrement is known and therefore one boundary check can
425   * be omitted.
426   *
427   * NOTE: These functions are very performance sensitive. Change only
428   * with care.
429   *
430   * Some processors have inc/dec instructions that are atomic vs an interrupt.
431   * However, the code must first determine the differential location in a zone
432   * based on the processor number and then inc/dec the counter. There is no
433   * guarantee without disabling preemption that the processor will not change
434   * in between and therefore the atomicity vs. interrupt cannot be exploited
435   * in a useful way here.
436   */
437  void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
438  {
439  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
440  	s8 __percpu *p = pcp->vm_stat_diff + item;
441  	s8 v, t;
442  
443  	/* See __mod_node_page_state */
444  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
445  		preempt_disable();
446  
447  	v = __this_cpu_inc_return(*p);
448  	t = __this_cpu_read(pcp->stat_threshold);
449  	if (unlikely(v > t)) {
450  		s8 overstep = t >> 1;
451  
452  		zone_page_state_add(v + overstep, zone, item);
453  		__this_cpu_write(*p, -overstep);
454  	}
455  
456  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
457  		preempt_enable();
458  }
459  
460  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
461  {
462  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
463  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
464  	s8 v, t;
465  
466  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
467  
468  	/* See __mod_node_page_state */
469  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
470  		preempt_disable();
471  
472  	v = __this_cpu_inc_return(*p);
473  	t = __this_cpu_read(pcp->stat_threshold);
474  	if (unlikely(v > t)) {
475  		s8 overstep = t >> 1;
476  
477  		node_page_state_add(v + overstep, pgdat, item);
478  		__this_cpu_write(*p, -overstep);
479  	}
480  
481  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
482  		preempt_enable();
483  }
484  
485  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
486  {
487  	__inc_zone_state(page_zone(page), item);
488  }
489  EXPORT_SYMBOL(__inc_zone_page_state);
490  
491  void __inc_node_page_state(struct page *page, enum node_stat_item item)
492  {
493  	__inc_node_state(page_pgdat(page), item);
494  }
495  EXPORT_SYMBOL(__inc_node_page_state);
496  
497  void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
498  {
499  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
500  	s8 __percpu *p = pcp->vm_stat_diff + item;
501  	s8 v, t;
502  
503  	/* See __mod_node_page_state */
504  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
505  		preempt_disable();
506  
507  	v = __this_cpu_dec_return(*p);
508  	t = __this_cpu_read(pcp->stat_threshold);
509  	if (unlikely(v < - t)) {
510  		s8 overstep = t >> 1;
511  
512  		zone_page_state_add(v - overstep, zone, item);
513  		__this_cpu_write(*p, overstep);
514  	}
515  
516  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
517  		preempt_enable();
518  }
519  
520  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
521  {
522  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
523  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
524  	s8 v, t;
525  
526  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
527  
528  	/* See __mod_node_page_state */
529  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
530  		preempt_disable();
531  
532  	v = __this_cpu_dec_return(*p);
533  	t = __this_cpu_read(pcp->stat_threshold);
534  	if (unlikely(v < - t)) {
535  		s8 overstep = t >> 1;
536  
537  		node_page_state_add(v - overstep, pgdat, item);
538  		__this_cpu_write(*p, overstep);
539  	}
540  
541  	if (IS_ENABLED(CONFIG_PREEMPT_RT))
542  		preempt_enable();
543  }
544  
545  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
546  {
547  	__dec_zone_state(page_zone(page), item);
548  }
549  EXPORT_SYMBOL(__dec_zone_page_state);
550  
551  void __dec_node_page_state(struct page *page, enum node_stat_item item)
552  {
553  	__dec_node_state(page_pgdat(page), item);
554  }
555  EXPORT_SYMBOL(__dec_node_page_state);
556  
557  #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
558  /*
559   * If we have cmpxchg_local support then we do not need to incur the overhead
560   * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
561   *
562   * mod_state() modifies the zone counter state through atomic per cpu
563   * operations.
564   *
565   * Overstep mode specifies how overstep should handled:
566   *     0       No overstepping
567   *     1       Overstepping half of threshold
568   *     -1      Overstepping minus half of threshold
569  */
570  static inline void mod_zone_state(struct zone *zone,
571         enum zone_stat_item item, long delta, int overstep_mode)
572  {
573  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
574  	s8 __percpu *p = pcp->vm_stat_diff + item;
575  	long o, n, t, z;
576  
577  	do {
578  		z = 0;  /* overflow to zone counters */
579  
580  		/*
581  		 * The fetching of the stat_threshold is racy. We may apply
582  		 * a counter threshold to the wrong the cpu if we get
583  		 * rescheduled while executing here. However, the next
584  		 * counter update will apply the threshold again and
585  		 * therefore bring the counter under the threshold again.
586  		 *
587  		 * Most of the time the thresholds are the same anyways
588  		 * for all cpus in a zone.
589  		 */
590  		t = this_cpu_read(pcp->stat_threshold);
591  
592  		o = this_cpu_read(*p);
593  		n = delta + o;
594  
595  		if (abs(n) > t) {
596  			int os = overstep_mode * (t >> 1) ;
597  
598  			/* Overflow must be added to zone counters */
599  			z = n + os;
600  			n = -os;
601  		}
602  	} while (this_cpu_cmpxchg(*p, o, n) != o);
603  
604  	if (z)
605  		zone_page_state_add(z, zone, item);
606  }
607  
608  void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
609  			 long delta)
610  {
611  	mod_zone_state(zone, item, delta, 0);
612  }
613  EXPORT_SYMBOL(mod_zone_page_state);
614  
615  void inc_zone_page_state(struct page *page, enum zone_stat_item item)
616  {
617  	mod_zone_state(page_zone(page), item, 1, 1);
618  }
619  EXPORT_SYMBOL(inc_zone_page_state);
620  
621  void dec_zone_page_state(struct page *page, enum zone_stat_item item)
622  {
623  	mod_zone_state(page_zone(page), item, -1, -1);
624  }
625  EXPORT_SYMBOL(dec_zone_page_state);
626  
627  static inline void mod_node_state(struct pglist_data *pgdat,
628         enum node_stat_item item, int delta, int overstep_mode)
629  {
630  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
631  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
632  	long o, n, t, z;
633  
634  	if (vmstat_item_in_bytes(item)) {
635  		/*
636  		 * Only cgroups use subpage accounting right now; at
637  		 * the global level, these items still change in
638  		 * multiples of whole pages. Store them as pages
639  		 * internally to keep the per-cpu counters compact.
640  		 */
641  		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
642  		delta >>= PAGE_SHIFT;
643  	}
644  
645  	do {
646  		z = 0;  /* overflow to node counters */
647  
648  		/*
649  		 * The fetching of the stat_threshold is racy. We may apply
650  		 * a counter threshold to the wrong the cpu if we get
651  		 * rescheduled while executing here. However, the next
652  		 * counter update will apply the threshold again and
653  		 * therefore bring the counter under the threshold again.
654  		 *
655  		 * Most of the time the thresholds are the same anyways
656  		 * for all cpus in a node.
657  		 */
658  		t = this_cpu_read(pcp->stat_threshold);
659  
660  		o = this_cpu_read(*p);
661  		n = delta + o;
662  
663  		if (abs(n) > t) {
664  			int os = overstep_mode * (t >> 1) ;
665  
666  			/* Overflow must be added to node counters */
667  			z = n + os;
668  			n = -os;
669  		}
670  	} while (this_cpu_cmpxchg(*p, o, n) != o);
671  
672  	if (z)
673  		node_page_state_add(z, pgdat, item);
674  }
675  
676  void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
677  					long delta)
678  {
679  	mod_node_state(pgdat, item, delta, 0);
680  }
681  EXPORT_SYMBOL(mod_node_page_state);
682  
683  void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
684  {
685  	mod_node_state(pgdat, item, 1, 1);
686  }
687  
688  void inc_node_page_state(struct page *page, enum node_stat_item item)
689  {
690  	mod_node_state(page_pgdat(page), item, 1, 1);
691  }
692  EXPORT_SYMBOL(inc_node_page_state);
693  
694  void dec_node_page_state(struct page *page, enum node_stat_item item)
695  {
696  	mod_node_state(page_pgdat(page), item, -1, -1);
697  }
698  EXPORT_SYMBOL(dec_node_page_state);
699  #else
700  /*
701   * Use interrupt disable to serialize counter updates
702   */
703  void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
704  			 long delta)
705  {
706  	unsigned long flags;
707  
708  	local_irq_save(flags);
709  	__mod_zone_page_state(zone, item, delta);
710  	local_irq_restore(flags);
711  }
712  EXPORT_SYMBOL(mod_zone_page_state);
713  
714  void inc_zone_page_state(struct page *page, enum zone_stat_item item)
715  {
716  	unsigned long flags;
717  	struct zone *zone;
718  
719  	zone = page_zone(page);
720  	local_irq_save(flags);
721  	__inc_zone_state(zone, item);
722  	local_irq_restore(flags);
723  }
724  EXPORT_SYMBOL(inc_zone_page_state);
725  
726  void dec_zone_page_state(struct page *page, enum zone_stat_item item)
727  {
728  	unsigned long flags;
729  
730  	local_irq_save(flags);
731  	__dec_zone_page_state(page, item);
732  	local_irq_restore(flags);
733  }
734  EXPORT_SYMBOL(dec_zone_page_state);
735  
736  void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
737  {
738  	unsigned long flags;
739  
740  	local_irq_save(flags);
741  	__inc_node_state(pgdat, item);
742  	local_irq_restore(flags);
743  }
744  EXPORT_SYMBOL(inc_node_state);
745  
746  void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
747  					long delta)
748  {
749  	unsigned long flags;
750  
751  	local_irq_save(flags);
752  	__mod_node_page_state(pgdat, item, delta);
753  	local_irq_restore(flags);
754  }
755  EXPORT_SYMBOL(mod_node_page_state);
756  
757  void inc_node_page_state(struct page *page, enum node_stat_item item)
758  {
759  	unsigned long flags;
760  	struct pglist_data *pgdat;
761  
762  	pgdat = page_pgdat(page);
763  	local_irq_save(flags);
764  	__inc_node_state(pgdat, item);
765  	local_irq_restore(flags);
766  }
767  EXPORT_SYMBOL(inc_node_page_state);
768  
769  void dec_node_page_state(struct page *page, enum node_stat_item item)
770  {
771  	unsigned long flags;
772  
773  	local_irq_save(flags);
774  	__dec_node_page_state(page, item);
775  	local_irq_restore(flags);
776  }
777  EXPORT_SYMBOL(dec_node_page_state);
778  #endif
779  
780  /*
781   * Fold a differential into the global counters.
782   * Returns the number of counters updated.
783   */
784  static int fold_diff(int *zone_diff, int *node_diff)
785  {
786  	int i;
787  	int changes = 0;
788  
789  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
790  		if (zone_diff[i]) {
791  			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
792  			changes++;
793  	}
794  
795  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
796  		if (node_diff[i]) {
797  			atomic_long_add(node_diff[i], &vm_node_stat[i]);
798  			changes++;
799  	}
800  	return changes;
801  }
802  
803  /*
804   * Update the zone counters for the current cpu.
805   *
806   * Note that refresh_cpu_vm_stats strives to only access
807   * node local memory. The per cpu pagesets on remote zones are placed
808   * in the memory local to the processor using that pageset. So the
809   * loop over all zones will access a series of cachelines local to
810   * the processor.
811   *
812   * The call to zone_page_state_add updates the cachelines with the
813   * statistics in the remote zone struct as well as the global cachelines
814   * with the global counters. These could cause remote node cache line
815   * bouncing and will have to be only done when necessary.
816   *
817   * The function returns the number of global counters updated.
818   */
819  static int refresh_cpu_vm_stats(bool do_pagesets)
820  {
821  	struct pglist_data *pgdat;
822  	struct zone *zone;
823  	int i;
824  	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
825  	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
826  	int changes = 0;
827  
828  	for_each_populated_zone(zone) {
829  		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
830  #ifdef CONFIG_NUMA
831  		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
832  #endif
833  
834  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
835  			int v;
836  
837  			v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
838  			if (v) {
839  
840  				atomic_long_add(v, &zone->vm_stat[i]);
841  				global_zone_diff[i] += v;
842  #ifdef CONFIG_NUMA
843  				/* 3 seconds idle till flush */
844  				__this_cpu_write(pcp->expire, 3);
845  #endif
846  			}
847  		}
848  #ifdef CONFIG_NUMA
849  
850  		if (do_pagesets) {
851  			cond_resched();
852  			/*
853  			 * Deal with draining the remote pageset of this
854  			 * processor
855  			 *
856  			 * Check if there are pages remaining in this pageset
857  			 * if not then there is nothing to expire.
858  			 */
859  			if (!__this_cpu_read(pcp->expire) ||
860  			       !__this_cpu_read(pcp->count))
861  				continue;
862  
863  			/*
864  			 * We never drain zones local to this processor.
865  			 */
866  			if (zone_to_nid(zone) == numa_node_id()) {
867  				__this_cpu_write(pcp->expire, 0);
868  				continue;
869  			}
870  
871  			if (__this_cpu_dec_return(pcp->expire))
872  				continue;
873  
874  			if (__this_cpu_read(pcp->count)) {
875  				drain_zone_pages(zone, this_cpu_ptr(pcp));
876  				changes++;
877  			}
878  		}
879  #endif
880  	}
881  
882  	for_each_online_pgdat(pgdat) {
883  		struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
884  
885  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
886  			int v;
887  
888  			v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
889  			if (v) {
890  				atomic_long_add(v, &pgdat->vm_stat[i]);
891  				global_node_diff[i] += v;
892  			}
893  		}
894  	}
895  
896  	changes += fold_diff(global_zone_diff, global_node_diff);
897  	return changes;
898  }
899  
900  /*
901   * Fold the data for an offline cpu into the global array.
902   * There cannot be any access by the offline cpu and therefore
903   * synchronization is simplified.
904   */
905  void cpu_vm_stats_fold(int cpu)
906  {
907  	struct pglist_data *pgdat;
908  	struct zone *zone;
909  	int i;
910  	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
911  	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
912  
913  	for_each_populated_zone(zone) {
914  		struct per_cpu_zonestat *pzstats;
915  
916  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
917  
918  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
919  			if (pzstats->vm_stat_diff[i]) {
920  				int v;
921  
922  				v = pzstats->vm_stat_diff[i];
923  				pzstats->vm_stat_diff[i] = 0;
924  				atomic_long_add(v, &zone->vm_stat[i]);
925  				global_zone_diff[i] += v;
926  			}
927  		}
928  #ifdef CONFIG_NUMA
929  		for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
930  			if (pzstats->vm_numa_event[i]) {
931  				unsigned long v;
932  
933  				v = pzstats->vm_numa_event[i];
934  				pzstats->vm_numa_event[i] = 0;
935  				zone_numa_event_add(v, zone, i);
936  			}
937  		}
938  #endif
939  	}
940  
941  	for_each_online_pgdat(pgdat) {
942  		struct per_cpu_nodestat *p;
943  
944  		p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
945  
946  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
947  			if (p->vm_node_stat_diff[i]) {
948  				int v;
949  
950  				v = p->vm_node_stat_diff[i];
951  				p->vm_node_stat_diff[i] = 0;
952  				atomic_long_add(v, &pgdat->vm_stat[i]);
953  				global_node_diff[i] += v;
954  			}
955  	}
956  
957  	fold_diff(global_zone_diff, global_node_diff);
958  }
959  
960  /*
961   * this is only called if !populated_zone(zone), which implies no other users of
962   * pset->vm_stat_diff[] exist.
963   */
964  void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
965  {
966  	unsigned long v;
967  	int i;
968  
969  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
970  		if (pzstats->vm_stat_diff[i]) {
971  			v = pzstats->vm_stat_diff[i];
972  			pzstats->vm_stat_diff[i] = 0;
973  			zone_page_state_add(v, zone, i);
974  		}
975  	}
976  
977  #ifdef CONFIG_NUMA
978  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
979  		if (pzstats->vm_numa_event[i]) {
980  			v = pzstats->vm_numa_event[i];
981  			pzstats->vm_numa_event[i] = 0;
982  			zone_numa_event_add(v, zone, i);
983  		}
984  	}
985  #endif
986  }
987  #endif
988  
989  #ifdef CONFIG_NUMA
990  /*
991   * Determine the per node value of a stat item. This function
992   * is called frequently in a NUMA machine, so try to be as
993   * frugal as possible.
994   */
995  unsigned long sum_zone_node_page_state(int node,
996  				 enum zone_stat_item item)
997  {
998  	struct zone *zones = NODE_DATA(node)->node_zones;
999  	int i;
1000  	unsigned long count = 0;
1001  
1002  	for (i = 0; i < MAX_NR_ZONES; i++)
1003  		count += zone_page_state(zones + i, item);
1004  
1005  	return count;
1006  }
1007  
1008  /* Determine the per node value of a numa stat item. */
1009  unsigned long sum_zone_numa_event_state(int node,
1010  				 enum numa_stat_item item)
1011  {
1012  	struct zone *zones = NODE_DATA(node)->node_zones;
1013  	unsigned long count = 0;
1014  	int i;
1015  
1016  	for (i = 0; i < MAX_NR_ZONES; i++)
1017  		count += zone_numa_event_state(zones + i, item);
1018  
1019  	return count;
1020  }
1021  
1022  /*
1023   * Determine the per node value of a stat item.
1024   */
1025  unsigned long node_page_state_pages(struct pglist_data *pgdat,
1026  				    enum node_stat_item item)
1027  {
1028  	long x = atomic_long_read(&pgdat->vm_stat[item]);
1029  #ifdef CONFIG_SMP
1030  	if (x < 0)
1031  		x = 0;
1032  #endif
1033  	return x;
1034  }
1035  
1036  unsigned long node_page_state(struct pglist_data *pgdat,
1037  			      enum node_stat_item item)
1038  {
1039  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1040  
1041  	return node_page_state_pages(pgdat, item);
1042  }
1043  #endif
1044  
1045  #ifdef CONFIG_COMPACTION
1046  
1047  struct contig_page_info {
1048  	unsigned long free_pages;
1049  	unsigned long free_blocks_total;
1050  	unsigned long free_blocks_suitable;
1051  };
1052  
1053  /*
1054   * Calculate the number of free pages in a zone, how many contiguous
1055   * pages are free and how many are large enough to satisfy an allocation of
1056   * the target size. Note that this function makes no attempt to estimate
1057   * how many suitable free blocks there *might* be if MOVABLE pages were
1058   * migrated. Calculating that is possible, but expensive and can be
1059   * figured out from userspace
1060   */
1061  static void fill_contig_page_info(struct zone *zone,
1062  				unsigned int suitable_order,
1063  				struct contig_page_info *info)
1064  {
1065  	unsigned int order;
1066  
1067  	info->free_pages = 0;
1068  	info->free_blocks_total = 0;
1069  	info->free_blocks_suitable = 0;
1070  
1071  	for (order = 0; order < MAX_ORDER; order++) {
1072  		unsigned long blocks;
1073  
1074  		/*
1075  		 * Count number of free blocks.
1076  		 *
1077  		 * Access to nr_free is lockless as nr_free is used only for
1078  		 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1079  		 */
1080  		blocks = data_race(zone->free_area[order].nr_free);
1081  		info->free_blocks_total += blocks;
1082  
1083  		/* Count free base pages */
1084  		info->free_pages += blocks << order;
1085  
1086  		/* Count the suitable free blocks */
1087  		if (order >= suitable_order)
1088  			info->free_blocks_suitable += blocks <<
1089  						(order - suitable_order);
1090  	}
1091  }
1092  
1093  /*
1094   * A fragmentation index only makes sense if an allocation of a requested
1095   * size would fail. If that is true, the fragmentation index indicates
1096   * whether external fragmentation or a lack of memory was the problem.
1097   * The value can be used to determine if page reclaim or compaction
1098   * should be used
1099   */
1100  static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1101  {
1102  	unsigned long requested = 1UL << order;
1103  
1104  	if (WARN_ON_ONCE(order >= MAX_ORDER))
1105  		return 0;
1106  
1107  	if (!info->free_blocks_total)
1108  		return 0;
1109  
1110  	/* Fragmentation index only makes sense when a request would fail */
1111  	if (info->free_blocks_suitable)
1112  		return -1000;
1113  
1114  	/*
1115  	 * Index is between 0 and 1 so return within 3 decimal places
1116  	 *
1117  	 * 0 => allocation would fail due to lack of memory
1118  	 * 1 => allocation would fail due to fragmentation
1119  	 */
1120  	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1121  }
1122  
1123  /*
1124   * Calculates external fragmentation within a zone wrt the given order.
1125   * It is defined as the percentage of pages found in blocks of size
1126   * less than 1 << order. It returns values in range [0, 100].
1127   */
1128  unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1129  {
1130  	struct contig_page_info info;
1131  
1132  	fill_contig_page_info(zone, order, &info);
1133  	if (info.free_pages == 0)
1134  		return 0;
1135  
1136  	return div_u64((info.free_pages -
1137  			(info.free_blocks_suitable << order)) * 100,
1138  			info.free_pages);
1139  }
1140  
1141  /* Same as __fragmentation index but allocs contig_page_info on stack */
1142  int fragmentation_index(struct zone *zone, unsigned int order)
1143  {
1144  	struct contig_page_info info;
1145  
1146  	fill_contig_page_info(zone, order, &info);
1147  	return __fragmentation_index(order, &info);
1148  }
1149  #endif
1150  
1151  #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1152      defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1153  #ifdef CONFIG_ZONE_DMA
1154  #define TEXT_FOR_DMA(xx) xx "_dma",
1155  #else
1156  #define TEXT_FOR_DMA(xx)
1157  #endif
1158  
1159  #ifdef CONFIG_ZONE_DMA32
1160  #define TEXT_FOR_DMA32(xx) xx "_dma32",
1161  #else
1162  #define TEXT_FOR_DMA32(xx)
1163  #endif
1164  
1165  #ifdef CONFIG_HIGHMEM
1166  #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1167  #else
1168  #define TEXT_FOR_HIGHMEM(xx)
1169  #endif
1170  
1171  #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1172  					TEXT_FOR_HIGHMEM(xx) xx "_movable",
1173  
1174  const char * const vmstat_text[] = {
1175  	/* enum zone_stat_item counters */
1176  	"nr_free_pages",
1177  	"nr_zone_inactive_anon",
1178  	"nr_zone_active_anon",
1179  	"nr_zone_inactive_file",
1180  	"nr_zone_active_file",
1181  	"nr_zone_unevictable",
1182  	"nr_zone_write_pending",
1183  	"nr_mlock",
1184  	"nr_bounce",
1185  #if IS_ENABLED(CONFIG_ZSMALLOC)
1186  	"nr_zspages",
1187  #endif
1188  	"nr_free_cma",
1189  
1190  	/* enum numa_stat_item counters */
1191  #ifdef CONFIG_NUMA
1192  	"numa_hit",
1193  	"numa_miss",
1194  	"numa_foreign",
1195  	"numa_interleave",
1196  	"numa_local",
1197  	"numa_other",
1198  #endif
1199  
1200  	/* enum node_stat_item counters */
1201  	"nr_inactive_anon",
1202  	"nr_active_anon",
1203  	"nr_inactive_file",
1204  	"nr_active_file",
1205  	"nr_unevictable",
1206  	"nr_slab_reclaimable",
1207  	"nr_slab_unreclaimable",
1208  	"nr_isolated_anon",
1209  	"nr_isolated_file",
1210  	"workingset_nodes",
1211  	"workingset_refault_anon",
1212  	"workingset_refault_file",
1213  	"workingset_activate_anon",
1214  	"workingset_activate_file",
1215  	"workingset_restore_anon",
1216  	"workingset_restore_file",
1217  	"workingset_nodereclaim",
1218  	"nr_anon_pages",
1219  	"nr_mapped",
1220  	"nr_file_pages",
1221  	"nr_dirty",
1222  	"nr_writeback",
1223  	"nr_writeback_temp",
1224  	"nr_shmem",
1225  	"nr_shmem_hugepages",
1226  	"nr_shmem_pmdmapped",
1227  	"nr_file_hugepages",
1228  	"nr_file_pmdmapped",
1229  	"nr_anon_transparent_hugepages",
1230  	"nr_vmscan_write",
1231  	"nr_vmscan_immediate_reclaim",
1232  	"nr_dirtied",
1233  	"nr_written",
1234  	"nr_throttled_written",
1235  	"nr_kernel_misc_reclaimable",
1236  	"nr_foll_pin_acquired",
1237  	"nr_foll_pin_released",
1238  	"nr_kernel_stack",
1239  #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1240  	"nr_shadow_call_stack",
1241  #endif
1242  	"nr_page_table_pages",
1243  #ifdef CONFIG_SWAP
1244  	"nr_swapcached",
1245  #endif
1246  #ifdef CONFIG_NUMA_BALANCING
1247  	"pgpromote_success",
1248  #endif
1249  
1250  	/* enum writeback_stat_item counters */
1251  	"nr_dirty_threshold",
1252  	"nr_dirty_background_threshold",
1253  
1254  #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1255  	/* enum vm_event_item counters */
1256  	"pgpgin",
1257  	"pgpgout",
1258  	"pswpin",
1259  	"pswpout",
1260  
1261  	TEXTS_FOR_ZONES("pgalloc")
1262  	TEXTS_FOR_ZONES("allocstall")
1263  	TEXTS_FOR_ZONES("pgskip")
1264  
1265  	"pgfree",
1266  	"pgactivate",
1267  	"pgdeactivate",
1268  	"pglazyfree",
1269  
1270  	"pgfault",
1271  	"pgmajfault",
1272  	"pglazyfreed",
1273  
1274  	"pgrefill",
1275  	"pgreuse",
1276  	"pgsteal_kswapd",
1277  	"pgsteal_direct",
1278  	"pgdemote_kswapd",
1279  	"pgdemote_direct",
1280  	"pgscan_kswapd",
1281  	"pgscan_direct",
1282  	"pgscan_direct_throttle",
1283  	"pgscan_anon",
1284  	"pgscan_file",
1285  	"pgsteal_anon",
1286  	"pgsteal_file",
1287  
1288  #ifdef CONFIG_NUMA
1289  	"zone_reclaim_failed",
1290  #endif
1291  	"pginodesteal",
1292  	"slabs_scanned",
1293  	"kswapd_inodesteal",
1294  	"kswapd_low_wmark_hit_quickly",
1295  	"kswapd_high_wmark_hit_quickly",
1296  	"pageoutrun",
1297  
1298  	"pgrotated",
1299  
1300  	"drop_pagecache",
1301  	"drop_slab",
1302  	"oom_kill",
1303  
1304  #ifdef CONFIG_NUMA_BALANCING
1305  	"numa_pte_updates",
1306  	"numa_huge_pte_updates",
1307  	"numa_hint_faults",
1308  	"numa_hint_faults_local",
1309  	"numa_pages_migrated",
1310  #endif
1311  #ifdef CONFIG_MIGRATION
1312  	"pgmigrate_success",
1313  	"pgmigrate_fail",
1314  	"thp_migration_success",
1315  	"thp_migration_fail",
1316  	"thp_migration_split",
1317  #endif
1318  #ifdef CONFIG_COMPACTION
1319  	"compact_migrate_scanned",
1320  	"compact_free_scanned",
1321  	"compact_isolated",
1322  	"compact_stall",
1323  	"compact_fail",
1324  	"compact_success",
1325  	"compact_daemon_wake",
1326  	"compact_daemon_migrate_scanned",
1327  	"compact_daemon_free_scanned",
1328  #endif
1329  
1330  #ifdef CONFIG_HUGETLB_PAGE
1331  	"htlb_buddy_alloc_success",
1332  	"htlb_buddy_alloc_fail",
1333  #endif
1334  #ifdef CONFIG_CMA
1335  	"cma_alloc_success",
1336  	"cma_alloc_fail",
1337  #endif
1338  	"unevictable_pgs_culled",
1339  	"unevictable_pgs_scanned",
1340  	"unevictable_pgs_rescued",
1341  	"unevictable_pgs_mlocked",
1342  	"unevictable_pgs_munlocked",
1343  	"unevictable_pgs_cleared",
1344  	"unevictable_pgs_stranded",
1345  
1346  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1347  	"thp_fault_alloc",
1348  	"thp_fault_fallback",
1349  	"thp_fault_fallback_charge",
1350  	"thp_collapse_alloc",
1351  	"thp_collapse_alloc_failed",
1352  	"thp_file_alloc",
1353  	"thp_file_fallback",
1354  	"thp_file_fallback_charge",
1355  	"thp_file_mapped",
1356  	"thp_split_page",
1357  	"thp_split_page_failed",
1358  	"thp_deferred_split_page",
1359  	"thp_split_pmd",
1360  	"thp_scan_exceed_none_pte",
1361  	"thp_scan_exceed_swap_pte",
1362  	"thp_scan_exceed_share_pte",
1363  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1364  	"thp_split_pud",
1365  #endif
1366  	"thp_zero_page_alloc",
1367  	"thp_zero_page_alloc_failed",
1368  	"thp_swpout",
1369  	"thp_swpout_fallback",
1370  #endif
1371  #ifdef CONFIG_MEMORY_BALLOON
1372  	"balloon_inflate",
1373  	"balloon_deflate",
1374  #ifdef CONFIG_BALLOON_COMPACTION
1375  	"balloon_migrate",
1376  #endif
1377  #endif /* CONFIG_MEMORY_BALLOON */
1378  #ifdef CONFIG_DEBUG_TLBFLUSH
1379  	"nr_tlb_remote_flush",
1380  	"nr_tlb_remote_flush_received",
1381  	"nr_tlb_local_flush_all",
1382  	"nr_tlb_local_flush_one",
1383  #endif /* CONFIG_DEBUG_TLBFLUSH */
1384  
1385  #ifdef CONFIG_DEBUG_VM_VMACACHE
1386  	"vmacache_find_calls",
1387  	"vmacache_find_hits",
1388  #endif
1389  #ifdef CONFIG_SWAP
1390  	"swap_ra",
1391  	"swap_ra_hit",
1392  #ifdef CONFIG_KSM
1393  	"ksm_swpin_copy",
1394  #endif
1395  #endif
1396  #ifdef CONFIG_KSM
1397  	"cow_ksm",
1398  #endif
1399  #ifdef CONFIG_ZSWAP
1400  	"zswpin",
1401  	"zswpout",
1402  #endif
1403  #ifdef CONFIG_X86
1404  	"direct_map_level2_splits",
1405  	"direct_map_level3_splits",
1406  #endif
1407  #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1408  };
1409  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1410  
1411  #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1412       defined(CONFIG_PROC_FS)
1413  static void *frag_start(struct seq_file *m, loff_t *pos)
1414  {
1415  	pg_data_t *pgdat;
1416  	loff_t node = *pos;
1417  
1418  	for (pgdat = first_online_pgdat();
1419  	     pgdat && node;
1420  	     pgdat = next_online_pgdat(pgdat))
1421  		--node;
1422  
1423  	return pgdat;
1424  }
1425  
1426  static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1427  {
1428  	pg_data_t *pgdat = (pg_data_t *)arg;
1429  
1430  	(*pos)++;
1431  	return next_online_pgdat(pgdat);
1432  }
1433  
1434  static void frag_stop(struct seq_file *m, void *arg)
1435  {
1436  }
1437  
1438  /*
1439   * Walk zones in a node and print using a callback.
1440   * If @assert_populated is true, only use callback for zones that are populated.
1441   */
1442  static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1443  		bool assert_populated, bool nolock,
1444  		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1445  {
1446  	struct zone *zone;
1447  	struct zone *node_zones = pgdat->node_zones;
1448  	unsigned long flags;
1449  
1450  	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1451  		if (assert_populated && !populated_zone(zone))
1452  			continue;
1453  
1454  		if (!nolock)
1455  			spin_lock_irqsave(&zone->lock, flags);
1456  		print(m, pgdat, zone);
1457  		if (!nolock)
1458  			spin_unlock_irqrestore(&zone->lock, flags);
1459  	}
1460  }
1461  #endif
1462  
1463  #ifdef CONFIG_PROC_FS
1464  static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1465  						struct zone *zone)
1466  {
1467  	int order;
1468  
1469  	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1470  	for (order = 0; order < MAX_ORDER; ++order)
1471  		/*
1472  		 * Access to nr_free is lockless as nr_free is used only for
1473  		 * printing purposes. Use data_race to avoid KCSAN warning.
1474  		 */
1475  		seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
1476  	seq_putc(m, '\n');
1477  }
1478  
1479  /*
1480   * This walks the free areas for each zone.
1481   */
1482  static int frag_show(struct seq_file *m, void *arg)
1483  {
1484  	pg_data_t *pgdat = (pg_data_t *)arg;
1485  	walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1486  	return 0;
1487  }
1488  
1489  static void pagetypeinfo_showfree_print(struct seq_file *m,
1490  					pg_data_t *pgdat, struct zone *zone)
1491  {
1492  	int order, mtype;
1493  
1494  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1495  		seq_printf(m, "Node %4d, zone %8s, type %12s ",
1496  					pgdat->node_id,
1497  					zone->name,
1498  					migratetype_names[mtype]);
1499  		for (order = 0; order < MAX_ORDER; ++order) {
1500  			unsigned long freecount = 0;
1501  			struct free_area *area;
1502  			struct list_head *curr;
1503  			bool overflow = false;
1504  
1505  			area = &(zone->free_area[order]);
1506  
1507  			list_for_each(curr, &area->free_list[mtype]) {
1508  				/*
1509  				 * Cap the free_list iteration because it might
1510  				 * be really large and we are under a spinlock
1511  				 * so a long time spent here could trigger a
1512  				 * hard lockup detector. Anyway this is a
1513  				 * debugging tool so knowing there is a handful
1514  				 * of pages of this order should be more than
1515  				 * sufficient.
1516  				 */
1517  				if (++freecount >= 100000) {
1518  					overflow = true;
1519  					break;
1520  				}
1521  			}
1522  			seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1523  			spin_unlock_irq(&zone->lock);
1524  			cond_resched();
1525  			spin_lock_irq(&zone->lock);
1526  		}
1527  		seq_putc(m, '\n');
1528  	}
1529  }
1530  
1531  /* Print out the free pages at each order for each migatetype */
1532  static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
1533  {
1534  	int order;
1535  	pg_data_t *pgdat = (pg_data_t *)arg;
1536  
1537  	/* Print header */
1538  	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1539  	for (order = 0; order < MAX_ORDER; ++order)
1540  		seq_printf(m, "%6d ", order);
1541  	seq_putc(m, '\n');
1542  
1543  	walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1544  }
1545  
1546  static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1547  					pg_data_t *pgdat, struct zone *zone)
1548  {
1549  	int mtype;
1550  	unsigned long pfn;
1551  	unsigned long start_pfn = zone->zone_start_pfn;
1552  	unsigned long end_pfn = zone_end_pfn(zone);
1553  	unsigned long count[MIGRATE_TYPES] = { 0, };
1554  
1555  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1556  		struct page *page;
1557  
1558  		page = pfn_to_online_page(pfn);
1559  		if (!page)
1560  			continue;
1561  
1562  		if (page_zone(page) != zone)
1563  			continue;
1564  
1565  		mtype = get_pageblock_migratetype(page);
1566  
1567  		if (mtype < MIGRATE_TYPES)
1568  			count[mtype]++;
1569  	}
1570  
1571  	/* Print counts */
1572  	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1573  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1574  		seq_printf(m, "%12lu ", count[mtype]);
1575  	seq_putc(m, '\n');
1576  }
1577  
1578  /* Print out the number of pageblocks for each migratetype */
1579  static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1580  {
1581  	int mtype;
1582  	pg_data_t *pgdat = (pg_data_t *)arg;
1583  
1584  	seq_printf(m, "\n%-23s", "Number of blocks type ");
1585  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1586  		seq_printf(m, "%12s ", migratetype_names[mtype]);
1587  	seq_putc(m, '\n');
1588  	walk_zones_in_node(m, pgdat, true, false,
1589  		pagetypeinfo_showblockcount_print);
1590  }
1591  
1592  /*
1593   * Print out the number of pageblocks for each migratetype that contain pages
1594   * of other types. This gives an indication of how well fallbacks are being
1595   * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1596   * to determine what is going on
1597   */
1598  static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1599  {
1600  #ifdef CONFIG_PAGE_OWNER
1601  	int mtype;
1602  
1603  	if (!static_branch_unlikely(&page_owner_inited))
1604  		return;
1605  
1606  	drain_all_pages(NULL);
1607  
1608  	seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1609  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1610  		seq_printf(m, "%12s ", migratetype_names[mtype]);
1611  	seq_putc(m, '\n');
1612  
1613  	walk_zones_in_node(m, pgdat, true, true,
1614  		pagetypeinfo_showmixedcount_print);
1615  #endif /* CONFIG_PAGE_OWNER */
1616  }
1617  
1618  /*
1619   * This prints out statistics in relation to grouping pages by mobility.
1620   * It is expensive to collect so do not constantly read the file.
1621   */
1622  static int pagetypeinfo_show(struct seq_file *m, void *arg)
1623  {
1624  	pg_data_t *pgdat = (pg_data_t *)arg;
1625  
1626  	/* check memoryless node */
1627  	if (!node_state(pgdat->node_id, N_MEMORY))
1628  		return 0;
1629  
1630  	seq_printf(m, "Page block order: %d\n", pageblock_order);
1631  	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1632  	seq_putc(m, '\n');
1633  	pagetypeinfo_showfree(m, pgdat);
1634  	pagetypeinfo_showblockcount(m, pgdat);
1635  	pagetypeinfo_showmixedcount(m, pgdat);
1636  
1637  	return 0;
1638  }
1639  
1640  static const struct seq_operations fragmentation_op = {
1641  	.start	= frag_start,
1642  	.next	= frag_next,
1643  	.stop	= frag_stop,
1644  	.show	= frag_show,
1645  };
1646  
1647  static const struct seq_operations pagetypeinfo_op = {
1648  	.start	= frag_start,
1649  	.next	= frag_next,
1650  	.stop	= frag_stop,
1651  	.show	= pagetypeinfo_show,
1652  };
1653  
1654  static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1655  {
1656  	int zid;
1657  
1658  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1659  		struct zone *compare = &pgdat->node_zones[zid];
1660  
1661  		if (populated_zone(compare))
1662  			return zone == compare;
1663  	}
1664  
1665  	return false;
1666  }
1667  
1668  static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1669  							struct zone *zone)
1670  {
1671  	int i;
1672  	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1673  	if (is_zone_first_populated(pgdat, zone)) {
1674  		seq_printf(m, "\n  per-node stats");
1675  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1676  			unsigned long pages = node_page_state_pages(pgdat, i);
1677  
1678  			if (vmstat_item_print_in_thp(i))
1679  				pages /= HPAGE_PMD_NR;
1680  			seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1681  				   pages);
1682  		}
1683  	}
1684  	seq_printf(m,
1685  		   "\n  pages free     %lu"
1686  		   "\n        boost    %lu"
1687  		   "\n        min      %lu"
1688  		   "\n        low      %lu"
1689  		   "\n        high     %lu"
1690  		   "\n        spanned  %lu"
1691  		   "\n        present  %lu"
1692  		   "\n        managed  %lu"
1693  		   "\n        cma      %lu",
1694  		   zone_page_state(zone, NR_FREE_PAGES),
1695  		   zone->watermark_boost,
1696  		   min_wmark_pages(zone),
1697  		   low_wmark_pages(zone),
1698  		   high_wmark_pages(zone),
1699  		   zone->spanned_pages,
1700  		   zone->present_pages,
1701  		   zone_managed_pages(zone),
1702  		   zone_cma_pages(zone));
1703  
1704  	seq_printf(m,
1705  		   "\n        protection: (%ld",
1706  		   zone->lowmem_reserve[0]);
1707  	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1708  		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1709  	seq_putc(m, ')');
1710  
1711  	/* If unpopulated, no other information is useful */
1712  	if (!populated_zone(zone)) {
1713  		seq_putc(m, '\n');
1714  		return;
1715  	}
1716  
1717  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1718  		seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1719  			   zone_page_state(zone, i));
1720  
1721  #ifdef CONFIG_NUMA
1722  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1723  		seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1724  			   zone_numa_event_state(zone, i));
1725  #endif
1726  
1727  	seq_printf(m, "\n  pagesets");
1728  	for_each_online_cpu(i) {
1729  		struct per_cpu_pages *pcp;
1730  		struct per_cpu_zonestat __maybe_unused *pzstats;
1731  
1732  		pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1733  		seq_printf(m,
1734  			   "\n    cpu: %i"
1735  			   "\n              count: %i"
1736  			   "\n              high:  %i"
1737  			   "\n              batch: %i",
1738  			   i,
1739  			   pcp->count,
1740  			   pcp->high,
1741  			   pcp->batch);
1742  #ifdef CONFIG_SMP
1743  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1744  		seq_printf(m, "\n  vm stats threshold: %d",
1745  				pzstats->stat_threshold);
1746  #endif
1747  	}
1748  	seq_printf(m,
1749  		   "\n  node_unreclaimable:  %u"
1750  		   "\n  start_pfn:           %lu",
1751  		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1752  		   zone->zone_start_pfn);
1753  	seq_putc(m, '\n');
1754  }
1755  
1756  /*
1757   * Output information about zones in @pgdat.  All zones are printed regardless
1758   * of whether they are populated or not: lowmem_reserve_ratio operates on the
1759   * set of all zones and userspace would not be aware of such zones if they are
1760   * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1761   */
1762  static int zoneinfo_show(struct seq_file *m, void *arg)
1763  {
1764  	pg_data_t *pgdat = (pg_data_t *)arg;
1765  	walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1766  	return 0;
1767  }
1768  
1769  static const struct seq_operations zoneinfo_op = {
1770  	.start	= frag_start, /* iterate over all zones. The same as in
1771  			       * fragmentation. */
1772  	.next	= frag_next,
1773  	.stop	= frag_stop,
1774  	.show	= zoneinfo_show,
1775  };
1776  
1777  #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1778  			 NR_VM_NUMA_EVENT_ITEMS + \
1779  			 NR_VM_NODE_STAT_ITEMS + \
1780  			 NR_VM_WRITEBACK_STAT_ITEMS + \
1781  			 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1782  			  NR_VM_EVENT_ITEMS : 0))
1783  
1784  static void *vmstat_start(struct seq_file *m, loff_t *pos)
1785  {
1786  	unsigned long *v;
1787  	int i;
1788  
1789  	if (*pos >= NR_VMSTAT_ITEMS)
1790  		return NULL;
1791  
1792  	BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1793  	fold_vm_numa_events();
1794  	v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1795  	m->private = v;
1796  	if (!v)
1797  		return ERR_PTR(-ENOMEM);
1798  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1799  		v[i] = global_zone_page_state(i);
1800  	v += NR_VM_ZONE_STAT_ITEMS;
1801  
1802  #ifdef CONFIG_NUMA
1803  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1804  		v[i] = global_numa_event_state(i);
1805  	v += NR_VM_NUMA_EVENT_ITEMS;
1806  #endif
1807  
1808  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1809  		v[i] = global_node_page_state_pages(i);
1810  		if (vmstat_item_print_in_thp(i))
1811  			v[i] /= HPAGE_PMD_NR;
1812  	}
1813  	v += NR_VM_NODE_STAT_ITEMS;
1814  
1815  	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1816  			    v + NR_DIRTY_THRESHOLD);
1817  	v += NR_VM_WRITEBACK_STAT_ITEMS;
1818  
1819  #ifdef CONFIG_VM_EVENT_COUNTERS
1820  	all_vm_events(v);
1821  	v[PGPGIN] /= 2;		/* sectors -> kbytes */
1822  	v[PGPGOUT] /= 2;
1823  #endif
1824  	return (unsigned long *)m->private + *pos;
1825  }
1826  
1827  static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1828  {
1829  	(*pos)++;
1830  	if (*pos >= NR_VMSTAT_ITEMS)
1831  		return NULL;
1832  	return (unsigned long *)m->private + *pos;
1833  }
1834  
1835  static int vmstat_show(struct seq_file *m, void *arg)
1836  {
1837  	unsigned long *l = arg;
1838  	unsigned long off = l - (unsigned long *)m->private;
1839  
1840  	seq_puts(m, vmstat_text[off]);
1841  	seq_put_decimal_ull(m, " ", *l);
1842  	seq_putc(m, '\n');
1843  
1844  	if (off == NR_VMSTAT_ITEMS - 1) {
1845  		/*
1846  		 * We've come to the end - add any deprecated counters to avoid
1847  		 * breaking userspace which might depend on them being present.
1848  		 */
1849  		seq_puts(m, "nr_unstable 0\n");
1850  	}
1851  	return 0;
1852  }
1853  
1854  static void vmstat_stop(struct seq_file *m, void *arg)
1855  {
1856  	kfree(m->private);
1857  	m->private = NULL;
1858  }
1859  
1860  static const struct seq_operations vmstat_op = {
1861  	.start	= vmstat_start,
1862  	.next	= vmstat_next,
1863  	.stop	= vmstat_stop,
1864  	.show	= vmstat_show,
1865  };
1866  #endif /* CONFIG_PROC_FS */
1867  
1868  #ifdef CONFIG_SMP
1869  static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1870  int sysctl_stat_interval __read_mostly = HZ;
1871  
1872  #ifdef CONFIG_PROC_FS
1873  static void refresh_vm_stats(struct work_struct *work)
1874  {
1875  	refresh_cpu_vm_stats(true);
1876  }
1877  
1878  int vmstat_refresh(struct ctl_table *table, int write,
1879  		   void *buffer, size_t *lenp, loff_t *ppos)
1880  {
1881  	long val;
1882  	int err;
1883  	int i;
1884  
1885  	/*
1886  	 * The regular update, every sysctl_stat_interval, may come later
1887  	 * than expected: leaving a significant amount in per_cpu buckets.
1888  	 * This is particularly misleading when checking a quantity of HUGE
1889  	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1890  	 * which can equally be echo'ed to or cat'ted from (by root),
1891  	 * can be used to update the stats just before reading them.
1892  	 *
1893  	 * Oh, and since global_zone_page_state() etc. are so careful to hide
1894  	 * transiently negative values, report an error here if any of
1895  	 * the stats is negative, so we know to go looking for imbalance.
1896  	 */
1897  	err = schedule_on_each_cpu(refresh_vm_stats);
1898  	if (err)
1899  		return err;
1900  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1901  		/*
1902  		 * Skip checking stats known to go negative occasionally.
1903  		 */
1904  		switch (i) {
1905  		case NR_ZONE_WRITE_PENDING:
1906  		case NR_FREE_CMA_PAGES:
1907  			continue;
1908  		}
1909  		val = atomic_long_read(&vm_zone_stat[i]);
1910  		if (val < 0) {
1911  			pr_warn("%s: %s %ld\n",
1912  				__func__, zone_stat_name(i), val);
1913  		}
1914  	}
1915  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1916  		/*
1917  		 * Skip checking stats known to go negative occasionally.
1918  		 */
1919  		switch (i) {
1920  		case NR_WRITEBACK:
1921  			continue;
1922  		}
1923  		val = atomic_long_read(&vm_node_stat[i]);
1924  		if (val < 0) {
1925  			pr_warn("%s: %s %ld\n",
1926  				__func__, node_stat_name(i), val);
1927  		}
1928  	}
1929  	if (write)
1930  		*ppos += *lenp;
1931  	else
1932  		*lenp = 0;
1933  	return 0;
1934  }
1935  #endif /* CONFIG_PROC_FS */
1936  
1937  static void vmstat_update(struct work_struct *w)
1938  {
1939  	if (refresh_cpu_vm_stats(true)) {
1940  		/*
1941  		 * Counters were updated so we expect more updates
1942  		 * to occur in the future. Keep on running the
1943  		 * update worker thread.
1944  		 */
1945  		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
1946  				this_cpu_ptr(&vmstat_work),
1947  				round_jiffies_relative(sysctl_stat_interval));
1948  	}
1949  }
1950  
1951  /*
1952   * Check if the diffs for a certain cpu indicate that
1953   * an update is needed.
1954   */
1955  static bool need_update(int cpu)
1956  {
1957  	pg_data_t *last_pgdat = NULL;
1958  	struct zone *zone;
1959  
1960  	for_each_populated_zone(zone) {
1961  		struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
1962  		struct per_cpu_nodestat *n;
1963  
1964  		/*
1965  		 * The fast way of checking if there are any vmstat diffs.
1966  		 */
1967  		if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
1968  			return true;
1969  
1970  		if (last_pgdat == zone->zone_pgdat)
1971  			continue;
1972  		last_pgdat = zone->zone_pgdat;
1973  		n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
1974  		if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1975  			return true;
1976  	}
1977  	return false;
1978  }
1979  
1980  /*
1981   * Switch off vmstat processing and then fold all the remaining differentials
1982   * until the diffs stay at zero. The function is used by NOHZ and can only be
1983   * invoked when tick processing is not active.
1984   */
1985  void quiet_vmstat(void)
1986  {
1987  	if (system_state != SYSTEM_RUNNING)
1988  		return;
1989  
1990  	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
1991  		return;
1992  
1993  	if (!need_update(smp_processor_id()))
1994  		return;
1995  
1996  	/*
1997  	 * Just refresh counters and do not care about the pending delayed
1998  	 * vmstat_update. It doesn't fire that often to matter and canceling
1999  	 * it would be too expensive from this path.
2000  	 * vmstat_shepherd will take care about that for us.
2001  	 */
2002  	refresh_cpu_vm_stats(false);
2003  }
2004  
2005  /*
2006   * Shepherd worker thread that checks the
2007   * differentials of processors that have their worker
2008   * threads for vm statistics updates disabled because of
2009   * inactivity.
2010   */
2011  static void vmstat_shepherd(struct work_struct *w);
2012  
2013  static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2014  
2015  static void vmstat_shepherd(struct work_struct *w)
2016  {
2017  	int cpu;
2018  
2019  	cpus_read_lock();
2020  	/* Check processors whose vmstat worker threads have been disabled */
2021  	for_each_online_cpu(cpu) {
2022  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2023  
2024  		if (!delayed_work_pending(dw) && need_update(cpu))
2025  			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
2026  
2027  		cond_resched();
2028  	}
2029  	cpus_read_unlock();
2030  
2031  	schedule_delayed_work(&shepherd,
2032  		round_jiffies_relative(sysctl_stat_interval));
2033  }
2034  
2035  static void __init start_shepherd_timer(void)
2036  {
2037  	int cpu;
2038  
2039  	for_each_possible_cpu(cpu)
2040  		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2041  			vmstat_update);
2042  
2043  	schedule_delayed_work(&shepherd,
2044  		round_jiffies_relative(sysctl_stat_interval));
2045  }
2046  
2047  static void __init init_cpu_node_state(void)
2048  {
2049  	int node;
2050  
2051  	for_each_online_node(node) {
2052  		if (!cpumask_empty(cpumask_of_node(node)))
2053  			node_set_state(node, N_CPU);
2054  	}
2055  }
2056  
2057  static int vmstat_cpu_online(unsigned int cpu)
2058  {
2059  	refresh_zone_stat_thresholds();
2060  
2061  	if (!node_state(cpu_to_node(cpu), N_CPU)) {
2062  		node_set_state(cpu_to_node(cpu), N_CPU);
2063  		set_migration_target_nodes();
2064  	}
2065  
2066  	return 0;
2067  }
2068  
2069  static int vmstat_cpu_down_prep(unsigned int cpu)
2070  {
2071  	cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2072  	return 0;
2073  }
2074  
2075  static int vmstat_cpu_dead(unsigned int cpu)
2076  {
2077  	const struct cpumask *node_cpus;
2078  	int node;
2079  
2080  	node = cpu_to_node(cpu);
2081  
2082  	refresh_zone_stat_thresholds();
2083  	node_cpus = cpumask_of_node(node);
2084  	if (!cpumask_empty(node_cpus))
2085  		return 0;
2086  
2087  	node_clear_state(node, N_CPU);
2088  	set_migration_target_nodes();
2089  
2090  	return 0;
2091  }
2092  
2093  #endif
2094  
2095  struct workqueue_struct *mm_percpu_wq;
2096  
2097  void __init init_mm_internals(void)
2098  {
2099  	int ret __maybe_unused;
2100  
2101  	mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2102  
2103  #ifdef CONFIG_SMP
2104  	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2105  					NULL, vmstat_cpu_dead);
2106  	if (ret < 0)
2107  		pr_err("vmstat: failed to register 'dead' hotplug state\n");
2108  
2109  	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2110  					vmstat_cpu_online,
2111  					vmstat_cpu_down_prep);
2112  	if (ret < 0)
2113  		pr_err("vmstat: failed to register 'online' hotplug state\n");
2114  
2115  	cpus_read_lock();
2116  	init_cpu_node_state();
2117  	cpus_read_unlock();
2118  
2119  	start_shepherd_timer();
2120  #endif
2121  	migrate_on_reclaim_init();
2122  #ifdef CONFIG_PROC_FS
2123  	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2124  	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2125  	proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2126  	proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2127  #endif
2128  }
2129  
2130  #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2131  
2132  /*
2133   * Return an index indicating how much of the available free memory is
2134   * unusable for an allocation of the requested size.
2135   */
2136  static int unusable_free_index(unsigned int order,
2137  				struct contig_page_info *info)
2138  {
2139  	/* No free memory is interpreted as all free memory is unusable */
2140  	if (info->free_pages == 0)
2141  		return 1000;
2142  
2143  	/*
2144  	 * Index should be a value between 0 and 1. Return a value to 3
2145  	 * decimal places.
2146  	 *
2147  	 * 0 => no fragmentation
2148  	 * 1 => high fragmentation
2149  	 */
2150  	return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2151  
2152  }
2153  
2154  static void unusable_show_print(struct seq_file *m,
2155  					pg_data_t *pgdat, struct zone *zone)
2156  {
2157  	unsigned int order;
2158  	int index;
2159  	struct contig_page_info info;
2160  
2161  	seq_printf(m, "Node %d, zone %8s ",
2162  				pgdat->node_id,
2163  				zone->name);
2164  	for (order = 0; order < MAX_ORDER; ++order) {
2165  		fill_contig_page_info(zone, order, &info);
2166  		index = unusable_free_index(order, &info);
2167  		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2168  	}
2169  
2170  	seq_putc(m, '\n');
2171  }
2172  
2173  /*
2174   * Display unusable free space index
2175   *
2176   * The unusable free space index measures how much of the available free
2177   * memory cannot be used to satisfy an allocation of a given size and is a
2178   * value between 0 and 1. The higher the value, the more of free memory is
2179   * unusable and by implication, the worse the external fragmentation is. This
2180   * can be expressed as a percentage by multiplying by 100.
2181   */
2182  static int unusable_show(struct seq_file *m, void *arg)
2183  {
2184  	pg_data_t *pgdat = (pg_data_t *)arg;
2185  
2186  	/* check memoryless node */
2187  	if (!node_state(pgdat->node_id, N_MEMORY))
2188  		return 0;
2189  
2190  	walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2191  
2192  	return 0;
2193  }
2194  
2195  static const struct seq_operations unusable_sops = {
2196  	.start	= frag_start,
2197  	.next	= frag_next,
2198  	.stop	= frag_stop,
2199  	.show	= unusable_show,
2200  };
2201  
2202  DEFINE_SEQ_ATTRIBUTE(unusable);
2203  
2204  static void extfrag_show_print(struct seq_file *m,
2205  					pg_data_t *pgdat, struct zone *zone)
2206  {
2207  	unsigned int order;
2208  	int index;
2209  
2210  	/* Alloc on stack as interrupts are disabled for zone walk */
2211  	struct contig_page_info info;
2212  
2213  	seq_printf(m, "Node %d, zone %8s ",
2214  				pgdat->node_id,
2215  				zone->name);
2216  	for (order = 0; order < MAX_ORDER; ++order) {
2217  		fill_contig_page_info(zone, order, &info);
2218  		index = __fragmentation_index(order, &info);
2219  		seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
2220  	}
2221  
2222  	seq_putc(m, '\n');
2223  }
2224  
2225  /*
2226   * Display fragmentation index for orders that allocations would fail for
2227   */
2228  static int extfrag_show(struct seq_file *m, void *arg)
2229  {
2230  	pg_data_t *pgdat = (pg_data_t *)arg;
2231  
2232  	walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2233  
2234  	return 0;
2235  }
2236  
2237  static const struct seq_operations extfrag_sops = {
2238  	.start	= frag_start,
2239  	.next	= frag_next,
2240  	.stop	= frag_stop,
2241  	.show	= extfrag_show,
2242  };
2243  
2244  DEFINE_SEQ_ATTRIBUTE(extfrag);
2245  
2246  static int __init extfrag_debug_init(void)
2247  {
2248  	struct dentry *extfrag_debug_root;
2249  
2250  	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2251  
2252  	debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2253  			    &unusable_fops);
2254  
2255  	debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2256  			    &extfrag_fops);
2257  
2258  	return 0;
2259  }
2260  
2261  module_init(extfrag_debug_init);
2262  #endif
2263