1 /* 2 * linux/mm/vmstat.c 3 * 4 * Manages VM statistics 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * 7 * zoned VM statistics 8 * Copyright (C) 2006 Silicon Graphics, Inc., 9 * Christoph Lameter <christoph@lameter.com> 10 */ 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/err.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 #include <linux/cpu.h> 17 #include <linux/vmstat.h> 18 #include <linux/sched.h> 19 #include <linux/math64.h> 20 #include <linux/writeback.h> 21 #include <linux/compaction.h> 22 23 #ifdef CONFIG_VM_EVENT_COUNTERS 24 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 25 EXPORT_PER_CPU_SYMBOL(vm_event_states); 26 27 static void sum_vm_events(unsigned long *ret) 28 { 29 int cpu; 30 int i; 31 32 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 33 34 for_each_online_cpu(cpu) { 35 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 36 37 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 38 ret[i] += this->event[i]; 39 } 40 } 41 42 /* 43 * Accumulate the vm event counters across all CPUs. 44 * The result is unavoidably approximate - it can change 45 * during and after execution of this function. 46 */ 47 void all_vm_events(unsigned long *ret) 48 { 49 get_online_cpus(); 50 sum_vm_events(ret); 51 put_online_cpus(); 52 } 53 EXPORT_SYMBOL_GPL(all_vm_events); 54 55 /* 56 * Fold the foreign cpu events into our own. 57 * 58 * This is adding to the events on one processor 59 * but keeps the global counts constant. 60 */ 61 void vm_events_fold_cpu(int cpu) 62 { 63 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); 64 int i; 65 66 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 67 count_vm_events(i, fold_state->event[i]); 68 fold_state->event[i] = 0; 69 } 70 } 71 72 #endif /* CONFIG_VM_EVENT_COUNTERS */ 73 74 /* 75 * Manage combined zone based / global counters 76 * 77 * vm_stat contains the global counters 78 */ 79 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 80 EXPORT_SYMBOL(vm_stat); 81 82 #ifdef CONFIG_SMP 83 84 int calculate_pressure_threshold(struct zone *zone) 85 { 86 int threshold; 87 int watermark_distance; 88 89 /* 90 * As vmstats are not up to date, there is drift between the estimated 91 * and real values. For high thresholds and a high number of CPUs, it 92 * is possible for the min watermark to be breached while the estimated 93 * value looks fine. The pressure threshold is a reduced value such 94 * that even the maximum amount of drift will not accidentally breach 95 * the min watermark 96 */ 97 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); 98 threshold = max(1, (int)(watermark_distance / num_online_cpus())); 99 100 /* 101 * Maximum threshold is 125 102 */ 103 threshold = min(125, threshold); 104 105 return threshold; 106 } 107 108 int calculate_normal_threshold(struct zone *zone) 109 { 110 int threshold; 111 int mem; /* memory in 128 MB units */ 112 113 /* 114 * The threshold scales with the number of processors and the amount 115 * of memory per zone. More memory means that we can defer updates for 116 * longer, more processors could lead to more contention. 117 * fls() is used to have a cheap way of logarithmic scaling. 118 * 119 * Some sample thresholds: 120 * 121 * Threshold Processors (fls) Zonesize fls(mem+1) 122 * ------------------------------------------------------------------ 123 * 8 1 1 0.9-1 GB 4 124 * 16 2 2 0.9-1 GB 4 125 * 20 2 2 1-2 GB 5 126 * 24 2 2 2-4 GB 6 127 * 28 2 2 4-8 GB 7 128 * 32 2 2 8-16 GB 8 129 * 4 2 2 <128M 1 130 * 30 4 3 2-4 GB 5 131 * 48 4 3 8-16 GB 8 132 * 32 8 4 1-2 GB 4 133 * 32 8 4 0.9-1GB 4 134 * 10 16 5 <128M 1 135 * 40 16 5 900M 4 136 * 70 64 7 2-4 GB 5 137 * 84 64 7 4-8 GB 6 138 * 108 512 9 4-8 GB 6 139 * 125 1024 10 8-16 GB 8 140 * 125 1024 10 16-32 GB 9 141 */ 142 143 mem = zone->managed_pages >> (27 - PAGE_SHIFT); 144 145 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 146 147 /* 148 * Maximum threshold is 125 149 */ 150 threshold = min(125, threshold); 151 152 return threshold; 153 } 154 155 /* 156 * Refresh the thresholds for each zone. 157 */ 158 void refresh_zone_stat_thresholds(void) 159 { 160 struct zone *zone; 161 int cpu; 162 int threshold; 163 164 for_each_populated_zone(zone) { 165 unsigned long max_drift, tolerate_drift; 166 167 threshold = calculate_normal_threshold(zone); 168 169 for_each_online_cpu(cpu) 170 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 171 = threshold; 172 173 /* 174 * Only set percpu_drift_mark if there is a danger that 175 * NR_FREE_PAGES reports the low watermark is ok when in fact 176 * the min watermark could be breached by an allocation 177 */ 178 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); 179 max_drift = num_online_cpus() * threshold; 180 if (max_drift > tolerate_drift) 181 zone->percpu_drift_mark = high_wmark_pages(zone) + 182 max_drift; 183 } 184 } 185 186 void set_pgdat_percpu_threshold(pg_data_t *pgdat, 187 int (*calculate_pressure)(struct zone *)) 188 { 189 struct zone *zone; 190 int cpu; 191 int threshold; 192 int i; 193 194 for (i = 0; i < pgdat->nr_zones; i++) { 195 zone = &pgdat->node_zones[i]; 196 if (!zone->percpu_drift_mark) 197 continue; 198 199 threshold = (*calculate_pressure)(zone); 200 for_each_possible_cpu(cpu) 201 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 202 = threshold; 203 } 204 } 205 206 /* 207 * For use when we know that interrupts are disabled. 208 */ 209 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 210 int delta) 211 { 212 struct per_cpu_pageset __percpu *pcp = zone->pageset; 213 s8 __percpu *p = pcp->vm_stat_diff + item; 214 long x; 215 long t; 216 217 x = delta + __this_cpu_read(*p); 218 219 t = __this_cpu_read(pcp->stat_threshold); 220 221 if (unlikely(x > t || x < -t)) { 222 zone_page_state_add(x, zone, item); 223 x = 0; 224 } 225 __this_cpu_write(*p, x); 226 } 227 EXPORT_SYMBOL(__mod_zone_page_state); 228 229 /* 230 * Optimized increment and decrement functions. 231 * 232 * These are only for a single page and therefore can take a struct page * 233 * argument instead of struct zone *. This allows the inclusion of the code 234 * generated for page_zone(page) into the optimized functions. 235 * 236 * No overflow check is necessary and therefore the differential can be 237 * incremented or decremented in place which may allow the compilers to 238 * generate better code. 239 * The increment or decrement is known and therefore one boundary check can 240 * be omitted. 241 * 242 * NOTE: These functions are very performance sensitive. Change only 243 * with care. 244 * 245 * Some processors have inc/dec instructions that are atomic vs an interrupt. 246 * However, the code must first determine the differential location in a zone 247 * based on the processor number and then inc/dec the counter. There is no 248 * guarantee without disabling preemption that the processor will not change 249 * in between and therefore the atomicity vs. interrupt cannot be exploited 250 * in a useful way here. 251 */ 252 void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 253 { 254 struct per_cpu_pageset __percpu *pcp = zone->pageset; 255 s8 __percpu *p = pcp->vm_stat_diff + item; 256 s8 v, t; 257 258 v = __this_cpu_inc_return(*p); 259 t = __this_cpu_read(pcp->stat_threshold); 260 if (unlikely(v > t)) { 261 s8 overstep = t >> 1; 262 263 zone_page_state_add(v + overstep, zone, item); 264 __this_cpu_write(*p, -overstep); 265 } 266 } 267 268 void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 269 { 270 __inc_zone_state(page_zone(page), item); 271 } 272 EXPORT_SYMBOL(__inc_zone_page_state); 273 274 void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 275 { 276 struct per_cpu_pageset __percpu *pcp = zone->pageset; 277 s8 __percpu *p = pcp->vm_stat_diff + item; 278 s8 v, t; 279 280 v = __this_cpu_dec_return(*p); 281 t = __this_cpu_read(pcp->stat_threshold); 282 if (unlikely(v < - t)) { 283 s8 overstep = t >> 1; 284 285 zone_page_state_add(v - overstep, zone, item); 286 __this_cpu_write(*p, overstep); 287 } 288 } 289 290 void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 291 { 292 __dec_zone_state(page_zone(page), item); 293 } 294 EXPORT_SYMBOL(__dec_zone_page_state); 295 296 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL 297 /* 298 * If we have cmpxchg_local support then we do not need to incur the overhead 299 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 300 * 301 * mod_state() modifies the zone counter state through atomic per cpu 302 * operations. 303 * 304 * Overstep mode specifies how overstep should handled: 305 * 0 No overstepping 306 * 1 Overstepping half of threshold 307 * -1 Overstepping minus half of threshold 308 */ 309 static inline void mod_state(struct zone *zone, 310 enum zone_stat_item item, int delta, int overstep_mode) 311 { 312 struct per_cpu_pageset __percpu *pcp = zone->pageset; 313 s8 __percpu *p = pcp->vm_stat_diff + item; 314 long o, n, t, z; 315 316 do { 317 z = 0; /* overflow to zone counters */ 318 319 /* 320 * The fetching of the stat_threshold is racy. We may apply 321 * a counter threshold to the wrong the cpu if we get 322 * rescheduled while executing here. However, the next 323 * counter update will apply the threshold again and 324 * therefore bring the counter under the threshold again. 325 * 326 * Most of the time the thresholds are the same anyways 327 * for all cpus in a zone. 328 */ 329 t = this_cpu_read(pcp->stat_threshold); 330 331 o = this_cpu_read(*p); 332 n = delta + o; 333 334 if (n > t || n < -t) { 335 int os = overstep_mode * (t >> 1) ; 336 337 /* Overflow must be added to zone counters */ 338 z = n + os; 339 n = -os; 340 } 341 } while (this_cpu_cmpxchg(*p, o, n) != o); 342 343 if (z) 344 zone_page_state_add(z, zone, item); 345 } 346 347 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 348 int delta) 349 { 350 mod_state(zone, item, delta, 0); 351 } 352 EXPORT_SYMBOL(mod_zone_page_state); 353 354 void inc_zone_state(struct zone *zone, enum zone_stat_item item) 355 { 356 mod_state(zone, item, 1, 1); 357 } 358 359 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 360 { 361 mod_state(page_zone(page), item, 1, 1); 362 } 363 EXPORT_SYMBOL(inc_zone_page_state); 364 365 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 366 { 367 mod_state(page_zone(page), item, -1, -1); 368 } 369 EXPORT_SYMBOL(dec_zone_page_state); 370 #else 371 /* 372 * Use interrupt disable to serialize counter updates 373 */ 374 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 375 int delta) 376 { 377 unsigned long flags; 378 379 local_irq_save(flags); 380 __mod_zone_page_state(zone, item, delta); 381 local_irq_restore(flags); 382 } 383 EXPORT_SYMBOL(mod_zone_page_state); 384 385 void inc_zone_state(struct zone *zone, enum zone_stat_item item) 386 { 387 unsigned long flags; 388 389 local_irq_save(flags); 390 __inc_zone_state(zone, item); 391 local_irq_restore(flags); 392 } 393 394 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 395 { 396 unsigned long flags; 397 struct zone *zone; 398 399 zone = page_zone(page); 400 local_irq_save(flags); 401 __inc_zone_state(zone, item); 402 local_irq_restore(flags); 403 } 404 EXPORT_SYMBOL(inc_zone_page_state); 405 406 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 407 { 408 unsigned long flags; 409 410 local_irq_save(flags); 411 __dec_zone_page_state(page, item); 412 local_irq_restore(flags); 413 } 414 EXPORT_SYMBOL(dec_zone_page_state); 415 #endif 416 417 /* 418 * Update the zone counters for one cpu. 419 * 420 * The cpu specified must be either the current cpu or a processor that 421 * is not online. If it is the current cpu then the execution thread must 422 * be pinned to the current cpu. 423 * 424 * Note that refresh_cpu_vm_stats strives to only access 425 * node local memory. The per cpu pagesets on remote zones are placed 426 * in the memory local to the processor using that pageset. So the 427 * loop over all zones will access a series of cachelines local to 428 * the processor. 429 * 430 * The call to zone_page_state_add updates the cachelines with the 431 * statistics in the remote zone struct as well as the global cachelines 432 * with the global counters. These could cause remote node cache line 433 * bouncing and will have to be only done when necessary. 434 */ 435 void refresh_cpu_vm_stats(int cpu) 436 { 437 struct zone *zone; 438 int i; 439 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 440 441 for_each_populated_zone(zone) { 442 struct per_cpu_pageset *p; 443 444 p = per_cpu_ptr(zone->pageset, cpu); 445 446 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 447 if (p->vm_stat_diff[i]) { 448 unsigned long flags; 449 int v; 450 451 local_irq_save(flags); 452 v = p->vm_stat_diff[i]; 453 p->vm_stat_diff[i] = 0; 454 local_irq_restore(flags); 455 atomic_long_add(v, &zone->vm_stat[i]); 456 global_diff[i] += v; 457 #ifdef CONFIG_NUMA 458 /* 3 seconds idle till flush */ 459 p->expire = 3; 460 #endif 461 } 462 cond_resched(); 463 #ifdef CONFIG_NUMA 464 /* 465 * Deal with draining the remote pageset of this 466 * processor 467 * 468 * Check if there are pages remaining in this pageset 469 * if not then there is nothing to expire. 470 */ 471 if (!p->expire || !p->pcp.count) 472 continue; 473 474 /* 475 * We never drain zones local to this processor. 476 */ 477 if (zone_to_nid(zone) == numa_node_id()) { 478 p->expire = 0; 479 continue; 480 } 481 482 p->expire--; 483 if (p->expire) 484 continue; 485 486 if (p->pcp.count) 487 drain_zone_pages(zone, &p->pcp); 488 #endif 489 } 490 491 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 492 if (global_diff[i]) 493 atomic_long_add(global_diff[i], &vm_stat[i]); 494 } 495 496 /* 497 * this is only called if !populated_zone(zone), which implies no other users of 498 * pset->vm_stat_diff[] exsist. 499 */ 500 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) 501 { 502 int i; 503 504 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 505 if (pset->vm_stat_diff[i]) { 506 int v = pset->vm_stat_diff[i]; 507 pset->vm_stat_diff[i] = 0; 508 atomic_long_add(v, &zone->vm_stat[i]); 509 atomic_long_add(v, &vm_stat[i]); 510 } 511 } 512 #endif 513 514 #ifdef CONFIG_NUMA 515 /* 516 * zonelist = the list of zones passed to the allocator 517 * z = the zone from which the allocation occurred. 518 * 519 * Must be called with interrupts disabled. 520 * 521 * When __GFP_OTHER_NODE is set assume the node of the preferred 522 * zone is the local node. This is useful for daemons who allocate 523 * memory on behalf of other processes. 524 */ 525 void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) 526 { 527 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 528 __inc_zone_state(z, NUMA_HIT); 529 } else { 530 __inc_zone_state(z, NUMA_MISS); 531 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 532 } 533 if (z->node == ((flags & __GFP_OTHER_NODE) ? 534 preferred_zone->node : numa_node_id())) 535 __inc_zone_state(z, NUMA_LOCAL); 536 else 537 __inc_zone_state(z, NUMA_OTHER); 538 } 539 #endif 540 541 #ifdef CONFIG_COMPACTION 542 543 struct contig_page_info { 544 unsigned long free_pages; 545 unsigned long free_blocks_total; 546 unsigned long free_blocks_suitable; 547 }; 548 549 /* 550 * Calculate the number of free pages in a zone, how many contiguous 551 * pages are free and how many are large enough to satisfy an allocation of 552 * the target size. Note that this function makes no attempt to estimate 553 * how many suitable free blocks there *might* be if MOVABLE pages were 554 * migrated. Calculating that is possible, but expensive and can be 555 * figured out from userspace 556 */ 557 static void fill_contig_page_info(struct zone *zone, 558 unsigned int suitable_order, 559 struct contig_page_info *info) 560 { 561 unsigned int order; 562 563 info->free_pages = 0; 564 info->free_blocks_total = 0; 565 info->free_blocks_suitable = 0; 566 567 for (order = 0; order < MAX_ORDER; order++) { 568 unsigned long blocks; 569 570 /* Count number of free blocks */ 571 blocks = zone->free_area[order].nr_free; 572 info->free_blocks_total += blocks; 573 574 /* Count free base pages */ 575 info->free_pages += blocks << order; 576 577 /* Count the suitable free blocks */ 578 if (order >= suitable_order) 579 info->free_blocks_suitable += blocks << 580 (order - suitable_order); 581 } 582 } 583 584 /* 585 * A fragmentation index only makes sense if an allocation of a requested 586 * size would fail. If that is true, the fragmentation index indicates 587 * whether external fragmentation or a lack of memory was the problem. 588 * The value can be used to determine if page reclaim or compaction 589 * should be used 590 */ 591 static int __fragmentation_index(unsigned int order, struct contig_page_info *info) 592 { 593 unsigned long requested = 1UL << order; 594 595 if (!info->free_blocks_total) 596 return 0; 597 598 /* Fragmentation index only makes sense when a request would fail */ 599 if (info->free_blocks_suitable) 600 return -1000; 601 602 /* 603 * Index is between 0 and 1 so return within 3 decimal places 604 * 605 * 0 => allocation would fail due to lack of memory 606 * 1 => allocation would fail due to fragmentation 607 */ 608 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); 609 } 610 611 /* Same as __fragmentation index but allocs contig_page_info on stack */ 612 int fragmentation_index(struct zone *zone, unsigned int order) 613 { 614 struct contig_page_info info; 615 616 fill_contig_page_info(zone, order, &info); 617 return __fragmentation_index(order, &info); 618 } 619 #endif 620 621 #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) 622 #include <linux/proc_fs.h> 623 #include <linux/seq_file.h> 624 625 static char * const migratetype_names[MIGRATE_TYPES] = { 626 "Unmovable", 627 "Reclaimable", 628 "Movable", 629 "Reserve", 630 #ifdef CONFIG_CMA 631 "CMA", 632 #endif 633 #ifdef CONFIG_MEMORY_ISOLATION 634 "Isolate", 635 #endif 636 }; 637 638 static void *frag_start(struct seq_file *m, loff_t *pos) 639 { 640 pg_data_t *pgdat; 641 loff_t node = *pos; 642 for (pgdat = first_online_pgdat(); 643 pgdat && node; 644 pgdat = next_online_pgdat(pgdat)) 645 --node; 646 647 return pgdat; 648 } 649 650 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 651 { 652 pg_data_t *pgdat = (pg_data_t *)arg; 653 654 (*pos)++; 655 return next_online_pgdat(pgdat); 656 } 657 658 static void frag_stop(struct seq_file *m, void *arg) 659 { 660 } 661 662 /* Walk all the zones in a node and print using a callback */ 663 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, 664 void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) 665 { 666 struct zone *zone; 667 struct zone *node_zones = pgdat->node_zones; 668 unsigned long flags; 669 670 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 671 if (!populated_zone(zone)) 672 continue; 673 674 spin_lock_irqsave(&zone->lock, flags); 675 print(m, pgdat, zone); 676 spin_unlock_irqrestore(&zone->lock, flags); 677 } 678 } 679 #endif 680 681 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) 682 #ifdef CONFIG_ZONE_DMA 683 #define TEXT_FOR_DMA(xx) xx "_dma", 684 #else 685 #define TEXT_FOR_DMA(xx) 686 #endif 687 688 #ifdef CONFIG_ZONE_DMA32 689 #define TEXT_FOR_DMA32(xx) xx "_dma32", 690 #else 691 #define TEXT_FOR_DMA32(xx) 692 #endif 693 694 #ifdef CONFIG_HIGHMEM 695 #define TEXT_FOR_HIGHMEM(xx) xx "_high", 696 #else 697 #define TEXT_FOR_HIGHMEM(xx) 698 #endif 699 700 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 701 TEXT_FOR_HIGHMEM(xx) xx "_movable", 702 703 const char * const vmstat_text[] = { 704 /* Zoned VM counters */ 705 "nr_free_pages", 706 "nr_inactive_anon", 707 "nr_active_anon", 708 "nr_inactive_file", 709 "nr_active_file", 710 "nr_unevictable", 711 "nr_mlock", 712 "nr_anon_pages", 713 "nr_mapped", 714 "nr_file_pages", 715 "nr_dirty", 716 "nr_writeback", 717 "nr_slab_reclaimable", 718 "nr_slab_unreclaimable", 719 "nr_page_table_pages", 720 "nr_kernel_stack", 721 "nr_unstable", 722 "nr_bounce", 723 "nr_vmscan_write", 724 "nr_vmscan_immediate_reclaim", 725 "nr_writeback_temp", 726 "nr_isolated_anon", 727 "nr_isolated_file", 728 "nr_shmem", 729 "nr_dirtied", 730 "nr_written", 731 732 #ifdef CONFIG_NUMA 733 "numa_hit", 734 "numa_miss", 735 "numa_foreign", 736 "numa_interleave", 737 "numa_local", 738 "numa_other", 739 #endif 740 "nr_anon_transparent_hugepages", 741 "nr_free_cma", 742 "nr_dirty_threshold", 743 "nr_dirty_background_threshold", 744 745 #ifdef CONFIG_VM_EVENT_COUNTERS 746 "pgpgin", 747 "pgpgout", 748 "pswpin", 749 "pswpout", 750 751 TEXTS_FOR_ZONES("pgalloc") 752 753 "pgfree", 754 "pgactivate", 755 "pgdeactivate", 756 757 "pgfault", 758 "pgmajfault", 759 760 TEXTS_FOR_ZONES("pgrefill") 761 TEXTS_FOR_ZONES("pgsteal_kswapd") 762 TEXTS_FOR_ZONES("pgsteal_direct") 763 TEXTS_FOR_ZONES("pgscan_kswapd") 764 TEXTS_FOR_ZONES("pgscan_direct") 765 "pgscan_direct_throttle", 766 767 #ifdef CONFIG_NUMA 768 "zone_reclaim_failed", 769 #endif 770 "pginodesteal", 771 "slabs_scanned", 772 "kswapd_inodesteal", 773 "kswapd_low_wmark_hit_quickly", 774 "kswapd_high_wmark_hit_quickly", 775 "pageoutrun", 776 "allocstall", 777 778 "pgrotated", 779 780 #ifdef CONFIG_NUMA_BALANCING 781 "numa_pte_updates", 782 "numa_hint_faults", 783 "numa_hint_faults_local", 784 "numa_pages_migrated", 785 #endif 786 #ifdef CONFIG_MIGRATION 787 "pgmigrate_success", 788 "pgmigrate_fail", 789 #endif 790 #ifdef CONFIG_COMPACTION 791 "compact_migrate_scanned", 792 "compact_free_scanned", 793 "compact_isolated", 794 "compact_stall", 795 "compact_fail", 796 "compact_success", 797 #endif 798 799 #ifdef CONFIG_HUGETLB_PAGE 800 "htlb_buddy_alloc_success", 801 "htlb_buddy_alloc_fail", 802 #endif 803 "unevictable_pgs_culled", 804 "unevictable_pgs_scanned", 805 "unevictable_pgs_rescued", 806 "unevictable_pgs_mlocked", 807 "unevictable_pgs_munlocked", 808 "unevictable_pgs_cleared", 809 "unevictable_pgs_stranded", 810 811 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 812 "thp_fault_alloc", 813 "thp_fault_fallback", 814 "thp_collapse_alloc", 815 "thp_collapse_alloc_failed", 816 "thp_split", 817 "thp_zero_page_alloc", 818 "thp_zero_page_alloc_failed", 819 #endif 820 821 #endif /* CONFIG_VM_EVENTS_COUNTERS */ 822 }; 823 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 824 825 826 #ifdef CONFIG_PROC_FS 827 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 828 struct zone *zone) 829 { 830 int order; 831 832 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 833 for (order = 0; order < MAX_ORDER; ++order) 834 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 835 seq_putc(m, '\n'); 836 } 837 838 /* 839 * This walks the free areas for each zone. 840 */ 841 static int frag_show(struct seq_file *m, void *arg) 842 { 843 pg_data_t *pgdat = (pg_data_t *)arg; 844 walk_zones_in_node(m, pgdat, frag_show_print); 845 return 0; 846 } 847 848 static void pagetypeinfo_showfree_print(struct seq_file *m, 849 pg_data_t *pgdat, struct zone *zone) 850 { 851 int order, mtype; 852 853 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { 854 seq_printf(m, "Node %4d, zone %8s, type %12s ", 855 pgdat->node_id, 856 zone->name, 857 migratetype_names[mtype]); 858 for (order = 0; order < MAX_ORDER; ++order) { 859 unsigned long freecount = 0; 860 struct free_area *area; 861 struct list_head *curr; 862 863 area = &(zone->free_area[order]); 864 865 list_for_each(curr, &area->free_list[mtype]) 866 freecount++; 867 seq_printf(m, "%6lu ", freecount); 868 } 869 seq_putc(m, '\n'); 870 } 871 } 872 873 /* Print out the free pages at each order for each migatetype */ 874 static int pagetypeinfo_showfree(struct seq_file *m, void *arg) 875 { 876 int order; 877 pg_data_t *pgdat = (pg_data_t *)arg; 878 879 /* Print header */ 880 seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); 881 for (order = 0; order < MAX_ORDER; ++order) 882 seq_printf(m, "%6d ", order); 883 seq_putc(m, '\n'); 884 885 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); 886 887 return 0; 888 } 889 890 static void pagetypeinfo_showblockcount_print(struct seq_file *m, 891 pg_data_t *pgdat, struct zone *zone) 892 { 893 int mtype; 894 unsigned long pfn; 895 unsigned long start_pfn = zone->zone_start_pfn; 896 unsigned long end_pfn = zone_end_pfn(zone); 897 unsigned long count[MIGRATE_TYPES] = { 0, }; 898 899 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 900 struct page *page; 901 902 if (!pfn_valid(pfn)) 903 continue; 904 905 page = pfn_to_page(pfn); 906 907 /* Watch for unexpected holes punched in the memmap */ 908 if (!memmap_valid_within(pfn, page, zone)) 909 continue; 910 911 mtype = get_pageblock_migratetype(page); 912 913 if (mtype < MIGRATE_TYPES) 914 count[mtype]++; 915 } 916 917 /* Print counts */ 918 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 919 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 920 seq_printf(m, "%12lu ", count[mtype]); 921 seq_putc(m, '\n'); 922 } 923 924 /* Print out the free pages at each order for each migratetype */ 925 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 926 { 927 int mtype; 928 pg_data_t *pgdat = (pg_data_t *)arg; 929 930 seq_printf(m, "\n%-23s", "Number of blocks type "); 931 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 932 seq_printf(m, "%12s ", migratetype_names[mtype]); 933 seq_putc(m, '\n'); 934 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); 935 936 return 0; 937 } 938 939 /* 940 * This prints out statistics in relation to grouping pages by mobility. 941 * It is expensive to collect so do not constantly read the file. 942 */ 943 static int pagetypeinfo_show(struct seq_file *m, void *arg) 944 { 945 pg_data_t *pgdat = (pg_data_t *)arg; 946 947 /* check memoryless node */ 948 if (!node_state(pgdat->node_id, N_MEMORY)) 949 return 0; 950 951 seq_printf(m, "Page block order: %d\n", pageblock_order); 952 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); 953 seq_putc(m, '\n'); 954 pagetypeinfo_showfree(m, pgdat); 955 pagetypeinfo_showblockcount(m, pgdat); 956 957 return 0; 958 } 959 960 static const struct seq_operations fragmentation_op = { 961 .start = frag_start, 962 .next = frag_next, 963 .stop = frag_stop, 964 .show = frag_show, 965 }; 966 967 static int fragmentation_open(struct inode *inode, struct file *file) 968 { 969 return seq_open(file, &fragmentation_op); 970 } 971 972 static const struct file_operations fragmentation_file_operations = { 973 .open = fragmentation_open, 974 .read = seq_read, 975 .llseek = seq_lseek, 976 .release = seq_release, 977 }; 978 979 static const struct seq_operations pagetypeinfo_op = { 980 .start = frag_start, 981 .next = frag_next, 982 .stop = frag_stop, 983 .show = pagetypeinfo_show, 984 }; 985 986 static int pagetypeinfo_open(struct inode *inode, struct file *file) 987 { 988 return seq_open(file, &pagetypeinfo_op); 989 } 990 991 static const struct file_operations pagetypeinfo_file_ops = { 992 .open = pagetypeinfo_open, 993 .read = seq_read, 994 .llseek = seq_lseek, 995 .release = seq_release, 996 }; 997 998 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 999 struct zone *zone) 1000 { 1001 int i; 1002 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 1003 seq_printf(m, 1004 "\n pages free %lu" 1005 "\n min %lu" 1006 "\n low %lu" 1007 "\n high %lu" 1008 "\n scanned %lu" 1009 "\n spanned %lu" 1010 "\n present %lu" 1011 "\n managed %lu", 1012 zone_page_state(zone, NR_FREE_PAGES), 1013 min_wmark_pages(zone), 1014 low_wmark_pages(zone), 1015 high_wmark_pages(zone), 1016 zone->pages_scanned, 1017 zone->spanned_pages, 1018 zone->present_pages, 1019 zone->managed_pages); 1020 1021 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1022 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1023 zone_page_state(zone, i)); 1024 1025 seq_printf(m, 1026 "\n protection: (%lu", 1027 zone->lowmem_reserve[0]); 1028 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1029 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 1030 seq_printf(m, 1031 ")" 1032 "\n pagesets"); 1033 for_each_online_cpu(i) { 1034 struct per_cpu_pageset *pageset; 1035 1036 pageset = per_cpu_ptr(zone->pageset, i); 1037 seq_printf(m, 1038 "\n cpu: %i" 1039 "\n count: %i" 1040 "\n high: %i" 1041 "\n batch: %i", 1042 i, 1043 pageset->pcp.count, 1044 pageset->pcp.high, 1045 pageset->pcp.batch); 1046 #ifdef CONFIG_SMP 1047 seq_printf(m, "\n vm stats threshold: %d", 1048 pageset->stat_threshold); 1049 #endif 1050 } 1051 seq_printf(m, 1052 "\n all_unreclaimable: %u" 1053 "\n start_pfn: %lu" 1054 "\n inactive_ratio: %u", 1055 zone->all_unreclaimable, 1056 zone->zone_start_pfn, 1057 zone->inactive_ratio); 1058 seq_putc(m, '\n'); 1059 } 1060 1061 /* 1062 * Output information about zones in @pgdat. 1063 */ 1064 static int zoneinfo_show(struct seq_file *m, void *arg) 1065 { 1066 pg_data_t *pgdat = (pg_data_t *)arg; 1067 walk_zones_in_node(m, pgdat, zoneinfo_show_print); 1068 return 0; 1069 } 1070 1071 static const struct seq_operations zoneinfo_op = { 1072 .start = frag_start, /* iterate over all zones. The same as in 1073 * fragmentation. */ 1074 .next = frag_next, 1075 .stop = frag_stop, 1076 .show = zoneinfo_show, 1077 }; 1078 1079 static int zoneinfo_open(struct inode *inode, struct file *file) 1080 { 1081 return seq_open(file, &zoneinfo_op); 1082 } 1083 1084 static const struct file_operations proc_zoneinfo_file_operations = { 1085 .open = zoneinfo_open, 1086 .read = seq_read, 1087 .llseek = seq_lseek, 1088 .release = seq_release, 1089 }; 1090 1091 enum writeback_stat_item { 1092 NR_DIRTY_THRESHOLD, 1093 NR_DIRTY_BG_THRESHOLD, 1094 NR_VM_WRITEBACK_STAT_ITEMS, 1095 }; 1096 1097 static void *vmstat_start(struct seq_file *m, loff_t *pos) 1098 { 1099 unsigned long *v; 1100 int i, stat_items_size; 1101 1102 if (*pos >= ARRAY_SIZE(vmstat_text)) 1103 return NULL; 1104 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + 1105 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); 1106 1107 #ifdef CONFIG_VM_EVENT_COUNTERS 1108 stat_items_size += sizeof(struct vm_event_state); 1109 #endif 1110 1111 v = kmalloc(stat_items_size, GFP_KERNEL); 1112 m->private = v; 1113 if (!v) 1114 return ERR_PTR(-ENOMEM); 1115 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1116 v[i] = global_page_state(i); 1117 v += NR_VM_ZONE_STAT_ITEMS; 1118 1119 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, 1120 v + NR_DIRTY_THRESHOLD); 1121 v += NR_VM_WRITEBACK_STAT_ITEMS; 1122 1123 #ifdef CONFIG_VM_EVENT_COUNTERS 1124 all_vm_events(v); 1125 v[PGPGIN] /= 2; /* sectors -> kbytes */ 1126 v[PGPGOUT] /= 2; 1127 #endif 1128 return (unsigned long *)m->private + *pos; 1129 } 1130 1131 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1132 { 1133 (*pos)++; 1134 if (*pos >= ARRAY_SIZE(vmstat_text)) 1135 return NULL; 1136 return (unsigned long *)m->private + *pos; 1137 } 1138 1139 static int vmstat_show(struct seq_file *m, void *arg) 1140 { 1141 unsigned long *l = arg; 1142 unsigned long off = l - (unsigned long *)m->private; 1143 1144 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1145 return 0; 1146 } 1147 1148 static void vmstat_stop(struct seq_file *m, void *arg) 1149 { 1150 kfree(m->private); 1151 m->private = NULL; 1152 } 1153 1154 static const struct seq_operations vmstat_op = { 1155 .start = vmstat_start, 1156 .next = vmstat_next, 1157 .stop = vmstat_stop, 1158 .show = vmstat_show, 1159 }; 1160 1161 static int vmstat_open(struct inode *inode, struct file *file) 1162 { 1163 return seq_open(file, &vmstat_op); 1164 } 1165 1166 static const struct file_operations proc_vmstat_file_operations = { 1167 .open = vmstat_open, 1168 .read = seq_read, 1169 .llseek = seq_lseek, 1170 .release = seq_release, 1171 }; 1172 #endif /* CONFIG_PROC_FS */ 1173 1174 #ifdef CONFIG_SMP 1175 static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1176 int sysctl_stat_interval __read_mostly = HZ; 1177 1178 static void vmstat_update(struct work_struct *w) 1179 { 1180 refresh_cpu_vm_stats(smp_processor_id()); 1181 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1182 round_jiffies_relative(sysctl_stat_interval)); 1183 } 1184 1185 static void start_cpu_timer(int cpu) 1186 { 1187 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1188 1189 INIT_DEFERRABLE_WORK(work, vmstat_update); 1190 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1191 } 1192 1193 /* 1194 * Use the cpu notifier to insure that the thresholds are recalculated 1195 * when necessary. 1196 */ 1197 static int vmstat_cpuup_callback(struct notifier_block *nfb, 1198 unsigned long action, 1199 void *hcpu) 1200 { 1201 long cpu = (long)hcpu; 1202 1203 switch (action) { 1204 case CPU_ONLINE: 1205 case CPU_ONLINE_FROZEN: 1206 refresh_zone_stat_thresholds(); 1207 start_cpu_timer(cpu); 1208 node_set_state(cpu_to_node(cpu), N_CPU); 1209 break; 1210 case CPU_DOWN_PREPARE: 1211 case CPU_DOWN_PREPARE_FROZEN: 1212 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 1213 per_cpu(vmstat_work, cpu).work.func = NULL; 1214 break; 1215 case CPU_DOWN_FAILED: 1216 case CPU_DOWN_FAILED_FROZEN: 1217 start_cpu_timer(cpu); 1218 break; 1219 case CPU_DEAD: 1220 case CPU_DEAD_FROZEN: 1221 refresh_zone_stat_thresholds(); 1222 break; 1223 default: 1224 break; 1225 } 1226 return NOTIFY_OK; 1227 } 1228 1229 static struct notifier_block vmstat_notifier = 1230 { &vmstat_cpuup_callback, NULL, 0 }; 1231 #endif 1232 1233 static int __init setup_vmstat(void) 1234 { 1235 #ifdef CONFIG_SMP 1236 int cpu; 1237 1238 register_cpu_notifier(&vmstat_notifier); 1239 1240 for_each_online_cpu(cpu) 1241 start_cpu_timer(cpu); 1242 #endif 1243 #ifdef CONFIG_PROC_FS 1244 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 1245 proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); 1246 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); 1247 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); 1248 #endif 1249 return 0; 1250 } 1251 module_init(setup_vmstat) 1252 1253 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1254 #include <linux/debugfs.h> 1255 1256 1257 /* 1258 * Return an index indicating how much of the available free memory is 1259 * unusable for an allocation of the requested size. 1260 */ 1261 static int unusable_free_index(unsigned int order, 1262 struct contig_page_info *info) 1263 { 1264 /* No free memory is interpreted as all free memory is unusable */ 1265 if (info->free_pages == 0) 1266 return 1000; 1267 1268 /* 1269 * Index should be a value between 0 and 1. Return a value to 3 1270 * decimal places. 1271 * 1272 * 0 => no fragmentation 1273 * 1 => high fragmentation 1274 */ 1275 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); 1276 1277 } 1278 1279 static void unusable_show_print(struct seq_file *m, 1280 pg_data_t *pgdat, struct zone *zone) 1281 { 1282 unsigned int order; 1283 int index; 1284 struct contig_page_info info; 1285 1286 seq_printf(m, "Node %d, zone %8s ", 1287 pgdat->node_id, 1288 zone->name); 1289 for (order = 0; order < MAX_ORDER; ++order) { 1290 fill_contig_page_info(zone, order, &info); 1291 index = unusable_free_index(order, &info); 1292 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1293 } 1294 1295 seq_putc(m, '\n'); 1296 } 1297 1298 /* 1299 * Display unusable free space index 1300 * 1301 * The unusable free space index measures how much of the available free 1302 * memory cannot be used to satisfy an allocation of a given size and is a 1303 * value between 0 and 1. The higher the value, the more of free memory is 1304 * unusable and by implication, the worse the external fragmentation is. This 1305 * can be expressed as a percentage by multiplying by 100. 1306 */ 1307 static int unusable_show(struct seq_file *m, void *arg) 1308 { 1309 pg_data_t *pgdat = (pg_data_t *)arg; 1310 1311 /* check memoryless node */ 1312 if (!node_state(pgdat->node_id, N_MEMORY)) 1313 return 0; 1314 1315 walk_zones_in_node(m, pgdat, unusable_show_print); 1316 1317 return 0; 1318 } 1319 1320 static const struct seq_operations unusable_op = { 1321 .start = frag_start, 1322 .next = frag_next, 1323 .stop = frag_stop, 1324 .show = unusable_show, 1325 }; 1326 1327 static int unusable_open(struct inode *inode, struct file *file) 1328 { 1329 return seq_open(file, &unusable_op); 1330 } 1331 1332 static const struct file_operations unusable_file_ops = { 1333 .open = unusable_open, 1334 .read = seq_read, 1335 .llseek = seq_lseek, 1336 .release = seq_release, 1337 }; 1338 1339 static void extfrag_show_print(struct seq_file *m, 1340 pg_data_t *pgdat, struct zone *zone) 1341 { 1342 unsigned int order; 1343 int index; 1344 1345 /* Alloc on stack as interrupts are disabled for zone walk */ 1346 struct contig_page_info info; 1347 1348 seq_printf(m, "Node %d, zone %8s ", 1349 pgdat->node_id, 1350 zone->name); 1351 for (order = 0; order < MAX_ORDER; ++order) { 1352 fill_contig_page_info(zone, order, &info); 1353 index = __fragmentation_index(order, &info); 1354 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1355 } 1356 1357 seq_putc(m, '\n'); 1358 } 1359 1360 /* 1361 * Display fragmentation index for orders that allocations would fail for 1362 */ 1363 static int extfrag_show(struct seq_file *m, void *arg) 1364 { 1365 pg_data_t *pgdat = (pg_data_t *)arg; 1366 1367 walk_zones_in_node(m, pgdat, extfrag_show_print); 1368 1369 return 0; 1370 } 1371 1372 static const struct seq_operations extfrag_op = { 1373 .start = frag_start, 1374 .next = frag_next, 1375 .stop = frag_stop, 1376 .show = extfrag_show, 1377 }; 1378 1379 static int extfrag_open(struct inode *inode, struct file *file) 1380 { 1381 return seq_open(file, &extfrag_op); 1382 } 1383 1384 static const struct file_operations extfrag_file_ops = { 1385 .open = extfrag_open, 1386 .read = seq_read, 1387 .llseek = seq_lseek, 1388 .release = seq_release, 1389 }; 1390 1391 static int __init extfrag_debug_init(void) 1392 { 1393 struct dentry *extfrag_debug_root; 1394 1395 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1396 if (!extfrag_debug_root) 1397 return -ENOMEM; 1398 1399 if (!debugfs_create_file("unusable_index", 0444, 1400 extfrag_debug_root, NULL, &unusable_file_ops)) 1401 goto fail; 1402 1403 if (!debugfs_create_file("extfrag_index", 0444, 1404 extfrag_debug_root, NULL, &extfrag_file_ops)) 1405 goto fail; 1406 1407 return 0; 1408 fail: 1409 debugfs_remove_recursive(extfrag_debug_root); 1410 return -ENOMEM; 1411 } 1412 1413 module_init(extfrag_debug_init); 1414 #endif 1415