1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/mutex.h> 37 #include <linux/rbtree.h> 38 #include <linux/slab.h> 39 #include <linux/swap.h> 40 #include <linux/swapops.h> 41 #include <linux/spinlock.h> 42 #include <linux/eventfd.h> 43 #include <linux/sort.h> 44 #include <linux/fs.h> 45 #include <linux/seq_file.h> 46 #include <linux/vmalloc.h> 47 #include <linux/mm_inline.h> 48 #include <linux/page_cgroup.h> 49 #include <linux/cpu.h> 50 #include <linux/oom.h> 51 #include "internal.h" 52 53 #include <asm/uaccess.h> 54 55 #include <trace/events/vmscan.h> 56 57 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58 #define MEM_CGROUP_RECLAIM_RETRIES 5 59 struct mem_cgroup *root_mem_cgroup __read_mostly; 60 61 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63 int do_swap_account __read_mostly; 64 65 /* for remember boot option*/ 66 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 67 static int really_do_swap_account __initdata = 1; 68 #else 69 static int really_do_swap_account __initdata = 0; 70 #endif 71 72 #else 73 #define do_swap_account (0) 74 #endif 75 76 77 /* 78 * Statistics for memory cgroup. 79 */ 80 enum mem_cgroup_stat_index { 81 /* 82 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 83 */ 84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 90 MEM_CGROUP_STAT_NSTATS, 91 }; 92 93 enum mem_cgroup_events_index { 94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 97 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 98 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 99 MEM_CGROUP_EVENTS_NSTATS, 100 }; 101 /* 102 * Per memcg event counter is incremented at every pagein/pageout. With THP, 103 * it will be incremated by the number of pages. This counter is used for 104 * for trigger some periodic events. This is straightforward and better 105 * than using jiffies etc. to handle periodic memcg event. 106 */ 107 enum mem_cgroup_events_target { 108 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_NUMAINFO, 111 MEM_CGROUP_NTARGETS, 112 }; 113 #define THRESHOLDS_EVENTS_TARGET (128) 114 #define SOFTLIMIT_EVENTS_TARGET (1024) 115 #define NUMAINFO_EVENTS_TARGET (1024) 116 117 struct mem_cgroup_stat_cpu { 118 long count[MEM_CGROUP_STAT_NSTATS]; 119 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 120 unsigned long targets[MEM_CGROUP_NTARGETS]; 121 }; 122 123 /* 124 * per-zone information in memory controller. 125 */ 126 struct mem_cgroup_per_zone { 127 /* 128 * spin_lock to protect the per cgroup LRU 129 */ 130 struct list_head lists[NR_LRU_LISTS]; 131 unsigned long count[NR_LRU_LISTS]; 132 133 struct zone_reclaim_stat reclaim_stat; 134 struct rb_node tree_node; /* RB tree node */ 135 unsigned long long usage_in_excess;/* Set to the value by which */ 136 /* the soft limit is exceeded*/ 137 bool on_tree; 138 struct mem_cgroup *mem; /* Back pointer, we cannot */ 139 /* use container_of */ 140 }; 141 /* Macro for accessing counter */ 142 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 143 144 struct mem_cgroup_per_node { 145 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 146 }; 147 148 struct mem_cgroup_lru_info { 149 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 150 }; 151 152 /* 153 * Cgroups above their limits are maintained in a RB-Tree, independent of 154 * their hierarchy representation 155 */ 156 157 struct mem_cgroup_tree_per_zone { 158 struct rb_root rb_root; 159 spinlock_t lock; 160 }; 161 162 struct mem_cgroup_tree_per_node { 163 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 164 }; 165 166 struct mem_cgroup_tree { 167 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 168 }; 169 170 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 171 172 struct mem_cgroup_threshold { 173 struct eventfd_ctx *eventfd; 174 u64 threshold; 175 }; 176 177 /* For threshold */ 178 struct mem_cgroup_threshold_ary { 179 /* An array index points to threshold just below usage. */ 180 int current_threshold; 181 /* Size of entries[] */ 182 unsigned int size; 183 /* Array of thresholds */ 184 struct mem_cgroup_threshold entries[0]; 185 }; 186 187 struct mem_cgroup_thresholds { 188 /* Primary thresholds array */ 189 struct mem_cgroup_threshold_ary *primary; 190 /* 191 * Spare threshold array. 192 * This is needed to make mem_cgroup_unregister_event() "never fail". 193 * It must be able to store at least primary->size - 1 entries. 194 */ 195 struct mem_cgroup_threshold_ary *spare; 196 }; 197 198 /* for OOM */ 199 struct mem_cgroup_eventfd_list { 200 struct list_head list; 201 struct eventfd_ctx *eventfd; 202 }; 203 204 static void mem_cgroup_threshold(struct mem_cgroup *mem); 205 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206 207 /* 208 * The memory controller data structure. The memory controller controls both 209 * page cache and RSS per cgroup. We would eventually like to provide 210 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 211 * to help the administrator determine what knobs to tune. 212 * 213 * TODO: Add a water mark for the memory controller. Reclaim will begin when 214 * we hit the water mark. May be even add a low water mark, such that 215 * no reclaim occurs from a cgroup at it's low water mark, this is 216 * a feature that will be implemented much later in the future. 217 */ 218 struct mem_cgroup { 219 struct cgroup_subsys_state css; 220 /* 221 * the counter to account for memory usage 222 */ 223 struct res_counter res; 224 /* 225 * the counter to account for mem+swap usage. 226 */ 227 struct res_counter memsw; 228 /* 229 * Per cgroup active and inactive list, similar to the 230 * per zone LRU lists. 231 */ 232 struct mem_cgroup_lru_info info; 233 /* 234 * While reclaiming in a hierarchy, we cache the last child we 235 * reclaimed from. 236 */ 237 int last_scanned_child; 238 int last_scanned_node; 239 #if MAX_NUMNODES > 1 240 nodemask_t scan_nodes; 241 atomic_t numainfo_events; 242 atomic_t numainfo_updating; 243 #endif 244 /* 245 * Should the accounting and control be hierarchical, per subtree? 246 */ 247 bool use_hierarchy; 248 249 bool oom_lock; 250 atomic_t under_oom; 251 252 atomic_t refcnt; 253 254 int swappiness; 255 /* OOM-Killer disable */ 256 int oom_kill_disable; 257 258 /* set when res.limit == memsw.limit */ 259 bool memsw_is_minimum; 260 261 /* protect arrays of thresholds */ 262 struct mutex thresholds_lock; 263 264 /* thresholds for memory usage. RCU-protected */ 265 struct mem_cgroup_thresholds thresholds; 266 267 /* thresholds for mem+swap usage. RCU-protected */ 268 struct mem_cgroup_thresholds memsw_thresholds; 269 270 /* For oom notifier event fd */ 271 struct list_head oom_notify; 272 273 /* 274 * Should we move charges of a task when a task is moved into this 275 * mem_cgroup ? And what type of charges should we move ? 276 */ 277 unsigned long move_charge_at_immigrate; 278 /* 279 * percpu counter. 280 */ 281 struct mem_cgroup_stat_cpu *stat; 282 /* 283 * used when a cpu is offlined or other synchronizations 284 * See mem_cgroup_read_stat(). 285 */ 286 struct mem_cgroup_stat_cpu nocpu_base; 287 spinlock_t pcp_counter_lock; 288 }; 289 290 /* Stuffs for move charges at task migration. */ 291 /* 292 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 293 * left-shifted bitmap of these types. 294 */ 295 enum move_type { 296 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 297 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 298 NR_MOVE_TYPE, 299 }; 300 301 /* "mc" and its members are protected by cgroup_mutex */ 302 static struct move_charge_struct { 303 spinlock_t lock; /* for from, to */ 304 struct mem_cgroup *from; 305 struct mem_cgroup *to; 306 unsigned long precharge; 307 unsigned long moved_charge; 308 unsigned long moved_swap; 309 struct task_struct *moving_task; /* a task moving charges */ 310 wait_queue_head_t waitq; /* a waitq for other context */ 311 } mc = { 312 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 313 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 314 }; 315 316 static bool move_anon(void) 317 { 318 return test_bit(MOVE_CHARGE_TYPE_ANON, 319 &mc.to->move_charge_at_immigrate); 320 } 321 322 static bool move_file(void) 323 { 324 return test_bit(MOVE_CHARGE_TYPE_FILE, 325 &mc.to->move_charge_at_immigrate); 326 } 327 328 /* 329 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 330 * limit reclaim to prevent infinite loops, if they ever occur. 331 */ 332 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 333 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 334 335 enum charge_type { 336 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 337 MEM_CGROUP_CHARGE_TYPE_MAPPED, 338 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 339 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 340 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 341 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 342 NR_CHARGE_TYPE, 343 }; 344 345 /* for encoding cft->private value on file */ 346 #define _MEM (0) 347 #define _MEMSWAP (1) 348 #define _OOM_TYPE (2) 349 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 350 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 351 #define MEMFILE_ATTR(val) ((val) & 0xffff) 352 /* Used for OOM nofiier */ 353 #define OOM_CONTROL (0) 354 355 /* 356 * Reclaim flags for mem_cgroup_hierarchical_reclaim 357 */ 358 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 359 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 360 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 361 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 362 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 363 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 364 365 static void mem_cgroup_get(struct mem_cgroup *mem); 366 static void mem_cgroup_put(struct mem_cgroup *mem); 367 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 368 static void drain_all_stock_async(struct mem_cgroup *mem); 369 370 static struct mem_cgroup_per_zone * 371 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 372 { 373 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 374 } 375 376 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 377 { 378 return &mem->css; 379 } 380 381 static struct mem_cgroup_per_zone * 382 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 383 { 384 int nid = page_to_nid(page); 385 int zid = page_zonenum(page); 386 387 return mem_cgroup_zoneinfo(mem, nid, zid); 388 } 389 390 static struct mem_cgroup_tree_per_zone * 391 soft_limit_tree_node_zone(int nid, int zid) 392 { 393 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 394 } 395 396 static struct mem_cgroup_tree_per_zone * 397 soft_limit_tree_from_page(struct page *page) 398 { 399 int nid = page_to_nid(page); 400 int zid = page_zonenum(page); 401 402 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 403 } 404 405 static void 406 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 407 struct mem_cgroup_per_zone *mz, 408 struct mem_cgroup_tree_per_zone *mctz, 409 unsigned long long new_usage_in_excess) 410 { 411 struct rb_node **p = &mctz->rb_root.rb_node; 412 struct rb_node *parent = NULL; 413 struct mem_cgroup_per_zone *mz_node; 414 415 if (mz->on_tree) 416 return; 417 418 mz->usage_in_excess = new_usage_in_excess; 419 if (!mz->usage_in_excess) 420 return; 421 while (*p) { 422 parent = *p; 423 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 424 tree_node); 425 if (mz->usage_in_excess < mz_node->usage_in_excess) 426 p = &(*p)->rb_left; 427 /* 428 * We can't avoid mem cgroups that are over their soft 429 * limit by the same amount 430 */ 431 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 432 p = &(*p)->rb_right; 433 } 434 rb_link_node(&mz->tree_node, parent, p); 435 rb_insert_color(&mz->tree_node, &mctz->rb_root); 436 mz->on_tree = true; 437 } 438 439 static void 440 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 441 struct mem_cgroup_per_zone *mz, 442 struct mem_cgroup_tree_per_zone *mctz) 443 { 444 if (!mz->on_tree) 445 return; 446 rb_erase(&mz->tree_node, &mctz->rb_root); 447 mz->on_tree = false; 448 } 449 450 static void 451 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 452 struct mem_cgroup_per_zone *mz, 453 struct mem_cgroup_tree_per_zone *mctz) 454 { 455 spin_lock(&mctz->lock); 456 __mem_cgroup_remove_exceeded(mem, mz, mctz); 457 spin_unlock(&mctz->lock); 458 } 459 460 461 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 462 { 463 unsigned long long excess; 464 struct mem_cgroup_per_zone *mz; 465 struct mem_cgroup_tree_per_zone *mctz; 466 int nid = page_to_nid(page); 467 int zid = page_zonenum(page); 468 mctz = soft_limit_tree_from_page(page); 469 470 /* 471 * Necessary to update all ancestors when hierarchy is used. 472 * because their event counter is not touched. 473 */ 474 for (; mem; mem = parent_mem_cgroup(mem)) { 475 mz = mem_cgroup_zoneinfo(mem, nid, zid); 476 excess = res_counter_soft_limit_excess(&mem->res); 477 /* 478 * We have to update the tree if mz is on RB-tree or 479 * mem is over its softlimit. 480 */ 481 if (excess || mz->on_tree) { 482 spin_lock(&mctz->lock); 483 /* if on-tree, remove it */ 484 if (mz->on_tree) 485 __mem_cgroup_remove_exceeded(mem, mz, mctz); 486 /* 487 * Insert again. mz->usage_in_excess will be updated. 488 * If excess is 0, no tree ops. 489 */ 490 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 491 spin_unlock(&mctz->lock); 492 } 493 } 494 } 495 496 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 497 { 498 int node, zone; 499 struct mem_cgroup_per_zone *mz; 500 struct mem_cgroup_tree_per_zone *mctz; 501 502 for_each_node_state(node, N_POSSIBLE) { 503 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 504 mz = mem_cgroup_zoneinfo(mem, node, zone); 505 mctz = soft_limit_tree_node_zone(node, zone); 506 mem_cgroup_remove_exceeded(mem, mz, mctz); 507 } 508 } 509 } 510 511 static struct mem_cgroup_per_zone * 512 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 513 { 514 struct rb_node *rightmost = NULL; 515 struct mem_cgroup_per_zone *mz; 516 517 retry: 518 mz = NULL; 519 rightmost = rb_last(&mctz->rb_root); 520 if (!rightmost) 521 goto done; /* Nothing to reclaim from */ 522 523 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 524 /* 525 * Remove the node now but someone else can add it back, 526 * we will to add it back at the end of reclaim to its correct 527 * position in the tree. 528 */ 529 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 530 if (!res_counter_soft_limit_excess(&mz->mem->res) || 531 !css_tryget(&mz->mem->css)) 532 goto retry; 533 done: 534 return mz; 535 } 536 537 static struct mem_cgroup_per_zone * 538 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 539 { 540 struct mem_cgroup_per_zone *mz; 541 542 spin_lock(&mctz->lock); 543 mz = __mem_cgroup_largest_soft_limit_node(mctz); 544 spin_unlock(&mctz->lock); 545 return mz; 546 } 547 548 /* 549 * Implementation Note: reading percpu statistics for memcg. 550 * 551 * Both of vmstat[] and percpu_counter has threshold and do periodic 552 * synchronization to implement "quick" read. There are trade-off between 553 * reading cost and precision of value. Then, we may have a chance to implement 554 * a periodic synchronizion of counter in memcg's counter. 555 * 556 * But this _read() function is used for user interface now. The user accounts 557 * memory usage by memory cgroup and he _always_ requires exact value because 558 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 559 * have to visit all online cpus and make sum. So, for now, unnecessary 560 * synchronization is not implemented. (just implemented for cpu hotplug) 561 * 562 * If there are kernel internal actions which can make use of some not-exact 563 * value, and reading all cpu value can be performance bottleneck in some 564 * common workload, threashold and synchonization as vmstat[] should be 565 * implemented. 566 */ 567 static long mem_cgroup_read_stat(struct mem_cgroup *mem, 568 enum mem_cgroup_stat_index idx) 569 { 570 long val = 0; 571 int cpu; 572 573 get_online_cpus(); 574 for_each_online_cpu(cpu) 575 val += per_cpu(mem->stat->count[idx], cpu); 576 #ifdef CONFIG_HOTPLUG_CPU 577 spin_lock(&mem->pcp_counter_lock); 578 val += mem->nocpu_base.count[idx]; 579 spin_unlock(&mem->pcp_counter_lock); 580 #endif 581 put_online_cpus(); 582 return val; 583 } 584 585 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 586 bool charge) 587 { 588 int val = (charge) ? 1 : -1; 589 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 590 } 591 592 void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 593 { 594 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 595 } 596 597 void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 598 { 599 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 600 } 601 602 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 603 enum mem_cgroup_events_index idx) 604 { 605 unsigned long val = 0; 606 int cpu; 607 608 for_each_online_cpu(cpu) 609 val += per_cpu(mem->stat->events[idx], cpu); 610 #ifdef CONFIG_HOTPLUG_CPU 611 spin_lock(&mem->pcp_counter_lock); 612 val += mem->nocpu_base.events[idx]; 613 spin_unlock(&mem->pcp_counter_lock); 614 #endif 615 return val; 616 } 617 618 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 619 bool file, int nr_pages) 620 { 621 preempt_disable(); 622 623 if (file) 624 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 625 else 626 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 627 628 /* pagein of a big page is an event. So, ignore page size */ 629 if (nr_pages > 0) 630 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 631 else { 632 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 633 nr_pages = -nr_pages; /* for event */ 634 } 635 636 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 637 638 preempt_enable(); 639 } 640 641 unsigned long 642 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, 643 unsigned int lru_mask) 644 { 645 struct mem_cgroup_per_zone *mz; 646 enum lru_list l; 647 unsigned long ret = 0; 648 649 mz = mem_cgroup_zoneinfo(mem, nid, zid); 650 651 for_each_lru(l) { 652 if (BIT(l) & lru_mask) 653 ret += MEM_CGROUP_ZSTAT(mz, l); 654 } 655 return ret; 656 } 657 658 static unsigned long 659 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, 660 int nid, unsigned int lru_mask) 661 { 662 u64 total = 0; 663 int zid; 664 665 for (zid = 0; zid < MAX_NR_ZONES; zid++) 666 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); 667 668 return total; 669 } 670 671 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, 672 unsigned int lru_mask) 673 { 674 int nid; 675 u64 total = 0; 676 677 for_each_node_state(nid, N_HIGH_MEMORY) 678 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); 679 return total; 680 } 681 682 static bool __memcg_event_check(struct mem_cgroup *mem, int target) 683 { 684 unsigned long val, next; 685 686 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 687 next = this_cpu_read(mem->stat->targets[target]); 688 /* from time_after() in jiffies.h */ 689 return ((long)next - (long)val < 0); 690 } 691 692 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 693 { 694 unsigned long val, next; 695 696 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 697 698 switch (target) { 699 case MEM_CGROUP_TARGET_THRESH: 700 next = val + THRESHOLDS_EVENTS_TARGET; 701 break; 702 case MEM_CGROUP_TARGET_SOFTLIMIT: 703 next = val + SOFTLIMIT_EVENTS_TARGET; 704 break; 705 case MEM_CGROUP_TARGET_NUMAINFO: 706 next = val + NUMAINFO_EVENTS_TARGET; 707 break; 708 default: 709 return; 710 } 711 712 this_cpu_write(mem->stat->targets[target], next); 713 } 714 715 /* 716 * Check events in order. 717 * 718 */ 719 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 720 { 721 /* threshold event is triggered in finer grain than soft limit */ 722 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 723 mem_cgroup_threshold(mem); 724 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 725 if (unlikely(__memcg_event_check(mem, 726 MEM_CGROUP_TARGET_SOFTLIMIT))) { 727 mem_cgroup_update_tree(mem, page); 728 __mem_cgroup_target_update(mem, 729 MEM_CGROUP_TARGET_SOFTLIMIT); 730 } 731 #if MAX_NUMNODES > 1 732 if (unlikely(__memcg_event_check(mem, 733 MEM_CGROUP_TARGET_NUMAINFO))) { 734 atomic_inc(&mem->numainfo_events); 735 __mem_cgroup_target_update(mem, 736 MEM_CGROUP_TARGET_NUMAINFO); 737 } 738 #endif 739 } 740 } 741 742 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 743 { 744 return container_of(cgroup_subsys_state(cont, 745 mem_cgroup_subsys_id), struct mem_cgroup, 746 css); 747 } 748 749 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 750 { 751 /* 752 * mm_update_next_owner() may clear mm->owner to NULL 753 * if it races with swapoff, page migration, etc. 754 * So this can be called with p == NULL. 755 */ 756 if (unlikely(!p)) 757 return NULL; 758 759 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 760 struct mem_cgroup, css); 761 } 762 763 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 764 { 765 struct mem_cgroup *mem = NULL; 766 767 if (!mm) 768 return NULL; 769 /* 770 * Because we have no locks, mm->owner's may be being moved to other 771 * cgroup. We use css_tryget() here even if this looks 772 * pessimistic (rather than adding locks here). 773 */ 774 rcu_read_lock(); 775 do { 776 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 777 if (unlikely(!mem)) 778 break; 779 } while (!css_tryget(&mem->css)); 780 rcu_read_unlock(); 781 return mem; 782 } 783 784 /* The caller has to guarantee "mem" exists before calling this */ 785 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 786 { 787 struct cgroup_subsys_state *css; 788 int found; 789 790 if (!mem) /* ROOT cgroup has the smallest ID */ 791 return root_mem_cgroup; /*css_put/get against root is ignored*/ 792 if (!mem->use_hierarchy) { 793 if (css_tryget(&mem->css)) 794 return mem; 795 return NULL; 796 } 797 rcu_read_lock(); 798 /* 799 * searching a memory cgroup which has the smallest ID under given 800 * ROOT cgroup. (ID >= 1) 801 */ 802 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 803 if (css && css_tryget(css)) 804 mem = container_of(css, struct mem_cgroup, css); 805 else 806 mem = NULL; 807 rcu_read_unlock(); 808 return mem; 809 } 810 811 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 812 struct mem_cgroup *root, 813 bool cond) 814 { 815 int nextid = css_id(&iter->css) + 1; 816 int found; 817 int hierarchy_used; 818 struct cgroup_subsys_state *css; 819 820 hierarchy_used = iter->use_hierarchy; 821 822 css_put(&iter->css); 823 /* If no ROOT, walk all, ignore hierarchy */ 824 if (!cond || (root && !hierarchy_used)) 825 return NULL; 826 827 if (!root) 828 root = root_mem_cgroup; 829 830 do { 831 iter = NULL; 832 rcu_read_lock(); 833 834 css = css_get_next(&mem_cgroup_subsys, nextid, 835 &root->css, &found); 836 if (css && css_tryget(css)) 837 iter = container_of(css, struct mem_cgroup, css); 838 rcu_read_unlock(); 839 /* If css is NULL, no more cgroups will be found */ 840 nextid = found + 1; 841 } while (css && !iter); 842 843 return iter; 844 } 845 /* 846 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 847 * be careful that "break" loop is not allowed. We have reference count. 848 * Instead of that modify "cond" to be false and "continue" to exit the loop. 849 */ 850 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 851 for (iter = mem_cgroup_start_loop(root);\ 852 iter != NULL;\ 853 iter = mem_cgroup_get_next(iter, root, cond)) 854 855 #define for_each_mem_cgroup_tree(iter, root) \ 856 for_each_mem_cgroup_tree_cond(iter, root, true) 857 858 #define for_each_mem_cgroup_all(iter) \ 859 for_each_mem_cgroup_tree_cond(iter, NULL, true) 860 861 862 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 863 { 864 return (mem == root_mem_cgroup); 865 } 866 867 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 868 { 869 struct mem_cgroup *mem; 870 871 if (!mm) 872 return; 873 874 rcu_read_lock(); 875 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 876 if (unlikely(!mem)) 877 goto out; 878 879 switch (idx) { 880 case PGMAJFAULT: 881 mem_cgroup_pgmajfault(mem, 1); 882 break; 883 case PGFAULT: 884 mem_cgroup_pgfault(mem, 1); 885 break; 886 default: 887 BUG(); 888 } 889 out: 890 rcu_read_unlock(); 891 } 892 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 893 894 /* 895 * Following LRU functions are allowed to be used without PCG_LOCK. 896 * Operations are called by routine of global LRU independently from memcg. 897 * What we have to take care of here is validness of pc->mem_cgroup. 898 * 899 * Changes to pc->mem_cgroup happens when 900 * 1. charge 901 * 2. moving account 902 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 903 * It is added to LRU before charge. 904 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 905 * When moving account, the page is not on LRU. It's isolated. 906 */ 907 908 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 909 { 910 struct page_cgroup *pc; 911 struct mem_cgroup_per_zone *mz; 912 913 if (mem_cgroup_disabled()) 914 return; 915 pc = lookup_page_cgroup(page); 916 /* can happen while we handle swapcache. */ 917 if (!TestClearPageCgroupAcctLRU(pc)) 918 return; 919 VM_BUG_ON(!pc->mem_cgroup); 920 /* 921 * We don't check PCG_USED bit. It's cleared when the "page" is finally 922 * removed from global LRU. 923 */ 924 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 925 /* huge page split is done under lru_lock. so, we have no races. */ 926 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 927 if (mem_cgroup_is_root(pc->mem_cgroup)) 928 return; 929 VM_BUG_ON(list_empty(&pc->lru)); 930 list_del_init(&pc->lru); 931 } 932 933 void mem_cgroup_del_lru(struct page *page) 934 { 935 mem_cgroup_del_lru_list(page, page_lru(page)); 936 } 937 938 /* 939 * Writeback is about to end against a page which has been marked for immediate 940 * reclaim. If it still appears to be reclaimable, move it to the tail of the 941 * inactive list. 942 */ 943 void mem_cgroup_rotate_reclaimable_page(struct page *page) 944 { 945 struct mem_cgroup_per_zone *mz; 946 struct page_cgroup *pc; 947 enum lru_list lru = page_lru(page); 948 949 if (mem_cgroup_disabled()) 950 return; 951 952 pc = lookup_page_cgroup(page); 953 /* unused or root page is not rotated. */ 954 if (!PageCgroupUsed(pc)) 955 return; 956 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 957 smp_rmb(); 958 if (mem_cgroup_is_root(pc->mem_cgroup)) 959 return; 960 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 961 list_move_tail(&pc->lru, &mz->lists[lru]); 962 } 963 964 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 965 { 966 struct mem_cgroup_per_zone *mz; 967 struct page_cgroup *pc; 968 969 if (mem_cgroup_disabled()) 970 return; 971 972 pc = lookup_page_cgroup(page); 973 /* unused or root page is not rotated. */ 974 if (!PageCgroupUsed(pc)) 975 return; 976 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 977 smp_rmb(); 978 if (mem_cgroup_is_root(pc->mem_cgroup)) 979 return; 980 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 981 list_move(&pc->lru, &mz->lists[lru]); 982 } 983 984 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 985 { 986 struct page_cgroup *pc; 987 struct mem_cgroup_per_zone *mz; 988 989 if (mem_cgroup_disabled()) 990 return; 991 pc = lookup_page_cgroup(page); 992 VM_BUG_ON(PageCgroupAcctLRU(pc)); 993 if (!PageCgroupUsed(pc)) 994 return; 995 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 996 smp_rmb(); 997 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 998 /* huge page split is done under lru_lock. so, we have no races. */ 999 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1000 SetPageCgroupAcctLRU(pc); 1001 if (mem_cgroup_is_root(pc->mem_cgroup)) 1002 return; 1003 list_add(&pc->lru, &mz->lists[lru]); 1004 } 1005 1006 /* 1007 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed 1008 * while it's linked to lru because the page may be reused after it's fully 1009 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. 1010 * It's done under lock_page and expected that zone->lru_lock isnever held. 1011 */ 1012 static void mem_cgroup_lru_del_before_commit(struct page *page) 1013 { 1014 unsigned long flags; 1015 struct zone *zone = page_zone(page); 1016 struct page_cgroup *pc = lookup_page_cgroup(page); 1017 1018 /* 1019 * Doing this check without taking ->lru_lock seems wrong but this 1020 * is safe. Because if page_cgroup's USED bit is unset, the page 1021 * will not be added to any memcg's LRU. If page_cgroup's USED bit is 1022 * set, the commit after this will fail, anyway. 1023 * This all charge/uncharge is done under some mutual execustion. 1024 * So, we don't need to taking care of changes in USED bit. 1025 */ 1026 if (likely(!PageLRU(page))) 1027 return; 1028 1029 spin_lock_irqsave(&zone->lru_lock, flags); 1030 /* 1031 * Forget old LRU when this page_cgroup is *not* used. This Used bit 1032 * is guarded by lock_page() because the page is SwapCache. 1033 */ 1034 if (!PageCgroupUsed(pc)) 1035 mem_cgroup_del_lru_list(page, page_lru(page)); 1036 spin_unlock_irqrestore(&zone->lru_lock, flags); 1037 } 1038 1039 static void mem_cgroup_lru_add_after_commit(struct page *page) 1040 { 1041 unsigned long flags; 1042 struct zone *zone = page_zone(page); 1043 struct page_cgroup *pc = lookup_page_cgroup(page); 1044 1045 /* taking care of that the page is added to LRU while we commit it */ 1046 if (likely(!PageLRU(page))) 1047 return; 1048 spin_lock_irqsave(&zone->lru_lock, flags); 1049 /* link when the page is linked to LRU but page_cgroup isn't */ 1050 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 1051 mem_cgroup_add_lru_list(page, page_lru(page)); 1052 spin_unlock_irqrestore(&zone->lru_lock, flags); 1053 } 1054 1055 1056 void mem_cgroup_move_lists(struct page *page, 1057 enum lru_list from, enum lru_list to) 1058 { 1059 if (mem_cgroup_disabled()) 1060 return; 1061 mem_cgroup_del_lru_list(page, from); 1062 mem_cgroup_add_lru_list(page, to); 1063 } 1064 1065 /* 1066 * Checks whether given mem is same or in the root_mem's 1067 * hierarchy subtree 1068 */ 1069 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, 1070 struct mem_cgroup *mem) 1071 { 1072 if (root_mem != mem) { 1073 return (root_mem->use_hierarchy && 1074 css_is_ancestor(&mem->css, &root_mem->css)); 1075 } 1076 1077 return true; 1078 } 1079 1080 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1081 { 1082 int ret; 1083 struct mem_cgroup *curr = NULL; 1084 struct task_struct *p; 1085 1086 p = find_lock_task_mm(task); 1087 if (!p) 1088 return 0; 1089 curr = try_get_mem_cgroup_from_mm(p->mm); 1090 task_unlock(p); 1091 if (!curr) 1092 return 0; 1093 /* 1094 * We should check use_hierarchy of "mem" not "curr". Because checking 1095 * use_hierarchy of "curr" here make this function true if hierarchy is 1096 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1097 * hierarchy(even if use_hierarchy is disabled in "mem"). 1098 */ 1099 ret = mem_cgroup_same_or_subtree(mem, curr); 1100 css_put(&curr->css); 1101 return ret; 1102 } 1103 1104 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1105 { 1106 unsigned long active; 1107 unsigned long inactive; 1108 unsigned long gb; 1109 unsigned long inactive_ratio; 1110 1111 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 1112 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 1113 1114 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1115 if (gb) 1116 inactive_ratio = int_sqrt(10 * gb); 1117 else 1118 inactive_ratio = 1; 1119 1120 if (present_pages) { 1121 present_pages[0] = inactive; 1122 present_pages[1] = active; 1123 } 1124 1125 return inactive_ratio; 1126 } 1127 1128 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1129 { 1130 unsigned long active; 1131 unsigned long inactive; 1132 unsigned long present_pages[2]; 1133 unsigned long inactive_ratio; 1134 1135 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1136 1137 inactive = present_pages[0]; 1138 active = present_pages[1]; 1139 1140 if (inactive * inactive_ratio < active) 1141 return 1; 1142 1143 return 0; 1144 } 1145 1146 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1147 { 1148 unsigned long active; 1149 unsigned long inactive; 1150 1151 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 1152 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 1153 1154 return (active > inactive); 1155 } 1156 1157 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1158 struct zone *zone) 1159 { 1160 int nid = zone_to_nid(zone); 1161 int zid = zone_idx(zone); 1162 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1163 1164 return &mz->reclaim_stat; 1165 } 1166 1167 struct zone_reclaim_stat * 1168 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1169 { 1170 struct page_cgroup *pc; 1171 struct mem_cgroup_per_zone *mz; 1172 1173 if (mem_cgroup_disabled()) 1174 return NULL; 1175 1176 pc = lookup_page_cgroup(page); 1177 if (!PageCgroupUsed(pc)) 1178 return NULL; 1179 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1180 smp_rmb(); 1181 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1182 return &mz->reclaim_stat; 1183 } 1184 1185 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1186 struct list_head *dst, 1187 unsigned long *scanned, int order, 1188 int mode, struct zone *z, 1189 struct mem_cgroup *mem_cont, 1190 int active, int file) 1191 { 1192 unsigned long nr_taken = 0; 1193 struct page *page; 1194 unsigned long scan; 1195 LIST_HEAD(pc_list); 1196 struct list_head *src; 1197 struct page_cgroup *pc, *tmp; 1198 int nid = zone_to_nid(z); 1199 int zid = zone_idx(z); 1200 struct mem_cgroup_per_zone *mz; 1201 int lru = LRU_FILE * file + active; 1202 int ret; 1203 1204 BUG_ON(!mem_cont); 1205 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1206 src = &mz->lists[lru]; 1207 1208 scan = 0; 1209 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1210 if (scan >= nr_to_scan) 1211 break; 1212 1213 if (unlikely(!PageCgroupUsed(pc))) 1214 continue; 1215 1216 page = lookup_cgroup_page(pc); 1217 1218 if (unlikely(!PageLRU(page))) 1219 continue; 1220 1221 scan++; 1222 ret = __isolate_lru_page(page, mode, file); 1223 switch (ret) { 1224 case 0: 1225 list_move(&page->lru, dst); 1226 mem_cgroup_del_lru(page); 1227 nr_taken += hpage_nr_pages(page); 1228 break; 1229 case -EBUSY: 1230 /* we don't affect global LRU but rotate in our LRU */ 1231 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1232 break; 1233 default: 1234 break; 1235 } 1236 } 1237 1238 *scanned = scan; 1239 1240 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1241 0, 0, 0, mode); 1242 1243 return nr_taken; 1244 } 1245 1246 #define mem_cgroup_from_res_counter(counter, member) \ 1247 container_of(counter, struct mem_cgroup, member) 1248 1249 /** 1250 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1251 * @mem: the memory cgroup 1252 * 1253 * Returns the maximum amount of memory @mem can be charged with, in 1254 * pages. 1255 */ 1256 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1257 { 1258 unsigned long long margin; 1259 1260 margin = res_counter_margin(&mem->res); 1261 if (do_swap_account) 1262 margin = min(margin, res_counter_margin(&mem->memsw)); 1263 return margin >> PAGE_SHIFT; 1264 } 1265 1266 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1267 { 1268 struct cgroup *cgrp = memcg->css.cgroup; 1269 1270 /* root ? */ 1271 if (cgrp->parent == NULL) 1272 return vm_swappiness; 1273 1274 return memcg->swappiness; 1275 } 1276 1277 static void mem_cgroup_start_move(struct mem_cgroup *mem) 1278 { 1279 int cpu; 1280 1281 get_online_cpus(); 1282 spin_lock(&mem->pcp_counter_lock); 1283 for_each_online_cpu(cpu) 1284 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1285 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1286 spin_unlock(&mem->pcp_counter_lock); 1287 put_online_cpus(); 1288 1289 synchronize_rcu(); 1290 } 1291 1292 static void mem_cgroup_end_move(struct mem_cgroup *mem) 1293 { 1294 int cpu; 1295 1296 if (!mem) 1297 return; 1298 get_online_cpus(); 1299 spin_lock(&mem->pcp_counter_lock); 1300 for_each_online_cpu(cpu) 1301 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1302 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1303 spin_unlock(&mem->pcp_counter_lock); 1304 put_online_cpus(); 1305 } 1306 /* 1307 * 2 routines for checking "mem" is under move_account() or not. 1308 * 1309 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1310 * for avoiding race in accounting. If true, 1311 * pc->mem_cgroup may be overwritten. 1312 * 1313 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1314 * under hierarchy of moving cgroups. This is for 1315 * waiting at hith-memory prressure caused by "move". 1316 */ 1317 1318 static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1319 { 1320 VM_BUG_ON(!rcu_read_lock_held()); 1321 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1322 } 1323 1324 static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1325 { 1326 struct mem_cgroup *from; 1327 struct mem_cgroup *to; 1328 bool ret = false; 1329 /* 1330 * Unlike task_move routines, we access mc.to, mc.from not under 1331 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1332 */ 1333 spin_lock(&mc.lock); 1334 from = mc.from; 1335 to = mc.to; 1336 if (!from) 1337 goto unlock; 1338 1339 ret = mem_cgroup_same_or_subtree(mem, from) 1340 || mem_cgroup_same_or_subtree(mem, to); 1341 unlock: 1342 spin_unlock(&mc.lock); 1343 return ret; 1344 } 1345 1346 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1347 { 1348 if (mc.moving_task && current != mc.moving_task) { 1349 if (mem_cgroup_under_move(mem)) { 1350 DEFINE_WAIT(wait); 1351 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1352 /* moving charge context might have finished. */ 1353 if (mc.moving_task) 1354 schedule(); 1355 finish_wait(&mc.waitq, &wait); 1356 return true; 1357 } 1358 } 1359 return false; 1360 } 1361 1362 /** 1363 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1364 * @memcg: The memory cgroup that went over limit 1365 * @p: Task that is going to be killed 1366 * 1367 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1368 * enabled 1369 */ 1370 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1371 { 1372 struct cgroup *task_cgrp; 1373 struct cgroup *mem_cgrp; 1374 /* 1375 * Need a buffer in BSS, can't rely on allocations. The code relies 1376 * on the assumption that OOM is serialized for memory controller. 1377 * If this assumption is broken, revisit this code. 1378 */ 1379 static char memcg_name[PATH_MAX]; 1380 int ret; 1381 1382 if (!memcg || !p) 1383 return; 1384 1385 1386 rcu_read_lock(); 1387 1388 mem_cgrp = memcg->css.cgroup; 1389 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1390 1391 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1392 if (ret < 0) { 1393 /* 1394 * Unfortunately, we are unable to convert to a useful name 1395 * But we'll still print out the usage information 1396 */ 1397 rcu_read_unlock(); 1398 goto done; 1399 } 1400 rcu_read_unlock(); 1401 1402 printk(KERN_INFO "Task in %s killed", memcg_name); 1403 1404 rcu_read_lock(); 1405 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1406 if (ret < 0) { 1407 rcu_read_unlock(); 1408 goto done; 1409 } 1410 rcu_read_unlock(); 1411 1412 /* 1413 * Continues from above, so we don't need an KERN_ level 1414 */ 1415 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1416 done: 1417 1418 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1419 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1420 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1421 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1422 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1423 "failcnt %llu\n", 1424 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1425 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1426 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1427 } 1428 1429 /* 1430 * This function returns the number of memcg under hierarchy tree. Returns 1431 * 1(self count) if no children. 1432 */ 1433 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1434 { 1435 int num = 0; 1436 struct mem_cgroup *iter; 1437 1438 for_each_mem_cgroup_tree(iter, mem) 1439 num++; 1440 return num; 1441 } 1442 1443 /* 1444 * Return the memory (and swap, if configured) limit for a memcg. 1445 */ 1446 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1447 { 1448 u64 limit; 1449 u64 memsw; 1450 1451 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1452 limit += total_swap_pages << PAGE_SHIFT; 1453 1454 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1455 /* 1456 * If memsw is finite and limits the amount of swap space available 1457 * to this memcg, return that limit. 1458 */ 1459 return min(limit, memsw); 1460 } 1461 1462 /* 1463 * Visit the first child (need not be the first child as per the ordering 1464 * of the cgroup list, since we track last_scanned_child) of @mem and use 1465 * that to reclaim free pages from. 1466 */ 1467 static struct mem_cgroup * 1468 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1469 { 1470 struct mem_cgroup *ret = NULL; 1471 struct cgroup_subsys_state *css; 1472 int nextid, found; 1473 1474 if (!root_mem->use_hierarchy) { 1475 css_get(&root_mem->css); 1476 ret = root_mem; 1477 } 1478 1479 while (!ret) { 1480 rcu_read_lock(); 1481 nextid = root_mem->last_scanned_child + 1; 1482 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1483 &found); 1484 if (css && css_tryget(css)) 1485 ret = container_of(css, struct mem_cgroup, css); 1486 1487 rcu_read_unlock(); 1488 /* Updates scanning parameter */ 1489 if (!css) { 1490 /* this means start scan from ID:1 */ 1491 root_mem->last_scanned_child = 0; 1492 } else 1493 root_mem->last_scanned_child = found; 1494 } 1495 1496 return ret; 1497 } 1498 1499 /** 1500 * test_mem_cgroup_node_reclaimable 1501 * @mem: the target memcg 1502 * @nid: the node ID to be checked. 1503 * @noswap : specify true here if the user wants flle only information. 1504 * 1505 * This function returns whether the specified memcg contains any 1506 * reclaimable pages on a node. Returns true if there are any reclaimable 1507 * pages in the node. 1508 */ 1509 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1510 int nid, bool noswap) 1511 { 1512 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) 1513 return true; 1514 if (noswap || !total_swap_pages) 1515 return false; 1516 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) 1517 return true; 1518 return false; 1519 1520 } 1521 #if MAX_NUMNODES > 1 1522 1523 /* 1524 * Always updating the nodemask is not very good - even if we have an empty 1525 * list or the wrong list here, we can start from some node and traverse all 1526 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1527 * 1528 */ 1529 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1530 { 1531 int nid; 1532 /* 1533 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1534 * pagein/pageout changes since the last update. 1535 */ 1536 if (!atomic_read(&mem->numainfo_events)) 1537 return; 1538 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1539 return; 1540 1541 /* make a nodemask where this memcg uses memory from */ 1542 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1543 1544 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1545 1546 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1547 node_clear(nid, mem->scan_nodes); 1548 } 1549 1550 atomic_set(&mem->numainfo_events, 0); 1551 atomic_set(&mem->numainfo_updating, 0); 1552 } 1553 1554 /* 1555 * Selecting a node where we start reclaim from. Because what we need is just 1556 * reducing usage counter, start from anywhere is O,K. Considering 1557 * memory reclaim from current node, there are pros. and cons. 1558 * 1559 * Freeing memory from current node means freeing memory from a node which 1560 * we'll use or we've used. So, it may make LRU bad. And if several threads 1561 * hit limits, it will see a contention on a node. But freeing from remote 1562 * node means more costs for memory reclaim because of memory latency. 1563 * 1564 * Now, we use round-robin. Better algorithm is welcomed. 1565 */ 1566 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1567 { 1568 int node; 1569 1570 mem_cgroup_may_update_nodemask(mem); 1571 node = mem->last_scanned_node; 1572 1573 node = next_node(node, mem->scan_nodes); 1574 if (node == MAX_NUMNODES) 1575 node = first_node(mem->scan_nodes); 1576 /* 1577 * We call this when we hit limit, not when pages are added to LRU. 1578 * No LRU may hold pages because all pages are UNEVICTABLE or 1579 * memcg is too small and all pages are not on LRU. In that case, 1580 * we use curret node. 1581 */ 1582 if (unlikely(node == MAX_NUMNODES)) 1583 node = numa_node_id(); 1584 1585 mem->last_scanned_node = node; 1586 return node; 1587 } 1588 1589 /* 1590 * Check all nodes whether it contains reclaimable pages or not. 1591 * For quick scan, we make use of scan_nodes. This will allow us to skip 1592 * unused nodes. But scan_nodes is lazily updated and may not cotain 1593 * enough new information. We need to do double check. 1594 */ 1595 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1596 { 1597 int nid; 1598 1599 /* 1600 * quick check...making use of scan_node. 1601 * We can skip unused nodes. 1602 */ 1603 if (!nodes_empty(mem->scan_nodes)) { 1604 for (nid = first_node(mem->scan_nodes); 1605 nid < MAX_NUMNODES; 1606 nid = next_node(nid, mem->scan_nodes)) { 1607 1608 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1609 return true; 1610 } 1611 } 1612 /* 1613 * Check rest of nodes. 1614 */ 1615 for_each_node_state(nid, N_HIGH_MEMORY) { 1616 if (node_isset(nid, mem->scan_nodes)) 1617 continue; 1618 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1619 return true; 1620 } 1621 return false; 1622 } 1623 1624 #else 1625 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1626 { 1627 return 0; 1628 } 1629 1630 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1631 { 1632 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1633 } 1634 #endif 1635 1636 /* 1637 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1638 * we reclaimed from, so that we don't end up penalizing one child extensively 1639 * based on its position in the children list. 1640 * 1641 * root_mem is the original ancestor that we've been reclaim from. 1642 * 1643 * We give up and return to the caller when we visit root_mem twice. 1644 * (other groups can be removed while we're walking....) 1645 * 1646 * If shrink==true, for avoiding to free too much, this returns immedieately. 1647 */ 1648 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1649 struct zone *zone, 1650 gfp_t gfp_mask, 1651 unsigned long reclaim_options, 1652 unsigned long *total_scanned) 1653 { 1654 struct mem_cgroup *victim; 1655 int ret, total = 0; 1656 int loop = 0; 1657 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1658 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1659 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1660 unsigned long excess; 1661 unsigned long nr_scanned; 1662 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1664 1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1666 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1667 noswap = true; 1668 1669 while (1) { 1670 victim = mem_cgroup_select_victim(root_mem); 1671 if (victim == root_mem) { 1672 loop++; 1673 /* 1674 * We are not draining per cpu cached charges during 1675 * soft limit reclaim because global reclaim doesn't 1676 * care about charges. It tries to free some memory and 1677 * charges will not give any. 1678 */ 1679 if (!check_soft && loop >= 1) 1680 drain_all_stock_async(root_mem); 1681 if (loop >= 2) { 1682 /* 1683 * If we have not been able to reclaim 1684 * anything, it might because there are 1685 * no reclaimable pages under this hierarchy 1686 */ 1687 if (!check_soft || !total) { 1688 css_put(&victim->css); 1689 break; 1690 } 1691 /* 1692 * We want to do more targeted reclaim. 1693 * excess >> 2 is not to excessive so as to 1694 * reclaim too much, nor too less that we keep 1695 * coming back to reclaim from this cgroup 1696 */ 1697 if (total >= (excess >> 2) || 1698 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1699 css_put(&victim->css); 1700 break; 1701 } 1702 } 1703 } 1704 if (!mem_cgroup_reclaimable(victim, noswap)) { 1705 /* this cgroup's local usage == 0 */ 1706 css_put(&victim->css); 1707 continue; 1708 } 1709 /* we use swappiness of local cgroup */ 1710 if (check_soft) { 1711 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1712 noswap, zone, &nr_scanned); 1713 *total_scanned += nr_scanned; 1714 } else 1715 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1716 noswap); 1717 css_put(&victim->css); 1718 /* 1719 * At shrinking usage, we can't check we should stop here or 1720 * reclaim more. It's depends on callers. last_scanned_child 1721 * will work enough for keeping fairness under tree. 1722 */ 1723 if (shrink) 1724 return ret; 1725 total += ret; 1726 if (check_soft) { 1727 if (!res_counter_soft_limit_excess(&root_mem->res)) 1728 return total; 1729 } else if (mem_cgroup_margin(root_mem)) 1730 return total; 1731 } 1732 return total; 1733 } 1734 1735 /* 1736 * Check OOM-Killer is already running under our hierarchy. 1737 * If someone is running, return false. 1738 * Has to be called with memcg_oom_lock 1739 */ 1740 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1741 { 1742 struct mem_cgroup *iter, *failed = NULL; 1743 bool cond = true; 1744 1745 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1746 if (iter->oom_lock) { 1747 /* 1748 * this subtree of our hierarchy is already locked 1749 * so we cannot give a lock. 1750 */ 1751 failed = iter; 1752 cond = false; 1753 } else 1754 iter->oom_lock = true; 1755 } 1756 1757 if (!failed) 1758 return true; 1759 1760 /* 1761 * OK, we failed to lock the whole subtree so we have to clean up 1762 * what we set up to the failing subtree 1763 */ 1764 cond = true; 1765 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1766 if (iter == failed) { 1767 cond = false; 1768 continue; 1769 } 1770 iter->oom_lock = false; 1771 } 1772 return false; 1773 } 1774 1775 /* 1776 * Has to be called with memcg_oom_lock 1777 */ 1778 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1779 { 1780 struct mem_cgroup *iter; 1781 1782 for_each_mem_cgroup_tree(iter, mem) 1783 iter->oom_lock = false; 1784 return 0; 1785 } 1786 1787 static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) 1788 { 1789 struct mem_cgroup *iter; 1790 1791 for_each_mem_cgroup_tree(iter, mem) 1792 atomic_inc(&iter->under_oom); 1793 } 1794 1795 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) 1796 { 1797 struct mem_cgroup *iter; 1798 1799 /* 1800 * When a new child is created while the hierarchy is under oom, 1801 * mem_cgroup_oom_lock() may not be called. We have to use 1802 * atomic_add_unless() here. 1803 */ 1804 for_each_mem_cgroup_tree(iter, mem) 1805 atomic_add_unless(&iter->under_oom, -1, 0); 1806 } 1807 1808 static DEFINE_SPINLOCK(memcg_oom_lock); 1809 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1810 1811 struct oom_wait_info { 1812 struct mem_cgroup *mem; 1813 wait_queue_t wait; 1814 }; 1815 1816 static int memcg_oom_wake_function(wait_queue_t *wait, 1817 unsigned mode, int sync, void *arg) 1818 { 1819 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, 1820 *oom_wait_mem; 1821 struct oom_wait_info *oom_wait_info; 1822 1823 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1824 oom_wait_mem = oom_wait_info->mem; 1825 1826 /* 1827 * Both of oom_wait_info->mem and wake_mem are stable under us. 1828 * Then we can use css_is_ancestor without taking care of RCU. 1829 */ 1830 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) 1831 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) 1832 return 0; 1833 return autoremove_wake_function(wait, mode, sync, arg); 1834 } 1835 1836 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1837 { 1838 /* for filtering, pass "mem" as argument. */ 1839 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1840 } 1841 1842 static void memcg_oom_recover(struct mem_cgroup *mem) 1843 { 1844 if (mem && atomic_read(&mem->under_oom)) 1845 memcg_wakeup_oom(mem); 1846 } 1847 1848 /* 1849 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1850 */ 1851 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1852 { 1853 struct oom_wait_info owait; 1854 bool locked, need_to_kill; 1855 1856 owait.mem = mem; 1857 owait.wait.flags = 0; 1858 owait.wait.func = memcg_oom_wake_function; 1859 owait.wait.private = current; 1860 INIT_LIST_HEAD(&owait.wait.task_list); 1861 need_to_kill = true; 1862 mem_cgroup_mark_under_oom(mem); 1863 1864 /* At first, try to OOM lock hierarchy under mem.*/ 1865 spin_lock(&memcg_oom_lock); 1866 locked = mem_cgroup_oom_lock(mem); 1867 /* 1868 * Even if signal_pending(), we can't quit charge() loop without 1869 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1870 * under OOM is always welcomed, use TASK_KILLABLE here. 1871 */ 1872 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1873 if (!locked || mem->oom_kill_disable) 1874 need_to_kill = false; 1875 if (locked) 1876 mem_cgroup_oom_notify(mem); 1877 spin_unlock(&memcg_oom_lock); 1878 1879 if (need_to_kill) { 1880 finish_wait(&memcg_oom_waitq, &owait.wait); 1881 mem_cgroup_out_of_memory(mem, mask); 1882 } else { 1883 schedule(); 1884 finish_wait(&memcg_oom_waitq, &owait.wait); 1885 } 1886 spin_lock(&memcg_oom_lock); 1887 if (locked) 1888 mem_cgroup_oom_unlock(mem); 1889 memcg_wakeup_oom(mem); 1890 spin_unlock(&memcg_oom_lock); 1891 1892 mem_cgroup_unmark_under_oom(mem); 1893 1894 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1895 return false; 1896 /* Give chance to dying process */ 1897 schedule_timeout(1); 1898 return true; 1899 } 1900 1901 /* 1902 * Currently used to update mapped file statistics, but the routine can be 1903 * generalized to update other statistics as well. 1904 * 1905 * Notes: Race condition 1906 * 1907 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1908 * it tends to be costly. But considering some conditions, we doesn't need 1909 * to do so _always_. 1910 * 1911 * Considering "charge", lock_page_cgroup() is not required because all 1912 * file-stat operations happen after a page is attached to radix-tree. There 1913 * are no race with "charge". 1914 * 1915 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1916 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1917 * if there are race with "uncharge". Statistics itself is properly handled 1918 * by flags. 1919 * 1920 * Considering "move", this is an only case we see a race. To make the race 1921 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1922 * possibility of race condition. If there is, we take a lock. 1923 */ 1924 1925 void mem_cgroup_update_page_stat(struct page *page, 1926 enum mem_cgroup_page_stat_item idx, int val) 1927 { 1928 struct mem_cgroup *mem; 1929 struct page_cgroup *pc = lookup_page_cgroup(page); 1930 bool need_unlock = false; 1931 unsigned long uninitialized_var(flags); 1932 1933 if (unlikely(!pc)) 1934 return; 1935 1936 rcu_read_lock(); 1937 mem = pc->mem_cgroup; 1938 if (unlikely(!mem || !PageCgroupUsed(pc))) 1939 goto out; 1940 /* pc->mem_cgroup is unstable ? */ 1941 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1942 /* take a lock against to access pc->mem_cgroup */ 1943 move_lock_page_cgroup(pc, &flags); 1944 need_unlock = true; 1945 mem = pc->mem_cgroup; 1946 if (!mem || !PageCgroupUsed(pc)) 1947 goto out; 1948 } 1949 1950 switch (idx) { 1951 case MEMCG_NR_FILE_MAPPED: 1952 if (val > 0) 1953 SetPageCgroupFileMapped(pc); 1954 else if (!page_mapped(page)) 1955 ClearPageCgroupFileMapped(pc); 1956 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1957 break; 1958 default: 1959 BUG(); 1960 } 1961 1962 this_cpu_add(mem->stat->count[idx], val); 1963 1964 out: 1965 if (unlikely(need_unlock)) 1966 move_unlock_page_cgroup(pc, &flags); 1967 rcu_read_unlock(); 1968 return; 1969 } 1970 EXPORT_SYMBOL(mem_cgroup_update_page_stat); 1971 1972 /* 1973 * size of first charge trial. "32" comes from vmscan.c's magic value. 1974 * TODO: maybe necessary to use big numbers in big irons. 1975 */ 1976 #define CHARGE_BATCH 32U 1977 struct memcg_stock_pcp { 1978 struct mem_cgroup *cached; /* this never be root cgroup */ 1979 unsigned int nr_pages; 1980 struct work_struct work; 1981 unsigned long flags; 1982 #define FLUSHING_CACHED_CHARGE (0) 1983 }; 1984 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1985 static DEFINE_MUTEX(percpu_charge_mutex); 1986 1987 /* 1988 * Try to consume stocked charge on this cpu. If success, one page is consumed 1989 * from local stock and true is returned. If the stock is 0 or charges from a 1990 * cgroup which is not current target, returns false. This stock will be 1991 * refilled. 1992 */ 1993 static bool consume_stock(struct mem_cgroup *mem) 1994 { 1995 struct memcg_stock_pcp *stock; 1996 bool ret = true; 1997 1998 stock = &get_cpu_var(memcg_stock); 1999 if (mem == stock->cached && stock->nr_pages) 2000 stock->nr_pages--; 2001 else /* need to call res_counter_charge */ 2002 ret = false; 2003 put_cpu_var(memcg_stock); 2004 return ret; 2005 } 2006 2007 /* 2008 * Returns stocks cached in percpu to res_counter and reset cached information. 2009 */ 2010 static void drain_stock(struct memcg_stock_pcp *stock) 2011 { 2012 struct mem_cgroup *old = stock->cached; 2013 2014 if (stock->nr_pages) { 2015 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2016 2017 res_counter_uncharge(&old->res, bytes); 2018 if (do_swap_account) 2019 res_counter_uncharge(&old->memsw, bytes); 2020 stock->nr_pages = 0; 2021 } 2022 stock->cached = NULL; 2023 } 2024 2025 /* 2026 * This must be called under preempt disabled or must be called by 2027 * a thread which is pinned to local cpu. 2028 */ 2029 static void drain_local_stock(struct work_struct *dummy) 2030 { 2031 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2032 drain_stock(stock); 2033 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2034 } 2035 2036 /* 2037 * Cache charges(val) which is from res_counter, to local per_cpu area. 2038 * This will be consumed by consume_stock() function, later. 2039 */ 2040 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2041 { 2042 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2043 2044 if (stock->cached != mem) { /* reset if necessary */ 2045 drain_stock(stock); 2046 stock->cached = mem; 2047 } 2048 stock->nr_pages += nr_pages; 2049 put_cpu_var(memcg_stock); 2050 } 2051 2052 /* 2053 * Drains all per-CPU charge caches for given root_mem resp. subtree 2054 * of the hierarchy under it. sync flag says whether we should block 2055 * until the work is done. 2056 */ 2057 static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) 2058 { 2059 int cpu, curcpu; 2060 2061 /* Notify other cpus that system-wide "drain" is running */ 2062 get_online_cpus(); 2063 curcpu = get_cpu(); 2064 for_each_online_cpu(cpu) { 2065 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2066 struct mem_cgroup *mem; 2067 2068 mem = stock->cached; 2069 if (!mem || !stock->nr_pages) 2070 continue; 2071 if (!mem_cgroup_same_or_subtree(root_mem, mem)) 2072 continue; 2073 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2074 if (cpu == curcpu) 2075 drain_local_stock(&stock->work); 2076 else 2077 schedule_work_on(cpu, &stock->work); 2078 } 2079 } 2080 put_cpu(); 2081 2082 if (!sync) 2083 goto out; 2084 2085 for_each_online_cpu(cpu) { 2086 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2087 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2088 flush_work(&stock->work); 2089 } 2090 out: 2091 put_online_cpus(); 2092 } 2093 2094 /* 2095 * Tries to drain stocked charges in other cpus. This function is asynchronous 2096 * and just put a work per cpu for draining localy on each cpu. Caller can 2097 * expects some charges will be back to res_counter later but cannot wait for 2098 * it. 2099 */ 2100 static void drain_all_stock_async(struct mem_cgroup *root_mem) 2101 { 2102 /* 2103 * If someone calls draining, avoid adding more kworker runs. 2104 */ 2105 if (!mutex_trylock(&percpu_charge_mutex)) 2106 return; 2107 drain_all_stock(root_mem, false); 2108 mutex_unlock(&percpu_charge_mutex); 2109 } 2110 2111 /* This is a synchronous drain interface. */ 2112 static void drain_all_stock_sync(struct mem_cgroup *root_mem) 2113 { 2114 /* called when force_empty is called */ 2115 mutex_lock(&percpu_charge_mutex); 2116 drain_all_stock(root_mem, true); 2117 mutex_unlock(&percpu_charge_mutex); 2118 } 2119 2120 /* 2121 * This function drains percpu counter value from DEAD cpu and 2122 * move it to local cpu. Note that this function can be preempted. 2123 */ 2124 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2125 { 2126 int i; 2127 2128 spin_lock(&mem->pcp_counter_lock); 2129 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2130 long x = per_cpu(mem->stat->count[i], cpu); 2131 2132 per_cpu(mem->stat->count[i], cpu) = 0; 2133 mem->nocpu_base.count[i] += x; 2134 } 2135 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2136 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2137 2138 per_cpu(mem->stat->events[i], cpu) = 0; 2139 mem->nocpu_base.events[i] += x; 2140 } 2141 /* need to clear ON_MOVE value, works as a kind of lock. */ 2142 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2143 spin_unlock(&mem->pcp_counter_lock); 2144 } 2145 2146 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2147 { 2148 int idx = MEM_CGROUP_ON_MOVE; 2149 2150 spin_lock(&mem->pcp_counter_lock); 2151 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2152 spin_unlock(&mem->pcp_counter_lock); 2153 } 2154 2155 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2156 unsigned long action, 2157 void *hcpu) 2158 { 2159 int cpu = (unsigned long)hcpu; 2160 struct memcg_stock_pcp *stock; 2161 struct mem_cgroup *iter; 2162 2163 if ((action == CPU_ONLINE)) { 2164 for_each_mem_cgroup_all(iter) 2165 synchronize_mem_cgroup_on_move(iter, cpu); 2166 return NOTIFY_OK; 2167 } 2168 2169 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2170 return NOTIFY_OK; 2171 2172 for_each_mem_cgroup_all(iter) 2173 mem_cgroup_drain_pcp_counter(iter, cpu); 2174 2175 stock = &per_cpu(memcg_stock, cpu); 2176 drain_stock(stock); 2177 return NOTIFY_OK; 2178 } 2179 2180 2181 /* See __mem_cgroup_try_charge() for details */ 2182 enum { 2183 CHARGE_OK, /* success */ 2184 CHARGE_RETRY, /* need to retry but retry is not bad */ 2185 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2186 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2187 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2188 }; 2189 2190 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2191 unsigned int nr_pages, bool oom_check) 2192 { 2193 unsigned long csize = nr_pages * PAGE_SIZE; 2194 struct mem_cgroup *mem_over_limit; 2195 struct res_counter *fail_res; 2196 unsigned long flags = 0; 2197 int ret; 2198 2199 ret = res_counter_charge(&mem->res, csize, &fail_res); 2200 2201 if (likely(!ret)) { 2202 if (!do_swap_account) 2203 return CHARGE_OK; 2204 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2205 if (likely(!ret)) 2206 return CHARGE_OK; 2207 2208 res_counter_uncharge(&mem->res, csize); 2209 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2210 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2211 } else 2212 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2213 /* 2214 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2215 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2216 * 2217 * Never reclaim on behalf of optional batching, retry with a 2218 * single page instead. 2219 */ 2220 if (nr_pages == CHARGE_BATCH) 2221 return CHARGE_RETRY; 2222 2223 if (!(gfp_mask & __GFP_WAIT)) 2224 return CHARGE_WOULDBLOCK; 2225 2226 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2227 gfp_mask, flags, NULL); 2228 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2229 return CHARGE_RETRY; 2230 /* 2231 * Even though the limit is exceeded at this point, reclaim 2232 * may have been able to free some pages. Retry the charge 2233 * before killing the task. 2234 * 2235 * Only for regular pages, though: huge pages are rather 2236 * unlikely to succeed so close to the limit, and we fall back 2237 * to regular pages anyway in case of failure. 2238 */ 2239 if (nr_pages == 1 && ret) 2240 return CHARGE_RETRY; 2241 2242 /* 2243 * At task move, charge accounts can be doubly counted. So, it's 2244 * better to wait until the end of task_move if something is going on. 2245 */ 2246 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2247 return CHARGE_RETRY; 2248 2249 /* If we don't need to call oom-killer at el, return immediately */ 2250 if (!oom_check) 2251 return CHARGE_NOMEM; 2252 /* check OOM */ 2253 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2254 return CHARGE_OOM_DIE; 2255 2256 return CHARGE_RETRY; 2257 } 2258 2259 /* 2260 * Unlike exported interface, "oom" parameter is added. if oom==true, 2261 * oom-killer can be invoked. 2262 */ 2263 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2264 gfp_t gfp_mask, 2265 unsigned int nr_pages, 2266 struct mem_cgroup **memcg, 2267 bool oom) 2268 { 2269 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2270 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2271 struct mem_cgroup *mem = NULL; 2272 int ret; 2273 2274 /* 2275 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2276 * in system level. So, allow to go ahead dying process in addition to 2277 * MEMDIE process. 2278 */ 2279 if (unlikely(test_thread_flag(TIF_MEMDIE) 2280 || fatal_signal_pending(current))) 2281 goto bypass; 2282 2283 /* 2284 * We always charge the cgroup the mm_struct belongs to. 2285 * The mm_struct's mem_cgroup changes on task migration if the 2286 * thread group leader migrates. It's possible that mm is not 2287 * set, if so charge the init_mm (happens for pagecache usage). 2288 */ 2289 if (!*memcg && !mm) 2290 goto bypass; 2291 again: 2292 if (*memcg) { /* css should be a valid one */ 2293 mem = *memcg; 2294 VM_BUG_ON(css_is_removed(&mem->css)); 2295 if (mem_cgroup_is_root(mem)) 2296 goto done; 2297 if (nr_pages == 1 && consume_stock(mem)) 2298 goto done; 2299 css_get(&mem->css); 2300 } else { 2301 struct task_struct *p; 2302 2303 rcu_read_lock(); 2304 p = rcu_dereference(mm->owner); 2305 /* 2306 * Because we don't have task_lock(), "p" can exit. 2307 * In that case, "mem" can point to root or p can be NULL with 2308 * race with swapoff. Then, we have small risk of mis-accouning. 2309 * But such kind of mis-account by race always happens because 2310 * we don't have cgroup_mutex(). It's overkill and we allo that 2311 * small race, here. 2312 * (*) swapoff at el will charge against mm-struct not against 2313 * task-struct. So, mm->owner can be NULL. 2314 */ 2315 mem = mem_cgroup_from_task(p); 2316 if (!mem || mem_cgroup_is_root(mem)) { 2317 rcu_read_unlock(); 2318 goto done; 2319 } 2320 if (nr_pages == 1 && consume_stock(mem)) { 2321 /* 2322 * It seems dagerous to access memcg without css_get(). 2323 * But considering how consume_stok works, it's not 2324 * necessary. If consume_stock success, some charges 2325 * from this memcg are cached on this cpu. So, we 2326 * don't need to call css_get()/css_tryget() before 2327 * calling consume_stock(). 2328 */ 2329 rcu_read_unlock(); 2330 goto done; 2331 } 2332 /* after here, we may be blocked. we need to get refcnt */ 2333 if (!css_tryget(&mem->css)) { 2334 rcu_read_unlock(); 2335 goto again; 2336 } 2337 rcu_read_unlock(); 2338 } 2339 2340 do { 2341 bool oom_check; 2342 2343 /* If killed, bypass charge */ 2344 if (fatal_signal_pending(current)) { 2345 css_put(&mem->css); 2346 goto bypass; 2347 } 2348 2349 oom_check = false; 2350 if (oom && !nr_oom_retries) { 2351 oom_check = true; 2352 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2353 } 2354 2355 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2356 switch (ret) { 2357 case CHARGE_OK: 2358 break; 2359 case CHARGE_RETRY: /* not in OOM situation but retry */ 2360 batch = nr_pages; 2361 css_put(&mem->css); 2362 mem = NULL; 2363 goto again; 2364 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2365 css_put(&mem->css); 2366 goto nomem; 2367 case CHARGE_NOMEM: /* OOM routine works */ 2368 if (!oom) { 2369 css_put(&mem->css); 2370 goto nomem; 2371 } 2372 /* If oom, we never return -ENOMEM */ 2373 nr_oom_retries--; 2374 break; 2375 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2376 css_put(&mem->css); 2377 goto bypass; 2378 } 2379 } while (ret != CHARGE_OK); 2380 2381 if (batch > nr_pages) 2382 refill_stock(mem, batch - nr_pages); 2383 css_put(&mem->css); 2384 done: 2385 *memcg = mem; 2386 return 0; 2387 nomem: 2388 *memcg = NULL; 2389 return -ENOMEM; 2390 bypass: 2391 *memcg = NULL; 2392 return 0; 2393 } 2394 2395 /* 2396 * Somemtimes we have to undo a charge we got by try_charge(). 2397 * This function is for that and do uncharge, put css's refcnt. 2398 * gotten by try_charge(). 2399 */ 2400 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2401 unsigned int nr_pages) 2402 { 2403 if (!mem_cgroup_is_root(mem)) { 2404 unsigned long bytes = nr_pages * PAGE_SIZE; 2405 2406 res_counter_uncharge(&mem->res, bytes); 2407 if (do_swap_account) 2408 res_counter_uncharge(&mem->memsw, bytes); 2409 } 2410 } 2411 2412 /* 2413 * A helper function to get mem_cgroup from ID. must be called under 2414 * rcu_read_lock(). The caller must check css_is_removed() or some if 2415 * it's concern. (dropping refcnt from swap can be called against removed 2416 * memcg.) 2417 */ 2418 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2419 { 2420 struct cgroup_subsys_state *css; 2421 2422 /* ID 0 is unused ID */ 2423 if (!id) 2424 return NULL; 2425 css = css_lookup(&mem_cgroup_subsys, id); 2426 if (!css) 2427 return NULL; 2428 return container_of(css, struct mem_cgroup, css); 2429 } 2430 2431 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2432 { 2433 struct mem_cgroup *mem = NULL; 2434 struct page_cgroup *pc; 2435 unsigned short id; 2436 swp_entry_t ent; 2437 2438 VM_BUG_ON(!PageLocked(page)); 2439 2440 pc = lookup_page_cgroup(page); 2441 lock_page_cgroup(pc); 2442 if (PageCgroupUsed(pc)) { 2443 mem = pc->mem_cgroup; 2444 if (mem && !css_tryget(&mem->css)) 2445 mem = NULL; 2446 } else if (PageSwapCache(page)) { 2447 ent.val = page_private(page); 2448 id = lookup_swap_cgroup(ent); 2449 rcu_read_lock(); 2450 mem = mem_cgroup_lookup(id); 2451 if (mem && !css_tryget(&mem->css)) 2452 mem = NULL; 2453 rcu_read_unlock(); 2454 } 2455 unlock_page_cgroup(pc); 2456 return mem; 2457 } 2458 2459 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2460 struct page *page, 2461 unsigned int nr_pages, 2462 struct page_cgroup *pc, 2463 enum charge_type ctype) 2464 { 2465 lock_page_cgroup(pc); 2466 if (unlikely(PageCgroupUsed(pc))) { 2467 unlock_page_cgroup(pc); 2468 __mem_cgroup_cancel_charge(mem, nr_pages); 2469 return; 2470 } 2471 /* 2472 * we don't need page_cgroup_lock about tail pages, becase they are not 2473 * accessed by any other context at this point. 2474 */ 2475 pc->mem_cgroup = mem; 2476 /* 2477 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2478 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2479 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2480 * before USED bit, we need memory barrier here. 2481 * See mem_cgroup_add_lru_list(), etc. 2482 */ 2483 smp_wmb(); 2484 switch (ctype) { 2485 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2487 SetPageCgroupCache(pc); 2488 SetPageCgroupUsed(pc); 2489 break; 2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2491 ClearPageCgroupCache(pc); 2492 SetPageCgroupUsed(pc); 2493 break; 2494 default: 2495 break; 2496 } 2497 2498 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2499 unlock_page_cgroup(pc); 2500 /* 2501 * "charge_statistics" updated event counter. Then, check it. 2502 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2503 * if they exceeds softlimit. 2504 */ 2505 memcg_check_events(mem, page); 2506 } 2507 2508 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2509 2510 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2511 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2512 /* 2513 * Because tail pages are not marked as "used", set it. We're under 2514 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2515 */ 2516 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2517 { 2518 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2519 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2520 unsigned long flags; 2521 2522 if (mem_cgroup_disabled()) 2523 return; 2524 /* 2525 * We have no races with charge/uncharge but will have races with 2526 * page state accounting. 2527 */ 2528 move_lock_page_cgroup(head_pc, &flags); 2529 2530 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2531 smp_wmb(); /* see __commit_charge() */ 2532 if (PageCgroupAcctLRU(head_pc)) { 2533 enum lru_list lru; 2534 struct mem_cgroup_per_zone *mz; 2535 2536 /* 2537 * LRU flags cannot be copied because we need to add tail 2538 *.page to LRU by generic call and our hook will be called. 2539 * We hold lru_lock, then, reduce counter directly. 2540 */ 2541 lru = page_lru(head); 2542 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2543 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2544 } 2545 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2546 move_unlock_page_cgroup(head_pc, &flags); 2547 } 2548 #endif 2549 2550 /** 2551 * mem_cgroup_move_account - move account of the page 2552 * @page: the page 2553 * @nr_pages: number of regular pages (>1 for huge pages) 2554 * @pc: page_cgroup of the page. 2555 * @from: mem_cgroup which the page is moved from. 2556 * @to: mem_cgroup which the page is moved to. @from != @to. 2557 * @uncharge: whether we should call uncharge and css_put against @from. 2558 * 2559 * The caller must confirm following. 2560 * - page is not on LRU (isolate_page() is useful.) 2561 * - compound_lock is held when nr_pages > 1 2562 * 2563 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2564 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2565 * true, this function does "uncharge" from old cgroup, but it doesn't if 2566 * @uncharge is false, so a caller should do "uncharge". 2567 */ 2568 static int mem_cgroup_move_account(struct page *page, 2569 unsigned int nr_pages, 2570 struct page_cgroup *pc, 2571 struct mem_cgroup *from, 2572 struct mem_cgroup *to, 2573 bool uncharge) 2574 { 2575 unsigned long flags; 2576 int ret; 2577 2578 VM_BUG_ON(from == to); 2579 VM_BUG_ON(PageLRU(page)); 2580 /* 2581 * The page is isolated from LRU. So, collapse function 2582 * will not handle this page. But page splitting can happen. 2583 * Do this check under compound_page_lock(). The caller should 2584 * hold it. 2585 */ 2586 ret = -EBUSY; 2587 if (nr_pages > 1 && !PageTransHuge(page)) 2588 goto out; 2589 2590 lock_page_cgroup(pc); 2591 2592 ret = -EINVAL; 2593 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2594 goto unlock; 2595 2596 move_lock_page_cgroup(pc, &flags); 2597 2598 if (PageCgroupFileMapped(pc)) { 2599 /* Update mapped_file data for mem_cgroup */ 2600 preempt_disable(); 2601 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2602 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2603 preempt_enable(); 2604 } 2605 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2606 if (uncharge) 2607 /* This is not "cancel", but cancel_charge does all we need. */ 2608 __mem_cgroup_cancel_charge(from, nr_pages); 2609 2610 /* caller should have done css_get */ 2611 pc->mem_cgroup = to; 2612 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2613 /* 2614 * We charges against "to" which may not have any tasks. Then, "to" 2615 * can be under rmdir(). But in current implementation, caller of 2616 * this function is just force_empty() and move charge, so it's 2617 * guaranteed that "to" is never removed. So, we don't check rmdir 2618 * status here. 2619 */ 2620 move_unlock_page_cgroup(pc, &flags); 2621 ret = 0; 2622 unlock: 2623 unlock_page_cgroup(pc); 2624 /* 2625 * check events 2626 */ 2627 memcg_check_events(to, page); 2628 memcg_check_events(from, page); 2629 out: 2630 return ret; 2631 } 2632 2633 /* 2634 * move charges to its parent. 2635 */ 2636 2637 static int mem_cgroup_move_parent(struct page *page, 2638 struct page_cgroup *pc, 2639 struct mem_cgroup *child, 2640 gfp_t gfp_mask) 2641 { 2642 struct cgroup *cg = child->css.cgroup; 2643 struct cgroup *pcg = cg->parent; 2644 struct mem_cgroup *parent; 2645 unsigned int nr_pages; 2646 unsigned long uninitialized_var(flags); 2647 int ret; 2648 2649 /* Is ROOT ? */ 2650 if (!pcg) 2651 return -EINVAL; 2652 2653 ret = -EBUSY; 2654 if (!get_page_unless_zero(page)) 2655 goto out; 2656 if (isolate_lru_page(page)) 2657 goto put; 2658 2659 nr_pages = hpage_nr_pages(page); 2660 2661 parent = mem_cgroup_from_cont(pcg); 2662 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2663 if (ret || !parent) 2664 goto put_back; 2665 2666 if (nr_pages > 1) 2667 flags = compound_lock_irqsave(page); 2668 2669 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2670 if (ret) 2671 __mem_cgroup_cancel_charge(parent, nr_pages); 2672 2673 if (nr_pages > 1) 2674 compound_unlock_irqrestore(page, flags); 2675 put_back: 2676 putback_lru_page(page); 2677 put: 2678 put_page(page); 2679 out: 2680 return ret; 2681 } 2682 2683 /* 2684 * Charge the memory controller for page usage. 2685 * Return 2686 * 0 if the charge was successful 2687 * < 0 if the cgroup is over its limit 2688 */ 2689 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2690 gfp_t gfp_mask, enum charge_type ctype) 2691 { 2692 struct mem_cgroup *mem = NULL; 2693 unsigned int nr_pages = 1; 2694 struct page_cgroup *pc; 2695 bool oom = true; 2696 int ret; 2697 2698 if (PageTransHuge(page)) { 2699 nr_pages <<= compound_order(page); 2700 VM_BUG_ON(!PageTransHuge(page)); 2701 /* 2702 * Never OOM-kill a process for a huge page. The 2703 * fault handler will fall back to regular pages. 2704 */ 2705 oom = false; 2706 } 2707 2708 pc = lookup_page_cgroup(page); 2709 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2710 2711 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2712 if (ret || !mem) 2713 return ret; 2714 2715 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2716 return 0; 2717 } 2718 2719 int mem_cgroup_newpage_charge(struct page *page, 2720 struct mm_struct *mm, gfp_t gfp_mask) 2721 { 2722 if (mem_cgroup_disabled()) 2723 return 0; 2724 /* 2725 * If already mapped, we don't have to account. 2726 * If page cache, page->mapping has address_space. 2727 * But page->mapping may have out-of-use anon_vma pointer, 2728 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2729 * is NULL. 2730 */ 2731 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2732 return 0; 2733 if (unlikely(!mm)) 2734 mm = &init_mm; 2735 return mem_cgroup_charge_common(page, mm, gfp_mask, 2736 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2737 } 2738 2739 static void 2740 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2741 enum charge_type ctype); 2742 2743 static void 2744 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2745 enum charge_type ctype) 2746 { 2747 struct page_cgroup *pc = lookup_page_cgroup(page); 2748 /* 2749 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2750 * is already on LRU. It means the page may on some other page_cgroup's 2751 * LRU. Take care of it. 2752 */ 2753 mem_cgroup_lru_del_before_commit(page); 2754 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2755 mem_cgroup_lru_add_after_commit(page); 2756 return; 2757 } 2758 2759 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2760 gfp_t gfp_mask) 2761 { 2762 struct mem_cgroup *mem = NULL; 2763 int ret; 2764 2765 if (mem_cgroup_disabled()) 2766 return 0; 2767 if (PageCompound(page)) 2768 return 0; 2769 2770 if (unlikely(!mm)) 2771 mm = &init_mm; 2772 2773 if (page_is_file_cache(page)) { 2774 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2775 if (ret || !mem) 2776 return ret; 2777 2778 /* 2779 * FUSE reuses pages without going through the final 2780 * put that would remove them from the LRU list, make 2781 * sure that they get relinked properly. 2782 */ 2783 __mem_cgroup_commit_charge_lrucare(page, mem, 2784 MEM_CGROUP_CHARGE_TYPE_CACHE); 2785 return ret; 2786 } 2787 /* shmem */ 2788 if (PageSwapCache(page)) { 2789 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2790 if (!ret) 2791 __mem_cgroup_commit_charge_swapin(page, mem, 2792 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2793 } else 2794 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2795 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2796 2797 return ret; 2798 } 2799 2800 /* 2801 * While swap-in, try_charge -> commit or cancel, the page is locked. 2802 * And when try_charge() successfully returns, one refcnt to memcg without 2803 * struct page_cgroup is acquired. This refcnt will be consumed by 2804 * "commit()" or removed by "cancel()" 2805 */ 2806 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2807 struct page *page, 2808 gfp_t mask, struct mem_cgroup **ptr) 2809 { 2810 struct mem_cgroup *mem; 2811 int ret; 2812 2813 *ptr = NULL; 2814 2815 if (mem_cgroup_disabled()) 2816 return 0; 2817 2818 if (!do_swap_account) 2819 goto charge_cur_mm; 2820 /* 2821 * A racing thread's fault, or swapoff, may have already updated 2822 * the pte, and even removed page from swap cache: in those cases 2823 * do_swap_page()'s pte_same() test will fail; but there's also a 2824 * KSM case which does need to charge the page. 2825 */ 2826 if (!PageSwapCache(page)) 2827 goto charge_cur_mm; 2828 mem = try_get_mem_cgroup_from_page(page); 2829 if (!mem) 2830 goto charge_cur_mm; 2831 *ptr = mem; 2832 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2833 css_put(&mem->css); 2834 return ret; 2835 charge_cur_mm: 2836 if (unlikely(!mm)) 2837 mm = &init_mm; 2838 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2839 } 2840 2841 static void 2842 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2843 enum charge_type ctype) 2844 { 2845 if (mem_cgroup_disabled()) 2846 return; 2847 if (!ptr) 2848 return; 2849 cgroup_exclude_rmdir(&ptr->css); 2850 2851 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2852 /* 2853 * Now swap is on-memory. This means this page may be 2854 * counted both as mem and swap....double count. 2855 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2856 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2857 * may call delete_from_swap_cache() before reach here. 2858 */ 2859 if (do_swap_account && PageSwapCache(page)) { 2860 swp_entry_t ent = {.val = page_private(page)}; 2861 unsigned short id; 2862 struct mem_cgroup *memcg; 2863 2864 id = swap_cgroup_record(ent, 0); 2865 rcu_read_lock(); 2866 memcg = mem_cgroup_lookup(id); 2867 if (memcg) { 2868 /* 2869 * This recorded memcg can be obsolete one. So, avoid 2870 * calling css_tryget 2871 */ 2872 if (!mem_cgroup_is_root(memcg)) 2873 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2874 mem_cgroup_swap_statistics(memcg, false); 2875 mem_cgroup_put(memcg); 2876 } 2877 rcu_read_unlock(); 2878 } 2879 /* 2880 * At swapin, we may charge account against cgroup which has no tasks. 2881 * So, rmdir()->pre_destroy() can be called while we do this charge. 2882 * In that case, we need to call pre_destroy() again. check it here. 2883 */ 2884 cgroup_release_and_wakeup_rmdir(&ptr->css); 2885 } 2886 2887 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2888 { 2889 __mem_cgroup_commit_charge_swapin(page, ptr, 2890 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2891 } 2892 2893 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2894 { 2895 if (mem_cgroup_disabled()) 2896 return; 2897 if (!mem) 2898 return; 2899 __mem_cgroup_cancel_charge(mem, 1); 2900 } 2901 2902 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 2903 unsigned int nr_pages, 2904 const enum charge_type ctype) 2905 { 2906 struct memcg_batch_info *batch = NULL; 2907 bool uncharge_memsw = true; 2908 2909 /* If swapout, usage of swap doesn't decrease */ 2910 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2911 uncharge_memsw = false; 2912 2913 batch = ¤t->memcg_batch; 2914 /* 2915 * In usual, we do css_get() when we remember memcg pointer. 2916 * But in this case, we keep res->usage until end of a series of 2917 * uncharges. Then, it's ok to ignore memcg's refcnt. 2918 */ 2919 if (!batch->memcg) 2920 batch->memcg = mem; 2921 /* 2922 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2923 * In those cases, all pages freed continuously can be expected to be in 2924 * the same cgroup and we have chance to coalesce uncharges. 2925 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2926 * because we want to do uncharge as soon as possible. 2927 */ 2928 2929 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2930 goto direct_uncharge; 2931 2932 if (nr_pages > 1) 2933 goto direct_uncharge; 2934 2935 /* 2936 * In typical case, batch->memcg == mem. This means we can 2937 * merge a series of uncharges to an uncharge of res_counter. 2938 * If not, we uncharge res_counter ony by one. 2939 */ 2940 if (batch->memcg != mem) 2941 goto direct_uncharge; 2942 /* remember freed charge and uncharge it later */ 2943 batch->nr_pages++; 2944 if (uncharge_memsw) 2945 batch->memsw_nr_pages++; 2946 return; 2947 direct_uncharge: 2948 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 2949 if (uncharge_memsw) 2950 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 2951 if (unlikely(batch->memcg != mem)) 2952 memcg_oom_recover(mem); 2953 return; 2954 } 2955 2956 /* 2957 * uncharge if !page_mapped(page) 2958 */ 2959 static struct mem_cgroup * 2960 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2961 { 2962 struct mem_cgroup *mem = NULL; 2963 unsigned int nr_pages = 1; 2964 struct page_cgroup *pc; 2965 2966 if (mem_cgroup_disabled()) 2967 return NULL; 2968 2969 if (PageSwapCache(page)) 2970 return NULL; 2971 2972 if (PageTransHuge(page)) { 2973 nr_pages <<= compound_order(page); 2974 VM_BUG_ON(!PageTransHuge(page)); 2975 } 2976 /* 2977 * Check if our page_cgroup is valid 2978 */ 2979 pc = lookup_page_cgroup(page); 2980 if (unlikely(!pc || !PageCgroupUsed(pc))) 2981 return NULL; 2982 2983 lock_page_cgroup(pc); 2984 2985 mem = pc->mem_cgroup; 2986 2987 if (!PageCgroupUsed(pc)) 2988 goto unlock_out; 2989 2990 switch (ctype) { 2991 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2992 case MEM_CGROUP_CHARGE_TYPE_DROP: 2993 /* See mem_cgroup_prepare_migration() */ 2994 if (page_mapped(page) || PageCgroupMigration(pc)) 2995 goto unlock_out; 2996 break; 2997 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2998 if (!PageAnon(page)) { /* Shared memory */ 2999 if (page->mapping && !page_is_file_cache(page)) 3000 goto unlock_out; 3001 } else if (page_mapped(page)) /* Anon */ 3002 goto unlock_out; 3003 break; 3004 default: 3005 break; 3006 } 3007 3008 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3009 3010 ClearPageCgroupUsed(pc); 3011 /* 3012 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3013 * freed from LRU. This is safe because uncharged page is expected not 3014 * to be reused (freed soon). Exception is SwapCache, it's handled by 3015 * special functions. 3016 */ 3017 3018 unlock_page_cgroup(pc); 3019 /* 3020 * even after unlock, we have mem->res.usage here and this memcg 3021 * will never be freed. 3022 */ 3023 memcg_check_events(mem, page); 3024 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3025 mem_cgroup_swap_statistics(mem, true); 3026 mem_cgroup_get(mem); 3027 } 3028 if (!mem_cgroup_is_root(mem)) 3029 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3030 3031 return mem; 3032 3033 unlock_out: 3034 unlock_page_cgroup(pc); 3035 return NULL; 3036 } 3037 3038 void mem_cgroup_uncharge_page(struct page *page) 3039 { 3040 /* early check. */ 3041 if (page_mapped(page)) 3042 return; 3043 if (page->mapping && !PageAnon(page)) 3044 return; 3045 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3046 } 3047 3048 void mem_cgroup_uncharge_cache_page(struct page *page) 3049 { 3050 VM_BUG_ON(page_mapped(page)); 3051 VM_BUG_ON(page->mapping); 3052 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3053 } 3054 3055 /* 3056 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3057 * In that cases, pages are freed continuously and we can expect pages 3058 * are in the same memcg. All these calls itself limits the number of 3059 * pages freed at once, then uncharge_start/end() is called properly. 3060 * This may be called prural(2) times in a context, 3061 */ 3062 3063 void mem_cgroup_uncharge_start(void) 3064 { 3065 current->memcg_batch.do_batch++; 3066 /* We can do nest. */ 3067 if (current->memcg_batch.do_batch == 1) { 3068 current->memcg_batch.memcg = NULL; 3069 current->memcg_batch.nr_pages = 0; 3070 current->memcg_batch.memsw_nr_pages = 0; 3071 } 3072 } 3073 3074 void mem_cgroup_uncharge_end(void) 3075 { 3076 struct memcg_batch_info *batch = ¤t->memcg_batch; 3077 3078 if (!batch->do_batch) 3079 return; 3080 3081 batch->do_batch--; 3082 if (batch->do_batch) /* If stacked, do nothing. */ 3083 return; 3084 3085 if (!batch->memcg) 3086 return; 3087 /* 3088 * This "batch->memcg" is valid without any css_get/put etc... 3089 * bacause we hide charges behind us. 3090 */ 3091 if (batch->nr_pages) 3092 res_counter_uncharge(&batch->memcg->res, 3093 batch->nr_pages * PAGE_SIZE); 3094 if (batch->memsw_nr_pages) 3095 res_counter_uncharge(&batch->memcg->memsw, 3096 batch->memsw_nr_pages * PAGE_SIZE); 3097 memcg_oom_recover(batch->memcg); 3098 /* forget this pointer (for sanity check) */ 3099 batch->memcg = NULL; 3100 } 3101 3102 #ifdef CONFIG_SWAP 3103 /* 3104 * called after __delete_from_swap_cache() and drop "page" account. 3105 * memcg information is recorded to swap_cgroup of "ent" 3106 */ 3107 void 3108 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3109 { 3110 struct mem_cgroup *memcg; 3111 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3112 3113 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3114 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3115 3116 memcg = __mem_cgroup_uncharge_common(page, ctype); 3117 3118 /* 3119 * record memcg information, if swapout && memcg != NULL, 3120 * mem_cgroup_get() was called in uncharge(). 3121 */ 3122 if (do_swap_account && swapout && memcg) 3123 swap_cgroup_record(ent, css_id(&memcg->css)); 3124 } 3125 #endif 3126 3127 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3128 /* 3129 * called from swap_entry_free(). remove record in swap_cgroup and 3130 * uncharge "memsw" account. 3131 */ 3132 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3133 { 3134 struct mem_cgroup *memcg; 3135 unsigned short id; 3136 3137 if (!do_swap_account) 3138 return; 3139 3140 id = swap_cgroup_record(ent, 0); 3141 rcu_read_lock(); 3142 memcg = mem_cgroup_lookup(id); 3143 if (memcg) { 3144 /* 3145 * We uncharge this because swap is freed. 3146 * This memcg can be obsolete one. We avoid calling css_tryget 3147 */ 3148 if (!mem_cgroup_is_root(memcg)) 3149 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3150 mem_cgroup_swap_statistics(memcg, false); 3151 mem_cgroup_put(memcg); 3152 } 3153 rcu_read_unlock(); 3154 } 3155 3156 /** 3157 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3158 * @entry: swap entry to be moved 3159 * @from: mem_cgroup which the entry is moved from 3160 * @to: mem_cgroup which the entry is moved to 3161 * @need_fixup: whether we should fixup res_counters and refcounts. 3162 * 3163 * It succeeds only when the swap_cgroup's record for this entry is the same 3164 * as the mem_cgroup's id of @from. 3165 * 3166 * Returns 0 on success, -EINVAL on failure. 3167 * 3168 * The caller must have charged to @to, IOW, called res_counter_charge() about 3169 * both res and memsw, and called css_get(). 3170 */ 3171 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3172 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3173 { 3174 unsigned short old_id, new_id; 3175 3176 old_id = css_id(&from->css); 3177 new_id = css_id(&to->css); 3178 3179 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3180 mem_cgroup_swap_statistics(from, false); 3181 mem_cgroup_swap_statistics(to, true); 3182 /* 3183 * This function is only called from task migration context now. 3184 * It postpones res_counter and refcount handling till the end 3185 * of task migration(mem_cgroup_clear_mc()) for performance 3186 * improvement. But we cannot postpone mem_cgroup_get(to) 3187 * because if the process that has been moved to @to does 3188 * swap-in, the refcount of @to might be decreased to 0. 3189 */ 3190 mem_cgroup_get(to); 3191 if (need_fixup) { 3192 if (!mem_cgroup_is_root(from)) 3193 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3194 mem_cgroup_put(from); 3195 /* 3196 * we charged both to->res and to->memsw, so we should 3197 * uncharge to->res. 3198 */ 3199 if (!mem_cgroup_is_root(to)) 3200 res_counter_uncharge(&to->res, PAGE_SIZE); 3201 } 3202 return 0; 3203 } 3204 return -EINVAL; 3205 } 3206 #else 3207 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3208 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3209 { 3210 return -EINVAL; 3211 } 3212 #endif 3213 3214 /* 3215 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3216 * page belongs to. 3217 */ 3218 int mem_cgroup_prepare_migration(struct page *page, 3219 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3220 { 3221 struct mem_cgroup *mem = NULL; 3222 struct page_cgroup *pc; 3223 enum charge_type ctype; 3224 int ret = 0; 3225 3226 *ptr = NULL; 3227 3228 VM_BUG_ON(PageTransHuge(page)); 3229 if (mem_cgroup_disabled()) 3230 return 0; 3231 3232 pc = lookup_page_cgroup(page); 3233 lock_page_cgroup(pc); 3234 if (PageCgroupUsed(pc)) { 3235 mem = pc->mem_cgroup; 3236 css_get(&mem->css); 3237 /* 3238 * At migrating an anonymous page, its mapcount goes down 3239 * to 0 and uncharge() will be called. But, even if it's fully 3240 * unmapped, migration may fail and this page has to be 3241 * charged again. We set MIGRATION flag here and delay uncharge 3242 * until end_migration() is called 3243 * 3244 * Corner Case Thinking 3245 * A) 3246 * When the old page was mapped as Anon and it's unmap-and-freed 3247 * while migration was ongoing. 3248 * If unmap finds the old page, uncharge() of it will be delayed 3249 * until end_migration(). If unmap finds a new page, it's 3250 * uncharged when it make mapcount to be 1->0. If unmap code 3251 * finds swap_migration_entry, the new page will not be mapped 3252 * and end_migration() will find it(mapcount==0). 3253 * 3254 * B) 3255 * When the old page was mapped but migraion fails, the kernel 3256 * remaps it. A charge for it is kept by MIGRATION flag even 3257 * if mapcount goes down to 0. We can do remap successfully 3258 * without charging it again. 3259 * 3260 * C) 3261 * The "old" page is under lock_page() until the end of 3262 * migration, so, the old page itself will not be swapped-out. 3263 * If the new page is swapped out before end_migraton, our 3264 * hook to usual swap-out path will catch the event. 3265 */ 3266 if (PageAnon(page)) 3267 SetPageCgroupMigration(pc); 3268 } 3269 unlock_page_cgroup(pc); 3270 /* 3271 * If the page is not charged at this point, 3272 * we return here. 3273 */ 3274 if (!mem) 3275 return 0; 3276 3277 *ptr = mem; 3278 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3279 css_put(&mem->css);/* drop extra refcnt */ 3280 if (ret || *ptr == NULL) { 3281 if (PageAnon(page)) { 3282 lock_page_cgroup(pc); 3283 ClearPageCgroupMigration(pc); 3284 unlock_page_cgroup(pc); 3285 /* 3286 * The old page may be fully unmapped while we kept it. 3287 */ 3288 mem_cgroup_uncharge_page(page); 3289 } 3290 return -ENOMEM; 3291 } 3292 /* 3293 * We charge new page before it's used/mapped. So, even if unlock_page() 3294 * is called before end_migration, we can catch all events on this new 3295 * page. In the case new page is migrated but not remapped, new page's 3296 * mapcount will be finally 0 and we call uncharge in end_migration(). 3297 */ 3298 pc = lookup_page_cgroup(newpage); 3299 if (PageAnon(page)) 3300 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3301 else if (page_is_file_cache(page)) 3302 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3303 else 3304 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3305 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3306 return ret; 3307 } 3308 3309 /* remove redundant charge if migration failed*/ 3310 void mem_cgroup_end_migration(struct mem_cgroup *mem, 3311 struct page *oldpage, struct page *newpage, bool migration_ok) 3312 { 3313 struct page *used, *unused; 3314 struct page_cgroup *pc; 3315 3316 if (!mem) 3317 return; 3318 /* blocks rmdir() */ 3319 cgroup_exclude_rmdir(&mem->css); 3320 if (!migration_ok) { 3321 used = oldpage; 3322 unused = newpage; 3323 } else { 3324 used = newpage; 3325 unused = oldpage; 3326 } 3327 /* 3328 * We disallowed uncharge of pages under migration because mapcount 3329 * of the page goes down to zero, temporarly. 3330 * Clear the flag and check the page should be charged. 3331 */ 3332 pc = lookup_page_cgroup(oldpage); 3333 lock_page_cgroup(pc); 3334 ClearPageCgroupMigration(pc); 3335 unlock_page_cgroup(pc); 3336 3337 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3338 3339 /* 3340 * If a page is a file cache, radix-tree replacement is very atomic 3341 * and we can skip this check. When it was an Anon page, its mapcount 3342 * goes down to 0. But because we added MIGRATION flage, it's not 3343 * uncharged yet. There are several case but page->mapcount check 3344 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3345 * check. (see prepare_charge() also) 3346 */ 3347 if (PageAnon(used)) 3348 mem_cgroup_uncharge_page(used); 3349 /* 3350 * At migration, we may charge account against cgroup which has no 3351 * tasks. 3352 * So, rmdir()->pre_destroy() can be called while we do this charge. 3353 * In that case, we need to call pre_destroy() again. check it here. 3354 */ 3355 cgroup_release_and_wakeup_rmdir(&mem->css); 3356 } 3357 3358 #ifdef CONFIG_DEBUG_VM 3359 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3360 { 3361 struct page_cgroup *pc; 3362 3363 pc = lookup_page_cgroup(page); 3364 if (likely(pc) && PageCgroupUsed(pc)) 3365 return pc; 3366 return NULL; 3367 } 3368 3369 bool mem_cgroup_bad_page_check(struct page *page) 3370 { 3371 if (mem_cgroup_disabled()) 3372 return false; 3373 3374 return lookup_page_cgroup_used(page) != NULL; 3375 } 3376 3377 void mem_cgroup_print_bad_page(struct page *page) 3378 { 3379 struct page_cgroup *pc; 3380 3381 pc = lookup_page_cgroup_used(page); 3382 if (pc) { 3383 int ret = -1; 3384 char *path; 3385 3386 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", 3387 pc, pc->flags, pc->mem_cgroup); 3388 3389 path = kmalloc(PATH_MAX, GFP_KERNEL); 3390 if (path) { 3391 rcu_read_lock(); 3392 ret = cgroup_path(pc->mem_cgroup->css.cgroup, 3393 path, PATH_MAX); 3394 rcu_read_unlock(); 3395 } 3396 3397 printk(KERN_CONT "(%s)\n", 3398 (ret < 0) ? "cannot get the path" : path); 3399 kfree(path); 3400 } 3401 } 3402 #endif 3403 3404 static DEFINE_MUTEX(set_limit_mutex); 3405 3406 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3407 unsigned long long val) 3408 { 3409 int retry_count; 3410 u64 memswlimit, memlimit; 3411 int ret = 0; 3412 int children = mem_cgroup_count_children(memcg); 3413 u64 curusage, oldusage; 3414 int enlarge; 3415 3416 /* 3417 * For keeping hierarchical_reclaim simple, how long we should retry 3418 * is depends on callers. We set our retry-count to be function 3419 * of # of children which we should visit in this loop. 3420 */ 3421 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3422 3423 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3424 3425 enlarge = 0; 3426 while (retry_count) { 3427 if (signal_pending(current)) { 3428 ret = -EINTR; 3429 break; 3430 } 3431 /* 3432 * Rather than hide all in some function, I do this in 3433 * open coded manner. You see what this really does. 3434 * We have to guarantee mem->res.limit < mem->memsw.limit. 3435 */ 3436 mutex_lock(&set_limit_mutex); 3437 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3438 if (memswlimit < val) { 3439 ret = -EINVAL; 3440 mutex_unlock(&set_limit_mutex); 3441 break; 3442 } 3443 3444 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3445 if (memlimit < val) 3446 enlarge = 1; 3447 3448 ret = res_counter_set_limit(&memcg->res, val); 3449 if (!ret) { 3450 if (memswlimit == val) 3451 memcg->memsw_is_minimum = true; 3452 else 3453 memcg->memsw_is_minimum = false; 3454 } 3455 mutex_unlock(&set_limit_mutex); 3456 3457 if (!ret) 3458 break; 3459 3460 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3461 MEM_CGROUP_RECLAIM_SHRINK, 3462 NULL); 3463 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3464 /* Usage is reduced ? */ 3465 if (curusage >= oldusage) 3466 retry_count--; 3467 else 3468 oldusage = curusage; 3469 } 3470 if (!ret && enlarge) 3471 memcg_oom_recover(memcg); 3472 3473 return ret; 3474 } 3475 3476 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3477 unsigned long long val) 3478 { 3479 int retry_count; 3480 u64 memlimit, memswlimit, oldusage, curusage; 3481 int children = mem_cgroup_count_children(memcg); 3482 int ret = -EBUSY; 3483 int enlarge = 0; 3484 3485 /* see mem_cgroup_resize_res_limit */ 3486 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3487 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3488 while (retry_count) { 3489 if (signal_pending(current)) { 3490 ret = -EINTR; 3491 break; 3492 } 3493 /* 3494 * Rather than hide all in some function, I do this in 3495 * open coded manner. You see what this really does. 3496 * We have to guarantee mem->res.limit < mem->memsw.limit. 3497 */ 3498 mutex_lock(&set_limit_mutex); 3499 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3500 if (memlimit > val) { 3501 ret = -EINVAL; 3502 mutex_unlock(&set_limit_mutex); 3503 break; 3504 } 3505 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3506 if (memswlimit < val) 3507 enlarge = 1; 3508 ret = res_counter_set_limit(&memcg->memsw, val); 3509 if (!ret) { 3510 if (memlimit == val) 3511 memcg->memsw_is_minimum = true; 3512 else 3513 memcg->memsw_is_minimum = false; 3514 } 3515 mutex_unlock(&set_limit_mutex); 3516 3517 if (!ret) 3518 break; 3519 3520 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3521 MEM_CGROUP_RECLAIM_NOSWAP | 3522 MEM_CGROUP_RECLAIM_SHRINK, 3523 NULL); 3524 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3525 /* Usage is reduced ? */ 3526 if (curusage >= oldusage) 3527 retry_count--; 3528 else 3529 oldusage = curusage; 3530 } 3531 if (!ret && enlarge) 3532 memcg_oom_recover(memcg); 3533 return ret; 3534 } 3535 3536 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3537 gfp_t gfp_mask, 3538 unsigned long *total_scanned) 3539 { 3540 unsigned long nr_reclaimed = 0; 3541 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3542 unsigned long reclaimed; 3543 int loop = 0; 3544 struct mem_cgroup_tree_per_zone *mctz; 3545 unsigned long long excess; 3546 unsigned long nr_scanned; 3547 3548 if (order > 0) 3549 return 0; 3550 3551 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3552 /* 3553 * This loop can run a while, specially if mem_cgroup's continuously 3554 * keep exceeding their soft limit and putting the system under 3555 * pressure 3556 */ 3557 do { 3558 if (next_mz) 3559 mz = next_mz; 3560 else 3561 mz = mem_cgroup_largest_soft_limit_node(mctz); 3562 if (!mz) 3563 break; 3564 3565 nr_scanned = 0; 3566 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3567 gfp_mask, 3568 MEM_CGROUP_RECLAIM_SOFT, 3569 &nr_scanned); 3570 nr_reclaimed += reclaimed; 3571 *total_scanned += nr_scanned; 3572 spin_lock(&mctz->lock); 3573 3574 /* 3575 * If we failed to reclaim anything from this memory cgroup 3576 * it is time to move on to the next cgroup 3577 */ 3578 next_mz = NULL; 3579 if (!reclaimed) { 3580 do { 3581 /* 3582 * Loop until we find yet another one. 3583 * 3584 * By the time we get the soft_limit lock 3585 * again, someone might have aded the 3586 * group back on the RB tree. Iterate to 3587 * make sure we get a different mem. 3588 * mem_cgroup_largest_soft_limit_node returns 3589 * NULL if no other cgroup is present on 3590 * the tree 3591 */ 3592 next_mz = 3593 __mem_cgroup_largest_soft_limit_node(mctz); 3594 if (next_mz == mz) 3595 css_put(&next_mz->mem->css); 3596 else /* next_mz == NULL or other memcg */ 3597 break; 3598 } while (1); 3599 } 3600 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3601 excess = res_counter_soft_limit_excess(&mz->mem->res); 3602 /* 3603 * One school of thought says that we should not add 3604 * back the node to the tree if reclaim returns 0. 3605 * But our reclaim could return 0, simply because due 3606 * to priority we are exposing a smaller subset of 3607 * memory to reclaim from. Consider this as a longer 3608 * term TODO. 3609 */ 3610 /* If excess == 0, no tree ops */ 3611 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3612 spin_unlock(&mctz->lock); 3613 css_put(&mz->mem->css); 3614 loop++; 3615 /* 3616 * Could not reclaim anything and there are no more 3617 * mem cgroups to try or we seem to be looping without 3618 * reclaiming anything. 3619 */ 3620 if (!nr_reclaimed && 3621 (next_mz == NULL || 3622 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3623 break; 3624 } while (!nr_reclaimed); 3625 if (next_mz) 3626 css_put(&next_mz->mem->css); 3627 return nr_reclaimed; 3628 } 3629 3630 /* 3631 * This routine traverse page_cgroup in given list and drop them all. 3632 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3633 */ 3634 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3635 int node, int zid, enum lru_list lru) 3636 { 3637 struct zone *zone; 3638 struct mem_cgroup_per_zone *mz; 3639 struct page_cgroup *pc, *busy; 3640 unsigned long flags, loop; 3641 struct list_head *list; 3642 int ret = 0; 3643 3644 zone = &NODE_DATA(node)->node_zones[zid]; 3645 mz = mem_cgroup_zoneinfo(mem, node, zid); 3646 list = &mz->lists[lru]; 3647 3648 loop = MEM_CGROUP_ZSTAT(mz, lru); 3649 /* give some margin against EBUSY etc...*/ 3650 loop += 256; 3651 busy = NULL; 3652 while (loop--) { 3653 struct page *page; 3654 3655 ret = 0; 3656 spin_lock_irqsave(&zone->lru_lock, flags); 3657 if (list_empty(list)) { 3658 spin_unlock_irqrestore(&zone->lru_lock, flags); 3659 break; 3660 } 3661 pc = list_entry(list->prev, struct page_cgroup, lru); 3662 if (busy == pc) { 3663 list_move(&pc->lru, list); 3664 busy = NULL; 3665 spin_unlock_irqrestore(&zone->lru_lock, flags); 3666 continue; 3667 } 3668 spin_unlock_irqrestore(&zone->lru_lock, flags); 3669 3670 page = lookup_cgroup_page(pc); 3671 3672 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3673 if (ret == -ENOMEM) 3674 break; 3675 3676 if (ret == -EBUSY || ret == -EINVAL) { 3677 /* found lock contention or "pc" is obsolete. */ 3678 busy = pc; 3679 cond_resched(); 3680 } else 3681 busy = NULL; 3682 } 3683 3684 if (!ret && !list_empty(list)) 3685 return -EBUSY; 3686 return ret; 3687 } 3688 3689 /* 3690 * make mem_cgroup's charge to be 0 if there is no task. 3691 * This enables deleting this mem_cgroup. 3692 */ 3693 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3694 { 3695 int ret; 3696 int node, zid, shrink; 3697 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3698 struct cgroup *cgrp = mem->css.cgroup; 3699 3700 css_get(&mem->css); 3701 3702 shrink = 0; 3703 /* should free all ? */ 3704 if (free_all) 3705 goto try_to_free; 3706 move_account: 3707 do { 3708 ret = -EBUSY; 3709 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3710 goto out; 3711 ret = -EINTR; 3712 if (signal_pending(current)) 3713 goto out; 3714 /* This is for making all *used* pages to be on LRU. */ 3715 lru_add_drain_all(); 3716 drain_all_stock_sync(mem); 3717 ret = 0; 3718 mem_cgroup_start_move(mem); 3719 for_each_node_state(node, N_HIGH_MEMORY) { 3720 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3721 enum lru_list l; 3722 for_each_lru(l) { 3723 ret = mem_cgroup_force_empty_list(mem, 3724 node, zid, l); 3725 if (ret) 3726 break; 3727 } 3728 } 3729 if (ret) 3730 break; 3731 } 3732 mem_cgroup_end_move(mem); 3733 memcg_oom_recover(mem); 3734 /* it seems parent cgroup doesn't have enough mem */ 3735 if (ret == -ENOMEM) 3736 goto try_to_free; 3737 cond_resched(); 3738 /* "ret" should also be checked to ensure all lists are empty. */ 3739 } while (mem->res.usage > 0 || ret); 3740 out: 3741 css_put(&mem->css); 3742 return ret; 3743 3744 try_to_free: 3745 /* returns EBUSY if there is a task or if we come here twice. */ 3746 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3747 ret = -EBUSY; 3748 goto out; 3749 } 3750 /* we call try-to-free pages for make this cgroup empty */ 3751 lru_add_drain_all(); 3752 /* try to free all pages in this cgroup */ 3753 shrink = 1; 3754 while (nr_retries && mem->res.usage > 0) { 3755 int progress; 3756 3757 if (signal_pending(current)) { 3758 ret = -EINTR; 3759 goto out; 3760 } 3761 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3762 false); 3763 if (!progress) { 3764 nr_retries--; 3765 /* maybe some writeback is necessary */ 3766 congestion_wait(BLK_RW_ASYNC, HZ/10); 3767 } 3768 3769 } 3770 lru_add_drain(); 3771 /* try move_account...there may be some *locked* pages. */ 3772 goto move_account; 3773 } 3774 3775 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3776 { 3777 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3778 } 3779 3780 3781 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3782 { 3783 return mem_cgroup_from_cont(cont)->use_hierarchy; 3784 } 3785 3786 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3787 u64 val) 3788 { 3789 int retval = 0; 3790 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3791 struct cgroup *parent = cont->parent; 3792 struct mem_cgroup *parent_mem = NULL; 3793 3794 if (parent) 3795 parent_mem = mem_cgroup_from_cont(parent); 3796 3797 cgroup_lock(); 3798 /* 3799 * If parent's use_hierarchy is set, we can't make any modifications 3800 * in the child subtrees. If it is unset, then the change can 3801 * occur, provided the current cgroup has no children. 3802 * 3803 * For the root cgroup, parent_mem is NULL, we allow value to be 3804 * set if there are no children. 3805 */ 3806 if ((!parent_mem || !parent_mem->use_hierarchy) && 3807 (val == 1 || val == 0)) { 3808 if (list_empty(&cont->children)) 3809 mem->use_hierarchy = val; 3810 else 3811 retval = -EBUSY; 3812 } else 3813 retval = -EINVAL; 3814 cgroup_unlock(); 3815 3816 return retval; 3817 } 3818 3819 3820 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3821 enum mem_cgroup_stat_index idx) 3822 { 3823 struct mem_cgroup *iter; 3824 long val = 0; 3825 3826 /* Per-cpu values can be negative, use a signed accumulator */ 3827 for_each_mem_cgroup_tree(iter, mem) 3828 val += mem_cgroup_read_stat(iter, idx); 3829 3830 if (val < 0) /* race ? */ 3831 val = 0; 3832 return val; 3833 } 3834 3835 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3836 { 3837 u64 val; 3838 3839 if (!mem_cgroup_is_root(mem)) { 3840 if (!swap) 3841 return res_counter_read_u64(&mem->res, RES_USAGE); 3842 else 3843 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3844 } 3845 3846 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3847 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3848 3849 if (swap) 3850 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3851 3852 return val << PAGE_SHIFT; 3853 } 3854 3855 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3856 { 3857 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3858 u64 val; 3859 int type, name; 3860 3861 type = MEMFILE_TYPE(cft->private); 3862 name = MEMFILE_ATTR(cft->private); 3863 switch (type) { 3864 case _MEM: 3865 if (name == RES_USAGE) 3866 val = mem_cgroup_usage(mem, false); 3867 else 3868 val = res_counter_read_u64(&mem->res, name); 3869 break; 3870 case _MEMSWAP: 3871 if (name == RES_USAGE) 3872 val = mem_cgroup_usage(mem, true); 3873 else 3874 val = res_counter_read_u64(&mem->memsw, name); 3875 break; 3876 default: 3877 BUG(); 3878 break; 3879 } 3880 return val; 3881 } 3882 /* 3883 * The user of this function is... 3884 * RES_LIMIT. 3885 */ 3886 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3887 const char *buffer) 3888 { 3889 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3890 int type, name; 3891 unsigned long long val; 3892 int ret; 3893 3894 type = MEMFILE_TYPE(cft->private); 3895 name = MEMFILE_ATTR(cft->private); 3896 switch (name) { 3897 case RES_LIMIT: 3898 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3899 ret = -EINVAL; 3900 break; 3901 } 3902 /* This function does all necessary parse...reuse it */ 3903 ret = res_counter_memparse_write_strategy(buffer, &val); 3904 if (ret) 3905 break; 3906 if (type == _MEM) 3907 ret = mem_cgroup_resize_limit(memcg, val); 3908 else 3909 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3910 break; 3911 case RES_SOFT_LIMIT: 3912 ret = res_counter_memparse_write_strategy(buffer, &val); 3913 if (ret) 3914 break; 3915 /* 3916 * For memsw, soft limits are hard to implement in terms 3917 * of semantics, for now, we support soft limits for 3918 * control without swap 3919 */ 3920 if (type == _MEM) 3921 ret = res_counter_set_soft_limit(&memcg->res, val); 3922 else 3923 ret = -EINVAL; 3924 break; 3925 default: 3926 ret = -EINVAL; /* should be BUG() ? */ 3927 break; 3928 } 3929 return ret; 3930 } 3931 3932 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3933 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3934 { 3935 struct cgroup *cgroup; 3936 unsigned long long min_limit, min_memsw_limit, tmp; 3937 3938 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3939 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3940 cgroup = memcg->css.cgroup; 3941 if (!memcg->use_hierarchy) 3942 goto out; 3943 3944 while (cgroup->parent) { 3945 cgroup = cgroup->parent; 3946 memcg = mem_cgroup_from_cont(cgroup); 3947 if (!memcg->use_hierarchy) 3948 break; 3949 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3950 min_limit = min(min_limit, tmp); 3951 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3952 min_memsw_limit = min(min_memsw_limit, tmp); 3953 } 3954 out: 3955 *mem_limit = min_limit; 3956 *memsw_limit = min_memsw_limit; 3957 return; 3958 } 3959 3960 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3961 { 3962 struct mem_cgroup *mem; 3963 int type, name; 3964 3965 mem = mem_cgroup_from_cont(cont); 3966 type = MEMFILE_TYPE(event); 3967 name = MEMFILE_ATTR(event); 3968 switch (name) { 3969 case RES_MAX_USAGE: 3970 if (type == _MEM) 3971 res_counter_reset_max(&mem->res); 3972 else 3973 res_counter_reset_max(&mem->memsw); 3974 break; 3975 case RES_FAILCNT: 3976 if (type == _MEM) 3977 res_counter_reset_failcnt(&mem->res); 3978 else 3979 res_counter_reset_failcnt(&mem->memsw); 3980 break; 3981 } 3982 3983 return 0; 3984 } 3985 3986 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3987 struct cftype *cft) 3988 { 3989 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3990 } 3991 3992 #ifdef CONFIG_MMU 3993 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3994 struct cftype *cft, u64 val) 3995 { 3996 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3997 3998 if (val >= (1 << NR_MOVE_TYPE)) 3999 return -EINVAL; 4000 /* 4001 * We check this value several times in both in can_attach() and 4002 * attach(), so we need cgroup lock to prevent this value from being 4003 * inconsistent. 4004 */ 4005 cgroup_lock(); 4006 mem->move_charge_at_immigrate = val; 4007 cgroup_unlock(); 4008 4009 return 0; 4010 } 4011 #else 4012 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4013 struct cftype *cft, u64 val) 4014 { 4015 return -ENOSYS; 4016 } 4017 #endif 4018 4019 4020 /* For read statistics */ 4021 enum { 4022 MCS_CACHE, 4023 MCS_RSS, 4024 MCS_FILE_MAPPED, 4025 MCS_PGPGIN, 4026 MCS_PGPGOUT, 4027 MCS_SWAP, 4028 MCS_PGFAULT, 4029 MCS_PGMAJFAULT, 4030 MCS_INACTIVE_ANON, 4031 MCS_ACTIVE_ANON, 4032 MCS_INACTIVE_FILE, 4033 MCS_ACTIVE_FILE, 4034 MCS_UNEVICTABLE, 4035 NR_MCS_STAT, 4036 }; 4037 4038 struct mcs_total_stat { 4039 s64 stat[NR_MCS_STAT]; 4040 }; 4041 4042 struct { 4043 char *local_name; 4044 char *total_name; 4045 } memcg_stat_strings[NR_MCS_STAT] = { 4046 {"cache", "total_cache"}, 4047 {"rss", "total_rss"}, 4048 {"mapped_file", "total_mapped_file"}, 4049 {"pgpgin", "total_pgpgin"}, 4050 {"pgpgout", "total_pgpgout"}, 4051 {"swap", "total_swap"}, 4052 {"pgfault", "total_pgfault"}, 4053 {"pgmajfault", "total_pgmajfault"}, 4054 {"inactive_anon", "total_inactive_anon"}, 4055 {"active_anon", "total_active_anon"}, 4056 {"inactive_file", "total_inactive_file"}, 4057 {"active_file", "total_active_file"}, 4058 {"unevictable", "total_unevictable"} 4059 }; 4060 4061 4062 static void 4063 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4064 { 4065 s64 val; 4066 4067 /* per cpu stat */ 4068 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4069 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4070 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4071 s->stat[MCS_RSS] += val * PAGE_SIZE; 4072 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4073 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4074 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4075 s->stat[MCS_PGPGIN] += val; 4076 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4077 s->stat[MCS_PGPGOUT] += val; 4078 if (do_swap_account) { 4079 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4080 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4081 } 4082 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4083 s->stat[MCS_PGFAULT] += val; 4084 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4085 s->stat[MCS_PGMAJFAULT] += val; 4086 4087 /* per zone stat */ 4088 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); 4089 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4090 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); 4091 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4092 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); 4093 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4094 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); 4095 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4096 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); 4097 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4098 } 4099 4100 static void 4101 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4102 { 4103 struct mem_cgroup *iter; 4104 4105 for_each_mem_cgroup_tree(iter, mem) 4106 mem_cgroup_get_local_stat(iter, s); 4107 } 4108 4109 #ifdef CONFIG_NUMA 4110 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4111 { 4112 int nid; 4113 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4114 unsigned long node_nr; 4115 struct cgroup *cont = m->private; 4116 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4117 4118 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4119 seq_printf(m, "total=%lu", total_nr); 4120 for_each_node_state(nid, N_HIGH_MEMORY) { 4121 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4122 seq_printf(m, " N%d=%lu", nid, node_nr); 4123 } 4124 seq_putc(m, '\n'); 4125 4126 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4127 seq_printf(m, "file=%lu", file_nr); 4128 for_each_node_state(nid, N_HIGH_MEMORY) { 4129 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4130 LRU_ALL_FILE); 4131 seq_printf(m, " N%d=%lu", nid, node_nr); 4132 } 4133 seq_putc(m, '\n'); 4134 4135 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4136 seq_printf(m, "anon=%lu", anon_nr); 4137 for_each_node_state(nid, N_HIGH_MEMORY) { 4138 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4139 LRU_ALL_ANON); 4140 seq_printf(m, " N%d=%lu", nid, node_nr); 4141 } 4142 seq_putc(m, '\n'); 4143 4144 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4145 seq_printf(m, "unevictable=%lu", unevictable_nr); 4146 for_each_node_state(nid, N_HIGH_MEMORY) { 4147 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4148 BIT(LRU_UNEVICTABLE)); 4149 seq_printf(m, " N%d=%lu", nid, node_nr); 4150 } 4151 seq_putc(m, '\n'); 4152 return 0; 4153 } 4154 #endif /* CONFIG_NUMA */ 4155 4156 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4157 struct cgroup_map_cb *cb) 4158 { 4159 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4160 struct mcs_total_stat mystat; 4161 int i; 4162 4163 memset(&mystat, 0, sizeof(mystat)); 4164 mem_cgroup_get_local_stat(mem_cont, &mystat); 4165 4166 4167 for (i = 0; i < NR_MCS_STAT; i++) { 4168 if (i == MCS_SWAP && !do_swap_account) 4169 continue; 4170 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4171 } 4172 4173 /* Hierarchical information */ 4174 { 4175 unsigned long long limit, memsw_limit; 4176 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4177 cb->fill(cb, "hierarchical_memory_limit", limit); 4178 if (do_swap_account) 4179 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4180 } 4181 4182 memset(&mystat, 0, sizeof(mystat)); 4183 mem_cgroup_get_total_stat(mem_cont, &mystat); 4184 for (i = 0; i < NR_MCS_STAT; i++) { 4185 if (i == MCS_SWAP && !do_swap_account) 4186 continue; 4187 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4188 } 4189 4190 #ifdef CONFIG_DEBUG_VM 4191 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 4192 4193 { 4194 int nid, zid; 4195 struct mem_cgroup_per_zone *mz; 4196 unsigned long recent_rotated[2] = {0, 0}; 4197 unsigned long recent_scanned[2] = {0, 0}; 4198 4199 for_each_online_node(nid) 4200 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4201 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4202 4203 recent_rotated[0] += 4204 mz->reclaim_stat.recent_rotated[0]; 4205 recent_rotated[1] += 4206 mz->reclaim_stat.recent_rotated[1]; 4207 recent_scanned[0] += 4208 mz->reclaim_stat.recent_scanned[0]; 4209 recent_scanned[1] += 4210 mz->reclaim_stat.recent_scanned[1]; 4211 } 4212 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4213 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4214 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4215 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4216 } 4217 #endif 4218 4219 return 0; 4220 } 4221 4222 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4223 { 4224 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4225 4226 return mem_cgroup_swappiness(memcg); 4227 } 4228 4229 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4230 u64 val) 4231 { 4232 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4233 struct mem_cgroup *parent; 4234 4235 if (val > 100) 4236 return -EINVAL; 4237 4238 if (cgrp->parent == NULL) 4239 return -EINVAL; 4240 4241 parent = mem_cgroup_from_cont(cgrp->parent); 4242 4243 cgroup_lock(); 4244 4245 /* If under hierarchy, only empty-root can set this value */ 4246 if ((parent->use_hierarchy) || 4247 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4248 cgroup_unlock(); 4249 return -EINVAL; 4250 } 4251 4252 memcg->swappiness = val; 4253 4254 cgroup_unlock(); 4255 4256 return 0; 4257 } 4258 4259 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4260 { 4261 struct mem_cgroup_threshold_ary *t; 4262 u64 usage; 4263 int i; 4264 4265 rcu_read_lock(); 4266 if (!swap) 4267 t = rcu_dereference(memcg->thresholds.primary); 4268 else 4269 t = rcu_dereference(memcg->memsw_thresholds.primary); 4270 4271 if (!t) 4272 goto unlock; 4273 4274 usage = mem_cgroup_usage(memcg, swap); 4275 4276 /* 4277 * current_threshold points to threshold just below usage. 4278 * If it's not true, a threshold was crossed after last 4279 * call of __mem_cgroup_threshold(). 4280 */ 4281 i = t->current_threshold; 4282 4283 /* 4284 * Iterate backward over array of thresholds starting from 4285 * current_threshold and check if a threshold is crossed. 4286 * If none of thresholds below usage is crossed, we read 4287 * only one element of the array here. 4288 */ 4289 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4290 eventfd_signal(t->entries[i].eventfd, 1); 4291 4292 /* i = current_threshold + 1 */ 4293 i++; 4294 4295 /* 4296 * Iterate forward over array of thresholds starting from 4297 * current_threshold+1 and check if a threshold is crossed. 4298 * If none of thresholds above usage is crossed, we read 4299 * only one element of the array here. 4300 */ 4301 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4302 eventfd_signal(t->entries[i].eventfd, 1); 4303 4304 /* Update current_threshold */ 4305 t->current_threshold = i - 1; 4306 unlock: 4307 rcu_read_unlock(); 4308 } 4309 4310 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4311 { 4312 while (memcg) { 4313 __mem_cgroup_threshold(memcg, false); 4314 if (do_swap_account) 4315 __mem_cgroup_threshold(memcg, true); 4316 4317 memcg = parent_mem_cgroup(memcg); 4318 } 4319 } 4320 4321 static int compare_thresholds(const void *a, const void *b) 4322 { 4323 const struct mem_cgroup_threshold *_a = a; 4324 const struct mem_cgroup_threshold *_b = b; 4325 4326 return _a->threshold - _b->threshold; 4327 } 4328 4329 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4330 { 4331 struct mem_cgroup_eventfd_list *ev; 4332 4333 list_for_each_entry(ev, &mem->oom_notify, list) 4334 eventfd_signal(ev->eventfd, 1); 4335 return 0; 4336 } 4337 4338 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4339 { 4340 struct mem_cgroup *iter; 4341 4342 for_each_mem_cgroup_tree(iter, mem) 4343 mem_cgroup_oom_notify_cb(iter); 4344 } 4345 4346 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4347 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4348 { 4349 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4350 struct mem_cgroup_thresholds *thresholds; 4351 struct mem_cgroup_threshold_ary *new; 4352 int type = MEMFILE_TYPE(cft->private); 4353 u64 threshold, usage; 4354 int i, size, ret; 4355 4356 ret = res_counter_memparse_write_strategy(args, &threshold); 4357 if (ret) 4358 return ret; 4359 4360 mutex_lock(&memcg->thresholds_lock); 4361 4362 if (type == _MEM) 4363 thresholds = &memcg->thresholds; 4364 else if (type == _MEMSWAP) 4365 thresholds = &memcg->memsw_thresholds; 4366 else 4367 BUG(); 4368 4369 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4370 4371 /* Check if a threshold crossed before adding a new one */ 4372 if (thresholds->primary) 4373 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4374 4375 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4376 4377 /* Allocate memory for new array of thresholds */ 4378 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4379 GFP_KERNEL); 4380 if (!new) { 4381 ret = -ENOMEM; 4382 goto unlock; 4383 } 4384 new->size = size; 4385 4386 /* Copy thresholds (if any) to new array */ 4387 if (thresholds->primary) { 4388 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4389 sizeof(struct mem_cgroup_threshold)); 4390 } 4391 4392 /* Add new threshold */ 4393 new->entries[size - 1].eventfd = eventfd; 4394 new->entries[size - 1].threshold = threshold; 4395 4396 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4397 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4398 compare_thresholds, NULL); 4399 4400 /* Find current threshold */ 4401 new->current_threshold = -1; 4402 for (i = 0; i < size; i++) { 4403 if (new->entries[i].threshold < usage) { 4404 /* 4405 * new->current_threshold will not be used until 4406 * rcu_assign_pointer(), so it's safe to increment 4407 * it here. 4408 */ 4409 ++new->current_threshold; 4410 } 4411 } 4412 4413 /* Free old spare buffer and save old primary buffer as spare */ 4414 kfree(thresholds->spare); 4415 thresholds->spare = thresholds->primary; 4416 4417 rcu_assign_pointer(thresholds->primary, new); 4418 4419 /* To be sure that nobody uses thresholds */ 4420 synchronize_rcu(); 4421 4422 unlock: 4423 mutex_unlock(&memcg->thresholds_lock); 4424 4425 return ret; 4426 } 4427 4428 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4429 struct cftype *cft, struct eventfd_ctx *eventfd) 4430 { 4431 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4432 struct mem_cgroup_thresholds *thresholds; 4433 struct mem_cgroup_threshold_ary *new; 4434 int type = MEMFILE_TYPE(cft->private); 4435 u64 usage; 4436 int i, j, size; 4437 4438 mutex_lock(&memcg->thresholds_lock); 4439 if (type == _MEM) 4440 thresholds = &memcg->thresholds; 4441 else if (type == _MEMSWAP) 4442 thresholds = &memcg->memsw_thresholds; 4443 else 4444 BUG(); 4445 4446 /* 4447 * Something went wrong if we trying to unregister a threshold 4448 * if we don't have thresholds 4449 */ 4450 BUG_ON(!thresholds); 4451 4452 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4453 4454 /* Check if a threshold crossed before removing */ 4455 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4456 4457 /* Calculate new number of threshold */ 4458 size = 0; 4459 for (i = 0; i < thresholds->primary->size; i++) { 4460 if (thresholds->primary->entries[i].eventfd != eventfd) 4461 size++; 4462 } 4463 4464 new = thresholds->spare; 4465 4466 /* Set thresholds array to NULL if we don't have thresholds */ 4467 if (!size) { 4468 kfree(new); 4469 new = NULL; 4470 goto swap_buffers; 4471 } 4472 4473 new->size = size; 4474 4475 /* Copy thresholds and find current threshold */ 4476 new->current_threshold = -1; 4477 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4478 if (thresholds->primary->entries[i].eventfd == eventfd) 4479 continue; 4480 4481 new->entries[j] = thresholds->primary->entries[i]; 4482 if (new->entries[j].threshold < usage) { 4483 /* 4484 * new->current_threshold will not be used 4485 * until rcu_assign_pointer(), so it's safe to increment 4486 * it here. 4487 */ 4488 ++new->current_threshold; 4489 } 4490 j++; 4491 } 4492 4493 swap_buffers: 4494 /* Swap primary and spare array */ 4495 thresholds->spare = thresholds->primary; 4496 rcu_assign_pointer(thresholds->primary, new); 4497 4498 /* To be sure that nobody uses thresholds */ 4499 synchronize_rcu(); 4500 4501 mutex_unlock(&memcg->thresholds_lock); 4502 } 4503 4504 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4505 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4506 { 4507 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4508 struct mem_cgroup_eventfd_list *event; 4509 int type = MEMFILE_TYPE(cft->private); 4510 4511 BUG_ON(type != _OOM_TYPE); 4512 event = kmalloc(sizeof(*event), GFP_KERNEL); 4513 if (!event) 4514 return -ENOMEM; 4515 4516 spin_lock(&memcg_oom_lock); 4517 4518 event->eventfd = eventfd; 4519 list_add(&event->list, &memcg->oom_notify); 4520 4521 /* already in OOM ? */ 4522 if (atomic_read(&memcg->under_oom)) 4523 eventfd_signal(eventfd, 1); 4524 spin_unlock(&memcg_oom_lock); 4525 4526 return 0; 4527 } 4528 4529 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4530 struct cftype *cft, struct eventfd_ctx *eventfd) 4531 { 4532 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4533 struct mem_cgroup_eventfd_list *ev, *tmp; 4534 int type = MEMFILE_TYPE(cft->private); 4535 4536 BUG_ON(type != _OOM_TYPE); 4537 4538 spin_lock(&memcg_oom_lock); 4539 4540 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4541 if (ev->eventfd == eventfd) { 4542 list_del(&ev->list); 4543 kfree(ev); 4544 } 4545 } 4546 4547 spin_unlock(&memcg_oom_lock); 4548 } 4549 4550 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4551 struct cftype *cft, struct cgroup_map_cb *cb) 4552 { 4553 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4554 4555 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4556 4557 if (atomic_read(&mem->under_oom)) 4558 cb->fill(cb, "under_oom", 1); 4559 else 4560 cb->fill(cb, "under_oom", 0); 4561 return 0; 4562 } 4563 4564 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4565 struct cftype *cft, u64 val) 4566 { 4567 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4568 struct mem_cgroup *parent; 4569 4570 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4571 if (!cgrp->parent || !((val == 0) || (val == 1))) 4572 return -EINVAL; 4573 4574 parent = mem_cgroup_from_cont(cgrp->parent); 4575 4576 cgroup_lock(); 4577 /* oom-kill-disable is a flag for subhierarchy. */ 4578 if ((parent->use_hierarchy) || 4579 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4580 cgroup_unlock(); 4581 return -EINVAL; 4582 } 4583 mem->oom_kill_disable = val; 4584 if (!val) 4585 memcg_oom_recover(mem); 4586 cgroup_unlock(); 4587 return 0; 4588 } 4589 4590 #ifdef CONFIG_NUMA 4591 static const struct file_operations mem_control_numa_stat_file_operations = { 4592 .read = seq_read, 4593 .llseek = seq_lseek, 4594 .release = single_release, 4595 }; 4596 4597 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4598 { 4599 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4600 4601 file->f_op = &mem_control_numa_stat_file_operations; 4602 return single_open(file, mem_control_numa_stat_show, cont); 4603 } 4604 #endif /* CONFIG_NUMA */ 4605 4606 static struct cftype mem_cgroup_files[] = { 4607 { 4608 .name = "usage_in_bytes", 4609 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4610 .read_u64 = mem_cgroup_read, 4611 .register_event = mem_cgroup_usage_register_event, 4612 .unregister_event = mem_cgroup_usage_unregister_event, 4613 }, 4614 { 4615 .name = "max_usage_in_bytes", 4616 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4617 .trigger = mem_cgroup_reset, 4618 .read_u64 = mem_cgroup_read, 4619 }, 4620 { 4621 .name = "limit_in_bytes", 4622 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4623 .write_string = mem_cgroup_write, 4624 .read_u64 = mem_cgroup_read, 4625 }, 4626 { 4627 .name = "soft_limit_in_bytes", 4628 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4629 .write_string = mem_cgroup_write, 4630 .read_u64 = mem_cgroup_read, 4631 }, 4632 { 4633 .name = "failcnt", 4634 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4635 .trigger = mem_cgroup_reset, 4636 .read_u64 = mem_cgroup_read, 4637 }, 4638 { 4639 .name = "stat", 4640 .read_map = mem_control_stat_show, 4641 }, 4642 { 4643 .name = "force_empty", 4644 .trigger = mem_cgroup_force_empty_write, 4645 }, 4646 { 4647 .name = "use_hierarchy", 4648 .write_u64 = mem_cgroup_hierarchy_write, 4649 .read_u64 = mem_cgroup_hierarchy_read, 4650 }, 4651 { 4652 .name = "swappiness", 4653 .read_u64 = mem_cgroup_swappiness_read, 4654 .write_u64 = mem_cgroup_swappiness_write, 4655 }, 4656 { 4657 .name = "move_charge_at_immigrate", 4658 .read_u64 = mem_cgroup_move_charge_read, 4659 .write_u64 = mem_cgroup_move_charge_write, 4660 }, 4661 { 4662 .name = "oom_control", 4663 .read_map = mem_cgroup_oom_control_read, 4664 .write_u64 = mem_cgroup_oom_control_write, 4665 .register_event = mem_cgroup_oom_register_event, 4666 .unregister_event = mem_cgroup_oom_unregister_event, 4667 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4668 }, 4669 #ifdef CONFIG_NUMA 4670 { 4671 .name = "numa_stat", 4672 .open = mem_control_numa_stat_open, 4673 .mode = S_IRUGO, 4674 }, 4675 #endif 4676 }; 4677 4678 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4679 static struct cftype memsw_cgroup_files[] = { 4680 { 4681 .name = "memsw.usage_in_bytes", 4682 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4683 .read_u64 = mem_cgroup_read, 4684 .register_event = mem_cgroup_usage_register_event, 4685 .unregister_event = mem_cgroup_usage_unregister_event, 4686 }, 4687 { 4688 .name = "memsw.max_usage_in_bytes", 4689 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4690 .trigger = mem_cgroup_reset, 4691 .read_u64 = mem_cgroup_read, 4692 }, 4693 { 4694 .name = "memsw.limit_in_bytes", 4695 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4696 .write_string = mem_cgroup_write, 4697 .read_u64 = mem_cgroup_read, 4698 }, 4699 { 4700 .name = "memsw.failcnt", 4701 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4702 .trigger = mem_cgroup_reset, 4703 .read_u64 = mem_cgroup_read, 4704 }, 4705 }; 4706 4707 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4708 { 4709 if (!do_swap_account) 4710 return 0; 4711 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4712 ARRAY_SIZE(memsw_cgroup_files)); 4713 }; 4714 #else 4715 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4716 { 4717 return 0; 4718 } 4719 #endif 4720 4721 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4722 { 4723 struct mem_cgroup_per_node *pn; 4724 struct mem_cgroup_per_zone *mz; 4725 enum lru_list l; 4726 int zone, tmp = node; 4727 /* 4728 * This routine is called against possible nodes. 4729 * But it's BUG to call kmalloc() against offline node. 4730 * 4731 * TODO: this routine can waste much memory for nodes which will 4732 * never be onlined. It's better to use memory hotplug callback 4733 * function. 4734 */ 4735 if (!node_state(node, N_NORMAL_MEMORY)) 4736 tmp = -1; 4737 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4738 if (!pn) 4739 return 1; 4740 4741 mem->info.nodeinfo[node] = pn; 4742 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4743 mz = &pn->zoneinfo[zone]; 4744 for_each_lru(l) 4745 INIT_LIST_HEAD(&mz->lists[l]); 4746 mz->usage_in_excess = 0; 4747 mz->on_tree = false; 4748 mz->mem = mem; 4749 } 4750 return 0; 4751 } 4752 4753 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4754 { 4755 kfree(mem->info.nodeinfo[node]); 4756 } 4757 4758 static struct mem_cgroup *mem_cgroup_alloc(void) 4759 { 4760 struct mem_cgroup *mem; 4761 int size = sizeof(struct mem_cgroup); 4762 4763 /* Can be very big if MAX_NUMNODES is very big */ 4764 if (size < PAGE_SIZE) 4765 mem = kzalloc(size, GFP_KERNEL); 4766 else 4767 mem = vzalloc(size); 4768 4769 if (!mem) 4770 return NULL; 4771 4772 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4773 if (!mem->stat) 4774 goto out_free; 4775 spin_lock_init(&mem->pcp_counter_lock); 4776 return mem; 4777 4778 out_free: 4779 if (size < PAGE_SIZE) 4780 kfree(mem); 4781 else 4782 vfree(mem); 4783 return NULL; 4784 } 4785 4786 /* 4787 * At destroying mem_cgroup, references from swap_cgroup can remain. 4788 * (scanning all at force_empty is too costly...) 4789 * 4790 * Instead of clearing all references at force_empty, we remember 4791 * the number of reference from swap_cgroup and free mem_cgroup when 4792 * it goes down to 0. 4793 * 4794 * Removal of cgroup itself succeeds regardless of refs from swap. 4795 */ 4796 4797 static void __mem_cgroup_free(struct mem_cgroup *mem) 4798 { 4799 int node; 4800 4801 mem_cgroup_remove_from_trees(mem); 4802 free_css_id(&mem_cgroup_subsys, &mem->css); 4803 4804 for_each_node_state(node, N_POSSIBLE) 4805 free_mem_cgroup_per_zone_info(mem, node); 4806 4807 free_percpu(mem->stat); 4808 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4809 kfree(mem); 4810 else 4811 vfree(mem); 4812 } 4813 4814 static void mem_cgroup_get(struct mem_cgroup *mem) 4815 { 4816 atomic_inc(&mem->refcnt); 4817 } 4818 4819 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4820 { 4821 if (atomic_sub_and_test(count, &mem->refcnt)) { 4822 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4823 __mem_cgroup_free(mem); 4824 if (parent) 4825 mem_cgroup_put(parent); 4826 } 4827 } 4828 4829 static void mem_cgroup_put(struct mem_cgroup *mem) 4830 { 4831 __mem_cgroup_put(mem, 1); 4832 } 4833 4834 /* 4835 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4836 */ 4837 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4838 { 4839 if (!mem->res.parent) 4840 return NULL; 4841 return mem_cgroup_from_res_counter(mem->res.parent, res); 4842 } 4843 4844 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4845 static void __init enable_swap_cgroup(void) 4846 { 4847 if (!mem_cgroup_disabled() && really_do_swap_account) 4848 do_swap_account = 1; 4849 } 4850 #else 4851 static void __init enable_swap_cgroup(void) 4852 { 4853 } 4854 #endif 4855 4856 static int mem_cgroup_soft_limit_tree_init(void) 4857 { 4858 struct mem_cgroup_tree_per_node *rtpn; 4859 struct mem_cgroup_tree_per_zone *rtpz; 4860 int tmp, node, zone; 4861 4862 for_each_node_state(node, N_POSSIBLE) { 4863 tmp = node; 4864 if (!node_state(node, N_NORMAL_MEMORY)) 4865 tmp = -1; 4866 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4867 if (!rtpn) 4868 return 1; 4869 4870 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4871 4872 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4873 rtpz = &rtpn->rb_tree_per_zone[zone]; 4874 rtpz->rb_root = RB_ROOT; 4875 spin_lock_init(&rtpz->lock); 4876 } 4877 } 4878 return 0; 4879 } 4880 4881 static struct cgroup_subsys_state * __ref 4882 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4883 { 4884 struct mem_cgroup *mem, *parent; 4885 long error = -ENOMEM; 4886 int node; 4887 4888 mem = mem_cgroup_alloc(); 4889 if (!mem) 4890 return ERR_PTR(error); 4891 4892 for_each_node_state(node, N_POSSIBLE) 4893 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4894 goto free_out; 4895 4896 /* root ? */ 4897 if (cont->parent == NULL) { 4898 int cpu; 4899 enable_swap_cgroup(); 4900 parent = NULL; 4901 root_mem_cgroup = mem; 4902 if (mem_cgroup_soft_limit_tree_init()) 4903 goto free_out; 4904 for_each_possible_cpu(cpu) { 4905 struct memcg_stock_pcp *stock = 4906 &per_cpu(memcg_stock, cpu); 4907 INIT_WORK(&stock->work, drain_local_stock); 4908 } 4909 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4910 } else { 4911 parent = mem_cgroup_from_cont(cont->parent); 4912 mem->use_hierarchy = parent->use_hierarchy; 4913 mem->oom_kill_disable = parent->oom_kill_disable; 4914 } 4915 4916 if (parent && parent->use_hierarchy) { 4917 res_counter_init(&mem->res, &parent->res); 4918 res_counter_init(&mem->memsw, &parent->memsw); 4919 /* 4920 * We increment refcnt of the parent to ensure that we can 4921 * safely access it on res_counter_charge/uncharge. 4922 * This refcnt will be decremented when freeing this 4923 * mem_cgroup(see mem_cgroup_put). 4924 */ 4925 mem_cgroup_get(parent); 4926 } else { 4927 res_counter_init(&mem->res, NULL); 4928 res_counter_init(&mem->memsw, NULL); 4929 } 4930 mem->last_scanned_child = 0; 4931 mem->last_scanned_node = MAX_NUMNODES; 4932 INIT_LIST_HEAD(&mem->oom_notify); 4933 4934 if (parent) 4935 mem->swappiness = mem_cgroup_swappiness(parent); 4936 atomic_set(&mem->refcnt, 1); 4937 mem->move_charge_at_immigrate = 0; 4938 mutex_init(&mem->thresholds_lock); 4939 return &mem->css; 4940 free_out: 4941 __mem_cgroup_free(mem); 4942 root_mem_cgroup = NULL; 4943 return ERR_PTR(error); 4944 } 4945 4946 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4947 struct cgroup *cont) 4948 { 4949 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4950 4951 return mem_cgroup_force_empty(mem, false); 4952 } 4953 4954 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4955 struct cgroup *cont) 4956 { 4957 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4958 4959 mem_cgroup_put(mem); 4960 } 4961 4962 static int mem_cgroup_populate(struct cgroup_subsys *ss, 4963 struct cgroup *cont) 4964 { 4965 int ret; 4966 4967 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4968 ARRAY_SIZE(mem_cgroup_files)); 4969 4970 if (!ret) 4971 ret = register_memsw_files(cont, ss); 4972 return ret; 4973 } 4974 4975 #ifdef CONFIG_MMU 4976 /* Handlers for move charge at task migration. */ 4977 #define PRECHARGE_COUNT_AT_ONCE 256 4978 static int mem_cgroup_do_precharge(unsigned long count) 4979 { 4980 int ret = 0; 4981 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4982 struct mem_cgroup *mem = mc.to; 4983 4984 if (mem_cgroup_is_root(mem)) { 4985 mc.precharge += count; 4986 /* we don't need css_get for root */ 4987 return ret; 4988 } 4989 /* try to charge at once */ 4990 if (count > 1) { 4991 struct res_counter *dummy; 4992 /* 4993 * "mem" cannot be under rmdir() because we've already checked 4994 * by cgroup_lock_live_cgroup() that it is not removed and we 4995 * are still under the same cgroup_mutex. So we can postpone 4996 * css_get(). 4997 */ 4998 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4999 goto one_by_one; 5000 if (do_swap_account && res_counter_charge(&mem->memsw, 5001 PAGE_SIZE * count, &dummy)) { 5002 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5003 goto one_by_one; 5004 } 5005 mc.precharge += count; 5006 return ret; 5007 } 5008 one_by_one: 5009 /* fall back to one by one charge */ 5010 while (count--) { 5011 if (signal_pending(current)) { 5012 ret = -EINTR; 5013 break; 5014 } 5015 if (!batch_count--) { 5016 batch_count = PRECHARGE_COUNT_AT_ONCE; 5017 cond_resched(); 5018 } 5019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5020 if (ret || !mem) 5021 /* mem_cgroup_clear_mc() will do uncharge later */ 5022 return -ENOMEM; 5023 mc.precharge++; 5024 } 5025 return ret; 5026 } 5027 5028 /** 5029 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5030 * @vma: the vma the pte to be checked belongs 5031 * @addr: the address corresponding to the pte to be checked 5032 * @ptent: the pte to be checked 5033 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5034 * 5035 * Returns 5036 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5037 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5038 * move charge. if @target is not NULL, the page is stored in target->page 5039 * with extra refcnt got(Callers should handle it). 5040 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5041 * target for charge migration. if @target is not NULL, the entry is stored 5042 * in target->ent. 5043 * 5044 * Called with pte lock held. 5045 */ 5046 union mc_target { 5047 struct page *page; 5048 swp_entry_t ent; 5049 }; 5050 5051 enum mc_target_type { 5052 MC_TARGET_NONE, /* not used */ 5053 MC_TARGET_PAGE, 5054 MC_TARGET_SWAP, 5055 }; 5056 5057 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5058 unsigned long addr, pte_t ptent) 5059 { 5060 struct page *page = vm_normal_page(vma, addr, ptent); 5061 5062 if (!page || !page_mapped(page)) 5063 return NULL; 5064 if (PageAnon(page)) { 5065 /* we don't move shared anon */ 5066 if (!move_anon() || page_mapcount(page) > 2) 5067 return NULL; 5068 } else if (!move_file()) 5069 /* we ignore mapcount for file pages */ 5070 return NULL; 5071 if (!get_page_unless_zero(page)) 5072 return NULL; 5073 5074 return page; 5075 } 5076 5077 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5078 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5079 { 5080 int usage_count; 5081 struct page *page = NULL; 5082 swp_entry_t ent = pte_to_swp_entry(ptent); 5083 5084 if (!move_anon() || non_swap_entry(ent)) 5085 return NULL; 5086 usage_count = mem_cgroup_count_swap_user(ent, &page); 5087 if (usage_count > 1) { /* we don't move shared anon */ 5088 if (page) 5089 put_page(page); 5090 return NULL; 5091 } 5092 if (do_swap_account) 5093 entry->val = ent.val; 5094 5095 return page; 5096 } 5097 5098 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5099 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5100 { 5101 struct page *page = NULL; 5102 struct inode *inode; 5103 struct address_space *mapping; 5104 pgoff_t pgoff; 5105 5106 if (!vma->vm_file) /* anonymous vma */ 5107 return NULL; 5108 if (!move_file()) 5109 return NULL; 5110 5111 inode = vma->vm_file->f_path.dentry->d_inode; 5112 mapping = vma->vm_file->f_mapping; 5113 if (pte_none(ptent)) 5114 pgoff = linear_page_index(vma, addr); 5115 else /* pte_file(ptent) is true */ 5116 pgoff = pte_to_pgoff(ptent); 5117 5118 /* page is moved even if it's not RSS of this task(page-faulted). */ 5119 page = find_get_page(mapping, pgoff); 5120 5121 #ifdef CONFIG_SWAP 5122 /* shmem/tmpfs may report page out on swap: account for that too. */ 5123 if (radix_tree_exceptional_entry(page)) { 5124 swp_entry_t swap = radix_to_swp_entry(page); 5125 if (do_swap_account) 5126 *entry = swap; 5127 page = find_get_page(&swapper_space, swap.val); 5128 } 5129 #endif 5130 return page; 5131 } 5132 5133 static int is_target_pte_for_mc(struct vm_area_struct *vma, 5134 unsigned long addr, pte_t ptent, union mc_target *target) 5135 { 5136 struct page *page = NULL; 5137 struct page_cgroup *pc; 5138 int ret = 0; 5139 swp_entry_t ent = { .val = 0 }; 5140 5141 if (pte_present(ptent)) 5142 page = mc_handle_present_pte(vma, addr, ptent); 5143 else if (is_swap_pte(ptent)) 5144 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5145 else if (pte_none(ptent) || pte_file(ptent)) 5146 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5147 5148 if (!page && !ent.val) 5149 return 0; 5150 if (page) { 5151 pc = lookup_page_cgroup(page); 5152 /* 5153 * Do only loose check w/o page_cgroup lock. 5154 * mem_cgroup_move_account() checks the pc is valid or not under 5155 * the lock. 5156 */ 5157 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5158 ret = MC_TARGET_PAGE; 5159 if (target) 5160 target->page = page; 5161 } 5162 if (!ret || !target) 5163 put_page(page); 5164 } 5165 /* There is a swap entry and a page doesn't exist or isn't charged */ 5166 if (ent.val && !ret && 5167 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5168 ret = MC_TARGET_SWAP; 5169 if (target) 5170 target->ent = ent; 5171 } 5172 return ret; 5173 } 5174 5175 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5176 unsigned long addr, unsigned long end, 5177 struct mm_walk *walk) 5178 { 5179 struct vm_area_struct *vma = walk->private; 5180 pte_t *pte; 5181 spinlock_t *ptl; 5182 5183 split_huge_page_pmd(walk->mm, pmd); 5184 5185 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5186 for (; addr != end; pte++, addr += PAGE_SIZE) 5187 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5188 mc.precharge++; /* increment precharge temporarily */ 5189 pte_unmap_unlock(pte - 1, ptl); 5190 cond_resched(); 5191 5192 return 0; 5193 } 5194 5195 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5196 { 5197 unsigned long precharge; 5198 struct vm_area_struct *vma; 5199 5200 down_read(&mm->mmap_sem); 5201 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5202 struct mm_walk mem_cgroup_count_precharge_walk = { 5203 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5204 .mm = mm, 5205 .private = vma, 5206 }; 5207 if (is_vm_hugetlb_page(vma)) 5208 continue; 5209 walk_page_range(vma->vm_start, vma->vm_end, 5210 &mem_cgroup_count_precharge_walk); 5211 } 5212 up_read(&mm->mmap_sem); 5213 5214 precharge = mc.precharge; 5215 mc.precharge = 0; 5216 5217 return precharge; 5218 } 5219 5220 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5221 { 5222 unsigned long precharge = mem_cgroup_count_precharge(mm); 5223 5224 VM_BUG_ON(mc.moving_task); 5225 mc.moving_task = current; 5226 return mem_cgroup_do_precharge(precharge); 5227 } 5228 5229 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5230 static void __mem_cgroup_clear_mc(void) 5231 { 5232 struct mem_cgroup *from = mc.from; 5233 struct mem_cgroup *to = mc.to; 5234 5235 /* we must uncharge all the leftover precharges from mc.to */ 5236 if (mc.precharge) { 5237 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5238 mc.precharge = 0; 5239 } 5240 /* 5241 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5242 * we must uncharge here. 5243 */ 5244 if (mc.moved_charge) { 5245 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5246 mc.moved_charge = 0; 5247 } 5248 /* we must fixup refcnts and charges */ 5249 if (mc.moved_swap) { 5250 /* uncharge swap account from the old cgroup */ 5251 if (!mem_cgroup_is_root(mc.from)) 5252 res_counter_uncharge(&mc.from->memsw, 5253 PAGE_SIZE * mc.moved_swap); 5254 __mem_cgroup_put(mc.from, mc.moved_swap); 5255 5256 if (!mem_cgroup_is_root(mc.to)) { 5257 /* 5258 * we charged both to->res and to->memsw, so we should 5259 * uncharge to->res. 5260 */ 5261 res_counter_uncharge(&mc.to->res, 5262 PAGE_SIZE * mc.moved_swap); 5263 } 5264 /* we've already done mem_cgroup_get(mc.to) */ 5265 mc.moved_swap = 0; 5266 } 5267 memcg_oom_recover(from); 5268 memcg_oom_recover(to); 5269 wake_up_all(&mc.waitq); 5270 } 5271 5272 static void mem_cgroup_clear_mc(void) 5273 { 5274 struct mem_cgroup *from = mc.from; 5275 5276 /* 5277 * we must clear moving_task before waking up waiters at the end of 5278 * task migration. 5279 */ 5280 mc.moving_task = NULL; 5281 __mem_cgroup_clear_mc(); 5282 spin_lock(&mc.lock); 5283 mc.from = NULL; 5284 mc.to = NULL; 5285 spin_unlock(&mc.lock); 5286 mem_cgroup_end_move(from); 5287 } 5288 5289 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5290 struct cgroup *cgroup, 5291 struct task_struct *p) 5292 { 5293 int ret = 0; 5294 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5295 5296 if (mem->move_charge_at_immigrate) { 5297 struct mm_struct *mm; 5298 struct mem_cgroup *from = mem_cgroup_from_task(p); 5299 5300 VM_BUG_ON(from == mem); 5301 5302 mm = get_task_mm(p); 5303 if (!mm) 5304 return 0; 5305 /* We move charges only when we move a owner of the mm */ 5306 if (mm->owner == p) { 5307 VM_BUG_ON(mc.from); 5308 VM_BUG_ON(mc.to); 5309 VM_BUG_ON(mc.precharge); 5310 VM_BUG_ON(mc.moved_charge); 5311 VM_BUG_ON(mc.moved_swap); 5312 mem_cgroup_start_move(from); 5313 spin_lock(&mc.lock); 5314 mc.from = from; 5315 mc.to = mem; 5316 spin_unlock(&mc.lock); 5317 /* We set mc.moving_task later */ 5318 5319 ret = mem_cgroup_precharge_mc(mm); 5320 if (ret) 5321 mem_cgroup_clear_mc(); 5322 } 5323 mmput(mm); 5324 } 5325 return ret; 5326 } 5327 5328 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5329 struct cgroup *cgroup, 5330 struct task_struct *p) 5331 { 5332 mem_cgroup_clear_mc(); 5333 } 5334 5335 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5336 unsigned long addr, unsigned long end, 5337 struct mm_walk *walk) 5338 { 5339 int ret = 0; 5340 struct vm_area_struct *vma = walk->private; 5341 pte_t *pte; 5342 spinlock_t *ptl; 5343 5344 split_huge_page_pmd(walk->mm, pmd); 5345 retry: 5346 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5347 for (; addr != end; addr += PAGE_SIZE) { 5348 pte_t ptent = *(pte++); 5349 union mc_target target; 5350 int type; 5351 struct page *page; 5352 struct page_cgroup *pc; 5353 swp_entry_t ent; 5354 5355 if (!mc.precharge) 5356 break; 5357 5358 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5359 switch (type) { 5360 case MC_TARGET_PAGE: 5361 page = target.page; 5362 if (isolate_lru_page(page)) 5363 goto put; 5364 pc = lookup_page_cgroup(page); 5365 if (!mem_cgroup_move_account(page, 1, pc, 5366 mc.from, mc.to, false)) { 5367 mc.precharge--; 5368 /* we uncharge from mc.from later. */ 5369 mc.moved_charge++; 5370 } 5371 putback_lru_page(page); 5372 put: /* is_target_pte_for_mc() gets the page */ 5373 put_page(page); 5374 break; 5375 case MC_TARGET_SWAP: 5376 ent = target.ent; 5377 if (!mem_cgroup_move_swap_account(ent, 5378 mc.from, mc.to, false)) { 5379 mc.precharge--; 5380 /* we fixup refcnts and charges later. */ 5381 mc.moved_swap++; 5382 } 5383 break; 5384 default: 5385 break; 5386 } 5387 } 5388 pte_unmap_unlock(pte - 1, ptl); 5389 cond_resched(); 5390 5391 if (addr != end) { 5392 /* 5393 * We have consumed all precharges we got in can_attach(). 5394 * We try charge one by one, but don't do any additional 5395 * charges to mc.to if we have failed in charge once in attach() 5396 * phase. 5397 */ 5398 ret = mem_cgroup_do_precharge(1); 5399 if (!ret) 5400 goto retry; 5401 } 5402 5403 return ret; 5404 } 5405 5406 static void mem_cgroup_move_charge(struct mm_struct *mm) 5407 { 5408 struct vm_area_struct *vma; 5409 5410 lru_add_drain_all(); 5411 retry: 5412 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5413 /* 5414 * Someone who are holding the mmap_sem might be waiting in 5415 * waitq. So we cancel all extra charges, wake up all waiters, 5416 * and retry. Because we cancel precharges, we might not be able 5417 * to move enough charges, but moving charge is a best-effort 5418 * feature anyway, so it wouldn't be a big problem. 5419 */ 5420 __mem_cgroup_clear_mc(); 5421 cond_resched(); 5422 goto retry; 5423 } 5424 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5425 int ret; 5426 struct mm_walk mem_cgroup_move_charge_walk = { 5427 .pmd_entry = mem_cgroup_move_charge_pte_range, 5428 .mm = mm, 5429 .private = vma, 5430 }; 5431 if (is_vm_hugetlb_page(vma)) 5432 continue; 5433 ret = walk_page_range(vma->vm_start, vma->vm_end, 5434 &mem_cgroup_move_charge_walk); 5435 if (ret) 5436 /* 5437 * means we have consumed all precharges and failed in 5438 * doing additional charge. Just abandon here. 5439 */ 5440 break; 5441 } 5442 up_read(&mm->mmap_sem); 5443 } 5444 5445 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5446 struct cgroup *cont, 5447 struct cgroup *old_cont, 5448 struct task_struct *p) 5449 { 5450 struct mm_struct *mm = get_task_mm(p); 5451 5452 if (mm) { 5453 if (mc.to) 5454 mem_cgroup_move_charge(mm); 5455 put_swap_token(mm); 5456 mmput(mm); 5457 } 5458 if (mc.to) 5459 mem_cgroup_clear_mc(); 5460 } 5461 #else /* !CONFIG_MMU */ 5462 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5463 struct cgroup *cgroup, 5464 struct task_struct *p) 5465 { 5466 return 0; 5467 } 5468 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5469 struct cgroup *cgroup, 5470 struct task_struct *p) 5471 { 5472 } 5473 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5474 struct cgroup *cont, 5475 struct cgroup *old_cont, 5476 struct task_struct *p) 5477 { 5478 } 5479 #endif 5480 5481 struct cgroup_subsys mem_cgroup_subsys = { 5482 .name = "memory", 5483 .subsys_id = mem_cgroup_subsys_id, 5484 .create = mem_cgroup_create, 5485 .pre_destroy = mem_cgroup_pre_destroy, 5486 .destroy = mem_cgroup_destroy, 5487 .populate = mem_cgroup_populate, 5488 .can_attach = mem_cgroup_can_attach, 5489 .cancel_attach = mem_cgroup_cancel_attach, 5490 .attach = mem_cgroup_move_task, 5491 .early_init = 0, 5492 .use_id = 1, 5493 }; 5494 5495 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5496 static int __init enable_swap_account(char *s) 5497 { 5498 /* consider enabled if no parameter or 1 is given */ 5499 if (!strcmp(s, "1")) 5500 really_do_swap_account = 1; 5501 else if (!strcmp(s, "0")) 5502 really_do_swap_account = 0; 5503 return 1; 5504 } 5505 __setup("swapaccount=", enable_swap_account); 5506 5507 #endif 5508