1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/export.h> 37 #include <linux/mutex.h> 38 #include <linux/rbtree.h> 39 #include <linux/slab.h> 40 #include <linux/swap.h> 41 #include <linux/swapops.h> 42 #include <linux/spinlock.h> 43 #include <linux/eventfd.h> 44 #include <linux/sort.h> 45 #include <linux/fs.h> 46 #include <linux/seq_file.h> 47 #include <linux/vmalloc.h> 48 #include <linux/mm_inline.h> 49 #include <linux/page_cgroup.h> 50 #include <linux/cpu.h> 51 #include <linux/oom.h> 52 #include "internal.h" 53 #include <net/sock.h> 54 #include <net/tcp_memcontrol.h> 55 56 #include <asm/uaccess.h> 57 58 #include <trace/events/vmscan.h> 59 60 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 61 #define MEM_CGROUP_RECLAIM_RETRIES 5 62 struct mem_cgroup *root_mem_cgroup __read_mostly; 63 64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 65 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 66 int do_swap_account __read_mostly; 67 68 /* for remember boot option*/ 69 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 70 static int really_do_swap_account __initdata = 1; 71 #else 72 static int really_do_swap_account __initdata = 0; 73 #endif 74 75 #else 76 #define do_swap_account (0) 77 #endif 78 79 80 /* 81 * Statistics for memory cgroup. 82 */ 83 enum mem_cgroup_stat_index { 84 /* 85 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 86 */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 92 MEM_CGROUP_STAT_NSTATS, 93 }; 94 95 enum mem_cgroup_events_index { 96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 101 MEM_CGROUP_EVENTS_NSTATS, 102 }; 103 /* 104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 105 * it will be incremated by the number of pages. This counter is used for 106 * for trigger some periodic events. This is straightforward and better 107 * than using jiffies etc. to handle periodic memcg event. 108 */ 109 enum mem_cgroup_events_target { 110 MEM_CGROUP_TARGET_THRESH, 111 MEM_CGROUP_TARGET_SOFTLIMIT, 112 MEM_CGROUP_TARGET_NUMAINFO, 113 MEM_CGROUP_NTARGETS, 114 }; 115 #define THRESHOLDS_EVENTS_TARGET (128) 116 #define SOFTLIMIT_EVENTS_TARGET (1024) 117 #define NUMAINFO_EVENTS_TARGET (1024) 118 119 struct mem_cgroup_stat_cpu { 120 long count[MEM_CGROUP_STAT_NSTATS]; 121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 122 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 }; 124 125 struct mem_cgroup_reclaim_iter { 126 /* css_id of the last scanned hierarchy member */ 127 int position; 128 /* scan generation, increased every round-trip */ 129 unsigned int generation; 130 }; 131 132 /* 133 * per-zone information in memory controller. 134 */ 135 struct mem_cgroup_per_zone { 136 struct lruvec lruvec; 137 unsigned long lru_size[NR_LRU_LISTS]; 138 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 140 141 struct zone_reclaim_stat reclaim_stat; 142 struct rb_node tree_node; /* RB tree node */ 143 unsigned long long usage_in_excess;/* Set to the value by which */ 144 /* the soft limit is exceeded*/ 145 bool on_tree; 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 147 /* use container_of */ 148 }; 149 150 struct mem_cgroup_per_node { 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 152 }; 153 154 struct mem_cgroup_lru_info { 155 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 156 }; 157 158 /* 159 * Cgroups above their limits are maintained in a RB-Tree, independent of 160 * their hierarchy representation 161 */ 162 163 struct mem_cgroup_tree_per_zone { 164 struct rb_root rb_root; 165 spinlock_t lock; 166 }; 167 168 struct mem_cgroup_tree_per_node { 169 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 170 }; 171 172 struct mem_cgroup_tree { 173 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 174 }; 175 176 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 177 178 struct mem_cgroup_threshold { 179 struct eventfd_ctx *eventfd; 180 u64 threshold; 181 }; 182 183 /* For threshold */ 184 struct mem_cgroup_threshold_ary { 185 /* An array index points to threshold just below usage. */ 186 int current_threshold; 187 /* Size of entries[] */ 188 unsigned int size; 189 /* Array of thresholds */ 190 struct mem_cgroup_threshold entries[0]; 191 }; 192 193 struct mem_cgroup_thresholds { 194 /* Primary thresholds array */ 195 struct mem_cgroup_threshold_ary *primary; 196 /* 197 * Spare threshold array. 198 * This is needed to make mem_cgroup_unregister_event() "never fail". 199 * It must be able to store at least primary->size - 1 entries. 200 */ 201 struct mem_cgroup_threshold_ary *spare; 202 }; 203 204 /* for OOM */ 205 struct mem_cgroup_eventfd_list { 206 struct list_head list; 207 struct eventfd_ctx *eventfd; 208 }; 209 210 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 211 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 212 213 /* 214 * The memory controller data structure. The memory controller controls both 215 * page cache and RSS per cgroup. We would eventually like to provide 216 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 217 * to help the administrator determine what knobs to tune. 218 * 219 * TODO: Add a water mark for the memory controller. Reclaim will begin when 220 * we hit the water mark. May be even add a low water mark, such that 221 * no reclaim occurs from a cgroup at it's low water mark, this is 222 * a feature that will be implemented much later in the future. 223 */ 224 struct mem_cgroup { 225 struct cgroup_subsys_state css; 226 /* 227 * the counter to account for memory usage 228 */ 229 struct res_counter res; 230 231 union { 232 /* 233 * the counter to account for mem+swap usage. 234 */ 235 struct res_counter memsw; 236 237 /* 238 * rcu_freeing is used only when freeing struct mem_cgroup, 239 * so put it into a union to avoid wasting more memory. 240 * It must be disjoint from the css field. It could be 241 * in a union with the res field, but res plays a much 242 * larger part in mem_cgroup life than memsw, and might 243 * be of interest, even at time of free, when debugging. 244 * So share rcu_head with the less interesting memsw. 245 */ 246 struct rcu_head rcu_freeing; 247 /* 248 * But when using vfree(), that cannot be done at 249 * interrupt time, so we must then queue the work. 250 */ 251 struct work_struct work_freeing; 252 }; 253 254 /* 255 * Per cgroup active and inactive list, similar to the 256 * per zone LRU lists. 257 */ 258 struct mem_cgroup_lru_info info; 259 int last_scanned_node; 260 #if MAX_NUMNODES > 1 261 nodemask_t scan_nodes; 262 atomic_t numainfo_events; 263 atomic_t numainfo_updating; 264 #endif 265 /* 266 * Should the accounting and control be hierarchical, per subtree? 267 */ 268 bool use_hierarchy; 269 270 bool oom_lock; 271 atomic_t under_oom; 272 273 atomic_t refcnt; 274 275 int swappiness; 276 /* OOM-Killer disable */ 277 int oom_kill_disable; 278 279 /* set when res.limit == memsw.limit */ 280 bool memsw_is_minimum; 281 282 /* protect arrays of thresholds */ 283 struct mutex thresholds_lock; 284 285 /* thresholds for memory usage. RCU-protected */ 286 struct mem_cgroup_thresholds thresholds; 287 288 /* thresholds for mem+swap usage. RCU-protected */ 289 struct mem_cgroup_thresholds memsw_thresholds; 290 291 /* For oom notifier event fd */ 292 struct list_head oom_notify; 293 294 /* 295 * Should we move charges of a task when a task is moved into this 296 * mem_cgroup ? And what type of charges should we move ? 297 */ 298 unsigned long move_charge_at_immigrate; 299 /* 300 * set > 0 if pages under this cgroup are moving to other cgroup. 301 */ 302 atomic_t moving_account; 303 /* taken only while moving_account > 0 */ 304 spinlock_t move_lock; 305 /* 306 * percpu counter. 307 */ 308 struct mem_cgroup_stat_cpu *stat; 309 /* 310 * used when a cpu is offlined or other synchronizations 311 * See mem_cgroup_read_stat(). 312 */ 313 struct mem_cgroup_stat_cpu nocpu_base; 314 spinlock_t pcp_counter_lock; 315 316 #ifdef CONFIG_INET 317 struct tcp_memcontrol tcp_mem; 318 #endif 319 }; 320 321 /* Stuffs for move charges at task migration. */ 322 /* 323 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 324 * left-shifted bitmap of these types. 325 */ 326 enum move_type { 327 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 328 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 329 NR_MOVE_TYPE, 330 }; 331 332 /* "mc" and its members are protected by cgroup_mutex */ 333 static struct move_charge_struct { 334 spinlock_t lock; /* for from, to */ 335 struct mem_cgroup *from; 336 struct mem_cgroup *to; 337 unsigned long precharge; 338 unsigned long moved_charge; 339 unsigned long moved_swap; 340 struct task_struct *moving_task; /* a task moving charges */ 341 wait_queue_head_t waitq; /* a waitq for other context */ 342 } mc = { 343 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 344 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 345 }; 346 347 static bool move_anon(void) 348 { 349 return test_bit(MOVE_CHARGE_TYPE_ANON, 350 &mc.to->move_charge_at_immigrate); 351 } 352 353 static bool move_file(void) 354 { 355 return test_bit(MOVE_CHARGE_TYPE_FILE, 356 &mc.to->move_charge_at_immigrate); 357 } 358 359 /* 360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 361 * limit reclaim to prevent infinite loops, if they ever occur. 362 */ 363 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 364 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 365 366 enum charge_type { 367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 368 MEM_CGROUP_CHARGE_TYPE_MAPPED, 369 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 370 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 371 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 372 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 373 NR_CHARGE_TYPE, 374 }; 375 376 /* for encoding cft->private value on file */ 377 #define _MEM (0) 378 #define _MEMSWAP (1) 379 #define _OOM_TYPE (2) 380 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 381 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 382 #define MEMFILE_ATTR(val) ((val) & 0xffff) 383 /* Used for OOM nofiier */ 384 #define OOM_CONTROL (0) 385 386 /* 387 * Reclaim flags for mem_cgroup_hierarchical_reclaim 388 */ 389 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 390 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 391 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 392 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 393 394 static void mem_cgroup_get(struct mem_cgroup *memcg); 395 static void mem_cgroup_put(struct mem_cgroup *memcg); 396 397 /* Writing them here to avoid exposing memcg's inner layout */ 398 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 399 #include <net/sock.h> 400 #include <net/ip.h> 401 402 static bool mem_cgroup_is_root(struct mem_cgroup *memcg); 403 void sock_update_memcg(struct sock *sk) 404 { 405 if (mem_cgroup_sockets_enabled) { 406 struct mem_cgroup *memcg; 407 408 BUG_ON(!sk->sk_prot->proto_cgroup); 409 410 /* Socket cloning can throw us here with sk_cgrp already 411 * filled. It won't however, necessarily happen from 412 * process context. So the test for root memcg given 413 * the current task's memcg won't help us in this case. 414 * 415 * Respecting the original socket's memcg is a better 416 * decision in this case. 417 */ 418 if (sk->sk_cgrp) { 419 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 420 mem_cgroup_get(sk->sk_cgrp->memcg); 421 return; 422 } 423 424 rcu_read_lock(); 425 memcg = mem_cgroup_from_task(current); 426 if (!mem_cgroup_is_root(memcg)) { 427 mem_cgroup_get(memcg); 428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 429 } 430 rcu_read_unlock(); 431 } 432 } 433 EXPORT_SYMBOL(sock_update_memcg); 434 435 void sock_release_memcg(struct sock *sk) 436 { 437 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 438 struct mem_cgroup *memcg; 439 WARN_ON(!sk->sk_cgrp->memcg); 440 memcg = sk->sk_cgrp->memcg; 441 mem_cgroup_put(memcg); 442 } 443 } 444 445 #ifdef CONFIG_INET 446 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 447 { 448 if (!memcg || mem_cgroup_is_root(memcg)) 449 return NULL; 450 451 return &memcg->tcp_mem.cg_proto; 452 } 453 EXPORT_SYMBOL(tcp_proto_cgroup); 454 #endif /* CONFIG_INET */ 455 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 456 457 static void drain_all_stock_async(struct mem_cgroup *memcg); 458 459 static struct mem_cgroup_per_zone * 460 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 461 { 462 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 463 } 464 465 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 466 { 467 return &memcg->css; 468 } 469 470 static struct mem_cgroup_per_zone * 471 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 472 { 473 int nid = page_to_nid(page); 474 int zid = page_zonenum(page); 475 476 return mem_cgroup_zoneinfo(memcg, nid, zid); 477 } 478 479 static struct mem_cgroup_tree_per_zone * 480 soft_limit_tree_node_zone(int nid, int zid) 481 { 482 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 483 } 484 485 static struct mem_cgroup_tree_per_zone * 486 soft_limit_tree_from_page(struct page *page) 487 { 488 int nid = page_to_nid(page); 489 int zid = page_zonenum(page); 490 491 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 492 } 493 494 static void 495 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 496 struct mem_cgroup_per_zone *mz, 497 struct mem_cgroup_tree_per_zone *mctz, 498 unsigned long long new_usage_in_excess) 499 { 500 struct rb_node **p = &mctz->rb_root.rb_node; 501 struct rb_node *parent = NULL; 502 struct mem_cgroup_per_zone *mz_node; 503 504 if (mz->on_tree) 505 return; 506 507 mz->usage_in_excess = new_usage_in_excess; 508 if (!mz->usage_in_excess) 509 return; 510 while (*p) { 511 parent = *p; 512 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 513 tree_node); 514 if (mz->usage_in_excess < mz_node->usage_in_excess) 515 p = &(*p)->rb_left; 516 /* 517 * We can't avoid mem cgroups that are over their soft 518 * limit by the same amount 519 */ 520 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 521 p = &(*p)->rb_right; 522 } 523 rb_link_node(&mz->tree_node, parent, p); 524 rb_insert_color(&mz->tree_node, &mctz->rb_root); 525 mz->on_tree = true; 526 } 527 528 static void 529 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 530 struct mem_cgroup_per_zone *mz, 531 struct mem_cgroup_tree_per_zone *mctz) 532 { 533 if (!mz->on_tree) 534 return; 535 rb_erase(&mz->tree_node, &mctz->rb_root); 536 mz->on_tree = false; 537 } 538 539 static void 540 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 541 struct mem_cgroup_per_zone *mz, 542 struct mem_cgroup_tree_per_zone *mctz) 543 { 544 spin_lock(&mctz->lock); 545 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 546 spin_unlock(&mctz->lock); 547 } 548 549 550 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 551 { 552 unsigned long long excess; 553 struct mem_cgroup_per_zone *mz; 554 struct mem_cgroup_tree_per_zone *mctz; 555 int nid = page_to_nid(page); 556 int zid = page_zonenum(page); 557 mctz = soft_limit_tree_from_page(page); 558 559 /* 560 * Necessary to update all ancestors when hierarchy is used. 561 * because their event counter is not touched. 562 */ 563 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 564 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 565 excess = res_counter_soft_limit_excess(&memcg->res); 566 /* 567 * We have to update the tree if mz is on RB-tree or 568 * mem is over its softlimit. 569 */ 570 if (excess || mz->on_tree) { 571 spin_lock(&mctz->lock); 572 /* if on-tree, remove it */ 573 if (mz->on_tree) 574 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 575 /* 576 * Insert again. mz->usage_in_excess will be updated. 577 * If excess is 0, no tree ops. 578 */ 579 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 580 spin_unlock(&mctz->lock); 581 } 582 } 583 } 584 585 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 586 { 587 int node, zone; 588 struct mem_cgroup_per_zone *mz; 589 struct mem_cgroup_tree_per_zone *mctz; 590 591 for_each_node(node) { 592 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 593 mz = mem_cgroup_zoneinfo(memcg, node, zone); 594 mctz = soft_limit_tree_node_zone(node, zone); 595 mem_cgroup_remove_exceeded(memcg, mz, mctz); 596 } 597 } 598 } 599 600 static struct mem_cgroup_per_zone * 601 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 602 { 603 struct rb_node *rightmost = NULL; 604 struct mem_cgroup_per_zone *mz; 605 606 retry: 607 mz = NULL; 608 rightmost = rb_last(&mctz->rb_root); 609 if (!rightmost) 610 goto done; /* Nothing to reclaim from */ 611 612 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 613 /* 614 * Remove the node now but someone else can add it back, 615 * we will to add it back at the end of reclaim to its correct 616 * position in the tree. 617 */ 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 620 !css_tryget(&mz->memcg->css)) 621 goto retry; 622 done: 623 return mz; 624 } 625 626 static struct mem_cgroup_per_zone * 627 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 628 { 629 struct mem_cgroup_per_zone *mz; 630 631 spin_lock(&mctz->lock); 632 mz = __mem_cgroup_largest_soft_limit_node(mctz); 633 spin_unlock(&mctz->lock); 634 return mz; 635 } 636 637 /* 638 * Implementation Note: reading percpu statistics for memcg. 639 * 640 * Both of vmstat[] and percpu_counter has threshold and do periodic 641 * synchronization to implement "quick" read. There are trade-off between 642 * reading cost and precision of value. Then, we may have a chance to implement 643 * a periodic synchronizion of counter in memcg's counter. 644 * 645 * But this _read() function is used for user interface now. The user accounts 646 * memory usage by memory cgroup and he _always_ requires exact value because 647 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 648 * have to visit all online cpus and make sum. So, for now, unnecessary 649 * synchronization is not implemented. (just implemented for cpu hotplug) 650 * 651 * If there are kernel internal actions which can make use of some not-exact 652 * value, and reading all cpu value can be performance bottleneck in some 653 * common workload, threashold and synchonization as vmstat[] should be 654 * implemented. 655 */ 656 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 657 enum mem_cgroup_stat_index idx) 658 { 659 long val = 0; 660 int cpu; 661 662 get_online_cpus(); 663 for_each_online_cpu(cpu) 664 val += per_cpu(memcg->stat->count[idx], cpu); 665 #ifdef CONFIG_HOTPLUG_CPU 666 spin_lock(&memcg->pcp_counter_lock); 667 val += memcg->nocpu_base.count[idx]; 668 spin_unlock(&memcg->pcp_counter_lock); 669 #endif 670 put_online_cpus(); 671 return val; 672 } 673 674 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 675 bool charge) 676 { 677 int val = (charge) ? 1 : -1; 678 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 679 } 680 681 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 682 enum mem_cgroup_events_index idx) 683 { 684 unsigned long val = 0; 685 int cpu; 686 687 for_each_online_cpu(cpu) 688 val += per_cpu(memcg->stat->events[idx], cpu); 689 #ifdef CONFIG_HOTPLUG_CPU 690 spin_lock(&memcg->pcp_counter_lock); 691 val += memcg->nocpu_base.events[idx]; 692 spin_unlock(&memcg->pcp_counter_lock); 693 #endif 694 return val; 695 } 696 697 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 698 bool anon, int nr_pages) 699 { 700 preempt_disable(); 701 702 /* 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 704 * counted as CACHE even if it's on ANON LRU. 705 */ 706 if (anon) 707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 708 nr_pages); 709 else 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 711 nr_pages); 712 713 /* pagein of a big page is an event. So, ignore page size */ 714 if (nr_pages > 0) 715 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 716 else { 717 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 718 nr_pages = -nr_pages; /* for event */ 719 } 720 721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 722 723 preempt_enable(); 724 } 725 726 unsigned long 727 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 728 unsigned int lru_mask) 729 { 730 struct mem_cgroup_per_zone *mz; 731 enum lru_list lru; 732 unsigned long ret = 0; 733 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 735 736 for_each_lru(lru) { 737 if (BIT(lru) & lru_mask) 738 ret += mz->lru_size[lru]; 739 } 740 return ret; 741 } 742 743 static unsigned long 744 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 745 int nid, unsigned int lru_mask) 746 { 747 u64 total = 0; 748 int zid; 749 750 for (zid = 0; zid < MAX_NR_ZONES; zid++) 751 total += mem_cgroup_zone_nr_lru_pages(memcg, 752 nid, zid, lru_mask); 753 754 return total; 755 } 756 757 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 758 unsigned int lru_mask) 759 { 760 int nid; 761 u64 total = 0; 762 763 for_each_node_state(nid, N_HIGH_MEMORY) 764 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 765 return total; 766 } 767 768 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 769 enum mem_cgroup_events_target target) 770 { 771 unsigned long val, next; 772 773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 774 next = __this_cpu_read(memcg->stat->targets[target]); 775 /* from time_after() in jiffies.h */ 776 if ((long)next - (long)val < 0) { 777 switch (target) { 778 case MEM_CGROUP_TARGET_THRESH: 779 next = val + THRESHOLDS_EVENTS_TARGET; 780 break; 781 case MEM_CGROUP_TARGET_SOFTLIMIT: 782 next = val + SOFTLIMIT_EVENTS_TARGET; 783 break; 784 case MEM_CGROUP_TARGET_NUMAINFO: 785 next = val + NUMAINFO_EVENTS_TARGET; 786 break; 787 default: 788 break; 789 } 790 __this_cpu_write(memcg->stat->targets[target], next); 791 return true; 792 } 793 return false; 794 } 795 796 /* 797 * Check events in order. 798 * 799 */ 800 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 801 { 802 preempt_disable(); 803 /* threshold event is triggered in finer grain than soft limit */ 804 if (unlikely(mem_cgroup_event_ratelimit(memcg, 805 MEM_CGROUP_TARGET_THRESH))) { 806 bool do_softlimit; 807 bool do_numainfo __maybe_unused; 808 809 do_softlimit = mem_cgroup_event_ratelimit(memcg, 810 MEM_CGROUP_TARGET_SOFTLIMIT); 811 #if MAX_NUMNODES > 1 812 do_numainfo = mem_cgroup_event_ratelimit(memcg, 813 MEM_CGROUP_TARGET_NUMAINFO); 814 #endif 815 preempt_enable(); 816 817 mem_cgroup_threshold(memcg); 818 if (unlikely(do_softlimit)) 819 mem_cgroup_update_tree(memcg, page); 820 #if MAX_NUMNODES > 1 821 if (unlikely(do_numainfo)) 822 atomic_inc(&memcg->numainfo_events); 823 #endif 824 } else 825 preempt_enable(); 826 } 827 828 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 829 { 830 return container_of(cgroup_subsys_state(cont, 831 mem_cgroup_subsys_id), struct mem_cgroup, 832 css); 833 } 834 835 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 836 { 837 /* 838 * mm_update_next_owner() may clear mm->owner to NULL 839 * if it races with swapoff, page migration, etc. 840 * So this can be called with p == NULL. 841 */ 842 if (unlikely(!p)) 843 return NULL; 844 845 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 846 struct mem_cgroup, css); 847 } 848 849 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 850 { 851 struct mem_cgroup *memcg = NULL; 852 853 if (!mm) 854 return NULL; 855 /* 856 * Because we have no locks, mm->owner's may be being moved to other 857 * cgroup. We use css_tryget() here even if this looks 858 * pessimistic (rather than adding locks here). 859 */ 860 rcu_read_lock(); 861 do { 862 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 863 if (unlikely(!memcg)) 864 break; 865 } while (!css_tryget(&memcg->css)); 866 rcu_read_unlock(); 867 return memcg; 868 } 869 870 /** 871 * mem_cgroup_iter - iterate over memory cgroup hierarchy 872 * @root: hierarchy root 873 * @prev: previously returned memcg, NULL on first invocation 874 * @reclaim: cookie for shared reclaim walks, NULL for full walks 875 * 876 * Returns references to children of the hierarchy below @root, or 877 * @root itself, or %NULL after a full round-trip. 878 * 879 * Caller must pass the return value in @prev on subsequent 880 * invocations for reference counting, or use mem_cgroup_iter_break() 881 * to cancel a hierarchy walk before the round-trip is complete. 882 * 883 * Reclaimers can specify a zone and a priority level in @reclaim to 884 * divide up the memcgs in the hierarchy among all concurrent 885 * reclaimers operating on the same zone and priority. 886 */ 887 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 888 struct mem_cgroup *prev, 889 struct mem_cgroup_reclaim_cookie *reclaim) 890 { 891 struct mem_cgroup *memcg = NULL; 892 int id = 0; 893 894 if (mem_cgroup_disabled()) 895 return NULL; 896 897 if (!root) 898 root = root_mem_cgroup; 899 900 if (prev && !reclaim) 901 id = css_id(&prev->css); 902 903 if (prev && prev != root) 904 css_put(&prev->css); 905 906 if (!root->use_hierarchy && root != root_mem_cgroup) { 907 if (prev) 908 return NULL; 909 return root; 910 } 911 912 while (!memcg) { 913 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 914 struct cgroup_subsys_state *css; 915 916 if (reclaim) { 917 int nid = zone_to_nid(reclaim->zone); 918 int zid = zone_idx(reclaim->zone); 919 struct mem_cgroup_per_zone *mz; 920 921 mz = mem_cgroup_zoneinfo(root, nid, zid); 922 iter = &mz->reclaim_iter[reclaim->priority]; 923 if (prev && reclaim->generation != iter->generation) 924 return NULL; 925 id = iter->position; 926 } 927 928 rcu_read_lock(); 929 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 930 if (css) { 931 if (css == &root->css || css_tryget(css)) 932 memcg = container_of(css, 933 struct mem_cgroup, css); 934 } else 935 id = 0; 936 rcu_read_unlock(); 937 938 if (reclaim) { 939 iter->position = id; 940 if (!css) 941 iter->generation++; 942 else if (!prev && memcg) 943 reclaim->generation = iter->generation; 944 } 945 946 if (prev && !css) 947 return NULL; 948 } 949 return memcg; 950 } 951 952 /** 953 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 954 * @root: hierarchy root 955 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 956 */ 957 void mem_cgroup_iter_break(struct mem_cgroup *root, 958 struct mem_cgroup *prev) 959 { 960 if (!root) 961 root = root_mem_cgroup; 962 if (prev && prev != root) 963 css_put(&prev->css); 964 } 965 966 /* 967 * Iteration constructs for visiting all cgroups (under a tree). If 968 * loops are exited prematurely (break), mem_cgroup_iter_break() must 969 * be used for reference counting. 970 */ 971 #define for_each_mem_cgroup_tree(iter, root) \ 972 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 973 iter != NULL; \ 974 iter = mem_cgroup_iter(root, iter, NULL)) 975 976 #define for_each_mem_cgroup(iter) \ 977 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 978 iter != NULL; \ 979 iter = mem_cgroup_iter(NULL, iter, NULL)) 980 981 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 982 { 983 return (memcg == root_mem_cgroup); 984 } 985 986 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 987 { 988 struct mem_cgroup *memcg; 989 990 if (!mm) 991 return; 992 993 rcu_read_lock(); 994 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 995 if (unlikely(!memcg)) 996 goto out; 997 998 switch (idx) { 999 case PGFAULT: 1000 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1001 break; 1002 case PGMAJFAULT: 1003 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1004 break; 1005 default: 1006 BUG(); 1007 } 1008 out: 1009 rcu_read_unlock(); 1010 } 1011 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1012 1013 /** 1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1015 * @zone: zone of the wanted lruvec 1016 * @mem: memcg of the wanted lruvec 1017 * 1018 * Returns the lru list vector holding pages for the given @zone and 1019 * @mem. This can be the global zone lruvec, if the memory controller 1020 * is disabled. 1021 */ 1022 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1023 struct mem_cgroup *memcg) 1024 { 1025 struct mem_cgroup_per_zone *mz; 1026 1027 if (mem_cgroup_disabled()) 1028 return &zone->lruvec; 1029 1030 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1031 return &mz->lruvec; 1032 } 1033 1034 /* 1035 * Following LRU functions are allowed to be used without PCG_LOCK. 1036 * Operations are called by routine of global LRU independently from memcg. 1037 * What we have to take care of here is validness of pc->mem_cgroup. 1038 * 1039 * Changes to pc->mem_cgroup happens when 1040 * 1. charge 1041 * 2. moving account 1042 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1043 * It is added to LRU before charge. 1044 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1045 * When moving account, the page is not on LRU. It's isolated. 1046 */ 1047 1048 /** 1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1050 * @zone: zone of the page 1051 * @page: the page 1052 * @lru: current lru 1053 * 1054 * This function accounts for @page being added to @lru, and returns 1055 * the lruvec for the given @zone and the memcg @page is charged to. 1056 * 1057 * The callsite is then responsible for physically linking the page to 1058 * the returned lruvec->lists[@lru]. 1059 */ 1060 struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1061 enum lru_list lru) 1062 { 1063 struct mem_cgroup_per_zone *mz; 1064 struct mem_cgroup *memcg; 1065 struct page_cgroup *pc; 1066 1067 if (mem_cgroup_disabled()) 1068 return &zone->lruvec; 1069 1070 pc = lookup_page_cgroup(page); 1071 memcg = pc->mem_cgroup; 1072 1073 /* 1074 * Surreptitiously switch any uncharged page to root: 1075 * an uncharged page off lru does nothing to secure 1076 * its former mem_cgroup from sudden removal. 1077 * 1078 * Our caller holds lru_lock, and PageCgroupUsed is updated 1079 * under page_cgroup lock: between them, they make all uses 1080 * of pc->mem_cgroup safe. 1081 */ 1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1084 1085 mz = page_cgroup_zoneinfo(memcg, page); 1086 /* compound_order() is stabilized through lru_lock */ 1087 mz->lru_size[lru] += 1 << compound_order(page); 1088 return &mz->lruvec; 1089 } 1090 1091 /** 1092 * mem_cgroup_lru_del_list - account for removing an lru page 1093 * @page: the page 1094 * @lru: target lru 1095 * 1096 * This function accounts for @page being removed from @lru. 1097 * 1098 * The callsite is then responsible for physically unlinking 1099 * @page->lru. 1100 */ 1101 void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1102 { 1103 struct mem_cgroup_per_zone *mz; 1104 struct mem_cgroup *memcg; 1105 struct page_cgroup *pc; 1106 1107 if (mem_cgroup_disabled()) 1108 return; 1109 1110 pc = lookup_page_cgroup(page); 1111 memcg = pc->mem_cgroup; 1112 VM_BUG_ON(!memcg); 1113 mz = page_cgroup_zoneinfo(memcg, page); 1114 /* huge page split is done under lru_lock. so, we have no races. */ 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); 1116 mz->lru_size[lru] -= 1 << compound_order(page); 1117 } 1118 1119 void mem_cgroup_lru_del(struct page *page) 1120 { 1121 mem_cgroup_lru_del_list(page, page_lru(page)); 1122 } 1123 1124 /** 1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus 1126 * @zone: zone of the page 1127 * @page: the page 1128 * @from: current lru 1129 * @to: target lru 1130 * 1131 * This function accounts for @page being moved between the lrus @from 1132 * and @to, and returns the lruvec for the given @zone and the memcg 1133 * @page is charged to. 1134 * 1135 * The callsite is then responsible for physically relinking 1136 * @page->lru to the returned lruvec->lists[@to]. 1137 */ 1138 struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, 1139 struct page *page, 1140 enum lru_list from, 1141 enum lru_list to) 1142 { 1143 /* XXX: Optimize this, especially for @from == @to */ 1144 mem_cgroup_lru_del_list(page, from); 1145 return mem_cgroup_lru_add_list(zone, page, to); 1146 } 1147 1148 /* 1149 * Checks whether given mem is same or in the root_mem_cgroup's 1150 * hierarchy subtree 1151 */ 1152 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1153 struct mem_cgroup *memcg) 1154 { 1155 if (root_memcg != memcg) { 1156 return (root_memcg->use_hierarchy && 1157 css_is_ancestor(&memcg->css, &root_memcg->css)); 1158 } 1159 1160 return true; 1161 } 1162 1163 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1164 { 1165 int ret; 1166 struct mem_cgroup *curr = NULL; 1167 struct task_struct *p; 1168 1169 p = find_lock_task_mm(task); 1170 if (p) { 1171 curr = try_get_mem_cgroup_from_mm(p->mm); 1172 task_unlock(p); 1173 } else { 1174 /* 1175 * All threads may have already detached their mm's, but the oom 1176 * killer still needs to detect if they have already been oom 1177 * killed to prevent needlessly killing additional tasks. 1178 */ 1179 task_lock(task); 1180 curr = mem_cgroup_from_task(task); 1181 if (curr) 1182 css_get(&curr->css); 1183 task_unlock(task); 1184 } 1185 if (!curr) 1186 return 0; 1187 /* 1188 * We should check use_hierarchy of "memcg" not "curr". Because checking 1189 * use_hierarchy of "curr" here make this function true if hierarchy is 1190 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1191 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1192 */ 1193 ret = mem_cgroup_same_or_subtree(memcg, curr); 1194 css_put(&curr->css); 1195 return ret; 1196 } 1197 1198 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1199 { 1200 unsigned long inactive_ratio; 1201 int nid = zone_to_nid(zone); 1202 int zid = zone_idx(zone); 1203 unsigned long inactive; 1204 unsigned long active; 1205 unsigned long gb; 1206 1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1208 BIT(LRU_INACTIVE_ANON)); 1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1210 BIT(LRU_ACTIVE_ANON)); 1211 1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1213 if (gb) 1214 inactive_ratio = int_sqrt(10 * gb); 1215 else 1216 inactive_ratio = 1; 1217 1218 return inactive * inactive_ratio < active; 1219 } 1220 1221 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1222 { 1223 unsigned long active; 1224 unsigned long inactive; 1225 int zid = zone_idx(zone); 1226 int nid = zone_to_nid(zone); 1227 1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1229 BIT(LRU_INACTIVE_FILE)); 1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1231 BIT(LRU_ACTIVE_FILE)); 1232 1233 return (active > inactive); 1234 } 1235 1236 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1237 struct zone *zone) 1238 { 1239 int nid = zone_to_nid(zone); 1240 int zid = zone_idx(zone); 1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1242 1243 return &mz->reclaim_stat; 1244 } 1245 1246 struct zone_reclaim_stat * 1247 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1248 { 1249 struct page_cgroup *pc; 1250 struct mem_cgroup_per_zone *mz; 1251 1252 if (mem_cgroup_disabled()) 1253 return NULL; 1254 1255 pc = lookup_page_cgroup(page); 1256 if (!PageCgroupUsed(pc)) 1257 return NULL; 1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1259 smp_rmb(); 1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1261 return &mz->reclaim_stat; 1262 } 1263 1264 #define mem_cgroup_from_res_counter(counter, member) \ 1265 container_of(counter, struct mem_cgroup, member) 1266 1267 /** 1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1269 * @mem: the memory cgroup 1270 * 1271 * Returns the maximum amount of memory @mem can be charged with, in 1272 * pages. 1273 */ 1274 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1275 { 1276 unsigned long long margin; 1277 1278 margin = res_counter_margin(&memcg->res); 1279 if (do_swap_account) 1280 margin = min(margin, res_counter_margin(&memcg->memsw)); 1281 return margin >> PAGE_SHIFT; 1282 } 1283 1284 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1285 { 1286 struct cgroup *cgrp = memcg->css.cgroup; 1287 1288 /* root ? */ 1289 if (cgrp->parent == NULL) 1290 return vm_swappiness; 1291 1292 return memcg->swappiness; 1293 } 1294 1295 /* 1296 * memcg->moving_account is used for checking possibility that some thread is 1297 * calling move_account(). When a thread on CPU-A starts moving pages under 1298 * a memcg, other threads should check memcg->moving_account under 1299 * rcu_read_lock(), like this: 1300 * 1301 * CPU-A CPU-B 1302 * rcu_read_lock() 1303 * memcg->moving_account+1 if (memcg->mocing_account) 1304 * take heavy locks. 1305 * synchronize_rcu() update something. 1306 * rcu_read_unlock() 1307 * start move here. 1308 */ 1309 1310 /* for quick checking without looking up memcg */ 1311 atomic_t memcg_moving __read_mostly; 1312 1313 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1314 { 1315 atomic_inc(&memcg_moving); 1316 atomic_inc(&memcg->moving_account); 1317 synchronize_rcu(); 1318 } 1319 1320 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1321 { 1322 /* 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1324 * We check NULL in callee rather than caller. 1325 */ 1326 if (memcg) { 1327 atomic_dec(&memcg_moving); 1328 atomic_dec(&memcg->moving_account); 1329 } 1330 } 1331 1332 /* 1333 * 2 routines for checking "mem" is under move_account() or not. 1334 * 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1336 * is used for avoiding races in accounting. If true, 1337 * pc->mem_cgroup may be overwritten. 1338 * 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1340 * under hierarchy of moving cgroups. This is for 1341 * waiting at hith-memory prressure caused by "move". 1342 */ 1343 1344 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1345 { 1346 VM_BUG_ON(!rcu_read_lock_held()); 1347 return atomic_read(&memcg->moving_account) > 0; 1348 } 1349 1350 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1351 { 1352 struct mem_cgroup *from; 1353 struct mem_cgroup *to; 1354 bool ret = false; 1355 /* 1356 * Unlike task_move routines, we access mc.to, mc.from not under 1357 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1358 */ 1359 spin_lock(&mc.lock); 1360 from = mc.from; 1361 to = mc.to; 1362 if (!from) 1363 goto unlock; 1364 1365 ret = mem_cgroup_same_or_subtree(memcg, from) 1366 || mem_cgroup_same_or_subtree(memcg, to); 1367 unlock: 1368 spin_unlock(&mc.lock); 1369 return ret; 1370 } 1371 1372 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1373 { 1374 if (mc.moving_task && current != mc.moving_task) { 1375 if (mem_cgroup_under_move(memcg)) { 1376 DEFINE_WAIT(wait); 1377 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1378 /* moving charge context might have finished. */ 1379 if (mc.moving_task) 1380 schedule(); 1381 finish_wait(&mc.waitq, &wait); 1382 return true; 1383 } 1384 } 1385 return false; 1386 } 1387 1388 /* 1389 * Take this lock when 1390 * - a code tries to modify page's memcg while it's USED. 1391 * - a code tries to modify page state accounting in a memcg. 1392 * see mem_cgroup_stolen(), too. 1393 */ 1394 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1395 unsigned long *flags) 1396 { 1397 spin_lock_irqsave(&memcg->move_lock, *flags); 1398 } 1399 1400 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1401 unsigned long *flags) 1402 { 1403 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1404 } 1405 1406 /** 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1408 * @memcg: The memory cgroup that went over limit 1409 * @p: Task that is going to be killed 1410 * 1411 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1412 * enabled 1413 */ 1414 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1415 { 1416 struct cgroup *task_cgrp; 1417 struct cgroup *mem_cgrp; 1418 /* 1419 * Need a buffer in BSS, can't rely on allocations. The code relies 1420 * on the assumption that OOM is serialized for memory controller. 1421 * If this assumption is broken, revisit this code. 1422 */ 1423 static char memcg_name[PATH_MAX]; 1424 int ret; 1425 1426 if (!memcg || !p) 1427 return; 1428 1429 rcu_read_lock(); 1430 1431 mem_cgrp = memcg->css.cgroup; 1432 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1433 1434 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1435 if (ret < 0) { 1436 /* 1437 * Unfortunately, we are unable to convert to a useful name 1438 * But we'll still print out the usage information 1439 */ 1440 rcu_read_unlock(); 1441 goto done; 1442 } 1443 rcu_read_unlock(); 1444 1445 printk(KERN_INFO "Task in %s killed", memcg_name); 1446 1447 rcu_read_lock(); 1448 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1449 if (ret < 0) { 1450 rcu_read_unlock(); 1451 goto done; 1452 } 1453 rcu_read_unlock(); 1454 1455 /* 1456 * Continues from above, so we don't need an KERN_ level 1457 */ 1458 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1459 done: 1460 1461 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1462 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1463 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1464 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1465 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1466 "failcnt %llu\n", 1467 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1468 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1469 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1470 } 1471 1472 /* 1473 * This function returns the number of memcg under hierarchy tree. Returns 1474 * 1(self count) if no children. 1475 */ 1476 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1477 { 1478 int num = 0; 1479 struct mem_cgroup *iter; 1480 1481 for_each_mem_cgroup_tree(iter, memcg) 1482 num++; 1483 return num; 1484 } 1485 1486 /* 1487 * Return the memory (and swap, if configured) limit for a memcg. 1488 */ 1489 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1490 { 1491 u64 limit; 1492 u64 memsw; 1493 1494 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1495 limit += total_swap_pages << PAGE_SHIFT; 1496 1497 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1498 /* 1499 * If memsw is finite and limits the amount of swap space available 1500 * to this memcg, return that limit. 1501 */ 1502 return min(limit, memsw); 1503 } 1504 1505 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1506 gfp_t gfp_mask, 1507 unsigned long flags) 1508 { 1509 unsigned long total = 0; 1510 bool noswap = false; 1511 int loop; 1512 1513 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1514 noswap = true; 1515 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1516 noswap = true; 1517 1518 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1519 if (loop) 1520 drain_all_stock_async(memcg); 1521 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1522 /* 1523 * Allow limit shrinkers, which are triggered directly 1524 * by userspace, to catch signals and stop reclaim 1525 * after minimal progress, regardless of the margin. 1526 */ 1527 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1528 break; 1529 if (mem_cgroup_margin(memcg)) 1530 break; 1531 /* 1532 * If nothing was reclaimed after two attempts, there 1533 * may be no reclaimable pages in this hierarchy. 1534 */ 1535 if (loop && !total) 1536 break; 1537 } 1538 return total; 1539 } 1540 1541 /** 1542 * test_mem_cgroup_node_reclaimable 1543 * @mem: the target memcg 1544 * @nid: the node ID to be checked. 1545 * @noswap : specify true here if the user wants flle only information. 1546 * 1547 * This function returns whether the specified memcg contains any 1548 * reclaimable pages on a node. Returns true if there are any reclaimable 1549 * pages in the node. 1550 */ 1551 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1552 int nid, bool noswap) 1553 { 1554 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1555 return true; 1556 if (noswap || !total_swap_pages) 1557 return false; 1558 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1559 return true; 1560 return false; 1561 1562 } 1563 #if MAX_NUMNODES > 1 1564 1565 /* 1566 * Always updating the nodemask is not very good - even if we have an empty 1567 * list or the wrong list here, we can start from some node and traverse all 1568 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1569 * 1570 */ 1571 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1572 { 1573 int nid; 1574 /* 1575 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1576 * pagein/pageout changes since the last update. 1577 */ 1578 if (!atomic_read(&memcg->numainfo_events)) 1579 return; 1580 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1581 return; 1582 1583 /* make a nodemask where this memcg uses memory from */ 1584 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1585 1586 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1587 1588 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1589 node_clear(nid, memcg->scan_nodes); 1590 } 1591 1592 atomic_set(&memcg->numainfo_events, 0); 1593 atomic_set(&memcg->numainfo_updating, 0); 1594 } 1595 1596 /* 1597 * Selecting a node where we start reclaim from. Because what we need is just 1598 * reducing usage counter, start from anywhere is O,K. Considering 1599 * memory reclaim from current node, there are pros. and cons. 1600 * 1601 * Freeing memory from current node means freeing memory from a node which 1602 * we'll use or we've used. So, it may make LRU bad. And if several threads 1603 * hit limits, it will see a contention on a node. But freeing from remote 1604 * node means more costs for memory reclaim because of memory latency. 1605 * 1606 * Now, we use round-robin. Better algorithm is welcomed. 1607 */ 1608 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1609 { 1610 int node; 1611 1612 mem_cgroup_may_update_nodemask(memcg); 1613 node = memcg->last_scanned_node; 1614 1615 node = next_node(node, memcg->scan_nodes); 1616 if (node == MAX_NUMNODES) 1617 node = first_node(memcg->scan_nodes); 1618 /* 1619 * We call this when we hit limit, not when pages are added to LRU. 1620 * No LRU may hold pages because all pages are UNEVICTABLE or 1621 * memcg is too small and all pages are not on LRU. In that case, 1622 * we use curret node. 1623 */ 1624 if (unlikely(node == MAX_NUMNODES)) 1625 node = numa_node_id(); 1626 1627 memcg->last_scanned_node = node; 1628 return node; 1629 } 1630 1631 /* 1632 * Check all nodes whether it contains reclaimable pages or not. 1633 * For quick scan, we make use of scan_nodes. This will allow us to skip 1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1635 * enough new information. We need to do double check. 1636 */ 1637 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1638 { 1639 int nid; 1640 1641 /* 1642 * quick check...making use of scan_node. 1643 * We can skip unused nodes. 1644 */ 1645 if (!nodes_empty(memcg->scan_nodes)) { 1646 for (nid = first_node(memcg->scan_nodes); 1647 nid < MAX_NUMNODES; 1648 nid = next_node(nid, memcg->scan_nodes)) { 1649 1650 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1651 return true; 1652 } 1653 } 1654 /* 1655 * Check rest of nodes. 1656 */ 1657 for_each_node_state(nid, N_HIGH_MEMORY) { 1658 if (node_isset(nid, memcg->scan_nodes)) 1659 continue; 1660 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1661 return true; 1662 } 1663 return false; 1664 } 1665 1666 #else 1667 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1668 { 1669 return 0; 1670 } 1671 1672 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1673 { 1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1675 } 1676 #endif 1677 1678 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1679 struct zone *zone, 1680 gfp_t gfp_mask, 1681 unsigned long *total_scanned) 1682 { 1683 struct mem_cgroup *victim = NULL; 1684 int total = 0; 1685 int loop = 0; 1686 unsigned long excess; 1687 unsigned long nr_scanned; 1688 struct mem_cgroup_reclaim_cookie reclaim = { 1689 .zone = zone, 1690 .priority = 0, 1691 }; 1692 1693 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1694 1695 while (1) { 1696 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1697 if (!victim) { 1698 loop++; 1699 if (loop >= 2) { 1700 /* 1701 * If we have not been able to reclaim 1702 * anything, it might because there are 1703 * no reclaimable pages under this hierarchy 1704 */ 1705 if (!total) 1706 break; 1707 /* 1708 * We want to do more targeted reclaim. 1709 * excess >> 2 is not to excessive so as to 1710 * reclaim too much, nor too less that we keep 1711 * coming back to reclaim from this cgroup 1712 */ 1713 if (total >= (excess >> 2) || 1714 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1715 break; 1716 } 1717 continue; 1718 } 1719 if (!mem_cgroup_reclaimable(victim, false)) 1720 continue; 1721 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1722 zone, &nr_scanned); 1723 *total_scanned += nr_scanned; 1724 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1725 break; 1726 } 1727 mem_cgroup_iter_break(root_memcg, victim); 1728 return total; 1729 } 1730 1731 /* 1732 * Check OOM-Killer is already running under our hierarchy. 1733 * If someone is running, return false. 1734 * Has to be called with memcg_oom_lock 1735 */ 1736 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1737 { 1738 struct mem_cgroup *iter, *failed = NULL; 1739 1740 for_each_mem_cgroup_tree(iter, memcg) { 1741 if (iter->oom_lock) { 1742 /* 1743 * this subtree of our hierarchy is already locked 1744 * so we cannot give a lock. 1745 */ 1746 failed = iter; 1747 mem_cgroup_iter_break(memcg, iter); 1748 break; 1749 } else 1750 iter->oom_lock = true; 1751 } 1752 1753 if (!failed) 1754 return true; 1755 1756 /* 1757 * OK, we failed to lock the whole subtree so we have to clean up 1758 * what we set up to the failing subtree 1759 */ 1760 for_each_mem_cgroup_tree(iter, memcg) { 1761 if (iter == failed) { 1762 mem_cgroup_iter_break(memcg, iter); 1763 break; 1764 } 1765 iter->oom_lock = false; 1766 } 1767 return false; 1768 } 1769 1770 /* 1771 * Has to be called with memcg_oom_lock 1772 */ 1773 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1774 { 1775 struct mem_cgroup *iter; 1776 1777 for_each_mem_cgroup_tree(iter, memcg) 1778 iter->oom_lock = false; 1779 return 0; 1780 } 1781 1782 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1783 { 1784 struct mem_cgroup *iter; 1785 1786 for_each_mem_cgroup_tree(iter, memcg) 1787 atomic_inc(&iter->under_oom); 1788 } 1789 1790 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1791 { 1792 struct mem_cgroup *iter; 1793 1794 /* 1795 * When a new child is created while the hierarchy is under oom, 1796 * mem_cgroup_oom_lock() may not be called. We have to use 1797 * atomic_add_unless() here. 1798 */ 1799 for_each_mem_cgroup_tree(iter, memcg) 1800 atomic_add_unless(&iter->under_oom, -1, 0); 1801 } 1802 1803 static DEFINE_SPINLOCK(memcg_oom_lock); 1804 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1805 1806 struct oom_wait_info { 1807 struct mem_cgroup *memcg; 1808 wait_queue_t wait; 1809 }; 1810 1811 static int memcg_oom_wake_function(wait_queue_t *wait, 1812 unsigned mode, int sync, void *arg) 1813 { 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1815 struct mem_cgroup *oom_wait_memcg; 1816 struct oom_wait_info *oom_wait_info; 1817 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1819 oom_wait_memcg = oom_wait_info->memcg; 1820 1821 /* 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 1823 * Then we can use css_is_ancestor without taking care of RCU. 1824 */ 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1826 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 1827 return 0; 1828 return autoremove_wake_function(wait, mode, sync, arg); 1829 } 1830 1831 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 1832 { 1833 /* for filtering, pass "memcg" as argument. */ 1834 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1835 } 1836 1837 static void memcg_oom_recover(struct mem_cgroup *memcg) 1838 { 1839 if (memcg && atomic_read(&memcg->under_oom)) 1840 memcg_wakeup_oom(memcg); 1841 } 1842 1843 /* 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1845 */ 1846 bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1847 { 1848 struct oom_wait_info owait; 1849 bool locked, need_to_kill; 1850 1851 owait.memcg = memcg; 1852 owait.wait.flags = 0; 1853 owait.wait.func = memcg_oom_wake_function; 1854 owait.wait.private = current; 1855 INIT_LIST_HEAD(&owait.wait.task_list); 1856 need_to_kill = true; 1857 mem_cgroup_mark_under_oom(memcg); 1858 1859 /* At first, try to OOM lock hierarchy under memcg.*/ 1860 spin_lock(&memcg_oom_lock); 1861 locked = mem_cgroup_oom_lock(memcg); 1862 /* 1863 * Even if signal_pending(), we can't quit charge() loop without 1864 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1865 * under OOM is always welcomed, use TASK_KILLABLE here. 1866 */ 1867 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1868 if (!locked || memcg->oom_kill_disable) 1869 need_to_kill = false; 1870 if (locked) 1871 mem_cgroup_oom_notify(memcg); 1872 spin_unlock(&memcg_oom_lock); 1873 1874 if (need_to_kill) { 1875 finish_wait(&memcg_oom_waitq, &owait.wait); 1876 mem_cgroup_out_of_memory(memcg, mask, order); 1877 } else { 1878 schedule(); 1879 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 } 1881 spin_lock(&memcg_oom_lock); 1882 if (locked) 1883 mem_cgroup_oom_unlock(memcg); 1884 memcg_wakeup_oom(memcg); 1885 spin_unlock(&memcg_oom_lock); 1886 1887 mem_cgroup_unmark_under_oom(memcg); 1888 1889 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1890 return false; 1891 /* Give chance to dying process */ 1892 schedule_timeout_uninterruptible(1); 1893 return true; 1894 } 1895 1896 /* 1897 * Currently used to update mapped file statistics, but the routine can be 1898 * generalized to update other statistics as well. 1899 * 1900 * Notes: Race condition 1901 * 1902 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1903 * it tends to be costly. But considering some conditions, we doesn't need 1904 * to do so _always_. 1905 * 1906 * Considering "charge", lock_page_cgroup() is not required because all 1907 * file-stat operations happen after a page is attached to radix-tree. There 1908 * are no race with "charge". 1909 * 1910 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1911 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1912 * if there are race with "uncharge". Statistics itself is properly handled 1913 * by flags. 1914 * 1915 * Considering "move", this is an only case we see a race. To make the race 1916 * small, we check mm->moving_account and detect there are possibility of race 1917 * If there is, we take a lock. 1918 */ 1919 1920 void __mem_cgroup_begin_update_page_stat(struct page *page, 1921 bool *locked, unsigned long *flags) 1922 { 1923 struct mem_cgroup *memcg; 1924 struct page_cgroup *pc; 1925 1926 pc = lookup_page_cgroup(page); 1927 again: 1928 memcg = pc->mem_cgroup; 1929 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1930 return; 1931 /* 1932 * If this memory cgroup is not under account moving, we don't 1933 * need to take move_lock_page_cgroup(). Because we already hold 1934 * rcu_read_lock(), any calls to move_account will be delayed until 1935 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1936 */ 1937 if (!mem_cgroup_stolen(memcg)) 1938 return; 1939 1940 move_lock_mem_cgroup(memcg, flags); 1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 1942 move_unlock_mem_cgroup(memcg, flags); 1943 goto again; 1944 } 1945 *locked = true; 1946 } 1947 1948 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 1949 { 1950 struct page_cgroup *pc = lookup_page_cgroup(page); 1951 1952 /* 1953 * It's guaranteed that pc->mem_cgroup never changes while 1954 * lock is held because a routine modifies pc->mem_cgroup 1955 * should take move_lock_page_cgroup(). 1956 */ 1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1958 } 1959 1960 void mem_cgroup_update_page_stat(struct page *page, 1961 enum mem_cgroup_page_stat_item idx, int val) 1962 { 1963 struct mem_cgroup *memcg; 1964 struct page_cgroup *pc = lookup_page_cgroup(page); 1965 unsigned long uninitialized_var(flags); 1966 1967 if (mem_cgroup_disabled()) 1968 return; 1969 1970 memcg = pc->mem_cgroup; 1971 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1972 return; 1973 1974 switch (idx) { 1975 case MEMCG_NR_FILE_MAPPED: 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1977 break; 1978 default: 1979 BUG(); 1980 } 1981 1982 this_cpu_add(memcg->stat->count[idx], val); 1983 } 1984 1985 /* 1986 * size of first charge trial. "32" comes from vmscan.c's magic value. 1987 * TODO: maybe necessary to use big numbers in big irons. 1988 */ 1989 #define CHARGE_BATCH 32U 1990 struct memcg_stock_pcp { 1991 struct mem_cgroup *cached; /* this never be root cgroup */ 1992 unsigned int nr_pages; 1993 struct work_struct work; 1994 unsigned long flags; 1995 #define FLUSHING_CACHED_CHARGE (0) 1996 }; 1997 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1998 static DEFINE_MUTEX(percpu_charge_mutex); 1999 2000 /* 2001 * Try to consume stocked charge on this cpu. If success, one page is consumed 2002 * from local stock and true is returned. If the stock is 0 or charges from a 2003 * cgroup which is not current target, returns false. This stock will be 2004 * refilled. 2005 */ 2006 static bool consume_stock(struct mem_cgroup *memcg) 2007 { 2008 struct memcg_stock_pcp *stock; 2009 bool ret = true; 2010 2011 stock = &get_cpu_var(memcg_stock); 2012 if (memcg == stock->cached && stock->nr_pages) 2013 stock->nr_pages--; 2014 else /* need to call res_counter_charge */ 2015 ret = false; 2016 put_cpu_var(memcg_stock); 2017 return ret; 2018 } 2019 2020 /* 2021 * Returns stocks cached in percpu to res_counter and reset cached information. 2022 */ 2023 static void drain_stock(struct memcg_stock_pcp *stock) 2024 { 2025 struct mem_cgroup *old = stock->cached; 2026 2027 if (stock->nr_pages) { 2028 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2029 2030 res_counter_uncharge(&old->res, bytes); 2031 if (do_swap_account) 2032 res_counter_uncharge(&old->memsw, bytes); 2033 stock->nr_pages = 0; 2034 } 2035 stock->cached = NULL; 2036 } 2037 2038 /* 2039 * This must be called under preempt disabled or must be called by 2040 * a thread which is pinned to local cpu. 2041 */ 2042 static void drain_local_stock(struct work_struct *dummy) 2043 { 2044 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2045 drain_stock(stock); 2046 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2047 } 2048 2049 /* 2050 * Cache charges(val) which is from res_counter, to local per_cpu area. 2051 * This will be consumed by consume_stock() function, later. 2052 */ 2053 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2054 { 2055 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2056 2057 if (stock->cached != memcg) { /* reset if necessary */ 2058 drain_stock(stock); 2059 stock->cached = memcg; 2060 } 2061 stock->nr_pages += nr_pages; 2062 put_cpu_var(memcg_stock); 2063 } 2064 2065 /* 2066 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2067 * of the hierarchy under it. sync flag says whether we should block 2068 * until the work is done. 2069 */ 2070 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2071 { 2072 int cpu, curcpu; 2073 2074 /* Notify other cpus that system-wide "drain" is running */ 2075 get_online_cpus(); 2076 curcpu = get_cpu(); 2077 for_each_online_cpu(cpu) { 2078 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2079 struct mem_cgroup *memcg; 2080 2081 memcg = stock->cached; 2082 if (!memcg || !stock->nr_pages) 2083 continue; 2084 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2085 continue; 2086 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2087 if (cpu == curcpu) 2088 drain_local_stock(&stock->work); 2089 else 2090 schedule_work_on(cpu, &stock->work); 2091 } 2092 } 2093 put_cpu(); 2094 2095 if (!sync) 2096 goto out; 2097 2098 for_each_online_cpu(cpu) { 2099 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2100 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2101 flush_work(&stock->work); 2102 } 2103 out: 2104 put_online_cpus(); 2105 } 2106 2107 /* 2108 * Tries to drain stocked charges in other cpus. This function is asynchronous 2109 * and just put a work per cpu for draining localy on each cpu. Caller can 2110 * expects some charges will be back to res_counter later but cannot wait for 2111 * it. 2112 */ 2113 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2114 { 2115 /* 2116 * If someone calls draining, avoid adding more kworker runs. 2117 */ 2118 if (!mutex_trylock(&percpu_charge_mutex)) 2119 return; 2120 drain_all_stock(root_memcg, false); 2121 mutex_unlock(&percpu_charge_mutex); 2122 } 2123 2124 /* This is a synchronous drain interface. */ 2125 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2126 { 2127 /* called when force_empty is called */ 2128 mutex_lock(&percpu_charge_mutex); 2129 drain_all_stock(root_memcg, true); 2130 mutex_unlock(&percpu_charge_mutex); 2131 } 2132 2133 /* 2134 * This function drains percpu counter value from DEAD cpu and 2135 * move it to local cpu. Note that this function can be preempted. 2136 */ 2137 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2138 { 2139 int i; 2140 2141 spin_lock(&memcg->pcp_counter_lock); 2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2143 long x = per_cpu(memcg->stat->count[i], cpu); 2144 2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2146 memcg->nocpu_base.count[i] += x; 2147 } 2148 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2149 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2150 2151 per_cpu(memcg->stat->events[i], cpu) = 0; 2152 memcg->nocpu_base.events[i] += x; 2153 } 2154 spin_unlock(&memcg->pcp_counter_lock); 2155 } 2156 2157 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2158 unsigned long action, 2159 void *hcpu) 2160 { 2161 int cpu = (unsigned long)hcpu; 2162 struct memcg_stock_pcp *stock; 2163 struct mem_cgroup *iter; 2164 2165 if (action == CPU_ONLINE) 2166 return NOTIFY_OK; 2167 2168 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2169 return NOTIFY_OK; 2170 2171 for_each_mem_cgroup(iter) 2172 mem_cgroup_drain_pcp_counter(iter, cpu); 2173 2174 stock = &per_cpu(memcg_stock, cpu); 2175 drain_stock(stock); 2176 return NOTIFY_OK; 2177 } 2178 2179 2180 /* See __mem_cgroup_try_charge() for details */ 2181 enum { 2182 CHARGE_OK, /* success */ 2183 CHARGE_RETRY, /* need to retry but retry is not bad */ 2184 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2185 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2186 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2187 }; 2188 2189 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2190 unsigned int nr_pages, bool oom_check) 2191 { 2192 unsigned long csize = nr_pages * PAGE_SIZE; 2193 struct mem_cgroup *mem_over_limit; 2194 struct res_counter *fail_res; 2195 unsigned long flags = 0; 2196 int ret; 2197 2198 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2199 2200 if (likely(!ret)) { 2201 if (!do_swap_account) 2202 return CHARGE_OK; 2203 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2204 if (likely(!ret)) 2205 return CHARGE_OK; 2206 2207 res_counter_uncharge(&memcg->res, csize); 2208 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2209 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2210 } else 2211 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2212 /* 2213 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2214 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2215 * 2216 * Never reclaim on behalf of optional batching, retry with a 2217 * single page instead. 2218 */ 2219 if (nr_pages == CHARGE_BATCH) 2220 return CHARGE_RETRY; 2221 2222 if (!(gfp_mask & __GFP_WAIT)) 2223 return CHARGE_WOULDBLOCK; 2224 2225 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2226 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2227 return CHARGE_RETRY; 2228 /* 2229 * Even though the limit is exceeded at this point, reclaim 2230 * may have been able to free some pages. Retry the charge 2231 * before killing the task. 2232 * 2233 * Only for regular pages, though: huge pages are rather 2234 * unlikely to succeed so close to the limit, and we fall back 2235 * to regular pages anyway in case of failure. 2236 */ 2237 if (nr_pages == 1 && ret) 2238 return CHARGE_RETRY; 2239 2240 /* 2241 * At task move, charge accounts can be doubly counted. So, it's 2242 * better to wait until the end of task_move if something is going on. 2243 */ 2244 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2245 return CHARGE_RETRY; 2246 2247 /* If we don't need to call oom-killer at el, return immediately */ 2248 if (!oom_check) 2249 return CHARGE_NOMEM; 2250 /* check OOM */ 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2252 return CHARGE_OOM_DIE; 2253 2254 return CHARGE_RETRY; 2255 } 2256 2257 /* 2258 * __mem_cgroup_try_charge() does 2259 * 1. detect memcg to be charged against from passed *mm and *ptr, 2260 * 2. update res_counter 2261 * 3. call memory reclaim if necessary. 2262 * 2263 * In some special case, if the task is fatal, fatal_signal_pending() or 2264 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2265 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2266 * as possible without any hazards. 2: all pages should have a valid 2267 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2268 * pointer, that is treated as a charge to root_mem_cgroup. 2269 * 2270 * So __mem_cgroup_try_charge() will return 2271 * 0 ... on success, filling *ptr with a valid memcg pointer. 2272 * -ENOMEM ... charge failure because of resource limits. 2273 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2274 * 2275 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2276 * the oom-killer can be invoked. 2277 */ 2278 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2279 gfp_t gfp_mask, 2280 unsigned int nr_pages, 2281 struct mem_cgroup **ptr, 2282 bool oom) 2283 { 2284 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2285 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2286 struct mem_cgroup *memcg = NULL; 2287 int ret; 2288 2289 /* 2290 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2291 * in system level. So, allow to go ahead dying process in addition to 2292 * MEMDIE process. 2293 */ 2294 if (unlikely(test_thread_flag(TIF_MEMDIE) 2295 || fatal_signal_pending(current))) 2296 goto bypass; 2297 2298 /* 2299 * We always charge the cgroup the mm_struct belongs to. 2300 * The mm_struct's mem_cgroup changes on task migration if the 2301 * thread group leader migrates. It's possible that mm is not 2302 * set, if so charge the init_mm (happens for pagecache usage). 2303 */ 2304 if (!*ptr && !mm) 2305 *ptr = root_mem_cgroup; 2306 again: 2307 if (*ptr) { /* css should be a valid one */ 2308 memcg = *ptr; 2309 VM_BUG_ON(css_is_removed(&memcg->css)); 2310 if (mem_cgroup_is_root(memcg)) 2311 goto done; 2312 if (nr_pages == 1 && consume_stock(memcg)) 2313 goto done; 2314 css_get(&memcg->css); 2315 } else { 2316 struct task_struct *p; 2317 2318 rcu_read_lock(); 2319 p = rcu_dereference(mm->owner); 2320 /* 2321 * Because we don't have task_lock(), "p" can exit. 2322 * In that case, "memcg" can point to root or p can be NULL with 2323 * race with swapoff. Then, we have small risk of mis-accouning. 2324 * But such kind of mis-account by race always happens because 2325 * we don't have cgroup_mutex(). It's overkill and we allo that 2326 * small race, here. 2327 * (*) swapoff at el will charge against mm-struct not against 2328 * task-struct. So, mm->owner can be NULL. 2329 */ 2330 memcg = mem_cgroup_from_task(p); 2331 if (!memcg) 2332 memcg = root_mem_cgroup; 2333 if (mem_cgroup_is_root(memcg)) { 2334 rcu_read_unlock(); 2335 goto done; 2336 } 2337 if (nr_pages == 1 && consume_stock(memcg)) { 2338 /* 2339 * It seems dagerous to access memcg without css_get(). 2340 * But considering how consume_stok works, it's not 2341 * necessary. If consume_stock success, some charges 2342 * from this memcg are cached on this cpu. So, we 2343 * don't need to call css_get()/css_tryget() before 2344 * calling consume_stock(). 2345 */ 2346 rcu_read_unlock(); 2347 goto done; 2348 } 2349 /* after here, we may be blocked. we need to get refcnt */ 2350 if (!css_tryget(&memcg->css)) { 2351 rcu_read_unlock(); 2352 goto again; 2353 } 2354 rcu_read_unlock(); 2355 } 2356 2357 do { 2358 bool oom_check; 2359 2360 /* If killed, bypass charge */ 2361 if (fatal_signal_pending(current)) { 2362 css_put(&memcg->css); 2363 goto bypass; 2364 } 2365 2366 oom_check = false; 2367 if (oom && !nr_oom_retries) { 2368 oom_check = true; 2369 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2370 } 2371 2372 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2373 switch (ret) { 2374 case CHARGE_OK: 2375 break; 2376 case CHARGE_RETRY: /* not in OOM situation but retry */ 2377 batch = nr_pages; 2378 css_put(&memcg->css); 2379 memcg = NULL; 2380 goto again; 2381 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2382 css_put(&memcg->css); 2383 goto nomem; 2384 case CHARGE_NOMEM: /* OOM routine works */ 2385 if (!oom) { 2386 css_put(&memcg->css); 2387 goto nomem; 2388 } 2389 /* If oom, we never return -ENOMEM */ 2390 nr_oom_retries--; 2391 break; 2392 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2393 css_put(&memcg->css); 2394 goto bypass; 2395 } 2396 } while (ret != CHARGE_OK); 2397 2398 if (batch > nr_pages) 2399 refill_stock(memcg, batch - nr_pages); 2400 css_put(&memcg->css); 2401 done: 2402 *ptr = memcg; 2403 return 0; 2404 nomem: 2405 *ptr = NULL; 2406 return -ENOMEM; 2407 bypass: 2408 *ptr = root_mem_cgroup; 2409 return -EINTR; 2410 } 2411 2412 /* 2413 * Somemtimes we have to undo a charge we got by try_charge(). 2414 * This function is for that and do uncharge, put css's refcnt. 2415 * gotten by try_charge(). 2416 */ 2417 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2418 unsigned int nr_pages) 2419 { 2420 if (!mem_cgroup_is_root(memcg)) { 2421 unsigned long bytes = nr_pages * PAGE_SIZE; 2422 2423 res_counter_uncharge(&memcg->res, bytes); 2424 if (do_swap_account) 2425 res_counter_uncharge(&memcg->memsw, bytes); 2426 } 2427 } 2428 2429 /* 2430 * A helper function to get mem_cgroup from ID. must be called under 2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2432 * it's concern. (dropping refcnt from swap can be called against removed 2433 * memcg.) 2434 */ 2435 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2436 { 2437 struct cgroup_subsys_state *css; 2438 2439 /* ID 0 is unused ID */ 2440 if (!id) 2441 return NULL; 2442 css = css_lookup(&mem_cgroup_subsys, id); 2443 if (!css) 2444 return NULL; 2445 return container_of(css, struct mem_cgroup, css); 2446 } 2447 2448 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2449 { 2450 struct mem_cgroup *memcg = NULL; 2451 struct page_cgroup *pc; 2452 unsigned short id; 2453 swp_entry_t ent; 2454 2455 VM_BUG_ON(!PageLocked(page)); 2456 2457 pc = lookup_page_cgroup(page); 2458 lock_page_cgroup(pc); 2459 if (PageCgroupUsed(pc)) { 2460 memcg = pc->mem_cgroup; 2461 if (memcg && !css_tryget(&memcg->css)) 2462 memcg = NULL; 2463 } else if (PageSwapCache(page)) { 2464 ent.val = page_private(page); 2465 id = lookup_swap_cgroup_id(ent); 2466 rcu_read_lock(); 2467 memcg = mem_cgroup_lookup(id); 2468 if (memcg && !css_tryget(&memcg->css)) 2469 memcg = NULL; 2470 rcu_read_unlock(); 2471 } 2472 unlock_page_cgroup(pc); 2473 return memcg; 2474 } 2475 2476 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2477 struct page *page, 2478 unsigned int nr_pages, 2479 enum charge_type ctype, 2480 bool lrucare) 2481 { 2482 struct page_cgroup *pc = lookup_page_cgroup(page); 2483 struct zone *uninitialized_var(zone); 2484 bool was_on_lru = false; 2485 bool anon; 2486 2487 lock_page_cgroup(pc); 2488 if (unlikely(PageCgroupUsed(pc))) { 2489 unlock_page_cgroup(pc); 2490 __mem_cgroup_cancel_charge(memcg, nr_pages); 2491 return; 2492 } 2493 /* 2494 * we don't need page_cgroup_lock about tail pages, becase they are not 2495 * accessed by any other context at this point. 2496 */ 2497 2498 /* 2499 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2500 * may already be on some other mem_cgroup's LRU. Take care of it. 2501 */ 2502 if (lrucare) { 2503 zone = page_zone(page); 2504 spin_lock_irq(&zone->lru_lock); 2505 if (PageLRU(page)) { 2506 ClearPageLRU(page); 2507 del_page_from_lru_list(zone, page, page_lru(page)); 2508 was_on_lru = true; 2509 } 2510 } 2511 2512 pc->mem_cgroup = memcg; 2513 /* 2514 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2515 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2516 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2517 * before USED bit, we need memory barrier here. 2518 * See mem_cgroup_add_lru_list(), etc. 2519 */ 2520 smp_wmb(); 2521 SetPageCgroupUsed(pc); 2522 2523 if (lrucare) { 2524 if (was_on_lru) { 2525 VM_BUG_ON(PageLRU(page)); 2526 SetPageLRU(page); 2527 add_page_to_lru_list(zone, page, page_lru(page)); 2528 } 2529 spin_unlock_irq(&zone->lru_lock); 2530 } 2531 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2533 anon = true; 2534 else 2535 anon = false; 2536 2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages); 2538 unlock_page_cgroup(pc); 2539 2540 /* 2541 * "charge_statistics" updated event counter. Then, check it. 2542 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2543 * if they exceeds softlimit. 2544 */ 2545 memcg_check_events(memcg, page); 2546 } 2547 2548 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2549 2550 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2551 /* 2552 * Because tail pages are not marked as "used", set it. We're under 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2554 * charge/uncharge will be never happen and move_account() is done under 2555 * compound_lock(), so we don't have to take care of races. 2556 */ 2557 void mem_cgroup_split_huge_fixup(struct page *head) 2558 { 2559 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2560 struct page_cgroup *pc; 2561 int i; 2562 2563 if (mem_cgroup_disabled()) 2564 return; 2565 for (i = 1; i < HPAGE_PMD_NR; i++) { 2566 pc = head_pc + i; 2567 pc->mem_cgroup = head_pc->mem_cgroup; 2568 smp_wmb();/* see __commit_charge() */ 2569 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2570 } 2571 } 2572 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2573 2574 /** 2575 * mem_cgroup_move_account - move account of the page 2576 * @page: the page 2577 * @nr_pages: number of regular pages (>1 for huge pages) 2578 * @pc: page_cgroup of the page. 2579 * @from: mem_cgroup which the page is moved from. 2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2581 * @uncharge: whether we should call uncharge and css_put against @from. 2582 * 2583 * The caller must confirm following. 2584 * - page is not on LRU (isolate_page() is useful.) 2585 * - compound_lock is held when nr_pages > 1 2586 * 2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2589 * true, this function does "uncharge" from old cgroup, but it doesn't if 2590 * @uncharge is false, so a caller should do "uncharge". 2591 */ 2592 static int mem_cgroup_move_account(struct page *page, 2593 unsigned int nr_pages, 2594 struct page_cgroup *pc, 2595 struct mem_cgroup *from, 2596 struct mem_cgroup *to, 2597 bool uncharge) 2598 { 2599 unsigned long flags; 2600 int ret; 2601 bool anon = PageAnon(page); 2602 2603 VM_BUG_ON(from == to); 2604 VM_BUG_ON(PageLRU(page)); 2605 /* 2606 * The page is isolated from LRU. So, collapse function 2607 * will not handle this page. But page splitting can happen. 2608 * Do this check under compound_page_lock(). The caller should 2609 * hold it. 2610 */ 2611 ret = -EBUSY; 2612 if (nr_pages > 1 && !PageTransHuge(page)) 2613 goto out; 2614 2615 lock_page_cgroup(pc); 2616 2617 ret = -EINVAL; 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2619 goto unlock; 2620 2621 move_lock_mem_cgroup(from, &flags); 2622 2623 if (!anon && page_mapped(page)) { 2624 /* Update mapped_file data for mem_cgroup */ 2625 preempt_disable(); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2628 preempt_enable(); 2629 } 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2631 if (uncharge) 2632 /* This is not "cancel", but cancel_charge does all we need. */ 2633 __mem_cgroup_cancel_charge(from, nr_pages); 2634 2635 /* caller should have done css_get */ 2636 pc->mem_cgroup = to; 2637 mem_cgroup_charge_statistics(to, anon, nr_pages); 2638 /* 2639 * We charges against "to" which may not have any tasks. Then, "to" 2640 * can be under rmdir(). But in current implementation, caller of 2641 * this function is just force_empty() and move charge, so it's 2642 * guaranteed that "to" is never removed. So, we don't check rmdir 2643 * status here. 2644 */ 2645 move_unlock_mem_cgroup(from, &flags); 2646 ret = 0; 2647 unlock: 2648 unlock_page_cgroup(pc); 2649 /* 2650 * check events 2651 */ 2652 memcg_check_events(to, page); 2653 memcg_check_events(from, page); 2654 out: 2655 return ret; 2656 } 2657 2658 /* 2659 * move charges to its parent. 2660 */ 2661 2662 static int mem_cgroup_move_parent(struct page *page, 2663 struct page_cgroup *pc, 2664 struct mem_cgroup *child, 2665 gfp_t gfp_mask) 2666 { 2667 struct cgroup *cg = child->css.cgroup; 2668 struct cgroup *pcg = cg->parent; 2669 struct mem_cgroup *parent; 2670 unsigned int nr_pages; 2671 unsigned long uninitialized_var(flags); 2672 int ret; 2673 2674 /* Is ROOT ? */ 2675 if (!pcg) 2676 return -EINVAL; 2677 2678 ret = -EBUSY; 2679 if (!get_page_unless_zero(page)) 2680 goto out; 2681 if (isolate_lru_page(page)) 2682 goto put; 2683 2684 nr_pages = hpage_nr_pages(page); 2685 2686 parent = mem_cgroup_from_cont(pcg); 2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2688 if (ret) 2689 goto put_back; 2690 2691 if (nr_pages > 1) 2692 flags = compound_lock_irqsave(page); 2693 2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2695 if (ret) 2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2697 2698 if (nr_pages > 1) 2699 compound_unlock_irqrestore(page, flags); 2700 put_back: 2701 putback_lru_page(page); 2702 put: 2703 put_page(page); 2704 out: 2705 return ret; 2706 } 2707 2708 /* 2709 * Charge the memory controller for page usage. 2710 * Return 2711 * 0 if the charge was successful 2712 * < 0 if the cgroup is over its limit 2713 */ 2714 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2715 gfp_t gfp_mask, enum charge_type ctype) 2716 { 2717 struct mem_cgroup *memcg = NULL; 2718 unsigned int nr_pages = 1; 2719 bool oom = true; 2720 int ret; 2721 2722 if (PageTransHuge(page)) { 2723 nr_pages <<= compound_order(page); 2724 VM_BUG_ON(!PageTransHuge(page)); 2725 /* 2726 * Never OOM-kill a process for a huge page. The 2727 * fault handler will fall back to regular pages. 2728 */ 2729 oom = false; 2730 } 2731 2732 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2733 if (ret == -ENOMEM) 2734 return ret; 2735 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 2736 return 0; 2737 } 2738 2739 int mem_cgroup_newpage_charge(struct page *page, 2740 struct mm_struct *mm, gfp_t gfp_mask) 2741 { 2742 if (mem_cgroup_disabled()) 2743 return 0; 2744 VM_BUG_ON(page_mapped(page)); 2745 VM_BUG_ON(page->mapping && !PageAnon(page)); 2746 VM_BUG_ON(!mm); 2747 return mem_cgroup_charge_common(page, mm, gfp_mask, 2748 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2749 } 2750 2751 static void 2752 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2753 enum charge_type ctype); 2754 2755 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2756 gfp_t gfp_mask) 2757 { 2758 struct mem_cgroup *memcg = NULL; 2759 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 2760 int ret; 2761 2762 if (mem_cgroup_disabled()) 2763 return 0; 2764 if (PageCompound(page)) 2765 return 0; 2766 2767 if (unlikely(!mm)) 2768 mm = &init_mm; 2769 if (!page_is_file_cache(page)) 2770 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2771 2772 if (!PageSwapCache(page)) 2773 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 2774 else { /* page is swapcache/shmem */ 2775 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2776 if (!ret) 2777 __mem_cgroup_commit_charge_swapin(page, memcg, type); 2778 } 2779 return ret; 2780 } 2781 2782 /* 2783 * While swap-in, try_charge -> commit or cancel, the page is locked. 2784 * And when try_charge() successfully returns, one refcnt to memcg without 2785 * struct page_cgroup is acquired. This refcnt will be consumed by 2786 * "commit()" or removed by "cancel()" 2787 */ 2788 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2789 struct page *page, 2790 gfp_t mask, struct mem_cgroup **memcgp) 2791 { 2792 struct mem_cgroup *memcg; 2793 int ret; 2794 2795 *memcgp = NULL; 2796 2797 if (mem_cgroup_disabled()) 2798 return 0; 2799 2800 if (!do_swap_account) 2801 goto charge_cur_mm; 2802 /* 2803 * A racing thread's fault, or swapoff, may have already updated 2804 * the pte, and even removed page from swap cache: in those cases 2805 * do_swap_page()'s pte_same() test will fail; but there's also a 2806 * KSM case which does need to charge the page. 2807 */ 2808 if (!PageSwapCache(page)) 2809 goto charge_cur_mm; 2810 memcg = try_get_mem_cgroup_from_page(page); 2811 if (!memcg) 2812 goto charge_cur_mm; 2813 *memcgp = memcg; 2814 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 2815 css_put(&memcg->css); 2816 if (ret == -EINTR) 2817 ret = 0; 2818 return ret; 2819 charge_cur_mm: 2820 if (unlikely(!mm)) 2821 mm = &init_mm; 2822 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2823 if (ret == -EINTR) 2824 ret = 0; 2825 return ret; 2826 } 2827 2828 static void 2829 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2830 enum charge_type ctype) 2831 { 2832 if (mem_cgroup_disabled()) 2833 return; 2834 if (!memcg) 2835 return; 2836 cgroup_exclude_rmdir(&memcg->css); 2837 2838 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2839 /* 2840 * Now swap is on-memory. This means this page may be 2841 * counted both as mem and swap....double count. 2842 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2843 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2844 * may call delete_from_swap_cache() before reach here. 2845 */ 2846 if (do_swap_account && PageSwapCache(page)) { 2847 swp_entry_t ent = {.val = page_private(page)}; 2848 struct mem_cgroup *swap_memcg; 2849 unsigned short id; 2850 2851 id = swap_cgroup_record(ent, 0); 2852 rcu_read_lock(); 2853 swap_memcg = mem_cgroup_lookup(id); 2854 if (swap_memcg) { 2855 /* 2856 * This recorded memcg can be obsolete one. So, avoid 2857 * calling css_tryget 2858 */ 2859 if (!mem_cgroup_is_root(swap_memcg)) 2860 res_counter_uncharge(&swap_memcg->memsw, 2861 PAGE_SIZE); 2862 mem_cgroup_swap_statistics(swap_memcg, false); 2863 mem_cgroup_put(swap_memcg); 2864 } 2865 rcu_read_unlock(); 2866 } 2867 /* 2868 * At swapin, we may charge account against cgroup which has no tasks. 2869 * So, rmdir()->pre_destroy() can be called while we do this charge. 2870 * In that case, we need to call pre_destroy() again. check it here. 2871 */ 2872 cgroup_release_and_wakeup_rmdir(&memcg->css); 2873 } 2874 2875 void mem_cgroup_commit_charge_swapin(struct page *page, 2876 struct mem_cgroup *memcg) 2877 { 2878 __mem_cgroup_commit_charge_swapin(page, memcg, 2879 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2880 } 2881 2882 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2883 { 2884 if (mem_cgroup_disabled()) 2885 return; 2886 if (!memcg) 2887 return; 2888 __mem_cgroup_cancel_charge(memcg, 1); 2889 } 2890 2891 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2892 unsigned int nr_pages, 2893 const enum charge_type ctype) 2894 { 2895 struct memcg_batch_info *batch = NULL; 2896 bool uncharge_memsw = true; 2897 2898 /* If swapout, usage of swap doesn't decrease */ 2899 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2900 uncharge_memsw = false; 2901 2902 batch = ¤t->memcg_batch; 2903 /* 2904 * In usual, we do css_get() when we remember memcg pointer. 2905 * But in this case, we keep res->usage until end of a series of 2906 * uncharges. Then, it's ok to ignore memcg's refcnt. 2907 */ 2908 if (!batch->memcg) 2909 batch->memcg = memcg; 2910 /* 2911 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2912 * In those cases, all pages freed continuously can be expected to be in 2913 * the same cgroup and we have chance to coalesce uncharges. 2914 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2915 * because we want to do uncharge as soon as possible. 2916 */ 2917 2918 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2919 goto direct_uncharge; 2920 2921 if (nr_pages > 1) 2922 goto direct_uncharge; 2923 2924 /* 2925 * In typical case, batch->memcg == mem. This means we can 2926 * merge a series of uncharges to an uncharge of res_counter. 2927 * If not, we uncharge res_counter ony by one. 2928 */ 2929 if (batch->memcg != memcg) 2930 goto direct_uncharge; 2931 /* remember freed charge and uncharge it later */ 2932 batch->nr_pages++; 2933 if (uncharge_memsw) 2934 batch->memsw_nr_pages++; 2935 return; 2936 direct_uncharge: 2937 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 2938 if (uncharge_memsw) 2939 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2940 if (unlikely(batch->memcg != memcg)) 2941 memcg_oom_recover(memcg); 2942 } 2943 2944 /* 2945 * uncharge if !page_mapped(page) 2946 */ 2947 static struct mem_cgroup * 2948 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2949 { 2950 struct mem_cgroup *memcg = NULL; 2951 unsigned int nr_pages = 1; 2952 struct page_cgroup *pc; 2953 bool anon; 2954 2955 if (mem_cgroup_disabled()) 2956 return NULL; 2957 2958 if (PageSwapCache(page)) 2959 return NULL; 2960 2961 if (PageTransHuge(page)) { 2962 nr_pages <<= compound_order(page); 2963 VM_BUG_ON(!PageTransHuge(page)); 2964 } 2965 /* 2966 * Check if our page_cgroup is valid 2967 */ 2968 pc = lookup_page_cgroup(page); 2969 if (unlikely(!PageCgroupUsed(pc))) 2970 return NULL; 2971 2972 lock_page_cgroup(pc); 2973 2974 memcg = pc->mem_cgroup; 2975 2976 if (!PageCgroupUsed(pc)) 2977 goto unlock_out; 2978 2979 anon = PageAnon(page); 2980 2981 switch (ctype) { 2982 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2983 /* 2984 * Generally PageAnon tells if it's the anon statistics to be 2985 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 2986 * used before page reached the stage of being marked PageAnon. 2987 */ 2988 anon = true; 2989 /* fallthrough */ 2990 case MEM_CGROUP_CHARGE_TYPE_DROP: 2991 /* See mem_cgroup_prepare_migration() */ 2992 if (page_mapped(page) || PageCgroupMigration(pc)) 2993 goto unlock_out; 2994 break; 2995 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2996 if (!PageAnon(page)) { /* Shared memory */ 2997 if (page->mapping && !page_is_file_cache(page)) 2998 goto unlock_out; 2999 } else if (page_mapped(page)) /* Anon */ 3000 goto unlock_out; 3001 break; 3002 default: 3003 break; 3004 } 3005 3006 mem_cgroup_charge_statistics(memcg, anon, -nr_pages); 3007 3008 ClearPageCgroupUsed(pc); 3009 /* 3010 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3011 * freed from LRU. This is safe because uncharged page is expected not 3012 * to be reused (freed soon). Exception is SwapCache, it's handled by 3013 * special functions. 3014 */ 3015 3016 unlock_page_cgroup(pc); 3017 /* 3018 * even after unlock, we have memcg->res.usage here and this memcg 3019 * will never be freed. 3020 */ 3021 memcg_check_events(memcg, page); 3022 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3023 mem_cgroup_swap_statistics(memcg, true); 3024 mem_cgroup_get(memcg); 3025 } 3026 if (!mem_cgroup_is_root(memcg)) 3027 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3028 3029 return memcg; 3030 3031 unlock_out: 3032 unlock_page_cgroup(pc); 3033 return NULL; 3034 } 3035 3036 void mem_cgroup_uncharge_page(struct page *page) 3037 { 3038 /* early check. */ 3039 if (page_mapped(page)) 3040 return; 3041 VM_BUG_ON(page->mapping && !PageAnon(page)); 3042 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3043 } 3044 3045 void mem_cgroup_uncharge_cache_page(struct page *page) 3046 { 3047 VM_BUG_ON(page_mapped(page)); 3048 VM_BUG_ON(page->mapping); 3049 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3050 } 3051 3052 /* 3053 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3054 * In that cases, pages are freed continuously and we can expect pages 3055 * are in the same memcg. All these calls itself limits the number of 3056 * pages freed at once, then uncharge_start/end() is called properly. 3057 * This may be called prural(2) times in a context, 3058 */ 3059 3060 void mem_cgroup_uncharge_start(void) 3061 { 3062 current->memcg_batch.do_batch++; 3063 /* We can do nest. */ 3064 if (current->memcg_batch.do_batch == 1) { 3065 current->memcg_batch.memcg = NULL; 3066 current->memcg_batch.nr_pages = 0; 3067 current->memcg_batch.memsw_nr_pages = 0; 3068 } 3069 } 3070 3071 void mem_cgroup_uncharge_end(void) 3072 { 3073 struct memcg_batch_info *batch = ¤t->memcg_batch; 3074 3075 if (!batch->do_batch) 3076 return; 3077 3078 batch->do_batch--; 3079 if (batch->do_batch) /* If stacked, do nothing. */ 3080 return; 3081 3082 if (!batch->memcg) 3083 return; 3084 /* 3085 * This "batch->memcg" is valid without any css_get/put etc... 3086 * bacause we hide charges behind us. 3087 */ 3088 if (batch->nr_pages) 3089 res_counter_uncharge(&batch->memcg->res, 3090 batch->nr_pages * PAGE_SIZE); 3091 if (batch->memsw_nr_pages) 3092 res_counter_uncharge(&batch->memcg->memsw, 3093 batch->memsw_nr_pages * PAGE_SIZE); 3094 memcg_oom_recover(batch->memcg); 3095 /* forget this pointer (for sanity check) */ 3096 batch->memcg = NULL; 3097 } 3098 3099 #ifdef CONFIG_SWAP 3100 /* 3101 * called after __delete_from_swap_cache() and drop "page" account. 3102 * memcg information is recorded to swap_cgroup of "ent" 3103 */ 3104 void 3105 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3106 { 3107 struct mem_cgroup *memcg; 3108 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3109 3110 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3111 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3112 3113 memcg = __mem_cgroup_uncharge_common(page, ctype); 3114 3115 /* 3116 * record memcg information, if swapout && memcg != NULL, 3117 * mem_cgroup_get() was called in uncharge(). 3118 */ 3119 if (do_swap_account && swapout && memcg) 3120 swap_cgroup_record(ent, css_id(&memcg->css)); 3121 } 3122 #endif 3123 3124 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3125 /* 3126 * called from swap_entry_free(). remove record in swap_cgroup and 3127 * uncharge "memsw" account. 3128 */ 3129 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3130 { 3131 struct mem_cgroup *memcg; 3132 unsigned short id; 3133 3134 if (!do_swap_account) 3135 return; 3136 3137 id = swap_cgroup_record(ent, 0); 3138 rcu_read_lock(); 3139 memcg = mem_cgroup_lookup(id); 3140 if (memcg) { 3141 /* 3142 * We uncharge this because swap is freed. 3143 * This memcg can be obsolete one. We avoid calling css_tryget 3144 */ 3145 if (!mem_cgroup_is_root(memcg)) 3146 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3147 mem_cgroup_swap_statistics(memcg, false); 3148 mem_cgroup_put(memcg); 3149 } 3150 rcu_read_unlock(); 3151 } 3152 3153 /** 3154 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3155 * @entry: swap entry to be moved 3156 * @from: mem_cgroup which the entry is moved from 3157 * @to: mem_cgroup which the entry is moved to 3158 * @need_fixup: whether we should fixup res_counters and refcounts. 3159 * 3160 * It succeeds only when the swap_cgroup's record for this entry is the same 3161 * as the mem_cgroup's id of @from. 3162 * 3163 * Returns 0 on success, -EINVAL on failure. 3164 * 3165 * The caller must have charged to @to, IOW, called res_counter_charge() about 3166 * both res and memsw, and called css_get(). 3167 */ 3168 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3169 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3170 { 3171 unsigned short old_id, new_id; 3172 3173 old_id = css_id(&from->css); 3174 new_id = css_id(&to->css); 3175 3176 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3177 mem_cgroup_swap_statistics(from, false); 3178 mem_cgroup_swap_statistics(to, true); 3179 /* 3180 * This function is only called from task migration context now. 3181 * It postpones res_counter and refcount handling till the end 3182 * of task migration(mem_cgroup_clear_mc()) for performance 3183 * improvement. But we cannot postpone mem_cgroup_get(to) 3184 * because if the process that has been moved to @to does 3185 * swap-in, the refcount of @to might be decreased to 0. 3186 */ 3187 mem_cgroup_get(to); 3188 if (need_fixup) { 3189 if (!mem_cgroup_is_root(from)) 3190 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3191 mem_cgroup_put(from); 3192 /* 3193 * we charged both to->res and to->memsw, so we should 3194 * uncharge to->res. 3195 */ 3196 if (!mem_cgroup_is_root(to)) 3197 res_counter_uncharge(&to->res, PAGE_SIZE); 3198 } 3199 return 0; 3200 } 3201 return -EINVAL; 3202 } 3203 #else 3204 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3205 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3206 { 3207 return -EINVAL; 3208 } 3209 #endif 3210 3211 /* 3212 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3213 * page belongs to. 3214 */ 3215 int mem_cgroup_prepare_migration(struct page *page, 3216 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3217 { 3218 struct mem_cgroup *memcg = NULL; 3219 struct page_cgroup *pc; 3220 enum charge_type ctype; 3221 int ret = 0; 3222 3223 *memcgp = NULL; 3224 3225 VM_BUG_ON(PageTransHuge(page)); 3226 if (mem_cgroup_disabled()) 3227 return 0; 3228 3229 pc = lookup_page_cgroup(page); 3230 lock_page_cgroup(pc); 3231 if (PageCgroupUsed(pc)) { 3232 memcg = pc->mem_cgroup; 3233 css_get(&memcg->css); 3234 /* 3235 * At migrating an anonymous page, its mapcount goes down 3236 * to 0 and uncharge() will be called. But, even if it's fully 3237 * unmapped, migration may fail and this page has to be 3238 * charged again. We set MIGRATION flag here and delay uncharge 3239 * until end_migration() is called 3240 * 3241 * Corner Case Thinking 3242 * A) 3243 * When the old page was mapped as Anon and it's unmap-and-freed 3244 * while migration was ongoing. 3245 * If unmap finds the old page, uncharge() of it will be delayed 3246 * until end_migration(). If unmap finds a new page, it's 3247 * uncharged when it make mapcount to be 1->0. If unmap code 3248 * finds swap_migration_entry, the new page will not be mapped 3249 * and end_migration() will find it(mapcount==0). 3250 * 3251 * B) 3252 * When the old page was mapped but migraion fails, the kernel 3253 * remaps it. A charge for it is kept by MIGRATION flag even 3254 * if mapcount goes down to 0. We can do remap successfully 3255 * without charging it again. 3256 * 3257 * C) 3258 * The "old" page is under lock_page() until the end of 3259 * migration, so, the old page itself will not be swapped-out. 3260 * If the new page is swapped out before end_migraton, our 3261 * hook to usual swap-out path will catch the event. 3262 */ 3263 if (PageAnon(page)) 3264 SetPageCgroupMigration(pc); 3265 } 3266 unlock_page_cgroup(pc); 3267 /* 3268 * If the page is not charged at this point, 3269 * we return here. 3270 */ 3271 if (!memcg) 3272 return 0; 3273 3274 *memcgp = memcg; 3275 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); 3276 css_put(&memcg->css);/* drop extra refcnt */ 3277 if (ret) { 3278 if (PageAnon(page)) { 3279 lock_page_cgroup(pc); 3280 ClearPageCgroupMigration(pc); 3281 unlock_page_cgroup(pc); 3282 /* 3283 * The old page may be fully unmapped while we kept it. 3284 */ 3285 mem_cgroup_uncharge_page(page); 3286 } 3287 /* we'll need to revisit this error code (we have -EINTR) */ 3288 return -ENOMEM; 3289 } 3290 /* 3291 * We charge new page before it's used/mapped. So, even if unlock_page() 3292 * is called before end_migration, we can catch all events on this new 3293 * page. In the case new page is migrated but not remapped, new page's 3294 * mapcount will be finally 0 and we call uncharge in end_migration(). 3295 */ 3296 if (PageAnon(page)) 3297 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3298 else if (page_is_file_cache(page)) 3299 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3300 else 3301 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3302 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3303 return ret; 3304 } 3305 3306 /* remove redundant charge if migration failed*/ 3307 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 3308 struct page *oldpage, struct page *newpage, bool migration_ok) 3309 { 3310 struct page *used, *unused; 3311 struct page_cgroup *pc; 3312 bool anon; 3313 3314 if (!memcg) 3315 return; 3316 /* blocks rmdir() */ 3317 cgroup_exclude_rmdir(&memcg->css); 3318 if (!migration_ok) { 3319 used = oldpage; 3320 unused = newpage; 3321 } else { 3322 used = newpage; 3323 unused = oldpage; 3324 } 3325 /* 3326 * We disallowed uncharge of pages under migration because mapcount 3327 * of the page goes down to zero, temporarly. 3328 * Clear the flag and check the page should be charged. 3329 */ 3330 pc = lookup_page_cgroup(oldpage); 3331 lock_page_cgroup(pc); 3332 ClearPageCgroupMigration(pc); 3333 unlock_page_cgroup(pc); 3334 anon = PageAnon(used); 3335 __mem_cgroup_uncharge_common(unused, 3336 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED 3337 : MEM_CGROUP_CHARGE_TYPE_CACHE); 3338 3339 /* 3340 * If a page is a file cache, radix-tree replacement is very atomic 3341 * and we can skip this check. When it was an Anon page, its mapcount 3342 * goes down to 0. But because we added MIGRATION flage, it's not 3343 * uncharged yet. There are several case but page->mapcount check 3344 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3345 * check. (see prepare_charge() also) 3346 */ 3347 if (anon) 3348 mem_cgroup_uncharge_page(used); 3349 /* 3350 * At migration, we may charge account against cgroup which has no 3351 * tasks. 3352 * So, rmdir()->pre_destroy() can be called while we do this charge. 3353 * In that case, we need to call pre_destroy() again. check it here. 3354 */ 3355 cgroup_release_and_wakeup_rmdir(&memcg->css); 3356 } 3357 3358 /* 3359 * At replace page cache, newpage is not under any memcg but it's on 3360 * LRU. So, this function doesn't touch res_counter but handles LRU 3361 * in correct way. Both pages are locked so we cannot race with uncharge. 3362 */ 3363 void mem_cgroup_replace_page_cache(struct page *oldpage, 3364 struct page *newpage) 3365 { 3366 struct mem_cgroup *memcg; 3367 struct page_cgroup *pc; 3368 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3369 3370 if (mem_cgroup_disabled()) 3371 return; 3372 3373 pc = lookup_page_cgroup(oldpage); 3374 /* fix accounting on old pages */ 3375 lock_page_cgroup(pc); 3376 memcg = pc->mem_cgroup; 3377 mem_cgroup_charge_statistics(memcg, false, -1); 3378 ClearPageCgroupUsed(pc); 3379 unlock_page_cgroup(pc); 3380 3381 if (PageSwapBacked(oldpage)) 3382 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3383 3384 /* 3385 * Even if newpage->mapping was NULL before starting replacement, 3386 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3387 * LRU while we overwrite pc->mem_cgroup. 3388 */ 3389 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 3390 } 3391 3392 #ifdef CONFIG_DEBUG_VM 3393 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3394 { 3395 struct page_cgroup *pc; 3396 3397 pc = lookup_page_cgroup(page); 3398 /* 3399 * Can be NULL while feeding pages into the page allocator for 3400 * the first time, i.e. during boot or memory hotplug; 3401 * or when mem_cgroup_disabled(). 3402 */ 3403 if (likely(pc) && PageCgroupUsed(pc)) 3404 return pc; 3405 return NULL; 3406 } 3407 3408 bool mem_cgroup_bad_page_check(struct page *page) 3409 { 3410 if (mem_cgroup_disabled()) 3411 return false; 3412 3413 return lookup_page_cgroup_used(page) != NULL; 3414 } 3415 3416 void mem_cgroup_print_bad_page(struct page *page) 3417 { 3418 struct page_cgroup *pc; 3419 3420 pc = lookup_page_cgroup_used(page); 3421 if (pc) { 3422 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3423 pc, pc->flags, pc->mem_cgroup); 3424 } 3425 } 3426 #endif 3427 3428 static DEFINE_MUTEX(set_limit_mutex); 3429 3430 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3431 unsigned long long val) 3432 { 3433 int retry_count; 3434 u64 memswlimit, memlimit; 3435 int ret = 0; 3436 int children = mem_cgroup_count_children(memcg); 3437 u64 curusage, oldusage; 3438 int enlarge; 3439 3440 /* 3441 * For keeping hierarchical_reclaim simple, how long we should retry 3442 * is depends on callers. We set our retry-count to be function 3443 * of # of children which we should visit in this loop. 3444 */ 3445 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3446 3447 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3448 3449 enlarge = 0; 3450 while (retry_count) { 3451 if (signal_pending(current)) { 3452 ret = -EINTR; 3453 break; 3454 } 3455 /* 3456 * Rather than hide all in some function, I do this in 3457 * open coded manner. You see what this really does. 3458 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3459 */ 3460 mutex_lock(&set_limit_mutex); 3461 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3462 if (memswlimit < val) { 3463 ret = -EINVAL; 3464 mutex_unlock(&set_limit_mutex); 3465 break; 3466 } 3467 3468 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3469 if (memlimit < val) 3470 enlarge = 1; 3471 3472 ret = res_counter_set_limit(&memcg->res, val); 3473 if (!ret) { 3474 if (memswlimit == val) 3475 memcg->memsw_is_minimum = true; 3476 else 3477 memcg->memsw_is_minimum = false; 3478 } 3479 mutex_unlock(&set_limit_mutex); 3480 3481 if (!ret) 3482 break; 3483 3484 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3485 MEM_CGROUP_RECLAIM_SHRINK); 3486 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3487 /* Usage is reduced ? */ 3488 if (curusage >= oldusage) 3489 retry_count--; 3490 else 3491 oldusage = curusage; 3492 } 3493 if (!ret && enlarge) 3494 memcg_oom_recover(memcg); 3495 3496 return ret; 3497 } 3498 3499 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3500 unsigned long long val) 3501 { 3502 int retry_count; 3503 u64 memlimit, memswlimit, oldusage, curusage; 3504 int children = mem_cgroup_count_children(memcg); 3505 int ret = -EBUSY; 3506 int enlarge = 0; 3507 3508 /* see mem_cgroup_resize_res_limit */ 3509 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3510 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3511 while (retry_count) { 3512 if (signal_pending(current)) { 3513 ret = -EINTR; 3514 break; 3515 } 3516 /* 3517 * Rather than hide all in some function, I do this in 3518 * open coded manner. You see what this really does. 3519 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3520 */ 3521 mutex_lock(&set_limit_mutex); 3522 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3523 if (memlimit > val) { 3524 ret = -EINVAL; 3525 mutex_unlock(&set_limit_mutex); 3526 break; 3527 } 3528 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3529 if (memswlimit < val) 3530 enlarge = 1; 3531 ret = res_counter_set_limit(&memcg->memsw, val); 3532 if (!ret) { 3533 if (memlimit == val) 3534 memcg->memsw_is_minimum = true; 3535 else 3536 memcg->memsw_is_minimum = false; 3537 } 3538 mutex_unlock(&set_limit_mutex); 3539 3540 if (!ret) 3541 break; 3542 3543 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3544 MEM_CGROUP_RECLAIM_NOSWAP | 3545 MEM_CGROUP_RECLAIM_SHRINK); 3546 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3547 /* Usage is reduced ? */ 3548 if (curusage >= oldusage) 3549 retry_count--; 3550 else 3551 oldusage = curusage; 3552 } 3553 if (!ret && enlarge) 3554 memcg_oom_recover(memcg); 3555 return ret; 3556 } 3557 3558 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3559 gfp_t gfp_mask, 3560 unsigned long *total_scanned) 3561 { 3562 unsigned long nr_reclaimed = 0; 3563 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3564 unsigned long reclaimed; 3565 int loop = 0; 3566 struct mem_cgroup_tree_per_zone *mctz; 3567 unsigned long long excess; 3568 unsigned long nr_scanned; 3569 3570 if (order > 0) 3571 return 0; 3572 3573 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3574 /* 3575 * This loop can run a while, specially if mem_cgroup's continuously 3576 * keep exceeding their soft limit and putting the system under 3577 * pressure 3578 */ 3579 do { 3580 if (next_mz) 3581 mz = next_mz; 3582 else 3583 mz = mem_cgroup_largest_soft_limit_node(mctz); 3584 if (!mz) 3585 break; 3586 3587 nr_scanned = 0; 3588 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3589 gfp_mask, &nr_scanned); 3590 nr_reclaimed += reclaimed; 3591 *total_scanned += nr_scanned; 3592 spin_lock(&mctz->lock); 3593 3594 /* 3595 * If we failed to reclaim anything from this memory cgroup 3596 * it is time to move on to the next cgroup 3597 */ 3598 next_mz = NULL; 3599 if (!reclaimed) { 3600 do { 3601 /* 3602 * Loop until we find yet another one. 3603 * 3604 * By the time we get the soft_limit lock 3605 * again, someone might have aded the 3606 * group back on the RB tree. Iterate to 3607 * make sure we get a different mem. 3608 * mem_cgroup_largest_soft_limit_node returns 3609 * NULL if no other cgroup is present on 3610 * the tree 3611 */ 3612 next_mz = 3613 __mem_cgroup_largest_soft_limit_node(mctz); 3614 if (next_mz == mz) 3615 css_put(&next_mz->memcg->css); 3616 else /* next_mz == NULL or other memcg */ 3617 break; 3618 } while (1); 3619 } 3620 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 3621 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3622 /* 3623 * One school of thought says that we should not add 3624 * back the node to the tree if reclaim returns 0. 3625 * But our reclaim could return 0, simply because due 3626 * to priority we are exposing a smaller subset of 3627 * memory to reclaim from. Consider this as a longer 3628 * term TODO. 3629 */ 3630 /* If excess == 0, no tree ops */ 3631 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 3632 spin_unlock(&mctz->lock); 3633 css_put(&mz->memcg->css); 3634 loop++; 3635 /* 3636 * Could not reclaim anything and there are no more 3637 * mem cgroups to try or we seem to be looping without 3638 * reclaiming anything. 3639 */ 3640 if (!nr_reclaimed && 3641 (next_mz == NULL || 3642 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3643 break; 3644 } while (!nr_reclaimed); 3645 if (next_mz) 3646 css_put(&next_mz->memcg->css); 3647 return nr_reclaimed; 3648 } 3649 3650 /* 3651 * This routine traverse page_cgroup in given list and drop them all. 3652 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3653 */ 3654 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3655 int node, int zid, enum lru_list lru) 3656 { 3657 struct mem_cgroup_per_zone *mz; 3658 unsigned long flags, loop; 3659 struct list_head *list; 3660 struct page *busy; 3661 struct zone *zone; 3662 int ret = 0; 3663 3664 zone = &NODE_DATA(node)->node_zones[zid]; 3665 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3666 list = &mz->lruvec.lists[lru]; 3667 3668 loop = mz->lru_size[lru]; 3669 /* give some margin against EBUSY etc...*/ 3670 loop += 256; 3671 busy = NULL; 3672 while (loop--) { 3673 struct page_cgroup *pc; 3674 struct page *page; 3675 3676 ret = 0; 3677 spin_lock_irqsave(&zone->lru_lock, flags); 3678 if (list_empty(list)) { 3679 spin_unlock_irqrestore(&zone->lru_lock, flags); 3680 break; 3681 } 3682 page = list_entry(list->prev, struct page, lru); 3683 if (busy == page) { 3684 list_move(&page->lru, list); 3685 busy = NULL; 3686 spin_unlock_irqrestore(&zone->lru_lock, flags); 3687 continue; 3688 } 3689 spin_unlock_irqrestore(&zone->lru_lock, flags); 3690 3691 pc = lookup_page_cgroup(page); 3692 3693 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3694 if (ret == -ENOMEM || ret == -EINTR) 3695 break; 3696 3697 if (ret == -EBUSY || ret == -EINVAL) { 3698 /* found lock contention or "pc" is obsolete. */ 3699 busy = page; 3700 cond_resched(); 3701 } else 3702 busy = NULL; 3703 } 3704 3705 if (!ret && !list_empty(list)) 3706 return -EBUSY; 3707 return ret; 3708 } 3709 3710 /* 3711 * make mem_cgroup's charge to be 0 if there is no task. 3712 * This enables deleting this mem_cgroup. 3713 */ 3714 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3715 { 3716 int ret; 3717 int node, zid, shrink; 3718 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3719 struct cgroup *cgrp = memcg->css.cgroup; 3720 3721 css_get(&memcg->css); 3722 3723 shrink = 0; 3724 /* should free all ? */ 3725 if (free_all) 3726 goto try_to_free; 3727 move_account: 3728 do { 3729 ret = -EBUSY; 3730 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3731 goto out; 3732 ret = -EINTR; 3733 if (signal_pending(current)) 3734 goto out; 3735 /* This is for making all *used* pages to be on LRU. */ 3736 lru_add_drain_all(); 3737 drain_all_stock_sync(memcg); 3738 ret = 0; 3739 mem_cgroup_start_move(memcg); 3740 for_each_node_state(node, N_HIGH_MEMORY) { 3741 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3742 enum lru_list lru; 3743 for_each_lru(lru) { 3744 ret = mem_cgroup_force_empty_list(memcg, 3745 node, zid, lru); 3746 if (ret) 3747 break; 3748 } 3749 } 3750 if (ret) 3751 break; 3752 } 3753 mem_cgroup_end_move(memcg); 3754 memcg_oom_recover(memcg); 3755 /* it seems parent cgroup doesn't have enough mem */ 3756 if (ret == -ENOMEM) 3757 goto try_to_free; 3758 cond_resched(); 3759 /* "ret" should also be checked to ensure all lists are empty. */ 3760 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3761 out: 3762 css_put(&memcg->css); 3763 return ret; 3764 3765 try_to_free: 3766 /* returns EBUSY if there is a task or if we come here twice. */ 3767 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3768 ret = -EBUSY; 3769 goto out; 3770 } 3771 /* we call try-to-free pages for make this cgroup empty */ 3772 lru_add_drain_all(); 3773 /* try to free all pages in this cgroup */ 3774 shrink = 1; 3775 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3776 int progress; 3777 3778 if (signal_pending(current)) { 3779 ret = -EINTR; 3780 goto out; 3781 } 3782 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3783 false); 3784 if (!progress) { 3785 nr_retries--; 3786 /* maybe some writeback is necessary */ 3787 congestion_wait(BLK_RW_ASYNC, HZ/10); 3788 } 3789 3790 } 3791 lru_add_drain(); 3792 /* try move_account...there may be some *locked* pages. */ 3793 goto move_account; 3794 } 3795 3796 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3797 { 3798 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3799 } 3800 3801 3802 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3803 { 3804 return mem_cgroup_from_cont(cont)->use_hierarchy; 3805 } 3806 3807 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3808 u64 val) 3809 { 3810 int retval = 0; 3811 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3812 struct cgroup *parent = cont->parent; 3813 struct mem_cgroup *parent_memcg = NULL; 3814 3815 if (parent) 3816 parent_memcg = mem_cgroup_from_cont(parent); 3817 3818 cgroup_lock(); 3819 /* 3820 * If parent's use_hierarchy is set, we can't make any modifications 3821 * in the child subtrees. If it is unset, then the change can 3822 * occur, provided the current cgroup has no children. 3823 * 3824 * For the root cgroup, parent_mem is NULL, we allow value to be 3825 * set if there are no children. 3826 */ 3827 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3828 (val == 1 || val == 0)) { 3829 if (list_empty(&cont->children)) 3830 memcg->use_hierarchy = val; 3831 else 3832 retval = -EBUSY; 3833 } else 3834 retval = -EINVAL; 3835 cgroup_unlock(); 3836 3837 return retval; 3838 } 3839 3840 3841 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3842 enum mem_cgroup_stat_index idx) 3843 { 3844 struct mem_cgroup *iter; 3845 long val = 0; 3846 3847 /* Per-cpu values can be negative, use a signed accumulator */ 3848 for_each_mem_cgroup_tree(iter, memcg) 3849 val += mem_cgroup_read_stat(iter, idx); 3850 3851 if (val < 0) /* race ? */ 3852 val = 0; 3853 return val; 3854 } 3855 3856 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3857 { 3858 u64 val; 3859 3860 if (!mem_cgroup_is_root(memcg)) { 3861 if (!swap) 3862 return res_counter_read_u64(&memcg->res, RES_USAGE); 3863 else 3864 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3865 } 3866 3867 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 3868 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3869 3870 if (swap) 3871 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3872 3873 return val << PAGE_SHIFT; 3874 } 3875 3876 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3877 { 3878 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3879 u64 val; 3880 int type, name; 3881 3882 type = MEMFILE_TYPE(cft->private); 3883 name = MEMFILE_ATTR(cft->private); 3884 switch (type) { 3885 case _MEM: 3886 if (name == RES_USAGE) 3887 val = mem_cgroup_usage(memcg, false); 3888 else 3889 val = res_counter_read_u64(&memcg->res, name); 3890 break; 3891 case _MEMSWAP: 3892 if (name == RES_USAGE) 3893 val = mem_cgroup_usage(memcg, true); 3894 else 3895 val = res_counter_read_u64(&memcg->memsw, name); 3896 break; 3897 default: 3898 BUG(); 3899 } 3900 return val; 3901 } 3902 /* 3903 * The user of this function is... 3904 * RES_LIMIT. 3905 */ 3906 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3907 const char *buffer) 3908 { 3909 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3910 int type, name; 3911 unsigned long long val; 3912 int ret; 3913 3914 type = MEMFILE_TYPE(cft->private); 3915 name = MEMFILE_ATTR(cft->private); 3916 switch (name) { 3917 case RES_LIMIT: 3918 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3919 ret = -EINVAL; 3920 break; 3921 } 3922 /* This function does all necessary parse...reuse it */ 3923 ret = res_counter_memparse_write_strategy(buffer, &val); 3924 if (ret) 3925 break; 3926 if (type == _MEM) 3927 ret = mem_cgroup_resize_limit(memcg, val); 3928 else 3929 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3930 break; 3931 case RES_SOFT_LIMIT: 3932 ret = res_counter_memparse_write_strategy(buffer, &val); 3933 if (ret) 3934 break; 3935 /* 3936 * For memsw, soft limits are hard to implement in terms 3937 * of semantics, for now, we support soft limits for 3938 * control without swap 3939 */ 3940 if (type == _MEM) 3941 ret = res_counter_set_soft_limit(&memcg->res, val); 3942 else 3943 ret = -EINVAL; 3944 break; 3945 default: 3946 ret = -EINVAL; /* should be BUG() ? */ 3947 break; 3948 } 3949 return ret; 3950 } 3951 3952 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3953 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3954 { 3955 struct cgroup *cgroup; 3956 unsigned long long min_limit, min_memsw_limit, tmp; 3957 3958 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3959 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3960 cgroup = memcg->css.cgroup; 3961 if (!memcg->use_hierarchy) 3962 goto out; 3963 3964 while (cgroup->parent) { 3965 cgroup = cgroup->parent; 3966 memcg = mem_cgroup_from_cont(cgroup); 3967 if (!memcg->use_hierarchy) 3968 break; 3969 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3970 min_limit = min(min_limit, tmp); 3971 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3972 min_memsw_limit = min(min_memsw_limit, tmp); 3973 } 3974 out: 3975 *mem_limit = min_limit; 3976 *memsw_limit = min_memsw_limit; 3977 } 3978 3979 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3980 { 3981 struct mem_cgroup *memcg; 3982 int type, name; 3983 3984 memcg = mem_cgroup_from_cont(cont); 3985 type = MEMFILE_TYPE(event); 3986 name = MEMFILE_ATTR(event); 3987 switch (name) { 3988 case RES_MAX_USAGE: 3989 if (type == _MEM) 3990 res_counter_reset_max(&memcg->res); 3991 else 3992 res_counter_reset_max(&memcg->memsw); 3993 break; 3994 case RES_FAILCNT: 3995 if (type == _MEM) 3996 res_counter_reset_failcnt(&memcg->res); 3997 else 3998 res_counter_reset_failcnt(&memcg->memsw); 3999 break; 4000 } 4001 4002 return 0; 4003 } 4004 4005 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 4006 struct cftype *cft) 4007 { 4008 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 4009 } 4010 4011 #ifdef CONFIG_MMU 4012 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4013 struct cftype *cft, u64 val) 4014 { 4015 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4016 4017 if (val >= (1 << NR_MOVE_TYPE)) 4018 return -EINVAL; 4019 /* 4020 * We check this value several times in both in can_attach() and 4021 * attach(), so we need cgroup lock to prevent this value from being 4022 * inconsistent. 4023 */ 4024 cgroup_lock(); 4025 memcg->move_charge_at_immigrate = val; 4026 cgroup_unlock(); 4027 4028 return 0; 4029 } 4030 #else 4031 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4032 struct cftype *cft, u64 val) 4033 { 4034 return -ENOSYS; 4035 } 4036 #endif 4037 4038 4039 /* For read statistics */ 4040 enum { 4041 MCS_CACHE, 4042 MCS_RSS, 4043 MCS_FILE_MAPPED, 4044 MCS_PGPGIN, 4045 MCS_PGPGOUT, 4046 MCS_SWAP, 4047 MCS_PGFAULT, 4048 MCS_PGMAJFAULT, 4049 MCS_INACTIVE_ANON, 4050 MCS_ACTIVE_ANON, 4051 MCS_INACTIVE_FILE, 4052 MCS_ACTIVE_FILE, 4053 MCS_UNEVICTABLE, 4054 NR_MCS_STAT, 4055 }; 4056 4057 struct mcs_total_stat { 4058 s64 stat[NR_MCS_STAT]; 4059 }; 4060 4061 struct { 4062 char *local_name; 4063 char *total_name; 4064 } memcg_stat_strings[NR_MCS_STAT] = { 4065 {"cache", "total_cache"}, 4066 {"rss", "total_rss"}, 4067 {"mapped_file", "total_mapped_file"}, 4068 {"pgpgin", "total_pgpgin"}, 4069 {"pgpgout", "total_pgpgout"}, 4070 {"swap", "total_swap"}, 4071 {"pgfault", "total_pgfault"}, 4072 {"pgmajfault", "total_pgmajfault"}, 4073 {"inactive_anon", "total_inactive_anon"}, 4074 {"active_anon", "total_active_anon"}, 4075 {"inactive_file", "total_inactive_file"}, 4076 {"active_file", "total_active_file"}, 4077 {"unevictable", "total_unevictable"} 4078 }; 4079 4080 4081 static void 4082 mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4083 { 4084 s64 val; 4085 4086 /* per cpu stat */ 4087 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); 4088 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4089 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); 4090 s->stat[MCS_RSS] += val * PAGE_SIZE; 4091 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 4092 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4093 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); 4094 s->stat[MCS_PGPGIN] += val; 4095 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); 4096 s->stat[MCS_PGPGOUT] += val; 4097 if (do_swap_account) { 4098 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 4099 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4100 } 4101 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); 4102 s->stat[MCS_PGFAULT] += val; 4103 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); 4104 s->stat[MCS_PGMAJFAULT] += val; 4105 4106 /* per zone stat */ 4107 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 4108 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4109 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 4110 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4111 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 4112 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4113 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 4114 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4115 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4116 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4117 } 4118 4119 static void 4120 mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4121 { 4122 struct mem_cgroup *iter; 4123 4124 for_each_mem_cgroup_tree(iter, memcg) 4125 mem_cgroup_get_local_stat(iter, s); 4126 } 4127 4128 #ifdef CONFIG_NUMA 4129 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4130 { 4131 int nid; 4132 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4133 unsigned long node_nr; 4134 struct cgroup *cont = m->private; 4135 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4136 4137 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4138 seq_printf(m, "total=%lu", total_nr); 4139 for_each_node_state(nid, N_HIGH_MEMORY) { 4140 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 4141 seq_printf(m, " N%d=%lu", nid, node_nr); 4142 } 4143 seq_putc(m, '\n'); 4144 4145 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 4146 seq_printf(m, "file=%lu", file_nr); 4147 for_each_node_state(nid, N_HIGH_MEMORY) { 4148 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4149 LRU_ALL_FILE); 4150 seq_printf(m, " N%d=%lu", nid, node_nr); 4151 } 4152 seq_putc(m, '\n'); 4153 4154 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 4155 seq_printf(m, "anon=%lu", anon_nr); 4156 for_each_node_state(nid, N_HIGH_MEMORY) { 4157 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4158 LRU_ALL_ANON); 4159 seq_printf(m, " N%d=%lu", nid, node_nr); 4160 } 4161 seq_putc(m, '\n'); 4162 4163 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4164 seq_printf(m, "unevictable=%lu", unevictable_nr); 4165 for_each_node_state(nid, N_HIGH_MEMORY) { 4166 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4167 BIT(LRU_UNEVICTABLE)); 4168 seq_printf(m, " N%d=%lu", nid, node_nr); 4169 } 4170 seq_putc(m, '\n'); 4171 return 0; 4172 } 4173 #endif /* CONFIG_NUMA */ 4174 4175 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4176 struct cgroup_map_cb *cb) 4177 { 4178 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4179 struct mcs_total_stat mystat; 4180 int i; 4181 4182 memset(&mystat, 0, sizeof(mystat)); 4183 mem_cgroup_get_local_stat(memcg, &mystat); 4184 4185 4186 for (i = 0; i < NR_MCS_STAT; i++) { 4187 if (i == MCS_SWAP && !do_swap_account) 4188 continue; 4189 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4190 } 4191 4192 /* Hierarchical information */ 4193 { 4194 unsigned long long limit, memsw_limit; 4195 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4196 cb->fill(cb, "hierarchical_memory_limit", limit); 4197 if (do_swap_account) 4198 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4199 } 4200 4201 memset(&mystat, 0, sizeof(mystat)); 4202 mem_cgroup_get_total_stat(memcg, &mystat); 4203 for (i = 0; i < NR_MCS_STAT; i++) { 4204 if (i == MCS_SWAP && !do_swap_account) 4205 continue; 4206 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4207 } 4208 4209 #ifdef CONFIG_DEBUG_VM 4210 { 4211 int nid, zid; 4212 struct mem_cgroup_per_zone *mz; 4213 unsigned long recent_rotated[2] = {0, 0}; 4214 unsigned long recent_scanned[2] = {0, 0}; 4215 4216 for_each_online_node(nid) 4217 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4218 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4219 4220 recent_rotated[0] += 4221 mz->reclaim_stat.recent_rotated[0]; 4222 recent_rotated[1] += 4223 mz->reclaim_stat.recent_rotated[1]; 4224 recent_scanned[0] += 4225 mz->reclaim_stat.recent_scanned[0]; 4226 recent_scanned[1] += 4227 mz->reclaim_stat.recent_scanned[1]; 4228 } 4229 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4230 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4231 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4232 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4233 } 4234 #endif 4235 4236 return 0; 4237 } 4238 4239 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4240 { 4241 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4242 4243 return mem_cgroup_swappiness(memcg); 4244 } 4245 4246 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4247 u64 val) 4248 { 4249 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4250 struct mem_cgroup *parent; 4251 4252 if (val > 100) 4253 return -EINVAL; 4254 4255 if (cgrp->parent == NULL) 4256 return -EINVAL; 4257 4258 parent = mem_cgroup_from_cont(cgrp->parent); 4259 4260 cgroup_lock(); 4261 4262 /* If under hierarchy, only empty-root can set this value */ 4263 if ((parent->use_hierarchy) || 4264 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4265 cgroup_unlock(); 4266 return -EINVAL; 4267 } 4268 4269 memcg->swappiness = val; 4270 4271 cgroup_unlock(); 4272 4273 return 0; 4274 } 4275 4276 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4277 { 4278 struct mem_cgroup_threshold_ary *t; 4279 u64 usage; 4280 int i; 4281 4282 rcu_read_lock(); 4283 if (!swap) 4284 t = rcu_dereference(memcg->thresholds.primary); 4285 else 4286 t = rcu_dereference(memcg->memsw_thresholds.primary); 4287 4288 if (!t) 4289 goto unlock; 4290 4291 usage = mem_cgroup_usage(memcg, swap); 4292 4293 /* 4294 * current_threshold points to threshold just below usage. 4295 * If it's not true, a threshold was crossed after last 4296 * call of __mem_cgroup_threshold(). 4297 */ 4298 i = t->current_threshold; 4299 4300 /* 4301 * Iterate backward over array of thresholds starting from 4302 * current_threshold and check if a threshold is crossed. 4303 * If none of thresholds below usage is crossed, we read 4304 * only one element of the array here. 4305 */ 4306 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4307 eventfd_signal(t->entries[i].eventfd, 1); 4308 4309 /* i = current_threshold + 1 */ 4310 i++; 4311 4312 /* 4313 * Iterate forward over array of thresholds starting from 4314 * current_threshold+1 and check if a threshold is crossed. 4315 * If none of thresholds above usage is crossed, we read 4316 * only one element of the array here. 4317 */ 4318 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4319 eventfd_signal(t->entries[i].eventfd, 1); 4320 4321 /* Update current_threshold */ 4322 t->current_threshold = i - 1; 4323 unlock: 4324 rcu_read_unlock(); 4325 } 4326 4327 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4328 { 4329 while (memcg) { 4330 __mem_cgroup_threshold(memcg, false); 4331 if (do_swap_account) 4332 __mem_cgroup_threshold(memcg, true); 4333 4334 memcg = parent_mem_cgroup(memcg); 4335 } 4336 } 4337 4338 static int compare_thresholds(const void *a, const void *b) 4339 { 4340 const struct mem_cgroup_threshold *_a = a; 4341 const struct mem_cgroup_threshold *_b = b; 4342 4343 return _a->threshold - _b->threshold; 4344 } 4345 4346 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4347 { 4348 struct mem_cgroup_eventfd_list *ev; 4349 4350 list_for_each_entry(ev, &memcg->oom_notify, list) 4351 eventfd_signal(ev->eventfd, 1); 4352 return 0; 4353 } 4354 4355 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4356 { 4357 struct mem_cgroup *iter; 4358 4359 for_each_mem_cgroup_tree(iter, memcg) 4360 mem_cgroup_oom_notify_cb(iter); 4361 } 4362 4363 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4364 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4365 { 4366 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4367 struct mem_cgroup_thresholds *thresholds; 4368 struct mem_cgroup_threshold_ary *new; 4369 int type = MEMFILE_TYPE(cft->private); 4370 u64 threshold, usage; 4371 int i, size, ret; 4372 4373 ret = res_counter_memparse_write_strategy(args, &threshold); 4374 if (ret) 4375 return ret; 4376 4377 mutex_lock(&memcg->thresholds_lock); 4378 4379 if (type == _MEM) 4380 thresholds = &memcg->thresholds; 4381 else if (type == _MEMSWAP) 4382 thresholds = &memcg->memsw_thresholds; 4383 else 4384 BUG(); 4385 4386 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4387 4388 /* Check if a threshold crossed before adding a new one */ 4389 if (thresholds->primary) 4390 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4391 4392 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4393 4394 /* Allocate memory for new array of thresholds */ 4395 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4396 GFP_KERNEL); 4397 if (!new) { 4398 ret = -ENOMEM; 4399 goto unlock; 4400 } 4401 new->size = size; 4402 4403 /* Copy thresholds (if any) to new array */ 4404 if (thresholds->primary) { 4405 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4406 sizeof(struct mem_cgroup_threshold)); 4407 } 4408 4409 /* Add new threshold */ 4410 new->entries[size - 1].eventfd = eventfd; 4411 new->entries[size - 1].threshold = threshold; 4412 4413 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4414 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4415 compare_thresholds, NULL); 4416 4417 /* Find current threshold */ 4418 new->current_threshold = -1; 4419 for (i = 0; i < size; i++) { 4420 if (new->entries[i].threshold < usage) { 4421 /* 4422 * new->current_threshold will not be used until 4423 * rcu_assign_pointer(), so it's safe to increment 4424 * it here. 4425 */ 4426 ++new->current_threshold; 4427 } 4428 } 4429 4430 /* Free old spare buffer and save old primary buffer as spare */ 4431 kfree(thresholds->spare); 4432 thresholds->spare = thresholds->primary; 4433 4434 rcu_assign_pointer(thresholds->primary, new); 4435 4436 /* To be sure that nobody uses thresholds */ 4437 synchronize_rcu(); 4438 4439 unlock: 4440 mutex_unlock(&memcg->thresholds_lock); 4441 4442 return ret; 4443 } 4444 4445 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4446 struct cftype *cft, struct eventfd_ctx *eventfd) 4447 { 4448 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4449 struct mem_cgroup_thresholds *thresholds; 4450 struct mem_cgroup_threshold_ary *new; 4451 int type = MEMFILE_TYPE(cft->private); 4452 u64 usage; 4453 int i, j, size; 4454 4455 mutex_lock(&memcg->thresholds_lock); 4456 if (type == _MEM) 4457 thresholds = &memcg->thresholds; 4458 else if (type == _MEMSWAP) 4459 thresholds = &memcg->memsw_thresholds; 4460 else 4461 BUG(); 4462 4463 if (!thresholds->primary) 4464 goto unlock; 4465 4466 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4467 4468 /* Check if a threshold crossed before removing */ 4469 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4470 4471 /* Calculate new number of threshold */ 4472 size = 0; 4473 for (i = 0; i < thresholds->primary->size; i++) { 4474 if (thresholds->primary->entries[i].eventfd != eventfd) 4475 size++; 4476 } 4477 4478 new = thresholds->spare; 4479 4480 /* Set thresholds array to NULL if we don't have thresholds */ 4481 if (!size) { 4482 kfree(new); 4483 new = NULL; 4484 goto swap_buffers; 4485 } 4486 4487 new->size = size; 4488 4489 /* Copy thresholds and find current threshold */ 4490 new->current_threshold = -1; 4491 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4492 if (thresholds->primary->entries[i].eventfd == eventfd) 4493 continue; 4494 4495 new->entries[j] = thresholds->primary->entries[i]; 4496 if (new->entries[j].threshold < usage) { 4497 /* 4498 * new->current_threshold will not be used 4499 * until rcu_assign_pointer(), so it's safe to increment 4500 * it here. 4501 */ 4502 ++new->current_threshold; 4503 } 4504 j++; 4505 } 4506 4507 swap_buffers: 4508 /* Swap primary and spare array */ 4509 thresholds->spare = thresholds->primary; 4510 /* If all events are unregistered, free the spare array */ 4511 if (!new) { 4512 kfree(thresholds->spare); 4513 thresholds->spare = NULL; 4514 } 4515 4516 rcu_assign_pointer(thresholds->primary, new); 4517 4518 /* To be sure that nobody uses thresholds */ 4519 synchronize_rcu(); 4520 unlock: 4521 mutex_unlock(&memcg->thresholds_lock); 4522 } 4523 4524 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4525 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4526 { 4527 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4528 struct mem_cgroup_eventfd_list *event; 4529 int type = MEMFILE_TYPE(cft->private); 4530 4531 BUG_ON(type != _OOM_TYPE); 4532 event = kmalloc(sizeof(*event), GFP_KERNEL); 4533 if (!event) 4534 return -ENOMEM; 4535 4536 spin_lock(&memcg_oom_lock); 4537 4538 event->eventfd = eventfd; 4539 list_add(&event->list, &memcg->oom_notify); 4540 4541 /* already in OOM ? */ 4542 if (atomic_read(&memcg->under_oom)) 4543 eventfd_signal(eventfd, 1); 4544 spin_unlock(&memcg_oom_lock); 4545 4546 return 0; 4547 } 4548 4549 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4550 struct cftype *cft, struct eventfd_ctx *eventfd) 4551 { 4552 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4553 struct mem_cgroup_eventfd_list *ev, *tmp; 4554 int type = MEMFILE_TYPE(cft->private); 4555 4556 BUG_ON(type != _OOM_TYPE); 4557 4558 spin_lock(&memcg_oom_lock); 4559 4560 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4561 if (ev->eventfd == eventfd) { 4562 list_del(&ev->list); 4563 kfree(ev); 4564 } 4565 } 4566 4567 spin_unlock(&memcg_oom_lock); 4568 } 4569 4570 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4571 struct cftype *cft, struct cgroup_map_cb *cb) 4572 { 4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4574 4575 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 4576 4577 if (atomic_read(&memcg->under_oom)) 4578 cb->fill(cb, "under_oom", 1); 4579 else 4580 cb->fill(cb, "under_oom", 0); 4581 return 0; 4582 } 4583 4584 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4585 struct cftype *cft, u64 val) 4586 { 4587 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4588 struct mem_cgroup *parent; 4589 4590 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4591 if (!cgrp->parent || !((val == 0) || (val == 1))) 4592 return -EINVAL; 4593 4594 parent = mem_cgroup_from_cont(cgrp->parent); 4595 4596 cgroup_lock(); 4597 /* oom-kill-disable is a flag for subhierarchy. */ 4598 if ((parent->use_hierarchy) || 4599 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4600 cgroup_unlock(); 4601 return -EINVAL; 4602 } 4603 memcg->oom_kill_disable = val; 4604 if (!val) 4605 memcg_oom_recover(memcg); 4606 cgroup_unlock(); 4607 return 0; 4608 } 4609 4610 #ifdef CONFIG_NUMA 4611 static const struct file_operations mem_control_numa_stat_file_operations = { 4612 .read = seq_read, 4613 .llseek = seq_lseek, 4614 .release = single_release, 4615 }; 4616 4617 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4618 { 4619 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4620 4621 file->f_op = &mem_control_numa_stat_file_operations; 4622 return single_open(file, mem_control_numa_stat_show, cont); 4623 } 4624 #endif /* CONFIG_NUMA */ 4625 4626 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4627 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4628 { 4629 /* 4630 * Part of this would be better living in a separate allocation 4631 * function, leaving us with just the cgroup tree population work. 4632 * We, however, depend on state such as network's proto_list that 4633 * is only initialized after cgroup creation. I found the less 4634 * cumbersome way to deal with it to defer it all to populate time 4635 */ 4636 return mem_cgroup_sockets_init(cont, ss); 4637 }; 4638 4639 static void kmem_cgroup_destroy(struct cgroup *cont) 4640 { 4641 mem_cgroup_sockets_destroy(cont); 4642 } 4643 #else 4644 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4645 { 4646 return 0; 4647 } 4648 4649 static void kmem_cgroup_destroy(struct cgroup *cont) 4650 { 4651 } 4652 #endif 4653 4654 static struct cftype mem_cgroup_files[] = { 4655 { 4656 .name = "usage_in_bytes", 4657 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4658 .read_u64 = mem_cgroup_read, 4659 .register_event = mem_cgroup_usage_register_event, 4660 .unregister_event = mem_cgroup_usage_unregister_event, 4661 }, 4662 { 4663 .name = "max_usage_in_bytes", 4664 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4665 .trigger = mem_cgroup_reset, 4666 .read_u64 = mem_cgroup_read, 4667 }, 4668 { 4669 .name = "limit_in_bytes", 4670 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4671 .write_string = mem_cgroup_write, 4672 .read_u64 = mem_cgroup_read, 4673 }, 4674 { 4675 .name = "soft_limit_in_bytes", 4676 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4677 .write_string = mem_cgroup_write, 4678 .read_u64 = mem_cgroup_read, 4679 }, 4680 { 4681 .name = "failcnt", 4682 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4683 .trigger = mem_cgroup_reset, 4684 .read_u64 = mem_cgroup_read, 4685 }, 4686 { 4687 .name = "stat", 4688 .read_map = mem_control_stat_show, 4689 }, 4690 { 4691 .name = "force_empty", 4692 .trigger = mem_cgroup_force_empty_write, 4693 }, 4694 { 4695 .name = "use_hierarchy", 4696 .write_u64 = mem_cgroup_hierarchy_write, 4697 .read_u64 = mem_cgroup_hierarchy_read, 4698 }, 4699 { 4700 .name = "swappiness", 4701 .read_u64 = mem_cgroup_swappiness_read, 4702 .write_u64 = mem_cgroup_swappiness_write, 4703 }, 4704 { 4705 .name = "move_charge_at_immigrate", 4706 .read_u64 = mem_cgroup_move_charge_read, 4707 .write_u64 = mem_cgroup_move_charge_write, 4708 }, 4709 { 4710 .name = "oom_control", 4711 .read_map = mem_cgroup_oom_control_read, 4712 .write_u64 = mem_cgroup_oom_control_write, 4713 .register_event = mem_cgroup_oom_register_event, 4714 .unregister_event = mem_cgroup_oom_unregister_event, 4715 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4716 }, 4717 #ifdef CONFIG_NUMA 4718 { 4719 .name = "numa_stat", 4720 .open = mem_control_numa_stat_open, 4721 .mode = S_IRUGO, 4722 }, 4723 #endif 4724 }; 4725 4726 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4727 static struct cftype memsw_cgroup_files[] = { 4728 { 4729 .name = "memsw.usage_in_bytes", 4730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4731 .read_u64 = mem_cgroup_read, 4732 .register_event = mem_cgroup_usage_register_event, 4733 .unregister_event = mem_cgroup_usage_unregister_event, 4734 }, 4735 { 4736 .name = "memsw.max_usage_in_bytes", 4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4738 .trigger = mem_cgroup_reset, 4739 .read_u64 = mem_cgroup_read, 4740 }, 4741 { 4742 .name = "memsw.limit_in_bytes", 4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4744 .write_string = mem_cgroup_write, 4745 .read_u64 = mem_cgroup_read, 4746 }, 4747 { 4748 .name = "memsw.failcnt", 4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4750 .trigger = mem_cgroup_reset, 4751 .read_u64 = mem_cgroup_read, 4752 }, 4753 }; 4754 4755 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4756 { 4757 if (!do_swap_account) 4758 return 0; 4759 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4760 ARRAY_SIZE(memsw_cgroup_files)); 4761 }; 4762 #else 4763 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4764 { 4765 return 0; 4766 } 4767 #endif 4768 4769 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4770 { 4771 struct mem_cgroup_per_node *pn; 4772 struct mem_cgroup_per_zone *mz; 4773 enum lru_list lru; 4774 int zone, tmp = node; 4775 /* 4776 * This routine is called against possible nodes. 4777 * But it's BUG to call kmalloc() against offline node. 4778 * 4779 * TODO: this routine can waste much memory for nodes which will 4780 * never be onlined. It's better to use memory hotplug callback 4781 * function. 4782 */ 4783 if (!node_state(node, N_NORMAL_MEMORY)) 4784 tmp = -1; 4785 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4786 if (!pn) 4787 return 1; 4788 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4790 mz = &pn->zoneinfo[zone]; 4791 for_each_lru(lru) 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]); 4793 mz->usage_in_excess = 0; 4794 mz->on_tree = false; 4795 mz->memcg = memcg; 4796 } 4797 memcg->info.nodeinfo[node] = pn; 4798 return 0; 4799 } 4800 4801 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4802 { 4803 kfree(memcg->info.nodeinfo[node]); 4804 } 4805 4806 static struct mem_cgroup *mem_cgroup_alloc(void) 4807 { 4808 struct mem_cgroup *memcg; 4809 int size = sizeof(struct mem_cgroup); 4810 4811 /* Can be very big if MAX_NUMNODES is very big */ 4812 if (size < PAGE_SIZE) 4813 memcg = kzalloc(size, GFP_KERNEL); 4814 else 4815 memcg = vzalloc(size); 4816 4817 if (!memcg) 4818 return NULL; 4819 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4821 if (!memcg->stat) 4822 goto out_free; 4823 spin_lock_init(&memcg->pcp_counter_lock); 4824 return memcg; 4825 4826 out_free: 4827 if (size < PAGE_SIZE) 4828 kfree(memcg); 4829 else 4830 vfree(memcg); 4831 return NULL; 4832 } 4833 4834 /* 4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4836 * but in process context. The work_freeing structure is overlaid 4837 * on the rcu_freeing structure, which itself is overlaid on memsw. 4838 */ 4839 static void vfree_work(struct work_struct *work) 4840 { 4841 struct mem_cgroup *memcg; 4842 4843 memcg = container_of(work, struct mem_cgroup, work_freeing); 4844 vfree(memcg); 4845 } 4846 static void vfree_rcu(struct rcu_head *rcu_head) 4847 { 4848 struct mem_cgroup *memcg; 4849 4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4851 INIT_WORK(&memcg->work_freeing, vfree_work); 4852 schedule_work(&memcg->work_freeing); 4853 } 4854 4855 /* 4856 * At destroying mem_cgroup, references from swap_cgroup can remain. 4857 * (scanning all at force_empty is too costly...) 4858 * 4859 * Instead of clearing all references at force_empty, we remember 4860 * the number of reference from swap_cgroup and free mem_cgroup when 4861 * it goes down to 0. 4862 * 4863 * Removal of cgroup itself succeeds regardless of refs from swap. 4864 */ 4865 4866 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4867 { 4868 int node; 4869 4870 mem_cgroup_remove_from_trees(memcg); 4871 free_css_id(&mem_cgroup_subsys, &memcg->css); 4872 4873 for_each_node(node) 4874 free_mem_cgroup_per_zone_info(memcg, node); 4875 4876 free_percpu(memcg->stat); 4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4878 kfree_rcu(memcg, rcu_freeing); 4879 else 4880 call_rcu(&memcg->rcu_freeing, vfree_rcu); 4881 } 4882 4883 static void mem_cgroup_get(struct mem_cgroup *memcg) 4884 { 4885 atomic_inc(&memcg->refcnt); 4886 } 4887 4888 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) 4889 { 4890 if (atomic_sub_and_test(count, &memcg->refcnt)) { 4891 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4892 __mem_cgroup_free(memcg); 4893 if (parent) 4894 mem_cgroup_put(parent); 4895 } 4896 } 4897 4898 static void mem_cgroup_put(struct mem_cgroup *memcg) 4899 { 4900 __mem_cgroup_put(memcg, 1); 4901 } 4902 4903 /* 4904 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4905 */ 4906 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4907 { 4908 if (!memcg->res.parent) 4909 return NULL; 4910 return mem_cgroup_from_res_counter(memcg->res.parent, res); 4911 } 4912 EXPORT_SYMBOL(parent_mem_cgroup); 4913 4914 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4915 static void __init enable_swap_cgroup(void) 4916 { 4917 if (!mem_cgroup_disabled() && really_do_swap_account) 4918 do_swap_account = 1; 4919 } 4920 #else 4921 static void __init enable_swap_cgroup(void) 4922 { 4923 } 4924 #endif 4925 4926 static int mem_cgroup_soft_limit_tree_init(void) 4927 { 4928 struct mem_cgroup_tree_per_node *rtpn; 4929 struct mem_cgroup_tree_per_zone *rtpz; 4930 int tmp, node, zone; 4931 4932 for_each_node(node) { 4933 tmp = node; 4934 if (!node_state(node, N_NORMAL_MEMORY)) 4935 tmp = -1; 4936 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4937 if (!rtpn) 4938 goto err_cleanup; 4939 4940 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4941 4942 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4943 rtpz = &rtpn->rb_tree_per_zone[zone]; 4944 rtpz->rb_root = RB_ROOT; 4945 spin_lock_init(&rtpz->lock); 4946 } 4947 } 4948 return 0; 4949 4950 err_cleanup: 4951 for_each_node(node) { 4952 if (!soft_limit_tree.rb_tree_per_node[node]) 4953 break; 4954 kfree(soft_limit_tree.rb_tree_per_node[node]); 4955 soft_limit_tree.rb_tree_per_node[node] = NULL; 4956 } 4957 return 1; 4958 4959 } 4960 4961 static struct cgroup_subsys_state * __ref 4962 mem_cgroup_create(struct cgroup *cont) 4963 { 4964 struct mem_cgroup *memcg, *parent; 4965 long error = -ENOMEM; 4966 int node; 4967 4968 memcg = mem_cgroup_alloc(); 4969 if (!memcg) 4970 return ERR_PTR(error); 4971 4972 for_each_node(node) 4973 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4974 goto free_out; 4975 4976 /* root ? */ 4977 if (cont->parent == NULL) { 4978 int cpu; 4979 enable_swap_cgroup(); 4980 parent = NULL; 4981 if (mem_cgroup_soft_limit_tree_init()) 4982 goto free_out; 4983 root_mem_cgroup = memcg; 4984 for_each_possible_cpu(cpu) { 4985 struct memcg_stock_pcp *stock = 4986 &per_cpu(memcg_stock, cpu); 4987 INIT_WORK(&stock->work, drain_local_stock); 4988 } 4989 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4990 } else { 4991 parent = mem_cgroup_from_cont(cont->parent); 4992 memcg->use_hierarchy = parent->use_hierarchy; 4993 memcg->oom_kill_disable = parent->oom_kill_disable; 4994 } 4995 4996 if (parent && parent->use_hierarchy) { 4997 res_counter_init(&memcg->res, &parent->res); 4998 res_counter_init(&memcg->memsw, &parent->memsw); 4999 /* 5000 * We increment refcnt of the parent to ensure that we can 5001 * safely access it on res_counter_charge/uncharge. 5002 * This refcnt will be decremented when freeing this 5003 * mem_cgroup(see mem_cgroup_put). 5004 */ 5005 mem_cgroup_get(parent); 5006 } else { 5007 res_counter_init(&memcg->res, NULL); 5008 res_counter_init(&memcg->memsw, NULL); 5009 } 5010 memcg->last_scanned_node = MAX_NUMNODES; 5011 INIT_LIST_HEAD(&memcg->oom_notify); 5012 5013 if (parent) 5014 memcg->swappiness = mem_cgroup_swappiness(parent); 5015 atomic_set(&memcg->refcnt, 1); 5016 memcg->move_charge_at_immigrate = 0; 5017 mutex_init(&memcg->thresholds_lock); 5018 spin_lock_init(&memcg->move_lock); 5019 return &memcg->css; 5020 free_out: 5021 __mem_cgroup_free(memcg); 5022 return ERR_PTR(error); 5023 } 5024 5025 static int mem_cgroup_pre_destroy(struct cgroup *cont) 5026 { 5027 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5028 5029 return mem_cgroup_force_empty(memcg, false); 5030 } 5031 5032 static void mem_cgroup_destroy(struct cgroup *cont) 5033 { 5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5035 5036 kmem_cgroup_destroy(cont); 5037 5038 mem_cgroup_put(memcg); 5039 } 5040 5041 static int mem_cgroup_populate(struct cgroup_subsys *ss, 5042 struct cgroup *cont) 5043 { 5044 int ret; 5045 5046 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 5047 ARRAY_SIZE(mem_cgroup_files)); 5048 5049 if (!ret) 5050 ret = register_memsw_files(cont, ss); 5051 5052 if (!ret) 5053 ret = register_kmem_files(cont, ss); 5054 5055 return ret; 5056 } 5057 5058 #ifdef CONFIG_MMU 5059 /* Handlers for move charge at task migration. */ 5060 #define PRECHARGE_COUNT_AT_ONCE 256 5061 static int mem_cgroup_do_precharge(unsigned long count) 5062 { 5063 int ret = 0; 5064 int batch_count = PRECHARGE_COUNT_AT_ONCE; 5065 struct mem_cgroup *memcg = mc.to; 5066 5067 if (mem_cgroup_is_root(memcg)) { 5068 mc.precharge += count; 5069 /* we don't need css_get for root */ 5070 return ret; 5071 } 5072 /* try to charge at once */ 5073 if (count > 1) { 5074 struct res_counter *dummy; 5075 /* 5076 * "memcg" cannot be under rmdir() because we've already checked 5077 * by cgroup_lock_live_cgroup() that it is not removed and we 5078 * are still under the same cgroup_mutex. So we can postpone 5079 * css_get(). 5080 */ 5081 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 5082 goto one_by_one; 5083 if (do_swap_account && res_counter_charge(&memcg->memsw, 5084 PAGE_SIZE * count, &dummy)) { 5085 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 5086 goto one_by_one; 5087 } 5088 mc.precharge += count; 5089 return ret; 5090 } 5091 one_by_one: 5092 /* fall back to one by one charge */ 5093 while (count--) { 5094 if (signal_pending(current)) { 5095 ret = -EINTR; 5096 break; 5097 } 5098 if (!batch_count--) { 5099 batch_count = PRECHARGE_COUNT_AT_ONCE; 5100 cond_resched(); 5101 } 5102 ret = __mem_cgroup_try_charge(NULL, 5103 GFP_KERNEL, 1, &memcg, false); 5104 if (ret) 5105 /* mem_cgroup_clear_mc() will do uncharge later */ 5106 return ret; 5107 mc.precharge++; 5108 } 5109 return ret; 5110 } 5111 5112 /** 5113 * get_mctgt_type - get target type of moving charge 5114 * @vma: the vma the pte to be checked belongs 5115 * @addr: the address corresponding to the pte to be checked 5116 * @ptent: the pte to be checked 5117 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5118 * 5119 * Returns 5120 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5121 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5122 * move charge. if @target is not NULL, the page is stored in target->page 5123 * with extra refcnt got(Callers should handle it). 5124 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5125 * target for charge migration. if @target is not NULL, the entry is stored 5126 * in target->ent. 5127 * 5128 * Called with pte lock held. 5129 */ 5130 union mc_target { 5131 struct page *page; 5132 swp_entry_t ent; 5133 }; 5134 5135 enum mc_target_type { 5136 MC_TARGET_NONE = 0, 5137 MC_TARGET_PAGE, 5138 MC_TARGET_SWAP, 5139 }; 5140 5141 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5142 unsigned long addr, pte_t ptent) 5143 { 5144 struct page *page = vm_normal_page(vma, addr, ptent); 5145 5146 if (!page || !page_mapped(page)) 5147 return NULL; 5148 if (PageAnon(page)) { 5149 /* we don't move shared anon */ 5150 if (!move_anon() || page_mapcount(page) > 2) 5151 return NULL; 5152 } else if (!move_file()) 5153 /* we ignore mapcount for file pages */ 5154 return NULL; 5155 if (!get_page_unless_zero(page)) 5156 return NULL; 5157 5158 return page; 5159 } 5160 5161 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5162 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5163 { 5164 int usage_count; 5165 struct page *page = NULL; 5166 swp_entry_t ent = pte_to_swp_entry(ptent); 5167 5168 if (!move_anon() || non_swap_entry(ent)) 5169 return NULL; 5170 usage_count = mem_cgroup_count_swap_user(ent, &page); 5171 if (usage_count > 1) { /* we don't move shared anon */ 5172 if (page) 5173 put_page(page); 5174 return NULL; 5175 } 5176 if (do_swap_account) 5177 entry->val = ent.val; 5178 5179 return page; 5180 } 5181 5182 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5183 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5184 { 5185 struct page *page = NULL; 5186 struct inode *inode; 5187 struct address_space *mapping; 5188 pgoff_t pgoff; 5189 5190 if (!vma->vm_file) /* anonymous vma */ 5191 return NULL; 5192 if (!move_file()) 5193 return NULL; 5194 5195 inode = vma->vm_file->f_path.dentry->d_inode; 5196 mapping = vma->vm_file->f_mapping; 5197 if (pte_none(ptent)) 5198 pgoff = linear_page_index(vma, addr); 5199 else /* pte_file(ptent) is true */ 5200 pgoff = pte_to_pgoff(ptent); 5201 5202 /* page is moved even if it's not RSS of this task(page-faulted). */ 5203 page = find_get_page(mapping, pgoff); 5204 5205 #ifdef CONFIG_SWAP 5206 /* shmem/tmpfs may report page out on swap: account for that too. */ 5207 if (radix_tree_exceptional_entry(page)) { 5208 swp_entry_t swap = radix_to_swp_entry(page); 5209 if (do_swap_account) 5210 *entry = swap; 5211 page = find_get_page(&swapper_space, swap.val); 5212 } 5213 #endif 5214 return page; 5215 } 5216 5217 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5218 unsigned long addr, pte_t ptent, union mc_target *target) 5219 { 5220 struct page *page = NULL; 5221 struct page_cgroup *pc; 5222 enum mc_target_type ret = MC_TARGET_NONE; 5223 swp_entry_t ent = { .val = 0 }; 5224 5225 if (pte_present(ptent)) 5226 page = mc_handle_present_pte(vma, addr, ptent); 5227 else if (is_swap_pte(ptent)) 5228 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5229 else if (pte_none(ptent) || pte_file(ptent)) 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5231 5232 if (!page && !ent.val) 5233 return ret; 5234 if (page) { 5235 pc = lookup_page_cgroup(page); 5236 /* 5237 * Do only loose check w/o page_cgroup lock. 5238 * mem_cgroup_move_account() checks the pc is valid or not under 5239 * the lock. 5240 */ 5241 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5242 ret = MC_TARGET_PAGE; 5243 if (target) 5244 target->page = page; 5245 } 5246 if (!ret || !target) 5247 put_page(page); 5248 } 5249 /* There is a swap entry and a page doesn't exist or isn't charged */ 5250 if (ent.val && !ret && 5251 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 5252 ret = MC_TARGET_SWAP; 5253 if (target) 5254 target->ent = ent; 5255 } 5256 return ret; 5257 } 5258 5259 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5260 /* 5261 * We don't consider swapping or file mapped pages because THP does not 5262 * support them for now. 5263 * Caller should make sure that pmd_trans_huge(pmd) is true. 5264 */ 5265 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5266 unsigned long addr, pmd_t pmd, union mc_target *target) 5267 { 5268 struct page *page = NULL; 5269 struct page_cgroup *pc; 5270 enum mc_target_type ret = MC_TARGET_NONE; 5271 5272 page = pmd_page(pmd); 5273 VM_BUG_ON(!page || !PageHead(page)); 5274 if (!move_anon()) 5275 return ret; 5276 pc = lookup_page_cgroup(page); 5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5278 ret = MC_TARGET_PAGE; 5279 if (target) { 5280 get_page(page); 5281 target->page = page; 5282 } 5283 } 5284 return ret; 5285 } 5286 #else 5287 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5288 unsigned long addr, pmd_t pmd, union mc_target *target) 5289 { 5290 return MC_TARGET_NONE; 5291 } 5292 #endif 5293 5294 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5295 unsigned long addr, unsigned long end, 5296 struct mm_walk *walk) 5297 { 5298 struct vm_area_struct *vma = walk->private; 5299 pte_t *pte; 5300 spinlock_t *ptl; 5301 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5304 mc.precharge += HPAGE_PMD_NR; 5305 spin_unlock(&vma->vm_mm->page_table_lock); 5306 return 0; 5307 } 5308 5309 if (pmd_trans_unstable(pmd)) 5310 return 0; 5311 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5312 for (; addr != end; pte++, addr += PAGE_SIZE) 5313 if (get_mctgt_type(vma, addr, *pte, NULL)) 5314 mc.precharge++; /* increment precharge temporarily */ 5315 pte_unmap_unlock(pte - 1, ptl); 5316 cond_resched(); 5317 5318 return 0; 5319 } 5320 5321 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5322 { 5323 unsigned long precharge; 5324 struct vm_area_struct *vma; 5325 5326 down_read(&mm->mmap_sem); 5327 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5328 struct mm_walk mem_cgroup_count_precharge_walk = { 5329 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5330 .mm = mm, 5331 .private = vma, 5332 }; 5333 if (is_vm_hugetlb_page(vma)) 5334 continue; 5335 walk_page_range(vma->vm_start, vma->vm_end, 5336 &mem_cgroup_count_precharge_walk); 5337 } 5338 up_read(&mm->mmap_sem); 5339 5340 precharge = mc.precharge; 5341 mc.precharge = 0; 5342 5343 return precharge; 5344 } 5345 5346 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5347 { 5348 unsigned long precharge = mem_cgroup_count_precharge(mm); 5349 5350 VM_BUG_ON(mc.moving_task); 5351 mc.moving_task = current; 5352 return mem_cgroup_do_precharge(precharge); 5353 } 5354 5355 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5356 static void __mem_cgroup_clear_mc(void) 5357 { 5358 struct mem_cgroup *from = mc.from; 5359 struct mem_cgroup *to = mc.to; 5360 5361 /* we must uncharge all the leftover precharges from mc.to */ 5362 if (mc.precharge) { 5363 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5364 mc.precharge = 0; 5365 } 5366 /* 5367 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5368 * we must uncharge here. 5369 */ 5370 if (mc.moved_charge) { 5371 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5372 mc.moved_charge = 0; 5373 } 5374 /* we must fixup refcnts and charges */ 5375 if (mc.moved_swap) { 5376 /* uncharge swap account from the old cgroup */ 5377 if (!mem_cgroup_is_root(mc.from)) 5378 res_counter_uncharge(&mc.from->memsw, 5379 PAGE_SIZE * mc.moved_swap); 5380 __mem_cgroup_put(mc.from, mc.moved_swap); 5381 5382 if (!mem_cgroup_is_root(mc.to)) { 5383 /* 5384 * we charged both to->res and to->memsw, so we should 5385 * uncharge to->res. 5386 */ 5387 res_counter_uncharge(&mc.to->res, 5388 PAGE_SIZE * mc.moved_swap); 5389 } 5390 /* we've already done mem_cgroup_get(mc.to) */ 5391 mc.moved_swap = 0; 5392 } 5393 memcg_oom_recover(from); 5394 memcg_oom_recover(to); 5395 wake_up_all(&mc.waitq); 5396 } 5397 5398 static void mem_cgroup_clear_mc(void) 5399 { 5400 struct mem_cgroup *from = mc.from; 5401 5402 /* 5403 * we must clear moving_task before waking up waiters at the end of 5404 * task migration. 5405 */ 5406 mc.moving_task = NULL; 5407 __mem_cgroup_clear_mc(); 5408 spin_lock(&mc.lock); 5409 mc.from = NULL; 5410 mc.to = NULL; 5411 spin_unlock(&mc.lock); 5412 mem_cgroup_end_move(from); 5413 } 5414 5415 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5416 struct cgroup_taskset *tset) 5417 { 5418 struct task_struct *p = cgroup_taskset_first(tset); 5419 int ret = 0; 5420 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 5421 5422 if (memcg->move_charge_at_immigrate) { 5423 struct mm_struct *mm; 5424 struct mem_cgroup *from = mem_cgroup_from_task(p); 5425 5426 VM_BUG_ON(from == memcg); 5427 5428 mm = get_task_mm(p); 5429 if (!mm) 5430 return 0; 5431 /* We move charges only when we move a owner of the mm */ 5432 if (mm->owner == p) { 5433 VM_BUG_ON(mc.from); 5434 VM_BUG_ON(mc.to); 5435 VM_BUG_ON(mc.precharge); 5436 VM_BUG_ON(mc.moved_charge); 5437 VM_BUG_ON(mc.moved_swap); 5438 mem_cgroup_start_move(from); 5439 spin_lock(&mc.lock); 5440 mc.from = from; 5441 mc.to = memcg; 5442 spin_unlock(&mc.lock); 5443 /* We set mc.moving_task later */ 5444 5445 ret = mem_cgroup_precharge_mc(mm); 5446 if (ret) 5447 mem_cgroup_clear_mc(); 5448 } 5449 mmput(mm); 5450 } 5451 return ret; 5452 } 5453 5454 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5455 struct cgroup_taskset *tset) 5456 { 5457 mem_cgroup_clear_mc(); 5458 } 5459 5460 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5461 unsigned long addr, unsigned long end, 5462 struct mm_walk *walk) 5463 { 5464 int ret = 0; 5465 struct vm_area_struct *vma = walk->private; 5466 pte_t *pte; 5467 spinlock_t *ptl; 5468 enum mc_target_type target_type; 5469 union mc_target target; 5470 struct page *page; 5471 struct page_cgroup *pc; 5472 5473 /* 5474 * We don't take compound_lock() here but no race with splitting thp 5475 * happens because: 5476 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5477 * under splitting, which means there's no concurrent thp split, 5478 * - if another thread runs into split_huge_page() just after we 5479 * entered this if-block, the thread must wait for page table lock 5480 * to be unlocked in __split_huge_page_splitting(), where the main 5481 * part of thp split is not executed yet. 5482 */ 5483 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5484 if (!mc.precharge) { 5485 spin_unlock(&vma->vm_mm->page_table_lock); 5486 return 0; 5487 } 5488 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5489 if (target_type == MC_TARGET_PAGE) { 5490 page = target.page; 5491 if (!isolate_lru_page(page)) { 5492 pc = lookup_page_cgroup(page); 5493 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5494 pc, mc.from, mc.to, 5495 false)) { 5496 mc.precharge -= HPAGE_PMD_NR; 5497 mc.moved_charge += HPAGE_PMD_NR; 5498 } 5499 putback_lru_page(page); 5500 } 5501 put_page(page); 5502 } 5503 spin_unlock(&vma->vm_mm->page_table_lock); 5504 return 0; 5505 } 5506 5507 if (pmd_trans_unstable(pmd)) 5508 return 0; 5509 retry: 5510 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5511 for (; addr != end; addr += PAGE_SIZE) { 5512 pte_t ptent = *(pte++); 5513 swp_entry_t ent; 5514 5515 if (!mc.precharge) 5516 break; 5517 5518 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5519 case MC_TARGET_PAGE: 5520 page = target.page; 5521 if (isolate_lru_page(page)) 5522 goto put; 5523 pc = lookup_page_cgroup(page); 5524 if (!mem_cgroup_move_account(page, 1, pc, 5525 mc.from, mc.to, false)) { 5526 mc.precharge--; 5527 /* we uncharge from mc.from later. */ 5528 mc.moved_charge++; 5529 } 5530 putback_lru_page(page); 5531 put: /* get_mctgt_type() gets the page */ 5532 put_page(page); 5533 break; 5534 case MC_TARGET_SWAP: 5535 ent = target.ent; 5536 if (!mem_cgroup_move_swap_account(ent, 5537 mc.from, mc.to, false)) { 5538 mc.precharge--; 5539 /* we fixup refcnts and charges later. */ 5540 mc.moved_swap++; 5541 } 5542 break; 5543 default: 5544 break; 5545 } 5546 } 5547 pte_unmap_unlock(pte - 1, ptl); 5548 cond_resched(); 5549 5550 if (addr != end) { 5551 /* 5552 * We have consumed all precharges we got in can_attach(). 5553 * We try charge one by one, but don't do any additional 5554 * charges to mc.to if we have failed in charge once in attach() 5555 * phase. 5556 */ 5557 ret = mem_cgroup_do_precharge(1); 5558 if (!ret) 5559 goto retry; 5560 } 5561 5562 return ret; 5563 } 5564 5565 static void mem_cgroup_move_charge(struct mm_struct *mm) 5566 { 5567 struct vm_area_struct *vma; 5568 5569 lru_add_drain_all(); 5570 retry: 5571 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5572 /* 5573 * Someone who are holding the mmap_sem might be waiting in 5574 * waitq. So we cancel all extra charges, wake up all waiters, 5575 * and retry. Because we cancel precharges, we might not be able 5576 * to move enough charges, but moving charge is a best-effort 5577 * feature anyway, so it wouldn't be a big problem. 5578 */ 5579 __mem_cgroup_clear_mc(); 5580 cond_resched(); 5581 goto retry; 5582 } 5583 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5584 int ret; 5585 struct mm_walk mem_cgroup_move_charge_walk = { 5586 .pmd_entry = mem_cgroup_move_charge_pte_range, 5587 .mm = mm, 5588 .private = vma, 5589 }; 5590 if (is_vm_hugetlb_page(vma)) 5591 continue; 5592 ret = walk_page_range(vma->vm_start, vma->vm_end, 5593 &mem_cgroup_move_charge_walk); 5594 if (ret) 5595 /* 5596 * means we have consumed all precharges and failed in 5597 * doing additional charge. Just abandon here. 5598 */ 5599 break; 5600 } 5601 up_read(&mm->mmap_sem); 5602 } 5603 5604 static void mem_cgroup_move_task(struct cgroup *cont, 5605 struct cgroup_taskset *tset) 5606 { 5607 struct task_struct *p = cgroup_taskset_first(tset); 5608 struct mm_struct *mm = get_task_mm(p); 5609 5610 if (mm) { 5611 if (mc.to) 5612 mem_cgroup_move_charge(mm); 5613 put_swap_token(mm); 5614 mmput(mm); 5615 } 5616 if (mc.to) 5617 mem_cgroup_clear_mc(); 5618 } 5619 #else /* !CONFIG_MMU */ 5620 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5621 struct cgroup_taskset *tset) 5622 { 5623 return 0; 5624 } 5625 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5626 struct cgroup_taskset *tset) 5627 { 5628 } 5629 static void mem_cgroup_move_task(struct cgroup *cont, 5630 struct cgroup_taskset *tset) 5631 { 5632 } 5633 #endif 5634 5635 struct cgroup_subsys mem_cgroup_subsys = { 5636 .name = "memory", 5637 .subsys_id = mem_cgroup_subsys_id, 5638 .create = mem_cgroup_create, 5639 .pre_destroy = mem_cgroup_pre_destroy, 5640 .destroy = mem_cgroup_destroy, 5641 .populate = mem_cgroup_populate, 5642 .can_attach = mem_cgroup_can_attach, 5643 .cancel_attach = mem_cgroup_cancel_attach, 5644 .attach = mem_cgroup_move_task, 5645 .early_init = 0, 5646 .use_id = 1, 5647 }; 5648 5649 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5650 static int __init enable_swap_account(char *s) 5651 { 5652 /* consider enabled if no parameter or 1 is given */ 5653 if (!strcmp(s, "1")) 5654 really_do_swap_account = 1; 5655 else if (!strcmp(s, "0")) 5656 really_do_swap_account = 0; 5657 return 1; 5658 } 5659 __setup("swapaccount=", enable_swap_account); 5660 5661 #endif 5662