1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/export.h> 37 #include <linux/mutex.h> 38 #include <linux/rbtree.h> 39 #include <linux/slab.h> 40 #include <linux/swap.h> 41 #include <linux/swapops.h> 42 #include <linux/spinlock.h> 43 #include <linux/eventfd.h> 44 #include <linux/sort.h> 45 #include <linux/fs.h> 46 #include <linux/seq_file.h> 47 #include <linux/vmalloc.h> 48 #include <linux/mm_inline.h> 49 #include <linux/page_cgroup.h> 50 #include <linux/cpu.h> 51 #include <linux/oom.h> 52 #include "internal.h" 53 #include <net/sock.h> 54 #include <net/tcp_memcontrol.h> 55 56 #include <asm/uaccess.h> 57 58 #include <trace/events/vmscan.h> 59 60 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 61 #define MEM_CGROUP_RECLAIM_RETRIES 5 62 struct mem_cgroup *root_mem_cgroup __read_mostly; 63 64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 65 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 66 int do_swap_account __read_mostly; 67 68 /* for remember boot option*/ 69 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 70 static int really_do_swap_account __initdata = 1; 71 #else 72 static int really_do_swap_account __initdata = 0; 73 #endif 74 75 #else 76 #define do_swap_account (0) 77 #endif 78 79 80 /* 81 * Statistics for memory cgroup. 82 */ 83 enum mem_cgroup_stat_index { 84 /* 85 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 86 */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 92 MEM_CGROUP_STAT_NSTATS, 93 }; 94 95 enum mem_cgroup_events_index { 96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 101 MEM_CGROUP_EVENTS_NSTATS, 102 }; 103 /* 104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 105 * it will be incremated by the number of pages. This counter is used for 106 * for trigger some periodic events. This is straightforward and better 107 * than using jiffies etc. to handle periodic memcg event. 108 */ 109 enum mem_cgroup_events_target { 110 MEM_CGROUP_TARGET_THRESH, 111 MEM_CGROUP_TARGET_SOFTLIMIT, 112 MEM_CGROUP_TARGET_NUMAINFO, 113 MEM_CGROUP_NTARGETS, 114 }; 115 #define THRESHOLDS_EVENTS_TARGET (128) 116 #define SOFTLIMIT_EVENTS_TARGET (1024) 117 #define NUMAINFO_EVENTS_TARGET (1024) 118 119 struct mem_cgroup_stat_cpu { 120 long count[MEM_CGROUP_STAT_NSTATS]; 121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 122 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 }; 124 125 struct mem_cgroup_reclaim_iter { 126 /* css_id of the last scanned hierarchy member */ 127 int position; 128 /* scan generation, increased every round-trip */ 129 unsigned int generation; 130 }; 131 132 /* 133 * per-zone information in memory controller. 134 */ 135 struct mem_cgroup_per_zone { 136 struct lruvec lruvec; 137 unsigned long lru_size[NR_LRU_LISTS]; 138 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 140 141 struct zone_reclaim_stat reclaim_stat; 142 struct rb_node tree_node; /* RB tree node */ 143 unsigned long long usage_in_excess;/* Set to the value by which */ 144 /* the soft limit is exceeded*/ 145 bool on_tree; 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 147 /* use container_of */ 148 }; 149 150 struct mem_cgroup_per_node { 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 152 }; 153 154 struct mem_cgroup_lru_info { 155 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 156 }; 157 158 /* 159 * Cgroups above their limits are maintained in a RB-Tree, independent of 160 * their hierarchy representation 161 */ 162 163 struct mem_cgroup_tree_per_zone { 164 struct rb_root rb_root; 165 spinlock_t lock; 166 }; 167 168 struct mem_cgroup_tree_per_node { 169 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 170 }; 171 172 struct mem_cgroup_tree { 173 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 174 }; 175 176 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 177 178 struct mem_cgroup_threshold { 179 struct eventfd_ctx *eventfd; 180 u64 threshold; 181 }; 182 183 /* For threshold */ 184 struct mem_cgroup_threshold_ary { 185 /* An array index points to threshold just below usage. */ 186 int current_threshold; 187 /* Size of entries[] */ 188 unsigned int size; 189 /* Array of thresholds */ 190 struct mem_cgroup_threshold entries[0]; 191 }; 192 193 struct mem_cgroup_thresholds { 194 /* Primary thresholds array */ 195 struct mem_cgroup_threshold_ary *primary; 196 /* 197 * Spare threshold array. 198 * This is needed to make mem_cgroup_unregister_event() "never fail". 199 * It must be able to store at least primary->size - 1 entries. 200 */ 201 struct mem_cgroup_threshold_ary *spare; 202 }; 203 204 /* for OOM */ 205 struct mem_cgroup_eventfd_list { 206 struct list_head list; 207 struct eventfd_ctx *eventfd; 208 }; 209 210 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 211 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 212 213 /* 214 * The memory controller data structure. The memory controller controls both 215 * page cache and RSS per cgroup. We would eventually like to provide 216 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 217 * to help the administrator determine what knobs to tune. 218 * 219 * TODO: Add a water mark for the memory controller. Reclaim will begin when 220 * we hit the water mark. May be even add a low water mark, such that 221 * no reclaim occurs from a cgroup at it's low water mark, this is 222 * a feature that will be implemented much later in the future. 223 */ 224 struct mem_cgroup { 225 struct cgroup_subsys_state css; 226 /* 227 * the counter to account for memory usage 228 */ 229 struct res_counter res; 230 231 union { 232 /* 233 * the counter to account for mem+swap usage. 234 */ 235 struct res_counter memsw; 236 237 /* 238 * rcu_freeing is used only when freeing struct mem_cgroup, 239 * so put it into a union to avoid wasting more memory. 240 * It must be disjoint from the css field. It could be 241 * in a union with the res field, but res plays a much 242 * larger part in mem_cgroup life than memsw, and might 243 * be of interest, even at time of free, when debugging. 244 * So share rcu_head with the less interesting memsw. 245 */ 246 struct rcu_head rcu_freeing; 247 /* 248 * But when using vfree(), that cannot be done at 249 * interrupt time, so we must then queue the work. 250 */ 251 struct work_struct work_freeing; 252 }; 253 254 /* 255 * Per cgroup active and inactive list, similar to the 256 * per zone LRU lists. 257 */ 258 struct mem_cgroup_lru_info info; 259 int last_scanned_node; 260 #if MAX_NUMNODES > 1 261 nodemask_t scan_nodes; 262 atomic_t numainfo_events; 263 atomic_t numainfo_updating; 264 #endif 265 /* 266 * Should the accounting and control be hierarchical, per subtree? 267 */ 268 bool use_hierarchy; 269 270 bool oom_lock; 271 atomic_t under_oom; 272 273 atomic_t refcnt; 274 275 int swappiness; 276 /* OOM-Killer disable */ 277 int oom_kill_disable; 278 279 /* set when res.limit == memsw.limit */ 280 bool memsw_is_minimum; 281 282 /* protect arrays of thresholds */ 283 struct mutex thresholds_lock; 284 285 /* thresholds for memory usage. RCU-protected */ 286 struct mem_cgroup_thresholds thresholds; 287 288 /* thresholds for mem+swap usage. RCU-protected */ 289 struct mem_cgroup_thresholds memsw_thresholds; 290 291 /* For oom notifier event fd */ 292 struct list_head oom_notify; 293 294 /* 295 * Should we move charges of a task when a task is moved into this 296 * mem_cgroup ? And what type of charges should we move ? 297 */ 298 unsigned long move_charge_at_immigrate; 299 /* 300 * set > 0 if pages under this cgroup are moving to other cgroup. 301 */ 302 atomic_t moving_account; 303 /* taken only while moving_account > 0 */ 304 spinlock_t move_lock; 305 /* 306 * percpu counter. 307 */ 308 struct mem_cgroup_stat_cpu *stat; 309 /* 310 * used when a cpu is offlined or other synchronizations 311 * See mem_cgroup_read_stat(). 312 */ 313 struct mem_cgroup_stat_cpu nocpu_base; 314 spinlock_t pcp_counter_lock; 315 316 #ifdef CONFIG_INET 317 struct tcp_memcontrol tcp_mem; 318 #endif 319 }; 320 321 /* Stuffs for move charges at task migration. */ 322 /* 323 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 324 * left-shifted bitmap of these types. 325 */ 326 enum move_type { 327 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 328 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 329 NR_MOVE_TYPE, 330 }; 331 332 /* "mc" and its members are protected by cgroup_mutex */ 333 static struct move_charge_struct { 334 spinlock_t lock; /* for from, to */ 335 struct mem_cgroup *from; 336 struct mem_cgroup *to; 337 unsigned long precharge; 338 unsigned long moved_charge; 339 unsigned long moved_swap; 340 struct task_struct *moving_task; /* a task moving charges */ 341 wait_queue_head_t waitq; /* a waitq for other context */ 342 } mc = { 343 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 344 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 345 }; 346 347 static bool move_anon(void) 348 { 349 return test_bit(MOVE_CHARGE_TYPE_ANON, 350 &mc.to->move_charge_at_immigrate); 351 } 352 353 static bool move_file(void) 354 { 355 return test_bit(MOVE_CHARGE_TYPE_FILE, 356 &mc.to->move_charge_at_immigrate); 357 } 358 359 /* 360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 361 * limit reclaim to prevent infinite loops, if they ever occur. 362 */ 363 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 364 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 365 366 enum charge_type { 367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 368 MEM_CGROUP_CHARGE_TYPE_MAPPED, 369 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 370 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 371 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 372 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 373 NR_CHARGE_TYPE, 374 }; 375 376 /* for encoding cft->private value on file */ 377 #define _MEM (0) 378 #define _MEMSWAP (1) 379 #define _OOM_TYPE (2) 380 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 381 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 382 #define MEMFILE_ATTR(val) ((val) & 0xffff) 383 /* Used for OOM nofiier */ 384 #define OOM_CONTROL (0) 385 386 /* 387 * Reclaim flags for mem_cgroup_hierarchical_reclaim 388 */ 389 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 390 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 391 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 392 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 393 394 static void mem_cgroup_get(struct mem_cgroup *memcg); 395 static void mem_cgroup_put(struct mem_cgroup *memcg); 396 397 /* Writing them here to avoid exposing memcg's inner layout */ 398 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 399 #include <net/sock.h> 400 #include <net/ip.h> 401 402 static bool mem_cgroup_is_root(struct mem_cgroup *memcg); 403 void sock_update_memcg(struct sock *sk) 404 { 405 if (mem_cgroup_sockets_enabled) { 406 struct mem_cgroup *memcg; 407 408 BUG_ON(!sk->sk_prot->proto_cgroup); 409 410 /* Socket cloning can throw us here with sk_cgrp already 411 * filled. It won't however, necessarily happen from 412 * process context. So the test for root memcg given 413 * the current task's memcg won't help us in this case. 414 * 415 * Respecting the original socket's memcg is a better 416 * decision in this case. 417 */ 418 if (sk->sk_cgrp) { 419 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 420 mem_cgroup_get(sk->sk_cgrp->memcg); 421 return; 422 } 423 424 rcu_read_lock(); 425 memcg = mem_cgroup_from_task(current); 426 if (!mem_cgroup_is_root(memcg)) { 427 mem_cgroup_get(memcg); 428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 429 } 430 rcu_read_unlock(); 431 } 432 } 433 EXPORT_SYMBOL(sock_update_memcg); 434 435 void sock_release_memcg(struct sock *sk) 436 { 437 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 438 struct mem_cgroup *memcg; 439 WARN_ON(!sk->sk_cgrp->memcg); 440 memcg = sk->sk_cgrp->memcg; 441 mem_cgroup_put(memcg); 442 } 443 } 444 445 #ifdef CONFIG_INET 446 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 447 { 448 if (!memcg || mem_cgroup_is_root(memcg)) 449 return NULL; 450 451 return &memcg->tcp_mem.cg_proto; 452 } 453 EXPORT_SYMBOL(tcp_proto_cgroup); 454 #endif /* CONFIG_INET */ 455 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 456 457 static void drain_all_stock_async(struct mem_cgroup *memcg); 458 459 static struct mem_cgroup_per_zone * 460 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 461 { 462 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 463 } 464 465 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 466 { 467 return &memcg->css; 468 } 469 470 static struct mem_cgroup_per_zone * 471 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 472 { 473 int nid = page_to_nid(page); 474 int zid = page_zonenum(page); 475 476 return mem_cgroup_zoneinfo(memcg, nid, zid); 477 } 478 479 static struct mem_cgroup_tree_per_zone * 480 soft_limit_tree_node_zone(int nid, int zid) 481 { 482 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 483 } 484 485 static struct mem_cgroup_tree_per_zone * 486 soft_limit_tree_from_page(struct page *page) 487 { 488 int nid = page_to_nid(page); 489 int zid = page_zonenum(page); 490 491 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 492 } 493 494 static void 495 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 496 struct mem_cgroup_per_zone *mz, 497 struct mem_cgroup_tree_per_zone *mctz, 498 unsigned long long new_usage_in_excess) 499 { 500 struct rb_node **p = &mctz->rb_root.rb_node; 501 struct rb_node *parent = NULL; 502 struct mem_cgroup_per_zone *mz_node; 503 504 if (mz->on_tree) 505 return; 506 507 mz->usage_in_excess = new_usage_in_excess; 508 if (!mz->usage_in_excess) 509 return; 510 while (*p) { 511 parent = *p; 512 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 513 tree_node); 514 if (mz->usage_in_excess < mz_node->usage_in_excess) 515 p = &(*p)->rb_left; 516 /* 517 * We can't avoid mem cgroups that are over their soft 518 * limit by the same amount 519 */ 520 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 521 p = &(*p)->rb_right; 522 } 523 rb_link_node(&mz->tree_node, parent, p); 524 rb_insert_color(&mz->tree_node, &mctz->rb_root); 525 mz->on_tree = true; 526 } 527 528 static void 529 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 530 struct mem_cgroup_per_zone *mz, 531 struct mem_cgroup_tree_per_zone *mctz) 532 { 533 if (!mz->on_tree) 534 return; 535 rb_erase(&mz->tree_node, &mctz->rb_root); 536 mz->on_tree = false; 537 } 538 539 static void 540 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 541 struct mem_cgroup_per_zone *mz, 542 struct mem_cgroup_tree_per_zone *mctz) 543 { 544 spin_lock(&mctz->lock); 545 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 546 spin_unlock(&mctz->lock); 547 } 548 549 550 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 551 { 552 unsigned long long excess; 553 struct mem_cgroup_per_zone *mz; 554 struct mem_cgroup_tree_per_zone *mctz; 555 int nid = page_to_nid(page); 556 int zid = page_zonenum(page); 557 mctz = soft_limit_tree_from_page(page); 558 559 /* 560 * Necessary to update all ancestors when hierarchy is used. 561 * because their event counter is not touched. 562 */ 563 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 564 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 565 excess = res_counter_soft_limit_excess(&memcg->res); 566 /* 567 * We have to update the tree if mz is on RB-tree or 568 * mem is over its softlimit. 569 */ 570 if (excess || mz->on_tree) { 571 spin_lock(&mctz->lock); 572 /* if on-tree, remove it */ 573 if (mz->on_tree) 574 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 575 /* 576 * Insert again. mz->usage_in_excess will be updated. 577 * If excess is 0, no tree ops. 578 */ 579 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 580 spin_unlock(&mctz->lock); 581 } 582 } 583 } 584 585 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 586 { 587 int node, zone; 588 struct mem_cgroup_per_zone *mz; 589 struct mem_cgroup_tree_per_zone *mctz; 590 591 for_each_node(node) { 592 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 593 mz = mem_cgroup_zoneinfo(memcg, node, zone); 594 mctz = soft_limit_tree_node_zone(node, zone); 595 mem_cgroup_remove_exceeded(memcg, mz, mctz); 596 } 597 } 598 } 599 600 static struct mem_cgroup_per_zone * 601 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 602 { 603 struct rb_node *rightmost = NULL; 604 struct mem_cgroup_per_zone *mz; 605 606 retry: 607 mz = NULL; 608 rightmost = rb_last(&mctz->rb_root); 609 if (!rightmost) 610 goto done; /* Nothing to reclaim from */ 611 612 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 613 /* 614 * Remove the node now but someone else can add it back, 615 * we will to add it back at the end of reclaim to its correct 616 * position in the tree. 617 */ 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 620 !css_tryget(&mz->memcg->css)) 621 goto retry; 622 done: 623 return mz; 624 } 625 626 static struct mem_cgroup_per_zone * 627 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 628 { 629 struct mem_cgroup_per_zone *mz; 630 631 spin_lock(&mctz->lock); 632 mz = __mem_cgroup_largest_soft_limit_node(mctz); 633 spin_unlock(&mctz->lock); 634 return mz; 635 } 636 637 /* 638 * Implementation Note: reading percpu statistics for memcg. 639 * 640 * Both of vmstat[] and percpu_counter has threshold and do periodic 641 * synchronization to implement "quick" read. There are trade-off between 642 * reading cost and precision of value. Then, we may have a chance to implement 643 * a periodic synchronizion of counter in memcg's counter. 644 * 645 * But this _read() function is used for user interface now. The user accounts 646 * memory usage by memory cgroup and he _always_ requires exact value because 647 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 648 * have to visit all online cpus and make sum. So, for now, unnecessary 649 * synchronization is not implemented. (just implemented for cpu hotplug) 650 * 651 * If there are kernel internal actions which can make use of some not-exact 652 * value, and reading all cpu value can be performance bottleneck in some 653 * common workload, threashold and synchonization as vmstat[] should be 654 * implemented. 655 */ 656 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 657 enum mem_cgroup_stat_index idx) 658 { 659 long val = 0; 660 int cpu; 661 662 get_online_cpus(); 663 for_each_online_cpu(cpu) 664 val += per_cpu(memcg->stat->count[idx], cpu); 665 #ifdef CONFIG_HOTPLUG_CPU 666 spin_lock(&memcg->pcp_counter_lock); 667 val += memcg->nocpu_base.count[idx]; 668 spin_unlock(&memcg->pcp_counter_lock); 669 #endif 670 put_online_cpus(); 671 return val; 672 } 673 674 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 675 bool charge) 676 { 677 int val = (charge) ? 1 : -1; 678 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 679 } 680 681 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 682 enum mem_cgroup_events_index idx) 683 { 684 unsigned long val = 0; 685 int cpu; 686 687 for_each_online_cpu(cpu) 688 val += per_cpu(memcg->stat->events[idx], cpu); 689 #ifdef CONFIG_HOTPLUG_CPU 690 spin_lock(&memcg->pcp_counter_lock); 691 val += memcg->nocpu_base.events[idx]; 692 spin_unlock(&memcg->pcp_counter_lock); 693 #endif 694 return val; 695 } 696 697 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 698 bool anon, int nr_pages) 699 { 700 preempt_disable(); 701 702 /* 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 704 * counted as CACHE even if it's on ANON LRU. 705 */ 706 if (anon) 707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 708 nr_pages); 709 else 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 711 nr_pages); 712 713 /* pagein of a big page is an event. So, ignore page size */ 714 if (nr_pages > 0) 715 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 716 else { 717 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 718 nr_pages = -nr_pages; /* for event */ 719 } 720 721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 722 723 preempt_enable(); 724 } 725 726 unsigned long 727 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 728 unsigned int lru_mask) 729 { 730 struct mem_cgroup_per_zone *mz; 731 enum lru_list lru; 732 unsigned long ret = 0; 733 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 735 736 for_each_lru(lru) { 737 if (BIT(lru) & lru_mask) 738 ret += mz->lru_size[lru]; 739 } 740 return ret; 741 } 742 743 static unsigned long 744 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 745 int nid, unsigned int lru_mask) 746 { 747 u64 total = 0; 748 int zid; 749 750 for (zid = 0; zid < MAX_NR_ZONES; zid++) 751 total += mem_cgroup_zone_nr_lru_pages(memcg, 752 nid, zid, lru_mask); 753 754 return total; 755 } 756 757 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 758 unsigned int lru_mask) 759 { 760 int nid; 761 u64 total = 0; 762 763 for_each_node_state(nid, N_HIGH_MEMORY) 764 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 765 return total; 766 } 767 768 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 769 enum mem_cgroup_events_target target) 770 { 771 unsigned long val, next; 772 773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 774 next = __this_cpu_read(memcg->stat->targets[target]); 775 /* from time_after() in jiffies.h */ 776 if ((long)next - (long)val < 0) { 777 switch (target) { 778 case MEM_CGROUP_TARGET_THRESH: 779 next = val + THRESHOLDS_EVENTS_TARGET; 780 break; 781 case MEM_CGROUP_TARGET_SOFTLIMIT: 782 next = val + SOFTLIMIT_EVENTS_TARGET; 783 break; 784 case MEM_CGROUP_TARGET_NUMAINFO: 785 next = val + NUMAINFO_EVENTS_TARGET; 786 break; 787 default: 788 break; 789 } 790 __this_cpu_write(memcg->stat->targets[target], next); 791 return true; 792 } 793 return false; 794 } 795 796 /* 797 * Check events in order. 798 * 799 */ 800 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 801 { 802 preempt_disable(); 803 /* threshold event is triggered in finer grain than soft limit */ 804 if (unlikely(mem_cgroup_event_ratelimit(memcg, 805 MEM_CGROUP_TARGET_THRESH))) { 806 bool do_softlimit; 807 bool do_numainfo __maybe_unused; 808 809 do_softlimit = mem_cgroup_event_ratelimit(memcg, 810 MEM_CGROUP_TARGET_SOFTLIMIT); 811 #if MAX_NUMNODES > 1 812 do_numainfo = mem_cgroup_event_ratelimit(memcg, 813 MEM_CGROUP_TARGET_NUMAINFO); 814 #endif 815 preempt_enable(); 816 817 mem_cgroup_threshold(memcg); 818 if (unlikely(do_softlimit)) 819 mem_cgroup_update_tree(memcg, page); 820 #if MAX_NUMNODES > 1 821 if (unlikely(do_numainfo)) 822 atomic_inc(&memcg->numainfo_events); 823 #endif 824 } else 825 preempt_enable(); 826 } 827 828 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 829 { 830 return container_of(cgroup_subsys_state(cont, 831 mem_cgroup_subsys_id), struct mem_cgroup, 832 css); 833 } 834 835 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 836 { 837 /* 838 * mm_update_next_owner() may clear mm->owner to NULL 839 * if it races with swapoff, page migration, etc. 840 * So this can be called with p == NULL. 841 */ 842 if (unlikely(!p)) 843 return NULL; 844 845 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 846 struct mem_cgroup, css); 847 } 848 849 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 850 { 851 struct mem_cgroup *memcg = NULL; 852 853 if (!mm) 854 return NULL; 855 /* 856 * Because we have no locks, mm->owner's may be being moved to other 857 * cgroup. We use css_tryget() here even if this looks 858 * pessimistic (rather than adding locks here). 859 */ 860 rcu_read_lock(); 861 do { 862 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 863 if (unlikely(!memcg)) 864 break; 865 } while (!css_tryget(&memcg->css)); 866 rcu_read_unlock(); 867 return memcg; 868 } 869 870 /** 871 * mem_cgroup_iter - iterate over memory cgroup hierarchy 872 * @root: hierarchy root 873 * @prev: previously returned memcg, NULL on first invocation 874 * @reclaim: cookie for shared reclaim walks, NULL for full walks 875 * 876 * Returns references to children of the hierarchy below @root, or 877 * @root itself, or %NULL after a full round-trip. 878 * 879 * Caller must pass the return value in @prev on subsequent 880 * invocations for reference counting, or use mem_cgroup_iter_break() 881 * to cancel a hierarchy walk before the round-trip is complete. 882 * 883 * Reclaimers can specify a zone and a priority level in @reclaim to 884 * divide up the memcgs in the hierarchy among all concurrent 885 * reclaimers operating on the same zone and priority. 886 */ 887 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 888 struct mem_cgroup *prev, 889 struct mem_cgroup_reclaim_cookie *reclaim) 890 { 891 struct mem_cgroup *memcg = NULL; 892 int id = 0; 893 894 if (mem_cgroup_disabled()) 895 return NULL; 896 897 if (!root) 898 root = root_mem_cgroup; 899 900 if (prev && !reclaim) 901 id = css_id(&prev->css); 902 903 if (prev && prev != root) 904 css_put(&prev->css); 905 906 if (!root->use_hierarchy && root != root_mem_cgroup) { 907 if (prev) 908 return NULL; 909 return root; 910 } 911 912 while (!memcg) { 913 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 914 struct cgroup_subsys_state *css; 915 916 if (reclaim) { 917 int nid = zone_to_nid(reclaim->zone); 918 int zid = zone_idx(reclaim->zone); 919 struct mem_cgroup_per_zone *mz; 920 921 mz = mem_cgroup_zoneinfo(root, nid, zid); 922 iter = &mz->reclaim_iter[reclaim->priority]; 923 if (prev && reclaim->generation != iter->generation) 924 return NULL; 925 id = iter->position; 926 } 927 928 rcu_read_lock(); 929 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 930 if (css) { 931 if (css == &root->css || css_tryget(css)) 932 memcg = container_of(css, 933 struct mem_cgroup, css); 934 } else 935 id = 0; 936 rcu_read_unlock(); 937 938 if (reclaim) { 939 iter->position = id; 940 if (!css) 941 iter->generation++; 942 else if (!prev && memcg) 943 reclaim->generation = iter->generation; 944 } 945 946 if (prev && !css) 947 return NULL; 948 } 949 return memcg; 950 } 951 952 /** 953 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 954 * @root: hierarchy root 955 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 956 */ 957 void mem_cgroup_iter_break(struct mem_cgroup *root, 958 struct mem_cgroup *prev) 959 { 960 if (!root) 961 root = root_mem_cgroup; 962 if (prev && prev != root) 963 css_put(&prev->css); 964 } 965 966 /* 967 * Iteration constructs for visiting all cgroups (under a tree). If 968 * loops are exited prematurely (break), mem_cgroup_iter_break() must 969 * be used for reference counting. 970 */ 971 #define for_each_mem_cgroup_tree(iter, root) \ 972 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 973 iter != NULL; \ 974 iter = mem_cgroup_iter(root, iter, NULL)) 975 976 #define for_each_mem_cgroup(iter) \ 977 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 978 iter != NULL; \ 979 iter = mem_cgroup_iter(NULL, iter, NULL)) 980 981 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 982 { 983 return (memcg == root_mem_cgroup); 984 } 985 986 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 987 { 988 struct mem_cgroup *memcg; 989 990 if (!mm) 991 return; 992 993 rcu_read_lock(); 994 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 995 if (unlikely(!memcg)) 996 goto out; 997 998 switch (idx) { 999 case PGFAULT: 1000 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1001 break; 1002 case PGMAJFAULT: 1003 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1004 break; 1005 default: 1006 BUG(); 1007 } 1008 out: 1009 rcu_read_unlock(); 1010 } 1011 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1012 1013 /** 1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1015 * @zone: zone of the wanted lruvec 1016 * @mem: memcg of the wanted lruvec 1017 * 1018 * Returns the lru list vector holding pages for the given @zone and 1019 * @mem. This can be the global zone lruvec, if the memory controller 1020 * is disabled. 1021 */ 1022 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1023 struct mem_cgroup *memcg) 1024 { 1025 struct mem_cgroup_per_zone *mz; 1026 1027 if (mem_cgroup_disabled()) 1028 return &zone->lruvec; 1029 1030 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1031 return &mz->lruvec; 1032 } 1033 1034 /* 1035 * Following LRU functions are allowed to be used without PCG_LOCK. 1036 * Operations are called by routine of global LRU independently from memcg. 1037 * What we have to take care of here is validness of pc->mem_cgroup. 1038 * 1039 * Changes to pc->mem_cgroup happens when 1040 * 1. charge 1041 * 2. moving account 1042 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1043 * It is added to LRU before charge. 1044 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1045 * When moving account, the page is not on LRU. It's isolated. 1046 */ 1047 1048 /** 1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1050 * @zone: zone of the page 1051 * @page: the page 1052 * @lru: current lru 1053 * 1054 * This function accounts for @page being added to @lru, and returns 1055 * the lruvec for the given @zone and the memcg @page is charged to. 1056 * 1057 * The callsite is then responsible for physically linking the page to 1058 * the returned lruvec->lists[@lru]. 1059 */ 1060 struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1061 enum lru_list lru) 1062 { 1063 struct mem_cgroup_per_zone *mz; 1064 struct mem_cgroup *memcg; 1065 struct page_cgroup *pc; 1066 1067 if (mem_cgroup_disabled()) 1068 return &zone->lruvec; 1069 1070 pc = lookup_page_cgroup(page); 1071 memcg = pc->mem_cgroup; 1072 1073 /* 1074 * Surreptitiously switch any uncharged page to root: 1075 * an uncharged page off lru does nothing to secure 1076 * its former mem_cgroup from sudden removal. 1077 * 1078 * Our caller holds lru_lock, and PageCgroupUsed is updated 1079 * under page_cgroup lock: between them, they make all uses 1080 * of pc->mem_cgroup safe. 1081 */ 1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1084 1085 mz = page_cgroup_zoneinfo(memcg, page); 1086 /* compound_order() is stabilized through lru_lock */ 1087 mz->lru_size[lru] += 1 << compound_order(page); 1088 return &mz->lruvec; 1089 } 1090 1091 /** 1092 * mem_cgroup_lru_del_list - account for removing an lru page 1093 * @page: the page 1094 * @lru: target lru 1095 * 1096 * This function accounts for @page being removed from @lru. 1097 * 1098 * The callsite is then responsible for physically unlinking 1099 * @page->lru. 1100 */ 1101 void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1102 { 1103 struct mem_cgroup_per_zone *mz; 1104 struct mem_cgroup *memcg; 1105 struct page_cgroup *pc; 1106 1107 if (mem_cgroup_disabled()) 1108 return; 1109 1110 pc = lookup_page_cgroup(page); 1111 memcg = pc->mem_cgroup; 1112 VM_BUG_ON(!memcg); 1113 mz = page_cgroup_zoneinfo(memcg, page); 1114 /* huge page split is done under lru_lock. so, we have no races. */ 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); 1116 mz->lru_size[lru] -= 1 << compound_order(page); 1117 } 1118 1119 void mem_cgroup_lru_del(struct page *page) 1120 { 1121 mem_cgroup_lru_del_list(page, page_lru(page)); 1122 } 1123 1124 /** 1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus 1126 * @zone: zone of the page 1127 * @page: the page 1128 * @from: current lru 1129 * @to: target lru 1130 * 1131 * This function accounts for @page being moved between the lrus @from 1132 * and @to, and returns the lruvec for the given @zone and the memcg 1133 * @page is charged to. 1134 * 1135 * The callsite is then responsible for physically relinking 1136 * @page->lru to the returned lruvec->lists[@to]. 1137 */ 1138 struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, 1139 struct page *page, 1140 enum lru_list from, 1141 enum lru_list to) 1142 { 1143 /* XXX: Optimize this, especially for @from == @to */ 1144 mem_cgroup_lru_del_list(page, from); 1145 return mem_cgroup_lru_add_list(zone, page, to); 1146 } 1147 1148 /* 1149 * Checks whether given mem is same or in the root_mem_cgroup's 1150 * hierarchy subtree 1151 */ 1152 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1153 struct mem_cgroup *memcg) 1154 { 1155 if (root_memcg != memcg) { 1156 return (root_memcg->use_hierarchy && 1157 css_is_ancestor(&memcg->css, &root_memcg->css)); 1158 } 1159 1160 return true; 1161 } 1162 1163 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1164 { 1165 int ret; 1166 struct mem_cgroup *curr = NULL; 1167 struct task_struct *p; 1168 1169 p = find_lock_task_mm(task); 1170 if (p) { 1171 curr = try_get_mem_cgroup_from_mm(p->mm); 1172 task_unlock(p); 1173 } else { 1174 /* 1175 * All threads may have already detached their mm's, but the oom 1176 * killer still needs to detect if they have already been oom 1177 * killed to prevent needlessly killing additional tasks. 1178 */ 1179 task_lock(task); 1180 curr = mem_cgroup_from_task(task); 1181 if (curr) 1182 css_get(&curr->css); 1183 task_unlock(task); 1184 } 1185 if (!curr) 1186 return 0; 1187 /* 1188 * We should check use_hierarchy of "memcg" not "curr". Because checking 1189 * use_hierarchy of "curr" here make this function true if hierarchy is 1190 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1191 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1192 */ 1193 ret = mem_cgroup_same_or_subtree(memcg, curr); 1194 css_put(&curr->css); 1195 return ret; 1196 } 1197 1198 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1199 { 1200 unsigned long inactive_ratio; 1201 int nid = zone_to_nid(zone); 1202 int zid = zone_idx(zone); 1203 unsigned long inactive; 1204 unsigned long active; 1205 unsigned long gb; 1206 1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1208 BIT(LRU_INACTIVE_ANON)); 1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1210 BIT(LRU_ACTIVE_ANON)); 1211 1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1213 if (gb) 1214 inactive_ratio = int_sqrt(10 * gb); 1215 else 1216 inactive_ratio = 1; 1217 1218 return inactive * inactive_ratio < active; 1219 } 1220 1221 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1222 { 1223 unsigned long active; 1224 unsigned long inactive; 1225 int zid = zone_idx(zone); 1226 int nid = zone_to_nid(zone); 1227 1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1229 BIT(LRU_INACTIVE_FILE)); 1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1231 BIT(LRU_ACTIVE_FILE)); 1232 1233 return (active > inactive); 1234 } 1235 1236 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1237 struct zone *zone) 1238 { 1239 int nid = zone_to_nid(zone); 1240 int zid = zone_idx(zone); 1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1242 1243 return &mz->reclaim_stat; 1244 } 1245 1246 struct zone_reclaim_stat * 1247 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1248 { 1249 struct page_cgroup *pc; 1250 struct mem_cgroup_per_zone *mz; 1251 1252 if (mem_cgroup_disabled()) 1253 return NULL; 1254 1255 pc = lookup_page_cgroup(page); 1256 if (!PageCgroupUsed(pc)) 1257 return NULL; 1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1259 smp_rmb(); 1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1261 return &mz->reclaim_stat; 1262 } 1263 1264 #define mem_cgroup_from_res_counter(counter, member) \ 1265 container_of(counter, struct mem_cgroup, member) 1266 1267 /** 1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1269 * @mem: the memory cgroup 1270 * 1271 * Returns the maximum amount of memory @mem can be charged with, in 1272 * pages. 1273 */ 1274 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1275 { 1276 unsigned long long margin; 1277 1278 margin = res_counter_margin(&memcg->res); 1279 if (do_swap_account) 1280 margin = min(margin, res_counter_margin(&memcg->memsw)); 1281 return margin >> PAGE_SHIFT; 1282 } 1283 1284 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1285 { 1286 struct cgroup *cgrp = memcg->css.cgroup; 1287 1288 /* root ? */ 1289 if (cgrp->parent == NULL) 1290 return vm_swappiness; 1291 1292 return memcg->swappiness; 1293 } 1294 1295 /* 1296 * memcg->moving_account is used for checking possibility that some thread is 1297 * calling move_account(). When a thread on CPU-A starts moving pages under 1298 * a memcg, other threads should check memcg->moving_account under 1299 * rcu_read_lock(), like this: 1300 * 1301 * CPU-A CPU-B 1302 * rcu_read_lock() 1303 * memcg->moving_account+1 if (memcg->mocing_account) 1304 * take heavy locks. 1305 * synchronize_rcu() update something. 1306 * rcu_read_unlock() 1307 * start move here. 1308 */ 1309 1310 /* for quick checking without looking up memcg */ 1311 atomic_t memcg_moving __read_mostly; 1312 1313 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1314 { 1315 atomic_inc(&memcg_moving); 1316 atomic_inc(&memcg->moving_account); 1317 synchronize_rcu(); 1318 } 1319 1320 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1321 { 1322 /* 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1324 * We check NULL in callee rather than caller. 1325 */ 1326 if (memcg) { 1327 atomic_dec(&memcg_moving); 1328 atomic_dec(&memcg->moving_account); 1329 } 1330 } 1331 1332 /* 1333 * 2 routines for checking "mem" is under move_account() or not. 1334 * 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1336 * is used for avoiding races in accounting. If true, 1337 * pc->mem_cgroup may be overwritten. 1338 * 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1340 * under hierarchy of moving cgroups. This is for 1341 * waiting at hith-memory prressure caused by "move". 1342 */ 1343 1344 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1345 { 1346 VM_BUG_ON(!rcu_read_lock_held()); 1347 return atomic_read(&memcg->moving_account) > 0; 1348 } 1349 1350 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1351 { 1352 struct mem_cgroup *from; 1353 struct mem_cgroup *to; 1354 bool ret = false; 1355 /* 1356 * Unlike task_move routines, we access mc.to, mc.from not under 1357 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1358 */ 1359 spin_lock(&mc.lock); 1360 from = mc.from; 1361 to = mc.to; 1362 if (!from) 1363 goto unlock; 1364 1365 ret = mem_cgroup_same_or_subtree(memcg, from) 1366 || mem_cgroup_same_or_subtree(memcg, to); 1367 unlock: 1368 spin_unlock(&mc.lock); 1369 return ret; 1370 } 1371 1372 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1373 { 1374 if (mc.moving_task && current != mc.moving_task) { 1375 if (mem_cgroup_under_move(memcg)) { 1376 DEFINE_WAIT(wait); 1377 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1378 /* moving charge context might have finished. */ 1379 if (mc.moving_task) 1380 schedule(); 1381 finish_wait(&mc.waitq, &wait); 1382 return true; 1383 } 1384 } 1385 return false; 1386 } 1387 1388 /* 1389 * Take this lock when 1390 * - a code tries to modify page's memcg while it's USED. 1391 * - a code tries to modify page state accounting in a memcg. 1392 * see mem_cgroup_stolen(), too. 1393 */ 1394 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1395 unsigned long *flags) 1396 { 1397 spin_lock_irqsave(&memcg->move_lock, *flags); 1398 } 1399 1400 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1401 unsigned long *flags) 1402 { 1403 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1404 } 1405 1406 /** 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1408 * @memcg: The memory cgroup that went over limit 1409 * @p: Task that is going to be killed 1410 * 1411 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1412 * enabled 1413 */ 1414 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1415 { 1416 struct cgroup *task_cgrp; 1417 struct cgroup *mem_cgrp; 1418 /* 1419 * Need a buffer in BSS, can't rely on allocations. The code relies 1420 * on the assumption that OOM is serialized for memory controller. 1421 * If this assumption is broken, revisit this code. 1422 */ 1423 static char memcg_name[PATH_MAX]; 1424 int ret; 1425 1426 if (!memcg || !p) 1427 return; 1428 1429 rcu_read_lock(); 1430 1431 mem_cgrp = memcg->css.cgroup; 1432 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1433 1434 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1435 if (ret < 0) { 1436 /* 1437 * Unfortunately, we are unable to convert to a useful name 1438 * But we'll still print out the usage information 1439 */ 1440 rcu_read_unlock(); 1441 goto done; 1442 } 1443 rcu_read_unlock(); 1444 1445 printk(KERN_INFO "Task in %s killed", memcg_name); 1446 1447 rcu_read_lock(); 1448 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1449 if (ret < 0) { 1450 rcu_read_unlock(); 1451 goto done; 1452 } 1453 rcu_read_unlock(); 1454 1455 /* 1456 * Continues from above, so we don't need an KERN_ level 1457 */ 1458 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1459 done: 1460 1461 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1462 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1463 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1464 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1465 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1466 "failcnt %llu\n", 1467 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1468 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1469 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1470 } 1471 1472 /* 1473 * This function returns the number of memcg under hierarchy tree. Returns 1474 * 1(self count) if no children. 1475 */ 1476 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1477 { 1478 int num = 0; 1479 struct mem_cgroup *iter; 1480 1481 for_each_mem_cgroup_tree(iter, memcg) 1482 num++; 1483 return num; 1484 } 1485 1486 /* 1487 * Return the memory (and swap, if configured) limit for a memcg. 1488 */ 1489 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1490 { 1491 u64 limit; 1492 u64 memsw; 1493 1494 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1495 limit += total_swap_pages << PAGE_SHIFT; 1496 1497 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1498 /* 1499 * If memsw is finite and limits the amount of swap space available 1500 * to this memcg, return that limit. 1501 */ 1502 return min(limit, memsw); 1503 } 1504 1505 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1506 gfp_t gfp_mask, 1507 unsigned long flags) 1508 { 1509 unsigned long total = 0; 1510 bool noswap = false; 1511 int loop; 1512 1513 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1514 noswap = true; 1515 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1516 noswap = true; 1517 1518 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1519 if (loop) 1520 drain_all_stock_async(memcg); 1521 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1522 /* 1523 * Allow limit shrinkers, which are triggered directly 1524 * by userspace, to catch signals and stop reclaim 1525 * after minimal progress, regardless of the margin. 1526 */ 1527 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1528 break; 1529 if (mem_cgroup_margin(memcg)) 1530 break; 1531 /* 1532 * If nothing was reclaimed after two attempts, there 1533 * may be no reclaimable pages in this hierarchy. 1534 */ 1535 if (loop && !total) 1536 break; 1537 } 1538 return total; 1539 } 1540 1541 /** 1542 * test_mem_cgroup_node_reclaimable 1543 * @mem: the target memcg 1544 * @nid: the node ID to be checked. 1545 * @noswap : specify true here if the user wants flle only information. 1546 * 1547 * This function returns whether the specified memcg contains any 1548 * reclaimable pages on a node. Returns true if there are any reclaimable 1549 * pages in the node. 1550 */ 1551 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1552 int nid, bool noswap) 1553 { 1554 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1555 return true; 1556 if (noswap || !total_swap_pages) 1557 return false; 1558 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1559 return true; 1560 return false; 1561 1562 } 1563 #if MAX_NUMNODES > 1 1564 1565 /* 1566 * Always updating the nodemask is not very good - even if we have an empty 1567 * list or the wrong list here, we can start from some node and traverse all 1568 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1569 * 1570 */ 1571 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1572 { 1573 int nid; 1574 /* 1575 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1576 * pagein/pageout changes since the last update. 1577 */ 1578 if (!atomic_read(&memcg->numainfo_events)) 1579 return; 1580 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1581 return; 1582 1583 /* make a nodemask where this memcg uses memory from */ 1584 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1585 1586 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1587 1588 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1589 node_clear(nid, memcg->scan_nodes); 1590 } 1591 1592 atomic_set(&memcg->numainfo_events, 0); 1593 atomic_set(&memcg->numainfo_updating, 0); 1594 } 1595 1596 /* 1597 * Selecting a node where we start reclaim from. Because what we need is just 1598 * reducing usage counter, start from anywhere is O,K. Considering 1599 * memory reclaim from current node, there are pros. and cons. 1600 * 1601 * Freeing memory from current node means freeing memory from a node which 1602 * we'll use or we've used. So, it may make LRU bad. And if several threads 1603 * hit limits, it will see a contention on a node. But freeing from remote 1604 * node means more costs for memory reclaim because of memory latency. 1605 * 1606 * Now, we use round-robin. Better algorithm is welcomed. 1607 */ 1608 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1609 { 1610 int node; 1611 1612 mem_cgroup_may_update_nodemask(memcg); 1613 node = memcg->last_scanned_node; 1614 1615 node = next_node(node, memcg->scan_nodes); 1616 if (node == MAX_NUMNODES) 1617 node = first_node(memcg->scan_nodes); 1618 /* 1619 * We call this when we hit limit, not when pages are added to LRU. 1620 * No LRU may hold pages because all pages are UNEVICTABLE or 1621 * memcg is too small and all pages are not on LRU. In that case, 1622 * we use curret node. 1623 */ 1624 if (unlikely(node == MAX_NUMNODES)) 1625 node = numa_node_id(); 1626 1627 memcg->last_scanned_node = node; 1628 return node; 1629 } 1630 1631 /* 1632 * Check all nodes whether it contains reclaimable pages or not. 1633 * For quick scan, we make use of scan_nodes. This will allow us to skip 1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1635 * enough new information. We need to do double check. 1636 */ 1637 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1638 { 1639 int nid; 1640 1641 /* 1642 * quick check...making use of scan_node. 1643 * We can skip unused nodes. 1644 */ 1645 if (!nodes_empty(memcg->scan_nodes)) { 1646 for (nid = first_node(memcg->scan_nodes); 1647 nid < MAX_NUMNODES; 1648 nid = next_node(nid, memcg->scan_nodes)) { 1649 1650 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1651 return true; 1652 } 1653 } 1654 /* 1655 * Check rest of nodes. 1656 */ 1657 for_each_node_state(nid, N_HIGH_MEMORY) { 1658 if (node_isset(nid, memcg->scan_nodes)) 1659 continue; 1660 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1661 return true; 1662 } 1663 return false; 1664 } 1665 1666 #else 1667 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1668 { 1669 return 0; 1670 } 1671 1672 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1673 { 1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1675 } 1676 #endif 1677 1678 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1679 struct zone *zone, 1680 gfp_t gfp_mask, 1681 unsigned long *total_scanned) 1682 { 1683 struct mem_cgroup *victim = NULL; 1684 int total = 0; 1685 int loop = 0; 1686 unsigned long excess; 1687 unsigned long nr_scanned; 1688 struct mem_cgroup_reclaim_cookie reclaim = { 1689 .zone = zone, 1690 .priority = 0, 1691 }; 1692 1693 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1694 1695 while (1) { 1696 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1697 if (!victim) { 1698 loop++; 1699 if (loop >= 2) { 1700 /* 1701 * If we have not been able to reclaim 1702 * anything, it might because there are 1703 * no reclaimable pages under this hierarchy 1704 */ 1705 if (!total) 1706 break; 1707 /* 1708 * We want to do more targeted reclaim. 1709 * excess >> 2 is not to excessive so as to 1710 * reclaim too much, nor too less that we keep 1711 * coming back to reclaim from this cgroup 1712 */ 1713 if (total >= (excess >> 2) || 1714 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1715 break; 1716 } 1717 continue; 1718 } 1719 if (!mem_cgroup_reclaimable(victim, false)) 1720 continue; 1721 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1722 zone, &nr_scanned); 1723 *total_scanned += nr_scanned; 1724 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1725 break; 1726 } 1727 mem_cgroup_iter_break(root_memcg, victim); 1728 return total; 1729 } 1730 1731 /* 1732 * Check OOM-Killer is already running under our hierarchy. 1733 * If someone is running, return false. 1734 * Has to be called with memcg_oom_lock 1735 */ 1736 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1737 { 1738 struct mem_cgroup *iter, *failed = NULL; 1739 1740 for_each_mem_cgroup_tree(iter, memcg) { 1741 if (iter->oom_lock) { 1742 /* 1743 * this subtree of our hierarchy is already locked 1744 * so we cannot give a lock. 1745 */ 1746 failed = iter; 1747 mem_cgroup_iter_break(memcg, iter); 1748 break; 1749 } else 1750 iter->oom_lock = true; 1751 } 1752 1753 if (!failed) 1754 return true; 1755 1756 /* 1757 * OK, we failed to lock the whole subtree so we have to clean up 1758 * what we set up to the failing subtree 1759 */ 1760 for_each_mem_cgroup_tree(iter, memcg) { 1761 if (iter == failed) { 1762 mem_cgroup_iter_break(memcg, iter); 1763 break; 1764 } 1765 iter->oom_lock = false; 1766 } 1767 return false; 1768 } 1769 1770 /* 1771 * Has to be called with memcg_oom_lock 1772 */ 1773 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1774 { 1775 struct mem_cgroup *iter; 1776 1777 for_each_mem_cgroup_tree(iter, memcg) 1778 iter->oom_lock = false; 1779 return 0; 1780 } 1781 1782 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1783 { 1784 struct mem_cgroup *iter; 1785 1786 for_each_mem_cgroup_tree(iter, memcg) 1787 atomic_inc(&iter->under_oom); 1788 } 1789 1790 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1791 { 1792 struct mem_cgroup *iter; 1793 1794 /* 1795 * When a new child is created while the hierarchy is under oom, 1796 * mem_cgroup_oom_lock() may not be called. We have to use 1797 * atomic_add_unless() here. 1798 */ 1799 for_each_mem_cgroup_tree(iter, memcg) 1800 atomic_add_unless(&iter->under_oom, -1, 0); 1801 } 1802 1803 static DEFINE_SPINLOCK(memcg_oom_lock); 1804 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1805 1806 struct oom_wait_info { 1807 struct mem_cgroup *memcg; 1808 wait_queue_t wait; 1809 }; 1810 1811 static int memcg_oom_wake_function(wait_queue_t *wait, 1812 unsigned mode, int sync, void *arg) 1813 { 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1815 struct mem_cgroup *oom_wait_memcg; 1816 struct oom_wait_info *oom_wait_info; 1817 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1819 oom_wait_memcg = oom_wait_info->memcg; 1820 1821 /* 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 1823 * Then we can use css_is_ancestor without taking care of RCU. 1824 */ 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1826 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 1827 return 0; 1828 return autoremove_wake_function(wait, mode, sync, arg); 1829 } 1830 1831 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 1832 { 1833 /* for filtering, pass "memcg" as argument. */ 1834 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1835 } 1836 1837 static void memcg_oom_recover(struct mem_cgroup *memcg) 1838 { 1839 if (memcg && atomic_read(&memcg->under_oom)) 1840 memcg_wakeup_oom(memcg); 1841 } 1842 1843 /* 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1845 */ 1846 bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1847 { 1848 struct oom_wait_info owait; 1849 bool locked, need_to_kill; 1850 1851 owait.memcg = memcg; 1852 owait.wait.flags = 0; 1853 owait.wait.func = memcg_oom_wake_function; 1854 owait.wait.private = current; 1855 INIT_LIST_HEAD(&owait.wait.task_list); 1856 need_to_kill = true; 1857 mem_cgroup_mark_under_oom(memcg); 1858 1859 /* At first, try to OOM lock hierarchy under memcg.*/ 1860 spin_lock(&memcg_oom_lock); 1861 locked = mem_cgroup_oom_lock(memcg); 1862 /* 1863 * Even if signal_pending(), we can't quit charge() loop without 1864 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1865 * under OOM is always welcomed, use TASK_KILLABLE here. 1866 */ 1867 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1868 if (!locked || memcg->oom_kill_disable) 1869 need_to_kill = false; 1870 if (locked) 1871 mem_cgroup_oom_notify(memcg); 1872 spin_unlock(&memcg_oom_lock); 1873 1874 if (need_to_kill) { 1875 finish_wait(&memcg_oom_waitq, &owait.wait); 1876 mem_cgroup_out_of_memory(memcg, mask, order); 1877 } else { 1878 schedule(); 1879 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 } 1881 spin_lock(&memcg_oom_lock); 1882 if (locked) 1883 mem_cgroup_oom_unlock(memcg); 1884 memcg_wakeup_oom(memcg); 1885 spin_unlock(&memcg_oom_lock); 1886 1887 mem_cgroup_unmark_under_oom(memcg); 1888 1889 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1890 return false; 1891 /* Give chance to dying process */ 1892 schedule_timeout_uninterruptible(1); 1893 return true; 1894 } 1895 1896 /* 1897 * Currently used to update mapped file statistics, but the routine can be 1898 * generalized to update other statistics as well. 1899 * 1900 * Notes: Race condition 1901 * 1902 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1903 * it tends to be costly. But considering some conditions, we doesn't need 1904 * to do so _always_. 1905 * 1906 * Considering "charge", lock_page_cgroup() is not required because all 1907 * file-stat operations happen after a page is attached to radix-tree. There 1908 * are no race with "charge". 1909 * 1910 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1911 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1912 * if there are race with "uncharge". Statistics itself is properly handled 1913 * by flags. 1914 * 1915 * Considering "move", this is an only case we see a race. To make the race 1916 * small, we check mm->moving_account and detect there are possibility of race 1917 * If there is, we take a lock. 1918 */ 1919 1920 void __mem_cgroup_begin_update_page_stat(struct page *page, 1921 bool *locked, unsigned long *flags) 1922 { 1923 struct mem_cgroup *memcg; 1924 struct page_cgroup *pc; 1925 1926 pc = lookup_page_cgroup(page); 1927 again: 1928 memcg = pc->mem_cgroup; 1929 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1930 return; 1931 /* 1932 * If this memory cgroup is not under account moving, we don't 1933 * need to take move_lock_page_cgroup(). Because we already hold 1934 * rcu_read_lock(), any calls to move_account will be delayed until 1935 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1936 */ 1937 if (!mem_cgroup_stolen(memcg)) 1938 return; 1939 1940 move_lock_mem_cgroup(memcg, flags); 1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 1942 move_unlock_mem_cgroup(memcg, flags); 1943 goto again; 1944 } 1945 *locked = true; 1946 } 1947 1948 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 1949 { 1950 struct page_cgroup *pc = lookup_page_cgroup(page); 1951 1952 /* 1953 * It's guaranteed that pc->mem_cgroup never changes while 1954 * lock is held because a routine modifies pc->mem_cgroup 1955 * should take move_lock_page_cgroup(). 1956 */ 1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1958 } 1959 1960 void mem_cgroup_update_page_stat(struct page *page, 1961 enum mem_cgroup_page_stat_item idx, int val) 1962 { 1963 struct mem_cgroup *memcg; 1964 struct page_cgroup *pc = lookup_page_cgroup(page); 1965 unsigned long uninitialized_var(flags); 1966 1967 if (mem_cgroup_disabled()) 1968 return; 1969 1970 memcg = pc->mem_cgroup; 1971 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1972 return; 1973 1974 switch (idx) { 1975 case MEMCG_NR_FILE_MAPPED: 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1977 break; 1978 default: 1979 BUG(); 1980 } 1981 1982 this_cpu_add(memcg->stat->count[idx], val); 1983 } 1984 1985 /* 1986 * size of first charge trial. "32" comes from vmscan.c's magic value. 1987 * TODO: maybe necessary to use big numbers in big irons. 1988 */ 1989 #define CHARGE_BATCH 32U 1990 struct memcg_stock_pcp { 1991 struct mem_cgroup *cached; /* this never be root cgroup */ 1992 unsigned int nr_pages; 1993 struct work_struct work; 1994 unsigned long flags; 1995 #define FLUSHING_CACHED_CHARGE (0) 1996 }; 1997 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1998 static DEFINE_MUTEX(percpu_charge_mutex); 1999 2000 /* 2001 * Try to consume stocked charge on this cpu. If success, one page is consumed 2002 * from local stock and true is returned. If the stock is 0 or charges from a 2003 * cgroup which is not current target, returns false. This stock will be 2004 * refilled. 2005 */ 2006 static bool consume_stock(struct mem_cgroup *memcg) 2007 { 2008 struct memcg_stock_pcp *stock; 2009 bool ret = true; 2010 2011 stock = &get_cpu_var(memcg_stock); 2012 if (memcg == stock->cached && stock->nr_pages) 2013 stock->nr_pages--; 2014 else /* need to call res_counter_charge */ 2015 ret = false; 2016 put_cpu_var(memcg_stock); 2017 return ret; 2018 } 2019 2020 /* 2021 * Returns stocks cached in percpu to res_counter and reset cached information. 2022 */ 2023 static void drain_stock(struct memcg_stock_pcp *stock) 2024 { 2025 struct mem_cgroup *old = stock->cached; 2026 2027 if (stock->nr_pages) { 2028 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2029 2030 res_counter_uncharge(&old->res, bytes); 2031 if (do_swap_account) 2032 res_counter_uncharge(&old->memsw, bytes); 2033 stock->nr_pages = 0; 2034 } 2035 stock->cached = NULL; 2036 } 2037 2038 /* 2039 * This must be called under preempt disabled or must be called by 2040 * a thread which is pinned to local cpu. 2041 */ 2042 static void drain_local_stock(struct work_struct *dummy) 2043 { 2044 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2045 drain_stock(stock); 2046 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2047 } 2048 2049 /* 2050 * Cache charges(val) which is from res_counter, to local per_cpu area. 2051 * This will be consumed by consume_stock() function, later. 2052 */ 2053 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2054 { 2055 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2056 2057 if (stock->cached != memcg) { /* reset if necessary */ 2058 drain_stock(stock); 2059 stock->cached = memcg; 2060 } 2061 stock->nr_pages += nr_pages; 2062 put_cpu_var(memcg_stock); 2063 } 2064 2065 /* 2066 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2067 * of the hierarchy under it. sync flag says whether we should block 2068 * until the work is done. 2069 */ 2070 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2071 { 2072 int cpu, curcpu; 2073 2074 /* Notify other cpus that system-wide "drain" is running */ 2075 get_online_cpus(); 2076 curcpu = get_cpu(); 2077 for_each_online_cpu(cpu) { 2078 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2079 struct mem_cgroup *memcg; 2080 2081 memcg = stock->cached; 2082 if (!memcg || !stock->nr_pages) 2083 continue; 2084 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2085 continue; 2086 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2087 if (cpu == curcpu) 2088 drain_local_stock(&stock->work); 2089 else 2090 schedule_work_on(cpu, &stock->work); 2091 } 2092 } 2093 put_cpu(); 2094 2095 if (!sync) 2096 goto out; 2097 2098 for_each_online_cpu(cpu) { 2099 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2100 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2101 flush_work(&stock->work); 2102 } 2103 out: 2104 put_online_cpus(); 2105 } 2106 2107 /* 2108 * Tries to drain stocked charges in other cpus. This function is asynchronous 2109 * and just put a work per cpu for draining localy on each cpu. Caller can 2110 * expects some charges will be back to res_counter later but cannot wait for 2111 * it. 2112 */ 2113 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2114 { 2115 /* 2116 * If someone calls draining, avoid adding more kworker runs. 2117 */ 2118 if (!mutex_trylock(&percpu_charge_mutex)) 2119 return; 2120 drain_all_stock(root_memcg, false); 2121 mutex_unlock(&percpu_charge_mutex); 2122 } 2123 2124 /* This is a synchronous drain interface. */ 2125 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2126 { 2127 /* called when force_empty is called */ 2128 mutex_lock(&percpu_charge_mutex); 2129 drain_all_stock(root_memcg, true); 2130 mutex_unlock(&percpu_charge_mutex); 2131 } 2132 2133 /* 2134 * This function drains percpu counter value from DEAD cpu and 2135 * move it to local cpu. Note that this function can be preempted. 2136 */ 2137 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2138 { 2139 int i; 2140 2141 spin_lock(&memcg->pcp_counter_lock); 2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2143 long x = per_cpu(memcg->stat->count[i], cpu); 2144 2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2146 memcg->nocpu_base.count[i] += x; 2147 } 2148 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2149 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2150 2151 per_cpu(memcg->stat->events[i], cpu) = 0; 2152 memcg->nocpu_base.events[i] += x; 2153 } 2154 spin_unlock(&memcg->pcp_counter_lock); 2155 } 2156 2157 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2158 unsigned long action, 2159 void *hcpu) 2160 { 2161 int cpu = (unsigned long)hcpu; 2162 struct memcg_stock_pcp *stock; 2163 struct mem_cgroup *iter; 2164 2165 if (action == CPU_ONLINE) 2166 return NOTIFY_OK; 2167 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2169 return NOTIFY_OK; 2170 2171 for_each_mem_cgroup(iter) 2172 mem_cgroup_drain_pcp_counter(iter, cpu); 2173 2174 stock = &per_cpu(memcg_stock, cpu); 2175 drain_stock(stock); 2176 return NOTIFY_OK; 2177 } 2178 2179 2180 /* See __mem_cgroup_try_charge() for details */ 2181 enum { 2182 CHARGE_OK, /* success */ 2183 CHARGE_RETRY, /* need to retry but retry is not bad */ 2184 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2185 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2186 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2187 }; 2188 2189 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2190 unsigned int nr_pages, bool oom_check) 2191 { 2192 unsigned long csize = nr_pages * PAGE_SIZE; 2193 struct mem_cgroup *mem_over_limit; 2194 struct res_counter *fail_res; 2195 unsigned long flags = 0; 2196 int ret; 2197 2198 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2199 2200 if (likely(!ret)) { 2201 if (!do_swap_account) 2202 return CHARGE_OK; 2203 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2204 if (likely(!ret)) 2205 return CHARGE_OK; 2206 2207 res_counter_uncharge(&memcg->res, csize); 2208 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2209 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2210 } else 2211 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2212 /* 2213 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2214 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2215 * 2216 * Never reclaim on behalf of optional batching, retry with a 2217 * single page instead. 2218 */ 2219 if (nr_pages == CHARGE_BATCH) 2220 return CHARGE_RETRY; 2221 2222 if (!(gfp_mask & __GFP_WAIT)) 2223 return CHARGE_WOULDBLOCK; 2224 2225 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2226 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2227 return CHARGE_RETRY; 2228 /* 2229 * Even though the limit is exceeded at this point, reclaim 2230 * may have been able to free some pages. Retry the charge 2231 * before killing the task. 2232 * 2233 * Only for regular pages, though: huge pages are rather 2234 * unlikely to succeed so close to the limit, and we fall back 2235 * to regular pages anyway in case of failure. 2236 */ 2237 if (nr_pages == 1 && ret) 2238 return CHARGE_RETRY; 2239 2240 /* 2241 * At task move, charge accounts can be doubly counted. So, it's 2242 * better to wait until the end of task_move if something is going on. 2243 */ 2244 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2245 return CHARGE_RETRY; 2246 2247 /* If we don't need to call oom-killer at el, return immediately */ 2248 if (!oom_check) 2249 return CHARGE_NOMEM; 2250 /* check OOM */ 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2252 return CHARGE_OOM_DIE; 2253 2254 return CHARGE_RETRY; 2255 } 2256 2257 /* 2258 * __mem_cgroup_try_charge() does 2259 * 1. detect memcg to be charged against from passed *mm and *ptr, 2260 * 2. update res_counter 2261 * 3. call memory reclaim if necessary. 2262 * 2263 * In some special case, if the task is fatal, fatal_signal_pending() or 2264 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2265 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2266 * as possible without any hazards. 2: all pages should have a valid 2267 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2268 * pointer, that is treated as a charge to root_mem_cgroup. 2269 * 2270 * So __mem_cgroup_try_charge() will return 2271 * 0 ... on success, filling *ptr with a valid memcg pointer. 2272 * -ENOMEM ... charge failure because of resource limits. 2273 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2274 * 2275 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2276 * the oom-killer can be invoked. 2277 */ 2278 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2279 gfp_t gfp_mask, 2280 unsigned int nr_pages, 2281 struct mem_cgroup **ptr, 2282 bool oom) 2283 { 2284 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2285 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2286 struct mem_cgroup *memcg = NULL; 2287 int ret; 2288 2289 /* 2290 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2291 * in system level. So, allow to go ahead dying process in addition to 2292 * MEMDIE process. 2293 */ 2294 if (unlikely(test_thread_flag(TIF_MEMDIE) 2295 || fatal_signal_pending(current))) 2296 goto bypass; 2297 2298 /* 2299 * We always charge the cgroup the mm_struct belongs to. 2300 * The mm_struct's mem_cgroup changes on task migration if the 2301 * thread group leader migrates. It's possible that mm is not 2302 * set, if so charge the init_mm (happens for pagecache usage). 2303 */ 2304 if (!*ptr && !mm) 2305 *ptr = root_mem_cgroup; 2306 again: 2307 if (*ptr) { /* css should be a valid one */ 2308 memcg = *ptr; 2309 VM_BUG_ON(css_is_removed(&memcg->css)); 2310 if (mem_cgroup_is_root(memcg)) 2311 goto done; 2312 if (nr_pages == 1 && consume_stock(memcg)) 2313 goto done; 2314 css_get(&memcg->css); 2315 } else { 2316 struct task_struct *p; 2317 2318 rcu_read_lock(); 2319 p = rcu_dereference(mm->owner); 2320 /* 2321 * Because we don't have task_lock(), "p" can exit. 2322 * In that case, "memcg" can point to root or p can be NULL with 2323 * race with swapoff. Then, we have small risk of mis-accouning. 2324 * But such kind of mis-account by race always happens because 2325 * we don't have cgroup_mutex(). It's overkill and we allo that 2326 * small race, here. 2327 * (*) swapoff at el will charge against mm-struct not against 2328 * task-struct. So, mm->owner can be NULL. 2329 */ 2330 memcg = mem_cgroup_from_task(p); 2331 if (!memcg) 2332 memcg = root_mem_cgroup; 2333 if (mem_cgroup_is_root(memcg)) { 2334 rcu_read_unlock(); 2335 goto done; 2336 } 2337 if (nr_pages == 1 && consume_stock(memcg)) { 2338 /* 2339 * It seems dagerous to access memcg without css_get(). 2340 * But considering how consume_stok works, it's not 2341 * necessary. If consume_stock success, some charges 2342 * from this memcg are cached on this cpu. So, we 2343 * don't need to call css_get()/css_tryget() before 2344 * calling consume_stock(). 2345 */ 2346 rcu_read_unlock(); 2347 goto done; 2348 } 2349 /* after here, we may be blocked. we need to get refcnt */ 2350 if (!css_tryget(&memcg->css)) { 2351 rcu_read_unlock(); 2352 goto again; 2353 } 2354 rcu_read_unlock(); 2355 } 2356 2357 do { 2358 bool oom_check; 2359 2360 /* If killed, bypass charge */ 2361 if (fatal_signal_pending(current)) { 2362 css_put(&memcg->css); 2363 goto bypass; 2364 } 2365 2366 oom_check = false; 2367 if (oom && !nr_oom_retries) { 2368 oom_check = true; 2369 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2370 } 2371 2372 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2373 switch (ret) { 2374 case CHARGE_OK: 2375 break; 2376 case CHARGE_RETRY: /* not in OOM situation but retry */ 2377 batch = nr_pages; 2378 css_put(&memcg->css); 2379 memcg = NULL; 2380 goto again; 2381 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2382 css_put(&memcg->css); 2383 goto nomem; 2384 case CHARGE_NOMEM: /* OOM routine works */ 2385 if (!oom) { 2386 css_put(&memcg->css); 2387 goto nomem; 2388 } 2389 /* If oom, we never return -ENOMEM */ 2390 nr_oom_retries--; 2391 break; 2392 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2393 css_put(&memcg->css); 2394 goto bypass; 2395 } 2396 } while (ret != CHARGE_OK); 2397 2398 if (batch > nr_pages) 2399 refill_stock(memcg, batch - nr_pages); 2400 css_put(&memcg->css); 2401 done: 2402 *ptr = memcg; 2403 return 0; 2404 nomem: 2405 *ptr = NULL; 2406 return -ENOMEM; 2407 bypass: 2408 *ptr = root_mem_cgroup; 2409 return -EINTR; 2410 } 2411 2412 /* 2413 * Somemtimes we have to undo a charge we got by try_charge(). 2414 * This function is for that and do uncharge, put css's refcnt. 2415 * gotten by try_charge(). 2416 */ 2417 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2418 unsigned int nr_pages) 2419 { 2420 if (!mem_cgroup_is_root(memcg)) { 2421 unsigned long bytes = nr_pages * PAGE_SIZE; 2422 2423 res_counter_uncharge(&memcg->res, bytes); 2424 if (do_swap_account) 2425 res_counter_uncharge(&memcg->memsw, bytes); 2426 } 2427 } 2428 2429 /* 2430 * A helper function to get mem_cgroup from ID. must be called under 2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2432 * it's concern. (dropping refcnt from swap can be called against removed 2433 * memcg.) 2434 */ 2435 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2436 { 2437 struct cgroup_subsys_state *css; 2438 2439 /* ID 0 is unused ID */ 2440 if (!id) 2441 return NULL; 2442 css = css_lookup(&mem_cgroup_subsys, id); 2443 if (!css) 2444 return NULL; 2445 return container_of(css, struct mem_cgroup, css); 2446 } 2447 2448 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2449 { 2450 struct mem_cgroup *memcg = NULL; 2451 struct page_cgroup *pc; 2452 unsigned short id; 2453 swp_entry_t ent; 2454 2455 VM_BUG_ON(!PageLocked(page)); 2456 2457 pc = lookup_page_cgroup(page); 2458 lock_page_cgroup(pc); 2459 if (PageCgroupUsed(pc)) { 2460 memcg = pc->mem_cgroup; 2461 if (memcg && !css_tryget(&memcg->css)) 2462 memcg = NULL; 2463 } else if (PageSwapCache(page)) { 2464 ent.val = page_private(page); 2465 id = lookup_swap_cgroup_id(ent); 2466 rcu_read_lock(); 2467 memcg = mem_cgroup_lookup(id); 2468 if (memcg && !css_tryget(&memcg->css)) 2469 memcg = NULL; 2470 rcu_read_unlock(); 2471 } 2472 unlock_page_cgroup(pc); 2473 return memcg; 2474 } 2475 2476 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2477 struct page *page, 2478 unsigned int nr_pages, 2479 struct page_cgroup *pc, 2480 enum charge_type ctype, 2481 bool lrucare) 2482 { 2483 struct zone *uninitialized_var(zone); 2484 bool was_on_lru = false; 2485 bool anon; 2486 2487 lock_page_cgroup(pc); 2488 if (unlikely(PageCgroupUsed(pc))) { 2489 unlock_page_cgroup(pc); 2490 __mem_cgroup_cancel_charge(memcg, nr_pages); 2491 return; 2492 } 2493 /* 2494 * we don't need page_cgroup_lock about tail pages, becase they are not 2495 * accessed by any other context at this point. 2496 */ 2497 2498 /* 2499 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2500 * may already be on some other mem_cgroup's LRU. Take care of it. 2501 */ 2502 if (lrucare) { 2503 zone = page_zone(page); 2504 spin_lock_irq(&zone->lru_lock); 2505 if (PageLRU(page)) { 2506 ClearPageLRU(page); 2507 del_page_from_lru_list(zone, page, page_lru(page)); 2508 was_on_lru = true; 2509 } 2510 } 2511 2512 pc->mem_cgroup = memcg; 2513 /* 2514 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2515 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2516 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2517 * before USED bit, we need memory barrier here. 2518 * See mem_cgroup_add_lru_list(), etc. 2519 */ 2520 smp_wmb(); 2521 SetPageCgroupUsed(pc); 2522 2523 if (lrucare) { 2524 if (was_on_lru) { 2525 VM_BUG_ON(PageLRU(page)); 2526 SetPageLRU(page); 2527 add_page_to_lru_list(zone, page, page_lru(page)); 2528 } 2529 spin_unlock_irq(&zone->lru_lock); 2530 } 2531 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2533 anon = true; 2534 else 2535 anon = false; 2536 2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages); 2538 unlock_page_cgroup(pc); 2539 2540 /* 2541 * "charge_statistics" updated event counter. Then, check it. 2542 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2543 * if they exceeds softlimit. 2544 */ 2545 memcg_check_events(memcg, page); 2546 } 2547 2548 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2549 2550 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2551 /* 2552 * Because tail pages are not marked as "used", set it. We're under 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2554 * charge/uncharge will be never happen and move_account() is done under 2555 * compound_lock(), so we don't have to take care of races. 2556 */ 2557 void mem_cgroup_split_huge_fixup(struct page *head) 2558 { 2559 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2560 struct page_cgroup *pc; 2561 int i; 2562 2563 if (mem_cgroup_disabled()) 2564 return; 2565 for (i = 1; i < HPAGE_PMD_NR; i++) { 2566 pc = head_pc + i; 2567 pc->mem_cgroup = head_pc->mem_cgroup; 2568 smp_wmb();/* see __commit_charge() */ 2569 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2570 } 2571 } 2572 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2573 2574 /** 2575 * mem_cgroup_move_account - move account of the page 2576 * @page: the page 2577 * @nr_pages: number of regular pages (>1 for huge pages) 2578 * @pc: page_cgroup of the page. 2579 * @from: mem_cgroup which the page is moved from. 2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2581 * @uncharge: whether we should call uncharge and css_put against @from. 2582 * 2583 * The caller must confirm following. 2584 * - page is not on LRU (isolate_page() is useful.) 2585 * - compound_lock is held when nr_pages > 1 2586 * 2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2589 * true, this function does "uncharge" from old cgroup, but it doesn't if 2590 * @uncharge is false, so a caller should do "uncharge". 2591 */ 2592 static int mem_cgroup_move_account(struct page *page, 2593 unsigned int nr_pages, 2594 struct page_cgroup *pc, 2595 struct mem_cgroup *from, 2596 struct mem_cgroup *to, 2597 bool uncharge) 2598 { 2599 unsigned long flags; 2600 int ret; 2601 bool anon = PageAnon(page); 2602 2603 VM_BUG_ON(from == to); 2604 VM_BUG_ON(PageLRU(page)); 2605 /* 2606 * The page is isolated from LRU. So, collapse function 2607 * will not handle this page. But page splitting can happen. 2608 * Do this check under compound_page_lock(). The caller should 2609 * hold it. 2610 */ 2611 ret = -EBUSY; 2612 if (nr_pages > 1 && !PageTransHuge(page)) 2613 goto out; 2614 2615 lock_page_cgroup(pc); 2616 2617 ret = -EINVAL; 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2619 goto unlock; 2620 2621 move_lock_mem_cgroup(from, &flags); 2622 2623 if (!anon && page_mapped(page)) { 2624 /* Update mapped_file data for mem_cgroup */ 2625 preempt_disable(); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2628 preempt_enable(); 2629 } 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2631 if (uncharge) 2632 /* This is not "cancel", but cancel_charge does all we need. */ 2633 __mem_cgroup_cancel_charge(from, nr_pages); 2634 2635 /* caller should have done css_get */ 2636 pc->mem_cgroup = to; 2637 mem_cgroup_charge_statistics(to, anon, nr_pages); 2638 /* 2639 * We charges against "to" which may not have any tasks. Then, "to" 2640 * can be under rmdir(). But in current implementation, caller of 2641 * this function is just force_empty() and move charge, so it's 2642 * guaranteed that "to" is never removed. So, we don't check rmdir 2643 * status here. 2644 */ 2645 move_unlock_mem_cgroup(from, &flags); 2646 ret = 0; 2647 unlock: 2648 unlock_page_cgroup(pc); 2649 /* 2650 * check events 2651 */ 2652 memcg_check_events(to, page); 2653 memcg_check_events(from, page); 2654 out: 2655 return ret; 2656 } 2657 2658 /* 2659 * move charges to its parent. 2660 */ 2661 2662 static int mem_cgroup_move_parent(struct page *page, 2663 struct page_cgroup *pc, 2664 struct mem_cgroup *child, 2665 gfp_t gfp_mask) 2666 { 2667 struct cgroup *cg = child->css.cgroup; 2668 struct cgroup *pcg = cg->parent; 2669 struct mem_cgroup *parent; 2670 unsigned int nr_pages; 2671 unsigned long uninitialized_var(flags); 2672 int ret; 2673 2674 /* Is ROOT ? */ 2675 if (!pcg) 2676 return -EINVAL; 2677 2678 ret = -EBUSY; 2679 if (!get_page_unless_zero(page)) 2680 goto out; 2681 if (isolate_lru_page(page)) 2682 goto put; 2683 2684 nr_pages = hpage_nr_pages(page); 2685 2686 parent = mem_cgroup_from_cont(pcg); 2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2688 if (ret) 2689 goto put_back; 2690 2691 if (nr_pages > 1) 2692 flags = compound_lock_irqsave(page); 2693 2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2695 if (ret) 2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2697 2698 if (nr_pages > 1) 2699 compound_unlock_irqrestore(page, flags); 2700 put_back: 2701 putback_lru_page(page); 2702 put: 2703 put_page(page); 2704 out: 2705 return ret; 2706 } 2707 2708 /* 2709 * Charge the memory controller for page usage. 2710 * Return 2711 * 0 if the charge was successful 2712 * < 0 if the cgroup is over its limit 2713 */ 2714 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2715 gfp_t gfp_mask, enum charge_type ctype) 2716 { 2717 struct mem_cgroup *memcg = NULL; 2718 unsigned int nr_pages = 1; 2719 struct page_cgroup *pc; 2720 bool oom = true; 2721 int ret; 2722 2723 if (PageTransHuge(page)) { 2724 nr_pages <<= compound_order(page); 2725 VM_BUG_ON(!PageTransHuge(page)); 2726 /* 2727 * Never OOM-kill a process for a huge page. The 2728 * fault handler will fall back to regular pages. 2729 */ 2730 oom = false; 2731 } 2732 2733 pc = lookup_page_cgroup(page); 2734 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2735 if (ret == -ENOMEM) 2736 return ret; 2737 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); 2738 return 0; 2739 } 2740 2741 int mem_cgroup_newpage_charge(struct page *page, 2742 struct mm_struct *mm, gfp_t gfp_mask) 2743 { 2744 if (mem_cgroup_disabled()) 2745 return 0; 2746 VM_BUG_ON(page_mapped(page)); 2747 VM_BUG_ON(page->mapping && !PageAnon(page)); 2748 VM_BUG_ON(!mm); 2749 return mem_cgroup_charge_common(page, mm, gfp_mask, 2750 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2751 } 2752 2753 static void 2754 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2755 enum charge_type ctype); 2756 2757 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2758 gfp_t gfp_mask) 2759 { 2760 struct mem_cgroup *memcg = NULL; 2761 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 2762 int ret; 2763 2764 if (mem_cgroup_disabled()) 2765 return 0; 2766 if (PageCompound(page)) 2767 return 0; 2768 2769 if (unlikely(!mm)) 2770 mm = &init_mm; 2771 if (!page_is_file_cache(page)) 2772 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2773 2774 if (!PageSwapCache(page)) 2775 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 2776 else { /* page is swapcache/shmem */ 2777 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2778 if (!ret) 2779 __mem_cgroup_commit_charge_swapin(page, memcg, type); 2780 } 2781 return ret; 2782 } 2783 2784 /* 2785 * While swap-in, try_charge -> commit or cancel, the page is locked. 2786 * And when try_charge() successfully returns, one refcnt to memcg without 2787 * struct page_cgroup is acquired. This refcnt will be consumed by 2788 * "commit()" or removed by "cancel()" 2789 */ 2790 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2791 struct page *page, 2792 gfp_t mask, struct mem_cgroup **memcgp) 2793 { 2794 struct mem_cgroup *memcg; 2795 int ret; 2796 2797 *memcgp = NULL; 2798 2799 if (mem_cgroup_disabled()) 2800 return 0; 2801 2802 if (!do_swap_account) 2803 goto charge_cur_mm; 2804 /* 2805 * A racing thread's fault, or swapoff, may have already updated 2806 * the pte, and even removed page from swap cache: in those cases 2807 * do_swap_page()'s pte_same() test will fail; but there's also a 2808 * KSM case which does need to charge the page. 2809 */ 2810 if (!PageSwapCache(page)) 2811 goto charge_cur_mm; 2812 memcg = try_get_mem_cgroup_from_page(page); 2813 if (!memcg) 2814 goto charge_cur_mm; 2815 *memcgp = memcg; 2816 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 2817 css_put(&memcg->css); 2818 if (ret == -EINTR) 2819 ret = 0; 2820 return ret; 2821 charge_cur_mm: 2822 if (unlikely(!mm)) 2823 mm = &init_mm; 2824 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2825 if (ret == -EINTR) 2826 ret = 0; 2827 return ret; 2828 } 2829 2830 static void 2831 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2832 enum charge_type ctype) 2833 { 2834 struct page_cgroup *pc; 2835 2836 if (mem_cgroup_disabled()) 2837 return; 2838 if (!memcg) 2839 return; 2840 cgroup_exclude_rmdir(&memcg->css); 2841 2842 pc = lookup_page_cgroup(page); 2843 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); 2844 /* 2845 * Now swap is on-memory. This means this page may be 2846 * counted both as mem and swap....double count. 2847 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2848 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2849 * may call delete_from_swap_cache() before reach here. 2850 */ 2851 if (do_swap_account && PageSwapCache(page)) { 2852 swp_entry_t ent = {.val = page_private(page)}; 2853 struct mem_cgroup *swap_memcg; 2854 unsigned short id; 2855 2856 id = swap_cgroup_record(ent, 0); 2857 rcu_read_lock(); 2858 swap_memcg = mem_cgroup_lookup(id); 2859 if (swap_memcg) { 2860 /* 2861 * This recorded memcg can be obsolete one. So, avoid 2862 * calling css_tryget 2863 */ 2864 if (!mem_cgroup_is_root(swap_memcg)) 2865 res_counter_uncharge(&swap_memcg->memsw, 2866 PAGE_SIZE); 2867 mem_cgroup_swap_statistics(swap_memcg, false); 2868 mem_cgroup_put(swap_memcg); 2869 } 2870 rcu_read_unlock(); 2871 } 2872 /* 2873 * At swapin, we may charge account against cgroup which has no tasks. 2874 * So, rmdir()->pre_destroy() can be called while we do this charge. 2875 * In that case, we need to call pre_destroy() again. check it here. 2876 */ 2877 cgroup_release_and_wakeup_rmdir(&memcg->css); 2878 } 2879 2880 void mem_cgroup_commit_charge_swapin(struct page *page, 2881 struct mem_cgroup *memcg) 2882 { 2883 __mem_cgroup_commit_charge_swapin(page, memcg, 2884 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2885 } 2886 2887 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2888 { 2889 if (mem_cgroup_disabled()) 2890 return; 2891 if (!memcg) 2892 return; 2893 __mem_cgroup_cancel_charge(memcg, 1); 2894 } 2895 2896 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2897 unsigned int nr_pages, 2898 const enum charge_type ctype) 2899 { 2900 struct memcg_batch_info *batch = NULL; 2901 bool uncharge_memsw = true; 2902 2903 /* If swapout, usage of swap doesn't decrease */ 2904 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2905 uncharge_memsw = false; 2906 2907 batch = ¤t->memcg_batch; 2908 /* 2909 * In usual, we do css_get() when we remember memcg pointer. 2910 * But in this case, we keep res->usage until end of a series of 2911 * uncharges. Then, it's ok to ignore memcg's refcnt. 2912 */ 2913 if (!batch->memcg) 2914 batch->memcg = memcg; 2915 /* 2916 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2917 * In those cases, all pages freed continuously can be expected to be in 2918 * the same cgroup and we have chance to coalesce uncharges. 2919 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2920 * because we want to do uncharge as soon as possible. 2921 */ 2922 2923 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2924 goto direct_uncharge; 2925 2926 if (nr_pages > 1) 2927 goto direct_uncharge; 2928 2929 /* 2930 * In typical case, batch->memcg == mem. This means we can 2931 * merge a series of uncharges to an uncharge of res_counter. 2932 * If not, we uncharge res_counter ony by one. 2933 */ 2934 if (batch->memcg != memcg) 2935 goto direct_uncharge; 2936 /* remember freed charge and uncharge it later */ 2937 batch->nr_pages++; 2938 if (uncharge_memsw) 2939 batch->memsw_nr_pages++; 2940 return; 2941 direct_uncharge: 2942 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 2943 if (uncharge_memsw) 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2945 if (unlikely(batch->memcg != memcg)) 2946 memcg_oom_recover(memcg); 2947 } 2948 2949 /* 2950 * uncharge if !page_mapped(page) 2951 */ 2952 static struct mem_cgroup * 2953 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2954 { 2955 struct mem_cgroup *memcg = NULL; 2956 unsigned int nr_pages = 1; 2957 struct page_cgroup *pc; 2958 bool anon; 2959 2960 if (mem_cgroup_disabled()) 2961 return NULL; 2962 2963 if (PageSwapCache(page)) 2964 return NULL; 2965 2966 if (PageTransHuge(page)) { 2967 nr_pages <<= compound_order(page); 2968 VM_BUG_ON(!PageTransHuge(page)); 2969 } 2970 /* 2971 * Check if our page_cgroup is valid 2972 */ 2973 pc = lookup_page_cgroup(page); 2974 if (unlikely(!PageCgroupUsed(pc))) 2975 return NULL; 2976 2977 lock_page_cgroup(pc); 2978 2979 memcg = pc->mem_cgroup; 2980 2981 if (!PageCgroupUsed(pc)) 2982 goto unlock_out; 2983 2984 anon = PageAnon(page); 2985 2986 switch (ctype) { 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2988 /* 2989 * Generally PageAnon tells if it's the anon statistics to be 2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 2991 * used before page reached the stage of being marked PageAnon. 2992 */ 2993 anon = true; 2994 /* fallthrough */ 2995 case MEM_CGROUP_CHARGE_TYPE_DROP: 2996 /* See mem_cgroup_prepare_migration() */ 2997 if (page_mapped(page) || PageCgroupMigration(pc)) 2998 goto unlock_out; 2999 break; 3000 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3001 if (!PageAnon(page)) { /* Shared memory */ 3002 if (page->mapping && !page_is_file_cache(page)) 3003 goto unlock_out; 3004 } else if (page_mapped(page)) /* Anon */ 3005 goto unlock_out; 3006 break; 3007 default: 3008 break; 3009 } 3010 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages); 3012 3013 ClearPageCgroupUsed(pc); 3014 /* 3015 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3016 * freed from LRU. This is safe because uncharged page is expected not 3017 * to be reused (freed soon). Exception is SwapCache, it's handled by 3018 * special functions. 3019 */ 3020 3021 unlock_page_cgroup(pc); 3022 /* 3023 * even after unlock, we have memcg->res.usage here and this memcg 3024 * will never be freed. 3025 */ 3026 memcg_check_events(memcg, page); 3027 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3028 mem_cgroup_swap_statistics(memcg, true); 3029 mem_cgroup_get(memcg); 3030 } 3031 if (!mem_cgroup_is_root(memcg)) 3032 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3033 3034 return memcg; 3035 3036 unlock_out: 3037 unlock_page_cgroup(pc); 3038 return NULL; 3039 } 3040 3041 void mem_cgroup_uncharge_page(struct page *page) 3042 { 3043 /* early check. */ 3044 if (page_mapped(page)) 3045 return; 3046 VM_BUG_ON(page->mapping && !PageAnon(page)); 3047 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3048 } 3049 3050 void mem_cgroup_uncharge_cache_page(struct page *page) 3051 { 3052 VM_BUG_ON(page_mapped(page)); 3053 VM_BUG_ON(page->mapping); 3054 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3055 } 3056 3057 /* 3058 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3059 * In that cases, pages are freed continuously and we can expect pages 3060 * are in the same memcg. All these calls itself limits the number of 3061 * pages freed at once, then uncharge_start/end() is called properly. 3062 * This may be called prural(2) times in a context, 3063 */ 3064 3065 void mem_cgroup_uncharge_start(void) 3066 { 3067 current->memcg_batch.do_batch++; 3068 /* We can do nest. */ 3069 if (current->memcg_batch.do_batch == 1) { 3070 current->memcg_batch.memcg = NULL; 3071 current->memcg_batch.nr_pages = 0; 3072 current->memcg_batch.memsw_nr_pages = 0; 3073 } 3074 } 3075 3076 void mem_cgroup_uncharge_end(void) 3077 { 3078 struct memcg_batch_info *batch = ¤t->memcg_batch; 3079 3080 if (!batch->do_batch) 3081 return; 3082 3083 batch->do_batch--; 3084 if (batch->do_batch) /* If stacked, do nothing. */ 3085 return; 3086 3087 if (!batch->memcg) 3088 return; 3089 /* 3090 * This "batch->memcg" is valid without any css_get/put etc... 3091 * bacause we hide charges behind us. 3092 */ 3093 if (batch->nr_pages) 3094 res_counter_uncharge(&batch->memcg->res, 3095 batch->nr_pages * PAGE_SIZE); 3096 if (batch->memsw_nr_pages) 3097 res_counter_uncharge(&batch->memcg->memsw, 3098 batch->memsw_nr_pages * PAGE_SIZE); 3099 memcg_oom_recover(batch->memcg); 3100 /* forget this pointer (for sanity check) */ 3101 batch->memcg = NULL; 3102 } 3103 3104 #ifdef CONFIG_SWAP 3105 /* 3106 * called after __delete_from_swap_cache() and drop "page" account. 3107 * memcg information is recorded to swap_cgroup of "ent" 3108 */ 3109 void 3110 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3111 { 3112 struct mem_cgroup *memcg; 3113 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3114 3115 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3116 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3117 3118 memcg = __mem_cgroup_uncharge_common(page, ctype); 3119 3120 /* 3121 * record memcg information, if swapout && memcg != NULL, 3122 * mem_cgroup_get() was called in uncharge(). 3123 */ 3124 if (do_swap_account && swapout && memcg) 3125 swap_cgroup_record(ent, css_id(&memcg->css)); 3126 } 3127 #endif 3128 3129 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3130 /* 3131 * called from swap_entry_free(). remove record in swap_cgroup and 3132 * uncharge "memsw" account. 3133 */ 3134 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3135 { 3136 struct mem_cgroup *memcg; 3137 unsigned short id; 3138 3139 if (!do_swap_account) 3140 return; 3141 3142 id = swap_cgroup_record(ent, 0); 3143 rcu_read_lock(); 3144 memcg = mem_cgroup_lookup(id); 3145 if (memcg) { 3146 /* 3147 * We uncharge this because swap is freed. 3148 * This memcg can be obsolete one. We avoid calling css_tryget 3149 */ 3150 if (!mem_cgroup_is_root(memcg)) 3151 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3152 mem_cgroup_swap_statistics(memcg, false); 3153 mem_cgroup_put(memcg); 3154 } 3155 rcu_read_unlock(); 3156 } 3157 3158 /** 3159 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3160 * @entry: swap entry to be moved 3161 * @from: mem_cgroup which the entry is moved from 3162 * @to: mem_cgroup which the entry is moved to 3163 * @need_fixup: whether we should fixup res_counters and refcounts. 3164 * 3165 * It succeeds only when the swap_cgroup's record for this entry is the same 3166 * as the mem_cgroup's id of @from. 3167 * 3168 * Returns 0 on success, -EINVAL on failure. 3169 * 3170 * The caller must have charged to @to, IOW, called res_counter_charge() about 3171 * both res and memsw, and called css_get(). 3172 */ 3173 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3174 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3175 { 3176 unsigned short old_id, new_id; 3177 3178 old_id = css_id(&from->css); 3179 new_id = css_id(&to->css); 3180 3181 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3182 mem_cgroup_swap_statistics(from, false); 3183 mem_cgroup_swap_statistics(to, true); 3184 /* 3185 * This function is only called from task migration context now. 3186 * It postpones res_counter and refcount handling till the end 3187 * of task migration(mem_cgroup_clear_mc()) for performance 3188 * improvement. But we cannot postpone mem_cgroup_get(to) 3189 * because if the process that has been moved to @to does 3190 * swap-in, the refcount of @to might be decreased to 0. 3191 */ 3192 mem_cgroup_get(to); 3193 if (need_fixup) { 3194 if (!mem_cgroup_is_root(from)) 3195 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3196 mem_cgroup_put(from); 3197 /* 3198 * we charged both to->res and to->memsw, so we should 3199 * uncharge to->res. 3200 */ 3201 if (!mem_cgroup_is_root(to)) 3202 res_counter_uncharge(&to->res, PAGE_SIZE); 3203 } 3204 return 0; 3205 } 3206 return -EINVAL; 3207 } 3208 #else 3209 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3210 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3211 { 3212 return -EINVAL; 3213 } 3214 #endif 3215 3216 /* 3217 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3218 * page belongs to. 3219 */ 3220 int mem_cgroup_prepare_migration(struct page *page, 3221 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3222 { 3223 struct mem_cgroup *memcg = NULL; 3224 struct page_cgroup *pc; 3225 enum charge_type ctype; 3226 int ret = 0; 3227 3228 *memcgp = NULL; 3229 3230 VM_BUG_ON(PageTransHuge(page)); 3231 if (mem_cgroup_disabled()) 3232 return 0; 3233 3234 pc = lookup_page_cgroup(page); 3235 lock_page_cgroup(pc); 3236 if (PageCgroupUsed(pc)) { 3237 memcg = pc->mem_cgroup; 3238 css_get(&memcg->css); 3239 /* 3240 * At migrating an anonymous page, its mapcount goes down 3241 * to 0 and uncharge() will be called. But, even if it's fully 3242 * unmapped, migration may fail and this page has to be 3243 * charged again. We set MIGRATION flag here and delay uncharge 3244 * until end_migration() is called 3245 * 3246 * Corner Case Thinking 3247 * A) 3248 * When the old page was mapped as Anon and it's unmap-and-freed 3249 * while migration was ongoing. 3250 * If unmap finds the old page, uncharge() of it will be delayed 3251 * until end_migration(). If unmap finds a new page, it's 3252 * uncharged when it make mapcount to be 1->0. If unmap code 3253 * finds swap_migration_entry, the new page will not be mapped 3254 * and end_migration() will find it(mapcount==0). 3255 * 3256 * B) 3257 * When the old page was mapped but migraion fails, the kernel 3258 * remaps it. A charge for it is kept by MIGRATION flag even 3259 * if mapcount goes down to 0. We can do remap successfully 3260 * without charging it again. 3261 * 3262 * C) 3263 * The "old" page is under lock_page() until the end of 3264 * migration, so, the old page itself will not be swapped-out. 3265 * If the new page is swapped out before end_migraton, our 3266 * hook to usual swap-out path will catch the event. 3267 */ 3268 if (PageAnon(page)) 3269 SetPageCgroupMigration(pc); 3270 } 3271 unlock_page_cgroup(pc); 3272 /* 3273 * If the page is not charged at this point, 3274 * we return here. 3275 */ 3276 if (!memcg) 3277 return 0; 3278 3279 *memcgp = memcg; 3280 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); 3281 css_put(&memcg->css);/* drop extra refcnt */ 3282 if (ret) { 3283 if (PageAnon(page)) { 3284 lock_page_cgroup(pc); 3285 ClearPageCgroupMigration(pc); 3286 unlock_page_cgroup(pc); 3287 /* 3288 * The old page may be fully unmapped while we kept it. 3289 */ 3290 mem_cgroup_uncharge_page(page); 3291 } 3292 /* we'll need to revisit this error code (we have -EINTR) */ 3293 return -ENOMEM; 3294 } 3295 /* 3296 * We charge new page before it's used/mapped. So, even if unlock_page() 3297 * is called before end_migration, we can catch all events on this new 3298 * page. In the case new page is migrated but not remapped, new page's 3299 * mapcount will be finally 0 and we call uncharge in end_migration(). 3300 */ 3301 pc = lookup_page_cgroup(newpage); 3302 if (PageAnon(page)) 3303 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3304 else if (page_is_file_cache(page)) 3305 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3306 else 3307 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3308 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); 3309 return ret; 3310 } 3311 3312 /* remove redundant charge if migration failed*/ 3313 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 3314 struct page *oldpage, struct page *newpage, bool migration_ok) 3315 { 3316 struct page *used, *unused; 3317 struct page_cgroup *pc; 3318 bool anon; 3319 3320 if (!memcg) 3321 return; 3322 /* blocks rmdir() */ 3323 cgroup_exclude_rmdir(&memcg->css); 3324 if (!migration_ok) { 3325 used = oldpage; 3326 unused = newpage; 3327 } else { 3328 used = newpage; 3329 unused = oldpage; 3330 } 3331 /* 3332 * We disallowed uncharge of pages under migration because mapcount 3333 * of the page goes down to zero, temporarly. 3334 * Clear the flag and check the page should be charged. 3335 */ 3336 pc = lookup_page_cgroup(oldpage); 3337 lock_page_cgroup(pc); 3338 ClearPageCgroupMigration(pc); 3339 unlock_page_cgroup(pc); 3340 anon = PageAnon(used); 3341 __mem_cgroup_uncharge_common(unused, 3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED 3343 : MEM_CGROUP_CHARGE_TYPE_CACHE); 3344 3345 /* 3346 * If a page is a file cache, radix-tree replacement is very atomic 3347 * and we can skip this check. When it was an Anon page, its mapcount 3348 * goes down to 0. But because we added MIGRATION flage, it's not 3349 * uncharged yet. There are several case but page->mapcount check 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3351 * check. (see prepare_charge() also) 3352 */ 3353 if (anon) 3354 mem_cgroup_uncharge_page(used); 3355 /* 3356 * At migration, we may charge account against cgroup which has no 3357 * tasks. 3358 * So, rmdir()->pre_destroy() can be called while we do this charge. 3359 * In that case, we need to call pre_destroy() again. check it here. 3360 */ 3361 cgroup_release_and_wakeup_rmdir(&memcg->css); 3362 } 3363 3364 /* 3365 * At replace page cache, newpage is not under any memcg but it's on 3366 * LRU. So, this function doesn't touch res_counter but handles LRU 3367 * in correct way. Both pages are locked so we cannot race with uncharge. 3368 */ 3369 void mem_cgroup_replace_page_cache(struct page *oldpage, 3370 struct page *newpage) 3371 { 3372 struct mem_cgroup *memcg; 3373 struct page_cgroup *pc; 3374 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3375 3376 if (mem_cgroup_disabled()) 3377 return; 3378 3379 pc = lookup_page_cgroup(oldpage); 3380 /* fix accounting on old pages */ 3381 lock_page_cgroup(pc); 3382 memcg = pc->mem_cgroup; 3383 mem_cgroup_charge_statistics(memcg, false, -1); 3384 ClearPageCgroupUsed(pc); 3385 unlock_page_cgroup(pc); 3386 3387 if (PageSwapBacked(oldpage)) 3388 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3389 3390 /* 3391 * Even if newpage->mapping was NULL before starting replacement, 3392 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3393 * LRU while we overwrite pc->mem_cgroup. 3394 */ 3395 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); 3396 } 3397 3398 #ifdef CONFIG_DEBUG_VM 3399 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3400 { 3401 struct page_cgroup *pc; 3402 3403 pc = lookup_page_cgroup(page); 3404 /* 3405 * Can be NULL while feeding pages into the page allocator for 3406 * the first time, i.e. during boot or memory hotplug; 3407 * or when mem_cgroup_disabled(). 3408 */ 3409 if (likely(pc) && PageCgroupUsed(pc)) 3410 return pc; 3411 return NULL; 3412 } 3413 3414 bool mem_cgroup_bad_page_check(struct page *page) 3415 { 3416 if (mem_cgroup_disabled()) 3417 return false; 3418 3419 return lookup_page_cgroup_used(page) != NULL; 3420 } 3421 3422 void mem_cgroup_print_bad_page(struct page *page) 3423 { 3424 struct page_cgroup *pc; 3425 3426 pc = lookup_page_cgroup_used(page); 3427 if (pc) { 3428 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3429 pc, pc->flags, pc->mem_cgroup); 3430 } 3431 } 3432 #endif 3433 3434 static DEFINE_MUTEX(set_limit_mutex); 3435 3436 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3437 unsigned long long val) 3438 { 3439 int retry_count; 3440 u64 memswlimit, memlimit; 3441 int ret = 0; 3442 int children = mem_cgroup_count_children(memcg); 3443 u64 curusage, oldusage; 3444 int enlarge; 3445 3446 /* 3447 * For keeping hierarchical_reclaim simple, how long we should retry 3448 * is depends on callers. We set our retry-count to be function 3449 * of # of children which we should visit in this loop. 3450 */ 3451 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3452 3453 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3454 3455 enlarge = 0; 3456 while (retry_count) { 3457 if (signal_pending(current)) { 3458 ret = -EINTR; 3459 break; 3460 } 3461 /* 3462 * Rather than hide all in some function, I do this in 3463 * open coded manner. You see what this really does. 3464 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3465 */ 3466 mutex_lock(&set_limit_mutex); 3467 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3468 if (memswlimit < val) { 3469 ret = -EINVAL; 3470 mutex_unlock(&set_limit_mutex); 3471 break; 3472 } 3473 3474 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3475 if (memlimit < val) 3476 enlarge = 1; 3477 3478 ret = res_counter_set_limit(&memcg->res, val); 3479 if (!ret) { 3480 if (memswlimit == val) 3481 memcg->memsw_is_minimum = true; 3482 else 3483 memcg->memsw_is_minimum = false; 3484 } 3485 mutex_unlock(&set_limit_mutex); 3486 3487 if (!ret) 3488 break; 3489 3490 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3491 MEM_CGROUP_RECLAIM_SHRINK); 3492 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3493 /* Usage is reduced ? */ 3494 if (curusage >= oldusage) 3495 retry_count--; 3496 else 3497 oldusage = curusage; 3498 } 3499 if (!ret && enlarge) 3500 memcg_oom_recover(memcg); 3501 3502 return ret; 3503 } 3504 3505 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3506 unsigned long long val) 3507 { 3508 int retry_count; 3509 u64 memlimit, memswlimit, oldusage, curusage; 3510 int children = mem_cgroup_count_children(memcg); 3511 int ret = -EBUSY; 3512 int enlarge = 0; 3513 3514 /* see mem_cgroup_resize_res_limit */ 3515 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3516 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3517 while (retry_count) { 3518 if (signal_pending(current)) { 3519 ret = -EINTR; 3520 break; 3521 } 3522 /* 3523 * Rather than hide all in some function, I do this in 3524 * open coded manner. You see what this really does. 3525 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3526 */ 3527 mutex_lock(&set_limit_mutex); 3528 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3529 if (memlimit > val) { 3530 ret = -EINVAL; 3531 mutex_unlock(&set_limit_mutex); 3532 break; 3533 } 3534 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3535 if (memswlimit < val) 3536 enlarge = 1; 3537 ret = res_counter_set_limit(&memcg->memsw, val); 3538 if (!ret) { 3539 if (memlimit == val) 3540 memcg->memsw_is_minimum = true; 3541 else 3542 memcg->memsw_is_minimum = false; 3543 } 3544 mutex_unlock(&set_limit_mutex); 3545 3546 if (!ret) 3547 break; 3548 3549 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3550 MEM_CGROUP_RECLAIM_NOSWAP | 3551 MEM_CGROUP_RECLAIM_SHRINK); 3552 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3553 /* Usage is reduced ? */ 3554 if (curusage >= oldusage) 3555 retry_count--; 3556 else 3557 oldusage = curusage; 3558 } 3559 if (!ret && enlarge) 3560 memcg_oom_recover(memcg); 3561 return ret; 3562 } 3563 3564 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3565 gfp_t gfp_mask, 3566 unsigned long *total_scanned) 3567 { 3568 unsigned long nr_reclaimed = 0; 3569 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3570 unsigned long reclaimed; 3571 int loop = 0; 3572 struct mem_cgroup_tree_per_zone *mctz; 3573 unsigned long long excess; 3574 unsigned long nr_scanned; 3575 3576 if (order > 0) 3577 return 0; 3578 3579 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3580 /* 3581 * This loop can run a while, specially if mem_cgroup's continuously 3582 * keep exceeding their soft limit and putting the system under 3583 * pressure 3584 */ 3585 do { 3586 if (next_mz) 3587 mz = next_mz; 3588 else 3589 mz = mem_cgroup_largest_soft_limit_node(mctz); 3590 if (!mz) 3591 break; 3592 3593 nr_scanned = 0; 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3595 gfp_mask, &nr_scanned); 3596 nr_reclaimed += reclaimed; 3597 *total_scanned += nr_scanned; 3598 spin_lock(&mctz->lock); 3599 3600 /* 3601 * If we failed to reclaim anything from this memory cgroup 3602 * it is time to move on to the next cgroup 3603 */ 3604 next_mz = NULL; 3605 if (!reclaimed) { 3606 do { 3607 /* 3608 * Loop until we find yet another one. 3609 * 3610 * By the time we get the soft_limit lock 3611 * again, someone might have aded the 3612 * group back on the RB tree. Iterate to 3613 * make sure we get a different mem. 3614 * mem_cgroup_largest_soft_limit_node returns 3615 * NULL if no other cgroup is present on 3616 * the tree 3617 */ 3618 next_mz = 3619 __mem_cgroup_largest_soft_limit_node(mctz); 3620 if (next_mz == mz) 3621 css_put(&next_mz->memcg->css); 3622 else /* next_mz == NULL or other memcg */ 3623 break; 3624 } while (1); 3625 } 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3628 /* 3629 * One school of thought says that we should not add 3630 * back the node to the tree if reclaim returns 0. 3631 * But our reclaim could return 0, simply because due 3632 * to priority we are exposing a smaller subset of 3633 * memory to reclaim from. Consider this as a longer 3634 * term TODO. 3635 */ 3636 /* If excess == 0, no tree ops */ 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 3638 spin_unlock(&mctz->lock); 3639 css_put(&mz->memcg->css); 3640 loop++; 3641 /* 3642 * Could not reclaim anything and there are no more 3643 * mem cgroups to try or we seem to be looping without 3644 * reclaiming anything. 3645 */ 3646 if (!nr_reclaimed && 3647 (next_mz == NULL || 3648 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3649 break; 3650 } while (!nr_reclaimed); 3651 if (next_mz) 3652 css_put(&next_mz->memcg->css); 3653 return nr_reclaimed; 3654 } 3655 3656 /* 3657 * This routine traverse page_cgroup in given list and drop them all. 3658 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3659 */ 3660 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3661 int node, int zid, enum lru_list lru) 3662 { 3663 struct mem_cgroup_per_zone *mz; 3664 unsigned long flags, loop; 3665 struct list_head *list; 3666 struct page *busy; 3667 struct zone *zone; 3668 int ret = 0; 3669 3670 zone = &NODE_DATA(node)->node_zones[zid]; 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3672 list = &mz->lruvec.lists[lru]; 3673 3674 loop = mz->lru_size[lru]; 3675 /* give some margin against EBUSY etc...*/ 3676 loop += 256; 3677 busy = NULL; 3678 while (loop--) { 3679 struct page_cgroup *pc; 3680 struct page *page; 3681 3682 ret = 0; 3683 spin_lock_irqsave(&zone->lru_lock, flags); 3684 if (list_empty(list)) { 3685 spin_unlock_irqrestore(&zone->lru_lock, flags); 3686 break; 3687 } 3688 page = list_entry(list->prev, struct page, lru); 3689 if (busy == page) { 3690 list_move(&page->lru, list); 3691 busy = NULL; 3692 spin_unlock_irqrestore(&zone->lru_lock, flags); 3693 continue; 3694 } 3695 spin_unlock_irqrestore(&zone->lru_lock, flags); 3696 3697 pc = lookup_page_cgroup(page); 3698 3699 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3700 if (ret == -ENOMEM || ret == -EINTR) 3701 break; 3702 3703 if (ret == -EBUSY || ret == -EINVAL) { 3704 /* found lock contention or "pc" is obsolete. */ 3705 busy = page; 3706 cond_resched(); 3707 } else 3708 busy = NULL; 3709 } 3710 3711 if (!ret && !list_empty(list)) 3712 return -EBUSY; 3713 return ret; 3714 } 3715 3716 /* 3717 * make mem_cgroup's charge to be 0 if there is no task. 3718 * This enables deleting this mem_cgroup. 3719 */ 3720 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3721 { 3722 int ret; 3723 int node, zid, shrink; 3724 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3725 struct cgroup *cgrp = memcg->css.cgroup; 3726 3727 css_get(&memcg->css); 3728 3729 shrink = 0; 3730 /* should free all ? */ 3731 if (free_all) 3732 goto try_to_free; 3733 move_account: 3734 do { 3735 ret = -EBUSY; 3736 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3737 goto out; 3738 ret = -EINTR; 3739 if (signal_pending(current)) 3740 goto out; 3741 /* This is for making all *used* pages to be on LRU. */ 3742 lru_add_drain_all(); 3743 drain_all_stock_sync(memcg); 3744 ret = 0; 3745 mem_cgroup_start_move(memcg); 3746 for_each_node_state(node, N_HIGH_MEMORY) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3748 enum lru_list lru; 3749 for_each_lru(lru) { 3750 ret = mem_cgroup_force_empty_list(memcg, 3751 node, zid, lru); 3752 if (ret) 3753 break; 3754 } 3755 } 3756 if (ret) 3757 break; 3758 } 3759 mem_cgroup_end_move(memcg); 3760 memcg_oom_recover(memcg); 3761 /* it seems parent cgroup doesn't have enough mem */ 3762 if (ret == -ENOMEM) 3763 goto try_to_free; 3764 cond_resched(); 3765 /* "ret" should also be checked to ensure all lists are empty. */ 3766 } while (memcg->res.usage > 0 || ret); 3767 out: 3768 css_put(&memcg->css); 3769 return ret; 3770 3771 try_to_free: 3772 /* returns EBUSY if there is a task or if we come here twice. */ 3773 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3774 ret = -EBUSY; 3775 goto out; 3776 } 3777 /* we call try-to-free pages for make this cgroup empty */ 3778 lru_add_drain_all(); 3779 /* try to free all pages in this cgroup */ 3780 shrink = 1; 3781 while (nr_retries && memcg->res.usage > 0) { 3782 int progress; 3783 3784 if (signal_pending(current)) { 3785 ret = -EINTR; 3786 goto out; 3787 } 3788 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3789 false); 3790 if (!progress) { 3791 nr_retries--; 3792 /* maybe some writeback is necessary */ 3793 congestion_wait(BLK_RW_ASYNC, HZ/10); 3794 } 3795 3796 } 3797 lru_add_drain(); 3798 /* try move_account...there may be some *locked* pages. */ 3799 goto move_account; 3800 } 3801 3802 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3803 { 3804 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3805 } 3806 3807 3808 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3809 { 3810 return mem_cgroup_from_cont(cont)->use_hierarchy; 3811 } 3812 3813 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3814 u64 val) 3815 { 3816 int retval = 0; 3817 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3818 struct cgroup *parent = cont->parent; 3819 struct mem_cgroup *parent_memcg = NULL; 3820 3821 if (parent) 3822 parent_memcg = mem_cgroup_from_cont(parent); 3823 3824 cgroup_lock(); 3825 /* 3826 * If parent's use_hierarchy is set, we can't make any modifications 3827 * in the child subtrees. If it is unset, then the change can 3828 * occur, provided the current cgroup has no children. 3829 * 3830 * For the root cgroup, parent_mem is NULL, we allow value to be 3831 * set if there are no children. 3832 */ 3833 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3834 (val == 1 || val == 0)) { 3835 if (list_empty(&cont->children)) 3836 memcg->use_hierarchy = val; 3837 else 3838 retval = -EBUSY; 3839 } else 3840 retval = -EINVAL; 3841 cgroup_unlock(); 3842 3843 return retval; 3844 } 3845 3846 3847 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3848 enum mem_cgroup_stat_index idx) 3849 { 3850 struct mem_cgroup *iter; 3851 long val = 0; 3852 3853 /* Per-cpu values can be negative, use a signed accumulator */ 3854 for_each_mem_cgroup_tree(iter, memcg) 3855 val += mem_cgroup_read_stat(iter, idx); 3856 3857 if (val < 0) /* race ? */ 3858 val = 0; 3859 return val; 3860 } 3861 3862 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3863 { 3864 u64 val; 3865 3866 if (!mem_cgroup_is_root(memcg)) { 3867 if (!swap) 3868 return res_counter_read_u64(&memcg->res, RES_USAGE); 3869 else 3870 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3871 } 3872 3873 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 3874 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3875 3876 if (swap) 3877 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3878 3879 return val << PAGE_SHIFT; 3880 } 3881 3882 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3883 { 3884 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3885 u64 val; 3886 int type, name; 3887 3888 type = MEMFILE_TYPE(cft->private); 3889 name = MEMFILE_ATTR(cft->private); 3890 switch (type) { 3891 case _MEM: 3892 if (name == RES_USAGE) 3893 val = mem_cgroup_usage(memcg, false); 3894 else 3895 val = res_counter_read_u64(&memcg->res, name); 3896 break; 3897 case _MEMSWAP: 3898 if (name == RES_USAGE) 3899 val = mem_cgroup_usage(memcg, true); 3900 else 3901 val = res_counter_read_u64(&memcg->memsw, name); 3902 break; 3903 default: 3904 BUG(); 3905 } 3906 return val; 3907 } 3908 /* 3909 * The user of this function is... 3910 * RES_LIMIT. 3911 */ 3912 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3913 const char *buffer) 3914 { 3915 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3916 int type, name; 3917 unsigned long long val; 3918 int ret; 3919 3920 type = MEMFILE_TYPE(cft->private); 3921 name = MEMFILE_ATTR(cft->private); 3922 switch (name) { 3923 case RES_LIMIT: 3924 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3925 ret = -EINVAL; 3926 break; 3927 } 3928 /* This function does all necessary parse...reuse it */ 3929 ret = res_counter_memparse_write_strategy(buffer, &val); 3930 if (ret) 3931 break; 3932 if (type == _MEM) 3933 ret = mem_cgroup_resize_limit(memcg, val); 3934 else 3935 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3936 break; 3937 case RES_SOFT_LIMIT: 3938 ret = res_counter_memparse_write_strategy(buffer, &val); 3939 if (ret) 3940 break; 3941 /* 3942 * For memsw, soft limits are hard to implement in terms 3943 * of semantics, for now, we support soft limits for 3944 * control without swap 3945 */ 3946 if (type == _MEM) 3947 ret = res_counter_set_soft_limit(&memcg->res, val); 3948 else 3949 ret = -EINVAL; 3950 break; 3951 default: 3952 ret = -EINVAL; /* should be BUG() ? */ 3953 break; 3954 } 3955 return ret; 3956 } 3957 3958 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3959 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3960 { 3961 struct cgroup *cgroup; 3962 unsigned long long min_limit, min_memsw_limit, tmp; 3963 3964 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3965 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3966 cgroup = memcg->css.cgroup; 3967 if (!memcg->use_hierarchy) 3968 goto out; 3969 3970 while (cgroup->parent) { 3971 cgroup = cgroup->parent; 3972 memcg = mem_cgroup_from_cont(cgroup); 3973 if (!memcg->use_hierarchy) 3974 break; 3975 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3976 min_limit = min(min_limit, tmp); 3977 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3978 min_memsw_limit = min(min_memsw_limit, tmp); 3979 } 3980 out: 3981 *mem_limit = min_limit; 3982 *memsw_limit = min_memsw_limit; 3983 } 3984 3985 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3986 { 3987 struct mem_cgroup *memcg; 3988 int type, name; 3989 3990 memcg = mem_cgroup_from_cont(cont); 3991 type = MEMFILE_TYPE(event); 3992 name = MEMFILE_ATTR(event); 3993 switch (name) { 3994 case RES_MAX_USAGE: 3995 if (type == _MEM) 3996 res_counter_reset_max(&memcg->res); 3997 else 3998 res_counter_reset_max(&memcg->memsw); 3999 break; 4000 case RES_FAILCNT: 4001 if (type == _MEM) 4002 res_counter_reset_failcnt(&memcg->res); 4003 else 4004 res_counter_reset_failcnt(&memcg->memsw); 4005 break; 4006 } 4007 4008 return 0; 4009 } 4010 4011 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 4012 struct cftype *cft) 4013 { 4014 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 4015 } 4016 4017 #ifdef CONFIG_MMU 4018 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4019 struct cftype *cft, u64 val) 4020 { 4021 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4022 4023 if (val >= (1 << NR_MOVE_TYPE)) 4024 return -EINVAL; 4025 /* 4026 * We check this value several times in both in can_attach() and 4027 * attach(), so we need cgroup lock to prevent this value from being 4028 * inconsistent. 4029 */ 4030 cgroup_lock(); 4031 memcg->move_charge_at_immigrate = val; 4032 cgroup_unlock(); 4033 4034 return 0; 4035 } 4036 #else 4037 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4038 struct cftype *cft, u64 val) 4039 { 4040 return -ENOSYS; 4041 } 4042 #endif 4043 4044 4045 /* For read statistics */ 4046 enum { 4047 MCS_CACHE, 4048 MCS_RSS, 4049 MCS_FILE_MAPPED, 4050 MCS_PGPGIN, 4051 MCS_PGPGOUT, 4052 MCS_SWAP, 4053 MCS_PGFAULT, 4054 MCS_PGMAJFAULT, 4055 MCS_INACTIVE_ANON, 4056 MCS_ACTIVE_ANON, 4057 MCS_INACTIVE_FILE, 4058 MCS_ACTIVE_FILE, 4059 MCS_UNEVICTABLE, 4060 NR_MCS_STAT, 4061 }; 4062 4063 struct mcs_total_stat { 4064 s64 stat[NR_MCS_STAT]; 4065 }; 4066 4067 struct { 4068 char *local_name; 4069 char *total_name; 4070 } memcg_stat_strings[NR_MCS_STAT] = { 4071 {"cache", "total_cache"}, 4072 {"rss", "total_rss"}, 4073 {"mapped_file", "total_mapped_file"}, 4074 {"pgpgin", "total_pgpgin"}, 4075 {"pgpgout", "total_pgpgout"}, 4076 {"swap", "total_swap"}, 4077 {"pgfault", "total_pgfault"}, 4078 {"pgmajfault", "total_pgmajfault"}, 4079 {"inactive_anon", "total_inactive_anon"}, 4080 {"active_anon", "total_active_anon"}, 4081 {"inactive_file", "total_inactive_file"}, 4082 {"active_file", "total_active_file"}, 4083 {"unevictable", "total_unevictable"} 4084 }; 4085 4086 4087 static void 4088 mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4089 { 4090 s64 val; 4091 4092 /* per cpu stat */ 4093 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); 4094 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4095 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); 4096 s->stat[MCS_RSS] += val * PAGE_SIZE; 4097 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 4098 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4099 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); 4100 s->stat[MCS_PGPGIN] += val; 4101 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); 4102 s->stat[MCS_PGPGOUT] += val; 4103 if (do_swap_account) { 4104 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 4105 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4106 } 4107 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); 4108 s->stat[MCS_PGFAULT] += val; 4109 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); 4110 s->stat[MCS_PGMAJFAULT] += val; 4111 4112 /* per zone stat */ 4113 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 4114 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4115 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 4116 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4117 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 4118 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4119 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 4120 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4121 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4122 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4123 } 4124 4125 static void 4126 mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4127 { 4128 struct mem_cgroup *iter; 4129 4130 for_each_mem_cgroup_tree(iter, memcg) 4131 mem_cgroup_get_local_stat(iter, s); 4132 } 4133 4134 #ifdef CONFIG_NUMA 4135 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4136 { 4137 int nid; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4139 unsigned long node_nr; 4140 struct cgroup *cont = m->private; 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4142 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4144 seq_printf(m, "total=%lu", total_nr); 4145 for_each_node_state(nid, N_HIGH_MEMORY) { 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 4147 seq_printf(m, " N%d=%lu", nid, node_nr); 4148 } 4149 seq_putc(m, '\n'); 4150 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 4152 seq_printf(m, "file=%lu", file_nr); 4153 for_each_node_state(nid, N_HIGH_MEMORY) { 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4155 LRU_ALL_FILE); 4156 seq_printf(m, " N%d=%lu", nid, node_nr); 4157 } 4158 seq_putc(m, '\n'); 4159 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 4161 seq_printf(m, "anon=%lu", anon_nr); 4162 for_each_node_state(nid, N_HIGH_MEMORY) { 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4164 LRU_ALL_ANON); 4165 seq_printf(m, " N%d=%lu", nid, node_nr); 4166 } 4167 seq_putc(m, '\n'); 4168 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr); 4171 for_each_node_state(nid, N_HIGH_MEMORY) { 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4173 BIT(LRU_UNEVICTABLE)); 4174 seq_printf(m, " N%d=%lu", nid, node_nr); 4175 } 4176 seq_putc(m, '\n'); 4177 return 0; 4178 } 4179 #endif /* CONFIG_NUMA */ 4180 4181 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4182 struct cgroup_map_cb *cb) 4183 { 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4185 struct mcs_total_stat mystat; 4186 int i; 4187 4188 memset(&mystat, 0, sizeof(mystat)); 4189 mem_cgroup_get_local_stat(memcg, &mystat); 4190 4191 4192 for (i = 0; i < NR_MCS_STAT; i++) { 4193 if (i == MCS_SWAP && !do_swap_account) 4194 continue; 4195 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4196 } 4197 4198 /* Hierarchical information */ 4199 { 4200 unsigned long long limit, memsw_limit; 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit); 4203 if (do_swap_account) 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4205 } 4206 4207 memset(&mystat, 0, sizeof(mystat)); 4208 mem_cgroup_get_total_stat(memcg, &mystat); 4209 for (i = 0; i < NR_MCS_STAT; i++) { 4210 if (i == MCS_SWAP && !do_swap_account) 4211 continue; 4212 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4213 } 4214 4215 #ifdef CONFIG_DEBUG_VM 4216 { 4217 int nid, zid; 4218 struct mem_cgroup_per_zone *mz; 4219 unsigned long recent_rotated[2] = {0, 0}; 4220 unsigned long recent_scanned[2] = {0, 0}; 4221 4222 for_each_online_node(nid) 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4225 4226 recent_rotated[0] += 4227 mz->reclaim_stat.recent_rotated[0]; 4228 recent_rotated[1] += 4229 mz->reclaim_stat.recent_rotated[1]; 4230 recent_scanned[0] += 4231 mz->reclaim_stat.recent_scanned[0]; 4232 recent_scanned[1] += 4233 mz->reclaim_stat.recent_scanned[1]; 4234 } 4235 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4236 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4237 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4238 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4239 } 4240 #endif 4241 4242 return 0; 4243 } 4244 4245 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4246 { 4247 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4248 4249 return mem_cgroup_swappiness(memcg); 4250 } 4251 4252 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4253 u64 val) 4254 { 4255 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4256 struct mem_cgroup *parent; 4257 4258 if (val > 100) 4259 return -EINVAL; 4260 4261 if (cgrp->parent == NULL) 4262 return -EINVAL; 4263 4264 parent = mem_cgroup_from_cont(cgrp->parent); 4265 4266 cgroup_lock(); 4267 4268 /* If under hierarchy, only empty-root can set this value */ 4269 if ((parent->use_hierarchy) || 4270 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4271 cgroup_unlock(); 4272 return -EINVAL; 4273 } 4274 4275 memcg->swappiness = val; 4276 4277 cgroup_unlock(); 4278 4279 return 0; 4280 } 4281 4282 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4283 { 4284 struct mem_cgroup_threshold_ary *t; 4285 u64 usage; 4286 int i; 4287 4288 rcu_read_lock(); 4289 if (!swap) 4290 t = rcu_dereference(memcg->thresholds.primary); 4291 else 4292 t = rcu_dereference(memcg->memsw_thresholds.primary); 4293 4294 if (!t) 4295 goto unlock; 4296 4297 usage = mem_cgroup_usage(memcg, swap); 4298 4299 /* 4300 * current_threshold points to threshold just below usage. 4301 * If it's not true, a threshold was crossed after last 4302 * call of __mem_cgroup_threshold(). 4303 */ 4304 i = t->current_threshold; 4305 4306 /* 4307 * Iterate backward over array of thresholds starting from 4308 * current_threshold and check if a threshold is crossed. 4309 * If none of thresholds below usage is crossed, we read 4310 * only one element of the array here. 4311 */ 4312 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4313 eventfd_signal(t->entries[i].eventfd, 1); 4314 4315 /* i = current_threshold + 1 */ 4316 i++; 4317 4318 /* 4319 * Iterate forward over array of thresholds starting from 4320 * current_threshold+1 and check if a threshold is crossed. 4321 * If none of thresholds above usage is crossed, we read 4322 * only one element of the array here. 4323 */ 4324 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4325 eventfd_signal(t->entries[i].eventfd, 1); 4326 4327 /* Update current_threshold */ 4328 t->current_threshold = i - 1; 4329 unlock: 4330 rcu_read_unlock(); 4331 } 4332 4333 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4334 { 4335 while (memcg) { 4336 __mem_cgroup_threshold(memcg, false); 4337 if (do_swap_account) 4338 __mem_cgroup_threshold(memcg, true); 4339 4340 memcg = parent_mem_cgroup(memcg); 4341 } 4342 } 4343 4344 static int compare_thresholds(const void *a, const void *b) 4345 { 4346 const struct mem_cgroup_threshold *_a = a; 4347 const struct mem_cgroup_threshold *_b = b; 4348 4349 return _a->threshold - _b->threshold; 4350 } 4351 4352 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4353 { 4354 struct mem_cgroup_eventfd_list *ev; 4355 4356 list_for_each_entry(ev, &memcg->oom_notify, list) 4357 eventfd_signal(ev->eventfd, 1); 4358 return 0; 4359 } 4360 4361 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4362 { 4363 struct mem_cgroup *iter; 4364 4365 for_each_mem_cgroup_tree(iter, memcg) 4366 mem_cgroup_oom_notify_cb(iter); 4367 } 4368 4369 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4370 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4371 { 4372 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4373 struct mem_cgroup_thresholds *thresholds; 4374 struct mem_cgroup_threshold_ary *new; 4375 int type = MEMFILE_TYPE(cft->private); 4376 u64 threshold, usage; 4377 int i, size, ret; 4378 4379 ret = res_counter_memparse_write_strategy(args, &threshold); 4380 if (ret) 4381 return ret; 4382 4383 mutex_lock(&memcg->thresholds_lock); 4384 4385 if (type == _MEM) 4386 thresholds = &memcg->thresholds; 4387 else if (type == _MEMSWAP) 4388 thresholds = &memcg->memsw_thresholds; 4389 else 4390 BUG(); 4391 4392 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4393 4394 /* Check if a threshold crossed before adding a new one */ 4395 if (thresholds->primary) 4396 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4397 4398 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4399 4400 /* Allocate memory for new array of thresholds */ 4401 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4402 GFP_KERNEL); 4403 if (!new) { 4404 ret = -ENOMEM; 4405 goto unlock; 4406 } 4407 new->size = size; 4408 4409 /* Copy thresholds (if any) to new array */ 4410 if (thresholds->primary) { 4411 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4412 sizeof(struct mem_cgroup_threshold)); 4413 } 4414 4415 /* Add new threshold */ 4416 new->entries[size - 1].eventfd = eventfd; 4417 new->entries[size - 1].threshold = threshold; 4418 4419 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4420 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4421 compare_thresholds, NULL); 4422 4423 /* Find current threshold */ 4424 new->current_threshold = -1; 4425 for (i = 0; i < size; i++) { 4426 if (new->entries[i].threshold < usage) { 4427 /* 4428 * new->current_threshold will not be used until 4429 * rcu_assign_pointer(), so it's safe to increment 4430 * it here. 4431 */ 4432 ++new->current_threshold; 4433 } 4434 } 4435 4436 /* Free old spare buffer and save old primary buffer as spare */ 4437 kfree(thresholds->spare); 4438 thresholds->spare = thresholds->primary; 4439 4440 rcu_assign_pointer(thresholds->primary, new); 4441 4442 /* To be sure that nobody uses thresholds */ 4443 synchronize_rcu(); 4444 4445 unlock: 4446 mutex_unlock(&memcg->thresholds_lock); 4447 4448 return ret; 4449 } 4450 4451 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4452 struct cftype *cft, struct eventfd_ctx *eventfd) 4453 { 4454 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4455 struct mem_cgroup_thresholds *thresholds; 4456 struct mem_cgroup_threshold_ary *new; 4457 int type = MEMFILE_TYPE(cft->private); 4458 u64 usage; 4459 int i, j, size; 4460 4461 mutex_lock(&memcg->thresholds_lock); 4462 if (type == _MEM) 4463 thresholds = &memcg->thresholds; 4464 else if (type == _MEMSWAP) 4465 thresholds = &memcg->memsw_thresholds; 4466 else 4467 BUG(); 4468 4469 if (!thresholds->primary) 4470 goto unlock; 4471 4472 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4473 4474 /* Check if a threshold crossed before removing */ 4475 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4476 4477 /* Calculate new number of threshold */ 4478 size = 0; 4479 for (i = 0; i < thresholds->primary->size; i++) { 4480 if (thresholds->primary->entries[i].eventfd != eventfd) 4481 size++; 4482 } 4483 4484 new = thresholds->spare; 4485 4486 /* Set thresholds array to NULL if we don't have thresholds */ 4487 if (!size) { 4488 kfree(new); 4489 new = NULL; 4490 goto swap_buffers; 4491 } 4492 4493 new->size = size; 4494 4495 /* Copy thresholds and find current threshold */ 4496 new->current_threshold = -1; 4497 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4498 if (thresholds->primary->entries[i].eventfd == eventfd) 4499 continue; 4500 4501 new->entries[j] = thresholds->primary->entries[i]; 4502 if (new->entries[j].threshold < usage) { 4503 /* 4504 * new->current_threshold will not be used 4505 * until rcu_assign_pointer(), so it's safe to increment 4506 * it here. 4507 */ 4508 ++new->current_threshold; 4509 } 4510 j++; 4511 } 4512 4513 swap_buffers: 4514 /* Swap primary and spare array */ 4515 thresholds->spare = thresholds->primary; 4516 rcu_assign_pointer(thresholds->primary, new); 4517 4518 /* To be sure that nobody uses thresholds */ 4519 synchronize_rcu(); 4520 unlock: 4521 mutex_unlock(&memcg->thresholds_lock); 4522 } 4523 4524 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4525 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4526 { 4527 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4528 struct mem_cgroup_eventfd_list *event; 4529 int type = MEMFILE_TYPE(cft->private); 4530 4531 BUG_ON(type != _OOM_TYPE); 4532 event = kmalloc(sizeof(*event), GFP_KERNEL); 4533 if (!event) 4534 return -ENOMEM; 4535 4536 spin_lock(&memcg_oom_lock); 4537 4538 event->eventfd = eventfd; 4539 list_add(&event->list, &memcg->oom_notify); 4540 4541 /* already in OOM ? */ 4542 if (atomic_read(&memcg->under_oom)) 4543 eventfd_signal(eventfd, 1); 4544 spin_unlock(&memcg_oom_lock); 4545 4546 return 0; 4547 } 4548 4549 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4550 struct cftype *cft, struct eventfd_ctx *eventfd) 4551 { 4552 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4553 struct mem_cgroup_eventfd_list *ev, *tmp; 4554 int type = MEMFILE_TYPE(cft->private); 4555 4556 BUG_ON(type != _OOM_TYPE); 4557 4558 spin_lock(&memcg_oom_lock); 4559 4560 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4561 if (ev->eventfd == eventfd) { 4562 list_del(&ev->list); 4563 kfree(ev); 4564 } 4565 } 4566 4567 spin_unlock(&memcg_oom_lock); 4568 } 4569 4570 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4571 struct cftype *cft, struct cgroup_map_cb *cb) 4572 { 4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4574 4575 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 4576 4577 if (atomic_read(&memcg->under_oom)) 4578 cb->fill(cb, "under_oom", 1); 4579 else 4580 cb->fill(cb, "under_oom", 0); 4581 return 0; 4582 } 4583 4584 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4585 struct cftype *cft, u64 val) 4586 { 4587 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4588 struct mem_cgroup *parent; 4589 4590 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4591 if (!cgrp->parent || !((val == 0) || (val == 1))) 4592 return -EINVAL; 4593 4594 parent = mem_cgroup_from_cont(cgrp->parent); 4595 4596 cgroup_lock(); 4597 /* oom-kill-disable is a flag for subhierarchy. */ 4598 if ((parent->use_hierarchy) || 4599 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4600 cgroup_unlock(); 4601 return -EINVAL; 4602 } 4603 memcg->oom_kill_disable = val; 4604 if (!val) 4605 memcg_oom_recover(memcg); 4606 cgroup_unlock(); 4607 return 0; 4608 } 4609 4610 #ifdef CONFIG_NUMA 4611 static const struct file_operations mem_control_numa_stat_file_operations = { 4612 .read = seq_read, 4613 .llseek = seq_lseek, 4614 .release = single_release, 4615 }; 4616 4617 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4618 { 4619 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4620 4621 file->f_op = &mem_control_numa_stat_file_operations; 4622 return single_open(file, mem_control_numa_stat_show, cont); 4623 } 4624 #endif /* CONFIG_NUMA */ 4625 4626 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4627 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4628 { 4629 /* 4630 * Part of this would be better living in a separate allocation 4631 * function, leaving us with just the cgroup tree population work. 4632 * We, however, depend on state such as network's proto_list that 4633 * is only initialized after cgroup creation. I found the less 4634 * cumbersome way to deal with it to defer it all to populate time 4635 */ 4636 return mem_cgroup_sockets_init(cont, ss); 4637 }; 4638 4639 static void kmem_cgroup_destroy(struct cgroup *cont) 4640 { 4641 mem_cgroup_sockets_destroy(cont); 4642 } 4643 #else 4644 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4645 { 4646 return 0; 4647 } 4648 4649 static void kmem_cgroup_destroy(struct cgroup *cont) 4650 { 4651 } 4652 #endif 4653 4654 static struct cftype mem_cgroup_files[] = { 4655 { 4656 .name = "usage_in_bytes", 4657 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4658 .read_u64 = mem_cgroup_read, 4659 .register_event = mem_cgroup_usage_register_event, 4660 .unregister_event = mem_cgroup_usage_unregister_event, 4661 }, 4662 { 4663 .name = "max_usage_in_bytes", 4664 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4665 .trigger = mem_cgroup_reset, 4666 .read_u64 = mem_cgroup_read, 4667 }, 4668 { 4669 .name = "limit_in_bytes", 4670 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4671 .write_string = mem_cgroup_write, 4672 .read_u64 = mem_cgroup_read, 4673 }, 4674 { 4675 .name = "soft_limit_in_bytes", 4676 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4677 .write_string = mem_cgroup_write, 4678 .read_u64 = mem_cgroup_read, 4679 }, 4680 { 4681 .name = "failcnt", 4682 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4683 .trigger = mem_cgroup_reset, 4684 .read_u64 = mem_cgroup_read, 4685 }, 4686 { 4687 .name = "stat", 4688 .read_map = mem_control_stat_show, 4689 }, 4690 { 4691 .name = "force_empty", 4692 .trigger = mem_cgroup_force_empty_write, 4693 }, 4694 { 4695 .name = "use_hierarchy", 4696 .write_u64 = mem_cgroup_hierarchy_write, 4697 .read_u64 = mem_cgroup_hierarchy_read, 4698 }, 4699 { 4700 .name = "swappiness", 4701 .read_u64 = mem_cgroup_swappiness_read, 4702 .write_u64 = mem_cgroup_swappiness_write, 4703 }, 4704 { 4705 .name = "move_charge_at_immigrate", 4706 .read_u64 = mem_cgroup_move_charge_read, 4707 .write_u64 = mem_cgroup_move_charge_write, 4708 }, 4709 { 4710 .name = "oom_control", 4711 .read_map = mem_cgroup_oom_control_read, 4712 .write_u64 = mem_cgroup_oom_control_write, 4713 .register_event = mem_cgroup_oom_register_event, 4714 .unregister_event = mem_cgroup_oom_unregister_event, 4715 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4716 }, 4717 #ifdef CONFIG_NUMA 4718 { 4719 .name = "numa_stat", 4720 .open = mem_control_numa_stat_open, 4721 .mode = S_IRUGO, 4722 }, 4723 #endif 4724 }; 4725 4726 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4727 static struct cftype memsw_cgroup_files[] = { 4728 { 4729 .name = "memsw.usage_in_bytes", 4730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4731 .read_u64 = mem_cgroup_read, 4732 .register_event = mem_cgroup_usage_register_event, 4733 .unregister_event = mem_cgroup_usage_unregister_event, 4734 }, 4735 { 4736 .name = "memsw.max_usage_in_bytes", 4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4738 .trigger = mem_cgroup_reset, 4739 .read_u64 = mem_cgroup_read, 4740 }, 4741 { 4742 .name = "memsw.limit_in_bytes", 4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4744 .write_string = mem_cgroup_write, 4745 .read_u64 = mem_cgroup_read, 4746 }, 4747 { 4748 .name = "memsw.failcnt", 4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4750 .trigger = mem_cgroup_reset, 4751 .read_u64 = mem_cgroup_read, 4752 }, 4753 }; 4754 4755 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4756 { 4757 if (!do_swap_account) 4758 return 0; 4759 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4760 ARRAY_SIZE(memsw_cgroup_files)); 4761 }; 4762 #else 4763 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4764 { 4765 return 0; 4766 } 4767 #endif 4768 4769 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4770 { 4771 struct mem_cgroup_per_node *pn; 4772 struct mem_cgroup_per_zone *mz; 4773 enum lru_list lru; 4774 int zone, tmp = node; 4775 /* 4776 * This routine is called against possible nodes. 4777 * But it's BUG to call kmalloc() against offline node. 4778 * 4779 * TODO: this routine can waste much memory for nodes which will 4780 * never be onlined. It's better to use memory hotplug callback 4781 * function. 4782 */ 4783 if (!node_state(node, N_NORMAL_MEMORY)) 4784 tmp = -1; 4785 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4786 if (!pn) 4787 return 1; 4788 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4790 mz = &pn->zoneinfo[zone]; 4791 for_each_lru(lru) 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]); 4793 mz->usage_in_excess = 0; 4794 mz->on_tree = false; 4795 mz->memcg = memcg; 4796 } 4797 memcg->info.nodeinfo[node] = pn; 4798 return 0; 4799 } 4800 4801 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4802 { 4803 kfree(memcg->info.nodeinfo[node]); 4804 } 4805 4806 static struct mem_cgroup *mem_cgroup_alloc(void) 4807 { 4808 struct mem_cgroup *memcg; 4809 int size = sizeof(struct mem_cgroup); 4810 4811 /* Can be very big if MAX_NUMNODES is very big */ 4812 if (size < PAGE_SIZE) 4813 memcg = kzalloc(size, GFP_KERNEL); 4814 else 4815 memcg = vzalloc(size); 4816 4817 if (!memcg) 4818 return NULL; 4819 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4821 if (!memcg->stat) 4822 goto out_free; 4823 spin_lock_init(&memcg->pcp_counter_lock); 4824 return memcg; 4825 4826 out_free: 4827 if (size < PAGE_SIZE) 4828 kfree(memcg); 4829 else 4830 vfree(memcg); 4831 return NULL; 4832 } 4833 4834 /* 4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4836 * but in process context. The work_freeing structure is overlaid 4837 * on the rcu_freeing structure, which itself is overlaid on memsw. 4838 */ 4839 static void vfree_work(struct work_struct *work) 4840 { 4841 struct mem_cgroup *memcg; 4842 4843 memcg = container_of(work, struct mem_cgroup, work_freeing); 4844 vfree(memcg); 4845 } 4846 static void vfree_rcu(struct rcu_head *rcu_head) 4847 { 4848 struct mem_cgroup *memcg; 4849 4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4851 INIT_WORK(&memcg->work_freeing, vfree_work); 4852 schedule_work(&memcg->work_freeing); 4853 } 4854 4855 /* 4856 * At destroying mem_cgroup, references from swap_cgroup can remain. 4857 * (scanning all at force_empty is too costly...) 4858 * 4859 * Instead of clearing all references at force_empty, we remember 4860 * the number of reference from swap_cgroup and free mem_cgroup when 4861 * it goes down to 0. 4862 * 4863 * Removal of cgroup itself succeeds regardless of refs from swap. 4864 */ 4865 4866 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4867 { 4868 int node; 4869 4870 mem_cgroup_remove_from_trees(memcg); 4871 free_css_id(&mem_cgroup_subsys, &memcg->css); 4872 4873 for_each_node(node) 4874 free_mem_cgroup_per_zone_info(memcg, node); 4875 4876 free_percpu(memcg->stat); 4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4878 kfree_rcu(memcg, rcu_freeing); 4879 else 4880 call_rcu(&memcg->rcu_freeing, vfree_rcu); 4881 } 4882 4883 static void mem_cgroup_get(struct mem_cgroup *memcg) 4884 { 4885 atomic_inc(&memcg->refcnt); 4886 } 4887 4888 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) 4889 { 4890 if (atomic_sub_and_test(count, &memcg->refcnt)) { 4891 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4892 __mem_cgroup_free(memcg); 4893 if (parent) 4894 mem_cgroup_put(parent); 4895 } 4896 } 4897 4898 static void mem_cgroup_put(struct mem_cgroup *memcg) 4899 { 4900 __mem_cgroup_put(memcg, 1); 4901 } 4902 4903 /* 4904 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4905 */ 4906 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4907 { 4908 if (!memcg->res.parent) 4909 return NULL; 4910 return mem_cgroup_from_res_counter(memcg->res.parent, res); 4911 } 4912 EXPORT_SYMBOL(parent_mem_cgroup); 4913 4914 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4915 static void __init enable_swap_cgroup(void) 4916 { 4917 if (!mem_cgroup_disabled() && really_do_swap_account) 4918 do_swap_account = 1; 4919 } 4920 #else 4921 static void __init enable_swap_cgroup(void) 4922 { 4923 } 4924 #endif 4925 4926 static int mem_cgroup_soft_limit_tree_init(void) 4927 { 4928 struct mem_cgroup_tree_per_node *rtpn; 4929 struct mem_cgroup_tree_per_zone *rtpz; 4930 int tmp, node, zone; 4931 4932 for_each_node(node) { 4933 tmp = node; 4934 if (!node_state(node, N_NORMAL_MEMORY)) 4935 tmp = -1; 4936 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4937 if (!rtpn) 4938 goto err_cleanup; 4939 4940 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4941 4942 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4943 rtpz = &rtpn->rb_tree_per_zone[zone]; 4944 rtpz->rb_root = RB_ROOT; 4945 spin_lock_init(&rtpz->lock); 4946 } 4947 } 4948 return 0; 4949 4950 err_cleanup: 4951 for_each_node(node) { 4952 if (!soft_limit_tree.rb_tree_per_node[node]) 4953 break; 4954 kfree(soft_limit_tree.rb_tree_per_node[node]); 4955 soft_limit_tree.rb_tree_per_node[node] = NULL; 4956 } 4957 return 1; 4958 4959 } 4960 4961 static struct cgroup_subsys_state * __ref 4962 mem_cgroup_create(struct cgroup *cont) 4963 { 4964 struct mem_cgroup *memcg, *parent; 4965 long error = -ENOMEM; 4966 int node; 4967 4968 memcg = mem_cgroup_alloc(); 4969 if (!memcg) 4970 return ERR_PTR(error); 4971 4972 for_each_node(node) 4973 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4974 goto free_out; 4975 4976 /* root ? */ 4977 if (cont->parent == NULL) { 4978 int cpu; 4979 enable_swap_cgroup(); 4980 parent = NULL; 4981 if (mem_cgroup_soft_limit_tree_init()) 4982 goto free_out; 4983 root_mem_cgroup = memcg; 4984 for_each_possible_cpu(cpu) { 4985 struct memcg_stock_pcp *stock = 4986 &per_cpu(memcg_stock, cpu); 4987 INIT_WORK(&stock->work, drain_local_stock); 4988 } 4989 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4990 } else { 4991 parent = mem_cgroup_from_cont(cont->parent); 4992 memcg->use_hierarchy = parent->use_hierarchy; 4993 memcg->oom_kill_disable = parent->oom_kill_disable; 4994 } 4995 4996 if (parent && parent->use_hierarchy) { 4997 res_counter_init(&memcg->res, &parent->res); 4998 res_counter_init(&memcg->memsw, &parent->memsw); 4999 /* 5000 * We increment refcnt of the parent to ensure that we can 5001 * safely access it on res_counter_charge/uncharge. 5002 * This refcnt will be decremented when freeing this 5003 * mem_cgroup(see mem_cgroup_put). 5004 */ 5005 mem_cgroup_get(parent); 5006 } else { 5007 res_counter_init(&memcg->res, NULL); 5008 res_counter_init(&memcg->memsw, NULL); 5009 } 5010 memcg->last_scanned_node = MAX_NUMNODES; 5011 INIT_LIST_HEAD(&memcg->oom_notify); 5012 5013 if (parent) 5014 memcg->swappiness = mem_cgroup_swappiness(parent); 5015 atomic_set(&memcg->refcnt, 1); 5016 memcg->move_charge_at_immigrate = 0; 5017 mutex_init(&memcg->thresholds_lock); 5018 spin_lock_init(&memcg->move_lock); 5019 return &memcg->css; 5020 free_out: 5021 __mem_cgroup_free(memcg); 5022 return ERR_PTR(error); 5023 } 5024 5025 static int mem_cgroup_pre_destroy(struct cgroup *cont) 5026 { 5027 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5028 5029 return mem_cgroup_force_empty(memcg, false); 5030 } 5031 5032 static void mem_cgroup_destroy(struct cgroup *cont) 5033 { 5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5035 5036 kmem_cgroup_destroy(cont); 5037 5038 mem_cgroup_put(memcg); 5039 } 5040 5041 static int mem_cgroup_populate(struct cgroup_subsys *ss, 5042 struct cgroup *cont) 5043 { 5044 int ret; 5045 5046 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 5047 ARRAY_SIZE(mem_cgroup_files)); 5048 5049 if (!ret) 5050 ret = register_memsw_files(cont, ss); 5051 5052 if (!ret) 5053 ret = register_kmem_files(cont, ss); 5054 5055 return ret; 5056 } 5057 5058 #ifdef CONFIG_MMU 5059 /* Handlers for move charge at task migration. */ 5060 #define PRECHARGE_COUNT_AT_ONCE 256 5061 static int mem_cgroup_do_precharge(unsigned long count) 5062 { 5063 int ret = 0; 5064 int batch_count = PRECHARGE_COUNT_AT_ONCE; 5065 struct mem_cgroup *memcg = mc.to; 5066 5067 if (mem_cgroup_is_root(memcg)) { 5068 mc.precharge += count; 5069 /* we don't need css_get for root */ 5070 return ret; 5071 } 5072 /* try to charge at once */ 5073 if (count > 1) { 5074 struct res_counter *dummy; 5075 /* 5076 * "memcg" cannot be under rmdir() because we've already checked 5077 * by cgroup_lock_live_cgroup() that it is not removed and we 5078 * are still under the same cgroup_mutex. So we can postpone 5079 * css_get(). 5080 */ 5081 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 5082 goto one_by_one; 5083 if (do_swap_account && res_counter_charge(&memcg->memsw, 5084 PAGE_SIZE * count, &dummy)) { 5085 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 5086 goto one_by_one; 5087 } 5088 mc.precharge += count; 5089 return ret; 5090 } 5091 one_by_one: 5092 /* fall back to one by one charge */ 5093 while (count--) { 5094 if (signal_pending(current)) { 5095 ret = -EINTR; 5096 break; 5097 } 5098 if (!batch_count--) { 5099 batch_count = PRECHARGE_COUNT_AT_ONCE; 5100 cond_resched(); 5101 } 5102 ret = __mem_cgroup_try_charge(NULL, 5103 GFP_KERNEL, 1, &memcg, false); 5104 if (ret) 5105 /* mem_cgroup_clear_mc() will do uncharge later */ 5106 return ret; 5107 mc.precharge++; 5108 } 5109 return ret; 5110 } 5111 5112 /** 5113 * get_mctgt_type - get target type of moving charge 5114 * @vma: the vma the pte to be checked belongs 5115 * @addr: the address corresponding to the pte to be checked 5116 * @ptent: the pte to be checked 5117 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5118 * 5119 * Returns 5120 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5121 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5122 * move charge. if @target is not NULL, the page is stored in target->page 5123 * with extra refcnt got(Callers should handle it). 5124 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5125 * target for charge migration. if @target is not NULL, the entry is stored 5126 * in target->ent. 5127 * 5128 * Called with pte lock held. 5129 */ 5130 union mc_target { 5131 struct page *page; 5132 swp_entry_t ent; 5133 }; 5134 5135 enum mc_target_type { 5136 MC_TARGET_NONE = 0, 5137 MC_TARGET_PAGE, 5138 MC_TARGET_SWAP, 5139 }; 5140 5141 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5142 unsigned long addr, pte_t ptent) 5143 { 5144 struct page *page = vm_normal_page(vma, addr, ptent); 5145 5146 if (!page || !page_mapped(page)) 5147 return NULL; 5148 if (PageAnon(page)) { 5149 /* we don't move shared anon */ 5150 if (!move_anon() || page_mapcount(page) > 2) 5151 return NULL; 5152 } else if (!move_file()) 5153 /* we ignore mapcount for file pages */ 5154 return NULL; 5155 if (!get_page_unless_zero(page)) 5156 return NULL; 5157 5158 return page; 5159 } 5160 5161 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5162 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5163 { 5164 int usage_count; 5165 struct page *page = NULL; 5166 swp_entry_t ent = pte_to_swp_entry(ptent); 5167 5168 if (!move_anon() || non_swap_entry(ent)) 5169 return NULL; 5170 usage_count = mem_cgroup_count_swap_user(ent, &page); 5171 if (usage_count > 1) { /* we don't move shared anon */ 5172 if (page) 5173 put_page(page); 5174 return NULL; 5175 } 5176 if (do_swap_account) 5177 entry->val = ent.val; 5178 5179 return page; 5180 } 5181 5182 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5183 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5184 { 5185 struct page *page = NULL; 5186 struct inode *inode; 5187 struct address_space *mapping; 5188 pgoff_t pgoff; 5189 5190 if (!vma->vm_file) /* anonymous vma */ 5191 return NULL; 5192 if (!move_file()) 5193 return NULL; 5194 5195 inode = vma->vm_file->f_path.dentry->d_inode; 5196 mapping = vma->vm_file->f_mapping; 5197 if (pte_none(ptent)) 5198 pgoff = linear_page_index(vma, addr); 5199 else /* pte_file(ptent) is true */ 5200 pgoff = pte_to_pgoff(ptent); 5201 5202 /* page is moved even if it's not RSS of this task(page-faulted). */ 5203 page = find_get_page(mapping, pgoff); 5204 5205 #ifdef CONFIG_SWAP 5206 /* shmem/tmpfs may report page out on swap: account for that too. */ 5207 if (radix_tree_exceptional_entry(page)) { 5208 swp_entry_t swap = radix_to_swp_entry(page); 5209 if (do_swap_account) 5210 *entry = swap; 5211 page = find_get_page(&swapper_space, swap.val); 5212 } 5213 #endif 5214 return page; 5215 } 5216 5217 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5218 unsigned long addr, pte_t ptent, union mc_target *target) 5219 { 5220 struct page *page = NULL; 5221 struct page_cgroup *pc; 5222 enum mc_target_type ret = MC_TARGET_NONE; 5223 swp_entry_t ent = { .val = 0 }; 5224 5225 if (pte_present(ptent)) 5226 page = mc_handle_present_pte(vma, addr, ptent); 5227 else if (is_swap_pte(ptent)) 5228 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5229 else if (pte_none(ptent) || pte_file(ptent)) 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5231 5232 if (!page && !ent.val) 5233 return ret; 5234 if (page) { 5235 pc = lookup_page_cgroup(page); 5236 /* 5237 * Do only loose check w/o page_cgroup lock. 5238 * mem_cgroup_move_account() checks the pc is valid or not under 5239 * the lock. 5240 */ 5241 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5242 ret = MC_TARGET_PAGE; 5243 if (target) 5244 target->page = page; 5245 } 5246 if (!ret || !target) 5247 put_page(page); 5248 } 5249 /* There is a swap entry and a page doesn't exist or isn't charged */ 5250 if (ent.val && !ret && 5251 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 5252 ret = MC_TARGET_SWAP; 5253 if (target) 5254 target->ent = ent; 5255 } 5256 return ret; 5257 } 5258 5259 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5260 /* 5261 * We don't consider swapping or file mapped pages because THP does not 5262 * support them for now. 5263 * Caller should make sure that pmd_trans_huge(pmd) is true. 5264 */ 5265 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5266 unsigned long addr, pmd_t pmd, union mc_target *target) 5267 { 5268 struct page *page = NULL; 5269 struct page_cgroup *pc; 5270 enum mc_target_type ret = MC_TARGET_NONE; 5271 5272 page = pmd_page(pmd); 5273 VM_BUG_ON(!page || !PageHead(page)); 5274 if (!move_anon()) 5275 return ret; 5276 pc = lookup_page_cgroup(page); 5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5278 ret = MC_TARGET_PAGE; 5279 if (target) { 5280 get_page(page); 5281 target->page = page; 5282 } 5283 } 5284 return ret; 5285 } 5286 #else 5287 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5288 unsigned long addr, pmd_t pmd, union mc_target *target) 5289 { 5290 return MC_TARGET_NONE; 5291 } 5292 #endif 5293 5294 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5295 unsigned long addr, unsigned long end, 5296 struct mm_walk *walk) 5297 { 5298 struct vm_area_struct *vma = walk->private; 5299 pte_t *pte; 5300 spinlock_t *ptl; 5301 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5304 mc.precharge += HPAGE_PMD_NR; 5305 spin_unlock(&vma->vm_mm->page_table_lock); 5306 return 0; 5307 } 5308 5309 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5310 for (; addr != end; pte++, addr += PAGE_SIZE) 5311 if (get_mctgt_type(vma, addr, *pte, NULL)) 5312 mc.precharge++; /* increment precharge temporarily */ 5313 pte_unmap_unlock(pte - 1, ptl); 5314 cond_resched(); 5315 5316 return 0; 5317 } 5318 5319 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5320 { 5321 unsigned long precharge; 5322 struct vm_area_struct *vma; 5323 5324 down_read(&mm->mmap_sem); 5325 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5326 struct mm_walk mem_cgroup_count_precharge_walk = { 5327 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5328 .mm = mm, 5329 .private = vma, 5330 }; 5331 if (is_vm_hugetlb_page(vma)) 5332 continue; 5333 walk_page_range(vma->vm_start, vma->vm_end, 5334 &mem_cgroup_count_precharge_walk); 5335 } 5336 up_read(&mm->mmap_sem); 5337 5338 precharge = mc.precharge; 5339 mc.precharge = 0; 5340 5341 return precharge; 5342 } 5343 5344 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5345 { 5346 unsigned long precharge = mem_cgroup_count_precharge(mm); 5347 5348 VM_BUG_ON(mc.moving_task); 5349 mc.moving_task = current; 5350 return mem_cgroup_do_precharge(precharge); 5351 } 5352 5353 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5354 static void __mem_cgroup_clear_mc(void) 5355 { 5356 struct mem_cgroup *from = mc.from; 5357 struct mem_cgroup *to = mc.to; 5358 5359 /* we must uncharge all the leftover precharges from mc.to */ 5360 if (mc.precharge) { 5361 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5362 mc.precharge = 0; 5363 } 5364 /* 5365 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5366 * we must uncharge here. 5367 */ 5368 if (mc.moved_charge) { 5369 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5370 mc.moved_charge = 0; 5371 } 5372 /* we must fixup refcnts and charges */ 5373 if (mc.moved_swap) { 5374 /* uncharge swap account from the old cgroup */ 5375 if (!mem_cgroup_is_root(mc.from)) 5376 res_counter_uncharge(&mc.from->memsw, 5377 PAGE_SIZE * mc.moved_swap); 5378 __mem_cgroup_put(mc.from, mc.moved_swap); 5379 5380 if (!mem_cgroup_is_root(mc.to)) { 5381 /* 5382 * we charged both to->res and to->memsw, so we should 5383 * uncharge to->res. 5384 */ 5385 res_counter_uncharge(&mc.to->res, 5386 PAGE_SIZE * mc.moved_swap); 5387 } 5388 /* we've already done mem_cgroup_get(mc.to) */ 5389 mc.moved_swap = 0; 5390 } 5391 memcg_oom_recover(from); 5392 memcg_oom_recover(to); 5393 wake_up_all(&mc.waitq); 5394 } 5395 5396 static void mem_cgroup_clear_mc(void) 5397 { 5398 struct mem_cgroup *from = mc.from; 5399 5400 /* 5401 * we must clear moving_task before waking up waiters at the end of 5402 * task migration. 5403 */ 5404 mc.moving_task = NULL; 5405 __mem_cgroup_clear_mc(); 5406 spin_lock(&mc.lock); 5407 mc.from = NULL; 5408 mc.to = NULL; 5409 spin_unlock(&mc.lock); 5410 mem_cgroup_end_move(from); 5411 } 5412 5413 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5414 struct cgroup_taskset *tset) 5415 { 5416 struct task_struct *p = cgroup_taskset_first(tset); 5417 int ret = 0; 5418 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 5419 5420 if (memcg->move_charge_at_immigrate) { 5421 struct mm_struct *mm; 5422 struct mem_cgroup *from = mem_cgroup_from_task(p); 5423 5424 VM_BUG_ON(from == memcg); 5425 5426 mm = get_task_mm(p); 5427 if (!mm) 5428 return 0; 5429 /* We move charges only when we move a owner of the mm */ 5430 if (mm->owner == p) { 5431 VM_BUG_ON(mc.from); 5432 VM_BUG_ON(mc.to); 5433 VM_BUG_ON(mc.precharge); 5434 VM_BUG_ON(mc.moved_charge); 5435 VM_BUG_ON(mc.moved_swap); 5436 mem_cgroup_start_move(from); 5437 spin_lock(&mc.lock); 5438 mc.from = from; 5439 mc.to = memcg; 5440 spin_unlock(&mc.lock); 5441 /* We set mc.moving_task later */ 5442 5443 ret = mem_cgroup_precharge_mc(mm); 5444 if (ret) 5445 mem_cgroup_clear_mc(); 5446 } 5447 mmput(mm); 5448 } 5449 return ret; 5450 } 5451 5452 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5453 struct cgroup_taskset *tset) 5454 { 5455 mem_cgroup_clear_mc(); 5456 } 5457 5458 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5459 unsigned long addr, unsigned long end, 5460 struct mm_walk *walk) 5461 { 5462 int ret = 0; 5463 struct vm_area_struct *vma = walk->private; 5464 pte_t *pte; 5465 spinlock_t *ptl; 5466 enum mc_target_type target_type; 5467 union mc_target target; 5468 struct page *page; 5469 struct page_cgroup *pc; 5470 5471 /* 5472 * We don't take compound_lock() here but no race with splitting thp 5473 * happens because: 5474 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5475 * under splitting, which means there's no concurrent thp split, 5476 * - if another thread runs into split_huge_page() just after we 5477 * entered this if-block, the thread must wait for page table lock 5478 * to be unlocked in __split_huge_page_splitting(), where the main 5479 * part of thp split is not executed yet. 5480 */ 5481 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5482 if (!mc.precharge) { 5483 spin_unlock(&vma->vm_mm->page_table_lock); 5484 return 0; 5485 } 5486 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5487 if (target_type == MC_TARGET_PAGE) { 5488 page = target.page; 5489 if (!isolate_lru_page(page)) { 5490 pc = lookup_page_cgroup(page); 5491 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5492 pc, mc.from, mc.to, 5493 false)) { 5494 mc.precharge -= HPAGE_PMD_NR; 5495 mc.moved_charge += HPAGE_PMD_NR; 5496 } 5497 putback_lru_page(page); 5498 } 5499 put_page(page); 5500 } 5501 spin_unlock(&vma->vm_mm->page_table_lock); 5502 return 0; 5503 } 5504 5505 retry: 5506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5507 for (; addr != end; addr += PAGE_SIZE) { 5508 pte_t ptent = *(pte++); 5509 swp_entry_t ent; 5510 5511 if (!mc.precharge) 5512 break; 5513 5514 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5515 case MC_TARGET_PAGE: 5516 page = target.page; 5517 if (isolate_lru_page(page)) 5518 goto put; 5519 pc = lookup_page_cgroup(page); 5520 if (!mem_cgroup_move_account(page, 1, pc, 5521 mc.from, mc.to, false)) { 5522 mc.precharge--; 5523 /* we uncharge from mc.from later. */ 5524 mc.moved_charge++; 5525 } 5526 putback_lru_page(page); 5527 put: /* get_mctgt_type() gets the page */ 5528 put_page(page); 5529 break; 5530 case MC_TARGET_SWAP: 5531 ent = target.ent; 5532 if (!mem_cgroup_move_swap_account(ent, 5533 mc.from, mc.to, false)) { 5534 mc.precharge--; 5535 /* we fixup refcnts and charges later. */ 5536 mc.moved_swap++; 5537 } 5538 break; 5539 default: 5540 break; 5541 } 5542 } 5543 pte_unmap_unlock(pte - 1, ptl); 5544 cond_resched(); 5545 5546 if (addr != end) { 5547 /* 5548 * We have consumed all precharges we got in can_attach(). 5549 * We try charge one by one, but don't do any additional 5550 * charges to mc.to if we have failed in charge once in attach() 5551 * phase. 5552 */ 5553 ret = mem_cgroup_do_precharge(1); 5554 if (!ret) 5555 goto retry; 5556 } 5557 5558 return ret; 5559 } 5560 5561 static void mem_cgroup_move_charge(struct mm_struct *mm) 5562 { 5563 struct vm_area_struct *vma; 5564 5565 lru_add_drain_all(); 5566 retry: 5567 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5568 /* 5569 * Someone who are holding the mmap_sem might be waiting in 5570 * waitq. So we cancel all extra charges, wake up all waiters, 5571 * and retry. Because we cancel precharges, we might not be able 5572 * to move enough charges, but moving charge is a best-effort 5573 * feature anyway, so it wouldn't be a big problem. 5574 */ 5575 __mem_cgroup_clear_mc(); 5576 cond_resched(); 5577 goto retry; 5578 } 5579 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5580 int ret; 5581 struct mm_walk mem_cgroup_move_charge_walk = { 5582 .pmd_entry = mem_cgroup_move_charge_pte_range, 5583 .mm = mm, 5584 .private = vma, 5585 }; 5586 if (is_vm_hugetlb_page(vma)) 5587 continue; 5588 ret = walk_page_range(vma->vm_start, vma->vm_end, 5589 &mem_cgroup_move_charge_walk); 5590 if (ret) 5591 /* 5592 * means we have consumed all precharges and failed in 5593 * doing additional charge. Just abandon here. 5594 */ 5595 break; 5596 } 5597 up_read(&mm->mmap_sem); 5598 } 5599 5600 static void mem_cgroup_move_task(struct cgroup *cont, 5601 struct cgroup_taskset *tset) 5602 { 5603 struct task_struct *p = cgroup_taskset_first(tset); 5604 struct mm_struct *mm = get_task_mm(p); 5605 5606 if (mm) { 5607 if (mc.to) 5608 mem_cgroup_move_charge(mm); 5609 put_swap_token(mm); 5610 mmput(mm); 5611 } 5612 if (mc.to) 5613 mem_cgroup_clear_mc(); 5614 } 5615 #else /* !CONFIG_MMU */ 5616 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5617 struct cgroup_taskset *tset) 5618 { 5619 return 0; 5620 } 5621 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5622 struct cgroup_taskset *tset) 5623 { 5624 } 5625 static void mem_cgroup_move_task(struct cgroup *cont, 5626 struct cgroup_taskset *tset) 5627 { 5628 } 5629 #endif 5630 5631 struct cgroup_subsys mem_cgroup_subsys = { 5632 .name = "memory", 5633 .subsys_id = mem_cgroup_subsys_id, 5634 .create = mem_cgroup_create, 5635 .pre_destroy = mem_cgroup_pre_destroy, 5636 .destroy = mem_cgroup_destroy, 5637 .populate = mem_cgroup_populate, 5638 .can_attach = mem_cgroup_can_attach, 5639 .cancel_attach = mem_cgroup_cancel_attach, 5640 .attach = mem_cgroup_move_task, 5641 .early_init = 0, 5642 .use_id = 1, 5643 }; 5644 5645 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5646 static int __init enable_swap_account(char *s) 5647 { 5648 /* consider enabled if no parameter or 1 is given */ 5649 if (!strcmp(s, "1")) 5650 really_do_swap_account = 1; 5651 else if (!strcmp(s, "0")) 5652 really_do_swap_account = 0; 5653 return 1; 5654 } 5655 __setup("swapaccount=", enable_swap_account); 5656 5657 #endif 5658