1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/sort.h> 49 #include <linux/fs.h> 50 #include <linux/seq_file.h> 51 #include <linux/vmalloc.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/page_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include "internal.h" 58 #include <net/sock.h> 59 #include <net/ip.h> 60 #include <net/tcp_memcontrol.h> 61 62 #include <asm/uaccess.h> 63 64 #include <trace/events/vmscan.h> 65 66 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 67 EXPORT_SYMBOL(mem_cgroup_subsys); 68 69 #define MEM_CGROUP_RECLAIM_RETRIES 5 70 static struct mem_cgroup *root_mem_cgroup __read_mostly; 71 72 #ifdef CONFIG_MEMCG_SWAP 73 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 74 int do_swap_account __read_mostly; 75 76 /* for remember boot option*/ 77 #ifdef CONFIG_MEMCG_SWAP_ENABLED 78 static int really_do_swap_account __initdata = 1; 79 #else 80 static int really_do_swap_account __initdata = 0; 81 #endif 82 83 #else 84 #define do_swap_account 0 85 #endif 86 87 88 /* 89 * Statistics for memory cgroup. 90 */ 91 enum mem_cgroup_stat_index { 92 /* 93 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 94 */ 95 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 96 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 97 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 98 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 99 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 100 MEM_CGROUP_STAT_NSTATS, 101 }; 102 103 static const char * const mem_cgroup_stat_names[] = { 104 "cache", 105 "rss", 106 "rss_huge", 107 "mapped_file", 108 "swap", 109 }; 110 111 enum mem_cgroup_events_index { 112 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 113 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 114 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 115 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 116 MEM_CGROUP_EVENTS_NSTATS, 117 }; 118 119 static const char * const mem_cgroup_events_names[] = { 120 "pgpgin", 121 "pgpgout", 122 "pgfault", 123 "pgmajfault", 124 }; 125 126 static const char * const mem_cgroup_lru_names[] = { 127 "inactive_anon", 128 "active_anon", 129 "inactive_file", 130 "active_file", 131 "unevictable", 132 }; 133 134 /* 135 * Per memcg event counter is incremented at every pagein/pageout. With THP, 136 * it will be incremated by the number of pages. This counter is used for 137 * for trigger some periodic events. This is straightforward and better 138 * than using jiffies etc. to handle periodic memcg event. 139 */ 140 enum mem_cgroup_events_target { 141 MEM_CGROUP_TARGET_THRESH, 142 MEM_CGROUP_TARGET_SOFTLIMIT, 143 MEM_CGROUP_TARGET_NUMAINFO, 144 MEM_CGROUP_NTARGETS, 145 }; 146 #define THRESHOLDS_EVENTS_TARGET 128 147 #define SOFTLIMIT_EVENTS_TARGET 1024 148 #define NUMAINFO_EVENTS_TARGET 1024 149 150 struct mem_cgroup_stat_cpu { 151 long count[MEM_CGROUP_STAT_NSTATS]; 152 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 153 unsigned long nr_page_events; 154 unsigned long targets[MEM_CGROUP_NTARGETS]; 155 }; 156 157 struct mem_cgroup_reclaim_iter { 158 /* 159 * last scanned hierarchy member. Valid only if last_dead_count 160 * matches memcg->dead_count of the hierarchy root group. 161 */ 162 struct mem_cgroup *last_visited; 163 unsigned long last_dead_count; 164 165 /* scan generation, increased every round-trip */ 166 unsigned int generation; 167 }; 168 169 /* 170 * per-zone information in memory controller. 171 */ 172 struct mem_cgroup_per_zone { 173 struct lruvec lruvec; 174 unsigned long lru_size[NR_LRU_LISTS]; 175 176 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 177 178 struct rb_node tree_node; /* RB tree node */ 179 unsigned long long usage_in_excess;/* Set to the value by which */ 180 /* the soft limit is exceeded*/ 181 bool on_tree; 182 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 183 /* use container_of */ 184 }; 185 186 struct mem_cgroup_per_node { 187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 188 }; 189 190 /* 191 * Cgroups above their limits are maintained in a RB-Tree, independent of 192 * their hierarchy representation 193 */ 194 195 struct mem_cgroup_tree_per_zone { 196 struct rb_root rb_root; 197 spinlock_t lock; 198 }; 199 200 struct mem_cgroup_tree_per_node { 201 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 202 }; 203 204 struct mem_cgroup_tree { 205 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 206 }; 207 208 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 209 210 struct mem_cgroup_threshold { 211 struct eventfd_ctx *eventfd; 212 u64 threshold; 213 }; 214 215 /* For threshold */ 216 struct mem_cgroup_threshold_ary { 217 /* An array index points to threshold just below or equal to usage. */ 218 int current_threshold; 219 /* Size of entries[] */ 220 unsigned int size; 221 /* Array of thresholds */ 222 struct mem_cgroup_threshold entries[0]; 223 }; 224 225 struct mem_cgroup_thresholds { 226 /* Primary thresholds array */ 227 struct mem_cgroup_threshold_ary *primary; 228 /* 229 * Spare threshold array. 230 * This is needed to make mem_cgroup_unregister_event() "never fail". 231 * It must be able to store at least primary->size - 1 entries. 232 */ 233 struct mem_cgroup_threshold_ary *spare; 234 }; 235 236 /* for OOM */ 237 struct mem_cgroup_eventfd_list { 238 struct list_head list; 239 struct eventfd_ctx *eventfd; 240 }; 241 242 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 243 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 244 245 /* 246 * The memory controller data structure. The memory controller controls both 247 * page cache and RSS per cgroup. We would eventually like to provide 248 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 249 * to help the administrator determine what knobs to tune. 250 * 251 * TODO: Add a water mark for the memory controller. Reclaim will begin when 252 * we hit the water mark. May be even add a low water mark, such that 253 * no reclaim occurs from a cgroup at it's low water mark, this is 254 * a feature that will be implemented much later in the future. 255 */ 256 struct mem_cgroup { 257 struct cgroup_subsys_state css; 258 /* 259 * the counter to account for memory usage 260 */ 261 struct res_counter res; 262 263 /* vmpressure notifications */ 264 struct vmpressure vmpressure; 265 266 /* 267 * the counter to account for mem+swap usage. 268 */ 269 struct res_counter memsw; 270 271 /* 272 * the counter to account for kernel memory usage. 273 */ 274 struct res_counter kmem; 275 /* 276 * Should the accounting and control be hierarchical, per subtree? 277 */ 278 bool use_hierarchy; 279 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 280 281 bool oom_lock; 282 atomic_t under_oom; 283 284 int swappiness; 285 /* OOM-Killer disable */ 286 int oom_kill_disable; 287 288 /* set when res.limit == memsw.limit */ 289 bool memsw_is_minimum; 290 291 /* protect arrays of thresholds */ 292 struct mutex thresholds_lock; 293 294 /* thresholds for memory usage. RCU-protected */ 295 struct mem_cgroup_thresholds thresholds; 296 297 /* thresholds for mem+swap usage. RCU-protected */ 298 struct mem_cgroup_thresholds memsw_thresholds; 299 300 /* For oom notifier event fd */ 301 struct list_head oom_notify; 302 303 /* 304 * Should we move charges of a task when a task is moved into this 305 * mem_cgroup ? And what type of charges should we move ? 306 */ 307 unsigned long move_charge_at_immigrate; 308 /* 309 * set > 0 if pages under this cgroup are moving to other cgroup. 310 */ 311 atomic_t moving_account; 312 /* taken only while moving_account > 0 */ 313 spinlock_t move_lock; 314 /* 315 * percpu counter. 316 */ 317 struct mem_cgroup_stat_cpu __percpu *stat; 318 /* 319 * used when a cpu is offlined or other synchronizations 320 * See mem_cgroup_read_stat(). 321 */ 322 struct mem_cgroup_stat_cpu nocpu_base; 323 spinlock_t pcp_counter_lock; 324 325 atomic_t dead_count; 326 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 327 struct tcp_memcontrol tcp_mem; 328 #endif 329 #if defined(CONFIG_MEMCG_KMEM) 330 /* analogous to slab_common's slab_caches list. per-memcg */ 331 struct list_head memcg_slab_caches; 332 /* Not a spinlock, we can take a lot of time walking the list */ 333 struct mutex slab_caches_mutex; 334 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 335 int kmemcg_id; 336 #endif 337 338 int last_scanned_node; 339 #if MAX_NUMNODES > 1 340 nodemask_t scan_nodes; 341 atomic_t numainfo_events; 342 atomic_t numainfo_updating; 343 #endif 344 345 struct mem_cgroup_per_node *nodeinfo[0]; 346 /* WARNING: nodeinfo must be the last member here */ 347 }; 348 349 static size_t memcg_size(void) 350 { 351 return sizeof(struct mem_cgroup) + 352 nr_node_ids * sizeof(struct mem_cgroup_per_node); 353 } 354 355 /* internal only representation about the status of kmem accounting. */ 356 enum { 357 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 358 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ 359 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 360 }; 361 362 /* We account when limit is on, but only after call sites are patched */ 363 #define KMEM_ACCOUNTED_MASK \ 364 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) 365 366 #ifdef CONFIG_MEMCG_KMEM 367 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 368 { 369 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 370 } 371 372 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 373 { 374 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 375 } 376 377 static void memcg_kmem_set_activated(struct mem_cgroup *memcg) 378 { 379 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 380 } 381 382 static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) 383 { 384 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 385 } 386 387 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 388 { 389 /* 390 * Our caller must use css_get() first, because memcg_uncharge_kmem() 391 * will call css_put() if it sees the memcg is dead. 392 */ 393 smp_wmb(); 394 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 395 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 396 } 397 398 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 399 { 400 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 401 &memcg->kmem_account_flags); 402 } 403 #endif 404 405 /* Stuffs for move charges at task migration. */ 406 /* 407 * Types of charges to be moved. "move_charge_at_immitgrate" and 408 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 409 */ 410 enum move_type { 411 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 412 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 413 NR_MOVE_TYPE, 414 }; 415 416 /* "mc" and its members are protected by cgroup_mutex */ 417 static struct move_charge_struct { 418 spinlock_t lock; /* for from, to */ 419 struct mem_cgroup *from; 420 struct mem_cgroup *to; 421 unsigned long immigrate_flags; 422 unsigned long precharge; 423 unsigned long moved_charge; 424 unsigned long moved_swap; 425 struct task_struct *moving_task; /* a task moving charges */ 426 wait_queue_head_t waitq; /* a waitq for other context */ 427 } mc = { 428 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 429 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 430 }; 431 432 static bool move_anon(void) 433 { 434 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 435 } 436 437 static bool move_file(void) 438 { 439 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 440 } 441 442 /* 443 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 444 * limit reclaim to prevent infinite loops, if they ever occur. 445 */ 446 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 447 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 448 449 enum charge_type { 450 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 451 MEM_CGROUP_CHARGE_TYPE_ANON, 452 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 453 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 454 NR_CHARGE_TYPE, 455 }; 456 457 /* for encoding cft->private value on file */ 458 enum res_type { 459 _MEM, 460 _MEMSWAP, 461 _OOM_TYPE, 462 _KMEM, 463 }; 464 465 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 466 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 467 #define MEMFILE_ATTR(val) ((val) & 0xffff) 468 /* Used for OOM nofiier */ 469 #define OOM_CONTROL (0) 470 471 /* 472 * Reclaim flags for mem_cgroup_hierarchical_reclaim 473 */ 474 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 475 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 476 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 477 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 478 479 /* 480 * The memcg_create_mutex will be held whenever a new cgroup is created. 481 * As a consequence, any change that needs to protect against new child cgroups 482 * appearing has to hold it as well. 483 */ 484 static DEFINE_MUTEX(memcg_create_mutex); 485 486 static inline 487 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 488 { 489 return container_of(s, struct mem_cgroup, css); 490 } 491 492 /* Some nice accessors for the vmpressure. */ 493 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 494 { 495 if (!memcg) 496 memcg = root_mem_cgroup; 497 return &memcg->vmpressure; 498 } 499 500 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 501 { 502 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 503 } 504 505 struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 506 { 507 return &mem_cgroup_from_css(css)->vmpressure; 508 } 509 510 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 511 { 512 return (memcg == root_mem_cgroup); 513 } 514 515 /* Writing them here to avoid exposing memcg's inner layout */ 516 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 517 518 void sock_update_memcg(struct sock *sk) 519 { 520 if (mem_cgroup_sockets_enabled) { 521 struct mem_cgroup *memcg; 522 struct cg_proto *cg_proto; 523 524 BUG_ON(!sk->sk_prot->proto_cgroup); 525 526 /* Socket cloning can throw us here with sk_cgrp already 527 * filled. It won't however, necessarily happen from 528 * process context. So the test for root memcg given 529 * the current task's memcg won't help us in this case. 530 * 531 * Respecting the original socket's memcg is a better 532 * decision in this case. 533 */ 534 if (sk->sk_cgrp) { 535 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 536 css_get(&sk->sk_cgrp->memcg->css); 537 return; 538 } 539 540 rcu_read_lock(); 541 memcg = mem_cgroup_from_task(current); 542 cg_proto = sk->sk_prot->proto_cgroup(memcg); 543 if (!mem_cgroup_is_root(memcg) && 544 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { 545 sk->sk_cgrp = cg_proto; 546 } 547 rcu_read_unlock(); 548 } 549 } 550 EXPORT_SYMBOL(sock_update_memcg); 551 552 void sock_release_memcg(struct sock *sk) 553 { 554 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 555 struct mem_cgroup *memcg; 556 WARN_ON(!sk->sk_cgrp->memcg); 557 memcg = sk->sk_cgrp->memcg; 558 css_put(&sk->sk_cgrp->memcg->css); 559 } 560 } 561 562 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 563 { 564 if (!memcg || mem_cgroup_is_root(memcg)) 565 return NULL; 566 567 return &memcg->tcp_mem.cg_proto; 568 } 569 EXPORT_SYMBOL(tcp_proto_cgroup); 570 571 static void disarm_sock_keys(struct mem_cgroup *memcg) 572 { 573 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 574 return; 575 static_key_slow_dec(&memcg_socket_limit_enabled); 576 } 577 #else 578 static void disarm_sock_keys(struct mem_cgroup *memcg) 579 { 580 } 581 #endif 582 583 #ifdef CONFIG_MEMCG_KMEM 584 /* 585 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 586 * There are two main reasons for not using the css_id for this: 587 * 1) this works better in sparse environments, where we have a lot of memcgs, 588 * but only a few kmem-limited. Or also, if we have, for instance, 200 589 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 590 * 200 entry array for that. 591 * 592 * 2) In order not to violate the cgroup API, we would like to do all memory 593 * allocation in ->create(). At that point, we haven't yet allocated the 594 * css_id. Having a separate index prevents us from messing with the cgroup 595 * core for this 596 * 597 * The current size of the caches array is stored in 598 * memcg_limited_groups_array_size. It will double each time we have to 599 * increase it. 600 */ 601 static DEFINE_IDA(kmem_limited_groups); 602 int memcg_limited_groups_array_size; 603 604 /* 605 * MIN_SIZE is different than 1, because we would like to avoid going through 606 * the alloc/free process all the time. In a small machine, 4 kmem-limited 607 * cgroups is a reasonable guess. In the future, it could be a parameter or 608 * tunable, but that is strictly not necessary. 609 * 610 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get 611 * this constant directly from cgroup, but it is understandable that this is 612 * better kept as an internal representation in cgroup.c. In any case, the 613 * css_id space is not getting any smaller, and we don't have to necessarily 614 * increase ours as well if it increases. 615 */ 616 #define MEMCG_CACHES_MIN_SIZE 4 617 #define MEMCG_CACHES_MAX_SIZE 65535 618 619 /* 620 * A lot of the calls to the cache allocation functions are expected to be 621 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 622 * conditional to this static branch, we'll have to allow modules that does 623 * kmem_cache_alloc and the such to see this symbol as well 624 */ 625 struct static_key memcg_kmem_enabled_key; 626 EXPORT_SYMBOL(memcg_kmem_enabled_key); 627 628 static void disarm_kmem_keys(struct mem_cgroup *memcg) 629 { 630 if (memcg_kmem_is_active(memcg)) { 631 static_key_slow_dec(&memcg_kmem_enabled_key); 632 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 633 } 634 /* 635 * This check can't live in kmem destruction function, 636 * since the charges will outlive the cgroup 637 */ 638 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 639 } 640 #else 641 static void disarm_kmem_keys(struct mem_cgroup *memcg) 642 { 643 } 644 #endif /* CONFIG_MEMCG_KMEM */ 645 646 static void disarm_static_keys(struct mem_cgroup *memcg) 647 { 648 disarm_sock_keys(memcg); 649 disarm_kmem_keys(memcg); 650 } 651 652 static void drain_all_stock_async(struct mem_cgroup *memcg); 653 654 static struct mem_cgroup_per_zone * 655 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 656 { 657 VM_BUG_ON((unsigned)nid >= nr_node_ids); 658 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 659 } 660 661 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 662 { 663 return &memcg->css; 664 } 665 666 static struct mem_cgroup_per_zone * 667 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 668 { 669 int nid = page_to_nid(page); 670 int zid = page_zonenum(page); 671 672 return mem_cgroup_zoneinfo(memcg, nid, zid); 673 } 674 675 static struct mem_cgroup_tree_per_zone * 676 soft_limit_tree_node_zone(int nid, int zid) 677 { 678 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 679 } 680 681 static struct mem_cgroup_tree_per_zone * 682 soft_limit_tree_from_page(struct page *page) 683 { 684 int nid = page_to_nid(page); 685 int zid = page_zonenum(page); 686 687 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 688 } 689 690 static void 691 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 692 struct mem_cgroup_per_zone *mz, 693 struct mem_cgroup_tree_per_zone *mctz, 694 unsigned long long new_usage_in_excess) 695 { 696 struct rb_node **p = &mctz->rb_root.rb_node; 697 struct rb_node *parent = NULL; 698 struct mem_cgroup_per_zone *mz_node; 699 700 if (mz->on_tree) 701 return; 702 703 mz->usage_in_excess = new_usage_in_excess; 704 if (!mz->usage_in_excess) 705 return; 706 while (*p) { 707 parent = *p; 708 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 709 tree_node); 710 if (mz->usage_in_excess < mz_node->usage_in_excess) 711 p = &(*p)->rb_left; 712 /* 713 * We can't avoid mem cgroups that are over their soft 714 * limit by the same amount 715 */ 716 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 717 p = &(*p)->rb_right; 718 } 719 rb_link_node(&mz->tree_node, parent, p); 720 rb_insert_color(&mz->tree_node, &mctz->rb_root); 721 mz->on_tree = true; 722 } 723 724 static void 725 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 726 struct mem_cgroup_per_zone *mz, 727 struct mem_cgroup_tree_per_zone *mctz) 728 { 729 if (!mz->on_tree) 730 return; 731 rb_erase(&mz->tree_node, &mctz->rb_root); 732 mz->on_tree = false; 733 } 734 735 static void 736 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 737 struct mem_cgroup_per_zone *mz, 738 struct mem_cgroup_tree_per_zone *mctz) 739 { 740 spin_lock(&mctz->lock); 741 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 742 spin_unlock(&mctz->lock); 743 } 744 745 746 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 747 { 748 unsigned long long excess; 749 struct mem_cgroup_per_zone *mz; 750 struct mem_cgroup_tree_per_zone *mctz; 751 int nid = page_to_nid(page); 752 int zid = page_zonenum(page); 753 mctz = soft_limit_tree_from_page(page); 754 755 /* 756 * Necessary to update all ancestors when hierarchy is used. 757 * because their event counter is not touched. 758 */ 759 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 760 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 761 excess = res_counter_soft_limit_excess(&memcg->res); 762 /* 763 * We have to update the tree if mz is on RB-tree or 764 * mem is over its softlimit. 765 */ 766 if (excess || mz->on_tree) { 767 spin_lock(&mctz->lock); 768 /* if on-tree, remove it */ 769 if (mz->on_tree) 770 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 771 /* 772 * Insert again. mz->usage_in_excess will be updated. 773 * If excess is 0, no tree ops. 774 */ 775 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 776 spin_unlock(&mctz->lock); 777 } 778 } 779 } 780 781 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 782 { 783 int node, zone; 784 struct mem_cgroup_per_zone *mz; 785 struct mem_cgroup_tree_per_zone *mctz; 786 787 for_each_node(node) { 788 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 789 mz = mem_cgroup_zoneinfo(memcg, node, zone); 790 mctz = soft_limit_tree_node_zone(node, zone); 791 mem_cgroup_remove_exceeded(memcg, mz, mctz); 792 } 793 } 794 } 795 796 static struct mem_cgroup_per_zone * 797 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 798 { 799 struct rb_node *rightmost = NULL; 800 struct mem_cgroup_per_zone *mz; 801 802 retry: 803 mz = NULL; 804 rightmost = rb_last(&mctz->rb_root); 805 if (!rightmost) 806 goto done; /* Nothing to reclaim from */ 807 808 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 809 /* 810 * Remove the node now but someone else can add it back, 811 * we will to add it back at the end of reclaim to its correct 812 * position in the tree. 813 */ 814 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 815 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 816 !css_tryget(&mz->memcg->css)) 817 goto retry; 818 done: 819 return mz; 820 } 821 822 static struct mem_cgroup_per_zone * 823 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 824 { 825 struct mem_cgroup_per_zone *mz; 826 827 spin_lock(&mctz->lock); 828 mz = __mem_cgroup_largest_soft_limit_node(mctz); 829 spin_unlock(&mctz->lock); 830 return mz; 831 } 832 833 /* 834 * Implementation Note: reading percpu statistics for memcg. 835 * 836 * Both of vmstat[] and percpu_counter has threshold and do periodic 837 * synchronization to implement "quick" read. There are trade-off between 838 * reading cost and precision of value. Then, we may have a chance to implement 839 * a periodic synchronizion of counter in memcg's counter. 840 * 841 * But this _read() function is used for user interface now. The user accounts 842 * memory usage by memory cgroup and he _always_ requires exact value because 843 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 844 * have to visit all online cpus and make sum. So, for now, unnecessary 845 * synchronization is not implemented. (just implemented for cpu hotplug) 846 * 847 * If there are kernel internal actions which can make use of some not-exact 848 * value, and reading all cpu value can be performance bottleneck in some 849 * common workload, threashold and synchonization as vmstat[] should be 850 * implemented. 851 */ 852 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 853 enum mem_cgroup_stat_index idx) 854 { 855 long val = 0; 856 int cpu; 857 858 get_online_cpus(); 859 for_each_online_cpu(cpu) 860 val += per_cpu(memcg->stat->count[idx], cpu); 861 #ifdef CONFIG_HOTPLUG_CPU 862 spin_lock(&memcg->pcp_counter_lock); 863 val += memcg->nocpu_base.count[idx]; 864 spin_unlock(&memcg->pcp_counter_lock); 865 #endif 866 put_online_cpus(); 867 return val; 868 } 869 870 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 871 bool charge) 872 { 873 int val = (charge) ? 1 : -1; 874 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 875 } 876 877 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 878 enum mem_cgroup_events_index idx) 879 { 880 unsigned long val = 0; 881 int cpu; 882 883 for_each_online_cpu(cpu) 884 val += per_cpu(memcg->stat->events[idx], cpu); 885 #ifdef CONFIG_HOTPLUG_CPU 886 spin_lock(&memcg->pcp_counter_lock); 887 val += memcg->nocpu_base.events[idx]; 888 spin_unlock(&memcg->pcp_counter_lock); 889 #endif 890 return val; 891 } 892 893 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 894 struct page *page, 895 bool anon, int nr_pages) 896 { 897 preempt_disable(); 898 899 /* 900 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 901 * counted as CACHE even if it's on ANON LRU. 902 */ 903 if (anon) 904 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 905 nr_pages); 906 else 907 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 908 nr_pages); 909 910 if (PageTransHuge(page)) 911 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 912 nr_pages); 913 914 /* pagein of a big page is an event. So, ignore page size */ 915 if (nr_pages > 0) 916 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 917 else { 918 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 919 nr_pages = -nr_pages; /* for event */ 920 } 921 922 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 923 924 preempt_enable(); 925 } 926 927 unsigned long 928 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 929 { 930 struct mem_cgroup_per_zone *mz; 931 932 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 933 return mz->lru_size[lru]; 934 } 935 936 static unsigned long 937 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 938 unsigned int lru_mask) 939 { 940 struct mem_cgroup_per_zone *mz; 941 enum lru_list lru; 942 unsigned long ret = 0; 943 944 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 945 946 for_each_lru(lru) { 947 if (BIT(lru) & lru_mask) 948 ret += mz->lru_size[lru]; 949 } 950 return ret; 951 } 952 953 static unsigned long 954 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 955 int nid, unsigned int lru_mask) 956 { 957 u64 total = 0; 958 int zid; 959 960 for (zid = 0; zid < MAX_NR_ZONES; zid++) 961 total += mem_cgroup_zone_nr_lru_pages(memcg, 962 nid, zid, lru_mask); 963 964 return total; 965 } 966 967 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 968 unsigned int lru_mask) 969 { 970 int nid; 971 u64 total = 0; 972 973 for_each_node_state(nid, N_MEMORY) 974 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 975 return total; 976 } 977 978 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 979 enum mem_cgroup_events_target target) 980 { 981 unsigned long val, next; 982 983 val = __this_cpu_read(memcg->stat->nr_page_events); 984 next = __this_cpu_read(memcg->stat->targets[target]); 985 /* from time_after() in jiffies.h */ 986 if ((long)next - (long)val < 0) { 987 switch (target) { 988 case MEM_CGROUP_TARGET_THRESH: 989 next = val + THRESHOLDS_EVENTS_TARGET; 990 break; 991 case MEM_CGROUP_TARGET_SOFTLIMIT: 992 next = val + SOFTLIMIT_EVENTS_TARGET; 993 break; 994 case MEM_CGROUP_TARGET_NUMAINFO: 995 next = val + NUMAINFO_EVENTS_TARGET; 996 break; 997 default: 998 break; 999 } 1000 __this_cpu_write(memcg->stat->targets[target], next); 1001 return true; 1002 } 1003 return false; 1004 } 1005 1006 /* 1007 * Check events in order. 1008 * 1009 */ 1010 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1011 { 1012 preempt_disable(); 1013 /* threshold event is triggered in finer grain than soft limit */ 1014 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1015 MEM_CGROUP_TARGET_THRESH))) { 1016 bool do_softlimit; 1017 bool do_numainfo __maybe_unused; 1018 1019 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1020 MEM_CGROUP_TARGET_SOFTLIMIT); 1021 #if MAX_NUMNODES > 1 1022 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1023 MEM_CGROUP_TARGET_NUMAINFO); 1024 #endif 1025 preempt_enable(); 1026 1027 mem_cgroup_threshold(memcg); 1028 if (unlikely(do_softlimit)) 1029 mem_cgroup_update_tree(memcg, page); 1030 #if MAX_NUMNODES > 1 1031 if (unlikely(do_numainfo)) 1032 atomic_inc(&memcg->numainfo_events); 1033 #endif 1034 } else 1035 preempt_enable(); 1036 } 1037 1038 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 1039 { 1040 return mem_cgroup_from_css( 1041 cgroup_subsys_state(cont, mem_cgroup_subsys_id)); 1042 } 1043 1044 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1045 { 1046 /* 1047 * mm_update_next_owner() may clear mm->owner to NULL 1048 * if it races with swapoff, page migration, etc. 1049 * So this can be called with p == NULL. 1050 */ 1051 if (unlikely(!p)) 1052 return NULL; 1053 1054 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); 1055 } 1056 1057 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1058 { 1059 struct mem_cgroup *memcg = NULL; 1060 1061 if (!mm) 1062 return NULL; 1063 /* 1064 * Because we have no locks, mm->owner's may be being moved to other 1065 * cgroup. We use css_tryget() here even if this looks 1066 * pessimistic (rather than adding locks here). 1067 */ 1068 rcu_read_lock(); 1069 do { 1070 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1071 if (unlikely(!memcg)) 1072 break; 1073 } while (!css_tryget(&memcg->css)); 1074 rcu_read_unlock(); 1075 return memcg; 1076 } 1077 1078 /* 1079 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1080 * ref. count) or NULL if the whole root's subtree has been visited. 1081 * 1082 * helper function to be used by mem_cgroup_iter 1083 */ 1084 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1085 struct mem_cgroup *last_visited) 1086 { 1087 struct cgroup *prev_cgroup, *next_cgroup; 1088 1089 /* 1090 * Root is not visited by cgroup iterators so it needs an 1091 * explicit visit. 1092 */ 1093 if (!last_visited) 1094 return root; 1095 1096 prev_cgroup = (last_visited == root) ? NULL 1097 : last_visited->css.cgroup; 1098 skip_node: 1099 next_cgroup = cgroup_next_descendant_pre( 1100 prev_cgroup, root->css.cgroup); 1101 1102 /* 1103 * Even if we found a group we have to make sure it is 1104 * alive. css && !memcg means that the groups should be 1105 * skipped and we should continue the tree walk. 1106 * last_visited css is safe to use because it is 1107 * protected by css_get and the tree walk is rcu safe. 1108 */ 1109 if (next_cgroup) { 1110 struct mem_cgroup *mem = mem_cgroup_from_cont( 1111 next_cgroup); 1112 if (css_tryget(&mem->css)) 1113 return mem; 1114 else { 1115 prev_cgroup = next_cgroup; 1116 goto skip_node; 1117 } 1118 } 1119 1120 return NULL; 1121 } 1122 1123 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1124 { 1125 /* 1126 * When a group in the hierarchy below root is destroyed, the 1127 * hierarchy iterator can no longer be trusted since it might 1128 * have pointed to the destroyed group. Invalidate it. 1129 */ 1130 atomic_inc(&root->dead_count); 1131 } 1132 1133 static struct mem_cgroup * 1134 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1135 struct mem_cgroup *root, 1136 int *sequence) 1137 { 1138 struct mem_cgroup *position = NULL; 1139 /* 1140 * A cgroup destruction happens in two stages: offlining and 1141 * release. They are separated by a RCU grace period. 1142 * 1143 * If the iterator is valid, we may still race with an 1144 * offlining. The RCU lock ensures the object won't be 1145 * released, tryget will fail if we lost the race. 1146 */ 1147 *sequence = atomic_read(&root->dead_count); 1148 if (iter->last_dead_count == *sequence) { 1149 smp_rmb(); 1150 position = iter->last_visited; 1151 if (position && !css_tryget(&position->css)) 1152 position = NULL; 1153 } 1154 return position; 1155 } 1156 1157 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1158 struct mem_cgroup *last_visited, 1159 struct mem_cgroup *new_position, 1160 int sequence) 1161 { 1162 if (last_visited) 1163 css_put(&last_visited->css); 1164 /* 1165 * We store the sequence count from the time @last_visited was 1166 * loaded successfully instead of rereading it here so that we 1167 * don't lose destruction events in between. We could have 1168 * raced with the destruction of @new_position after all. 1169 */ 1170 iter->last_visited = new_position; 1171 smp_wmb(); 1172 iter->last_dead_count = sequence; 1173 } 1174 1175 /** 1176 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1177 * @root: hierarchy root 1178 * @prev: previously returned memcg, NULL on first invocation 1179 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1180 * 1181 * Returns references to children of the hierarchy below @root, or 1182 * @root itself, or %NULL after a full round-trip. 1183 * 1184 * Caller must pass the return value in @prev on subsequent 1185 * invocations for reference counting, or use mem_cgroup_iter_break() 1186 * to cancel a hierarchy walk before the round-trip is complete. 1187 * 1188 * Reclaimers can specify a zone and a priority level in @reclaim to 1189 * divide up the memcgs in the hierarchy among all concurrent 1190 * reclaimers operating on the same zone and priority. 1191 */ 1192 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1193 struct mem_cgroup *prev, 1194 struct mem_cgroup_reclaim_cookie *reclaim) 1195 { 1196 struct mem_cgroup *memcg = NULL; 1197 struct mem_cgroup *last_visited = NULL; 1198 1199 if (mem_cgroup_disabled()) 1200 return NULL; 1201 1202 if (!root) 1203 root = root_mem_cgroup; 1204 1205 if (prev && !reclaim) 1206 last_visited = prev; 1207 1208 if (!root->use_hierarchy && root != root_mem_cgroup) { 1209 if (prev) 1210 goto out_css_put; 1211 return root; 1212 } 1213 1214 rcu_read_lock(); 1215 while (!memcg) { 1216 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1217 int uninitialized_var(seq); 1218 1219 if (reclaim) { 1220 int nid = zone_to_nid(reclaim->zone); 1221 int zid = zone_idx(reclaim->zone); 1222 struct mem_cgroup_per_zone *mz; 1223 1224 mz = mem_cgroup_zoneinfo(root, nid, zid); 1225 iter = &mz->reclaim_iter[reclaim->priority]; 1226 if (prev && reclaim->generation != iter->generation) { 1227 iter->last_visited = NULL; 1228 goto out_unlock; 1229 } 1230 1231 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1232 } 1233 1234 memcg = __mem_cgroup_iter_next(root, last_visited); 1235 1236 if (reclaim) { 1237 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1238 1239 if (!memcg) 1240 iter->generation++; 1241 else if (!prev && memcg) 1242 reclaim->generation = iter->generation; 1243 } 1244 1245 if (prev && !memcg) 1246 goto out_unlock; 1247 } 1248 out_unlock: 1249 rcu_read_unlock(); 1250 out_css_put: 1251 if (prev && prev != root) 1252 css_put(&prev->css); 1253 1254 return memcg; 1255 } 1256 1257 /** 1258 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1259 * @root: hierarchy root 1260 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1261 */ 1262 void mem_cgroup_iter_break(struct mem_cgroup *root, 1263 struct mem_cgroup *prev) 1264 { 1265 if (!root) 1266 root = root_mem_cgroup; 1267 if (prev && prev != root) 1268 css_put(&prev->css); 1269 } 1270 1271 /* 1272 * Iteration constructs for visiting all cgroups (under a tree). If 1273 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1274 * be used for reference counting. 1275 */ 1276 #define for_each_mem_cgroup_tree(iter, root) \ 1277 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1278 iter != NULL; \ 1279 iter = mem_cgroup_iter(root, iter, NULL)) 1280 1281 #define for_each_mem_cgroup(iter) \ 1282 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1283 iter != NULL; \ 1284 iter = mem_cgroup_iter(NULL, iter, NULL)) 1285 1286 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1287 { 1288 struct mem_cgroup *memcg; 1289 1290 rcu_read_lock(); 1291 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1292 if (unlikely(!memcg)) 1293 goto out; 1294 1295 switch (idx) { 1296 case PGFAULT: 1297 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1298 break; 1299 case PGMAJFAULT: 1300 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1301 break; 1302 default: 1303 BUG(); 1304 } 1305 out: 1306 rcu_read_unlock(); 1307 } 1308 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1309 1310 /** 1311 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1312 * @zone: zone of the wanted lruvec 1313 * @memcg: memcg of the wanted lruvec 1314 * 1315 * Returns the lru list vector holding pages for the given @zone and 1316 * @mem. This can be the global zone lruvec, if the memory controller 1317 * is disabled. 1318 */ 1319 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1320 struct mem_cgroup *memcg) 1321 { 1322 struct mem_cgroup_per_zone *mz; 1323 struct lruvec *lruvec; 1324 1325 if (mem_cgroup_disabled()) { 1326 lruvec = &zone->lruvec; 1327 goto out; 1328 } 1329 1330 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1331 lruvec = &mz->lruvec; 1332 out: 1333 /* 1334 * Since a node can be onlined after the mem_cgroup was created, 1335 * we have to be prepared to initialize lruvec->zone here; 1336 * and if offlined then reonlined, we need to reinitialize it. 1337 */ 1338 if (unlikely(lruvec->zone != zone)) 1339 lruvec->zone = zone; 1340 return lruvec; 1341 } 1342 1343 /* 1344 * Following LRU functions are allowed to be used without PCG_LOCK. 1345 * Operations are called by routine of global LRU independently from memcg. 1346 * What we have to take care of here is validness of pc->mem_cgroup. 1347 * 1348 * Changes to pc->mem_cgroup happens when 1349 * 1. charge 1350 * 2. moving account 1351 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1352 * It is added to LRU before charge. 1353 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1354 * When moving account, the page is not on LRU. It's isolated. 1355 */ 1356 1357 /** 1358 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1359 * @page: the page 1360 * @zone: zone of the page 1361 */ 1362 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1363 { 1364 struct mem_cgroup_per_zone *mz; 1365 struct mem_cgroup *memcg; 1366 struct page_cgroup *pc; 1367 struct lruvec *lruvec; 1368 1369 if (mem_cgroup_disabled()) { 1370 lruvec = &zone->lruvec; 1371 goto out; 1372 } 1373 1374 pc = lookup_page_cgroup(page); 1375 memcg = pc->mem_cgroup; 1376 1377 /* 1378 * Surreptitiously switch any uncharged offlist page to root: 1379 * an uncharged page off lru does nothing to secure 1380 * its former mem_cgroup from sudden removal. 1381 * 1382 * Our caller holds lru_lock, and PageCgroupUsed is updated 1383 * under page_cgroup lock: between them, they make all uses 1384 * of pc->mem_cgroup safe. 1385 */ 1386 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1387 pc->mem_cgroup = memcg = root_mem_cgroup; 1388 1389 mz = page_cgroup_zoneinfo(memcg, page); 1390 lruvec = &mz->lruvec; 1391 out: 1392 /* 1393 * Since a node can be onlined after the mem_cgroup was created, 1394 * we have to be prepared to initialize lruvec->zone here; 1395 * and if offlined then reonlined, we need to reinitialize it. 1396 */ 1397 if (unlikely(lruvec->zone != zone)) 1398 lruvec->zone = zone; 1399 return lruvec; 1400 } 1401 1402 /** 1403 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1404 * @lruvec: mem_cgroup per zone lru vector 1405 * @lru: index of lru list the page is sitting on 1406 * @nr_pages: positive when adding or negative when removing 1407 * 1408 * This function must be called when a page is added to or removed from an 1409 * lru list. 1410 */ 1411 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1412 int nr_pages) 1413 { 1414 struct mem_cgroup_per_zone *mz; 1415 unsigned long *lru_size; 1416 1417 if (mem_cgroup_disabled()) 1418 return; 1419 1420 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1421 lru_size = mz->lru_size + lru; 1422 *lru_size += nr_pages; 1423 VM_BUG_ON((long)(*lru_size) < 0); 1424 } 1425 1426 /* 1427 * Checks whether given mem is same or in the root_mem_cgroup's 1428 * hierarchy subtree 1429 */ 1430 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1431 struct mem_cgroup *memcg) 1432 { 1433 if (root_memcg == memcg) 1434 return true; 1435 if (!root_memcg->use_hierarchy || !memcg) 1436 return false; 1437 return css_is_ancestor(&memcg->css, &root_memcg->css); 1438 } 1439 1440 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1441 struct mem_cgroup *memcg) 1442 { 1443 bool ret; 1444 1445 rcu_read_lock(); 1446 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1447 rcu_read_unlock(); 1448 return ret; 1449 } 1450 1451 bool task_in_mem_cgroup(struct task_struct *task, 1452 const struct mem_cgroup *memcg) 1453 { 1454 struct mem_cgroup *curr = NULL; 1455 struct task_struct *p; 1456 bool ret; 1457 1458 p = find_lock_task_mm(task); 1459 if (p) { 1460 curr = try_get_mem_cgroup_from_mm(p->mm); 1461 task_unlock(p); 1462 } else { 1463 /* 1464 * All threads may have already detached their mm's, but the oom 1465 * killer still needs to detect if they have already been oom 1466 * killed to prevent needlessly killing additional tasks. 1467 */ 1468 rcu_read_lock(); 1469 curr = mem_cgroup_from_task(task); 1470 if (curr) 1471 css_get(&curr->css); 1472 rcu_read_unlock(); 1473 } 1474 if (!curr) 1475 return false; 1476 /* 1477 * We should check use_hierarchy of "memcg" not "curr". Because checking 1478 * use_hierarchy of "curr" here make this function true if hierarchy is 1479 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1480 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1481 */ 1482 ret = mem_cgroup_same_or_subtree(memcg, curr); 1483 css_put(&curr->css); 1484 return ret; 1485 } 1486 1487 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1488 { 1489 unsigned long inactive_ratio; 1490 unsigned long inactive; 1491 unsigned long active; 1492 unsigned long gb; 1493 1494 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1495 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1496 1497 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1498 if (gb) 1499 inactive_ratio = int_sqrt(10 * gb); 1500 else 1501 inactive_ratio = 1; 1502 1503 return inactive * inactive_ratio < active; 1504 } 1505 1506 #define mem_cgroup_from_res_counter(counter, member) \ 1507 container_of(counter, struct mem_cgroup, member) 1508 1509 /** 1510 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1511 * @memcg: the memory cgroup 1512 * 1513 * Returns the maximum amount of memory @mem can be charged with, in 1514 * pages. 1515 */ 1516 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1517 { 1518 unsigned long long margin; 1519 1520 margin = res_counter_margin(&memcg->res); 1521 if (do_swap_account) 1522 margin = min(margin, res_counter_margin(&memcg->memsw)); 1523 return margin >> PAGE_SHIFT; 1524 } 1525 1526 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1527 { 1528 struct cgroup *cgrp = memcg->css.cgroup; 1529 1530 /* root ? */ 1531 if (cgrp->parent == NULL) 1532 return vm_swappiness; 1533 1534 return memcg->swappiness; 1535 } 1536 1537 /* 1538 * memcg->moving_account is used for checking possibility that some thread is 1539 * calling move_account(). When a thread on CPU-A starts moving pages under 1540 * a memcg, other threads should check memcg->moving_account under 1541 * rcu_read_lock(), like this: 1542 * 1543 * CPU-A CPU-B 1544 * rcu_read_lock() 1545 * memcg->moving_account+1 if (memcg->mocing_account) 1546 * take heavy locks. 1547 * synchronize_rcu() update something. 1548 * rcu_read_unlock() 1549 * start move here. 1550 */ 1551 1552 /* for quick checking without looking up memcg */ 1553 atomic_t memcg_moving __read_mostly; 1554 1555 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1556 { 1557 atomic_inc(&memcg_moving); 1558 atomic_inc(&memcg->moving_account); 1559 synchronize_rcu(); 1560 } 1561 1562 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1563 { 1564 /* 1565 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1566 * We check NULL in callee rather than caller. 1567 */ 1568 if (memcg) { 1569 atomic_dec(&memcg_moving); 1570 atomic_dec(&memcg->moving_account); 1571 } 1572 } 1573 1574 /* 1575 * 2 routines for checking "mem" is under move_account() or not. 1576 * 1577 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1578 * is used for avoiding races in accounting. If true, 1579 * pc->mem_cgroup may be overwritten. 1580 * 1581 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1582 * under hierarchy of moving cgroups. This is for 1583 * waiting at hith-memory prressure caused by "move". 1584 */ 1585 1586 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1587 { 1588 VM_BUG_ON(!rcu_read_lock_held()); 1589 return atomic_read(&memcg->moving_account) > 0; 1590 } 1591 1592 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1593 { 1594 struct mem_cgroup *from; 1595 struct mem_cgroup *to; 1596 bool ret = false; 1597 /* 1598 * Unlike task_move routines, we access mc.to, mc.from not under 1599 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1600 */ 1601 spin_lock(&mc.lock); 1602 from = mc.from; 1603 to = mc.to; 1604 if (!from) 1605 goto unlock; 1606 1607 ret = mem_cgroup_same_or_subtree(memcg, from) 1608 || mem_cgroup_same_or_subtree(memcg, to); 1609 unlock: 1610 spin_unlock(&mc.lock); 1611 return ret; 1612 } 1613 1614 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1615 { 1616 if (mc.moving_task && current != mc.moving_task) { 1617 if (mem_cgroup_under_move(memcg)) { 1618 DEFINE_WAIT(wait); 1619 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1620 /* moving charge context might have finished. */ 1621 if (mc.moving_task) 1622 schedule(); 1623 finish_wait(&mc.waitq, &wait); 1624 return true; 1625 } 1626 } 1627 return false; 1628 } 1629 1630 /* 1631 * Take this lock when 1632 * - a code tries to modify page's memcg while it's USED. 1633 * - a code tries to modify page state accounting in a memcg. 1634 * see mem_cgroup_stolen(), too. 1635 */ 1636 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1637 unsigned long *flags) 1638 { 1639 spin_lock_irqsave(&memcg->move_lock, *flags); 1640 } 1641 1642 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1643 unsigned long *flags) 1644 { 1645 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1646 } 1647 1648 #define K(x) ((x) << (PAGE_SHIFT-10)) 1649 /** 1650 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1651 * @memcg: The memory cgroup that went over limit 1652 * @p: Task that is going to be killed 1653 * 1654 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1655 * enabled 1656 */ 1657 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1658 { 1659 struct cgroup *task_cgrp; 1660 struct cgroup *mem_cgrp; 1661 /* 1662 * Need a buffer in BSS, can't rely on allocations. The code relies 1663 * on the assumption that OOM is serialized for memory controller. 1664 * If this assumption is broken, revisit this code. 1665 */ 1666 static char memcg_name[PATH_MAX]; 1667 int ret; 1668 struct mem_cgroup *iter; 1669 unsigned int i; 1670 1671 if (!p) 1672 return; 1673 1674 rcu_read_lock(); 1675 1676 mem_cgrp = memcg->css.cgroup; 1677 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1678 1679 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1680 if (ret < 0) { 1681 /* 1682 * Unfortunately, we are unable to convert to a useful name 1683 * But we'll still print out the usage information 1684 */ 1685 rcu_read_unlock(); 1686 goto done; 1687 } 1688 rcu_read_unlock(); 1689 1690 pr_info("Task in %s killed", memcg_name); 1691 1692 rcu_read_lock(); 1693 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1694 if (ret < 0) { 1695 rcu_read_unlock(); 1696 goto done; 1697 } 1698 rcu_read_unlock(); 1699 1700 /* 1701 * Continues from above, so we don't need an KERN_ level 1702 */ 1703 pr_cont(" as a result of limit of %s\n", memcg_name); 1704 done: 1705 1706 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1707 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1708 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1709 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1710 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1711 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1712 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1713 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1714 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1715 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1716 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1717 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1718 1719 for_each_mem_cgroup_tree(iter, memcg) { 1720 pr_info("Memory cgroup stats"); 1721 1722 rcu_read_lock(); 1723 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); 1724 if (!ret) 1725 pr_cont(" for %s", memcg_name); 1726 rcu_read_unlock(); 1727 pr_cont(":"); 1728 1729 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1730 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1731 continue; 1732 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1733 K(mem_cgroup_read_stat(iter, i))); 1734 } 1735 1736 for (i = 0; i < NR_LRU_LISTS; i++) 1737 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1738 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1739 1740 pr_cont("\n"); 1741 } 1742 } 1743 1744 /* 1745 * This function returns the number of memcg under hierarchy tree. Returns 1746 * 1(self count) if no children. 1747 */ 1748 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1749 { 1750 int num = 0; 1751 struct mem_cgroup *iter; 1752 1753 for_each_mem_cgroup_tree(iter, memcg) 1754 num++; 1755 return num; 1756 } 1757 1758 /* 1759 * Return the memory (and swap, if configured) limit for a memcg. 1760 */ 1761 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1762 { 1763 u64 limit; 1764 1765 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1766 1767 /* 1768 * Do not consider swap space if we cannot swap due to swappiness 1769 */ 1770 if (mem_cgroup_swappiness(memcg)) { 1771 u64 memsw; 1772 1773 limit += total_swap_pages << PAGE_SHIFT; 1774 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1775 1776 /* 1777 * If memsw is finite and limits the amount of swap space 1778 * available to this memcg, return that limit. 1779 */ 1780 limit = min(limit, memsw); 1781 } 1782 1783 return limit; 1784 } 1785 1786 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1787 int order) 1788 { 1789 struct mem_cgroup *iter; 1790 unsigned long chosen_points = 0; 1791 unsigned long totalpages; 1792 unsigned int points = 0; 1793 struct task_struct *chosen = NULL; 1794 1795 /* 1796 * If current has a pending SIGKILL or is exiting, then automatically 1797 * select it. The goal is to allow it to allocate so that it may 1798 * quickly exit and free its memory. 1799 */ 1800 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1801 set_thread_flag(TIF_MEMDIE); 1802 return; 1803 } 1804 1805 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1806 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1807 for_each_mem_cgroup_tree(iter, memcg) { 1808 struct cgroup *cgroup = iter->css.cgroup; 1809 struct cgroup_iter it; 1810 struct task_struct *task; 1811 1812 cgroup_iter_start(cgroup, &it); 1813 while ((task = cgroup_iter_next(cgroup, &it))) { 1814 switch (oom_scan_process_thread(task, totalpages, NULL, 1815 false)) { 1816 case OOM_SCAN_SELECT: 1817 if (chosen) 1818 put_task_struct(chosen); 1819 chosen = task; 1820 chosen_points = ULONG_MAX; 1821 get_task_struct(chosen); 1822 /* fall through */ 1823 case OOM_SCAN_CONTINUE: 1824 continue; 1825 case OOM_SCAN_ABORT: 1826 cgroup_iter_end(cgroup, &it); 1827 mem_cgroup_iter_break(memcg, iter); 1828 if (chosen) 1829 put_task_struct(chosen); 1830 return; 1831 case OOM_SCAN_OK: 1832 break; 1833 }; 1834 points = oom_badness(task, memcg, NULL, totalpages); 1835 if (points > chosen_points) { 1836 if (chosen) 1837 put_task_struct(chosen); 1838 chosen = task; 1839 chosen_points = points; 1840 get_task_struct(chosen); 1841 } 1842 } 1843 cgroup_iter_end(cgroup, &it); 1844 } 1845 1846 if (!chosen) 1847 return; 1848 points = chosen_points * 1000 / totalpages; 1849 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1850 NULL, "Memory cgroup out of memory"); 1851 } 1852 1853 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1854 gfp_t gfp_mask, 1855 unsigned long flags) 1856 { 1857 unsigned long total = 0; 1858 bool noswap = false; 1859 int loop; 1860 1861 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1862 noswap = true; 1863 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1864 noswap = true; 1865 1866 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1867 if (loop) 1868 drain_all_stock_async(memcg); 1869 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1870 /* 1871 * Allow limit shrinkers, which are triggered directly 1872 * by userspace, to catch signals and stop reclaim 1873 * after minimal progress, regardless of the margin. 1874 */ 1875 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1876 break; 1877 if (mem_cgroup_margin(memcg)) 1878 break; 1879 /* 1880 * If nothing was reclaimed after two attempts, there 1881 * may be no reclaimable pages in this hierarchy. 1882 */ 1883 if (loop && !total) 1884 break; 1885 } 1886 return total; 1887 } 1888 1889 /** 1890 * test_mem_cgroup_node_reclaimable 1891 * @memcg: the target memcg 1892 * @nid: the node ID to be checked. 1893 * @noswap : specify true here if the user wants flle only information. 1894 * 1895 * This function returns whether the specified memcg contains any 1896 * reclaimable pages on a node. Returns true if there are any reclaimable 1897 * pages in the node. 1898 */ 1899 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1900 int nid, bool noswap) 1901 { 1902 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1903 return true; 1904 if (noswap || !total_swap_pages) 1905 return false; 1906 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1907 return true; 1908 return false; 1909 1910 } 1911 #if MAX_NUMNODES > 1 1912 1913 /* 1914 * Always updating the nodemask is not very good - even if we have an empty 1915 * list or the wrong list here, we can start from some node and traverse all 1916 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1917 * 1918 */ 1919 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1920 { 1921 int nid; 1922 /* 1923 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1924 * pagein/pageout changes since the last update. 1925 */ 1926 if (!atomic_read(&memcg->numainfo_events)) 1927 return; 1928 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1929 return; 1930 1931 /* make a nodemask where this memcg uses memory from */ 1932 memcg->scan_nodes = node_states[N_MEMORY]; 1933 1934 for_each_node_mask(nid, node_states[N_MEMORY]) { 1935 1936 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1937 node_clear(nid, memcg->scan_nodes); 1938 } 1939 1940 atomic_set(&memcg->numainfo_events, 0); 1941 atomic_set(&memcg->numainfo_updating, 0); 1942 } 1943 1944 /* 1945 * Selecting a node where we start reclaim from. Because what we need is just 1946 * reducing usage counter, start from anywhere is O,K. Considering 1947 * memory reclaim from current node, there are pros. and cons. 1948 * 1949 * Freeing memory from current node means freeing memory from a node which 1950 * we'll use or we've used. So, it may make LRU bad. And if several threads 1951 * hit limits, it will see a contention on a node. But freeing from remote 1952 * node means more costs for memory reclaim because of memory latency. 1953 * 1954 * Now, we use round-robin. Better algorithm is welcomed. 1955 */ 1956 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1957 { 1958 int node; 1959 1960 mem_cgroup_may_update_nodemask(memcg); 1961 node = memcg->last_scanned_node; 1962 1963 node = next_node(node, memcg->scan_nodes); 1964 if (node == MAX_NUMNODES) 1965 node = first_node(memcg->scan_nodes); 1966 /* 1967 * We call this when we hit limit, not when pages are added to LRU. 1968 * No LRU may hold pages because all pages are UNEVICTABLE or 1969 * memcg is too small and all pages are not on LRU. In that case, 1970 * we use curret node. 1971 */ 1972 if (unlikely(node == MAX_NUMNODES)) 1973 node = numa_node_id(); 1974 1975 memcg->last_scanned_node = node; 1976 return node; 1977 } 1978 1979 /* 1980 * Check all nodes whether it contains reclaimable pages or not. 1981 * For quick scan, we make use of scan_nodes. This will allow us to skip 1982 * unused nodes. But scan_nodes is lazily updated and may not cotain 1983 * enough new information. We need to do double check. 1984 */ 1985 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1986 { 1987 int nid; 1988 1989 /* 1990 * quick check...making use of scan_node. 1991 * We can skip unused nodes. 1992 */ 1993 if (!nodes_empty(memcg->scan_nodes)) { 1994 for (nid = first_node(memcg->scan_nodes); 1995 nid < MAX_NUMNODES; 1996 nid = next_node(nid, memcg->scan_nodes)) { 1997 1998 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1999 return true; 2000 } 2001 } 2002 /* 2003 * Check rest of nodes. 2004 */ 2005 for_each_node_state(nid, N_MEMORY) { 2006 if (node_isset(nid, memcg->scan_nodes)) 2007 continue; 2008 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2009 return true; 2010 } 2011 return false; 2012 } 2013 2014 #else 2015 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2016 { 2017 return 0; 2018 } 2019 2020 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2021 { 2022 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 2023 } 2024 #endif 2025 2026 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 2027 struct zone *zone, 2028 gfp_t gfp_mask, 2029 unsigned long *total_scanned) 2030 { 2031 struct mem_cgroup *victim = NULL; 2032 int total = 0; 2033 int loop = 0; 2034 unsigned long excess; 2035 unsigned long nr_scanned; 2036 struct mem_cgroup_reclaim_cookie reclaim = { 2037 .zone = zone, 2038 .priority = 0, 2039 }; 2040 2041 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 2042 2043 while (1) { 2044 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 2045 if (!victim) { 2046 loop++; 2047 if (loop >= 2) { 2048 /* 2049 * If we have not been able to reclaim 2050 * anything, it might because there are 2051 * no reclaimable pages under this hierarchy 2052 */ 2053 if (!total) 2054 break; 2055 /* 2056 * We want to do more targeted reclaim. 2057 * excess >> 2 is not to excessive so as to 2058 * reclaim too much, nor too less that we keep 2059 * coming back to reclaim from this cgroup 2060 */ 2061 if (total >= (excess >> 2) || 2062 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2063 break; 2064 } 2065 continue; 2066 } 2067 if (!mem_cgroup_reclaimable(victim, false)) 2068 continue; 2069 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2070 zone, &nr_scanned); 2071 *total_scanned += nr_scanned; 2072 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2073 break; 2074 } 2075 mem_cgroup_iter_break(root_memcg, victim); 2076 return total; 2077 } 2078 2079 /* 2080 * Check OOM-Killer is already running under our hierarchy. 2081 * If someone is running, return false. 2082 * Has to be called with memcg_oom_lock 2083 */ 2084 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 2085 { 2086 struct mem_cgroup *iter, *failed = NULL; 2087 2088 for_each_mem_cgroup_tree(iter, memcg) { 2089 if (iter->oom_lock) { 2090 /* 2091 * this subtree of our hierarchy is already locked 2092 * so we cannot give a lock. 2093 */ 2094 failed = iter; 2095 mem_cgroup_iter_break(memcg, iter); 2096 break; 2097 } else 2098 iter->oom_lock = true; 2099 } 2100 2101 if (!failed) 2102 return true; 2103 2104 /* 2105 * OK, we failed to lock the whole subtree so we have to clean up 2106 * what we set up to the failing subtree 2107 */ 2108 for_each_mem_cgroup_tree(iter, memcg) { 2109 if (iter == failed) { 2110 mem_cgroup_iter_break(memcg, iter); 2111 break; 2112 } 2113 iter->oom_lock = false; 2114 } 2115 return false; 2116 } 2117 2118 /* 2119 * Has to be called with memcg_oom_lock 2120 */ 2121 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2122 { 2123 struct mem_cgroup *iter; 2124 2125 for_each_mem_cgroup_tree(iter, memcg) 2126 iter->oom_lock = false; 2127 return 0; 2128 } 2129 2130 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2131 { 2132 struct mem_cgroup *iter; 2133 2134 for_each_mem_cgroup_tree(iter, memcg) 2135 atomic_inc(&iter->under_oom); 2136 } 2137 2138 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2139 { 2140 struct mem_cgroup *iter; 2141 2142 /* 2143 * When a new child is created while the hierarchy is under oom, 2144 * mem_cgroup_oom_lock() may not be called. We have to use 2145 * atomic_add_unless() here. 2146 */ 2147 for_each_mem_cgroup_tree(iter, memcg) 2148 atomic_add_unless(&iter->under_oom, -1, 0); 2149 } 2150 2151 static DEFINE_SPINLOCK(memcg_oom_lock); 2152 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2153 2154 struct oom_wait_info { 2155 struct mem_cgroup *memcg; 2156 wait_queue_t wait; 2157 }; 2158 2159 static int memcg_oom_wake_function(wait_queue_t *wait, 2160 unsigned mode, int sync, void *arg) 2161 { 2162 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2163 struct mem_cgroup *oom_wait_memcg; 2164 struct oom_wait_info *oom_wait_info; 2165 2166 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2167 oom_wait_memcg = oom_wait_info->memcg; 2168 2169 /* 2170 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2171 * Then we can use css_is_ancestor without taking care of RCU. 2172 */ 2173 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2174 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2175 return 0; 2176 return autoremove_wake_function(wait, mode, sync, arg); 2177 } 2178 2179 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2180 { 2181 /* for filtering, pass "memcg" as argument. */ 2182 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2183 } 2184 2185 static void memcg_oom_recover(struct mem_cgroup *memcg) 2186 { 2187 if (memcg && atomic_read(&memcg->under_oom)) 2188 memcg_wakeup_oom(memcg); 2189 } 2190 2191 /* 2192 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2193 */ 2194 static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2195 int order) 2196 { 2197 struct oom_wait_info owait; 2198 bool locked, need_to_kill; 2199 2200 owait.memcg = memcg; 2201 owait.wait.flags = 0; 2202 owait.wait.func = memcg_oom_wake_function; 2203 owait.wait.private = current; 2204 INIT_LIST_HEAD(&owait.wait.task_list); 2205 need_to_kill = true; 2206 mem_cgroup_mark_under_oom(memcg); 2207 2208 /* At first, try to OOM lock hierarchy under memcg.*/ 2209 spin_lock(&memcg_oom_lock); 2210 locked = mem_cgroup_oom_lock(memcg); 2211 /* 2212 * Even if signal_pending(), we can't quit charge() loop without 2213 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 2214 * under OOM is always welcomed, use TASK_KILLABLE here. 2215 */ 2216 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2217 if (!locked || memcg->oom_kill_disable) 2218 need_to_kill = false; 2219 if (locked) 2220 mem_cgroup_oom_notify(memcg); 2221 spin_unlock(&memcg_oom_lock); 2222 2223 if (need_to_kill) { 2224 finish_wait(&memcg_oom_waitq, &owait.wait); 2225 mem_cgroup_out_of_memory(memcg, mask, order); 2226 } else { 2227 schedule(); 2228 finish_wait(&memcg_oom_waitq, &owait.wait); 2229 } 2230 spin_lock(&memcg_oom_lock); 2231 if (locked) 2232 mem_cgroup_oom_unlock(memcg); 2233 memcg_wakeup_oom(memcg); 2234 spin_unlock(&memcg_oom_lock); 2235 2236 mem_cgroup_unmark_under_oom(memcg); 2237 2238 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2239 return false; 2240 /* Give chance to dying process */ 2241 schedule_timeout_uninterruptible(1); 2242 return true; 2243 } 2244 2245 /* 2246 * Currently used to update mapped file statistics, but the routine can be 2247 * generalized to update other statistics as well. 2248 * 2249 * Notes: Race condition 2250 * 2251 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2252 * it tends to be costly. But considering some conditions, we doesn't need 2253 * to do so _always_. 2254 * 2255 * Considering "charge", lock_page_cgroup() is not required because all 2256 * file-stat operations happen after a page is attached to radix-tree. There 2257 * are no race with "charge". 2258 * 2259 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2260 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2261 * if there are race with "uncharge". Statistics itself is properly handled 2262 * by flags. 2263 * 2264 * Considering "move", this is an only case we see a race. To make the race 2265 * small, we check mm->moving_account and detect there are possibility of race 2266 * If there is, we take a lock. 2267 */ 2268 2269 void __mem_cgroup_begin_update_page_stat(struct page *page, 2270 bool *locked, unsigned long *flags) 2271 { 2272 struct mem_cgroup *memcg; 2273 struct page_cgroup *pc; 2274 2275 pc = lookup_page_cgroup(page); 2276 again: 2277 memcg = pc->mem_cgroup; 2278 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2279 return; 2280 /* 2281 * If this memory cgroup is not under account moving, we don't 2282 * need to take move_lock_mem_cgroup(). Because we already hold 2283 * rcu_read_lock(), any calls to move_account will be delayed until 2284 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2285 */ 2286 if (!mem_cgroup_stolen(memcg)) 2287 return; 2288 2289 move_lock_mem_cgroup(memcg, flags); 2290 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2291 move_unlock_mem_cgroup(memcg, flags); 2292 goto again; 2293 } 2294 *locked = true; 2295 } 2296 2297 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2298 { 2299 struct page_cgroup *pc = lookup_page_cgroup(page); 2300 2301 /* 2302 * It's guaranteed that pc->mem_cgroup never changes while 2303 * lock is held because a routine modifies pc->mem_cgroup 2304 * should take move_lock_mem_cgroup(). 2305 */ 2306 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2307 } 2308 2309 void mem_cgroup_update_page_stat(struct page *page, 2310 enum mem_cgroup_page_stat_item idx, int val) 2311 { 2312 struct mem_cgroup *memcg; 2313 struct page_cgroup *pc = lookup_page_cgroup(page); 2314 unsigned long uninitialized_var(flags); 2315 2316 if (mem_cgroup_disabled()) 2317 return; 2318 2319 memcg = pc->mem_cgroup; 2320 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2321 return; 2322 2323 switch (idx) { 2324 case MEMCG_NR_FILE_MAPPED: 2325 idx = MEM_CGROUP_STAT_FILE_MAPPED; 2326 break; 2327 default: 2328 BUG(); 2329 } 2330 2331 this_cpu_add(memcg->stat->count[idx], val); 2332 } 2333 2334 /* 2335 * size of first charge trial. "32" comes from vmscan.c's magic value. 2336 * TODO: maybe necessary to use big numbers in big irons. 2337 */ 2338 #define CHARGE_BATCH 32U 2339 struct memcg_stock_pcp { 2340 struct mem_cgroup *cached; /* this never be root cgroup */ 2341 unsigned int nr_pages; 2342 struct work_struct work; 2343 unsigned long flags; 2344 #define FLUSHING_CACHED_CHARGE 0 2345 }; 2346 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2347 static DEFINE_MUTEX(percpu_charge_mutex); 2348 2349 /** 2350 * consume_stock: Try to consume stocked charge on this cpu. 2351 * @memcg: memcg to consume from. 2352 * @nr_pages: how many pages to charge. 2353 * 2354 * The charges will only happen if @memcg matches the current cpu's memcg 2355 * stock, and at least @nr_pages are available in that stock. Failure to 2356 * service an allocation will refill the stock. 2357 * 2358 * returns true if successful, false otherwise. 2359 */ 2360 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2361 { 2362 struct memcg_stock_pcp *stock; 2363 bool ret = true; 2364 2365 if (nr_pages > CHARGE_BATCH) 2366 return false; 2367 2368 stock = &get_cpu_var(memcg_stock); 2369 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2370 stock->nr_pages -= nr_pages; 2371 else /* need to call res_counter_charge */ 2372 ret = false; 2373 put_cpu_var(memcg_stock); 2374 return ret; 2375 } 2376 2377 /* 2378 * Returns stocks cached in percpu to res_counter and reset cached information. 2379 */ 2380 static void drain_stock(struct memcg_stock_pcp *stock) 2381 { 2382 struct mem_cgroup *old = stock->cached; 2383 2384 if (stock->nr_pages) { 2385 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2386 2387 res_counter_uncharge(&old->res, bytes); 2388 if (do_swap_account) 2389 res_counter_uncharge(&old->memsw, bytes); 2390 stock->nr_pages = 0; 2391 } 2392 stock->cached = NULL; 2393 } 2394 2395 /* 2396 * This must be called under preempt disabled or must be called by 2397 * a thread which is pinned to local cpu. 2398 */ 2399 static void drain_local_stock(struct work_struct *dummy) 2400 { 2401 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2402 drain_stock(stock); 2403 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2404 } 2405 2406 static void __init memcg_stock_init(void) 2407 { 2408 int cpu; 2409 2410 for_each_possible_cpu(cpu) { 2411 struct memcg_stock_pcp *stock = 2412 &per_cpu(memcg_stock, cpu); 2413 INIT_WORK(&stock->work, drain_local_stock); 2414 } 2415 } 2416 2417 /* 2418 * Cache charges(val) which is from res_counter, to local per_cpu area. 2419 * This will be consumed by consume_stock() function, later. 2420 */ 2421 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2422 { 2423 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2424 2425 if (stock->cached != memcg) { /* reset if necessary */ 2426 drain_stock(stock); 2427 stock->cached = memcg; 2428 } 2429 stock->nr_pages += nr_pages; 2430 put_cpu_var(memcg_stock); 2431 } 2432 2433 /* 2434 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2435 * of the hierarchy under it. sync flag says whether we should block 2436 * until the work is done. 2437 */ 2438 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2439 { 2440 int cpu, curcpu; 2441 2442 /* Notify other cpus that system-wide "drain" is running */ 2443 get_online_cpus(); 2444 curcpu = get_cpu(); 2445 for_each_online_cpu(cpu) { 2446 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2447 struct mem_cgroup *memcg; 2448 2449 memcg = stock->cached; 2450 if (!memcg || !stock->nr_pages) 2451 continue; 2452 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2453 continue; 2454 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2455 if (cpu == curcpu) 2456 drain_local_stock(&stock->work); 2457 else 2458 schedule_work_on(cpu, &stock->work); 2459 } 2460 } 2461 put_cpu(); 2462 2463 if (!sync) 2464 goto out; 2465 2466 for_each_online_cpu(cpu) { 2467 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2468 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2469 flush_work(&stock->work); 2470 } 2471 out: 2472 put_online_cpus(); 2473 } 2474 2475 /* 2476 * Tries to drain stocked charges in other cpus. This function is asynchronous 2477 * and just put a work per cpu for draining localy on each cpu. Caller can 2478 * expects some charges will be back to res_counter later but cannot wait for 2479 * it. 2480 */ 2481 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2482 { 2483 /* 2484 * If someone calls draining, avoid adding more kworker runs. 2485 */ 2486 if (!mutex_trylock(&percpu_charge_mutex)) 2487 return; 2488 drain_all_stock(root_memcg, false); 2489 mutex_unlock(&percpu_charge_mutex); 2490 } 2491 2492 /* This is a synchronous drain interface. */ 2493 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2494 { 2495 /* called when force_empty is called */ 2496 mutex_lock(&percpu_charge_mutex); 2497 drain_all_stock(root_memcg, true); 2498 mutex_unlock(&percpu_charge_mutex); 2499 } 2500 2501 /* 2502 * This function drains percpu counter value from DEAD cpu and 2503 * move it to local cpu. Note that this function can be preempted. 2504 */ 2505 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2506 { 2507 int i; 2508 2509 spin_lock(&memcg->pcp_counter_lock); 2510 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2511 long x = per_cpu(memcg->stat->count[i], cpu); 2512 2513 per_cpu(memcg->stat->count[i], cpu) = 0; 2514 memcg->nocpu_base.count[i] += x; 2515 } 2516 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2517 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2518 2519 per_cpu(memcg->stat->events[i], cpu) = 0; 2520 memcg->nocpu_base.events[i] += x; 2521 } 2522 spin_unlock(&memcg->pcp_counter_lock); 2523 } 2524 2525 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2526 unsigned long action, 2527 void *hcpu) 2528 { 2529 int cpu = (unsigned long)hcpu; 2530 struct memcg_stock_pcp *stock; 2531 struct mem_cgroup *iter; 2532 2533 if (action == CPU_ONLINE) 2534 return NOTIFY_OK; 2535 2536 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2537 return NOTIFY_OK; 2538 2539 for_each_mem_cgroup(iter) 2540 mem_cgroup_drain_pcp_counter(iter, cpu); 2541 2542 stock = &per_cpu(memcg_stock, cpu); 2543 drain_stock(stock); 2544 return NOTIFY_OK; 2545 } 2546 2547 2548 /* See __mem_cgroup_try_charge() for details */ 2549 enum { 2550 CHARGE_OK, /* success */ 2551 CHARGE_RETRY, /* need to retry but retry is not bad */ 2552 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2553 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2554 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2555 }; 2556 2557 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2558 unsigned int nr_pages, unsigned int min_pages, 2559 bool oom_check) 2560 { 2561 unsigned long csize = nr_pages * PAGE_SIZE; 2562 struct mem_cgroup *mem_over_limit; 2563 struct res_counter *fail_res; 2564 unsigned long flags = 0; 2565 int ret; 2566 2567 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2568 2569 if (likely(!ret)) { 2570 if (!do_swap_account) 2571 return CHARGE_OK; 2572 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2573 if (likely(!ret)) 2574 return CHARGE_OK; 2575 2576 res_counter_uncharge(&memcg->res, csize); 2577 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2578 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2579 } else 2580 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2581 /* 2582 * Never reclaim on behalf of optional batching, retry with a 2583 * single page instead. 2584 */ 2585 if (nr_pages > min_pages) 2586 return CHARGE_RETRY; 2587 2588 if (!(gfp_mask & __GFP_WAIT)) 2589 return CHARGE_WOULDBLOCK; 2590 2591 if (gfp_mask & __GFP_NORETRY) 2592 return CHARGE_NOMEM; 2593 2594 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2595 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2596 return CHARGE_RETRY; 2597 /* 2598 * Even though the limit is exceeded at this point, reclaim 2599 * may have been able to free some pages. Retry the charge 2600 * before killing the task. 2601 * 2602 * Only for regular pages, though: huge pages are rather 2603 * unlikely to succeed so close to the limit, and we fall back 2604 * to regular pages anyway in case of failure. 2605 */ 2606 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2607 return CHARGE_RETRY; 2608 2609 /* 2610 * At task move, charge accounts can be doubly counted. So, it's 2611 * better to wait until the end of task_move if something is going on. 2612 */ 2613 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2614 return CHARGE_RETRY; 2615 2616 /* If we don't need to call oom-killer at el, return immediately */ 2617 if (!oom_check) 2618 return CHARGE_NOMEM; 2619 /* check OOM */ 2620 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2621 return CHARGE_OOM_DIE; 2622 2623 return CHARGE_RETRY; 2624 } 2625 2626 /* 2627 * __mem_cgroup_try_charge() does 2628 * 1. detect memcg to be charged against from passed *mm and *ptr, 2629 * 2. update res_counter 2630 * 3. call memory reclaim if necessary. 2631 * 2632 * In some special case, if the task is fatal, fatal_signal_pending() or 2633 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2634 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2635 * as possible without any hazards. 2: all pages should have a valid 2636 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2637 * pointer, that is treated as a charge to root_mem_cgroup. 2638 * 2639 * So __mem_cgroup_try_charge() will return 2640 * 0 ... on success, filling *ptr with a valid memcg pointer. 2641 * -ENOMEM ... charge failure because of resource limits. 2642 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2643 * 2644 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2645 * the oom-killer can be invoked. 2646 */ 2647 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2648 gfp_t gfp_mask, 2649 unsigned int nr_pages, 2650 struct mem_cgroup **ptr, 2651 bool oom) 2652 { 2653 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2654 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2655 struct mem_cgroup *memcg = NULL; 2656 int ret; 2657 2658 /* 2659 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2660 * in system level. So, allow to go ahead dying process in addition to 2661 * MEMDIE process. 2662 */ 2663 if (unlikely(test_thread_flag(TIF_MEMDIE) 2664 || fatal_signal_pending(current))) 2665 goto bypass; 2666 2667 /* 2668 * We always charge the cgroup the mm_struct belongs to. 2669 * The mm_struct's mem_cgroup changes on task migration if the 2670 * thread group leader migrates. It's possible that mm is not 2671 * set, if so charge the root memcg (happens for pagecache usage). 2672 */ 2673 if (!*ptr && !mm) 2674 *ptr = root_mem_cgroup; 2675 again: 2676 if (*ptr) { /* css should be a valid one */ 2677 memcg = *ptr; 2678 if (mem_cgroup_is_root(memcg)) 2679 goto done; 2680 if (consume_stock(memcg, nr_pages)) 2681 goto done; 2682 css_get(&memcg->css); 2683 } else { 2684 struct task_struct *p; 2685 2686 rcu_read_lock(); 2687 p = rcu_dereference(mm->owner); 2688 /* 2689 * Because we don't have task_lock(), "p" can exit. 2690 * In that case, "memcg" can point to root or p can be NULL with 2691 * race with swapoff. Then, we have small risk of mis-accouning. 2692 * But such kind of mis-account by race always happens because 2693 * we don't have cgroup_mutex(). It's overkill and we allo that 2694 * small race, here. 2695 * (*) swapoff at el will charge against mm-struct not against 2696 * task-struct. So, mm->owner can be NULL. 2697 */ 2698 memcg = mem_cgroup_from_task(p); 2699 if (!memcg) 2700 memcg = root_mem_cgroup; 2701 if (mem_cgroup_is_root(memcg)) { 2702 rcu_read_unlock(); 2703 goto done; 2704 } 2705 if (consume_stock(memcg, nr_pages)) { 2706 /* 2707 * It seems dagerous to access memcg without css_get(). 2708 * But considering how consume_stok works, it's not 2709 * necessary. If consume_stock success, some charges 2710 * from this memcg are cached on this cpu. So, we 2711 * don't need to call css_get()/css_tryget() before 2712 * calling consume_stock(). 2713 */ 2714 rcu_read_unlock(); 2715 goto done; 2716 } 2717 /* after here, we may be blocked. we need to get refcnt */ 2718 if (!css_tryget(&memcg->css)) { 2719 rcu_read_unlock(); 2720 goto again; 2721 } 2722 rcu_read_unlock(); 2723 } 2724 2725 do { 2726 bool oom_check; 2727 2728 /* If killed, bypass charge */ 2729 if (fatal_signal_pending(current)) { 2730 css_put(&memcg->css); 2731 goto bypass; 2732 } 2733 2734 oom_check = false; 2735 if (oom && !nr_oom_retries) { 2736 oom_check = true; 2737 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2738 } 2739 2740 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, 2741 oom_check); 2742 switch (ret) { 2743 case CHARGE_OK: 2744 break; 2745 case CHARGE_RETRY: /* not in OOM situation but retry */ 2746 batch = nr_pages; 2747 css_put(&memcg->css); 2748 memcg = NULL; 2749 goto again; 2750 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2751 css_put(&memcg->css); 2752 goto nomem; 2753 case CHARGE_NOMEM: /* OOM routine works */ 2754 if (!oom) { 2755 css_put(&memcg->css); 2756 goto nomem; 2757 } 2758 /* If oom, we never return -ENOMEM */ 2759 nr_oom_retries--; 2760 break; 2761 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2762 css_put(&memcg->css); 2763 goto bypass; 2764 } 2765 } while (ret != CHARGE_OK); 2766 2767 if (batch > nr_pages) 2768 refill_stock(memcg, batch - nr_pages); 2769 css_put(&memcg->css); 2770 done: 2771 *ptr = memcg; 2772 return 0; 2773 nomem: 2774 *ptr = NULL; 2775 return -ENOMEM; 2776 bypass: 2777 *ptr = root_mem_cgroup; 2778 return -EINTR; 2779 } 2780 2781 /* 2782 * Somemtimes we have to undo a charge we got by try_charge(). 2783 * This function is for that and do uncharge, put css's refcnt. 2784 * gotten by try_charge(). 2785 */ 2786 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2787 unsigned int nr_pages) 2788 { 2789 if (!mem_cgroup_is_root(memcg)) { 2790 unsigned long bytes = nr_pages * PAGE_SIZE; 2791 2792 res_counter_uncharge(&memcg->res, bytes); 2793 if (do_swap_account) 2794 res_counter_uncharge(&memcg->memsw, bytes); 2795 } 2796 } 2797 2798 /* 2799 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2800 * This is useful when moving usage to parent cgroup. 2801 */ 2802 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2803 unsigned int nr_pages) 2804 { 2805 unsigned long bytes = nr_pages * PAGE_SIZE; 2806 2807 if (mem_cgroup_is_root(memcg)) 2808 return; 2809 2810 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2811 if (do_swap_account) 2812 res_counter_uncharge_until(&memcg->memsw, 2813 memcg->memsw.parent, bytes); 2814 } 2815 2816 /* 2817 * A helper function to get mem_cgroup from ID. must be called under 2818 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2819 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2820 * called against removed memcg.) 2821 */ 2822 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2823 { 2824 struct cgroup_subsys_state *css; 2825 2826 /* ID 0 is unused ID */ 2827 if (!id) 2828 return NULL; 2829 css = css_lookup(&mem_cgroup_subsys, id); 2830 if (!css) 2831 return NULL; 2832 return mem_cgroup_from_css(css); 2833 } 2834 2835 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2836 { 2837 struct mem_cgroup *memcg = NULL; 2838 struct page_cgroup *pc; 2839 unsigned short id; 2840 swp_entry_t ent; 2841 2842 VM_BUG_ON(!PageLocked(page)); 2843 2844 pc = lookup_page_cgroup(page); 2845 lock_page_cgroup(pc); 2846 if (PageCgroupUsed(pc)) { 2847 memcg = pc->mem_cgroup; 2848 if (memcg && !css_tryget(&memcg->css)) 2849 memcg = NULL; 2850 } else if (PageSwapCache(page)) { 2851 ent.val = page_private(page); 2852 id = lookup_swap_cgroup_id(ent); 2853 rcu_read_lock(); 2854 memcg = mem_cgroup_lookup(id); 2855 if (memcg && !css_tryget(&memcg->css)) 2856 memcg = NULL; 2857 rcu_read_unlock(); 2858 } 2859 unlock_page_cgroup(pc); 2860 return memcg; 2861 } 2862 2863 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2864 struct page *page, 2865 unsigned int nr_pages, 2866 enum charge_type ctype, 2867 bool lrucare) 2868 { 2869 struct page_cgroup *pc = lookup_page_cgroup(page); 2870 struct zone *uninitialized_var(zone); 2871 struct lruvec *lruvec; 2872 bool was_on_lru = false; 2873 bool anon; 2874 2875 lock_page_cgroup(pc); 2876 VM_BUG_ON(PageCgroupUsed(pc)); 2877 /* 2878 * we don't need page_cgroup_lock about tail pages, becase they are not 2879 * accessed by any other context at this point. 2880 */ 2881 2882 /* 2883 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2884 * may already be on some other mem_cgroup's LRU. Take care of it. 2885 */ 2886 if (lrucare) { 2887 zone = page_zone(page); 2888 spin_lock_irq(&zone->lru_lock); 2889 if (PageLRU(page)) { 2890 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2891 ClearPageLRU(page); 2892 del_page_from_lru_list(page, lruvec, page_lru(page)); 2893 was_on_lru = true; 2894 } 2895 } 2896 2897 pc->mem_cgroup = memcg; 2898 /* 2899 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2900 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2901 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2902 * before USED bit, we need memory barrier here. 2903 * See mem_cgroup_add_lru_list(), etc. 2904 */ 2905 smp_wmb(); 2906 SetPageCgroupUsed(pc); 2907 2908 if (lrucare) { 2909 if (was_on_lru) { 2910 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2911 VM_BUG_ON(PageLRU(page)); 2912 SetPageLRU(page); 2913 add_page_to_lru_list(page, lruvec, page_lru(page)); 2914 } 2915 spin_unlock_irq(&zone->lru_lock); 2916 } 2917 2918 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) 2919 anon = true; 2920 else 2921 anon = false; 2922 2923 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2924 unlock_page_cgroup(pc); 2925 2926 /* 2927 * "charge_statistics" updated event counter. Then, check it. 2928 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2929 * if they exceeds softlimit. 2930 */ 2931 memcg_check_events(memcg, page); 2932 } 2933 2934 static DEFINE_MUTEX(set_limit_mutex); 2935 2936 #ifdef CONFIG_MEMCG_KMEM 2937 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2938 { 2939 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2940 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 2941 } 2942 2943 /* 2944 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2945 * in the memcg_cache_params struct. 2946 */ 2947 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2948 { 2949 struct kmem_cache *cachep; 2950 2951 VM_BUG_ON(p->is_root_cache); 2952 cachep = p->root_cache; 2953 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; 2954 } 2955 2956 #ifdef CONFIG_SLABINFO 2957 static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, 2958 struct seq_file *m) 2959 { 2960 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2961 struct memcg_cache_params *params; 2962 2963 if (!memcg_can_account_kmem(memcg)) 2964 return -EIO; 2965 2966 print_slabinfo_header(m); 2967 2968 mutex_lock(&memcg->slab_caches_mutex); 2969 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2970 cache_show(memcg_params_to_cache(params), m); 2971 mutex_unlock(&memcg->slab_caches_mutex); 2972 2973 return 0; 2974 } 2975 #endif 2976 2977 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2978 { 2979 struct res_counter *fail_res; 2980 struct mem_cgroup *_memcg; 2981 int ret = 0; 2982 bool may_oom; 2983 2984 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2985 if (ret) 2986 return ret; 2987 2988 /* 2989 * Conditions under which we can wait for the oom_killer. Those are 2990 * the same conditions tested by the core page allocator 2991 */ 2992 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); 2993 2994 _memcg = memcg; 2995 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 2996 &_memcg, may_oom); 2997 2998 if (ret == -EINTR) { 2999 /* 3000 * __mem_cgroup_try_charge() chosed to bypass to root due to 3001 * OOM kill or fatal signal. Since our only options are to 3002 * either fail the allocation or charge it to this cgroup, do 3003 * it as a temporary condition. But we can't fail. From a 3004 * kmem/slab perspective, the cache has already been selected, 3005 * by mem_cgroup_kmem_get_cache(), so it is too late to change 3006 * our minds. 3007 * 3008 * This condition will only trigger if the task entered 3009 * memcg_charge_kmem in a sane state, but was OOM-killed during 3010 * __mem_cgroup_try_charge() above. Tasks that were already 3011 * dying when the allocation triggers should have been already 3012 * directed to the root cgroup in memcontrol.h 3013 */ 3014 res_counter_charge_nofail(&memcg->res, size, &fail_res); 3015 if (do_swap_account) 3016 res_counter_charge_nofail(&memcg->memsw, size, 3017 &fail_res); 3018 ret = 0; 3019 } else if (ret) 3020 res_counter_uncharge(&memcg->kmem, size); 3021 3022 return ret; 3023 } 3024 3025 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 3026 { 3027 res_counter_uncharge(&memcg->res, size); 3028 if (do_swap_account) 3029 res_counter_uncharge(&memcg->memsw, size); 3030 3031 /* Not down to 0 */ 3032 if (res_counter_uncharge(&memcg->kmem, size)) 3033 return; 3034 3035 /* 3036 * Releases a reference taken in kmem_cgroup_css_offline in case 3037 * this last uncharge is racing with the offlining code or it is 3038 * outliving the memcg existence. 3039 * 3040 * The memory barrier imposed by test&clear is paired with the 3041 * explicit one in memcg_kmem_mark_dead(). 3042 */ 3043 if (memcg_kmem_test_and_clear_dead(memcg)) 3044 css_put(&memcg->css); 3045 } 3046 3047 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3048 { 3049 if (!memcg) 3050 return; 3051 3052 mutex_lock(&memcg->slab_caches_mutex); 3053 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 3054 mutex_unlock(&memcg->slab_caches_mutex); 3055 } 3056 3057 /* 3058 * helper for acessing a memcg's index. It will be used as an index in the 3059 * child cache array in kmem_cache, and also to derive its name. This function 3060 * will return -1 when this is not a kmem-limited memcg. 3061 */ 3062 int memcg_cache_id(struct mem_cgroup *memcg) 3063 { 3064 return memcg ? memcg->kmemcg_id : -1; 3065 } 3066 3067 /* 3068 * This ends up being protected by the set_limit mutex, during normal 3069 * operation, because that is its main call site. 3070 * 3071 * But when we create a new cache, we can call this as well if its parent 3072 * is kmem-limited. That will have to hold set_limit_mutex as well. 3073 */ 3074 int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3075 { 3076 int num, ret; 3077 3078 num = ida_simple_get(&kmem_limited_groups, 3079 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3080 if (num < 0) 3081 return num; 3082 /* 3083 * After this point, kmem_accounted (that we test atomically in 3084 * the beginning of this conditional), is no longer 0. This 3085 * guarantees only one process will set the following boolean 3086 * to true. We don't need test_and_set because we're protected 3087 * by the set_limit_mutex anyway. 3088 */ 3089 memcg_kmem_set_activated(memcg); 3090 3091 ret = memcg_update_all_caches(num+1); 3092 if (ret) { 3093 ida_simple_remove(&kmem_limited_groups, num); 3094 memcg_kmem_clear_activated(memcg); 3095 return ret; 3096 } 3097 3098 memcg->kmemcg_id = num; 3099 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 3100 mutex_init(&memcg->slab_caches_mutex); 3101 return 0; 3102 } 3103 3104 static size_t memcg_caches_array_size(int num_groups) 3105 { 3106 ssize_t size; 3107 if (num_groups <= 0) 3108 return 0; 3109 3110 size = 2 * num_groups; 3111 if (size < MEMCG_CACHES_MIN_SIZE) 3112 size = MEMCG_CACHES_MIN_SIZE; 3113 else if (size > MEMCG_CACHES_MAX_SIZE) 3114 size = MEMCG_CACHES_MAX_SIZE; 3115 3116 return size; 3117 } 3118 3119 /* 3120 * We should update the current array size iff all caches updates succeed. This 3121 * can only be done from the slab side. The slab mutex needs to be held when 3122 * calling this. 3123 */ 3124 void memcg_update_array_size(int num) 3125 { 3126 if (num > memcg_limited_groups_array_size) 3127 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3128 } 3129 3130 static void kmem_cache_destroy_work_func(struct work_struct *w); 3131 3132 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3133 { 3134 struct memcg_cache_params *cur_params = s->memcg_params; 3135 3136 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); 3137 3138 if (num_groups > memcg_limited_groups_array_size) { 3139 int i; 3140 ssize_t size = memcg_caches_array_size(num_groups); 3141 3142 size *= sizeof(void *); 3143 size += sizeof(struct memcg_cache_params); 3144 3145 s->memcg_params = kzalloc(size, GFP_KERNEL); 3146 if (!s->memcg_params) { 3147 s->memcg_params = cur_params; 3148 return -ENOMEM; 3149 } 3150 3151 s->memcg_params->is_root_cache = true; 3152 3153 /* 3154 * There is the chance it will be bigger than 3155 * memcg_limited_groups_array_size, if we failed an allocation 3156 * in a cache, in which case all caches updated before it, will 3157 * have a bigger array. 3158 * 3159 * But if that is the case, the data after 3160 * memcg_limited_groups_array_size is certainly unused 3161 */ 3162 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3163 if (!cur_params->memcg_caches[i]) 3164 continue; 3165 s->memcg_params->memcg_caches[i] = 3166 cur_params->memcg_caches[i]; 3167 } 3168 3169 /* 3170 * Ideally, we would wait until all caches succeed, and only 3171 * then free the old one. But this is not worth the extra 3172 * pointer per-cache we'd have to have for this. 3173 * 3174 * It is not a big deal if some caches are left with a size 3175 * bigger than the others. And all updates will reset this 3176 * anyway. 3177 */ 3178 kfree(cur_params); 3179 } 3180 return 0; 3181 } 3182 3183 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3184 struct kmem_cache *root_cache) 3185 { 3186 size_t size = sizeof(struct memcg_cache_params); 3187 3188 if (!memcg_kmem_enabled()) 3189 return 0; 3190 3191 if (!memcg) 3192 size += memcg_limited_groups_array_size * sizeof(void *); 3193 3194 s->memcg_params = kzalloc(size, GFP_KERNEL); 3195 if (!s->memcg_params) 3196 return -ENOMEM; 3197 3198 INIT_WORK(&s->memcg_params->destroy, 3199 kmem_cache_destroy_work_func); 3200 if (memcg) { 3201 s->memcg_params->memcg = memcg; 3202 s->memcg_params->root_cache = root_cache; 3203 } else 3204 s->memcg_params->is_root_cache = true; 3205 3206 return 0; 3207 } 3208 3209 void memcg_release_cache(struct kmem_cache *s) 3210 { 3211 struct kmem_cache *root; 3212 struct mem_cgroup *memcg; 3213 int id; 3214 3215 /* 3216 * This happens, for instance, when a root cache goes away before we 3217 * add any memcg. 3218 */ 3219 if (!s->memcg_params) 3220 return; 3221 3222 if (s->memcg_params->is_root_cache) 3223 goto out; 3224 3225 memcg = s->memcg_params->memcg; 3226 id = memcg_cache_id(memcg); 3227 3228 root = s->memcg_params->root_cache; 3229 root->memcg_params->memcg_caches[id] = NULL; 3230 3231 mutex_lock(&memcg->slab_caches_mutex); 3232 list_del(&s->memcg_params->list); 3233 mutex_unlock(&memcg->slab_caches_mutex); 3234 3235 css_put(&memcg->css); 3236 out: 3237 kfree(s->memcg_params); 3238 } 3239 3240 /* 3241 * During the creation a new cache, we need to disable our accounting mechanism 3242 * altogether. This is true even if we are not creating, but rather just 3243 * enqueing new caches to be created. 3244 * 3245 * This is because that process will trigger allocations; some visible, like 3246 * explicit kmallocs to auxiliary data structures, name strings and internal 3247 * cache structures; some well concealed, like INIT_WORK() that can allocate 3248 * objects during debug. 3249 * 3250 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3251 * to it. This may not be a bounded recursion: since the first cache creation 3252 * failed to complete (waiting on the allocation), we'll just try to create the 3253 * cache again, failing at the same point. 3254 * 3255 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3256 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3257 * inside the following two functions. 3258 */ 3259 static inline void memcg_stop_kmem_account(void) 3260 { 3261 VM_BUG_ON(!current->mm); 3262 current->memcg_kmem_skip_account++; 3263 } 3264 3265 static inline void memcg_resume_kmem_account(void) 3266 { 3267 VM_BUG_ON(!current->mm); 3268 current->memcg_kmem_skip_account--; 3269 } 3270 3271 static void kmem_cache_destroy_work_func(struct work_struct *w) 3272 { 3273 struct kmem_cache *cachep; 3274 struct memcg_cache_params *p; 3275 3276 p = container_of(w, struct memcg_cache_params, destroy); 3277 3278 cachep = memcg_params_to_cache(p); 3279 3280 /* 3281 * If we get down to 0 after shrink, we could delete right away. 3282 * However, memcg_release_pages() already puts us back in the workqueue 3283 * in that case. If we proceed deleting, we'll get a dangling 3284 * reference, and removing the object from the workqueue in that case 3285 * is unnecessary complication. We are not a fast path. 3286 * 3287 * Note that this case is fundamentally different from racing with 3288 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in 3289 * kmem_cache_shrink, not only we would be reinserting a dead cache 3290 * into the queue, but doing so from inside the worker racing to 3291 * destroy it. 3292 * 3293 * So if we aren't down to zero, we'll just schedule a worker and try 3294 * again 3295 */ 3296 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3297 kmem_cache_shrink(cachep); 3298 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3299 return; 3300 } else 3301 kmem_cache_destroy(cachep); 3302 } 3303 3304 void mem_cgroup_destroy_cache(struct kmem_cache *cachep) 3305 { 3306 if (!cachep->memcg_params->dead) 3307 return; 3308 3309 /* 3310 * There are many ways in which we can get here. 3311 * 3312 * We can get to a memory-pressure situation while the delayed work is 3313 * still pending to run. The vmscan shrinkers can then release all 3314 * cache memory and get us to destruction. If this is the case, we'll 3315 * be executed twice, which is a bug (the second time will execute over 3316 * bogus data). In this case, cancelling the work should be fine. 3317 * 3318 * But we can also get here from the worker itself, if 3319 * kmem_cache_shrink is enough to shake all the remaining objects and 3320 * get the page count to 0. In this case, we'll deadlock if we try to 3321 * cancel the work (the worker runs with an internal lock held, which 3322 * is the same lock we would hold for cancel_work_sync().) 3323 * 3324 * Since we can't possibly know who got us here, just refrain from 3325 * running if there is already work pending 3326 */ 3327 if (work_pending(&cachep->memcg_params->destroy)) 3328 return; 3329 /* 3330 * We have to defer the actual destroying to a workqueue, because 3331 * we might currently be in a context that cannot sleep. 3332 */ 3333 schedule_work(&cachep->memcg_params->destroy); 3334 } 3335 3336 /* 3337 * This lock protects updaters, not readers. We want readers to be as fast as 3338 * they can, and they will either see NULL or a valid cache value. Our model 3339 * allow them to see NULL, in which case the root memcg will be selected. 3340 * 3341 * We need this lock because multiple allocations to the same cache from a non 3342 * will span more than one worker. Only one of them can create the cache. 3343 */ 3344 static DEFINE_MUTEX(memcg_cache_mutex); 3345 3346 /* 3347 * Called with memcg_cache_mutex held 3348 */ 3349 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3350 struct kmem_cache *s) 3351 { 3352 struct kmem_cache *new; 3353 static char *tmp_name = NULL; 3354 3355 lockdep_assert_held(&memcg_cache_mutex); 3356 3357 /* 3358 * kmem_cache_create_memcg duplicates the given name and 3359 * cgroup_name for this name requires RCU context. 3360 * This static temporary buffer is used to prevent from 3361 * pointless shortliving allocation. 3362 */ 3363 if (!tmp_name) { 3364 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3365 if (!tmp_name) 3366 return NULL; 3367 } 3368 3369 rcu_read_lock(); 3370 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3371 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3372 rcu_read_unlock(); 3373 3374 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3375 (s->flags & ~SLAB_PANIC), s->ctor, s); 3376 3377 if (new) 3378 new->allocflags |= __GFP_KMEMCG; 3379 3380 return new; 3381 } 3382 3383 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3384 struct kmem_cache *cachep) 3385 { 3386 struct kmem_cache *new_cachep; 3387 int idx; 3388 3389 BUG_ON(!memcg_can_account_kmem(memcg)); 3390 3391 idx = memcg_cache_id(memcg); 3392 3393 mutex_lock(&memcg_cache_mutex); 3394 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3395 if (new_cachep) { 3396 css_put(&memcg->css); 3397 goto out; 3398 } 3399 3400 new_cachep = kmem_cache_dup(memcg, cachep); 3401 if (new_cachep == NULL) { 3402 new_cachep = cachep; 3403 css_put(&memcg->css); 3404 goto out; 3405 } 3406 3407 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3408 3409 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3410 /* 3411 * the readers won't lock, make sure everybody sees the updated value, 3412 * so they won't put stuff in the queue again for no reason 3413 */ 3414 wmb(); 3415 out: 3416 mutex_unlock(&memcg_cache_mutex); 3417 return new_cachep; 3418 } 3419 3420 void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3421 { 3422 struct kmem_cache *c; 3423 int i; 3424 3425 if (!s->memcg_params) 3426 return; 3427 if (!s->memcg_params->is_root_cache) 3428 return; 3429 3430 /* 3431 * If the cache is being destroyed, we trust that there is no one else 3432 * requesting objects from it. Even if there are, the sanity checks in 3433 * kmem_cache_destroy should caught this ill-case. 3434 * 3435 * Still, we don't want anyone else freeing memcg_caches under our 3436 * noses, which can happen if a new memcg comes to life. As usual, 3437 * we'll take the set_limit_mutex to protect ourselves against this. 3438 */ 3439 mutex_lock(&set_limit_mutex); 3440 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3441 c = s->memcg_params->memcg_caches[i]; 3442 if (!c) 3443 continue; 3444 3445 /* 3446 * We will now manually delete the caches, so to avoid races 3447 * we need to cancel all pending destruction workers and 3448 * proceed with destruction ourselves. 3449 * 3450 * kmem_cache_destroy() will call kmem_cache_shrink internally, 3451 * and that could spawn the workers again: it is likely that 3452 * the cache still have active pages until this very moment. 3453 * This would lead us back to mem_cgroup_destroy_cache. 3454 * 3455 * But that will not execute at all if the "dead" flag is not 3456 * set, so flip it down to guarantee we are in control. 3457 */ 3458 c->memcg_params->dead = false; 3459 cancel_work_sync(&c->memcg_params->destroy); 3460 kmem_cache_destroy(c); 3461 } 3462 mutex_unlock(&set_limit_mutex); 3463 } 3464 3465 struct create_work { 3466 struct mem_cgroup *memcg; 3467 struct kmem_cache *cachep; 3468 struct work_struct work; 3469 }; 3470 3471 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3472 { 3473 struct kmem_cache *cachep; 3474 struct memcg_cache_params *params; 3475 3476 if (!memcg_kmem_is_active(memcg)) 3477 return; 3478 3479 mutex_lock(&memcg->slab_caches_mutex); 3480 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3481 cachep = memcg_params_to_cache(params); 3482 cachep->memcg_params->dead = true; 3483 schedule_work(&cachep->memcg_params->destroy); 3484 } 3485 mutex_unlock(&memcg->slab_caches_mutex); 3486 } 3487 3488 static void memcg_create_cache_work_func(struct work_struct *w) 3489 { 3490 struct create_work *cw; 3491 3492 cw = container_of(w, struct create_work, work); 3493 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3494 kfree(cw); 3495 } 3496 3497 /* 3498 * Enqueue the creation of a per-memcg kmem_cache. 3499 */ 3500 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3501 struct kmem_cache *cachep) 3502 { 3503 struct create_work *cw; 3504 3505 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3506 if (cw == NULL) { 3507 css_put(&memcg->css); 3508 return; 3509 } 3510 3511 cw->memcg = memcg; 3512 cw->cachep = cachep; 3513 3514 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3515 schedule_work(&cw->work); 3516 } 3517 3518 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3519 struct kmem_cache *cachep) 3520 { 3521 /* 3522 * We need to stop accounting when we kmalloc, because if the 3523 * corresponding kmalloc cache is not yet created, the first allocation 3524 * in __memcg_create_cache_enqueue will recurse. 3525 * 3526 * However, it is better to enclose the whole function. Depending on 3527 * the debugging options enabled, INIT_WORK(), for instance, can 3528 * trigger an allocation. This too, will make us recurse. Because at 3529 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3530 * the safest choice is to do it like this, wrapping the whole function. 3531 */ 3532 memcg_stop_kmem_account(); 3533 __memcg_create_cache_enqueue(memcg, cachep); 3534 memcg_resume_kmem_account(); 3535 } 3536 /* 3537 * Return the kmem_cache we're supposed to use for a slab allocation. 3538 * We try to use the current memcg's version of the cache. 3539 * 3540 * If the cache does not exist yet, if we are the first user of it, 3541 * we either create it immediately, if possible, or create it asynchronously 3542 * in a workqueue. 3543 * In the latter case, we will let the current allocation go through with 3544 * the original cache. 3545 * 3546 * Can't be called in interrupt context or from kernel threads. 3547 * This function needs to be called with rcu_read_lock() held. 3548 */ 3549 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3550 gfp_t gfp) 3551 { 3552 struct mem_cgroup *memcg; 3553 int idx; 3554 3555 VM_BUG_ON(!cachep->memcg_params); 3556 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3557 3558 if (!current->mm || current->memcg_kmem_skip_account) 3559 return cachep; 3560 3561 rcu_read_lock(); 3562 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3563 3564 if (!memcg_can_account_kmem(memcg)) 3565 goto out; 3566 3567 idx = memcg_cache_id(memcg); 3568 3569 /* 3570 * barrier to mare sure we're always seeing the up to date value. The 3571 * code updating memcg_caches will issue a write barrier to match this. 3572 */ 3573 read_barrier_depends(); 3574 if (likely(cachep->memcg_params->memcg_caches[idx])) { 3575 cachep = cachep->memcg_params->memcg_caches[idx]; 3576 goto out; 3577 } 3578 3579 /* The corresponding put will be done in the workqueue. */ 3580 if (!css_tryget(&memcg->css)) 3581 goto out; 3582 rcu_read_unlock(); 3583 3584 /* 3585 * If we are in a safe context (can wait, and not in interrupt 3586 * context), we could be be predictable and return right away. 3587 * This would guarantee that the allocation being performed 3588 * already belongs in the new cache. 3589 * 3590 * However, there are some clashes that can arrive from locking. 3591 * For instance, because we acquire the slab_mutex while doing 3592 * kmem_cache_dup, this means no further allocation could happen 3593 * with the slab_mutex held. 3594 * 3595 * Also, because cache creation issue get_online_cpus(), this 3596 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, 3597 * that ends up reversed during cpu hotplug. (cpuset allocates 3598 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, 3599 * better to defer everything. 3600 */ 3601 memcg_create_cache_enqueue(memcg, cachep); 3602 return cachep; 3603 out: 3604 rcu_read_unlock(); 3605 return cachep; 3606 } 3607 EXPORT_SYMBOL(__memcg_kmem_get_cache); 3608 3609 /* 3610 * We need to verify if the allocation against current->mm->owner's memcg is 3611 * possible for the given order. But the page is not allocated yet, so we'll 3612 * need a further commit step to do the final arrangements. 3613 * 3614 * It is possible for the task to switch cgroups in this mean time, so at 3615 * commit time, we can't rely on task conversion any longer. We'll then use 3616 * the handle argument to return to the caller which cgroup we should commit 3617 * against. We could also return the memcg directly and avoid the pointer 3618 * passing, but a boolean return value gives better semantics considering 3619 * the compiled-out case as well. 3620 * 3621 * Returning true means the allocation is possible. 3622 */ 3623 bool 3624 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3625 { 3626 struct mem_cgroup *memcg; 3627 int ret; 3628 3629 *_memcg = NULL; 3630 3631 /* 3632 * Disabling accounting is only relevant for some specific memcg 3633 * internal allocations. Therefore we would initially not have such 3634 * check here, since direct calls to the page allocator that are marked 3635 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3636 * concerned with cache allocations, and by having this test at 3637 * memcg_kmem_get_cache, we are already able to relay the allocation to 3638 * the root cache and bypass the memcg cache altogether. 3639 * 3640 * There is one exception, though: the SLUB allocator does not create 3641 * large order caches, but rather service large kmallocs directly from 3642 * the page allocator. Therefore, the following sequence when backed by 3643 * the SLUB allocator: 3644 * 3645 * memcg_stop_kmem_account(); 3646 * kmalloc(<large_number>) 3647 * memcg_resume_kmem_account(); 3648 * 3649 * would effectively ignore the fact that we should skip accounting, 3650 * since it will drive us directly to this function without passing 3651 * through the cache selector memcg_kmem_get_cache. Such large 3652 * allocations are extremely rare but can happen, for instance, for the 3653 * cache arrays. We bring this test here. 3654 */ 3655 if (!current->mm || current->memcg_kmem_skip_account) 3656 return true; 3657 3658 memcg = try_get_mem_cgroup_from_mm(current->mm); 3659 3660 /* 3661 * very rare case described in mem_cgroup_from_task. Unfortunately there 3662 * isn't much we can do without complicating this too much, and it would 3663 * be gfp-dependent anyway. Just let it go 3664 */ 3665 if (unlikely(!memcg)) 3666 return true; 3667 3668 if (!memcg_can_account_kmem(memcg)) { 3669 css_put(&memcg->css); 3670 return true; 3671 } 3672 3673 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3674 if (!ret) 3675 *_memcg = memcg; 3676 3677 css_put(&memcg->css); 3678 return (ret == 0); 3679 } 3680 3681 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3682 int order) 3683 { 3684 struct page_cgroup *pc; 3685 3686 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3687 3688 /* The page allocation failed. Revert */ 3689 if (!page) { 3690 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3691 return; 3692 } 3693 3694 pc = lookup_page_cgroup(page); 3695 lock_page_cgroup(pc); 3696 pc->mem_cgroup = memcg; 3697 SetPageCgroupUsed(pc); 3698 unlock_page_cgroup(pc); 3699 } 3700 3701 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3702 { 3703 struct mem_cgroup *memcg = NULL; 3704 struct page_cgroup *pc; 3705 3706 3707 pc = lookup_page_cgroup(page); 3708 /* 3709 * Fast unlocked return. Theoretically might have changed, have to 3710 * check again after locking. 3711 */ 3712 if (!PageCgroupUsed(pc)) 3713 return; 3714 3715 lock_page_cgroup(pc); 3716 if (PageCgroupUsed(pc)) { 3717 memcg = pc->mem_cgroup; 3718 ClearPageCgroupUsed(pc); 3719 } 3720 unlock_page_cgroup(pc); 3721 3722 /* 3723 * We trust that only if there is a memcg associated with the page, it 3724 * is a valid allocation 3725 */ 3726 if (!memcg) 3727 return; 3728 3729 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3730 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3731 } 3732 #else 3733 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3734 { 3735 } 3736 #endif /* CONFIG_MEMCG_KMEM */ 3737 3738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3739 3740 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3741 /* 3742 * Because tail pages are not marked as "used", set it. We're under 3743 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3744 * charge/uncharge will be never happen and move_account() is done under 3745 * compound_lock(), so we don't have to take care of races. 3746 */ 3747 void mem_cgroup_split_huge_fixup(struct page *head) 3748 { 3749 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3750 struct page_cgroup *pc; 3751 struct mem_cgroup *memcg; 3752 int i; 3753 3754 if (mem_cgroup_disabled()) 3755 return; 3756 3757 memcg = head_pc->mem_cgroup; 3758 for (i = 1; i < HPAGE_PMD_NR; i++) { 3759 pc = head_pc + i; 3760 pc->mem_cgroup = memcg; 3761 smp_wmb();/* see __commit_charge() */ 3762 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3763 } 3764 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3765 HPAGE_PMD_NR); 3766 } 3767 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3768 3769 /** 3770 * mem_cgroup_move_account - move account of the page 3771 * @page: the page 3772 * @nr_pages: number of regular pages (>1 for huge pages) 3773 * @pc: page_cgroup of the page. 3774 * @from: mem_cgroup which the page is moved from. 3775 * @to: mem_cgroup which the page is moved to. @from != @to. 3776 * 3777 * The caller must confirm following. 3778 * - page is not on LRU (isolate_page() is useful.) 3779 * - compound_lock is held when nr_pages > 1 3780 * 3781 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3782 * from old cgroup. 3783 */ 3784 static int mem_cgroup_move_account(struct page *page, 3785 unsigned int nr_pages, 3786 struct page_cgroup *pc, 3787 struct mem_cgroup *from, 3788 struct mem_cgroup *to) 3789 { 3790 unsigned long flags; 3791 int ret; 3792 bool anon = PageAnon(page); 3793 3794 VM_BUG_ON(from == to); 3795 VM_BUG_ON(PageLRU(page)); 3796 /* 3797 * The page is isolated from LRU. So, collapse function 3798 * will not handle this page. But page splitting can happen. 3799 * Do this check under compound_page_lock(). The caller should 3800 * hold it. 3801 */ 3802 ret = -EBUSY; 3803 if (nr_pages > 1 && !PageTransHuge(page)) 3804 goto out; 3805 3806 lock_page_cgroup(pc); 3807 3808 ret = -EINVAL; 3809 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3810 goto unlock; 3811 3812 move_lock_mem_cgroup(from, &flags); 3813 3814 if (!anon && page_mapped(page)) { 3815 /* Update mapped_file data for mem_cgroup */ 3816 preempt_disable(); 3817 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3818 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3819 preempt_enable(); 3820 } 3821 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3822 3823 /* caller should have done css_get */ 3824 pc->mem_cgroup = to; 3825 mem_cgroup_charge_statistics(to, page, anon, nr_pages); 3826 move_unlock_mem_cgroup(from, &flags); 3827 ret = 0; 3828 unlock: 3829 unlock_page_cgroup(pc); 3830 /* 3831 * check events 3832 */ 3833 memcg_check_events(to, page); 3834 memcg_check_events(from, page); 3835 out: 3836 return ret; 3837 } 3838 3839 /** 3840 * mem_cgroup_move_parent - moves page to the parent group 3841 * @page: the page to move 3842 * @pc: page_cgroup of the page 3843 * @child: page's cgroup 3844 * 3845 * move charges to its parent or the root cgroup if the group has no 3846 * parent (aka use_hierarchy==0). 3847 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3848 * mem_cgroup_move_account fails) the failure is always temporary and 3849 * it signals a race with a page removal/uncharge or migration. In the 3850 * first case the page is on the way out and it will vanish from the LRU 3851 * on the next attempt and the call should be retried later. 3852 * Isolation from the LRU fails only if page has been isolated from 3853 * the LRU since we looked at it and that usually means either global 3854 * reclaim or migration going on. The page will either get back to the 3855 * LRU or vanish. 3856 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3857 * (!PageCgroupUsed) or moved to a different group. The page will 3858 * disappear in the next attempt. 3859 */ 3860 static int mem_cgroup_move_parent(struct page *page, 3861 struct page_cgroup *pc, 3862 struct mem_cgroup *child) 3863 { 3864 struct mem_cgroup *parent; 3865 unsigned int nr_pages; 3866 unsigned long uninitialized_var(flags); 3867 int ret; 3868 3869 VM_BUG_ON(mem_cgroup_is_root(child)); 3870 3871 ret = -EBUSY; 3872 if (!get_page_unless_zero(page)) 3873 goto out; 3874 if (isolate_lru_page(page)) 3875 goto put; 3876 3877 nr_pages = hpage_nr_pages(page); 3878 3879 parent = parent_mem_cgroup(child); 3880 /* 3881 * If no parent, move charges to root cgroup. 3882 */ 3883 if (!parent) 3884 parent = root_mem_cgroup; 3885 3886 if (nr_pages > 1) { 3887 VM_BUG_ON(!PageTransHuge(page)); 3888 flags = compound_lock_irqsave(page); 3889 } 3890 3891 ret = mem_cgroup_move_account(page, nr_pages, 3892 pc, child, parent); 3893 if (!ret) 3894 __mem_cgroup_cancel_local_charge(child, nr_pages); 3895 3896 if (nr_pages > 1) 3897 compound_unlock_irqrestore(page, flags); 3898 putback_lru_page(page); 3899 put: 3900 put_page(page); 3901 out: 3902 return ret; 3903 } 3904 3905 /* 3906 * Charge the memory controller for page usage. 3907 * Return 3908 * 0 if the charge was successful 3909 * < 0 if the cgroup is over its limit 3910 */ 3911 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 3912 gfp_t gfp_mask, enum charge_type ctype) 3913 { 3914 struct mem_cgroup *memcg = NULL; 3915 unsigned int nr_pages = 1; 3916 bool oom = true; 3917 int ret; 3918 3919 if (PageTransHuge(page)) { 3920 nr_pages <<= compound_order(page); 3921 VM_BUG_ON(!PageTransHuge(page)); 3922 /* 3923 * Never OOM-kill a process for a huge page. The 3924 * fault handler will fall back to regular pages. 3925 */ 3926 oom = false; 3927 } 3928 3929 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 3930 if (ret == -ENOMEM) 3931 return ret; 3932 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 3933 return 0; 3934 } 3935 3936 int mem_cgroup_newpage_charge(struct page *page, 3937 struct mm_struct *mm, gfp_t gfp_mask) 3938 { 3939 if (mem_cgroup_disabled()) 3940 return 0; 3941 VM_BUG_ON(page_mapped(page)); 3942 VM_BUG_ON(page->mapping && !PageAnon(page)); 3943 VM_BUG_ON(!mm); 3944 return mem_cgroup_charge_common(page, mm, gfp_mask, 3945 MEM_CGROUP_CHARGE_TYPE_ANON); 3946 } 3947 3948 /* 3949 * While swap-in, try_charge -> commit or cancel, the page is locked. 3950 * And when try_charge() successfully returns, one refcnt to memcg without 3951 * struct page_cgroup is acquired. This refcnt will be consumed by 3952 * "commit()" or removed by "cancel()" 3953 */ 3954 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, 3955 struct page *page, 3956 gfp_t mask, 3957 struct mem_cgroup **memcgp) 3958 { 3959 struct mem_cgroup *memcg; 3960 struct page_cgroup *pc; 3961 int ret; 3962 3963 pc = lookup_page_cgroup(page); 3964 /* 3965 * Every swap fault against a single page tries to charge the 3966 * page, bail as early as possible. shmem_unuse() encounters 3967 * already charged pages, too. The USED bit is protected by 3968 * the page lock, which serializes swap cache removal, which 3969 * in turn serializes uncharging. 3970 */ 3971 if (PageCgroupUsed(pc)) 3972 return 0; 3973 if (!do_swap_account) 3974 goto charge_cur_mm; 3975 memcg = try_get_mem_cgroup_from_page(page); 3976 if (!memcg) 3977 goto charge_cur_mm; 3978 *memcgp = memcg; 3979 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 3980 css_put(&memcg->css); 3981 if (ret == -EINTR) 3982 ret = 0; 3983 return ret; 3984 charge_cur_mm: 3985 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 3986 if (ret == -EINTR) 3987 ret = 0; 3988 return ret; 3989 } 3990 3991 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 3992 gfp_t gfp_mask, struct mem_cgroup **memcgp) 3993 { 3994 *memcgp = NULL; 3995 if (mem_cgroup_disabled()) 3996 return 0; 3997 /* 3998 * A racing thread's fault, or swapoff, may have already 3999 * updated the pte, and even removed page from swap cache: in 4000 * those cases unuse_pte()'s pte_same() test will fail; but 4001 * there's also a KSM case which does need to charge the page. 4002 */ 4003 if (!PageSwapCache(page)) { 4004 int ret; 4005 4006 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); 4007 if (ret == -EINTR) 4008 ret = 0; 4009 return ret; 4010 } 4011 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 4012 } 4013 4014 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 4015 { 4016 if (mem_cgroup_disabled()) 4017 return; 4018 if (!memcg) 4019 return; 4020 __mem_cgroup_cancel_charge(memcg, 1); 4021 } 4022 4023 static void 4024 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 4025 enum charge_type ctype) 4026 { 4027 if (mem_cgroup_disabled()) 4028 return; 4029 if (!memcg) 4030 return; 4031 4032 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 4033 /* 4034 * Now swap is on-memory. This means this page may be 4035 * counted both as mem and swap....double count. 4036 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 4037 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 4038 * may call delete_from_swap_cache() before reach here. 4039 */ 4040 if (do_swap_account && PageSwapCache(page)) { 4041 swp_entry_t ent = {.val = page_private(page)}; 4042 mem_cgroup_uncharge_swap(ent); 4043 } 4044 } 4045 4046 void mem_cgroup_commit_charge_swapin(struct page *page, 4047 struct mem_cgroup *memcg) 4048 { 4049 __mem_cgroup_commit_charge_swapin(page, memcg, 4050 MEM_CGROUP_CHARGE_TYPE_ANON); 4051 } 4052 4053 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 4054 gfp_t gfp_mask) 4055 { 4056 struct mem_cgroup *memcg = NULL; 4057 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4058 int ret; 4059 4060 if (mem_cgroup_disabled()) 4061 return 0; 4062 if (PageCompound(page)) 4063 return 0; 4064 4065 if (!PageSwapCache(page)) 4066 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 4067 else { /* page is swapcache/shmem */ 4068 ret = __mem_cgroup_try_charge_swapin(mm, page, 4069 gfp_mask, &memcg); 4070 if (!ret) 4071 __mem_cgroup_commit_charge_swapin(page, memcg, type); 4072 } 4073 return ret; 4074 } 4075 4076 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 4077 unsigned int nr_pages, 4078 const enum charge_type ctype) 4079 { 4080 struct memcg_batch_info *batch = NULL; 4081 bool uncharge_memsw = true; 4082 4083 /* If swapout, usage of swap doesn't decrease */ 4084 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 4085 uncharge_memsw = false; 4086 4087 batch = ¤t->memcg_batch; 4088 /* 4089 * In usual, we do css_get() when we remember memcg pointer. 4090 * But in this case, we keep res->usage until end of a series of 4091 * uncharges. Then, it's ok to ignore memcg's refcnt. 4092 */ 4093 if (!batch->memcg) 4094 batch->memcg = memcg; 4095 /* 4096 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 4097 * In those cases, all pages freed continuously can be expected to be in 4098 * the same cgroup and we have chance to coalesce uncharges. 4099 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 4100 * because we want to do uncharge as soon as possible. 4101 */ 4102 4103 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 4104 goto direct_uncharge; 4105 4106 if (nr_pages > 1) 4107 goto direct_uncharge; 4108 4109 /* 4110 * In typical case, batch->memcg == mem. This means we can 4111 * merge a series of uncharges to an uncharge of res_counter. 4112 * If not, we uncharge res_counter ony by one. 4113 */ 4114 if (batch->memcg != memcg) 4115 goto direct_uncharge; 4116 /* remember freed charge and uncharge it later */ 4117 batch->nr_pages++; 4118 if (uncharge_memsw) 4119 batch->memsw_nr_pages++; 4120 return; 4121 direct_uncharge: 4122 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 4123 if (uncharge_memsw) 4124 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 4125 if (unlikely(batch->memcg != memcg)) 4126 memcg_oom_recover(memcg); 4127 } 4128 4129 /* 4130 * uncharge if !page_mapped(page) 4131 */ 4132 static struct mem_cgroup * 4133 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, 4134 bool end_migration) 4135 { 4136 struct mem_cgroup *memcg = NULL; 4137 unsigned int nr_pages = 1; 4138 struct page_cgroup *pc; 4139 bool anon; 4140 4141 if (mem_cgroup_disabled()) 4142 return NULL; 4143 4144 if (PageTransHuge(page)) { 4145 nr_pages <<= compound_order(page); 4146 VM_BUG_ON(!PageTransHuge(page)); 4147 } 4148 /* 4149 * Check if our page_cgroup is valid 4150 */ 4151 pc = lookup_page_cgroup(page); 4152 if (unlikely(!PageCgroupUsed(pc))) 4153 return NULL; 4154 4155 lock_page_cgroup(pc); 4156 4157 memcg = pc->mem_cgroup; 4158 4159 if (!PageCgroupUsed(pc)) 4160 goto unlock_out; 4161 4162 anon = PageAnon(page); 4163 4164 switch (ctype) { 4165 case MEM_CGROUP_CHARGE_TYPE_ANON: 4166 /* 4167 * Generally PageAnon tells if it's the anon statistics to be 4168 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 4169 * used before page reached the stage of being marked PageAnon. 4170 */ 4171 anon = true; 4172 /* fallthrough */ 4173 case MEM_CGROUP_CHARGE_TYPE_DROP: 4174 /* See mem_cgroup_prepare_migration() */ 4175 if (page_mapped(page)) 4176 goto unlock_out; 4177 /* 4178 * Pages under migration may not be uncharged. But 4179 * end_migration() /must/ be the one uncharging the 4180 * unused post-migration page and so it has to call 4181 * here with the migration bit still set. See the 4182 * res_counter handling below. 4183 */ 4184 if (!end_migration && PageCgroupMigration(pc)) 4185 goto unlock_out; 4186 break; 4187 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 4188 if (!PageAnon(page)) { /* Shared memory */ 4189 if (page->mapping && !page_is_file_cache(page)) 4190 goto unlock_out; 4191 } else if (page_mapped(page)) /* Anon */ 4192 goto unlock_out; 4193 break; 4194 default: 4195 break; 4196 } 4197 4198 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); 4199 4200 ClearPageCgroupUsed(pc); 4201 /* 4202 * pc->mem_cgroup is not cleared here. It will be accessed when it's 4203 * freed from LRU. This is safe because uncharged page is expected not 4204 * to be reused (freed soon). Exception is SwapCache, it's handled by 4205 * special functions. 4206 */ 4207 4208 unlock_page_cgroup(pc); 4209 /* 4210 * even after unlock, we have memcg->res.usage here and this memcg 4211 * will never be freed, so it's safe to call css_get(). 4212 */ 4213 memcg_check_events(memcg, page); 4214 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4215 mem_cgroup_swap_statistics(memcg, true); 4216 css_get(&memcg->css); 4217 } 4218 /* 4219 * Migration does not charge the res_counter for the 4220 * replacement page, so leave it alone when phasing out the 4221 * page that is unused after the migration. 4222 */ 4223 if (!end_migration && !mem_cgroup_is_root(memcg)) 4224 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 4225 4226 return memcg; 4227 4228 unlock_out: 4229 unlock_page_cgroup(pc); 4230 return NULL; 4231 } 4232 4233 void mem_cgroup_uncharge_page(struct page *page) 4234 { 4235 /* early check. */ 4236 if (page_mapped(page)) 4237 return; 4238 VM_BUG_ON(page->mapping && !PageAnon(page)); 4239 /* 4240 * If the page is in swap cache, uncharge should be deferred 4241 * to the swap path, which also properly accounts swap usage 4242 * and handles memcg lifetime. 4243 * 4244 * Note that this check is not stable and reclaim may add the 4245 * page to swap cache at any time after this. However, if the 4246 * page is not in swap cache by the time page->mapcount hits 4247 * 0, there won't be any page table references to the swap 4248 * slot, and reclaim will free it and not actually write the 4249 * page to disk. 4250 */ 4251 if (PageSwapCache(page)) 4252 return; 4253 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); 4254 } 4255 4256 void mem_cgroup_uncharge_cache_page(struct page *page) 4257 { 4258 VM_BUG_ON(page_mapped(page)); 4259 VM_BUG_ON(page->mapping); 4260 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4261 } 4262 4263 /* 4264 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 4265 * In that cases, pages are freed continuously and we can expect pages 4266 * are in the same memcg. All these calls itself limits the number of 4267 * pages freed at once, then uncharge_start/end() is called properly. 4268 * This may be called prural(2) times in a context, 4269 */ 4270 4271 void mem_cgroup_uncharge_start(void) 4272 { 4273 current->memcg_batch.do_batch++; 4274 /* We can do nest. */ 4275 if (current->memcg_batch.do_batch == 1) { 4276 current->memcg_batch.memcg = NULL; 4277 current->memcg_batch.nr_pages = 0; 4278 current->memcg_batch.memsw_nr_pages = 0; 4279 } 4280 } 4281 4282 void mem_cgroup_uncharge_end(void) 4283 { 4284 struct memcg_batch_info *batch = ¤t->memcg_batch; 4285 4286 if (!batch->do_batch) 4287 return; 4288 4289 batch->do_batch--; 4290 if (batch->do_batch) /* If stacked, do nothing. */ 4291 return; 4292 4293 if (!batch->memcg) 4294 return; 4295 /* 4296 * This "batch->memcg" is valid without any css_get/put etc... 4297 * bacause we hide charges behind us. 4298 */ 4299 if (batch->nr_pages) 4300 res_counter_uncharge(&batch->memcg->res, 4301 batch->nr_pages * PAGE_SIZE); 4302 if (batch->memsw_nr_pages) 4303 res_counter_uncharge(&batch->memcg->memsw, 4304 batch->memsw_nr_pages * PAGE_SIZE); 4305 memcg_oom_recover(batch->memcg); 4306 /* forget this pointer (for sanity check) */ 4307 batch->memcg = NULL; 4308 } 4309 4310 #ifdef CONFIG_SWAP 4311 /* 4312 * called after __delete_from_swap_cache() and drop "page" account. 4313 * memcg information is recorded to swap_cgroup of "ent" 4314 */ 4315 void 4316 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 4317 { 4318 struct mem_cgroup *memcg; 4319 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 4320 4321 if (!swapout) /* this was a swap cache but the swap is unused ! */ 4322 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 4323 4324 memcg = __mem_cgroup_uncharge_common(page, ctype, false); 4325 4326 /* 4327 * record memcg information, if swapout && memcg != NULL, 4328 * css_get() was called in uncharge(). 4329 */ 4330 if (do_swap_account && swapout && memcg) 4331 swap_cgroup_record(ent, css_id(&memcg->css)); 4332 } 4333 #endif 4334 4335 #ifdef CONFIG_MEMCG_SWAP 4336 /* 4337 * called from swap_entry_free(). remove record in swap_cgroup and 4338 * uncharge "memsw" account. 4339 */ 4340 void mem_cgroup_uncharge_swap(swp_entry_t ent) 4341 { 4342 struct mem_cgroup *memcg; 4343 unsigned short id; 4344 4345 if (!do_swap_account) 4346 return; 4347 4348 id = swap_cgroup_record(ent, 0); 4349 rcu_read_lock(); 4350 memcg = mem_cgroup_lookup(id); 4351 if (memcg) { 4352 /* 4353 * We uncharge this because swap is freed. 4354 * This memcg can be obsolete one. We avoid calling css_tryget 4355 */ 4356 if (!mem_cgroup_is_root(memcg)) 4357 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4358 mem_cgroup_swap_statistics(memcg, false); 4359 css_put(&memcg->css); 4360 } 4361 rcu_read_unlock(); 4362 } 4363 4364 /** 4365 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 4366 * @entry: swap entry to be moved 4367 * @from: mem_cgroup which the entry is moved from 4368 * @to: mem_cgroup which the entry is moved to 4369 * 4370 * It succeeds only when the swap_cgroup's record for this entry is the same 4371 * as the mem_cgroup's id of @from. 4372 * 4373 * Returns 0 on success, -EINVAL on failure. 4374 * 4375 * The caller must have charged to @to, IOW, called res_counter_charge() about 4376 * both res and memsw, and called css_get(). 4377 */ 4378 static int mem_cgroup_move_swap_account(swp_entry_t entry, 4379 struct mem_cgroup *from, struct mem_cgroup *to) 4380 { 4381 unsigned short old_id, new_id; 4382 4383 old_id = css_id(&from->css); 4384 new_id = css_id(&to->css); 4385 4386 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4387 mem_cgroup_swap_statistics(from, false); 4388 mem_cgroup_swap_statistics(to, true); 4389 /* 4390 * This function is only called from task migration context now. 4391 * It postpones res_counter and refcount handling till the end 4392 * of task migration(mem_cgroup_clear_mc()) for performance 4393 * improvement. But we cannot postpone css_get(to) because if 4394 * the process that has been moved to @to does swap-in, the 4395 * refcount of @to might be decreased to 0. 4396 * 4397 * We are in attach() phase, so the cgroup is guaranteed to be 4398 * alive, so we can just call css_get(). 4399 */ 4400 css_get(&to->css); 4401 return 0; 4402 } 4403 return -EINVAL; 4404 } 4405 #else 4406 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 4407 struct mem_cgroup *from, struct mem_cgroup *to) 4408 { 4409 return -EINVAL; 4410 } 4411 #endif 4412 4413 /* 4414 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 4415 * page belongs to. 4416 */ 4417 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 4418 struct mem_cgroup **memcgp) 4419 { 4420 struct mem_cgroup *memcg = NULL; 4421 unsigned int nr_pages = 1; 4422 struct page_cgroup *pc; 4423 enum charge_type ctype; 4424 4425 *memcgp = NULL; 4426 4427 if (mem_cgroup_disabled()) 4428 return; 4429 4430 if (PageTransHuge(page)) 4431 nr_pages <<= compound_order(page); 4432 4433 pc = lookup_page_cgroup(page); 4434 lock_page_cgroup(pc); 4435 if (PageCgroupUsed(pc)) { 4436 memcg = pc->mem_cgroup; 4437 css_get(&memcg->css); 4438 /* 4439 * At migrating an anonymous page, its mapcount goes down 4440 * to 0 and uncharge() will be called. But, even if it's fully 4441 * unmapped, migration may fail and this page has to be 4442 * charged again. We set MIGRATION flag here and delay uncharge 4443 * until end_migration() is called 4444 * 4445 * Corner Case Thinking 4446 * A) 4447 * When the old page was mapped as Anon and it's unmap-and-freed 4448 * while migration was ongoing. 4449 * If unmap finds the old page, uncharge() of it will be delayed 4450 * until end_migration(). If unmap finds a new page, it's 4451 * uncharged when it make mapcount to be 1->0. If unmap code 4452 * finds swap_migration_entry, the new page will not be mapped 4453 * and end_migration() will find it(mapcount==0). 4454 * 4455 * B) 4456 * When the old page was mapped but migraion fails, the kernel 4457 * remaps it. A charge for it is kept by MIGRATION flag even 4458 * if mapcount goes down to 0. We can do remap successfully 4459 * without charging it again. 4460 * 4461 * C) 4462 * The "old" page is under lock_page() until the end of 4463 * migration, so, the old page itself will not be swapped-out. 4464 * If the new page is swapped out before end_migraton, our 4465 * hook to usual swap-out path will catch the event. 4466 */ 4467 if (PageAnon(page)) 4468 SetPageCgroupMigration(pc); 4469 } 4470 unlock_page_cgroup(pc); 4471 /* 4472 * If the page is not charged at this point, 4473 * we return here. 4474 */ 4475 if (!memcg) 4476 return; 4477 4478 *memcgp = memcg; 4479 /* 4480 * We charge new page before it's used/mapped. So, even if unlock_page() 4481 * is called before end_migration, we can catch all events on this new 4482 * page. In the case new page is migrated but not remapped, new page's 4483 * mapcount will be finally 0 and we call uncharge in end_migration(). 4484 */ 4485 if (PageAnon(page)) 4486 ctype = MEM_CGROUP_CHARGE_TYPE_ANON; 4487 else 4488 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 4489 /* 4490 * The page is committed to the memcg, but it's not actually 4491 * charged to the res_counter since we plan on replacing the 4492 * old one and only one page is going to be left afterwards. 4493 */ 4494 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 4495 } 4496 4497 /* remove redundant charge if migration failed*/ 4498 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 4499 struct page *oldpage, struct page *newpage, bool migration_ok) 4500 { 4501 struct page *used, *unused; 4502 struct page_cgroup *pc; 4503 bool anon; 4504 4505 if (!memcg) 4506 return; 4507 4508 if (!migration_ok) { 4509 used = oldpage; 4510 unused = newpage; 4511 } else { 4512 used = newpage; 4513 unused = oldpage; 4514 } 4515 anon = PageAnon(used); 4516 __mem_cgroup_uncharge_common(unused, 4517 anon ? MEM_CGROUP_CHARGE_TYPE_ANON 4518 : MEM_CGROUP_CHARGE_TYPE_CACHE, 4519 true); 4520 css_put(&memcg->css); 4521 /* 4522 * We disallowed uncharge of pages under migration because mapcount 4523 * of the page goes down to zero, temporarly. 4524 * Clear the flag and check the page should be charged. 4525 */ 4526 pc = lookup_page_cgroup(oldpage); 4527 lock_page_cgroup(pc); 4528 ClearPageCgroupMigration(pc); 4529 unlock_page_cgroup(pc); 4530 4531 /* 4532 * If a page is a file cache, radix-tree replacement is very atomic 4533 * and we can skip this check. When it was an Anon page, its mapcount 4534 * goes down to 0. But because we added MIGRATION flage, it's not 4535 * uncharged yet. There are several case but page->mapcount check 4536 * and USED bit check in mem_cgroup_uncharge_page() will do enough 4537 * check. (see prepare_charge() also) 4538 */ 4539 if (anon) 4540 mem_cgroup_uncharge_page(used); 4541 } 4542 4543 /* 4544 * At replace page cache, newpage is not under any memcg but it's on 4545 * LRU. So, this function doesn't touch res_counter but handles LRU 4546 * in correct way. Both pages are locked so we cannot race with uncharge. 4547 */ 4548 void mem_cgroup_replace_page_cache(struct page *oldpage, 4549 struct page *newpage) 4550 { 4551 struct mem_cgroup *memcg = NULL; 4552 struct page_cgroup *pc; 4553 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4554 4555 if (mem_cgroup_disabled()) 4556 return; 4557 4558 pc = lookup_page_cgroup(oldpage); 4559 /* fix accounting on old pages */ 4560 lock_page_cgroup(pc); 4561 if (PageCgroupUsed(pc)) { 4562 memcg = pc->mem_cgroup; 4563 mem_cgroup_charge_statistics(memcg, oldpage, false, -1); 4564 ClearPageCgroupUsed(pc); 4565 } 4566 unlock_page_cgroup(pc); 4567 4568 /* 4569 * When called from shmem_replace_page(), in some cases the 4570 * oldpage has already been charged, and in some cases not. 4571 */ 4572 if (!memcg) 4573 return; 4574 /* 4575 * Even if newpage->mapping was NULL before starting replacement, 4576 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4577 * LRU while we overwrite pc->mem_cgroup. 4578 */ 4579 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4580 } 4581 4582 #ifdef CONFIG_DEBUG_VM 4583 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 4584 { 4585 struct page_cgroup *pc; 4586 4587 pc = lookup_page_cgroup(page); 4588 /* 4589 * Can be NULL while feeding pages into the page allocator for 4590 * the first time, i.e. during boot or memory hotplug; 4591 * or when mem_cgroup_disabled(). 4592 */ 4593 if (likely(pc) && PageCgroupUsed(pc)) 4594 return pc; 4595 return NULL; 4596 } 4597 4598 bool mem_cgroup_bad_page_check(struct page *page) 4599 { 4600 if (mem_cgroup_disabled()) 4601 return false; 4602 4603 return lookup_page_cgroup_used(page) != NULL; 4604 } 4605 4606 void mem_cgroup_print_bad_page(struct page *page) 4607 { 4608 struct page_cgroup *pc; 4609 4610 pc = lookup_page_cgroup_used(page); 4611 if (pc) { 4612 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4613 pc, pc->flags, pc->mem_cgroup); 4614 } 4615 } 4616 #endif 4617 4618 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4619 unsigned long long val) 4620 { 4621 int retry_count; 4622 u64 memswlimit, memlimit; 4623 int ret = 0; 4624 int children = mem_cgroup_count_children(memcg); 4625 u64 curusage, oldusage; 4626 int enlarge; 4627 4628 /* 4629 * For keeping hierarchical_reclaim simple, how long we should retry 4630 * is depends on callers. We set our retry-count to be function 4631 * of # of children which we should visit in this loop. 4632 */ 4633 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 4634 4635 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4636 4637 enlarge = 0; 4638 while (retry_count) { 4639 if (signal_pending(current)) { 4640 ret = -EINTR; 4641 break; 4642 } 4643 /* 4644 * Rather than hide all in some function, I do this in 4645 * open coded manner. You see what this really does. 4646 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4647 */ 4648 mutex_lock(&set_limit_mutex); 4649 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4650 if (memswlimit < val) { 4651 ret = -EINVAL; 4652 mutex_unlock(&set_limit_mutex); 4653 break; 4654 } 4655 4656 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4657 if (memlimit < val) 4658 enlarge = 1; 4659 4660 ret = res_counter_set_limit(&memcg->res, val); 4661 if (!ret) { 4662 if (memswlimit == val) 4663 memcg->memsw_is_minimum = true; 4664 else 4665 memcg->memsw_is_minimum = false; 4666 } 4667 mutex_unlock(&set_limit_mutex); 4668 4669 if (!ret) 4670 break; 4671 4672 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4673 MEM_CGROUP_RECLAIM_SHRINK); 4674 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4675 /* Usage is reduced ? */ 4676 if (curusage >= oldusage) 4677 retry_count--; 4678 else 4679 oldusage = curusage; 4680 } 4681 if (!ret && enlarge) 4682 memcg_oom_recover(memcg); 4683 4684 return ret; 4685 } 4686 4687 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 4688 unsigned long long val) 4689 { 4690 int retry_count; 4691 u64 memlimit, memswlimit, oldusage, curusage; 4692 int children = mem_cgroup_count_children(memcg); 4693 int ret = -EBUSY; 4694 int enlarge = 0; 4695 4696 /* see mem_cgroup_resize_res_limit */ 4697 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4698 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4699 while (retry_count) { 4700 if (signal_pending(current)) { 4701 ret = -EINTR; 4702 break; 4703 } 4704 /* 4705 * Rather than hide all in some function, I do this in 4706 * open coded manner. You see what this really does. 4707 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4708 */ 4709 mutex_lock(&set_limit_mutex); 4710 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4711 if (memlimit > val) { 4712 ret = -EINVAL; 4713 mutex_unlock(&set_limit_mutex); 4714 break; 4715 } 4716 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4717 if (memswlimit < val) 4718 enlarge = 1; 4719 ret = res_counter_set_limit(&memcg->memsw, val); 4720 if (!ret) { 4721 if (memlimit == val) 4722 memcg->memsw_is_minimum = true; 4723 else 4724 memcg->memsw_is_minimum = false; 4725 } 4726 mutex_unlock(&set_limit_mutex); 4727 4728 if (!ret) 4729 break; 4730 4731 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4732 MEM_CGROUP_RECLAIM_NOSWAP | 4733 MEM_CGROUP_RECLAIM_SHRINK); 4734 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4735 /* Usage is reduced ? */ 4736 if (curusage >= oldusage) 4737 retry_count--; 4738 else 4739 oldusage = curusage; 4740 } 4741 if (!ret && enlarge) 4742 memcg_oom_recover(memcg); 4743 return ret; 4744 } 4745 4746 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4747 gfp_t gfp_mask, 4748 unsigned long *total_scanned) 4749 { 4750 unsigned long nr_reclaimed = 0; 4751 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4752 unsigned long reclaimed; 4753 int loop = 0; 4754 struct mem_cgroup_tree_per_zone *mctz; 4755 unsigned long long excess; 4756 unsigned long nr_scanned; 4757 4758 if (order > 0) 4759 return 0; 4760 4761 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4762 /* 4763 * This loop can run a while, specially if mem_cgroup's continuously 4764 * keep exceeding their soft limit and putting the system under 4765 * pressure 4766 */ 4767 do { 4768 if (next_mz) 4769 mz = next_mz; 4770 else 4771 mz = mem_cgroup_largest_soft_limit_node(mctz); 4772 if (!mz) 4773 break; 4774 4775 nr_scanned = 0; 4776 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4777 gfp_mask, &nr_scanned); 4778 nr_reclaimed += reclaimed; 4779 *total_scanned += nr_scanned; 4780 spin_lock(&mctz->lock); 4781 4782 /* 4783 * If we failed to reclaim anything from this memory cgroup 4784 * it is time to move on to the next cgroup 4785 */ 4786 next_mz = NULL; 4787 if (!reclaimed) { 4788 do { 4789 /* 4790 * Loop until we find yet another one. 4791 * 4792 * By the time we get the soft_limit lock 4793 * again, someone might have aded the 4794 * group back on the RB tree. Iterate to 4795 * make sure we get a different mem. 4796 * mem_cgroup_largest_soft_limit_node returns 4797 * NULL if no other cgroup is present on 4798 * the tree 4799 */ 4800 next_mz = 4801 __mem_cgroup_largest_soft_limit_node(mctz); 4802 if (next_mz == mz) 4803 css_put(&next_mz->memcg->css); 4804 else /* next_mz == NULL or other memcg */ 4805 break; 4806 } while (1); 4807 } 4808 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4809 excess = res_counter_soft_limit_excess(&mz->memcg->res); 4810 /* 4811 * One school of thought says that we should not add 4812 * back the node to the tree if reclaim returns 0. 4813 * But our reclaim could return 0, simply because due 4814 * to priority we are exposing a smaller subset of 4815 * memory to reclaim from. Consider this as a longer 4816 * term TODO. 4817 */ 4818 /* If excess == 0, no tree ops */ 4819 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4820 spin_unlock(&mctz->lock); 4821 css_put(&mz->memcg->css); 4822 loop++; 4823 /* 4824 * Could not reclaim anything and there are no more 4825 * mem cgroups to try or we seem to be looping without 4826 * reclaiming anything. 4827 */ 4828 if (!nr_reclaimed && 4829 (next_mz == NULL || 4830 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4831 break; 4832 } while (!nr_reclaimed); 4833 if (next_mz) 4834 css_put(&next_mz->memcg->css); 4835 return nr_reclaimed; 4836 } 4837 4838 /** 4839 * mem_cgroup_force_empty_list - clears LRU of a group 4840 * @memcg: group to clear 4841 * @node: NUMA node 4842 * @zid: zone id 4843 * @lru: lru to to clear 4844 * 4845 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4846 * reclaim the pages page themselves - pages are moved to the parent (or root) 4847 * group. 4848 */ 4849 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4850 int node, int zid, enum lru_list lru) 4851 { 4852 struct lruvec *lruvec; 4853 unsigned long flags; 4854 struct list_head *list; 4855 struct page *busy; 4856 struct zone *zone; 4857 4858 zone = &NODE_DATA(node)->node_zones[zid]; 4859 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4860 list = &lruvec->lists[lru]; 4861 4862 busy = NULL; 4863 do { 4864 struct page_cgroup *pc; 4865 struct page *page; 4866 4867 spin_lock_irqsave(&zone->lru_lock, flags); 4868 if (list_empty(list)) { 4869 spin_unlock_irqrestore(&zone->lru_lock, flags); 4870 break; 4871 } 4872 page = list_entry(list->prev, struct page, lru); 4873 if (busy == page) { 4874 list_move(&page->lru, list); 4875 busy = NULL; 4876 spin_unlock_irqrestore(&zone->lru_lock, flags); 4877 continue; 4878 } 4879 spin_unlock_irqrestore(&zone->lru_lock, flags); 4880 4881 pc = lookup_page_cgroup(page); 4882 4883 if (mem_cgroup_move_parent(page, pc, memcg)) { 4884 /* found lock contention or "pc" is obsolete. */ 4885 busy = page; 4886 cond_resched(); 4887 } else 4888 busy = NULL; 4889 } while (!list_empty(list)); 4890 } 4891 4892 /* 4893 * make mem_cgroup's charge to be 0 if there is no task by moving 4894 * all the charges and pages to the parent. 4895 * This enables deleting this mem_cgroup. 4896 * 4897 * Caller is responsible for holding css reference on the memcg. 4898 */ 4899 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4900 { 4901 int node, zid; 4902 u64 usage; 4903 4904 do { 4905 /* This is for making all *used* pages to be on LRU. */ 4906 lru_add_drain_all(); 4907 drain_all_stock_sync(memcg); 4908 mem_cgroup_start_move(memcg); 4909 for_each_node_state(node, N_MEMORY) { 4910 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4911 enum lru_list lru; 4912 for_each_lru(lru) { 4913 mem_cgroup_force_empty_list(memcg, 4914 node, zid, lru); 4915 } 4916 } 4917 } 4918 mem_cgroup_end_move(memcg); 4919 memcg_oom_recover(memcg); 4920 cond_resched(); 4921 4922 /* 4923 * Kernel memory may not necessarily be trackable to a specific 4924 * process. So they are not migrated, and therefore we can't 4925 * expect their value to drop to 0 here. 4926 * Having res filled up with kmem only is enough. 4927 * 4928 * This is a safety check because mem_cgroup_force_empty_list 4929 * could have raced with mem_cgroup_replace_page_cache callers 4930 * so the lru seemed empty but the page could have been added 4931 * right after the check. RES_USAGE should be safe as we always 4932 * charge before adding to the LRU. 4933 */ 4934 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 4935 res_counter_read_u64(&memcg->kmem, RES_USAGE); 4936 } while (usage > 0); 4937 } 4938 4939 /* 4940 * This mainly exists for tests during the setting of set of use_hierarchy. 4941 * Since this is the very setting we are changing, the current hierarchy value 4942 * is meaningless 4943 */ 4944 static inline bool __memcg_has_children(struct mem_cgroup *memcg) 4945 { 4946 struct cgroup *pos; 4947 4948 /* bounce at first found */ 4949 cgroup_for_each_child(pos, memcg->css.cgroup) 4950 return true; 4951 return false; 4952 } 4953 4954 /* 4955 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed 4956 * to be already dead (as in mem_cgroup_force_empty, for instance). This is 4957 * from mem_cgroup_count_children(), in the sense that we don't really care how 4958 * many children we have; we only need to know if we have any. It also counts 4959 * any memcg without hierarchy as infertile. 4960 */ 4961 static inline bool memcg_has_children(struct mem_cgroup *memcg) 4962 { 4963 return memcg->use_hierarchy && __memcg_has_children(memcg); 4964 } 4965 4966 /* 4967 * Reclaims as many pages from the given memcg as possible and moves 4968 * the rest to the parent. 4969 * 4970 * Caller is responsible for holding css reference for memcg. 4971 */ 4972 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 4973 { 4974 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 4975 struct cgroup *cgrp = memcg->css.cgroup; 4976 4977 /* returns EBUSY if there is a task or if we come here twice. */ 4978 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 4979 return -EBUSY; 4980 4981 /* we call try-to-free pages for make this cgroup empty */ 4982 lru_add_drain_all(); 4983 /* try to free all pages in this cgroup */ 4984 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4985 int progress; 4986 4987 if (signal_pending(current)) 4988 return -EINTR; 4989 4990 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4991 false); 4992 if (!progress) { 4993 nr_retries--; 4994 /* maybe some writeback is necessary */ 4995 congestion_wait(BLK_RW_ASYNC, HZ/10); 4996 } 4997 4998 } 4999 lru_add_drain(); 5000 mem_cgroup_reparent_charges(memcg); 5001 5002 return 0; 5003 } 5004 5005 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 5006 { 5007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5008 int ret; 5009 5010 if (mem_cgroup_is_root(memcg)) 5011 return -EINVAL; 5012 css_get(&memcg->css); 5013 ret = mem_cgroup_force_empty(memcg); 5014 css_put(&memcg->css); 5015 5016 return ret; 5017 } 5018 5019 5020 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 5021 { 5022 return mem_cgroup_from_cont(cont)->use_hierarchy; 5023 } 5024 5025 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 5026 u64 val) 5027 { 5028 int retval = 0; 5029 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5030 struct cgroup *parent = cont->parent; 5031 struct mem_cgroup *parent_memcg = NULL; 5032 5033 if (parent) 5034 parent_memcg = mem_cgroup_from_cont(parent); 5035 5036 mutex_lock(&memcg_create_mutex); 5037 5038 if (memcg->use_hierarchy == val) 5039 goto out; 5040 5041 /* 5042 * If parent's use_hierarchy is set, we can't make any modifications 5043 * in the child subtrees. If it is unset, then the change can 5044 * occur, provided the current cgroup has no children. 5045 * 5046 * For the root cgroup, parent_mem is NULL, we allow value to be 5047 * set if there are no children. 5048 */ 5049 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 5050 (val == 1 || val == 0)) { 5051 if (!__memcg_has_children(memcg)) 5052 memcg->use_hierarchy = val; 5053 else 5054 retval = -EBUSY; 5055 } else 5056 retval = -EINVAL; 5057 5058 out: 5059 mutex_unlock(&memcg_create_mutex); 5060 5061 return retval; 5062 } 5063 5064 5065 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 5066 enum mem_cgroup_stat_index idx) 5067 { 5068 struct mem_cgroup *iter; 5069 long val = 0; 5070 5071 /* Per-cpu values can be negative, use a signed accumulator */ 5072 for_each_mem_cgroup_tree(iter, memcg) 5073 val += mem_cgroup_read_stat(iter, idx); 5074 5075 if (val < 0) /* race ? */ 5076 val = 0; 5077 return val; 5078 } 5079 5080 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 5081 { 5082 u64 val; 5083 5084 if (!mem_cgroup_is_root(memcg)) { 5085 if (!swap) 5086 return res_counter_read_u64(&memcg->res, RES_USAGE); 5087 else 5088 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 5089 } 5090 5091 /* 5092 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 5093 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 5094 */ 5095 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 5096 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 5097 5098 if (swap) 5099 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 5100 5101 return val << PAGE_SHIFT; 5102 } 5103 5104 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 5105 struct file *file, char __user *buf, 5106 size_t nbytes, loff_t *ppos) 5107 { 5108 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5109 char str[64]; 5110 u64 val; 5111 int name, len; 5112 enum res_type type; 5113 5114 type = MEMFILE_TYPE(cft->private); 5115 name = MEMFILE_ATTR(cft->private); 5116 5117 switch (type) { 5118 case _MEM: 5119 if (name == RES_USAGE) 5120 val = mem_cgroup_usage(memcg, false); 5121 else 5122 val = res_counter_read_u64(&memcg->res, name); 5123 break; 5124 case _MEMSWAP: 5125 if (name == RES_USAGE) 5126 val = mem_cgroup_usage(memcg, true); 5127 else 5128 val = res_counter_read_u64(&memcg->memsw, name); 5129 break; 5130 case _KMEM: 5131 val = res_counter_read_u64(&memcg->kmem, name); 5132 break; 5133 default: 5134 BUG(); 5135 } 5136 5137 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5138 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5139 } 5140 5141 static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) 5142 { 5143 int ret = -EINVAL; 5144 #ifdef CONFIG_MEMCG_KMEM 5145 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5146 /* 5147 * For simplicity, we won't allow this to be disabled. It also can't 5148 * be changed if the cgroup has children already, or if tasks had 5149 * already joined. 5150 * 5151 * If tasks join before we set the limit, a person looking at 5152 * kmem.usage_in_bytes will have no way to determine when it took 5153 * place, which makes the value quite meaningless. 5154 * 5155 * After it first became limited, changes in the value of the limit are 5156 * of course permitted. 5157 */ 5158 mutex_lock(&memcg_create_mutex); 5159 mutex_lock(&set_limit_mutex); 5160 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 5161 if (cgroup_task_count(cont) || memcg_has_children(memcg)) { 5162 ret = -EBUSY; 5163 goto out; 5164 } 5165 ret = res_counter_set_limit(&memcg->kmem, val); 5166 VM_BUG_ON(ret); 5167 5168 ret = memcg_update_cache_sizes(memcg); 5169 if (ret) { 5170 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5171 goto out; 5172 } 5173 static_key_slow_inc(&memcg_kmem_enabled_key); 5174 /* 5175 * setting the active bit after the inc will guarantee no one 5176 * starts accounting before all call sites are patched 5177 */ 5178 memcg_kmem_set_active(memcg); 5179 } else 5180 ret = res_counter_set_limit(&memcg->kmem, val); 5181 out: 5182 mutex_unlock(&set_limit_mutex); 5183 mutex_unlock(&memcg_create_mutex); 5184 #endif 5185 return ret; 5186 } 5187 5188 #ifdef CONFIG_MEMCG_KMEM 5189 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5190 { 5191 int ret = 0; 5192 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5193 if (!parent) 5194 goto out; 5195 5196 memcg->kmem_account_flags = parent->kmem_account_flags; 5197 /* 5198 * When that happen, we need to disable the static branch only on those 5199 * memcgs that enabled it. To achieve this, we would be forced to 5200 * complicate the code by keeping track of which memcgs were the ones 5201 * that actually enabled limits, and which ones got it from its 5202 * parents. 5203 * 5204 * It is a lot simpler just to do static_key_slow_inc() on every child 5205 * that is accounted. 5206 */ 5207 if (!memcg_kmem_is_active(memcg)) 5208 goto out; 5209 5210 /* 5211 * __mem_cgroup_free() will issue static_key_slow_dec() because this 5212 * memcg is active already. If the later initialization fails then the 5213 * cgroup core triggers the cleanup so we do not have to do it here. 5214 */ 5215 static_key_slow_inc(&memcg_kmem_enabled_key); 5216 5217 mutex_lock(&set_limit_mutex); 5218 memcg_stop_kmem_account(); 5219 ret = memcg_update_cache_sizes(memcg); 5220 memcg_resume_kmem_account(); 5221 mutex_unlock(&set_limit_mutex); 5222 out: 5223 return ret; 5224 } 5225 #endif /* CONFIG_MEMCG_KMEM */ 5226 5227 /* 5228 * The user of this function is... 5229 * RES_LIMIT. 5230 */ 5231 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 5232 const char *buffer) 5233 { 5234 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5235 enum res_type type; 5236 int name; 5237 unsigned long long val; 5238 int ret; 5239 5240 type = MEMFILE_TYPE(cft->private); 5241 name = MEMFILE_ATTR(cft->private); 5242 5243 switch (name) { 5244 case RES_LIMIT: 5245 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5246 ret = -EINVAL; 5247 break; 5248 } 5249 /* This function does all necessary parse...reuse it */ 5250 ret = res_counter_memparse_write_strategy(buffer, &val); 5251 if (ret) 5252 break; 5253 if (type == _MEM) 5254 ret = mem_cgroup_resize_limit(memcg, val); 5255 else if (type == _MEMSWAP) 5256 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5257 else if (type == _KMEM) 5258 ret = memcg_update_kmem_limit(cont, val); 5259 else 5260 return -EINVAL; 5261 break; 5262 case RES_SOFT_LIMIT: 5263 ret = res_counter_memparse_write_strategy(buffer, &val); 5264 if (ret) 5265 break; 5266 /* 5267 * For memsw, soft limits are hard to implement in terms 5268 * of semantics, for now, we support soft limits for 5269 * control without swap 5270 */ 5271 if (type == _MEM) 5272 ret = res_counter_set_soft_limit(&memcg->res, val); 5273 else 5274 ret = -EINVAL; 5275 break; 5276 default: 5277 ret = -EINVAL; /* should be BUG() ? */ 5278 break; 5279 } 5280 return ret; 5281 } 5282 5283 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5284 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5285 { 5286 struct cgroup *cgroup; 5287 unsigned long long min_limit, min_memsw_limit, tmp; 5288 5289 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5290 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5291 cgroup = memcg->css.cgroup; 5292 if (!memcg->use_hierarchy) 5293 goto out; 5294 5295 while (cgroup->parent) { 5296 cgroup = cgroup->parent; 5297 memcg = mem_cgroup_from_cont(cgroup); 5298 if (!memcg->use_hierarchy) 5299 break; 5300 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5301 min_limit = min(min_limit, tmp); 5302 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5303 min_memsw_limit = min(min_memsw_limit, tmp); 5304 } 5305 out: 5306 *mem_limit = min_limit; 5307 *memsw_limit = min_memsw_limit; 5308 } 5309 5310 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5311 { 5312 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5313 int name; 5314 enum res_type type; 5315 5316 type = MEMFILE_TYPE(event); 5317 name = MEMFILE_ATTR(event); 5318 5319 switch (name) { 5320 case RES_MAX_USAGE: 5321 if (type == _MEM) 5322 res_counter_reset_max(&memcg->res); 5323 else if (type == _MEMSWAP) 5324 res_counter_reset_max(&memcg->memsw); 5325 else if (type == _KMEM) 5326 res_counter_reset_max(&memcg->kmem); 5327 else 5328 return -EINVAL; 5329 break; 5330 case RES_FAILCNT: 5331 if (type == _MEM) 5332 res_counter_reset_failcnt(&memcg->res); 5333 else if (type == _MEMSWAP) 5334 res_counter_reset_failcnt(&memcg->memsw); 5335 else if (type == _KMEM) 5336 res_counter_reset_failcnt(&memcg->kmem); 5337 else 5338 return -EINVAL; 5339 break; 5340 } 5341 5342 return 0; 5343 } 5344 5345 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 5346 struct cftype *cft) 5347 { 5348 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 5349 } 5350 5351 #ifdef CONFIG_MMU 5352 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5353 struct cftype *cft, u64 val) 5354 { 5355 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5356 5357 if (val >= (1 << NR_MOVE_TYPE)) 5358 return -EINVAL; 5359 5360 /* 5361 * No kind of locking is needed in here, because ->can_attach() will 5362 * check this value once in the beginning of the process, and then carry 5363 * on with stale data. This means that changes to this value will only 5364 * affect task migrations starting after the change. 5365 */ 5366 memcg->move_charge_at_immigrate = val; 5367 return 0; 5368 } 5369 #else 5370 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5371 struct cftype *cft, u64 val) 5372 { 5373 return -ENOSYS; 5374 } 5375 #endif 5376 5377 #ifdef CONFIG_NUMA 5378 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, 5379 struct seq_file *m) 5380 { 5381 int nid; 5382 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5383 unsigned long node_nr; 5384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5385 5386 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5387 seq_printf(m, "total=%lu", total_nr); 5388 for_each_node_state(nid, N_MEMORY) { 5389 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5390 seq_printf(m, " N%d=%lu", nid, node_nr); 5391 } 5392 seq_putc(m, '\n'); 5393 5394 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5395 seq_printf(m, "file=%lu", file_nr); 5396 for_each_node_state(nid, N_MEMORY) { 5397 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5398 LRU_ALL_FILE); 5399 seq_printf(m, " N%d=%lu", nid, node_nr); 5400 } 5401 seq_putc(m, '\n'); 5402 5403 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5404 seq_printf(m, "anon=%lu", anon_nr); 5405 for_each_node_state(nid, N_MEMORY) { 5406 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5407 LRU_ALL_ANON); 5408 seq_printf(m, " N%d=%lu", nid, node_nr); 5409 } 5410 seq_putc(m, '\n'); 5411 5412 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5413 seq_printf(m, "unevictable=%lu", unevictable_nr); 5414 for_each_node_state(nid, N_MEMORY) { 5415 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5416 BIT(LRU_UNEVICTABLE)); 5417 seq_printf(m, " N%d=%lu", nid, node_nr); 5418 } 5419 seq_putc(m, '\n'); 5420 return 0; 5421 } 5422 #endif /* CONFIG_NUMA */ 5423 5424 static inline void mem_cgroup_lru_names_not_uptodate(void) 5425 { 5426 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5427 } 5428 5429 static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, 5430 struct seq_file *m) 5431 { 5432 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5433 struct mem_cgroup *mi; 5434 unsigned int i; 5435 5436 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5437 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5438 continue; 5439 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 5440 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 5441 } 5442 5443 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 5444 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 5445 mem_cgroup_read_events(memcg, i)); 5446 5447 for (i = 0; i < NR_LRU_LISTS; i++) 5448 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 5449 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 5450 5451 /* Hierarchical information */ 5452 { 5453 unsigned long long limit, memsw_limit; 5454 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 5455 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 5456 if (do_swap_account) 5457 seq_printf(m, "hierarchical_memsw_limit %llu\n", 5458 memsw_limit); 5459 } 5460 5461 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5462 long long val = 0; 5463 5464 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5465 continue; 5466 for_each_mem_cgroup_tree(mi, memcg) 5467 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 5468 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 5469 } 5470 5471 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 5472 unsigned long long val = 0; 5473 5474 for_each_mem_cgroup_tree(mi, memcg) 5475 val += mem_cgroup_read_events(mi, i); 5476 seq_printf(m, "total_%s %llu\n", 5477 mem_cgroup_events_names[i], val); 5478 } 5479 5480 for (i = 0; i < NR_LRU_LISTS; i++) { 5481 unsigned long long val = 0; 5482 5483 for_each_mem_cgroup_tree(mi, memcg) 5484 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 5485 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 5486 } 5487 5488 #ifdef CONFIG_DEBUG_VM 5489 { 5490 int nid, zid; 5491 struct mem_cgroup_per_zone *mz; 5492 struct zone_reclaim_stat *rstat; 5493 unsigned long recent_rotated[2] = {0, 0}; 5494 unsigned long recent_scanned[2] = {0, 0}; 5495 5496 for_each_online_node(nid) 5497 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 5498 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 5499 rstat = &mz->lruvec.reclaim_stat; 5500 5501 recent_rotated[0] += rstat->recent_rotated[0]; 5502 recent_rotated[1] += rstat->recent_rotated[1]; 5503 recent_scanned[0] += rstat->recent_scanned[0]; 5504 recent_scanned[1] += rstat->recent_scanned[1]; 5505 } 5506 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 5507 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 5508 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 5509 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 5510 } 5511 #endif 5512 5513 return 0; 5514 } 5515 5516 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 5517 { 5518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5519 5520 return mem_cgroup_swappiness(memcg); 5521 } 5522 5523 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 5524 u64 val) 5525 { 5526 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5527 struct mem_cgroup *parent; 5528 5529 if (val > 100) 5530 return -EINVAL; 5531 5532 if (cgrp->parent == NULL) 5533 return -EINVAL; 5534 5535 parent = mem_cgroup_from_cont(cgrp->parent); 5536 5537 mutex_lock(&memcg_create_mutex); 5538 5539 /* If under hierarchy, only empty-root can set this value */ 5540 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5541 mutex_unlock(&memcg_create_mutex); 5542 return -EINVAL; 5543 } 5544 5545 memcg->swappiness = val; 5546 5547 mutex_unlock(&memcg_create_mutex); 5548 5549 return 0; 5550 } 5551 5552 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 5553 { 5554 struct mem_cgroup_threshold_ary *t; 5555 u64 usage; 5556 int i; 5557 5558 rcu_read_lock(); 5559 if (!swap) 5560 t = rcu_dereference(memcg->thresholds.primary); 5561 else 5562 t = rcu_dereference(memcg->memsw_thresholds.primary); 5563 5564 if (!t) 5565 goto unlock; 5566 5567 usage = mem_cgroup_usage(memcg, swap); 5568 5569 /* 5570 * current_threshold points to threshold just below or equal to usage. 5571 * If it's not true, a threshold was crossed after last 5572 * call of __mem_cgroup_threshold(). 5573 */ 5574 i = t->current_threshold; 5575 5576 /* 5577 * Iterate backward over array of thresholds starting from 5578 * current_threshold and check if a threshold is crossed. 5579 * If none of thresholds below usage is crossed, we read 5580 * only one element of the array here. 5581 */ 5582 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 5583 eventfd_signal(t->entries[i].eventfd, 1); 5584 5585 /* i = current_threshold + 1 */ 5586 i++; 5587 5588 /* 5589 * Iterate forward over array of thresholds starting from 5590 * current_threshold+1 and check if a threshold is crossed. 5591 * If none of thresholds above usage is crossed, we read 5592 * only one element of the array here. 5593 */ 5594 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 5595 eventfd_signal(t->entries[i].eventfd, 1); 5596 5597 /* Update current_threshold */ 5598 t->current_threshold = i - 1; 5599 unlock: 5600 rcu_read_unlock(); 5601 } 5602 5603 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 5604 { 5605 while (memcg) { 5606 __mem_cgroup_threshold(memcg, false); 5607 if (do_swap_account) 5608 __mem_cgroup_threshold(memcg, true); 5609 5610 memcg = parent_mem_cgroup(memcg); 5611 } 5612 } 5613 5614 static int compare_thresholds(const void *a, const void *b) 5615 { 5616 const struct mem_cgroup_threshold *_a = a; 5617 const struct mem_cgroup_threshold *_b = b; 5618 5619 return _a->threshold - _b->threshold; 5620 } 5621 5622 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5623 { 5624 struct mem_cgroup_eventfd_list *ev; 5625 5626 list_for_each_entry(ev, &memcg->oom_notify, list) 5627 eventfd_signal(ev->eventfd, 1); 5628 return 0; 5629 } 5630 5631 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 5632 { 5633 struct mem_cgroup *iter; 5634 5635 for_each_mem_cgroup_tree(iter, memcg) 5636 mem_cgroup_oom_notify_cb(iter); 5637 } 5638 5639 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 5640 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5641 { 5642 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5643 struct mem_cgroup_thresholds *thresholds; 5644 struct mem_cgroup_threshold_ary *new; 5645 enum res_type type = MEMFILE_TYPE(cft->private); 5646 u64 threshold, usage; 5647 int i, size, ret; 5648 5649 ret = res_counter_memparse_write_strategy(args, &threshold); 5650 if (ret) 5651 return ret; 5652 5653 mutex_lock(&memcg->thresholds_lock); 5654 5655 if (type == _MEM) 5656 thresholds = &memcg->thresholds; 5657 else if (type == _MEMSWAP) 5658 thresholds = &memcg->memsw_thresholds; 5659 else 5660 BUG(); 5661 5662 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5663 5664 /* Check if a threshold crossed before adding a new one */ 5665 if (thresholds->primary) 5666 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5667 5668 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 5669 5670 /* Allocate memory for new array of thresholds */ 5671 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 5672 GFP_KERNEL); 5673 if (!new) { 5674 ret = -ENOMEM; 5675 goto unlock; 5676 } 5677 new->size = size; 5678 5679 /* Copy thresholds (if any) to new array */ 5680 if (thresholds->primary) { 5681 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 5682 sizeof(struct mem_cgroup_threshold)); 5683 } 5684 5685 /* Add new threshold */ 5686 new->entries[size - 1].eventfd = eventfd; 5687 new->entries[size - 1].threshold = threshold; 5688 5689 /* Sort thresholds. Registering of new threshold isn't time-critical */ 5690 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 5691 compare_thresholds, NULL); 5692 5693 /* Find current threshold */ 5694 new->current_threshold = -1; 5695 for (i = 0; i < size; i++) { 5696 if (new->entries[i].threshold <= usage) { 5697 /* 5698 * new->current_threshold will not be used until 5699 * rcu_assign_pointer(), so it's safe to increment 5700 * it here. 5701 */ 5702 ++new->current_threshold; 5703 } else 5704 break; 5705 } 5706 5707 /* Free old spare buffer and save old primary buffer as spare */ 5708 kfree(thresholds->spare); 5709 thresholds->spare = thresholds->primary; 5710 5711 rcu_assign_pointer(thresholds->primary, new); 5712 5713 /* To be sure that nobody uses thresholds */ 5714 synchronize_rcu(); 5715 5716 unlock: 5717 mutex_unlock(&memcg->thresholds_lock); 5718 5719 return ret; 5720 } 5721 5722 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 5723 struct cftype *cft, struct eventfd_ctx *eventfd) 5724 { 5725 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5726 struct mem_cgroup_thresholds *thresholds; 5727 struct mem_cgroup_threshold_ary *new; 5728 enum res_type type = MEMFILE_TYPE(cft->private); 5729 u64 usage; 5730 int i, j, size; 5731 5732 mutex_lock(&memcg->thresholds_lock); 5733 if (type == _MEM) 5734 thresholds = &memcg->thresholds; 5735 else if (type == _MEMSWAP) 5736 thresholds = &memcg->memsw_thresholds; 5737 else 5738 BUG(); 5739 5740 if (!thresholds->primary) 5741 goto unlock; 5742 5743 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5744 5745 /* Check if a threshold crossed before removing */ 5746 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5747 5748 /* Calculate new number of threshold */ 5749 size = 0; 5750 for (i = 0; i < thresholds->primary->size; i++) { 5751 if (thresholds->primary->entries[i].eventfd != eventfd) 5752 size++; 5753 } 5754 5755 new = thresholds->spare; 5756 5757 /* Set thresholds array to NULL if we don't have thresholds */ 5758 if (!size) { 5759 kfree(new); 5760 new = NULL; 5761 goto swap_buffers; 5762 } 5763 5764 new->size = size; 5765 5766 /* Copy thresholds and find current threshold */ 5767 new->current_threshold = -1; 5768 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 5769 if (thresholds->primary->entries[i].eventfd == eventfd) 5770 continue; 5771 5772 new->entries[j] = thresholds->primary->entries[i]; 5773 if (new->entries[j].threshold <= usage) { 5774 /* 5775 * new->current_threshold will not be used 5776 * until rcu_assign_pointer(), so it's safe to increment 5777 * it here. 5778 */ 5779 ++new->current_threshold; 5780 } 5781 j++; 5782 } 5783 5784 swap_buffers: 5785 /* Swap primary and spare array */ 5786 thresholds->spare = thresholds->primary; 5787 /* If all events are unregistered, free the spare array */ 5788 if (!new) { 5789 kfree(thresholds->spare); 5790 thresholds->spare = NULL; 5791 } 5792 5793 rcu_assign_pointer(thresholds->primary, new); 5794 5795 /* To be sure that nobody uses thresholds */ 5796 synchronize_rcu(); 5797 unlock: 5798 mutex_unlock(&memcg->thresholds_lock); 5799 } 5800 5801 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 5802 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5803 { 5804 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5805 struct mem_cgroup_eventfd_list *event; 5806 enum res_type type = MEMFILE_TYPE(cft->private); 5807 5808 BUG_ON(type != _OOM_TYPE); 5809 event = kmalloc(sizeof(*event), GFP_KERNEL); 5810 if (!event) 5811 return -ENOMEM; 5812 5813 spin_lock(&memcg_oom_lock); 5814 5815 event->eventfd = eventfd; 5816 list_add(&event->list, &memcg->oom_notify); 5817 5818 /* already in OOM ? */ 5819 if (atomic_read(&memcg->under_oom)) 5820 eventfd_signal(eventfd, 1); 5821 spin_unlock(&memcg_oom_lock); 5822 5823 return 0; 5824 } 5825 5826 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 5827 struct cftype *cft, struct eventfd_ctx *eventfd) 5828 { 5829 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5830 struct mem_cgroup_eventfd_list *ev, *tmp; 5831 enum res_type type = MEMFILE_TYPE(cft->private); 5832 5833 BUG_ON(type != _OOM_TYPE); 5834 5835 spin_lock(&memcg_oom_lock); 5836 5837 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 5838 if (ev->eventfd == eventfd) { 5839 list_del(&ev->list); 5840 kfree(ev); 5841 } 5842 } 5843 5844 spin_unlock(&memcg_oom_lock); 5845 } 5846 5847 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 5848 struct cftype *cft, struct cgroup_map_cb *cb) 5849 { 5850 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5851 5852 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5853 5854 if (atomic_read(&memcg->under_oom)) 5855 cb->fill(cb, "under_oom", 1); 5856 else 5857 cb->fill(cb, "under_oom", 0); 5858 return 0; 5859 } 5860 5861 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 5862 struct cftype *cft, u64 val) 5863 { 5864 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5865 struct mem_cgroup *parent; 5866 5867 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5868 if (!cgrp->parent || !((val == 0) || (val == 1))) 5869 return -EINVAL; 5870 5871 parent = mem_cgroup_from_cont(cgrp->parent); 5872 5873 mutex_lock(&memcg_create_mutex); 5874 /* oom-kill-disable is a flag for subhierarchy. */ 5875 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5876 mutex_unlock(&memcg_create_mutex); 5877 return -EINVAL; 5878 } 5879 memcg->oom_kill_disable = val; 5880 if (!val) 5881 memcg_oom_recover(memcg); 5882 mutex_unlock(&memcg_create_mutex); 5883 return 0; 5884 } 5885 5886 #ifdef CONFIG_MEMCG_KMEM 5887 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5888 { 5889 int ret; 5890 5891 memcg->kmemcg_id = -1; 5892 ret = memcg_propagate_kmem(memcg); 5893 if (ret) 5894 return ret; 5895 5896 return mem_cgroup_sockets_init(memcg, ss); 5897 } 5898 5899 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5900 { 5901 mem_cgroup_sockets_destroy(memcg); 5902 } 5903 5904 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5905 { 5906 if (!memcg_kmem_is_active(memcg)) 5907 return; 5908 5909 /* 5910 * kmem charges can outlive the cgroup. In the case of slab 5911 * pages, for instance, a page contain objects from various 5912 * processes. As we prevent from taking a reference for every 5913 * such allocation we have to be careful when doing uncharge 5914 * (see memcg_uncharge_kmem) and here during offlining. 5915 * 5916 * The idea is that that only the _last_ uncharge which sees 5917 * the dead memcg will drop the last reference. An additional 5918 * reference is taken here before the group is marked dead 5919 * which is then paired with css_put during uncharge resp. here. 5920 * 5921 * Although this might sound strange as this path is called from 5922 * css_offline() when the referencemight have dropped down to 0 5923 * and shouldn't be incremented anymore (css_tryget would fail) 5924 * we do not have other options because of the kmem allocations 5925 * lifetime. 5926 */ 5927 css_get(&memcg->css); 5928 5929 memcg_kmem_mark_dead(memcg); 5930 5931 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5932 return; 5933 5934 if (memcg_kmem_test_and_clear_dead(memcg)) 5935 css_put(&memcg->css); 5936 } 5937 #else 5938 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5939 { 5940 return 0; 5941 } 5942 5943 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5944 { 5945 } 5946 5947 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5948 { 5949 } 5950 #endif 5951 5952 static struct cftype mem_cgroup_files[] = { 5953 { 5954 .name = "usage_in_bytes", 5955 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5956 .read = mem_cgroup_read, 5957 .register_event = mem_cgroup_usage_register_event, 5958 .unregister_event = mem_cgroup_usage_unregister_event, 5959 }, 5960 { 5961 .name = "max_usage_in_bytes", 5962 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5963 .trigger = mem_cgroup_reset, 5964 .read = mem_cgroup_read, 5965 }, 5966 { 5967 .name = "limit_in_bytes", 5968 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5969 .write_string = mem_cgroup_write, 5970 .read = mem_cgroup_read, 5971 }, 5972 { 5973 .name = "soft_limit_in_bytes", 5974 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5975 .write_string = mem_cgroup_write, 5976 .read = mem_cgroup_read, 5977 }, 5978 { 5979 .name = "failcnt", 5980 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5981 .trigger = mem_cgroup_reset, 5982 .read = mem_cgroup_read, 5983 }, 5984 { 5985 .name = "stat", 5986 .read_seq_string = memcg_stat_show, 5987 }, 5988 { 5989 .name = "force_empty", 5990 .trigger = mem_cgroup_force_empty_write, 5991 }, 5992 { 5993 .name = "use_hierarchy", 5994 .flags = CFTYPE_INSANE, 5995 .write_u64 = mem_cgroup_hierarchy_write, 5996 .read_u64 = mem_cgroup_hierarchy_read, 5997 }, 5998 { 5999 .name = "swappiness", 6000 .read_u64 = mem_cgroup_swappiness_read, 6001 .write_u64 = mem_cgroup_swappiness_write, 6002 }, 6003 { 6004 .name = "move_charge_at_immigrate", 6005 .read_u64 = mem_cgroup_move_charge_read, 6006 .write_u64 = mem_cgroup_move_charge_write, 6007 }, 6008 { 6009 .name = "oom_control", 6010 .read_map = mem_cgroup_oom_control_read, 6011 .write_u64 = mem_cgroup_oom_control_write, 6012 .register_event = mem_cgroup_oom_register_event, 6013 .unregister_event = mem_cgroup_oom_unregister_event, 6014 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6015 }, 6016 { 6017 .name = "pressure_level", 6018 .register_event = vmpressure_register_event, 6019 .unregister_event = vmpressure_unregister_event, 6020 }, 6021 #ifdef CONFIG_NUMA 6022 { 6023 .name = "numa_stat", 6024 .read_seq_string = memcg_numa_stat_show, 6025 }, 6026 #endif 6027 #ifdef CONFIG_MEMCG_KMEM 6028 { 6029 .name = "kmem.limit_in_bytes", 6030 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6031 .write_string = mem_cgroup_write, 6032 .read = mem_cgroup_read, 6033 }, 6034 { 6035 .name = "kmem.usage_in_bytes", 6036 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6037 .read = mem_cgroup_read, 6038 }, 6039 { 6040 .name = "kmem.failcnt", 6041 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6042 .trigger = mem_cgroup_reset, 6043 .read = mem_cgroup_read, 6044 }, 6045 { 6046 .name = "kmem.max_usage_in_bytes", 6047 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6048 .trigger = mem_cgroup_reset, 6049 .read = mem_cgroup_read, 6050 }, 6051 #ifdef CONFIG_SLABINFO 6052 { 6053 .name = "kmem.slabinfo", 6054 .read_seq_string = mem_cgroup_slabinfo_read, 6055 }, 6056 #endif 6057 #endif 6058 { }, /* terminate */ 6059 }; 6060 6061 #ifdef CONFIG_MEMCG_SWAP 6062 static struct cftype memsw_cgroup_files[] = { 6063 { 6064 .name = "memsw.usage_in_bytes", 6065 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6066 .read = mem_cgroup_read, 6067 .register_event = mem_cgroup_usage_register_event, 6068 .unregister_event = mem_cgroup_usage_unregister_event, 6069 }, 6070 { 6071 .name = "memsw.max_usage_in_bytes", 6072 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6073 .trigger = mem_cgroup_reset, 6074 .read = mem_cgroup_read, 6075 }, 6076 { 6077 .name = "memsw.limit_in_bytes", 6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6079 .write_string = mem_cgroup_write, 6080 .read = mem_cgroup_read, 6081 }, 6082 { 6083 .name = "memsw.failcnt", 6084 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6085 .trigger = mem_cgroup_reset, 6086 .read = mem_cgroup_read, 6087 }, 6088 { }, /* terminate */ 6089 }; 6090 #endif 6091 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6092 { 6093 struct mem_cgroup_per_node *pn; 6094 struct mem_cgroup_per_zone *mz; 6095 int zone, tmp = node; 6096 /* 6097 * This routine is called against possible nodes. 6098 * But it's BUG to call kmalloc() against offline node. 6099 * 6100 * TODO: this routine can waste much memory for nodes which will 6101 * never be onlined. It's better to use memory hotplug callback 6102 * function. 6103 */ 6104 if (!node_state(node, N_NORMAL_MEMORY)) 6105 tmp = -1; 6106 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 6107 if (!pn) 6108 return 1; 6109 6110 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6111 mz = &pn->zoneinfo[zone]; 6112 lruvec_init(&mz->lruvec); 6113 mz->usage_in_excess = 0; 6114 mz->on_tree = false; 6115 mz->memcg = memcg; 6116 } 6117 memcg->nodeinfo[node] = pn; 6118 return 0; 6119 } 6120 6121 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6122 { 6123 kfree(memcg->nodeinfo[node]); 6124 } 6125 6126 static struct mem_cgroup *mem_cgroup_alloc(void) 6127 { 6128 struct mem_cgroup *memcg; 6129 size_t size = memcg_size(); 6130 6131 /* Can be very big if nr_node_ids is very big */ 6132 if (size < PAGE_SIZE) 6133 memcg = kzalloc(size, GFP_KERNEL); 6134 else 6135 memcg = vzalloc(size); 6136 6137 if (!memcg) 6138 return NULL; 6139 6140 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 6141 if (!memcg->stat) 6142 goto out_free; 6143 spin_lock_init(&memcg->pcp_counter_lock); 6144 return memcg; 6145 6146 out_free: 6147 if (size < PAGE_SIZE) 6148 kfree(memcg); 6149 else 6150 vfree(memcg); 6151 return NULL; 6152 } 6153 6154 /* 6155 * At destroying mem_cgroup, references from swap_cgroup can remain. 6156 * (scanning all at force_empty is too costly...) 6157 * 6158 * Instead of clearing all references at force_empty, we remember 6159 * the number of reference from swap_cgroup and free mem_cgroup when 6160 * it goes down to 0. 6161 * 6162 * Removal of cgroup itself succeeds regardless of refs from swap. 6163 */ 6164 6165 static void __mem_cgroup_free(struct mem_cgroup *memcg) 6166 { 6167 int node; 6168 size_t size = memcg_size(); 6169 6170 mem_cgroup_remove_from_trees(memcg); 6171 free_css_id(&mem_cgroup_subsys, &memcg->css); 6172 6173 for_each_node(node) 6174 free_mem_cgroup_per_zone_info(memcg, node); 6175 6176 free_percpu(memcg->stat); 6177 6178 /* 6179 * We need to make sure that (at least for now), the jump label 6180 * destruction code runs outside of the cgroup lock. This is because 6181 * get_online_cpus(), which is called from the static_branch update, 6182 * can't be called inside the cgroup_lock. cpusets are the ones 6183 * enforcing this dependency, so if they ever change, we might as well. 6184 * 6185 * schedule_work() will guarantee this happens. Be careful if you need 6186 * to move this code around, and make sure it is outside 6187 * the cgroup_lock. 6188 */ 6189 disarm_static_keys(memcg); 6190 if (size < PAGE_SIZE) 6191 kfree(memcg); 6192 else 6193 vfree(memcg); 6194 } 6195 6196 /* 6197 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6198 */ 6199 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 6200 { 6201 if (!memcg->res.parent) 6202 return NULL; 6203 return mem_cgroup_from_res_counter(memcg->res.parent, res); 6204 } 6205 EXPORT_SYMBOL(parent_mem_cgroup); 6206 6207 static void __init mem_cgroup_soft_limit_tree_init(void) 6208 { 6209 struct mem_cgroup_tree_per_node *rtpn; 6210 struct mem_cgroup_tree_per_zone *rtpz; 6211 int tmp, node, zone; 6212 6213 for_each_node(node) { 6214 tmp = node; 6215 if (!node_state(node, N_NORMAL_MEMORY)) 6216 tmp = -1; 6217 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6218 BUG_ON(!rtpn); 6219 6220 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6221 6222 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6223 rtpz = &rtpn->rb_tree_per_zone[zone]; 6224 rtpz->rb_root = RB_ROOT; 6225 spin_lock_init(&rtpz->lock); 6226 } 6227 } 6228 } 6229 6230 static struct cgroup_subsys_state * __ref 6231 mem_cgroup_css_alloc(struct cgroup *cont) 6232 { 6233 struct mem_cgroup *memcg; 6234 long error = -ENOMEM; 6235 int node; 6236 6237 memcg = mem_cgroup_alloc(); 6238 if (!memcg) 6239 return ERR_PTR(error); 6240 6241 for_each_node(node) 6242 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 6243 goto free_out; 6244 6245 /* root ? */ 6246 if (cont->parent == NULL) { 6247 root_mem_cgroup = memcg; 6248 res_counter_init(&memcg->res, NULL); 6249 res_counter_init(&memcg->memsw, NULL); 6250 res_counter_init(&memcg->kmem, NULL); 6251 } 6252 6253 memcg->last_scanned_node = MAX_NUMNODES; 6254 INIT_LIST_HEAD(&memcg->oom_notify); 6255 memcg->move_charge_at_immigrate = 0; 6256 mutex_init(&memcg->thresholds_lock); 6257 spin_lock_init(&memcg->move_lock); 6258 vmpressure_init(&memcg->vmpressure); 6259 6260 return &memcg->css; 6261 6262 free_out: 6263 __mem_cgroup_free(memcg); 6264 return ERR_PTR(error); 6265 } 6266 6267 static int 6268 mem_cgroup_css_online(struct cgroup *cont) 6269 { 6270 struct mem_cgroup *memcg, *parent; 6271 int error = 0; 6272 6273 if (!cont->parent) 6274 return 0; 6275 6276 mutex_lock(&memcg_create_mutex); 6277 memcg = mem_cgroup_from_cont(cont); 6278 parent = mem_cgroup_from_cont(cont->parent); 6279 6280 memcg->use_hierarchy = parent->use_hierarchy; 6281 memcg->oom_kill_disable = parent->oom_kill_disable; 6282 memcg->swappiness = mem_cgroup_swappiness(parent); 6283 6284 if (parent->use_hierarchy) { 6285 res_counter_init(&memcg->res, &parent->res); 6286 res_counter_init(&memcg->memsw, &parent->memsw); 6287 res_counter_init(&memcg->kmem, &parent->kmem); 6288 6289 /* 6290 * No need to take a reference to the parent because cgroup 6291 * core guarantees its existence. 6292 */ 6293 } else { 6294 res_counter_init(&memcg->res, NULL); 6295 res_counter_init(&memcg->memsw, NULL); 6296 res_counter_init(&memcg->kmem, NULL); 6297 /* 6298 * Deeper hierachy with use_hierarchy == false doesn't make 6299 * much sense so let cgroup subsystem know about this 6300 * unfortunate state in our controller. 6301 */ 6302 if (parent != root_mem_cgroup) 6303 mem_cgroup_subsys.broken_hierarchy = true; 6304 } 6305 6306 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6307 mutex_unlock(&memcg_create_mutex); 6308 return error; 6309 } 6310 6311 /* 6312 * Announce all parents that a group from their hierarchy is gone. 6313 */ 6314 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 6315 { 6316 struct mem_cgroup *parent = memcg; 6317 6318 while ((parent = parent_mem_cgroup(parent))) 6319 mem_cgroup_iter_invalidate(parent); 6320 6321 /* 6322 * if the root memcg is not hierarchical we have to check it 6323 * explicitely. 6324 */ 6325 if (!root_mem_cgroup->use_hierarchy) 6326 mem_cgroup_iter_invalidate(root_mem_cgroup); 6327 } 6328 6329 static void mem_cgroup_css_offline(struct cgroup *cont) 6330 { 6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6332 6333 kmem_cgroup_css_offline(memcg); 6334 6335 mem_cgroup_invalidate_reclaim_iterators(memcg); 6336 mem_cgroup_reparent_charges(memcg); 6337 mem_cgroup_destroy_all_caches(memcg); 6338 } 6339 6340 static void mem_cgroup_css_free(struct cgroup *cont) 6341 { 6342 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6343 6344 memcg_destroy_kmem(memcg); 6345 __mem_cgroup_free(memcg); 6346 } 6347 6348 #ifdef CONFIG_MMU 6349 /* Handlers for move charge at task migration. */ 6350 #define PRECHARGE_COUNT_AT_ONCE 256 6351 static int mem_cgroup_do_precharge(unsigned long count) 6352 { 6353 int ret = 0; 6354 int batch_count = PRECHARGE_COUNT_AT_ONCE; 6355 struct mem_cgroup *memcg = mc.to; 6356 6357 if (mem_cgroup_is_root(memcg)) { 6358 mc.precharge += count; 6359 /* we don't need css_get for root */ 6360 return ret; 6361 } 6362 /* try to charge at once */ 6363 if (count > 1) { 6364 struct res_counter *dummy; 6365 /* 6366 * "memcg" cannot be under rmdir() because we've already checked 6367 * by cgroup_lock_live_cgroup() that it is not removed and we 6368 * are still under the same cgroup_mutex. So we can postpone 6369 * css_get(). 6370 */ 6371 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 6372 goto one_by_one; 6373 if (do_swap_account && res_counter_charge(&memcg->memsw, 6374 PAGE_SIZE * count, &dummy)) { 6375 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 6376 goto one_by_one; 6377 } 6378 mc.precharge += count; 6379 return ret; 6380 } 6381 one_by_one: 6382 /* fall back to one by one charge */ 6383 while (count--) { 6384 if (signal_pending(current)) { 6385 ret = -EINTR; 6386 break; 6387 } 6388 if (!batch_count--) { 6389 batch_count = PRECHARGE_COUNT_AT_ONCE; 6390 cond_resched(); 6391 } 6392 ret = __mem_cgroup_try_charge(NULL, 6393 GFP_KERNEL, 1, &memcg, false); 6394 if (ret) 6395 /* mem_cgroup_clear_mc() will do uncharge later */ 6396 return ret; 6397 mc.precharge++; 6398 } 6399 return ret; 6400 } 6401 6402 /** 6403 * get_mctgt_type - get target type of moving charge 6404 * @vma: the vma the pte to be checked belongs 6405 * @addr: the address corresponding to the pte to be checked 6406 * @ptent: the pte to be checked 6407 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6408 * 6409 * Returns 6410 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 6411 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 6412 * move charge. if @target is not NULL, the page is stored in target->page 6413 * with extra refcnt got(Callers should handle it). 6414 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 6415 * target for charge migration. if @target is not NULL, the entry is stored 6416 * in target->ent. 6417 * 6418 * Called with pte lock held. 6419 */ 6420 union mc_target { 6421 struct page *page; 6422 swp_entry_t ent; 6423 }; 6424 6425 enum mc_target_type { 6426 MC_TARGET_NONE = 0, 6427 MC_TARGET_PAGE, 6428 MC_TARGET_SWAP, 6429 }; 6430 6431 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 6432 unsigned long addr, pte_t ptent) 6433 { 6434 struct page *page = vm_normal_page(vma, addr, ptent); 6435 6436 if (!page || !page_mapped(page)) 6437 return NULL; 6438 if (PageAnon(page)) { 6439 /* we don't move shared anon */ 6440 if (!move_anon()) 6441 return NULL; 6442 } else if (!move_file()) 6443 /* we ignore mapcount for file pages */ 6444 return NULL; 6445 if (!get_page_unless_zero(page)) 6446 return NULL; 6447 6448 return page; 6449 } 6450 6451 #ifdef CONFIG_SWAP 6452 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6453 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6454 { 6455 struct page *page = NULL; 6456 swp_entry_t ent = pte_to_swp_entry(ptent); 6457 6458 if (!move_anon() || non_swap_entry(ent)) 6459 return NULL; 6460 /* 6461 * Because lookup_swap_cache() updates some statistics counter, 6462 * we call find_get_page() with swapper_space directly. 6463 */ 6464 page = find_get_page(swap_address_space(ent), ent.val); 6465 if (do_swap_account) 6466 entry->val = ent.val; 6467 6468 return page; 6469 } 6470 #else 6471 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6472 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6473 { 6474 return NULL; 6475 } 6476 #endif 6477 6478 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 6479 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6480 { 6481 struct page *page = NULL; 6482 struct address_space *mapping; 6483 pgoff_t pgoff; 6484 6485 if (!vma->vm_file) /* anonymous vma */ 6486 return NULL; 6487 if (!move_file()) 6488 return NULL; 6489 6490 mapping = vma->vm_file->f_mapping; 6491 if (pte_none(ptent)) 6492 pgoff = linear_page_index(vma, addr); 6493 else /* pte_file(ptent) is true */ 6494 pgoff = pte_to_pgoff(ptent); 6495 6496 /* page is moved even if it's not RSS of this task(page-faulted). */ 6497 page = find_get_page(mapping, pgoff); 6498 6499 #ifdef CONFIG_SWAP 6500 /* shmem/tmpfs may report page out on swap: account for that too. */ 6501 if (radix_tree_exceptional_entry(page)) { 6502 swp_entry_t swap = radix_to_swp_entry(page); 6503 if (do_swap_account) 6504 *entry = swap; 6505 page = find_get_page(swap_address_space(swap), swap.val); 6506 } 6507 #endif 6508 return page; 6509 } 6510 6511 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 6512 unsigned long addr, pte_t ptent, union mc_target *target) 6513 { 6514 struct page *page = NULL; 6515 struct page_cgroup *pc; 6516 enum mc_target_type ret = MC_TARGET_NONE; 6517 swp_entry_t ent = { .val = 0 }; 6518 6519 if (pte_present(ptent)) 6520 page = mc_handle_present_pte(vma, addr, ptent); 6521 else if (is_swap_pte(ptent)) 6522 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 6523 else if (pte_none(ptent) || pte_file(ptent)) 6524 page = mc_handle_file_pte(vma, addr, ptent, &ent); 6525 6526 if (!page && !ent.val) 6527 return ret; 6528 if (page) { 6529 pc = lookup_page_cgroup(page); 6530 /* 6531 * Do only loose check w/o page_cgroup lock. 6532 * mem_cgroup_move_account() checks the pc is valid or not under 6533 * the lock. 6534 */ 6535 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6536 ret = MC_TARGET_PAGE; 6537 if (target) 6538 target->page = page; 6539 } 6540 if (!ret || !target) 6541 put_page(page); 6542 } 6543 /* There is a swap entry and a page doesn't exist or isn't charged */ 6544 if (ent.val && !ret && 6545 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 6546 ret = MC_TARGET_SWAP; 6547 if (target) 6548 target->ent = ent; 6549 } 6550 return ret; 6551 } 6552 6553 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6554 /* 6555 * We don't consider swapping or file mapped pages because THP does not 6556 * support them for now. 6557 * Caller should make sure that pmd_trans_huge(pmd) is true. 6558 */ 6559 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6560 unsigned long addr, pmd_t pmd, union mc_target *target) 6561 { 6562 struct page *page = NULL; 6563 struct page_cgroup *pc; 6564 enum mc_target_type ret = MC_TARGET_NONE; 6565 6566 page = pmd_page(pmd); 6567 VM_BUG_ON(!page || !PageHead(page)); 6568 if (!move_anon()) 6569 return ret; 6570 pc = lookup_page_cgroup(page); 6571 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6572 ret = MC_TARGET_PAGE; 6573 if (target) { 6574 get_page(page); 6575 target->page = page; 6576 } 6577 } 6578 return ret; 6579 } 6580 #else 6581 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6582 unsigned long addr, pmd_t pmd, union mc_target *target) 6583 { 6584 return MC_TARGET_NONE; 6585 } 6586 #endif 6587 6588 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 6589 unsigned long addr, unsigned long end, 6590 struct mm_walk *walk) 6591 { 6592 struct vm_area_struct *vma = walk->private; 6593 pte_t *pte; 6594 spinlock_t *ptl; 6595 6596 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6597 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6598 mc.precharge += HPAGE_PMD_NR; 6599 spin_unlock(&vma->vm_mm->page_table_lock); 6600 return 0; 6601 } 6602 6603 if (pmd_trans_unstable(pmd)) 6604 return 0; 6605 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6606 for (; addr != end; pte++, addr += PAGE_SIZE) 6607 if (get_mctgt_type(vma, addr, *pte, NULL)) 6608 mc.precharge++; /* increment precharge temporarily */ 6609 pte_unmap_unlock(pte - 1, ptl); 6610 cond_resched(); 6611 6612 return 0; 6613 } 6614 6615 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6616 { 6617 unsigned long precharge; 6618 struct vm_area_struct *vma; 6619 6620 down_read(&mm->mmap_sem); 6621 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6622 struct mm_walk mem_cgroup_count_precharge_walk = { 6623 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6624 .mm = mm, 6625 .private = vma, 6626 }; 6627 if (is_vm_hugetlb_page(vma)) 6628 continue; 6629 walk_page_range(vma->vm_start, vma->vm_end, 6630 &mem_cgroup_count_precharge_walk); 6631 } 6632 up_read(&mm->mmap_sem); 6633 6634 precharge = mc.precharge; 6635 mc.precharge = 0; 6636 6637 return precharge; 6638 } 6639 6640 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6641 { 6642 unsigned long precharge = mem_cgroup_count_precharge(mm); 6643 6644 VM_BUG_ON(mc.moving_task); 6645 mc.moving_task = current; 6646 return mem_cgroup_do_precharge(precharge); 6647 } 6648 6649 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6650 static void __mem_cgroup_clear_mc(void) 6651 { 6652 struct mem_cgroup *from = mc.from; 6653 struct mem_cgroup *to = mc.to; 6654 int i; 6655 6656 /* we must uncharge all the leftover precharges from mc.to */ 6657 if (mc.precharge) { 6658 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 6659 mc.precharge = 0; 6660 } 6661 /* 6662 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6663 * we must uncharge here. 6664 */ 6665 if (mc.moved_charge) { 6666 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6667 mc.moved_charge = 0; 6668 } 6669 /* we must fixup refcnts and charges */ 6670 if (mc.moved_swap) { 6671 /* uncharge swap account from the old cgroup */ 6672 if (!mem_cgroup_is_root(mc.from)) 6673 res_counter_uncharge(&mc.from->memsw, 6674 PAGE_SIZE * mc.moved_swap); 6675 6676 for (i = 0; i < mc.moved_swap; i++) 6677 css_put(&mc.from->css); 6678 6679 if (!mem_cgroup_is_root(mc.to)) { 6680 /* 6681 * we charged both to->res and to->memsw, so we should 6682 * uncharge to->res. 6683 */ 6684 res_counter_uncharge(&mc.to->res, 6685 PAGE_SIZE * mc.moved_swap); 6686 } 6687 /* we've already done css_get(mc.to) */ 6688 mc.moved_swap = 0; 6689 } 6690 memcg_oom_recover(from); 6691 memcg_oom_recover(to); 6692 wake_up_all(&mc.waitq); 6693 } 6694 6695 static void mem_cgroup_clear_mc(void) 6696 { 6697 struct mem_cgroup *from = mc.from; 6698 6699 /* 6700 * we must clear moving_task before waking up waiters at the end of 6701 * task migration. 6702 */ 6703 mc.moving_task = NULL; 6704 __mem_cgroup_clear_mc(); 6705 spin_lock(&mc.lock); 6706 mc.from = NULL; 6707 mc.to = NULL; 6708 spin_unlock(&mc.lock); 6709 mem_cgroup_end_move(from); 6710 } 6711 6712 static int mem_cgroup_can_attach(struct cgroup *cgroup, 6713 struct cgroup_taskset *tset) 6714 { 6715 struct task_struct *p = cgroup_taskset_first(tset); 6716 int ret = 0; 6717 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6718 unsigned long move_charge_at_immigrate; 6719 6720 /* 6721 * We are now commited to this value whatever it is. Changes in this 6722 * tunable will only affect upcoming migrations, not the current one. 6723 * So we need to save it, and keep it going. 6724 */ 6725 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 6726 if (move_charge_at_immigrate) { 6727 struct mm_struct *mm; 6728 struct mem_cgroup *from = mem_cgroup_from_task(p); 6729 6730 VM_BUG_ON(from == memcg); 6731 6732 mm = get_task_mm(p); 6733 if (!mm) 6734 return 0; 6735 /* We move charges only when we move a owner of the mm */ 6736 if (mm->owner == p) { 6737 VM_BUG_ON(mc.from); 6738 VM_BUG_ON(mc.to); 6739 VM_BUG_ON(mc.precharge); 6740 VM_BUG_ON(mc.moved_charge); 6741 VM_BUG_ON(mc.moved_swap); 6742 mem_cgroup_start_move(from); 6743 spin_lock(&mc.lock); 6744 mc.from = from; 6745 mc.to = memcg; 6746 mc.immigrate_flags = move_charge_at_immigrate; 6747 spin_unlock(&mc.lock); 6748 /* We set mc.moving_task later */ 6749 6750 ret = mem_cgroup_precharge_mc(mm); 6751 if (ret) 6752 mem_cgroup_clear_mc(); 6753 } 6754 mmput(mm); 6755 } 6756 return ret; 6757 } 6758 6759 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6760 struct cgroup_taskset *tset) 6761 { 6762 mem_cgroup_clear_mc(); 6763 } 6764 6765 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6766 unsigned long addr, unsigned long end, 6767 struct mm_walk *walk) 6768 { 6769 int ret = 0; 6770 struct vm_area_struct *vma = walk->private; 6771 pte_t *pte; 6772 spinlock_t *ptl; 6773 enum mc_target_type target_type; 6774 union mc_target target; 6775 struct page *page; 6776 struct page_cgroup *pc; 6777 6778 /* 6779 * We don't take compound_lock() here but no race with splitting thp 6780 * happens because: 6781 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 6782 * under splitting, which means there's no concurrent thp split, 6783 * - if another thread runs into split_huge_page() just after we 6784 * entered this if-block, the thread must wait for page table lock 6785 * to be unlocked in __split_huge_page_splitting(), where the main 6786 * part of thp split is not executed yet. 6787 */ 6788 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6789 if (mc.precharge < HPAGE_PMD_NR) { 6790 spin_unlock(&vma->vm_mm->page_table_lock); 6791 return 0; 6792 } 6793 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6794 if (target_type == MC_TARGET_PAGE) { 6795 page = target.page; 6796 if (!isolate_lru_page(page)) { 6797 pc = lookup_page_cgroup(page); 6798 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 6799 pc, mc.from, mc.to)) { 6800 mc.precharge -= HPAGE_PMD_NR; 6801 mc.moved_charge += HPAGE_PMD_NR; 6802 } 6803 putback_lru_page(page); 6804 } 6805 put_page(page); 6806 } 6807 spin_unlock(&vma->vm_mm->page_table_lock); 6808 return 0; 6809 } 6810 6811 if (pmd_trans_unstable(pmd)) 6812 return 0; 6813 retry: 6814 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6815 for (; addr != end; addr += PAGE_SIZE) { 6816 pte_t ptent = *(pte++); 6817 swp_entry_t ent; 6818 6819 if (!mc.precharge) 6820 break; 6821 6822 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6823 case MC_TARGET_PAGE: 6824 page = target.page; 6825 if (isolate_lru_page(page)) 6826 goto put; 6827 pc = lookup_page_cgroup(page); 6828 if (!mem_cgroup_move_account(page, 1, pc, 6829 mc.from, mc.to)) { 6830 mc.precharge--; 6831 /* we uncharge from mc.from later. */ 6832 mc.moved_charge++; 6833 } 6834 putback_lru_page(page); 6835 put: /* get_mctgt_type() gets the page */ 6836 put_page(page); 6837 break; 6838 case MC_TARGET_SWAP: 6839 ent = target.ent; 6840 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6841 mc.precharge--; 6842 /* we fixup refcnts and charges later. */ 6843 mc.moved_swap++; 6844 } 6845 break; 6846 default: 6847 break; 6848 } 6849 } 6850 pte_unmap_unlock(pte - 1, ptl); 6851 cond_resched(); 6852 6853 if (addr != end) { 6854 /* 6855 * We have consumed all precharges we got in can_attach(). 6856 * We try charge one by one, but don't do any additional 6857 * charges to mc.to if we have failed in charge once in attach() 6858 * phase. 6859 */ 6860 ret = mem_cgroup_do_precharge(1); 6861 if (!ret) 6862 goto retry; 6863 } 6864 6865 return ret; 6866 } 6867 6868 static void mem_cgroup_move_charge(struct mm_struct *mm) 6869 { 6870 struct vm_area_struct *vma; 6871 6872 lru_add_drain_all(); 6873 retry: 6874 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 6875 /* 6876 * Someone who are holding the mmap_sem might be waiting in 6877 * waitq. So we cancel all extra charges, wake up all waiters, 6878 * and retry. Because we cancel precharges, we might not be able 6879 * to move enough charges, but moving charge is a best-effort 6880 * feature anyway, so it wouldn't be a big problem. 6881 */ 6882 __mem_cgroup_clear_mc(); 6883 cond_resched(); 6884 goto retry; 6885 } 6886 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6887 int ret; 6888 struct mm_walk mem_cgroup_move_charge_walk = { 6889 .pmd_entry = mem_cgroup_move_charge_pte_range, 6890 .mm = mm, 6891 .private = vma, 6892 }; 6893 if (is_vm_hugetlb_page(vma)) 6894 continue; 6895 ret = walk_page_range(vma->vm_start, vma->vm_end, 6896 &mem_cgroup_move_charge_walk); 6897 if (ret) 6898 /* 6899 * means we have consumed all precharges and failed in 6900 * doing additional charge. Just abandon here. 6901 */ 6902 break; 6903 } 6904 up_read(&mm->mmap_sem); 6905 } 6906 6907 static void mem_cgroup_move_task(struct cgroup *cont, 6908 struct cgroup_taskset *tset) 6909 { 6910 struct task_struct *p = cgroup_taskset_first(tset); 6911 struct mm_struct *mm = get_task_mm(p); 6912 6913 if (mm) { 6914 if (mc.to) 6915 mem_cgroup_move_charge(mm); 6916 mmput(mm); 6917 } 6918 if (mc.to) 6919 mem_cgroup_clear_mc(); 6920 } 6921 #else /* !CONFIG_MMU */ 6922 static int mem_cgroup_can_attach(struct cgroup *cgroup, 6923 struct cgroup_taskset *tset) 6924 { 6925 return 0; 6926 } 6927 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6928 struct cgroup_taskset *tset) 6929 { 6930 } 6931 static void mem_cgroup_move_task(struct cgroup *cont, 6932 struct cgroup_taskset *tset) 6933 { 6934 } 6935 #endif 6936 6937 /* 6938 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6939 * to verify sane_behavior flag on each mount attempt. 6940 */ 6941 static void mem_cgroup_bind(struct cgroup *root) 6942 { 6943 /* 6944 * use_hierarchy is forced with sane_behavior. cgroup core 6945 * guarantees that @root doesn't have any children, so turning it 6946 * on for the root memcg is enough. 6947 */ 6948 if (cgroup_sane_behavior(root)) 6949 mem_cgroup_from_cont(root)->use_hierarchy = true; 6950 } 6951 6952 struct cgroup_subsys mem_cgroup_subsys = { 6953 .name = "memory", 6954 .subsys_id = mem_cgroup_subsys_id, 6955 .css_alloc = mem_cgroup_css_alloc, 6956 .css_online = mem_cgroup_css_online, 6957 .css_offline = mem_cgroup_css_offline, 6958 .css_free = mem_cgroup_css_free, 6959 .can_attach = mem_cgroup_can_attach, 6960 .cancel_attach = mem_cgroup_cancel_attach, 6961 .attach = mem_cgroup_move_task, 6962 .bind = mem_cgroup_bind, 6963 .base_cftypes = mem_cgroup_files, 6964 .early_init = 0, 6965 .use_id = 1, 6966 }; 6967 6968 #ifdef CONFIG_MEMCG_SWAP 6969 static int __init enable_swap_account(char *s) 6970 { 6971 /* consider enabled if no parameter or 1 is given */ 6972 if (!strcmp(s, "1")) 6973 really_do_swap_account = 1; 6974 else if (!strcmp(s, "0")) 6975 really_do_swap_account = 0; 6976 return 1; 6977 } 6978 __setup("swapaccount=", enable_swap_account); 6979 6980 static void __init memsw_file_init(void) 6981 { 6982 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); 6983 } 6984 6985 static void __init enable_swap_cgroup(void) 6986 { 6987 if (!mem_cgroup_disabled() && really_do_swap_account) { 6988 do_swap_account = 1; 6989 memsw_file_init(); 6990 } 6991 } 6992 6993 #else 6994 static void __init enable_swap_cgroup(void) 6995 { 6996 } 6997 #endif 6998 6999 /* 7000 * subsys_initcall() for memory controller. 7001 * 7002 * Some parts like hotcpu_notifier() have to be initialized from this context 7003 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 7004 * everything that doesn't depend on a specific mem_cgroup structure should 7005 * be initialized from here. 7006 */ 7007 static int __init mem_cgroup_init(void) 7008 { 7009 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7010 enable_swap_cgroup(); 7011 mem_cgroup_soft_limit_tree_init(); 7012 memcg_stock_init(); 7013 return 0; 7014 } 7015 subsys_initcall(mem_cgroup_init); 7016