1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/swap_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include "internal.h" 60 #include <net/sock.h> 61 #include <net/ip.h> 62 #include <net/tcp_memcontrol.h> 63 #include "slab.h" 64 65 #include <asm/uaccess.h> 66 67 #include <trace/events/vmscan.h> 68 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 70 EXPORT_SYMBOL(memory_cgrp_subsys); 71 72 #define MEM_CGROUP_RECLAIM_RETRIES 5 73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #ifdef CONFIG_MEMCG_SWAP 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 77 int do_swap_account __read_mostly; 78 79 /* for remember boot option*/ 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 81 static int really_do_swap_account __initdata = 1; 82 #else 83 static int really_do_swap_account __initdata; 84 #endif 85 86 #else 87 #define do_swap_account 0 88 #endif 89 90 91 static const char * const mem_cgroup_stat_names[] = { 92 "cache", 93 "rss", 94 "rss_huge", 95 "mapped_file", 96 "writeback", 97 "swap", 98 }; 99 100 enum mem_cgroup_events_index { 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_NSTATS, 106 }; 107 108 static const char * const mem_cgroup_events_names[] = { 109 "pgpgin", 110 "pgpgout", 111 "pgfault", 112 "pgmajfault", 113 }; 114 115 static const char * const mem_cgroup_lru_names[] = { 116 "inactive_anon", 117 "active_anon", 118 "inactive_file", 119 "active_file", 120 "unevictable", 121 }; 122 123 /* 124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 125 * it will be incremated by the number of pages. This counter is used for 126 * for trigger some periodic events. This is straightforward and better 127 * than using jiffies etc. to handle periodic memcg event. 128 */ 129 enum mem_cgroup_events_target { 130 MEM_CGROUP_TARGET_THRESH, 131 MEM_CGROUP_TARGET_SOFTLIMIT, 132 MEM_CGROUP_TARGET_NUMAINFO, 133 MEM_CGROUP_NTARGETS, 134 }; 135 #define THRESHOLDS_EVENTS_TARGET 128 136 #define SOFTLIMIT_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024 138 139 struct mem_cgroup_stat_cpu { 140 long count[MEM_CGROUP_STAT_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 142 unsigned long nr_page_events; 143 unsigned long targets[MEM_CGROUP_NTARGETS]; 144 }; 145 146 struct reclaim_iter { 147 struct mem_cgroup *position; 148 /* scan generation, increased every round-trip */ 149 unsigned int generation; 150 }; 151 152 /* 153 * per-zone information in memory controller. 154 */ 155 struct mem_cgroup_per_zone { 156 struct lruvec lruvec; 157 unsigned long lru_size[NR_LRU_LISTS]; 158 159 struct reclaim_iter iter[DEF_PRIORITY + 1]; 160 161 struct rb_node tree_node; /* RB tree node */ 162 unsigned long usage_in_excess;/* Set to the value by which */ 163 /* the soft limit is exceeded*/ 164 bool on_tree; 165 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 166 /* use container_of */ 167 }; 168 169 struct mem_cgroup_per_node { 170 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 171 }; 172 173 /* 174 * Cgroups above their limits are maintained in a RB-Tree, independent of 175 * their hierarchy representation 176 */ 177 178 struct mem_cgroup_tree_per_zone { 179 struct rb_root rb_root; 180 spinlock_t lock; 181 }; 182 183 struct mem_cgroup_tree_per_node { 184 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 185 }; 186 187 struct mem_cgroup_tree { 188 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 189 }; 190 191 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 192 193 struct mem_cgroup_threshold { 194 struct eventfd_ctx *eventfd; 195 unsigned long threshold; 196 }; 197 198 /* For threshold */ 199 struct mem_cgroup_threshold_ary { 200 /* An array index points to threshold just below or equal to usage. */ 201 int current_threshold; 202 /* Size of entries[] */ 203 unsigned int size; 204 /* Array of thresholds */ 205 struct mem_cgroup_threshold entries[0]; 206 }; 207 208 struct mem_cgroup_thresholds { 209 /* Primary thresholds array */ 210 struct mem_cgroup_threshold_ary *primary; 211 /* 212 * Spare threshold array. 213 * This is needed to make mem_cgroup_unregister_event() "never fail". 214 * It must be able to store at least primary->size - 1 entries. 215 */ 216 struct mem_cgroup_threshold_ary *spare; 217 }; 218 219 /* for OOM */ 220 struct mem_cgroup_eventfd_list { 221 struct list_head list; 222 struct eventfd_ctx *eventfd; 223 }; 224 225 /* 226 * cgroup_event represents events which userspace want to receive. 227 */ 228 struct mem_cgroup_event { 229 /* 230 * memcg which the event belongs to. 231 */ 232 struct mem_cgroup *memcg; 233 /* 234 * eventfd to signal userspace about the event. 235 */ 236 struct eventfd_ctx *eventfd; 237 /* 238 * Each of these stored in a list by the cgroup. 239 */ 240 struct list_head list; 241 /* 242 * register_event() callback will be used to add new userspace 243 * waiter for changes related to this event. Use eventfd_signal() 244 * on eventfd to send notification to userspace. 245 */ 246 int (*register_event)(struct mem_cgroup *memcg, 247 struct eventfd_ctx *eventfd, const char *args); 248 /* 249 * unregister_event() callback will be called when userspace closes 250 * the eventfd or on cgroup removing. This callback must be set, 251 * if you want provide notification functionality. 252 */ 253 void (*unregister_event)(struct mem_cgroup *memcg, 254 struct eventfd_ctx *eventfd); 255 /* 256 * All fields below needed to unregister event when 257 * userspace closes eventfd. 258 */ 259 poll_table pt; 260 wait_queue_head_t *wqh; 261 wait_queue_t wait; 262 struct work_struct remove; 263 }; 264 265 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 266 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 267 268 /* 269 * The memory controller data structure. The memory controller controls both 270 * page cache and RSS per cgroup. We would eventually like to provide 271 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 272 * to help the administrator determine what knobs to tune. 273 * 274 * TODO: Add a water mark for the memory controller. Reclaim will begin when 275 * we hit the water mark. May be even add a low water mark, such that 276 * no reclaim occurs from a cgroup at it's low water mark, this is 277 * a feature that will be implemented much later in the future. 278 */ 279 struct mem_cgroup { 280 struct cgroup_subsys_state css; 281 282 /* Accounted resources */ 283 struct page_counter memory; 284 struct page_counter memsw; 285 struct page_counter kmem; 286 287 unsigned long soft_limit; 288 289 /* vmpressure notifications */ 290 struct vmpressure vmpressure; 291 292 /* css_online() has been completed */ 293 int initialized; 294 295 /* 296 * Should the accounting and control be hierarchical, per subtree? 297 */ 298 bool use_hierarchy; 299 300 bool oom_lock; 301 atomic_t under_oom; 302 atomic_t oom_wakeups; 303 304 int swappiness; 305 /* OOM-Killer disable */ 306 int oom_kill_disable; 307 308 /* protect arrays of thresholds */ 309 struct mutex thresholds_lock; 310 311 /* thresholds for memory usage. RCU-protected */ 312 struct mem_cgroup_thresholds thresholds; 313 314 /* thresholds for mem+swap usage. RCU-protected */ 315 struct mem_cgroup_thresholds memsw_thresholds; 316 317 /* For oom notifier event fd */ 318 struct list_head oom_notify; 319 320 /* 321 * Should we move charges of a task when a task is moved into this 322 * mem_cgroup ? And what type of charges should we move ? 323 */ 324 unsigned long move_charge_at_immigrate; 325 /* 326 * set > 0 if pages under this cgroup are moving to other cgroup. 327 */ 328 atomic_t moving_account; 329 /* taken only while moving_account > 0 */ 330 spinlock_t move_lock; 331 /* 332 * percpu counter. 333 */ 334 struct mem_cgroup_stat_cpu __percpu *stat; 335 /* 336 * used when a cpu is offlined or other synchronizations 337 * See mem_cgroup_read_stat(). 338 */ 339 struct mem_cgroup_stat_cpu nocpu_base; 340 spinlock_t pcp_counter_lock; 341 342 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 343 struct cg_proto tcp_mem; 344 #endif 345 #if defined(CONFIG_MEMCG_KMEM) 346 /* analogous to slab_common's slab_caches list, but per-memcg; 347 * protected by memcg_slab_mutex */ 348 struct list_head memcg_slab_caches; 349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 350 int kmemcg_id; 351 #endif 352 353 int last_scanned_node; 354 #if MAX_NUMNODES > 1 355 nodemask_t scan_nodes; 356 atomic_t numainfo_events; 357 atomic_t numainfo_updating; 358 #endif 359 360 /* List of events which userspace want to receive */ 361 struct list_head event_list; 362 spinlock_t event_list_lock; 363 364 struct mem_cgroup_per_node *nodeinfo[0]; 365 /* WARNING: nodeinfo must be the last member here */ 366 }; 367 368 #ifdef CONFIG_MEMCG_KMEM 369 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 370 { 371 return memcg->kmemcg_id >= 0; 372 } 373 #endif 374 375 /* Stuffs for move charges at task migration. */ 376 /* 377 * Types of charges to be moved. "move_charge_at_immitgrate" and 378 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 379 */ 380 enum move_type { 381 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 382 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 383 NR_MOVE_TYPE, 384 }; 385 386 /* "mc" and its members are protected by cgroup_mutex */ 387 static struct move_charge_struct { 388 spinlock_t lock; /* for from, to */ 389 struct mem_cgroup *from; 390 struct mem_cgroup *to; 391 unsigned long immigrate_flags; 392 unsigned long precharge; 393 unsigned long moved_charge; 394 unsigned long moved_swap; 395 struct task_struct *moving_task; /* a task moving charges */ 396 wait_queue_head_t waitq; /* a waitq for other context */ 397 } mc = { 398 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 399 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 400 }; 401 402 static bool move_anon(void) 403 { 404 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 405 } 406 407 static bool move_file(void) 408 { 409 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 410 } 411 412 /* 413 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 414 * limit reclaim to prevent infinite loops, if they ever occur. 415 */ 416 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 417 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 418 419 enum charge_type { 420 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 421 MEM_CGROUP_CHARGE_TYPE_ANON, 422 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 423 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 424 NR_CHARGE_TYPE, 425 }; 426 427 /* for encoding cft->private value on file */ 428 enum res_type { 429 _MEM, 430 _MEMSWAP, 431 _OOM_TYPE, 432 _KMEM, 433 }; 434 435 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 436 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 437 #define MEMFILE_ATTR(val) ((val) & 0xffff) 438 /* Used for OOM nofiier */ 439 #define OOM_CONTROL (0) 440 441 /* 442 * The memcg_create_mutex will be held whenever a new cgroup is created. 443 * As a consequence, any change that needs to protect against new child cgroups 444 * appearing has to hold it as well. 445 */ 446 static DEFINE_MUTEX(memcg_create_mutex); 447 448 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 449 { 450 return s ? container_of(s, struct mem_cgroup, css) : NULL; 451 } 452 453 /* Some nice accessors for the vmpressure. */ 454 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 455 { 456 if (!memcg) 457 memcg = root_mem_cgroup; 458 return &memcg->vmpressure; 459 } 460 461 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 462 { 463 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 464 } 465 466 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 467 { 468 return (memcg == root_mem_cgroup); 469 } 470 471 /* 472 * We restrict the id in the range of [1, 65535], so it can fit into 473 * an unsigned short. 474 */ 475 #define MEM_CGROUP_ID_MAX USHRT_MAX 476 477 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 478 { 479 return memcg->css.id; 480 } 481 482 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 483 { 484 struct cgroup_subsys_state *css; 485 486 css = css_from_id(id, &memory_cgrp_subsys); 487 return mem_cgroup_from_css(css); 488 } 489 490 /* Writing them here to avoid exposing memcg's inner layout */ 491 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 492 493 void sock_update_memcg(struct sock *sk) 494 { 495 if (mem_cgroup_sockets_enabled) { 496 struct mem_cgroup *memcg; 497 struct cg_proto *cg_proto; 498 499 BUG_ON(!sk->sk_prot->proto_cgroup); 500 501 /* Socket cloning can throw us here with sk_cgrp already 502 * filled. It won't however, necessarily happen from 503 * process context. So the test for root memcg given 504 * the current task's memcg won't help us in this case. 505 * 506 * Respecting the original socket's memcg is a better 507 * decision in this case. 508 */ 509 if (sk->sk_cgrp) { 510 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 511 css_get(&sk->sk_cgrp->memcg->css); 512 return; 513 } 514 515 rcu_read_lock(); 516 memcg = mem_cgroup_from_task(current); 517 cg_proto = sk->sk_prot->proto_cgroup(memcg); 518 if (!mem_cgroup_is_root(memcg) && 519 memcg_proto_active(cg_proto) && 520 css_tryget_online(&memcg->css)) { 521 sk->sk_cgrp = cg_proto; 522 } 523 rcu_read_unlock(); 524 } 525 } 526 EXPORT_SYMBOL(sock_update_memcg); 527 528 void sock_release_memcg(struct sock *sk) 529 { 530 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 531 struct mem_cgroup *memcg; 532 WARN_ON(!sk->sk_cgrp->memcg); 533 memcg = sk->sk_cgrp->memcg; 534 css_put(&sk->sk_cgrp->memcg->css); 535 } 536 } 537 538 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 539 { 540 if (!memcg || mem_cgroup_is_root(memcg)) 541 return NULL; 542 543 return &memcg->tcp_mem; 544 } 545 EXPORT_SYMBOL(tcp_proto_cgroup); 546 547 static void disarm_sock_keys(struct mem_cgroup *memcg) 548 { 549 if (!memcg_proto_activated(&memcg->tcp_mem)) 550 return; 551 static_key_slow_dec(&memcg_socket_limit_enabled); 552 } 553 #else 554 static void disarm_sock_keys(struct mem_cgroup *memcg) 555 { 556 } 557 #endif 558 559 #ifdef CONFIG_MEMCG_KMEM 560 /* 561 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 562 * The main reason for not using cgroup id for this: 563 * this works better in sparse environments, where we have a lot of memcgs, 564 * but only a few kmem-limited. Or also, if we have, for instance, 200 565 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 566 * 200 entry array for that. 567 * 568 * The current size of the caches array is stored in 569 * memcg_limited_groups_array_size. It will double each time we have to 570 * increase it. 571 */ 572 static DEFINE_IDA(kmem_limited_groups); 573 int memcg_limited_groups_array_size; 574 575 /* 576 * MIN_SIZE is different than 1, because we would like to avoid going through 577 * the alloc/free process all the time. In a small machine, 4 kmem-limited 578 * cgroups is a reasonable guess. In the future, it could be a parameter or 579 * tunable, but that is strictly not necessary. 580 * 581 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 582 * this constant directly from cgroup, but it is understandable that this is 583 * better kept as an internal representation in cgroup.c. In any case, the 584 * cgrp_id space is not getting any smaller, and we don't have to necessarily 585 * increase ours as well if it increases. 586 */ 587 #define MEMCG_CACHES_MIN_SIZE 4 588 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 589 590 /* 591 * A lot of the calls to the cache allocation functions are expected to be 592 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 593 * conditional to this static branch, we'll have to allow modules that does 594 * kmem_cache_alloc and the such to see this symbol as well 595 */ 596 struct static_key memcg_kmem_enabled_key; 597 EXPORT_SYMBOL(memcg_kmem_enabled_key); 598 599 static void memcg_free_cache_id(int id); 600 601 static void disarm_kmem_keys(struct mem_cgroup *memcg) 602 { 603 if (memcg_kmem_is_active(memcg)) { 604 static_key_slow_dec(&memcg_kmem_enabled_key); 605 memcg_free_cache_id(memcg->kmemcg_id); 606 } 607 /* 608 * This check can't live in kmem destruction function, 609 * since the charges will outlive the cgroup 610 */ 611 WARN_ON(page_counter_read(&memcg->kmem)); 612 } 613 #else 614 static void disarm_kmem_keys(struct mem_cgroup *memcg) 615 { 616 } 617 #endif /* CONFIG_MEMCG_KMEM */ 618 619 static void disarm_static_keys(struct mem_cgroup *memcg) 620 { 621 disarm_sock_keys(memcg); 622 disarm_kmem_keys(memcg); 623 } 624 625 static struct mem_cgroup_per_zone * 626 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 627 { 628 int nid = zone_to_nid(zone); 629 int zid = zone_idx(zone); 630 631 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 632 } 633 634 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 635 { 636 return &memcg->css; 637 } 638 639 static struct mem_cgroup_per_zone * 640 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 641 { 642 int nid = page_to_nid(page); 643 int zid = page_zonenum(page); 644 645 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 646 } 647 648 static struct mem_cgroup_tree_per_zone * 649 soft_limit_tree_node_zone(int nid, int zid) 650 { 651 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 652 } 653 654 static struct mem_cgroup_tree_per_zone * 655 soft_limit_tree_from_page(struct page *page) 656 { 657 int nid = page_to_nid(page); 658 int zid = page_zonenum(page); 659 660 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 661 } 662 663 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 664 struct mem_cgroup_tree_per_zone *mctz, 665 unsigned long new_usage_in_excess) 666 { 667 struct rb_node **p = &mctz->rb_root.rb_node; 668 struct rb_node *parent = NULL; 669 struct mem_cgroup_per_zone *mz_node; 670 671 if (mz->on_tree) 672 return; 673 674 mz->usage_in_excess = new_usage_in_excess; 675 if (!mz->usage_in_excess) 676 return; 677 while (*p) { 678 parent = *p; 679 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 680 tree_node); 681 if (mz->usage_in_excess < mz_node->usage_in_excess) 682 p = &(*p)->rb_left; 683 /* 684 * We can't avoid mem cgroups that are over their soft 685 * limit by the same amount 686 */ 687 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 688 p = &(*p)->rb_right; 689 } 690 rb_link_node(&mz->tree_node, parent, p); 691 rb_insert_color(&mz->tree_node, &mctz->rb_root); 692 mz->on_tree = true; 693 } 694 695 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 696 struct mem_cgroup_tree_per_zone *mctz) 697 { 698 if (!mz->on_tree) 699 return; 700 rb_erase(&mz->tree_node, &mctz->rb_root); 701 mz->on_tree = false; 702 } 703 704 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 705 struct mem_cgroup_tree_per_zone *mctz) 706 { 707 unsigned long flags; 708 709 spin_lock_irqsave(&mctz->lock, flags); 710 __mem_cgroup_remove_exceeded(mz, mctz); 711 spin_unlock_irqrestore(&mctz->lock, flags); 712 } 713 714 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 715 { 716 unsigned long nr_pages = page_counter_read(&memcg->memory); 717 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 718 unsigned long excess = 0; 719 720 if (nr_pages > soft_limit) 721 excess = nr_pages - soft_limit; 722 723 return excess; 724 } 725 726 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 727 { 728 unsigned long excess; 729 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_tree_per_zone *mctz; 731 732 mctz = soft_limit_tree_from_page(page); 733 /* 734 * Necessary to update all ancestors when hierarchy is used. 735 * because their event counter is not touched. 736 */ 737 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 738 mz = mem_cgroup_page_zoneinfo(memcg, page); 739 excess = soft_limit_excess(memcg); 740 /* 741 * We have to update the tree if mz is on RB-tree or 742 * mem is over its softlimit. 743 */ 744 if (excess || mz->on_tree) { 745 unsigned long flags; 746 747 spin_lock_irqsave(&mctz->lock, flags); 748 /* if on-tree, remove it */ 749 if (mz->on_tree) 750 __mem_cgroup_remove_exceeded(mz, mctz); 751 /* 752 * Insert again. mz->usage_in_excess will be updated. 753 * If excess is 0, no tree ops. 754 */ 755 __mem_cgroup_insert_exceeded(mz, mctz, excess); 756 spin_unlock_irqrestore(&mctz->lock, flags); 757 } 758 } 759 } 760 761 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 762 { 763 struct mem_cgroup_tree_per_zone *mctz; 764 struct mem_cgroup_per_zone *mz; 765 int nid, zid; 766 767 for_each_node(nid) { 768 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 769 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 770 mctz = soft_limit_tree_node_zone(nid, zid); 771 mem_cgroup_remove_exceeded(mz, mctz); 772 } 773 } 774 } 775 776 static struct mem_cgroup_per_zone * 777 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 778 { 779 struct rb_node *rightmost = NULL; 780 struct mem_cgroup_per_zone *mz; 781 782 retry: 783 mz = NULL; 784 rightmost = rb_last(&mctz->rb_root); 785 if (!rightmost) 786 goto done; /* Nothing to reclaim from */ 787 788 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 789 /* 790 * Remove the node now but someone else can add it back, 791 * we will to add it back at the end of reclaim to its correct 792 * position in the tree. 793 */ 794 __mem_cgroup_remove_exceeded(mz, mctz); 795 if (!soft_limit_excess(mz->memcg) || 796 !css_tryget_online(&mz->memcg->css)) 797 goto retry; 798 done: 799 return mz; 800 } 801 802 static struct mem_cgroup_per_zone * 803 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 804 { 805 struct mem_cgroup_per_zone *mz; 806 807 spin_lock_irq(&mctz->lock); 808 mz = __mem_cgroup_largest_soft_limit_node(mctz); 809 spin_unlock_irq(&mctz->lock); 810 return mz; 811 } 812 813 /* 814 * Implementation Note: reading percpu statistics for memcg. 815 * 816 * Both of vmstat[] and percpu_counter has threshold and do periodic 817 * synchronization to implement "quick" read. There are trade-off between 818 * reading cost and precision of value. Then, we may have a chance to implement 819 * a periodic synchronizion of counter in memcg's counter. 820 * 821 * But this _read() function is used for user interface now. The user accounts 822 * memory usage by memory cgroup and he _always_ requires exact value because 823 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 824 * have to visit all online cpus and make sum. So, for now, unnecessary 825 * synchronization is not implemented. (just implemented for cpu hotplug) 826 * 827 * If there are kernel internal actions which can make use of some not-exact 828 * value, and reading all cpu value can be performance bottleneck in some 829 * common workload, threashold and synchonization as vmstat[] should be 830 * implemented. 831 */ 832 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 833 enum mem_cgroup_stat_index idx) 834 { 835 long val = 0; 836 int cpu; 837 838 get_online_cpus(); 839 for_each_online_cpu(cpu) 840 val += per_cpu(memcg->stat->count[idx], cpu); 841 #ifdef CONFIG_HOTPLUG_CPU 842 spin_lock(&memcg->pcp_counter_lock); 843 val += memcg->nocpu_base.count[idx]; 844 spin_unlock(&memcg->pcp_counter_lock); 845 #endif 846 put_online_cpus(); 847 return val; 848 } 849 850 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 851 enum mem_cgroup_events_index idx) 852 { 853 unsigned long val = 0; 854 int cpu; 855 856 get_online_cpus(); 857 for_each_online_cpu(cpu) 858 val += per_cpu(memcg->stat->events[idx], cpu); 859 #ifdef CONFIG_HOTPLUG_CPU 860 spin_lock(&memcg->pcp_counter_lock); 861 val += memcg->nocpu_base.events[idx]; 862 spin_unlock(&memcg->pcp_counter_lock); 863 #endif 864 put_online_cpus(); 865 return val; 866 } 867 868 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 869 struct page *page, 870 int nr_pages) 871 { 872 /* 873 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 874 * counted as CACHE even if it's on ANON LRU. 875 */ 876 if (PageAnon(page)) 877 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 878 nr_pages); 879 else 880 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 881 nr_pages); 882 883 if (PageTransHuge(page)) 884 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 885 nr_pages); 886 887 /* pagein of a big page is an event. So, ignore page size */ 888 if (nr_pages > 0) 889 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 890 else { 891 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 892 nr_pages = -nr_pages; /* for event */ 893 } 894 895 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 896 } 897 898 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 899 { 900 struct mem_cgroup_per_zone *mz; 901 902 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 903 return mz->lru_size[lru]; 904 } 905 906 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 907 int nid, 908 unsigned int lru_mask) 909 { 910 unsigned long nr = 0; 911 int zid; 912 913 VM_BUG_ON((unsigned)nid >= nr_node_ids); 914 915 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 916 struct mem_cgroup_per_zone *mz; 917 enum lru_list lru; 918 919 for_each_lru(lru) { 920 if (!(BIT(lru) & lru_mask)) 921 continue; 922 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 923 nr += mz->lru_size[lru]; 924 } 925 } 926 return nr; 927 } 928 929 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 930 unsigned int lru_mask) 931 { 932 unsigned long nr = 0; 933 int nid; 934 935 for_each_node_state(nid, N_MEMORY) 936 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 937 return nr; 938 } 939 940 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 941 enum mem_cgroup_events_target target) 942 { 943 unsigned long val, next; 944 945 val = __this_cpu_read(memcg->stat->nr_page_events); 946 next = __this_cpu_read(memcg->stat->targets[target]); 947 /* from time_after() in jiffies.h */ 948 if ((long)next - (long)val < 0) { 949 switch (target) { 950 case MEM_CGROUP_TARGET_THRESH: 951 next = val + THRESHOLDS_EVENTS_TARGET; 952 break; 953 case MEM_CGROUP_TARGET_SOFTLIMIT: 954 next = val + SOFTLIMIT_EVENTS_TARGET; 955 break; 956 case MEM_CGROUP_TARGET_NUMAINFO: 957 next = val + NUMAINFO_EVENTS_TARGET; 958 break; 959 default: 960 break; 961 } 962 __this_cpu_write(memcg->stat->targets[target], next); 963 return true; 964 } 965 return false; 966 } 967 968 /* 969 * Check events in order. 970 * 971 */ 972 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 973 { 974 /* threshold event is triggered in finer grain than soft limit */ 975 if (unlikely(mem_cgroup_event_ratelimit(memcg, 976 MEM_CGROUP_TARGET_THRESH))) { 977 bool do_softlimit; 978 bool do_numainfo __maybe_unused; 979 980 do_softlimit = mem_cgroup_event_ratelimit(memcg, 981 MEM_CGROUP_TARGET_SOFTLIMIT); 982 #if MAX_NUMNODES > 1 983 do_numainfo = mem_cgroup_event_ratelimit(memcg, 984 MEM_CGROUP_TARGET_NUMAINFO); 985 #endif 986 mem_cgroup_threshold(memcg); 987 if (unlikely(do_softlimit)) 988 mem_cgroup_update_tree(memcg, page); 989 #if MAX_NUMNODES > 1 990 if (unlikely(do_numainfo)) 991 atomic_inc(&memcg->numainfo_events); 992 #endif 993 } 994 } 995 996 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 997 { 998 /* 999 * mm_update_next_owner() may clear mm->owner to NULL 1000 * if it races with swapoff, page migration, etc. 1001 * So this can be called with p == NULL. 1002 */ 1003 if (unlikely(!p)) 1004 return NULL; 1005 1006 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1007 } 1008 1009 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1010 { 1011 struct mem_cgroup *memcg = NULL; 1012 1013 rcu_read_lock(); 1014 do { 1015 /* 1016 * Page cache insertions can happen withou an 1017 * actual mm context, e.g. during disk probing 1018 * on boot, loopback IO, acct() writes etc. 1019 */ 1020 if (unlikely(!mm)) 1021 memcg = root_mem_cgroup; 1022 else { 1023 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1024 if (unlikely(!memcg)) 1025 memcg = root_mem_cgroup; 1026 } 1027 } while (!css_tryget_online(&memcg->css)); 1028 rcu_read_unlock(); 1029 return memcg; 1030 } 1031 1032 /** 1033 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1034 * @root: hierarchy root 1035 * @prev: previously returned memcg, NULL on first invocation 1036 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1037 * 1038 * Returns references to children of the hierarchy below @root, or 1039 * @root itself, or %NULL after a full round-trip. 1040 * 1041 * Caller must pass the return value in @prev on subsequent 1042 * invocations for reference counting, or use mem_cgroup_iter_break() 1043 * to cancel a hierarchy walk before the round-trip is complete. 1044 * 1045 * Reclaimers can specify a zone and a priority level in @reclaim to 1046 * divide up the memcgs in the hierarchy among all concurrent 1047 * reclaimers operating on the same zone and priority. 1048 */ 1049 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1050 struct mem_cgroup *prev, 1051 struct mem_cgroup_reclaim_cookie *reclaim) 1052 { 1053 struct reclaim_iter *uninitialized_var(iter); 1054 struct cgroup_subsys_state *css = NULL; 1055 struct mem_cgroup *memcg = NULL; 1056 struct mem_cgroup *pos = NULL; 1057 1058 if (mem_cgroup_disabled()) 1059 return NULL; 1060 1061 if (!root) 1062 root = root_mem_cgroup; 1063 1064 if (prev && !reclaim) 1065 pos = prev; 1066 1067 if (!root->use_hierarchy && root != root_mem_cgroup) { 1068 if (prev) 1069 goto out; 1070 return root; 1071 } 1072 1073 rcu_read_lock(); 1074 1075 if (reclaim) { 1076 struct mem_cgroup_per_zone *mz; 1077 1078 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1079 iter = &mz->iter[reclaim->priority]; 1080 1081 if (prev && reclaim->generation != iter->generation) 1082 goto out_unlock; 1083 1084 do { 1085 pos = ACCESS_ONCE(iter->position); 1086 /* 1087 * A racing update may change the position and 1088 * put the last reference, hence css_tryget(), 1089 * or retry to see the updated position. 1090 */ 1091 } while (pos && !css_tryget(&pos->css)); 1092 } 1093 1094 if (pos) 1095 css = &pos->css; 1096 1097 for (;;) { 1098 css = css_next_descendant_pre(css, &root->css); 1099 if (!css) { 1100 /* 1101 * Reclaimers share the hierarchy walk, and a 1102 * new one might jump in right at the end of 1103 * the hierarchy - make sure they see at least 1104 * one group and restart from the beginning. 1105 */ 1106 if (!prev) 1107 continue; 1108 break; 1109 } 1110 1111 /* 1112 * Verify the css and acquire a reference. The root 1113 * is provided by the caller, so we know it's alive 1114 * and kicking, and don't take an extra reference. 1115 */ 1116 memcg = mem_cgroup_from_css(css); 1117 1118 if (css == &root->css) 1119 break; 1120 1121 if (css_tryget(css)) { 1122 /* 1123 * Make sure the memcg is initialized: 1124 * mem_cgroup_css_online() orders the the 1125 * initialization against setting the flag. 1126 */ 1127 if (smp_load_acquire(&memcg->initialized)) 1128 break; 1129 1130 css_put(css); 1131 } 1132 1133 memcg = NULL; 1134 } 1135 1136 if (reclaim) { 1137 if (cmpxchg(&iter->position, pos, memcg) == pos) { 1138 if (memcg) 1139 css_get(&memcg->css); 1140 if (pos) 1141 css_put(&pos->css); 1142 } 1143 1144 /* 1145 * pairs with css_tryget when dereferencing iter->position 1146 * above. 1147 */ 1148 if (pos) 1149 css_put(&pos->css); 1150 1151 if (!memcg) 1152 iter->generation++; 1153 else if (!prev) 1154 reclaim->generation = iter->generation; 1155 } 1156 1157 out_unlock: 1158 rcu_read_unlock(); 1159 out: 1160 if (prev && prev != root) 1161 css_put(&prev->css); 1162 1163 return memcg; 1164 } 1165 1166 /** 1167 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1168 * @root: hierarchy root 1169 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1170 */ 1171 void mem_cgroup_iter_break(struct mem_cgroup *root, 1172 struct mem_cgroup *prev) 1173 { 1174 if (!root) 1175 root = root_mem_cgroup; 1176 if (prev && prev != root) 1177 css_put(&prev->css); 1178 } 1179 1180 /* 1181 * Iteration constructs for visiting all cgroups (under a tree). If 1182 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1183 * be used for reference counting. 1184 */ 1185 #define for_each_mem_cgroup_tree(iter, root) \ 1186 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1187 iter != NULL; \ 1188 iter = mem_cgroup_iter(root, iter, NULL)) 1189 1190 #define for_each_mem_cgroup(iter) \ 1191 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1192 iter != NULL; \ 1193 iter = mem_cgroup_iter(NULL, iter, NULL)) 1194 1195 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1196 { 1197 struct mem_cgroup *memcg; 1198 1199 rcu_read_lock(); 1200 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1201 if (unlikely(!memcg)) 1202 goto out; 1203 1204 switch (idx) { 1205 case PGFAULT: 1206 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1207 break; 1208 case PGMAJFAULT: 1209 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1210 break; 1211 default: 1212 BUG(); 1213 } 1214 out: 1215 rcu_read_unlock(); 1216 } 1217 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1218 1219 /** 1220 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1221 * @zone: zone of the wanted lruvec 1222 * @memcg: memcg of the wanted lruvec 1223 * 1224 * Returns the lru list vector holding pages for the given @zone and 1225 * @mem. This can be the global zone lruvec, if the memory controller 1226 * is disabled. 1227 */ 1228 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1229 struct mem_cgroup *memcg) 1230 { 1231 struct mem_cgroup_per_zone *mz; 1232 struct lruvec *lruvec; 1233 1234 if (mem_cgroup_disabled()) { 1235 lruvec = &zone->lruvec; 1236 goto out; 1237 } 1238 1239 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1240 lruvec = &mz->lruvec; 1241 out: 1242 /* 1243 * Since a node can be onlined after the mem_cgroup was created, 1244 * we have to be prepared to initialize lruvec->zone here; 1245 * and if offlined then reonlined, we need to reinitialize it. 1246 */ 1247 if (unlikely(lruvec->zone != zone)) 1248 lruvec->zone = zone; 1249 return lruvec; 1250 } 1251 1252 /** 1253 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1254 * @page: the page 1255 * @zone: zone of the page 1256 * 1257 * This function is only safe when following the LRU page isolation 1258 * and putback protocol: the LRU lock must be held, and the page must 1259 * either be PageLRU() or the caller must have isolated/allocated it. 1260 */ 1261 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1262 { 1263 struct mem_cgroup_per_zone *mz; 1264 struct mem_cgroup *memcg; 1265 struct lruvec *lruvec; 1266 1267 if (mem_cgroup_disabled()) { 1268 lruvec = &zone->lruvec; 1269 goto out; 1270 } 1271 1272 memcg = page->mem_cgroup; 1273 /* 1274 * Swapcache readahead pages are added to the LRU - and 1275 * possibly migrated - before they are charged. 1276 */ 1277 if (!memcg) 1278 memcg = root_mem_cgroup; 1279 1280 mz = mem_cgroup_page_zoneinfo(memcg, page); 1281 lruvec = &mz->lruvec; 1282 out: 1283 /* 1284 * Since a node can be onlined after the mem_cgroup was created, 1285 * we have to be prepared to initialize lruvec->zone here; 1286 * and if offlined then reonlined, we need to reinitialize it. 1287 */ 1288 if (unlikely(lruvec->zone != zone)) 1289 lruvec->zone = zone; 1290 return lruvec; 1291 } 1292 1293 /** 1294 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1295 * @lruvec: mem_cgroup per zone lru vector 1296 * @lru: index of lru list the page is sitting on 1297 * @nr_pages: positive when adding or negative when removing 1298 * 1299 * This function must be called when a page is added to or removed from an 1300 * lru list. 1301 */ 1302 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1303 int nr_pages) 1304 { 1305 struct mem_cgroup_per_zone *mz; 1306 unsigned long *lru_size; 1307 1308 if (mem_cgroup_disabled()) 1309 return; 1310 1311 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1312 lru_size = mz->lru_size + lru; 1313 *lru_size += nr_pages; 1314 VM_BUG_ON((long)(*lru_size) < 0); 1315 } 1316 1317 bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) 1318 { 1319 if (root == memcg) 1320 return true; 1321 if (!root->use_hierarchy) 1322 return false; 1323 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); 1324 } 1325 1326 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1327 { 1328 struct mem_cgroup *task_memcg; 1329 struct task_struct *p; 1330 bool ret; 1331 1332 p = find_lock_task_mm(task); 1333 if (p) { 1334 task_memcg = get_mem_cgroup_from_mm(p->mm); 1335 task_unlock(p); 1336 } else { 1337 /* 1338 * All threads may have already detached their mm's, but the oom 1339 * killer still needs to detect if they have already been oom 1340 * killed to prevent needlessly killing additional tasks. 1341 */ 1342 rcu_read_lock(); 1343 task_memcg = mem_cgroup_from_task(task); 1344 css_get(&task_memcg->css); 1345 rcu_read_unlock(); 1346 } 1347 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1348 css_put(&task_memcg->css); 1349 return ret; 1350 } 1351 1352 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1353 { 1354 unsigned long inactive_ratio; 1355 unsigned long inactive; 1356 unsigned long active; 1357 unsigned long gb; 1358 1359 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1360 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1361 1362 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1363 if (gb) 1364 inactive_ratio = int_sqrt(10 * gb); 1365 else 1366 inactive_ratio = 1; 1367 1368 return inactive * inactive_ratio < active; 1369 } 1370 1371 #define mem_cgroup_from_counter(counter, member) \ 1372 container_of(counter, struct mem_cgroup, member) 1373 1374 /** 1375 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1376 * @memcg: the memory cgroup 1377 * 1378 * Returns the maximum amount of memory @mem can be charged with, in 1379 * pages. 1380 */ 1381 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1382 { 1383 unsigned long margin = 0; 1384 unsigned long count; 1385 unsigned long limit; 1386 1387 count = page_counter_read(&memcg->memory); 1388 limit = ACCESS_ONCE(memcg->memory.limit); 1389 if (count < limit) 1390 margin = limit - count; 1391 1392 if (do_swap_account) { 1393 count = page_counter_read(&memcg->memsw); 1394 limit = ACCESS_ONCE(memcg->memsw.limit); 1395 if (count <= limit) 1396 margin = min(margin, limit - count); 1397 } 1398 1399 return margin; 1400 } 1401 1402 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1403 { 1404 /* root ? */ 1405 if (mem_cgroup_disabled() || !memcg->css.parent) 1406 return vm_swappiness; 1407 1408 return memcg->swappiness; 1409 } 1410 1411 /* 1412 * A routine for checking "mem" is under move_account() or not. 1413 * 1414 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1415 * moving cgroups. This is for waiting at high-memory pressure 1416 * caused by "move". 1417 */ 1418 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1419 { 1420 struct mem_cgroup *from; 1421 struct mem_cgroup *to; 1422 bool ret = false; 1423 /* 1424 * Unlike task_move routines, we access mc.to, mc.from not under 1425 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1426 */ 1427 spin_lock(&mc.lock); 1428 from = mc.from; 1429 to = mc.to; 1430 if (!from) 1431 goto unlock; 1432 1433 ret = mem_cgroup_is_descendant(from, memcg) || 1434 mem_cgroup_is_descendant(to, memcg); 1435 unlock: 1436 spin_unlock(&mc.lock); 1437 return ret; 1438 } 1439 1440 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1441 { 1442 if (mc.moving_task && current != mc.moving_task) { 1443 if (mem_cgroup_under_move(memcg)) { 1444 DEFINE_WAIT(wait); 1445 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1446 /* moving charge context might have finished. */ 1447 if (mc.moving_task) 1448 schedule(); 1449 finish_wait(&mc.waitq, &wait); 1450 return true; 1451 } 1452 } 1453 return false; 1454 } 1455 1456 #define K(x) ((x) << (PAGE_SHIFT-10)) 1457 /** 1458 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1459 * @memcg: The memory cgroup that went over limit 1460 * @p: Task that is going to be killed 1461 * 1462 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1463 * enabled 1464 */ 1465 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1466 { 1467 /* oom_info_lock ensures that parallel ooms do not interleave */ 1468 static DEFINE_MUTEX(oom_info_lock); 1469 struct mem_cgroup *iter; 1470 unsigned int i; 1471 1472 if (!p) 1473 return; 1474 1475 mutex_lock(&oom_info_lock); 1476 rcu_read_lock(); 1477 1478 pr_info("Task in "); 1479 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1480 pr_info(" killed as a result of limit of "); 1481 pr_cont_cgroup_path(memcg->css.cgroup); 1482 pr_info("\n"); 1483 1484 rcu_read_unlock(); 1485 1486 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1487 K((u64)page_counter_read(&memcg->memory)), 1488 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1489 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1490 K((u64)page_counter_read(&memcg->memsw)), 1491 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1492 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1493 K((u64)page_counter_read(&memcg->kmem)), 1494 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1495 1496 for_each_mem_cgroup_tree(iter, memcg) { 1497 pr_info("Memory cgroup stats for "); 1498 pr_cont_cgroup_path(iter->css.cgroup); 1499 pr_cont(":"); 1500 1501 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1502 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1503 continue; 1504 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1505 K(mem_cgroup_read_stat(iter, i))); 1506 } 1507 1508 for (i = 0; i < NR_LRU_LISTS; i++) 1509 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1510 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1511 1512 pr_cont("\n"); 1513 } 1514 mutex_unlock(&oom_info_lock); 1515 } 1516 1517 /* 1518 * This function returns the number of memcg under hierarchy tree. Returns 1519 * 1(self count) if no children. 1520 */ 1521 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1522 { 1523 int num = 0; 1524 struct mem_cgroup *iter; 1525 1526 for_each_mem_cgroup_tree(iter, memcg) 1527 num++; 1528 return num; 1529 } 1530 1531 /* 1532 * Return the memory (and swap, if configured) limit for a memcg. 1533 */ 1534 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1535 { 1536 unsigned long limit; 1537 1538 limit = memcg->memory.limit; 1539 if (mem_cgroup_swappiness(memcg)) { 1540 unsigned long memsw_limit; 1541 1542 memsw_limit = memcg->memsw.limit; 1543 limit = min(limit + total_swap_pages, memsw_limit); 1544 } 1545 return limit; 1546 } 1547 1548 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1549 int order) 1550 { 1551 struct mem_cgroup *iter; 1552 unsigned long chosen_points = 0; 1553 unsigned long totalpages; 1554 unsigned int points = 0; 1555 struct task_struct *chosen = NULL; 1556 1557 /* 1558 * If current has a pending SIGKILL or is exiting, then automatically 1559 * select it. The goal is to allow it to allocate so that it may 1560 * quickly exit and free its memory. 1561 */ 1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1563 set_thread_flag(TIF_MEMDIE); 1564 return; 1565 } 1566 1567 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1568 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1569 for_each_mem_cgroup_tree(iter, memcg) { 1570 struct css_task_iter it; 1571 struct task_struct *task; 1572 1573 css_task_iter_start(&iter->css, &it); 1574 while ((task = css_task_iter_next(&it))) { 1575 switch (oom_scan_process_thread(task, totalpages, NULL, 1576 false)) { 1577 case OOM_SCAN_SELECT: 1578 if (chosen) 1579 put_task_struct(chosen); 1580 chosen = task; 1581 chosen_points = ULONG_MAX; 1582 get_task_struct(chosen); 1583 /* fall through */ 1584 case OOM_SCAN_CONTINUE: 1585 continue; 1586 case OOM_SCAN_ABORT: 1587 css_task_iter_end(&it); 1588 mem_cgroup_iter_break(memcg, iter); 1589 if (chosen) 1590 put_task_struct(chosen); 1591 return; 1592 case OOM_SCAN_OK: 1593 break; 1594 }; 1595 points = oom_badness(task, memcg, NULL, totalpages); 1596 if (!points || points < chosen_points) 1597 continue; 1598 /* Prefer thread group leaders for display purposes */ 1599 if (points == chosen_points && 1600 thread_group_leader(chosen)) 1601 continue; 1602 1603 if (chosen) 1604 put_task_struct(chosen); 1605 chosen = task; 1606 chosen_points = points; 1607 get_task_struct(chosen); 1608 } 1609 css_task_iter_end(&it); 1610 } 1611 1612 if (!chosen) 1613 return; 1614 points = chosen_points * 1000 / totalpages; 1615 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1616 NULL, "Memory cgroup out of memory"); 1617 } 1618 1619 #if MAX_NUMNODES > 1 1620 1621 /** 1622 * test_mem_cgroup_node_reclaimable 1623 * @memcg: the target memcg 1624 * @nid: the node ID to be checked. 1625 * @noswap : specify true here if the user wants flle only information. 1626 * 1627 * This function returns whether the specified memcg contains any 1628 * reclaimable pages on a node. Returns true if there are any reclaimable 1629 * pages in the node. 1630 */ 1631 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1632 int nid, bool noswap) 1633 { 1634 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1635 return true; 1636 if (noswap || !total_swap_pages) 1637 return false; 1638 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1639 return true; 1640 return false; 1641 1642 } 1643 1644 /* 1645 * Always updating the nodemask is not very good - even if we have an empty 1646 * list or the wrong list here, we can start from some node and traverse all 1647 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1648 * 1649 */ 1650 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1651 { 1652 int nid; 1653 /* 1654 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1655 * pagein/pageout changes since the last update. 1656 */ 1657 if (!atomic_read(&memcg->numainfo_events)) 1658 return; 1659 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1660 return; 1661 1662 /* make a nodemask where this memcg uses memory from */ 1663 memcg->scan_nodes = node_states[N_MEMORY]; 1664 1665 for_each_node_mask(nid, node_states[N_MEMORY]) { 1666 1667 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1668 node_clear(nid, memcg->scan_nodes); 1669 } 1670 1671 atomic_set(&memcg->numainfo_events, 0); 1672 atomic_set(&memcg->numainfo_updating, 0); 1673 } 1674 1675 /* 1676 * Selecting a node where we start reclaim from. Because what we need is just 1677 * reducing usage counter, start from anywhere is O,K. Considering 1678 * memory reclaim from current node, there are pros. and cons. 1679 * 1680 * Freeing memory from current node means freeing memory from a node which 1681 * we'll use or we've used. So, it may make LRU bad. And if several threads 1682 * hit limits, it will see a contention on a node. But freeing from remote 1683 * node means more costs for memory reclaim because of memory latency. 1684 * 1685 * Now, we use round-robin. Better algorithm is welcomed. 1686 */ 1687 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1688 { 1689 int node; 1690 1691 mem_cgroup_may_update_nodemask(memcg); 1692 node = memcg->last_scanned_node; 1693 1694 node = next_node(node, memcg->scan_nodes); 1695 if (node == MAX_NUMNODES) 1696 node = first_node(memcg->scan_nodes); 1697 /* 1698 * We call this when we hit limit, not when pages are added to LRU. 1699 * No LRU may hold pages because all pages are UNEVICTABLE or 1700 * memcg is too small and all pages are not on LRU. In that case, 1701 * we use curret node. 1702 */ 1703 if (unlikely(node == MAX_NUMNODES)) 1704 node = numa_node_id(); 1705 1706 memcg->last_scanned_node = node; 1707 return node; 1708 } 1709 #else 1710 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1711 { 1712 return 0; 1713 } 1714 #endif 1715 1716 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1717 struct zone *zone, 1718 gfp_t gfp_mask, 1719 unsigned long *total_scanned) 1720 { 1721 struct mem_cgroup *victim = NULL; 1722 int total = 0; 1723 int loop = 0; 1724 unsigned long excess; 1725 unsigned long nr_scanned; 1726 struct mem_cgroup_reclaim_cookie reclaim = { 1727 .zone = zone, 1728 .priority = 0, 1729 }; 1730 1731 excess = soft_limit_excess(root_memcg); 1732 1733 while (1) { 1734 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1735 if (!victim) { 1736 loop++; 1737 if (loop >= 2) { 1738 /* 1739 * If we have not been able to reclaim 1740 * anything, it might because there are 1741 * no reclaimable pages under this hierarchy 1742 */ 1743 if (!total) 1744 break; 1745 /* 1746 * We want to do more targeted reclaim. 1747 * excess >> 2 is not to excessive so as to 1748 * reclaim too much, nor too less that we keep 1749 * coming back to reclaim from this cgroup 1750 */ 1751 if (total >= (excess >> 2) || 1752 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1753 break; 1754 } 1755 continue; 1756 } 1757 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1758 zone, &nr_scanned); 1759 *total_scanned += nr_scanned; 1760 if (!soft_limit_excess(root_memcg)) 1761 break; 1762 } 1763 mem_cgroup_iter_break(root_memcg, victim); 1764 return total; 1765 } 1766 1767 #ifdef CONFIG_LOCKDEP 1768 static struct lockdep_map memcg_oom_lock_dep_map = { 1769 .name = "memcg_oom_lock", 1770 }; 1771 #endif 1772 1773 static DEFINE_SPINLOCK(memcg_oom_lock); 1774 1775 /* 1776 * Check OOM-Killer is already running under our hierarchy. 1777 * If someone is running, return false. 1778 */ 1779 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1780 { 1781 struct mem_cgroup *iter, *failed = NULL; 1782 1783 spin_lock(&memcg_oom_lock); 1784 1785 for_each_mem_cgroup_tree(iter, memcg) { 1786 if (iter->oom_lock) { 1787 /* 1788 * this subtree of our hierarchy is already locked 1789 * so we cannot give a lock. 1790 */ 1791 failed = iter; 1792 mem_cgroup_iter_break(memcg, iter); 1793 break; 1794 } else 1795 iter->oom_lock = true; 1796 } 1797 1798 if (failed) { 1799 /* 1800 * OK, we failed to lock the whole subtree so we have 1801 * to clean up what we set up to the failing subtree 1802 */ 1803 for_each_mem_cgroup_tree(iter, memcg) { 1804 if (iter == failed) { 1805 mem_cgroup_iter_break(memcg, iter); 1806 break; 1807 } 1808 iter->oom_lock = false; 1809 } 1810 } else 1811 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1812 1813 spin_unlock(&memcg_oom_lock); 1814 1815 return !failed; 1816 } 1817 1818 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1819 { 1820 struct mem_cgroup *iter; 1821 1822 spin_lock(&memcg_oom_lock); 1823 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1824 for_each_mem_cgroup_tree(iter, memcg) 1825 iter->oom_lock = false; 1826 spin_unlock(&memcg_oom_lock); 1827 } 1828 1829 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1830 { 1831 struct mem_cgroup *iter; 1832 1833 for_each_mem_cgroup_tree(iter, memcg) 1834 atomic_inc(&iter->under_oom); 1835 } 1836 1837 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1838 { 1839 struct mem_cgroup *iter; 1840 1841 /* 1842 * When a new child is created while the hierarchy is under oom, 1843 * mem_cgroup_oom_lock() may not be called. We have to use 1844 * atomic_add_unless() here. 1845 */ 1846 for_each_mem_cgroup_tree(iter, memcg) 1847 atomic_add_unless(&iter->under_oom, -1, 0); 1848 } 1849 1850 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1851 1852 struct oom_wait_info { 1853 struct mem_cgroup *memcg; 1854 wait_queue_t wait; 1855 }; 1856 1857 static int memcg_oom_wake_function(wait_queue_t *wait, 1858 unsigned mode, int sync, void *arg) 1859 { 1860 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1861 struct mem_cgroup *oom_wait_memcg; 1862 struct oom_wait_info *oom_wait_info; 1863 1864 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1865 oom_wait_memcg = oom_wait_info->memcg; 1866 1867 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1868 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1869 return 0; 1870 return autoremove_wake_function(wait, mode, sync, arg); 1871 } 1872 1873 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 1874 { 1875 atomic_inc(&memcg->oom_wakeups); 1876 /* for filtering, pass "memcg" as argument. */ 1877 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1878 } 1879 1880 static void memcg_oom_recover(struct mem_cgroup *memcg) 1881 { 1882 if (memcg && atomic_read(&memcg->under_oom)) 1883 memcg_wakeup_oom(memcg); 1884 } 1885 1886 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1887 { 1888 if (!current->memcg_oom.may_oom) 1889 return; 1890 /* 1891 * We are in the middle of the charge context here, so we 1892 * don't want to block when potentially sitting on a callstack 1893 * that holds all kinds of filesystem and mm locks. 1894 * 1895 * Also, the caller may handle a failed allocation gracefully 1896 * (like optional page cache readahead) and so an OOM killer 1897 * invocation might not even be necessary. 1898 * 1899 * That's why we don't do anything here except remember the 1900 * OOM context and then deal with it at the end of the page 1901 * fault when the stack is unwound, the locks are released, 1902 * and when we know whether the fault was overall successful. 1903 */ 1904 css_get(&memcg->css); 1905 current->memcg_oom.memcg = memcg; 1906 current->memcg_oom.gfp_mask = mask; 1907 current->memcg_oom.order = order; 1908 } 1909 1910 /** 1911 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1912 * @handle: actually kill/wait or just clean up the OOM state 1913 * 1914 * This has to be called at the end of a page fault if the memcg OOM 1915 * handler was enabled. 1916 * 1917 * Memcg supports userspace OOM handling where failed allocations must 1918 * sleep on a waitqueue until the userspace task resolves the 1919 * situation. Sleeping directly in the charge context with all kinds 1920 * of locks held is not a good idea, instead we remember an OOM state 1921 * in the task and mem_cgroup_oom_synchronize() has to be called at 1922 * the end of the page fault to complete the OOM handling. 1923 * 1924 * Returns %true if an ongoing memcg OOM situation was detected and 1925 * completed, %false otherwise. 1926 */ 1927 bool mem_cgroup_oom_synchronize(bool handle) 1928 { 1929 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1930 struct oom_wait_info owait; 1931 bool locked; 1932 1933 /* OOM is global, do not handle */ 1934 if (!memcg) 1935 return false; 1936 1937 if (!handle) 1938 goto cleanup; 1939 1940 owait.memcg = memcg; 1941 owait.wait.flags = 0; 1942 owait.wait.func = memcg_oom_wake_function; 1943 owait.wait.private = current; 1944 INIT_LIST_HEAD(&owait.wait.task_list); 1945 1946 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1947 mem_cgroup_mark_under_oom(memcg); 1948 1949 locked = mem_cgroup_oom_trylock(memcg); 1950 1951 if (locked) 1952 mem_cgroup_oom_notify(memcg); 1953 1954 if (locked && !memcg->oom_kill_disable) { 1955 mem_cgroup_unmark_under_oom(memcg); 1956 finish_wait(&memcg_oom_waitq, &owait.wait); 1957 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1958 current->memcg_oom.order); 1959 } else { 1960 schedule(); 1961 mem_cgroup_unmark_under_oom(memcg); 1962 finish_wait(&memcg_oom_waitq, &owait.wait); 1963 } 1964 1965 if (locked) { 1966 mem_cgroup_oom_unlock(memcg); 1967 /* 1968 * There is no guarantee that an OOM-lock contender 1969 * sees the wakeups triggered by the OOM kill 1970 * uncharges. Wake any sleepers explicitely. 1971 */ 1972 memcg_oom_recover(memcg); 1973 } 1974 cleanup: 1975 current->memcg_oom.memcg = NULL; 1976 css_put(&memcg->css); 1977 return true; 1978 } 1979 1980 /** 1981 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1982 * @page: page that is going to change accounted state 1983 * @locked: &memcg->move_lock slowpath was taken 1984 * @flags: IRQ-state flags for &memcg->move_lock 1985 * 1986 * This function must mark the beginning of an accounted page state 1987 * change to prevent double accounting when the page is concurrently 1988 * being moved to another memcg: 1989 * 1990 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1991 * if (TestClearPageState(page)) 1992 * mem_cgroup_update_page_stat(memcg, state, -1); 1993 * mem_cgroup_end_page_stat(memcg, locked, flags); 1994 * 1995 * The RCU lock is held throughout the transaction. The fast path can 1996 * get away without acquiring the memcg->move_lock (@locked is false) 1997 * because page moving starts with an RCU grace period. 1998 * 1999 * The RCU lock also protects the memcg from being freed when the page 2000 * state that is going to change is the only thing preventing the page 2001 * from being uncharged. E.g. end-writeback clearing PageWriteback(), 2002 * which allows migration to go ahead and uncharge the page before the 2003 * account transaction might be complete. 2004 */ 2005 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 2006 bool *locked, 2007 unsigned long *flags) 2008 { 2009 struct mem_cgroup *memcg; 2010 2011 rcu_read_lock(); 2012 2013 if (mem_cgroup_disabled()) 2014 return NULL; 2015 again: 2016 memcg = page->mem_cgroup; 2017 if (unlikely(!memcg)) 2018 return NULL; 2019 2020 *locked = false; 2021 if (atomic_read(&memcg->moving_account) <= 0) 2022 return memcg; 2023 2024 spin_lock_irqsave(&memcg->move_lock, *flags); 2025 if (memcg != page->mem_cgroup) { 2026 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2027 goto again; 2028 } 2029 *locked = true; 2030 2031 return memcg; 2032 } 2033 2034 /** 2035 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2036 * @memcg: the memcg that was accounted against 2037 * @locked: value received from mem_cgroup_begin_page_stat() 2038 * @flags: value received from mem_cgroup_begin_page_stat() 2039 */ 2040 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, 2041 unsigned long *flags) 2042 { 2043 if (memcg && *locked) 2044 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2045 2046 rcu_read_unlock(); 2047 } 2048 2049 /** 2050 * mem_cgroup_update_page_stat - update page state statistics 2051 * @memcg: memcg to account against 2052 * @idx: page state item to account 2053 * @val: number of pages (positive or negative) 2054 * 2055 * See mem_cgroup_begin_page_stat() for locking requirements. 2056 */ 2057 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 2058 enum mem_cgroup_stat_index idx, int val) 2059 { 2060 VM_BUG_ON(!rcu_read_lock_held()); 2061 2062 if (memcg) 2063 this_cpu_add(memcg->stat->count[idx], val); 2064 } 2065 2066 /* 2067 * size of first charge trial. "32" comes from vmscan.c's magic value. 2068 * TODO: maybe necessary to use big numbers in big irons. 2069 */ 2070 #define CHARGE_BATCH 32U 2071 struct memcg_stock_pcp { 2072 struct mem_cgroup *cached; /* this never be root cgroup */ 2073 unsigned int nr_pages; 2074 struct work_struct work; 2075 unsigned long flags; 2076 #define FLUSHING_CACHED_CHARGE 0 2077 }; 2078 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2079 static DEFINE_MUTEX(percpu_charge_mutex); 2080 2081 /** 2082 * consume_stock: Try to consume stocked charge on this cpu. 2083 * @memcg: memcg to consume from. 2084 * @nr_pages: how many pages to charge. 2085 * 2086 * The charges will only happen if @memcg matches the current cpu's memcg 2087 * stock, and at least @nr_pages are available in that stock. Failure to 2088 * service an allocation will refill the stock. 2089 * 2090 * returns true if successful, false otherwise. 2091 */ 2092 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2093 { 2094 struct memcg_stock_pcp *stock; 2095 bool ret = false; 2096 2097 if (nr_pages > CHARGE_BATCH) 2098 return ret; 2099 2100 stock = &get_cpu_var(memcg_stock); 2101 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2102 stock->nr_pages -= nr_pages; 2103 ret = true; 2104 } 2105 put_cpu_var(memcg_stock); 2106 return ret; 2107 } 2108 2109 /* 2110 * Returns stocks cached in percpu and reset cached information. 2111 */ 2112 static void drain_stock(struct memcg_stock_pcp *stock) 2113 { 2114 struct mem_cgroup *old = stock->cached; 2115 2116 if (stock->nr_pages) { 2117 page_counter_uncharge(&old->memory, stock->nr_pages); 2118 if (do_swap_account) 2119 page_counter_uncharge(&old->memsw, stock->nr_pages); 2120 css_put_many(&old->css, stock->nr_pages); 2121 stock->nr_pages = 0; 2122 } 2123 stock->cached = NULL; 2124 } 2125 2126 /* 2127 * This must be called under preempt disabled or must be called by 2128 * a thread which is pinned to local cpu. 2129 */ 2130 static void drain_local_stock(struct work_struct *dummy) 2131 { 2132 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2133 drain_stock(stock); 2134 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2135 } 2136 2137 static void __init memcg_stock_init(void) 2138 { 2139 int cpu; 2140 2141 for_each_possible_cpu(cpu) { 2142 struct memcg_stock_pcp *stock = 2143 &per_cpu(memcg_stock, cpu); 2144 INIT_WORK(&stock->work, drain_local_stock); 2145 } 2146 } 2147 2148 /* 2149 * Cache charges(val) to local per_cpu area. 2150 * This will be consumed by consume_stock() function, later. 2151 */ 2152 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2153 { 2154 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2155 2156 if (stock->cached != memcg) { /* reset if necessary */ 2157 drain_stock(stock); 2158 stock->cached = memcg; 2159 } 2160 stock->nr_pages += nr_pages; 2161 put_cpu_var(memcg_stock); 2162 } 2163 2164 /* 2165 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2166 * of the hierarchy under it. 2167 */ 2168 static void drain_all_stock(struct mem_cgroup *root_memcg) 2169 { 2170 int cpu, curcpu; 2171 2172 /* If someone's already draining, avoid adding running more workers. */ 2173 if (!mutex_trylock(&percpu_charge_mutex)) 2174 return; 2175 /* Notify other cpus that system-wide "drain" is running */ 2176 get_online_cpus(); 2177 curcpu = get_cpu(); 2178 for_each_online_cpu(cpu) { 2179 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2180 struct mem_cgroup *memcg; 2181 2182 memcg = stock->cached; 2183 if (!memcg || !stock->nr_pages) 2184 continue; 2185 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 2186 continue; 2187 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2188 if (cpu == curcpu) 2189 drain_local_stock(&stock->work); 2190 else 2191 schedule_work_on(cpu, &stock->work); 2192 } 2193 } 2194 put_cpu(); 2195 put_online_cpus(); 2196 mutex_unlock(&percpu_charge_mutex); 2197 } 2198 2199 /* 2200 * This function drains percpu counter value from DEAD cpu and 2201 * move it to local cpu. Note that this function can be preempted. 2202 */ 2203 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2204 { 2205 int i; 2206 2207 spin_lock(&memcg->pcp_counter_lock); 2208 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2209 long x = per_cpu(memcg->stat->count[i], cpu); 2210 2211 per_cpu(memcg->stat->count[i], cpu) = 0; 2212 memcg->nocpu_base.count[i] += x; 2213 } 2214 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2215 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2216 2217 per_cpu(memcg->stat->events[i], cpu) = 0; 2218 memcg->nocpu_base.events[i] += x; 2219 } 2220 spin_unlock(&memcg->pcp_counter_lock); 2221 } 2222 2223 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2224 unsigned long action, 2225 void *hcpu) 2226 { 2227 int cpu = (unsigned long)hcpu; 2228 struct memcg_stock_pcp *stock; 2229 struct mem_cgroup *iter; 2230 2231 if (action == CPU_ONLINE) 2232 return NOTIFY_OK; 2233 2234 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2235 return NOTIFY_OK; 2236 2237 for_each_mem_cgroup(iter) 2238 mem_cgroup_drain_pcp_counter(iter, cpu); 2239 2240 stock = &per_cpu(memcg_stock, cpu); 2241 drain_stock(stock); 2242 return NOTIFY_OK; 2243 } 2244 2245 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2246 unsigned int nr_pages) 2247 { 2248 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2249 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2250 struct mem_cgroup *mem_over_limit; 2251 struct page_counter *counter; 2252 unsigned long nr_reclaimed; 2253 bool may_swap = true; 2254 bool drained = false; 2255 int ret = 0; 2256 2257 if (mem_cgroup_is_root(memcg)) 2258 goto done; 2259 retry: 2260 if (consume_stock(memcg, nr_pages)) 2261 goto done; 2262 2263 if (!do_swap_account || 2264 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2265 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2266 goto done_restock; 2267 if (do_swap_account) 2268 page_counter_uncharge(&memcg->memsw, batch); 2269 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2270 } else { 2271 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2272 may_swap = false; 2273 } 2274 2275 if (batch > nr_pages) { 2276 batch = nr_pages; 2277 goto retry; 2278 } 2279 2280 /* 2281 * Unlike in global OOM situations, memcg is not in a physical 2282 * memory shortage. Allow dying and OOM-killed tasks to 2283 * bypass the last charges so that they can exit quickly and 2284 * free their memory. 2285 */ 2286 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2287 fatal_signal_pending(current) || 2288 current->flags & PF_EXITING)) 2289 goto bypass; 2290 2291 if (unlikely(task_in_memcg_oom(current))) 2292 goto nomem; 2293 2294 if (!(gfp_mask & __GFP_WAIT)) 2295 goto nomem; 2296 2297 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2298 gfp_mask, may_swap); 2299 2300 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2301 goto retry; 2302 2303 if (!drained) { 2304 drain_all_stock(mem_over_limit); 2305 drained = true; 2306 goto retry; 2307 } 2308 2309 if (gfp_mask & __GFP_NORETRY) 2310 goto nomem; 2311 /* 2312 * Even though the limit is exceeded at this point, reclaim 2313 * may have been able to free some pages. Retry the charge 2314 * before killing the task. 2315 * 2316 * Only for regular pages, though: huge pages are rather 2317 * unlikely to succeed so close to the limit, and we fall back 2318 * to regular pages anyway in case of failure. 2319 */ 2320 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2321 goto retry; 2322 /* 2323 * At task move, charge accounts can be doubly counted. So, it's 2324 * better to wait until the end of task_move if something is going on. 2325 */ 2326 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2327 goto retry; 2328 2329 if (nr_retries--) 2330 goto retry; 2331 2332 if (gfp_mask & __GFP_NOFAIL) 2333 goto bypass; 2334 2335 if (fatal_signal_pending(current)) 2336 goto bypass; 2337 2338 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2339 nomem: 2340 if (!(gfp_mask & __GFP_NOFAIL)) 2341 return -ENOMEM; 2342 bypass: 2343 return -EINTR; 2344 2345 done_restock: 2346 css_get_many(&memcg->css, batch); 2347 if (batch > nr_pages) 2348 refill_stock(memcg, batch - nr_pages); 2349 done: 2350 return ret; 2351 } 2352 2353 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2354 { 2355 if (mem_cgroup_is_root(memcg)) 2356 return; 2357 2358 page_counter_uncharge(&memcg->memory, nr_pages); 2359 if (do_swap_account) 2360 page_counter_uncharge(&memcg->memsw, nr_pages); 2361 2362 css_put_many(&memcg->css, nr_pages); 2363 } 2364 2365 /* 2366 * A helper function to get mem_cgroup from ID. must be called under 2367 * rcu_read_lock(). The caller is responsible for calling 2368 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 2369 * refcnt from swap can be called against removed memcg.) 2370 */ 2371 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2372 { 2373 /* ID 0 is unused ID */ 2374 if (!id) 2375 return NULL; 2376 return mem_cgroup_from_id(id); 2377 } 2378 2379 /* 2380 * try_get_mem_cgroup_from_page - look up page's memcg association 2381 * @page: the page 2382 * 2383 * Look up, get a css reference, and return the memcg that owns @page. 2384 * 2385 * The page must be locked to prevent racing with swap-in and page 2386 * cache charges. If coming from an unlocked page table, the caller 2387 * must ensure the page is on the LRU or this can race with charging. 2388 */ 2389 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2390 { 2391 struct mem_cgroup *memcg; 2392 unsigned short id; 2393 swp_entry_t ent; 2394 2395 VM_BUG_ON_PAGE(!PageLocked(page), page); 2396 2397 memcg = page->mem_cgroup; 2398 if (memcg) { 2399 if (!css_tryget_online(&memcg->css)) 2400 memcg = NULL; 2401 } else if (PageSwapCache(page)) { 2402 ent.val = page_private(page); 2403 id = lookup_swap_cgroup_id(ent); 2404 rcu_read_lock(); 2405 memcg = mem_cgroup_lookup(id); 2406 if (memcg && !css_tryget_online(&memcg->css)) 2407 memcg = NULL; 2408 rcu_read_unlock(); 2409 } 2410 return memcg; 2411 } 2412 2413 static void lock_page_lru(struct page *page, int *isolated) 2414 { 2415 struct zone *zone = page_zone(page); 2416 2417 spin_lock_irq(&zone->lru_lock); 2418 if (PageLRU(page)) { 2419 struct lruvec *lruvec; 2420 2421 lruvec = mem_cgroup_page_lruvec(page, zone); 2422 ClearPageLRU(page); 2423 del_page_from_lru_list(page, lruvec, page_lru(page)); 2424 *isolated = 1; 2425 } else 2426 *isolated = 0; 2427 } 2428 2429 static void unlock_page_lru(struct page *page, int isolated) 2430 { 2431 struct zone *zone = page_zone(page); 2432 2433 if (isolated) { 2434 struct lruvec *lruvec; 2435 2436 lruvec = mem_cgroup_page_lruvec(page, zone); 2437 VM_BUG_ON_PAGE(PageLRU(page), page); 2438 SetPageLRU(page); 2439 add_page_to_lru_list(page, lruvec, page_lru(page)); 2440 } 2441 spin_unlock_irq(&zone->lru_lock); 2442 } 2443 2444 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2445 bool lrucare) 2446 { 2447 int isolated; 2448 2449 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2450 2451 /* 2452 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2453 * may already be on some other mem_cgroup's LRU. Take care of it. 2454 */ 2455 if (lrucare) 2456 lock_page_lru(page, &isolated); 2457 2458 /* 2459 * Nobody should be changing or seriously looking at 2460 * page->mem_cgroup at this point: 2461 * 2462 * - the page is uncharged 2463 * 2464 * - the page is off-LRU 2465 * 2466 * - an anonymous fault has exclusive page access, except for 2467 * a locked page table 2468 * 2469 * - a page cache insertion, a swapin fault, or a migration 2470 * have the page locked 2471 */ 2472 page->mem_cgroup = memcg; 2473 2474 if (lrucare) 2475 unlock_page_lru(page, isolated); 2476 } 2477 2478 #ifdef CONFIG_MEMCG_KMEM 2479 /* 2480 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2481 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. 2482 */ 2483 static DEFINE_MUTEX(memcg_slab_mutex); 2484 2485 /* 2486 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2487 * in the memcg_cache_params struct. 2488 */ 2489 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2490 { 2491 struct kmem_cache *cachep; 2492 2493 VM_BUG_ON(p->is_root_cache); 2494 cachep = p->root_cache; 2495 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2496 } 2497 2498 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2499 unsigned long nr_pages) 2500 { 2501 struct page_counter *counter; 2502 int ret = 0; 2503 2504 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2505 if (ret < 0) 2506 return ret; 2507 2508 ret = try_charge(memcg, gfp, nr_pages); 2509 if (ret == -EINTR) { 2510 /* 2511 * try_charge() chose to bypass to root due to OOM kill or 2512 * fatal signal. Since our only options are to either fail 2513 * the allocation or charge it to this cgroup, do it as a 2514 * temporary condition. But we can't fail. From a kmem/slab 2515 * perspective, the cache has already been selected, by 2516 * mem_cgroup_kmem_get_cache(), so it is too late to change 2517 * our minds. 2518 * 2519 * This condition will only trigger if the task entered 2520 * memcg_charge_kmem in a sane state, but was OOM-killed 2521 * during try_charge() above. Tasks that were already dying 2522 * when the allocation triggers should have been already 2523 * directed to the root cgroup in memcontrol.h 2524 */ 2525 page_counter_charge(&memcg->memory, nr_pages); 2526 if (do_swap_account) 2527 page_counter_charge(&memcg->memsw, nr_pages); 2528 css_get_many(&memcg->css, nr_pages); 2529 ret = 0; 2530 } else if (ret) 2531 page_counter_uncharge(&memcg->kmem, nr_pages); 2532 2533 return ret; 2534 } 2535 2536 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2537 unsigned long nr_pages) 2538 { 2539 page_counter_uncharge(&memcg->memory, nr_pages); 2540 if (do_swap_account) 2541 page_counter_uncharge(&memcg->memsw, nr_pages); 2542 2543 page_counter_uncharge(&memcg->kmem, nr_pages); 2544 2545 css_put_many(&memcg->css, nr_pages); 2546 } 2547 2548 /* 2549 * helper for acessing a memcg's index. It will be used as an index in the 2550 * child cache array in kmem_cache, and also to derive its name. This function 2551 * will return -1 when this is not a kmem-limited memcg. 2552 */ 2553 int memcg_cache_id(struct mem_cgroup *memcg) 2554 { 2555 return memcg ? memcg->kmemcg_id : -1; 2556 } 2557 2558 static int memcg_alloc_cache_id(void) 2559 { 2560 int id, size; 2561 int err; 2562 2563 id = ida_simple_get(&kmem_limited_groups, 2564 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2565 if (id < 0) 2566 return id; 2567 2568 if (id < memcg_limited_groups_array_size) 2569 return id; 2570 2571 /* 2572 * There's no space for the new id in memcg_caches arrays, 2573 * so we have to grow them. 2574 */ 2575 2576 size = 2 * (id + 1); 2577 if (size < MEMCG_CACHES_MIN_SIZE) 2578 size = MEMCG_CACHES_MIN_SIZE; 2579 else if (size > MEMCG_CACHES_MAX_SIZE) 2580 size = MEMCG_CACHES_MAX_SIZE; 2581 2582 mutex_lock(&memcg_slab_mutex); 2583 err = memcg_update_all_caches(size); 2584 mutex_unlock(&memcg_slab_mutex); 2585 2586 if (err) { 2587 ida_simple_remove(&kmem_limited_groups, id); 2588 return err; 2589 } 2590 return id; 2591 } 2592 2593 static void memcg_free_cache_id(int id) 2594 { 2595 ida_simple_remove(&kmem_limited_groups, id); 2596 } 2597 2598 /* 2599 * We should update the current array size iff all caches updates succeed. This 2600 * can only be done from the slab side. The slab mutex needs to be held when 2601 * calling this. 2602 */ 2603 void memcg_update_array_size(int num) 2604 { 2605 memcg_limited_groups_array_size = num; 2606 } 2607 2608 static void memcg_register_cache(struct mem_cgroup *memcg, 2609 struct kmem_cache *root_cache) 2610 { 2611 static char memcg_name_buf[NAME_MAX + 1]; /* protected by 2612 memcg_slab_mutex */ 2613 struct kmem_cache *cachep; 2614 int id; 2615 2616 lockdep_assert_held(&memcg_slab_mutex); 2617 2618 id = memcg_cache_id(memcg); 2619 2620 /* 2621 * Since per-memcg caches are created asynchronously on first 2622 * allocation (see memcg_kmem_get_cache()), several threads can try to 2623 * create the same cache, but only one of them may succeed. 2624 */ 2625 if (cache_from_memcg_idx(root_cache, id)) 2626 return; 2627 2628 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); 2629 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); 2630 /* 2631 * If we could not create a memcg cache, do not complain, because 2632 * that's not critical at all as we can always proceed with the root 2633 * cache. 2634 */ 2635 if (!cachep) 2636 return; 2637 2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2639 2640 /* 2641 * Since readers won't lock (see cache_from_memcg_idx()), we need a 2642 * barrier here to ensure nobody will see the kmem_cache partially 2643 * initialized. 2644 */ 2645 smp_wmb(); 2646 2647 BUG_ON(root_cache->memcg_params->memcg_caches[id]); 2648 root_cache->memcg_params->memcg_caches[id] = cachep; 2649 } 2650 2651 static void memcg_unregister_cache(struct kmem_cache *cachep) 2652 { 2653 struct kmem_cache *root_cache; 2654 struct mem_cgroup *memcg; 2655 int id; 2656 2657 lockdep_assert_held(&memcg_slab_mutex); 2658 2659 BUG_ON(is_root_cache(cachep)); 2660 2661 root_cache = cachep->memcg_params->root_cache; 2662 memcg = cachep->memcg_params->memcg; 2663 id = memcg_cache_id(memcg); 2664 2665 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); 2666 root_cache->memcg_params->memcg_caches[id] = NULL; 2667 2668 list_del(&cachep->memcg_params->list); 2669 2670 kmem_cache_destroy(cachep); 2671 } 2672 2673 int __memcg_cleanup_cache_params(struct kmem_cache *s) 2674 { 2675 struct kmem_cache *c; 2676 int i, failed = 0; 2677 2678 mutex_lock(&memcg_slab_mutex); 2679 for_each_memcg_cache_index(i) { 2680 c = cache_from_memcg_idx(s, i); 2681 if (!c) 2682 continue; 2683 2684 memcg_unregister_cache(c); 2685 2686 if (cache_from_memcg_idx(s, i)) 2687 failed++; 2688 } 2689 mutex_unlock(&memcg_slab_mutex); 2690 return failed; 2691 } 2692 2693 static void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2694 { 2695 struct kmem_cache *cachep; 2696 struct memcg_cache_params *params, *tmp; 2697 2698 if (!memcg_kmem_is_active(memcg)) 2699 return; 2700 2701 mutex_lock(&memcg_slab_mutex); 2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2703 cachep = memcg_params_to_cache(params); 2704 memcg_unregister_cache(cachep); 2705 } 2706 mutex_unlock(&memcg_slab_mutex); 2707 } 2708 2709 struct memcg_register_cache_work { 2710 struct mem_cgroup *memcg; 2711 struct kmem_cache *cachep; 2712 struct work_struct work; 2713 }; 2714 2715 static void memcg_register_cache_func(struct work_struct *w) 2716 { 2717 struct memcg_register_cache_work *cw = 2718 container_of(w, struct memcg_register_cache_work, work); 2719 struct mem_cgroup *memcg = cw->memcg; 2720 struct kmem_cache *cachep = cw->cachep; 2721 2722 mutex_lock(&memcg_slab_mutex); 2723 memcg_register_cache(memcg, cachep); 2724 mutex_unlock(&memcg_slab_mutex); 2725 2726 css_put(&memcg->css); 2727 kfree(cw); 2728 } 2729 2730 /* 2731 * Enqueue the creation of a per-memcg kmem_cache. 2732 */ 2733 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 2734 struct kmem_cache *cachep) 2735 { 2736 struct memcg_register_cache_work *cw; 2737 2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2739 if (!cw) 2740 return; 2741 2742 css_get(&memcg->css); 2743 2744 cw->memcg = memcg; 2745 cw->cachep = cachep; 2746 2747 INIT_WORK(&cw->work, memcg_register_cache_func); 2748 schedule_work(&cw->work); 2749 } 2750 2751 static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 2752 struct kmem_cache *cachep) 2753 { 2754 /* 2755 * We need to stop accounting when we kmalloc, because if the 2756 * corresponding kmalloc cache is not yet created, the first allocation 2757 * in __memcg_schedule_register_cache will recurse. 2758 * 2759 * However, it is better to enclose the whole function. Depending on 2760 * the debugging options enabled, INIT_WORK(), for instance, can 2761 * trigger an allocation. This too, will make us recurse. Because at 2762 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2763 * the safest choice is to do it like this, wrapping the whole function. 2764 */ 2765 current->memcg_kmem_skip_account = 1; 2766 __memcg_schedule_register_cache(memcg, cachep); 2767 current->memcg_kmem_skip_account = 0; 2768 } 2769 2770 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2771 { 2772 unsigned int nr_pages = 1 << order; 2773 2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 2775 } 2776 2777 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2778 { 2779 unsigned int nr_pages = 1 << order; 2780 2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 2782 } 2783 2784 /* 2785 * Return the kmem_cache we're supposed to use for a slab allocation. 2786 * We try to use the current memcg's version of the cache. 2787 * 2788 * If the cache does not exist yet, if we are the first user of it, 2789 * we either create it immediately, if possible, or create it asynchronously 2790 * in a workqueue. 2791 * In the latter case, we will let the current allocation go through with 2792 * the original cache. 2793 * 2794 * Can't be called in interrupt context or from kernel threads. 2795 * This function needs to be called with rcu_read_lock() held. 2796 */ 2797 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2798 { 2799 struct mem_cgroup *memcg; 2800 struct kmem_cache *memcg_cachep; 2801 2802 VM_BUG_ON(!cachep->memcg_params); 2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 2804 2805 if (current->memcg_kmem_skip_account) 2806 return cachep; 2807 2808 memcg = get_mem_cgroup_from_mm(current->mm); 2809 if (!memcg_kmem_is_active(memcg)) 2810 goto out; 2811 2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2813 if (likely(memcg_cachep)) 2814 return memcg_cachep; 2815 2816 /* 2817 * If we are in a safe context (can wait, and not in interrupt 2818 * context), we could be be predictable and return right away. 2819 * This would guarantee that the allocation being performed 2820 * already belongs in the new cache. 2821 * 2822 * However, there are some clashes that can arrive from locking. 2823 * For instance, because we acquire the slab_mutex while doing 2824 * memcg_create_kmem_cache, this means no further allocation 2825 * could happen with the slab_mutex held. So it's better to 2826 * defer everything. 2827 */ 2828 memcg_schedule_register_cache(memcg, cachep); 2829 out: 2830 css_put(&memcg->css); 2831 return cachep; 2832 } 2833 2834 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2835 { 2836 if (!is_root_cache(cachep)) 2837 css_put(&cachep->memcg_params->memcg->css); 2838 } 2839 2840 /* 2841 * We need to verify if the allocation against current->mm->owner's memcg is 2842 * possible for the given order. But the page is not allocated yet, so we'll 2843 * need a further commit step to do the final arrangements. 2844 * 2845 * It is possible for the task to switch cgroups in this mean time, so at 2846 * commit time, we can't rely on task conversion any longer. We'll then use 2847 * the handle argument to return to the caller which cgroup we should commit 2848 * against. We could also return the memcg directly and avoid the pointer 2849 * passing, but a boolean return value gives better semantics considering 2850 * the compiled-out case as well. 2851 * 2852 * Returning true means the allocation is possible. 2853 */ 2854 bool 2855 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 2856 { 2857 struct mem_cgroup *memcg; 2858 int ret; 2859 2860 *_memcg = NULL; 2861 2862 memcg = get_mem_cgroup_from_mm(current->mm); 2863 2864 if (!memcg_kmem_is_active(memcg)) { 2865 css_put(&memcg->css); 2866 return true; 2867 } 2868 2869 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2870 if (!ret) 2871 *_memcg = memcg; 2872 2873 css_put(&memcg->css); 2874 return (ret == 0); 2875 } 2876 2877 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2878 int order) 2879 { 2880 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2881 2882 /* The page allocation failed. Revert */ 2883 if (!page) { 2884 memcg_uncharge_kmem(memcg, 1 << order); 2885 return; 2886 } 2887 page->mem_cgroup = memcg; 2888 } 2889 2890 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2891 { 2892 struct mem_cgroup *memcg = page->mem_cgroup; 2893 2894 if (!memcg) 2895 return; 2896 2897 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2898 2899 memcg_uncharge_kmem(memcg, 1 << order); 2900 page->mem_cgroup = NULL; 2901 } 2902 #endif /* CONFIG_MEMCG_KMEM */ 2903 2904 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2905 2906 /* 2907 * Because tail pages are not marked as "used", set it. We're under 2908 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2909 * charge/uncharge will be never happen and move_account() is done under 2910 * compound_lock(), so we don't have to take care of races. 2911 */ 2912 void mem_cgroup_split_huge_fixup(struct page *head) 2913 { 2914 int i; 2915 2916 if (mem_cgroup_disabled()) 2917 return; 2918 2919 for (i = 1; i < HPAGE_PMD_NR; i++) 2920 head[i].mem_cgroup = head->mem_cgroup; 2921 2922 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2923 HPAGE_PMD_NR); 2924 } 2925 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2926 2927 /** 2928 * mem_cgroup_move_account - move account of the page 2929 * @page: the page 2930 * @nr_pages: number of regular pages (>1 for huge pages) 2931 * @from: mem_cgroup which the page is moved from. 2932 * @to: mem_cgroup which the page is moved to. @from != @to. 2933 * 2934 * The caller must confirm following. 2935 * - page is not on LRU (isolate_page() is useful.) 2936 * - compound_lock is held when nr_pages > 1 2937 * 2938 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 2939 * from old cgroup. 2940 */ 2941 static int mem_cgroup_move_account(struct page *page, 2942 unsigned int nr_pages, 2943 struct mem_cgroup *from, 2944 struct mem_cgroup *to) 2945 { 2946 unsigned long flags; 2947 int ret; 2948 2949 VM_BUG_ON(from == to); 2950 VM_BUG_ON_PAGE(PageLRU(page), page); 2951 /* 2952 * The page is isolated from LRU. So, collapse function 2953 * will not handle this page. But page splitting can happen. 2954 * Do this check under compound_page_lock(). The caller should 2955 * hold it. 2956 */ 2957 ret = -EBUSY; 2958 if (nr_pages > 1 && !PageTransHuge(page)) 2959 goto out; 2960 2961 /* 2962 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 2963 * of its source page while we change it: page migration takes 2964 * both pages off the LRU, but page cache replacement doesn't. 2965 */ 2966 if (!trylock_page(page)) 2967 goto out; 2968 2969 ret = -EINVAL; 2970 if (page->mem_cgroup != from) 2971 goto out_unlock; 2972 2973 spin_lock_irqsave(&from->move_lock, flags); 2974 2975 if (!PageAnon(page) && page_mapped(page)) { 2976 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 2977 nr_pages); 2978 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 2979 nr_pages); 2980 } 2981 2982 if (PageWriteback(page)) { 2983 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 2984 nr_pages); 2985 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 2986 nr_pages); 2987 } 2988 2989 /* 2990 * It is safe to change page->mem_cgroup here because the page 2991 * is referenced, charged, and isolated - we can't race with 2992 * uncharging, charging, migration, or LRU putback. 2993 */ 2994 2995 /* caller should have done css_get */ 2996 page->mem_cgroup = to; 2997 spin_unlock_irqrestore(&from->move_lock, flags); 2998 2999 ret = 0; 3000 3001 local_irq_disable(); 3002 mem_cgroup_charge_statistics(to, page, nr_pages); 3003 memcg_check_events(to, page); 3004 mem_cgroup_charge_statistics(from, page, -nr_pages); 3005 memcg_check_events(from, page); 3006 local_irq_enable(); 3007 out_unlock: 3008 unlock_page(page); 3009 out: 3010 return ret; 3011 } 3012 3013 #ifdef CONFIG_MEMCG_SWAP 3014 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3015 bool charge) 3016 { 3017 int val = (charge) ? 1 : -1; 3018 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 3019 } 3020 3021 /** 3022 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3023 * @entry: swap entry to be moved 3024 * @from: mem_cgroup which the entry is moved from 3025 * @to: mem_cgroup which the entry is moved to 3026 * 3027 * It succeeds only when the swap_cgroup's record for this entry is the same 3028 * as the mem_cgroup's id of @from. 3029 * 3030 * Returns 0 on success, -EINVAL on failure. 3031 * 3032 * The caller must have charged to @to, IOW, called page_counter_charge() about 3033 * both res and memsw, and called css_get(). 3034 */ 3035 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3036 struct mem_cgroup *from, struct mem_cgroup *to) 3037 { 3038 unsigned short old_id, new_id; 3039 3040 old_id = mem_cgroup_id(from); 3041 new_id = mem_cgroup_id(to); 3042 3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3044 mem_cgroup_swap_statistics(from, false); 3045 mem_cgroup_swap_statistics(to, true); 3046 /* 3047 * This function is only called from task migration context now. 3048 * It postpones page_counter and refcount handling till the end 3049 * of task migration(mem_cgroup_clear_mc()) for performance 3050 * improvement. But we cannot postpone css_get(to) because if 3051 * the process that has been moved to @to does swap-in, the 3052 * refcount of @to might be decreased to 0. 3053 * 3054 * We are in attach() phase, so the cgroup is guaranteed to be 3055 * alive, so we can just call css_get(). 3056 */ 3057 css_get(&to->css); 3058 return 0; 3059 } 3060 return -EINVAL; 3061 } 3062 #else 3063 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3064 struct mem_cgroup *from, struct mem_cgroup *to) 3065 { 3066 return -EINVAL; 3067 } 3068 #endif 3069 3070 static DEFINE_MUTEX(memcg_limit_mutex); 3071 3072 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3073 unsigned long limit) 3074 { 3075 unsigned long curusage; 3076 unsigned long oldusage; 3077 bool enlarge = false; 3078 int retry_count; 3079 int ret; 3080 3081 /* 3082 * For keeping hierarchical_reclaim simple, how long we should retry 3083 * is depends on callers. We set our retry-count to be function 3084 * of # of children which we should visit in this loop. 3085 */ 3086 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3087 mem_cgroup_count_children(memcg); 3088 3089 oldusage = page_counter_read(&memcg->memory); 3090 3091 do { 3092 if (signal_pending(current)) { 3093 ret = -EINTR; 3094 break; 3095 } 3096 3097 mutex_lock(&memcg_limit_mutex); 3098 if (limit > memcg->memsw.limit) { 3099 mutex_unlock(&memcg_limit_mutex); 3100 ret = -EINVAL; 3101 break; 3102 } 3103 if (limit > memcg->memory.limit) 3104 enlarge = true; 3105 ret = page_counter_limit(&memcg->memory, limit); 3106 mutex_unlock(&memcg_limit_mutex); 3107 3108 if (!ret) 3109 break; 3110 3111 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3112 3113 curusage = page_counter_read(&memcg->memory); 3114 /* Usage is reduced ? */ 3115 if (curusage >= oldusage) 3116 retry_count--; 3117 else 3118 oldusage = curusage; 3119 } while (retry_count); 3120 3121 if (!ret && enlarge) 3122 memcg_oom_recover(memcg); 3123 3124 return ret; 3125 } 3126 3127 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3128 unsigned long limit) 3129 { 3130 unsigned long curusage; 3131 unsigned long oldusage; 3132 bool enlarge = false; 3133 int retry_count; 3134 int ret; 3135 3136 /* see mem_cgroup_resize_res_limit */ 3137 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3138 mem_cgroup_count_children(memcg); 3139 3140 oldusage = page_counter_read(&memcg->memsw); 3141 3142 do { 3143 if (signal_pending(current)) { 3144 ret = -EINTR; 3145 break; 3146 } 3147 3148 mutex_lock(&memcg_limit_mutex); 3149 if (limit < memcg->memory.limit) { 3150 mutex_unlock(&memcg_limit_mutex); 3151 ret = -EINVAL; 3152 break; 3153 } 3154 if (limit > memcg->memsw.limit) 3155 enlarge = true; 3156 ret = page_counter_limit(&memcg->memsw, limit); 3157 mutex_unlock(&memcg_limit_mutex); 3158 3159 if (!ret) 3160 break; 3161 3162 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3163 3164 curusage = page_counter_read(&memcg->memsw); 3165 /* Usage is reduced ? */ 3166 if (curusage >= oldusage) 3167 retry_count--; 3168 else 3169 oldusage = curusage; 3170 } while (retry_count); 3171 3172 if (!ret && enlarge) 3173 memcg_oom_recover(memcg); 3174 3175 return ret; 3176 } 3177 3178 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3179 gfp_t gfp_mask, 3180 unsigned long *total_scanned) 3181 { 3182 unsigned long nr_reclaimed = 0; 3183 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3184 unsigned long reclaimed; 3185 int loop = 0; 3186 struct mem_cgroup_tree_per_zone *mctz; 3187 unsigned long excess; 3188 unsigned long nr_scanned; 3189 3190 if (order > 0) 3191 return 0; 3192 3193 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3194 /* 3195 * This loop can run a while, specially if mem_cgroup's continuously 3196 * keep exceeding their soft limit and putting the system under 3197 * pressure 3198 */ 3199 do { 3200 if (next_mz) 3201 mz = next_mz; 3202 else 3203 mz = mem_cgroup_largest_soft_limit_node(mctz); 3204 if (!mz) 3205 break; 3206 3207 nr_scanned = 0; 3208 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3209 gfp_mask, &nr_scanned); 3210 nr_reclaimed += reclaimed; 3211 *total_scanned += nr_scanned; 3212 spin_lock_irq(&mctz->lock); 3213 __mem_cgroup_remove_exceeded(mz, mctz); 3214 3215 /* 3216 * If we failed to reclaim anything from this memory cgroup 3217 * it is time to move on to the next cgroup 3218 */ 3219 next_mz = NULL; 3220 if (!reclaimed) 3221 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3222 3223 excess = soft_limit_excess(mz->memcg); 3224 /* 3225 * One school of thought says that we should not add 3226 * back the node to the tree if reclaim returns 0. 3227 * But our reclaim could return 0, simply because due 3228 * to priority we are exposing a smaller subset of 3229 * memory to reclaim from. Consider this as a longer 3230 * term TODO. 3231 */ 3232 /* If excess == 0, no tree ops */ 3233 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3234 spin_unlock_irq(&mctz->lock); 3235 css_put(&mz->memcg->css); 3236 loop++; 3237 /* 3238 * Could not reclaim anything and there are no more 3239 * mem cgroups to try or we seem to be looping without 3240 * reclaiming anything. 3241 */ 3242 if (!nr_reclaimed && 3243 (next_mz == NULL || 3244 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3245 break; 3246 } while (!nr_reclaimed); 3247 if (next_mz) 3248 css_put(&next_mz->memcg->css); 3249 return nr_reclaimed; 3250 } 3251 3252 /* 3253 * Test whether @memcg has children, dead or alive. Note that this 3254 * function doesn't care whether @memcg has use_hierarchy enabled and 3255 * returns %true if there are child csses according to the cgroup 3256 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3257 */ 3258 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3259 { 3260 bool ret; 3261 3262 /* 3263 * The lock does not prevent addition or deletion of children, but 3264 * it prevents a new child from being initialized based on this 3265 * parent in css_online(), so it's enough to decide whether 3266 * hierarchically inherited attributes can still be changed or not. 3267 */ 3268 lockdep_assert_held(&memcg_create_mutex); 3269 3270 rcu_read_lock(); 3271 ret = css_next_child(NULL, &memcg->css); 3272 rcu_read_unlock(); 3273 return ret; 3274 } 3275 3276 /* 3277 * Reclaims as many pages from the given memcg as possible and moves 3278 * the rest to the parent. 3279 * 3280 * Caller is responsible for holding css reference for memcg. 3281 */ 3282 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3283 { 3284 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3285 3286 /* we call try-to-free pages for make this cgroup empty */ 3287 lru_add_drain_all(); 3288 /* try to free all pages in this cgroup */ 3289 while (nr_retries && page_counter_read(&memcg->memory)) { 3290 int progress; 3291 3292 if (signal_pending(current)) 3293 return -EINTR; 3294 3295 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3296 GFP_KERNEL, true); 3297 if (!progress) { 3298 nr_retries--; 3299 /* maybe some writeback is necessary */ 3300 congestion_wait(BLK_RW_ASYNC, HZ/10); 3301 } 3302 3303 } 3304 3305 return 0; 3306 } 3307 3308 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3309 char *buf, size_t nbytes, 3310 loff_t off) 3311 { 3312 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3313 3314 if (mem_cgroup_is_root(memcg)) 3315 return -EINVAL; 3316 return mem_cgroup_force_empty(memcg) ?: nbytes; 3317 } 3318 3319 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3320 struct cftype *cft) 3321 { 3322 return mem_cgroup_from_css(css)->use_hierarchy; 3323 } 3324 3325 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3326 struct cftype *cft, u64 val) 3327 { 3328 int retval = 0; 3329 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3330 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3331 3332 mutex_lock(&memcg_create_mutex); 3333 3334 if (memcg->use_hierarchy == val) 3335 goto out; 3336 3337 /* 3338 * If parent's use_hierarchy is set, we can't make any modifications 3339 * in the child subtrees. If it is unset, then the change can 3340 * occur, provided the current cgroup has no children. 3341 * 3342 * For the root cgroup, parent_mem is NULL, we allow value to be 3343 * set if there are no children. 3344 */ 3345 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3346 (val == 1 || val == 0)) { 3347 if (!memcg_has_children(memcg)) 3348 memcg->use_hierarchy = val; 3349 else 3350 retval = -EBUSY; 3351 } else 3352 retval = -EINVAL; 3353 3354 out: 3355 mutex_unlock(&memcg_create_mutex); 3356 3357 return retval; 3358 } 3359 3360 static unsigned long tree_stat(struct mem_cgroup *memcg, 3361 enum mem_cgroup_stat_index idx) 3362 { 3363 struct mem_cgroup *iter; 3364 long val = 0; 3365 3366 /* Per-cpu values can be negative, use a signed accumulator */ 3367 for_each_mem_cgroup_tree(iter, memcg) 3368 val += mem_cgroup_read_stat(iter, idx); 3369 3370 if (val < 0) /* race ? */ 3371 val = 0; 3372 return val; 3373 } 3374 3375 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3376 { 3377 u64 val; 3378 3379 if (mem_cgroup_is_root(memcg)) { 3380 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 3381 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 3382 if (swap) 3383 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 3384 } else { 3385 if (!swap) 3386 val = page_counter_read(&memcg->memory); 3387 else 3388 val = page_counter_read(&memcg->memsw); 3389 } 3390 return val << PAGE_SHIFT; 3391 } 3392 3393 enum { 3394 RES_USAGE, 3395 RES_LIMIT, 3396 RES_MAX_USAGE, 3397 RES_FAILCNT, 3398 RES_SOFT_LIMIT, 3399 }; 3400 3401 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3402 struct cftype *cft) 3403 { 3404 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3405 struct page_counter *counter; 3406 3407 switch (MEMFILE_TYPE(cft->private)) { 3408 case _MEM: 3409 counter = &memcg->memory; 3410 break; 3411 case _MEMSWAP: 3412 counter = &memcg->memsw; 3413 break; 3414 case _KMEM: 3415 counter = &memcg->kmem; 3416 break; 3417 default: 3418 BUG(); 3419 } 3420 3421 switch (MEMFILE_ATTR(cft->private)) { 3422 case RES_USAGE: 3423 if (counter == &memcg->memory) 3424 return mem_cgroup_usage(memcg, false); 3425 if (counter == &memcg->memsw) 3426 return mem_cgroup_usage(memcg, true); 3427 return (u64)page_counter_read(counter) * PAGE_SIZE; 3428 case RES_LIMIT: 3429 return (u64)counter->limit * PAGE_SIZE; 3430 case RES_MAX_USAGE: 3431 return (u64)counter->watermark * PAGE_SIZE; 3432 case RES_FAILCNT: 3433 return counter->failcnt; 3434 case RES_SOFT_LIMIT: 3435 return (u64)memcg->soft_limit * PAGE_SIZE; 3436 default: 3437 BUG(); 3438 } 3439 } 3440 3441 #ifdef CONFIG_MEMCG_KMEM 3442 static int memcg_activate_kmem(struct mem_cgroup *memcg, 3443 unsigned long nr_pages) 3444 { 3445 int err = 0; 3446 int memcg_id; 3447 3448 if (memcg_kmem_is_active(memcg)) 3449 return 0; 3450 3451 /* 3452 * For simplicity, we won't allow this to be disabled. It also can't 3453 * be changed if the cgroup has children already, or if tasks had 3454 * already joined. 3455 * 3456 * If tasks join before we set the limit, a person looking at 3457 * kmem.usage_in_bytes will have no way to determine when it took 3458 * place, which makes the value quite meaningless. 3459 * 3460 * After it first became limited, changes in the value of the limit are 3461 * of course permitted. 3462 */ 3463 mutex_lock(&memcg_create_mutex); 3464 if (cgroup_has_tasks(memcg->css.cgroup) || 3465 (memcg->use_hierarchy && memcg_has_children(memcg))) 3466 err = -EBUSY; 3467 mutex_unlock(&memcg_create_mutex); 3468 if (err) 3469 goto out; 3470 3471 memcg_id = memcg_alloc_cache_id(); 3472 if (memcg_id < 0) { 3473 err = memcg_id; 3474 goto out; 3475 } 3476 3477 /* 3478 * We couldn't have accounted to this cgroup, because it hasn't got 3479 * activated yet, so this should succeed. 3480 */ 3481 err = page_counter_limit(&memcg->kmem, nr_pages); 3482 VM_BUG_ON(err); 3483 3484 static_key_slow_inc(&memcg_kmem_enabled_key); 3485 /* 3486 * A memory cgroup is considered kmem-active as soon as it gets 3487 * kmemcg_id. Setting the id after enabling static branching will 3488 * guarantee no one starts accounting before all call sites are 3489 * patched. 3490 */ 3491 memcg->kmemcg_id = memcg_id; 3492 out: 3493 return err; 3494 } 3495 3496 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3497 unsigned long limit) 3498 { 3499 int ret; 3500 3501 mutex_lock(&memcg_limit_mutex); 3502 if (!memcg_kmem_is_active(memcg)) 3503 ret = memcg_activate_kmem(memcg, limit); 3504 else 3505 ret = page_counter_limit(&memcg->kmem, limit); 3506 mutex_unlock(&memcg_limit_mutex); 3507 return ret; 3508 } 3509 3510 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 3511 { 3512 int ret = 0; 3513 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 3514 3515 if (!parent) 3516 return 0; 3517 3518 mutex_lock(&memcg_limit_mutex); 3519 /* 3520 * If the parent cgroup is not kmem-active now, it cannot be activated 3521 * after this point, because it has at least one child already. 3522 */ 3523 if (memcg_kmem_is_active(parent)) 3524 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 3525 mutex_unlock(&memcg_limit_mutex); 3526 return ret; 3527 } 3528 #else 3529 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3530 unsigned long limit) 3531 { 3532 return -EINVAL; 3533 } 3534 #endif /* CONFIG_MEMCG_KMEM */ 3535 3536 /* 3537 * The user of this function is... 3538 * RES_LIMIT. 3539 */ 3540 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3541 char *buf, size_t nbytes, loff_t off) 3542 { 3543 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3544 unsigned long nr_pages; 3545 int ret; 3546 3547 buf = strstrip(buf); 3548 ret = page_counter_memparse(buf, &nr_pages); 3549 if (ret) 3550 return ret; 3551 3552 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3553 case RES_LIMIT: 3554 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3555 ret = -EINVAL; 3556 break; 3557 } 3558 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3559 case _MEM: 3560 ret = mem_cgroup_resize_limit(memcg, nr_pages); 3561 break; 3562 case _MEMSWAP: 3563 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 3564 break; 3565 case _KMEM: 3566 ret = memcg_update_kmem_limit(memcg, nr_pages); 3567 break; 3568 } 3569 break; 3570 case RES_SOFT_LIMIT: 3571 memcg->soft_limit = nr_pages; 3572 ret = 0; 3573 break; 3574 } 3575 return ret ?: nbytes; 3576 } 3577 3578 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3579 size_t nbytes, loff_t off) 3580 { 3581 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3582 struct page_counter *counter; 3583 3584 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3585 case _MEM: 3586 counter = &memcg->memory; 3587 break; 3588 case _MEMSWAP: 3589 counter = &memcg->memsw; 3590 break; 3591 case _KMEM: 3592 counter = &memcg->kmem; 3593 break; 3594 default: 3595 BUG(); 3596 } 3597 3598 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3599 case RES_MAX_USAGE: 3600 page_counter_reset_watermark(counter); 3601 break; 3602 case RES_FAILCNT: 3603 counter->failcnt = 0; 3604 break; 3605 default: 3606 BUG(); 3607 } 3608 3609 return nbytes; 3610 } 3611 3612 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3613 struct cftype *cft) 3614 { 3615 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3616 } 3617 3618 #ifdef CONFIG_MMU 3619 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3620 struct cftype *cft, u64 val) 3621 { 3622 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3623 3624 if (val >= (1 << NR_MOVE_TYPE)) 3625 return -EINVAL; 3626 3627 /* 3628 * No kind of locking is needed in here, because ->can_attach() will 3629 * check this value once in the beginning of the process, and then carry 3630 * on with stale data. This means that changes to this value will only 3631 * affect task migrations starting after the change. 3632 */ 3633 memcg->move_charge_at_immigrate = val; 3634 return 0; 3635 } 3636 #else 3637 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3638 struct cftype *cft, u64 val) 3639 { 3640 return -ENOSYS; 3641 } 3642 #endif 3643 3644 #ifdef CONFIG_NUMA 3645 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3646 { 3647 struct numa_stat { 3648 const char *name; 3649 unsigned int lru_mask; 3650 }; 3651 3652 static const struct numa_stat stats[] = { 3653 { "total", LRU_ALL }, 3654 { "file", LRU_ALL_FILE }, 3655 { "anon", LRU_ALL_ANON }, 3656 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3657 }; 3658 const struct numa_stat *stat; 3659 int nid; 3660 unsigned long nr; 3661 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3662 3663 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3664 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3665 seq_printf(m, "%s=%lu", stat->name, nr); 3666 for_each_node_state(nid, N_MEMORY) { 3667 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3668 stat->lru_mask); 3669 seq_printf(m, " N%d=%lu", nid, nr); 3670 } 3671 seq_putc(m, '\n'); 3672 } 3673 3674 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3675 struct mem_cgroup *iter; 3676 3677 nr = 0; 3678 for_each_mem_cgroup_tree(iter, memcg) 3679 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3680 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3681 for_each_node_state(nid, N_MEMORY) { 3682 nr = 0; 3683 for_each_mem_cgroup_tree(iter, memcg) 3684 nr += mem_cgroup_node_nr_lru_pages( 3685 iter, nid, stat->lru_mask); 3686 seq_printf(m, " N%d=%lu", nid, nr); 3687 } 3688 seq_putc(m, '\n'); 3689 } 3690 3691 return 0; 3692 } 3693 #endif /* CONFIG_NUMA */ 3694 3695 static int memcg_stat_show(struct seq_file *m, void *v) 3696 { 3697 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3698 unsigned long memory, memsw; 3699 struct mem_cgroup *mi; 3700 unsigned int i; 3701 3702 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3703 3704 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3705 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3706 continue; 3707 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 3708 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3709 } 3710 3711 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3712 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3713 mem_cgroup_read_events(memcg, i)); 3714 3715 for (i = 0; i < NR_LRU_LISTS; i++) 3716 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3717 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3718 3719 /* Hierarchical information */ 3720 memory = memsw = PAGE_COUNTER_MAX; 3721 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3722 memory = min(memory, mi->memory.limit); 3723 memsw = min(memsw, mi->memsw.limit); 3724 } 3725 seq_printf(m, "hierarchical_memory_limit %llu\n", 3726 (u64)memory * PAGE_SIZE); 3727 if (do_swap_account) 3728 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3729 (u64)memsw * PAGE_SIZE); 3730 3731 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3732 long long val = 0; 3733 3734 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3735 continue; 3736 for_each_mem_cgroup_tree(mi, memcg) 3737 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3738 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 3739 } 3740 3741 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3742 unsigned long long val = 0; 3743 3744 for_each_mem_cgroup_tree(mi, memcg) 3745 val += mem_cgroup_read_events(mi, i); 3746 seq_printf(m, "total_%s %llu\n", 3747 mem_cgroup_events_names[i], val); 3748 } 3749 3750 for (i = 0; i < NR_LRU_LISTS; i++) { 3751 unsigned long long val = 0; 3752 3753 for_each_mem_cgroup_tree(mi, memcg) 3754 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3755 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3756 } 3757 3758 #ifdef CONFIG_DEBUG_VM 3759 { 3760 int nid, zid; 3761 struct mem_cgroup_per_zone *mz; 3762 struct zone_reclaim_stat *rstat; 3763 unsigned long recent_rotated[2] = {0, 0}; 3764 unsigned long recent_scanned[2] = {0, 0}; 3765 3766 for_each_online_node(nid) 3767 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3768 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3769 rstat = &mz->lruvec.reclaim_stat; 3770 3771 recent_rotated[0] += rstat->recent_rotated[0]; 3772 recent_rotated[1] += rstat->recent_rotated[1]; 3773 recent_scanned[0] += rstat->recent_scanned[0]; 3774 recent_scanned[1] += rstat->recent_scanned[1]; 3775 } 3776 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3777 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3778 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3779 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3780 } 3781 #endif 3782 3783 return 0; 3784 } 3785 3786 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3787 struct cftype *cft) 3788 { 3789 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3790 3791 return mem_cgroup_swappiness(memcg); 3792 } 3793 3794 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3795 struct cftype *cft, u64 val) 3796 { 3797 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3798 3799 if (val > 100) 3800 return -EINVAL; 3801 3802 if (css->parent) 3803 memcg->swappiness = val; 3804 else 3805 vm_swappiness = val; 3806 3807 return 0; 3808 } 3809 3810 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3811 { 3812 struct mem_cgroup_threshold_ary *t; 3813 unsigned long usage; 3814 int i; 3815 3816 rcu_read_lock(); 3817 if (!swap) 3818 t = rcu_dereference(memcg->thresholds.primary); 3819 else 3820 t = rcu_dereference(memcg->memsw_thresholds.primary); 3821 3822 if (!t) 3823 goto unlock; 3824 3825 usage = mem_cgroup_usage(memcg, swap); 3826 3827 /* 3828 * current_threshold points to threshold just below or equal to usage. 3829 * If it's not true, a threshold was crossed after last 3830 * call of __mem_cgroup_threshold(). 3831 */ 3832 i = t->current_threshold; 3833 3834 /* 3835 * Iterate backward over array of thresholds starting from 3836 * current_threshold and check if a threshold is crossed. 3837 * If none of thresholds below usage is crossed, we read 3838 * only one element of the array here. 3839 */ 3840 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3841 eventfd_signal(t->entries[i].eventfd, 1); 3842 3843 /* i = current_threshold + 1 */ 3844 i++; 3845 3846 /* 3847 * Iterate forward over array of thresholds starting from 3848 * current_threshold+1 and check if a threshold is crossed. 3849 * If none of thresholds above usage is crossed, we read 3850 * only one element of the array here. 3851 */ 3852 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3853 eventfd_signal(t->entries[i].eventfd, 1); 3854 3855 /* Update current_threshold */ 3856 t->current_threshold = i - 1; 3857 unlock: 3858 rcu_read_unlock(); 3859 } 3860 3861 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3862 { 3863 while (memcg) { 3864 __mem_cgroup_threshold(memcg, false); 3865 if (do_swap_account) 3866 __mem_cgroup_threshold(memcg, true); 3867 3868 memcg = parent_mem_cgroup(memcg); 3869 } 3870 } 3871 3872 static int compare_thresholds(const void *a, const void *b) 3873 { 3874 const struct mem_cgroup_threshold *_a = a; 3875 const struct mem_cgroup_threshold *_b = b; 3876 3877 if (_a->threshold > _b->threshold) 3878 return 1; 3879 3880 if (_a->threshold < _b->threshold) 3881 return -1; 3882 3883 return 0; 3884 } 3885 3886 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3887 { 3888 struct mem_cgroup_eventfd_list *ev; 3889 3890 spin_lock(&memcg_oom_lock); 3891 3892 list_for_each_entry(ev, &memcg->oom_notify, list) 3893 eventfd_signal(ev->eventfd, 1); 3894 3895 spin_unlock(&memcg_oom_lock); 3896 return 0; 3897 } 3898 3899 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3900 { 3901 struct mem_cgroup *iter; 3902 3903 for_each_mem_cgroup_tree(iter, memcg) 3904 mem_cgroup_oom_notify_cb(iter); 3905 } 3906 3907 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3908 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3909 { 3910 struct mem_cgroup_thresholds *thresholds; 3911 struct mem_cgroup_threshold_ary *new; 3912 unsigned long threshold; 3913 unsigned long usage; 3914 int i, size, ret; 3915 3916 ret = page_counter_memparse(args, &threshold); 3917 if (ret) 3918 return ret; 3919 3920 mutex_lock(&memcg->thresholds_lock); 3921 3922 if (type == _MEM) { 3923 thresholds = &memcg->thresholds; 3924 usage = mem_cgroup_usage(memcg, false); 3925 } else if (type == _MEMSWAP) { 3926 thresholds = &memcg->memsw_thresholds; 3927 usage = mem_cgroup_usage(memcg, true); 3928 } else 3929 BUG(); 3930 3931 /* Check if a threshold crossed before adding a new one */ 3932 if (thresholds->primary) 3933 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3934 3935 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3936 3937 /* Allocate memory for new array of thresholds */ 3938 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3939 GFP_KERNEL); 3940 if (!new) { 3941 ret = -ENOMEM; 3942 goto unlock; 3943 } 3944 new->size = size; 3945 3946 /* Copy thresholds (if any) to new array */ 3947 if (thresholds->primary) { 3948 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3949 sizeof(struct mem_cgroup_threshold)); 3950 } 3951 3952 /* Add new threshold */ 3953 new->entries[size - 1].eventfd = eventfd; 3954 new->entries[size - 1].threshold = threshold; 3955 3956 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3957 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3958 compare_thresholds, NULL); 3959 3960 /* Find current threshold */ 3961 new->current_threshold = -1; 3962 for (i = 0; i < size; i++) { 3963 if (new->entries[i].threshold <= usage) { 3964 /* 3965 * new->current_threshold will not be used until 3966 * rcu_assign_pointer(), so it's safe to increment 3967 * it here. 3968 */ 3969 ++new->current_threshold; 3970 } else 3971 break; 3972 } 3973 3974 /* Free old spare buffer and save old primary buffer as spare */ 3975 kfree(thresholds->spare); 3976 thresholds->spare = thresholds->primary; 3977 3978 rcu_assign_pointer(thresholds->primary, new); 3979 3980 /* To be sure that nobody uses thresholds */ 3981 synchronize_rcu(); 3982 3983 unlock: 3984 mutex_unlock(&memcg->thresholds_lock); 3985 3986 return ret; 3987 } 3988 3989 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3990 struct eventfd_ctx *eventfd, const char *args) 3991 { 3992 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3993 } 3994 3995 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3996 struct eventfd_ctx *eventfd, const char *args) 3997 { 3998 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3999 } 4000 4001 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4002 struct eventfd_ctx *eventfd, enum res_type type) 4003 { 4004 struct mem_cgroup_thresholds *thresholds; 4005 struct mem_cgroup_threshold_ary *new; 4006 unsigned long usage; 4007 int i, j, size; 4008 4009 mutex_lock(&memcg->thresholds_lock); 4010 4011 if (type == _MEM) { 4012 thresholds = &memcg->thresholds; 4013 usage = mem_cgroup_usage(memcg, false); 4014 } else if (type == _MEMSWAP) { 4015 thresholds = &memcg->memsw_thresholds; 4016 usage = mem_cgroup_usage(memcg, true); 4017 } else 4018 BUG(); 4019 4020 if (!thresholds->primary) 4021 goto unlock; 4022 4023 /* Check if a threshold crossed before removing */ 4024 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4025 4026 /* Calculate new number of threshold */ 4027 size = 0; 4028 for (i = 0; i < thresholds->primary->size; i++) { 4029 if (thresholds->primary->entries[i].eventfd != eventfd) 4030 size++; 4031 } 4032 4033 new = thresholds->spare; 4034 4035 /* Set thresholds array to NULL if we don't have thresholds */ 4036 if (!size) { 4037 kfree(new); 4038 new = NULL; 4039 goto swap_buffers; 4040 } 4041 4042 new->size = size; 4043 4044 /* Copy thresholds and find current threshold */ 4045 new->current_threshold = -1; 4046 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4047 if (thresholds->primary->entries[i].eventfd == eventfd) 4048 continue; 4049 4050 new->entries[j] = thresholds->primary->entries[i]; 4051 if (new->entries[j].threshold <= usage) { 4052 /* 4053 * new->current_threshold will not be used 4054 * until rcu_assign_pointer(), so it's safe to increment 4055 * it here. 4056 */ 4057 ++new->current_threshold; 4058 } 4059 j++; 4060 } 4061 4062 swap_buffers: 4063 /* Swap primary and spare array */ 4064 thresholds->spare = thresholds->primary; 4065 /* If all events are unregistered, free the spare array */ 4066 if (!new) { 4067 kfree(thresholds->spare); 4068 thresholds->spare = NULL; 4069 } 4070 4071 rcu_assign_pointer(thresholds->primary, new); 4072 4073 /* To be sure that nobody uses thresholds */ 4074 synchronize_rcu(); 4075 unlock: 4076 mutex_unlock(&memcg->thresholds_lock); 4077 } 4078 4079 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4080 struct eventfd_ctx *eventfd) 4081 { 4082 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4083 } 4084 4085 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4086 struct eventfd_ctx *eventfd) 4087 { 4088 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4089 } 4090 4091 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4092 struct eventfd_ctx *eventfd, const char *args) 4093 { 4094 struct mem_cgroup_eventfd_list *event; 4095 4096 event = kmalloc(sizeof(*event), GFP_KERNEL); 4097 if (!event) 4098 return -ENOMEM; 4099 4100 spin_lock(&memcg_oom_lock); 4101 4102 event->eventfd = eventfd; 4103 list_add(&event->list, &memcg->oom_notify); 4104 4105 /* already in OOM ? */ 4106 if (atomic_read(&memcg->under_oom)) 4107 eventfd_signal(eventfd, 1); 4108 spin_unlock(&memcg_oom_lock); 4109 4110 return 0; 4111 } 4112 4113 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4114 struct eventfd_ctx *eventfd) 4115 { 4116 struct mem_cgroup_eventfd_list *ev, *tmp; 4117 4118 spin_lock(&memcg_oom_lock); 4119 4120 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4121 if (ev->eventfd == eventfd) { 4122 list_del(&ev->list); 4123 kfree(ev); 4124 } 4125 } 4126 4127 spin_unlock(&memcg_oom_lock); 4128 } 4129 4130 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4131 { 4132 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 4133 4134 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4135 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 4136 return 0; 4137 } 4138 4139 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4140 struct cftype *cft, u64 val) 4141 { 4142 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4143 4144 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4145 if (!css->parent || !((val == 0) || (val == 1))) 4146 return -EINVAL; 4147 4148 memcg->oom_kill_disable = val; 4149 if (!val) 4150 memcg_oom_recover(memcg); 4151 4152 return 0; 4153 } 4154 4155 #ifdef CONFIG_MEMCG_KMEM 4156 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4157 { 4158 int ret; 4159 4160 ret = memcg_propagate_kmem(memcg); 4161 if (ret) 4162 return ret; 4163 4164 return mem_cgroup_sockets_init(memcg, ss); 4165 } 4166 4167 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4168 { 4169 memcg_unregister_all_caches(memcg); 4170 mem_cgroup_sockets_destroy(memcg); 4171 } 4172 #else 4173 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4174 { 4175 return 0; 4176 } 4177 4178 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4179 { 4180 } 4181 #endif 4182 4183 /* 4184 * DO NOT USE IN NEW FILES. 4185 * 4186 * "cgroup.event_control" implementation. 4187 * 4188 * This is way over-engineered. It tries to support fully configurable 4189 * events for each user. Such level of flexibility is completely 4190 * unnecessary especially in the light of the planned unified hierarchy. 4191 * 4192 * Please deprecate this and replace with something simpler if at all 4193 * possible. 4194 */ 4195 4196 /* 4197 * Unregister event and free resources. 4198 * 4199 * Gets called from workqueue. 4200 */ 4201 static void memcg_event_remove(struct work_struct *work) 4202 { 4203 struct mem_cgroup_event *event = 4204 container_of(work, struct mem_cgroup_event, remove); 4205 struct mem_cgroup *memcg = event->memcg; 4206 4207 remove_wait_queue(event->wqh, &event->wait); 4208 4209 event->unregister_event(memcg, event->eventfd); 4210 4211 /* Notify userspace the event is going away. */ 4212 eventfd_signal(event->eventfd, 1); 4213 4214 eventfd_ctx_put(event->eventfd); 4215 kfree(event); 4216 css_put(&memcg->css); 4217 } 4218 4219 /* 4220 * Gets called on POLLHUP on eventfd when user closes it. 4221 * 4222 * Called with wqh->lock held and interrupts disabled. 4223 */ 4224 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 4225 int sync, void *key) 4226 { 4227 struct mem_cgroup_event *event = 4228 container_of(wait, struct mem_cgroup_event, wait); 4229 struct mem_cgroup *memcg = event->memcg; 4230 unsigned long flags = (unsigned long)key; 4231 4232 if (flags & POLLHUP) { 4233 /* 4234 * If the event has been detached at cgroup removal, we 4235 * can simply return knowing the other side will cleanup 4236 * for us. 4237 * 4238 * We can't race against event freeing since the other 4239 * side will require wqh->lock via remove_wait_queue(), 4240 * which we hold. 4241 */ 4242 spin_lock(&memcg->event_list_lock); 4243 if (!list_empty(&event->list)) { 4244 list_del_init(&event->list); 4245 /* 4246 * We are in atomic context, but cgroup_event_remove() 4247 * may sleep, so we have to call it in workqueue. 4248 */ 4249 schedule_work(&event->remove); 4250 } 4251 spin_unlock(&memcg->event_list_lock); 4252 } 4253 4254 return 0; 4255 } 4256 4257 static void memcg_event_ptable_queue_proc(struct file *file, 4258 wait_queue_head_t *wqh, poll_table *pt) 4259 { 4260 struct mem_cgroup_event *event = 4261 container_of(pt, struct mem_cgroup_event, pt); 4262 4263 event->wqh = wqh; 4264 add_wait_queue(wqh, &event->wait); 4265 } 4266 4267 /* 4268 * DO NOT USE IN NEW FILES. 4269 * 4270 * Parse input and register new cgroup event handler. 4271 * 4272 * Input must be in format '<event_fd> <control_fd> <args>'. 4273 * Interpretation of args is defined by control file implementation. 4274 */ 4275 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4276 char *buf, size_t nbytes, loff_t off) 4277 { 4278 struct cgroup_subsys_state *css = of_css(of); 4279 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4280 struct mem_cgroup_event *event; 4281 struct cgroup_subsys_state *cfile_css; 4282 unsigned int efd, cfd; 4283 struct fd efile; 4284 struct fd cfile; 4285 const char *name; 4286 char *endp; 4287 int ret; 4288 4289 buf = strstrip(buf); 4290 4291 efd = simple_strtoul(buf, &endp, 10); 4292 if (*endp != ' ') 4293 return -EINVAL; 4294 buf = endp + 1; 4295 4296 cfd = simple_strtoul(buf, &endp, 10); 4297 if ((*endp != ' ') && (*endp != '\0')) 4298 return -EINVAL; 4299 buf = endp + 1; 4300 4301 event = kzalloc(sizeof(*event), GFP_KERNEL); 4302 if (!event) 4303 return -ENOMEM; 4304 4305 event->memcg = memcg; 4306 INIT_LIST_HEAD(&event->list); 4307 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4308 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4309 INIT_WORK(&event->remove, memcg_event_remove); 4310 4311 efile = fdget(efd); 4312 if (!efile.file) { 4313 ret = -EBADF; 4314 goto out_kfree; 4315 } 4316 4317 event->eventfd = eventfd_ctx_fileget(efile.file); 4318 if (IS_ERR(event->eventfd)) { 4319 ret = PTR_ERR(event->eventfd); 4320 goto out_put_efile; 4321 } 4322 4323 cfile = fdget(cfd); 4324 if (!cfile.file) { 4325 ret = -EBADF; 4326 goto out_put_eventfd; 4327 } 4328 4329 /* the process need read permission on control file */ 4330 /* AV: shouldn't we check that it's been opened for read instead? */ 4331 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4332 if (ret < 0) 4333 goto out_put_cfile; 4334 4335 /* 4336 * Determine the event callbacks and set them in @event. This used 4337 * to be done via struct cftype but cgroup core no longer knows 4338 * about these events. The following is crude but the whole thing 4339 * is for compatibility anyway. 4340 * 4341 * DO NOT ADD NEW FILES. 4342 */ 4343 name = cfile.file->f_path.dentry->d_name.name; 4344 4345 if (!strcmp(name, "memory.usage_in_bytes")) { 4346 event->register_event = mem_cgroup_usage_register_event; 4347 event->unregister_event = mem_cgroup_usage_unregister_event; 4348 } else if (!strcmp(name, "memory.oom_control")) { 4349 event->register_event = mem_cgroup_oom_register_event; 4350 event->unregister_event = mem_cgroup_oom_unregister_event; 4351 } else if (!strcmp(name, "memory.pressure_level")) { 4352 event->register_event = vmpressure_register_event; 4353 event->unregister_event = vmpressure_unregister_event; 4354 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4355 event->register_event = memsw_cgroup_usage_register_event; 4356 event->unregister_event = memsw_cgroup_usage_unregister_event; 4357 } else { 4358 ret = -EINVAL; 4359 goto out_put_cfile; 4360 } 4361 4362 /* 4363 * Verify @cfile should belong to @css. Also, remaining events are 4364 * automatically removed on cgroup destruction but the removal is 4365 * asynchronous, so take an extra ref on @css. 4366 */ 4367 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4368 &memory_cgrp_subsys); 4369 ret = -EINVAL; 4370 if (IS_ERR(cfile_css)) 4371 goto out_put_cfile; 4372 if (cfile_css != css) { 4373 css_put(cfile_css); 4374 goto out_put_cfile; 4375 } 4376 4377 ret = event->register_event(memcg, event->eventfd, buf); 4378 if (ret) 4379 goto out_put_css; 4380 4381 efile.file->f_op->poll(efile.file, &event->pt); 4382 4383 spin_lock(&memcg->event_list_lock); 4384 list_add(&event->list, &memcg->event_list); 4385 spin_unlock(&memcg->event_list_lock); 4386 4387 fdput(cfile); 4388 fdput(efile); 4389 4390 return nbytes; 4391 4392 out_put_css: 4393 css_put(css); 4394 out_put_cfile: 4395 fdput(cfile); 4396 out_put_eventfd: 4397 eventfd_ctx_put(event->eventfd); 4398 out_put_efile: 4399 fdput(efile); 4400 out_kfree: 4401 kfree(event); 4402 4403 return ret; 4404 } 4405 4406 static struct cftype mem_cgroup_files[] = { 4407 { 4408 .name = "usage_in_bytes", 4409 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4410 .read_u64 = mem_cgroup_read_u64, 4411 }, 4412 { 4413 .name = "max_usage_in_bytes", 4414 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4415 .write = mem_cgroup_reset, 4416 .read_u64 = mem_cgroup_read_u64, 4417 }, 4418 { 4419 .name = "limit_in_bytes", 4420 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4421 .write = mem_cgroup_write, 4422 .read_u64 = mem_cgroup_read_u64, 4423 }, 4424 { 4425 .name = "soft_limit_in_bytes", 4426 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4427 .write = mem_cgroup_write, 4428 .read_u64 = mem_cgroup_read_u64, 4429 }, 4430 { 4431 .name = "failcnt", 4432 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4433 .write = mem_cgroup_reset, 4434 .read_u64 = mem_cgroup_read_u64, 4435 }, 4436 { 4437 .name = "stat", 4438 .seq_show = memcg_stat_show, 4439 }, 4440 { 4441 .name = "force_empty", 4442 .write = mem_cgroup_force_empty_write, 4443 }, 4444 { 4445 .name = "use_hierarchy", 4446 .write_u64 = mem_cgroup_hierarchy_write, 4447 .read_u64 = mem_cgroup_hierarchy_read, 4448 }, 4449 { 4450 .name = "cgroup.event_control", /* XXX: for compat */ 4451 .write = memcg_write_event_control, 4452 .flags = CFTYPE_NO_PREFIX, 4453 .mode = S_IWUGO, 4454 }, 4455 { 4456 .name = "swappiness", 4457 .read_u64 = mem_cgroup_swappiness_read, 4458 .write_u64 = mem_cgroup_swappiness_write, 4459 }, 4460 { 4461 .name = "move_charge_at_immigrate", 4462 .read_u64 = mem_cgroup_move_charge_read, 4463 .write_u64 = mem_cgroup_move_charge_write, 4464 }, 4465 { 4466 .name = "oom_control", 4467 .seq_show = mem_cgroup_oom_control_read, 4468 .write_u64 = mem_cgroup_oom_control_write, 4469 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4470 }, 4471 { 4472 .name = "pressure_level", 4473 }, 4474 #ifdef CONFIG_NUMA 4475 { 4476 .name = "numa_stat", 4477 .seq_show = memcg_numa_stat_show, 4478 }, 4479 #endif 4480 #ifdef CONFIG_MEMCG_KMEM 4481 { 4482 .name = "kmem.limit_in_bytes", 4483 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4484 .write = mem_cgroup_write, 4485 .read_u64 = mem_cgroup_read_u64, 4486 }, 4487 { 4488 .name = "kmem.usage_in_bytes", 4489 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4490 .read_u64 = mem_cgroup_read_u64, 4491 }, 4492 { 4493 .name = "kmem.failcnt", 4494 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4495 .write = mem_cgroup_reset, 4496 .read_u64 = mem_cgroup_read_u64, 4497 }, 4498 { 4499 .name = "kmem.max_usage_in_bytes", 4500 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4501 .write = mem_cgroup_reset, 4502 .read_u64 = mem_cgroup_read_u64, 4503 }, 4504 #ifdef CONFIG_SLABINFO 4505 { 4506 .name = "kmem.slabinfo", 4507 .seq_start = slab_start, 4508 .seq_next = slab_next, 4509 .seq_stop = slab_stop, 4510 .seq_show = memcg_slab_show, 4511 }, 4512 #endif 4513 #endif 4514 { }, /* terminate */ 4515 }; 4516 4517 #ifdef CONFIG_MEMCG_SWAP 4518 static struct cftype memsw_cgroup_files[] = { 4519 { 4520 .name = "memsw.usage_in_bytes", 4521 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4522 .read_u64 = mem_cgroup_read_u64, 4523 }, 4524 { 4525 .name = "memsw.max_usage_in_bytes", 4526 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4527 .write = mem_cgroup_reset, 4528 .read_u64 = mem_cgroup_read_u64, 4529 }, 4530 { 4531 .name = "memsw.limit_in_bytes", 4532 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4533 .write = mem_cgroup_write, 4534 .read_u64 = mem_cgroup_read_u64, 4535 }, 4536 { 4537 .name = "memsw.failcnt", 4538 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4539 .write = mem_cgroup_reset, 4540 .read_u64 = mem_cgroup_read_u64, 4541 }, 4542 { }, /* terminate */ 4543 }; 4544 #endif 4545 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4546 { 4547 struct mem_cgroup_per_node *pn; 4548 struct mem_cgroup_per_zone *mz; 4549 int zone, tmp = node; 4550 /* 4551 * This routine is called against possible nodes. 4552 * But it's BUG to call kmalloc() against offline node. 4553 * 4554 * TODO: this routine can waste much memory for nodes which will 4555 * never be onlined. It's better to use memory hotplug callback 4556 * function. 4557 */ 4558 if (!node_state(node, N_NORMAL_MEMORY)) 4559 tmp = -1; 4560 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4561 if (!pn) 4562 return 1; 4563 4564 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4565 mz = &pn->zoneinfo[zone]; 4566 lruvec_init(&mz->lruvec); 4567 mz->usage_in_excess = 0; 4568 mz->on_tree = false; 4569 mz->memcg = memcg; 4570 } 4571 memcg->nodeinfo[node] = pn; 4572 return 0; 4573 } 4574 4575 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4576 { 4577 kfree(memcg->nodeinfo[node]); 4578 } 4579 4580 static struct mem_cgroup *mem_cgroup_alloc(void) 4581 { 4582 struct mem_cgroup *memcg; 4583 size_t size; 4584 4585 size = sizeof(struct mem_cgroup); 4586 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4587 4588 memcg = kzalloc(size, GFP_KERNEL); 4589 if (!memcg) 4590 return NULL; 4591 4592 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4593 if (!memcg->stat) 4594 goto out_free; 4595 spin_lock_init(&memcg->pcp_counter_lock); 4596 return memcg; 4597 4598 out_free: 4599 kfree(memcg); 4600 return NULL; 4601 } 4602 4603 /* 4604 * At destroying mem_cgroup, references from swap_cgroup can remain. 4605 * (scanning all at force_empty is too costly...) 4606 * 4607 * Instead of clearing all references at force_empty, we remember 4608 * the number of reference from swap_cgroup and free mem_cgroup when 4609 * it goes down to 0. 4610 * 4611 * Removal of cgroup itself succeeds regardless of refs from swap. 4612 */ 4613 4614 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4615 { 4616 int node; 4617 4618 mem_cgroup_remove_from_trees(memcg); 4619 4620 for_each_node(node) 4621 free_mem_cgroup_per_zone_info(memcg, node); 4622 4623 free_percpu(memcg->stat); 4624 4625 disarm_static_keys(memcg); 4626 kfree(memcg); 4627 } 4628 4629 /* 4630 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4631 */ 4632 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4633 { 4634 if (!memcg->memory.parent) 4635 return NULL; 4636 return mem_cgroup_from_counter(memcg->memory.parent, memory); 4637 } 4638 EXPORT_SYMBOL(parent_mem_cgroup); 4639 4640 static void __init mem_cgroup_soft_limit_tree_init(void) 4641 { 4642 struct mem_cgroup_tree_per_node *rtpn; 4643 struct mem_cgroup_tree_per_zone *rtpz; 4644 int tmp, node, zone; 4645 4646 for_each_node(node) { 4647 tmp = node; 4648 if (!node_state(node, N_NORMAL_MEMORY)) 4649 tmp = -1; 4650 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4651 BUG_ON(!rtpn); 4652 4653 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4654 4655 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4656 rtpz = &rtpn->rb_tree_per_zone[zone]; 4657 rtpz->rb_root = RB_ROOT; 4658 spin_lock_init(&rtpz->lock); 4659 } 4660 } 4661 } 4662 4663 static struct cgroup_subsys_state * __ref 4664 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4665 { 4666 struct mem_cgroup *memcg; 4667 long error = -ENOMEM; 4668 int node; 4669 4670 memcg = mem_cgroup_alloc(); 4671 if (!memcg) 4672 return ERR_PTR(error); 4673 4674 for_each_node(node) 4675 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4676 goto free_out; 4677 4678 /* root ? */ 4679 if (parent_css == NULL) { 4680 root_mem_cgroup = memcg; 4681 page_counter_init(&memcg->memory, NULL); 4682 page_counter_init(&memcg->memsw, NULL); 4683 page_counter_init(&memcg->kmem, NULL); 4684 } 4685 4686 memcg->last_scanned_node = MAX_NUMNODES; 4687 INIT_LIST_HEAD(&memcg->oom_notify); 4688 memcg->move_charge_at_immigrate = 0; 4689 mutex_init(&memcg->thresholds_lock); 4690 spin_lock_init(&memcg->move_lock); 4691 vmpressure_init(&memcg->vmpressure); 4692 INIT_LIST_HEAD(&memcg->event_list); 4693 spin_lock_init(&memcg->event_list_lock); 4694 #ifdef CONFIG_MEMCG_KMEM 4695 memcg->kmemcg_id = -1; 4696 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4697 #endif 4698 4699 return &memcg->css; 4700 4701 free_out: 4702 __mem_cgroup_free(memcg); 4703 return ERR_PTR(error); 4704 } 4705 4706 static int 4707 mem_cgroup_css_online(struct cgroup_subsys_state *css) 4708 { 4709 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4710 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4711 int ret; 4712 4713 if (css->id > MEM_CGROUP_ID_MAX) 4714 return -ENOSPC; 4715 4716 if (!parent) 4717 return 0; 4718 4719 mutex_lock(&memcg_create_mutex); 4720 4721 memcg->use_hierarchy = parent->use_hierarchy; 4722 memcg->oom_kill_disable = parent->oom_kill_disable; 4723 memcg->swappiness = mem_cgroup_swappiness(parent); 4724 4725 if (parent->use_hierarchy) { 4726 page_counter_init(&memcg->memory, &parent->memory); 4727 page_counter_init(&memcg->memsw, &parent->memsw); 4728 page_counter_init(&memcg->kmem, &parent->kmem); 4729 4730 /* 4731 * No need to take a reference to the parent because cgroup 4732 * core guarantees its existence. 4733 */ 4734 } else { 4735 page_counter_init(&memcg->memory, NULL); 4736 page_counter_init(&memcg->memsw, NULL); 4737 page_counter_init(&memcg->kmem, NULL); 4738 /* 4739 * Deeper hierachy with use_hierarchy == false doesn't make 4740 * much sense so let cgroup subsystem know about this 4741 * unfortunate state in our controller. 4742 */ 4743 if (parent != root_mem_cgroup) 4744 memory_cgrp_subsys.broken_hierarchy = true; 4745 } 4746 mutex_unlock(&memcg_create_mutex); 4747 4748 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 4749 if (ret) 4750 return ret; 4751 4752 /* 4753 * Make sure the memcg is initialized: mem_cgroup_iter() 4754 * orders reading memcg->initialized against its callers 4755 * reading the memcg members. 4756 */ 4757 smp_store_release(&memcg->initialized, 1); 4758 4759 return 0; 4760 } 4761 4762 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4763 { 4764 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4765 struct mem_cgroup_event *event, *tmp; 4766 4767 /* 4768 * Unregister events and notify userspace. 4769 * Notify userspace about cgroup removing only after rmdir of cgroup 4770 * directory to avoid race between userspace and kernelspace. 4771 */ 4772 spin_lock(&memcg->event_list_lock); 4773 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4774 list_del_init(&event->list); 4775 schedule_work(&event->remove); 4776 } 4777 spin_unlock(&memcg->event_list_lock); 4778 4779 vmpressure_cleanup(&memcg->vmpressure); 4780 } 4781 4782 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4783 { 4784 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4785 4786 memcg_destroy_kmem(memcg); 4787 __mem_cgroup_free(memcg); 4788 } 4789 4790 /** 4791 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4792 * @css: the target css 4793 * 4794 * Reset the states of the mem_cgroup associated with @css. This is 4795 * invoked when the userland requests disabling on the default hierarchy 4796 * but the memcg is pinned through dependency. The memcg should stop 4797 * applying policies and should revert to the vanilla state as it may be 4798 * made visible again. 4799 * 4800 * The current implementation only resets the essential configurations. 4801 * This needs to be expanded to cover all the visible parts. 4802 */ 4803 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4804 { 4805 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4806 4807 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4808 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4809 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4810 memcg->soft_limit = 0; 4811 } 4812 4813 #ifdef CONFIG_MMU 4814 /* Handlers for move charge at task migration. */ 4815 static int mem_cgroup_do_precharge(unsigned long count) 4816 { 4817 int ret; 4818 4819 /* Try a single bulk charge without reclaim first */ 4820 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 4821 if (!ret) { 4822 mc.precharge += count; 4823 return ret; 4824 } 4825 if (ret == -EINTR) { 4826 cancel_charge(root_mem_cgroup, count); 4827 return ret; 4828 } 4829 4830 /* Try charges one by one with reclaim */ 4831 while (count--) { 4832 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4833 /* 4834 * In case of failure, any residual charges against 4835 * mc.to will be dropped by mem_cgroup_clear_mc() 4836 * later on. However, cancel any charges that are 4837 * bypassed to root right away or they'll be lost. 4838 */ 4839 if (ret == -EINTR) 4840 cancel_charge(root_mem_cgroup, 1); 4841 if (ret) 4842 return ret; 4843 mc.precharge++; 4844 cond_resched(); 4845 } 4846 return 0; 4847 } 4848 4849 /** 4850 * get_mctgt_type - get target type of moving charge 4851 * @vma: the vma the pte to be checked belongs 4852 * @addr: the address corresponding to the pte to be checked 4853 * @ptent: the pte to be checked 4854 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4855 * 4856 * Returns 4857 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4858 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4859 * move charge. if @target is not NULL, the page is stored in target->page 4860 * with extra refcnt got(Callers should handle it). 4861 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4862 * target for charge migration. if @target is not NULL, the entry is stored 4863 * in target->ent. 4864 * 4865 * Called with pte lock held. 4866 */ 4867 union mc_target { 4868 struct page *page; 4869 swp_entry_t ent; 4870 }; 4871 4872 enum mc_target_type { 4873 MC_TARGET_NONE = 0, 4874 MC_TARGET_PAGE, 4875 MC_TARGET_SWAP, 4876 }; 4877 4878 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4879 unsigned long addr, pte_t ptent) 4880 { 4881 struct page *page = vm_normal_page(vma, addr, ptent); 4882 4883 if (!page || !page_mapped(page)) 4884 return NULL; 4885 if (PageAnon(page)) { 4886 /* we don't move shared anon */ 4887 if (!move_anon()) 4888 return NULL; 4889 } else if (!move_file()) 4890 /* we ignore mapcount for file pages */ 4891 return NULL; 4892 if (!get_page_unless_zero(page)) 4893 return NULL; 4894 4895 return page; 4896 } 4897 4898 #ifdef CONFIG_SWAP 4899 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4900 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4901 { 4902 struct page *page = NULL; 4903 swp_entry_t ent = pte_to_swp_entry(ptent); 4904 4905 if (!move_anon() || non_swap_entry(ent)) 4906 return NULL; 4907 /* 4908 * Because lookup_swap_cache() updates some statistics counter, 4909 * we call find_get_page() with swapper_space directly. 4910 */ 4911 page = find_get_page(swap_address_space(ent), ent.val); 4912 if (do_swap_account) 4913 entry->val = ent.val; 4914 4915 return page; 4916 } 4917 #else 4918 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4919 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4920 { 4921 return NULL; 4922 } 4923 #endif 4924 4925 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4926 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4927 { 4928 struct page *page = NULL; 4929 struct address_space *mapping; 4930 pgoff_t pgoff; 4931 4932 if (!vma->vm_file) /* anonymous vma */ 4933 return NULL; 4934 if (!move_file()) 4935 return NULL; 4936 4937 mapping = vma->vm_file->f_mapping; 4938 if (pte_none(ptent)) 4939 pgoff = linear_page_index(vma, addr); 4940 else /* pte_file(ptent) is true */ 4941 pgoff = pte_to_pgoff(ptent); 4942 4943 /* page is moved even if it's not RSS of this task(page-faulted). */ 4944 #ifdef CONFIG_SWAP 4945 /* shmem/tmpfs may report page out on swap: account for that too. */ 4946 if (shmem_mapping(mapping)) { 4947 page = find_get_entry(mapping, pgoff); 4948 if (radix_tree_exceptional_entry(page)) { 4949 swp_entry_t swp = radix_to_swp_entry(page); 4950 if (do_swap_account) 4951 *entry = swp; 4952 page = find_get_page(swap_address_space(swp), swp.val); 4953 } 4954 } else 4955 page = find_get_page(mapping, pgoff); 4956 #else 4957 page = find_get_page(mapping, pgoff); 4958 #endif 4959 return page; 4960 } 4961 4962 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4963 unsigned long addr, pte_t ptent, union mc_target *target) 4964 { 4965 struct page *page = NULL; 4966 enum mc_target_type ret = MC_TARGET_NONE; 4967 swp_entry_t ent = { .val = 0 }; 4968 4969 if (pte_present(ptent)) 4970 page = mc_handle_present_pte(vma, addr, ptent); 4971 else if (is_swap_pte(ptent)) 4972 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4973 else if (pte_none(ptent) || pte_file(ptent)) 4974 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4975 4976 if (!page && !ent.val) 4977 return ret; 4978 if (page) { 4979 /* 4980 * Do only loose check w/o serialization. 4981 * mem_cgroup_move_account() checks the page is valid or 4982 * not under LRU exclusion. 4983 */ 4984 if (page->mem_cgroup == mc.from) { 4985 ret = MC_TARGET_PAGE; 4986 if (target) 4987 target->page = page; 4988 } 4989 if (!ret || !target) 4990 put_page(page); 4991 } 4992 /* There is a swap entry and a page doesn't exist or isn't charged */ 4993 if (ent.val && !ret && 4994 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4995 ret = MC_TARGET_SWAP; 4996 if (target) 4997 target->ent = ent; 4998 } 4999 return ret; 5000 } 5001 5002 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5003 /* 5004 * We don't consider swapping or file mapped pages because THP does not 5005 * support them for now. 5006 * Caller should make sure that pmd_trans_huge(pmd) is true. 5007 */ 5008 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5009 unsigned long addr, pmd_t pmd, union mc_target *target) 5010 { 5011 struct page *page = NULL; 5012 enum mc_target_type ret = MC_TARGET_NONE; 5013 5014 page = pmd_page(pmd); 5015 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5016 if (!move_anon()) 5017 return ret; 5018 if (page->mem_cgroup == mc.from) { 5019 ret = MC_TARGET_PAGE; 5020 if (target) { 5021 get_page(page); 5022 target->page = page; 5023 } 5024 } 5025 return ret; 5026 } 5027 #else 5028 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5029 unsigned long addr, pmd_t pmd, union mc_target *target) 5030 { 5031 return MC_TARGET_NONE; 5032 } 5033 #endif 5034 5035 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5036 unsigned long addr, unsigned long end, 5037 struct mm_walk *walk) 5038 { 5039 struct vm_area_struct *vma = walk->private; 5040 pte_t *pte; 5041 spinlock_t *ptl; 5042 5043 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5044 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5045 mc.precharge += HPAGE_PMD_NR; 5046 spin_unlock(ptl); 5047 return 0; 5048 } 5049 5050 if (pmd_trans_unstable(pmd)) 5051 return 0; 5052 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5053 for (; addr != end; pte++, addr += PAGE_SIZE) 5054 if (get_mctgt_type(vma, addr, *pte, NULL)) 5055 mc.precharge++; /* increment precharge temporarily */ 5056 pte_unmap_unlock(pte - 1, ptl); 5057 cond_resched(); 5058 5059 return 0; 5060 } 5061 5062 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5063 { 5064 unsigned long precharge; 5065 struct vm_area_struct *vma; 5066 5067 down_read(&mm->mmap_sem); 5068 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5069 struct mm_walk mem_cgroup_count_precharge_walk = { 5070 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5071 .mm = mm, 5072 .private = vma, 5073 }; 5074 if (is_vm_hugetlb_page(vma)) 5075 continue; 5076 walk_page_range(vma->vm_start, vma->vm_end, 5077 &mem_cgroup_count_precharge_walk); 5078 } 5079 up_read(&mm->mmap_sem); 5080 5081 precharge = mc.precharge; 5082 mc.precharge = 0; 5083 5084 return precharge; 5085 } 5086 5087 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5088 { 5089 unsigned long precharge = mem_cgroup_count_precharge(mm); 5090 5091 VM_BUG_ON(mc.moving_task); 5092 mc.moving_task = current; 5093 return mem_cgroup_do_precharge(precharge); 5094 } 5095 5096 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5097 static void __mem_cgroup_clear_mc(void) 5098 { 5099 struct mem_cgroup *from = mc.from; 5100 struct mem_cgroup *to = mc.to; 5101 5102 /* we must uncharge all the leftover precharges from mc.to */ 5103 if (mc.precharge) { 5104 cancel_charge(mc.to, mc.precharge); 5105 mc.precharge = 0; 5106 } 5107 /* 5108 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5109 * we must uncharge here. 5110 */ 5111 if (mc.moved_charge) { 5112 cancel_charge(mc.from, mc.moved_charge); 5113 mc.moved_charge = 0; 5114 } 5115 /* we must fixup refcnts and charges */ 5116 if (mc.moved_swap) { 5117 /* uncharge swap account from the old cgroup */ 5118 if (!mem_cgroup_is_root(mc.from)) 5119 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5120 5121 /* 5122 * we charged both to->memory and to->memsw, so we 5123 * should uncharge to->memory. 5124 */ 5125 if (!mem_cgroup_is_root(mc.to)) 5126 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5127 5128 css_put_many(&mc.from->css, mc.moved_swap); 5129 5130 /* we've already done css_get(mc.to) */ 5131 mc.moved_swap = 0; 5132 } 5133 memcg_oom_recover(from); 5134 memcg_oom_recover(to); 5135 wake_up_all(&mc.waitq); 5136 } 5137 5138 static void mem_cgroup_clear_mc(void) 5139 { 5140 /* 5141 * we must clear moving_task before waking up waiters at the end of 5142 * task migration. 5143 */ 5144 mc.moving_task = NULL; 5145 __mem_cgroup_clear_mc(); 5146 spin_lock(&mc.lock); 5147 mc.from = NULL; 5148 mc.to = NULL; 5149 spin_unlock(&mc.lock); 5150 } 5151 5152 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5153 struct cgroup_taskset *tset) 5154 { 5155 struct task_struct *p = cgroup_taskset_first(tset); 5156 int ret = 0; 5157 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5158 unsigned long move_charge_at_immigrate; 5159 5160 /* 5161 * We are now commited to this value whatever it is. Changes in this 5162 * tunable will only affect upcoming migrations, not the current one. 5163 * So we need to save it, and keep it going. 5164 */ 5165 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5166 if (move_charge_at_immigrate) { 5167 struct mm_struct *mm; 5168 struct mem_cgroup *from = mem_cgroup_from_task(p); 5169 5170 VM_BUG_ON(from == memcg); 5171 5172 mm = get_task_mm(p); 5173 if (!mm) 5174 return 0; 5175 /* We move charges only when we move a owner of the mm */ 5176 if (mm->owner == p) { 5177 VM_BUG_ON(mc.from); 5178 VM_BUG_ON(mc.to); 5179 VM_BUG_ON(mc.precharge); 5180 VM_BUG_ON(mc.moved_charge); 5181 VM_BUG_ON(mc.moved_swap); 5182 5183 spin_lock(&mc.lock); 5184 mc.from = from; 5185 mc.to = memcg; 5186 mc.immigrate_flags = move_charge_at_immigrate; 5187 spin_unlock(&mc.lock); 5188 /* We set mc.moving_task later */ 5189 5190 ret = mem_cgroup_precharge_mc(mm); 5191 if (ret) 5192 mem_cgroup_clear_mc(); 5193 } 5194 mmput(mm); 5195 } 5196 return ret; 5197 } 5198 5199 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5200 struct cgroup_taskset *tset) 5201 { 5202 if (mc.to) 5203 mem_cgroup_clear_mc(); 5204 } 5205 5206 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5207 unsigned long addr, unsigned long end, 5208 struct mm_walk *walk) 5209 { 5210 int ret = 0; 5211 struct vm_area_struct *vma = walk->private; 5212 pte_t *pte; 5213 spinlock_t *ptl; 5214 enum mc_target_type target_type; 5215 union mc_target target; 5216 struct page *page; 5217 5218 /* 5219 * We don't take compound_lock() here but no race with splitting thp 5220 * happens because: 5221 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5222 * under splitting, which means there's no concurrent thp split, 5223 * - if another thread runs into split_huge_page() just after we 5224 * entered this if-block, the thread must wait for page table lock 5225 * to be unlocked in __split_huge_page_splitting(), where the main 5226 * part of thp split is not executed yet. 5227 */ 5228 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5229 if (mc.precharge < HPAGE_PMD_NR) { 5230 spin_unlock(ptl); 5231 return 0; 5232 } 5233 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5234 if (target_type == MC_TARGET_PAGE) { 5235 page = target.page; 5236 if (!isolate_lru_page(page)) { 5237 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5238 mc.from, mc.to)) { 5239 mc.precharge -= HPAGE_PMD_NR; 5240 mc.moved_charge += HPAGE_PMD_NR; 5241 } 5242 putback_lru_page(page); 5243 } 5244 put_page(page); 5245 } 5246 spin_unlock(ptl); 5247 return 0; 5248 } 5249 5250 if (pmd_trans_unstable(pmd)) 5251 return 0; 5252 retry: 5253 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5254 for (; addr != end; addr += PAGE_SIZE) { 5255 pte_t ptent = *(pte++); 5256 swp_entry_t ent; 5257 5258 if (!mc.precharge) 5259 break; 5260 5261 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5262 case MC_TARGET_PAGE: 5263 page = target.page; 5264 if (isolate_lru_page(page)) 5265 goto put; 5266 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 5267 mc.precharge--; 5268 /* we uncharge from mc.from later. */ 5269 mc.moved_charge++; 5270 } 5271 putback_lru_page(page); 5272 put: /* get_mctgt_type() gets the page */ 5273 put_page(page); 5274 break; 5275 case MC_TARGET_SWAP: 5276 ent = target.ent; 5277 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5278 mc.precharge--; 5279 /* we fixup refcnts and charges later. */ 5280 mc.moved_swap++; 5281 } 5282 break; 5283 default: 5284 break; 5285 } 5286 } 5287 pte_unmap_unlock(pte - 1, ptl); 5288 cond_resched(); 5289 5290 if (addr != end) { 5291 /* 5292 * We have consumed all precharges we got in can_attach(). 5293 * We try charge one by one, but don't do any additional 5294 * charges to mc.to if we have failed in charge once in attach() 5295 * phase. 5296 */ 5297 ret = mem_cgroup_do_precharge(1); 5298 if (!ret) 5299 goto retry; 5300 } 5301 5302 return ret; 5303 } 5304 5305 static void mem_cgroup_move_charge(struct mm_struct *mm) 5306 { 5307 struct vm_area_struct *vma; 5308 5309 lru_add_drain_all(); 5310 /* 5311 * Signal mem_cgroup_begin_page_stat() to take the memcg's 5312 * move_lock while we're moving its pages to another memcg. 5313 * Then wait for already started RCU-only updates to finish. 5314 */ 5315 atomic_inc(&mc.from->moving_account); 5316 synchronize_rcu(); 5317 retry: 5318 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5319 /* 5320 * Someone who are holding the mmap_sem might be waiting in 5321 * waitq. So we cancel all extra charges, wake up all waiters, 5322 * and retry. Because we cancel precharges, we might not be able 5323 * to move enough charges, but moving charge is a best-effort 5324 * feature anyway, so it wouldn't be a big problem. 5325 */ 5326 __mem_cgroup_clear_mc(); 5327 cond_resched(); 5328 goto retry; 5329 } 5330 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5331 int ret; 5332 struct mm_walk mem_cgroup_move_charge_walk = { 5333 .pmd_entry = mem_cgroup_move_charge_pte_range, 5334 .mm = mm, 5335 .private = vma, 5336 }; 5337 if (is_vm_hugetlb_page(vma)) 5338 continue; 5339 ret = walk_page_range(vma->vm_start, vma->vm_end, 5340 &mem_cgroup_move_charge_walk); 5341 if (ret) 5342 /* 5343 * means we have consumed all precharges and failed in 5344 * doing additional charge. Just abandon here. 5345 */ 5346 break; 5347 } 5348 up_read(&mm->mmap_sem); 5349 atomic_dec(&mc.from->moving_account); 5350 } 5351 5352 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5353 struct cgroup_taskset *tset) 5354 { 5355 struct task_struct *p = cgroup_taskset_first(tset); 5356 struct mm_struct *mm = get_task_mm(p); 5357 5358 if (mm) { 5359 if (mc.to) 5360 mem_cgroup_move_charge(mm); 5361 mmput(mm); 5362 } 5363 if (mc.to) 5364 mem_cgroup_clear_mc(); 5365 } 5366 #else /* !CONFIG_MMU */ 5367 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5368 struct cgroup_taskset *tset) 5369 { 5370 return 0; 5371 } 5372 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5373 struct cgroup_taskset *tset) 5374 { 5375 } 5376 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5377 struct cgroup_taskset *tset) 5378 { 5379 } 5380 #endif 5381 5382 /* 5383 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5384 * to verify whether we're attached to the default hierarchy on each mount 5385 * attempt. 5386 */ 5387 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5388 { 5389 /* 5390 * use_hierarchy is forced on the default hierarchy. cgroup core 5391 * guarantees that @root doesn't have any children, so turning it 5392 * on for the root memcg is enough. 5393 */ 5394 if (cgroup_on_dfl(root_css->cgroup)) 5395 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5396 } 5397 5398 struct cgroup_subsys memory_cgrp_subsys = { 5399 .css_alloc = mem_cgroup_css_alloc, 5400 .css_online = mem_cgroup_css_online, 5401 .css_offline = mem_cgroup_css_offline, 5402 .css_free = mem_cgroup_css_free, 5403 .css_reset = mem_cgroup_css_reset, 5404 .can_attach = mem_cgroup_can_attach, 5405 .cancel_attach = mem_cgroup_cancel_attach, 5406 .attach = mem_cgroup_move_task, 5407 .bind = mem_cgroup_bind, 5408 .legacy_cftypes = mem_cgroup_files, 5409 .early_init = 0, 5410 }; 5411 5412 #ifdef CONFIG_MEMCG_SWAP 5413 static int __init enable_swap_account(char *s) 5414 { 5415 if (!strcmp(s, "1")) 5416 really_do_swap_account = 1; 5417 else if (!strcmp(s, "0")) 5418 really_do_swap_account = 0; 5419 return 1; 5420 } 5421 __setup("swapaccount=", enable_swap_account); 5422 5423 static void __init memsw_file_init(void) 5424 { 5425 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5426 memsw_cgroup_files)); 5427 } 5428 5429 static void __init enable_swap_cgroup(void) 5430 { 5431 if (!mem_cgroup_disabled() && really_do_swap_account) { 5432 do_swap_account = 1; 5433 memsw_file_init(); 5434 } 5435 } 5436 5437 #else 5438 static void __init enable_swap_cgroup(void) 5439 { 5440 } 5441 #endif 5442 5443 #ifdef CONFIG_MEMCG_SWAP 5444 /** 5445 * mem_cgroup_swapout - transfer a memsw charge to swap 5446 * @page: page whose memsw charge to transfer 5447 * @entry: swap entry to move the charge to 5448 * 5449 * Transfer the memsw charge of @page to @entry. 5450 */ 5451 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5452 { 5453 struct mem_cgroup *memcg; 5454 unsigned short oldid; 5455 5456 VM_BUG_ON_PAGE(PageLRU(page), page); 5457 VM_BUG_ON_PAGE(page_count(page), page); 5458 5459 if (!do_swap_account) 5460 return; 5461 5462 memcg = page->mem_cgroup; 5463 5464 /* Readahead page, never charged */ 5465 if (!memcg) 5466 return; 5467 5468 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5469 VM_BUG_ON_PAGE(oldid, page); 5470 mem_cgroup_swap_statistics(memcg, true); 5471 5472 page->mem_cgroup = NULL; 5473 5474 if (!mem_cgroup_is_root(memcg)) 5475 page_counter_uncharge(&memcg->memory, 1); 5476 5477 /* XXX: caller holds IRQ-safe mapping->tree_lock */ 5478 VM_BUG_ON(!irqs_disabled()); 5479 5480 mem_cgroup_charge_statistics(memcg, page, -1); 5481 memcg_check_events(memcg, page); 5482 } 5483 5484 /** 5485 * mem_cgroup_uncharge_swap - uncharge a swap entry 5486 * @entry: swap entry to uncharge 5487 * 5488 * Drop the memsw charge associated with @entry. 5489 */ 5490 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5491 { 5492 struct mem_cgroup *memcg; 5493 unsigned short id; 5494 5495 if (!do_swap_account) 5496 return; 5497 5498 id = swap_cgroup_record(entry, 0); 5499 rcu_read_lock(); 5500 memcg = mem_cgroup_lookup(id); 5501 if (memcg) { 5502 if (!mem_cgroup_is_root(memcg)) 5503 page_counter_uncharge(&memcg->memsw, 1); 5504 mem_cgroup_swap_statistics(memcg, false); 5505 css_put(&memcg->css); 5506 } 5507 rcu_read_unlock(); 5508 } 5509 #endif 5510 5511 /** 5512 * mem_cgroup_try_charge - try charging a page 5513 * @page: page to charge 5514 * @mm: mm context of the victim 5515 * @gfp_mask: reclaim mode 5516 * @memcgp: charged memcg return 5517 * 5518 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5519 * pages according to @gfp_mask if necessary. 5520 * 5521 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5522 * Otherwise, an error code is returned. 5523 * 5524 * After page->mapping has been set up, the caller must finalize the 5525 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5526 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5527 */ 5528 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5529 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5530 { 5531 struct mem_cgroup *memcg = NULL; 5532 unsigned int nr_pages = 1; 5533 int ret = 0; 5534 5535 if (mem_cgroup_disabled()) 5536 goto out; 5537 5538 if (PageSwapCache(page)) { 5539 /* 5540 * Every swap fault against a single page tries to charge the 5541 * page, bail as early as possible. shmem_unuse() encounters 5542 * already charged pages, too. The USED bit is protected by 5543 * the page lock, which serializes swap cache removal, which 5544 * in turn serializes uncharging. 5545 */ 5546 if (page->mem_cgroup) 5547 goto out; 5548 } 5549 5550 if (PageTransHuge(page)) { 5551 nr_pages <<= compound_order(page); 5552 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5553 } 5554 5555 if (do_swap_account && PageSwapCache(page)) 5556 memcg = try_get_mem_cgroup_from_page(page); 5557 if (!memcg) 5558 memcg = get_mem_cgroup_from_mm(mm); 5559 5560 ret = try_charge(memcg, gfp_mask, nr_pages); 5561 5562 css_put(&memcg->css); 5563 5564 if (ret == -EINTR) { 5565 memcg = root_mem_cgroup; 5566 ret = 0; 5567 } 5568 out: 5569 *memcgp = memcg; 5570 return ret; 5571 } 5572 5573 /** 5574 * mem_cgroup_commit_charge - commit a page charge 5575 * @page: page to charge 5576 * @memcg: memcg to charge the page to 5577 * @lrucare: page might be on LRU already 5578 * 5579 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5580 * after page->mapping has been set up. This must happen atomically 5581 * as part of the page instantiation, i.e. under the page table lock 5582 * for anonymous pages, under the page lock for page and swap cache. 5583 * 5584 * In addition, the page must not be on the LRU during the commit, to 5585 * prevent racing with task migration. If it might be, use @lrucare. 5586 * 5587 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5588 */ 5589 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5590 bool lrucare) 5591 { 5592 unsigned int nr_pages = 1; 5593 5594 VM_BUG_ON_PAGE(!page->mapping, page); 5595 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5596 5597 if (mem_cgroup_disabled()) 5598 return; 5599 /* 5600 * Swap faults will attempt to charge the same page multiple 5601 * times. But reuse_swap_page() might have removed the page 5602 * from swapcache already, so we can't check PageSwapCache(). 5603 */ 5604 if (!memcg) 5605 return; 5606 5607 commit_charge(page, memcg, lrucare); 5608 5609 if (PageTransHuge(page)) { 5610 nr_pages <<= compound_order(page); 5611 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5612 } 5613 5614 local_irq_disable(); 5615 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5616 memcg_check_events(memcg, page); 5617 local_irq_enable(); 5618 5619 if (do_swap_account && PageSwapCache(page)) { 5620 swp_entry_t entry = { .val = page_private(page) }; 5621 /* 5622 * The swap entry might not get freed for a long time, 5623 * let's not wait for it. The page already received a 5624 * memory+swap charge, drop the swap entry duplicate. 5625 */ 5626 mem_cgroup_uncharge_swap(entry); 5627 } 5628 } 5629 5630 /** 5631 * mem_cgroup_cancel_charge - cancel a page charge 5632 * @page: page to charge 5633 * @memcg: memcg to charge the page to 5634 * 5635 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5636 */ 5637 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5638 { 5639 unsigned int nr_pages = 1; 5640 5641 if (mem_cgroup_disabled()) 5642 return; 5643 /* 5644 * Swap faults will attempt to charge the same page multiple 5645 * times. But reuse_swap_page() might have removed the page 5646 * from swapcache already, so we can't check PageSwapCache(). 5647 */ 5648 if (!memcg) 5649 return; 5650 5651 if (PageTransHuge(page)) { 5652 nr_pages <<= compound_order(page); 5653 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5654 } 5655 5656 cancel_charge(memcg, nr_pages); 5657 } 5658 5659 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5660 unsigned long nr_anon, unsigned long nr_file, 5661 unsigned long nr_huge, struct page *dummy_page) 5662 { 5663 unsigned long nr_pages = nr_anon + nr_file; 5664 unsigned long flags; 5665 5666 if (!mem_cgroup_is_root(memcg)) { 5667 page_counter_uncharge(&memcg->memory, nr_pages); 5668 if (do_swap_account) 5669 page_counter_uncharge(&memcg->memsw, nr_pages); 5670 memcg_oom_recover(memcg); 5671 } 5672 5673 local_irq_save(flags); 5674 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5675 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5676 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5677 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5678 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5679 memcg_check_events(memcg, dummy_page); 5680 local_irq_restore(flags); 5681 5682 if (!mem_cgroup_is_root(memcg)) 5683 css_put_many(&memcg->css, nr_pages); 5684 } 5685 5686 static void uncharge_list(struct list_head *page_list) 5687 { 5688 struct mem_cgroup *memcg = NULL; 5689 unsigned long nr_anon = 0; 5690 unsigned long nr_file = 0; 5691 unsigned long nr_huge = 0; 5692 unsigned long pgpgout = 0; 5693 struct list_head *next; 5694 struct page *page; 5695 5696 next = page_list->next; 5697 do { 5698 unsigned int nr_pages = 1; 5699 5700 page = list_entry(next, struct page, lru); 5701 next = page->lru.next; 5702 5703 VM_BUG_ON_PAGE(PageLRU(page), page); 5704 VM_BUG_ON_PAGE(page_count(page), page); 5705 5706 if (!page->mem_cgroup) 5707 continue; 5708 5709 /* 5710 * Nobody should be changing or seriously looking at 5711 * page->mem_cgroup at this point, we have fully 5712 * exclusive access to the page. 5713 */ 5714 5715 if (memcg != page->mem_cgroup) { 5716 if (memcg) { 5717 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5718 nr_huge, page); 5719 pgpgout = nr_anon = nr_file = nr_huge = 0; 5720 } 5721 memcg = page->mem_cgroup; 5722 } 5723 5724 if (PageTransHuge(page)) { 5725 nr_pages <<= compound_order(page); 5726 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5727 nr_huge += nr_pages; 5728 } 5729 5730 if (PageAnon(page)) 5731 nr_anon += nr_pages; 5732 else 5733 nr_file += nr_pages; 5734 5735 page->mem_cgroup = NULL; 5736 5737 pgpgout++; 5738 } while (next != page_list); 5739 5740 if (memcg) 5741 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5742 nr_huge, page); 5743 } 5744 5745 /** 5746 * mem_cgroup_uncharge - uncharge a page 5747 * @page: page to uncharge 5748 * 5749 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5750 * mem_cgroup_commit_charge(). 5751 */ 5752 void mem_cgroup_uncharge(struct page *page) 5753 { 5754 if (mem_cgroup_disabled()) 5755 return; 5756 5757 /* Don't touch page->lru of any random page, pre-check: */ 5758 if (!page->mem_cgroup) 5759 return; 5760 5761 INIT_LIST_HEAD(&page->lru); 5762 uncharge_list(&page->lru); 5763 } 5764 5765 /** 5766 * mem_cgroup_uncharge_list - uncharge a list of page 5767 * @page_list: list of pages to uncharge 5768 * 5769 * Uncharge a list of pages previously charged with 5770 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5771 */ 5772 void mem_cgroup_uncharge_list(struct list_head *page_list) 5773 { 5774 if (mem_cgroup_disabled()) 5775 return; 5776 5777 if (!list_empty(page_list)) 5778 uncharge_list(page_list); 5779 } 5780 5781 /** 5782 * mem_cgroup_migrate - migrate a charge to another page 5783 * @oldpage: currently charged page 5784 * @newpage: page to transfer the charge to 5785 * @lrucare: both pages might be on the LRU already 5786 * 5787 * Migrate the charge from @oldpage to @newpage. 5788 * 5789 * Both pages must be locked, @newpage->mapping must be set up. 5790 */ 5791 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5792 bool lrucare) 5793 { 5794 struct mem_cgroup *memcg; 5795 int isolated; 5796 5797 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5798 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5799 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 5800 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 5801 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5802 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5803 newpage); 5804 5805 if (mem_cgroup_disabled()) 5806 return; 5807 5808 /* Page cache replacement: new page already charged? */ 5809 if (newpage->mem_cgroup) 5810 return; 5811 5812 /* 5813 * Swapcache readahead pages can get migrated before being 5814 * charged, and migration from compaction can happen to an 5815 * uncharged page when the PFN walker finds a page that 5816 * reclaim just put back on the LRU but has not released yet. 5817 */ 5818 memcg = oldpage->mem_cgroup; 5819 if (!memcg) 5820 return; 5821 5822 if (lrucare) 5823 lock_page_lru(oldpage, &isolated); 5824 5825 oldpage->mem_cgroup = NULL; 5826 5827 if (lrucare) 5828 unlock_page_lru(oldpage, isolated); 5829 5830 commit_charge(newpage, memcg, lrucare); 5831 } 5832 5833 /* 5834 * subsys_initcall() for memory controller. 5835 * 5836 * Some parts like hotcpu_notifier() have to be initialized from this context 5837 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 5838 * everything that doesn't depend on a specific mem_cgroup structure should 5839 * be initialized from here. 5840 */ 5841 static int __init mem_cgroup_init(void) 5842 { 5843 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5844 enable_swap_cgroup(); 5845 mem_cgroup_soft_limit_tree_init(); 5846 memcg_stock_init(); 5847 return 0; 5848 } 5849 subsys_initcall(mem_cgroup_init); 5850