1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/page_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include "internal.h" 60 #include <net/sock.h> 61 #include <net/ip.h> 62 #include <net/tcp_memcontrol.h> 63 #include "slab.h" 64 65 #include <asm/uaccess.h> 66 67 #include <trace/events/vmscan.h> 68 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 70 EXPORT_SYMBOL(memory_cgrp_subsys); 71 72 #define MEM_CGROUP_RECLAIM_RETRIES 5 73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #ifdef CONFIG_MEMCG_SWAP 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 77 int do_swap_account __read_mostly; 78 79 /* for remember boot option*/ 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 81 static int really_do_swap_account __initdata = 1; 82 #else 83 static int really_do_swap_account __initdata; 84 #endif 85 86 #else 87 #define do_swap_account 0 88 #endif 89 90 91 static const char * const mem_cgroup_stat_names[] = { 92 "cache", 93 "rss", 94 "rss_huge", 95 "mapped_file", 96 "writeback", 97 "swap", 98 }; 99 100 enum mem_cgroup_events_index { 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_NSTATS, 106 }; 107 108 static const char * const mem_cgroup_events_names[] = { 109 "pgpgin", 110 "pgpgout", 111 "pgfault", 112 "pgmajfault", 113 }; 114 115 static const char * const mem_cgroup_lru_names[] = { 116 "inactive_anon", 117 "active_anon", 118 "inactive_file", 119 "active_file", 120 "unevictable", 121 }; 122 123 /* 124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 125 * it will be incremated by the number of pages. This counter is used for 126 * for trigger some periodic events. This is straightforward and better 127 * than using jiffies etc. to handle periodic memcg event. 128 */ 129 enum mem_cgroup_events_target { 130 MEM_CGROUP_TARGET_THRESH, 131 MEM_CGROUP_TARGET_SOFTLIMIT, 132 MEM_CGROUP_TARGET_NUMAINFO, 133 MEM_CGROUP_NTARGETS, 134 }; 135 #define THRESHOLDS_EVENTS_TARGET 128 136 #define SOFTLIMIT_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024 138 139 struct mem_cgroup_stat_cpu { 140 long count[MEM_CGROUP_STAT_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 142 unsigned long nr_page_events; 143 unsigned long targets[MEM_CGROUP_NTARGETS]; 144 }; 145 146 struct mem_cgroup_reclaim_iter { 147 /* 148 * last scanned hierarchy member. Valid only if last_dead_count 149 * matches memcg->dead_count of the hierarchy root group. 150 */ 151 struct mem_cgroup *last_visited; 152 int last_dead_count; 153 154 /* scan generation, increased every round-trip */ 155 unsigned int generation; 156 }; 157 158 /* 159 * per-zone information in memory controller. 160 */ 161 struct mem_cgroup_per_zone { 162 struct lruvec lruvec; 163 unsigned long lru_size[NR_LRU_LISTS]; 164 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 166 167 struct rb_node tree_node; /* RB tree node */ 168 unsigned long long usage_in_excess;/* Set to the value by which */ 169 /* the soft limit is exceeded*/ 170 bool on_tree; 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 172 /* use container_of */ 173 }; 174 175 struct mem_cgroup_per_node { 176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 177 }; 178 179 /* 180 * Cgroups above their limits are maintained in a RB-Tree, independent of 181 * their hierarchy representation 182 */ 183 184 struct mem_cgroup_tree_per_zone { 185 struct rb_root rb_root; 186 spinlock_t lock; 187 }; 188 189 struct mem_cgroup_tree_per_node { 190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 191 }; 192 193 struct mem_cgroup_tree { 194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 195 }; 196 197 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 198 199 struct mem_cgroup_threshold { 200 struct eventfd_ctx *eventfd; 201 u64 threshold; 202 }; 203 204 /* For threshold */ 205 struct mem_cgroup_threshold_ary { 206 /* An array index points to threshold just below or equal to usage. */ 207 int current_threshold; 208 /* Size of entries[] */ 209 unsigned int size; 210 /* Array of thresholds */ 211 struct mem_cgroup_threshold entries[0]; 212 }; 213 214 struct mem_cgroup_thresholds { 215 /* Primary thresholds array */ 216 struct mem_cgroup_threshold_ary *primary; 217 /* 218 * Spare threshold array. 219 * This is needed to make mem_cgroup_unregister_event() "never fail". 220 * It must be able to store at least primary->size - 1 entries. 221 */ 222 struct mem_cgroup_threshold_ary *spare; 223 }; 224 225 /* for OOM */ 226 struct mem_cgroup_eventfd_list { 227 struct list_head list; 228 struct eventfd_ctx *eventfd; 229 }; 230 231 /* 232 * cgroup_event represents events which userspace want to receive. 233 */ 234 struct mem_cgroup_event { 235 /* 236 * memcg which the event belongs to. 237 */ 238 struct mem_cgroup *memcg; 239 /* 240 * eventfd to signal userspace about the event. 241 */ 242 struct eventfd_ctx *eventfd; 243 /* 244 * Each of these stored in a list by the cgroup. 245 */ 246 struct list_head list; 247 /* 248 * register_event() callback will be used to add new userspace 249 * waiter for changes related to this event. Use eventfd_signal() 250 * on eventfd to send notification to userspace. 251 */ 252 int (*register_event)(struct mem_cgroup *memcg, 253 struct eventfd_ctx *eventfd, const char *args); 254 /* 255 * unregister_event() callback will be called when userspace closes 256 * the eventfd or on cgroup removing. This callback must be set, 257 * if you want provide notification functionality. 258 */ 259 void (*unregister_event)(struct mem_cgroup *memcg, 260 struct eventfd_ctx *eventfd); 261 /* 262 * All fields below needed to unregister event when 263 * userspace closes eventfd. 264 */ 265 poll_table pt; 266 wait_queue_head_t *wqh; 267 wait_queue_t wait; 268 struct work_struct remove; 269 }; 270 271 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273 274 /* 275 * The memory controller data structure. The memory controller controls both 276 * page cache and RSS per cgroup. We would eventually like to provide 277 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 278 * to help the administrator determine what knobs to tune. 279 * 280 * TODO: Add a water mark for the memory controller. Reclaim will begin when 281 * we hit the water mark. May be even add a low water mark, such that 282 * no reclaim occurs from a cgroup at it's low water mark, this is 283 * a feature that will be implemented much later in the future. 284 */ 285 struct mem_cgroup { 286 struct cgroup_subsys_state css; 287 /* 288 * the counter to account for memory usage 289 */ 290 struct res_counter res; 291 292 /* vmpressure notifications */ 293 struct vmpressure vmpressure; 294 295 /* 296 * the counter to account for mem+swap usage. 297 */ 298 struct res_counter memsw; 299 300 /* 301 * the counter to account for kernel memory usage. 302 */ 303 struct res_counter kmem; 304 /* 305 * Should the accounting and control be hierarchical, per subtree? 306 */ 307 bool use_hierarchy; 308 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 309 310 bool oom_lock; 311 atomic_t under_oom; 312 atomic_t oom_wakeups; 313 314 int swappiness; 315 /* OOM-Killer disable */ 316 int oom_kill_disable; 317 318 /* set when res.limit == memsw.limit */ 319 bool memsw_is_minimum; 320 321 /* protect arrays of thresholds */ 322 struct mutex thresholds_lock; 323 324 /* thresholds for memory usage. RCU-protected */ 325 struct mem_cgroup_thresholds thresholds; 326 327 /* thresholds for mem+swap usage. RCU-protected */ 328 struct mem_cgroup_thresholds memsw_thresholds; 329 330 /* For oom notifier event fd */ 331 struct list_head oom_notify; 332 333 /* 334 * Should we move charges of a task when a task is moved into this 335 * mem_cgroup ? And what type of charges should we move ? 336 */ 337 unsigned long move_charge_at_immigrate; 338 /* 339 * set > 0 if pages under this cgroup are moving to other cgroup. 340 */ 341 atomic_t moving_account; 342 /* taken only while moving_account > 0 */ 343 spinlock_t move_lock; 344 /* 345 * percpu counter. 346 */ 347 struct mem_cgroup_stat_cpu __percpu *stat; 348 /* 349 * used when a cpu is offlined or other synchronizations 350 * See mem_cgroup_read_stat(). 351 */ 352 struct mem_cgroup_stat_cpu nocpu_base; 353 spinlock_t pcp_counter_lock; 354 355 atomic_t dead_count; 356 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 357 struct cg_proto tcp_mem; 358 #endif 359 #if defined(CONFIG_MEMCG_KMEM) 360 /* analogous to slab_common's slab_caches list, but per-memcg; 361 * protected by memcg_slab_mutex */ 362 struct list_head memcg_slab_caches; 363 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 364 int kmemcg_id; 365 #endif 366 367 int last_scanned_node; 368 #if MAX_NUMNODES > 1 369 nodemask_t scan_nodes; 370 atomic_t numainfo_events; 371 atomic_t numainfo_updating; 372 #endif 373 374 /* List of events which userspace want to receive */ 375 struct list_head event_list; 376 spinlock_t event_list_lock; 377 378 struct mem_cgroup_per_node *nodeinfo[0]; 379 /* WARNING: nodeinfo must be the last member here */ 380 }; 381 382 /* internal only representation about the status of kmem accounting. */ 383 enum { 384 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 385 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 386 }; 387 388 #ifdef CONFIG_MEMCG_KMEM 389 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 390 { 391 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 392 } 393 394 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 395 { 396 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 397 } 398 399 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 400 { 401 /* 402 * Our caller must use css_get() first, because memcg_uncharge_kmem() 403 * will call css_put() if it sees the memcg is dead. 404 */ 405 smp_wmb(); 406 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 407 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 408 } 409 410 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 411 { 412 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 413 &memcg->kmem_account_flags); 414 } 415 #endif 416 417 /* Stuffs for move charges at task migration. */ 418 /* 419 * Types of charges to be moved. "move_charge_at_immitgrate" and 420 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 421 */ 422 enum move_type { 423 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 424 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 425 NR_MOVE_TYPE, 426 }; 427 428 /* "mc" and its members are protected by cgroup_mutex */ 429 static struct move_charge_struct { 430 spinlock_t lock; /* for from, to */ 431 struct mem_cgroup *from; 432 struct mem_cgroup *to; 433 unsigned long immigrate_flags; 434 unsigned long precharge; 435 unsigned long moved_charge; 436 unsigned long moved_swap; 437 struct task_struct *moving_task; /* a task moving charges */ 438 wait_queue_head_t waitq; /* a waitq for other context */ 439 } mc = { 440 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 441 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 442 }; 443 444 static bool move_anon(void) 445 { 446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 447 } 448 449 static bool move_file(void) 450 { 451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 452 } 453 454 /* 455 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 456 * limit reclaim to prevent infinite loops, if they ever occur. 457 */ 458 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 459 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 460 461 enum charge_type { 462 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 463 MEM_CGROUP_CHARGE_TYPE_ANON, 464 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 465 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 466 NR_CHARGE_TYPE, 467 }; 468 469 /* for encoding cft->private value on file */ 470 enum res_type { 471 _MEM, 472 _MEMSWAP, 473 _OOM_TYPE, 474 _KMEM, 475 }; 476 477 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 478 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 479 #define MEMFILE_ATTR(val) ((val) & 0xffff) 480 /* Used for OOM nofiier */ 481 #define OOM_CONTROL (0) 482 483 /* 484 * Reclaim flags for mem_cgroup_hierarchical_reclaim 485 */ 486 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 487 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 488 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 489 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 490 491 /* 492 * The memcg_create_mutex will be held whenever a new cgroup is created. 493 * As a consequence, any change that needs to protect against new child cgroups 494 * appearing has to hold it as well. 495 */ 496 static DEFINE_MUTEX(memcg_create_mutex); 497 498 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 499 { 500 return s ? container_of(s, struct mem_cgroup, css) : NULL; 501 } 502 503 /* Some nice accessors for the vmpressure. */ 504 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 505 { 506 if (!memcg) 507 memcg = root_mem_cgroup; 508 return &memcg->vmpressure; 509 } 510 511 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 512 { 513 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 514 } 515 516 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 517 { 518 return (memcg == root_mem_cgroup); 519 } 520 521 /* 522 * We restrict the id in the range of [1, 65535], so it can fit into 523 * an unsigned short. 524 */ 525 #define MEM_CGROUP_ID_MAX USHRT_MAX 526 527 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 528 { 529 return memcg->css.id; 530 } 531 532 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 533 { 534 struct cgroup_subsys_state *css; 535 536 css = css_from_id(id, &memory_cgrp_subsys); 537 return mem_cgroup_from_css(css); 538 } 539 540 /* Writing them here to avoid exposing memcg's inner layout */ 541 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 542 543 void sock_update_memcg(struct sock *sk) 544 { 545 if (mem_cgroup_sockets_enabled) { 546 struct mem_cgroup *memcg; 547 struct cg_proto *cg_proto; 548 549 BUG_ON(!sk->sk_prot->proto_cgroup); 550 551 /* Socket cloning can throw us here with sk_cgrp already 552 * filled. It won't however, necessarily happen from 553 * process context. So the test for root memcg given 554 * the current task's memcg won't help us in this case. 555 * 556 * Respecting the original socket's memcg is a better 557 * decision in this case. 558 */ 559 if (sk->sk_cgrp) { 560 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 561 css_get(&sk->sk_cgrp->memcg->css); 562 return; 563 } 564 565 rcu_read_lock(); 566 memcg = mem_cgroup_from_task(current); 567 cg_proto = sk->sk_prot->proto_cgroup(memcg); 568 if (!mem_cgroup_is_root(memcg) && 569 memcg_proto_active(cg_proto) && 570 css_tryget_online(&memcg->css)) { 571 sk->sk_cgrp = cg_proto; 572 } 573 rcu_read_unlock(); 574 } 575 } 576 EXPORT_SYMBOL(sock_update_memcg); 577 578 void sock_release_memcg(struct sock *sk) 579 { 580 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 581 struct mem_cgroup *memcg; 582 WARN_ON(!sk->sk_cgrp->memcg); 583 memcg = sk->sk_cgrp->memcg; 584 css_put(&sk->sk_cgrp->memcg->css); 585 } 586 } 587 588 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 589 { 590 if (!memcg || mem_cgroup_is_root(memcg)) 591 return NULL; 592 593 return &memcg->tcp_mem; 594 } 595 EXPORT_SYMBOL(tcp_proto_cgroup); 596 597 static void disarm_sock_keys(struct mem_cgroup *memcg) 598 { 599 if (!memcg_proto_activated(&memcg->tcp_mem)) 600 return; 601 static_key_slow_dec(&memcg_socket_limit_enabled); 602 } 603 #else 604 static void disarm_sock_keys(struct mem_cgroup *memcg) 605 { 606 } 607 #endif 608 609 #ifdef CONFIG_MEMCG_KMEM 610 /* 611 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 612 * The main reason for not using cgroup id for this: 613 * this works better in sparse environments, where we have a lot of memcgs, 614 * but only a few kmem-limited. Or also, if we have, for instance, 200 615 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 616 * 200 entry array for that. 617 * 618 * The current size of the caches array is stored in 619 * memcg_limited_groups_array_size. It will double each time we have to 620 * increase it. 621 */ 622 static DEFINE_IDA(kmem_limited_groups); 623 int memcg_limited_groups_array_size; 624 625 /* 626 * MIN_SIZE is different than 1, because we would like to avoid going through 627 * the alloc/free process all the time. In a small machine, 4 kmem-limited 628 * cgroups is a reasonable guess. In the future, it could be a parameter or 629 * tunable, but that is strictly not necessary. 630 * 631 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 632 * this constant directly from cgroup, but it is understandable that this is 633 * better kept as an internal representation in cgroup.c. In any case, the 634 * cgrp_id space is not getting any smaller, and we don't have to necessarily 635 * increase ours as well if it increases. 636 */ 637 #define MEMCG_CACHES_MIN_SIZE 4 638 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 639 640 /* 641 * A lot of the calls to the cache allocation functions are expected to be 642 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 643 * conditional to this static branch, we'll have to allow modules that does 644 * kmem_cache_alloc and the such to see this symbol as well 645 */ 646 struct static_key memcg_kmem_enabled_key; 647 EXPORT_SYMBOL(memcg_kmem_enabled_key); 648 649 static void disarm_kmem_keys(struct mem_cgroup *memcg) 650 { 651 if (memcg_kmem_is_active(memcg)) { 652 static_key_slow_dec(&memcg_kmem_enabled_key); 653 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 654 } 655 /* 656 * This check can't live in kmem destruction function, 657 * since the charges will outlive the cgroup 658 */ 659 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 660 } 661 #else 662 static void disarm_kmem_keys(struct mem_cgroup *memcg) 663 { 664 } 665 #endif /* CONFIG_MEMCG_KMEM */ 666 667 static void disarm_static_keys(struct mem_cgroup *memcg) 668 { 669 disarm_sock_keys(memcg); 670 disarm_kmem_keys(memcg); 671 } 672 673 static void drain_all_stock_async(struct mem_cgroup *memcg); 674 675 static struct mem_cgroup_per_zone * 676 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 677 { 678 int nid = zone_to_nid(zone); 679 int zid = zone_idx(zone); 680 681 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 682 } 683 684 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 685 { 686 return &memcg->css; 687 } 688 689 static struct mem_cgroup_per_zone * 690 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 691 { 692 int nid = page_to_nid(page); 693 int zid = page_zonenum(page); 694 695 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 696 } 697 698 static struct mem_cgroup_tree_per_zone * 699 soft_limit_tree_node_zone(int nid, int zid) 700 { 701 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 702 } 703 704 static struct mem_cgroup_tree_per_zone * 705 soft_limit_tree_from_page(struct page *page) 706 { 707 int nid = page_to_nid(page); 708 int zid = page_zonenum(page); 709 710 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 711 } 712 713 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 714 struct mem_cgroup_tree_per_zone *mctz, 715 unsigned long long new_usage_in_excess) 716 { 717 struct rb_node **p = &mctz->rb_root.rb_node; 718 struct rb_node *parent = NULL; 719 struct mem_cgroup_per_zone *mz_node; 720 721 if (mz->on_tree) 722 return; 723 724 mz->usage_in_excess = new_usage_in_excess; 725 if (!mz->usage_in_excess) 726 return; 727 while (*p) { 728 parent = *p; 729 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 730 tree_node); 731 if (mz->usage_in_excess < mz_node->usage_in_excess) 732 p = &(*p)->rb_left; 733 /* 734 * We can't avoid mem cgroups that are over their soft 735 * limit by the same amount 736 */ 737 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 738 p = &(*p)->rb_right; 739 } 740 rb_link_node(&mz->tree_node, parent, p); 741 rb_insert_color(&mz->tree_node, &mctz->rb_root); 742 mz->on_tree = true; 743 } 744 745 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 746 struct mem_cgroup_tree_per_zone *mctz) 747 { 748 if (!mz->on_tree) 749 return; 750 rb_erase(&mz->tree_node, &mctz->rb_root); 751 mz->on_tree = false; 752 } 753 754 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 755 struct mem_cgroup_tree_per_zone *mctz) 756 { 757 unsigned long flags; 758 759 spin_lock_irqsave(&mctz->lock, flags); 760 __mem_cgroup_remove_exceeded(mz, mctz); 761 spin_unlock_irqrestore(&mctz->lock, flags); 762 } 763 764 765 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 766 { 767 unsigned long long excess; 768 struct mem_cgroup_per_zone *mz; 769 struct mem_cgroup_tree_per_zone *mctz; 770 771 mctz = soft_limit_tree_from_page(page); 772 /* 773 * Necessary to update all ancestors when hierarchy is used. 774 * because their event counter is not touched. 775 */ 776 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 777 mz = mem_cgroup_page_zoneinfo(memcg, page); 778 excess = res_counter_soft_limit_excess(&memcg->res); 779 /* 780 * We have to update the tree if mz is on RB-tree or 781 * mem is over its softlimit. 782 */ 783 if (excess || mz->on_tree) { 784 unsigned long flags; 785 786 spin_lock_irqsave(&mctz->lock, flags); 787 /* if on-tree, remove it */ 788 if (mz->on_tree) 789 __mem_cgroup_remove_exceeded(mz, mctz); 790 /* 791 * Insert again. mz->usage_in_excess will be updated. 792 * If excess is 0, no tree ops. 793 */ 794 __mem_cgroup_insert_exceeded(mz, mctz, excess); 795 spin_unlock_irqrestore(&mctz->lock, flags); 796 } 797 } 798 } 799 800 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 801 { 802 struct mem_cgroup_tree_per_zone *mctz; 803 struct mem_cgroup_per_zone *mz; 804 int nid, zid; 805 806 for_each_node(nid) { 807 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 808 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 809 mctz = soft_limit_tree_node_zone(nid, zid); 810 mem_cgroup_remove_exceeded(mz, mctz); 811 } 812 } 813 } 814 815 static struct mem_cgroup_per_zone * 816 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 817 { 818 struct rb_node *rightmost = NULL; 819 struct mem_cgroup_per_zone *mz; 820 821 retry: 822 mz = NULL; 823 rightmost = rb_last(&mctz->rb_root); 824 if (!rightmost) 825 goto done; /* Nothing to reclaim from */ 826 827 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 828 /* 829 * Remove the node now but someone else can add it back, 830 * we will to add it back at the end of reclaim to its correct 831 * position in the tree. 832 */ 833 __mem_cgroup_remove_exceeded(mz, mctz); 834 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 835 !css_tryget_online(&mz->memcg->css)) 836 goto retry; 837 done: 838 return mz; 839 } 840 841 static struct mem_cgroup_per_zone * 842 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 843 { 844 struct mem_cgroup_per_zone *mz; 845 846 spin_lock_irq(&mctz->lock); 847 mz = __mem_cgroup_largest_soft_limit_node(mctz); 848 spin_unlock_irq(&mctz->lock); 849 return mz; 850 } 851 852 /* 853 * Implementation Note: reading percpu statistics for memcg. 854 * 855 * Both of vmstat[] and percpu_counter has threshold and do periodic 856 * synchronization to implement "quick" read. There are trade-off between 857 * reading cost and precision of value. Then, we may have a chance to implement 858 * a periodic synchronizion of counter in memcg's counter. 859 * 860 * But this _read() function is used for user interface now. The user accounts 861 * memory usage by memory cgroup and he _always_ requires exact value because 862 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 863 * have to visit all online cpus and make sum. So, for now, unnecessary 864 * synchronization is not implemented. (just implemented for cpu hotplug) 865 * 866 * If there are kernel internal actions which can make use of some not-exact 867 * value, and reading all cpu value can be performance bottleneck in some 868 * common workload, threashold and synchonization as vmstat[] should be 869 * implemented. 870 */ 871 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 872 enum mem_cgroup_stat_index idx) 873 { 874 long val = 0; 875 int cpu; 876 877 get_online_cpus(); 878 for_each_online_cpu(cpu) 879 val += per_cpu(memcg->stat->count[idx], cpu); 880 #ifdef CONFIG_HOTPLUG_CPU 881 spin_lock(&memcg->pcp_counter_lock); 882 val += memcg->nocpu_base.count[idx]; 883 spin_unlock(&memcg->pcp_counter_lock); 884 #endif 885 put_online_cpus(); 886 return val; 887 } 888 889 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 890 enum mem_cgroup_events_index idx) 891 { 892 unsigned long val = 0; 893 int cpu; 894 895 get_online_cpus(); 896 for_each_online_cpu(cpu) 897 val += per_cpu(memcg->stat->events[idx], cpu); 898 #ifdef CONFIG_HOTPLUG_CPU 899 spin_lock(&memcg->pcp_counter_lock); 900 val += memcg->nocpu_base.events[idx]; 901 spin_unlock(&memcg->pcp_counter_lock); 902 #endif 903 put_online_cpus(); 904 return val; 905 } 906 907 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 908 struct page *page, 909 int nr_pages) 910 { 911 /* 912 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 913 * counted as CACHE even if it's on ANON LRU. 914 */ 915 if (PageAnon(page)) 916 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 917 nr_pages); 918 else 919 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 920 nr_pages); 921 922 if (PageTransHuge(page)) 923 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 924 nr_pages); 925 926 /* pagein of a big page is an event. So, ignore page size */ 927 if (nr_pages > 0) 928 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 929 else { 930 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 931 nr_pages = -nr_pages; /* for event */ 932 } 933 934 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 935 } 936 937 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 938 { 939 struct mem_cgroup_per_zone *mz; 940 941 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 942 return mz->lru_size[lru]; 943 } 944 945 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 946 int nid, 947 unsigned int lru_mask) 948 { 949 unsigned long nr = 0; 950 int zid; 951 952 VM_BUG_ON((unsigned)nid >= nr_node_ids); 953 954 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 955 struct mem_cgroup_per_zone *mz; 956 enum lru_list lru; 957 958 for_each_lru(lru) { 959 if (!(BIT(lru) & lru_mask)) 960 continue; 961 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 962 nr += mz->lru_size[lru]; 963 } 964 } 965 return nr; 966 } 967 968 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 969 unsigned int lru_mask) 970 { 971 unsigned long nr = 0; 972 int nid; 973 974 for_each_node_state(nid, N_MEMORY) 975 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 976 return nr; 977 } 978 979 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 980 enum mem_cgroup_events_target target) 981 { 982 unsigned long val, next; 983 984 val = __this_cpu_read(memcg->stat->nr_page_events); 985 next = __this_cpu_read(memcg->stat->targets[target]); 986 /* from time_after() in jiffies.h */ 987 if ((long)next - (long)val < 0) { 988 switch (target) { 989 case MEM_CGROUP_TARGET_THRESH: 990 next = val + THRESHOLDS_EVENTS_TARGET; 991 break; 992 case MEM_CGROUP_TARGET_SOFTLIMIT: 993 next = val + SOFTLIMIT_EVENTS_TARGET; 994 break; 995 case MEM_CGROUP_TARGET_NUMAINFO: 996 next = val + NUMAINFO_EVENTS_TARGET; 997 break; 998 default: 999 break; 1000 } 1001 __this_cpu_write(memcg->stat->targets[target], next); 1002 return true; 1003 } 1004 return false; 1005 } 1006 1007 /* 1008 * Check events in order. 1009 * 1010 */ 1011 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1012 { 1013 /* threshold event is triggered in finer grain than soft limit */ 1014 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1015 MEM_CGROUP_TARGET_THRESH))) { 1016 bool do_softlimit; 1017 bool do_numainfo __maybe_unused; 1018 1019 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1020 MEM_CGROUP_TARGET_SOFTLIMIT); 1021 #if MAX_NUMNODES > 1 1022 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1023 MEM_CGROUP_TARGET_NUMAINFO); 1024 #endif 1025 mem_cgroup_threshold(memcg); 1026 if (unlikely(do_softlimit)) 1027 mem_cgroup_update_tree(memcg, page); 1028 #if MAX_NUMNODES > 1 1029 if (unlikely(do_numainfo)) 1030 atomic_inc(&memcg->numainfo_events); 1031 #endif 1032 } 1033 } 1034 1035 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1036 { 1037 /* 1038 * mm_update_next_owner() may clear mm->owner to NULL 1039 * if it races with swapoff, page migration, etc. 1040 * So this can be called with p == NULL. 1041 */ 1042 if (unlikely(!p)) 1043 return NULL; 1044 1045 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1046 } 1047 1048 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1049 { 1050 struct mem_cgroup *memcg = NULL; 1051 1052 rcu_read_lock(); 1053 do { 1054 /* 1055 * Page cache insertions can happen withou an 1056 * actual mm context, e.g. during disk probing 1057 * on boot, loopback IO, acct() writes etc. 1058 */ 1059 if (unlikely(!mm)) 1060 memcg = root_mem_cgroup; 1061 else { 1062 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1063 if (unlikely(!memcg)) 1064 memcg = root_mem_cgroup; 1065 } 1066 } while (!css_tryget_online(&memcg->css)); 1067 rcu_read_unlock(); 1068 return memcg; 1069 } 1070 1071 /* 1072 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1073 * ref. count) or NULL if the whole root's subtree has been visited. 1074 * 1075 * helper function to be used by mem_cgroup_iter 1076 */ 1077 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1078 struct mem_cgroup *last_visited) 1079 { 1080 struct cgroup_subsys_state *prev_css, *next_css; 1081 1082 prev_css = last_visited ? &last_visited->css : NULL; 1083 skip_node: 1084 next_css = css_next_descendant_pre(prev_css, &root->css); 1085 1086 /* 1087 * Even if we found a group we have to make sure it is 1088 * alive. css && !memcg means that the groups should be 1089 * skipped and we should continue the tree walk. 1090 * last_visited css is safe to use because it is 1091 * protected by css_get and the tree walk is rcu safe. 1092 * 1093 * We do not take a reference on the root of the tree walk 1094 * because we might race with the root removal when it would 1095 * be the only node in the iterated hierarchy and mem_cgroup_iter 1096 * would end up in an endless loop because it expects that at 1097 * least one valid node will be returned. Root cannot disappear 1098 * because caller of the iterator should hold it already so 1099 * skipping css reference should be safe. 1100 */ 1101 if (next_css) { 1102 if ((next_css == &root->css) || 1103 ((next_css->flags & CSS_ONLINE) && 1104 css_tryget_online(next_css))) 1105 return mem_cgroup_from_css(next_css); 1106 1107 prev_css = next_css; 1108 goto skip_node; 1109 } 1110 1111 return NULL; 1112 } 1113 1114 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1115 { 1116 /* 1117 * When a group in the hierarchy below root is destroyed, the 1118 * hierarchy iterator can no longer be trusted since it might 1119 * have pointed to the destroyed group. Invalidate it. 1120 */ 1121 atomic_inc(&root->dead_count); 1122 } 1123 1124 static struct mem_cgroup * 1125 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1126 struct mem_cgroup *root, 1127 int *sequence) 1128 { 1129 struct mem_cgroup *position = NULL; 1130 /* 1131 * A cgroup destruction happens in two stages: offlining and 1132 * release. They are separated by a RCU grace period. 1133 * 1134 * If the iterator is valid, we may still race with an 1135 * offlining. The RCU lock ensures the object won't be 1136 * released, tryget will fail if we lost the race. 1137 */ 1138 *sequence = atomic_read(&root->dead_count); 1139 if (iter->last_dead_count == *sequence) { 1140 smp_rmb(); 1141 position = iter->last_visited; 1142 1143 /* 1144 * We cannot take a reference to root because we might race 1145 * with root removal and returning NULL would end up in 1146 * an endless loop on the iterator user level when root 1147 * would be returned all the time. 1148 */ 1149 if (position && position != root && 1150 !css_tryget_online(&position->css)) 1151 position = NULL; 1152 } 1153 return position; 1154 } 1155 1156 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1157 struct mem_cgroup *last_visited, 1158 struct mem_cgroup *new_position, 1159 struct mem_cgroup *root, 1160 int sequence) 1161 { 1162 /* root reference counting symmetric to mem_cgroup_iter_load */ 1163 if (last_visited && last_visited != root) 1164 css_put(&last_visited->css); 1165 /* 1166 * We store the sequence count from the time @last_visited was 1167 * loaded successfully instead of rereading it here so that we 1168 * don't lose destruction events in between. We could have 1169 * raced with the destruction of @new_position after all. 1170 */ 1171 iter->last_visited = new_position; 1172 smp_wmb(); 1173 iter->last_dead_count = sequence; 1174 } 1175 1176 /** 1177 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1178 * @root: hierarchy root 1179 * @prev: previously returned memcg, NULL on first invocation 1180 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1181 * 1182 * Returns references to children of the hierarchy below @root, or 1183 * @root itself, or %NULL after a full round-trip. 1184 * 1185 * Caller must pass the return value in @prev on subsequent 1186 * invocations for reference counting, or use mem_cgroup_iter_break() 1187 * to cancel a hierarchy walk before the round-trip is complete. 1188 * 1189 * Reclaimers can specify a zone and a priority level in @reclaim to 1190 * divide up the memcgs in the hierarchy among all concurrent 1191 * reclaimers operating on the same zone and priority. 1192 */ 1193 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1194 struct mem_cgroup *prev, 1195 struct mem_cgroup_reclaim_cookie *reclaim) 1196 { 1197 struct mem_cgroup *memcg = NULL; 1198 struct mem_cgroup *last_visited = NULL; 1199 1200 if (mem_cgroup_disabled()) 1201 return NULL; 1202 1203 if (!root) 1204 root = root_mem_cgroup; 1205 1206 if (prev && !reclaim) 1207 last_visited = prev; 1208 1209 if (!root->use_hierarchy && root != root_mem_cgroup) { 1210 if (prev) 1211 goto out_css_put; 1212 return root; 1213 } 1214 1215 rcu_read_lock(); 1216 while (!memcg) { 1217 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1218 int uninitialized_var(seq); 1219 1220 if (reclaim) { 1221 struct mem_cgroup_per_zone *mz; 1222 1223 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1224 iter = &mz->reclaim_iter[reclaim->priority]; 1225 if (prev && reclaim->generation != iter->generation) { 1226 iter->last_visited = NULL; 1227 goto out_unlock; 1228 } 1229 1230 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1231 } 1232 1233 memcg = __mem_cgroup_iter_next(root, last_visited); 1234 1235 if (reclaim) { 1236 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1237 seq); 1238 1239 if (!memcg) 1240 iter->generation++; 1241 else if (!prev && memcg) 1242 reclaim->generation = iter->generation; 1243 } 1244 1245 if (prev && !memcg) 1246 goto out_unlock; 1247 } 1248 out_unlock: 1249 rcu_read_unlock(); 1250 out_css_put: 1251 if (prev && prev != root) 1252 css_put(&prev->css); 1253 1254 return memcg; 1255 } 1256 1257 /** 1258 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1259 * @root: hierarchy root 1260 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1261 */ 1262 void mem_cgroup_iter_break(struct mem_cgroup *root, 1263 struct mem_cgroup *prev) 1264 { 1265 if (!root) 1266 root = root_mem_cgroup; 1267 if (prev && prev != root) 1268 css_put(&prev->css); 1269 } 1270 1271 /* 1272 * Iteration constructs for visiting all cgroups (under a tree). If 1273 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1274 * be used for reference counting. 1275 */ 1276 #define for_each_mem_cgroup_tree(iter, root) \ 1277 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1278 iter != NULL; \ 1279 iter = mem_cgroup_iter(root, iter, NULL)) 1280 1281 #define for_each_mem_cgroup(iter) \ 1282 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1283 iter != NULL; \ 1284 iter = mem_cgroup_iter(NULL, iter, NULL)) 1285 1286 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1287 { 1288 struct mem_cgroup *memcg; 1289 1290 rcu_read_lock(); 1291 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1292 if (unlikely(!memcg)) 1293 goto out; 1294 1295 switch (idx) { 1296 case PGFAULT: 1297 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1298 break; 1299 case PGMAJFAULT: 1300 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1301 break; 1302 default: 1303 BUG(); 1304 } 1305 out: 1306 rcu_read_unlock(); 1307 } 1308 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1309 1310 /** 1311 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1312 * @zone: zone of the wanted lruvec 1313 * @memcg: memcg of the wanted lruvec 1314 * 1315 * Returns the lru list vector holding pages for the given @zone and 1316 * @mem. This can be the global zone lruvec, if the memory controller 1317 * is disabled. 1318 */ 1319 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1320 struct mem_cgroup *memcg) 1321 { 1322 struct mem_cgroup_per_zone *mz; 1323 struct lruvec *lruvec; 1324 1325 if (mem_cgroup_disabled()) { 1326 lruvec = &zone->lruvec; 1327 goto out; 1328 } 1329 1330 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1331 lruvec = &mz->lruvec; 1332 out: 1333 /* 1334 * Since a node can be onlined after the mem_cgroup was created, 1335 * we have to be prepared to initialize lruvec->zone here; 1336 * and if offlined then reonlined, we need to reinitialize it. 1337 */ 1338 if (unlikely(lruvec->zone != zone)) 1339 lruvec->zone = zone; 1340 return lruvec; 1341 } 1342 1343 /** 1344 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1345 * @page: the page 1346 * @zone: zone of the page 1347 */ 1348 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1349 { 1350 struct mem_cgroup_per_zone *mz; 1351 struct mem_cgroup *memcg; 1352 struct page_cgroup *pc; 1353 struct lruvec *lruvec; 1354 1355 if (mem_cgroup_disabled()) { 1356 lruvec = &zone->lruvec; 1357 goto out; 1358 } 1359 1360 pc = lookup_page_cgroup(page); 1361 memcg = pc->mem_cgroup; 1362 1363 /* 1364 * Surreptitiously switch any uncharged offlist page to root: 1365 * an uncharged page off lru does nothing to secure 1366 * its former mem_cgroup from sudden removal. 1367 * 1368 * Our caller holds lru_lock, and PageCgroupUsed is updated 1369 * under page_cgroup lock: between them, they make all uses 1370 * of pc->mem_cgroup safe. 1371 */ 1372 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1373 pc->mem_cgroup = memcg = root_mem_cgroup; 1374 1375 mz = mem_cgroup_page_zoneinfo(memcg, page); 1376 lruvec = &mz->lruvec; 1377 out: 1378 /* 1379 * Since a node can be onlined after the mem_cgroup was created, 1380 * we have to be prepared to initialize lruvec->zone here; 1381 * and if offlined then reonlined, we need to reinitialize it. 1382 */ 1383 if (unlikely(lruvec->zone != zone)) 1384 lruvec->zone = zone; 1385 return lruvec; 1386 } 1387 1388 /** 1389 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1390 * @lruvec: mem_cgroup per zone lru vector 1391 * @lru: index of lru list the page is sitting on 1392 * @nr_pages: positive when adding or negative when removing 1393 * 1394 * This function must be called when a page is added to or removed from an 1395 * lru list. 1396 */ 1397 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1398 int nr_pages) 1399 { 1400 struct mem_cgroup_per_zone *mz; 1401 unsigned long *lru_size; 1402 1403 if (mem_cgroup_disabled()) 1404 return; 1405 1406 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1407 lru_size = mz->lru_size + lru; 1408 *lru_size += nr_pages; 1409 VM_BUG_ON((long)(*lru_size) < 0); 1410 } 1411 1412 /* 1413 * Checks whether given mem is same or in the root_mem_cgroup's 1414 * hierarchy subtree 1415 */ 1416 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1417 struct mem_cgroup *memcg) 1418 { 1419 if (root_memcg == memcg) 1420 return true; 1421 if (!root_memcg->use_hierarchy || !memcg) 1422 return false; 1423 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1424 } 1425 1426 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1427 struct mem_cgroup *memcg) 1428 { 1429 bool ret; 1430 1431 rcu_read_lock(); 1432 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1433 rcu_read_unlock(); 1434 return ret; 1435 } 1436 1437 bool task_in_mem_cgroup(struct task_struct *task, 1438 const struct mem_cgroup *memcg) 1439 { 1440 struct mem_cgroup *curr = NULL; 1441 struct task_struct *p; 1442 bool ret; 1443 1444 p = find_lock_task_mm(task); 1445 if (p) { 1446 curr = get_mem_cgroup_from_mm(p->mm); 1447 task_unlock(p); 1448 } else { 1449 /* 1450 * All threads may have already detached their mm's, but the oom 1451 * killer still needs to detect if they have already been oom 1452 * killed to prevent needlessly killing additional tasks. 1453 */ 1454 rcu_read_lock(); 1455 curr = mem_cgroup_from_task(task); 1456 if (curr) 1457 css_get(&curr->css); 1458 rcu_read_unlock(); 1459 } 1460 /* 1461 * We should check use_hierarchy of "memcg" not "curr". Because checking 1462 * use_hierarchy of "curr" here make this function true if hierarchy is 1463 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1464 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1465 */ 1466 ret = mem_cgroup_same_or_subtree(memcg, curr); 1467 css_put(&curr->css); 1468 return ret; 1469 } 1470 1471 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1472 { 1473 unsigned long inactive_ratio; 1474 unsigned long inactive; 1475 unsigned long active; 1476 unsigned long gb; 1477 1478 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1479 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1480 1481 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1482 if (gb) 1483 inactive_ratio = int_sqrt(10 * gb); 1484 else 1485 inactive_ratio = 1; 1486 1487 return inactive * inactive_ratio < active; 1488 } 1489 1490 #define mem_cgroup_from_res_counter(counter, member) \ 1491 container_of(counter, struct mem_cgroup, member) 1492 1493 /** 1494 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1495 * @memcg: the memory cgroup 1496 * 1497 * Returns the maximum amount of memory @mem can be charged with, in 1498 * pages. 1499 */ 1500 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1501 { 1502 unsigned long long margin; 1503 1504 margin = res_counter_margin(&memcg->res); 1505 if (do_swap_account) 1506 margin = min(margin, res_counter_margin(&memcg->memsw)); 1507 return margin >> PAGE_SHIFT; 1508 } 1509 1510 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1511 { 1512 /* root ? */ 1513 if (mem_cgroup_disabled() || !memcg->css.parent) 1514 return vm_swappiness; 1515 1516 return memcg->swappiness; 1517 } 1518 1519 /* 1520 * memcg->moving_account is used for checking possibility that some thread is 1521 * calling move_account(). When a thread on CPU-A starts moving pages under 1522 * a memcg, other threads should check memcg->moving_account under 1523 * rcu_read_lock(), like this: 1524 * 1525 * CPU-A CPU-B 1526 * rcu_read_lock() 1527 * memcg->moving_account+1 if (memcg->mocing_account) 1528 * take heavy locks. 1529 * synchronize_rcu() update something. 1530 * rcu_read_unlock() 1531 * start move here. 1532 */ 1533 1534 /* for quick checking without looking up memcg */ 1535 atomic_t memcg_moving __read_mostly; 1536 1537 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1538 { 1539 atomic_inc(&memcg_moving); 1540 atomic_inc(&memcg->moving_account); 1541 synchronize_rcu(); 1542 } 1543 1544 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1545 { 1546 /* 1547 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1548 * We check NULL in callee rather than caller. 1549 */ 1550 if (memcg) { 1551 atomic_dec(&memcg_moving); 1552 atomic_dec(&memcg->moving_account); 1553 } 1554 } 1555 1556 /* 1557 * A routine for checking "mem" is under move_account() or not. 1558 * 1559 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1560 * moving cgroups. This is for waiting at high-memory pressure 1561 * caused by "move". 1562 */ 1563 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1564 { 1565 struct mem_cgroup *from; 1566 struct mem_cgroup *to; 1567 bool ret = false; 1568 /* 1569 * Unlike task_move routines, we access mc.to, mc.from not under 1570 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1571 */ 1572 spin_lock(&mc.lock); 1573 from = mc.from; 1574 to = mc.to; 1575 if (!from) 1576 goto unlock; 1577 1578 ret = mem_cgroup_same_or_subtree(memcg, from) 1579 || mem_cgroup_same_or_subtree(memcg, to); 1580 unlock: 1581 spin_unlock(&mc.lock); 1582 return ret; 1583 } 1584 1585 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1586 { 1587 if (mc.moving_task && current != mc.moving_task) { 1588 if (mem_cgroup_under_move(memcg)) { 1589 DEFINE_WAIT(wait); 1590 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1591 /* moving charge context might have finished. */ 1592 if (mc.moving_task) 1593 schedule(); 1594 finish_wait(&mc.waitq, &wait); 1595 return true; 1596 } 1597 } 1598 return false; 1599 } 1600 1601 /* 1602 * Take this lock when 1603 * - a code tries to modify page's memcg while it's USED. 1604 * - a code tries to modify page state accounting in a memcg. 1605 */ 1606 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1607 unsigned long *flags) 1608 { 1609 spin_lock_irqsave(&memcg->move_lock, *flags); 1610 } 1611 1612 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1613 unsigned long *flags) 1614 { 1615 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1616 } 1617 1618 #define K(x) ((x) << (PAGE_SHIFT-10)) 1619 /** 1620 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1621 * @memcg: The memory cgroup that went over limit 1622 * @p: Task that is going to be killed 1623 * 1624 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1625 * enabled 1626 */ 1627 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1628 { 1629 /* oom_info_lock ensures that parallel ooms do not interleave */ 1630 static DEFINE_MUTEX(oom_info_lock); 1631 struct mem_cgroup *iter; 1632 unsigned int i; 1633 1634 if (!p) 1635 return; 1636 1637 mutex_lock(&oom_info_lock); 1638 rcu_read_lock(); 1639 1640 pr_info("Task in "); 1641 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1642 pr_info(" killed as a result of limit of "); 1643 pr_cont_cgroup_path(memcg->css.cgroup); 1644 pr_info("\n"); 1645 1646 rcu_read_unlock(); 1647 1648 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1649 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1650 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1651 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1652 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1653 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1654 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1655 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1656 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1657 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1658 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1659 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1660 1661 for_each_mem_cgroup_tree(iter, memcg) { 1662 pr_info("Memory cgroup stats for "); 1663 pr_cont_cgroup_path(iter->css.cgroup); 1664 pr_cont(":"); 1665 1666 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1667 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1668 continue; 1669 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1670 K(mem_cgroup_read_stat(iter, i))); 1671 } 1672 1673 for (i = 0; i < NR_LRU_LISTS; i++) 1674 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1675 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1676 1677 pr_cont("\n"); 1678 } 1679 mutex_unlock(&oom_info_lock); 1680 } 1681 1682 /* 1683 * This function returns the number of memcg under hierarchy tree. Returns 1684 * 1(self count) if no children. 1685 */ 1686 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1687 { 1688 int num = 0; 1689 struct mem_cgroup *iter; 1690 1691 for_each_mem_cgroup_tree(iter, memcg) 1692 num++; 1693 return num; 1694 } 1695 1696 /* 1697 * Return the memory (and swap, if configured) limit for a memcg. 1698 */ 1699 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1700 { 1701 u64 limit; 1702 1703 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1704 1705 /* 1706 * Do not consider swap space if we cannot swap due to swappiness 1707 */ 1708 if (mem_cgroup_swappiness(memcg)) { 1709 u64 memsw; 1710 1711 limit += total_swap_pages << PAGE_SHIFT; 1712 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1713 1714 /* 1715 * If memsw is finite and limits the amount of swap space 1716 * available to this memcg, return that limit. 1717 */ 1718 limit = min(limit, memsw); 1719 } 1720 1721 return limit; 1722 } 1723 1724 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1725 int order) 1726 { 1727 struct mem_cgroup *iter; 1728 unsigned long chosen_points = 0; 1729 unsigned long totalpages; 1730 unsigned int points = 0; 1731 struct task_struct *chosen = NULL; 1732 1733 /* 1734 * If current has a pending SIGKILL or is exiting, then automatically 1735 * select it. The goal is to allow it to allocate so that it may 1736 * quickly exit and free its memory. 1737 */ 1738 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1739 set_thread_flag(TIF_MEMDIE); 1740 return; 1741 } 1742 1743 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1744 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1745 for_each_mem_cgroup_tree(iter, memcg) { 1746 struct css_task_iter it; 1747 struct task_struct *task; 1748 1749 css_task_iter_start(&iter->css, &it); 1750 while ((task = css_task_iter_next(&it))) { 1751 switch (oom_scan_process_thread(task, totalpages, NULL, 1752 false)) { 1753 case OOM_SCAN_SELECT: 1754 if (chosen) 1755 put_task_struct(chosen); 1756 chosen = task; 1757 chosen_points = ULONG_MAX; 1758 get_task_struct(chosen); 1759 /* fall through */ 1760 case OOM_SCAN_CONTINUE: 1761 continue; 1762 case OOM_SCAN_ABORT: 1763 css_task_iter_end(&it); 1764 mem_cgroup_iter_break(memcg, iter); 1765 if (chosen) 1766 put_task_struct(chosen); 1767 return; 1768 case OOM_SCAN_OK: 1769 break; 1770 }; 1771 points = oom_badness(task, memcg, NULL, totalpages); 1772 if (!points || points < chosen_points) 1773 continue; 1774 /* Prefer thread group leaders for display purposes */ 1775 if (points == chosen_points && 1776 thread_group_leader(chosen)) 1777 continue; 1778 1779 if (chosen) 1780 put_task_struct(chosen); 1781 chosen = task; 1782 chosen_points = points; 1783 get_task_struct(chosen); 1784 } 1785 css_task_iter_end(&it); 1786 } 1787 1788 if (!chosen) 1789 return; 1790 points = chosen_points * 1000 / totalpages; 1791 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1792 NULL, "Memory cgroup out of memory"); 1793 } 1794 1795 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1796 gfp_t gfp_mask, 1797 unsigned long flags) 1798 { 1799 unsigned long total = 0; 1800 bool noswap = false; 1801 int loop; 1802 1803 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1804 noswap = true; 1805 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1806 noswap = true; 1807 1808 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1809 if (loop) 1810 drain_all_stock_async(memcg); 1811 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1812 /* 1813 * Allow limit shrinkers, which are triggered directly 1814 * by userspace, to catch signals and stop reclaim 1815 * after minimal progress, regardless of the margin. 1816 */ 1817 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1818 break; 1819 if (mem_cgroup_margin(memcg)) 1820 break; 1821 /* 1822 * If nothing was reclaimed after two attempts, there 1823 * may be no reclaimable pages in this hierarchy. 1824 */ 1825 if (loop && !total) 1826 break; 1827 } 1828 return total; 1829 } 1830 1831 /** 1832 * test_mem_cgroup_node_reclaimable 1833 * @memcg: the target memcg 1834 * @nid: the node ID to be checked. 1835 * @noswap : specify true here if the user wants flle only information. 1836 * 1837 * This function returns whether the specified memcg contains any 1838 * reclaimable pages on a node. Returns true if there are any reclaimable 1839 * pages in the node. 1840 */ 1841 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1842 int nid, bool noswap) 1843 { 1844 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1845 return true; 1846 if (noswap || !total_swap_pages) 1847 return false; 1848 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1849 return true; 1850 return false; 1851 1852 } 1853 #if MAX_NUMNODES > 1 1854 1855 /* 1856 * Always updating the nodemask is not very good - even if we have an empty 1857 * list or the wrong list here, we can start from some node and traverse all 1858 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1859 * 1860 */ 1861 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1862 { 1863 int nid; 1864 /* 1865 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1866 * pagein/pageout changes since the last update. 1867 */ 1868 if (!atomic_read(&memcg->numainfo_events)) 1869 return; 1870 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1871 return; 1872 1873 /* make a nodemask where this memcg uses memory from */ 1874 memcg->scan_nodes = node_states[N_MEMORY]; 1875 1876 for_each_node_mask(nid, node_states[N_MEMORY]) { 1877 1878 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1879 node_clear(nid, memcg->scan_nodes); 1880 } 1881 1882 atomic_set(&memcg->numainfo_events, 0); 1883 atomic_set(&memcg->numainfo_updating, 0); 1884 } 1885 1886 /* 1887 * Selecting a node where we start reclaim from. Because what we need is just 1888 * reducing usage counter, start from anywhere is O,K. Considering 1889 * memory reclaim from current node, there are pros. and cons. 1890 * 1891 * Freeing memory from current node means freeing memory from a node which 1892 * we'll use or we've used. So, it may make LRU bad. And if several threads 1893 * hit limits, it will see a contention on a node. But freeing from remote 1894 * node means more costs for memory reclaim because of memory latency. 1895 * 1896 * Now, we use round-robin. Better algorithm is welcomed. 1897 */ 1898 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1899 { 1900 int node; 1901 1902 mem_cgroup_may_update_nodemask(memcg); 1903 node = memcg->last_scanned_node; 1904 1905 node = next_node(node, memcg->scan_nodes); 1906 if (node == MAX_NUMNODES) 1907 node = first_node(memcg->scan_nodes); 1908 /* 1909 * We call this when we hit limit, not when pages are added to LRU. 1910 * No LRU may hold pages because all pages are UNEVICTABLE or 1911 * memcg is too small and all pages are not on LRU. In that case, 1912 * we use curret node. 1913 */ 1914 if (unlikely(node == MAX_NUMNODES)) 1915 node = numa_node_id(); 1916 1917 memcg->last_scanned_node = node; 1918 return node; 1919 } 1920 1921 /* 1922 * Check all nodes whether it contains reclaimable pages or not. 1923 * For quick scan, we make use of scan_nodes. This will allow us to skip 1924 * unused nodes. But scan_nodes is lazily updated and may not cotain 1925 * enough new information. We need to do double check. 1926 */ 1927 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1928 { 1929 int nid; 1930 1931 /* 1932 * quick check...making use of scan_node. 1933 * We can skip unused nodes. 1934 */ 1935 if (!nodes_empty(memcg->scan_nodes)) { 1936 for (nid = first_node(memcg->scan_nodes); 1937 nid < MAX_NUMNODES; 1938 nid = next_node(nid, memcg->scan_nodes)) { 1939 1940 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1941 return true; 1942 } 1943 } 1944 /* 1945 * Check rest of nodes. 1946 */ 1947 for_each_node_state(nid, N_MEMORY) { 1948 if (node_isset(nid, memcg->scan_nodes)) 1949 continue; 1950 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1951 return true; 1952 } 1953 return false; 1954 } 1955 1956 #else 1957 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1958 { 1959 return 0; 1960 } 1961 1962 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1963 { 1964 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1965 } 1966 #endif 1967 1968 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1969 struct zone *zone, 1970 gfp_t gfp_mask, 1971 unsigned long *total_scanned) 1972 { 1973 struct mem_cgroup *victim = NULL; 1974 int total = 0; 1975 int loop = 0; 1976 unsigned long excess; 1977 unsigned long nr_scanned; 1978 struct mem_cgroup_reclaim_cookie reclaim = { 1979 .zone = zone, 1980 .priority = 0, 1981 }; 1982 1983 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1984 1985 while (1) { 1986 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1987 if (!victim) { 1988 loop++; 1989 if (loop >= 2) { 1990 /* 1991 * If we have not been able to reclaim 1992 * anything, it might because there are 1993 * no reclaimable pages under this hierarchy 1994 */ 1995 if (!total) 1996 break; 1997 /* 1998 * We want to do more targeted reclaim. 1999 * excess >> 2 is not to excessive so as to 2000 * reclaim too much, nor too less that we keep 2001 * coming back to reclaim from this cgroup 2002 */ 2003 if (total >= (excess >> 2) || 2004 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2005 break; 2006 } 2007 continue; 2008 } 2009 if (!mem_cgroup_reclaimable(victim, false)) 2010 continue; 2011 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2012 zone, &nr_scanned); 2013 *total_scanned += nr_scanned; 2014 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2015 break; 2016 } 2017 mem_cgroup_iter_break(root_memcg, victim); 2018 return total; 2019 } 2020 2021 #ifdef CONFIG_LOCKDEP 2022 static struct lockdep_map memcg_oom_lock_dep_map = { 2023 .name = "memcg_oom_lock", 2024 }; 2025 #endif 2026 2027 static DEFINE_SPINLOCK(memcg_oom_lock); 2028 2029 /* 2030 * Check OOM-Killer is already running under our hierarchy. 2031 * If someone is running, return false. 2032 */ 2033 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 2034 { 2035 struct mem_cgroup *iter, *failed = NULL; 2036 2037 spin_lock(&memcg_oom_lock); 2038 2039 for_each_mem_cgroup_tree(iter, memcg) { 2040 if (iter->oom_lock) { 2041 /* 2042 * this subtree of our hierarchy is already locked 2043 * so we cannot give a lock. 2044 */ 2045 failed = iter; 2046 mem_cgroup_iter_break(memcg, iter); 2047 break; 2048 } else 2049 iter->oom_lock = true; 2050 } 2051 2052 if (failed) { 2053 /* 2054 * OK, we failed to lock the whole subtree so we have 2055 * to clean up what we set up to the failing subtree 2056 */ 2057 for_each_mem_cgroup_tree(iter, memcg) { 2058 if (iter == failed) { 2059 mem_cgroup_iter_break(memcg, iter); 2060 break; 2061 } 2062 iter->oom_lock = false; 2063 } 2064 } else 2065 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 2066 2067 spin_unlock(&memcg_oom_lock); 2068 2069 return !failed; 2070 } 2071 2072 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2073 { 2074 struct mem_cgroup *iter; 2075 2076 spin_lock(&memcg_oom_lock); 2077 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 2078 for_each_mem_cgroup_tree(iter, memcg) 2079 iter->oom_lock = false; 2080 spin_unlock(&memcg_oom_lock); 2081 } 2082 2083 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2084 { 2085 struct mem_cgroup *iter; 2086 2087 for_each_mem_cgroup_tree(iter, memcg) 2088 atomic_inc(&iter->under_oom); 2089 } 2090 2091 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2092 { 2093 struct mem_cgroup *iter; 2094 2095 /* 2096 * When a new child is created while the hierarchy is under oom, 2097 * mem_cgroup_oom_lock() may not be called. We have to use 2098 * atomic_add_unless() here. 2099 */ 2100 for_each_mem_cgroup_tree(iter, memcg) 2101 atomic_add_unless(&iter->under_oom, -1, 0); 2102 } 2103 2104 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2105 2106 struct oom_wait_info { 2107 struct mem_cgroup *memcg; 2108 wait_queue_t wait; 2109 }; 2110 2111 static int memcg_oom_wake_function(wait_queue_t *wait, 2112 unsigned mode, int sync, void *arg) 2113 { 2114 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2115 struct mem_cgroup *oom_wait_memcg; 2116 struct oom_wait_info *oom_wait_info; 2117 2118 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2119 oom_wait_memcg = oom_wait_info->memcg; 2120 2121 /* 2122 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2123 * Then we can use css_is_ancestor without taking care of RCU. 2124 */ 2125 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2126 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2127 return 0; 2128 return autoremove_wake_function(wait, mode, sync, arg); 2129 } 2130 2131 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2132 { 2133 atomic_inc(&memcg->oom_wakeups); 2134 /* for filtering, pass "memcg" as argument. */ 2135 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2136 } 2137 2138 static void memcg_oom_recover(struct mem_cgroup *memcg) 2139 { 2140 if (memcg && atomic_read(&memcg->under_oom)) 2141 memcg_wakeup_oom(memcg); 2142 } 2143 2144 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2145 { 2146 if (!current->memcg_oom.may_oom) 2147 return; 2148 /* 2149 * We are in the middle of the charge context here, so we 2150 * don't want to block when potentially sitting on a callstack 2151 * that holds all kinds of filesystem and mm locks. 2152 * 2153 * Also, the caller may handle a failed allocation gracefully 2154 * (like optional page cache readahead) and so an OOM killer 2155 * invocation might not even be necessary. 2156 * 2157 * That's why we don't do anything here except remember the 2158 * OOM context and then deal with it at the end of the page 2159 * fault when the stack is unwound, the locks are released, 2160 * and when we know whether the fault was overall successful. 2161 */ 2162 css_get(&memcg->css); 2163 current->memcg_oom.memcg = memcg; 2164 current->memcg_oom.gfp_mask = mask; 2165 current->memcg_oom.order = order; 2166 } 2167 2168 /** 2169 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2170 * @handle: actually kill/wait or just clean up the OOM state 2171 * 2172 * This has to be called at the end of a page fault if the memcg OOM 2173 * handler was enabled. 2174 * 2175 * Memcg supports userspace OOM handling where failed allocations must 2176 * sleep on a waitqueue until the userspace task resolves the 2177 * situation. Sleeping directly in the charge context with all kinds 2178 * of locks held is not a good idea, instead we remember an OOM state 2179 * in the task and mem_cgroup_oom_synchronize() has to be called at 2180 * the end of the page fault to complete the OOM handling. 2181 * 2182 * Returns %true if an ongoing memcg OOM situation was detected and 2183 * completed, %false otherwise. 2184 */ 2185 bool mem_cgroup_oom_synchronize(bool handle) 2186 { 2187 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2188 struct oom_wait_info owait; 2189 bool locked; 2190 2191 /* OOM is global, do not handle */ 2192 if (!memcg) 2193 return false; 2194 2195 if (!handle) 2196 goto cleanup; 2197 2198 owait.memcg = memcg; 2199 owait.wait.flags = 0; 2200 owait.wait.func = memcg_oom_wake_function; 2201 owait.wait.private = current; 2202 INIT_LIST_HEAD(&owait.wait.task_list); 2203 2204 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2205 mem_cgroup_mark_under_oom(memcg); 2206 2207 locked = mem_cgroup_oom_trylock(memcg); 2208 2209 if (locked) 2210 mem_cgroup_oom_notify(memcg); 2211 2212 if (locked && !memcg->oom_kill_disable) { 2213 mem_cgroup_unmark_under_oom(memcg); 2214 finish_wait(&memcg_oom_waitq, &owait.wait); 2215 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2216 current->memcg_oom.order); 2217 } else { 2218 schedule(); 2219 mem_cgroup_unmark_under_oom(memcg); 2220 finish_wait(&memcg_oom_waitq, &owait.wait); 2221 } 2222 2223 if (locked) { 2224 mem_cgroup_oom_unlock(memcg); 2225 /* 2226 * There is no guarantee that an OOM-lock contender 2227 * sees the wakeups triggered by the OOM kill 2228 * uncharges. Wake any sleepers explicitely. 2229 */ 2230 memcg_oom_recover(memcg); 2231 } 2232 cleanup: 2233 current->memcg_oom.memcg = NULL; 2234 css_put(&memcg->css); 2235 return true; 2236 } 2237 2238 /* 2239 * Used to update mapped file or writeback or other statistics. 2240 * 2241 * Notes: Race condition 2242 * 2243 * Charging occurs during page instantiation, while the page is 2244 * unmapped and locked in page migration, or while the page table is 2245 * locked in THP migration. No race is possible. 2246 * 2247 * Uncharge happens to pages with zero references, no race possible. 2248 * 2249 * Charge moving between groups is protected by checking mm->moving 2250 * account and taking the move_lock in the slowpath. 2251 */ 2252 2253 void __mem_cgroup_begin_update_page_stat(struct page *page, 2254 bool *locked, unsigned long *flags) 2255 { 2256 struct mem_cgroup *memcg; 2257 struct page_cgroup *pc; 2258 2259 pc = lookup_page_cgroup(page); 2260 again: 2261 memcg = pc->mem_cgroup; 2262 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2263 return; 2264 /* 2265 * If this memory cgroup is not under account moving, we don't 2266 * need to take move_lock_mem_cgroup(). Because we already hold 2267 * rcu_read_lock(), any calls to move_account will be delayed until 2268 * rcu_read_unlock(). 2269 */ 2270 VM_BUG_ON(!rcu_read_lock_held()); 2271 if (atomic_read(&memcg->moving_account) <= 0) 2272 return; 2273 2274 move_lock_mem_cgroup(memcg, flags); 2275 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2276 move_unlock_mem_cgroup(memcg, flags); 2277 goto again; 2278 } 2279 *locked = true; 2280 } 2281 2282 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2283 { 2284 struct page_cgroup *pc = lookup_page_cgroup(page); 2285 2286 /* 2287 * It's guaranteed that pc->mem_cgroup never changes while 2288 * lock is held because a routine modifies pc->mem_cgroup 2289 * should take move_lock_mem_cgroup(). 2290 */ 2291 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2292 } 2293 2294 void mem_cgroup_update_page_stat(struct page *page, 2295 enum mem_cgroup_stat_index idx, int val) 2296 { 2297 struct mem_cgroup *memcg; 2298 struct page_cgroup *pc = lookup_page_cgroup(page); 2299 unsigned long uninitialized_var(flags); 2300 2301 if (mem_cgroup_disabled()) 2302 return; 2303 2304 VM_BUG_ON(!rcu_read_lock_held()); 2305 memcg = pc->mem_cgroup; 2306 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2307 return; 2308 2309 this_cpu_add(memcg->stat->count[idx], val); 2310 } 2311 2312 /* 2313 * size of first charge trial. "32" comes from vmscan.c's magic value. 2314 * TODO: maybe necessary to use big numbers in big irons. 2315 */ 2316 #define CHARGE_BATCH 32U 2317 struct memcg_stock_pcp { 2318 struct mem_cgroup *cached; /* this never be root cgroup */ 2319 unsigned int nr_pages; 2320 struct work_struct work; 2321 unsigned long flags; 2322 #define FLUSHING_CACHED_CHARGE 0 2323 }; 2324 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2325 static DEFINE_MUTEX(percpu_charge_mutex); 2326 2327 /** 2328 * consume_stock: Try to consume stocked charge on this cpu. 2329 * @memcg: memcg to consume from. 2330 * @nr_pages: how many pages to charge. 2331 * 2332 * The charges will only happen if @memcg matches the current cpu's memcg 2333 * stock, and at least @nr_pages are available in that stock. Failure to 2334 * service an allocation will refill the stock. 2335 * 2336 * returns true if successful, false otherwise. 2337 */ 2338 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2339 { 2340 struct memcg_stock_pcp *stock; 2341 bool ret = true; 2342 2343 if (nr_pages > CHARGE_BATCH) 2344 return false; 2345 2346 stock = &get_cpu_var(memcg_stock); 2347 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2348 stock->nr_pages -= nr_pages; 2349 else /* need to call res_counter_charge */ 2350 ret = false; 2351 put_cpu_var(memcg_stock); 2352 return ret; 2353 } 2354 2355 /* 2356 * Returns stocks cached in percpu to res_counter and reset cached information. 2357 */ 2358 static void drain_stock(struct memcg_stock_pcp *stock) 2359 { 2360 struct mem_cgroup *old = stock->cached; 2361 2362 if (stock->nr_pages) { 2363 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2364 2365 res_counter_uncharge(&old->res, bytes); 2366 if (do_swap_account) 2367 res_counter_uncharge(&old->memsw, bytes); 2368 stock->nr_pages = 0; 2369 } 2370 stock->cached = NULL; 2371 } 2372 2373 /* 2374 * This must be called under preempt disabled or must be called by 2375 * a thread which is pinned to local cpu. 2376 */ 2377 static void drain_local_stock(struct work_struct *dummy) 2378 { 2379 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2380 drain_stock(stock); 2381 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2382 } 2383 2384 static void __init memcg_stock_init(void) 2385 { 2386 int cpu; 2387 2388 for_each_possible_cpu(cpu) { 2389 struct memcg_stock_pcp *stock = 2390 &per_cpu(memcg_stock, cpu); 2391 INIT_WORK(&stock->work, drain_local_stock); 2392 } 2393 } 2394 2395 /* 2396 * Cache charges(val) which is from res_counter, to local per_cpu area. 2397 * This will be consumed by consume_stock() function, later. 2398 */ 2399 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2400 { 2401 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2402 2403 if (stock->cached != memcg) { /* reset if necessary */ 2404 drain_stock(stock); 2405 stock->cached = memcg; 2406 } 2407 stock->nr_pages += nr_pages; 2408 put_cpu_var(memcg_stock); 2409 } 2410 2411 /* 2412 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2413 * of the hierarchy under it. sync flag says whether we should block 2414 * until the work is done. 2415 */ 2416 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2417 { 2418 int cpu, curcpu; 2419 2420 /* Notify other cpus that system-wide "drain" is running */ 2421 get_online_cpus(); 2422 curcpu = get_cpu(); 2423 for_each_online_cpu(cpu) { 2424 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2425 struct mem_cgroup *memcg; 2426 2427 memcg = stock->cached; 2428 if (!memcg || !stock->nr_pages) 2429 continue; 2430 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2431 continue; 2432 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2433 if (cpu == curcpu) 2434 drain_local_stock(&stock->work); 2435 else 2436 schedule_work_on(cpu, &stock->work); 2437 } 2438 } 2439 put_cpu(); 2440 2441 if (!sync) 2442 goto out; 2443 2444 for_each_online_cpu(cpu) { 2445 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2446 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2447 flush_work(&stock->work); 2448 } 2449 out: 2450 put_online_cpus(); 2451 } 2452 2453 /* 2454 * Tries to drain stocked charges in other cpus. This function is asynchronous 2455 * and just put a work per cpu for draining localy on each cpu. Caller can 2456 * expects some charges will be back to res_counter later but cannot wait for 2457 * it. 2458 */ 2459 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2460 { 2461 /* 2462 * If someone calls draining, avoid adding more kworker runs. 2463 */ 2464 if (!mutex_trylock(&percpu_charge_mutex)) 2465 return; 2466 drain_all_stock(root_memcg, false); 2467 mutex_unlock(&percpu_charge_mutex); 2468 } 2469 2470 /* This is a synchronous drain interface. */ 2471 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2472 { 2473 /* called when force_empty is called */ 2474 mutex_lock(&percpu_charge_mutex); 2475 drain_all_stock(root_memcg, true); 2476 mutex_unlock(&percpu_charge_mutex); 2477 } 2478 2479 /* 2480 * This function drains percpu counter value from DEAD cpu and 2481 * move it to local cpu. Note that this function can be preempted. 2482 */ 2483 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2484 { 2485 int i; 2486 2487 spin_lock(&memcg->pcp_counter_lock); 2488 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2489 long x = per_cpu(memcg->stat->count[i], cpu); 2490 2491 per_cpu(memcg->stat->count[i], cpu) = 0; 2492 memcg->nocpu_base.count[i] += x; 2493 } 2494 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2495 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2496 2497 per_cpu(memcg->stat->events[i], cpu) = 0; 2498 memcg->nocpu_base.events[i] += x; 2499 } 2500 spin_unlock(&memcg->pcp_counter_lock); 2501 } 2502 2503 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2504 unsigned long action, 2505 void *hcpu) 2506 { 2507 int cpu = (unsigned long)hcpu; 2508 struct memcg_stock_pcp *stock; 2509 struct mem_cgroup *iter; 2510 2511 if (action == CPU_ONLINE) 2512 return NOTIFY_OK; 2513 2514 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2515 return NOTIFY_OK; 2516 2517 for_each_mem_cgroup(iter) 2518 mem_cgroup_drain_pcp_counter(iter, cpu); 2519 2520 stock = &per_cpu(memcg_stock, cpu); 2521 drain_stock(stock); 2522 return NOTIFY_OK; 2523 } 2524 2525 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2526 unsigned int nr_pages) 2527 { 2528 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2529 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2530 struct mem_cgroup *mem_over_limit; 2531 struct res_counter *fail_res; 2532 unsigned long nr_reclaimed; 2533 unsigned long flags = 0; 2534 unsigned long long size; 2535 int ret = 0; 2536 2537 retry: 2538 if (consume_stock(memcg, nr_pages)) 2539 goto done; 2540 2541 size = batch * PAGE_SIZE; 2542 if (!res_counter_charge(&memcg->res, size, &fail_res)) { 2543 if (!do_swap_account) 2544 goto done_restock; 2545 if (!res_counter_charge(&memcg->memsw, size, &fail_res)) 2546 goto done_restock; 2547 res_counter_uncharge(&memcg->res, size); 2548 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2549 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2550 } else 2551 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2552 2553 if (batch > nr_pages) { 2554 batch = nr_pages; 2555 goto retry; 2556 } 2557 2558 /* 2559 * Unlike in global OOM situations, memcg is not in a physical 2560 * memory shortage. Allow dying and OOM-killed tasks to 2561 * bypass the last charges so that they can exit quickly and 2562 * free their memory. 2563 */ 2564 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2565 fatal_signal_pending(current) || 2566 current->flags & PF_EXITING)) 2567 goto bypass; 2568 2569 if (unlikely(task_in_memcg_oom(current))) 2570 goto nomem; 2571 2572 if (!(gfp_mask & __GFP_WAIT)) 2573 goto nomem; 2574 2575 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2576 2577 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2578 goto retry; 2579 2580 if (gfp_mask & __GFP_NORETRY) 2581 goto nomem; 2582 /* 2583 * Even though the limit is exceeded at this point, reclaim 2584 * may have been able to free some pages. Retry the charge 2585 * before killing the task. 2586 * 2587 * Only for regular pages, though: huge pages are rather 2588 * unlikely to succeed so close to the limit, and we fall back 2589 * to regular pages anyway in case of failure. 2590 */ 2591 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2592 goto retry; 2593 /* 2594 * At task move, charge accounts can be doubly counted. So, it's 2595 * better to wait until the end of task_move if something is going on. 2596 */ 2597 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2598 goto retry; 2599 2600 if (nr_retries--) 2601 goto retry; 2602 2603 if (gfp_mask & __GFP_NOFAIL) 2604 goto bypass; 2605 2606 if (fatal_signal_pending(current)) 2607 goto bypass; 2608 2609 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2610 nomem: 2611 if (!(gfp_mask & __GFP_NOFAIL)) 2612 return -ENOMEM; 2613 bypass: 2614 memcg = root_mem_cgroup; 2615 ret = -EINTR; 2616 goto retry; 2617 2618 done_restock: 2619 if (batch > nr_pages) 2620 refill_stock(memcg, batch - nr_pages); 2621 done: 2622 return ret; 2623 } 2624 2625 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2626 { 2627 unsigned long bytes = nr_pages * PAGE_SIZE; 2628 2629 res_counter_uncharge(&memcg->res, bytes); 2630 if (do_swap_account) 2631 res_counter_uncharge(&memcg->memsw, bytes); 2632 } 2633 2634 /* 2635 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2636 * This is useful when moving usage to parent cgroup. 2637 */ 2638 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2639 unsigned int nr_pages) 2640 { 2641 unsigned long bytes = nr_pages * PAGE_SIZE; 2642 2643 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2644 if (do_swap_account) 2645 res_counter_uncharge_until(&memcg->memsw, 2646 memcg->memsw.parent, bytes); 2647 } 2648 2649 /* 2650 * A helper function to get mem_cgroup from ID. must be called under 2651 * rcu_read_lock(). The caller is responsible for calling 2652 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 2653 * refcnt from swap can be called against removed memcg.) 2654 */ 2655 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2656 { 2657 /* ID 0 is unused ID */ 2658 if (!id) 2659 return NULL; 2660 return mem_cgroup_from_id(id); 2661 } 2662 2663 /* 2664 * try_get_mem_cgroup_from_page - look up page's memcg association 2665 * @page: the page 2666 * 2667 * Look up, get a css reference, and return the memcg that owns @page. 2668 * 2669 * The page must be locked to prevent racing with swap-in and page 2670 * cache charges. If coming from an unlocked page table, the caller 2671 * must ensure the page is on the LRU or this can race with charging. 2672 */ 2673 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2674 { 2675 struct mem_cgroup *memcg = NULL; 2676 struct page_cgroup *pc; 2677 unsigned short id; 2678 swp_entry_t ent; 2679 2680 VM_BUG_ON_PAGE(!PageLocked(page), page); 2681 2682 pc = lookup_page_cgroup(page); 2683 if (PageCgroupUsed(pc)) { 2684 memcg = pc->mem_cgroup; 2685 if (memcg && !css_tryget_online(&memcg->css)) 2686 memcg = NULL; 2687 } else if (PageSwapCache(page)) { 2688 ent.val = page_private(page); 2689 id = lookup_swap_cgroup_id(ent); 2690 rcu_read_lock(); 2691 memcg = mem_cgroup_lookup(id); 2692 if (memcg && !css_tryget_online(&memcg->css)) 2693 memcg = NULL; 2694 rcu_read_unlock(); 2695 } 2696 return memcg; 2697 } 2698 2699 static void lock_page_lru(struct page *page, int *isolated) 2700 { 2701 struct zone *zone = page_zone(page); 2702 2703 spin_lock_irq(&zone->lru_lock); 2704 if (PageLRU(page)) { 2705 struct lruvec *lruvec; 2706 2707 lruvec = mem_cgroup_page_lruvec(page, zone); 2708 ClearPageLRU(page); 2709 del_page_from_lru_list(page, lruvec, page_lru(page)); 2710 *isolated = 1; 2711 } else 2712 *isolated = 0; 2713 } 2714 2715 static void unlock_page_lru(struct page *page, int isolated) 2716 { 2717 struct zone *zone = page_zone(page); 2718 2719 if (isolated) { 2720 struct lruvec *lruvec; 2721 2722 lruvec = mem_cgroup_page_lruvec(page, zone); 2723 VM_BUG_ON_PAGE(PageLRU(page), page); 2724 SetPageLRU(page); 2725 add_page_to_lru_list(page, lruvec, page_lru(page)); 2726 } 2727 spin_unlock_irq(&zone->lru_lock); 2728 } 2729 2730 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2731 bool lrucare) 2732 { 2733 struct page_cgroup *pc = lookup_page_cgroup(page); 2734 int isolated; 2735 2736 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2737 /* 2738 * we don't need page_cgroup_lock about tail pages, becase they are not 2739 * accessed by any other context at this point. 2740 */ 2741 2742 /* 2743 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2744 * may already be on some other mem_cgroup's LRU. Take care of it. 2745 */ 2746 if (lrucare) 2747 lock_page_lru(page, &isolated); 2748 2749 /* 2750 * Nobody should be changing or seriously looking at 2751 * pc->mem_cgroup and pc->flags at this point: 2752 * 2753 * - the page is uncharged 2754 * 2755 * - the page is off-LRU 2756 * 2757 * - an anonymous fault has exclusive page access, except for 2758 * a locked page table 2759 * 2760 * - a page cache insertion, a swapin fault, or a migration 2761 * have the page locked 2762 */ 2763 pc->mem_cgroup = memcg; 2764 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); 2765 2766 if (lrucare) 2767 unlock_page_lru(page, isolated); 2768 } 2769 2770 static DEFINE_MUTEX(set_limit_mutex); 2771 2772 #ifdef CONFIG_MEMCG_KMEM 2773 /* 2774 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2775 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. 2776 */ 2777 static DEFINE_MUTEX(memcg_slab_mutex); 2778 2779 static DEFINE_MUTEX(activate_kmem_mutex); 2780 2781 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2782 { 2783 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2784 memcg_kmem_is_active(memcg); 2785 } 2786 2787 /* 2788 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2789 * in the memcg_cache_params struct. 2790 */ 2791 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2792 { 2793 struct kmem_cache *cachep; 2794 2795 VM_BUG_ON(p->is_root_cache); 2796 cachep = p->root_cache; 2797 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2798 } 2799 2800 #ifdef CONFIG_SLABINFO 2801 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2802 { 2803 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2804 struct memcg_cache_params *params; 2805 2806 if (!memcg_can_account_kmem(memcg)) 2807 return -EIO; 2808 2809 print_slabinfo_header(m); 2810 2811 mutex_lock(&memcg_slab_mutex); 2812 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2813 cache_show(memcg_params_to_cache(params), m); 2814 mutex_unlock(&memcg_slab_mutex); 2815 2816 return 0; 2817 } 2818 #endif 2819 2820 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2821 { 2822 struct res_counter *fail_res; 2823 int ret = 0; 2824 2825 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2826 if (ret) 2827 return ret; 2828 2829 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2830 if (ret == -EINTR) { 2831 /* 2832 * try_charge() chose to bypass to root due to OOM kill or 2833 * fatal signal. Since our only options are to either fail 2834 * the allocation or charge it to this cgroup, do it as a 2835 * temporary condition. But we can't fail. From a kmem/slab 2836 * perspective, the cache has already been selected, by 2837 * mem_cgroup_kmem_get_cache(), so it is too late to change 2838 * our minds. 2839 * 2840 * This condition will only trigger if the task entered 2841 * memcg_charge_kmem in a sane state, but was OOM-killed 2842 * during try_charge() above. Tasks that were already dying 2843 * when the allocation triggers should have been already 2844 * directed to the root cgroup in memcontrol.h 2845 */ 2846 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2847 if (do_swap_account) 2848 res_counter_charge_nofail(&memcg->memsw, size, 2849 &fail_res); 2850 ret = 0; 2851 } else if (ret) 2852 res_counter_uncharge(&memcg->kmem, size); 2853 2854 return ret; 2855 } 2856 2857 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2858 { 2859 res_counter_uncharge(&memcg->res, size); 2860 if (do_swap_account) 2861 res_counter_uncharge(&memcg->memsw, size); 2862 2863 /* Not down to 0 */ 2864 if (res_counter_uncharge(&memcg->kmem, size)) 2865 return; 2866 2867 /* 2868 * Releases a reference taken in kmem_cgroup_css_offline in case 2869 * this last uncharge is racing with the offlining code or it is 2870 * outliving the memcg existence. 2871 * 2872 * The memory barrier imposed by test&clear is paired with the 2873 * explicit one in memcg_kmem_mark_dead(). 2874 */ 2875 if (memcg_kmem_test_and_clear_dead(memcg)) 2876 css_put(&memcg->css); 2877 } 2878 2879 /* 2880 * helper for acessing a memcg's index. It will be used as an index in the 2881 * child cache array in kmem_cache, and also to derive its name. This function 2882 * will return -1 when this is not a kmem-limited memcg. 2883 */ 2884 int memcg_cache_id(struct mem_cgroup *memcg) 2885 { 2886 return memcg ? memcg->kmemcg_id : -1; 2887 } 2888 2889 static size_t memcg_caches_array_size(int num_groups) 2890 { 2891 ssize_t size; 2892 if (num_groups <= 0) 2893 return 0; 2894 2895 size = 2 * num_groups; 2896 if (size < MEMCG_CACHES_MIN_SIZE) 2897 size = MEMCG_CACHES_MIN_SIZE; 2898 else if (size > MEMCG_CACHES_MAX_SIZE) 2899 size = MEMCG_CACHES_MAX_SIZE; 2900 2901 return size; 2902 } 2903 2904 /* 2905 * We should update the current array size iff all caches updates succeed. This 2906 * can only be done from the slab side. The slab mutex needs to be held when 2907 * calling this. 2908 */ 2909 void memcg_update_array_size(int num) 2910 { 2911 if (num > memcg_limited_groups_array_size) 2912 memcg_limited_groups_array_size = memcg_caches_array_size(num); 2913 } 2914 2915 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 2916 { 2917 struct memcg_cache_params *cur_params = s->memcg_params; 2918 2919 VM_BUG_ON(!is_root_cache(s)); 2920 2921 if (num_groups > memcg_limited_groups_array_size) { 2922 int i; 2923 struct memcg_cache_params *new_params; 2924 ssize_t size = memcg_caches_array_size(num_groups); 2925 2926 size *= sizeof(void *); 2927 size += offsetof(struct memcg_cache_params, memcg_caches); 2928 2929 new_params = kzalloc(size, GFP_KERNEL); 2930 if (!new_params) 2931 return -ENOMEM; 2932 2933 new_params->is_root_cache = true; 2934 2935 /* 2936 * There is the chance it will be bigger than 2937 * memcg_limited_groups_array_size, if we failed an allocation 2938 * in a cache, in which case all caches updated before it, will 2939 * have a bigger array. 2940 * 2941 * But if that is the case, the data after 2942 * memcg_limited_groups_array_size is certainly unused 2943 */ 2944 for (i = 0; i < memcg_limited_groups_array_size; i++) { 2945 if (!cur_params->memcg_caches[i]) 2946 continue; 2947 new_params->memcg_caches[i] = 2948 cur_params->memcg_caches[i]; 2949 } 2950 2951 /* 2952 * Ideally, we would wait until all caches succeed, and only 2953 * then free the old one. But this is not worth the extra 2954 * pointer per-cache we'd have to have for this. 2955 * 2956 * It is not a big deal if some caches are left with a size 2957 * bigger than the others. And all updates will reset this 2958 * anyway. 2959 */ 2960 rcu_assign_pointer(s->memcg_params, new_params); 2961 if (cur_params) 2962 kfree_rcu(cur_params, rcu_head); 2963 } 2964 return 0; 2965 } 2966 2967 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 2968 struct kmem_cache *root_cache) 2969 { 2970 size_t size; 2971 2972 if (!memcg_kmem_enabled()) 2973 return 0; 2974 2975 if (!memcg) { 2976 size = offsetof(struct memcg_cache_params, memcg_caches); 2977 size += memcg_limited_groups_array_size * sizeof(void *); 2978 } else 2979 size = sizeof(struct memcg_cache_params); 2980 2981 s->memcg_params = kzalloc(size, GFP_KERNEL); 2982 if (!s->memcg_params) 2983 return -ENOMEM; 2984 2985 if (memcg) { 2986 s->memcg_params->memcg = memcg; 2987 s->memcg_params->root_cache = root_cache; 2988 css_get(&memcg->css); 2989 } else 2990 s->memcg_params->is_root_cache = true; 2991 2992 return 0; 2993 } 2994 2995 void memcg_free_cache_params(struct kmem_cache *s) 2996 { 2997 if (!s->memcg_params) 2998 return; 2999 if (!s->memcg_params->is_root_cache) 3000 css_put(&s->memcg_params->memcg->css); 3001 kfree(s->memcg_params); 3002 } 3003 3004 static void memcg_register_cache(struct mem_cgroup *memcg, 3005 struct kmem_cache *root_cache) 3006 { 3007 static char memcg_name_buf[NAME_MAX + 1]; /* protected by 3008 memcg_slab_mutex */ 3009 struct kmem_cache *cachep; 3010 int id; 3011 3012 lockdep_assert_held(&memcg_slab_mutex); 3013 3014 id = memcg_cache_id(memcg); 3015 3016 /* 3017 * Since per-memcg caches are created asynchronously on first 3018 * allocation (see memcg_kmem_get_cache()), several threads can try to 3019 * create the same cache, but only one of them may succeed. 3020 */ 3021 if (cache_from_memcg_idx(root_cache, id)) 3022 return; 3023 3024 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); 3025 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); 3026 /* 3027 * If we could not create a memcg cache, do not complain, because 3028 * that's not critical at all as we can always proceed with the root 3029 * cache. 3030 */ 3031 if (!cachep) 3032 return; 3033 3034 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 3035 3036 /* 3037 * Since readers won't lock (see cache_from_memcg_idx()), we need a 3038 * barrier here to ensure nobody will see the kmem_cache partially 3039 * initialized. 3040 */ 3041 smp_wmb(); 3042 3043 BUG_ON(root_cache->memcg_params->memcg_caches[id]); 3044 root_cache->memcg_params->memcg_caches[id] = cachep; 3045 } 3046 3047 static void memcg_unregister_cache(struct kmem_cache *cachep) 3048 { 3049 struct kmem_cache *root_cache; 3050 struct mem_cgroup *memcg; 3051 int id; 3052 3053 lockdep_assert_held(&memcg_slab_mutex); 3054 3055 BUG_ON(is_root_cache(cachep)); 3056 3057 root_cache = cachep->memcg_params->root_cache; 3058 memcg = cachep->memcg_params->memcg; 3059 id = memcg_cache_id(memcg); 3060 3061 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); 3062 root_cache->memcg_params->memcg_caches[id] = NULL; 3063 3064 list_del(&cachep->memcg_params->list); 3065 3066 kmem_cache_destroy(cachep); 3067 } 3068 3069 /* 3070 * During the creation a new cache, we need to disable our accounting mechanism 3071 * altogether. This is true even if we are not creating, but rather just 3072 * enqueing new caches to be created. 3073 * 3074 * This is because that process will trigger allocations; some visible, like 3075 * explicit kmallocs to auxiliary data structures, name strings and internal 3076 * cache structures; some well concealed, like INIT_WORK() that can allocate 3077 * objects during debug. 3078 * 3079 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3080 * to it. This may not be a bounded recursion: since the first cache creation 3081 * failed to complete (waiting on the allocation), we'll just try to create the 3082 * cache again, failing at the same point. 3083 * 3084 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3085 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3086 * inside the following two functions. 3087 */ 3088 static inline void memcg_stop_kmem_account(void) 3089 { 3090 VM_BUG_ON(!current->mm); 3091 current->memcg_kmem_skip_account++; 3092 } 3093 3094 static inline void memcg_resume_kmem_account(void) 3095 { 3096 VM_BUG_ON(!current->mm); 3097 current->memcg_kmem_skip_account--; 3098 } 3099 3100 int __memcg_cleanup_cache_params(struct kmem_cache *s) 3101 { 3102 struct kmem_cache *c; 3103 int i, failed = 0; 3104 3105 mutex_lock(&memcg_slab_mutex); 3106 for_each_memcg_cache_index(i) { 3107 c = cache_from_memcg_idx(s, i); 3108 if (!c) 3109 continue; 3110 3111 memcg_unregister_cache(c); 3112 3113 if (cache_from_memcg_idx(s, i)) 3114 failed++; 3115 } 3116 mutex_unlock(&memcg_slab_mutex); 3117 return failed; 3118 } 3119 3120 static void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3121 { 3122 struct kmem_cache *cachep; 3123 struct memcg_cache_params *params, *tmp; 3124 3125 if (!memcg_kmem_is_active(memcg)) 3126 return; 3127 3128 mutex_lock(&memcg_slab_mutex); 3129 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 3130 cachep = memcg_params_to_cache(params); 3131 kmem_cache_shrink(cachep); 3132 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3133 memcg_unregister_cache(cachep); 3134 } 3135 mutex_unlock(&memcg_slab_mutex); 3136 } 3137 3138 struct memcg_register_cache_work { 3139 struct mem_cgroup *memcg; 3140 struct kmem_cache *cachep; 3141 struct work_struct work; 3142 }; 3143 3144 static void memcg_register_cache_func(struct work_struct *w) 3145 { 3146 struct memcg_register_cache_work *cw = 3147 container_of(w, struct memcg_register_cache_work, work); 3148 struct mem_cgroup *memcg = cw->memcg; 3149 struct kmem_cache *cachep = cw->cachep; 3150 3151 mutex_lock(&memcg_slab_mutex); 3152 memcg_register_cache(memcg, cachep); 3153 mutex_unlock(&memcg_slab_mutex); 3154 3155 css_put(&memcg->css); 3156 kfree(cw); 3157 } 3158 3159 /* 3160 * Enqueue the creation of a per-memcg kmem_cache. 3161 */ 3162 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 3163 struct kmem_cache *cachep) 3164 { 3165 struct memcg_register_cache_work *cw; 3166 3167 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 3168 if (cw == NULL) { 3169 css_put(&memcg->css); 3170 return; 3171 } 3172 3173 cw->memcg = memcg; 3174 cw->cachep = cachep; 3175 3176 INIT_WORK(&cw->work, memcg_register_cache_func); 3177 schedule_work(&cw->work); 3178 } 3179 3180 static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 3181 struct kmem_cache *cachep) 3182 { 3183 /* 3184 * We need to stop accounting when we kmalloc, because if the 3185 * corresponding kmalloc cache is not yet created, the first allocation 3186 * in __memcg_schedule_register_cache will recurse. 3187 * 3188 * However, it is better to enclose the whole function. Depending on 3189 * the debugging options enabled, INIT_WORK(), for instance, can 3190 * trigger an allocation. This too, will make us recurse. Because at 3191 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3192 * the safest choice is to do it like this, wrapping the whole function. 3193 */ 3194 memcg_stop_kmem_account(); 3195 __memcg_schedule_register_cache(memcg, cachep); 3196 memcg_resume_kmem_account(); 3197 } 3198 3199 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 3200 { 3201 int res; 3202 3203 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 3204 PAGE_SIZE << order); 3205 if (!res) 3206 atomic_add(1 << order, &cachep->memcg_params->nr_pages); 3207 return res; 3208 } 3209 3210 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 3211 { 3212 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 3213 atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 3214 } 3215 3216 /* 3217 * Return the kmem_cache we're supposed to use for a slab allocation. 3218 * We try to use the current memcg's version of the cache. 3219 * 3220 * If the cache does not exist yet, if we are the first user of it, 3221 * we either create it immediately, if possible, or create it asynchronously 3222 * in a workqueue. 3223 * In the latter case, we will let the current allocation go through with 3224 * the original cache. 3225 * 3226 * Can't be called in interrupt context or from kernel threads. 3227 * This function needs to be called with rcu_read_lock() held. 3228 */ 3229 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3230 gfp_t gfp) 3231 { 3232 struct mem_cgroup *memcg; 3233 struct kmem_cache *memcg_cachep; 3234 3235 VM_BUG_ON(!cachep->memcg_params); 3236 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3237 3238 if (!current->mm || current->memcg_kmem_skip_account) 3239 return cachep; 3240 3241 rcu_read_lock(); 3242 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3243 3244 if (!memcg_can_account_kmem(memcg)) 3245 goto out; 3246 3247 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3248 if (likely(memcg_cachep)) { 3249 cachep = memcg_cachep; 3250 goto out; 3251 } 3252 3253 /* The corresponding put will be done in the workqueue. */ 3254 if (!css_tryget_online(&memcg->css)) 3255 goto out; 3256 rcu_read_unlock(); 3257 3258 /* 3259 * If we are in a safe context (can wait, and not in interrupt 3260 * context), we could be be predictable and return right away. 3261 * This would guarantee that the allocation being performed 3262 * already belongs in the new cache. 3263 * 3264 * However, there are some clashes that can arrive from locking. 3265 * For instance, because we acquire the slab_mutex while doing 3266 * memcg_create_kmem_cache, this means no further allocation 3267 * could happen with the slab_mutex held. So it's better to 3268 * defer everything. 3269 */ 3270 memcg_schedule_register_cache(memcg, cachep); 3271 return cachep; 3272 out: 3273 rcu_read_unlock(); 3274 return cachep; 3275 } 3276 3277 /* 3278 * We need to verify if the allocation against current->mm->owner's memcg is 3279 * possible for the given order. But the page is not allocated yet, so we'll 3280 * need a further commit step to do the final arrangements. 3281 * 3282 * It is possible for the task to switch cgroups in this mean time, so at 3283 * commit time, we can't rely on task conversion any longer. We'll then use 3284 * the handle argument to return to the caller which cgroup we should commit 3285 * against. We could also return the memcg directly and avoid the pointer 3286 * passing, but a boolean return value gives better semantics considering 3287 * the compiled-out case as well. 3288 * 3289 * Returning true means the allocation is possible. 3290 */ 3291 bool 3292 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3293 { 3294 struct mem_cgroup *memcg; 3295 int ret; 3296 3297 *_memcg = NULL; 3298 3299 /* 3300 * Disabling accounting is only relevant for some specific memcg 3301 * internal allocations. Therefore we would initially not have such 3302 * check here, since direct calls to the page allocator that are 3303 * accounted to kmemcg (alloc_kmem_pages and friends) only happen 3304 * outside memcg core. We are mostly concerned with cache allocations, 3305 * and by having this test at memcg_kmem_get_cache, we are already able 3306 * to relay the allocation to the root cache and bypass the memcg cache 3307 * altogether. 3308 * 3309 * There is one exception, though: the SLUB allocator does not create 3310 * large order caches, but rather service large kmallocs directly from 3311 * the page allocator. Therefore, the following sequence when backed by 3312 * the SLUB allocator: 3313 * 3314 * memcg_stop_kmem_account(); 3315 * kmalloc(<large_number>) 3316 * memcg_resume_kmem_account(); 3317 * 3318 * would effectively ignore the fact that we should skip accounting, 3319 * since it will drive us directly to this function without passing 3320 * through the cache selector memcg_kmem_get_cache. Such large 3321 * allocations are extremely rare but can happen, for instance, for the 3322 * cache arrays. We bring this test here. 3323 */ 3324 if (!current->mm || current->memcg_kmem_skip_account) 3325 return true; 3326 3327 memcg = get_mem_cgroup_from_mm(current->mm); 3328 3329 if (!memcg_can_account_kmem(memcg)) { 3330 css_put(&memcg->css); 3331 return true; 3332 } 3333 3334 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3335 if (!ret) 3336 *_memcg = memcg; 3337 3338 css_put(&memcg->css); 3339 return (ret == 0); 3340 } 3341 3342 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3343 int order) 3344 { 3345 struct page_cgroup *pc; 3346 3347 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3348 3349 /* The page allocation failed. Revert */ 3350 if (!page) { 3351 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3352 return; 3353 } 3354 /* 3355 * The page is freshly allocated and not visible to any 3356 * outside callers yet. Set up pc non-atomically. 3357 */ 3358 pc = lookup_page_cgroup(page); 3359 pc->mem_cgroup = memcg; 3360 pc->flags = PCG_USED; 3361 } 3362 3363 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3364 { 3365 struct mem_cgroup *memcg = NULL; 3366 struct page_cgroup *pc; 3367 3368 3369 pc = lookup_page_cgroup(page); 3370 if (!PageCgroupUsed(pc)) 3371 return; 3372 3373 memcg = pc->mem_cgroup; 3374 pc->flags = 0; 3375 3376 /* 3377 * We trust that only if there is a memcg associated with the page, it 3378 * is a valid allocation 3379 */ 3380 if (!memcg) 3381 return; 3382 3383 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3384 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3385 } 3386 #else 3387 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3388 { 3389 } 3390 #endif /* CONFIG_MEMCG_KMEM */ 3391 3392 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3393 3394 /* 3395 * Because tail pages are not marked as "used", set it. We're under 3396 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3397 * charge/uncharge will be never happen and move_account() is done under 3398 * compound_lock(), so we don't have to take care of races. 3399 */ 3400 void mem_cgroup_split_huge_fixup(struct page *head) 3401 { 3402 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3403 struct page_cgroup *pc; 3404 struct mem_cgroup *memcg; 3405 int i; 3406 3407 if (mem_cgroup_disabled()) 3408 return; 3409 3410 memcg = head_pc->mem_cgroup; 3411 for (i = 1; i < HPAGE_PMD_NR; i++) { 3412 pc = head_pc + i; 3413 pc->mem_cgroup = memcg; 3414 pc->flags = head_pc->flags; 3415 } 3416 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3417 HPAGE_PMD_NR); 3418 } 3419 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3420 3421 /** 3422 * mem_cgroup_move_account - move account of the page 3423 * @page: the page 3424 * @nr_pages: number of regular pages (>1 for huge pages) 3425 * @pc: page_cgroup of the page. 3426 * @from: mem_cgroup which the page is moved from. 3427 * @to: mem_cgroup which the page is moved to. @from != @to. 3428 * 3429 * The caller must confirm following. 3430 * - page is not on LRU (isolate_page() is useful.) 3431 * - compound_lock is held when nr_pages > 1 3432 * 3433 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3434 * from old cgroup. 3435 */ 3436 static int mem_cgroup_move_account(struct page *page, 3437 unsigned int nr_pages, 3438 struct page_cgroup *pc, 3439 struct mem_cgroup *from, 3440 struct mem_cgroup *to) 3441 { 3442 unsigned long flags; 3443 int ret; 3444 3445 VM_BUG_ON(from == to); 3446 VM_BUG_ON_PAGE(PageLRU(page), page); 3447 /* 3448 * The page is isolated from LRU. So, collapse function 3449 * will not handle this page. But page splitting can happen. 3450 * Do this check under compound_page_lock(). The caller should 3451 * hold it. 3452 */ 3453 ret = -EBUSY; 3454 if (nr_pages > 1 && !PageTransHuge(page)) 3455 goto out; 3456 3457 /* 3458 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3459 * of its source page while we change it: page migration takes 3460 * both pages off the LRU, but page cache replacement doesn't. 3461 */ 3462 if (!trylock_page(page)) 3463 goto out; 3464 3465 ret = -EINVAL; 3466 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3467 goto out_unlock; 3468 3469 move_lock_mem_cgroup(from, &flags); 3470 3471 if (!PageAnon(page) && page_mapped(page)) { 3472 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3473 nr_pages); 3474 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3475 nr_pages); 3476 } 3477 3478 if (PageWriteback(page)) { 3479 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3480 nr_pages); 3481 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3482 nr_pages); 3483 } 3484 3485 /* 3486 * It is safe to change pc->mem_cgroup here because the page 3487 * is referenced, charged, and isolated - we can't race with 3488 * uncharging, charging, migration, or LRU putback. 3489 */ 3490 3491 /* caller should have done css_get */ 3492 pc->mem_cgroup = to; 3493 move_unlock_mem_cgroup(from, &flags); 3494 ret = 0; 3495 3496 local_irq_disable(); 3497 mem_cgroup_charge_statistics(to, page, nr_pages); 3498 memcg_check_events(to, page); 3499 mem_cgroup_charge_statistics(from, page, -nr_pages); 3500 memcg_check_events(from, page); 3501 local_irq_enable(); 3502 out_unlock: 3503 unlock_page(page); 3504 out: 3505 return ret; 3506 } 3507 3508 /** 3509 * mem_cgroup_move_parent - moves page to the parent group 3510 * @page: the page to move 3511 * @pc: page_cgroup of the page 3512 * @child: page's cgroup 3513 * 3514 * move charges to its parent or the root cgroup if the group has no 3515 * parent (aka use_hierarchy==0). 3516 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3517 * mem_cgroup_move_account fails) the failure is always temporary and 3518 * it signals a race with a page removal/uncharge or migration. In the 3519 * first case the page is on the way out and it will vanish from the LRU 3520 * on the next attempt and the call should be retried later. 3521 * Isolation from the LRU fails only if page has been isolated from 3522 * the LRU since we looked at it and that usually means either global 3523 * reclaim or migration going on. The page will either get back to the 3524 * LRU or vanish. 3525 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3526 * (!PageCgroupUsed) or moved to a different group. The page will 3527 * disappear in the next attempt. 3528 */ 3529 static int mem_cgroup_move_parent(struct page *page, 3530 struct page_cgroup *pc, 3531 struct mem_cgroup *child) 3532 { 3533 struct mem_cgroup *parent; 3534 unsigned int nr_pages; 3535 unsigned long uninitialized_var(flags); 3536 int ret; 3537 3538 VM_BUG_ON(mem_cgroup_is_root(child)); 3539 3540 ret = -EBUSY; 3541 if (!get_page_unless_zero(page)) 3542 goto out; 3543 if (isolate_lru_page(page)) 3544 goto put; 3545 3546 nr_pages = hpage_nr_pages(page); 3547 3548 parent = parent_mem_cgroup(child); 3549 /* 3550 * If no parent, move charges to root cgroup. 3551 */ 3552 if (!parent) 3553 parent = root_mem_cgroup; 3554 3555 if (nr_pages > 1) { 3556 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3557 flags = compound_lock_irqsave(page); 3558 } 3559 3560 ret = mem_cgroup_move_account(page, nr_pages, 3561 pc, child, parent); 3562 if (!ret) 3563 __mem_cgroup_cancel_local_charge(child, nr_pages); 3564 3565 if (nr_pages > 1) 3566 compound_unlock_irqrestore(page, flags); 3567 putback_lru_page(page); 3568 put: 3569 put_page(page); 3570 out: 3571 return ret; 3572 } 3573 3574 #ifdef CONFIG_MEMCG_SWAP 3575 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3576 bool charge) 3577 { 3578 int val = (charge) ? 1 : -1; 3579 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 3580 } 3581 3582 /** 3583 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3584 * @entry: swap entry to be moved 3585 * @from: mem_cgroup which the entry is moved from 3586 * @to: mem_cgroup which the entry is moved to 3587 * 3588 * It succeeds only when the swap_cgroup's record for this entry is the same 3589 * as the mem_cgroup's id of @from. 3590 * 3591 * Returns 0 on success, -EINVAL on failure. 3592 * 3593 * The caller must have charged to @to, IOW, called res_counter_charge() about 3594 * both res and memsw, and called css_get(). 3595 */ 3596 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3597 struct mem_cgroup *from, struct mem_cgroup *to) 3598 { 3599 unsigned short old_id, new_id; 3600 3601 old_id = mem_cgroup_id(from); 3602 new_id = mem_cgroup_id(to); 3603 3604 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3605 mem_cgroup_swap_statistics(from, false); 3606 mem_cgroup_swap_statistics(to, true); 3607 /* 3608 * This function is only called from task migration context now. 3609 * It postpones res_counter and refcount handling till the end 3610 * of task migration(mem_cgroup_clear_mc()) for performance 3611 * improvement. But we cannot postpone css_get(to) because if 3612 * the process that has been moved to @to does swap-in, the 3613 * refcount of @to might be decreased to 0. 3614 * 3615 * We are in attach() phase, so the cgroup is guaranteed to be 3616 * alive, so we can just call css_get(). 3617 */ 3618 css_get(&to->css); 3619 return 0; 3620 } 3621 return -EINVAL; 3622 } 3623 #else 3624 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3625 struct mem_cgroup *from, struct mem_cgroup *to) 3626 { 3627 return -EINVAL; 3628 } 3629 #endif 3630 3631 #ifdef CONFIG_DEBUG_VM 3632 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3633 { 3634 struct page_cgroup *pc; 3635 3636 pc = lookup_page_cgroup(page); 3637 /* 3638 * Can be NULL while feeding pages into the page allocator for 3639 * the first time, i.e. during boot or memory hotplug; 3640 * or when mem_cgroup_disabled(). 3641 */ 3642 if (likely(pc) && PageCgroupUsed(pc)) 3643 return pc; 3644 return NULL; 3645 } 3646 3647 bool mem_cgroup_bad_page_check(struct page *page) 3648 { 3649 if (mem_cgroup_disabled()) 3650 return false; 3651 3652 return lookup_page_cgroup_used(page) != NULL; 3653 } 3654 3655 void mem_cgroup_print_bad_page(struct page *page) 3656 { 3657 struct page_cgroup *pc; 3658 3659 pc = lookup_page_cgroup_used(page); 3660 if (pc) { 3661 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3662 pc, pc->flags, pc->mem_cgroup); 3663 } 3664 } 3665 #endif 3666 3667 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3668 unsigned long long val) 3669 { 3670 int retry_count; 3671 u64 memswlimit, memlimit; 3672 int ret = 0; 3673 int children = mem_cgroup_count_children(memcg); 3674 u64 curusage, oldusage; 3675 int enlarge; 3676 3677 /* 3678 * For keeping hierarchical_reclaim simple, how long we should retry 3679 * is depends on callers. We set our retry-count to be function 3680 * of # of children which we should visit in this loop. 3681 */ 3682 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3683 3684 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3685 3686 enlarge = 0; 3687 while (retry_count) { 3688 if (signal_pending(current)) { 3689 ret = -EINTR; 3690 break; 3691 } 3692 /* 3693 * Rather than hide all in some function, I do this in 3694 * open coded manner. You see what this really does. 3695 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3696 */ 3697 mutex_lock(&set_limit_mutex); 3698 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3699 if (memswlimit < val) { 3700 ret = -EINVAL; 3701 mutex_unlock(&set_limit_mutex); 3702 break; 3703 } 3704 3705 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3706 if (memlimit < val) 3707 enlarge = 1; 3708 3709 ret = res_counter_set_limit(&memcg->res, val); 3710 if (!ret) { 3711 if (memswlimit == val) 3712 memcg->memsw_is_minimum = true; 3713 else 3714 memcg->memsw_is_minimum = false; 3715 } 3716 mutex_unlock(&set_limit_mutex); 3717 3718 if (!ret) 3719 break; 3720 3721 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3722 MEM_CGROUP_RECLAIM_SHRINK); 3723 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3724 /* Usage is reduced ? */ 3725 if (curusage >= oldusage) 3726 retry_count--; 3727 else 3728 oldusage = curusage; 3729 } 3730 if (!ret && enlarge) 3731 memcg_oom_recover(memcg); 3732 3733 return ret; 3734 } 3735 3736 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3737 unsigned long long val) 3738 { 3739 int retry_count; 3740 u64 memlimit, memswlimit, oldusage, curusage; 3741 int children = mem_cgroup_count_children(memcg); 3742 int ret = -EBUSY; 3743 int enlarge = 0; 3744 3745 /* see mem_cgroup_resize_res_limit */ 3746 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3747 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3748 while (retry_count) { 3749 if (signal_pending(current)) { 3750 ret = -EINTR; 3751 break; 3752 } 3753 /* 3754 * Rather than hide all in some function, I do this in 3755 * open coded manner. You see what this really does. 3756 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3757 */ 3758 mutex_lock(&set_limit_mutex); 3759 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3760 if (memlimit > val) { 3761 ret = -EINVAL; 3762 mutex_unlock(&set_limit_mutex); 3763 break; 3764 } 3765 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3766 if (memswlimit < val) 3767 enlarge = 1; 3768 ret = res_counter_set_limit(&memcg->memsw, val); 3769 if (!ret) { 3770 if (memlimit == val) 3771 memcg->memsw_is_minimum = true; 3772 else 3773 memcg->memsw_is_minimum = false; 3774 } 3775 mutex_unlock(&set_limit_mutex); 3776 3777 if (!ret) 3778 break; 3779 3780 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3781 MEM_CGROUP_RECLAIM_NOSWAP | 3782 MEM_CGROUP_RECLAIM_SHRINK); 3783 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3784 /* Usage is reduced ? */ 3785 if (curusage >= oldusage) 3786 retry_count--; 3787 else 3788 oldusage = curusage; 3789 } 3790 if (!ret && enlarge) 3791 memcg_oom_recover(memcg); 3792 return ret; 3793 } 3794 3795 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3796 gfp_t gfp_mask, 3797 unsigned long *total_scanned) 3798 { 3799 unsigned long nr_reclaimed = 0; 3800 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3801 unsigned long reclaimed; 3802 int loop = 0; 3803 struct mem_cgroup_tree_per_zone *mctz; 3804 unsigned long long excess; 3805 unsigned long nr_scanned; 3806 3807 if (order > 0) 3808 return 0; 3809 3810 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3811 /* 3812 * This loop can run a while, specially if mem_cgroup's continuously 3813 * keep exceeding their soft limit and putting the system under 3814 * pressure 3815 */ 3816 do { 3817 if (next_mz) 3818 mz = next_mz; 3819 else 3820 mz = mem_cgroup_largest_soft_limit_node(mctz); 3821 if (!mz) 3822 break; 3823 3824 nr_scanned = 0; 3825 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3826 gfp_mask, &nr_scanned); 3827 nr_reclaimed += reclaimed; 3828 *total_scanned += nr_scanned; 3829 spin_lock_irq(&mctz->lock); 3830 3831 /* 3832 * If we failed to reclaim anything from this memory cgroup 3833 * it is time to move on to the next cgroup 3834 */ 3835 next_mz = NULL; 3836 if (!reclaimed) { 3837 do { 3838 /* 3839 * Loop until we find yet another one. 3840 * 3841 * By the time we get the soft_limit lock 3842 * again, someone might have aded the 3843 * group back on the RB tree. Iterate to 3844 * make sure we get a different mem. 3845 * mem_cgroup_largest_soft_limit_node returns 3846 * NULL if no other cgroup is present on 3847 * the tree 3848 */ 3849 next_mz = 3850 __mem_cgroup_largest_soft_limit_node(mctz); 3851 if (next_mz == mz) 3852 css_put(&next_mz->memcg->css); 3853 else /* next_mz == NULL or other memcg */ 3854 break; 3855 } while (1); 3856 } 3857 __mem_cgroup_remove_exceeded(mz, mctz); 3858 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3859 /* 3860 * One school of thought says that we should not add 3861 * back the node to the tree if reclaim returns 0. 3862 * But our reclaim could return 0, simply because due 3863 * to priority we are exposing a smaller subset of 3864 * memory to reclaim from. Consider this as a longer 3865 * term TODO. 3866 */ 3867 /* If excess == 0, no tree ops */ 3868 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3869 spin_unlock_irq(&mctz->lock); 3870 css_put(&mz->memcg->css); 3871 loop++; 3872 /* 3873 * Could not reclaim anything and there are no more 3874 * mem cgroups to try or we seem to be looping without 3875 * reclaiming anything. 3876 */ 3877 if (!nr_reclaimed && 3878 (next_mz == NULL || 3879 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3880 break; 3881 } while (!nr_reclaimed); 3882 if (next_mz) 3883 css_put(&next_mz->memcg->css); 3884 return nr_reclaimed; 3885 } 3886 3887 /** 3888 * mem_cgroup_force_empty_list - clears LRU of a group 3889 * @memcg: group to clear 3890 * @node: NUMA node 3891 * @zid: zone id 3892 * @lru: lru to to clear 3893 * 3894 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3895 * reclaim the pages page themselves - pages are moved to the parent (or root) 3896 * group. 3897 */ 3898 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3899 int node, int zid, enum lru_list lru) 3900 { 3901 struct lruvec *lruvec; 3902 unsigned long flags; 3903 struct list_head *list; 3904 struct page *busy; 3905 struct zone *zone; 3906 3907 zone = &NODE_DATA(node)->node_zones[zid]; 3908 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3909 list = &lruvec->lists[lru]; 3910 3911 busy = NULL; 3912 do { 3913 struct page_cgroup *pc; 3914 struct page *page; 3915 3916 spin_lock_irqsave(&zone->lru_lock, flags); 3917 if (list_empty(list)) { 3918 spin_unlock_irqrestore(&zone->lru_lock, flags); 3919 break; 3920 } 3921 page = list_entry(list->prev, struct page, lru); 3922 if (busy == page) { 3923 list_move(&page->lru, list); 3924 busy = NULL; 3925 spin_unlock_irqrestore(&zone->lru_lock, flags); 3926 continue; 3927 } 3928 spin_unlock_irqrestore(&zone->lru_lock, flags); 3929 3930 pc = lookup_page_cgroup(page); 3931 3932 if (mem_cgroup_move_parent(page, pc, memcg)) { 3933 /* found lock contention or "pc" is obsolete. */ 3934 busy = page; 3935 } else 3936 busy = NULL; 3937 cond_resched(); 3938 } while (!list_empty(list)); 3939 } 3940 3941 /* 3942 * make mem_cgroup's charge to be 0 if there is no task by moving 3943 * all the charges and pages to the parent. 3944 * This enables deleting this mem_cgroup. 3945 * 3946 * Caller is responsible for holding css reference on the memcg. 3947 */ 3948 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 3949 { 3950 int node, zid; 3951 u64 usage; 3952 3953 do { 3954 /* This is for making all *used* pages to be on LRU. */ 3955 lru_add_drain_all(); 3956 drain_all_stock_sync(memcg); 3957 mem_cgroup_start_move(memcg); 3958 for_each_node_state(node, N_MEMORY) { 3959 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3960 enum lru_list lru; 3961 for_each_lru(lru) { 3962 mem_cgroup_force_empty_list(memcg, 3963 node, zid, lru); 3964 } 3965 } 3966 } 3967 mem_cgroup_end_move(memcg); 3968 memcg_oom_recover(memcg); 3969 cond_resched(); 3970 3971 /* 3972 * Kernel memory may not necessarily be trackable to a specific 3973 * process. So they are not migrated, and therefore we can't 3974 * expect their value to drop to 0 here. 3975 * Having res filled up with kmem only is enough. 3976 * 3977 * This is a safety check because mem_cgroup_force_empty_list 3978 * could have raced with mem_cgroup_replace_page_cache callers 3979 * so the lru seemed empty but the page could have been added 3980 * right after the check. RES_USAGE should be safe as we always 3981 * charge before adding to the LRU. 3982 */ 3983 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 3984 res_counter_read_u64(&memcg->kmem, RES_USAGE); 3985 } while (usage > 0); 3986 } 3987 3988 /* 3989 * Test whether @memcg has children, dead or alive. Note that this 3990 * function doesn't care whether @memcg has use_hierarchy enabled and 3991 * returns %true if there are child csses according to the cgroup 3992 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3993 */ 3994 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3995 { 3996 bool ret; 3997 3998 /* 3999 * The lock does not prevent addition or deletion of children, but 4000 * it prevents a new child from being initialized based on this 4001 * parent in css_online(), so it's enough to decide whether 4002 * hierarchically inherited attributes can still be changed or not. 4003 */ 4004 lockdep_assert_held(&memcg_create_mutex); 4005 4006 rcu_read_lock(); 4007 ret = css_next_child(NULL, &memcg->css); 4008 rcu_read_unlock(); 4009 return ret; 4010 } 4011 4012 /* 4013 * Reclaims as many pages from the given memcg as possible and moves 4014 * the rest to the parent. 4015 * 4016 * Caller is responsible for holding css reference for memcg. 4017 */ 4018 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 4019 { 4020 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 4021 4022 /* we call try-to-free pages for make this cgroup empty */ 4023 lru_add_drain_all(); 4024 /* try to free all pages in this cgroup */ 4025 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4026 int progress; 4027 4028 if (signal_pending(current)) 4029 return -EINTR; 4030 4031 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4032 false); 4033 if (!progress) { 4034 nr_retries--; 4035 /* maybe some writeback is necessary */ 4036 congestion_wait(BLK_RW_ASYNC, HZ/10); 4037 } 4038 4039 } 4040 4041 return 0; 4042 } 4043 4044 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 4045 char *buf, size_t nbytes, 4046 loff_t off) 4047 { 4048 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4049 4050 if (mem_cgroup_is_root(memcg)) 4051 return -EINVAL; 4052 return mem_cgroup_force_empty(memcg) ?: nbytes; 4053 } 4054 4055 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 4056 struct cftype *cft) 4057 { 4058 return mem_cgroup_from_css(css)->use_hierarchy; 4059 } 4060 4061 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 4062 struct cftype *cft, u64 val) 4063 { 4064 int retval = 0; 4065 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4066 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 4067 4068 mutex_lock(&memcg_create_mutex); 4069 4070 if (memcg->use_hierarchy == val) 4071 goto out; 4072 4073 /* 4074 * If parent's use_hierarchy is set, we can't make any modifications 4075 * in the child subtrees. If it is unset, then the change can 4076 * occur, provided the current cgroup has no children. 4077 * 4078 * For the root cgroup, parent_mem is NULL, we allow value to be 4079 * set if there are no children. 4080 */ 4081 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4082 (val == 1 || val == 0)) { 4083 if (!memcg_has_children(memcg)) 4084 memcg->use_hierarchy = val; 4085 else 4086 retval = -EBUSY; 4087 } else 4088 retval = -EINVAL; 4089 4090 out: 4091 mutex_unlock(&memcg_create_mutex); 4092 4093 return retval; 4094 } 4095 4096 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4097 struct cftype *cft) 4098 { 4099 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4100 enum res_type type = MEMFILE_TYPE(cft->private); 4101 int name = MEMFILE_ATTR(cft->private); 4102 4103 switch (type) { 4104 case _MEM: 4105 return res_counter_read_u64(&memcg->res, name); 4106 case _MEMSWAP: 4107 return res_counter_read_u64(&memcg->memsw, name); 4108 case _KMEM: 4109 return res_counter_read_u64(&memcg->kmem, name); 4110 break; 4111 default: 4112 BUG(); 4113 } 4114 } 4115 4116 #ifdef CONFIG_MEMCG_KMEM 4117 /* should be called with activate_kmem_mutex held */ 4118 static int __memcg_activate_kmem(struct mem_cgroup *memcg, 4119 unsigned long long limit) 4120 { 4121 int err = 0; 4122 int memcg_id; 4123 4124 if (memcg_kmem_is_active(memcg)) 4125 return 0; 4126 4127 /* 4128 * We are going to allocate memory for data shared by all memory 4129 * cgroups so let's stop accounting here. 4130 */ 4131 memcg_stop_kmem_account(); 4132 4133 /* 4134 * For simplicity, we won't allow this to be disabled. It also can't 4135 * be changed if the cgroup has children already, or if tasks had 4136 * already joined. 4137 * 4138 * If tasks join before we set the limit, a person looking at 4139 * kmem.usage_in_bytes will have no way to determine when it took 4140 * place, which makes the value quite meaningless. 4141 * 4142 * After it first became limited, changes in the value of the limit are 4143 * of course permitted. 4144 */ 4145 mutex_lock(&memcg_create_mutex); 4146 if (cgroup_has_tasks(memcg->css.cgroup) || 4147 (memcg->use_hierarchy && memcg_has_children(memcg))) 4148 err = -EBUSY; 4149 mutex_unlock(&memcg_create_mutex); 4150 if (err) 4151 goto out; 4152 4153 memcg_id = ida_simple_get(&kmem_limited_groups, 4154 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 4155 if (memcg_id < 0) { 4156 err = memcg_id; 4157 goto out; 4158 } 4159 4160 /* 4161 * Make sure we have enough space for this cgroup in each root cache's 4162 * memcg_params. 4163 */ 4164 mutex_lock(&memcg_slab_mutex); 4165 err = memcg_update_all_caches(memcg_id + 1); 4166 mutex_unlock(&memcg_slab_mutex); 4167 if (err) 4168 goto out_rmid; 4169 4170 memcg->kmemcg_id = memcg_id; 4171 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4172 4173 /* 4174 * We couldn't have accounted to this cgroup, because it hasn't got the 4175 * active bit set yet, so this should succeed. 4176 */ 4177 err = res_counter_set_limit(&memcg->kmem, limit); 4178 VM_BUG_ON(err); 4179 4180 static_key_slow_inc(&memcg_kmem_enabled_key); 4181 /* 4182 * Setting the active bit after enabling static branching will 4183 * guarantee no one starts accounting before all call sites are 4184 * patched. 4185 */ 4186 memcg_kmem_set_active(memcg); 4187 out: 4188 memcg_resume_kmem_account(); 4189 return err; 4190 4191 out_rmid: 4192 ida_simple_remove(&kmem_limited_groups, memcg_id); 4193 goto out; 4194 } 4195 4196 static int memcg_activate_kmem(struct mem_cgroup *memcg, 4197 unsigned long long limit) 4198 { 4199 int ret; 4200 4201 mutex_lock(&activate_kmem_mutex); 4202 ret = __memcg_activate_kmem(memcg, limit); 4203 mutex_unlock(&activate_kmem_mutex); 4204 return ret; 4205 } 4206 4207 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4208 unsigned long long val) 4209 { 4210 int ret; 4211 4212 if (!memcg_kmem_is_active(memcg)) 4213 ret = memcg_activate_kmem(memcg, val); 4214 else 4215 ret = res_counter_set_limit(&memcg->kmem, val); 4216 return ret; 4217 } 4218 4219 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 4220 { 4221 int ret = 0; 4222 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4223 4224 if (!parent) 4225 return 0; 4226 4227 mutex_lock(&activate_kmem_mutex); 4228 /* 4229 * If the parent cgroup is not kmem-active now, it cannot be activated 4230 * after this point, because it has at least one child already. 4231 */ 4232 if (memcg_kmem_is_active(parent)) 4233 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 4234 mutex_unlock(&activate_kmem_mutex); 4235 return ret; 4236 } 4237 #else 4238 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4239 unsigned long long val) 4240 { 4241 return -EINVAL; 4242 } 4243 #endif /* CONFIG_MEMCG_KMEM */ 4244 4245 /* 4246 * The user of this function is... 4247 * RES_LIMIT. 4248 */ 4249 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 4250 char *buf, size_t nbytes, loff_t off) 4251 { 4252 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4253 enum res_type type; 4254 int name; 4255 unsigned long long val; 4256 int ret; 4257 4258 buf = strstrip(buf); 4259 type = MEMFILE_TYPE(of_cft(of)->private); 4260 name = MEMFILE_ATTR(of_cft(of)->private); 4261 4262 switch (name) { 4263 case RES_LIMIT: 4264 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4265 ret = -EINVAL; 4266 break; 4267 } 4268 /* This function does all necessary parse...reuse it */ 4269 ret = res_counter_memparse_write_strategy(buf, &val); 4270 if (ret) 4271 break; 4272 if (type == _MEM) 4273 ret = mem_cgroup_resize_limit(memcg, val); 4274 else if (type == _MEMSWAP) 4275 ret = mem_cgroup_resize_memsw_limit(memcg, val); 4276 else if (type == _KMEM) 4277 ret = memcg_update_kmem_limit(memcg, val); 4278 else 4279 return -EINVAL; 4280 break; 4281 case RES_SOFT_LIMIT: 4282 ret = res_counter_memparse_write_strategy(buf, &val); 4283 if (ret) 4284 break; 4285 /* 4286 * For memsw, soft limits are hard to implement in terms 4287 * of semantics, for now, we support soft limits for 4288 * control without swap 4289 */ 4290 if (type == _MEM) 4291 ret = res_counter_set_soft_limit(&memcg->res, val); 4292 else 4293 ret = -EINVAL; 4294 break; 4295 default: 4296 ret = -EINVAL; /* should be BUG() ? */ 4297 break; 4298 } 4299 return ret ?: nbytes; 4300 } 4301 4302 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 4303 unsigned long long *mem_limit, unsigned long long *memsw_limit) 4304 { 4305 unsigned long long min_limit, min_memsw_limit, tmp; 4306 4307 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4308 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4309 if (!memcg->use_hierarchy) 4310 goto out; 4311 4312 while (memcg->css.parent) { 4313 memcg = mem_cgroup_from_css(memcg->css.parent); 4314 if (!memcg->use_hierarchy) 4315 break; 4316 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 4317 min_limit = min(min_limit, tmp); 4318 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4319 min_memsw_limit = min(min_memsw_limit, tmp); 4320 } 4321 out: 4322 *mem_limit = min_limit; 4323 *memsw_limit = min_memsw_limit; 4324 } 4325 4326 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4327 size_t nbytes, loff_t off) 4328 { 4329 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4330 int name; 4331 enum res_type type; 4332 4333 type = MEMFILE_TYPE(of_cft(of)->private); 4334 name = MEMFILE_ATTR(of_cft(of)->private); 4335 4336 switch (name) { 4337 case RES_MAX_USAGE: 4338 if (type == _MEM) 4339 res_counter_reset_max(&memcg->res); 4340 else if (type == _MEMSWAP) 4341 res_counter_reset_max(&memcg->memsw); 4342 else if (type == _KMEM) 4343 res_counter_reset_max(&memcg->kmem); 4344 else 4345 return -EINVAL; 4346 break; 4347 case RES_FAILCNT: 4348 if (type == _MEM) 4349 res_counter_reset_failcnt(&memcg->res); 4350 else if (type == _MEMSWAP) 4351 res_counter_reset_failcnt(&memcg->memsw); 4352 else if (type == _KMEM) 4353 res_counter_reset_failcnt(&memcg->kmem); 4354 else 4355 return -EINVAL; 4356 break; 4357 } 4358 4359 return nbytes; 4360 } 4361 4362 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 4363 struct cftype *cft) 4364 { 4365 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 4366 } 4367 4368 #ifdef CONFIG_MMU 4369 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4370 struct cftype *cft, u64 val) 4371 { 4372 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4373 4374 if (val >= (1 << NR_MOVE_TYPE)) 4375 return -EINVAL; 4376 4377 /* 4378 * No kind of locking is needed in here, because ->can_attach() will 4379 * check this value once in the beginning of the process, and then carry 4380 * on with stale data. This means that changes to this value will only 4381 * affect task migrations starting after the change. 4382 */ 4383 memcg->move_charge_at_immigrate = val; 4384 return 0; 4385 } 4386 #else 4387 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4388 struct cftype *cft, u64 val) 4389 { 4390 return -ENOSYS; 4391 } 4392 #endif 4393 4394 #ifdef CONFIG_NUMA 4395 static int memcg_numa_stat_show(struct seq_file *m, void *v) 4396 { 4397 struct numa_stat { 4398 const char *name; 4399 unsigned int lru_mask; 4400 }; 4401 4402 static const struct numa_stat stats[] = { 4403 { "total", LRU_ALL }, 4404 { "file", LRU_ALL_FILE }, 4405 { "anon", LRU_ALL_ANON }, 4406 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4407 }; 4408 const struct numa_stat *stat; 4409 int nid; 4410 unsigned long nr; 4411 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4412 4413 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4414 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 4415 seq_printf(m, "%s=%lu", stat->name, nr); 4416 for_each_node_state(nid, N_MEMORY) { 4417 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4418 stat->lru_mask); 4419 seq_printf(m, " N%d=%lu", nid, nr); 4420 } 4421 seq_putc(m, '\n'); 4422 } 4423 4424 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4425 struct mem_cgroup *iter; 4426 4427 nr = 0; 4428 for_each_mem_cgroup_tree(iter, memcg) 4429 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 4430 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 4431 for_each_node_state(nid, N_MEMORY) { 4432 nr = 0; 4433 for_each_mem_cgroup_tree(iter, memcg) 4434 nr += mem_cgroup_node_nr_lru_pages( 4435 iter, nid, stat->lru_mask); 4436 seq_printf(m, " N%d=%lu", nid, nr); 4437 } 4438 seq_putc(m, '\n'); 4439 } 4440 4441 return 0; 4442 } 4443 #endif /* CONFIG_NUMA */ 4444 4445 static inline void mem_cgroup_lru_names_not_uptodate(void) 4446 { 4447 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4448 } 4449 4450 static int memcg_stat_show(struct seq_file *m, void *v) 4451 { 4452 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4453 struct mem_cgroup *mi; 4454 unsigned int i; 4455 4456 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4457 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4458 continue; 4459 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4460 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4461 } 4462 4463 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 4464 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 4465 mem_cgroup_read_events(memcg, i)); 4466 4467 for (i = 0; i < NR_LRU_LISTS; i++) 4468 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 4469 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 4470 4471 /* Hierarchical information */ 4472 { 4473 unsigned long long limit, memsw_limit; 4474 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4475 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 4476 if (do_swap_account) 4477 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4478 memsw_limit); 4479 } 4480 4481 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4482 long long val = 0; 4483 4484 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4485 continue; 4486 for_each_mem_cgroup_tree(mi, memcg) 4487 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4488 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 4489 } 4490 4491 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 4492 unsigned long long val = 0; 4493 4494 for_each_mem_cgroup_tree(mi, memcg) 4495 val += mem_cgroup_read_events(mi, i); 4496 seq_printf(m, "total_%s %llu\n", 4497 mem_cgroup_events_names[i], val); 4498 } 4499 4500 for (i = 0; i < NR_LRU_LISTS; i++) { 4501 unsigned long long val = 0; 4502 4503 for_each_mem_cgroup_tree(mi, memcg) 4504 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 4505 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 4506 } 4507 4508 #ifdef CONFIG_DEBUG_VM 4509 { 4510 int nid, zid; 4511 struct mem_cgroup_per_zone *mz; 4512 struct zone_reclaim_stat *rstat; 4513 unsigned long recent_rotated[2] = {0, 0}; 4514 unsigned long recent_scanned[2] = {0, 0}; 4515 4516 for_each_online_node(nid) 4517 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4518 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 4519 rstat = &mz->lruvec.reclaim_stat; 4520 4521 recent_rotated[0] += rstat->recent_rotated[0]; 4522 recent_rotated[1] += rstat->recent_rotated[1]; 4523 recent_scanned[0] += rstat->recent_scanned[0]; 4524 recent_scanned[1] += rstat->recent_scanned[1]; 4525 } 4526 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 4527 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 4528 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 4529 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 4530 } 4531 #endif 4532 4533 return 0; 4534 } 4535 4536 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4537 struct cftype *cft) 4538 { 4539 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4540 4541 return mem_cgroup_swappiness(memcg); 4542 } 4543 4544 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4545 struct cftype *cft, u64 val) 4546 { 4547 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4548 4549 if (val > 100) 4550 return -EINVAL; 4551 4552 if (css->parent) 4553 memcg->swappiness = val; 4554 else 4555 vm_swappiness = val; 4556 4557 return 0; 4558 } 4559 4560 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4561 { 4562 struct mem_cgroup_threshold_ary *t; 4563 u64 usage; 4564 int i; 4565 4566 rcu_read_lock(); 4567 if (!swap) 4568 t = rcu_dereference(memcg->thresholds.primary); 4569 else 4570 t = rcu_dereference(memcg->memsw_thresholds.primary); 4571 4572 if (!t) 4573 goto unlock; 4574 4575 if (!swap) 4576 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4577 else 4578 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4579 4580 /* 4581 * current_threshold points to threshold just below or equal to usage. 4582 * If it's not true, a threshold was crossed after last 4583 * call of __mem_cgroup_threshold(). 4584 */ 4585 i = t->current_threshold; 4586 4587 /* 4588 * Iterate backward over array of thresholds starting from 4589 * current_threshold and check if a threshold is crossed. 4590 * If none of thresholds below usage is crossed, we read 4591 * only one element of the array here. 4592 */ 4593 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4594 eventfd_signal(t->entries[i].eventfd, 1); 4595 4596 /* i = current_threshold + 1 */ 4597 i++; 4598 4599 /* 4600 * Iterate forward over array of thresholds starting from 4601 * current_threshold+1 and check if a threshold is crossed. 4602 * If none of thresholds above usage is crossed, we read 4603 * only one element of the array here. 4604 */ 4605 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4606 eventfd_signal(t->entries[i].eventfd, 1); 4607 4608 /* Update current_threshold */ 4609 t->current_threshold = i - 1; 4610 unlock: 4611 rcu_read_unlock(); 4612 } 4613 4614 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4615 { 4616 while (memcg) { 4617 __mem_cgroup_threshold(memcg, false); 4618 if (do_swap_account) 4619 __mem_cgroup_threshold(memcg, true); 4620 4621 memcg = parent_mem_cgroup(memcg); 4622 } 4623 } 4624 4625 static int compare_thresholds(const void *a, const void *b) 4626 { 4627 const struct mem_cgroup_threshold *_a = a; 4628 const struct mem_cgroup_threshold *_b = b; 4629 4630 if (_a->threshold > _b->threshold) 4631 return 1; 4632 4633 if (_a->threshold < _b->threshold) 4634 return -1; 4635 4636 return 0; 4637 } 4638 4639 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4640 { 4641 struct mem_cgroup_eventfd_list *ev; 4642 4643 spin_lock(&memcg_oom_lock); 4644 4645 list_for_each_entry(ev, &memcg->oom_notify, list) 4646 eventfd_signal(ev->eventfd, 1); 4647 4648 spin_unlock(&memcg_oom_lock); 4649 return 0; 4650 } 4651 4652 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4653 { 4654 struct mem_cgroup *iter; 4655 4656 for_each_mem_cgroup_tree(iter, memcg) 4657 mem_cgroup_oom_notify_cb(iter); 4658 } 4659 4660 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4661 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4662 { 4663 struct mem_cgroup_thresholds *thresholds; 4664 struct mem_cgroup_threshold_ary *new; 4665 u64 threshold, usage; 4666 int i, size, ret; 4667 4668 ret = res_counter_memparse_write_strategy(args, &threshold); 4669 if (ret) 4670 return ret; 4671 4672 mutex_lock(&memcg->thresholds_lock); 4673 4674 if (type == _MEM) { 4675 thresholds = &memcg->thresholds; 4676 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4677 } else if (type == _MEMSWAP) { 4678 thresholds = &memcg->memsw_thresholds; 4679 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4680 } else 4681 BUG(); 4682 4683 /* Check if a threshold crossed before adding a new one */ 4684 if (thresholds->primary) 4685 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4686 4687 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4688 4689 /* Allocate memory for new array of thresholds */ 4690 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4691 GFP_KERNEL); 4692 if (!new) { 4693 ret = -ENOMEM; 4694 goto unlock; 4695 } 4696 new->size = size; 4697 4698 /* Copy thresholds (if any) to new array */ 4699 if (thresholds->primary) { 4700 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4701 sizeof(struct mem_cgroup_threshold)); 4702 } 4703 4704 /* Add new threshold */ 4705 new->entries[size - 1].eventfd = eventfd; 4706 new->entries[size - 1].threshold = threshold; 4707 4708 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4709 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4710 compare_thresholds, NULL); 4711 4712 /* Find current threshold */ 4713 new->current_threshold = -1; 4714 for (i = 0; i < size; i++) { 4715 if (new->entries[i].threshold <= usage) { 4716 /* 4717 * new->current_threshold will not be used until 4718 * rcu_assign_pointer(), so it's safe to increment 4719 * it here. 4720 */ 4721 ++new->current_threshold; 4722 } else 4723 break; 4724 } 4725 4726 /* Free old spare buffer and save old primary buffer as spare */ 4727 kfree(thresholds->spare); 4728 thresholds->spare = thresholds->primary; 4729 4730 rcu_assign_pointer(thresholds->primary, new); 4731 4732 /* To be sure that nobody uses thresholds */ 4733 synchronize_rcu(); 4734 4735 unlock: 4736 mutex_unlock(&memcg->thresholds_lock); 4737 4738 return ret; 4739 } 4740 4741 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4742 struct eventfd_ctx *eventfd, const char *args) 4743 { 4744 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4745 } 4746 4747 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4748 struct eventfd_ctx *eventfd, const char *args) 4749 { 4750 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4751 } 4752 4753 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4754 struct eventfd_ctx *eventfd, enum res_type type) 4755 { 4756 struct mem_cgroup_thresholds *thresholds; 4757 struct mem_cgroup_threshold_ary *new; 4758 u64 usage; 4759 int i, j, size; 4760 4761 mutex_lock(&memcg->thresholds_lock); 4762 4763 if (type == _MEM) { 4764 thresholds = &memcg->thresholds; 4765 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4766 } else if (type == _MEMSWAP) { 4767 thresholds = &memcg->memsw_thresholds; 4768 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4769 } else 4770 BUG(); 4771 4772 if (!thresholds->primary) 4773 goto unlock; 4774 4775 /* Check if a threshold crossed before removing */ 4776 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4777 4778 /* Calculate new number of threshold */ 4779 size = 0; 4780 for (i = 0; i < thresholds->primary->size; i++) { 4781 if (thresholds->primary->entries[i].eventfd != eventfd) 4782 size++; 4783 } 4784 4785 new = thresholds->spare; 4786 4787 /* Set thresholds array to NULL if we don't have thresholds */ 4788 if (!size) { 4789 kfree(new); 4790 new = NULL; 4791 goto swap_buffers; 4792 } 4793 4794 new->size = size; 4795 4796 /* Copy thresholds and find current threshold */ 4797 new->current_threshold = -1; 4798 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4799 if (thresholds->primary->entries[i].eventfd == eventfd) 4800 continue; 4801 4802 new->entries[j] = thresholds->primary->entries[i]; 4803 if (new->entries[j].threshold <= usage) { 4804 /* 4805 * new->current_threshold will not be used 4806 * until rcu_assign_pointer(), so it's safe to increment 4807 * it here. 4808 */ 4809 ++new->current_threshold; 4810 } 4811 j++; 4812 } 4813 4814 swap_buffers: 4815 /* Swap primary and spare array */ 4816 thresholds->spare = thresholds->primary; 4817 /* If all events are unregistered, free the spare array */ 4818 if (!new) { 4819 kfree(thresholds->spare); 4820 thresholds->spare = NULL; 4821 } 4822 4823 rcu_assign_pointer(thresholds->primary, new); 4824 4825 /* To be sure that nobody uses thresholds */ 4826 synchronize_rcu(); 4827 unlock: 4828 mutex_unlock(&memcg->thresholds_lock); 4829 } 4830 4831 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4832 struct eventfd_ctx *eventfd) 4833 { 4834 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4835 } 4836 4837 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4838 struct eventfd_ctx *eventfd) 4839 { 4840 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4841 } 4842 4843 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4844 struct eventfd_ctx *eventfd, const char *args) 4845 { 4846 struct mem_cgroup_eventfd_list *event; 4847 4848 event = kmalloc(sizeof(*event), GFP_KERNEL); 4849 if (!event) 4850 return -ENOMEM; 4851 4852 spin_lock(&memcg_oom_lock); 4853 4854 event->eventfd = eventfd; 4855 list_add(&event->list, &memcg->oom_notify); 4856 4857 /* already in OOM ? */ 4858 if (atomic_read(&memcg->under_oom)) 4859 eventfd_signal(eventfd, 1); 4860 spin_unlock(&memcg_oom_lock); 4861 4862 return 0; 4863 } 4864 4865 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4866 struct eventfd_ctx *eventfd) 4867 { 4868 struct mem_cgroup_eventfd_list *ev, *tmp; 4869 4870 spin_lock(&memcg_oom_lock); 4871 4872 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4873 if (ev->eventfd == eventfd) { 4874 list_del(&ev->list); 4875 kfree(ev); 4876 } 4877 } 4878 4879 spin_unlock(&memcg_oom_lock); 4880 } 4881 4882 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4883 { 4884 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 4885 4886 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4887 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 4888 return 0; 4889 } 4890 4891 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4892 struct cftype *cft, u64 val) 4893 { 4894 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4895 4896 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4897 if (!css->parent || !((val == 0) || (val == 1))) 4898 return -EINVAL; 4899 4900 memcg->oom_kill_disable = val; 4901 if (!val) 4902 memcg_oom_recover(memcg); 4903 4904 return 0; 4905 } 4906 4907 #ifdef CONFIG_MEMCG_KMEM 4908 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4909 { 4910 int ret; 4911 4912 memcg->kmemcg_id = -1; 4913 ret = memcg_propagate_kmem(memcg); 4914 if (ret) 4915 return ret; 4916 4917 return mem_cgroup_sockets_init(memcg, ss); 4918 } 4919 4920 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4921 { 4922 mem_cgroup_sockets_destroy(memcg); 4923 } 4924 4925 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4926 { 4927 if (!memcg_kmem_is_active(memcg)) 4928 return; 4929 4930 /* 4931 * kmem charges can outlive the cgroup. In the case of slab 4932 * pages, for instance, a page contain objects from various 4933 * processes. As we prevent from taking a reference for every 4934 * such allocation we have to be careful when doing uncharge 4935 * (see memcg_uncharge_kmem) and here during offlining. 4936 * 4937 * The idea is that that only the _last_ uncharge which sees 4938 * the dead memcg will drop the last reference. An additional 4939 * reference is taken here before the group is marked dead 4940 * which is then paired with css_put during uncharge resp. here. 4941 * 4942 * Although this might sound strange as this path is called from 4943 * css_offline() when the referencemight have dropped down to 0 and 4944 * shouldn't be incremented anymore (css_tryget_online() would 4945 * fail) we do not have other options because of the kmem 4946 * allocations lifetime. 4947 */ 4948 css_get(&memcg->css); 4949 4950 memcg_kmem_mark_dead(memcg); 4951 4952 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 4953 return; 4954 4955 if (memcg_kmem_test_and_clear_dead(memcg)) 4956 css_put(&memcg->css); 4957 } 4958 #else 4959 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4960 { 4961 return 0; 4962 } 4963 4964 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4965 { 4966 } 4967 4968 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4969 { 4970 } 4971 #endif 4972 4973 /* 4974 * DO NOT USE IN NEW FILES. 4975 * 4976 * "cgroup.event_control" implementation. 4977 * 4978 * This is way over-engineered. It tries to support fully configurable 4979 * events for each user. Such level of flexibility is completely 4980 * unnecessary especially in the light of the planned unified hierarchy. 4981 * 4982 * Please deprecate this and replace with something simpler if at all 4983 * possible. 4984 */ 4985 4986 /* 4987 * Unregister event and free resources. 4988 * 4989 * Gets called from workqueue. 4990 */ 4991 static void memcg_event_remove(struct work_struct *work) 4992 { 4993 struct mem_cgroup_event *event = 4994 container_of(work, struct mem_cgroup_event, remove); 4995 struct mem_cgroup *memcg = event->memcg; 4996 4997 remove_wait_queue(event->wqh, &event->wait); 4998 4999 event->unregister_event(memcg, event->eventfd); 5000 5001 /* Notify userspace the event is going away. */ 5002 eventfd_signal(event->eventfd, 1); 5003 5004 eventfd_ctx_put(event->eventfd); 5005 kfree(event); 5006 css_put(&memcg->css); 5007 } 5008 5009 /* 5010 * Gets called on POLLHUP on eventfd when user closes it. 5011 * 5012 * Called with wqh->lock held and interrupts disabled. 5013 */ 5014 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 5015 int sync, void *key) 5016 { 5017 struct mem_cgroup_event *event = 5018 container_of(wait, struct mem_cgroup_event, wait); 5019 struct mem_cgroup *memcg = event->memcg; 5020 unsigned long flags = (unsigned long)key; 5021 5022 if (flags & POLLHUP) { 5023 /* 5024 * If the event has been detached at cgroup removal, we 5025 * can simply return knowing the other side will cleanup 5026 * for us. 5027 * 5028 * We can't race against event freeing since the other 5029 * side will require wqh->lock via remove_wait_queue(), 5030 * which we hold. 5031 */ 5032 spin_lock(&memcg->event_list_lock); 5033 if (!list_empty(&event->list)) { 5034 list_del_init(&event->list); 5035 /* 5036 * We are in atomic context, but cgroup_event_remove() 5037 * may sleep, so we have to call it in workqueue. 5038 */ 5039 schedule_work(&event->remove); 5040 } 5041 spin_unlock(&memcg->event_list_lock); 5042 } 5043 5044 return 0; 5045 } 5046 5047 static void memcg_event_ptable_queue_proc(struct file *file, 5048 wait_queue_head_t *wqh, poll_table *pt) 5049 { 5050 struct mem_cgroup_event *event = 5051 container_of(pt, struct mem_cgroup_event, pt); 5052 5053 event->wqh = wqh; 5054 add_wait_queue(wqh, &event->wait); 5055 } 5056 5057 /* 5058 * DO NOT USE IN NEW FILES. 5059 * 5060 * Parse input and register new cgroup event handler. 5061 * 5062 * Input must be in format '<event_fd> <control_fd> <args>'. 5063 * Interpretation of args is defined by control file implementation. 5064 */ 5065 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 5066 char *buf, size_t nbytes, loff_t off) 5067 { 5068 struct cgroup_subsys_state *css = of_css(of); 5069 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5070 struct mem_cgroup_event *event; 5071 struct cgroup_subsys_state *cfile_css; 5072 unsigned int efd, cfd; 5073 struct fd efile; 5074 struct fd cfile; 5075 const char *name; 5076 char *endp; 5077 int ret; 5078 5079 buf = strstrip(buf); 5080 5081 efd = simple_strtoul(buf, &endp, 10); 5082 if (*endp != ' ') 5083 return -EINVAL; 5084 buf = endp + 1; 5085 5086 cfd = simple_strtoul(buf, &endp, 10); 5087 if ((*endp != ' ') && (*endp != '\0')) 5088 return -EINVAL; 5089 buf = endp + 1; 5090 5091 event = kzalloc(sizeof(*event), GFP_KERNEL); 5092 if (!event) 5093 return -ENOMEM; 5094 5095 event->memcg = memcg; 5096 INIT_LIST_HEAD(&event->list); 5097 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 5098 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 5099 INIT_WORK(&event->remove, memcg_event_remove); 5100 5101 efile = fdget(efd); 5102 if (!efile.file) { 5103 ret = -EBADF; 5104 goto out_kfree; 5105 } 5106 5107 event->eventfd = eventfd_ctx_fileget(efile.file); 5108 if (IS_ERR(event->eventfd)) { 5109 ret = PTR_ERR(event->eventfd); 5110 goto out_put_efile; 5111 } 5112 5113 cfile = fdget(cfd); 5114 if (!cfile.file) { 5115 ret = -EBADF; 5116 goto out_put_eventfd; 5117 } 5118 5119 /* the process need read permission on control file */ 5120 /* AV: shouldn't we check that it's been opened for read instead? */ 5121 ret = inode_permission(file_inode(cfile.file), MAY_READ); 5122 if (ret < 0) 5123 goto out_put_cfile; 5124 5125 /* 5126 * Determine the event callbacks and set them in @event. This used 5127 * to be done via struct cftype but cgroup core no longer knows 5128 * about these events. The following is crude but the whole thing 5129 * is for compatibility anyway. 5130 * 5131 * DO NOT ADD NEW FILES. 5132 */ 5133 name = cfile.file->f_dentry->d_name.name; 5134 5135 if (!strcmp(name, "memory.usage_in_bytes")) { 5136 event->register_event = mem_cgroup_usage_register_event; 5137 event->unregister_event = mem_cgroup_usage_unregister_event; 5138 } else if (!strcmp(name, "memory.oom_control")) { 5139 event->register_event = mem_cgroup_oom_register_event; 5140 event->unregister_event = mem_cgroup_oom_unregister_event; 5141 } else if (!strcmp(name, "memory.pressure_level")) { 5142 event->register_event = vmpressure_register_event; 5143 event->unregister_event = vmpressure_unregister_event; 5144 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 5145 event->register_event = memsw_cgroup_usage_register_event; 5146 event->unregister_event = memsw_cgroup_usage_unregister_event; 5147 } else { 5148 ret = -EINVAL; 5149 goto out_put_cfile; 5150 } 5151 5152 /* 5153 * Verify @cfile should belong to @css. Also, remaining events are 5154 * automatically removed on cgroup destruction but the removal is 5155 * asynchronous, so take an extra ref on @css. 5156 */ 5157 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, 5158 &memory_cgrp_subsys); 5159 ret = -EINVAL; 5160 if (IS_ERR(cfile_css)) 5161 goto out_put_cfile; 5162 if (cfile_css != css) { 5163 css_put(cfile_css); 5164 goto out_put_cfile; 5165 } 5166 5167 ret = event->register_event(memcg, event->eventfd, buf); 5168 if (ret) 5169 goto out_put_css; 5170 5171 efile.file->f_op->poll(efile.file, &event->pt); 5172 5173 spin_lock(&memcg->event_list_lock); 5174 list_add(&event->list, &memcg->event_list); 5175 spin_unlock(&memcg->event_list_lock); 5176 5177 fdput(cfile); 5178 fdput(efile); 5179 5180 return nbytes; 5181 5182 out_put_css: 5183 css_put(css); 5184 out_put_cfile: 5185 fdput(cfile); 5186 out_put_eventfd: 5187 eventfd_ctx_put(event->eventfd); 5188 out_put_efile: 5189 fdput(efile); 5190 out_kfree: 5191 kfree(event); 5192 5193 return ret; 5194 } 5195 5196 static struct cftype mem_cgroup_files[] = { 5197 { 5198 .name = "usage_in_bytes", 5199 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5200 .read_u64 = mem_cgroup_read_u64, 5201 }, 5202 { 5203 .name = "max_usage_in_bytes", 5204 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5205 .write = mem_cgroup_reset, 5206 .read_u64 = mem_cgroup_read_u64, 5207 }, 5208 { 5209 .name = "limit_in_bytes", 5210 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5211 .write = mem_cgroup_write, 5212 .read_u64 = mem_cgroup_read_u64, 5213 }, 5214 { 5215 .name = "soft_limit_in_bytes", 5216 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5217 .write = mem_cgroup_write, 5218 .read_u64 = mem_cgroup_read_u64, 5219 }, 5220 { 5221 .name = "failcnt", 5222 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5223 .write = mem_cgroup_reset, 5224 .read_u64 = mem_cgroup_read_u64, 5225 }, 5226 { 5227 .name = "stat", 5228 .seq_show = memcg_stat_show, 5229 }, 5230 { 5231 .name = "force_empty", 5232 .write = mem_cgroup_force_empty_write, 5233 }, 5234 { 5235 .name = "use_hierarchy", 5236 .write_u64 = mem_cgroup_hierarchy_write, 5237 .read_u64 = mem_cgroup_hierarchy_read, 5238 }, 5239 { 5240 .name = "cgroup.event_control", /* XXX: for compat */ 5241 .write = memcg_write_event_control, 5242 .flags = CFTYPE_NO_PREFIX, 5243 .mode = S_IWUGO, 5244 }, 5245 { 5246 .name = "swappiness", 5247 .read_u64 = mem_cgroup_swappiness_read, 5248 .write_u64 = mem_cgroup_swappiness_write, 5249 }, 5250 { 5251 .name = "move_charge_at_immigrate", 5252 .read_u64 = mem_cgroup_move_charge_read, 5253 .write_u64 = mem_cgroup_move_charge_write, 5254 }, 5255 { 5256 .name = "oom_control", 5257 .seq_show = mem_cgroup_oom_control_read, 5258 .write_u64 = mem_cgroup_oom_control_write, 5259 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5260 }, 5261 { 5262 .name = "pressure_level", 5263 }, 5264 #ifdef CONFIG_NUMA 5265 { 5266 .name = "numa_stat", 5267 .seq_show = memcg_numa_stat_show, 5268 }, 5269 #endif 5270 #ifdef CONFIG_MEMCG_KMEM 5271 { 5272 .name = "kmem.limit_in_bytes", 5273 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5274 .write = mem_cgroup_write, 5275 .read_u64 = mem_cgroup_read_u64, 5276 }, 5277 { 5278 .name = "kmem.usage_in_bytes", 5279 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5280 .read_u64 = mem_cgroup_read_u64, 5281 }, 5282 { 5283 .name = "kmem.failcnt", 5284 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5285 .write = mem_cgroup_reset, 5286 .read_u64 = mem_cgroup_read_u64, 5287 }, 5288 { 5289 .name = "kmem.max_usage_in_bytes", 5290 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5291 .write = mem_cgroup_reset, 5292 .read_u64 = mem_cgroup_read_u64, 5293 }, 5294 #ifdef CONFIG_SLABINFO 5295 { 5296 .name = "kmem.slabinfo", 5297 .seq_show = mem_cgroup_slabinfo_read, 5298 }, 5299 #endif 5300 #endif 5301 { }, /* terminate */ 5302 }; 5303 5304 #ifdef CONFIG_MEMCG_SWAP 5305 static struct cftype memsw_cgroup_files[] = { 5306 { 5307 .name = "memsw.usage_in_bytes", 5308 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 5309 .read_u64 = mem_cgroup_read_u64, 5310 }, 5311 { 5312 .name = "memsw.max_usage_in_bytes", 5313 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 5314 .write = mem_cgroup_reset, 5315 .read_u64 = mem_cgroup_read_u64, 5316 }, 5317 { 5318 .name = "memsw.limit_in_bytes", 5319 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 5320 .write = mem_cgroup_write, 5321 .read_u64 = mem_cgroup_read_u64, 5322 }, 5323 { 5324 .name = "memsw.failcnt", 5325 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 5326 .write = mem_cgroup_reset, 5327 .read_u64 = mem_cgroup_read_u64, 5328 }, 5329 { }, /* terminate */ 5330 }; 5331 #endif 5332 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5333 { 5334 struct mem_cgroup_per_node *pn; 5335 struct mem_cgroup_per_zone *mz; 5336 int zone, tmp = node; 5337 /* 5338 * This routine is called against possible nodes. 5339 * But it's BUG to call kmalloc() against offline node. 5340 * 5341 * TODO: this routine can waste much memory for nodes which will 5342 * never be onlined. It's better to use memory hotplug callback 5343 * function. 5344 */ 5345 if (!node_state(node, N_NORMAL_MEMORY)) 5346 tmp = -1; 5347 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5348 if (!pn) 5349 return 1; 5350 5351 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5352 mz = &pn->zoneinfo[zone]; 5353 lruvec_init(&mz->lruvec); 5354 mz->usage_in_excess = 0; 5355 mz->on_tree = false; 5356 mz->memcg = memcg; 5357 } 5358 memcg->nodeinfo[node] = pn; 5359 return 0; 5360 } 5361 5362 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5363 { 5364 kfree(memcg->nodeinfo[node]); 5365 } 5366 5367 static struct mem_cgroup *mem_cgroup_alloc(void) 5368 { 5369 struct mem_cgroup *memcg; 5370 size_t size; 5371 5372 size = sizeof(struct mem_cgroup); 5373 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5374 5375 memcg = kzalloc(size, GFP_KERNEL); 5376 if (!memcg) 5377 return NULL; 5378 5379 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 5380 if (!memcg->stat) 5381 goto out_free; 5382 spin_lock_init(&memcg->pcp_counter_lock); 5383 return memcg; 5384 5385 out_free: 5386 kfree(memcg); 5387 return NULL; 5388 } 5389 5390 /* 5391 * At destroying mem_cgroup, references from swap_cgroup can remain. 5392 * (scanning all at force_empty is too costly...) 5393 * 5394 * Instead of clearing all references at force_empty, we remember 5395 * the number of reference from swap_cgroup and free mem_cgroup when 5396 * it goes down to 0. 5397 * 5398 * Removal of cgroup itself succeeds regardless of refs from swap. 5399 */ 5400 5401 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5402 { 5403 int node; 5404 5405 mem_cgroup_remove_from_trees(memcg); 5406 5407 for_each_node(node) 5408 free_mem_cgroup_per_zone_info(memcg, node); 5409 5410 free_percpu(memcg->stat); 5411 5412 /* 5413 * We need to make sure that (at least for now), the jump label 5414 * destruction code runs outside of the cgroup lock. This is because 5415 * get_online_cpus(), which is called from the static_branch update, 5416 * can't be called inside the cgroup_lock. cpusets are the ones 5417 * enforcing this dependency, so if they ever change, we might as well. 5418 * 5419 * schedule_work() will guarantee this happens. Be careful if you need 5420 * to move this code around, and make sure it is outside 5421 * the cgroup_lock. 5422 */ 5423 disarm_static_keys(memcg); 5424 kfree(memcg); 5425 } 5426 5427 /* 5428 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 5429 */ 5430 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 5431 { 5432 if (!memcg->res.parent) 5433 return NULL; 5434 return mem_cgroup_from_res_counter(memcg->res.parent, res); 5435 } 5436 EXPORT_SYMBOL(parent_mem_cgroup); 5437 5438 static void __init mem_cgroup_soft_limit_tree_init(void) 5439 { 5440 struct mem_cgroup_tree_per_node *rtpn; 5441 struct mem_cgroup_tree_per_zone *rtpz; 5442 int tmp, node, zone; 5443 5444 for_each_node(node) { 5445 tmp = node; 5446 if (!node_state(node, N_NORMAL_MEMORY)) 5447 tmp = -1; 5448 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 5449 BUG_ON(!rtpn); 5450 5451 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5452 5453 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5454 rtpz = &rtpn->rb_tree_per_zone[zone]; 5455 rtpz->rb_root = RB_ROOT; 5456 spin_lock_init(&rtpz->lock); 5457 } 5458 } 5459 } 5460 5461 static struct cgroup_subsys_state * __ref 5462 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5463 { 5464 struct mem_cgroup *memcg; 5465 long error = -ENOMEM; 5466 int node; 5467 5468 memcg = mem_cgroup_alloc(); 5469 if (!memcg) 5470 return ERR_PTR(error); 5471 5472 for_each_node(node) 5473 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 5474 goto free_out; 5475 5476 /* root ? */ 5477 if (parent_css == NULL) { 5478 root_mem_cgroup = memcg; 5479 res_counter_init(&memcg->res, NULL); 5480 res_counter_init(&memcg->memsw, NULL); 5481 res_counter_init(&memcg->kmem, NULL); 5482 } 5483 5484 memcg->last_scanned_node = MAX_NUMNODES; 5485 INIT_LIST_HEAD(&memcg->oom_notify); 5486 memcg->move_charge_at_immigrate = 0; 5487 mutex_init(&memcg->thresholds_lock); 5488 spin_lock_init(&memcg->move_lock); 5489 vmpressure_init(&memcg->vmpressure); 5490 INIT_LIST_HEAD(&memcg->event_list); 5491 spin_lock_init(&memcg->event_list_lock); 5492 5493 return &memcg->css; 5494 5495 free_out: 5496 __mem_cgroup_free(memcg); 5497 return ERR_PTR(error); 5498 } 5499 5500 static int 5501 mem_cgroup_css_online(struct cgroup_subsys_state *css) 5502 { 5503 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5504 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 5505 5506 if (css->id > MEM_CGROUP_ID_MAX) 5507 return -ENOSPC; 5508 5509 if (!parent) 5510 return 0; 5511 5512 mutex_lock(&memcg_create_mutex); 5513 5514 memcg->use_hierarchy = parent->use_hierarchy; 5515 memcg->oom_kill_disable = parent->oom_kill_disable; 5516 memcg->swappiness = mem_cgroup_swappiness(parent); 5517 5518 if (parent->use_hierarchy) { 5519 res_counter_init(&memcg->res, &parent->res); 5520 res_counter_init(&memcg->memsw, &parent->memsw); 5521 res_counter_init(&memcg->kmem, &parent->kmem); 5522 5523 /* 5524 * No need to take a reference to the parent because cgroup 5525 * core guarantees its existence. 5526 */ 5527 } else { 5528 res_counter_init(&memcg->res, &root_mem_cgroup->res); 5529 res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); 5530 res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 5531 /* 5532 * Deeper hierachy with use_hierarchy == false doesn't make 5533 * much sense so let cgroup subsystem know about this 5534 * unfortunate state in our controller. 5535 */ 5536 if (parent != root_mem_cgroup) 5537 memory_cgrp_subsys.broken_hierarchy = true; 5538 } 5539 mutex_unlock(&memcg_create_mutex); 5540 5541 return memcg_init_kmem(memcg, &memory_cgrp_subsys); 5542 } 5543 5544 /* 5545 * Announce all parents that a group from their hierarchy is gone. 5546 */ 5547 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 5548 { 5549 struct mem_cgroup *parent = memcg; 5550 5551 while ((parent = parent_mem_cgroup(parent))) 5552 mem_cgroup_iter_invalidate(parent); 5553 5554 /* 5555 * if the root memcg is not hierarchical we have to check it 5556 * explicitely. 5557 */ 5558 if (!root_mem_cgroup->use_hierarchy) 5559 mem_cgroup_iter_invalidate(root_mem_cgroup); 5560 } 5561 5562 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5563 { 5564 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5565 struct mem_cgroup_event *event, *tmp; 5566 struct cgroup_subsys_state *iter; 5567 5568 /* 5569 * Unregister events and notify userspace. 5570 * Notify userspace about cgroup removing only after rmdir of cgroup 5571 * directory to avoid race between userspace and kernelspace. 5572 */ 5573 spin_lock(&memcg->event_list_lock); 5574 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5575 list_del_init(&event->list); 5576 schedule_work(&event->remove); 5577 } 5578 spin_unlock(&memcg->event_list_lock); 5579 5580 kmem_cgroup_css_offline(memcg); 5581 5582 mem_cgroup_invalidate_reclaim_iterators(memcg); 5583 5584 /* 5585 * This requires that offlining is serialized. Right now that is 5586 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. 5587 */ 5588 css_for_each_descendant_post(iter, css) 5589 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 5590 5591 memcg_unregister_all_caches(memcg); 5592 vmpressure_cleanup(&memcg->vmpressure); 5593 } 5594 5595 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5596 { 5597 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5598 /* 5599 * XXX: css_offline() would be where we should reparent all 5600 * memory to prepare the cgroup for destruction. However, 5601 * memcg does not do css_tryget_online() and res_counter charging 5602 * under the same RCU lock region, which means that charging 5603 * could race with offlining. Offlining only happens to 5604 * cgroups with no tasks in them but charges can show up 5605 * without any tasks from the swapin path when the target 5606 * memcg is looked up from the swapout record and not from the 5607 * current task as it usually is. A race like this can leak 5608 * charges and put pages with stale cgroup pointers into 5609 * circulation: 5610 * 5611 * #0 #1 5612 * lookup_swap_cgroup_id() 5613 * rcu_read_lock() 5614 * mem_cgroup_lookup() 5615 * css_tryget_online() 5616 * rcu_read_unlock() 5617 * disable css_tryget_online() 5618 * call_rcu() 5619 * offline_css() 5620 * reparent_charges() 5621 * res_counter_charge() 5622 * css_put() 5623 * css_free() 5624 * pc->mem_cgroup = dead memcg 5625 * add page to lru 5626 * 5627 * The bulk of the charges are still moved in offline_css() to 5628 * avoid pinning a lot of pages in case a long-term reference 5629 * like a swapout record is deferring the css_free() to long 5630 * after offlining. But this makes sure we catch any charges 5631 * made after offlining: 5632 */ 5633 mem_cgroup_reparent_charges(memcg); 5634 5635 memcg_destroy_kmem(memcg); 5636 __mem_cgroup_free(memcg); 5637 } 5638 5639 /** 5640 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5641 * @css: the target css 5642 * 5643 * Reset the states of the mem_cgroup associated with @css. This is 5644 * invoked when the userland requests disabling on the default hierarchy 5645 * but the memcg is pinned through dependency. The memcg should stop 5646 * applying policies and should revert to the vanilla state as it may be 5647 * made visible again. 5648 * 5649 * The current implementation only resets the essential configurations. 5650 * This needs to be expanded to cover all the visible parts. 5651 */ 5652 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5653 { 5654 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5655 5656 mem_cgroup_resize_limit(memcg, ULLONG_MAX); 5657 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 5658 memcg_update_kmem_limit(memcg, ULLONG_MAX); 5659 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 5660 } 5661 5662 #ifdef CONFIG_MMU 5663 /* Handlers for move charge at task migration. */ 5664 static int mem_cgroup_do_precharge(unsigned long count) 5665 { 5666 int ret; 5667 5668 /* Try a single bulk charge without reclaim first */ 5669 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 5670 if (!ret) { 5671 mc.precharge += count; 5672 return ret; 5673 } 5674 if (ret == -EINTR) { 5675 cancel_charge(root_mem_cgroup, count); 5676 return ret; 5677 } 5678 5679 /* Try charges one by one with reclaim */ 5680 while (count--) { 5681 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 5682 /* 5683 * In case of failure, any residual charges against 5684 * mc.to will be dropped by mem_cgroup_clear_mc() 5685 * later on. However, cancel any charges that are 5686 * bypassed to root right away or they'll be lost. 5687 */ 5688 if (ret == -EINTR) 5689 cancel_charge(root_mem_cgroup, 1); 5690 if (ret) 5691 return ret; 5692 mc.precharge++; 5693 cond_resched(); 5694 } 5695 return 0; 5696 } 5697 5698 /** 5699 * get_mctgt_type - get target type of moving charge 5700 * @vma: the vma the pte to be checked belongs 5701 * @addr: the address corresponding to the pte to be checked 5702 * @ptent: the pte to be checked 5703 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5704 * 5705 * Returns 5706 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5707 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5708 * move charge. if @target is not NULL, the page is stored in target->page 5709 * with extra refcnt got(Callers should handle it). 5710 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5711 * target for charge migration. if @target is not NULL, the entry is stored 5712 * in target->ent. 5713 * 5714 * Called with pte lock held. 5715 */ 5716 union mc_target { 5717 struct page *page; 5718 swp_entry_t ent; 5719 }; 5720 5721 enum mc_target_type { 5722 MC_TARGET_NONE = 0, 5723 MC_TARGET_PAGE, 5724 MC_TARGET_SWAP, 5725 }; 5726 5727 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5728 unsigned long addr, pte_t ptent) 5729 { 5730 struct page *page = vm_normal_page(vma, addr, ptent); 5731 5732 if (!page || !page_mapped(page)) 5733 return NULL; 5734 if (PageAnon(page)) { 5735 /* we don't move shared anon */ 5736 if (!move_anon()) 5737 return NULL; 5738 } else if (!move_file()) 5739 /* we ignore mapcount for file pages */ 5740 return NULL; 5741 if (!get_page_unless_zero(page)) 5742 return NULL; 5743 5744 return page; 5745 } 5746 5747 #ifdef CONFIG_SWAP 5748 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5749 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5750 { 5751 struct page *page = NULL; 5752 swp_entry_t ent = pte_to_swp_entry(ptent); 5753 5754 if (!move_anon() || non_swap_entry(ent)) 5755 return NULL; 5756 /* 5757 * Because lookup_swap_cache() updates some statistics counter, 5758 * we call find_get_page() with swapper_space directly. 5759 */ 5760 page = find_get_page(swap_address_space(ent), ent.val); 5761 if (do_swap_account) 5762 entry->val = ent.val; 5763 5764 return page; 5765 } 5766 #else 5767 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5768 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5769 { 5770 return NULL; 5771 } 5772 #endif 5773 5774 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5775 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5776 { 5777 struct page *page = NULL; 5778 struct address_space *mapping; 5779 pgoff_t pgoff; 5780 5781 if (!vma->vm_file) /* anonymous vma */ 5782 return NULL; 5783 if (!move_file()) 5784 return NULL; 5785 5786 mapping = vma->vm_file->f_mapping; 5787 if (pte_none(ptent)) 5788 pgoff = linear_page_index(vma, addr); 5789 else /* pte_file(ptent) is true */ 5790 pgoff = pte_to_pgoff(ptent); 5791 5792 /* page is moved even if it's not RSS of this task(page-faulted). */ 5793 #ifdef CONFIG_SWAP 5794 /* shmem/tmpfs may report page out on swap: account for that too. */ 5795 if (shmem_mapping(mapping)) { 5796 page = find_get_entry(mapping, pgoff); 5797 if (radix_tree_exceptional_entry(page)) { 5798 swp_entry_t swp = radix_to_swp_entry(page); 5799 if (do_swap_account) 5800 *entry = swp; 5801 page = find_get_page(swap_address_space(swp), swp.val); 5802 } 5803 } else 5804 page = find_get_page(mapping, pgoff); 5805 #else 5806 page = find_get_page(mapping, pgoff); 5807 #endif 5808 return page; 5809 } 5810 5811 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5812 unsigned long addr, pte_t ptent, union mc_target *target) 5813 { 5814 struct page *page = NULL; 5815 struct page_cgroup *pc; 5816 enum mc_target_type ret = MC_TARGET_NONE; 5817 swp_entry_t ent = { .val = 0 }; 5818 5819 if (pte_present(ptent)) 5820 page = mc_handle_present_pte(vma, addr, ptent); 5821 else if (is_swap_pte(ptent)) 5822 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5823 else if (pte_none(ptent) || pte_file(ptent)) 5824 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5825 5826 if (!page && !ent.val) 5827 return ret; 5828 if (page) { 5829 pc = lookup_page_cgroup(page); 5830 /* 5831 * Do only loose check w/o serialization. 5832 * mem_cgroup_move_account() checks the pc is valid or 5833 * not under LRU exclusion. 5834 */ 5835 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5836 ret = MC_TARGET_PAGE; 5837 if (target) 5838 target->page = page; 5839 } 5840 if (!ret || !target) 5841 put_page(page); 5842 } 5843 /* There is a swap entry and a page doesn't exist or isn't charged */ 5844 if (ent.val && !ret && 5845 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5846 ret = MC_TARGET_SWAP; 5847 if (target) 5848 target->ent = ent; 5849 } 5850 return ret; 5851 } 5852 5853 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5854 /* 5855 * We don't consider swapping or file mapped pages because THP does not 5856 * support them for now. 5857 * Caller should make sure that pmd_trans_huge(pmd) is true. 5858 */ 5859 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5860 unsigned long addr, pmd_t pmd, union mc_target *target) 5861 { 5862 struct page *page = NULL; 5863 struct page_cgroup *pc; 5864 enum mc_target_type ret = MC_TARGET_NONE; 5865 5866 page = pmd_page(pmd); 5867 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5868 if (!move_anon()) 5869 return ret; 5870 pc = lookup_page_cgroup(page); 5871 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5872 ret = MC_TARGET_PAGE; 5873 if (target) { 5874 get_page(page); 5875 target->page = page; 5876 } 5877 } 5878 return ret; 5879 } 5880 #else 5881 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5882 unsigned long addr, pmd_t pmd, union mc_target *target) 5883 { 5884 return MC_TARGET_NONE; 5885 } 5886 #endif 5887 5888 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5889 unsigned long addr, unsigned long end, 5890 struct mm_walk *walk) 5891 { 5892 struct vm_area_struct *vma = walk->private; 5893 pte_t *pte; 5894 spinlock_t *ptl; 5895 5896 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5897 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5898 mc.precharge += HPAGE_PMD_NR; 5899 spin_unlock(ptl); 5900 return 0; 5901 } 5902 5903 if (pmd_trans_unstable(pmd)) 5904 return 0; 5905 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5906 for (; addr != end; pte++, addr += PAGE_SIZE) 5907 if (get_mctgt_type(vma, addr, *pte, NULL)) 5908 mc.precharge++; /* increment precharge temporarily */ 5909 pte_unmap_unlock(pte - 1, ptl); 5910 cond_resched(); 5911 5912 return 0; 5913 } 5914 5915 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5916 { 5917 unsigned long precharge; 5918 struct vm_area_struct *vma; 5919 5920 down_read(&mm->mmap_sem); 5921 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5922 struct mm_walk mem_cgroup_count_precharge_walk = { 5923 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5924 .mm = mm, 5925 .private = vma, 5926 }; 5927 if (is_vm_hugetlb_page(vma)) 5928 continue; 5929 walk_page_range(vma->vm_start, vma->vm_end, 5930 &mem_cgroup_count_precharge_walk); 5931 } 5932 up_read(&mm->mmap_sem); 5933 5934 precharge = mc.precharge; 5935 mc.precharge = 0; 5936 5937 return precharge; 5938 } 5939 5940 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5941 { 5942 unsigned long precharge = mem_cgroup_count_precharge(mm); 5943 5944 VM_BUG_ON(mc.moving_task); 5945 mc.moving_task = current; 5946 return mem_cgroup_do_precharge(precharge); 5947 } 5948 5949 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5950 static void __mem_cgroup_clear_mc(void) 5951 { 5952 struct mem_cgroup *from = mc.from; 5953 struct mem_cgroup *to = mc.to; 5954 int i; 5955 5956 /* we must uncharge all the leftover precharges from mc.to */ 5957 if (mc.precharge) { 5958 cancel_charge(mc.to, mc.precharge); 5959 mc.precharge = 0; 5960 } 5961 /* 5962 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5963 * we must uncharge here. 5964 */ 5965 if (mc.moved_charge) { 5966 cancel_charge(mc.from, mc.moved_charge); 5967 mc.moved_charge = 0; 5968 } 5969 /* we must fixup refcnts and charges */ 5970 if (mc.moved_swap) { 5971 /* uncharge swap account from the old cgroup */ 5972 res_counter_uncharge(&mc.from->memsw, 5973 PAGE_SIZE * mc.moved_swap); 5974 5975 for (i = 0; i < mc.moved_swap; i++) 5976 css_put(&mc.from->css); 5977 5978 /* 5979 * we charged both to->res and to->memsw, so we should 5980 * uncharge to->res. 5981 */ 5982 res_counter_uncharge(&mc.to->res, 5983 PAGE_SIZE * mc.moved_swap); 5984 /* we've already done css_get(mc.to) */ 5985 mc.moved_swap = 0; 5986 } 5987 memcg_oom_recover(from); 5988 memcg_oom_recover(to); 5989 wake_up_all(&mc.waitq); 5990 } 5991 5992 static void mem_cgroup_clear_mc(void) 5993 { 5994 struct mem_cgroup *from = mc.from; 5995 5996 /* 5997 * we must clear moving_task before waking up waiters at the end of 5998 * task migration. 5999 */ 6000 mc.moving_task = NULL; 6001 __mem_cgroup_clear_mc(); 6002 spin_lock(&mc.lock); 6003 mc.from = NULL; 6004 mc.to = NULL; 6005 spin_unlock(&mc.lock); 6006 mem_cgroup_end_move(from); 6007 } 6008 6009 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6010 struct cgroup_taskset *tset) 6011 { 6012 struct task_struct *p = cgroup_taskset_first(tset); 6013 int ret = 0; 6014 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6015 unsigned long move_charge_at_immigrate; 6016 6017 /* 6018 * We are now commited to this value whatever it is. Changes in this 6019 * tunable will only affect upcoming migrations, not the current one. 6020 * So we need to save it, and keep it going. 6021 */ 6022 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 6023 if (move_charge_at_immigrate) { 6024 struct mm_struct *mm; 6025 struct mem_cgroup *from = mem_cgroup_from_task(p); 6026 6027 VM_BUG_ON(from == memcg); 6028 6029 mm = get_task_mm(p); 6030 if (!mm) 6031 return 0; 6032 /* We move charges only when we move a owner of the mm */ 6033 if (mm->owner == p) { 6034 VM_BUG_ON(mc.from); 6035 VM_BUG_ON(mc.to); 6036 VM_BUG_ON(mc.precharge); 6037 VM_BUG_ON(mc.moved_charge); 6038 VM_BUG_ON(mc.moved_swap); 6039 mem_cgroup_start_move(from); 6040 spin_lock(&mc.lock); 6041 mc.from = from; 6042 mc.to = memcg; 6043 mc.immigrate_flags = move_charge_at_immigrate; 6044 spin_unlock(&mc.lock); 6045 /* We set mc.moving_task later */ 6046 6047 ret = mem_cgroup_precharge_mc(mm); 6048 if (ret) 6049 mem_cgroup_clear_mc(); 6050 } 6051 mmput(mm); 6052 } 6053 return ret; 6054 } 6055 6056 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 6057 struct cgroup_taskset *tset) 6058 { 6059 mem_cgroup_clear_mc(); 6060 } 6061 6062 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6063 unsigned long addr, unsigned long end, 6064 struct mm_walk *walk) 6065 { 6066 int ret = 0; 6067 struct vm_area_struct *vma = walk->private; 6068 pte_t *pte; 6069 spinlock_t *ptl; 6070 enum mc_target_type target_type; 6071 union mc_target target; 6072 struct page *page; 6073 struct page_cgroup *pc; 6074 6075 /* 6076 * We don't take compound_lock() here but no race with splitting thp 6077 * happens because: 6078 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 6079 * under splitting, which means there's no concurrent thp split, 6080 * - if another thread runs into split_huge_page() just after we 6081 * entered this if-block, the thread must wait for page table lock 6082 * to be unlocked in __split_huge_page_splitting(), where the main 6083 * part of thp split is not executed yet. 6084 */ 6085 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 6086 if (mc.precharge < HPAGE_PMD_NR) { 6087 spin_unlock(ptl); 6088 return 0; 6089 } 6090 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6091 if (target_type == MC_TARGET_PAGE) { 6092 page = target.page; 6093 if (!isolate_lru_page(page)) { 6094 pc = lookup_page_cgroup(page); 6095 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 6096 pc, mc.from, mc.to)) { 6097 mc.precharge -= HPAGE_PMD_NR; 6098 mc.moved_charge += HPAGE_PMD_NR; 6099 } 6100 putback_lru_page(page); 6101 } 6102 put_page(page); 6103 } 6104 spin_unlock(ptl); 6105 return 0; 6106 } 6107 6108 if (pmd_trans_unstable(pmd)) 6109 return 0; 6110 retry: 6111 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6112 for (; addr != end; addr += PAGE_SIZE) { 6113 pte_t ptent = *(pte++); 6114 swp_entry_t ent; 6115 6116 if (!mc.precharge) 6117 break; 6118 6119 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6120 case MC_TARGET_PAGE: 6121 page = target.page; 6122 if (isolate_lru_page(page)) 6123 goto put; 6124 pc = lookup_page_cgroup(page); 6125 if (!mem_cgroup_move_account(page, 1, pc, 6126 mc.from, mc.to)) { 6127 mc.precharge--; 6128 /* we uncharge from mc.from later. */ 6129 mc.moved_charge++; 6130 } 6131 putback_lru_page(page); 6132 put: /* get_mctgt_type() gets the page */ 6133 put_page(page); 6134 break; 6135 case MC_TARGET_SWAP: 6136 ent = target.ent; 6137 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6138 mc.precharge--; 6139 /* we fixup refcnts and charges later. */ 6140 mc.moved_swap++; 6141 } 6142 break; 6143 default: 6144 break; 6145 } 6146 } 6147 pte_unmap_unlock(pte - 1, ptl); 6148 cond_resched(); 6149 6150 if (addr != end) { 6151 /* 6152 * We have consumed all precharges we got in can_attach(). 6153 * We try charge one by one, but don't do any additional 6154 * charges to mc.to if we have failed in charge once in attach() 6155 * phase. 6156 */ 6157 ret = mem_cgroup_do_precharge(1); 6158 if (!ret) 6159 goto retry; 6160 } 6161 6162 return ret; 6163 } 6164 6165 static void mem_cgroup_move_charge(struct mm_struct *mm) 6166 { 6167 struct vm_area_struct *vma; 6168 6169 lru_add_drain_all(); 6170 retry: 6171 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 6172 /* 6173 * Someone who are holding the mmap_sem might be waiting in 6174 * waitq. So we cancel all extra charges, wake up all waiters, 6175 * and retry. Because we cancel precharges, we might not be able 6176 * to move enough charges, but moving charge is a best-effort 6177 * feature anyway, so it wouldn't be a big problem. 6178 */ 6179 __mem_cgroup_clear_mc(); 6180 cond_resched(); 6181 goto retry; 6182 } 6183 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6184 int ret; 6185 struct mm_walk mem_cgroup_move_charge_walk = { 6186 .pmd_entry = mem_cgroup_move_charge_pte_range, 6187 .mm = mm, 6188 .private = vma, 6189 }; 6190 if (is_vm_hugetlb_page(vma)) 6191 continue; 6192 ret = walk_page_range(vma->vm_start, vma->vm_end, 6193 &mem_cgroup_move_charge_walk); 6194 if (ret) 6195 /* 6196 * means we have consumed all precharges and failed in 6197 * doing additional charge. Just abandon here. 6198 */ 6199 break; 6200 } 6201 up_read(&mm->mmap_sem); 6202 } 6203 6204 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6205 struct cgroup_taskset *tset) 6206 { 6207 struct task_struct *p = cgroup_taskset_first(tset); 6208 struct mm_struct *mm = get_task_mm(p); 6209 6210 if (mm) { 6211 if (mc.to) 6212 mem_cgroup_move_charge(mm); 6213 mmput(mm); 6214 } 6215 if (mc.to) 6216 mem_cgroup_clear_mc(); 6217 } 6218 #else /* !CONFIG_MMU */ 6219 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6220 struct cgroup_taskset *tset) 6221 { 6222 return 0; 6223 } 6224 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 6225 struct cgroup_taskset *tset) 6226 { 6227 } 6228 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6229 struct cgroup_taskset *tset) 6230 { 6231 } 6232 #endif 6233 6234 /* 6235 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6236 * to verify whether we're attached to the default hierarchy on each mount 6237 * attempt. 6238 */ 6239 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 6240 { 6241 /* 6242 * use_hierarchy is forced on the default hierarchy. cgroup core 6243 * guarantees that @root doesn't have any children, so turning it 6244 * on for the root memcg is enough. 6245 */ 6246 if (cgroup_on_dfl(root_css->cgroup)) 6247 mem_cgroup_from_css(root_css)->use_hierarchy = true; 6248 } 6249 6250 struct cgroup_subsys memory_cgrp_subsys = { 6251 .css_alloc = mem_cgroup_css_alloc, 6252 .css_online = mem_cgroup_css_online, 6253 .css_offline = mem_cgroup_css_offline, 6254 .css_free = mem_cgroup_css_free, 6255 .css_reset = mem_cgroup_css_reset, 6256 .can_attach = mem_cgroup_can_attach, 6257 .cancel_attach = mem_cgroup_cancel_attach, 6258 .attach = mem_cgroup_move_task, 6259 .bind = mem_cgroup_bind, 6260 .legacy_cftypes = mem_cgroup_files, 6261 .early_init = 0, 6262 }; 6263 6264 #ifdef CONFIG_MEMCG_SWAP 6265 static int __init enable_swap_account(char *s) 6266 { 6267 if (!strcmp(s, "1")) 6268 really_do_swap_account = 1; 6269 else if (!strcmp(s, "0")) 6270 really_do_swap_account = 0; 6271 return 1; 6272 } 6273 __setup("swapaccount=", enable_swap_account); 6274 6275 static void __init memsw_file_init(void) 6276 { 6277 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6278 memsw_cgroup_files)); 6279 } 6280 6281 static void __init enable_swap_cgroup(void) 6282 { 6283 if (!mem_cgroup_disabled() && really_do_swap_account) { 6284 do_swap_account = 1; 6285 memsw_file_init(); 6286 } 6287 } 6288 6289 #else 6290 static void __init enable_swap_cgroup(void) 6291 { 6292 } 6293 #endif 6294 6295 #ifdef CONFIG_MEMCG_SWAP 6296 /** 6297 * mem_cgroup_swapout - transfer a memsw charge to swap 6298 * @page: page whose memsw charge to transfer 6299 * @entry: swap entry to move the charge to 6300 * 6301 * Transfer the memsw charge of @page to @entry. 6302 */ 6303 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6304 { 6305 struct page_cgroup *pc; 6306 unsigned short oldid; 6307 6308 VM_BUG_ON_PAGE(PageLRU(page), page); 6309 VM_BUG_ON_PAGE(page_count(page), page); 6310 6311 if (!do_swap_account) 6312 return; 6313 6314 pc = lookup_page_cgroup(page); 6315 6316 /* Readahead page, never charged */ 6317 if (!PageCgroupUsed(pc)) 6318 return; 6319 6320 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); 6321 6322 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); 6323 VM_BUG_ON_PAGE(oldid, page); 6324 6325 pc->flags &= ~PCG_MEMSW; 6326 css_get(&pc->mem_cgroup->css); 6327 mem_cgroup_swap_statistics(pc->mem_cgroup, true); 6328 } 6329 6330 /** 6331 * mem_cgroup_uncharge_swap - uncharge a swap entry 6332 * @entry: swap entry to uncharge 6333 * 6334 * Drop the memsw charge associated with @entry. 6335 */ 6336 void mem_cgroup_uncharge_swap(swp_entry_t entry) 6337 { 6338 struct mem_cgroup *memcg; 6339 unsigned short id; 6340 6341 if (!do_swap_account) 6342 return; 6343 6344 id = swap_cgroup_record(entry, 0); 6345 rcu_read_lock(); 6346 memcg = mem_cgroup_lookup(id); 6347 if (memcg) { 6348 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6349 mem_cgroup_swap_statistics(memcg, false); 6350 css_put(&memcg->css); 6351 } 6352 rcu_read_unlock(); 6353 } 6354 #endif 6355 6356 /** 6357 * mem_cgroup_try_charge - try charging a page 6358 * @page: page to charge 6359 * @mm: mm context of the victim 6360 * @gfp_mask: reclaim mode 6361 * @memcgp: charged memcg return 6362 * 6363 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6364 * pages according to @gfp_mask if necessary. 6365 * 6366 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 6367 * Otherwise, an error code is returned. 6368 * 6369 * After page->mapping has been set up, the caller must finalize the 6370 * charge with mem_cgroup_commit_charge(). Or abort the transaction 6371 * with mem_cgroup_cancel_charge() in case page instantiation fails. 6372 */ 6373 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 6374 gfp_t gfp_mask, struct mem_cgroup **memcgp) 6375 { 6376 struct mem_cgroup *memcg = NULL; 6377 unsigned int nr_pages = 1; 6378 int ret = 0; 6379 6380 if (mem_cgroup_disabled()) 6381 goto out; 6382 6383 if (PageSwapCache(page)) { 6384 struct page_cgroup *pc = lookup_page_cgroup(page); 6385 /* 6386 * Every swap fault against a single page tries to charge the 6387 * page, bail as early as possible. shmem_unuse() encounters 6388 * already charged pages, too. The USED bit is protected by 6389 * the page lock, which serializes swap cache removal, which 6390 * in turn serializes uncharging. 6391 */ 6392 if (PageCgroupUsed(pc)) 6393 goto out; 6394 } 6395 6396 if (PageTransHuge(page)) { 6397 nr_pages <<= compound_order(page); 6398 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6399 } 6400 6401 if (do_swap_account && PageSwapCache(page)) 6402 memcg = try_get_mem_cgroup_from_page(page); 6403 if (!memcg) 6404 memcg = get_mem_cgroup_from_mm(mm); 6405 6406 ret = try_charge(memcg, gfp_mask, nr_pages); 6407 6408 css_put(&memcg->css); 6409 6410 if (ret == -EINTR) { 6411 memcg = root_mem_cgroup; 6412 ret = 0; 6413 } 6414 out: 6415 *memcgp = memcg; 6416 return ret; 6417 } 6418 6419 /** 6420 * mem_cgroup_commit_charge - commit a page charge 6421 * @page: page to charge 6422 * @memcg: memcg to charge the page to 6423 * @lrucare: page might be on LRU already 6424 * 6425 * Finalize a charge transaction started by mem_cgroup_try_charge(), 6426 * after page->mapping has been set up. This must happen atomically 6427 * as part of the page instantiation, i.e. under the page table lock 6428 * for anonymous pages, under the page lock for page and swap cache. 6429 * 6430 * In addition, the page must not be on the LRU during the commit, to 6431 * prevent racing with task migration. If it might be, use @lrucare. 6432 * 6433 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 6434 */ 6435 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 6436 bool lrucare) 6437 { 6438 unsigned int nr_pages = 1; 6439 6440 VM_BUG_ON_PAGE(!page->mapping, page); 6441 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 6442 6443 if (mem_cgroup_disabled()) 6444 return; 6445 /* 6446 * Swap faults will attempt to charge the same page multiple 6447 * times. But reuse_swap_page() might have removed the page 6448 * from swapcache already, so we can't check PageSwapCache(). 6449 */ 6450 if (!memcg) 6451 return; 6452 6453 commit_charge(page, memcg, lrucare); 6454 6455 if (PageTransHuge(page)) { 6456 nr_pages <<= compound_order(page); 6457 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6458 } 6459 6460 local_irq_disable(); 6461 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6462 memcg_check_events(memcg, page); 6463 local_irq_enable(); 6464 6465 if (do_swap_account && PageSwapCache(page)) { 6466 swp_entry_t entry = { .val = page_private(page) }; 6467 /* 6468 * The swap entry might not get freed for a long time, 6469 * let's not wait for it. The page already received a 6470 * memory+swap charge, drop the swap entry duplicate. 6471 */ 6472 mem_cgroup_uncharge_swap(entry); 6473 } 6474 } 6475 6476 /** 6477 * mem_cgroup_cancel_charge - cancel a page charge 6478 * @page: page to charge 6479 * @memcg: memcg to charge the page to 6480 * 6481 * Cancel a charge transaction started by mem_cgroup_try_charge(). 6482 */ 6483 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 6484 { 6485 unsigned int nr_pages = 1; 6486 6487 if (mem_cgroup_disabled()) 6488 return; 6489 /* 6490 * Swap faults will attempt to charge the same page multiple 6491 * times. But reuse_swap_page() might have removed the page 6492 * from swapcache already, so we can't check PageSwapCache(). 6493 */ 6494 if (!memcg) 6495 return; 6496 6497 if (PageTransHuge(page)) { 6498 nr_pages <<= compound_order(page); 6499 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6500 } 6501 6502 cancel_charge(memcg, nr_pages); 6503 } 6504 6505 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 6506 unsigned long nr_mem, unsigned long nr_memsw, 6507 unsigned long nr_anon, unsigned long nr_file, 6508 unsigned long nr_huge, struct page *dummy_page) 6509 { 6510 unsigned long flags; 6511 6512 if (nr_mem) 6513 res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); 6514 if (nr_memsw) 6515 res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); 6516 6517 memcg_oom_recover(memcg); 6518 6519 local_irq_save(flags); 6520 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 6521 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 6522 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 6523 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 6524 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); 6525 memcg_check_events(memcg, dummy_page); 6526 local_irq_restore(flags); 6527 } 6528 6529 static void uncharge_list(struct list_head *page_list) 6530 { 6531 struct mem_cgroup *memcg = NULL; 6532 unsigned long nr_memsw = 0; 6533 unsigned long nr_anon = 0; 6534 unsigned long nr_file = 0; 6535 unsigned long nr_huge = 0; 6536 unsigned long pgpgout = 0; 6537 unsigned long nr_mem = 0; 6538 struct list_head *next; 6539 struct page *page; 6540 6541 next = page_list->next; 6542 do { 6543 unsigned int nr_pages = 1; 6544 struct page_cgroup *pc; 6545 6546 page = list_entry(next, struct page, lru); 6547 next = page->lru.next; 6548 6549 VM_BUG_ON_PAGE(PageLRU(page), page); 6550 VM_BUG_ON_PAGE(page_count(page), page); 6551 6552 pc = lookup_page_cgroup(page); 6553 if (!PageCgroupUsed(pc)) 6554 continue; 6555 6556 /* 6557 * Nobody should be changing or seriously looking at 6558 * pc->mem_cgroup and pc->flags at this point, we have 6559 * fully exclusive access to the page. 6560 */ 6561 6562 if (memcg != pc->mem_cgroup) { 6563 if (memcg) { 6564 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6565 nr_anon, nr_file, nr_huge, page); 6566 pgpgout = nr_mem = nr_memsw = 0; 6567 nr_anon = nr_file = nr_huge = 0; 6568 } 6569 memcg = pc->mem_cgroup; 6570 } 6571 6572 if (PageTransHuge(page)) { 6573 nr_pages <<= compound_order(page); 6574 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6575 nr_huge += nr_pages; 6576 } 6577 6578 if (PageAnon(page)) 6579 nr_anon += nr_pages; 6580 else 6581 nr_file += nr_pages; 6582 6583 if (pc->flags & PCG_MEM) 6584 nr_mem += nr_pages; 6585 if (pc->flags & PCG_MEMSW) 6586 nr_memsw += nr_pages; 6587 pc->flags = 0; 6588 6589 pgpgout++; 6590 } while (next != page_list); 6591 6592 if (memcg) 6593 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6594 nr_anon, nr_file, nr_huge, page); 6595 } 6596 6597 /** 6598 * mem_cgroup_uncharge - uncharge a page 6599 * @page: page to uncharge 6600 * 6601 * Uncharge a page previously charged with mem_cgroup_try_charge() and 6602 * mem_cgroup_commit_charge(). 6603 */ 6604 void mem_cgroup_uncharge(struct page *page) 6605 { 6606 struct page_cgroup *pc; 6607 6608 if (mem_cgroup_disabled()) 6609 return; 6610 6611 /* Don't touch page->lru of any random page, pre-check: */ 6612 pc = lookup_page_cgroup(page); 6613 if (!PageCgroupUsed(pc)) 6614 return; 6615 6616 INIT_LIST_HEAD(&page->lru); 6617 uncharge_list(&page->lru); 6618 } 6619 6620 /** 6621 * mem_cgroup_uncharge_list - uncharge a list of page 6622 * @page_list: list of pages to uncharge 6623 * 6624 * Uncharge a list of pages previously charged with 6625 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 6626 */ 6627 void mem_cgroup_uncharge_list(struct list_head *page_list) 6628 { 6629 if (mem_cgroup_disabled()) 6630 return; 6631 6632 if (!list_empty(page_list)) 6633 uncharge_list(page_list); 6634 } 6635 6636 /** 6637 * mem_cgroup_migrate - migrate a charge to another page 6638 * @oldpage: currently charged page 6639 * @newpage: page to transfer the charge to 6640 * @lrucare: both pages might be on the LRU already 6641 * 6642 * Migrate the charge from @oldpage to @newpage. 6643 * 6644 * Both pages must be locked, @newpage->mapping must be set up. 6645 */ 6646 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 6647 bool lrucare) 6648 { 6649 struct page_cgroup *pc; 6650 int isolated; 6651 6652 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6653 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6654 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 6655 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 6656 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6657 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6658 newpage); 6659 6660 if (mem_cgroup_disabled()) 6661 return; 6662 6663 /* Page cache replacement: new page already charged? */ 6664 pc = lookup_page_cgroup(newpage); 6665 if (PageCgroupUsed(pc)) 6666 return; 6667 6668 /* Re-entrant migration: old page already uncharged? */ 6669 pc = lookup_page_cgroup(oldpage); 6670 if (!PageCgroupUsed(pc)) 6671 return; 6672 6673 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); 6674 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); 6675 6676 if (lrucare) 6677 lock_page_lru(oldpage, &isolated); 6678 6679 pc->flags = 0; 6680 6681 if (lrucare) 6682 unlock_page_lru(oldpage, isolated); 6683 6684 commit_charge(newpage, pc->mem_cgroup, lrucare); 6685 } 6686 6687 /* 6688 * subsys_initcall() for memory controller. 6689 * 6690 * Some parts like hotcpu_notifier() have to be initialized from this context 6691 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 6692 * everything that doesn't depend on a specific mem_cgroup structure should 6693 * be initialized from here. 6694 */ 6695 static int __init mem_cgroup_init(void) 6696 { 6697 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6698 enable_swap_cgroup(); 6699 mem_cgroup_soft_limit_tree_init(); 6700 memcg_stock_init(); 6701 return 0; 6702 } 6703 subsys_initcall(mem_cgroup_init); 6704