1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/page_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include "internal.h" 60 #include <net/sock.h> 61 #include <net/ip.h> 62 #include <net/tcp_memcontrol.h> 63 #include "slab.h" 64 65 #include <asm/uaccess.h> 66 67 #include <trace/events/vmscan.h> 68 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 70 EXPORT_SYMBOL(memory_cgrp_subsys); 71 72 #define MEM_CGROUP_RECLAIM_RETRIES 5 73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #ifdef CONFIG_MEMCG_SWAP 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 77 int do_swap_account __read_mostly; 78 79 /* for remember boot option*/ 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 81 static int really_do_swap_account __initdata = 1; 82 #else 83 static int really_do_swap_account __initdata; 84 #endif 85 86 #else 87 #define do_swap_account 0 88 #endif 89 90 91 static const char * const mem_cgroup_stat_names[] = { 92 "cache", 93 "rss", 94 "rss_huge", 95 "mapped_file", 96 "writeback", 97 "swap", 98 }; 99 100 enum mem_cgroup_events_index { 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_NSTATS, 106 }; 107 108 static const char * const mem_cgroup_events_names[] = { 109 "pgpgin", 110 "pgpgout", 111 "pgfault", 112 "pgmajfault", 113 }; 114 115 static const char * const mem_cgroup_lru_names[] = { 116 "inactive_anon", 117 "active_anon", 118 "inactive_file", 119 "active_file", 120 "unevictable", 121 }; 122 123 /* 124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 125 * it will be incremated by the number of pages. This counter is used for 126 * for trigger some periodic events. This is straightforward and better 127 * than using jiffies etc. to handle periodic memcg event. 128 */ 129 enum mem_cgroup_events_target { 130 MEM_CGROUP_TARGET_THRESH, 131 MEM_CGROUP_TARGET_SOFTLIMIT, 132 MEM_CGROUP_TARGET_NUMAINFO, 133 MEM_CGROUP_NTARGETS, 134 }; 135 #define THRESHOLDS_EVENTS_TARGET 128 136 #define SOFTLIMIT_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024 138 139 struct mem_cgroup_stat_cpu { 140 long count[MEM_CGROUP_STAT_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 142 unsigned long nr_page_events; 143 unsigned long targets[MEM_CGROUP_NTARGETS]; 144 }; 145 146 struct mem_cgroup_reclaim_iter { 147 /* 148 * last scanned hierarchy member. Valid only if last_dead_count 149 * matches memcg->dead_count of the hierarchy root group. 150 */ 151 struct mem_cgroup *last_visited; 152 int last_dead_count; 153 154 /* scan generation, increased every round-trip */ 155 unsigned int generation; 156 }; 157 158 /* 159 * per-zone information in memory controller. 160 */ 161 struct mem_cgroup_per_zone { 162 struct lruvec lruvec; 163 unsigned long lru_size[NR_LRU_LISTS]; 164 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 166 167 struct rb_node tree_node; /* RB tree node */ 168 unsigned long long usage_in_excess;/* Set to the value by which */ 169 /* the soft limit is exceeded*/ 170 bool on_tree; 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 172 /* use container_of */ 173 }; 174 175 struct mem_cgroup_per_node { 176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 177 }; 178 179 /* 180 * Cgroups above their limits are maintained in a RB-Tree, independent of 181 * their hierarchy representation 182 */ 183 184 struct mem_cgroup_tree_per_zone { 185 struct rb_root rb_root; 186 spinlock_t lock; 187 }; 188 189 struct mem_cgroup_tree_per_node { 190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 191 }; 192 193 struct mem_cgroup_tree { 194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 195 }; 196 197 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 198 199 struct mem_cgroup_threshold { 200 struct eventfd_ctx *eventfd; 201 u64 threshold; 202 }; 203 204 /* For threshold */ 205 struct mem_cgroup_threshold_ary { 206 /* An array index points to threshold just below or equal to usage. */ 207 int current_threshold; 208 /* Size of entries[] */ 209 unsigned int size; 210 /* Array of thresholds */ 211 struct mem_cgroup_threshold entries[0]; 212 }; 213 214 struct mem_cgroup_thresholds { 215 /* Primary thresholds array */ 216 struct mem_cgroup_threshold_ary *primary; 217 /* 218 * Spare threshold array. 219 * This is needed to make mem_cgroup_unregister_event() "never fail". 220 * It must be able to store at least primary->size - 1 entries. 221 */ 222 struct mem_cgroup_threshold_ary *spare; 223 }; 224 225 /* for OOM */ 226 struct mem_cgroup_eventfd_list { 227 struct list_head list; 228 struct eventfd_ctx *eventfd; 229 }; 230 231 /* 232 * cgroup_event represents events which userspace want to receive. 233 */ 234 struct mem_cgroup_event { 235 /* 236 * memcg which the event belongs to. 237 */ 238 struct mem_cgroup *memcg; 239 /* 240 * eventfd to signal userspace about the event. 241 */ 242 struct eventfd_ctx *eventfd; 243 /* 244 * Each of these stored in a list by the cgroup. 245 */ 246 struct list_head list; 247 /* 248 * register_event() callback will be used to add new userspace 249 * waiter for changes related to this event. Use eventfd_signal() 250 * on eventfd to send notification to userspace. 251 */ 252 int (*register_event)(struct mem_cgroup *memcg, 253 struct eventfd_ctx *eventfd, const char *args); 254 /* 255 * unregister_event() callback will be called when userspace closes 256 * the eventfd or on cgroup removing. This callback must be set, 257 * if you want provide notification functionality. 258 */ 259 void (*unregister_event)(struct mem_cgroup *memcg, 260 struct eventfd_ctx *eventfd); 261 /* 262 * All fields below needed to unregister event when 263 * userspace closes eventfd. 264 */ 265 poll_table pt; 266 wait_queue_head_t *wqh; 267 wait_queue_t wait; 268 struct work_struct remove; 269 }; 270 271 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273 274 /* 275 * The memory controller data structure. The memory controller controls both 276 * page cache and RSS per cgroup. We would eventually like to provide 277 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 278 * to help the administrator determine what knobs to tune. 279 * 280 * TODO: Add a water mark for the memory controller. Reclaim will begin when 281 * we hit the water mark. May be even add a low water mark, such that 282 * no reclaim occurs from a cgroup at it's low water mark, this is 283 * a feature that will be implemented much later in the future. 284 */ 285 struct mem_cgroup { 286 struct cgroup_subsys_state css; 287 /* 288 * the counter to account for memory usage 289 */ 290 struct res_counter res; 291 292 /* vmpressure notifications */ 293 struct vmpressure vmpressure; 294 295 /* css_online() has been completed */ 296 int initialized; 297 298 /* 299 * the counter to account for mem+swap usage. 300 */ 301 struct res_counter memsw; 302 303 /* 304 * the counter to account for kernel memory usage. 305 */ 306 struct res_counter kmem; 307 /* 308 * Should the accounting and control be hierarchical, per subtree? 309 */ 310 bool use_hierarchy; 311 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 312 313 bool oom_lock; 314 atomic_t under_oom; 315 atomic_t oom_wakeups; 316 317 int swappiness; 318 /* OOM-Killer disable */ 319 int oom_kill_disable; 320 321 /* protect arrays of thresholds */ 322 struct mutex thresholds_lock; 323 324 /* thresholds for memory usage. RCU-protected */ 325 struct mem_cgroup_thresholds thresholds; 326 327 /* thresholds for mem+swap usage. RCU-protected */ 328 struct mem_cgroup_thresholds memsw_thresholds; 329 330 /* For oom notifier event fd */ 331 struct list_head oom_notify; 332 333 /* 334 * Should we move charges of a task when a task is moved into this 335 * mem_cgroup ? And what type of charges should we move ? 336 */ 337 unsigned long move_charge_at_immigrate; 338 /* 339 * set > 0 if pages under this cgroup are moving to other cgroup. 340 */ 341 atomic_t moving_account; 342 /* taken only while moving_account > 0 */ 343 spinlock_t move_lock; 344 /* 345 * percpu counter. 346 */ 347 struct mem_cgroup_stat_cpu __percpu *stat; 348 /* 349 * used when a cpu is offlined or other synchronizations 350 * See mem_cgroup_read_stat(). 351 */ 352 struct mem_cgroup_stat_cpu nocpu_base; 353 spinlock_t pcp_counter_lock; 354 355 atomic_t dead_count; 356 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 357 struct cg_proto tcp_mem; 358 #endif 359 #if defined(CONFIG_MEMCG_KMEM) 360 /* analogous to slab_common's slab_caches list, but per-memcg; 361 * protected by memcg_slab_mutex */ 362 struct list_head memcg_slab_caches; 363 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 364 int kmemcg_id; 365 #endif 366 367 int last_scanned_node; 368 #if MAX_NUMNODES > 1 369 nodemask_t scan_nodes; 370 atomic_t numainfo_events; 371 atomic_t numainfo_updating; 372 #endif 373 374 /* List of events which userspace want to receive */ 375 struct list_head event_list; 376 spinlock_t event_list_lock; 377 378 struct mem_cgroup_per_node *nodeinfo[0]; 379 /* WARNING: nodeinfo must be the last member here */ 380 }; 381 382 /* internal only representation about the status of kmem accounting. */ 383 enum { 384 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 385 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 386 }; 387 388 #ifdef CONFIG_MEMCG_KMEM 389 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 390 { 391 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 392 } 393 394 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 395 { 396 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 397 } 398 399 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 400 { 401 /* 402 * Our caller must use css_get() first, because memcg_uncharge_kmem() 403 * will call css_put() if it sees the memcg is dead. 404 */ 405 smp_wmb(); 406 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 407 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 408 } 409 410 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 411 { 412 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 413 &memcg->kmem_account_flags); 414 } 415 #endif 416 417 /* Stuffs for move charges at task migration. */ 418 /* 419 * Types of charges to be moved. "move_charge_at_immitgrate" and 420 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 421 */ 422 enum move_type { 423 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 424 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 425 NR_MOVE_TYPE, 426 }; 427 428 /* "mc" and its members are protected by cgroup_mutex */ 429 static struct move_charge_struct { 430 spinlock_t lock; /* for from, to */ 431 struct mem_cgroup *from; 432 struct mem_cgroup *to; 433 unsigned long immigrate_flags; 434 unsigned long precharge; 435 unsigned long moved_charge; 436 unsigned long moved_swap; 437 struct task_struct *moving_task; /* a task moving charges */ 438 wait_queue_head_t waitq; /* a waitq for other context */ 439 } mc = { 440 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 441 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 442 }; 443 444 static bool move_anon(void) 445 { 446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 447 } 448 449 static bool move_file(void) 450 { 451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 452 } 453 454 /* 455 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 456 * limit reclaim to prevent infinite loops, if they ever occur. 457 */ 458 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 459 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 460 461 enum charge_type { 462 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 463 MEM_CGROUP_CHARGE_TYPE_ANON, 464 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 465 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 466 NR_CHARGE_TYPE, 467 }; 468 469 /* for encoding cft->private value on file */ 470 enum res_type { 471 _MEM, 472 _MEMSWAP, 473 _OOM_TYPE, 474 _KMEM, 475 }; 476 477 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 478 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 479 #define MEMFILE_ATTR(val) ((val) & 0xffff) 480 /* Used for OOM nofiier */ 481 #define OOM_CONTROL (0) 482 483 /* 484 * The memcg_create_mutex will be held whenever a new cgroup is created. 485 * As a consequence, any change that needs to protect against new child cgroups 486 * appearing has to hold it as well. 487 */ 488 static DEFINE_MUTEX(memcg_create_mutex); 489 490 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 491 { 492 return s ? container_of(s, struct mem_cgroup, css) : NULL; 493 } 494 495 /* Some nice accessors for the vmpressure. */ 496 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 497 { 498 if (!memcg) 499 memcg = root_mem_cgroup; 500 return &memcg->vmpressure; 501 } 502 503 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 504 { 505 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 506 } 507 508 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 509 { 510 return (memcg == root_mem_cgroup); 511 } 512 513 /* 514 * We restrict the id in the range of [1, 65535], so it can fit into 515 * an unsigned short. 516 */ 517 #define MEM_CGROUP_ID_MAX USHRT_MAX 518 519 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 520 { 521 return memcg->css.id; 522 } 523 524 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 525 { 526 struct cgroup_subsys_state *css; 527 528 css = css_from_id(id, &memory_cgrp_subsys); 529 return mem_cgroup_from_css(css); 530 } 531 532 /* Writing them here to avoid exposing memcg's inner layout */ 533 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 534 535 void sock_update_memcg(struct sock *sk) 536 { 537 if (mem_cgroup_sockets_enabled) { 538 struct mem_cgroup *memcg; 539 struct cg_proto *cg_proto; 540 541 BUG_ON(!sk->sk_prot->proto_cgroup); 542 543 /* Socket cloning can throw us here with sk_cgrp already 544 * filled. It won't however, necessarily happen from 545 * process context. So the test for root memcg given 546 * the current task's memcg won't help us in this case. 547 * 548 * Respecting the original socket's memcg is a better 549 * decision in this case. 550 */ 551 if (sk->sk_cgrp) { 552 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 553 css_get(&sk->sk_cgrp->memcg->css); 554 return; 555 } 556 557 rcu_read_lock(); 558 memcg = mem_cgroup_from_task(current); 559 cg_proto = sk->sk_prot->proto_cgroup(memcg); 560 if (!mem_cgroup_is_root(memcg) && 561 memcg_proto_active(cg_proto) && 562 css_tryget_online(&memcg->css)) { 563 sk->sk_cgrp = cg_proto; 564 } 565 rcu_read_unlock(); 566 } 567 } 568 EXPORT_SYMBOL(sock_update_memcg); 569 570 void sock_release_memcg(struct sock *sk) 571 { 572 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 573 struct mem_cgroup *memcg; 574 WARN_ON(!sk->sk_cgrp->memcg); 575 memcg = sk->sk_cgrp->memcg; 576 css_put(&sk->sk_cgrp->memcg->css); 577 } 578 } 579 580 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 581 { 582 if (!memcg || mem_cgroup_is_root(memcg)) 583 return NULL; 584 585 return &memcg->tcp_mem; 586 } 587 EXPORT_SYMBOL(tcp_proto_cgroup); 588 589 static void disarm_sock_keys(struct mem_cgroup *memcg) 590 { 591 if (!memcg_proto_activated(&memcg->tcp_mem)) 592 return; 593 static_key_slow_dec(&memcg_socket_limit_enabled); 594 } 595 #else 596 static void disarm_sock_keys(struct mem_cgroup *memcg) 597 { 598 } 599 #endif 600 601 #ifdef CONFIG_MEMCG_KMEM 602 /* 603 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 604 * The main reason for not using cgroup id for this: 605 * this works better in sparse environments, where we have a lot of memcgs, 606 * but only a few kmem-limited. Or also, if we have, for instance, 200 607 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 608 * 200 entry array for that. 609 * 610 * The current size of the caches array is stored in 611 * memcg_limited_groups_array_size. It will double each time we have to 612 * increase it. 613 */ 614 static DEFINE_IDA(kmem_limited_groups); 615 int memcg_limited_groups_array_size; 616 617 /* 618 * MIN_SIZE is different than 1, because we would like to avoid going through 619 * the alloc/free process all the time. In a small machine, 4 kmem-limited 620 * cgroups is a reasonable guess. In the future, it could be a parameter or 621 * tunable, but that is strictly not necessary. 622 * 623 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 624 * this constant directly from cgroup, but it is understandable that this is 625 * better kept as an internal representation in cgroup.c. In any case, the 626 * cgrp_id space is not getting any smaller, and we don't have to necessarily 627 * increase ours as well if it increases. 628 */ 629 #define MEMCG_CACHES_MIN_SIZE 4 630 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 631 632 /* 633 * A lot of the calls to the cache allocation functions are expected to be 634 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 635 * conditional to this static branch, we'll have to allow modules that does 636 * kmem_cache_alloc and the such to see this symbol as well 637 */ 638 struct static_key memcg_kmem_enabled_key; 639 EXPORT_SYMBOL(memcg_kmem_enabled_key); 640 641 static void memcg_free_cache_id(int id); 642 643 static void disarm_kmem_keys(struct mem_cgroup *memcg) 644 { 645 if (memcg_kmem_is_active(memcg)) { 646 static_key_slow_dec(&memcg_kmem_enabled_key); 647 memcg_free_cache_id(memcg->kmemcg_id); 648 } 649 /* 650 * This check can't live in kmem destruction function, 651 * since the charges will outlive the cgroup 652 */ 653 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 654 } 655 #else 656 static void disarm_kmem_keys(struct mem_cgroup *memcg) 657 { 658 } 659 #endif /* CONFIG_MEMCG_KMEM */ 660 661 static void disarm_static_keys(struct mem_cgroup *memcg) 662 { 663 disarm_sock_keys(memcg); 664 disarm_kmem_keys(memcg); 665 } 666 667 static void drain_all_stock_async(struct mem_cgroup *memcg); 668 669 static struct mem_cgroup_per_zone * 670 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 671 { 672 int nid = zone_to_nid(zone); 673 int zid = zone_idx(zone); 674 675 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 676 } 677 678 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 679 { 680 return &memcg->css; 681 } 682 683 static struct mem_cgroup_per_zone * 684 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 685 { 686 int nid = page_to_nid(page); 687 int zid = page_zonenum(page); 688 689 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 690 } 691 692 static struct mem_cgroup_tree_per_zone * 693 soft_limit_tree_node_zone(int nid, int zid) 694 { 695 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 696 } 697 698 static struct mem_cgroup_tree_per_zone * 699 soft_limit_tree_from_page(struct page *page) 700 { 701 int nid = page_to_nid(page); 702 int zid = page_zonenum(page); 703 704 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 705 } 706 707 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 708 struct mem_cgroup_tree_per_zone *mctz, 709 unsigned long long new_usage_in_excess) 710 { 711 struct rb_node **p = &mctz->rb_root.rb_node; 712 struct rb_node *parent = NULL; 713 struct mem_cgroup_per_zone *mz_node; 714 715 if (mz->on_tree) 716 return; 717 718 mz->usage_in_excess = new_usage_in_excess; 719 if (!mz->usage_in_excess) 720 return; 721 while (*p) { 722 parent = *p; 723 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 724 tree_node); 725 if (mz->usage_in_excess < mz_node->usage_in_excess) 726 p = &(*p)->rb_left; 727 /* 728 * We can't avoid mem cgroups that are over their soft 729 * limit by the same amount 730 */ 731 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 732 p = &(*p)->rb_right; 733 } 734 rb_link_node(&mz->tree_node, parent, p); 735 rb_insert_color(&mz->tree_node, &mctz->rb_root); 736 mz->on_tree = true; 737 } 738 739 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 740 struct mem_cgroup_tree_per_zone *mctz) 741 { 742 if (!mz->on_tree) 743 return; 744 rb_erase(&mz->tree_node, &mctz->rb_root); 745 mz->on_tree = false; 746 } 747 748 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 749 struct mem_cgroup_tree_per_zone *mctz) 750 { 751 unsigned long flags; 752 753 spin_lock_irqsave(&mctz->lock, flags); 754 __mem_cgroup_remove_exceeded(mz, mctz); 755 spin_unlock_irqrestore(&mctz->lock, flags); 756 } 757 758 759 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 760 { 761 unsigned long long excess; 762 struct mem_cgroup_per_zone *mz; 763 struct mem_cgroup_tree_per_zone *mctz; 764 765 mctz = soft_limit_tree_from_page(page); 766 /* 767 * Necessary to update all ancestors when hierarchy is used. 768 * because their event counter is not touched. 769 */ 770 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 771 mz = mem_cgroup_page_zoneinfo(memcg, page); 772 excess = res_counter_soft_limit_excess(&memcg->res); 773 /* 774 * We have to update the tree if mz is on RB-tree or 775 * mem is over its softlimit. 776 */ 777 if (excess || mz->on_tree) { 778 unsigned long flags; 779 780 spin_lock_irqsave(&mctz->lock, flags); 781 /* if on-tree, remove it */ 782 if (mz->on_tree) 783 __mem_cgroup_remove_exceeded(mz, mctz); 784 /* 785 * Insert again. mz->usage_in_excess will be updated. 786 * If excess is 0, no tree ops. 787 */ 788 __mem_cgroup_insert_exceeded(mz, mctz, excess); 789 spin_unlock_irqrestore(&mctz->lock, flags); 790 } 791 } 792 } 793 794 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 795 { 796 struct mem_cgroup_tree_per_zone *mctz; 797 struct mem_cgroup_per_zone *mz; 798 int nid, zid; 799 800 for_each_node(nid) { 801 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 802 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 803 mctz = soft_limit_tree_node_zone(nid, zid); 804 mem_cgroup_remove_exceeded(mz, mctz); 805 } 806 } 807 } 808 809 static struct mem_cgroup_per_zone * 810 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 811 { 812 struct rb_node *rightmost = NULL; 813 struct mem_cgroup_per_zone *mz; 814 815 retry: 816 mz = NULL; 817 rightmost = rb_last(&mctz->rb_root); 818 if (!rightmost) 819 goto done; /* Nothing to reclaim from */ 820 821 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 822 /* 823 * Remove the node now but someone else can add it back, 824 * we will to add it back at the end of reclaim to its correct 825 * position in the tree. 826 */ 827 __mem_cgroup_remove_exceeded(mz, mctz); 828 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 829 !css_tryget_online(&mz->memcg->css)) 830 goto retry; 831 done: 832 return mz; 833 } 834 835 static struct mem_cgroup_per_zone * 836 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 837 { 838 struct mem_cgroup_per_zone *mz; 839 840 spin_lock_irq(&mctz->lock); 841 mz = __mem_cgroup_largest_soft_limit_node(mctz); 842 spin_unlock_irq(&mctz->lock); 843 return mz; 844 } 845 846 /* 847 * Implementation Note: reading percpu statistics for memcg. 848 * 849 * Both of vmstat[] and percpu_counter has threshold and do periodic 850 * synchronization to implement "quick" read. There are trade-off between 851 * reading cost and precision of value. Then, we may have a chance to implement 852 * a periodic synchronizion of counter in memcg's counter. 853 * 854 * But this _read() function is used for user interface now. The user accounts 855 * memory usage by memory cgroup and he _always_ requires exact value because 856 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 857 * have to visit all online cpus and make sum. So, for now, unnecessary 858 * synchronization is not implemented. (just implemented for cpu hotplug) 859 * 860 * If there are kernel internal actions which can make use of some not-exact 861 * value, and reading all cpu value can be performance bottleneck in some 862 * common workload, threashold and synchonization as vmstat[] should be 863 * implemented. 864 */ 865 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 866 enum mem_cgroup_stat_index idx) 867 { 868 long val = 0; 869 int cpu; 870 871 get_online_cpus(); 872 for_each_online_cpu(cpu) 873 val += per_cpu(memcg->stat->count[idx], cpu); 874 #ifdef CONFIG_HOTPLUG_CPU 875 spin_lock(&memcg->pcp_counter_lock); 876 val += memcg->nocpu_base.count[idx]; 877 spin_unlock(&memcg->pcp_counter_lock); 878 #endif 879 put_online_cpus(); 880 return val; 881 } 882 883 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 884 enum mem_cgroup_events_index idx) 885 { 886 unsigned long val = 0; 887 int cpu; 888 889 get_online_cpus(); 890 for_each_online_cpu(cpu) 891 val += per_cpu(memcg->stat->events[idx], cpu); 892 #ifdef CONFIG_HOTPLUG_CPU 893 spin_lock(&memcg->pcp_counter_lock); 894 val += memcg->nocpu_base.events[idx]; 895 spin_unlock(&memcg->pcp_counter_lock); 896 #endif 897 put_online_cpus(); 898 return val; 899 } 900 901 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 902 struct page *page, 903 int nr_pages) 904 { 905 /* 906 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 907 * counted as CACHE even if it's on ANON LRU. 908 */ 909 if (PageAnon(page)) 910 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 911 nr_pages); 912 else 913 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 914 nr_pages); 915 916 if (PageTransHuge(page)) 917 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 918 nr_pages); 919 920 /* pagein of a big page is an event. So, ignore page size */ 921 if (nr_pages > 0) 922 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 923 else { 924 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 925 nr_pages = -nr_pages; /* for event */ 926 } 927 928 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 929 } 930 931 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 932 { 933 struct mem_cgroup_per_zone *mz; 934 935 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 936 return mz->lru_size[lru]; 937 } 938 939 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 940 int nid, 941 unsigned int lru_mask) 942 { 943 unsigned long nr = 0; 944 int zid; 945 946 VM_BUG_ON((unsigned)nid >= nr_node_ids); 947 948 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 949 struct mem_cgroup_per_zone *mz; 950 enum lru_list lru; 951 952 for_each_lru(lru) { 953 if (!(BIT(lru) & lru_mask)) 954 continue; 955 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 956 nr += mz->lru_size[lru]; 957 } 958 } 959 return nr; 960 } 961 962 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 963 unsigned int lru_mask) 964 { 965 unsigned long nr = 0; 966 int nid; 967 968 for_each_node_state(nid, N_MEMORY) 969 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 970 return nr; 971 } 972 973 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 974 enum mem_cgroup_events_target target) 975 { 976 unsigned long val, next; 977 978 val = __this_cpu_read(memcg->stat->nr_page_events); 979 next = __this_cpu_read(memcg->stat->targets[target]); 980 /* from time_after() in jiffies.h */ 981 if ((long)next - (long)val < 0) { 982 switch (target) { 983 case MEM_CGROUP_TARGET_THRESH: 984 next = val + THRESHOLDS_EVENTS_TARGET; 985 break; 986 case MEM_CGROUP_TARGET_SOFTLIMIT: 987 next = val + SOFTLIMIT_EVENTS_TARGET; 988 break; 989 case MEM_CGROUP_TARGET_NUMAINFO: 990 next = val + NUMAINFO_EVENTS_TARGET; 991 break; 992 default: 993 break; 994 } 995 __this_cpu_write(memcg->stat->targets[target], next); 996 return true; 997 } 998 return false; 999 } 1000 1001 /* 1002 * Check events in order. 1003 * 1004 */ 1005 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1006 { 1007 /* threshold event is triggered in finer grain than soft limit */ 1008 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1009 MEM_CGROUP_TARGET_THRESH))) { 1010 bool do_softlimit; 1011 bool do_numainfo __maybe_unused; 1012 1013 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1014 MEM_CGROUP_TARGET_SOFTLIMIT); 1015 #if MAX_NUMNODES > 1 1016 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1017 MEM_CGROUP_TARGET_NUMAINFO); 1018 #endif 1019 mem_cgroup_threshold(memcg); 1020 if (unlikely(do_softlimit)) 1021 mem_cgroup_update_tree(memcg, page); 1022 #if MAX_NUMNODES > 1 1023 if (unlikely(do_numainfo)) 1024 atomic_inc(&memcg->numainfo_events); 1025 #endif 1026 } 1027 } 1028 1029 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1030 { 1031 /* 1032 * mm_update_next_owner() may clear mm->owner to NULL 1033 * if it races with swapoff, page migration, etc. 1034 * So this can be called with p == NULL. 1035 */ 1036 if (unlikely(!p)) 1037 return NULL; 1038 1039 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1040 } 1041 1042 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1043 { 1044 struct mem_cgroup *memcg = NULL; 1045 1046 rcu_read_lock(); 1047 do { 1048 /* 1049 * Page cache insertions can happen withou an 1050 * actual mm context, e.g. during disk probing 1051 * on boot, loopback IO, acct() writes etc. 1052 */ 1053 if (unlikely(!mm)) 1054 memcg = root_mem_cgroup; 1055 else { 1056 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1057 if (unlikely(!memcg)) 1058 memcg = root_mem_cgroup; 1059 } 1060 } while (!css_tryget_online(&memcg->css)); 1061 rcu_read_unlock(); 1062 return memcg; 1063 } 1064 1065 /* 1066 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1067 * ref. count) or NULL if the whole root's subtree has been visited. 1068 * 1069 * helper function to be used by mem_cgroup_iter 1070 */ 1071 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1072 struct mem_cgroup *last_visited) 1073 { 1074 struct cgroup_subsys_state *prev_css, *next_css; 1075 1076 prev_css = last_visited ? &last_visited->css : NULL; 1077 skip_node: 1078 next_css = css_next_descendant_pre(prev_css, &root->css); 1079 1080 /* 1081 * Even if we found a group we have to make sure it is 1082 * alive. css && !memcg means that the groups should be 1083 * skipped and we should continue the tree walk. 1084 * last_visited css is safe to use because it is 1085 * protected by css_get and the tree walk is rcu safe. 1086 * 1087 * We do not take a reference on the root of the tree walk 1088 * because we might race with the root removal when it would 1089 * be the only node in the iterated hierarchy and mem_cgroup_iter 1090 * would end up in an endless loop because it expects that at 1091 * least one valid node will be returned. Root cannot disappear 1092 * because caller of the iterator should hold it already so 1093 * skipping css reference should be safe. 1094 */ 1095 if (next_css) { 1096 struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); 1097 1098 if (next_css == &root->css) 1099 return memcg; 1100 1101 if (css_tryget_online(next_css)) { 1102 /* 1103 * Make sure the memcg is initialized: 1104 * mem_cgroup_css_online() orders the the 1105 * initialization against setting the flag. 1106 */ 1107 if (smp_load_acquire(&memcg->initialized)) 1108 return memcg; 1109 css_put(next_css); 1110 } 1111 1112 prev_css = next_css; 1113 goto skip_node; 1114 } 1115 1116 return NULL; 1117 } 1118 1119 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1120 { 1121 /* 1122 * When a group in the hierarchy below root is destroyed, the 1123 * hierarchy iterator can no longer be trusted since it might 1124 * have pointed to the destroyed group. Invalidate it. 1125 */ 1126 atomic_inc(&root->dead_count); 1127 } 1128 1129 static struct mem_cgroup * 1130 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1131 struct mem_cgroup *root, 1132 int *sequence) 1133 { 1134 struct mem_cgroup *position = NULL; 1135 /* 1136 * A cgroup destruction happens in two stages: offlining and 1137 * release. They are separated by a RCU grace period. 1138 * 1139 * If the iterator is valid, we may still race with an 1140 * offlining. The RCU lock ensures the object won't be 1141 * released, tryget will fail if we lost the race. 1142 */ 1143 *sequence = atomic_read(&root->dead_count); 1144 if (iter->last_dead_count == *sequence) { 1145 smp_rmb(); 1146 position = iter->last_visited; 1147 1148 /* 1149 * We cannot take a reference to root because we might race 1150 * with root removal and returning NULL would end up in 1151 * an endless loop on the iterator user level when root 1152 * would be returned all the time. 1153 */ 1154 if (position && position != root && 1155 !css_tryget_online(&position->css)) 1156 position = NULL; 1157 } 1158 return position; 1159 } 1160 1161 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1162 struct mem_cgroup *last_visited, 1163 struct mem_cgroup *new_position, 1164 struct mem_cgroup *root, 1165 int sequence) 1166 { 1167 /* root reference counting symmetric to mem_cgroup_iter_load */ 1168 if (last_visited && last_visited != root) 1169 css_put(&last_visited->css); 1170 /* 1171 * We store the sequence count from the time @last_visited was 1172 * loaded successfully instead of rereading it here so that we 1173 * don't lose destruction events in between. We could have 1174 * raced with the destruction of @new_position after all. 1175 */ 1176 iter->last_visited = new_position; 1177 smp_wmb(); 1178 iter->last_dead_count = sequence; 1179 } 1180 1181 /** 1182 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1183 * @root: hierarchy root 1184 * @prev: previously returned memcg, NULL on first invocation 1185 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1186 * 1187 * Returns references to children of the hierarchy below @root, or 1188 * @root itself, or %NULL after a full round-trip. 1189 * 1190 * Caller must pass the return value in @prev on subsequent 1191 * invocations for reference counting, or use mem_cgroup_iter_break() 1192 * to cancel a hierarchy walk before the round-trip is complete. 1193 * 1194 * Reclaimers can specify a zone and a priority level in @reclaim to 1195 * divide up the memcgs in the hierarchy among all concurrent 1196 * reclaimers operating on the same zone and priority. 1197 */ 1198 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1199 struct mem_cgroup *prev, 1200 struct mem_cgroup_reclaim_cookie *reclaim) 1201 { 1202 struct mem_cgroup *memcg = NULL; 1203 struct mem_cgroup *last_visited = NULL; 1204 1205 if (mem_cgroup_disabled()) 1206 return NULL; 1207 1208 if (!root) 1209 root = root_mem_cgroup; 1210 1211 if (prev && !reclaim) 1212 last_visited = prev; 1213 1214 if (!root->use_hierarchy && root != root_mem_cgroup) { 1215 if (prev) 1216 goto out_css_put; 1217 return root; 1218 } 1219 1220 rcu_read_lock(); 1221 while (!memcg) { 1222 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1223 int uninitialized_var(seq); 1224 1225 if (reclaim) { 1226 struct mem_cgroup_per_zone *mz; 1227 1228 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1229 iter = &mz->reclaim_iter[reclaim->priority]; 1230 if (prev && reclaim->generation != iter->generation) { 1231 iter->last_visited = NULL; 1232 goto out_unlock; 1233 } 1234 1235 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1236 } 1237 1238 memcg = __mem_cgroup_iter_next(root, last_visited); 1239 1240 if (reclaim) { 1241 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1242 seq); 1243 1244 if (!memcg) 1245 iter->generation++; 1246 else if (!prev && memcg) 1247 reclaim->generation = iter->generation; 1248 } 1249 1250 if (prev && !memcg) 1251 goto out_unlock; 1252 } 1253 out_unlock: 1254 rcu_read_unlock(); 1255 out_css_put: 1256 if (prev && prev != root) 1257 css_put(&prev->css); 1258 1259 return memcg; 1260 } 1261 1262 /** 1263 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1264 * @root: hierarchy root 1265 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1266 */ 1267 void mem_cgroup_iter_break(struct mem_cgroup *root, 1268 struct mem_cgroup *prev) 1269 { 1270 if (!root) 1271 root = root_mem_cgroup; 1272 if (prev && prev != root) 1273 css_put(&prev->css); 1274 } 1275 1276 /* 1277 * Iteration constructs for visiting all cgroups (under a tree). If 1278 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1279 * be used for reference counting. 1280 */ 1281 #define for_each_mem_cgroup_tree(iter, root) \ 1282 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1283 iter != NULL; \ 1284 iter = mem_cgroup_iter(root, iter, NULL)) 1285 1286 #define for_each_mem_cgroup(iter) \ 1287 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1288 iter != NULL; \ 1289 iter = mem_cgroup_iter(NULL, iter, NULL)) 1290 1291 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1292 { 1293 struct mem_cgroup *memcg; 1294 1295 rcu_read_lock(); 1296 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1297 if (unlikely(!memcg)) 1298 goto out; 1299 1300 switch (idx) { 1301 case PGFAULT: 1302 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1303 break; 1304 case PGMAJFAULT: 1305 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1306 break; 1307 default: 1308 BUG(); 1309 } 1310 out: 1311 rcu_read_unlock(); 1312 } 1313 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1314 1315 /** 1316 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1317 * @zone: zone of the wanted lruvec 1318 * @memcg: memcg of the wanted lruvec 1319 * 1320 * Returns the lru list vector holding pages for the given @zone and 1321 * @mem. This can be the global zone lruvec, if the memory controller 1322 * is disabled. 1323 */ 1324 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1325 struct mem_cgroup *memcg) 1326 { 1327 struct mem_cgroup_per_zone *mz; 1328 struct lruvec *lruvec; 1329 1330 if (mem_cgroup_disabled()) { 1331 lruvec = &zone->lruvec; 1332 goto out; 1333 } 1334 1335 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1336 lruvec = &mz->lruvec; 1337 out: 1338 /* 1339 * Since a node can be onlined after the mem_cgroup was created, 1340 * we have to be prepared to initialize lruvec->zone here; 1341 * and if offlined then reonlined, we need to reinitialize it. 1342 */ 1343 if (unlikely(lruvec->zone != zone)) 1344 lruvec->zone = zone; 1345 return lruvec; 1346 } 1347 1348 /** 1349 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1350 * @page: the page 1351 * @zone: zone of the page 1352 */ 1353 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1354 { 1355 struct mem_cgroup_per_zone *mz; 1356 struct mem_cgroup *memcg; 1357 struct page_cgroup *pc; 1358 struct lruvec *lruvec; 1359 1360 if (mem_cgroup_disabled()) { 1361 lruvec = &zone->lruvec; 1362 goto out; 1363 } 1364 1365 pc = lookup_page_cgroup(page); 1366 memcg = pc->mem_cgroup; 1367 1368 /* 1369 * Surreptitiously switch any uncharged offlist page to root: 1370 * an uncharged page off lru does nothing to secure 1371 * its former mem_cgroup from sudden removal. 1372 * 1373 * Our caller holds lru_lock, and PageCgroupUsed is updated 1374 * under page_cgroup lock: between them, they make all uses 1375 * of pc->mem_cgroup safe. 1376 */ 1377 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1378 pc->mem_cgroup = memcg = root_mem_cgroup; 1379 1380 mz = mem_cgroup_page_zoneinfo(memcg, page); 1381 lruvec = &mz->lruvec; 1382 out: 1383 /* 1384 * Since a node can be onlined after the mem_cgroup was created, 1385 * we have to be prepared to initialize lruvec->zone here; 1386 * and if offlined then reonlined, we need to reinitialize it. 1387 */ 1388 if (unlikely(lruvec->zone != zone)) 1389 lruvec->zone = zone; 1390 return lruvec; 1391 } 1392 1393 /** 1394 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1395 * @lruvec: mem_cgroup per zone lru vector 1396 * @lru: index of lru list the page is sitting on 1397 * @nr_pages: positive when adding or negative when removing 1398 * 1399 * This function must be called when a page is added to or removed from an 1400 * lru list. 1401 */ 1402 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1403 int nr_pages) 1404 { 1405 struct mem_cgroup_per_zone *mz; 1406 unsigned long *lru_size; 1407 1408 if (mem_cgroup_disabled()) 1409 return; 1410 1411 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1412 lru_size = mz->lru_size + lru; 1413 *lru_size += nr_pages; 1414 VM_BUG_ON((long)(*lru_size) < 0); 1415 } 1416 1417 /* 1418 * Checks whether given mem is same or in the root_mem_cgroup's 1419 * hierarchy subtree 1420 */ 1421 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1422 struct mem_cgroup *memcg) 1423 { 1424 if (root_memcg == memcg) 1425 return true; 1426 if (!root_memcg->use_hierarchy || !memcg) 1427 return false; 1428 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1429 } 1430 1431 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1432 struct mem_cgroup *memcg) 1433 { 1434 bool ret; 1435 1436 rcu_read_lock(); 1437 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1438 rcu_read_unlock(); 1439 return ret; 1440 } 1441 1442 bool task_in_mem_cgroup(struct task_struct *task, 1443 const struct mem_cgroup *memcg) 1444 { 1445 struct mem_cgroup *curr = NULL; 1446 struct task_struct *p; 1447 bool ret; 1448 1449 p = find_lock_task_mm(task); 1450 if (p) { 1451 curr = get_mem_cgroup_from_mm(p->mm); 1452 task_unlock(p); 1453 } else { 1454 /* 1455 * All threads may have already detached their mm's, but the oom 1456 * killer still needs to detect if they have already been oom 1457 * killed to prevent needlessly killing additional tasks. 1458 */ 1459 rcu_read_lock(); 1460 curr = mem_cgroup_from_task(task); 1461 if (curr) 1462 css_get(&curr->css); 1463 rcu_read_unlock(); 1464 } 1465 /* 1466 * We should check use_hierarchy of "memcg" not "curr". Because checking 1467 * use_hierarchy of "curr" here make this function true if hierarchy is 1468 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1469 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1470 */ 1471 ret = mem_cgroup_same_or_subtree(memcg, curr); 1472 css_put(&curr->css); 1473 return ret; 1474 } 1475 1476 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1477 { 1478 unsigned long inactive_ratio; 1479 unsigned long inactive; 1480 unsigned long active; 1481 unsigned long gb; 1482 1483 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1484 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1485 1486 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1487 if (gb) 1488 inactive_ratio = int_sqrt(10 * gb); 1489 else 1490 inactive_ratio = 1; 1491 1492 return inactive * inactive_ratio < active; 1493 } 1494 1495 #define mem_cgroup_from_res_counter(counter, member) \ 1496 container_of(counter, struct mem_cgroup, member) 1497 1498 /** 1499 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1500 * @memcg: the memory cgroup 1501 * 1502 * Returns the maximum amount of memory @mem can be charged with, in 1503 * pages. 1504 */ 1505 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1506 { 1507 unsigned long long margin; 1508 1509 margin = res_counter_margin(&memcg->res); 1510 if (do_swap_account) 1511 margin = min(margin, res_counter_margin(&memcg->memsw)); 1512 return margin >> PAGE_SHIFT; 1513 } 1514 1515 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1516 { 1517 /* root ? */ 1518 if (mem_cgroup_disabled() || !memcg->css.parent) 1519 return vm_swappiness; 1520 1521 return memcg->swappiness; 1522 } 1523 1524 /* 1525 * memcg->moving_account is used for checking possibility that some thread is 1526 * calling move_account(). When a thread on CPU-A starts moving pages under 1527 * a memcg, other threads should check memcg->moving_account under 1528 * rcu_read_lock(), like this: 1529 * 1530 * CPU-A CPU-B 1531 * rcu_read_lock() 1532 * memcg->moving_account+1 if (memcg->mocing_account) 1533 * take heavy locks. 1534 * synchronize_rcu() update something. 1535 * rcu_read_unlock() 1536 * start move here. 1537 */ 1538 1539 /* for quick checking without looking up memcg */ 1540 atomic_t memcg_moving __read_mostly; 1541 1542 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1543 { 1544 atomic_inc(&memcg_moving); 1545 atomic_inc(&memcg->moving_account); 1546 synchronize_rcu(); 1547 } 1548 1549 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1550 { 1551 /* 1552 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1553 * We check NULL in callee rather than caller. 1554 */ 1555 if (memcg) { 1556 atomic_dec(&memcg_moving); 1557 atomic_dec(&memcg->moving_account); 1558 } 1559 } 1560 1561 /* 1562 * A routine for checking "mem" is under move_account() or not. 1563 * 1564 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1565 * moving cgroups. This is for waiting at high-memory pressure 1566 * caused by "move". 1567 */ 1568 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1569 { 1570 struct mem_cgroup *from; 1571 struct mem_cgroup *to; 1572 bool ret = false; 1573 /* 1574 * Unlike task_move routines, we access mc.to, mc.from not under 1575 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1576 */ 1577 spin_lock(&mc.lock); 1578 from = mc.from; 1579 to = mc.to; 1580 if (!from) 1581 goto unlock; 1582 1583 ret = mem_cgroup_same_or_subtree(memcg, from) 1584 || mem_cgroup_same_or_subtree(memcg, to); 1585 unlock: 1586 spin_unlock(&mc.lock); 1587 return ret; 1588 } 1589 1590 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1591 { 1592 if (mc.moving_task && current != mc.moving_task) { 1593 if (mem_cgroup_under_move(memcg)) { 1594 DEFINE_WAIT(wait); 1595 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1596 /* moving charge context might have finished. */ 1597 if (mc.moving_task) 1598 schedule(); 1599 finish_wait(&mc.waitq, &wait); 1600 return true; 1601 } 1602 } 1603 return false; 1604 } 1605 1606 /* 1607 * Take this lock when 1608 * - a code tries to modify page's memcg while it's USED. 1609 * - a code tries to modify page state accounting in a memcg. 1610 */ 1611 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1612 unsigned long *flags) 1613 { 1614 spin_lock_irqsave(&memcg->move_lock, *flags); 1615 } 1616 1617 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1618 unsigned long *flags) 1619 { 1620 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1621 } 1622 1623 #define K(x) ((x) << (PAGE_SHIFT-10)) 1624 /** 1625 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1626 * @memcg: The memory cgroup that went over limit 1627 * @p: Task that is going to be killed 1628 * 1629 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1630 * enabled 1631 */ 1632 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1633 { 1634 /* oom_info_lock ensures that parallel ooms do not interleave */ 1635 static DEFINE_MUTEX(oom_info_lock); 1636 struct mem_cgroup *iter; 1637 unsigned int i; 1638 1639 if (!p) 1640 return; 1641 1642 mutex_lock(&oom_info_lock); 1643 rcu_read_lock(); 1644 1645 pr_info("Task in "); 1646 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1647 pr_info(" killed as a result of limit of "); 1648 pr_cont_cgroup_path(memcg->css.cgroup); 1649 pr_info("\n"); 1650 1651 rcu_read_unlock(); 1652 1653 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1654 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1655 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1656 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1657 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1658 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1659 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1660 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1661 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1662 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1663 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1664 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1665 1666 for_each_mem_cgroup_tree(iter, memcg) { 1667 pr_info("Memory cgroup stats for "); 1668 pr_cont_cgroup_path(iter->css.cgroup); 1669 pr_cont(":"); 1670 1671 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1672 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1673 continue; 1674 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1675 K(mem_cgroup_read_stat(iter, i))); 1676 } 1677 1678 for (i = 0; i < NR_LRU_LISTS; i++) 1679 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1680 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1681 1682 pr_cont("\n"); 1683 } 1684 mutex_unlock(&oom_info_lock); 1685 } 1686 1687 /* 1688 * This function returns the number of memcg under hierarchy tree. Returns 1689 * 1(self count) if no children. 1690 */ 1691 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1692 { 1693 int num = 0; 1694 struct mem_cgroup *iter; 1695 1696 for_each_mem_cgroup_tree(iter, memcg) 1697 num++; 1698 return num; 1699 } 1700 1701 /* 1702 * Return the memory (and swap, if configured) limit for a memcg. 1703 */ 1704 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1705 { 1706 u64 limit; 1707 1708 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1709 1710 /* 1711 * Do not consider swap space if we cannot swap due to swappiness 1712 */ 1713 if (mem_cgroup_swappiness(memcg)) { 1714 u64 memsw; 1715 1716 limit += total_swap_pages << PAGE_SHIFT; 1717 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1718 1719 /* 1720 * If memsw is finite and limits the amount of swap space 1721 * available to this memcg, return that limit. 1722 */ 1723 limit = min(limit, memsw); 1724 } 1725 1726 return limit; 1727 } 1728 1729 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1730 int order) 1731 { 1732 struct mem_cgroup *iter; 1733 unsigned long chosen_points = 0; 1734 unsigned long totalpages; 1735 unsigned int points = 0; 1736 struct task_struct *chosen = NULL; 1737 1738 /* 1739 * If current has a pending SIGKILL or is exiting, then automatically 1740 * select it. The goal is to allow it to allocate so that it may 1741 * quickly exit and free its memory. 1742 */ 1743 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1744 set_thread_flag(TIF_MEMDIE); 1745 return; 1746 } 1747 1748 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1749 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1750 for_each_mem_cgroup_tree(iter, memcg) { 1751 struct css_task_iter it; 1752 struct task_struct *task; 1753 1754 css_task_iter_start(&iter->css, &it); 1755 while ((task = css_task_iter_next(&it))) { 1756 switch (oom_scan_process_thread(task, totalpages, NULL, 1757 false)) { 1758 case OOM_SCAN_SELECT: 1759 if (chosen) 1760 put_task_struct(chosen); 1761 chosen = task; 1762 chosen_points = ULONG_MAX; 1763 get_task_struct(chosen); 1764 /* fall through */ 1765 case OOM_SCAN_CONTINUE: 1766 continue; 1767 case OOM_SCAN_ABORT: 1768 css_task_iter_end(&it); 1769 mem_cgroup_iter_break(memcg, iter); 1770 if (chosen) 1771 put_task_struct(chosen); 1772 return; 1773 case OOM_SCAN_OK: 1774 break; 1775 }; 1776 points = oom_badness(task, memcg, NULL, totalpages); 1777 if (!points || points < chosen_points) 1778 continue; 1779 /* Prefer thread group leaders for display purposes */ 1780 if (points == chosen_points && 1781 thread_group_leader(chosen)) 1782 continue; 1783 1784 if (chosen) 1785 put_task_struct(chosen); 1786 chosen = task; 1787 chosen_points = points; 1788 get_task_struct(chosen); 1789 } 1790 css_task_iter_end(&it); 1791 } 1792 1793 if (!chosen) 1794 return; 1795 points = chosen_points * 1000 / totalpages; 1796 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1797 NULL, "Memory cgroup out of memory"); 1798 } 1799 1800 /** 1801 * test_mem_cgroup_node_reclaimable 1802 * @memcg: the target memcg 1803 * @nid: the node ID to be checked. 1804 * @noswap : specify true here if the user wants flle only information. 1805 * 1806 * This function returns whether the specified memcg contains any 1807 * reclaimable pages on a node. Returns true if there are any reclaimable 1808 * pages in the node. 1809 */ 1810 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1811 int nid, bool noswap) 1812 { 1813 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1814 return true; 1815 if (noswap || !total_swap_pages) 1816 return false; 1817 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1818 return true; 1819 return false; 1820 1821 } 1822 #if MAX_NUMNODES > 1 1823 1824 /* 1825 * Always updating the nodemask is not very good - even if we have an empty 1826 * list or the wrong list here, we can start from some node and traverse all 1827 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1828 * 1829 */ 1830 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1831 { 1832 int nid; 1833 /* 1834 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1835 * pagein/pageout changes since the last update. 1836 */ 1837 if (!atomic_read(&memcg->numainfo_events)) 1838 return; 1839 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1840 return; 1841 1842 /* make a nodemask where this memcg uses memory from */ 1843 memcg->scan_nodes = node_states[N_MEMORY]; 1844 1845 for_each_node_mask(nid, node_states[N_MEMORY]) { 1846 1847 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1848 node_clear(nid, memcg->scan_nodes); 1849 } 1850 1851 atomic_set(&memcg->numainfo_events, 0); 1852 atomic_set(&memcg->numainfo_updating, 0); 1853 } 1854 1855 /* 1856 * Selecting a node where we start reclaim from. Because what we need is just 1857 * reducing usage counter, start from anywhere is O,K. Considering 1858 * memory reclaim from current node, there are pros. and cons. 1859 * 1860 * Freeing memory from current node means freeing memory from a node which 1861 * we'll use or we've used. So, it may make LRU bad. And if several threads 1862 * hit limits, it will see a contention on a node. But freeing from remote 1863 * node means more costs for memory reclaim because of memory latency. 1864 * 1865 * Now, we use round-robin. Better algorithm is welcomed. 1866 */ 1867 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1868 { 1869 int node; 1870 1871 mem_cgroup_may_update_nodemask(memcg); 1872 node = memcg->last_scanned_node; 1873 1874 node = next_node(node, memcg->scan_nodes); 1875 if (node == MAX_NUMNODES) 1876 node = first_node(memcg->scan_nodes); 1877 /* 1878 * We call this when we hit limit, not when pages are added to LRU. 1879 * No LRU may hold pages because all pages are UNEVICTABLE or 1880 * memcg is too small and all pages are not on LRU. In that case, 1881 * we use curret node. 1882 */ 1883 if (unlikely(node == MAX_NUMNODES)) 1884 node = numa_node_id(); 1885 1886 memcg->last_scanned_node = node; 1887 return node; 1888 } 1889 1890 /* 1891 * Check all nodes whether it contains reclaimable pages or not. 1892 * For quick scan, we make use of scan_nodes. This will allow us to skip 1893 * unused nodes. But scan_nodes is lazily updated and may not cotain 1894 * enough new information. We need to do double check. 1895 */ 1896 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1897 { 1898 int nid; 1899 1900 /* 1901 * quick check...making use of scan_node. 1902 * We can skip unused nodes. 1903 */ 1904 if (!nodes_empty(memcg->scan_nodes)) { 1905 for (nid = first_node(memcg->scan_nodes); 1906 nid < MAX_NUMNODES; 1907 nid = next_node(nid, memcg->scan_nodes)) { 1908 1909 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1910 return true; 1911 } 1912 } 1913 /* 1914 * Check rest of nodes. 1915 */ 1916 for_each_node_state(nid, N_MEMORY) { 1917 if (node_isset(nid, memcg->scan_nodes)) 1918 continue; 1919 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1920 return true; 1921 } 1922 return false; 1923 } 1924 1925 #else 1926 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1927 { 1928 return 0; 1929 } 1930 1931 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1932 { 1933 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1934 } 1935 #endif 1936 1937 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1938 struct zone *zone, 1939 gfp_t gfp_mask, 1940 unsigned long *total_scanned) 1941 { 1942 struct mem_cgroup *victim = NULL; 1943 int total = 0; 1944 int loop = 0; 1945 unsigned long excess; 1946 unsigned long nr_scanned; 1947 struct mem_cgroup_reclaim_cookie reclaim = { 1948 .zone = zone, 1949 .priority = 0, 1950 }; 1951 1952 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1953 1954 while (1) { 1955 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1956 if (!victim) { 1957 loop++; 1958 if (loop >= 2) { 1959 /* 1960 * If we have not been able to reclaim 1961 * anything, it might because there are 1962 * no reclaimable pages under this hierarchy 1963 */ 1964 if (!total) 1965 break; 1966 /* 1967 * We want to do more targeted reclaim. 1968 * excess >> 2 is not to excessive so as to 1969 * reclaim too much, nor too less that we keep 1970 * coming back to reclaim from this cgroup 1971 */ 1972 if (total >= (excess >> 2) || 1973 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1974 break; 1975 } 1976 continue; 1977 } 1978 if (!mem_cgroup_reclaimable(victim, false)) 1979 continue; 1980 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1981 zone, &nr_scanned); 1982 *total_scanned += nr_scanned; 1983 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1984 break; 1985 } 1986 mem_cgroup_iter_break(root_memcg, victim); 1987 return total; 1988 } 1989 1990 #ifdef CONFIG_LOCKDEP 1991 static struct lockdep_map memcg_oom_lock_dep_map = { 1992 .name = "memcg_oom_lock", 1993 }; 1994 #endif 1995 1996 static DEFINE_SPINLOCK(memcg_oom_lock); 1997 1998 /* 1999 * Check OOM-Killer is already running under our hierarchy. 2000 * If someone is running, return false. 2001 */ 2002 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 2003 { 2004 struct mem_cgroup *iter, *failed = NULL; 2005 2006 spin_lock(&memcg_oom_lock); 2007 2008 for_each_mem_cgroup_tree(iter, memcg) { 2009 if (iter->oom_lock) { 2010 /* 2011 * this subtree of our hierarchy is already locked 2012 * so we cannot give a lock. 2013 */ 2014 failed = iter; 2015 mem_cgroup_iter_break(memcg, iter); 2016 break; 2017 } else 2018 iter->oom_lock = true; 2019 } 2020 2021 if (failed) { 2022 /* 2023 * OK, we failed to lock the whole subtree so we have 2024 * to clean up what we set up to the failing subtree 2025 */ 2026 for_each_mem_cgroup_tree(iter, memcg) { 2027 if (iter == failed) { 2028 mem_cgroup_iter_break(memcg, iter); 2029 break; 2030 } 2031 iter->oom_lock = false; 2032 } 2033 } else 2034 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 2035 2036 spin_unlock(&memcg_oom_lock); 2037 2038 return !failed; 2039 } 2040 2041 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2042 { 2043 struct mem_cgroup *iter; 2044 2045 spin_lock(&memcg_oom_lock); 2046 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 2047 for_each_mem_cgroup_tree(iter, memcg) 2048 iter->oom_lock = false; 2049 spin_unlock(&memcg_oom_lock); 2050 } 2051 2052 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2053 { 2054 struct mem_cgroup *iter; 2055 2056 for_each_mem_cgroup_tree(iter, memcg) 2057 atomic_inc(&iter->under_oom); 2058 } 2059 2060 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2061 { 2062 struct mem_cgroup *iter; 2063 2064 /* 2065 * When a new child is created while the hierarchy is under oom, 2066 * mem_cgroup_oom_lock() may not be called. We have to use 2067 * atomic_add_unless() here. 2068 */ 2069 for_each_mem_cgroup_tree(iter, memcg) 2070 atomic_add_unless(&iter->under_oom, -1, 0); 2071 } 2072 2073 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2074 2075 struct oom_wait_info { 2076 struct mem_cgroup *memcg; 2077 wait_queue_t wait; 2078 }; 2079 2080 static int memcg_oom_wake_function(wait_queue_t *wait, 2081 unsigned mode, int sync, void *arg) 2082 { 2083 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2084 struct mem_cgroup *oom_wait_memcg; 2085 struct oom_wait_info *oom_wait_info; 2086 2087 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2088 oom_wait_memcg = oom_wait_info->memcg; 2089 2090 /* 2091 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2092 * Then we can use css_is_ancestor without taking care of RCU. 2093 */ 2094 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2095 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2096 return 0; 2097 return autoremove_wake_function(wait, mode, sync, arg); 2098 } 2099 2100 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2101 { 2102 atomic_inc(&memcg->oom_wakeups); 2103 /* for filtering, pass "memcg" as argument. */ 2104 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2105 } 2106 2107 static void memcg_oom_recover(struct mem_cgroup *memcg) 2108 { 2109 if (memcg && atomic_read(&memcg->under_oom)) 2110 memcg_wakeup_oom(memcg); 2111 } 2112 2113 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2114 { 2115 if (!current->memcg_oom.may_oom) 2116 return; 2117 /* 2118 * We are in the middle of the charge context here, so we 2119 * don't want to block when potentially sitting on a callstack 2120 * that holds all kinds of filesystem and mm locks. 2121 * 2122 * Also, the caller may handle a failed allocation gracefully 2123 * (like optional page cache readahead) and so an OOM killer 2124 * invocation might not even be necessary. 2125 * 2126 * That's why we don't do anything here except remember the 2127 * OOM context and then deal with it at the end of the page 2128 * fault when the stack is unwound, the locks are released, 2129 * and when we know whether the fault was overall successful. 2130 */ 2131 css_get(&memcg->css); 2132 current->memcg_oom.memcg = memcg; 2133 current->memcg_oom.gfp_mask = mask; 2134 current->memcg_oom.order = order; 2135 } 2136 2137 /** 2138 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2139 * @handle: actually kill/wait or just clean up the OOM state 2140 * 2141 * This has to be called at the end of a page fault if the memcg OOM 2142 * handler was enabled. 2143 * 2144 * Memcg supports userspace OOM handling where failed allocations must 2145 * sleep on a waitqueue until the userspace task resolves the 2146 * situation. Sleeping directly in the charge context with all kinds 2147 * of locks held is not a good idea, instead we remember an OOM state 2148 * in the task and mem_cgroup_oom_synchronize() has to be called at 2149 * the end of the page fault to complete the OOM handling. 2150 * 2151 * Returns %true if an ongoing memcg OOM situation was detected and 2152 * completed, %false otherwise. 2153 */ 2154 bool mem_cgroup_oom_synchronize(bool handle) 2155 { 2156 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2157 struct oom_wait_info owait; 2158 bool locked; 2159 2160 /* OOM is global, do not handle */ 2161 if (!memcg) 2162 return false; 2163 2164 if (!handle) 2165 goto cleanup; 2166 2167 owait.memcg = memcg; 2168 owait.wait.flags = 0; 2169 owait.wait.func = memcg_oom_wake_function; 2170 owait.wait.private = current; 2171 INIT_LIST_HEAD(&owait.wait.task_list); 2172 2173 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2174 mem_cgroup_mark_under_oom(memcg); 2175 2176 locked = mem_cgroup_oom_trylock(memcg); 2177 2178 if (locked) 2179 mem_cgroup_oom_notify(memcg); 2180 2181 if (locked && !memcg->oom_kill_disable) { 2182 mem_cgroup_unmark_under_oom(memcg); 2183 finish_wait(&memcg_oom_waitq, &owait.wait); 2184 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2185 current->memcg_oom.order); 2186 } else { 2187 schedule(); 2188 mem_cgroup_unmark_under_oom(memcg); 2189 finish_wait(&memcg_oom_waitq, &owait.wait); 2190 } 2191 2192 if (locked) { 2193 mem_cgroup_oom_unlock(memcg); 2194 /* 2195 * There is no guarantee that an OOM-lock contender 2196 * sees the wakeups triggered by the OOM kill 2197 * uncharges. Wake any sleepers explicitely. 2198 */ 2199 memcg_oom_recover(memcg); 2200 } 2201 cleanup: 2202 current->memcg_oom.memcg = NULL; 2203 css_put(&memcg->css); 2204 return true; 2205 } 2206 2207 /* 2208 * Used to update mapped file or writeback or other statistics. 2209 * 2210 * Notes: Race condition 2211 * 2212 * Charging occurs during page instantiation, while the page is 2213 * unmapped and locked in page migration, or while the page table is 2214 * locked in THP migration. No race is possible. 2215 * 2216 * Uncharge happens to pages with zero references, no race possible. 2217 * 2218 * Charge moving between groups is protected by checking mm->moving 2219 * account and taking the move_lock in the slowpath. 2220 */ 2221 2222 void __mem_cgroup_begin_update_page_stat(struct page *page, 2223 bool *locked, unsigned long *flags) 2224 { 2225 struct mem_cgroup *memcg; 2226 struct page_cgroup *pc; 2227 2228 pc = lookup_page_cgroup(page); 2229 again: 2230 memcg = pc->mem_cgroup; 2231 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2232 return; 2233 /* 2234 * If this memory cgroup is not under account moving, we don't 2235 * need to take move_lock_mem_cgroup(). Because we already hold 2236 * rcu_read_lock(), any calls to move_account will be delayed until 2237 * rcu_read_unlock(). 2238 */ 2239 VM_BUG_ON(!rcu_read_lock_held()); 2240 if (atomic_read(&memcg->moving_account) <= 0) 2241 return; 2242 2243 move_lock_mem_cgroup(memcg, flags); 2244 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2245 move_unlock_mem_cgroup(memcg, flags); 2246 goto again; 2247 } 2248 *locked = true; 2249 } 2250 2251 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2252 { 2253 struct page_cgroup *pc = lookup_page_cgroup(page); 2254 2255 /* 2256 * It's guaranteed that pc->mem_cgroup never changes while 2257 * lock is held because a routine modifies pc->mem_cgroup 2258 * should take move_lock_mem_cgroup(). 2259 */ 2260 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2261 } 2262 2263 void mem_cgroup_update_page_stat(struct page *page, 2264 enum mem_cgroup_stat_index idx, int val) 2265 { 2266 struct mem_cgroup *memcg; 2267 struct page_cgroup *pc = lookup_page_cgroup(page); 2268 unsigned long uninitialized_var(flags); 2269 2270 if (mem_cgroup_disabled()) 2271 return; 2272 2273 VM_BUG_ON(!rcu_read_lock_held()); 2274 memcg = pc->mem_cgroup; 2275 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2276 return; 2277 2278 this_cpu_add(memcg->stat->count[idx], val); 2279 } 2280 2281 /* 2282 * size of first charge trial. "32" comes from vmscan.c's magic value. 2283 * TODO: maybe necessary to use big numbers in big irons. 2284 */ 2285 #define CHARGE_BATCH 32U 2286 struct memcg_stock_pcp { 2287 struct mem_cgroup *cached; /* this never be root cgroup */ 2288 unsigned int nr_pages; 2289 struct work_struct work; 2290 unsigned long flags; 2291 #define FLUSHING_CACHED_CHARGE 0 2292 }; 2293 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2294 static DEFINE_MUTEX(percpu_charge_mutex); 2295 2296 /** 2297 * consume_stock: Try to consume stocked charge on this cpu. 2298 * @memcg: memcg to consume from. 2299 * @nr_pages: how many pages to charge. 2300 * 2301 * The charges will only happen if @memcg matches the current cpu's memcg 2302 * stock, and at least @nr_pages are available in that stock. Failure to 2303 * service an allocation will refill the stock. 2304 * 2305 * returns true if successful, false otherwise. 2306 */ 2307 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2308 { 2309 struct memcg_stock_pcp *stock; 2310 bool ret = true; 2311 2312 if (nr_pages > CHARGE_BATCH) 2313 return false; 2314 2315 stock = &get_cpu_var(memcg_stock); 2316 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2317 stock->nr_pages -= nr_pages; 2318 else /* need to call res_counter_charge */ 2319 ret = false; 2320 put_cpu_var(memcg_stock); 2321 return ret; 2322 } 2323 2324 /* 2325 * Returns stocks cached in percpu to res_counter and reset cached information. 2326 */ 2327 static void drain_stock(struct memcg_stock_pcp *stock) 2328 { 2329 struct mem_cgroup *old = stock->cached; 2330 2331 if (stock->nr_pages) { 2332 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2333 2334 res_counter_uncharge(&old->res, bytes); 2335 if (do_swap_account) 2336 res_counter_uncharge(&old->memsw, bytes); 2337 stock->nr_pages = 0; 2338 } 2339 stock->cached = NULL; 2340 } 2341 2342 /* 2343 * This must be called under preempt disabled or must be called by 2344 * a thread which is pinned to local cpu. 2345 */ 2346 static void drain_local_stock(struct work_struct *dummy) 2347 { 2348 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2349 drain_stock(stock); 2350 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2351 } 2352 2353 static void __init memcg_stock_init(void) 2354 { 2355 int cpu; 2356 2357 for_each_possible_cpu(cpu) { 2358 struct memcg_stock_pcp *stock = 2359 &per_cpu(memcg_stock, cpu); 2360 INIT_WORK(&stock->work, drain_local_stock); 2361 } 2362 } 2363 2364 /* 2365 * Cache charges(val) which is from res_counter, to local per_cpu area. 2366 * This will be consumed by consume_stock() function, later. 2367 */ 2368 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2369 { 2370 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2371 2372 if (stock->cached != memcg) { /* reset if necessary */ 2373 drain_stock(stock); 2374 stock->cached = memcg; 2375 } 2376 stock->nr_pages += nr_pages; 2377 put_cpu_var(memcg_stock); 2378 } 2379 2380 /* 2381 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2382 * of the hierarchy under it. sync flag says whether we should block 2383 * until the work is done. 2384 */ 2385 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2386 { 2387 int cpu, curcpu; 2388 2389 /* Notify other cpus that system-wide "drain" is running */ 2390 get_online_cpus(); 2391 curcpu = get_cpu(); 2392 for_each_online_cpu(cpu) { 2393 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2394 struct mem_cgroup *memcg; 2395 2396 memcg = stock->cached; 2397 if (!memcg || !stock->nr_pages) 2398 continue; 2399 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2400 continue; 2401 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2402 if (cpu == curcpu) 2403 drain_local_stock(&stock->work); 2404 else 2405 schedule_work_on(cpu, &stock->work); 2406 } 2407 } 2408 put_cpu(); 2409 2410 if (!sync) 2411 goto out; 2412 2413 for_each_online_cpu(cpu) { 2414 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2415 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2416 flush_work(&stock->work); 2417 } 2418 out: 2419 put_online_cpus(); 2420 } 2421 2422 /* 2423 * Tries to drain stocked charges in other cpus. This function is asynchronous 2424 * and just put a work per cpu for draining localy on each cpu. Caller can 2425 * expects some charges will be back to res_counter later but cannot wait for 2426 * it. 2427 */ 2428 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2429 { 2430 /* 2431 * If someone calls draining, avoid adding more kworker runs. 2432 */ 2433 if (!mutex_trylock(&percpu_charge_mutex)) 2434 return; 2435 drain_all_stock(root_memcg, false); 2436 mutex_unlock(&percpu_charge_mutex); 2437 } 2438 2439 /* This is a synchronous drain interface. */ 2440 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2441 { 2442 /* called when force_empty is called */ 2443 mutex_lock(&percpu_charge_mutex); 2444 drain_all_stock(root_memcg, true); 2445 mutex_unlock(&percpu_charge_mutex); 2446 } 2447 2448 /* 2449 * This function drains percpu counter value from DEAD cpu and 2450 * move it to local cpu. Note that this function can be preempted. 2451 */ 2452 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2453 { 2454 int i; 2455 2456 spin_lock(&memcg->pcp_counter_lock); 2457 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2458 long x = per_cpu(memcg->stat->count[i], cpu); 2459 2460 per_cpu(memcg->stat->count[i], cpu) = 0; 2461 memcg->nocpu_base.count[i] += x; 2462 } 2463 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2464 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2465 2466 per_cpu(memcg->stat->events[i], cpu) = 0; 2467 memcg->nocpu_base.events[i] += x; 2468 } 2469 spin_unlock(&memcg->pcp_counter_lock); 2470 } 2471 2472 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2473 unsigned long action, 2474 void *hcpu) 2475 { 2476 int cpu = (unsigned long)hcpu; 2477 struct memcg_stock_pcp *stock; 2478 struct mem_cgroup *iter; 2479 2480 if (action == CPU_ONLINE) 2481 return NOTIFY_OK; 2482 2483 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2484 return NOTIFY_OK; 2485 2486 for_each_mem_cgroup(iter) 2487 mem_cgroup_drain_pcp_counter(iter, cpu); 2488 2489 stock = &per_cpu(memcg_stock, cpu); 2490 drain_stock(stock); 2491 return NOTIFY_OK; 2492 } 2493 2494 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2495 unsigned int nr_pages) 2496 { 2497 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2498 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2499 struct mem_cgroup *mem_over_limit; 2500 struct res_counter *fail_res; 2501 unsigned long nr_reclaimed; 2502 unsigned long long size; 2503 bool may_swap = true; 2504 bool drained = false; 2505 int ret = 0; 2506 2507 if (mem_cgroup_is_root(memcg)) 2508 goto done; 2509 retry: 2510 if (consume_stock(memcg, nr_pages)) 2511 goto done; 2512 2513 size = batch * PAGE_SIZE; 2514 if (!do_swap_account || 2515 !res_counter_charge(&memcg->memsw, size, &fail_res)) { 2516 if (!res_counter_charge(&memcg->res, size, &fail_res)) 2517 goto done_restock; 2518 if (do_swap_account) 2519 res_counter_uncharge(&memcg->memsw, size); 2520 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2521 } else { 2522 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2523 may_swap = false; 2524 } 2525 2526 if (batch > nr_pages) { 2527 batch = nr_pages; 2528 goto retry; 2529 } 2530 2531 /* 2532 * Unlike in global OOM situations, memcg is not in a physical 2533 * memory shortage. Allow dying and OOM-killed tasks to 2534 * bypass the last charges so that they can exit quickly and 2535 * free their memory. 2536 */ 2537 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2538 fatal_signal_pending(current) || 2539 current->flags & PF_EXITING)) 2540 goto bypass; 2541 2542 if (unlikely(task_in_memcg_oom(current))) 2543 goto nomem; 2544 2545 if (!(gfp_mask & __GFP_WAIT)) 2546 goto nomem; 2547 2548 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2549 gfp_mask, may_swap); 2550 2551 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2552 goto retry; 2553 2554 if (!drained) { 2555 drain_all_stock_async(mem_over_limit); 2556 drained = true; 2557 goto retry; 2558 } 2559 2560 if (gfp_mask & __GFP_NORETRY) 2561 goto nomem; 2562 /* 2563 * Even though the limit is exceeded at this point, reclaim 2564 * may have been able to free some pages. Retry the charge 2565 * before killing the task. 2566 * 2567 * Only for regular pages, though: huge pages are rather 2568 * unlikely to succeed so close to the limit, and we fall back 2569 * to regular pages anyway in case of failure. 2570 */ 2571 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2572 goto retry; 2573 /* 2574 * At task move, charge accounts can be doubly counted. So, it's 2575 * better to wait until the end of task_move if something is going on. 2576 */ 2577 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2578 goto retry; 2579 2580 if (nr_retries--) 2581 goto retry; 2582 2583 if (gfp_mask & __GFP_NOFAIL) 2584 goto bypass; 2585 2586 if (fatal_signal_pending(current)) 2587 goto bypass; 2588 2589 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2590 nomem: 2591 if (!(gfp_mask & __GFP_NOFAIL)) 2592 return -ENOMEM; 2593 bypass: 2594 return -EINTR; 2595 2596 done_restock: 2597 if (batch > nr_pages) 2598 refill_stock(memcg, batch - nr_pages); 2599 done: 2600 return ret; 2601 } 2602 2603 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2604 { 2605 unsigned long bytes = nr_pages * PAGE_SIZE; 2606 2607 if (mem_cgroup_is_root(memcg)) 2608 return; 2609 2610 res_counter_uncharge(&memcg->res, bytes); 2611 if (do_swap_account) 2612 res_counter_uncharge(&memcg->memsw, bytes); 2613 } 2614 2615 /* 2616 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2617 * This is useful when moving usage to parent cgroup. 2618 */ 2619 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2620 unsigned int nr_pages) 2621 { 2622 unsigned long bytes = nr_pages * PAGE_SIZE; 2623 2624 if (mem_cgroup_is_root(memcg)) 2625 return; 2626 2627 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2628 if (do_swap_account) 2629 res_counter_uncharge_until(&memcg->memsw, 2630 memcg->memsw.parent, bytes); 2631 } 2632 2633 /* 2634 * A helper function to get mem_cgroup from ID. must be called under 2635 * rcu_read_lock(). The caller is responsible for calling 2636 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 2637 * refcnt from swap can be called against removed memcg.) 2638 */ 2639 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2640 { 2641 /* ID 0 is unused ID */ 2642 if (!id) 2643 return NULL; 2644 return mem_cgroup_from_id(id); 2645 } 2646 2647 /* 2648 * try_get_mem_cgroup_from_page - look up page's memcg association 2649 * @page: the page 2650 * 2651 * Look up, get a css reference, and return the memcg that owns @page. 2652 * 2653 * The page must be locked to prevent racing with swap-in and page 2654 * cache charges. If coming from an unlocked page table, the caller 2655 * must ensure the page is on the LRU or this can race with charging. 2656 */ 2657 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2658 { 2659 struct mem_cgroup *memcg = NULL; 2660 struct page_cgroup *pc; 2661 unsigned short id; 2662 swp_entry_t ent; 2663 2664 VM_BUG_ON_PAGE(!PageLocked(page), page); 2665 2666 pc = lookup_page_cgroup(page); 2667 if (PageCgroupUsed(pc)) { 2668 memcg = pc->mem_cgroup; 2669 if (memcg && !css_tryget_online(&memcg->css)) 2670 memcg = NULL; 2671 } else if (PageSwapCache(page)) { 2672 ent.val = page_private(page); 2673 id = lookup_swap_cgroup_id(ent); 2674 rcu_read_lock(); 2675 memcg = mem_cgroup_lookup(id); 2676 if (memcg && !css_tryget_online(&memcg->css)) 2677 memcg = NULL; 2678 rcu_read_unlock(); 2679 } 2680 return memcg; 2681 } 2682 2683 static void lock_page_lru(struct page *page, int *isolated) 2684 { 2685 struct zone *zone = page_zone(page); 2686 2687 spin_lock_irq(&zone->lru_lock); 2688 if (PageLRU(page)) { 2689 struct lruvec *lruvec; 2690 2691 lruvec = mem_cgroup_page_lruvec(page, zone); 2692 ClearPageLRU(page); 2693 del_page_from_lru_list(page, lruvec, page_lru(page)); 2694 *isolated = 1; 2695 } else 2696 *isolated = 0; 2697 } 2698 2699 static void unlock_page_lru(struct page *page, int isolated) 2700 { 2701 struct zone *zone = page_zone(page); 2702 2703 if (isolated) { 2704 struct lruvec *lruvec; 2705 2706 lruvec = mem_cgroup_page_lruvec(page, zone); 2707 VM_BUG_ON_PAGE(PageLRU(page), page); 2708 SetPageLRU(page); 2709 add_page_to_lru_list(page, lruvec, page_lru(page)); 2710 } 2711 spin_unlock_irq(&zone->lru_lock); 2712 } 2713 2714 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2715 bool lrucare) 2716 { 2717 struct page_cgroup *pc = lookup_page_cgroup(page); 2718 int isolated; 2719 2720 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2721 /* 2722 * we don't need page_cgroup_lock about tail pages, becase they are not 2723 * accessed by any other context at this point. 2724 */ 2725 2726 /* 2727 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2728 * may already be on some other mem_cgroup's LRU. Take care of it. 2729 */ 2730 if (lrucare) 2731 lock_page_lru(page, &isolated); 2732 2733 /* 2734 * Nobody should be changing or seriously looking at 2735 * pc->mem_cgroup and pc->flags at this point: 2736 * 2737 * - the page is uncharged 2738 * 2739 * - the page is off-LRU 2740 * 2741 * - an anonymous fault has exclusive page access, except for 2742 * a locked page table 2743 * 2744 * - a page cache insertion, a swapin fault, or a migration 2745 * have the page locked 2746 */ 2747 pc->mem_cgroup = memcg; 2748 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); 2749 2750 if (lrucare) 2751 unlock_page_lru(page, isolated); 2752 } 2753 2754 static DEFINE_MUTEX(set_limit_mutex); 2755 2756 #ifdef CONFIG_MEMCG_KMEM 2757 /* 2758 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2759 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. 2760 */ 2761 static DEFINE_MUTEX(memcg_slab_mutex); 2762 2763 static DEFINE_MUTEX(activate_kmem_mutex); 2764 2765 /* 2766 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2767 * in the memcg_cache_params struct. 2768 */ 2769 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2770 { 2771 struct kmem_cache *cachep; 2772 2773 VM_BUG_ON(p->is_root_cache); 2774 cachep = p->root_cache; 2775 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2776 } 2777 2778 #ifdef CONFIG_SLABINFO 2779 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2780 { 2781 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2782 struct memcg_cache_params *params; 2783 2784 if (!memcg_kmem_is_active(memcg)) 2785 return -EIO; 2786 2787 print_slabinfo_header(m); 2788 2789 mutex_lock(&memcg_slab_mutex); 2790 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2791 cache_show(memcg_params_to_cache(params), m); 2792 mutex_unlock(&memcg_slab_mutex); 2793 2794 return 0; 2795 } 2796 #endif 2797 2798 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2799 { 2800 struct res_counter *fail_res; 2801 int ret = 0; 2802 2803 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2804 if (ret) 2805 return ret; 2806 2807 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2808 if (ret == -EINTR) { 2809 /* 2810 * try_charge() chose to bypass to root due to OOM kill or 2811 * fatal signal. Since our only options are to either fail 2812 * the allocation or charge it to this cgroup, do it as a 2813 * temporary condition. But we can't fail. From a kmem/slab 2814 * perspective, the cache has already been selected, by 2815 * mem_cgroup_kmem_get_cache(), so it is too late to change 2816 * our minds. 2817 * 2818 * This condition will only trigger if the task entered 2819 * memcg_charge_kmem in a sane state, but was OOM-killed 2820 * during try_charge() above. Tasks that were already dying 2821 * when the allocation triggers should have been already 2822 * directed to the root cgroup in memcontrol.h 2823 */ 2824 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2825 if (do_swap_account) 2826 res_counter_charge_nofail(&memcg->memsw, size, 2827 &fail_res); 2828 ret = 0; 2829 } else if (ret) 2830 res_counter_uncharge(&memcg->kmem, size); 2831 2832 return ret; 2833 } 2834 2835 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2836 { 2837 res_counter_uncharge(&memcg->res, size); 2838 if (do_swap_account) 2839 res_counter_uncharge(&memcg->memsw, size); 2840 2841 /* Not down to 0 */ 2842 if (res_counter_uncharge(&memcg->kmem, size)) 2843 return; 2844 2845 /* 2846 * Releases a reference taken in kmem_cgroup_css_offline in case 2847 * this last uncharge is racing with the offlining code or it is 2848 * outliving the memcg existence. 2849 * 2850 * The memory barrier imposed by test&clear is paired with the 2851 * explicit one in memcg_kmem_mark_dead(). 2852 */ 2853 if (memcg_kmem_test_and_clear_dead(memcg)) 2854 css_put(&memcg->css); 2855 } 2856 2857 /* 2858 * helper for acessing a memcg's index. It will be used as an index in the 2859 * child cache array in kmem_cache, and also to derive its name. This function 2860 * will return -1 when this is not a kmem-limited memcg. 2861 */ 2862 int memcg_cache_id(struct mem_cgroup *memcg) 2863 { 2864 return memcg ? memcg->kmemcg_id : -1; 2865 } 2866 2867 static int memcg_alloc_cache_id(void) 2868 { 2869 int id, size; 2870 int err; 2871 2872 id = ida_simple_get(&kmem_limited_groups, 2873 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2874 if (id < 0) 2875 return id; 2876 2877 if (id < memcg_limited_groups_array_size) 2878 return id; 2879 2880 /* 2881 * There's no space for the new id in memcg_caches arrays, 2882 * so we have to grow them. 2883 */ 2884 2885 size = 2 * (id + 1); 2886 if (size < MEMCG_CACHES_MIN_SIZE) 2887 size = MEMCG_CACHES_MIN_SIZE; 2888 else if (size > MEMCG_CACHES_MAX_SIZE) 2889 size = MEMCG_CACHES_MAX_SIZE; 2890 2891 mutex_lock(&memcg_slab_mutex); 2892 err = memcg_update_all_caches(size); 2893 mutex_unlock(&memcg_slab_mutex); 2894 2895 if (err) { 2896 ida_simple_remove(&kmem_limited_groups, id); 2897 return err; 2898 } 2899 return id; 2900 } 2901 2902 static void memcg_free_cache_id(int id) 2903 { 2904 ida_simple_remove(&kmem_limited_groups, id); 2905 } 2906 2907 /* 2908 * We should update the current array size iff all caches updates succeed. This 2909 * can only be done from the slab side. The slab mutex needs to be held when 2910 * calling this. 2911 */ 2912 void memcg_update_array_size(int num) 2913 { 2914 memcg_limited_groups_array_size = num; 2915 } 2916 2917 static void memcg_register_cache(struct mem_cgroup *memcg, 2918 struct kmem_cache *root_cache) 2919 { 2920 static char memcg_name_buf[NAME_MAX + 1]; /* protected by 2921 memcg_slab_mutex */ 2922 struct kmem_cache *cachep; 2923 int id; 2924 2925 lockdep_assert_held(&memcg_slab_mutex); 2926 2927 id = memcg_cache_id(memcg); 2928 2929 /* 2930 * Since per-memcg caches are created asynchronously on first 2931 * allocation (see memcg_kmem_get_cache()), several threads can try to 2932 * create the same cache, but only one of them may succeed. 2933 */ 2934 if (cache_from_memcg_idx(root_cache, id)) 2935 return; 2936 2937 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); 2938 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); 2939 /* 2940 * If we could not create a memcg cache, do not complain, because 2941 * that's not critical at all as we can always proceed with the root 2942 * cache. 2943 */ 2944 if (!cachep) 2945 return; 2946 2947 css_get(&memcg->css); 2948 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2949 2950 /* 2951 * Since readers won't lock (see cache_from_memcg_idx()), we need a 2952 * barrier here to ensure nobody will see the kmem_cache partially 2953 * initialized. 2954 */ 2955 smp_wmb(); 2956 2957 BUG_ON(root_cache->memcg_params->memcg_caches[id]); 2958 root_cache->memcg_params->memcg_caches[id] = cachep; 2959 } 2960 2961 static void memcg_unregister_cache(struct kmem_cache *cachep) 2962 { 2963 struct kmem_cache *root_cache; 2964 struct mem_cgroup *memcg; 2965 int id; 2966 2967 lockdep_assert_held(&memcg_slab_mutex); 2968 2969 BUG_ON(is_root_cache(cachep)); 2970 2971 root_cache = cachep->memcg_params->root_cache; 2972 memcg = cachep->memcg_params->memcg; 2973 id = memcg_cache_id(memcg); 2974 2975 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); 2976 root_cache->memcg_params->memcg_caches[id] = NULL; 2977 2978 list_del(&cachep->memcg_params->list); 2979 2980 kmem_cache_destroy(cachep); 2981 2982 /* drop the reference taken in memcg_register_cache */ 2983 css_put(&memcg->css); 2984 } 2985 2986 /* 2987 * During the creation a new cache, we need to disable our accounting mechanism 2988 * altogether. This is true even if we are not creating, but rather just 2989 * enqueing new caches to be created. 2990 * 2991 * This is because that process will trigger allocations; some visible, like 2992 * explicit kmallocs to auxiliary data structures, name strings and internal 2993 * cache structures; some well concealed, like INIT_WORK() that can allocate 2994 * objects during debug. 2995 * 2996 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 2997 * to it. This may not be a bounded recursion: since the first cache creation 2998 * failed to complete (waiting on the allocation), we'll just try to create the 2999 * cache again, failing at the same point. 3000 * 3001 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3002 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3003 * inside the following two functions. 3004 */ 3005 static inline void memcg_stop_kmem_account(void) 3006 { 3007 VM_BUG_ON(!current->mm); 3008 current->memcg_kmem_skip_account++; 3009 } 3010 3011 static inline void memcg_resume_kmem_account(void) 3012 { 3013 VM_BUG_ON(!current->mm); 3014 current->memcg_kmem_skip_account--; 3015 } 3016 3017 int __memcg_cleanup_cache_params(struct kmem_cache *s) 3018 { 3019 struct kmem_cache *c; 3020 int i, failed = 0; 3021 3022 mutex_lock(&memcg_slab_mutex); 3023 for_each_memcg_cache_index(i) { 3024 c = cache_from_memcg_idx(s, i); 3025 if (!c) 3026 continue; 3027 3028 memcg_unregister_cache(c); 3029 3030 if (cache_from_memcg_idx(s, i)) 3031 failed++; 3032 } 3033 mutex_unlock(&memcg_slab_mutex); 3034 return failed; 3035 } 3036 3037 static void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3038 { 3039 struct kmem_cache *cachep; 3040 struct memcg_cache_params *params, *tmp; 3041 3042 if (!memcg_kmem_is_active(memcg)) 3043 return; 3044 3045 mutex_lock(&memcg_slab_mutex); 3046 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 3047 cachep = memcg_params_to_cache(params); 3048 kmem_cache_shrink(cachep); 3049 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3050 memcg_unregister_cache(cachep); 3051 } 3052 mutex_unlock(&memcg_slab_mutex); 3053 } 3054 3055 struct memcg_register_cache_work { 3056 struct mem_cgroup *memcg; 3057 struct kmem_cache *cachep; 3058 struct work_struct work; 3059 }; 3060 3061 static void memcg_register_cache_func(struct work_struct *w) 3062 { 3063 struct memcg_register_cache_work *cw = 3064 container_of(w, struct memcg_register_cache_work, work); 3065 struct mem_cgroup *memcg = cw->memcg; 3066 struct kmem_cache *cachep = cw->cachep; 3067 3068 mutex_lock(&memcg_slab_mutex); 3069 memcg_register_cache(memcg, cachep); 3070 mutex_unlock(&memcg_slab_mutex); 3071 3072 css_put(&memcg->css); 3073 kfree(cw); 3074 } 3075 3076 /* 3077 * Enqueue the creation of a per-memcg kmem_cache. 3078 */ 3079 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 3080 struct kmem_cache *cachep) 3081 { 3082 struct memcg_register_cache_work *cw; 3083 3084 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 3085 if (cw == NULL) { 3086 css_put(&memcg->css); 3087 return; 3088 } 3089 3090 cw->memcg = memcg; 3091 cw->cachep = cachep; 3092 3093 INIT_WORK(&cw->work, memcg_register_cache_func); 3094 schedule_work(&cw->work); 3095 } 3096 3097 static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 3098 struct kmem_cache *cachep) 3099 { 3100 /* 3101 * We need to stop accounting when we kmalloc, because if the 3102 * corresponding kmalloc cache is not yet created, the first allocation 3103 * in __memcg_schedule_register_cache will recurse. 3104 * 3105 * However, it is better to enclose the whole function. Depending on 3106 * the debugging options enabled, INIT_WORK(), for instance, can 3107 * trigger an allocation. This too, will make us recurse. Because at 3108 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3109 * the safest choice is to do it like this, wrapping the whole function. 3110 */ 3111 memcg_stop_kmem_account(); 3112 __memcg_schedule_register_cache(memcg, cachep); 3113 memcg_resume_kmem_account(); 3114 } 3115 3116 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 3117 { 3118 int res; 3119 3120 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 3121 PAGE_SIZE << order); 3122 if (!res) 3123 atomic_add(1 << order, &cachep->memcg_params->nr_pages); 3124 return res; 3125 } 3126 3127 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 3128 { 3129 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 3130 atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 3131 } 3132 3133 /* 3134 * Return the kmem_cache we're supposed to use for a slab allocation. 3135 * We try to use the current memcg's version of the cache. 3136 * 3137 * If the cache does not exist yet, if we are the first user of it, 3138 * we either create it immediately, if possible, or create it asynchronously 3139 * in a workqueue. 3140 * In the latter case, we will let the current allocation go through with 3141 * the original cache. 3142 * 3143 * Can't be called in interrupt context or from kernel threads. 3144 * This function needs to be called with rcu_read_lock() held. 3145 */ 3146 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3147 gfp_t gfp) 3148 { 3149 struct mem_cgroup *memcg; 3150 struct kmem_cache *memcg_cachep; 3151 3152 VM_BUG_ON(!cachep->memcg_params); 3153 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3154 3155 if (!current->mm || current->memcg_kmem_skip_account) 3156 return cachep; 3157 3158 rcu_read_lock(); 3159 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3160 3161 if (!memcg_kmem_is_active(memcg)) 3162 goto out; 3163 3164 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3165 if (likely(memcg_cachep)) { 3166 cachep = memcg_cachep; 3167 goto out; 3168 } 3169 3170 /* The corresponding put will be done in the workqueue. */ 3171 if (!css_tryget_online(&memcg->css)) 3172 goto out; 3173 rcu_read_unlock(); 3174 3175 /* 3176 * If we are in a safe context (can wait, and not in interrupt 3177 * context), we could be be predictable and return right away. 3178 * This would guarantee that the allocation being performed 3179 * already belongs in the new cache. 3180 * 3181 * However, there are some clashes that can arrive from locking. 3182 * For instance, because we acquire the slab_mutex while doing 3183 * memcg_create_kmem_cache, this means no further allocation 3184 * could happen with the slab_mutex held. So it's better to 3185 * defer everything. 3186 */ 3187 memcg_schedule_register_cache(memcg, cachep); 3188 return cachep; 3189 out: 3190 rcu_read_unlock(); 3191 return cachep; 3192 } 3193 3194 /* 3195 * We need to verify if the allocation against current->mm->owner's memcg is 3196 * possible for the given order. But the page is not allocated yet, so we'll 3197 * need a further commit step to do the final arrangements. 3198 * 3199 * It is possible for the task to switch cgroups in this mean time, so at 3200 * commit time, we can't rely on task conversion any longer. We'll then use 3201 * the handle argument to return to the caller which cgroup we should commit 3202 * against. We could also return the memcg directly and avoid the pointer 3203 * passing, but a boolean return value gives better semantics considering 3204 * the compiled-out case as well. 3205 * 3206 * Returning true means the allocation is possible. 3207 */ 3208 bool 3209 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3210 { 3211 struct mem_cgroup *memcg; 3212 int ret; 3213 3214 *_memcg = NULL; 3215 3216 /* 3217 * Disabling accounting is only relevant for some specific memcg 3218 * internal allocations. Therefore we would initially not have such 3219 * check here, since direct calls to the page allocator that are 3220 * accounted to kmemcg (alloc_kmem_pages and friends) only happen 3221 * outside memcg core. We are mostly concerned with cache allocations, 3222 * and by having this test at memcg_kmem_get_cache, we are already able 3223 * to relay the allocation to the root cache and bypass the memcg cache 3224 * altogether. 3225 * 3226 * There is one exception, though: the SLUB allocator does not create 3227 * large order caches, but rather service large kmallocs directly from 3228 * the page allocator. Therefore, the following sequence when backed by 3229 * the SLUB allocator: 3230 * 3231 * memcg_stop_kmem_account(); 3232 * kmalloc(<large_number>) 3233 * memcg_resume_kmem_account(); 3234 * 3235 * would effectively ignore the fact that we should skip accounting, 3236 * since it will drive us directly to this function without passing 3237 * through the cache selector memcg_kmem_get_cache. Such large 3238 * allocations are extremely rare but can happen, for instance, for the 3239 * cache arrays. We bring this test here. 3240 */ 3241 if (!current->mm || current->memcg_kmem_skip_account) 3242 return true; 3243 3244 memcg = get_mem_cgroup_from_mm(current->mm); 3245 3246 if (!memcg_kmem_is_active(memcg)) { 3247 css_put(&memcg->css); 3248 return true; 3249 } 3250 3251 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3252 if (!ret) 3253 *_memcg = memcg; 3254 3255 css_put(&memcg->css); 3256 return (ret == 0); 3257 } 3258 3259 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3260 int order) 3261 { 3262 struct page_cgroup *pc; 3263 3264 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3265 3266 /* The page allocation failed. Revert */ 3267 if (!page) { 3268 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3269 return; 3270 } 3271 /* 3272 * The page is freshly allocated and not visible to any 3273 * outside callers yet. Set up pc non-atomically. 3274 */ 3275 pc = lookup_page_cgroup(page); 3276 pc->mem_cgroup = memcg; 3277 pc->flags = PCG_USED; 3278 } 3279 3280 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3281 { 3282 struct mem_cgroup *memcg = NULL; 3283 struct page_cgroup *pc; 3284 3285 3286 pc = lookup_page_cgroup(page); 3287 if (!PageCgroupUsed(pc)) 3288 return; 3289 3290 memcg = pc->mem_cgroup; 3291 pc->flags = 0; 3292 3293 /* 3294 * We trust that only if there is a memcg associated with the page, it 3295 * is a valid allocation 3296 */ 3297 if (!memcg) 3298 return; 3299 3300 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3301 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3302 } 3303 #else 3304 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3305 { 3306 } 3307 #endif /* CONFIG_MEMCG_KMEM */ 3308 3309 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3310 3311 /* 3312 * Because tail pages are not marked as "used", set it. We're under 3313 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3314 * charge/uncharge will be never happen and move_account() is done under 3315 * compound_lock(), so we don't have to take care of races. 3316 */ 3317 void mem_cgroup_split_huge_fixup(struct page *head) 3318 { 3319 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3320 struct page_cgroup *pc; 3321 struct mem_cgroup *memcg; 3322 int i; 3323 3324 if (mem_cgroup_disabled()) 3325 return; 3326 3327 memcg = head_pc->mem_cgroup; 3328 for (i = 1; i < HPAGE_PMD_NR; i++) { 3329 pc = head_pc + i; 3330 pc->mem_cgroup = memcg; 3331 pc->flags = head_pc->flags; 3332 } 3333 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3334 HPAGE_PMD_NR); 3335 } 3336 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3337 3338 /** 3339 * mem_cgroup_move_account - move account of the page 3340 * @page: the page 3341 * @nr_pages: number of regular pages (>1 for huge pages) 3342 * @pc: page_cgroup of the page. 3343 * @from: mem_cgroup which the page is moved from. 3344 * @to: mem_cgroup which the page is moved to. @from != @to. 3345 * 3346 * The caller must confirm following. 3347 * - page is not on LRU (isolate_page() is useful.) 3348 * - compound_lock is held when nr_pages > 1 3349 * 3350 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3351 * from old cgroup. 3352 */ 3353 static int mem_cgroup_move_account(struct page *page, 3354 unsigned int nr_pages, 3355 struct page_cgroup *pc, 3356 struct mem_cgroup *from, 3357 struct mem_cgroup *to) 3358 { 3359 unsigned long flags; 3360 int ret; 3361 3362 VM_BUG_ON(from == to); 3363 VM_BUG_ON_PAGE(PageLRU(page), page); 3364 /* 3365 * The page is isolated from LRU. So, collapse function 3366 * will not handle this page. But page splitting can happen. 3367 * Do this check under compound_page_lock(). The caller should 3368 * hold it. 3369 */ 3370 ret = -EBUSY; 3371 if (nr_pages > 1 && !PageTransHuge(page)) 3372 goto out; 3373 3374 /* 3375 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3376 * of its source page while we change it: page migration takes 3377 * both pages off the LRU, but page cache replacement doesn't. 3378 */ 3379 if (!trylock_page(page)) 3380 goto out; 3381 3382 ret = -EINVAL; 3383 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3384 goto out_unlock; 3385 3386 move_lock_mem_cgroup(from, &flags); 3387 3388 if (!PageAnon(page) && page_mapped(page)) { 3389 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3390 nr_pages); 3391 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3392 nr_pages); 3393 } 3394 3395 if (PageWriteback(page)) { 3396 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3397 nr_pages); 3398 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3399 nr_pages); 3400 } 3401 3402 /* 3403 * It is safe to change pc->mem_cgroup here because the page 3404 * is referenced, charged, and isolated - we can't race with 3405 * uncharging, charging, migration, or LRU putback. 3406 */ 3407 3408 /* caller should have done css_get */ 3409 pc->mem_cgroup = to; 3410 move_unlock_mem_cgroup(from, &flags); 3411 ret = 0; 3412 3413 local_irq_disable(); 3414 mem_cgroup_charge_statistics(to, page, nr_pages); 3415 memcg_check_events(to, page); 3416 mem_cgroup_charge_statistics(from, page, -nr_pages); 3417 memcg_check_events(from, page); 3418 local_irq_enable(); 3419 out_unlock: 3420 unlock_page(page); 3421 out: 3422 return ret; 3423 } 3424 3425 /** 3426 * mem_cgroup_move_parent - moves page to the parent group 3427 * @page: the page to move 3428 * @pc: page_cgroup of the page 3429 * @child: page's cgroup 3430 * 3431 * move charges to its parent or the root cgroup if the group has no 3432 * parent (aka use_hierarchy==0). 3433 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3434 * mem_cgroup_move_account fails) the failure is always temporary and 3435 * it signals a race with a page removal/uncharge or migration. In the 3436 * first case the page is on the way out and it will vanish from the LRU 3437 * on the next attempt and the call should be retried later. 3438 * Isolation from the LRU fails only if page has been isolated from 3439 * the LRU since we looked at it and that usually means either global 3440 * reclaim or migration going on. The page will either get back to the 3441 * LRU or vanish. 3442 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3443 * (!PageCgroupUsed) or moved to a different group. The page will 3444 * disappear in the next attempt. 3445 */ 3446 static int mem_cgroup_move_parent(struct page *page, 3447 struct page_cgroup *pc, 3448 struct mem_cgroup *child) 3449 { 3450 struct mem_cgroup *parent; 3451 unsigned int nr_pages; 3452 unsigned long uninitialized_var(flags); 3453 int ret; 3454 3455 VM_BUG_ON(mem_cgroup_is_root(child)); 3456 3457 ret = -EBUSY; 3458 if (!get_page_unless_zero(page)) 3459 goto out; 3460 if (isolate_lru_page(page)) 3461 goto put; 3462 3463 nr_pages = hpage_nr_pages(page); 3464 3465 parent = parent_mem_cgroup(child); 3466 /* 3467 * If no parent, move charges to root cgroup. 3468 */ 3469 if (!parent) 3470 parent = root_mem_cgroup; 3471 3472 if (nr_pages > 1) { 3473 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3474 flags = compound_lock_irqsave(page); 3475 } 3476 3477 ret = mem_cgroup_move_account(page, nr_pages, 3478 pc, child, parent); 3479 if (!ret) 3480 __mem_cgroup_cancel_local_charge(child, nr_pages); 3481 3482 if (nr_pages > 1) 3483 compound_unlock_irqrestore(page, flags); 3484 putback_lru_page(page); 3485 put: 3486 put_page(page); 3487 out: 3488 return ret; 3489 } 3490 3491 #ifdef CONFIG_MEMCG_SWAP 3492 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3493 bool charge) 3494 { 3495 int val = (charge) ? 1 : -1; 3496 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 3497 } 3498 3499 /** 3500 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3501 * @entry: swap entry to be moved 3502 * @from: mem_cgroup which the entry is moved from 3503 * @to: mem_cgroup which the entry is moved to 3504 * 3505 * It succeeds only when the swap_cgroup's record for this entry is the same 3506 * as the mem_cgroup's id of @from. 3507 * 3508 * Returns 0 on success, -EINVAL on failure. 3509 * 3510 * The caller must have charged to @to, IOW, called res_counter_charge() about 3511 * both res and memsw, and called css_get(). 3512 */ 3513 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3514 struct mem_cgroup *from, struct mem_cgroup *to) 3515 { 3516 unsigned short old_id, new_id; 3517 3518 old_id = mem_cgroup_id(from); 3519 new_id = mem_cgroup_id(to); 3520 3521 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3522 mem_cgroup_swap_statistics(from, false); 3523 mem_cgroup_swap_statistics(to, true); 3524 /* 3525 * This function is only called from task migration context now. 3526 * It postpones res_counter and refcount handling till the end 3527 * of task migration(mem_cgroup_clear_mc()) for performance 3528 * improvement. But we cannot postpone css_get(to) because if 3529 * the process that has been moved to @to does swap-in, the 3530 * refcount of @to might be decreased to 0. 3531 * 3532 * We are in attach() phase, so the cgroup is guaranteed to be 3533 * alive, so we can just call css_get(). 3534 */ 3535 css_get(&to->css); 3536 return 0; 3537 } 3538 return -EINVAL; 3539 } 3540 #else 3541 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3542 struct mem_cgroup *from, struct mem_cgroup *to) 3543 { 3544 return -EINVAL; 3545 } 3546 #endif 3547 3548 #ifdef CONFIG_DEBUG_VM 3549 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3550 { 3551 struct page_cgroup *pc; 3552 3553 pc = lookup_page_cgroup(page); 3554 /* 3555 * Can be NULL while feeding pages into the page allocator for 3556 * the first time, i.e. during boot or memory hotplug; 3557 * or when mem_cgroup_disabled(). 3558 */ 3559 if (likely(pc) && PageCgroupUsed(pc)) 3560 return pc; 3561 return NULL; 3562 } 3563 3564 bool mem_cgroup_bad_page_check(struct page *page) 3565 { 3566 if (mem_cgroup_disabled()) 3567 return false; 3568 3569 return lookup_page_cgroup_used(page) != NULL; 3570 } 3571 3572 void mem_cgroup_print_bad_page(struct page *page) 3573 { 3574 struct page_cgroup *pc; 3575 3576 pc = lookup_page_cgroup_used(page); 3577 if (pc) { 3578 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3579 pc, pc->flags, pc->mem_cgroup); 3580 } 3581 } 3582 #endif 3583 3584 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3585 unsigned long long val) 3586 { 3587 int retry_count; 3588 int ret = 0; 3589 int children = mem_cgroup_count_children(memcg); 3590 u64 curusage, oldusage; 3591 int enlarge; 3592 3593 /* 3594 * For keeping hierarchical_reclaim simple, how long we should retry 3595 * is depends on callers. We set our retry-count to be function 3596 * of # of children which we should visit in this loop. 3597 */ 3598 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3599 3600 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3601 3602 enlarge = 0; 3603 while (retry_count) { 3604 if (signal_pending(current)) { 3605 ret = -EINTR; 3606 break; 3607 } 3608 /* 3609 * Rather than hide all in some function, I do this in 3610 * open coded manner. You see what this really does. 3611 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3612 */ 3613 mutex_lock(&set_limit_mutex); 3614 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { 3615 ret = -EINVAL; 3616 mutex_unlock(&set_limit_mutex); 3617 break; 3618 } 3619 3620 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) 3621 enlarge = 1; 3622 3623 ret = res_counter_set_limit(&memcg->res, val); 3624 mutex_unlock(&set_limit_mutex); 3625 3626 if (!ret) 3627 break; 3628 3629 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3630 3631 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3632 /* Usage is reduced ? */ 3633 if (curusage >= oldusage) 3634 retry_count--; 3635 else 3636 oldusage = curusage; 3637 } 3638 if (!ret && enlarge) 3639 memcg_oom_recover(memcg); 3640 3641 return ret; 3642 } 3643 3644 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3645 unsigned long long val) 3646 { 3647 int retry_count; 3648 u64 oldusage, curusage; 3649 int children = mem_cgroup_count_children(memcg); 3650 int ret = -EBUSY; 3651 int enlarge = 0; 3652 3653 /* see mem_cgroup_resize_res_limit */ 3654 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3655 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3656 while (retry_count) { 3657 if (signal_pending(current)) { 3658 ret = -EINTR; 3659 break; 3660 } 3661 /* 3662 * Rather than hide all in some function, I do this in 3663 * open coded manner. You see what this really does. 3664 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3665 */ 3666 mutex_lock(&set_limit_mutex); 3667 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { 3668 ret = -EINVAL; 3669 mutex_unlock(&set_limit_mutex); 3670 break; 3671 } 3672 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) 3673 enlarge = 1; 3674 ret = res_counter_set_limit(&memcg->memsw, val); 3675 mutex_unlock(&set_limit_mutex); 3676 3677 if (!ret) 3678 break; 3679 3680 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3681 3682 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3683 /* Usage is reduced ? */ 3684 if (curusage >= oldusage) 3685 retry_count--; 3686 else 3687 oldusage = curusage; 3688 } 3689 if (!ret && enlarge) 3690 memcg_oom_recover(memcg); 3691 return ret; 3692 } 3693 3694 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3695 gfp_t gfp_mask, 3696 unsigned long *total_scanned) 3697 { 3698 unsigned long nr_reclaimed = 0; 3699 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3700 unsigned long reclaimed; 3701 int loop = 0; 3702 struct mem_cgroup_tree_per_zone *mctz; 3703 unsigned long long excess; 3704 unsigned long nr_scanned; 3705 3706 if (order > 0) 3707 return 0; 3708 3709 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3710 /* 3711 * This loop can run a while, specially if mem_cgroup's continuously 3712 * keep exceeding their soft limit and putting the system under 3713 * pressure 3714 */ 3715 do { 3716 if (next_mz) 3717 mz = next_mz; 3718 else 3719 mz = mem_cgroup_largest_soft_limit_node(mctz); 3720 if (!mz) 3721 break; 3722 3723 nr_scanned = 0; 3724 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3725 gfp_mask, &nr_scanned); 3726 nr_reclaimed += reclaimed; 3727 *total_scanned += nr_scanned; 3728 spin_lock_irq(&mctz->lock); 3729 3730 /* 3731 * If we failed to reclaim anything from this memory cgroup 3732 * it is time to move on to the next cgroup 3733 */ 3734 next_mz = NULL; 3735 if (!reclaimed) { 3736 do { 3737 /* 3738 * Loop until we find yet another one. 3739 * 3740 * By the time we get the soft_limit lock 3741 * again, someone might have aded the 3742 * group back on the RB tree. Iterate to 3743 * make sure we get a different mem. 3744 * mem_cgroup_largest_soft_limit_node returns 3745 * NULL if no other cgroup is present on 3746 * the tree 3747 */ 3748 next_mz = 3749 __mem_cgroup_largest_soft_limit_node(mctz); 3750 if (next_mz == mz) 3751 css_put(&next_mz->memcg->css); 3752 else /* next_mz == NULL or other memcg */ 3753 break; 3754 } while (1); 3755 } 3756 __mem_cgroup_remove_exceeded(mz, mctz); 3757 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3758 /* 3759 * One school of thought says that we should not add 3760 * back the node to the tree if reclaim returns 0. 3761 * But our reclaim could return 0, simply because due 3762 * to priority we are exposing a smaller subset of 3763 * memory to reclaim from. Consider this as a longer 3764 * term TODO. 3765 */ 3766 /* If excess == 0, no tree ops */ 3767 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3768 spin_unlock_irq(&mctz->lock); 3769 css_put(&mz->memcg->css); 3770 loop++; 3771 /* 3772 * Could not reclaim anything and there are no more 3773 * mem cgroups to try or we seem to be looping without 3774 * reclaiming anything. 3775 */ 3776 if (!nr_reclaimed && 3777 (next_mz == NULL || 3778 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3779 break; 3780 } while (!nr_reclaimed); 3781 if (next_mz) 3782 css_put(&next_mz->memcg->css); 3783 return nr_reclaimed; 3784 } 3785 3786 /** 3787 * mem_cgroup_force_empty_list - clears LRU of a group 3788 * @memcg: group to clear 3789 * @node: NUMA node 3790 * @zid: zone id 3791 * @lru: lru to to clear 3792 * 3793 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3794 * reclaim the pages page themselves - pages are moved to the parent (or root) 3795 * group. 3796 */ 3797 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3798 int node, int zid, enum lru_list lru) 3799 { 3800 struct lruvec *lruvec; 3801 unsigned long flags; 3802 struct list_head *list; 3803 struct page *busy; 3804 struct zone *zone; 3805 3806 zone = &NODE_DATA(node)->node_zones[zid]; 3807 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3808 list = &lruvec->lists[lru]; 3809 3810 busy = NULL; 3811 do { 3812 struct page_cgroup *pc; 3813 struct page *page; 3814 3815 spin_lock_irqsave(&zone->lru_lock, flags); 3816 if (list_empty(list)) { 3817 spin_unlock_irqrestore(&zone->lru_lock, flags); 3818 break; 3819 } 3820 page = list_entry(list->prev, struct page, lru); 3821 if (busy == page) { 3822 list_move(&page->lru, list); 3823 busy = NULL; 3824 spin_unlock_irqrestore(&zone->lru_lock, flags); 3825 continue; 3826 } 3827 spin_unlock_irqrestore(&zone->lru_lock, flags); 3828 3829 pc = lookup_page_cgroup(page); 3830 3831 if (mem_cgroup_move_parent(page, pc, memcg)) { 3832 /* found lock contention or "pc" is obsolete. */ 3833 busy = page; 3834 } else 3835 busy = NULL; 3836 cond_resched(); 3837 } while (!list_empty(list)); 3838 } 3839 3840 /* 3841 * make mem_cgroup's charge to be 0 if there is no task by moving 3842 * all the charges and pages to the parent. 3843 * This enables deleting this mem_cgroup. 3844 * 3845 * Caller is responsible for holding css reference on the memcg. 3846 */ 3847 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 3848 { 3849 int node, zid; 3850 u64 usage; 3851 3852 do { 3853 /* This is for making all *used* pages to be on LRU. */ 3854 lru_add_drain_all(); 3855 drain_all_stock_sync(memcg); 3856 mem_cgroup_start_move(memcg); 3857 for_each_node_state(node, N_MEMORY) { 3858 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3859 enum lru_list lru; 3860 for_each_lru(lru) { 3861 mem_cgroup_force_empty_list(memcg, 3862 node, zid, lru); 3863 } 3864 } 3865 } 3866 mem_cgroup_end_move(memcg); 3867 memcg_oom_recover(memcg); 3868 cond_resched(); 3869 3870 /* 3871 * Kernel memory may not necessarily be trackable to a specific 3872 * process. So they are not migrated, and therefore we can't 3873 * expect their value to drop to 0 here. 3874 * Having res filled up with kmem only is enough. 3875 * 3876 * This is a safety check because mem_cgroup_force_empty_list 3877 * could have raced with mem_cgroup_replace_page_cache callers 3878 * so the lru seemed empty but the page could have been added 3879 * right after the check. RES_USAGE should be safe as we always 3880 * charge before adding to the LRU. 3881 */ 3882 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 3883 res_counter_read_u64(&memcg->kmem, RES_USAGE); 3884 } while (usage > 0); 3885 } 3886 3887 /* 3888 * Test whether @memcg has children, dead or alive. Note that this 3889 * function doesn't care whether @memcg has use_hierarchy enabled and 3890 * returns %true if there are child csses according to the cgroup 3891 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3892 */ 3893 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3894 { 3895 bool ret; 3896 3897 /* 3898 * The lock does not prevent addition or deletion of children, but 3899 * it prevents a new child from being initialized based on this 3900 * parent in css_online(), so it's enough to decide whether 3901 * hierarchically inherited attributes can still be changed or not. 3902 */ 3903 lockdep_assert_held(&memcg_create_mutex); 3904 3905 rcu_read_lock(); 3906 ret = css_next_child(NULL, &memcg->css); 3907 rcu_read_unlock(); 3908 return ret; 3909 } 3910 3911 /* 3912 * Reclaims as many pages from the given memcg as possible and moves 3913 * the rest to the parent. 3914 * 3915 * Caller is responsible for holding css reference for memcg. 3916 */ 3917 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3918 { 3919 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3920 3921 /* we call try-to-free pages for make this cgroup empty */ 3922 lru_add_drain_all(); 3923 /* try to free all pages in this cgroup */ 3924 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3925 int progress; 3926 3927 if (signal_pending(current)) 3928 return -EINTR; 3929 3930 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3931 GFP_KERNEL, true); 3932 if (!progress) { 3933 nr_retries--; 3934 /* maybe some writeback is necessary */ 3935 congestion_wait(BLK_RW_ASYNC, HZ/10); 3936 } 3937 3938 } 3939 3940 return 0; 3941 } 3942 3943 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3944 char *buf, size_t nbytes, 3945 loff_t off) 3946 { 3947 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3948 3949 if (mem_cgroup_is_root(memcg)) 3950 return -EINVAL; 3951 return mem_cgroup_force_empty(memcg) ?: nbytes; 3952 } 3953 3954 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3955 struct cftype *cft) 3956 { 3957 return mem_cgroup_from_css(css)->use_hierarchy; 3958 } 3959 3960 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3961 struct cftype *cft, u64 val) 3962 { 3963 int retval = 0; 3964 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3965 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3966 3967 mutex_lock(&memcg_create_mutex); 3968 3969 if (memcg->use_hierarchy == val) 3970 goto out; 3971 3972 /* 3973 * If parent's use_hierarchy is set, we can't make any modifications 3974 * in the child subtrees. If it is unset, then the change can 3975 * occur, provided the current cgroup has no children. 3976 * 3977 * For the root cgroup, parent_mem is NULL, we allow value to be 3978 * set if there are no children. 3979 */ 3980 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3981 (val == 1 || val == 0)) { 3982 if (!memcg_has_children(memcg)) 3983 memcg->use_hierarchy = val; 3984 else 3985 retval = -EBUSY; 3986 } else 3987 retval = -EINVAL; 3988 3989 out: 3990 mutex_unlock(&memcg_create_mutex); 3991 3992 return retval; 3993 } 3994 3995 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3996 enum mem_cgroup_stat_index idx) 3997 { 3998 struct mem_cgroup *iter; 3999 long val = 0; 4000 4001 /* Per-cpu values can be negative, use a signed accumulator */ 4002 for_each_mem_cgroup_tree(iter, memcg) 4003 val += mem_cgroup_read_stat(iter, idx); 4004 4005 if (val < 0) /* race ? */ 4006 val = 0; 4007 return val; 4008 } 4009 4010 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 4011 { 4012 u64 val; 4013 4014 if (!mem_cgroup_is_root(memcg)) { 4015 if (!swap) 4016 return res_counter_read_u64(&memcg->res, RES_USAGE); 4017 else 4018 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 4019 } 4020 4021 /* 4022 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 4023 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 4024 */ 4025 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 4026 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 4027 4028 if (swap) 4029 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 4030 4031 return val << PAGE_SHIFT; 4032 } 4033 4034 4035 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4036 struct cftype *cft) 4037 { 4038 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4039 enum res_type type = MEMFILE_TYPE(cft->private); 4040 int name = MEMFILE_ATTR(cft->private); 4041 4042 switch (type) { 4043 case _MEM: 4044 if (name == RES_USAGE) 4045 return mem_cgroup_usage(memcg, false); 4046 return res_counter_read_u64(&memcg->res, name); 4047 case _MEMSWAP: 4048 if (name == RES_USAGE) 4049 return mem_cgroup_usage(memcg, true); 4050 return res_counter_read_u64(&memcg->memsw, name); 4051 case _KMEM: 4052 return res_counter_read_u64(&memcg->kmem, name); 4053 break; 4054 default: 4055 BUG(); 4056 } 4057 } 4058 4059 #ifdef CONFIG_MEMCG_KMEM 4060 /* should be called with activate_kmem_mutex held */ 4061 static int __memcg_activate_kmem(struct mem_cgroup *memcg, 4062 unsigned long long limit) 4063 { 4064 int err = 0; 4065 int memcg_id; 4066 4067 if (memcg_kmem_is_active(memcg)) 4068 return 0; 4069 4070 /* 4071 * We are going to allocate memory for data shared by all memory 4072 * cgroups so let's stop accounting here. 4073 */ 4074 memcg_stop_kmem_account(); 4075 4076 /* 4077 * For simplicity, we won't allow this to be disabled. It also can't 4078 * be changed if the cgroup has children already, or if tasks had 4079 * already joined. 4080 * 4081 * If tasks join before we set the limit, a person looking at 4082 * kmem.usage_in_bytes will have no way to determine when it took 4083 * place, which makes the value quite meaningless. 4084 * 4085 * After it first became limited, changes in the value of the limit are 4086 * of course permitted. 4087 */ 4088 mutex_lock(&memcg_create_mutex); 4089 if (cgroup_has_tasks(memcg->css.cgroup) || 4090 (memcg->use_hierarchy && memcg_has_children(memcg))) 4091 err = -EBUSY; 4092 mutex_unlock(&memcg_create_mutex); 4093 if (err) 4094 goto out; 4095 4096 memcg_id = memcg_alloc_cache_id(); 4097 if (memcg_id < 0) { 4098 err = memcg_id; 4099 goto out; 4100 } 4101 4102 memcg->kmemcg_id = memcg_id; 4103 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4104 4105 /* 4106 * We couldn't have accounted to this cgroup, because it hasn't got the 4107 * active bit set yet, so this should succeed. 4108 */ 4109 err = res_counter_set_limit(&memcg->kmem, limit); 4110 VM_BUG_ON(err); 4111 4112 static_key_slow_inc(&memcg_kmem_enabled_key); 4113 /* 4114 * Setting the active bit after enabling static branching will 4115 * guarantee no one starts accounting before all call sites are 4116 * patched. 4117 */ 4118 memcg_kmem_set_active(memcg); 4119 out: 4120 memcg_resume_kmem_account(); 4121 return err; 4122 } 4123 4124 static int memcg_activate_kmem(struct mem_cgroup *memcg, 4125 unsigned long long limit) 4126 { 4127 int ret; 4128 4129 mutex_lock(&activate_kmem_mutex); 4130 ret = __memcg_activate_kmem(memcg, limit); 4131 mutex_unlock(&activate_kmem_mutex); 4132 return ret; 4133 } 4134 4135 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4136 unsigned long long val) 4137 { 4138 int ret; 4139 4140 if (!memcg_kmem_is_active(memcg)) 4141 ret = memcg_activate_kmem(memcg, val); 4142 else 4143 ret = res_counter_set_limit(&memcg->kmem, val); 4144 return ret; 4145 } 4146 4147 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 4148 { 4149 int ret = 0; 4150 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4151 4152 if (!parent) 4153 return 0; 4154 4155 mutex_lock(&activate_kmem_mutex); 4156 /* 4157 * If the parent cgroup is not kmem-active now, it cannot be activated 4158 * after this point, because it has at least one child already. 4159 */ 4160 if (memcg_kmem_is_active(parent)) 4161 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 4162 mutex_unlock(&activate_kmem_mutex); 4163 return ret; 4164 } 4165 #else 4166 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4167 unsigned long long val) 4168 { 4169 return -EINVAL; 4170 } 4171 #endif /* CONFIG_MEMCG_KMEM */ 4172 4173 /* 4174 * The user of this function is... 4175 * RES_LIMIT. 4176 */ 4177 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 4178 char *buf, size_t nbytes, loff_t off) 4179 { 4180 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4181 enum res_type type; 4182 int name; 4183 unsigned long long val; 4184 int ret; 4185 4186 buf = strstrip(buf); 4187 type = MEMFILE_TYPE(of_cft(of)->private); 4188 name = MEMFILE_ATTR(of_cft(of)->private); 4189 4190 switch (name) { 4191 case RES_LIMIT: 4192 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4193 ret = -EINVAL; 4194 break; 4195 } 4196 /* This function does all necessary parse...reuse it */ 4197 ret = res_counter_memparse_write_strategy(buf, &val); 4198 if (ret) 4199 break; 4200 if (type == _MEM) 4201 ret = mem_cgroup_resize_limit(memcg, val); 4202 else if (type == _MEMSWAP) 4203 ret = mem_cgroup_resize_memsw_limit(memcg, val); 4204 else if (type == _KMEM) 4205 ret = memcg_update_kmem_limit(memcg, val); 4206 else 4207 return -EINVAL; 4208 break; 4209 case RES_SOFT_LIMIT: 4210 ret = res_counter_memparse_write_strategy(buf, &val); 4211 if (ret) 4212 break; 4213 /* 4214 * For memsw, soft limits are hard to implement in terms 4215 * of semantics, for now, we support soft limits for 4216 * control without swap 4217 */ 4218 if (type == _MEM) 4219 ret = res_counter_set_soft_limit(&memcg->res, val); 4220 else 4221 ret = -EINVAL; 4222 break; 4223 default: 4224 ret = -EINVAL; /* should be BUG() ? */ 4225 break; 4226 } 4227 return ret ?: nbytes; 4228 } 4229 4230 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 4231 unsigned long long *mem_limit, unsigned long long *memsw_limit) 4232 { 4233 unsigned long long min_limit, min_memsw_limit, tmp; 4234 4235 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4236 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4237 if (!memcg->use_hierarchy) 4238 goto out; 4239 4240 while (memcg->css.parent) { 4241 memcg = mem_cgroup_from_css(memcg->css.parent); 4242 if (!memcg->use_hierarchy) 4243 break; 4244 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 4245 min_limit = min(min_limit, tmp); 4246 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4247 min_memsw_limit = min(min_memsw_limit, tmp); 4248 } 4249 out: 4250 *mem_limit = min_limit; 4251 *memsw_limit = min_memsw_limit; 4252 } 4253 4254 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4255 size_t nbytes, loff_t off) 4256 { 4257 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4258 int name; 4259 enum res_type type; 4260 4261 type = MEMFILE_TYPE(of_cft(of)->private); 4262 name = MEMFILE_ATTR(of_cft(of)->private); 4263 4264 switch (name) { 4265 case RES_MAX_USAGE: 4266 if (type == _MEM) 4267 res_counter_reset_max(&memcg->res); 4268 else if (type == _MEMSWAP) 4269 res_counter_reset_max(&memcg->memsw); 4270 else if (type == _KMEM) 4271 res_counter_reset_max(&memcg->kmem); 4272 else 4273 return -EINVAL; 4274 break; 4275 case RES_FAILCNT: 4276 if (type == _MEM) 4277 res_counter_reset_failcnt(&memcg->res); 4278 else if (type == _MEMSWAP) 4279 res_counter_reset_failcnt(&memcg->memsw); 4280 else if (type == _KMEM) 4281 res_counter_reset_failcnt(&memcg->kmem); 4282 else 4283 return -EINVAL; 4284 break; 4285 } 4286 4287 return nbytes; 4288 } 4289 4290 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 4291 struct cftype *cft) 4292 { 4293 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 4294 } 4295 4296 #ifdef CONFIG_MMU 4297 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4298 struct cftype *cft, u64 val) 4299 { 4300 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4301 4302 if (val >= (1 << NR_MOVE_TYPE)) 4303 return -EINVAL; 4304 4305 /* 4306 * No kind of locking is needed in here, because ->can_attach() will 4307 * check this value once in the beginning of the process, and then carry 4308 * on with stale data. This means that changes to this value will only 4309 * affect task migrations starting after the change. 4310 */ 4311 memcg->move_charge_at_immigrate = val; 4312 return 0; 4313 } 4314 #else 4315 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4316 struct cftype *cft, u64 val) 4317 { 4318 return -ENOSYS; 4319 } 4320 #endif 4321 4322 #ifdef CONFIG_NUMA 4323 static int memcg_numa_stat_show(struct seq_file *m, void *v) 4324 { 4325 struct numa_stat { 4326 const char *name; 4327 unsigned int lru_mask; 4328 }; 4329 4330 static const struct numa_stat stats[] = { 4331 { "total", LRU_ALL }, 4332 { "file", LRU_ALL_FILE }, 4333 { "anon", LRU_ALL_ANON }, 4334 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4335 }; 4336 const struct numa_stat *stat; 4337 int nid; 4338 unsigned long nr; 4339 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4340 4341 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4342 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 4343 seq_printf(m, "%s=%lu", stat->name, nr); 4344 for_each_node_state(nid, N_MEMORY) { 4345 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4346 stat->lru_mask); 4347 seq_printf(m, " N%d=%lu", nid, nr); 4348 } 4349 seq_putc(m, '\n'); 4350 } 4351 4352 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4353 struct mem_cgroup *iter; 4354 4355 nr = 0; 4356 for_each_mem_cgroup_tree(iter, memcg) 4357 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 4358 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 4359 for_each_node_state(nid, N_MEMORY) { 4360 nr = 0; 4361 for_each_mem_cgroup_tree(iter, memcg) 4362 nr += mem_cgroup_node_nr_lru_pages( 4363 iter, nid, stat->lru_mask); 4364 seq_printf(m, " N%d=%lu", nid, nr); 4365 } 4366 seq_putc(m, '\n'); 4367 } 4368 4369 return 0; 4370 } 4371 #endif /* CONFIG_NUMA */ 4372 4373 static inline void mem_cgroup_lru_names_not_uptodate(void) 4374 { 4375 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4376 } 4377 4378 static int memcg_stat_show(struct seq_file *m, void *v) 4379 { 4380 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4381 struct mem_cgroup *mi; 4382 unsigned int i; 4383 4384 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4385 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4386 continue; 4387 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4388 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4389 } 4390 4391 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 4392 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 4393 mem_cgroup_read_events(memcg, i)); 4394 4395 for (i = 0; i < NR_LRU_LISTS; i++) 4396 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 4397 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 4398 4399 /* Hierarchical information */ 4400 { 4401 unsigned long long limit, memsw_limit; 4402 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4403 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 4404 if (do_swap_account) 4405 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4406 memsw_limit); 4407 } 4408 4409 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4410 long long val = 0; 4411 4412 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4413 continue; 4414 for_each_mem_cgroup_tree(mi, memcg) 4415 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4416 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 4417 } 4418 4419 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 4420 unsigned long long val = 0; 4421 4422 for_each_mem_cgroup_tree(mi, memcg) 4423 val += mem_cgroup_read_events(mi, i); 4424 seq_printf(m, "total_%s %llu\n", 4425 mem_cgroup_events_names[i], val); 4426 } 4427 4428 for (i = 0; i < NR_LRU_LISTS; i++) { 4429 unsigned long long val = 0; 4430 4431 for_each_mem_cgroup_tree(mi, memcg) 4432 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 4433 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 4434 } 4435 4436 #ifdef CONFIG_DEBUG_VM 4437 { 4438 int nid, zid; 4439 struct mem_cgroup_per_zone *mz; 4440 struct zone_reclaim_stat *rstat; 4441 unsigned long recent_rotated[2] = {0, 0}; 4442 unsigned long recent_scanned[2] = {0, 0}; 4443 4444 for_each_online_node(nid) 4445 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4446 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 4447 rstat = &mz->lruvec.reclaim_stat; 4448 4449 recent_rotated[0] += rstat->recent_rotated[0]; 4450 recent_rotated[1] += rstat->recent_rotated[1]; 4451 recent_scanned[0] += rstat->recent_scanned[0]; 4452 recent_scanned[1] += rstat->recent_scanned[1]; 4453 } 4454 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 4455 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 4456 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 4457 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 4458 } 4459 #endif 4460 4461 return 0; 4462 } 4463 4464 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4465 struct cftype *cft) 4466 { 4467 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4468 4469 return mem_cgroup_swappiness(memcg); 4470 } 4471 4472 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4473 struct cftype *cft, u64 val) 4474 { 4475 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4476 4477 if (val > 100) 4478 return -EINVAL; 4479 4480 if (css->parent) 4481 memcg->swappiness = val; 4482 else 4483 vm_swappiness = val; 4484 4485 return 0; 4486 } 4487 4488 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4489 { 4490 struct mem_cgroup_threshold_ary *t; 4491 u64 usage; 4492 int i; 4493 4494 rcu_read_lock(); 4495 if (!swap) 4496 t = rcu_dereference(memcg->thresholds.primary); 4497 else 4498 t = rcu_dereference(memcg->memsw_thresholds.primary); 4499 4500 if (!t) 4501 goto unlock; 4502 4503 usage = mem_cgroup_usage(memcg, swap); 4504 4505 /* 4506 * current_threshold points to threshold just below or equal to usage. 4507 * If it's not true, a threshold was crossed after last 4508 * call of __mem_cgroup_threshold(). 4509 */ 4510 i = t->current_threshold; 4511 4512 /* 4513 * Iterate backward over array of thresholds starting from 4514 * current_threshold and check if a threshold is crossed. 4515 * If none of thresholds below usage is crossed, we read 4516 * only one element of the array here. 4517 */ 4518 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4519 eventfd_signal(t->entries[i].eventfd, 1); 4520 4521 /* i = current_threshold + 1 */ 4522 i++; 4523 4524 /* 4525 * Iterate forward over array of thresholds starting from 4526 * current_threshold+1 and check if a threshold is crossed. 4527 * If none of thresholds above usage is crossed, we read 4528 * only one element of the array here. 4529 */ 4530 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4531 eventfd_signal(t->entries[i].eventfd, 1); 4532 4533 /* Update current_threshold */ 4534 t->current_threshold = i - 1; 4535 unlock: 4536 rcu_read_unlock(); 4537 } 4538 4539 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4540 { 4541 while (memcg) { 4542 __mem_cgroup_threshold(memcg, false); 4543 if (do_swap_account) 4544 __mem_cgroup_threshold(memcg, true); 4545 4546 memcg = parent_mem_cgroup(memcg); 4547 } 4548 } 4549 4550 static int compare_thresholds(const void *a, const void *b) 4551 { 4552 const struct mem_cgroup_threshold *_a = a; 4553 const struct mem_cgroup_threshold *_b = b; 4554 4555 if (_a->threshold > _b->threshold) 4556 return 1; 4557 4558 if (_a->threshold < _b->threshold) 4559 return -1; 4560 4561 return 0; 4562 } 4563 4564 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4565 { 4566 struct mem_cgroup_eventfd_list *ev; 4567 4568 spin_lock(&memcg_oom_lock); 4569 4570 list_for_each_entry(ev, &memcg->oom_notify, list) 4571 eventfd_signal(ev->eventfd, 1); 4572 4573 spin_unlock(&memcg_oom_lock); 4574 return 0; 4575 } 4576 4577 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4578 { 4579 struct mem_cgroup *iter; 4580 4581 for_each_mem_cgroup_tree(iter, memcg) 4582 mem_cgroup_oom_notify_cb(iter); 4583 } 4584 4585 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4586 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4587 { 4588 struct mem_cgroup_thresholds *thresholds; 4589 struct mem_cgroup_threshold_ary *new; 4590 u64 threshold, usage; 4591 int i, size, ret; 4592 4593 ret = res_counter_memparse_write_strategy(args, &threshold); 4594 if (ret) 4595 return ret; 4596 4597 mutex_lock(&memcg->thresholds_lock); 4598 4599 if (type == _MEM) { 4600 thresholds = &memcg->thresholds; 4601 usage = mem_cgroup_usage(memcg, false); 4602 } else if (type == _MEMSWAP) { 4603 thresholds = &memcg->memsw_thresholds; 4604 usage = mem_cgroup_usage(memcg, true); 4605 } else 4606 BUG(); 4607 4608 /* Check if a threshold crossed before adding a new one */ 4609 if (thresholds->primary) 4610 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4611 4612 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4613 4614 /* Allocate memory for new array of thresholds */ 4615 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4616 GFP_KERNEL); 4617 if (!new) { 4618 ret = -ENOMEM; 4619 goto unlock; 4620 } 4621 new->size = size; 4622 4623 /* Copy thresholds (if any) to new array */ 4624 if (thresholds->primary) { 4625 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4626 sizeof(struct mem_cgroup_threshold)); 4627 } 4628 4629 /* Add new threshold */ 4630 new->entries[size - 1].eventfd = eventfd; 4631 new->entries[size - 1].threshold = threshold; 4632 4633 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4634 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4635 compare_thresholds, NULL); 4636 4637 /* Find current threshold */ 4638 new->current_threshold = -1; 4639 for (i = 0; i < size; i++) { 4640 if (new->entries[i].threshold <= usage) { 4641 /* 4642 * new->current_threshold will not be used until 4643 * rcu_assign_pointer(), so it's safe to increment 4644 * it here. 4645 */ 4646 ++new->current_threshold; 4647 } else 4648 break; 4649 } 4650 4651 /* Free old spare buffer and save old primary buffer as spare */ 4652 kfree(thresholds->spare); 4653 thresholds->spare = thresholds->primary; 4654 4655 rcu_assign_pointer(thresholds->primary, new); 4656 4657 /* To be sure that nobody uses thresholds */ 4658 synchronize_rcu(); 4659 4660 unlock: 4661 mutex_unlock(&memcg->thresholds_lock); 4662 4663 return ret; 4664 } 4665 4666 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4667 struct eventfd_ctx *eventfd, const char *args) 4668 { 4669 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4670 } 4671 4672 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4673 struct eventfd_ctx *eventfd, const char *args) 4674 { 4675 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4676 } 4677 4678 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4679 struct eventfd_ctx *eventfd, enum res_type type) 4680 { 4681 struct mem_cgroup_thresholds *thresholds; 4682 struct mem_cgroup_threshold_ary *new; 4683 u64 usage; 4684 int i, j, size; 4685 4686 mutex_lock(&memcg->thresholds_lock); 4687 4688 if (type == _MEM) { 4689 thresholds = &memcg->thresholds; 4690 usage = mem_cgroup_usage(memcg, false); 4691 } else if (type == _MEMSWAP) { 4692 thresholds = &memcg->memsw_thresholds; 4693 usage = mem_cgroup_usage(memcg, true); 4694 } else 4695 BUG(); 4696 4697 if (!thresholds->primary) 4698 goto unlock; 4699 4700 /* Check if a threshold crossed before removing */ 4701 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4702 4703 /* Calculate new number of threshold */ 4704 size = 0; 4705 for (i = 0; i < thresholds->primary->size; i++) { 4706 if (thresholds->primary->entries[i].eventfd != eventfd) 4707 size++; 4708 } 4709 4710 new = thresholds->spare; 4711 4712 /* Set thresholds array to NULL if we don't have thresholds */ 4713 if (!size) { 4714 kfree(new); 4715 new = NULL; 4716 goto swap_buffers; 4717 } 4718 4719 new->size = size; 4720 4721 /* Copy thresholds and find current threshold */ 4722 new->current_threshold = -1; 4723 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4724 if (thresholds->primary->entries[i].eventfd == eventfd) 4725 continue; 4726 4727 new->entries[j] = thresholds->primary->entries[i]; 4728 if (new->entries[j].threshold <= usage) { 4729 /* 4730 * new->current_threshold will not be used 4731 * until rcu_assign_pointer(), so it's safe to increment 4732 * it here. 4733 */ 4734 ++new->current_threshold; 4735 } 4736 j++; 4737 } 4738 4739 swap_buffers: 4740 /* Swap primary and spare array */ 4741 thresholds->spare = thresholds->primary; 4742 /* If all events are unregistered, free the spare array */ 4743 if (!new) { 4744 kfree(thresholds->spare); 4745 thresholds->spare = NULL; 4746 } 4747 4748 rcu_assign_pointer(thresholds->primary, new); 4749 4750 /* To be sure that nobody uses thresholds */ 4751 synchronize_rcu(); 4752 unlock: 4753 mutex_unlock(&memcg->thresholds_lock); 4754 } 4755 4756 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4757 struct eventfd_ctx *eventfd) 4758 { 4759 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4760 } 4761 4762 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4763 struct eventfd_ctx *eventfd) 4764 { 4765 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4766 } 4767 4768 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4769 struct eventfd_ctx *eventfd, const char *args) 4770 { 4771 struct mem_cgroup_eventfd_list *event; 4772 4773 event = kmalloc(sizeof(*event), GFP_KERNEL); 4774 if (!event) 4775 return -ENOMEM; 4776 4777 spin_lock(&memcg_oom_lock); 4778 4779 event->eventfd = eventfd; 4780 list_add(&event->list, &memcg->oom_notify); 4781 4782 /* already in OOM ? */ 4783 if (atomic_read(&memcg->under_oom)) 4784 eventfd_signal(eventfd, 1); 4785 spin_unlock(&memcg_oom_lock); 4786 4787 return 0; 4788 } 4789 4790 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4791 struct eventfd_ctx *eventfd) 4792 { 4793 struct mem_cgroup_eventfd_list *ev, *tmp; 4794 4795 spin_lock(&memcg_oom_lock); 4796 4797 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4798 if (ev->eventfd == eventfd) { 4799 list_del(&ev->list); 4800 kfree(ev); 4801 } 4802 } 4803 4804 spin_unlock(&memcg_oom_lock); 4805 } 4806 4807 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4808 { 4809 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 4810 4811 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4812 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 4813 return 0; 4814 } 4815 4816 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4817 struct cftype *cft, u64 val) 4818 { 4819 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4820 4821 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4822 if (!css->parent || !((val == 0) || (val == 1))) 4823 return -EINVAL; 4824 4825 memcg->oom_kill_disable = val; 4826 if (!val) 4827 memcg_oom_recover(memcg); 4828 4829 return 0; 4830 } 4831 4832 #ifdef CONFIG_MEMCG_KMEM 4833 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4834 { 4835 int ret; 4836 4837 memcg->kmemcg_id = -1; 4838 ret = memcg_propagate_kmem(memcg); 4839 if (ret) 4840 return ret; 4841 4842 return mem_cgroup_sockets_init(memcg, ss); 4843 } 4844 4845 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4846 { 4847 mem_cgroup_sockets_destroy(memcg); 4848 } 4849 4850 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4851 { 4852 if (!memcg_kmem_is_active(memcg)) 4853 return; 4854 4855 /* 4856 * kmem charges can outlive the cgroup. In the case of slab 4857 * pages, for instance, a page contain objects from various 4858 * processes. As we prevent from taking a reference for every 4859 * such allocation we have to be careful when doing uncharge 4860 * (see memcg_uncharge_kmem) and here during offlining. 4861 * 4862 * The idea is that that only the _last_ uncharge which sees 4863 * the dead memcg will drop the last reference. An additional 4864 * reference is taken here before the group is marked dead 4865 * which is then paired with css_put during uncharge resp. here. 4866 * 4867 * Although this might sound strange as this path is called from 4868 * css_offline() when the referencemight have dropped down to 0 and 4869 * shouldn't be incremented anymore (css_tryget_online() would 4870 * fail) we do not have other options because of the kmem 4871 * allocations lifetime. 4872 */ 4873 css_get(&memcg->css); 4874 4875 memcg_kmem_mark_dead(memcg); 4876 4877 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 4878 return; 4879 4880 if (memcg_kmem_test_and_clear_dead(memcg)) 4881 css_put(&memcg->css); 4882 } 4883 #else 4884 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4885 { 4886 return 0; 4887 } 4888 4889 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4890 { 4891 } 4892 4893 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4894 { 4895 } 4896 #endif 4897 4898 /* 4899 * DO NOT USE IN NEW FILES. 4900 * 4901 * "cgroup.event_control" implementation. 4902 * 4903 * This is way over-engineered. It tries to support fully configurable 4904 * events for each user. Such level of flexibility is completely 4905 * unnecessary especially in the light of the planned unified hierarchy. 4906 * 4907 * Please deprecate this and replace with something simpler if at all 4908 * possible. 4909 */ 4910 4911 /* 4912 * Unregister event and free resources. 4913 * 4914 * Gets called from workqueue. 4915 */ 4916 static void memcg_event_remove(struct work_struct *work) 4917 { 4918 struct mem_cgroup_event *event = 4919 container_of(work, struct mem_cgroup_event, remove); 4920 struct mem_cgroup *memcg = event->memcg; 4921 4922 remove_wait_queue(event->wqh, &event->wait); 4923 4924 event->unregister_event(memcg, event->eventfd); 4925 4926 /* Notify userspace the event is going away. */ 4927 eventfd_signal(event->eventfd, 1); 4928 4929 eventfd_ctx_put(event->eventfd); 4930 kfree(event); 4931 css_put(&memcg->css); 4932 } 4933 4934 /* 4935 * Gets called on POLLHUP on eventfd when user closes it. 4936 * 4937 * Called with wqh->lock held and interrupts disabled. 4938 */ 4939 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 4940 int sync, void *key) 4941 { 4942 struct mem_cgroup_event *event = 4943 container_of(wait, struct mem_cgroup_event, wait); 4944 struct mem_cgroup *memcg = event->memcg; 4945 unsigned long flags = (unsigned long)key; 4946 4947 if (flags & POLLHUP) { 4948 /* 4949 * If the event has been detached at cgroup removal, we 4950 * can simply return knowing the other side will cleanup 4951 * for us. 4952 * 4953 * We can't race against event freeing since the other 4954 * side will require wqh->lock via remove_wait_queue(), 4955 * which we hold. 4956 */ 4957 spin_lock(&memcg->event_list_lock); 4958 if (!list_empty(&event->list)) { 4959 list_del_init(&event->list); 4960 /* 4961 * We are in atomic context, but cgroup_event_remove() 4962 * may sleep, so we have to call it in workqueue. 4963 */ 4964 schedule_work(&event->remove); 4965 } 4966 spin_unlock(&memcg->event_list_lock); 4967 } 4968 4969 return 0; 4970 } 4971 4972 static void memcg_event_ptable_queue_proc(struct file *file, 4973 wait_queue_head_t *wqh, poll_table *pt) 4974 { 4975 struct mem_cgroup_event *event = 4976 container_of(pt, struct mem_cgroup_event, pt); 4977 4978 event->wqh = wqh; 4979 add_wait_queue(wqh, &event->wait); 4980 } 4981 4982 /* 4983 * DO NOT USE IN NEW FILES. 4984 * 4985 * Parse input and register new cgroup event handler. 4986 * 4987 * Input must be in format '<event_fd> <control_fd> <args>'. 4988 * Interpretation of args is defined by control file implementation. 4989 */ 4990 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4991 char *buf, size_t nbytes, loff_t off) 4992 { 4993 struct cgroup_subsys_state *css = of_css(of); 4994 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4995 struct mem_cgroup_event *event; 4996 struct cgroup_subsys_state *cfile_css; 4997 unsigned int efd, cfd; 4998 struct fd efile; 4999 struct fd cfile; 5000 const char *name; 5001 char *endp; 5002 int ret; 5003 5004 buf = strstrip(buf); 5005 5006 efd = simple_strtoul(buf, &endp, 10); 5007 if (*endp != ' ') 5008 return -EINVAL; 5009 buf = endp + 1; 5010 5011 cfd = simple_strtoul(buf, &endp, 10); 5012 if ((*endp != ' ') && (*endp != '\0')) 5013 return -EINVAL; 5014 buf = endp + 1; 5015 5016 event = kzalloc(sizeof(*event), GFP_KERNEL); 5017 if (!event) 5018 return -ENOMEM; 5019 5020 event->memcg = memcg; 5021 INIT_LIST_HEAD(&event->list); 5022 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 5023 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 5024 INIT_WORK(&event->remove, memcg_event_remove); 5025 5026 efile = fdget(efd); 5027 if (!efile.file) { 5028 ret = -EBADF; 5029 goto out_kfree; 5030 } 5031 5032 event->eventfd = eventfd_ctx_fileget(efile.file); 5033 if (IS_ERR(event->eventfd)) { 5034 ret = PTR_ERR(event->eventfd); 5035 goto out_put_efile; 5036 } 5037 5038 cfile = fdget(cfd); 5039 if (!cfile.file) { 5040 ret = -EBADF; 5041 goto out_put_eventfd; 5042 } 5043 5044 /* the process need read permission on control file */ 5045 /* AV: shouldn't we check that it's been opened for read instead? */ 5046 ret = inode_permission(file_inode(cfile.file), MAY_READ); 5047 if (ret < 0) 5048 goto out_put_cfile; 5049 5050 /* 5051 * Determine the event callbacks and set them in @event. This used 5052 * to be done via struct cftype but cgroup core no longer knows 5053 * about these events. The following is crude but the whole thing 5054 * is for compatibility anyway. 5055 * 5056 * DO NOT ADD NEW FILES. 5057 */ 5058 name = cfile.file->f_dentry->d_name.name; 5059 5060 if (!strcmp(name, "memory.usage_in_bytes")) { 5061 event->register_event = mem_cgroup_usage_register_event; 5062 event->unregister_event = mem_cgroup_usage_unregister_event; 5063 } else if (!strcmp(name, "memory.oom_control")) { 5064 event->register_event = mem_cgroup_oom_register_event; 5065 event->unregister_event = mem_cgroup_oom_unregister_event; 5066 } else if (!strcmp(name, "memory.pressure_level")) { 5067 event->register_event = vmpressure_register_event; 5068 event->unregister_event = vmpressure_unregister_event; 5069 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 5070 event->register_event = memsw_cgroup_usage_register_event; 5071 event->unregister_event = memsw_cgroup_usage_unregister_event; 5072 } else { 5073 ret = -EINVAL; 5074 goto out_put_cfile; 5075 } 5076 5077 /* 5078 * Verify @cfile should belong to @css. Also, remaining events are 5079 * automatically removed on cgroup destruction but the removal is 5080 * asynchronous, so take an extra ref on @css. 5081 */ 5082 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, 5083 &memory_cgrp_subsys); 5084 ret = -EINVAL; 5085 if (IS_ERR(cfile_css)) 5086 goto out_put_cfile; 5087 if (cfile_css != css) { 5088 css_put(cfile_css); 5089 goto out_put_cfile; 5090 } 5091 5092 ret = event->register_event(memcg, event->eventfd, buf); 5093 if (ret) 5094 goto out_put_css; 5095 5096 efile.file->f_op->poll(efile.file, &event->pt); 5097 5098 spin_lock(&memcg->event_list_lock); 5099 list_add(&event->list, &memcg->event_list); 5100 spin_unlock(&memcg->event_list_lock); 5101 5102 fdput(cfile); 5103 fdput(efile); 5104 5105 return nbytes; 5106 5107 out_put_css: 5108 css_put(css); 5109 out_put_cfile: 5110 fdput(cfile); 5111 out_put_eventfd: 5112 eventfd_ctx_put(event->eventfd); 5113 out_put_efile: 5114 fdput(efile); 5115 out_kfree: 5116 kfree(event); 5117 5118 return ret; 5119 } 5120 5121 static struct cftype mem_cgroup_files[] = { 5122 { 5123 .name = "usage_in_bytes", 5124 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5125 .read_u64 = mem_cgroup_read_u64, 5126 }, 5127 { 5128 .name = "max_usage_in_bytes", 5129 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5130 .write = mem_cgroup_reset, 5131 .read_u64 = mem_cgroup_read_u64, 5132 }, 5133 { 5134 .name = "limit_in_bytes", 5135 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5136 .write = mem_cgroup_write, 5137 .read_u64 = mem_cgroup_read_u64, 5138 }, 5139 { 5140 .name = "soft_limit_in_bytes", 5141 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5142 .write = mem_cgroup_write, 5143 .read_u64 = mem_cgroup_read_u64, 5144 }, 5145 { 5146 .name = "failcnt", 5147 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5148 .write = mem_cgroup_reset, 5149 .read_u64 = mem_cgroup_read_u64, 5150 }, 5151 { 5152 .name = "stat", 5153 .seq_show = memcg_stat_show, 5154 }, 5155 { 5156 .name = "force_empty", 5157 .write = mem_cgroup_force_empty_write, 5158 }, 5159 { 5160 .name = "use_hierarchy", 5161 .write_u64 = mem_cgroup_hierarchy_write, 5162 .read_u64 = mem_cgroup_hierarchy_read, 5163 }, 5164 { 5165 .name = "cgroup.event_control", /* XXX: for compat */ 5166 .write = memcg_write_event_control, 5167 .flags = CFTYPE_NO_PREFIX, 5168 .mode = S_IWUGO, 5169 }, 5170 { 5171 .name = "swappiness", 5172 .read_u64 = mem_cgroup_swappiness_read, 5173 .write_u64 = mem_cgroup_swappiness_write, 5174 }, 5175 { 5176 .name = "move_charge_at_immigrate", 5177 .read_u64 = mem_cgroup_move_charge_read, 5178 .write_u64 = mem_cgroup_move_charge_write, 5179 }, 5180 { 5181 .name = "oom_control", 5182 .seq_show = mem_cgroup_oom_control_read, 5183 .write_u64 = mem_cgroup_oom_control_write, 5184 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5185 }, 5186 { 5187 .name = "pressure_level", 5188 }, 5189 #ifdef CONFIG_NUMA 5190 { 5191 .name = "numa_stat", 5192 .seq_show = memcg_numa_stat_show, 5193 }, 5194 #endif 5195 #ifdef CONFIG_MEMCG_KMEM 5196 { 5197 .name = "kmem.limit_in_bytes", 5198 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5199 .write = mem_cgroup_write, 5200 .read_u64 = mem_cgroup_read_u64, 5201 }, 5202 { 5203 .name = "kmem.usage_in_bytes", 5204 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5205 .read_u64 = mem_cgroup_read_u64, 5206 }, 5207 { 5208 .name = "kmem.failcnt", 5209 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5210 .write = mem_cgroup_reset, 5211 .read_u64 = mem_cgroup_read_u64, 5212 }, 5213 { 5214 .name = "kmem.max_usage_in_bytes", 5215 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5216 .write = mem_cgroup_reset, 5217 .read_u64 = mem_cgroup_read_u64, 5218 }, 5219 #ifdef CONFIG_SLABINFO 5220 { 5221 .name = "kmem.slabinfo", 5222 .seq_show = mem_cgroup_slabinfo_read, 5223 }, 5224 #endif 5225 #endif 5226 { }, /* terminate */ 5227 }; 5228 5229 #ifdef CONFIG_MEMCG_SWAP 5230 static struct cftype memsw_cgroup_files[] = { 5231 { 5232 .name = "memsw.usage_in_bytes", 5233 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 5234 .read_u64 = mem_cgroup_read_u64, 5235 }, 5236 { 5237 .name = "memsw.max_usage_in_bytes", 5238 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 5239 .write = mem_cgroup_reset, 5240 .read_u64 = mem_cgroup_read_u64, 5241 }, 5242 { 5243 .name = "memsw.limit_in_bytes", 5244 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 5245 .write = mem_cgroup_write, 5246 .read_u64 = mem_cgroup_read_u64, 5247 }, 5248 { 5249 .name = "memsw.failcnt", 5250 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 5251 .write = mem_cgroup_reset, 5252 .read_u64 = mem_cgroup_read_u64, 5253 }, 5254 { }, /* terminate */ 5255 }; 5256 #endif 5257 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5258 { 5259 struct mem_cgroup_per_node *pn; 5260 struct mem_cgroup_per_zone *mz; 5261 int zone, tmp = node; 5262 /* 5263 * This routine is called against possible nodes. 5264 * But it's BUG to call kmalloc() against offline node. 5265 * 5266 * TODO: this routine can waste much memory for nodes which will 5267 * never be onlined. It's better to use memory hotplug callback 5268 * function. 5269 */ 5270 if (!node_state(node, N_NORMAL_MEMORY)) 5271 tmp = -1; 5272 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5273 if (!pn) 5274 return 1; 5275 5276 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5277 mz = &pn->zoneinfo[zone]; 5278 lruvec_init(&mz->lruvec); 5279 mz->usage_in_excess = 0; 5280 mz->on_tree = false; 5281 mz->memcg = memcg; 5282 } 5283 memcg->nodeinfo[node] = pn; 5284 return 0; 5285 } 5286 5287 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5288 { 5289 kfree(memcg->nodeinfo[node]); 5290 } 5291 5292 static struct mem_cgroup *mem_cgroup_alloc(void) 5293 { 5294 struct mem_cgroup *memcg; 5295 size_t size; 5296 5297 size = sizeof(struct mem_cgroup); 5298 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5299 5300 memcg = kzalloc(size, GFP_KERNEL); 5301 if (!memcg) 5302 return NULL; 5303 5304 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 5305 if (!memcg->stat) 5306 goto out_free; 5307 spin_lock_init(&memcg->pcp_counter_lock); 5308 return memcg; 5309 5310 out_free: 5311 kfree(memcg); 5312 return NULL; 5313 } 5314 5315 /* 5316 * At destroying mem_cgroup, references from swap_cgroup can remain. 5317 * (scanning all at force_empty is too costly...) 5318 * 5319 * Instead of clearing all references at force_empty, we remember 5320 * the number of reference from swap_cgroup and free mem_cgroup when 5321 * it goes down to 0. 5322 * 5323 * Removal of cgroup itself succeeds regardless of refs from swap. 5324 */ 5325 5326 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5327 { 5328 int node; 5329 5330 mem_cgroup_remove_from_trees(memcg); 5331 5332 for_each_node(node) 5333 free_mem_cgroup_per_zone_info(memcg, node); 5334 5335 free_percpu(memcg->stat); 5336 5337 /* 5338 * We need to make sure that (at least for now), the jump label 5339 * destruction code runs outside of the cgroup lock. This is because 5340 * get_online_cpus(), which is called from the static_branch update, 5341 * can't be called inside the cgroup_lock. cpusets are the ones 5342 * enforcing this dependency, so if they ever change, we might as well. 5343 * 5344 * schedule_work() will guarantee this happens. Be careful if you need 5345 * to move this code around, and make sure it is outside 5346 * the cgroup_lock. 5347 */ 5348 disarm_static_keys(memcg); 5349 kfree(memcg); 5350 } 5351 5352 /* 5353 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 5354 */ 5355 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 5356 { 5357 if (!memcg->res.parent) 5358 return NULL; 5359 return mem_cgroup_from_res_counter(memcg->res.parent, res); 5360 } 5361 EXPORT_SYMBOL(parent_mem_cgroup); 5362 5363 static void __init mem_cgroup_soft_limit_tree_init(void) 5364 { 5365 struct mem_cgroup_tree_per_node *rtpn; 5366 struct mem_cgroup_tree_per_zone *rtpz; 5367 int tmp, node, zone; 5368 5369 for_each_node(node) { 5370 tmp = node; 5371 if (!node_state(node, N_NORMAL_MEMORY)) 5372 tmp = -1; 5373 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 5374 BUG_ON(!rtpn); 5375 5376 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5377 5378 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5379 rtpz = &rtpn->rb_tree_per_zone[zone]; 5380 rtpz->rb_root = RB_ROOT; 5381 spin_lock_init(&rtpz->lock); 5382 } 5383 } 5384 } 5385 5386 static struct cgroup_subsys_state * __ref 5387 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5388 { 5389 struct mem_cgroup *memcg; 5390 long error = -ENOMEM; 5391 int node; 5392 5393 memcg = mem_cgroup_alloc(); 5394 if (!memcg) 5395 return ERR_PTR(error); 5396 5397 for_each_node(node) 5398 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 5399 goto free_out; 5400 5401 /* root ? */ 5402 if (parent_css == NULL) { 5403 root_mem_cgroup = memcg; 5404 res_counter_init(&memcg->res, NULL); 5405 res_counter_init(&memcg->memsw, NULL); 5406 res_counter_init(&memcg->kmem, NULL); 5407 } 5408 5409 memcg->last_scanned_node = MAX_NUMNODES; 5410 INIT_LIST_HEAD(&memcg->oom_notify); 5411 memcg->move_charge_at_immigrate = 0; 5412 mutex_init(&memcg->thresholds_lock); 5413 spin_lock_init(&memcg->move_lock); 5414 vmpressure_init(&memcg->vmpressure); 5415 INIT_LIST_HEAD(&memcg->event_list); 5416 spin_lock_init(&memcg->event_list_lock); 5417 5418 return &memcg->css; 5419 5420 free_out: 5421 __mem_cgroup_free(memcg); 5422 return ERR_PTR(error); 5423 } 5424 5425 static int 5426 mem_cgroup_css_online(struct cgroup_subsys_state *css) 5427 { 5428 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5429 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 5430 int ret; 5431 5432 if (css->id > MEM_CGROUP_ID_MAX) 5433 return -ENOSPC; 5434 5435 if (!parent) 5436 return 0; 5437 5438 mutex_lock(&memcg_create_mutex); 5439 5440 memcg->use_hierarchy = parent->use_hierarchy; 5441 memcg->oom_kill_disable = parent->oom_kill_disable; 5442 memcg->swappiness = mem_cgroup_swappiness(parent); 5443 5444 if (parent->use_hierarchy) { 5445 res_counter_init(&memcg->res, &parent->res); 5446 res_counter_init(&memcg->memsw, &parent->memsw); 5447 res_counter_init(&memcg->kmem, &parent->kmem); 5448 5449 /* 5450 * No need to take a reference to the parent because cgroup 5451 * core guarantees its existence. 5452 */ 5453 } else { 5454 res_counter_init(&memcg->res, NULL); 5455 res_counter_init(&memcg->memsw, NULL); 5456 res_counter_init(&memcg->kmem, NULL); 5457 /* 5458 * Deeper hierachy with use_hierarchy == false doesn't make 5459 * much sense so let cgroup subsystem know about this 5460 * unfortunate state in our controller. 5461 */ 5462 if (parent != root_mem_cgroup) 5463 memory_cgrp_subsys.broken_hierarchy = true; 5464 } 5465 mutex_unlock(&memcg_create_mutex); 5466 5467 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 5468 if (ret) 5469 return ret; 5470 5471 /* 5472 * Make sure the memcg is initialized: mem_cgroup_iter() 5473 * orders reading memcg->initialized against its callers 5474 * reading the memcg members. 5475 */ 5476 smp_store_release(&memcg->initialized, 1); 5477 5478 return 0; 5479 } 5480 5481 /* 5482 * Announce all parents that a group from their hierarchy is gone. 5483 */ 5484 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 5485 { 5486 struct mem_cgroup *parent = memcg; 5487 5488 while ((parent = parent_mem_cgroup(parent))) 5489 mem_cgroup_iter_invalidate(parent); 5490 5491 /* 5492 * if the root memcg is not hierarchical we have to check it 5493 * explicitely. 5494 */ 5495 if (!root_mem_cgroup->use_hierarchy) 5496 mem_cgroup_iter_invalidate(root_mem_cgroup); 5497 } 5498 5499 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5500 { 5501 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5502 struct mem_cgroup_event *event, *tmp; 5503 struct cgroup_subsys_state *iter; 5504 5505 /* 5506 * Unregister events and notify userspace. 5507 * Notify userspace about cgroup removing only after rmdir of cgroup 5508 * directory to avoid race between userspace and kernelspace. 5509 */ 5510 spin_lock(&memcg->event_list_lock); 5511 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5512 list_del_init(&event->list); 5513 schedule_work(&event->remove); 5514 } 5515 spin_unlock(&memcg->event_list_lock); 5516 5517 kmem_cgroup_css_offline(memcg); 5518 5519 mem_cgroup_invalidate_reclaim_iterators(memcg); 5520 5521 /* 5522 * This requires that offlining is serialized. Right now that is 5523 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. 5524 */ 5525 css_for_each_descendant_post(iter, css) 5526 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 5527 5528 memcg_unregister_all_caches(memcg); 5529 vmpressure_cleanup(&memcg->vmpressure); 5530 } 5531 5532 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5533 { 5534 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5535 /* 5536 * XXX: css_offline() would be where we should reparent all 5537 * memory to prepare the cgroup for destruction. However, 5538 * memcg does not do css_tryget_online() and res_counter charging 5539 * under the same RCU lock region, which means that charging 5540 * could race with offlining. Offlining only happens to 5541 * cgroups with no tasks in them but charges can show up 5542 * without any tasks from the swapin path when the target 5543 * memcg is looked up from the swapout record and not from the 5544 * current task as it usually is. A race like this can leak 5545 * charges and put pages with stale cgroup pointers into 5546 * circulation: 5547 * 5548 * #0 #1 5549 * lookup_swap_cgroup_id() 5550 * rcu_read_lock() 5551 * mem_cgroup_lookup() 5552 * css_tryget_online() 5553 * rcu_read_unlock() 5554 * disable css_tryget_online() 5555 * call_rcu() 5556 * offline_css() 5557 * reparent_charges() 5558 * res_counter_charge() 5559 * css_put() 5560 * css_free() 5561 * pc->mem_cgroup = dead memcg 5562 * add page to lru 5563 * 5564 * The bulk of the charges are still moved in offline_css() to 5565 * avoid pinning a lot of pages in case a long-term reference 5566 * like a swapout record is deferring the css_free() to long 5567 * after offlining. But this makes sure we catch any charges 5568 * made after offlining: 5569 */ 5570 mem_cgroup_reparent_charges(memcg); 5571 5572 memcg_destroy_kmem(memcg); 5573 __mem_cgroup_free(memcg); 5574 } 5575 5576 /** 5577 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5578 * @css: the target css 5579 * 5580 * Reset the states of the mem_cgroup associated with @css. This is 5581 * invoked when the userland requests disabling on the default hierarchy 5582 * but the memcg is pinned through dependency. The memcg should stop 5583 * applying policies and should revert to the vanilla state as it may be 5584 * made visible again. 5585 * 5586 * The current implementation only resets the essential configurations. 5587 * This needs to be expanded to cover all the visible parts. 5588 */ 5589 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5590 { 5591 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5592 5593 mem_cgroup_resize_limit(memcg, ULLONG_MAX); 5594 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 5595 memcg_update_kmem_limit(memcg, ULLONG_MAX); 5596 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 5597 } 5598 5599 #ifdef CONFIG_MMU 5600 /* Handlers for move charge at task migration. */ 5601 static int mem_cgroup_do_precharge(unsigned long count) 5602 { 5603 int ret; 5604 5605 /* Try a single bulk charge without reclaim first */ 5606 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 5607 if (!ret) { 5608 mc.precharge += count; 5609 return ret; 5610 } 5611 if (ret == -EINTR) { 5612 cancel_charge(root_mem_cgroup, count); 5613 return ret; 5614 } 5615 5616 /* Try charges one by one with reclaim */ 5617 while (count--) { 5618 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 5619 /* 5620 * In case of failure, any residual charges against 5621 * mc.to will be dropped by mem_cgroup_clear_mc() 5622 * later on. However, cancel any charges that are 5623 * bypassed to root right away or they'll be lost. 5624 */ 5625 if (ret == -EINTR) 5626 cancel_charge(root_mem_cgroup, 1); 5627 if (ret) 5628 return ret; 5629 mc.precharge++; 5630 cond_resched(); 5631 } 5632 return 0; 5633 } 5634 5635 /** 5636 * get_mctgt_type - get target type of moving charge 5637 * @vma: the vma the pte to be checked belongs 5638 * @addr: the address corresponding to the pte to be checked 5639 * @ptent: the pte to be checked 5640 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5641 * 5642 * Returns 5643 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5644 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5645 * move charge. if @target is not NULL, the page is stored in target->page 5646 * with extra refcnt got(Callers should handle it). 5647 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5648 * target for charge migration. if @target is not NULL, the entry is stored 5649 * in target->ent. 5650 * 5651 * Called with pte lock held. 5652 */ 5653 union mc_target { 5654 struct page *page; 5655 swp_entry_t ent; 5656 }; 5657 5658 enum mc_target_type { 5659 MC_TARGET_NONE = 0, 5660 MC_TARGET_PAGE, 5661 MC_TARGET_SWAP, 5662 }; 5663 5664 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5665 unsigned long addr, pte_t ptent) 5666 { 5667 struct page *page = vm_normal_page(vma, addr, ptent); 5668 5669 if (!page || !page_mapped(page)) 5670 return NULL; 5671 if (PageAnon(page)) { 5672 /* we don't move shared anon */ 5673 if (!move_anon()) 5674 return NULL; 5675 } else if (!move_file()) 5676 /* we ignore mapcount for file pages */ 5677 return NULL; 5678 if (!get_page_unless_zero(page)) 5679 return NULL; 5680 5681 return page; 5682 } 5683 5684 #ifdef CONFIG_SWAP 5685 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5686 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5687 { 5688 struct page *page = NULL; 5689 swp_entry_t ent = pte_to_swp_entry(ptent); 5690 5691 if (!move_anon() || non_swap_entry(ent)) 5692 return NULL; 5693 /* 5694 * Because lookup_swap_cache() updates some statistics counter, 5695 * we call find_get_page() with swapper_space directly. 5696 */ 5697 page = find_get_page(swap_address_space(ent), ent.val); 5698 if (do_swap_account) 5699 entry->val = ent.val; 5700 5701 return page; 5702 } 5703 #else 5704 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5705 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5706 { 5707 return NULL; 5708 } 5709 #endif 5710 5711 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5712 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5713 { 5714 struct page *page = NULL; 5715 struct address_space *mapping; 5716 pgoff_t pgoff; 5717 5718 if (!vma->vm_file) /* anonymous vma */ 5719 return NULL; 5720 if (!move_file()) 5721 return NULL; 5722 5723 mapping = vma->vm_file->f_mapping; 5724 if (pte_none(ptent)) 5725 pgoff = linear_page_index(vma, addr); 5726 else /* pte_file(ptent) is true */ 5727 pgoff = pte_to_pgoff(ptent); 5728 5729 /* page is moved even if it's not RSS of this task(page-faulted). */ 5730 #ifdef CONFIG_SWAP 5731 /* shmem/tmpfs may report page out on swap: account for that too. */ 5732 if (shmem_mapping(mapping)) { 5733 page = find_get_entry(mapping, pgoff); 5734 if (radix_tree_exceptional_entry(page)) { 5735 swp_entry_t swp = radix_to_swp_entry(page); 5736 if (do_swap_account) 5737 *entry = swp; 5738 page = find_get_page(swap_address_space(swp), swp.val); 5739 } 5740 } else 5741 page = find_get_page(mapping, pgoff); 5742 #else 5743 page = find_get_page(mapping, pgoff); 5744 #endif 5745 return page; 5746 } 5747 5748 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5749 unsigned long addr, pte_t ptent, union mc_target *target) 5750 { 5751 struct page *page = NULL; 5752 struct page_cgroup *pc; 5753 enum mc_target_type ret = MC_TARGET_NONE; 5754 swp_entry_t ent = { .val = 0 }; 5755 5756 if (pte_present(ptent)) 5757 page = mc_handle_present_pte(vma, addr, ptent); 5758 else if (is_swap_pte(ptent)) 5759 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5760 else if (pte_none(ptent) || pte_file(ptent)) 5761 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5762 5763 if (!page && !ent.val) 5764 return ret; 5765 if (page) { 5766 pc = lookup_page_cgroup(page); 5767 /* 5768 * Do only loose check w/o serialization. 5769 * mem_cgroup_move_account() checks the pc is valid or 5770 * not under LRU exclusion. 5771 */ 5772 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5773 ret = MC_TARGET_PAGE; 5774 if (target) 5775 target->page = page; 5776 } 5777 if (!ret || !target) 5778 put_page(page); 5779 } 5780 /* There is a swap entry and a page doesn't exist or isn't charged */ 5781 if (ent.val && !ret && 5782 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5783 ret = MC_TARGET_SWAP; 5784 if (target) 5785 target->ent = ent; 5786 } 5787 return ret; 5788 } 5789 5790 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5791 /* 5792 * We don't consider swapping or file mapped pages because THP does not 5793 * support them for now. 5794 * Caller should make sure that pmd_trans_huge(pmd) is true. 5795 */ 5796 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5797 unsigned long addr, pmd_t pmd, union mc_target *target) 5798 { 5799 struct page *page = NULL; 5800 struct page_cgroup *pc; 5801 enum mc_target_type ret = MC_TARGET_NONE; 5802 5803 page = pmd_page(pmd); 5804 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5805 if (!move_anon()) 5806 return ret; 5807 pc = lookup_page_cgroup(page); 5808 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5809 ret = MC_TARGET_PAGE; 5810 if (target) { 5811 get_page(page); 5812 target->page = page; 5813 } 5814 } 5815 return ret; 5816 } 5817 #else 5818 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5819 unsigned long addr, pmd_t pmd, union mc_target *target) 5820 { 5821 return MC_TARGET_NONE; 5822 } 5823 #endif 5824 5825 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5826 unsigned long addr, unsigned long end, 5827 struct mm_walk *walk) 5828 { 5829 struct vm_area_struct *vma = walk->private; 5830 pte_t *pte; 5831 spinlock_t *ptl; 5832 5833 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5834 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5835 mc.precharge += HPAGE_PMD_NR; 5836 spin_unlock(ptl); 5837 return 0; 5838 } 5839 5840 if (pmd_trans_unstable(pmd)) 5841 return 0; 5842 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5843 for (; addr != end; pte++, addr += PAGE_SIZE) 5844 if (get_mctgt_type(vma, addr, *pte, NULL)) 5845 mc.precharge++; /* increment precharge temporarily */ 5846 pte_unmap_unlock(pte - 1, ptl); 5847 cond_resched(); 5848 5849 return 0; 5850 } 5851 5852 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5853 { 5854 unsigned long precharge; 5855 struct vm_area_struct *vma; 5856 5857 down_read(&mm->mmap_sem); 5858 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5859 struct mm_walk mem_cgroup_count_precharge_walk = { 5860 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5861 .mm = mm, 5862 .private = vma, 5863 }; 5864 if (is_vm_hugetlb_page(vma)) 5865 continue; 5866 walk_page_range(vma->vm_start, vma->vm_end, 5867 &mem_cgroup_count_precharge_walk); 5868 } 5869 up_read(&mm->mmap_sem); 5870 5871 precharge = mc.precharge; 5872 mc.precharge = 0; 5873 5874 return precharge; 5875 } 5876 5877 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5878 { 5879 unsigned long precharge = mem_cgroup_count_precharge(mm); 5880 5881 VM_BUG_ON(mc.moving_task); 5882 mc.moving_task = current; 5883 return mem_cgroup_do_precharge(precharge); 5884 } 5885 5886 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5887 static void __mem_cgroup_clear_mc(void) 5888 { 5889 struct mem_cgroup *from = mc.from; 5890 struct mem_cgroup *to = mc.to; 5891 int i; 5892 5893 /* we must uncharge all the leftover precharges from mc.to */ 5894 if (mc.precharge) { 5895 cancel_charge(mc.to, mc.precharge); 5896 mc.precharge = 0; 5897 } 5898 /* 5899 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5900 * we must uncharge here. 5901 */ 5902 if (mc.moved_charge) { 5903 cancel_charge(mc.from, mc.moved_charge); 5904 mc.moved_charge = 0; 5905 } 5906 /* we must fixup refcnts and charges */ 5907 if (mc.moved_swap) { 5908 /* uncharge swap account from the old cgroup */ 5909 if (!mem_cgroup_is_root(mc.from)) 5910 res_counter_uncharge(&mc.from->memsw, 5911 PAGE_SIZE * mc.moved_swap); 5912 5913 for (i = 0; i < mc.moved_swap; i++) 5914 css_put(&mc.from->css); 5915 5916 /* 5917 * we charged both to->res and to->memsw, so we should 5918 * uncharge to->res. 5919 */ 5920 if (!mem_cgroup_is_root(mc.to)) 5921 res_counter_uncharge(&mc.to->res, 5922 PAGE_SIZE * mc.moved_swap); 5923 /* we've already done css_get(mc.to) */ 5924 mc.moved_swap = 0; 5925 } 5926 memcg_oom_recover(from); 5927 memcg_oom_recover(to); 5928 wake_up_all(&mc.waitq); 5929 } 5930 5931 static void mem_cgroup_clear_mc(void) 5932 { 5933 struct mem_cgroup *from = mc.from; 5934 5935 /* 5936 * we must clear moving_task before waking up waiters at the end of 5937 * task migration. 5938 */ 5939 mc.moving_task = NULL; 5940 __mem_cgroup_clear_mc(); 5941 spin_lock(&mc.lock); 5942 mc.from = NULL; 5943 mc.to = NULL; 5944 spin_unlock(&mc.lock); 5945 mem_cgroup_end_move(from); 5946 } 5947 5948 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5949 struct cgroup_taskset *tset) 5950 { 5951 struct task_struct *p = cgroup_taskset_first(tset); 5952 int ret = 0; 5953 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5954 unsigned long move_charge_at_immigrate; 5955 5956 /* 5957 * We are now commited to this value whatever it is. Changes in this 5958 * tunable will only affect upcoming migrations, not the current one. 5959 * So we need to save it, and keep it going. 5960 */ 5961 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5962 if (move_charge_at_immigrate) { 5963 struct mm_struct *mm; 5964 struct mem_cgroup *from = mem_cgroup_from_task(p); 5965 5966 VM_BUG_ON(from == memcg); 5967 5968 mm = get_task_mm(p); 5969 if (!mm) 5970 return 0; 5971 /* We move charges only when we move a owner of the mm */ 5972 if (mm->owner == p) { 5973 VM_BUG_ON(mc.from); 5974 VM_BUG_ON(mc.to); 5975 VM_BUG_ON(mc.precharge); 5976 VM_BUG_ON(mc.moved_charge); 5977 VM_BUG_ON(mc.moved_swap); 5978 mem_cgroup_start_move(from); 5979 spin_lock(&mc.lock); 5980 mc.from = from; 5981 mc.to = memcg; 5982 mc.immigrate_flags = move_charge_at_immigrate; 5983 spin_unlock(&mc.lock); 5984 /* We set mc.moving_task later */ 5985 5986 ret = mem_cgroup_precharge_mc(mm); 5987 if (ret) 5988 mem_cgroup_clear_mc(); 5989 } 5990 mmput(mm); 5991 } 5992 return ret; 5993 } 5994 5995 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5996 struct cgroup_taskset *tset) 5997 { 5998 mem_cgroup_clear_mc(); 5999 } 6000 6001 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6002 unsigned long addr, unsigned long end, 6003 struct mm_walk *walk) 6004 { 6005 int ret = 0; 6006 struct vm_area_struct *vma = walk->private; 6007 pte_t *pte; 6008 spinlock_t *ptl; 6009 enum mc_target_type target_type; 6010 union mc_target target; 6011 struct page *page; 6012 struct page_cgroup *pc; 6013 6014 /* 6015 * We don't take compound_lock() here but no race with splitting thp 6016 * happens because: 6017 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 6018 * under splitting, which means there's no concurrent thp split, 6019 * - if another thread runs into split_huge_page() just after we 6020 * entered this if-block, the thread must wait for page table lock 6021 * to be unlocked in __split_huge_page_splitting(), where the main 6022 * part of thp split is not executed yet. 6023 */ 6024 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 6025 if (mc.precharge < HPAGE_PMD_NR) { 6026 spin_unlock(ptl); 6027 return 0; 6028 } 6029 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6030 if (target_type == MC_TARGET_PAGE) { 6031 page = target.page; 6032 if (!isolate_lru_page(page)) { 6033 pc = lookup_page_cgroup(page); 6034 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 6035 pc, mc.from, mc.to)) { 6036 mc.precharge -= HPAGE_PMD_NR; 6037 mc.moved_charge += HPAGE_PMD_NR; 6038 } 6039 putback_lru_page(page); 6040 } 6041 put_page(page); 6042 } 6043 spin_unlock(ptl); 6044 return 0; 6045 } 6046 6047 if (pmd_trans_unstable(pmd)) 6048 return 0; 6049 retry: 6050 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6051 for (; addr != end; addr += PAGE_SIZE) { 6052 pte_t ptent = *(pte++); 6053 swp_entry_t ent; 6054 6055 if (!mc.precharge) 6056 break; 6057 6058 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6059 case MC_TARGET_PAGE: 6060 page = target.page; 6061 if (isolate_lru_page(page)) 6062 goto put; 6063 pc = lookup_page_cgroup(page); 6064 if (!mem_cgroup_move_account(page, 1, pc, 6065 mc.from, mc.to)) { 6066 mc.precharge--; 6067 /* we uncharge from mc.from later. */ 6068 mc.moved_charge++; 6069 } 6070 putback_lru_page(page); 6071 put: /* get_mctgt_type() gets the page */ 6072 put_page(page); 6073 break; 6074 case MC_TARGET_SWAP: 6075 ent = target.ent; 6076 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6077 mc.precharge--; 6078 /* we fixup refcnts and charges later. */ 6079 mc.moved_swap++; 6080 } 6081 break; 6082 default: 6083 break; 6084 } 6085 } 6086 pte_unmap_unlock(pte - 1, ptl); 6087 cond_resched(); 6088 6089 if (addr != end) { 6090 /* 6091 * We have consumed all precharges we got in can_attach(). 6092 * We try charge one by one, but don't do any additional 6093 * charges to mc.to if we have failed in charge once in attach() 6094 * phase. 6095 */ 6096 ret = mem_cgroup_do_precharge(1); 6097 if (!ret) 6098 goto retry; 6099 } 6100 6101 return ret; 6102 } 6103 6104 static void mem_cgroup_move_charge(struct mm_struct *mm) 6105 { 6106 struct vm_area_struct *vma; 6107 6108 lru_add_drain_all(); 6109 retry: 6110 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 6111 /* 6112 * Someone who are holding the mmap_sem might be waiting in 6113 * waitq. So we cancel all extra charges, wake up all waiters, 6114 * and retry. Because we cancel precharges, we might not be able 6115 * to move enough charges, but moving charge is a best-effort 6116 * feature anyway, so it wouldn't be a big problem. 6117 */ 6118 __mem_cgroup_clear_mc(); 6119 cond_resched(); 6120 goto retry; 6121 } 6122 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6123 int ret; 6124 struct mm_walk mem_cgroup_move_charge_walk = { 6125 .pmd_entry = mem_cgroup_move_charge_pte_range, 6126 .mm = mm, 6127 .private = vma, 6128 }; 6129 if (is_vm_hugetlb_page(vma)) 6130 continue; 6131 ret = walk_page_range(vma->vm_start, vma->vm_end, 6132 &mem_cgroup_move_charge_walk); 6133 if (ret) 6134 /* 6135 * means we have consumed all precharges and failed in 6136 * doing additional charge. Just abandon here. 6137 */ 6138 break; 6139 } 6140 up_read(&mm->mmap_sem); 6141 } 6142 6143 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6144 struct cgroup_taskset *tset) 6145 { 6146 struct task_struct *p = cgroup_taskset_first(tset); 6147 struct mm_struct *mm = get_task_mm(p); 6148 6149 if (mm) { 6150 if (mc.to) 6151 mem_cgroup_move_charge(mm); 6152 mmput(mm); 6153 } 6154 if (mc.to) 6155 mem_cgroup_clear_mc(); 6156 } 6157 #else /* !CONFIG_MMU */ 6158 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6159 struct cgroup_taskset *tset) 6160 { 6161 return 0; 6162 } 6163 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 6164 struct cgroup_taskset *tset) 6165 { 6166 } 6167 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6168 struct cgroup_taskset *tset) 6169 { 6170 } 6171 #endif 6172 6173 /* 6174 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6175 * to verify whether we're attached to the default hierarchy on each mount 6176 * attempt. 6177 */ 6178 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 6179 { 6180 /* 6181 * use_hierarchy is forced on the default hierarchy. cgroup core 6182 * guarantees that @root doesn't have any children, so turning it 6183 * on for the root memcg is enough. 6184 */ 6185 if (cgroup_on_dfl(root_css->cgroup)) 6186 mem_cgroup_from_css(root_css)->use_hierarchy = true; 6187 } 6188 6189 struct cgroup_subsys memory_cgrp_subsys = { 6190 .css_alloc = mem_cgroup_css_alloc, 6191 .css_online = mem_cgroup_css_online, 6192 .css_offline = mem_cgroup_css_offline, 6193 .css_free = mem_cgroup_css_free, 6194 .css_reset = mem_cgroup_css_reset, 6195 .can_attach = mem_cgroup_can_attach, 6196 .cancel_attach = mem_cgroup_cancel_attach, 6197 .attach = mem_cgroup_move_task, 6198 .bind = mem_cgroup_bind, 6199 .legacy_cftypes = mem_cgroup_files, 6200 .early_init = 0, 6201 }; 6202 6203 #ifdef CONFIG_MEMCG_SWAP 6204 static int __init enable_swap_account(char *s) 6205 { 6206 if (!strcmp(s, "1")) 6207 really_do_swap_account = 1; 6208 else if (!strcmp(s, "0")) 6209 really_do_swap_account = 0; 6210 return 1; 6211 } 6212 __setup("swapaccount=", enable_swap_account); 6213 6214 static void __init memsw_file_init(void) 6215 { 6216 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6217 memsw_cgroup_files)); 6218 } 6219 6220 static void __init enable_swap_cgroup(void) 6221 { 6222 if (!mem_cgroup_disabled() && really_do_swap_account) { 6223 do_swap_account = 1; 6224 memsw_file_init(); 6225 } 6226 } 6227 6228 #else 6229 static void __init enable_swap_cgroup(void) 6230 { 6231 } 6232 #endif 6233 6234 #ifdef CONFIG_MEMCG_SWAP 6235 /** 6236 * mem_cgroup_swapout - transfer a memsw charge to swap 6237 * @page: page whose memsw charge to transfer 6238 * @entry: swap entry to move the charge to 6239 * 6240 * Transfer the memsw charge of @page to @entry. 6241 */ 6242 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6243 { 6244 struct page_cgroup *pc; 6245 unsigned short oldid; 6246 6247 VM_BUG_ON_PAGE(PageLRU(page), page); 6248 VM_BUG_ON_PAGE(page_count(page), page); 6249 6250 if (!do_swap_account) 6251 return; 6252 6253 pc = lookup_page_cgroup(page); 6254 6255 /* Readahead page, never charged */ 6256 if (!PageCgroupUsed(pc)) 6257 return; 6258 6259 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); 6260 6261 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); 6262 VM_BUG_ON_PAGE(oldid, page); 6263 6264 pc->flags &= ~PCG_MEMSW; 6265 css_get(&pc->mem_cgroup->css); 6266 mem_cgroup_swap_statistics(pc->mem_cgroup, true); 6267 } 6268 6269 /** 6270 * mem_cgroup_uncharge_swap - uncharge a swap entry 6271 * @entry: swap entry to uncharge 6272 * 6273 * Drop the memsw charge associated with @entry. 6274 */ 6275 void mem_cgroup_uncharge_swap(swp_entry_t entry) 6276 { 6277 struct mem_cgroup *memcg; 6278 unsigned short id; 6279 6280 if (!do_swap_account) 6281 return; 6282 6283 id = swap_cgroup_record(entry, 0); 6284 rcu_read_lock(); 6285 memcg = mem_cgroup_lookup(id); 6286 if (memcg) { 6287 if (!mem_cgroup_is_root(memcg)) 6288 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6289 mem_cgroup_swap_statistics(memcg, false); 6290 css_put(&memcg->css); 6291 } 6292 rcu_read_unlock(); 6293 } 6294 #endif 6295 6296 /** 6297 * mem_cgroup_try_charge - try charging a page 6298 * @page: page to charge 6299 * @mm: mm context of the victim 6300 * @gfp_mask: reclaim mode 6301 * @memcgp: charged memcg return 6302 * 6303 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6304 * pages according to @gfp_mask if necessary. 6305 * 6306 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 6307 * Otherwise, an error code is returned. 6308 * 6309 * After page->mapping has been set up, the caller must finalize the 6310 * charge with mem_cgroup_commit_charge(). Or abort the transaction 6311 * with mem_cgroup_cancel_charge() in case page instantiation fails. 6312 */ 6313 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 6314 gfp_t gfp_mask, struct mem_cgroup **memcgp) 6315 { 6316 struct mem_cgroup *memcg = NULL; 6317 unsigned int nr_pages = 1; 6318 int ret = 0; 6319 6320 if (mem_cgroup_disabled()) 6321 goto out; 6322 6323 if (PageSwapCache(page)) { 6324 struct page_cgroup *pc = lookup_page_cgroup(page); 6325 /* 6326 * Every swap fault against a single page tries to charge the 6327 * page, bail as early as possible. shmem_unuse() encounters 6328 * already charged pages, too. The USED bit is protected by 6329 * the page lock, which serializes swap cache removal, which 6330 * in turn serializes uncharging. 6331 */ 6332 if (PageCgroupUsed(pc)) 6333 goto out; 6334 } 6335 6336 if (PageTransHuge(page)) { 6337 nr_pages <<= compound_order(page); 6338 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6339 } 6340 6341 if (do_swap_account && PageSwapCache(page)) 6342 memcg = try_get_mem_cgroup_from_page(page); 6343 if (!memcg) 6344 memcg = get_mem_cgroup_from_mm(mm); 6345 6346 ret = try_charge(memcg, gfp_mask, nr_pages); 6347 6348 css_put(&memcg->css); 6349 6350 if (ret == -EINTR) { 6351 memcg = root_mem_cgroup; 6352 ret = 0; 6353 } 6354 out: 6355 *memcgp = memcg; 6356 return ret; 6357 } 6358 6359 /** 6360 * mem_cgroup_commit_charge - commit a page charge 6361 * @page: page to charge 6362 * @memcg: memcg to charge the page to 6363 * @lrucare: page might be on LRU already 6364 * 6365 * Finalize a charge transaction started by mem_cgroup_try_charge(), 6366 * after page->mapping has been set up. This must happen atomically 6367 * as part of the page instantiation, i.e. under the page table lock 6368 * for anonymous pages, under the page lock for page and swap cache. 6369 * 6370 * In addition, the page must not be on the LRU during the commit, to 6371 * prevent racing with task migration. If it might be, use @lrucare. 6372 * 6373 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 6374 */ 6375 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 6376 bool lrucare) 6377 { 6378 unsigned int nr_pages = 1; 6379 6380 VM_BUG_ON_PAGE(!page->mapping, page); 6381 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 6382 6383 if (mem_cgroup_disabled()) 6384 return; 6385 /* 6386 * Swap faults will attempt to charge the same page multiple 6387 * times. But reuse_swap_page() might have removed the page 6388 * from swapcache already, so we can't check PageSwapCache(). 6389 */ 6390 if (!memcg) 6391 return; 6392 6393 commit_charge(page, memcg, lrucare); 6394 6395 if (PageTransHuge(page)) { 6396 nr_pages <<= compound_order(page); 6397 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6398 } 6399 6400 local_irq_disable(); 6401 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6402 memcg_check_events(memcg, page); 6403 local_irq_enable(); 6404 6405 if (do_swap_account && PageSwapCache(page)) { 6406 swp_entry_t entry = { .val = page_private(page) }; 6407 /* 6408 * The swap entry might not get freed for a long time, 6409 * let's not wait for it. The page already received a 6410 * memory+swap charge, drop the swap entry duplicate. 6411 */ 6412 mem_cgroup_uncharge_swap(entry); 6413 } 6414 } 6415 6416 /** 6417 * mem_cgroup_cancel_charge - cancel a page charge 6418 * @page: page to charge 6419 * @memcg: memcg to charge the page to 6420 * 6421 * Cancel a charge transaction started by mem_cgroup_try_charge(). 6422 */ 6423 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 6424 { 6425 unsigned int nr_pages = 1; 6426 6427 if (mem_cgroup_disabled()) 6428 return; 6429 /* 6430 * Swap faults will attempt to charge the same page multiple 6431 * times. But reuse_swap_page() might have removed the page 6432 * from swapcache already, so we can't check PageSwapCache(). 6433 */ 6434 if (!memcg) 6435 return; 6436 6437 if (PageTransHuge(page)) { 6438 nr_pages <<= compound_order(page); 6439 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6440 } 6441 6442 cancel_charge(memcg, nr_pages); 6443 } 6444 6445 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 6446 unsigned long nr_mem, unsigned long nr_memsw, 6447 unsigned long nr_anon, unsigned long nr_file, 6448 unsigned long nr_huge, struct page *dummy_page) 6449 { 6450 unsigned long flags; 6451 6452 if (!mem_cgroup_is_root(memcg)) { 6453 if (nr_mem) 6454 res_counter_uncharge(&memcg->res, 6455 nr_mem * PAGE_SIZE); 6456 if (nr_memsw) 6457 res_counter_uncharge(&memcg->memsw, 6458 nr_memsw * PAGE_SIZE); 6459 memcg_oom_recover(memcg); 6460 } 6461 6462 local_irq_save(flags); 6463 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 6464 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 6465 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 6466 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 6467 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); 6468 memcg_check_events(memcg, dummy_page); 6469 local_irq_restore(flags); 6470 } 6471 6472 static void uncharge_list(struct list_head *page_list) 6473 { 6474 struct mem_cgroup *memcg = NULL; 6475 unsigned long nr_memsw = 0; 6476 unsigned long nr_anon = 0; 6477 unsigned long nr_file = 0; 6478 unsigned long nr_huge = 0; 6479 unsigned long pgpgout = 0; 6480 unsigned long nr_mem = 0; 6481 struct list_head *next; 6482 struct page *page; 6483 6484 next = page_list->next; 6485 do { 6486 unsigned int nr_pages = 1; 6487 struct page_cgroup *pc; 6488 6489 page = list_entry(next, struct page, lru); 6490 next = page->lru.next; 6491 6492 VM_BUG_ON_PAGE(PageLRU(page), page); 6493 VM_BUG_ON_PAGE(page_count(page), page); 6494 6495 pc = lookup_page_cgroup(page); 6496 if (!PageCgroupUsed(pc)) 6497 continue; 6498 6499 /* 6500 * Nobody should be changing or seriously looking at 6501 * pc->mem_cgroup and pc->flags at this point, we have 6502 * fully exclusive access to the page. 6503 */ 6504 6505 if (memcg != pc->mem_cgroup) { 6506 if (memcg) { 6507 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6508 nr_anon, nr_file, nr_huge, page); 6509 pgpgout = nr_mem = nr_memsw = 0; 6510 nr_anon = nr_file = nr_huge = 0; 6511 } 6512 memcg = pc->mem_cgroup; 6513 } 6514 6515 if (PageTransHuge(page)) { 6516 nr_pages <<= compound_order(page); 6517 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6518 nr_huge += nr_pages; 6519 } 6520 6521 if (PageAnon(page)) 6522 nr_anon += nr_pages; 6523 else 6524 nr_file += nr_pages; 6525 6526 if (pc->flags & PCG_MEM) 6527 nr_mem += nr_pages; 6528 if (pc->flags & PCG_MEMSW) 6529 nr_memsw += nr_pages; 6530 pc->flags = 0; 6531 6532 pgpgout++; 6533 } while (next != page_list); 6534 6535 if (memcg) 6536 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6537 nr_anon, nr_file, nr_huge, page); 6538 } 6539 6540 /** 6541 * mem_cgroup_uncharge - uncharge a page 6542 * @page: page to uncharge 6543 * 6544 * Uncharge a page previously charged with mem_cgroup_try_charge() and 6545 * mem_cgroup_commit_charge(). 6546 */ 6547 void mem_cgroup_uncharge(struct page *page) 6548 { 6549 struct page_cgroup *pc; 6550 6551 if (mem_cgroup_disabled()) 6552 return; 6553 6554 /* Don't touch page->lru of any random page, pre-check: */ 6555 pc = lookup_page_cgroup(page); 6556 if (!PageCgroupUsed(pc)) 6557 return; 6558 6559 INIT_LIST_HEAD(&page->lru); 6560 uncharge_list(&page->lru); 6561 } 6562 6563 /** 6564 * mem_cgroup_uncharge_list - uncharge a list of page 6565 * @page_list: list of pages to uncharge 6566 * 6567 * Uncharge a list of pages previously charged with 6568 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 6569 */ 6570 void mem_cgroup_uncharge_list(struct list_head *page_list) 6571 { 6572 if (mem_cgroup_disabled()) 6573 return; 6574 6575 if (!list_empty(page_list)) 6576 uncharge_list(page_list); 6577 } 6578 6579 /** 6580 * mem_cgroup_migrate - migrate a charge to another page 6581 * @oldpage: currently charged page 6582 * @newpage: page to transfer the charge to 6583 * @lrucare: both pages might be on the LRU already 6584 * 6585 * Migrate the charge from @oldpage to @newpage. 6586 * 6587 * Both pages must be locked, @newpage->mapping must be set up. 6588 */ 6589 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 6590 bool lrucare) 6591 { 6592 struct page_cgroup *pc; 6593 int isolated; 6594 6595 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6596 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6597 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 6598 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 6599 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6600 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6601 newpage); 6602 6603 if (mem_cgroup_disabled()) 6604 return; 6605 6606 /* Page cache replacement: new page already charged? */ 6607 pc = lookup_page_cgroup(newpage); 6608 if (PageCgroupUsed(pc)) 6609 return; 6610 6611 /* Re-entrant migration: old page already uncharged? */ 6612 pc = lookup_page_cgroup(oldpage); 6613 if (!PageCgroupUsed(pc)) 6614 return; 6615 6616 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); 6617 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); 6618 6619 if (lrucare) 6620 lock_page_lru(oldpage, &isolated); 6621 6622 pc->flags = 0; 6623 6624 if (lrucare) 6625 unlock_page_lru(oldpage, isolated); 6626 6627 commit_charge(newpage, pc->mem_cgroup, lrucare); 6628 } 6629 6630 /* 6631 * subsys_initcall() for memory controller. 6632 * 6633 * Some parts like hotcpu_notifier() have to be initialized from this context 6634 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 6635 * everything that doesn't depend on a specific mem_cgroup structure should 6636 * be initialized from here. 6637 */ 6638 static int __init mem_cgroup_init(void) 6639 { 6640 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6641 enable_swap_cgroup(); 6642 mem_cgroup_soft_limit_tree_init(); 6643 memcg_stock_init(); 6644 return 0; 6645 } 6646 subsys_initcall(mem_cgroup_init); 6647