1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * Native page reclaim 18 * Charge lifetime sanitation 19 * Lockless page tracking & accounting 20 * Unified hierarchy configuration model 21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 */ 33 34 #include <linux/page_counter.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cgroup.h> 37 #include <linux/mm.h> 38 #include <linux/hugetlb.h> 39 #include <linux/pagemap.h> 40 #include <linux/smp.h> 41 #include <linux/page-flags.h> 42 #include <linux/backing-dev.h> 43 #include <linux/bit_spinlock.h> 44 #include <linux/rcupdate.h> 45 #include <linux/limits.h> 46 #include <linux/export.h> 47 #include <linux/mutex.h> 48 #include <linux/rbtree.h> 49 #include <linux/slab.h> 50 #include <linux/swap.h> 51 #include <linux/swapops.h> 52 #include <linux/spinlock.h> 53 #include <linux/eventfd.h> 54 #include <linux/poll.h> 55 #include <linux/sort.h> 56 #include <linux/fs.h> 57 #include <linux/seq_file.h> 58 #include <linux/vmpressure.h> 59 #include <linux/mm_inline.h> 60 #include <linux/swap_cgroup.h> 61 #include <linux/cpu.h> 62 #include <linux/oom.h> 63 #include <linux/lockdep.h> 64 #include <linux/file.h> 65 #include "internal.h" 66 #include <net/sock.h> 67 #include <net/ip.h> 68 #include <net/tcp_memcontrol.h> 69 #include "slab.h" 70 71 #include <asm/uaccess.h> 72 73 #include <trace/events/vmscan.h> 74 75 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 76 EXPORT_SYMBOL(memory_cgrp_subsys); 77 78 #define MEM_CGROUP_RECLAIM_RETRIES 5 79 static struct mem_cgroup *root_mem_cgroup __read_mostly; 80 struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; 81 82 /* Whether the swap controller is active */ 83 #ifdef CONFIG_MEMCG_SWAP 84 int do_swap_account __read_mostly; 85 #else 86 #define do_swap_account 0 87 #endif 88 89 static const char * const mem_cgroup_stat_names[] = { 90 "cache", 91 "rss", 92 "rss_huge", 93 "mapped_file", 94 "dirty", 95 "writeback", 96 "swap", 97 }; 98 99 static const char * const mem_cgroup_events_names[] = { 100 "pgpgin", 101 "pgpgout", 102 "pgfault", 103 "pgmajfault", 104 }; 105 106 static const char * const mem_cgroup_lru_names[] = { 107 "inactive_anon", 108 "active_anon", 109 "inactive_file", 110 "active_file", 111 "unevictable", 112 }; 113 114 /* 115 * Per memcg event counter is incremented at every pagein/pageout. With THP, 116 * it will be incremated by the number of pages. This counter is used for 117 * for trigger some periodic events. This is straightforward and better 118 * than using jiffies etc. to handle periodic memcg event. 119 */ 120 enum mem_cgroup_events_target { 121 MEM_CGROUP_TARGET_THRESH, 122 MEM_CGROUP_TARGET_SOFTLIMIT, 123 MEM_CGROUP_TARGET_NUMAINFO, 124 MEM_CGROUP_NTARGETS, 125 }; 126 #define THRESHOLDS_EVENTS_TARGET 128 127 #define SOFTLIMIT_EVENTS_TARGET 1024 128 #define NUMAINFO_EVENTS_TARGET 1024 129 130 struct mem_cgroup_stat_cpu { 131 long count[MEM_CGROUP_STAT_NSTATS]; 132 unsigned long events[MEMCG_NR_EVENTS]; 133 unsigned long nr_page_events; 134 unsigned long targets[MEM_CGROUP_NTARGETS]; 135 }; 136 137 struct reclaim_iter { 138 struct mem_cgroup *position; 139 /* scan generation, increased every round-trip */ 140 unsigned int generation; 141 }; 142 143 /* 144 * per-zone information in memory controller. 145 */ 146 struct mem_cgroup_per_zone { 147 struct lruvec lruvec; 148 unsigned long lru_size[NR_LRU_LISTS]; 149 150 struct reclaim_iter iter[DEF_PRIORITY + 1]; 151 152 struct rb_node tree_node; /* RB tree node */ 153 unsigned long usage_in_excess;/* Set to the value by which */ 154 /* the soft limit is exceeded*/ 155 bool on_tree; 156 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 157 /* use container_of */ 158 }; 159 160 struct mem_cgroup_per_node { 161 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 162 }; 163 164 /* 165 * Cgroups above their limits are maintained in a RB-Tree, independent of 166 * their hierarchy representation 167 */ 168 169 struct mem_cgroup_tree_per_zone { 170 struct rb_root rb_root; 171 spinlock_t lock; 172 }; 173 174 struct mem_cgroup_tree_per_node { 175 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 176 }; 177 178 struct mem_cgroup_tree { 179 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 180 }; 181 182 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 183 184 struct mem_cgroup_threshold { 185 struct eventfd_ctx *eventfd; 186 unsigned long threshold; 187 }; 188 189 /* For threshold */ 190 struct mem_cgroup_threshold_ary { 191 /* An array index points to threshold just below or equal to usage. */ 192 int current_threshold; 193 /* Size of entries[] */ 194 unsigned int size; 195 /* Array of thresholds */ 196 struct mem_cgroup_threshold entries[0]; 197 }; 198 199 struct mem_cgroup_thresholds { 200 /* Primary thresholds array */ 201 struct mem_cgroup_threshold_ary *primary; 202 /* 203 * Spare threshold array. 204 * This is needed to make mem_cgroup_unregister_event() "never fail". 205 * It must be able to store at least primary->size - 1 entries. 206 */ 207 struct mem_cgroup_threshold_ary *spare; 208 }; 209 210 /* for OOM */ 211 struct mem_cgroup_eventfd_list { 212 struct list_head list; 213 struct eventfd_ctx *eventfd; 214 }; 215 216 /* 217 * cgroup_event represents events which userspace want to receive. 218 */ 219 struct mem_cgroup_event { 220 /* 221 * memcg which the event belongs to. 222 */ 223 struct mem_cgroup *memcg; 224 /* 225 * eventfd to signal userspace about the event. 226 */ 227 struct eventfd_ctx *eventfd; 228 /* 229 * Each of these stored in a list by the cgroup. 230 */ 231 struct list_head list; 232 /* 233 * register_event() callback will be used to add new userspace 234 * waiter for changes related to this event. Use eventfd_signal() 235 * on eventfd to send notification to userspace. 236 */ 237 int (*register_event)(struct mem_cgroup *memcg, 238 struct eventfd_ctx *eventfd, const char *args); 239 /* 240 * unregister_event() callback will be called when userspace closes 241 * the eventfd or on cgroup removing. This callback must be set, 242 * if you want provide notification functionality. 243 */ 244 void (*unregister_event)(struct mem_cgroup *memcg, 245 struct eventfd_ctx *eventfd); 246 /* 247 * All fields below needed to unregister event when 248 * userspace closes eventfd. 249 */ 250 poll_table pt; 251 wait_queue_head_t *wqh; 252 wait_queue_t wait; 253 struct work_struct remove; 254 }; 255 256 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 257 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 258 259 /* 260 * The memory controller data structure. The memory controller controls both 261 * page cache and RSS per cgroup. We would eventually like to provide 262 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 263 * to help the administrator determine what knobs to tune. 264 */ 265 struct mem_cgroup { 266 struct cgroup_subsys_state css; 267 268 /* Accounted resources */ 269 struct page_counter memory; 270 struct page_counter memsw; 271 struct page_counter kmem; 272 273 /* Normal memory consumption range */ 274 unsigned long low; 275 unsigned long high; 276 277 unsigned long soft_limit; 278 279 /* vmpressure notifications */ 280 struct vmpressure vmpressure; 281 282 /* css_online() has been completed */ 283 int initialized; 284 285 /* 286 * Should the accounting and control be hierarchical, per subtree? 287 */ 288 bool use_hierarchy; 289 290 /* protected by memcg_oom_lock */ 291 bool oom_lock; 292 int under_oom; 293 294 int swappiness; 295 /* OOM-Killer disable */ 296 int oom_kill_disable; 297 298 /* protect arrays of thresholds */ 299 struct mutex thresholds_lock; 300 301 /* thresholds for memory usage. RCU-protected */ 302 struct mem_cgroup_thresholds thresholds; 303 304 /* thresholds for mem+swap usage. RCU-protected */ 305 struct mem_cgroup_thresholds memsw_thresholds; 306 307 /* For oom notifier event fd */ 308 struct list_head oom_notify; 309 310 /* 311 * Should we move charges of a task when a task is moved into this 312 * mem_cgroup ? And what type of charges should we move ? 313 */ 314 unsigned long move_charge_at_immigrate; 315 /* 316 * set > 0 if pages under this cgroup are moving to other cgroup. 317 */ 318 atomic_t moving_account; 319 /* taken only while moving_account > 0 */ 320 spinlock_t move_lock; 321 struct task_struct *move_lock_task; 322 unsigned long move_lock_flags; 323 /* 324 * percpu counter. 325 */ 326 struct mem_cgroup_stat_cpu __percpu *stat; 327 spinlock_t pcp_counter_lock; 328 329 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 330 struct cg_proto tcp_mem; 331 #endif 332 #if defined(CONFIG_MEMCG_KMEM) 333 /* Index in the kmem_cache->memcg_params.memcg_caches array */ 334 int kmemcg_id; 335 bool kmem_acct_activated; 336 bool kmem_acct_active; 337 #endif 338 339 int last_scanned_node; 340 #if MAX_NUMNODES > 1 341 nodemask_t scan_nodes; 342 atomic_t numainfo_events; 343 atomic_t numainfo_updating; 344 #endif 345 346 #ifdef CONFIG_CGROUP_WRITEBACK 347 struct list_head cgwb_list; 348 struct wb_domain cgwb_domain; 349 #endif 350 351 /* List of events which userspace want to receive */ 352 struct list_head event_list; 353 spinlock_t event_list_lock; 354 355 struct mem_cgroup_per_node *nodeinfo[0]; 356 /* WARNING: nodeinfo must be the last member here */ 357 }; 358 359 #ifdef CONFIG_MEMCG_KMEM 360 bool memcg_kmem_is_active(struct mem_cgroup *memcg) 361 { 362 return memcg->kmem_acct_active; 363 } 364 #endif 365 366 /* Stuffs for move charges at task migration. */ 367 /* 368 * Types of charges to be moved. 369 */ 370 #define MOVE_ANON 0x1U 371 #define MOVE_FILE 0x2U 372 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 373 374 /* "mc" and its members are protected by cgroup_mutex */ 375 static struct move_charge_struct { 376 spinlock_t lock; /* for from, to */ 377 struct mem_cgroup *from; 378 struct mem_cgroup *to; 379 unsigned long flags; 380 unsigned long precharge; 381 unsigned long moved_charge; 382 unsigned long moved_swap; 383 struct task_struct *moving_task; /* a task moving charges */ 384 wait_queue_head_t waitq; /* a waitq for other context */ 385 } mc = { 386 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 387 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 388 }; 389 390 /* 391 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 392 * limit reclaim to prevent infinite loops, if they ever occur. 393 */ 394 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 395 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 396 397 enum charge_type { 398 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 399 MEM_CGROUP_CHARGE_TYPE_ANON, 400 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 401 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 402 NR_CHARGE_TYPE, 403 }; 404 405 /* for encoding cft->private value on file */ 406 enum res_type { 407 _MEM, 408 _MEMSWAP, 409 _OOM_TYPE, 410 _KMEM, 411 }; 412 413 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 414 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 415 #define MEMFILE_ATTR(val) ((val) & 0xffff) 416 /* Used for OOM nofiier */ 417 #define OOM_CONTROL (0) 418 419 /* 420 * The memcg_create_mutex will be held whenever a new cgroup is created. 421 * As a consequence, any change that needs to protect against new child cgroups 422 * appearing has to hold it as well. 423 */ 424 static DEFINE_MUTEX(memcg_create_mutex); 425 426 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 427 { 428 return s ? container_of(s, struct mem_cgroup, css) : NULL; 429 } 430 431 /* Some nice accessors for the vmpressure. */ 432 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 433 { 434 if (!memcg) 435 memcg = root_mem_cgroup; 436 return &memcg->vmpressure; 437 } 438 439 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 440 { 441 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 442 } 443 444 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 445 { 446 return (memcg == root_mem_cgroup); 447 } 448 449 /* 450 * We restrict the id in the range of [1, 65535], so it can fit into 451 * an unsigned short. 452 */ 453 #define MEM_CGROUP_ID_MAX USHRT_MAX 454 455 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 456 { 457 return memcg->css.id; 458 } 459 460 /* 461 * A helper function to get mem_cgroup from ID. must be called under 462 * rcu_read_lock(). The caller is responsible for calling 463 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 464 * refcnt from swap can be called against removed memcg.) 465 */ 466 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 467 { 468 struct cgroup_subsys_state *css; 469 470 css = css_from_id(id, &memory_cgrp_subsys); 471 return mem_cgroup_from_css(css); 472 } 473 474 /* Writing them here to avoid exposing memcg's inner layout */ 475 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 476 477 void sock_update_memcg(struct sock *sk) 478 { 479 if (mem_cgroup_sockets_enabled) { 480 struct mem_cgroup *memcg; 481 struct cg_proto *cg_proto; 482 483 BUG_ON(!sk->sk_prot->proto_cgroup); 484 485 /* Socket cloning can throw us here with sk_cgrp already 486 * filled. It won't however, necessarily happen from 487 * process context. So the test for root memcg given 488 * the current task's memcg won't help us in this case. 489 * 490 * Respecting the original socket's memcg is a better 491 * decision in this case. 492 */ 493 if (sk->sk_cgrp) { 494 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 495 css_get(&sk->sk_cgrp->memcg->css); 496 return; 497 } 498 499 rcu_read_lock(); 500 memcg = mem_cgroup_from_task(current); 501 cg_proto = sk->sk_prot->proto_cgroup(memcg); 502 if (!mem_cgroup_is_root(memcg) && 503 memcg_proto_active(cg_proto) && 504 css_tryget_online(&memcg->css)) { 505 sk->sk_cgrp = cg_proto; 506 } 507 rcu_read_unlock(); 508 } 509 } 510 EXPORT_SYMBOL(sock_update_memcg); 511 512 void sock_release_memcg(struct sock *sk) 513 { 514 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 515 struct mem_cgroup *memcg; 516 WARN_ON(!sk->sk_cgrp->memcg); 517 memcg = sk->sk_cgrp->memcg; 518 css_put(&sk->sk_cgrp->memcg->css); 519 } 520 } 521 522 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 523 { 524 if (!memcg || mem_cgroup_is_root(memcg)) 525 return NULL; 526 527 return &memcg->tcp_mem; 528 } 529 EXPORT_SYMBOL(tcp_proto_cgroup); 530 531 #endif 532 533 #ifdef CONFIG_MEMCG_KMEM 534 /* 535 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 536 * The main reason for not using cgroup id for this: 537 * this works better in sparse environments, where we have a lot of memcgs, 538 * but only a few kmem-limited. Or also, if we have, for instance, 200 539 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 540 * 200 entry array for that. 541 * 542 * The current size of the caches array is stored in memcg_nr_cache_ids. It 543 * will double each time we have to increase it. 544 */ 545 static DEFINE_IDA(memcg_cache_ida); 546 int memcg_nr_cache_ids; 547 548 /* Protects memcg_nr_cache_ids */ 549 static DECLARE_RWSEM(memcg_cache_ids_sem); 550 551 void memcg_get_cache_ids(void) 552 { 553 down_read(&memcg_cache_ids_sem); 554 } 555 556 void memcg_put_cache_ids(void) 557 { 558 up_read(&memcg_cache_ids_sem); 559 } 560 561 /* 562 * MIN_SIZE is different than 1, because we would like to avoid going through 563 * the alloc/free process all the time. In a small machine, 4 kmem-limited 564 * cgroups is a reasonable guess. In the future, it could be a parameter or 565 * tunable, but that is strictly not necessary. 566 * 567 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 568 * this constant directly from cgroup, but it is understandable that this is 569 * better kept as an internal representation in cgroup.c. In any case, the 570 * cgrp_id space is not getting any smaller, and we don't have to necessarily 571 * increase ours as well if it increases. 572 */ 573 #define MEMCG_CACHES_MIN_SIZE 4 574 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 575 576 /* 577 * A lot of the calls to the cache allocation functions are expected to be 578 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 579 * conditional to this static branch, we'll have to allow modules that does 580 * kmem_cache_alloc and the such to see this symbol as well 581 */ 582 struct static_key memcg_kmem_enabled_key; 583 EXPORT_SYMBOL(memcg_kmem_enabled_key); 584 585 #endif /* CONFIG_MEMCG_KMEM */ 586 587 static struct mem_cgroup_per_zone * 588 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 589 { 590 int nid = zone_to_nid(zone); 591 int zid = zone_idx(zone); 592 593 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 594 } 595 596 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 597 { 598 return &memcg->css; 599 } 600 601 /** 602 * mem_cgroup_css_from_page - css of the memcg associated with a page 603 * @page: page of interest 604 * 605 * If memcg is bound to the default hierarchy, css of the memcg associated 606 * with @page is returned. The returned css remains associated with @page 607 * until it is released. 608 * 609 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 610 * is returned. 611 * 612 * XXX: The above description of behavior on the default hierarchy isn't 613 * strictly true yet as replace_page_cache_page() can modify the 614 * association before @page is released even on the default hierarchy; 615 * however, the current and planned usages don't mix the the two functions 616 * and replace_page_cache_page() will soon be updated to make the invariant 617 * actually true. 618 */ 619 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 620 { 621 struct mem_cgroup *memcg; 622 623 rcu_read_lock(); 624 625 memcg = page->mem_cgroup; 626 627 if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) 628 memcg = root_mem_cgroup; 629 630 rcu_read_unlock(); 631 return &memcg->css; 632 } 633 634 static struct mem_cgroup_per_zone * 635 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 636 { 637 int nid = page_to_nid(page); 638 int zid = page_zonenum(page); 639 640 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 641 } 642 643 static struct mem_cgroup_tree_per_zone * 644 soft_limit_tree_node_zone(int nid, int zid) 645 { 646 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 647 } 648 649 static struct mem_cgroup_tree_per_zone * 650 soft_limit_tree_from_page(struct page *page) 651 { 652 int nid = page_to_nid(page); 653 int zid = page_zonenum(page); 654 655 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 656 } 657 658 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 659 struct mem_cgroup_tree_per_zone *mctz, 660 unsigned long new_usage_in_excess) 661 { 662 struct rb_node **p = &mctz->rb_root.rb_node; 663 struct rb_node *parent = NULL; 664 struct mem_cgroup_per_zone *mz_node; 665 666 if (mz->on_tree) 667 return; 668 669 mz->usage_in_excess = new_usage_in_excess; 670 if (!mz->usage_in_excess) 671 return; 672 while (*p) { 673 parent = *p; 674 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 675 tree_node); 676 if (mz->usage_in_excess < mz_node->usage_in_excess) 677 p = &(*p)->rb_left; 678 /* 679 * We can't avoid mem cgroups that are over their soft 680 * limit by the same amount 681 */ 682 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 683 p = &(*p)->rb_right; 684 } 685 rb_link_node(&mz->tree_node, parent, p); 686 rb_insert_color(&mz->tree_node, &mctz->rb_root); 687 mz->on_tree = true; 688 } 689 690 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 691 struct mem_cgroup_tree_per_zone *mctz) 692 { 693 if (!mz->on_tree) 694 return; 695 rb_erase(&mz->tree_node, &mctz->rb_root); 696 mz->on_tree = false; 697 } 698 699 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 700 struct mem_cgroup_tree_per_zone *mctz) 701 { 702 unsigned long flags; 703 704 spin_lock_irqsave(&mctz->lock, flags); 705 __mem_cgroup_remove_exceeded(mz, mctz); 706 spin_unlock_irqrestore(&mctz->lock, flags); 707 } 708 709 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 710 { 711 unsigned long nr_pages = page_counter_read(&memcg->memory); 712 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 713 unsigned long excess = 0; 714 715 if (nr_pages > soft_limit) 716 excess = nr_pages - soft_limit; 717 718 return excess; 719 } 720 721 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 722 { 723 unsigned long excess; 724 struct mem_cgroup_per_zone *mz; 725 struct mem_cgroup_tree_per_zone *mctz; 726 727 mctz = soft_limit_tree_from_page(page); 728 /* 729 * Necessary to update all ancestors when hierarchy is used. 730 * because their event counter is not touched. 731 */ 732 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 733 mz = mem_cgroup_page_zoneinfo(memcg, page); 734 excess = soft_limit_excess(memcg); 735 /* 736 * We have to update the tree if mz is on RB-tree or 737 * mem is over its softlimit. 738 */ 739 if (excess || mz->on_tree) { 740 unsigned long flags; 741 742 spin_lock_irqsave(&mctz->lock, flags); 743 /* if on-tree, remove it */ 744 if (mz->on_tree) 745 __mem_cgroup_remove_exceeded(mz, mctz); 746 /* 747 * Insert again. mz->usage_in_excess will be updated. 748 * If excess is 0, no tree ops. 749 */ 750 __mem_cgroup_insert_exceeded(mz, mctz, excess); 751 spin_unlock_irqrestore(&mctz->lock, flags); 752 } 753 } 754 } 755 756 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 757 { 758 struct mem_cgroup_tree_per_zone *mctz; 759 struct mem_cgroup_per_zone *mz; 760 int nid, zid; 761 762 for_each_node(nid) { 763 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 764 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 765 mctz = soft_limit_tree_node_zone(nid, zid); 766 mem_cgroup_remove_exceeded(mz, mctz); 767 } 768 } 769 } 770 771 static struct mem_cgroup_per_zone * 772 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 773 { 774 struct rb_node *rightmost = NULL; 775 struct mem_cgroup_per_zone *mz; 776 777 retry: 778 mz = NULL; 779 rightmost = rb_last(&mctz->rb_root); 780 if (!rightmost) 781 goto done; /* Nothing to reclaim from */ 782 783 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 784 /* 785 * Remove the node now but someone else can add it back, 786 * we will to add it back at the end of reclaim to its correct 787 * position in the tree. 788 */ 789 __mem_cgroup_remove_exceeded(mz, mctz); 790 if (!soft_limit_excess(mz->memcg) || 791 !css_tryget_online(&mz->memcg->css)) 792 goto retry; 793 done: 794 return mz; 795 } 796 797 static struct mem_cgroup_per_zone * 798 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 799 { 800 struct mem_cgroup_per_zone *mz; 801 802 spin_lock_irq(&mctz->lock); 803 mz = __mem_cgroup_largest_soft_limit_node(mctz); 804 spin_unlock_irq(&mctz->lock); 805 return mz; 806 } 807 808 /* 809 * Implementation Note: reading percpu statistics for memcg. 810 * 811 * Both of vmstat[] and percpu_counter has threshold and do periodic 812 * synchronization to implement "quick" read. There are trade-off between 813 * reading cost and precision of value. Then, we may have a chance to implement 814 * a periodic synchronizion of counter in memcg's counter. 815 * 816 * But this _read() function is used for user interface now. The user accounts 817 * memory usage by memory cgroup and he _always_ requires exact value because 818 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 819 * have to visit all online cpus and make sum. So, for now, unnecessary 820 * synchronization is not implemented. (just implemented for cpu hotplug) 821 * 822 * If there are kernel internal actions which can make use of some not-exact 823 * value, and reading all cpu value can be performance bottleneck in some 824 * common workload, threashold and synchonization as vmstat[] should be 825 * implemented. 826 */ 827 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 828 enum mem_cgroup_stat_index idx) 829 { 830 long val = 0; 831 int cpu; 832 833 for_each_possible_cpu(cpu) 834 val += per_cpu(memcg->stat->count[idx], cpu); 835 return val; 836 } 837 838 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 839 enum mem_cgroup_events_index idx) 840 { 841 unsigned long val = 0; 842 int cpu; 843 844 for_each_possible_cpu(cpu) 845 val += per_cpu(memcg->stat->events[idx], cpu); 846 return val; 847 } 848 849 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 850 struct page *page, 851 int nr_pages) 852 { 853 /* 854 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 855 * counted as CACHE even if it's on ANON LRU. 856 */ 857 if (PageAnon(page)) 858 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 859 nr_pages); 860 else 861 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 862 nr_pages); 863 864 if (PageTransHuge(page)) 865 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 866 nr_pages); 867 868 /* pagein of a big page is an event. So, ignore page size */ 869 if (nr_pages > 0) 870 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 871 else { 872 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 873 nr_pages = -nr_pages; /* for event */ 874 } 875 876 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 877 } 878 879 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 880 { 881 struct mem_cgroup_per_zone *mz; 882 883 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 884 return mz->lru_size[lru]; 885 } 886 887 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 888 int nid, 889 unsigned int lru_mask) 890 { 891 unsigned long nr = 0; 892 int zid; 893 894 VM_BUG_ON((unsigned)nid >= nr_node_ids); 895 896 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 897 struct mem_cgroup_per_zone *mz; 898 enum lru_list lru; 899 900 for_each_lru(lru) { 901 if (!(BIT(lru) & lru_mask)) 902 continue; 903 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 904 nr += mz->lru_size[lru]; 905 } 906 } 907 return nr; 908 } 909 910 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 911 unsigned int lru_mask) 912 { 913 unsigned long nr = 0; 914 int nid; 915 916 for_each_node_state(nid, N_MEMORY) 917 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 918 return nr; 919 } 920 921 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 922 enum mem_cgroup_events_target target) 923 { 924 unsigned long val, next; 925 926 val = __this_cpu_read(memcg->stat->nr_page_events); 927 next = __this_cpu_read(memcg->stat->targets[target]); 928 /* from time_after() in jiffies.h */ 929 if ((long)next - (long)val < 0) { 930 switch (target) { 931 case MEM_CGROUP_TARGET_THRESH: 932 next = val + THRESHOLDS_EVENTS_TARGET; 933 break; 934 case MEM_CGROUP_TARGET_SOFTLIMIT: 935 next = val + SOFTLIMIT_EVENTS_TARGET; 936 break; 937 case MEM_CGROUP_TARGET_NUMAINFO: 938 next = val + NUMAINFO_EVENTS_TARGET; 939 break; 940 default: 941 break; 942 } 943 __this_cpu_write(memcg->stat->targets[target], next); 944 return true; 945 } 946 return false; 947 } 948 949 /* 950 * Check events in order. 951 * 952 */ 953 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 954 { 955 /* threshold event is triggered in finer grain than soft limit */ 956 if (unlikely(mem_cgroup_event_ratelimit(memcg, 957 MEM_CGROUP_TARGET_THRESH))) { 958 bool do_softlimit; 959 bool do_numainfo __maybe_unused; 960 961 do_softlimit = mem_cgroup_event_ratelimit(memcg, 962 MEM_CGROUP_TARGET_SOFTLIMIT); 963 #if MAX_NUMNODES > 1 964 do_numainfo = mem_cgroup_event_ratelimit(memcg, 965 MEM_CGROUP_TARGET_NUMAINFO); 966 #endif 967 mem_cgroup_threshold(memcg); 968 if (unlikely(do_softlimit)) 969 mem_cgroup_update_tree(memcg, page); 970 #if MAX_NUMNODES > 1 971 if (unlikely(do_numainfo)) 972 atomic_inc(&memcg->numainfo_events); 973 #endif 974 } 975 } 976 977 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 978 { 979 /* 980 * mm_update_next_owner() may clear mm->owner to NULL 981 * if it races with swapoff, page migration, etc. 982 * So this can be called with p == NULL. 983 */ 984 if (unlikely(!p)) 985 return NULL; 986 987 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 988 } 989 990 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 991 { 992 struct mem_cgroup *memcg = NULL; 993 994 rcu_read_lock(); 995 do { 996 /* 997 * Page cache insertions can happen withou an 998 * actual mm context, e.g. during disk probing 999 * on boot, loopback IO, acct() writes etc. 1000 */ 1001 if (unlikely(!mm)) 1002 memcg = root_mem_cgroup; 1003 else { 1004 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1005 if (unlikely(!memcg)) 1006 memcg = root_mem_cgroup; 1007 } 1008 } while (!css_tryget_online(&memcg->css)); 1009 rcu_read_unlock(); 1010 return memcg; 1011 } 1012 1013 /** 1014 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1015 * @root: hierarchy root 1016 * @prev: previously returned memcg, NULL on first invocation 1017 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1018 * 1019 * Returns references to children of the hierarchy below @root, or 1020 * @root itself, or %NULL after a full round-trip. 1021 * 1022 * Caller must pass the return value in @prev on subsequent 1023 * invocations for reference counting, or use mem_cgroup_iter_break() 1024 * to cancel a hierarchy walk before the round-trip is complete. 1025 * 1026 * Reclaimers can specify a zone and a priority level in @reclaim to 1027 * divide up the memcgs in the hierarchy among all concurrent 1028 * reclaimers operating on the same zone and priority. 1029 */ 1030 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1031 struct mem_cgroup *prev, 1032 struct mem_cgroup_reclaim_cookie *reclaim) 1033 { 1034 struct reclaim_iter *uninitialized_var(iter); 1035 struct cgroup_subsys_state *css = NULL; 1036 struct mem_cgroup *memcg = NULL; 1037 struct mem_cgroup *pos = NULL; 1038 1039 if (mem_cgroup_disabled()) 1040 return NULL; 1041 1042 if (!root) 1043 root = root_mem_cgroup; 1044 1045 if (prev && !reclaim) 1046 pos = prev; 1047 1048 if (!root->use_hierarchy && root != root_mem_cgroup) { 1049 if (prev) 1050 goto out; 1051 return root; 1052 } 1053 1054 rcu_read_lock(); 1055 1056 if (reclaim) { 1057 struct mem_cgroup_per_zone *mz; 1058 1059 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1060 iter = &mz->iter[reclaim->priority]; 1061 1062 if (prev && reclaim->generation != iter->generation) 1063 goto out_unlock; 1064 1065 do { 1066 pos = READ_ONCE(iter->position); 1067 /* 1068 * A racing update may change the position and 1069 * put the last reference, hence css_tryget(), 1070 * or retry to see the updated position. 1071 */ 1072 } while (pos && !css_tryget(&pos->css)); 1073 } 1074 1075 if (pos) 1076 css = &pos->css; 1077 1078 for (;;) { 1079 css = css_next_descendant_pre(css, &root->css); 1080 if (!css) { 1081 /* 1082 * Reclaimers share the hierarchy walk, and a 1083 * new one might jump in right at the end of 1084 * the hierarchy - make sure they see at least 1085 * one group and restart from the beginning. 1086 */ 1087 if (!prev) 1088 continue; 1089 break; 1090 } 1091 1092 /* 1093 * Verify the css and acquire a reference. The root 1094 * is provided by the caller, so we know it's alive 1095 * and kicking, and don't take an extra reference. 1096 */ 1097 memcg = mem_cgroup_from_css(css); 1098 1099 if (css == &root->css) 1100 break; 1101 1102 if (css_tryget(css)) { 1103 /* 1104 * Make sure the memcg is initialized: 1105 * mem_cgroup_css_online() orders the the 1106 * initialization against setting the flag. 1107 */ 1108 if (smp_load_acquire(&memcg->initialized)) 1109 break; 1110 1111 css_put(css); 1112 } 1113 1114 memcg = NULL; 1115 } 1116 1117 if (reclaim) { 1118 if (cmpxchg(&iter->position, pos, memcg) == pos) { 1119 if (memcg) 1120 css_get(&memcg->css); 1121 if (pos) 1122 css_put(&pos->css); 1123 } 1124 1125 /* 1126 * pairs with css_tryget when dereferencing iter->position 1127 * above. 1128 */ 1129 if (pos) 1130 css_put(&pos->css); 1131 1132 if (!memcg) 1133 iter->generation++; 1134 else if (!prev) 1135 reclaim->generation = iter->generation; 1136 } 1137 1138 out_unlock: 1139 rcu_read_unlock(); 1140 out: 1141 if (prev && prev != root) 1142 css_put(&prev->css); 1143 1144 return memcg; 1145 } 1146 1147 /** 1148 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1149 * @root: hierarchy root 1150 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1151 */ 1152 void mem_cgroup_iter_break(struct mem_cgroup *root, 1153 struct mem_cgroup *prev) 1154 { 1155 if (!root) 1156 root = root_mem_cgroup; 1157 if (prev && prev != root) 1158 css_put(&prev->css); 1159 } 1160 1161 /* 1162 * Iteration constructs for visiting all cgroups (under a tree). If 1163 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1164 * be used for reference counting. 1165 */ 1166 #define for_each_mem_cgroup_tree(iter, root) \ 1167 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1168 iter != NULL; \ 1169 iter = mem_cgroup_iter(root, iter, NULL)) 1170 1171 #define for_each_mem_cgroup(iter) \ 1172 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1173 iter != NULL; \ 1174 iter = mem_cgroup_iter(NULL, iter, NULL)) 1175 1176 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1177 { 1178 struct mem_cgroup *memcg; 1179 1180 rcu_read_lock(); 1181 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1182 if (unlikely(!memcg)) 1183 goto out; 1184 1185 switch (idx) { 1186 case PGFAULT: 1187 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1188 break; 1189 case PGMAJFAULT: 1190 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1191 break; 1192 default: 1193 BUG(); 1194 } 1195 out: 1196 rcu_read_unlock(); 1197 } 1198 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1199 1200 /** 1201 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1202 * @zone: zone of the wanted lruvec 1203 * @memcg: memcg of the wanted lruvec 1204 * 1205 * Returns the lru list vector holding pages for the given @zone and 1206 * @mem. This can be the global zone lruvec, if the memory controller 1207 * is disabled. 1208 */ 1209 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1210 struct mem_cgroup *memcg) 1211 { 1212 struct mem_cgroup_per_zone *mz; 1213 struct lruvec *lruvec; 1214 1215 if (mem_cgroup_disabled()) { 1216 lruvec = &zone->lruvec; 1217 goto out; 1218 } 1219 1220 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1221 lruvec = &mz->lruvec; 1222 out: 1223 /* 1224 * Since a node can be onlined after the mem_cgroup was created, 1225 * we have to be prepared to initialize lruvec->zone here; 1226 * and if offlined then reonlined, we need to reinitialize it. 1227 */ 1228 if (unlikely(lruvec->zone != zone)) 1229 lruvec->zone = zone; 1230 return lruvec; 1231 } 1232 1233 /** 1234 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1235 * @page: the page 1236 * @zone: zone of the page 1237 * 1238 * This function is only safe when following the LRU page isolation 1239 * and putback protocol: the LRU lock must be held, and the page must 1240 * either be PageLRU() or the caller must have isolated/allocated it. 1241 */ 1242 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1243 { 1244 struct mem_cgroup_per_zone *mz; 1245 struct mem_cgroup *memcg; 1246 struct lruvec *lruvec; 1247 1248 if (mem_cgroup_disabled()) { 1249 lruvec = &zone->lruvec; 1250 goto out; 1251 } 1252 1253 memcg = page->mem_cgroup; 1254 /* 1255 * Swapcache readahead pages are added to the LRU - and 1256 * possibly migrated - before they are charged. 1257 */ 1258 if (!memcg) 1259 memcg = root_mem_cgroup; 1260 1261 mz = mem_cgroup_page_zoneinfo(memcg, page); 1262 lruvec = &mz->lruvec; 1263 out: 1264 /* 1265 * Since a node can be onlined after the mem_cgroup was created, 1266 * we have to be prepared to initialize lruvec->zone here; 1267 * and if offlined then reonlined, we need to reinitialize it. 1268 */ 1269 if (unlikely(lruvec->zone != zone)) 1270 lruvec->zone = zone; 1271 return lruvec; 1272 } 1273 1274 /** 1275 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1276 * @lruvec: mem_cgroup per zone lru vector 1277 * @lru: index of lru list the page is sitting on 1278 * @nr_pages: positive when adding or negative when removing 1279 * 1280 * This function must be called when a page is added to or removed from an 1281 * lru list. 1282 */ 1283 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1284 int nr_pages) 1285 { 1286 struct mem_cgroup_per_zone *mz; 1287 unsigned long *lru_size; 1288 1289 if (mem_cgroup_disabled()) 1290 return; 1291 1292 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1293 lru_size = mz->lru_size + lru; 1294 *lru_size += nr_pages; 1295 VM_BUG_ON((long)(*lru_size) < 0); 1296 } 1297 1298 bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) 1299 { 1300 if (root == memcg) 1301 return true; 1302 if (!root->use_hierarchy) 1303 return false; 1304 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); 1305 } 1306 1307 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1308 { 1309 struct mem_cgroup *task_memcg; 1310 struct task_struct *p; 1311 bool ret; 1312 1313 p = find_lock_task_mm(task); 1314 if (p) { 1315 task_memcg = get_mem_cgroup_from_mm(p->mm); 1316 task_unlock(p); 1317 } else { 1318 /* 1319 * All threads may have already detached their mm's, but the oom 1320 * killer still needs to detect if they have already been oom 1321 * killed to prevent needlessly killing additional tasks. 1322 */ 1323 rcu_read_lock(); 1324 task_memcg = mem_cgroup_from_task(task); 1325 css_get(&task_memcg->css); 1326 rcu_read_unlock(); 1327 } 1328 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1329 css_put(&task_memcg->css); 1330 return ret; 1331 } 1332 1333 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1334 { 1335 unsigned long inactive_ratio; 1336 unsigned long inactive; 1337 unsigned long active; 1338 unsigned long gb; 1339 1340 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1341 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1342 1343 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1344 if (gb) 1345 inactive_ratio = int_sqrt(10 * gb); 1346 else 1347 inactive_ratio = 1; 1348 1349 return inactive * inactive_ratio < active; 1350 } 1351 1352 bool mem_cgroup_lruvec_online(struct lruvec *lruvec) 1353 { 1354 struct mem_cgroup_per_zone *mz; 1355 struct mem_cgroup *memcg; 1356 1357 if (mem_cgroup_disabled()) 1358 return true; 1359 1360 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1361 memcg = mz->memcg; 1362 1363 return !!(memcg->css.flags & CSS_ONLINE); 1364 } 1365 1366 #define mem_cgroup_from_counter(counter, member) \ 1367 container_of(counter, struct mem_cgroup, member) 1368 1369 /** 1370 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1371 * @memcg: the memory cgroup 1372 * 1373 * Returns the maximum amount of memory @mem can be charged with, in 1374 * pages. 1375 */ 1376 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1377 { 1378 unsigned long margin = 0; 1379 unsigned long count; 1380 unsigned long limit; 1381 1382 count = page_counter_read(&memcg->memory); 1383 limit = READ_ONCE(memcg->memory.limit); 1384 if (count < limit) 1385 margin = limit - count; 1386 1387 if (do_swap_account) { 1388 count = page_counter_read(&memcg->memsw); 1389 limit = READ_ONCE(memcg->memsw.limit); 1390 if (count <= limit) 1391 margin = min(margin, limit - count); 1392 } 1393 1394 return margin; 1395 } 1396 1397 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1398 { 1399 /* root ? */ 1400 if (mem_cgroup_disabled() || !memcg->css.parent) 1401 return vm_swappiness; 1402 1403 return memcg->swappiness; 1404 } 1405 1406 /* 1407 * A routine for checking "mem" is under move_account() or not. 1408 * 1409 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1410 * moving cgroups. This is for waiting at high-memory pressure 1411 * caused by "move". 1412 */ 1413 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1414 { 1415 struct mem_cgroup *from; 1416 struct mem_cgroup *to; 1417 bool ret = false; 1418 /* 1419 * Unlike task_move routines, we access mc.to, mc.from not under 1420 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1421 */ 1422 spin_lock(&mc.lock); 1423 from = mc.from; 1424 to = mc.to; 1425 if (!from) 1426 goto unlock; 1427 1428 ret = mem_cgroup_is_descendant(from, memcg) || 1429 mem_cgroup_is_descendant(to, memcg); 1430 unlock: 1431 spin_unlock(&mc.lock); 1432 return ret; 1433 } 1434 1435 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1436 { 1437 if (mc.moving_task && current != mc.moving_task) { 1438 if (mem_cgroup_under_move(memcg)) { 1439 DEFINE_WAIT(wait); 1440 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1441 /* moving charge context might have finished. */ 1442 if (mc.moving_task) 1443 schedule(); 1444 finish_wait(&mc.waitq, &wait); 1445 return true; 1446 } 1447 } 1448 return false; 1449 } 1450 1451 #define K(x) ((x) << (PAGE_SHIFT-10)) 1452 /** 1453 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1454 * @memcg: The memory cgroup that went over limit 1455 * @p: Task that is going to be killed 1456 * 1457 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1458 * enabled 1459 */ 1460 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1461 { 1462 /* oom_info_lock ensures that parallel ooms do not interleave */ 1463 static DEFINE_MUTEX(oom_info_lock); 1464 struct mem_cgroup *iter; 1465 unsigned int i; 1466 1467 mutex_lock(&oom_info_lock); 1468 rcu_read_lock(); 1469 1470 if (p) { 1471 pr_info("Task in "); 1472 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1473 pr_cont(" killed as a result of limit of "); 1474 } else { 1475 pr_info("Memory limit reached of cgroup "); 1476 } 1477 1478 pr_cont_cgroup_path(memcg->css.cgroup); 1479 pr_cont("\n"); 1480 1481 rcu_read_unlock(); 1482 1483 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1484 K((u64)page_counter_read(&memcg->memory)), 1485 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1486 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1487 K((u64)page_counter_read(&memcg->memsw)), 1488 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1489 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1490 K((u64)page_counter_read(&memcg->kmem)), 1491 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1492 1493 for_each_mem_cgroup_tree(iter, memcg) { 1494 pr_info("Memory cgroup stats for "); 1495 pr_cont_cgroup_path(iter->css.cgroup); 1496 pr_cont(":"); 1497 1498 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1499 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1500 continue; 1501 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1502 K(mem_cgroup_read_stat(iter, i))); 1503 } 1504 1505 for (i = 0; i < NR_LRU_LISTS; i++) 1506 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1507 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1508 1509 pr_cont("\n"); 1510 } 1511 mutex_unlock(&oom_info_lock); 1512 } 1513 1514 /* 1515 * This function returns the number of memcg under hierarchy tree. Returns 1516 * 1(self count) if no children. 1517 */ 1518 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1519 { 1520 int num = 0; 1521 struct mem_cgroup *iter; 1522 1523 for_each_mem_cgroup_tree(iter, memcg) 1524 num++; 1525 return num; 1526 } 1527 1528 /* 1529 * Return the memory (and swap, if configured) limit for a memcg. 1530 */ 1531 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1532 { 1533 unsigned long limit; 1534 1535 limit = memcg->memory.limit; 1536 if (mem_cgroup_swappiness(memcg)) { 1537 unsigned long memsw_limit; 1538 1539 memsw_limit = memcg->memsw.limit; 1540 limit = min(limit + total_swap_pages, memsw_limit); 1541 } 1542 return limit; 1543 } 1544 1545 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1546 int order) 1547 { 1548 struct mem_cgroup *iter; 1549 unsigned long chosen_points = 0; 1550 unsigned long totalpages; 1551 unsigned int points = 0; 1552 struct task_struct *chosen = NULL; 1553 1554 mutex_lock(&oom_lock); 1555 1556 /* 1557 * If current has a pending SIGKILL or is exiting, then automatically 1558 * select it. The goal is to allow it to allocate so that it may 1559 * quickly exit and free its memory. 1560 */ 1561 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1562 mark_oom_victim(current); 1563 goto unlock; 1564 } 1565 1566 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); 1567 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1568 for_each_mem_cgroup_tree(iter, memcg) { 1569 struct css_task_iter it; 1570 struct task_struct *task; 1571 1572 css_task_iter_start(&iter->css, &it); 1573 while ((task = css_task_iter_next(&it))) { 1574 switch (oom_scan_process_thread(task, totalpages, NULL, 1575 false)) { 1576 case OOM_SCAN_SELECT: 1577 if (chosen) 1578 put_task_struct(chosen); 1579 chosen = task; 1580 chosen_points = ULONG_MAX; 1581 get_task_struct(chosen); 1582 /* fall through */ 1583 case OOM_SCAN_CONTINUE: 1584 continue; 1585 case OOM_SCAN_ABORT: 1586 css_task_iter_end(&it); 1587 mem_cgroup_iter_break(memcg, iter); 1588 if (chosen) 1589 put_task_struct(chosen); 1590 goto unlock; 1591 case OOM_SCAN_OK: 1592 break; 1593 }; 1594 points = oom_badness(task, memcg, NULL, totalpages); 1595 if (!points || points < chosen_points) 1596 continue; 1597 /* Prefer thread group leaders for display purposes */ 1598 if (points == chosen_points && 1599 thread_group_leader(chosen)) 1600 continue; 1601 1602 if (chosen) 1603 put_task_struct(chosen); 1604 chosen = task; 1605 chosen_points = points; 1606 get_task_struct(chosen); 1607 } 1608 css_task_iter_end(&it); 1609 } 1610 1611 if (chosen) { 1612 points = chosen_points * 1000 / totalpages; 1613 oom_kill_process(chosen, gfp_mask, order, points, totalpages, 1614 memcg, NULL, "Memory cgroup out of memory"); 1615 } 1616 unlock: 1617 mutex_unlock(&oom_lock); 1618 } 1619 1620 #if MAX_NUMNODES > 1 1621 1622 /** 1623 * test_mem_cgroup_node_reclaimable 1624 * @memcg: the target memcg 1625 * @nid: the node ID to be checked. 1626 * @noswap : specify true here if the user wants flle only information. 1627 * 1628 * This function returns whether the specified memcg contains any 1629 * reclaimable pages on a node. Returns true if there are any reclaimable 1630 * pages in the node. 1631 */ 1632 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1633 int nid, bool noswap) 1634 { 1635 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1636 return true; 1637 if (noswap || !total_swap_pages) 1638 return false; 1639 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1640 return true; 1641 return false; 1642 1643 } 1644 1645 /* 1646 * Always updating the nodemask is not very good - even if we have an empty 1647 * list or the wrong list here, we can start from some node and traverse all 1648 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1649 * 1650 */ 1651 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1652 { 1653 int nid; 1654 /* 1655 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1656 * pagein/pageout changes since the last update. 1657 */ 1658 if (!atomic_read(&memcg->numainfo_events)) 1659 return; 1660 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1661 return; 1662 1663 /* make a nodemask where this memcg uses memory from */ 1664 memcg->scan_nodes = node_states[N_MEMORY]; 1665 1666 for_each_node_mask(nid, node_states[N_MEMORY]) { 1667 1668 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1669 node_clear(nid, memcg->scan_nodes); 1670 } 1671 1672 atomic_set(&memcg->numainfo_events, 0); 1673 atomic_set(&memcg->numainfo_updating, 0); 1674 } 1675 1676 /* 1677 * Selecting a node where we start reclaim from. Because what we need is just 1678 * reducing usage counter, start from anywhere is O,K. Considering 1679 * memory reclaim from current node, there are pros. and cons. 1680 * 1681 * Freeing memory from current node means freeing memory from a node which 1682 * we'll use or we've used. So, it may make LRU bad. And if several threads 1683 * hit limits, it will see a contention on a node. But freeing from remote 1684 * node means more costs for memory reclaim because of memory latency. 1685 * 1686 * Now, we use round-robin. Better algorithm is welcomed. 1687 */ 1688 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1689 { 1690 int node; 1691 1692 mem_cgroup_may_update_nodemask(memcg); 1693 node = memcg->last_scanned_node; 1694 1695 node = next_node(node, memcg->scan_nodes); 1696 if (node == MAX_NUMNODES) 1697 node = first_node(memcg->scan_nodes); 1698 /* 1699 * We call this when we hit limit, not when pages are added to LRU. 1700 * No LRU may hold pages because all pages are UNEVICTABLE or 1701 * memcg is too small and all pages are not on LRU. In that case, 1702 * we use curret node. 1703 */ 1704 if (unlikely(node == MAX_NUMNODES)) 1705 node = numa_node_id(); 1706 1707 memcg->last_scanned_node = node; 1708 return node; 1709 } 1710 #else 1711 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1712 { 1713 return 0; 1714 } 1715 #endif 1716 1717 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1718 struct zone *zone, 1719 gfp_t gfp_mask, 1720 unsigned long *total_scanned) 1721 { 1722 struct mem_cgroup *victim = NULL; 1723 int total = 0; 1724 int loop = 0; 1725 unsigned long excess; 1726 unsigned long nr_scanned; 1727 struct mem_cgroup_reclaim_cookie reclaim = { 1728 .zone = zone, 1729 .priority = 0, 1730 }; 1731 1732 excess = soft_limit_excess(root_memcg); 1733 1734 while (1) { 1735 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1736 if (!victim) { 1737 loop++; 1738 if (loop >= 2) { 1739 /* 1740 * If we have not been able to reclaim 1741 * anything, it might because there are 1742 * no reclaimable pages under this hierarchy 1743 */ 1744 if (!total) 1745 break; 1746 /* 1747 * We want to do more targeted reclaim. 1748 * excess >> 2 is not to excessive so as to 1749 * reclaim too much, nor too less that we keep 1750 * coming back to reclaim from this cgroup 1751 */ 1752 if (total >= (excess >> 2) || 1753 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1754 break; 1755 } 1756 continue; 1757 } 1758 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1759 zone, &nr_scanned); 1760 *total_scanned += nr_scanned; 1761 if (!soft_limit_excess(root_memcg)) 1762 break; 1763 } 1764 mem_cgroup_iter_break(root_memcg, victim); 1765 return total; 1766 } 1767 1768 #ifdef CONFIG_LOCKDEP 1769 static struct lockdep_map memcg_oom_lock_dep_map = { 1770 .name = "memcg_oom_lock", 1771 }; 1772 #endif 1773 1774 static DEFINE_SPINLOCK(memcg_oom_lock); 1775 1776 /* 1777 * Check OOM-Killer is already running under our hierarchy. 1778 * If someone is running, return false. 1779 */ 1780 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1781 { 1782 struct mem_cgroup *iter, *failed = NULL; 1783 1784 spin_lock(&memcg_oom_lock); 1785 1786 for_each_mem_cgroup_tree(iter, memcg) { 1787 if (iter->oom_lock) { 1788 /* 1789 * this subtree of our hierarchy is already locked 1790 * so we cannot give a lock. 1791 */ 1792 failed = iter; 1793 mem_cgroup_iter_break(memcg, iter); 1794 break; 1795 } else 1796 iter->oom_lock = true; 1797 } 1798 1799 if (failed) { 1800 /* 1801 * OK, we failed to lock the whole subtree so we have 1802 * to clean up what we set up to the failing subtree 1803 */ 1804 for_each_mem_cgroup_tree(iter, memcg) { 1805 if (iter == failed) { 1806 mem_cgroup_iter_break(memcg, iter); 1807 break; 1808 } 1809 iter->oom_lock = false; 1810 } 1811 } else 1812 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1813 1814 spin_unlock(&memcg_oom_lock); 1815 1816 return !failed; 1817 } 1818 1819 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1820 { 1821 struct mem_cgroup *iter; 1822 1823 spin_lock(&memcg_oom_lock); 1824 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1825 for_each_mem_cgroup_tree(iter, memcg) 1826 iter->oom_lock = false; 1827 spin_unlock(&memcg_oom_lock); 1828 } 1829 1830 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1831 { 1832 struct mem_cgroup *iter; 1833 1834 spin_lock(&memcg_oom_lock); 1835 for_each_mem_cgroup_tree(iter, memcg) 1836 iter->under_oom++; 1837 spin_unlock(&memcg_oom_lock); 1838 } 1839 1840 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1841 { 1842 struct mem_cgroup *iter; 1843 1844 /* 1845 * When a new child is created while the hierarchy is under oom, 1846 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1847 */ 1848 spin_lock(&memcg_oom_lock); 1849 for_each_mem_cgroup_tree(iter, memcg) 1850 if (iter->under_oom > 0) 1851 iter->under_oom--; 1852 spin_unlock(&memcg_oom_lock); 1853 } 1854 1855 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1856 1857 struct oom_wait_info { 1858 struct mem_cgroup *memcg; 1859 wait_queue_t wait; 1860 }; 1861 1862 static int memcg_oom_wake_function(wait_queue_t *wait, 1863 unsigned mode, int sync, void *arg) 1864 { 1865 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1866 struct mem_cgroup *oom_wait_memcg; 1867 struct oom_wait_info *oom_wait_info; 1868 1869 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1870 oom_wait_memcg = oom_wait_info->memcg; 1871 1872 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1873 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1874 return 0; 1875 return autoremove_wake_function(wait, mode, sync, arg); 1876 } 1877 1878 static void memcg_oom_recover(struct mem_cgroup *memcg) 1879 { 1880 /* 1881 * For the following lockless ->under_oom test, the only required 1882 * guarantee is that it must see the state asserted by an OOM when 1883 * this function is called as a result of userland actions 1884 * triggered by the notification of the OOM. This is trivially 1885 * achieved by invoking mem_cgroup_mark_under_oom() before 1886 * triggering notification. 1887 */ 1888 if (memcg && memcg->under_oom) 1889 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1890 } 1891 1892 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1893 { 1894 if (!current->memcg_oom.may_oom) 1895 return; 1896 /* 1897 * We are in the middle of the charge context here, so we 1898 * don't want to block when potentially sitting on a callstack 1899 * that holds all kinds of filesystem and mm locks. 1900 * 1901 * Also, the caller may handle a failed allocation gracefully 1902 * (like optional page cache readahead) and so an OOM killer 1903 * invocation might not even be necessary. 1904 * 1905 * That's why we don't do anything here except remember the 1906 * OOM context and then deal with it at the end of the page 1907 * fault when the stack is unwound, the locks are released, 1908 * and when we know whether the fault was overall successful. 1909 */ 1910 css_get(&memcg->css); 1911 current->memcg_oom.memcg = memcg; 1912 current->memcg_oom.gfp_mask = mask; 1913 current->memcg_oom.order = order; 1914 } 1915 1916 /** 1917 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1918 * @handle: actually kill/wait or just clean up the OOM state 1919 * 1920 * This has to be called at the end of a page fault if the memcg OOM 1921 * handler was enabled. 1922 * 1923 * Memcg supports userspace OOM handling where failed allocations must 1924 * sleep on a waitqueue until the userspace task resolves the 1925 * situation. Sleeping directly in the charge context with all kinds 1926 * of locks held is not a good idea, instead we remember an OOM state 1927 * in the task and mem_cgroup_oom_synchronize() has to be called at 1928 * the end of the page fault to complete the OOM handling. 1929 * 1930 * Returns %true if an ongoing memcg OOM situation was detected and 1931 * completed, %false otherwise. 1932 */ 1933 bool mem_cgroup_oom_synchronize(bool handle) 1934 { 1935 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1936 struct oom_wait_info owait; 1937 bool locked; 1938 1939 /* OOM is global, do not handle */ 1940 if (!memcg) 1941 return false; 1942 1943 if (!handle || oom_killer_disabled) 1944 goto cleanup; 1945 1946 owait.memcg = memcg; 1947 owait.wait.flags = 0; 1948 owait.wait.func = memcg_oom_wake_function; 1949 owait.wait.private = current; 1950 INIT_LIST_HEAD(&owait.wait.task_list); 1951 1952 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1953 mem_cgroup_mark_under_oom(memcg); 1954 1955 locked = mem_cgroup_oom_trylock(memcg); 1956 1957 if (locked) 1958 mem_cgroup_oom_notify(memcg); 1959 1960 if (locked && !memcg->oom_kill_disable) { 1961 mem_cgroup_unmark_under_oom(memcg); 1962 finish_wait(&memcg_oom_waitq, &owait.wait); 1963 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1964 current->memcg_oom.order); 1965 } else { 1966 schedule(); 1967 mem_cgroup_unmark_under_oom(memcg); 1968 finish_wait(&memcg_oom_waitq, &owait.wait); 1969 } 1970 1971 if (locked) { 1972 mem_cgroup_oom_unlock(memcg); 1973 /* 1974 * There is no guarantee that an OOM-lock contender 1975 * sees the wakeups triggered by the OOM kill 1976 * uncharges. Wake any sleepers explicitely. 1977 */ 1978 memcg_oom_recover(memcg); 1979 } 1980 cleanup: 1981 current->memcg_oom.memcg = NULL; 1982 css_put(&memcg->css); 1983 return true; 1984 } 1985 1986 /** 1987 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1988 * @page: page that is going to change accounted state 1989 * 1990 * This function must mark the beginning of an accounted page state 1991 * change to prevent double accounting when the page is concurrently 1992 * being moved to another memcg: 1993 * 1994 * memcg = mem_cgroup_begin_page_stat(page); 1995 * if (TestClearPageState(page)) 1996 * mem_cgroup_update_page_stat(memcg, state, -1); 1997 * mem_cgroup_end_page_stat(memcg); 1998 */ 1999 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 2000 { 2001 struct mem_cgroup *memcg; 2002 unsigned long flags; 2003 2004 /* 2005 * The RCU lock is held throughout the transaction. The fast 2006 * path can get away without acquiring the memcg->move_lock 2007 * because page moving starts with an RCU grace period. 2008 * 2009 * The RCU lock also protects the memcg from being freed when 2010 * the page state that is going to change is the only thing 2011 * preventing the page from being uncharged. 2012 * E.g. end-writeback clearing PageWriteback(), which allows 2013 * migration to go ahead and uncharge the page before the 2014 * account transaction might be complete. 2015 */ 2016 rcu_read_lock(); 2017 2018 if (mem_cgroup_disabled()) 2019 return NULL; 2020 again: 2021 memcg = page->mem_cgroup; 2022 if (unlikely(!memcg)) 2023 return NULL; 2024 2025 if (atomic_read(&memcg->moving_account) <= 0) 2026 return memcg; 2027 2028 spin_lock_irqsave(&memcg->move_lock, flags); 2029 if (memcg != page->mem_cgroup) { 2030 spin_unlock_irqrestore(&memcg->move_lock, flags); 2031 goto again; 2032 } 2033 2034 /* 2035 * When charge migration first begins, we can have locked and 2036 * unlocked page stat updates happening concurrently. Track 2037 * the task who has the lock for mem_cgroup_end_page_stat(). 2038 */ 2039 memcg->move_lock_task = current; 2040 memcg->move_lock_flags = flags; 2041 2042 return memcg; 2043 } 2044 EXPORT_SYMBOL(mem_cgroup_begin_page_stat); 2045 2046 /** 2047 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2048 * @memcg: the memcg that was accounted against 2049 */ 2050 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 2051 { 2052 if (memcg && memcg->move_lock_task == current) { 2053 unsigned long flags = memcg->move_lock_flags; 2054 2055 memcg->move_lock_task = NULL; 2056 memcg->move_lock_flags = 0; 2057 2058 spin_unlock_irqrestore(&memcg->move_lock, flags); 2059 } 2060 2061 rcu_read_unlock(); 2062 } 2063 EXPORT_SYMBOL(mem_cgroup_end_page_stat); 2064 2065 /** 2066 * mem_cgroup_update_page_stat - update page state statistics 2067 * @memcg: memcg to account against 2068 * @idx: page state item to account 2069 * @val: number of pages (positive or negative) 2070 * 2071 * See mem_cgroup_begin_page_stat() for locking requirements. 2072 */ 2073 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 2074 enum mem_cgroup_stat_index idx, int val) 2075 { 2076 VM_BUG_ON(!rcu_read_lock_held()); 2077 2078 if (memcg) 2079 this_cpu_add(memcg->stat->count[idx], val); 2080 } 2081 2082 /* 2083 * size of first charge trial. "32" comes from vmscan.c's magic value. 2084 * TODO: maybe necessary to use big numbers in big irons. 2085 */ 2086 #define CHARGE_BATCH 32U 2087 struct memcg_stock_pcp { 2088 struct mem_cgroup *cached; /* this never be root cgroup */ 2089 unsigned int nr_pages; 2090 struct work_struct work; 2091 unsigned long flags; 2092 #define FLUSHING_CACHED_CHARGE 0 2093 }; 2094 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2095 static DEFINE_MUTEX(percpu_charge_mutex); 2096 2097 /** 2098 * consume_stock: Try to consume stocked charge on this cpu. 2099 * @memcg: memcg to consume from. 2100 * @nr_pages: how many pages to charge. 2101 * 2102 * The charges will only happen if @memcg matches the current cpu's memcg 2103 * stock, and at least @nr_pages are available in that stock. Failure to 2104 * service an allocation will refill the stock. 2105 * 2106 * returns true if successful, false otherwise. 2107 */ 2108 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2109 { 2110 struct memcg_stock_pcp *stock; 2111 bool ret = false; 2112 2113 if (nr_pages > CHARGE_BATCH) 2114 return ret; 2115 2116 stock = &get_cpu_var(memcg_stock); 2117 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2118 stock->nr_pages -= nr_pages; 2119 ret = true; 2120 } 2121 put_cpu_var(memcg_stock); 2122 return ret; 2123 } 2124 2125 /* 2126 * Returns stocks cached in percpu and reset cached information. 2127 */ 2128 static void drain_stock(struct memcg_stock_pcp *stock) 2129 { 2130 struct mem_cgroup *old = stock->cached; 2131 2132 if (stock->nr_pages) { 2133 page_counter_uncharge(&old->memory, stock->nr_pages); 2134 if (do_swap_account) 2135 page_counter_uncharge(&old->memsw, stock->nr_pages); 2136 css_put_many(&old->css, stock->nr_pages); 2137 stock->nr_pages = 0; 2138 } 2139 stock->cached = NULL; 2140 } 2141 2142 /* 2143 * This must be called under preempt disabled or must be called by 2144 * a thread which is pinned to local cpu. 2145 */ 2146 static void drain_local_stock(struct work_struct *dummy) 2147 { 2148 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2149 drain_stock(stock); 2150 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2151 } 2152 2153 /* 2154 * Cache charges(val) to local per_cpu area. 2155 * This will be consumed by consume_stock() function, later. 2156 */ 2157 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2158 { 2159 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2160 2161 if (stock->cached != memcg) { /* reset if necessary */ 2162 drain_stock(stock); 2163 stock->cached = memcg; 2164 } 2165 stock->nr_pages += nr_pages; 2166 put_cpu_var(memcg_stock); 2167 } 2168 2169 /* 2170 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2171 * of the hierarchy under it. 2172 */ 2173 static void drain_all_stock(struct mem_cgroup *root_memcg) 2174 { 2175 int cpu, curcpu; 2176 2177 /* If someone's already draining, avoid adding running more workers. */ 2178 if (!mutex_trylock(&percpu_charge_mutex)) 2179 return; 2180 /* Notify other cpus that system-wide "drain" is running */ 2181 get_online_cpus(); 2182 curcpu = get_cpu(); 2183 for_each_online_cpu(cpu) { 2184 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2185 struct mem_cgroup *memcg; 2186 2187 memcg = stock->cached; 2188 if (!memcg || !stock->nr_pages) 2189 continue; 2190 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 2191 continue; 2192 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2193 if (cpu == curcpu) 2194 drain_local_stock(&stock->work); 2195 else 2196 schedule_work_on(cpu, &stock->work); 2197 } 2198 } 2199 put_cpu(); 2200 put_online_cpus(); 2201 mutex_unlock(&percpu_charge_mutex); 2202 } 2203 2204 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2205 unsigned long action, 2206 void *hcpu) 2207 { 2208 int cpu = (unsigned long)hcpu; 2209 struct memcg_stock_pcp *stock; 2210 2211 if (action == CPU_ONLINE) 2212 return NOTIFY_OK; 2213 2214 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2215 return NOTIFY_OK; 2216 2217 stock = &per_cpu(memcg_stock, cpu); 2218 drain_stock(stock); 2219 return NOTIFY_OK; 2220 } 2221 2222 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2223 unsigned int nr_pages) 2224 { 2225 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2226 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2227 struct mem_cgroup *mem_over_limit; 2228 struct page_counter *counter; 2229 unsigned long nr_reclaimed; 2230 bool may_swap = true; 2231 bool drained = false; 2232 int ret = 0; 2233 2234 if (mem_cgroup_is_root(memcg)) 2235 goto done; 2236 retry: 2237 if (consume_stock(memcg, nr_pages)) 2238 goto done; 2239 2240 if (!do_swap_account || 2241 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2242 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2243 goto done_restock; 2244 if (do_swap_account) 2245 page_counter_uncharge(&memcg->memsw, batch); 2246 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2247 } else { 2248 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2249 may_swap = false; 2250 } 2251 2252 if (batch > nr_pages) { 2253 batch = nr_pages; 2254 goto retry; 2255 } 2256 2257 /* 2258 * Unlike in global OOM situations, memcg is not in a physical 2259 * memory shortage. Allow dying and OOM-killed tasks to 2260 * bypass the last charges so that they can exit quickly and 2261 * free their memory. 2262 */ 2263 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2264 fatal_signal_pending(current) || 2265 current->flags & PF_EXITING)) 2266 goto bypass; 2267 2268 if (unlikely(task_in_memcg_oom(current))) 2269 goto nomem; 2270 2271 if (!(gfp_mask & __GFP_WAIT)) 2272 goto nomem; 2273 2274 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); 2275 2276 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2277 gfp_mask, may_swap); 2278 2279 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2280 goto retry; 2281 2282 if (!drained) { 2283 drain_all_stock(mem_over_limit); 2284 drained = true; 2285 goto retry; 2286 } 2287 2288 if (gfp_mask & __GFP_NORETRY) 2289 goto nomem; 2290 /* 2291 * Even though the limit is exceeded at this point, reclaim 2292 * may have been able to free some pages. Retry the charge 2293 * before killing the task. 2294 * 2295 * Only for regular pages, though: huge pages are rather 2296 * unlikely to succeed so close to the limit, and we fall back 2297 * to regular pages anyway in case of failure. 2298 */ 2299 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2300 goto retry; 2301 /* 2302 * At task move, charge accounts can be doubly counted. So, it's 2303 * better to wait until the end of task_move if something is going on. 2304 */ 2305 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2306 goto retry; 2307 2308 if (nr_retries--) 2309 goto retry; 2310 2311 if (gfp_mask & __GFP_NOFAIL) 2312 goto bypass; 2313 2314 if (fatal_signal_pending(current)) 2315 goto bypass; 2316 2317 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); 2318 2319 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2320 nomem: 2321 if (!(gfp_mask & __GFP_NOFAIL)) 2322 return -ENOMEM; 2323 bypass: 2324 return -EINTR; 2325 2326 done_restock: 2327 css_get_many(&memcg->css, batch); 2328 if (batch > nr_pages) 2329 refill_stock(memcg, batch - nr_pages); 2330 if (!(gfp_mask & __GFP_WAIT)) 2331 goto done; 2332 /* 2333 * If the hierarchy is above the normal consumption range, 2334 * make the charging task trim their excess contribution. 2335 */ 2336 do { 2337 if (page_counter_read(&memcg->memory) <= memcg->high) 2338 continue; 2339 mem_cgroup_events(memcg, MEMCG_HIGH, 1); 2340 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2341 } while ((memcg = parent_mem_cgroup(memcg))); 2342 done: 2343 return ret; 2344 } 2345 2346 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2347 { 2348 if (mem_cgroup_is_root(memcg)) 2349 return; 2350 2351 page_counter_uncharge(&memcg->memory, nr_pages); 2352 if (do_swap_account) 2353 page_counter_uncharge(&memcg->memsw, nr_pages); 2354 2355 css_put_many(&memcg->css, nr_pages); 2356 } 2357 2358 /* 2359 * try_get_mem_cgroup_from_page - look up page's memcg association 2360 * @page: the page 2361 * 2362 * Look up, get a css reference, and return the memcg that owns @page. 2363 * 2364 * The page must be locked to prevent racing with swap-in and page 2365 * cache charges. If coming from an unlocked page table, the caller 2366 * must ensure the page is on the LRU or this can race with charging. 2367 */ 2368 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2369 { 2370 struct mem_cgroup *memcg; 2371 unsigned short id; 2372 swp_entry_t ent; 2373 2374 VM_BUG_ON_PAGE(!PageLocked(page), page); 2375 2376 memcg = page->mem_cgroup; 2377 if (memcg) { 2378 if (!css_tryget_online(&memcg->css)) 2379 memcg = NULL; 2380 } else if (PageSwapCache(page)) { 2381 ent.val = page_private(page); 2382 id = lookup_swap_cgroup_id(ent); 2383 rcu_read_lock(); 2384 memcg = mem_cgroup_from_id(id); 2385 if (memcg && !css_tryget_online(&memcg->css)) 2386 memcg = NULL; 2387 rcu_read_unlock(); 2388 } 2389 return memcg; 2390 } 2391 2392 static void lock_page_lru(struct page *page, int *isolated) 2393 { 2394 struct zone *zone = page_zone(page); 2395 2396 spin_lock_irq(&zone->lru_lock); 2397 if (PageLRU(page)) { 2398 struct lruvec *lruvec; 2399 2400 lruvec = mem_cgroup_page_lruvec(page, zone); 2401 ClearPageLRU(page); 2402 del_page_from_lru_list(page, lruvec, page_lru(page)); 2403 *isolated = 1; 2404 } else 2405 *isolated = 0; 2406 } 2407 2408 static void unlock_page_lru(struct page *page, int isolated) 2409 { 2410 struct zone *zone = page_zone(page); 2411 2412 if (isolated) { 2413 struct lruvec *lruvec; 2414 2415 lruvec = mem_cgroup_page_lruvec(page, zone); 2416 VM_BUG_ON_PAGE(PageLRU(page), page); 2417 SetPageLRU(page); 2418 add_page_to_lru_list(page, lruvec, page_lru(page)); 2419 } 2420 spin_unlock_irq(&zone->lru_lock); 2421 } 2422 2423 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2424 bool lrucare) 2425 { 2426 int isolated; 2427 2428 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2429 2430 /* 2431 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2432 * may already be on some other mem_cgroup's LRU. Take care of it. 2433 */ 2434 if (lrucare) 2435 lock_page_lru(page, &isolated); 2436 2437 /* 2438 * Nobody should be changing or seriously looking at 2439 * page->mem_cgroup at this point: 2440 * 2441 * - the page is uncharged 2442 * 2443 * - the page is off-LRU 2444 * 2445 * - an anonymous fault has exclusive page access, except for 2446 * a locked page table 2447 * 2448 * - a page cache insertion, a swapin fault, or a migration 2449 * have the page locked 2450 */ 2451 page->mem_cgroup = memcg; 2452 2453 if (lrucare) 2454 unlock_page_lru(page, isolated); 2455 } 2456 2457 #ifdef CONFIG_MEMCG_KMEM 2458 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2459 unsigned long nr_pages) 2460 { 2461 struct page_counter *counter; 2462 int ret = 0; 2463 2464 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2465 if (ret < 0) 2466 return ret; 2467 2468 ret = try_charge(memcg, gfp, nr_pages); 2469 if (ret == -EINTR) { 2470 /* 2471 * try_charge() chose to bypass to root due to OOM kill or 2472 * fatal signal. Since our only options are to either fail 2473 * the allocation or charge it to this cgroup, do it as a 2474 * temporary condition. But we can't fail. From a kmem/slab 2475 * perspective, the cache has already been selected, by 2476 * mem_cgroup_kmem_get_cache(), so it is too late to change 2477 * our minds. 2478 * 2479 * This condition will only trigger if the task entered 2480 * memcg_charge_kmem in a sane state, but was OOM-killed 2481 * during try_charge() above. Tasks that were already dying 2482 * when the allocation triggers should have been already 2483 * directed to the root cgroup in memcontrol.h 2484 */ 2485 page_counter_charge(&memcg->memory, nr_pages); 2486 if (do_swap_account) 2487 page_counter_charge(&memcg->memsw, nr_pages); 2488 css_get_many(&memcg->css, nr_pages); 2489 ret = 0; 2490 } else if (ret) 2491 page_counter_uncharge(&memcg->kmem, nr_pages); 2492 2493 return ret; 2494 } 2495 2496 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) 2497 { 2498 page_counter_uncharge(&memcg->memory, nr_pages); 2499 if (do_swap_account) 2500 page_counter_uncharge(&memcg->memsw, nr_pages); 2501 2502 page_counter_uncharge(&memcg->kmem, nr_pages); 2503 2504 css_put_many(&memcg->css, nr_pages); 2505 } 2506 2507 /* 2508 * helper for acessing a memcg's index. It will be used as an index in the 2509 * child cache array in kmem_cache, and also to derive its name. This function 2510 * will return -1 when this is not a kmem-limited memcg. 2511 */ 2512 int memcg_cache_id(struct mem_cgroup *memcg) 2513 { 2514 return memcg ? memcg->kmemcg_id : -1; 2515 } 2516 2517 static int memcg_alloc_cache_id(void) 2518 { 2519 int id, size; 2520 int err; 2521 2522 id = ida_simple_get(&memcg_cache_ida, 2523 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2524 if (id < 0) 2525 return id; 2526 2527 if (id < memcg_nr_cache_ids) 2528 return id; 2529 2530 /* 2531 * There's no space for the new id in memcg_caches arrays, 2532 * so we have to grow them. 2533 */ 2534 down_write(&memcg_cache_ids_sem); 2535 2536 size = 2 * (id + 1); 2537 if (size < MEMCG_CACHES_MIN_SIZE) 2538 size = MEMCG_CACHES_MIN_SIZE; 2539 else if (size > MEMCG_CACHES_MAX_SIZE) 2540 size = MEMCG_CACHES_MAX_SIZE; 2541 2542 err = memcg_update_all_caches(size); 2543 if (!err) 2544 err = memcg_update_all_list_lrus(size); 2545 if (!err) 2546 memcg_nr_cache_ids = size; 2547 2548 up_write(&memcg_cache_ids_sem); 2549 2550 if (err) { 2551 ida_simple_remove(&memcg_cache_ida, id); 2552 return err; 2553 } 2554 return id; 2555 } 2556 2557 static void memcg_free_cache_id(int id) 2558 { 2559 ida_simple_remove(&memcg_cache_ida, id); 2560 } 2561 2562 struct memcg_kmem_cache_create_work { 2563 struct mem_cgroup *memcg; 2564 struct kmem_cache *cachep; 2565 struct work_struct work; 2566 }; 2567 2568 static void memcg_kmem_cache_create_func(struct work_struct *w) 2569 { 2570 struct memcg_kmem_cache_create_work *cw = 2571 container_of(w, struct memcg_kmem_cache_create_work, work); 2572 struct mem_cgroup *memcg = cw->memcg; 2573 struct kmem_cache *cachep = cw->cachep; 2574 2575 memcg_create_kmem_cache(memcg, cachep); 2576 2577 css_put(&memcg->css); 2578 kfree(cw); 2579 } 2580 2581 /* 2582 * Enqueue the creation of a per-memcg kmem_cache. 2583 */ 2584 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2585 struct kmem_cache *cachep) 2586 { 2587 struct memcg_kmem_cache_create_work *cw; 2588 2589 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2590 if (!cw) 2591 return; 2592 2593 css_get(&memcg->css); 2594 2595 cw->memcg = memcg; 2596 cw->cachep = cachep; 2597 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2598 2599 schedule_work(&cw->work); 2600 } 2601 2602 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2603 struct kmem_cache *cachep) 2604 { 2605 /* 2606 * We need to stop accounting when we kmalloc, because if the 2607 * corresponding kmalloc cache is not yet created, the first allocation 2608 * in __memcg_schedule_kmem_cache_create will recurse. 2609 * 2610 * However, it is better to enclose the whole function. Depending on 2611 * the debugging options enabled, INIT_WORK(), for instance, can 2612 * trigger an allocation. This too, will make us recurse. Because at 2613 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2614 * the safest choice is to do it like this, wrapping the whole function. 2615 */ 2616 current->memcg_kmem_skip_account = 1; 2617 __memcg_schedule_kmem_cache_create(memcg, cachep); 2618 current->memcg_kmem_skip_account = 0; 2619 } 2620 2621 /* 2622 * Return the kmem_cache we're supposed to use for a slab allocation. 2623 * We try to use the current memcg's version of the cache. 2624 * 2625 * If the cache does not exist yet, if we are the first user of it, 2626 * we either create it immediately, if possible, or create it asynchronously 2627 * in a workqueue. 2628 * In the latter case, we will let the current allocation go through with 2629 * the original cache. 2630 * 2631 * Can't be called in interrupt context or from kernel threads. 2632 * This function needs to be called with rcu_read_lock() held. 2633 */ 2634 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2635 { 2636 struct mem_cgroup *memcg; 2637 struct kmem_cache *memcg_cachep; 2638 int kmemcg_id; 2639 2640 VM_BUG_ON(!is_root_cache(cachep)); 2641 2642 if (current->memcg_kmem_skip_account) 2643 return cachep; 2644 2645 memcg = get_mem_cgroup_from_mm(current->mm); 2646 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2647 if (kmemcg_id < 0) 2648 goto out; 2649 2650 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2651 if (likely(memcg_cachep)) 2652 return memcg_cachep; 2653 2654 /* 2655 * If we are in a safe context (can wait, and not in interrupt 2656 * context), we could be be predictable and return right away. 2657 * This would guarantee that the allocation being performed 2658 * already belongs in the new cache. 2659 * 2660 * However, there are some clashes that can arrive from locking. 2661 * For instance, because we acquire the slab_mutex while doing 2662 * memcg_create_kmem_cache, this means no further allocation 2663 * could happen with the slab_mutex held. So it's better to 2664 * defer everything. 2665 */ 2666 memcg_schedule_kmem_cache_create(memcg, cachep); 2667 out: 2668 css_put(&memcg->css); 2669 return cachep; 2670 } 2671 2672 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2673 { 2674 if (!is_root_cache(cachep)) 2675 css_put(&cachep->memcg_params.memcg->css); 2676 } 2677 2678 /* 2679 * We need to verify if the allocation against current->mm->owner's memcg is 2680 * possible for the given order. But the page is not allocated yet, so we'll 2681 * need a further commit step to do the final arrangements. 2682 * 2683 * It is possible for the task to switch cgroups in this mean time, so at 2684 * commit time, we can't rely on task conversion any longer. We'll then use 2685 * the handle argument to return to the caller which cgroup we should commit 2686 * against. We could also return the memcg directly and avoid the pointer 2687 * passing, but a boolean return value gives better semantics considering 2688 * the compiled-out case as well. 2689 * 2690 * Returning true means the allocation is possible. 2691 */ 2692 bool 2693 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 2694 { 2695 struct mem_cgroup *memcg; 2696 int ret; 2697 2698 *_memcg = NULL; 2699 2700 memcg = get_mem_cgroup_from_mm(current->mm); 2701 2702 if (!memcg_kmem_is_active(memcg)) { 2703 css_put(&memcg->css); 2704 return true; 2705 } 2706 2707 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2708 if (!ret) 2709 *_memcg = memcg; 2710 2711 css_put(&memcg->css); 2712 return (ret == 0); 2713 } 2714 2715 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2716 int order) 2717 { 2718 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2719 2720 /* The page allocation failed. Revert */ 2721 if (!page) { 2722 memcg_uncharge_kmem(memcg, 1 << order); 2723 return; 2724 } 2725 page->mem_cgroup = memcg; 2726 } 2727 2728 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2729 { 2730 struct mem_cgroup *memcg = page->mem_cgroup; 2731 2732 if (!memcg) 2733 return; 2734 2735 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2736 2737 memcg_uncharge_kmem(memcg, 1 << order); 2738 page->mem_cgroup = NULL; 2739 } 2740 2741 struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) 2742 { 2743 struct mem_cgroup *memcg = NULL; 2744 struct kmem_cache *cachep; 2745 struct page *page; 2746 2747 page = virt_to_head_page(ptr); 2748 if (PageSlab(page)) { 2749 cachep = page->slab_cache; 2750 if (!is_root_cache(cachep)) 2751 memcg = cachep->memcg_params.memcg; 2752 } else 2753 /* page allocated by alloc_kmem_pages */ 2754 memcg = page->mem_cgroup; 2755 2756 return memcg; 2757 } 2758 #endif /* CONFIG_MEMCG_KMEM */ 2759 2760 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2761 2762 /* 2763 * Because tail pages are not marked as "used", set it. We're under 2764 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2765 * charge/uncharge will be never happen and move_account() is done under 2766 * compound_lock(), so we don't have to take care of races. 2767 */ 2768 void mem_cgroup_split_huge_fixup(struct page *head) 2769 { 2770 int i; 2771 2772 if (mem_cgroup_disabled()) 2773 return; 2774 2775 for (i = 1; i < HPAGE_PMD_NR; i++) 2776 head[i].mem_cgroup = head->mem_cgroup; 2777 2778 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2779 HPAGE_PMD_NR); 2780 } 2781 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2782 2783 #ifdef CONFIG_MEMCG_SWAP 2784 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2785 bool charge) 2786 { 2787 int val = (charge) ? 1 : -1; 2788 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 2789 } 2790 2791 /** 2792 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2793 * @entry: swap entry to be moved 2794 * @from: mem_cgroup which the entry is moved from 2795 * @to: mem_cgroup which the entry is moved to 2796 * 2797 * It succeeds only when the swap_cgroup's record for this entry is the same 2798 * as the mem_cgroup's id of @from. 2799 * 2800 * Returns 0 on success, -EINVAL on failure. 2801 * 2802 * The caller must have charged to @to, IOW, called page_counter_charge() about 2803 * both res and memsw, and called css_get(). 2804 */ 2805 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2806 struct mem_cgroup *from, struct mem_cgroup *to) 2807 { 2808 unsigned short old_id, new_id; 2809 2810 old_id = mem_cgroup_id(from); 2811 new_id = mem_cgroup_id(to); 2812 2813 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2814 mem_cgroup_swap_statistics(from, false); 2815 mem_cgroup_swap_statistics(to, true); 2816 return 0; 2817 } 2818 return -EINVAL; 2819 } 2820 #else 2821 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2822 struct mem_cgroup *from, struct mem_cgroup *to) 2823 { 2824 return -EINVAL; 2825 } 2826 #endif 2827 2828 static DEFINE_MUTEX(memcg_limit_mutex); 2829 2830 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2831 unsigned long limit) 2832 { 2833 unsigned long curusage; 2834 unsigned long oldusage; 2835 bool enlarge = false; 2836 int retry_count; 2837 int ret; 2838 2839 /* 2840 * For keeping hierarchical_reclaim simple, how long we should retry 2841 * is depends on callers. We set our retry-count to be function 2842 * of # of children which we should visit in this loop. 2843 */ 2844 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2845 mem_cgroup_count_children(memcg); 2846 2847 oldusage = page_counter_read(&memcg->memory); 2848 2849 do { 2850 if (signal_pending(current)) { 2851 ret = -EINTR; 2852 break; 2853 } 2854 2855 mutex_lock(&memcg_limit_mutex); 2856 if (limit > memcg->memsw.limit) { 2857 mutex_unlock(&memcg_limit_mutex); 2858 ret = -EINVAL; 2859 break; 2860 } 2861 if (limit > memcg->memory.limit) 2862 enlarge = true; 2863 ret = page_counter_limit(&memcg->memory, limit); 2864 mutex_unlock(&memcg_limit_mutex); 2865 2866 if (!ret) 2867 break; 2868 2869 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 2870 2871 curusage = page_counter_read(&memcg->memory); 2872 /* Usage is reduced ? */ 2873 if (curusage >= oldusage) 2874 retry_count--; 2875 else 2876 oldusage = curusage; 2877 } while (retry_count); 2878 2879 if (!ret && enlarge) 2880 memcg_oom_recover(memcg); 2881 2882 return ret; 2883 } 2884 2885 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2886 unsigned long limit) 2887 { 2888 unsigned long curusage; 2889 unsigned long oldusage; 2890 bool enlarge = false; 2891 int retry_count; 2892 int ret; 2893 2894 /* see mem_cgroup_resize_res_limit */ 2895 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2896 mem_cgroup_count_children(memcg); 2897 2898 oldusage = page_counter_read(&memcg->memsw); 2899 2900 do { 2901 if (signal_pending(current)) { 2902 ret = -EINTR; 2903 break; 2904 } 2905 2906 mutex_lock(&memcg_limit_mutex); 2907 if (limit < memcg->memory.limit) { 2908 mutex_unlock(&memcg_limit_mutex); 2909 ret = -EINVAL; 2910 break; 2911 } 2912 if (limit > memcg->memsw.limit) 2913 enlarge = true; 2914 ret = page_counter_limit(&memcg->memsw, limit); 2915 mutex_unlock(&memcg_limit_mutex); 2916 2917 if (!ret) 2918 break; 2919 2920 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 2921 2922 curusage = page_counter_read(&memcg->memsw); 2923 /* Usage is reduced ? */ 2924 if (curusage >= oldusage) 2925 retry_count--; 2926 else 2927 oldusage = curusage; 2928 } while (retry_count); 2929 2930 if (!ret && enlarge) 2931 memcg_oom_recover(memcg); 2932 2933 return ret; 2934 } 2935 2936 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2937 gfp_t gfp_mask, 2938 unsigned long *total_scanned) 2939 { 2940 unsigned long nr_reclaimed = 0; 2941 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2942 unsigned long reclaimed; 2943 int loop = 0; 2944 struct mem_cgroup_tree_per_zone *mctz; 2945 unsigned long excess; 2946 unsigned long nr_scanned; 2947 2948 if (order > 0) 2949 return 0; 2950 2951 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2952 /* 2953 * This loop can run a while, specially if mem_cgroup's continuously 2954 * keep exceeding their soft limit and putting the system under 2955 * pressure 2956 */ 2957 do { 2958 if (next_mz) 2959 mz = next_mz; 2960 else 2961 mz = mem_cgroup_largest_soft_limit_node(mctz); 2962 if (!mz) 2963 break; 2964 2965 nr_scanned = 0; 2966 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 2967 gfp_mask, &nr_scanned); 2968 nr_reclaimed += reclaimed; 2969 *total_scanned += nr_scanned; 2970 spin_lock_irq(&mctz->lock); 2971 __mem_cgroup_remove_exceeded(mz, mctz); 2972 2973 /* 2974 * If we failed to reclaim anything from this memory cgroup 2975 * it is time to move on to the next cgroup 2976 */ 2977 next_mz = NULL; 2978 if (!reclaimed) 2979 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 2980 2981 excess = soft_limit_excess(mz->memcg); 2982 /* 2983 * One school of thought says that we should not add 2984 * back the node to the tree if reclaim returns 0. 2985 * But our reclaim could return 0, simply because due 2986 * to priority we are exposing a smaller subset of 2987 * memory to reclaim from. Consider this as a longer 2988 * term TODO. 2989 */ 2990 /* If excess == 0, no tree ops */ 2991 __mem_cgroup_insert_exceeded(mz, mctz, excess); 2992 spin_unlock_irq(&mctz->lock); 2993 css_put(&mz->memcg->css); 2994 loop++; 2995 /* 2996 * Could not reclaim anything and there are no more 2997 * mem cgroups to try or we seem to be looping without 2998 * reclaiming anything. 2999 */ 3000 if (!nr_reclaimed && 3001 (next_mz == NULL || 3002 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3003 break; 3004 } while (!nr_reclaimed); 3005 if (next_mz) 3006 css_put(&next_mz->memcg->css); 3007 return nr_reclaimed; 3008 } 3009 3010 /* 3011 * Test whether @memcg has children, dead or alive. Note that this 3012 * function doesn't care whether @memcg has use_hierarchy enabled and 3013 * returns %true if there are child csses according to the cgroup 3014 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3015 */ 3016 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3017 { 3018 bool ret; 3019 3020 /* 3021 * The lock does not prevent addition or deletion of children, but 3022 * it prevents a new child from being initialized based on this 3023 * parent in css_online(), so it's enough to decide whether 3024 * hierarchically inherited attributes can still be changed or not. 3025 */ 3026 lockdep_assert_held(&memcg_create_mutex); 3027 3028 rcu_read_lock(); 3029 ret = css_next_child(NULL, &memcg->css); 3030 rcu_read_unlock(); 3031 return ret; 3032 } 3033 3034 /* 3035 * Reclaims as many pages from the given memcg as possible and moves 3036 * the rest to the parent. 3037 * 3038 * Caller is responsible for holding css reference for memcg. 3039 */ 3040 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3041 { 3042 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3043 3044 /* we call try-to-free pages for make this cgroup empty */ 3045 lru_add_drain_all(); 3046 /* try to free all pages in this cgroup */ 3047 while (nr_retries && page_counter_read(&memcg->memory)) { 3048 int progress; 3049 3050 if (signal_pending(current)) 3051 return -EINTR; 3052 3053 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3054 GFP_KERNEL, true); 3055 if (!progress) { 3056 nr_retries--; 3057 /* maybe some writeback is necessary */ 3058 congestion_wait(BLK_RW_ASYNC, HZ/10); 3059 } 3060 3061 } 3062 3063 return 0; 3064 } 3065 3066 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3067 char *buf, size_t nbytes, 3068 loff_t off) 3069 { 3070 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3071 3072 if (mem_cgroup_is_root(memcg)) 3073 return -EINVAL; 3074 return mem_cgroup_force_empty(memcg) ?: nbytes; 3075 } 3076 3077 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3078 struct cftype *cft) 3079 { 3080 return mem_cgroup_from_css(css)->use_hierarchy; 3081 } 3082 3083 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3084 struct cftype *cft, u64 val) 3085 { 3086 int retval = 0; 3087 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3088 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3089 3090 mutex_lock(&memcg_create_mutex); 3091 3092 if (memcg->use_hierarchy == val) 3093 goto out; 3094 3095 /* 3096 * If parent's use_hierarchy is set, we can't make any modifications 3097 * in the child subtrees. If it is unset, then the change can 3098 * occur, provided the current cgroup has no children. 3099 * 3100 * For the root cgroup, parent_mem is NULL, we allow value to be 3101 * set if there are no children. 3102 */ 3103 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3104 (val == 1 || val == 0)) { 3105 if (!memcg_has_children(memcg)) 3106 memcg->use_hierarchy = val; 3107 else 3108 retval = -EBUSY; 3109 } else 3110 retval = -EINVAL; 3111 3112 out: 3113 mutex_unlock(&memcg_create_mutex); 3114 3115 return retval; 3116 } 3117 3118 static unsigned long tree_stat(struct mem_cgroup *memcg, 3119 enum mem_cgroup_stat_index idx) 3120 { 3121 struct mem_cgroup *iter; 3122 long val = 0; 3123 3124 /* Per-cpu values can be negative, use a signed accumulator */ 3125 for_each_mem_cgroup_tree(iter, memcg) 3126 val += mem_cgroup_read_stat(iter, idx); 3127 3128 if (val < 0) /* race ? */ 3129 val = 0; 3130 return val; 3131 } 3132 3133 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3134 { 3135 u64 val; 3136 3137 if (mem_cgroup_is_root(memcg)) { 3138 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 3139 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 3140 if (swap) 3141 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 3142 } else { 3143 if (!swap) 3144 val = page_counter_read(&memcg->memory); 3145 else 3146 val = page_counter_read(&memcg->memsw); 3147 } 3148 return val << PAGE_SHIFT; 3149 } 3150 3151 enum { 3152 RES_USAGE, 3153 RES_LIMIT, 3154 RES_MAX_USAGE, 3155 RES_FAILCNT, 3156 RES_SOFT_LIMIT, 3157 }; 3158 3159 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3160 struct cftype *cft) 3161 { 3162 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3163 struct page_counter *counter; 3164 3165 switch (MEMFILE_TYPE(cft->private)) { 3166 case _MEM: 3167 counter = &memcg->memory; 3168 break; 3169 case _MEMSWAP: 3170 counter = &memcg->memsw; 3171 break; 3172 case _KMEM: 3173 counter = &memcg->kmem; 3174 break; 3175 default: 3176 BUG(); 3177 } 3178 3179 switch (MEMFILE_ATTR(cft->private)) { 3180 case RES_USAGE: 3181 if (counter == &memcg->memory) 3182 return mem_cgroup_usage(memcg, false); 3183 if (counter == &memcg->memsw) 3184 return mem_cgroup_usage(memcg, true); 3185 return (u64)page_counter_read(counter) * PAGE_SIZE; 3186 case RES_LIMIT: 3187 return (u64)counter->limit * PAGE_SIZE; 3188 case RES_MAX_USAGE: 3189 return (u64)counter->watermark * PAGE_SIZE; 3190 case RES_FAILCNT: 3191 return counter->failcnt; 3192 case RES_SOFT_LIMIT: 3193 return (u64)memcg->soft_limit * PAGE_SIZE; 3194 default: 3195 BUG(); 3196 } 3197 } 3198 3199 #ifdef CONFIG_MEMCG_KMEM 3200 static int memcg_activate_kmem(struct mem_cgroup *memcg, 3201 unsigned long nr_pages) 3202 { 3203 int err = 0; 3204 int memcg_id; 3205 3206 BUG_ON(memcg->kmemcg_id >= 0); 3207 BUG_ON(memcg->kmem_acct_activated); 3208 BUG_ON(memcg->kmem_acct_active); 3209 3210 /* 3211 * For simplicity, we won't allow this to be disabled. It also can't 3212 * be changed if the cgroup has children already, or if tasks had 3213 * already joined. 3214 * 3215 * If tasks join before we set the limit, a person looking at 3216 * kmem.usage_in_bytes will have no way to determine when it took 3217 * place, which makes the value quite meaningless. 3218 * 3219 * After it first became limited, changes in the value of the limit are 3220 * of course permitted. 3221 */ 3222 mutex_lock(&memcg_create_mutex); 3223 if (cgroup_has_tasks(memcg->css.cgroup) || 3224 (memcg->use_hierarchy && memcg_has_children(memcg))) 3225 err = -EBUSY; 3226 mutex_unlock(&memcg_create_mutex); 3227 if (err) 3228 goto out; 3229 3230 memcg_id = memcg_alloc_cache_id(); 3231 if (memcg_id < 0) { 3232 err = memcg_id; 3233 goto out; 3234 } 3235 3236 /* 3237 * We couldn't have accounted to this cgroup, because it hasn't got 3238 * activated yet, so this should succeed. 3239 */ 3240 err = page_counter_limit(&memcg->kmem, nr_pages); 3241 VM_BUG_ON(err); 3242 3243 static_key_slow_inc(&memcg_kmem_enabled_key); 3244 /* 3245 * A memory cgroup is considered kmem-active as soon as it gets 3246 * kmemcg_id. Setting the id after enabling static branching will 3247 * guarantee no one starts accounting before all call sites are 3248 * patched. 3249 */ 3250 memcg->kmemcg_id = memcg_id; 3251 memcg->kmem_acct_activated = true; 3252 memcg->kmem_acct_active = true; 3253 out: 3254 return err; 3255 } 3256 3257 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3258 unsigned long limit) 3259 { 3260 int ret; 3261 3262 mutex_lock(&memcg_limit_mutex); 3263 if (!memcg_kmem_is_active(memcg)) 3264 ret = memcg_activate_kmem(memcg, limit); 3265 else 3266 ret = page_counter_limit(&memcg->kmem, limit); 3267 mutex_unlock(&memcg_limit_mutex); 3268 return ret; 3269 } 3270 3271 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 3272 { 3273 int ret = 0; 3274 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 3275 3276 if (!parent) 3277 return 0; 3278 3279 mutex_lock(&memcg_limit_mutex); 3280 /* 3281 * If the parent cgroup is not kmem-active now, it cannot be activated 3282 * after this point, because it has at least one child already. 3283 */ 3284 if (memcg_kmem_is_active(parent)) 3285 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 3286 mutex_unlock(&memcg_limit_mutex); 3287 return ret; 3288 } 3289 #else 3290 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3291 unsigned long limit) 3292 { 3293 return -EINVAL; 3294 } 3295 #endif /* CONFIG_MEMCG_KMEM */ 3296 3297 /* 3298 * The user of this function is... 3299 * RES_LIMIT. 3300 */ 3301 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3302 char *buf, size_t nbytes, loff_t off) 3303 { 3304 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3305 unsigned long nr_pages; 3306 int ret; 3307 3308 buf = strstrip(buf); 3309 ret = page_counter_memparse(buf, "-1", &nr_pages); 3310 if (ret) 3311 return ret; 3312 3313 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3314 case RES_LIMIT: 3315 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3316 ret = -EINVAL; 3317 break; 3318 } 3319 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3320 case _MEM: 3321 ret = mem_cgroup_resize_limit(memcg, nr_pages); 3322 break; 3323 case _MEMSWAP: 3324 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 3325 break; 3326 case _KMEM: 3327 ret = memcg_update_kmem_limit(memcg, nr_pages); 3328 break; 3329 } 3330 break; 3331 case RES_SOFT_LIMIT: 3332 memcg->soft_limit = nr_pages; 3333 ret = 0; 3334 break; 3335 } 3336 return ret ?: nbytes; 3337 } 3338 3339 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3340 size_t nbytes, loff_t off) 3341 { 3342 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3343 struct page_counter *counter; 3344 3345 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3346 case _MEM: 3347 counter = &memcg->memory; 3348 break; 3349 case _MEMSWAP: 3350 counter = &memcg->memsw; 3351 break; 3352 case _KMEM: 3353 counter = &memcg->kmem; 3354 break; 3355 default: 3356 BUG(); 3357 } 3358 3359 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3360 case RES_MAX_USAGE: 3361 page_counter_reset_watermark(counter); 3362 break; 3363 case RES_FAILCNT: 3364 counter->failcnt = 0; 3365 break; 3366 default: 3367 BUG(); 3368 } 3369 3370 return nbytes; 3371 } 3372 3373 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3374 struct cftype *cft) 3375 { 3376 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3377 } 3378 3379 #ifdef CONFIG_MMU 3380 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3381 struct cftype *cft, u64 val) 3382 { 3383 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3384 3385 if (val & ~MOVE_MASK) 3386 return -EINVAL; 3387 3388 /* 3389 * No kind of locking is needed in here, because ->can_attach() will 3390 * check this value once in the beginning of the process, and then carry 3391 * on with stale data. This means that changes to this value will only 3392 * affect task migrations starting after the change. 3393 */ 3394 memcg->move_charge_at_immigrate = val; 3395 return 0; 3396 } 3397 #else 3398 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3399 struct cftype *cft, u64 val) 3400 { 3401 return -ENOSYS; 3402 } 3403 #endif 3404 3405 #ifdef CONFIG_NUMA 3406 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3407 { 3408 struct numa_stat { 3409 const char *name; 3410 unsigned int lru_mask; 3411 }; 3412 3413 static const struct numa_stat stats[] = { 3414 { "total", LRU_ALL }, 3415 { "file", LRU_ALL_FILE }, 3416 { "anon", LRU_ALL_ANON }, 3417 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3418 }; 3419 const struct numa_stat *stat; 3420 int nid; 3421 unsigned long nr; 3422 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3423 3424 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3425 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3426 seq_printf(m, "%s=%lu", stat->name, nr); 3427 for_each_node_state(nid, N_MEMORY) { 3428 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3429 stat->lru_mask); 3430 seq_printf(m, " N%d=%lu", nid, nr); 3431 } 3432 seq_putc(m, '\n'); 3433 } 3434 3435 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3436 struct mem_cgroup *iter; 3437 3438 nr = 0; 3439 for_each_mem_cgroup_tree(iter, memcg) 3440 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3441 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3442 for_each_node_state(nid, N_MEMORY) { 3443 nr = 0; 3444 for_each_mem_cgroup_tree(iter, memcg) 3445 nr += mem_cgroup_node_nr_lru_pages( 3446 iter, nid, stat->lru_mask); 3447 seq_printf(m, " N%d=%lu", nid, nr); 3448 } 3449 seq_putc(m, '\n'); 3450 } 3451 3452 return 0; 3453 } 3454 #endif /* CONFIG_NUMA */ 3455 3456 static int memcg_stat_show(struct seq_file *m, void *v) 3457 { 3458 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3459 unsigned long memory, memsw; 3460 struct mem_cgroup *mi; 3461 unsigned int i; 3462 3463 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != 3464 MEM_CGROUP_STAT_NSTATS); 3465 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != 3466 MEM_CGROUP_EVENTS_NSTATS); 3467 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3468 3469 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3470 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3471 continue; 3472 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 3473 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3474 } 3475 3476 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3477 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3478 mem_cgroup_read_events(memcg, i)); 3479 3480 for (i = 0; i < NR_LRU_LISTS; i++) 3481 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3482 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3483 3484 /* Hierarchical information */ 3485 memory = memsw = PAGE_COUNTER_MAX; 3486 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3487 memory = min(memory, mi->memory.limit); 3488 memsw = min(memsw, mi->memsw.limit); 3489 } 3490 seq_printf(m, "hierarchical_memory_limit %llu\n", 3491 (u64)memory * PAGE_SIZE); 3492 if (do_swap_account) 3493 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3494 (u64)memsw * PAGE_SIZE); 3495 3496 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3497 long long val = 0; 3498 3499 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3500 continue; 3501 for_each_mem_cgroup_tree(mi, memcg) 3502 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3503 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 3504 } 3505 3506 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3507 unsigned long long val = 0; 3508 3509 for_each_mem_cgroup_tree(mi, memcg) 3510 val += mem_cgroup_read_events(mi, i); 3511 seq_printf(m, "total_%s %llu\n", 3512 mem_cgroup_events_names[i], val); 3513 } 3514 3515 for (i = 0; i < NR_LRU_LISTS; i++) { 3516 unsigned long long val = 0; 3517 3518 for_each_mem_cgroup_tree(mi, memcg) 3519 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3520 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3521 } 3522 3523 #ifdef CONFIG_DEBUG_VM 3524 { 3525 int nid, zid; 3526 struct mem_cgroup_per_zone *mz; 3527 struct zone_reclaim_stat *rstat; 3528 unsigned long recent_rotated[2] = {0, 0}; 3529 unsigned long recent_scanned[2] = {0, 0}; 3530 3531 for_each_online_node(nid) 3532 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3533 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3534 rstat = &mz->lruvec.reclaim_stat; 3535 3536 recent_rotated[0] += rstat->recent_rotated[0]; 3537 recent_rotated[1] += rstat->recent_rotated[1]; 3538 recent_scanned[0] += rstat->recent_scanned[0]; 3539 recent_scanned[1] += rstat->recent_scanned[1]; 3540 } 3541 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3542 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3543 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3544 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3545 } 3546 #endif 3547 3548 return 0; 3549 } 3550 3551 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3552 struct cftype *cft) 3553 { 3554 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3555 3556 return mem_cgroup_swappiness(memcg); 3557 } 3558 3559 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3560 struct cftype *cft, u64 val) 3561 { 3562 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3563 3564 if (val > 100) 3565 return -EINVAL; 3566 3567 if (css->parent) 3568 memcg->swappiness = val; 3569 else 3570 vm_swappiness = val; 3571 3572 return 0; 3573 } 3574 3575 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3576 { 3577 struct mem_cgroup_threshold_ary *t; 3578 unsigned long usage; 3579 int i; 3580 3581 rcu_read_lock(); 3582 if (!swap) 3583 t = rcu_dereference(memcg->thresholds.primary); 3584 else 3585 t = rcu_dereference(memcg->memsw_thresholds.primary); 3586 3587 if (!t) 3588 goto unlock; 3589 3590 usage = mem_cgroup_usage(memcg, swap); 3591 3592 /* 3593 * current_threshold points to threshold just below or equal to usage. 3594 * If it's not true, a threshold was crossed after last 3595 * call of __mem_cgroup_threshold(). 3596 */ 3597 i = t->current_threshold; 3598 3599 /* 3600 * Iterate backward over array of thresholds starting from 3601 * current_threshold and check if a threshold is crossed. 3602 * If none of thresholds below usage is crossed, we read 3603 * only one element of the array here. 3604 */ 3605 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3606 eventfd_signal(t->entries[i].eventfd, 1); 3607 3608 /* i = current_threshold + 1 */ 3609 i++; 3610 3611 /* 3612 * Iterate forward over array of thresholds starting from 3613 * current_threshold+1 and check if a threshold is crossed. 3614 * If none of thresholds above usage is crossed, we read 3615 * only one element of the array here. 3616 */ 3617 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3618 eventfd_signal(t->entries[i].eventfd, 1); 3619 3620 /* Update current_threshold */ 3621 t->current_threshold = i - 1; 3622 unlock: 3623 rcu_read_unlock(); 3624 } 3625 3626 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3627 { 3628 while (memcg) { 3629 __mem_cgroup_threshold(memcg, false); 3630 if (do_swap_account) 3631 __mem_cgroup_threshold(memcg, true); 3632 3633 memcg = parent_mem_cgroup(memcg); 3634 } 3635 } 3636 3637 static int compare_thresholds(const void *a, const void *b) 3638 { 3639 const struct mem_cgroup_threshold *_a = a; 3640 const struct mem_cgroup_threshold *_b = b; 3641 3642 if (_a->threshold > _b->threshold) 3643 return 1; 3644 3645 if (_a->threshold < _b->threshold) 3646 return -1; 3647 3648 return 0; 3649 } 3650 3651 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3652 { 3653 struct mem_cgroup_eventfd_list *ev; 3654 3655 spin_lock(&memcg_oom_lock); 3656 3657 list_for_each_entry(ev, &memcg->oom_notify, list) 3658 eventfd_signal(ev->eventfd, 1); 3659 3660 spin_unlock(&memcg_oom_lock); 3661 return 0; 3662 } 3663 3664 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3665 { 3666 struct mem_cgroup *iter; 3667 3668 for_each_mem_cgroup_tree(iter, memcg) 3669 mem_cgroup_oom_notify_cb(iter); 3670 } 3671 3672 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3673 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3674 { 3675 struct mem_cgroup_thresholds *thresholds; 3676 struct mem_cgroup_threshold_ary *new; 3677 unsigned long threshold; 3678 unsigned long usage; 3679 int i, size, ret; 3680 3681 ret = page_counter_memparse(args, "-1", &threshold); 3682 if (ret) 3683 return ret; 3684 3685 mutex_lock(&memcg->thresholds_lock); 3686 3687 if (type == _MEM) { 3688 thresholds = &memcg->thresholds; 3689 usage = mem_cgroup_usage(memcg, false); 3690 } else if (type == _MEMSWAP) { 3691 thresholds = &memcg->memsw_thresholds; 3692 usage = mem_cgroup_usage(memcg, true); 3693 } else 3694 BUG(); 3695 3696 /* Check if a threshold crossed before adding a new one */ 3697 if (thresholds->primary) 3698 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3699 3700 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3701 3702 /* Allocate memory for new array of thresholds */ 3703 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3704 GFP_KERNEL); 3705 if (!new) { 3706 ret = -ENOMEM; 3707 goto unlock; 3708 } 3709 new->size = size; 3710 3711 /* Copy thresholds (if any) to new array */ 3712 if (thresholds->primary) { 3713 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3714 sizeof(struct mem_cgroup_threshold)); 3715 } 3716 3717 /* Add new threshold */ 3718 new->entries[size - 1].eventfd = eventfd; 3719 new->entries[size - 1].threshold = threshold; 3720 3721 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3722 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3723 compare_thresholds, NULL); 3724 3725 /* Find current threshold */ 3726 new->current_threshold = -1; 3727 for (i = 0; i < size; i++) { 3728 if (new->entries[i].threshold <= usage) { 3729 /* 3730 * new->current_threshold will not be used until 3731 * rcu_assign_pointer(), so it's safe to increment 3732 * it here. 3733 */ 3734 ++new->current_threshold; 3735 } else 3736 break; 3737 } 3738 3739 /* Free old spare buffer and save old primary buffer as spare */ 3740 kfree(thresholds->spare); 3741 thresholds->spare = thresholds->primary; 3742 3743 rcu_assign_pointer(thresholds->primary, new); 3744 3745 /* To be sure that nobody uses thresholds */ 3746 synchronize_rcu(); 3747 3748 unlock: 3749 mutex_unlock(&memcg->thresholds_lock); 3750 3751 return ret; 3752 } 3753 3754 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3755 struct eventfd_ctx *eventfd, const char *args) 3756 { 3757 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3758 } 3759 3760 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3761 struct eventfd_ctx *eventfd, const char *args) 3762 { 3763 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3764 } 3765 3766 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3767 struct eventfd_ctx *eventfd, enum res_type type) 3768 { 3769 struct mem_cgroup_thresholds *thresholds; 3770 struct mem_cgroup_threshold_ary *new; 3771 unsigned long usage; 3772 int i, j, size; 3773 3774 mutex_lock(&memcg->thresholds_lock); 3775 3776 if (type == _MEM) { 3777 thresholds = &memcg->thresholds; 3778 usage = mem_cgroup_usage(memcg, false); 3779 } else if (type == _MEMSWAP) { 3780 thresholds = &memcg->memsw_thresholds; 3781 usage = mem_cgroup_usage(memcg, true); 3782 } else 3783 BUG(); 3784 3785 if (!thresholds->primary) 3786 goto unlock; 3787 3788 /* Check if a threshold crossed before removing */ 3789 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3790 3791 /* Calculate new number of threshold */ 3792 size = 0; 3793 for (i = 0; i < thresholds->primary->size; i++) { 3794 if (thresholds->primary->entries[i].eventfd != eventfd) 3795 size++; 3796 } 3797 3798 new = thresholds->spare; 3799 3800 /* Set thresholds array to NULL if we don't have thresholds */ 3801 if (!size) { 3802 kfree(new); 3803 new = NULL; 3804 goto swap_buffers; 3805 } 3806 3807 new->size = size; 3808 3809 /* Copy thresholds and find current threshold */ 3810 new->current_threshold = -1; 3811 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3812 if (thresholds->primary->entries[i].eventfd == eventfd) 3813 continue; 3814 3815 new->entries[j] = thresholds->primary->entries[i]; 3816 if (new->entries[j].threshold <= usage) { 3817 /* 3818 * new->current_threshold will not be used 3819 * until rcu_assign_pointer(), so it's safe to increment 3820 * it here. 3821 */ 3822 ++new->current_threshold; 3823 } 3824 j++; 3825 } 3826 3827 swap_buffers: 3828 /* Swap primary and spare array */ 3829 thresholds->spare = thresholds->primary; 3830 /* If all events are unregistered, free the spare array */ 3831 if (!new) { 3832 kfree(thresholds->spare); 3833 thresholds->spare = NULL; 3834 } 3835 3836 rcu_assign_pointer(thresholds->primary, new); 3837 3838 /* To be sure that nobody uses thresholds */ 3839 synchronize_rcu(); 3840 unlock: 3841 mutex_unlock(&memcg->thresholds_lock); 3842 } 3843 3844 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3845 struct eventfd_ctx *eventfd) 3846 { 3847 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 3848 } 3849 3850 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3851 struct eventfd_ctx *eventfd) 3852 { 3853 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 3854 } 3855 3856 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 3857 struct eventfd_ctx *eventfd, const char *args) 3858 { 3859 struct mem_cgroup_eventfd_list *event; 3860 3861 event = kmalloc(sizeof(*event), GFP_KERNEL); 3862 if (!event) 3863 return -ENOMEM; 3864 3865 spin_lock(&memcg_oom_lock); 3866 3867 event->eventfd = eventfd; 3868 list_add(&event->list, &memcg->oom_notify); 3869 3870 /* already in OOM ? */ 3871 if (memcg->under_oom) 3872 eventfd_signal(eventfd, 1); 3873 spin_unlock(&memcg_oom_lock); 3874 3875 return 0; 3876 } 3877 3878 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 3879 struct eventfd_ctx *eventfd) 3880 { 3881 struct mem_cgroup_eventfd_list *ev, *tmp; 3882 3883 spin_lock(&memcg_oom_lock); 3884 3885 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 3886 if (ev->eventfd == eventfd) { 3887 list_del(&ev->list); 3888 kfree(ev); 3889 } 3890 } 3891 3892 spin_unlock(&memcg_oom_lock); 3893 } 3894 3895 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3896 { 3897 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3898 3899 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3900 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3901 return 0; 3902 } 3903 3904 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 3905 struct cftype *cft, u64 val) 3906 { 3907 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3908 3909 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3910 if (!css->parent || !((val == 0) || (val == 1))) 3911 return -EINVAL; 3912 3913 memcg->oom_kill_disable = val; 3914 if (!val) 3915 memcg_oom_recover(memcg); 3916 3917 return 0; 3918 } 3919 3920 #ifdef CONFIG_MEMCG_KMEM 3921 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3922 { 3923 int ret; 3924 3925 ret = memcg_propagate_kmem(memcg); 3926 if (ret) 3927 return ret; 3928 3929 return mem_cgroup_sockets_init(memcg, ss); 3930 } 3931 3932 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3933 { 3934 struct cgroup_subsys_state *css; 3935 struct mem_cgroup *parent, *child; 3936 int kmemcg_id; 3937 3938 if (!memcg->kmem_acct_active) 3939 return; 3940 3941 /* 3942 * Clear the 'active' flag before clearing memcg_caches arrays entries. 3943 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it 3944 * guarantees no cache will be created for this cgroup after we are 3945 * done (see memcg_create_kmem_cache()). 3946 */ 3947 memcg->kmem_acct_active = false; 3948 3949 memcg_deactivate_kmem_caches(memcg); 3950 3951 kmemcg_id = memcg->kmemcg_id; 3952 BUG_ON(kmemcg_id < 0); 3953 3954 parent = parent_mem_cgroup(memcg); 3955 if (!parent) 3956 parent = root_mem_cgroup; 3957 3958 /* 3959 * Change kmemcg_id of this cgroup and all its descendants to the 3960 * parent's id, and then move all entries from this cgroup's list_lrus 3961 * to ones of the parent. After we have finished, all list_lrus 3962 * corresponding to this cgroup are guaranteed to remain empty. The 3963 * ordering is imposed by list_lru_node->lock taken by 3964 * memcg_drain_all_list_lrus(). 3965 */ 3966 css_for_each_descendant_pre(css, &memcg->css) { 3967 child = mem_cgroup_from_css(css); 3968 BUG_ON(child->kmemcg_id != kmemcg_id); 3969 child->kmemcg_id = parent->kmemcg_id; 3970 if (!memcg->use_hierarchy) 3971 break; 3972 } 3973 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); 3974 3975 memcg_free_cache_id(kmemcg_id); 3976 } 3977 3978 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3979 { 3980 if (memcg->kmem_acct_activated) { 3981 memcg_destroy_kmem_caches(memcg); 3982 static_key_slow_dec(&memcg_kmem_enabled_key); 3983 WARN_ON(page_counter_read(&memcg->kmem)); 3984 } 3985 mem_cgroup_sockets_destroy(memcg); 3986 } 3987 #else 3988 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3989 { 3990 return 0; 3991 } 3992 3993 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3994 { 3995 } 3996 3997 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3998 { 3999 } 4000 #endif 4001 4002 #ifdef CONFIG_CGROUP_WRITEBACK 4003 4004 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) 4005 { 4006 return &memcg->cgwb_list; 4007 } 4008 4009 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4010 { 4011 return wb_domain_init(&memcg->cgwb_domain, gfp); 4012 } 4013 4014 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4015 { 4016 wb_domain_exit(&memcg->cgwb_domain); 4017 } 4018 4019 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4020 { 4021 wb_domain_size_changed(&memcg->cgwb_domain); 4022 } 4023 4024 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4025 { 4026 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4027 4028 if (!memcg->css.parent) 4029 return NULL; 4030 4031 return &memcg->cgwb_domain; 4032 } 4033 4034 /** 4035 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4036 * @wb: bdi_writeback in question 4037 * @pavail: out parameter for number of available pages 4038 * @pdirty: out parameter for number of dirty pages 4039 * @pwriteback: out parameter for number of pages under writeback 4040 * 4041 * Determine the numbers of available, dirty, and writeback pages in @wb's 4042 * memcg. Dirty and writeback are self-explanatory. Available is a bit 4043 * more involved. 4044 * 4045 * A memcg's headroom is "min(max, high) - used". The available memory is 4046 * calculated as the lowest headroom of itself and the ancestors plus the 4047 * number of pages already being used for file pages. Note that this 4048 * doesn't consider the actual amount of available memory in the system. 4049 * The caller should further cap *@pavail accordingly. 4050 */ 4051 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, 4052 unsigned long *pdirty, unsigned long *pwriteback) 4053 { 4054 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4055 struct mem_cgroup *parent; 4056 unsigned long head_room = PAGE_COUNTER_MAX; 4057 unsigned long file_pages; 4058 4059 *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); 4060 4061 /* this should eventually include NR_UNSTABLE_NFS */ 4062 *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 4063 4064 file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 4065 (1 << LRU_ACTIVE_FILE)); 4066 while ((parent = parent_mem_cgroup(memcg))) { 4067 unsigned long ceiling = min(memcg->memory.limit, memcg->high); 4068 unsigned long used = page_counter_read(&memcg->memory); 4069 4070 head_room = min(head_room, ceiling - min(ceiling, used)); 4071 memcg = parent; 4072 } 4073 4074 *pavail = file_pages + head_room; 4075 } 4076 4077 #else /* CONFIG_CGROUP_WRITEBACK */ 4078 4079 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4080 { 4081 return 0; 4082 } 4083 4084 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4085 { 4086 } 4087 4088 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4089 { 4090 } 4091 4092 #endif /* CONFIG_CGROUP_WRITEBACK */ 4093 4094 /* 4095 * DO NOT USE IN NEW FILES. 4096 * 4097 * "cgroup.event_control" implementation. 4098 * 4099 * This is way over-engineered. It tries to support fully configurable 4100 * events for each user. Such level of flexibility is completely 4101 * unnecessary especially in the light of the planned unified hierarchy. 4102 * 4103 * Please deprecate this and replace with something simpler if at all 4104 * possible. 4105 */ 4106 4107 /* 4108 * Unregister event and free resources. 4109 * 4110 * Gets called from workqueue. 4111 */ 4112 static void memcg_event_remove(struct work_struct *work) 4113 { 4114 struct mem_cgroup_event *event = 4115 container_of(work, struct mem_cgroup_event, remove); 4116 struct mem_cgroup *memcg = event->memcg; 4117 4118 remove_wait_queue(event->wqh, &event->wait); 4119 4120 event->unregister_event(memcg, event->eventfd); 4121 4122 /* Notify userspace the event is going away. */ 4123 eventfd_signal(event->eventfd, 1); 4124 4125 eventfd_ctx_put(event->eventfd); 4126 kfree(event); 4127 css_put(&memcg->css); 4128 } 4129 4130 /* 4131 * Gets called on POLLHUP on eventfd when user closes it. 4132 * 4133 * Called with wqh->lock held and interrupts disabled. 4134 */ 4135 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 4136 int sync, void *key) 4137 { 4138 struct mem_cgroup_event *event = 4139 container_of(wait, struct mem_cgroup_event, wait); 4140 struct mem_cgroup *memcg = event->memcg; 4141 unsigned long flags = (unsigned long)key; 4142 4143 if (flags & POLLHUP) { 4144 /* 4145 * If the event has been detached at cgroup removal, we 4146 * can simply return knowing the other side will cleanup 4147 * for us. 4148 * 4149 * We can't race against event freeing since the other 4150 * side will require wqh->lock via remove_wait_queue(), 4151 * which we hold. 4152 */ 4153 spin_lock(&memcg->event_list_lock); 4154 if (!list_empty(&event->list)) { 4155 list_del_init(&event->list); 4156 /* 4157 * We are in atomic context, but cgroup_event_remove() 4158 * may sleep, so we have to call it in workqueue. 4159 */ 4160 schedule_work(&event->remove); 4161 } 4162 spin_unlock(&memcg->event_list_lock); 4163 } 4164 4165 return 0; 4166 } 4167 4168 static void memcg_event_ptable_queue_proc(struct file *file, 4169 wait_queue_head_t *wqh, poll_table *pt) 4170 { 4171 struct mem_cgroup_event *event = 4172 container_of(pt, struct mem_cgroup_event, pt); 4173 4174 event->wqh = wqh; 4175 add_wait_queue(wqh, &event->wait); 4176 } 4177 4178 /* 4179 * DO NOT USE IN NEW FILES. 4180 * 4181 * Parse input and register new cgroup event handler. 4182 * 4183 * Input must be in format '<event_fd> <control_fd> <args>'. 4184 * Interpretation of args is defined by control file implementation. 4185 */ 4186 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4187 char *buf, size_t nbytes, loff_t off) 4188 { 4189 struct cgroup_subsys_state *css = of_css(of); 4190 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4191 struct mem_cgroup_event *event; 4192 struct cgroup_subsys_state *cfile_css; 4193 unsigned int efd, cfd; 4194 struct fd efile; 4195 struct fd cfile; 4196 const char *name; 4197 char *endp; 4198 int ret; 4199 4200 buf = strstrip(buf); 4201 4202 efd = simple_strtoul(buf, &endp, 10); 4203 if (*endp != ' ') 4204 return -EINVAL; 4205 buf = endp + 1; 4206 4207 cfd = simple_strtoul(buf, &endp, 10); 4208 if ((*endp != ' ') && (*endp != '\0')) 4209 return -EINVAL; 4210 buf = endp + 1; 4211 4212 event = kzalloc(sizeof(*event), GFP_KERNEL); 4213 if (!event) 4214 return -ENOMEM; 4215 4216 event->memcg = memcg; 4217 INIT_LIST_HEAD(&event->list); 4218 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4219 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4220 INIT_WORK(&event->remove, memcg_event_remove); 4221 4222 efile = fdget(efd); 4223 if (!efile.file) { 4224 ret = -EBADF; 4225 goto out_kfree; 4226 } 4227 4228 event->eventfd = eventfd_ctx_fileget(efile.file); 4229 if (IS_ERR(event->eventfd)) { 4230 ret = PTR_ERR(event->eventfd); 4231 goto out_put_efile; 4232 } 4233 4234 cfile = fdget(cfd); 4235 if (!cfile.file) { 4236 ret = -EBADF; 4237 goto out_put_eventfd; 4238 } 4239 4240 /* the process need read permission on control file */ 4241 /* AV: shouldn't we check that it's been opened for read instead? */ 4242 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4243 if (ret < 0) 4244 goto out_put_cfile; 4245 4246 /* 4247 * Determine the event callbacks and set them in @event. This used 4248 * to be done via struct cftype but cgroup core no longer knows 4249 * about these events. The following is crude but the whole thing 4250 * is for compatibility anyway. 4251 * 4252 * DO NOT ADD NEW FILES. 4253 */ 4254 name = cfile.file->f_path.dentry->d_name.name; 4255 4256 if (!strcmp(name, "memory.usage_in_bytes")) { 4257 event->register_event = mem_cgroup_usage_register_event; 4258 event->unregister_event = mem_cgroup_usage_unregister_event; 4259 } else if (!strcmp(name, "memory.oom_control")) { 4260 event->register_event = mem_cgroup_oom_register_event; 4261 event->unregister_event = mem_cgroup_oom_unregister_event; 4262 } else if (!strcmp(name, "memory.pressure_level")) { 4263 event->register_event = vmpressure_register_event; 4264 event->unregister_event = vmpressure_unregister_event; 4265 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4266 event->register_event = memsw_cgroup_usage_register_event; 4267 event->unregister_event = memsw_cgroup_usage_unregister_event; 4268 } else { 4269 ret = -EINVAL; 4270 goto out_put_cfile; 4271 } 4272 4273 /* 4274 * Verify @cfile should belong to @css. Also, remaining events are 4275 * automatically removed on cgroup destruction but the removal is 4276 * asynchronous, so take an extra ref on @css. 4277 */ 4278 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4279 &memory_cgrp_subsys); 4280 ret = -EINVAL; 4281 if (IS_ERR(cfile_css)) 4282 goto out_put_cfile; 4283 if (cfile_css != css) { 4284 css_put(cfile_css); 4285 goto out_put_cfile; 4286 } 4287 4288 ret = event->register_event(memcg, event->eventfd, buf); 4289 if (ret) 4290 goto out_put_css; 4291 4292 efile.file->f_op->poll(efile.file, &event->pt); 4293 4294 spin_lock(&memcg->event_list_lock); 4295 list_add(&event->list, &memcg->event_list); 4296 spin_unlock(&memcg->event_list_lock); 4297 4298 fdput(cfile); 4299 fdput(efile); 4300 4301 return nbytes; 4302 4303 out_put_css: 4304 css_put(css); 4305 out_put_cfile: 4306 fdput(cfile); 4307 out_put_eventfd: 4308 eventfd_ctx_put(event->eventfd); 4309 out_put_efile: 4310 fdput(efile); 4311 out_kfree: 4312 kfree(event); 4313 4314 return ret; 4315 } 4316 4317 static struct cftype mem_cgroup_legacy_files[] = { 4318 { 4319 .name = "usage_in_bytes", 4320 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4321 .read_u64 = mem_cgroup_read_u64, 4322 }, 4323 { 4324 .name = "max_usage_in_bytes", 4325 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4326 .write = mem_cgroup_reset, 4327 .read_u64 = mem_cgroup_read_u64, 4328 }, 4329 { 4330 .name = "limit_in_bytes", 4331 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4332 .write = mem_cgroup_write, 4333 .read_u64 = mem_cgroup_read_u64, 4334 }, 4335 { 4336 .name = "soft_limit_in_bytes", 4337 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4338 .write = mem_cgroup_write, 4339 .read_u64 = mem_cgroup_read_u64, 4340 }, 4341 { 4342 .name = "failcnt", 4343 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4344 .write = mem_cgroup_reset, 4345 .read_u64 = mem_cgroup_read_u64, 4346 }, 4347 { 4348 .name = "stat", 4349 .seq_show = memcg_stat_show, 4350 }, 4351 { 4352 .name = "force_empty", 4353 .write = mem_cgroup_force_empty_write, 4354 }, 4355 { 4356 .name = "use_hierarchy", 4357 .write_u64 = mem_cgroup_hierarchy_write, 4358 .read_u64 = mem_cgroup_hierarchy_read, 4359 }, 4360 { 4361 .name = "cgroup.event_control", /* XXX: for compat */ 4362 .write = memcg_write_event_control, 4363 .flags = CFTYPE_NO_PREFIX, 4364 .mode = S_IWUGO, 4365 }, 4366 { 4367 .name = "swappiness", 4368 .read_u64 = mem_cgroup_swappiness_read, 4369 .write_u64 = mem_cgroup_swappiness_write, 4370 }, 4371 { 4372 .name = "move_charge_at_immigrate", 4373 .read_u64 = mem_cgroup_move_charge_read, 4374 .write_u64 = mem_cgroup_move_charge_write, 4375 }, 4376 { 4377 .name = "oom_control", 4378 .seq_show = mem_cgroup_oom_control_read, 4379 .write_u64 = mem_cgroup_oom_control_write, 4380 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4381 }, 4382 { 4383 .name = "pressure_level", 4384 }, 4385 #ifdef CONFIG_NUMA 4386 { 4387 .name = "numa_stat", 4388 .seq_show = memcg_numa_stat_show, 4389 }, 4390 #endif 4391 #ifdef CONFIG_MEMCG_KMEM 4392 { 4393 .name = "kmem.limit_in_bytes", 4394 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4395 .write = mem_cgroup_write, 4396 .read_u64 = mem_cgroup_read_u64, 4397 }, 4398 { 4399 .name = "kmem.usage_in_bytes", 4400 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4401 .read_u64 = mem_cgroup_read_u64, 4402 }, 4403 { 4404 .name = "kmem.failcnt", 4405 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4406 .write = mem_cgroup_reset, 4407 .read_u64 = mem_cgroup_read_u64, 4408 }, 4409 { 4410 .name = "kmem.max_usage_in_bytes", 4411 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4412 .write = mem_cgroup_reset, 4413 .read_u64 = mem_cgroup_read_u64, 4414 }, 4415 #ifdef CONFIG_SLABINFO 4416 { 4417 .name = "kmem.slabinfo", 4418 .seq_start = slab_start, 4419 .seq_next = slab_next, 4420 .seq_stop = slab_stop, 4421 .seq_show = memcg_slab_show, 4422 }, 4423 #endif 4424 #endif 4425 { }, /* terminate */ 4426 }; 4427 4428 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4429 { 4430 struct mem_cgroup_per_node *pn; 4431 struct mem_cgroup_per_zone *mz; 4432 int zone, tmp = node; 4433 /* 4434 * This routine is called against possible nodes. 4435 * But it's BUG to call kmalloc() against offline node. 4436 * 4437 * TODO: this routine can waste much memory for nodes which will 4438 * never be onlined. It's better to use memory hotplug callback 4439 * function. 4440 */ 4441 if (!node_state(node, N_NORMAL_MEMORY)) 4442 tmp = -1; 4443 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4444 if (!pn) 4445 return 1; 4446 4447 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4448 mz = &pn->zoneinfo[zone]; 4449 lruvec_init(&mz->lruvec); 4450 mz->usage_in_excess = 0; 4451 mz->on_tree = false; 4452 mz->memcg = memcg; 4453 } 4454 memcg->nodeinfo[node] = pn; 4455 return 0; 4456 } 4457 4458 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4459 { 4460 kfree(memcg->nodeinfo[node]); 4461 } 4462 4463 static struct mem_cgroup *mem_cgroup_alloc(void) 4464 { 4465 struct mem_cgroup *memcg; 4466 size_t size; 4467 4468 size = sizeof(struct mem_cgroup); 4469 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4470 4471 memcg = kzalloc(size, GFP_KERNEL); 4472 if (!memcg) 4473 return NULL; 4474 4475 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4476 if (!memcg->stat) 4477 goto out_free; 4478 4479 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4480 goto out_free_stat; 4481 4482 spin_lock_init(&memcg->pcp_counter_lock); 4483 return memcg; 4484 4485 out_free_stat: 4486 free_percpu(memcg->stat); 4487 out_free: 4488 kfree(memcg); 4489 return NULL; 4490 } 4491 4492 /* 4493 * At destroying mem_cgroup, references from swap_cgroup can remain. 4494 * (scanning all at force_empty is too costly...) 4495 * 4496 * Instead of clearing all references at force_empty, we remember 4497 * the number of reference from swap_cgroup and free mem_cgroup when 4498 * it goes down to 0. 4499 * 4500 * Removal of cgroup itself succeeds regardless of refs from swap. 4501 */ 4502 4503 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4504 { 4505 int node; 4506 4507 mem_cgroup_remove_from_trees(memcg); 4508 4509 for_each_node(node) 4510 free_mem_cgroup_per_zone_info(memcg, node); 4511 4512 free_percpu(memcg->stat); 4513 memcg_wb_domain_exit(memcg); 4514 kfree(memcg); 4515 } 4516 4517 /* 4518 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4519 */ 4520 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4521 { 4522 if (!memcg->memory.parent) 4523 return NULL; 4524 return mem_cgroup_from_counter(memcg->memory.parent, memory); 4525 } 4526 EXPORT_SYMBOL(parent_mem_cgroup); 4527 4528 static struct cgroup_subsys_state * __ref 4529 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4530 { 4531 struct mem_cgroup *memcg; 4532 long error = -ENOMEM; 4533 int node; 4534 4535 memcg = mem_cgroup_alloc(); 4536 if (!memcg) 4537 return ERR_PTR(error); 4538 4539 for_each_node(node) 4540 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4541 goto free_out; 4542 4543 /* root ? */ 4544 if (parent_css == NULL) { 4545 root_mem_cgroup = memcg; 4546 mem_cgroup_root_css = &memcg->css; 4547 page_counter_init(&memcg->memory, NULL); 4548 memcg->high = PAGE_COUNTER_MAX; 4549 memcg->soft_limit = PAGE_COUNTER_MAX; 4550 page_counter_init(&memcg->memsw, NULL); 4551 page_counter_init(&memcg->kmem, NULL); 4552 } 4553 4554 memcg->last_scanned_node = MAX_NUMNODES; 4555 INIT_LIST_HEAD(&memcg->oom_notify); 4556 memcg->move_charge_at_immigrate = 0; 4557 mutex_init(&memcg->thresholds_lock); 4558 spin_lock_init(&memcg->move_lock); 4559 vmpressure_init(&memcg->vmpressure); 4560 INIT_LIST_HEAD(&memcg->event_list); 4561 spin_lock_init(&memcg->event_list_lock); 4562 #ifdef CONFIG_MEMCG_KMEM 4563 memcg->kmemcg_id = -1; 4564 #endif 4565 #ifdef CONFIG_CGROUP_WRITEBACK 4566 INIT_LIST_HEAD(&memcg->cgwb_list); 4567 #endif 4568 return &memcg->css; 4569 4570 free_out: 4571 __mem_cgroup_free(memcg); 4572 return ERR_PTR(error); 4573 } 4574 4575 static int 4576 mem_cgroup_css_online(struct cgroup_subsys_state *css) 4577 { 4578 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4579 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4580 int ret; 4581 4582 if (css->id > MEM_CGROUP_ID_MAX) 4583 return -ENOSPC; 4584 4585 if (!parent) 4586 return 0; 4587 4588 mutex_lock(&memcg_create_mutex); 4589 4590 memcg->use_hierarchy = parent->use_hierarchy; 4591 memcg->oom_kill_disable = parent->oom_kill_disable; 4592 memcg->swappiness = mem_cgroup_swappiness(parent); 4593 4594 if (parent->use_hierarchy) { 4595 page_counter_init(&memcg->memory, &parent->memory); 4596 memcg->high = PAGE_COUNTER_MAX; 4597 memcg->soft_limit = PAGE_COUNTER_MAX; 4598 page_counter_init(&memcg->memsw, &parent->memsw); 4599 page_counter_init(&memcg->kmem, &parent->kmem); 4600 4601 /* 4602 * No need to take a reference to the parent because cgroup 4603 * core guarantees its existence. 4604 */ 4605 } else { 4606 page_counter_init(&memcg->memory, NULL); 4607 memcg->high = PAGE_COUNTER_MAX; 4608 memcg->soft_limit = PAGE_COUNTER_MAX; 4609 page_counter_init(&memcg->memsw, NULL); 4610 page_counter_init(&memcg->kmem, NULL); 4611 /* 4612 * Deeper hierachy with use_hierarchy == false doesn't make 4613 * much sense so let cgroup subsystem know about this 4614 * unfortunate state in our controller. 4615 */ 4616 if (parent != root_mem_cgroup) 4617 memory_cgrp_subsys.broken_hierarchy = true; 4618 } 4619 mutex_unlock(&memcg_create_mutex); 4620 4621 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 4622 if (ret) 4623 return ret; 4624 4625 /* 4626 * Make sure the memcg is initialized: mem_cgroup_iter() 4627 * orders reading memcg->initialized against its callers 4628 * reading the memcg members. 4629 */ 4630 smp_store_release(&memcg->initialized, 1); 4631 4632 return 0; 4633 } 4634 4635 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4636 { 4637 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4638 struct mem_cgroup_event *event, *tmp; 4639 4640 /* 4641 * Unregister events and notify userspace. 4642 * Notify userspace about cgroup removing only after rmdir of cgroup 4643 * directory to avoid race between userspace and kernelspace. 4644 */ 4645 spin_lock(&memcg->event_list_lock); 4646 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4647 list_del_init(&event->list); 4648 schedule_work(&event->remove); 4649 } 4650 spin_unlock(&memcg->event_list_lock); 4651 4652 vmpressure_cleanup(&memcg->vmpressure); 4653 4654 memcg_deactivate_kmem(memcg); 4655 4656 wb_memcg_offline(memcg); 4657 } 4658 4659 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4660 { 4661 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4662 4663 memcg_destroy_kmem(memcg); 4664 __mem_cgroup_free(memcg); 4665 } 4666 4667 /** 4668 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4669 * @css: the target css 4670 * 4671 * Reset the states of the mem_cgroup associated with @css. This is 4672 * invoked when the userland requests disabling on the default hierarchy 4673 * but the memcg is pinned through dependency. The memcg should stop 4674 * applying policies and should revert to the vanilla state as it may be 4675 * made visible again. 4676 * 4677 * The current implementation only resets the essential configurations. 4678 * This needs to be expanded to cover all the visible parts. 4679 */ 4680 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4681 { 4682 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4683 4684 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4685 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4686 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4687 memcg->low = 0; 4688 memcg->high = PAGE_COUNTER_MAX; 4689 memcg->soft_limit = PAGE_COUNTER_MAX; 4690 memcg_wb_domain_size_changed(memcg); 4691 } 4692 4693 #ifdef CONFIG_MMU 4694 /* Handlers for move charge at task migration. */ 4695 static int mem_cgroup_do_precharge(unsigned long count) 4696 { 4697 int ret; 4698 4699 /* Try a single bulk charge without reclaim first */ 4700 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 4701 if (!ret) { 4702 mc.precharge += count; 4703 return ret; 4704 } 4705 if (ret == -EINTR) { 4706 cancel_charge(root_mem_cgroup, count); 4707 return ret; 4708 } 4709 4710 /* Try charges one by one with reclaim */ 4711 while (count--) { 4712 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4713 /* 4714 * In case of failure, any residual charges against 4715 * mc.to will be dropped by mem_cgroup_clear_mc() 4716 * later on. However, cancel any charges that are 4717 * bypassed to root right away or they'll be lost. 4718 */ 4719 if (ret == -EINTR) 4720 cancel_charge(root_mem_cgroup, 1); 4721 if (ret) 4722 return ret; 4723 mc.precharge++; 4724 cond_resched(); 4725 } 4726 return 0; 4727 } 4728 4729 /** 4730 * get_mctgt_type - get target type of moving charge 4731 * @vma: the vma the pte to be checked belongs 4732 * @addr: the address corresponding to the pte to be checked 4733 * @ptent: the pte to be checked 4734 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4735 * 4736 * Returns 4737 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4738 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4739 * move charge. if @target is not NULL, the page is stored in target->page 4740 * with extra refcnt got(Callers should handle it). 4741 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4742 * target for charge migration. if @target is not NULL, the entry is stored 4743 * in target->ent. 4744 * 4745 * Called with pte lock held. 4746 */ 4747 union mc_target { 4748 struct page *page; 4749 swp_entry_t ent; 4750 }; 4751 4752 enum mc_target_type { 4753 MC_TARGET_NONE = 0, 4754 MC_TARGET_PAGE, 4755 MC_TARGET_SWAP, 4756 }; 4757 4758 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4759 unsigned long addr, pte_t ptent) 4760 { 4761 struct page *page = vm_normal_page(vma, addr, ptent); 4762 4763 if (!page || !page_mapped(page)) 4764 return NULL; 4765 if (PageAnon(page)) { 4766 if (!(mc.flags & MOVE_ANON)) 4767 return NULL; 4768 } else { 4769 if (!(mc.flags & MOVE_FILE)) 4770 return NULL; 4771 } 4772 if (!get_page_unless_zero(page)) 4773 return NULL; 4774 4775 return page; 4776 } 4777 4778 #ifdef CONFIG_SWAP 4779 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4780 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4781 { 4782 struct page *page = NULL; 4783 swp_entry_t ent = pte_to_swp_entry(ptent); 4784 4785 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 4786 return NULL; 4787 /* 4788 * Because lookup_swap_cache() updates some statistics counter, 4789 * we call find_get_page() with swapper_space directly. 4790 */ 4791 page = find_get_page(swap_address_space(ent), ent.val); 4792 if (do_swap_account) 4793 entry->val = ent.val; 4794 4795 return page; 4796 } 4797 #else 4798 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4799 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4800 { 4801 return NULL; 4802 } 4803 #endif 4804 4805 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4806 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4807 { 4808 struct page *page = NULL; 4809 struct address_space *mapping; 4810 pgoff_t pgoff; 4811 4812 if (!vma->vm_file) /* anonymous vma */ 4813 return NULL; 4814 if (!(mc.flags & MOVE_FILE)) 4815 return NULL; 4816 4817 mapping = vma->vm_file->f_mapping; 4818 pgoff = linear_page_index(vma, addr); 4819 4820 /* page is moved even if it's not RSS of this task(page-faulted). */ 4821 #ifdef CONFIG_SWAP 4822 /* shmem/tmpfs may report page out on swap: account for that too. */ 4823 if (shmem_mapping(mapping)) { 4824 page = find_get_entry(mapping, pgoff); 4825 if (radix_tree_exceptional_entry(page)) { 4826 swp_entry_t swp = radix_to_swp_entry(page); 4827 if (do_swap_account) 4828 *entry = swp; 4829 page = find_get_page(swap_address_space(swp), swp.val); 4830 } 4831 } else 4832 page = find_get_page(mapping, pgoff); 4833 #else 4834 page = find_get_page(mapping, pgoff); 4835 #endif 4836 return page; 4837 } 4838 4839 /** 4840 * mem_cgroup_move_account - move account of the page 4841 * @page: the page 4842 * @nr_pages: number of regular pages (>1 for huge pages) 4843 * @from: mem_cgroup which the page is moved from. 4844 * @to: mem_cgroup which the page is moved to. @from != @to. 4845 * 4846 * The caller must confirm following. 4847 * - page is not on LRU (isolate_page() is useful.) 4848 * - compound_lock is held when nr_pages > 1 4849 * 4850 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4851 * from old cgroup. 4852 */ 4853 static int mem_cgroup_move_account(struct page *page, 4854 unsigned int nr_pages, 4855 struct mem_cgroup *from, 4856 struct mem_cgroup *to) 4857 { 4858 unsigned long flags; 4859 int ret; 4860 bool anon; 4861 4862 VM_BUG_ON(from == to); 4863 VM_BUG_ON_PAGE(PageLRU(page), page); 4864 /* 4865 * The page is isolated from LRU. So, collapse function 4866 * will not handle this page. But page splitting can happen. 4867 * Do this check under compound_page_lock(). The caller should 4868 * hold it. 4869 */ 4870 ret = -EBUSY; 4871 if (nr_pages > 1 && !PageTransHuge(page)) 4872 goto out; 4873 4874 /* 4875 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 4876 * of its source page while we change it: page migration takes 4877 * both pages off the LRU, but page cache replacement doesn't. 4878 */ 4879 if (!trylock_page(page)) 4880 goto out; 4881 4882 ret = -EINVAL; 4883 if (page->mem_cgroup != from) 4884 goto out_unlock; 4885 4886 anon = PageAnon(page); 4887 4888 spin_lock_irqsave(&from->move_lock, flags); 4889 4890 if (!anon && page_mapped(page)) { 4891 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4892 nr_pages); 4893 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4894 nr_pages); 4895 } 4896 4897 /* 4898 * move_lock grabbed above and caller set from->moving_account, so 4899 * mem_cgroup_update_page_stat() will serialize updates to PageDirty. 4900 * So mapping should be stable for dirty pages. 4901 */ 4902 if (!anon && PageDirty(page)) { 4903 struct address_space *mapping = page_mapping(page); 4904 4905 if (mapping_cap_account_dirty(mapping)) { 4906 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], 4907 nr_pages); 4908 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], 4909 nr_pages); 4910 } 4911 } 4912 4913 if (PageWriteback(page)) { 4914 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4915 nr_pages); 4916 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4917 nr_pages); 4918 } 4919 4920 /* 4921 * It is safe to change page->mem_cgroup here because the page 4922 * is referenced, charged, and isolated - we can't race with 4923 * uncharging, charging, migration, or LRU putback. 4924 */ 4925 4926 /* caller should have done css_get */ 4927 page->mem_cgroup = to; 4928 spin_unlock_irqrestore(&from->move_lock, flags); 4929 4930 ret = 0; 4931 4932 local_irq_disable(); 4933 mem_cgroup_charge_statistics(to, page, nr_pages); 4934 memcg_check_events(to, page); 4935 mem_cgroup_charge_statistics(from, page, -nr_pages); 4936 memcg_check_events(from, page); 4937 local_irq_enable(); 4938 out_unlock: 4939 unlock_page(page); 4940 out: 4941 return ret; 4942 } 4943 4944 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4945 unsigned long addr, pte_t ptent, union mc_target *target) 4946 { 4947 struct page *page = NULL; 4948 enum mc_target_type ret = MC_TARGET_NONE; 4949 swp_entry_t ent = { .val = 0 }; 4950 4951 if (pte_present(ptent)) 4952 page = mc_handle_present_pte(vma, addr, ptent); 4953 else if (is_swap_pte(ptent)) 4954 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4955 else if (pte_none(ptent)) 4956 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4957 4958 if (!page && !ent.val) 4959 return ret; 4960 if (page) { 4961 /* 4962 * Do only loose check w/o serialization. 4963 * mem_cgroup_move_account() checks the page is valid or 4964 * not under LRU exclusion. 4965 */ 4966 if (page->mem_cgroup == mc.from) { 4967 ret = MC_TARGET_PAGE; 4968 if (target) 4969 target->page = page; 4970 } 4971 if (!ret || !target) 4972 put_page(page); 4973 } 4974 /* There is a swap entry and a page doesn't exist or isn't charged */ 4975 if (ent.val && !ret && 4976 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4977 ret = MC_TARGET_SWAP; 4978 if (target) 4979 target->ent = ent; 4980 } 4981 return ret; 4982 } 4983 4984 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4985 /* 4986 * We don't consider swapping or file mapped pages because THP does not 4987 * support them for now. 4988 * Caller should make sure that pmd_trans_huge(pmd) is true. 4989 */ 4990 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4991 unsigned long addr, pmd_t pmd, union mc_target *target) 4992 { 4993 struct page *page = NULL; 4994 enum mc_target_type ret = MC_TARGET_NONE; 4995 4996 page = pmd_page(pmd); 4997 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4998 if (!(mc.flags & MOVE_ANON)) 4999 return ret; 5000 if (page->mem_cgroup == mc.from) { 5001 ret = MC_TARGET_PAGE; 5002 if (target) { 5003 get_page(page); 5004 target->page = page; 5005 } 5006 } 5007 return ret; 5008 } 5009 #else 5010 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5011 unsigned long addr, pmd_t pmd, union mc_target *target) 5012 { 5013 return MC_TARGET_NONE; 5014 } 5015 #endif 5016 5017 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5018 unsigned long addr, unsigned long end, 5019 struct mm_walk *walk) 5020 { 5021 struct vm_area_struct *vma = walk->vma; 5022 pte_t *pte; 5023 spinlock_t *ptl; 5024 5025 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5026 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5027 mc.precharge += HPAGE_PMD_NR; 5028 spin_unlock(ptl); 5029 return 0; 5030 } 5031 5032 if (pmd_trans_unstable(pmd)) 5033 return 0; 5034 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5035 for (; addr != end; pte++, addr += PAGE_SIZE) 5036 if (get_mctgt_type(vma, addr, *pte, NULL)) 5037 mc.precharge++; /* increment precharge temporarily */ 5038 pte_unmap_unlock(pte - 1, ptl); 5039 cond_resched(); 5040 5041 return 0; 5042 } 5043 5044 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5045 { 5046 unsigned long precharge; 5047 5048 struct mm_walk mem_cgroup_count_precharge_walk = { 5049 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5050 .mm = mm, 5051 }; 5052 down_read(&mm->mmap_sem); 5053 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); 5054 up_read(&mm->mmap_sem); 5055 5056 precharge = mc.precharge; 5057 mc.precharge = 0; 5058 5059 return precharge; 5060 } 5061 5062 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5063 { 5064 unsigned long precharge = mem_cgroup_count_precharge(mm); 5065 5066 VM_BUG_ON(mc.moving_task); 5067 mc.moving_task = current; 5068 return mem_cgroup_do_precharge(precharge); 5069 } 5070 5071 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5072 static void __mem_cgroup_clear_mc(void) 5073 { 5074 struct mem_cgroup *from = mc.from; 5075 struct mem_cgroup *to = mc.to; 5076 5077 /* we must uncharge all the leftover precharges from mc.to */ 5078 if (mc.precharge) { 5079 cancel_charge(mc.to, mc.precharge); 5080 mc.precharge = 0; 5081 } 5082 /* 5083 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5084 * we must uncharge here. 5085 */ 5086 if (mc.moved_charge) { 5087 cancel_charge(mc.from, mc.moved_charge); 5088 mc.moved_charge = 0; 5089 } 5090 /* we must fixup refcnts and charges */ 5091 if (mc.moved_swap) { 5092 /* uncharge swap account from the old cgroup */ 5093 if (!mem_cgroup_is_root(mc.from)) 5094 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5095 5096 /* 5097 * we charged both to->memory and to->memsw, so we 5098 * should uncharge to->memory. 5099 */ 5100 if (!mem_cgroup_is_root(mc.to)) 5101 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5102 5103 css_put_many(&mc.from->css, mc.moved_swap); 5104 5105 /* we've already done css_get(mc.to) */ 5106 mc.moved_swap = 0; 5107 } 5108 memcg_oom_recover(from); 5109 memcg_oom_recover(to); 5110 wake_up_all(&mc.waitq); 5111 } 5112 5113 static void mem_cgroup_clear_mc(void) 5114 { 5115 /* 5116 * we must clear moving_task before waking up waiters at the end of 5117 * task migration. 5118 */ 5119 mc.moving_task = NULL; 5120 __mem_cgroup_clear_mc(); 5121 spin_lock(&mc.lock); 5122 mc.from = NULL; 5123 mc.to = NULL; 5124 spin_unlock(&mc.lock); 5125 } 5126 5127 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5128 struct cgroup_taskset *tset) 5129 { 5130 struct task_struct *p = cgroup_taskset_first(tset); 5131 int ret = 0; 5132 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5133 unsigned long move_flags; 5134 5135 /* 5136 * We are now commited to this value whatever it is. Changes in this 5137 * tunable will only affect upcoming migrations, not the current one. 5138 * So we need to save it, and keep it going. 5139 */ 5140 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5141 if (move_flags) { 5142 struct mm_struct *mm; 5143 struct mem_cgroup *from = mem_cgroup_from_task(p); 5144 5145 VM_BUG_ON(from == memcg); 5146 5147 mm = get_task_mm(p); 5148 if (!mm) 5149 return 0; 5150 /* We move charges only when we move a owner of the mm */ 5151 if (mm->owner == p) { 5152 VM_BUG_ON(mc.from); 5153 VM_BUG_ON(mc.to); 5154 VM_BUG_ON(mc.precharge); 5155 VM_BUG_ON(mc.moved_charge); 5156 VM_BUG_ON(mc.moved_swap); 5157 5158 spin_lock(&mc.lock); 5159 mc.from = from; 5160 mc.to = memcg; 5161 mc.flags = move_flags; 5162 spin_unlock(&mc.lock); 5163 /* We set mc.moving_task later */ 5164 5165 ret = mem_cgroup_precharge_mc(mm); 5166 if (ret) 5167 mem_cgroup_clear_mc(); 5168 } 5169 mmput(mm); 5170 } 5171 return ret; 5172 } 5173 5174 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5175 struct cgroup_taskset *tset) 5176 { 5177 if (mc.to) 5178 mem_cgroup_clear_mc(); 5179 } 5180 5181 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5182 unsigned long addr, unsigned long end, 5183 struct mm_walk *walk) 5184 { 5185 int ret = 0; 5186 struct vm_area_struct *vma = walk->vma; 5187 pte_t *pte; 5188 spinlock_t *ptl; 5189 enum mc_target_type target_type; 5190 union mc_target target; 5191 struct page *page; 5192 5193 /* 5194 * We don't take compound_lock() here but no race with splitting thp 5195 * happens because: 5196 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5197 * under splitting, which means there's no concurrent thp split, 5198 * - if another thread runs into split_huge_page() just after we 5199 * entered this if-block, the thread must wait for page table lock 5200 * to be unlocked in __split_huge_page_splitting(), where the main 5201 * part of thp split is not executed yet. 5202 */ 5203 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5204 if (mc.precharge < HPAGE_PMD_NR) { 5205 spin_unlock(ptl); 5206 return 0; 5207 } 5208 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5209 if (target_type == MC_TARGET_PAGE) { 5210 page = target.page; 5211 if (!isolate_lru_page(page)) { 5212 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5213 mc.from, mc.to)) { 5214 mc.precharge -= HPAGE_PMD_NR; 5215 mc.moved_charge += HPAGE_PMD_NR; 5216 } 5217 putback_lru_page(page); 5218 } 5219 put_page(page); 5220 } 5221 spin_unlock(ptl); 5222 return 0; 5223 } 5224 5225 if (pmd_trans_unstable(pmd)) 5226 return 0; 5227 retry: 5228 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5229 for (; addr != end; addr += PAGE_SIZE) { 5230 pte_t ptent = *(pte++); 5231 swp_entry_t ent; 5232 5233 if (!mc.precharge) 5234 break; 5235 5236 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5237 case MC_TARGET_PAGE: 5238 page = target.page; 5239 if (isolate_lru_page(page)) 5240 goto put; 5241 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 5242 mc.precharge--; 5243 /* we uncharge from mc.from later. */ 5244 mc.moved_charge++; 5245 } 5246 putback_lru_page(page); 5247 put: /* get_mctgt_type() gets the page */ 5248 put_page(page); 5249 break; 5250 case MC_TARGET_SWAP: 5251 ent = target.ent; 5252 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5253 mc.precharge--; 5254 /* we fixup refcnts and charges later. */ 5255 mc.moved_swap++; 5256 } 5257 break; 5258 default: 5259 break; 5260 } 5261 } 5262 pte_unmap_unlock(pte - 1, ptl); 5263 cond_resched(); 5264 5265 if (addr != end) { 5266 /* 5267 * We have consumed all precharges we got in can_attach(). 5268 * We try charge one by one, but don't do any additional 5269 * charges to mc.to if we have failed in charge once in attach() 5270 * phase. 5271 */ 5272 ret = mem_cgroup_do_precharge(1); 5273 if (!ret) 5274 goto retry; 5275 } 5276 5277 return ret; 5278 } 5279 5280 static void mem_cgroup_move_charge(struct mm_struct *mm) 5281 { 5282 struct mm_walk mem_cgroup_move_charge_walk = { 5283 .pmd_entry = mem_cgroup_move_charge_pte_range, 5284 .mm = mm, 5285 }; 5286 5287 lru_add_drain_all(); 5288 /* 5289 * Signal mem_cgroup_begin_page_stat() to take the memcg's 5290 * move_lock while we're moving its pages to another memcg. 5291 * Then wait for already started RCU-only updates to finish. 5292 */ 5293 atomic_inc(&mc.from->moving_account); 5294 synchronize_rcu(); 5295 retry: 5296 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5297 /* 5298 * Someone who are holding the mmap_sem might be waiting in 5299 * waitq. So we cancel all extra charges, wake up all waiters, 5300 * and retry. Because we cancel precharges, we might not be able 5301 * to move enough charges, but moving charge is a best-effort 5302 * feature anyway, so it wouldn't be a big problem. 5303 */ 5304 __mem_cgroup_clear_mc(); 5305 cond_resched(); 5306 goto retry; 5307 } 5308 /* 5309 * When we have consumed all precharges and failed in doing 5310 * additional charge, the page walk just aborts. 5311 */ 5312 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); 5313 up_read(&mm->mmap_sem); 5314 atomic_dec(&mc.from->moving_account); 5315 } 5316 5317 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5318 struct cgroup_taskset *tset) 5319 { 5320 struct task_struct *p = cgroup_taskset_first(tset); 5321 struct mm_struct *mm = get_task_mm(p); 5322 5323 if (mm) { 5324 if (mc.to) 5325 mem_cgroup_move_charge(mm); 5326 mmput(mm); 5327 } 5328 if (mc.to) 5329 mem_cgroup_clear_mc(); 5330 } 5331 #else /* !CONFIG_MMU */ 5332 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5333 struct cgroup_taskset *tset) 5334 { 5335 return 0; 5336 } 5337 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5338 struct cgroup_taskset *tset) 5339 { 5340 } 5341 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5342 struct cgroup_taskset *tset) 5343 { 5344 } 5345 #endif 5346 5347 /* 5348 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5349 * to verify whether we're attached to the default hierarchy on each mount 5350 * attempt. 5351 */ 5352 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5353 { 5354 /* 5355 * use_hierarchy is forced on the default hierarchy. cgroup core 5356 * guarantees that @root doesn't have any children, so turning it 5357 * on for the root memcg is enough. 5358 */ 5359 if (cgroup_on_dfl(root_css->cgroup)) 5360 root_mem_cgroup->use_hierarchy = true; 5361 else 5362 root_mem_cgroup->use_hierarchy = false; 5363 } 5364 5365 static u64 memory_current_read(struct cgroup_subsys_state *css, 5366 struct cftype *cft) 5367 { 5368 return mem_cgroup_usage(mem_cgroup_from_css(css), false); 5369 } 5370 5371 static int memory_low_show(struct seq_file *m, void *v) 5372 { 5373 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5374 unsigned long low = READ_ONCE(memcg->low); 5375 5376 if (low == PAGE_COUNTER_MAX) 5377 seq_puts(m, "max\n"); 5378 else 5379 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); 5380 5381 return 0; 5382 } 5383 5384 static ssize_t memory_low_write(struct kernfs_open_file *of, 5385 char *buf, size_t nbytes, loff_t off) 5386 { 5387 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5388 unsigned long low; 5389 int err; 5390 5391 buf = strstrip(buf); 5392 err = page_counter_memparse(buf, "max", &low); 5393 if (err) 5394 return err; 5395 5396 memcg->low = low; 5397 5398 return nbytes; 5399 } 5400 5401 static int memory_high_show(struct seq_file *m, void *v) 5402 { 5403 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5404 unsigned long high = READ_ONCE(memcg->high); 5405 5406 if (high == PAGE_COUNTER_MAX) 5407 seq_puts(m, "max\n"); 5408 else 5409 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); 5410 5411 return 0; 5412 } 5413 5414 static ssize_t memory_high_write(struct kernfs_open_file *of, 5415 char *buf, size_t nbytes, loff_t off) 5416 { 5417 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5418 unsigned long high; 5419 int err; 5420 5421 buf = strstrip(buf); 5422 err = page_counter_memparse(buf, "max", &high); 5423 if (err) 5424 return err; 5425 5426 memcg->high = high; 5427 5428 memcg_wb_domain_size_changed(memcg); 5429 return nbytes; 5430 } 5431 5432 static int memory_max_show(struct seq_file *m, void *v) 5433 { 5434 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5435 unsigned long max = READ_ONCE(memcg->memory.limit); 5436 5437 if (max == PAGE_COUNTER_MAX) 5438 seq_puts(m, "max\n"); 5439 else 5440 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 5441 5442 return 0; 5443 } 5444 5445 static ssize_t memory_max_write(struct kernfs_open_file *of, 5446 char *buf, size_t nbytes, loff_t off) 5447 { 5448 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5449 unsigned long max; 5450 int err; 5451 5452 buf = strstrip(buf); 5453 err = page_counter_memparse(buf, "max", &max); 5454 if (err) 5455 return err; 5456 5457 err = mem_cgroup_resize_limit(memcg, max); 5458 if (err) 5459 return err; 5460 5461 memcg_wb_domain_size_changed(memcg); 5462 return nbytes; 5463 } 5464 5465 static int memory_events_show(struct seq_file *m, void *v) 5466 { 5467 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5468 5469 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); 5470 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); 5471 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); 5472 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); 5473 5474 return 0; 5475 } 5476 5477 static struct cftype memory_files[] = { 5478 { 5479 .name = "current", 5480 .read_u64 = memory_current_read, 5481 }, 5482 { 5483 .name = "low", 5484 .flags = CFTYPE_NOT_ON_ROOT, 5485 .seq_show = memory_low_show, 5486 .write = memory_low_write, 5487 }, 5488 { 5489 .name = "high", 5490 .flags = CFTYPE_NOT_ON_ROOT, 5491 .seq_show = memory_high_show, 5492 .write = memory_high_write, 5493 }, 5494 { 5495 .name = "max", 5496 .flags = CFTYPE_NOT_ON_ROOT, 5497 .seq_show = memory_max_show, 5498 .write = memory_max_write, 5499 }, 5500 { 5501 .name = "events", 5502 .flags = CFTYPE_NOT_ON_ROOT, 5503 .seq_show = memory_events_show, 5504 }, 5505 { } /* terminate */ 5506 }; 5507 5508 struct cgroup_subsys memory_cgrp_subsys = { 5509 .css_alloc = mem_cgroup_css_alloc, 5510 .css_online = mem_cgroup_css_online, 5511 .css_offline = mem_cgroup_css_offline, 5512 .css_free = mem_cgroup_css_free, 5513 .css_reset = mem_cgroup_css_reset, 5514 .can_attach = mem_cgroup_can_attach, 5515 .cancel_attach = mem_cgroup_cancel_attach, 5516 .attach = mem_cgroup_move_task, 5517 .bind = mem_cgroup_bind, 5518 .dfl_cftypes = memory_files, 5519 .legacy_cftypes = mem_cgroup_legacy_files, 5520 .early_init = 0, 5521 }; 5522 5523 /** 5524 * mem_cgroup_events - count memory events against a cgroup 5525 * @memcg: the memory cgroup 5526 * @idx: the event index 5527 * @nr: the number of events to account for 5528 */ 5529 void mem_cgroup_events(struct mem_cgroup *memcg, 5530 enum mem_cgroup_events_index idx, 5531 unsigned int nr) 5532 { 5533 this_cpu_add(memcg->stat->events[idx], nr); 5534 } 5535 5536 /** 5537 * mem_cgroup_low - check if memory consumption is below the normal range 5538 * @root: the highest ancestor to consider 5539 * @memcg: the memory cgroup to check 5540 * 5541 * Returns %true if memory consumption of @memcg, and that of all 5542 * configurable ancestors up to @root, is below the normal range. 5543 */ 5544 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) 5545 { 5546 if (mem_cgroup_disabled()) 5547 return false; 5548 5549 /* 5550 * The toplevel group doesn't have a configurable range, so 5551 * it's never low when looked at directly, and it is not 5552 * considered an ancestor when assessing the hierarchy. 5553 */ 5554 5555 if (memcg == root_mem_cgroup) 5556 return false; 5557 5558 if (page_counter_read(&memcg->memory) >= memcg->low) 5559 return false; 5560 5561 while (memcg != root) { 5562 memcg = parent_mem_cgroup(memcg); 5563 5564 if (memcg == root_mem_cgroup) 5565 break; 5566 5567 if (page_counter_read(&memcg->memory) >= memcg->low) 5568 return false; 5569 } 5570 return true; 5571 } 5572 5573 /** 5574 * mem_cgroup_try_charge - try charging a page 5575 * @page: page to charge 5576 * @mm: mm context of the victim 5577 * @gfp_mask: reclaim mode 5578 * @memcgp: charged memcg return 5579 * 5580 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5581 * pages according to @gfp_mask if necessary. 5582 * 5583 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5584 * Otherwise, an error code is returned. 5585 * 5586 * After page->mapping has been set up, the caller must finalize the 5587 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5588 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5589 */ 5590 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5591 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5592 { 5593 struct mem_cgroup *memcg = NULL; 5594 unsigned int nr_pages = 1; 5595 int ret = 0; 5596 5597 if (mem_cgroup_disabled()) 5598 goto out; 5599 5600 if (PageSwapCache(page)) { 5601 /* 5602 * Every swap fault against a single page tries to charge the 5603 * page, bail as early as possible. shmem_unuse() encounters 5604 * already charged pages, too. The USED bit is protected by 5605 * the page lock, which serializes swap cache removal, which 5606 * in turn serializes uncharging. 5607 */ 5608 if (page->mem_cgroup) 5609 goto out; 5610 } 5611 5612 if (PageTransHuge(page)) { 5613 nr_pages <<= compound_order(page); 5614 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5615 } 5616 5617 if (do_swap_account && PageSwapCache(page)) 5618 memcg = try_get_mem_cgroup_from_page(page); 5619 if (!memcg) 5620 memcg = get_mem_cgroup_from_mm(mm); 5621 5622 ret = try_charge(memcg, gfp_mask, nr_pages); 5623 5624 css_put(&memcg->css); 5625 5626 if (ret == -EINTR) { 5627 memcg = root_mem_cgroup; 5628 ret = 0; 5629 } 5630 out: 5631 *memcgp = memcg; 5632 return ret; 5633 } 5634 5635 /** 5636 * mem_cgroup_commit_charge - commit a page charge 5637 * @page: page to charge 5638 * @memcg: memcg to charge the page to 5639 * @lrucare: page might be on LRU already 5640 * 5641 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5642 * after page->mapping has been set up. This must happen atomically 5643 * as part of the page instantiation, i.e. under the page table lock 5644 * for anonymous pages, under the page lock for page and swap cache. 5645 * 5646 * In addition, the page must not be on the LRU during the commit, to 5647 * prevent racing with task migration. If it might be, use @lrucare. 5648 * 5649 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5650 */ 5651 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5652 bool lrucare) 5653 { 5654 unsigned int nr_pages = 1; 5655 5656 VM_BUG_ON_PAGE(!page->mapping, page); 5657 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5658 5659 if (mem_cgroup_disabled()) 5660 return; 5661 /* 5662 * Swap faults will attempt to charge the same page multiple 5663 * times. But reuse_swap_page() might have removed the page 5664 * from swapcache already, so we can't check PageSwapCache(). 5665 */ 5666 if (!memcg) 5667 return; 5668 5669 commit_charge(page, memcg, lrucare); 5670 5671 if (PageTransHuge(page)) { 5672 nr_pages <<= compound_order(page); 5673 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5674 } 5675 5676 local_irq_disable(); 5677 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5678 memcg_check_events(memcg, page); 5679 local_irq_enable(); 5680 5681 if (do_swap_account && PageSwapCache(page)) { 5682 swp_entry_t entry = { .val = page_private(page) }; 5683 /* 5684 * The swap entry might not get freed for a long time, 5685 * let's not wait for it. The page already received a 5686 * memory+swap charge, drop the swap entry duplicate. 5687 */ 5688 mem_cgroup_uncharge_swap(entry); 5689 } 5690 } 5691 5692 /** 5693 * mem_cgroup_cancel_charge - cancel a page charge 5694 * @page: page to charge 5695 * @memcg: memcg to charge the page to 5696 * 5697 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5698 */ 5699 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5700 { 5701 unsigned int nr_pages = 1; 5702 5703 if (mem_cgroup_disabled()) 5704 return; 5705 /* 5706 * Swap faults will attempt to charge the same page multiple 5707 * times. But reuse_swap_page() might have removed the page 5708 * from swapcache already, so we can't check PageSwapCache(). 5709 */ 5710 if (!memcg) 5711 return; 5712 5713 if (PageTransHuge(page)) { 5714 nr_pages <<= compound_order(page); 5715 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5716 } 5717 5718 cancel_charge(memcg, nr_pages); 5719 } 5720 5721 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5722 unsigned long nr_anon, unsigned long nr_file, 5723 unsigned long nr_huge, struct page *dummy_page) 5724 { 5725 unsigned long nr_pages = nr_anon + nr_file; 5726 unsigned long flags; 5727 5728 if (!mem_cgroup_is_root(memcg)) { 5729 page_counter_uncharge(&memcg->memory, nr_pages); 5730 if (do_swap_account) 5731 page_counter_uncharge(&memcg->memsw, nr_pages); 5732 memcg_oom_recover(memcg); 5733 } 5734 5735 local_irq_save(flags); 5736 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5737 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5738 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5739 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5740 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5741 memcg_check_events(memcg, dummy_page); 5742 local_irq_restore(flags); 5743 5744 if (!mem_cgroup_is_root(memcg)) 5745 css_put_many(&memcg->css, nr_pages); 5746 } 5747 5748 static void uncharge_list(struct list_head *page_list) 5749 { 5750 struct mem_cgroup *memcg = NULL; 5751 unsigned long nr_anon = 0; 5752 unsigned long nr_file = 0; 5753 unsigned long nr_huge = 0; 5754 unsigned long pgpgout = 0; 5755 struct list_head *next; 5756 struct page *page; 5757 5758 next = page_list->next; 5759 do { 5760 unsigned int nr_pages = 1; 5761 5762 page = list_entry(next, struct page, lru); 5763 next = page->lru.next; 5764 5765 VM_BUG_ON_PAGE(PageLRU(page), page); 5766 VM_BUG_ON_PAGE(page_count(page), page); 5767 5768 if (!page->mem_cgroup) 5769 continue; 5770 5771 /* 5772 * Nobody should be changing or seriously looking at 5773 * page->mem_cgroup at this point, we have fully 5774 * exclusive access to the page. 5775 */ 5776 5777 if (memcg != page->mem_cgroup) { 5778 if (memcg) { 5779 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5780 nr_huge, page); 5781 pgpgout = nr_anon = nr_file = nr_huge = 0; 5782 } 5783 memcg = page->mem_cgroup; 5784 } 5785 5786 if (PageTransHuge(page)) { 5787 nr_pages <<= compound_order(page); 5788 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5789 nr_huge += nr_pages; 5790 } 5791 5792 if (PageAnon(page)) 5793 nr_anon += nr_pages; 5794 else 5795 nr_file += nr_pages; 5796 5797 page->mem_cgroup = NULL; 5798 5799 pgpgout++; 5800 } while (next != page_list); 5801 5802 if (memcg) 5803 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5804 nr_huge, page); 5805 } 5806 5807 /** 5808 * mem_cgroup_uncharge - uncharge a page 5809 * @page: page to uncharge 5810 * 5811 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5812 * mem_cgroup_commit_charge(). 5813 */ 5814 void mem_cgroup_uncharge(struct page *page) 5815 { 5816 if (mem_cgroup_disabled()) 5817 return; 5818 5819 /* Don't touch page->lru of any random page, pre-check: */ 5820 if (!page->mem_cgroup) 5821 return; 5822 5823 INIT_LIST_HEAD(&page->lru); 5824 uncharge_list(&page->lru); 5825 } 5826 5827 /** 5828 * mem_cgroup_uncharge_list - uncharge a list of page 5829 * @page_list: list of pages to uncharge 5830 * 5831 * Uncharge a list of pages previously charged with 5832 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5833 */ 5834 void mem_cgroup_uncharge_list(struct list_head *page_list) 5835 { 5836 if (mem_cgroup_disabled()) 5837 return; 5838 5839 if (!list_empty(page_list)) 5840 uncharge_list(page_list); 5841 } 5842 5843 /** 5844 * mem_cgroup_migrate - migrate a charge to another page 5845 * @oldpage: currently charged page 5846 * @newpage: page to transfer the charge to 5847 * @lrucare: either or both pages might be on the LRU already 5848 * 5849 * Migrate the charge from @oldpage to @newpage. 5850 * 5851 * Both pages must be locked, @newpage->mapping must be set up. 5852 */ 5853 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5854 bool lrucare) 5855 { 5856 struct mem_cgroup *memcg; 5857 int isolated; 5858 5859 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5860 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5861 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 5862 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 5863 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5864 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5865 newpage); 5866 5867 if (mem_cgroup_disabled()) 5868 return; 5869 5870 /* Page cache replacement: new page already charged? */ 5871 if (newpage->mem_cgroup) 5872 return; 5873 5874 /* 5875 * Swapcache readahead pages can get migrated before being 5876 * charged, and migration from compaction can happen to an 5877 * uncharged page when the PFN walker finds a page that 5878 * reclaim just put back on the LRU but has not released yet. 5879 */ 5880 memcg = oldpage->mem_cgroup; 5881 if (!memcg) 5882 return; 5883 5884 if (lrucare) 5885 lock_page_lru(oldpage, &isolated); 5886 5887 oldpage->mem_cgroup = NULL; 5888 5889 if (lrucare) 5890 unlock_page_lru(oldpage, isolated); 5891 5892 commit_charge(newpage, memcg, lrucare); 5893 } 5894 5895 /* 5896 * subsys_initcall() for memory controller. 5897 * 5898 * Some parts like hotcpu_notifier() have to be initialized from this context 5899 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 5900 * everything that doesn't depend on a specific mem_cgroup structure should 5901 * be initialized from here. 5902 */ 5903 static int __init mem_cgroup_init(void) 5904 { 5905 int cpu, node; 5906 5907 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5908 5909 for_each_possible_cpu(cpu) 5910 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 5911 drain_local_stock); 5912 5913 for_each_node(node) { 5914 struct mem_cgroup_tree_per_node *rtpn; 5915 int zone; 5916 5917 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5918 node_online(node) ? node : NUMA_NO_NODE); 5919 5920 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5921 struct mem_cgroup_tree_per_zone *rtpz; 5922 5923 rtpz = &rtpn->rb_tree_per_zone[zone]; 5924 rtpz->rb_root = RB_ROOT; 5925 spin_lock_init(&rtpz->lock); 5926 } 5927 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5928 } 5929 5930 return 0; 5931 } 5932 subsys_initcall(mem_cgroup_init); 5933 5934 #ifdef CONFIG_MEMCG_SWAP 5935 /** 5936 * mem_cgroup_swapout - transfer a memsw charge to swap 5937 * @page: page whose memsw charge to transfer 5938 * @entry: swap entry to move the charge to 5939 * 5940 * Transfer the memsw charge of @page to @entry. 5941 */ 5942 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5943 { 5944 struct mem_cgroup *memcg; 5945 unsigned short oldid; 5946 5947 VM_BUG_ON_PAGE(PageLRU(page), page); 5948 VM_BUG_ON_PAGE(page_count(page), page); 5949 5950 if (!do_swap_account) 5951 return; 5952 5953 memcg = page->mem_cgroup; 5954 5955 /* Readahead page, never charged */ 5956 if (!memcg) 5957 return; 5958 5959 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5960 VM_BUG_ON_PAGE(oldid, page); 5961 mem_cgroup_swap_statistics(memcg, true); 5962 5963 page->mem_cgroup = NULL; 5964 5965 if (!mem_cgroup_is_root(memcg)) 5966 page_counter_uncharge(&memcg->memory, 1); 5967 5968 /* Caller disabled preemption with mapping->tree_lock */ 5969 mem_cgroup_charge_statistics(memcg, page, -1); 5970 memcg_check_events(memcg, page); 5971 } 5972 5973 /** 5974 * mem_cgroup_uncharge_swap - uncharge a swap entry 5975 * @entry: swap entry to uncharge 5976 * 5977 * Drop the memsw charge associated with @entry. 5978 */ 5979 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5980 { 5981 struct mem_cgroup *memcg; 5982 unsigned short id; 5983 5984 if (!do_swap_account) 5985 return; 5986 5987 id = swap_cgroup_record(entry, 0); 5988 rcu_read_lock(); 5989 memcg = mem_cgroup_from_id(id); 5990 if (memcg) { 5991 if (!mem_cgroup_is_root(memcg)) 5992 page_counter_uncharge(&memcg->memsw, 1); 5993 mem_cgroup_swap_statistics(memcg, false); 5994 css_put(&memcg->css); 5995 } 5996 rcu_read_unlock(); 5997 } 5998 5999 /* for remember boot option*/ 6000 #ifdef CONFIG_MEMCG_SWAP_ENABLED 6001 static int really_do_swap_account __initdata = 1; 6002 #else 6003 static int really_do_swap_account __initdata; 6004 #endif 6005 6006 static int __init enable_swap_account(char *s) 6007 { 6008 if (!strcmp(s, "1")) 6009 really_do_swap_account = 1; 6010 else if (!strcmp(s, "0")) 6011 really_do_swap_account = 0; 6012 return 1; 6013 } 6014 __setup("swapaccount=", enable_swap_account); 6015 6016 static struct cftype memsw_cgroup_files[] = { 6017 { 6018 .name = "memsw.usage_in_bytes", 6019 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6020 .read_u64 = mem_cgroup_read_u64, 6021 }, 6022 { 6023 .name = "memsw.max_usage_in_bytes", 6024 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6025 .write = mem_cgroup_reset, 6026 .read_u64 = mem_cgroup_read_u64, 6027 }, 6028 { 6029 .name = "memsw.limit_in_bytes", 6030 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6031 .write = mem_cgroup_write, 6032 .read_u64 = mem_cgroup_read_u64, 6033 }, 6034 { 6035 .name = "memsw.failcnt", 6036 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6037 .write = mem_cgroup_reset, 6038 .read_u64 = mem_cgroup_read_u64, 6039 }, 6040 { }, /* terminate */ 6041 }; 6042 6043 static int __init mem_cgroup_swap_init(void) 6044 { 6045 if (!mem_cgroup_disabled() && really_do_swap_account) { 6046 do_swap_account = 1; 6047 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6048 memsw_cgroup_files)); 6049 } 6050 return 0; 6051 } 6052 subsys_initcall(mem_cgroup_swap_init); 6053 6054 #endif /* CONFIG_MEMCG_SWAP */ 6055