1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * Native page reclaim 18 * Charge lifetime sanitation 19 * Lockless page tracking & accounting 20 * Unified hierarchy configuration model 21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 */ 33 34 #include <linux/page_counter.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cgroup.h> 37 #include <linux/mm.h> 38 #include <linux/hugetlb.h> 39 #include <linux/pagemap.h> 40 #include <linux/smp.h> 41 #include <linux/page-flags.h> 42 #include <linux/backing-dev.h> 43 #include <linux/bit_spinlock.h> 44 #include <linux/rcupdate.h> 45 #include <linux/limits.h> 46 #include <linux/export.h> 47 #include <linux/mutex.h> 48 #include <linux/rbtree.h> 49 #include <linux/slab.h> 50 #include <linux/swap.h> 51 #include <linux/swapops.h> 52 #include <linux/spinlock.h> 53 #include <linux/eventfd.h> 54 #include <linux/poll.h> 55 #include <linux/sort.h> 56 #include <linux/fs.h> 57 #include <linux/seq_file.h> 58 #include <linux/vmpressure.h> 59 #include <linux/mm_inline.h> 60 #include <linux/swap_cgroup.h> 61 #include <linux/cpu.h> 62 #include <linux/oom.h> 63 #include <linux/lockdep.h> 64 #include <linux/file.h> 65 #include "internal.h" 66 #include <net/sock.h> 67 #include <net/ip.h> 68 #include <net/tcp_memcontrol.h> 69 #include "slab.h" 70 71 #include <asm/uaccess.h> 72 73 #include <trace/events/vmscan.h> 74 75 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 76 EXPORT_SYMBOL(memory_cgrp_subsys); 77 78 #define MEM_CGROUP_RECLAIM_RETRIES 5 79 static struct mem_cgroup *root_mem_cgroup __read_mostly; 80 struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; 81 82 /* Whether the swap controller is active */ 83 #ifdef CONFIG_MEMCG_SWAP 84 int do_swap_account __read_mostly; 85 #else 86 #define do_swap_account 0 87 #endif 88 89 static const char * const mem_cgroup_stat_names[] = { 90 "cache", 91 "rss", 92 "rss_huge", 93 "mapped_file", 94 "dirty", 95 "writeback", 96 "swap", 97 }; 98 99 static const char * const mem_cgroup_events_names[] = { 100 "pgpgin", 101 "pgpgout", 102 "pgfault", 103 "pgmajfault", 104 }; 105 106 static const char * const mem_cgroup_lru_names[] = { 107 "inactive_anon", 108 "active_anon", 109 "inactive_file", 110 "active_file", 111 "unevictable", 112 }; 113 114 #define THRESHOLDS_EVENTS_TARGET 128 115 #define SOFTLIMIT_EVENTS_TARGET 1024 116 #define NUMAINFO_EVENTS_TARGET 1024 117 118 /* 119 * Cgroups above their limits are maintained in a RB-Tree, independent of 120 * their hierarchy representation 121 */ 122 123 struct mem_cgroup_tree_per_zone { 124 struct rb_root rb_root; 125 spinlock_t lock; 126 }; 127 128 struct mem_cgroup_tree_per_node { 129 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 130 }; 131 132 struct mem_cgroup_tree { 133 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 134 }; 135 136 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 137 138 /* for OOM */ 139 struct mem_cgroup_eventfd_list { 140 struct list_head list; 141 struct eventfd_ctx *eventfd; 142 }; 143 144 /* 145 * cgroup_event represents events which userspace want to receive. 146 */ 147 struct mem_cgroup_event { 148 /* 149 * memcg which the event belongs to. 150 */ 151 struct mem_cgroup *memcg; 152 /* 153 * eventfd to signal userspace about the event. 154 */ 155 struct eventfd_ctx *eventfd; 156 /* 157 * Each of these stored in a list by the cgroup. 158 */ 159 struct list_head list; 160 /* 161 * register_event() callback will be used to add new userspace 162 * waiter for changes related to this event. Use eventfd_signal() 163 * on eventfd to send notification to userspace. 164 */ 165 int (*register_event)(struct mem_cgroup *memcg, 166 struct eventfd_ctx *eventfd, const char *args); 167 /* 168 * unregister_event() callback will be called when userspace closes 169 * the eventfd or on cgroup removing. This callback must be set, 170 * if you want provide notification functionality. 171 */ 172 void (*unregister_event)(struct mem_cgroup *memcg, 173 struct eventfd_ctx *eventfd); 174 /* 175 * All fields below needed to unregister event when 176 * userspace closes eventfd. 177 */ 178 poll_table pt; 179 wait_queue_head_t *wqh; 180 wait_queue_t wait; 181 struct work_struct remove; 182 }; 183 184 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 185 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 186 187 /* Stuffs for move charges at task migration. */ 188 /* 189 * Types of charges to be moved. 190 */ 191 #define MOVE_ANON 0x1U 192 #define MOVE_FILE 0x2U 193 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 194 195 /* "mc" and its members are protected by cgroup_mutex */ 196 static struct move_charge_struct { 197 spinlock_t lock; /* for from, to */ 198 struct mem_cgroup *from; 199 struct mem_cgroup *to; 200 unsigned long flags; 201 unsigned long precharge; 202 unsigned long moved_charge; 203 unsigned long moved_swap; 204 struct task_struct *moving_task; /* a task moving charges */ 205 wait_queue_head_t waitq; /* a waitq for other context */ 206 } mc = { 207 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 208 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 209 }; 210 211 /* 212 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 213 * limit reclaim to prevent infinite loops, if they ever occur. 214 */ 215 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 216 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 217 218 enum charge_type { 219 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 220 MEM_CGROUP_CHARGE_TYPE_ANON, 221 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 222 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 223 NR_CHARGE_TYPE, 224 }; 225 226 /* for encoding cft->private value on file */ 227 enum res_type { 228 _MEM, 229 _MEMSWAP, 230 _OOM_TYPE, 231 _KMEM, 232 }; 233 234 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 235 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 236 #define MEMFILE_ATTR(val) ((val) & 0xffff) 237 /* Used for OOM nofiier */ 238 #define OOM_CONTROL (0) 239 240 /* 241 * The memcg_create_mutex will be held whenever a new cgroup is created. 242 * As a consequence, any change that needs to protect against new child cgroups 243 * appearing has to hold it as well. 244 */ 245 static DEFINE_MUTEX(memcg_create_mutex); 246 247 /* Some nice accessors for the vmpressure. */ 248 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 249 { 250 if (!memcg) 251 memcg = root_mem_cgroup; 252 return &memcg->vmpressure; 253 } 254 255 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 256 { 257 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 258 } 259 260 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 261 { 262 return (memcg == root_mem_cgroup); 263 } 264 265 /* 266 * We restrict the id in the range of [1, 65535], so it can fit into 267 * an unsigned short. 268 */ 269 #define MEM_CGROUP_ID_MAX USHRT_MAX 270 271 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 272 { 273 return memcg->css.id; 274 } 275 276 /* 277 * A helper function to get mem_cgroup from ID. must be called under 278 * rcu_read_lock(). The caller is responsible for calling 279 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 280 * refcnt from swap can be called against removed memcg.) 281 */ 282 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 283 { 284 struct cgroup_subsys_state *css; 285 286 css = css_from_id(id, &memory_cgrp_subsys); 287 return mem_cgroup_from_css(css); 288 } 289 290 /* Writing them here to avoid exposing memcg's inner layout */ 291 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 292 293 void sock_update_memcg(struct sock *sk) 294 { 295 if (mem_cgroup_sockets_enabled) { 296 struct mem_cgroup *memcg; 297 struct cg_proto *cg_proto; 298 299 BUG_ON(!sk->sk_prot->proto_cgroup); 300 301 /* Socket cloning can throw us here with sk_cgrp already 302 * filled. It won't however, necessarily happen from 303 * process context. So the test for root memcg given 304 * the current task's memcg won't help us in this case. 305 * 306 * Respecting the original socket's memcg is a better 307 * decision in this case. 308 */ 309 if (sk->sk_cgrp) { 310 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 311 css_get(&sk->sk_cgrp->memcg->css); 312 return; 313 } 314 315 rcu_read_lock(); 316 memcg = mem_cgroup_from_task(current); 317 cg_proto = sk->sk_prot->proto_cgroup(memcg); 318 if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && 319 css_tryget_online(&memcg->css)) { 320 sk->sk_cgrp = cg_proto; 321 } 322 rcu_read_unlock(); 323 } 324 } 325 EXPORT_SYMBOL(sock_update_memcg); 326 327 void sock_release_memcg(struct sock *sk) 328 { 329 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 330 struct mem_cgroup *memcg; 331 WARN_ON(!sk->sk_cgrp->memcg); 332 memcg = sk->sk_cgrp->memcg; 333 css_put(&sk->sk_cgrp->memcg->css); 334 } 335 } 336 337 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 338 { 339 if (!memcg || mem_cgroup_is_root(memcg)) 340 return NULL; 341 342 return &memcg->tcp_mem; 343 } 344 EXPORT_SYMBOL(tcp_proto_cgroup); 345 346 #endif 347 348 #ifdef CONFIG_MEMCG_KMEM 349 /* 350 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 351 * The main reason for not using cgroup id for this: 352 * this works better in sparse environments, where we have a lot of memcgs, 353 * but only a few kmem-limited. Or also, if we have, for instance, 200 354 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 355 * 200 entry array for that. 356 * 357 * The current size of the caches array is stored in memcg_nr_cache_ids. It 358 * will double each time we have to increase it. 359 */ 360 static DEFINE_IDA(memcg_cache_ida); 361 int memcg_nr_cache_ids; 362 363 /* Protects memcg_nr_cache_ids */ 364 static DECLARE_RWSEM(memcg_cache_ids_sem); 365 366 void memcg_get_cache_ids(void) 367 { 368 down_read(&memcg_cache_ids_sem); 369 } 370 371 void memcg_put_cache_ids(void) 372 { 373 up_read(&memcg_cache_ids_sem); 374 } 375 376 /* 377 * MIN_SIZE is different than 1, because we would like to avoid going through 378 * the alloc/free process all the time. In a small machine, 4 kmem-limited 379 * cgroups is a reasonable guess. In the future, it could be a parameter or 380 * tunable, but that is strictly not necessary. 381 * 382 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 383 * this constant directly from cgroup, but it is understandable that this is 384 * better kept as an internal representation in cgroup.c. In any case, the 385 * cgrp_id space is not getting any smaller, and we don't have to necessarily 386 * increase ours as well if it increases. 387 */ 388 #define MEMCG_CACHES_MIN_SIZE 4 389 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 390 391 /* 392 * A lot of the calls to the cache allocation functions are expected to be 393 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 394 * conditional to this static branch, we'll have to allow modules that does 395 * kmem_cache_alloc and the such to see this symbol as well 396 */ 397 struct static_key memcg_kmem_enabled_key; 398 EXPORT_SYMBOL(memcg_kmem_enabled_key); 399 400 #endif /* CONFIG_MEMCG_KMEM */ 401 402 static struct mem_cgroup_per_zone * 403 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 404 { 405 int nid = zone_to_nid(zone); 406 int zid = zone_idx(zone); 407 408 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 409 } 410 411 /** 412 * mem_cgroup_css_from_page - css of the memcg associated with a page 413 * @page: page of interest 414 * 415 * If memcg is bound to the default hierarchy, css of the memcg associated 416 * with @page is returned. The returned css remains associated with @page 417 * until it is released. 418 * 419 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 420 * is returned. 421 * 422 * XXX: The above description of behavior on the default hierarchy isn't 423 * strictly true yet as replace_page_cache_page() can modify the 424 * association before @page is released even on the default hierarchy; 425 * however, the current and planned usages don't mix the the two functions 426 * and replace_page_cache_page() will soon be updated to make the invariant 427 * actually true. 428 */ 429 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 430 { 431 struct mem_cgroup *memcg; 432 433 rcu_read_lock(); 434 435 memcg = page->mem_cgroup; 436 437 if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) 438 memcg = root_mem_cgroup; 439 440 rcu_read_unlock(); 441 return &memcg->css; 442 } 443 444 /** 445 * page_cgroup_ino - return inode number of the memcg a page is charged to 446 * @page: the page 447 * 448 * Look up the closest online ancestor of the memory cgroup @page is charged to 449 * and return its inode number or 0 if @page is not charged to any cgroup. It 450 * is safe to call this function without holding a reference to @page. 451 * 452 * Note, this function is inherently racy, because there is nothing to prevent 453 * the cgroup inode from getting torn down and potentially reallocated a moment 454 * after page_cgroup_ino() returns, so it only should be used by callers that 455 * do not care (such as procfs interfaces). 456 */ 457 ino_t page_cgroup_ino(struct page *page) 458 { 459 struct mem_cgroup *memcg; 460 unsigned long ino = 0; 461 462 rcu_read_lock(); 463 memcg = READ_ONCE(page->mem_cgroup); 464 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 465 memcg = parent_mem_cgroup(memcg); 466 if (memcg) 467 ino = cgroup_ino(memcg->css.cgroup); 468 rcu_read_unlock(); 469 return ino; 470 } 471 472 static struct mem_cgroup_per_zone * 473 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 474 { 475 int nid = page_to_nid(page); 476 int zid = page_zonenum(page); 477 478 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 479 } 480 481 static struct mem_cgroup_tree_per_zone * 482 soft_limit_tree_node_zone(int nid, int zid) 483 { 484 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 485 } 486 487 static struct mem_cgroup_tree_per_zone * 488 soft_limit_tree_from_page(struct page *page) 489 { 490 int nid = page_to_nid(page); 491 int zid = page_zonenum(page); 492 493 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 494 } 495 496 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 497 struct mem_cgroup_tree_per_zone *mctz, 498 unsigned long new_usage_in_excess) 499 { 500 struct rb_node **p = &mctz->rb_root.rb_node; 501 struct rb_node *parent = NULL; 502 struct mem_cgroup_per_zone *mz_node; 503 504 if (mz->on_tree) 505 return; 506 507 mz->usage_in_excess = new_usage_in_excess; 508 if (!mz->usage_in_excess) 509 return; 510 while (*p) { 511 parent = *p; 512 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 513 tree_node); 514 if (mz->usage_in_excess < mz_node->usage_in_excess) 515 p = &(*p)->rb_left; 516 /* 517 * We can't avoid mem cgroups that are over their soft 518 * limit by the same amount 519 */ 520 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 521 p = &(*p)->rb_right; 522 } 523 rb_link_node(&mz->tree_node, parent, p); 524 rb_insert_color(&mz->tree_node, &mctz->rb_root); 525 mz->on_tree = true; 526 } 527 528 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 529 struct mem_cgroup_tree_per_zone *mctz) 530 { 531 if (!mz->on_tree) 532 return; 533 rb_erase(&mz->tree_node, &mctz->rb_root); 534 mz->on_tree = false; 535 } 536 537 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 538 struct mem_cgroup_tree_per_zone *mctz) 539 { 540 unsigned long flags; 541 542 spin_lock_irqsave(&mctz->lock, flags); 543 __mem_cgroup_remove_exceeded(mz, mctz); 544 spin_unlock_irqrestore(&mctz->lock, flags); 545 } 546 547 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 548 { 549 unsigned long nr_pages = page_counter_read(&memcg->memory); 550 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 551 unsigned long excess = 0; 552 553 if (nr_pages > soft_limit) 554 excess = nr_pages - soft_limit; 555 556 return excess; 557 } 558 559 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 560 { 561 unsigned long excess; 562 struct mem_cgroup_per_zone *mz; 563 struct mem_cgroup_tree_per_zone *mctz; 564 565 mctz = soft_limit_tree_from_page(page); 566 /* 567 * Necessary to update all ancestors when hierarchy is used. 568 * because their event counter is not touched. 569 */ 570 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 571 mz = mem_cgroup_page_zoneinfo(memcg, page); 572 excess = soft_limit_excess(memcg); 573 /* 574 * We have to update the tree if mz is on RB-tree or 575 * mem is over its softlimit. 576 */ 577 if (excess || mz->on_tree) { 578 unsigned long flags; 579 580 spin_lock_irqsave(&mctz->lock, flags); 581 /* if on-tree, remove it */ 582 if (mz->on_tree) 583 __mem_cgroup_remove_exceeded(mz, mctz); 584 /* 585 * Insert again. mz->usage_in_excess will be updated. 586 * If excess is 0, no tree ops. 587 */ 588 __mem_cgroup_insert_exceeded(mz, mctz, excess); 589 spin_unlock_irqrestore(&mctz->lock, flags); 590 } 591 } 592 } 593 594 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 595 { 596 struct mem_cgroup_tree_per_zone *mctz; 597 struct mem_cgroup_per_zone *mz; 598 int nid, zid; 599 600 for_each_node(nid) { 601 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 602 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 603 mctz = soft_limit_tree_node_zone(nid, zid); 604 mem_cgroup_remove_exceeded(mz, mctz); 605 } 606 } 607 } 608 609 static struct mem_cgroup_per_zone * 610 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 611 { 612 struct rb_node *rightmost = NULL; 613 struct mem_cgroup_per_zone *mz; 614 615 retry: 616 mz = NULL; 617 rightmost = rb_last(&mctz->rb_root); 618 if (!rightmost) 619 goto done; /* Nothing to reclaim from */ 620 621 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 622 /* 623 * Remove the node now but someone else can add it back, 624 * we will to add it back at the end of reclaim to its correct 625 * position in the tree. 626 */ 627 __mem_cgroup_remove_exceeded(mz, mctz); 628 if (!soft_limit_excess(mz->memcg) || 629 !css_tryget_online(&mz->memcg->css)) 630 goto retry; 631 done: 632 return mz; 633 } 634 635 static struct mem_cgroup_per_zone * 636 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 637 { 638 struct mem_cgroup_per_zone *mz; 639 640 spin_lock_irq(&mctz->lock); 641 mz = __mem_cgroup_largest_soft_limit_node(mctz); 642 spin_unlock_irq(&mctz->lock); 643 return mz; 644 } 645 646 /* 647 * Return page count for single (non recursive) @memcg. 648 * 649 * Implementation Note: reading percpu statistics for memcg. 650 * 651 * Both of vmstat[] and percpu_counter has threshold and do periodic 652 * synchronization to implement "quick" read. There are trade-off between 653 * reading cost and precision of value. Then, we may have a chance to implement 654 * a periodic synchronization of counter in memcg's counter. 655 * 656 * But this _read() function is used for user interface now. The user accounts 657 * memory usage by memory cgroup and he _always_ requires exact value because 658 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 659 * have to visit all online cpus and make sum. So, for now, unnecessary 660 * synchronization is not implemented. (just implemented for cpu hotplug) 661 * 662 * If there are kernel internal actions which can make use of some not-exact 663 * value, and reading all cpu value can be performance bottleneck in some 664 * common workload, threshold and synchronization as vmstat[] should be 665 * implemented. 666 */ 667 static unsigned long 668 mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) 669 { 670 long val = 0; 671 int cpu; 672 673 /* Per-cpu values can be negative, use a signed accumulator */ 674 for_each_possible_cpu(cpu) 675 val += per_cpu(memcg->stat->count[idx], cpu); 676 /* 677 * Summing races with updates, so val may be negative. Avoid exposing 678 * transient negative values. 679 */ 680 if (val < 0) 681 val = 0; 682 return val; 683 } 684 685 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 686 enum mem_cgroup_events_index idx) 687 { 688 unsigned long val = 0; 689 int cpu; 690 691 for_each_possible_cpu(cpu) 692 val += per_cpu(memcg->stat->events[idx], cpu); 693 return val; 694 } 695 696 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697 struct page *page, 698 int nr_pages) 699 { 700 /* 701 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 702 * counted as CACHE even if it's on ANON LRU. 703 */ 704 if (PageAnon(page)) 705 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 706 nr_pages); 707 else 708 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 709 nr_pages); 710 711 if (PageTransHuge(page)) 712 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 713 nr_pages); 714 715 /* pagein of a big page is an event. So, ignore page size */ 716 if (nr_pages > 0) 717 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 718 else { 719 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 720 nr_pages = -nr_pages; /* for event */ 721 } 722 723 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 724 } 725 726 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 727 int nid, 728 unsigned int lru_mask) 729 { 730 unsigned long nr = 0; 731 int zid; 732 733 VM_BUG_ON((unsigned)nid >= nr_node_ids); 734 735 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 736 struct mem_cgroup_per_zone *mz; 737 enum lru_list lru; 738 739 for_each_lru(lru) { 740 if (!(BIT(lru) & lru_mask)) 741 continue; 742 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 743 nr += mz->lru_size[lru]; 744 } 745 } 746 return nr; 747 } 748 749 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 750 unsigned int lru_mask) 751 { 752 unsigned long nr = 0; 753 int nid; 754 755 for_each_node_state(nid, N_MEMORY) 756 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 757 return nr; 758 } 759 760 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 761 enum mem_cgroup_events_target target) 762 { 763 unsigned long val, next; 764 765 val = __this_cpu_read(memcg->stat->nr_page_events); 766 next = __this_cpu_read(memcg->stat->targets[target]); 767 /* from time_after() in jiffies.h */ 768 if ((long)next - (long)val < 0) { 769 switch (target) { 770 case MEM_CGROUP_TARGET_THRESH: 771 next = val + THRESHOLDS_EVENTS_TARGET; 772 break; 773 case MEM_CGROUP_TARGET_SOFTLIMIT: 774 next = val + SOFTLIMIT_EVENTS_TARGET; 775 break; 776 case MEM_CGROUP_TARGET_NUMAINFO: 777 next = val + NUMAINFO_EVENTS_TARGET; 778 break; 779 default: 780 break; 781 } 782 __this_cpu_write(memcg->stat->targets[target], next); 783 return true; 784 } 785 return false; 786 } 787 788 /* 789 * Check events in order. 790 * 791 */ 792 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 793 { 794 /* threshold event is triggered in finer grain than soft limit */ 795 if (unlikely(mem_cgroup_event_ratelimit(memcg, 796 MEM_CGROUP_TARGET_THRESH))) { 797 bool do_softlimit; 798 bool do_numainfo __maybe_unused; 799 800 do_softlimit = mem_cgroup_event_ratelimit(memcg, 801 MEM_CGROUP_TARGET_SOFTLIMIT); 802 #if MAX_NUMNODES > 1 803 do_numainfo = mem_cgroup_event_ratelimit(memcg, 804 MEM_CGROUP_TARGET_NUMAINFO); 805 #endif 806 mem_cgroup_threshold(memcg); 807 if (unlikely(do_softlimit)) 808 mem_cgroup_update_tree(memcg, page); 809 #if MAX_NUMNODES > 1 810 if (unlikely(do_numainfo)) 811 atomic_inc(&memcg->numainfo_events); 812 #endif 813 } 814 } 815 816 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 817 { 818 /* 819 * mm_update_next_owner() may clear mm->owner to NULL 820 * if it races with swapoff, page migration, etc. 821 * So this can be called with p == NULL. 822 */ 823 if (unlikely(!p)) 824 return NULL; 825 826 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 827 } 828 EXPORT_SYMBOL(mem_cgroup_from_task); 829 830 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 831 { 832 struct mem_cgroup *memcg = NULL; 833 834 rcu_read_lock(); 835 do { 836 /* 837 * Page cache insertions can happen withou an 838 * actual mm context, e.g. during disk probing 839 * on boot, loopback IO, acct() writes etc. 840 */ 841 if (unlikely(!mm)) 842 memcg = root_mem_cgroup; 843 else { 844 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 845 if (unlikely(!memcg)) 846 memcg = root_mem_cgroup; 847 } 848 } while (!css_tryget_online(&memcg->css)); 849 rcu_read_unlock(); 850 return memcg; 851 } 852 853 /** 854 * mem_cgroup_iter - iterate over memory cgroup hierarchy 855 * @root: hierarchy root 856 * @prev: previously returned memcg, NULL on first invocation 857 * @reclaim: cookie for shared reclaim walks, NULL for full walks 858 * 859 * Returns references to children of the hierarchy below @root, or 860 * @root itself, or %NULL after a full round-trip. 861 * 862 * Caller must pass the return value in @prev on subsequent 863 * invocations for reference counting, or use mem_cgroup_iter_break() 864 * to cancel a hierarchy walk before the round-trip is complete. 865 * 866 * Reclaimers can specify a zone and a priority level in @reclaim to 867 * divide up the memcgs in the hierarchy among all concurrent 868 * reclaimers operating on the same zone and priority. 869 */ 870 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 871 struct mem_cgroup *prev, 872 struct mem_cgroup_reclaim_cookie *reclaim) 873 { 874 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 875 struct cgroup_subsys_state *css = NULL; 876 struct mem_cgroup *memcg = NULL; 877 struct mem_cgroup *pos = NULL; 878 879 if (mem_cgroup_disabled()) 880 return NULL; 881 882 if (!root) 883 root = root_mem_cgroup; 884 885 if (prev && !reclaim) 886 pos = prev; 887 888 if (!root->use_hierarchy && root != root_mem_cgroup) { 889 if (prev) 890 goto out; 891 return root; 892 } 893 894 rcu_read_lock(); 895 896 if (reclaim) { 897 struct mem_cgroup_per_zone *mz; 898 899 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 900 iter = &mz->iter[reclaim->priority]; 901 902 if (prev && reclaim->generation != iter->generation) 903 goto out_unlock; 904 905 do { 906 pos = READ_ONCE(iter->position); 907 /* 908 * A racing update may change the position and 909 * put the last reference, hence css_tryget(), 910 * or retry to see the updated position. 911 */ 912 } while (pos && !css_tryget(&pos->css)); 913 } 914 915 if (pos) 916 css = &pos->css; 917 918 for (;;) { 919 css = css_next_descendant_pre(css, &root->css); 920 if (!css) { 921 /* 922 * Reclaimers share the hierarchy walk, and a 923 * new one might jump in right at the end of 924 * the hierarchy - make sure they see at least 925 * one group and restart from the beginning. 926 */ 927 if (!prev) 928 continue; 929 break; 930 } 931 932 /* 933 * Verify the css and acquire a reference. The root 934 * is provided by the caller, so we know it's alive 935 * and kicking, and don't take an extra reference. 936 */ 937 memcg = mem_cgroup_from_css(css); 938 939 if (css == &root->css) 940 break; 941 942 if (css_tryget(css)) { 943 /* 944 * Make sure the memcg is initialized: 945 * mem_cgroup_css_online() orders the the 946 * initialization against setting the flag. 947 */ 948 if (smp_load_acquire(&memcg->initialized)) 949 break; 950 951 css_put(css); 952 } 953 954 memcg = NULL; 955 } 956 957 if (reclaim) { 958 if (cmpxchg(&iter->position, pos, memcg) == pos) { 959 if (memcg) 960 css_get(&memcg->css); 961 if (pos) 962 css_put(&pos->css); 963 } 964 965 /* 966 * pairs with css_tryget when dereferencing iter->position 967 * above. 968 */ 969 if (pos) 970 css_put(&pos->css); 971 972 if (!memcg) 973 iter->generation++; 974 else if (!prev) 975 reclaim->generation = iter->generation; 976 } 977 978 out_unlock: 979 rcu_read_unlock(); 980 out: 981 if (prev && prev != root) 982 css_put(&prev->css); 983 984 return memcg; 985 } 986 987 /** 988 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 989 * @root: hierarchy root 990 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 991 */ 992 void mem_cgroup_iter_break(struct mem_cgroup *root, 993 struct mem_cgroup *prev) 994 { 995 if (!root) 996 root = root_mem_cgroup; 997 if (prev && prev != root) 998 css_put(&prev->css); 999 } 1000 1001 /* 1002 * Iteration constructs for visiting all cgroups (under a tree). If 1003 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1004 * be used for reference counting. 1005 */ 1006 #define for_each_mem_cgroup_tree(iter, root) \ 1007 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1008 iter != NULL; \ 1009 iter = mem_cgroup_iter(root, iter, NULL)) 1010 1011 #define for_each_mem_cgroup(iter) \ 1012 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1013 iter != NULL; \ 1014 iter = mem_cgroup_iter(NULL, iter, NULL)) 1015 1016 /** 1017 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1018 * @zone: zone of the wanted lruvec 1019 * @memcg: memcg of the wanted lruvec 1020 * 1021 * Returns the lru list vector holding pages for the given @zone and 1022 * @mem. This can be the global zone lruvec, if the memory controller 1023 * is disabled. 1024 */ 1025 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1026 struct mem_cgroup *memcg) 1027 { 1028 struct mem_cgroup_per_zone *mz; 1029 struct lruvec *lruvec; 1030 1031 if (mem_cgroup_disabled()) { 1032 lruvec = &zone->lruvec; 1033 goto out; 1034 } 1035 1036 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1037 lruvec = &mz->lruvec; 1038 out: 1039 /* 1040 * Since a node can be onlined after the mem_cgroup was created, 1041 * we have to be prepared to initialize lruvec->zone here; 1042 * and if offlined then reonlined, we need to reinitialize it. 1043 */ 1044 if (unlikely(lruvec->zone != zone)) 1045 lruvec->zone = zone; 1046 return lruvec; 1047 } 1048 1049 /** 1050 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1051 * @page: the page 1052 * @zone: zone of the page 1053 * 1054 * This function is only safe when following the LRU page isolation 1055 * and putback protocol: the LRU lock must be held, and the page must 1056 * either be PageLRU() or the caller must have isolated/allocated it. 1057 */ 1058 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1059 { 1060 struct mem_cgroup_per_zone *mz; 1061 struct mem_cgroup *memcg; 1062 struct lruvec *lruvec; 1063 1064 if (mem_cgroup_disabled()) { 1065 lruvec = &zone->lruvec; 1066 goto out; 1067 } 1068 1069 memcg = page->mem_cgroup; 1070 /* 1071 * Swapcache readahead pages are added to the LRU - and 1072 * possibly migrated - before they are charged. 1073 */ 1074 if (!memcg) 1075 memcg = root_mem_cgroup; 1076 1077 mz = mem_cgroup_page_zoneinfo(memcg, page); 1078 lruvec = &mz->lruvec; 1079 out: 1080 /* 1081 * Since a node can be onlined after the mem_cgroup was created, 1082 * we have to be prepared to initialize lruvec->zone here; 1083 * and if offlined then reonlined, we need to reinitialize it. 1084 */ 1085 if (unlikely(lruvec->zone != zone)) 1086 lruvec->zone = zone; 1087 return lruvec; 1088 } 1089 1090 /** 1091 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1092 * @lruvec: mem_cgroup per zone lru vector 1093 * @lru: index of lru list the page is sitting on 1094 * @nr_pages: positive when adding or negative when removing 1095 * 1096 * This function must be called when a page is added to or removed from an 1097 * lru list. 1098 */ 1099 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1100 int nr_pages) 1101 { 1102 struct mem_cgroup_per_zone *mz; 1103 unsigned long *lru_size; 1104 1105 if (mem_cgroup_disabled()) 1106 return; 1107 1108 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1109 lru_size = mz->lru_size + lru; 1110 *lru_size += nr_pages; 1111 VM_BUG_ON((long)(*lru_size) < 0); 1112 } 1113 1114 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1115 { 1116 struct mem_cgroup *task_memcg; 1117 struct task_struct *p; 1118 bool ret; 1119 1120 p = find_lock_task_mm(task); 1121 if (p) { 1122 task_memcg = get_mem_cgroup_from_mm(p->mm); 1123 task_unlock(p); 1124 } else { 1125 /* 1126 * All threads may have already detached their mm's, but the oom 1127 * killer still needs to detect if they have already been oom 1128 * killed to prevent needlessly killing additional tasks. 1129 */ 1130 rcu_read_lock(); 1131 task_memcg = mem_cgroup_from_task(task); 1132 css_get(&task_memcg->css); 1133 rcu_read_unlock(); 1134 } 1135 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1136 css_put(&task_memcg->css); 1137 return ret; 1138 } 1139 1140 #define mem_cgroup_from_counter(counter, member) \ 1141 container_of(counter, struct mem_cgroup, member) 1142 1143 /** 1144 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1145 * @memcg: the memory cgroup 1146 * 1147 * Returns the maximum amount of memory @mem can be charged with, in 1148 * pages. 1149 */ 1150 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1151 { 1152 unsigned long margin = 0; 1153 unsigned long count; 1154 unsigned long limit; 1155 1156 count = page_counter_read(&memcg->memory); 1157 limit = READ_ONCE(memcg->memory.limit); 1158 if (count < limit) 1159 margin = limit - count; 1160 1161 if (do_swap_account) { 1162 count = page_counter_read(&memcg->memsw); 1163 limit = READ_ONCE(memcg->memsw.limit); 1164 if (count <= limit) 1165 margin = min(margin, limit - count); 1166 } 1167 1168 return margin; 1169 } 1170 1171 /* 1172 * A routine for checking "mem" is under move_account() or not. 1173 * 1174 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1175 * moving cgroups. This is for waiting at high-memory pressure 1176 * caused by "move". 1177 */ 1178 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1179 { 1180 struct mem_cgroup *from; 1181 struct mem_cgroup *to; 1182 bool ret = false; 1183 /* 1184 * Unlike task_move routines, we access mc.to, mc.from not under 1185 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1186 */ 1187 spin_lock(&mc.lock); 1188 from = mc.from; 1189 to = mc.to; 1190 if (!from) 1191 goto unlock; 1192 1193 ret = mem_cgroup_is_descendant(from, memcg) || 1194 mem_cgroup_is_descendant(to, memcg); 1195 unlock: 1196 spin_unlock(&mc.lock); 1197 return ret; 1198 } 1199 1200 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1201 { 1202 if (mc.moving_task && current != mc.moving_task) { 1203 if (mem_cgroup_under_move(memcg)) { 1204 DEFINE_WAIT(wait); 1205 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1206 /* moving charge context might have finished. */ 1207 if (mc.moving_task) 1208 schedule(); 1209 finish_wait(&mc.waitq, &wait); 1210 return true; 1211 } 1212 } 1213 return false; 1214 } 1215 1216 #define K(x) ((x) << (PAGE_SHIFT-10)) 1217 /** 1218 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1219 * @memcg: The memory cgroup that went over limit 1220 * @p: Task that is going to be killed 1221 * 1222 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1223 * enabled 1224 */ 1225 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1226 { 1227 /* oom_info_lock ensures that parallel ooms do not interleave */ 1228 static DEFINE_MUTEX(oom_info_lock); 1229 struct mem_cgroup *iter; 1230 unsigned int i; 1231 1232 mutex_lock(&oom_info_lock); 1233 rcu_read_lock(); 1234 1235 if (p) { 1236 pr_info("Task in "); 1237 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1238 pr_cont(" killed as a result of limit of "); 1239 } else { 1240 pr_info("Memory limit reached of cgroup "); 1241 } 1242 1243 pr_cont_cgroup_path(memcg->css.cgroup); 1244 pr_cont("\n"); 1245 1246 rcu_read_unlock(); 1247 1248 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1249 K((u64)page_counter_read(&memcg->memory)), 1250 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1251 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1252 K((u64)page_counter_read(&memcg->memsw)), 1253 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1254 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1255 K((u64)page_counter_read(&memcg->kmem)), 1256 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1257 1258 for_each_mem_cgroup_tree(iter, memcg) { 1259 pr_info("Memory cgroup stats for "); 1260 pr_cont_cgroup_path(iter->css.cgroup); 1261 pr_cont(":"); 1262 1263 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1264 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1265 continue; 1266 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], 1267 K(mem_cgroup_read_stat(iter, i))); 1268 } 1269 1270 for (i = 0; i < NR_LRU_LISTS; i++) 1271 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1272 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1273 1274 pr_cont("\n"); 1275 } 1276 mutex_unlock(&oom_info_lock); 1277 } 1278 1279 /* 1280 * This function returns the number of memcg under hierarchy tree. Returns 1281 * 1(self count) if no children. 1282 */ 1283 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1284 { 1285 int num = 0; 1286 struct mem_cgroup *iter; 1287 1288 for_each_mem_cgroup_tree(iter, memcg) 1289 num++; 1290 return num; 1291 } 1292 1293 /* 1294 * Return the memory (and swap, if configured) limit for a memcg. 1295 */ 1296 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1297 { 1298 unsigned long limit; 1299 1300 limit = memcg->memory.limit; 1301 if (mem_cgroup_swappiness(memcg)) { 1302 unsigned long memsw_limit; 1303 1304 memsw_limit = memcg->memsw.limit; 1305 limit = min(limit + total_swap_pages, memsw_limit); 1306 } 1307 return limit; 1308 } 1309 1310 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1311 int order) 1312 { 1313 struct oom_control oc = { 1314 .zonelist = NULL, 1315 .nodemask = NULL, 1316 .gfp_mask = gfp_mask, 1317 .order = order, 1318 }; 1319 struct mem_cgroup *iter; 1320 unsigned long chosen_points = 0; 1321 unsigned long totalpages; 1322 unsigned int points = 0; 1323 struct task_struct *chosen = NULL; 1324 1325 mutex_lock(&oom_lock); 1326 1327 /* 1328 * If current has a pending SIGKILL or is exiting, then automatically 1329 * select it. The goal is to allow it to allocate so that it may 1330 * quickly exit and free its memory. 1331 */ 1332 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1333 mark_oom_victim(current); 1334 goto unlock; 1335 } 1336 1337 check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); 1338 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1339 for_each_mem_cgroup_tree(iter, memcg) { 1340 struct css_task_iter it; 1341 struct task_struct *task; 1342 1343 css_task_iter_start(&iter->css, &it); 1344 while ((task = css_task_iter_next(&it))) { 1345 switch (oom_scan_process_thread(&oc, task, totalpages)) { 1346 case OOM_SCAN_SELECT: 1347 if (chosen) 1348 put_task_struct(chosen); 1349 chosen = task; 1350 chosen_points = ULONG_MAX; 1351 get_task_struct(chosen); 1352 /* fall through */ 1353 case OOM_SCAN_CONTINUE: 1354 continue; 1355 case OOM_SCAN_ABORT: 1356 css_task_iter_end(&it); 1357 mem_cgroup_iter_break(memcg, iter); 1358 if (chosen) 1359 put_task_struct(chosen); 1360 goto unlock; 1361 case OOM_SCAN_OK: 1362 break; 1363 }; 1364 points = oom_badness(task, memcg, NULL, totalpages); 1365 if (!points || points < chosen_points) 1366 continue; 1367 /* Prefer thread group leaders for display purposes */ 1368 if (points == chosen_points && 1369 thread_group_leader(chosen)) 1370 continue; 1371 1372 if (chosen) 1373 put_task_struct(chosen); 1374 chosen = task; 1375 chosen_points = points; 1376 get_task_struct(chosen); 1377 } 1378 css_task_iter_end(&it); 1379 } 1380 1381 if (chosen) { 1382 points = chosen_points * 1000 / totalpages; 1383 oom_kill_process(&oc, chosen, points, totalpages, memcg, 1384 "Memory cgroup out of memory"); 1385 } 1386 unlock: 1387 mutex_unlock(&oom_lock); 1388 } 1389 1390 #if MAX_NUMNODES > 1 1391 1392 /** 1393 * test_mem_cgroup_node_reclaimable 1394 * @memcg: the target memcg 1395 * @nid: the node ID to be checked. 1396 * @noswap : specify true here if the user wants flle only information. 1397 * 1398 * This function returns whether the specified memcg contains any 1399 * reclaimable pages on a node. Returns true if there are any reclaimable 1400 * pages in the node. 1401 */ 1402 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1403 int nid, bool noswap) 1404 { 1405 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1406 return true; 1407 if (noswap || !total_swap_pages) 1408 return false; 1409 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1410 return true; 1411 return false; 1412 1413 } 1414 1415 /* 1416 * Always updating the nodemask is not very good - even if we have an empty 1417 * list or the wrong list here, we can start from some node and traverse all 1418 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1419 * 1420 */ 1421 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1422 { 1423 int nid; 1424 /* 1425 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1426 * pagein/pageout changes since the last update. 1427 */ 1428 if (!atomic_read(&memcg->numainfo_events)) 1429 return; 1430 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1431 return; 1432 1433 /* make a nodemask where this memcg uses memory from */ 1434 memcg->scan_nodes = node_states[N_MEMORY]; 1435 1436 for_each_node_mask(nid, node_states[N_MEMORY]) { 1437 1438 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1439 node_clear(nid, memcg->scan_nodes); 1440 } 1441 1442 atomic_set(&memcg->numainfo_events, 0); 1443 atomic_set(&memcg->numainfo_updating, 0); 1444 } 1445 1446 /* 1447 * Selecting a node where we start reclaim from. Because what we need is just 1448 * reducing usage counter, start from anywhere is O,K. Considering 1449 * memory reclaim from current node, there are pros. and cons. 1450 * 1451 * Freeing memory from current node means freeing memory from a node which 1452 * we'll use or we've used. So, it may make LRU bad. And if several threads 1453 * hit limits, it will see a contention on a node. But freeing from remote 1454 * node means more costs for memory reclaim because of memory latency. 1455 * 1456 * Now, we use round-robin. Better algorithm is welcomed. 1457 */ 1458 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1459 { 1460 int node; 1461 1462 mem_cgroup_may_update_nodemask(memcg); 1463 node = memcg->last_scanned_node; 1464 1465 node = next_node(node, memcg->scan_nodes); 1466 if (node == MAX_NUMNODES) 1467 node = first_node(memcg->scan_nodes); 1468 /* 1469 * We call this when we hit limit, not when pages are added to LRU. 1470 * No LRU may hold pages because all pages are UNEVICTABLE or 1471 * memcg is too small and all pages are not on LRU. In that case, 1472 * we use curret node. 1473 */ 1474 if (unlikely(node == MAX_NUMNODES)) 1475 node = numa_node_id(); 1476 1477 memcg->last_scanned_node = node; 1478 return node; 1479 } 1480 #else 1481 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1482 { 1483 return 0; 1484 } 1485 #endif 1486 1487 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1488 struct zone *zone, 1489 gfp_t gfp_mask, 1490 unsigned long *total_scanned) 1491 { 1492 struct mem_cgroup *victim = NULL; 1493 int total = 0; 1494 int loop = 0; 1495 unsigned long excess; 1496 unsigned long nr_scanned; 1497 struct mem_cgroup_reclaim_cookie reclaim = { 1498 .zone = zone, 1499 .priority = 0, 1500 }; 1501 1502 excess = soft_limit_excess(root_memcg); 1503 1504 while (1) { 1505 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1506 if (!victim) { 1507 loop++; 1508 if (loop >= 2) { 1509 /* 1510 * If we have not been able to reclaim 1511 * anything, it might because there are 1512 * no reclaimable pages under this hierarchy 1513 */ 1514 if (!total) 1515 break; 1516 /* 1517 * We want to do more targeted reclaim. 1518 * excess >> 2 is not to excessive so as to 1519 * reclaim too much, nor too less that we keep 1520 * coming back to reclaim from this cgroup 1521 */ 1522 if (total >= (excess >> 2) || 1523 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1524 break; 1525 } 1526 continue; 1527 } 1528 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1529 zone, &nr_scanned); 1530 *total_scanned += nr_scanned; 1531 if (!soft_limit_excess(root_memcg)) 1532 break; 1533 } 1534 mem_cgroup_iter_break(root_memcg, victim); 1535 return total; 1536 } 1537 1538 #ifdef CONFIG_LOCKDEP 1539 static struct lockdep_map memcg_oom_lock_dep_map = { 1540 .name = "memcg_oom_lock", 1541 }; 1542 #endif 1543 1544 static DEFINE_SPINLOCK(memcg_oom_lock); 1545 1546 /* 1547 * Check OOM-Killer is already running under our hierarchy. 1548 * If someone is running, return false. 1549 */ 1550 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1551 { 1552 struct mem_cgroup *iter, *failed = NULL; 1553 1554 spin_lock(&memcg_oom_lock); 1555 1556 for_each_mem_cgroup_tree(iter, memcg) { 1557 if (iter->oom_lock) { 1558 /* 1559 * this subtree of our hierarchy is already locked 1560 * so we cannot give a lock. 1561 */ 1562 failed = iter; 1563 mem_cgroup_iter_break(memcg, iter); 1564 break; 1565 } else 1566 iter->oom_lock = true; 1567 } 1568 1569 if (failed) { 1570 /* 1571 * OK, we failed to lock the whole subtree so we have 1572 * to clean up what we set up to the failing subtree 1573 */ 1574 for_each_mem_cgroup_tree(iter, memcg) { 1575 if (iter == failed) { 1576 mem_cgroup_iter_break(memcg, iter); 1577 break; 1578 } 1579 iter->oom_lock = false; 1580 } 1581 } else 1582 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1583 1584 spin_unlock(&memcg_oom_lock); 1585 1586 return !failed; 1587 } 1588 1589 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1590 { 1591 struct mem_cgroup *iter; 1592 1593 spin_lock(&memcg_oom_lock); 1594 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1595 for_each_mem_cgroup_tree(iter, memcg) 1596 iter->oom_lock = false; 1597 spin_unlock(&memcg_oom_lock); 1598 } 1599 1600 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1601 { 1602 struct mem_cgroup *iter; 1603 1604 spin_lock(&memcg_oom_lock); 1605 for_each_mem_cgroup_tree(iter, memcg) 1606 iter->under_oom++; 1607 spin_unlock(&memcg_oom_lock); 1608 } 1609 1610 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1611 { 1612 struct mem_cgroup *iter; 1613 1614 /* 1615 * When a new child is created while the hierarchy is under oom, 1616 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1617 */ 1618 spin_lock(&memcg_oom_lock); 1619 for_each_mem_cgroup_tree(iter, memcg) 1620 if (iter->under_oom > 0) 1621 iter->under_oom--; 1622 spin_unlock(&memcg_oom_lock); 1623 } 1624 1625 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1626 1627 struct oom_wait_info { 1628 struct mem_cgroup *memcg; 1629 wait_queue_t wait; 1630 }; 1631 1632 static int memcg_oom_wake_function(wait_queue_t *wait, 1633 unsigned mode, int sync, void *arg) 1634 { 1635 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1636 struct mem_cgroup *oom_wait_memcg; 1637 struct oom_wait_info *oom_wait_info; 1638 1639 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1640 oom_wait_memcg = oom_wait_info->memcg; 1641 1642 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1643 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1644 return 0; 1645 return autoremove_wake_function(wait, mode, sync, arg); 1646 } 1647 1648 static void memcg_oom_recover(struct mem_cgroup *memcg) 1649 { 1650 /* 1651 * For the following lockless ->under_oom test, the only required 1652 * guarantee is that it must see the state asserted by an OOM when 1653 * this function is called as a result of userland actions 1654 * triggered by the notification of the OOM. This is trivially 1655 * achieved by invoking mem_cgroup_mark_under_oom() before 1656 * triggering notification. 1657 */ 1658 if (memcg && memcg->under_oom) 1659 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1660 } 1661 1662 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1663 { 1664 if (!current->memcg_oom.may_oom) 1665 return; 1666 /* 1667 * We are in the middle of the charge context here, so we 1668 * don't want to block when potentially sitting on a callstack 1669 * that holds all kinds of filesystem and mm locks. 1670 * 1671 * Also, the caller may handle a failed allocation gracefully 1672 * (like optional page cache readahead) and so an OOM killer 1673 * invocation might not even be necessary. 1674 * 1675 * That's why we don't do anything here except remember the 1676 * OOM context and then deal with it at the end of the page 1677 * fault when the stack is unwound, the locks are released, 1678 * and when we know whether the fault was overall successful. 1679 */ 1680 css_get(&memcg->css); 1681 current->memcg_oom.memcg = memcg; 1682 current->memcg_oom.gfp_mask = mask; 1683 current->memcg_oom.order = order; 1684 } 1685 1686 /** 1687 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1688 * @handle: actually kill/wait or just clean up the OOM state 1689 * 1690 * This has to be called at the end of a page fault if the memcg OOM 1691 * handler was enabled. 1692 * 1693 * Memcg supports userspace OOM handling where failed allocations must 1694 * sleep on a waitqueue until the userspace task resolves the 1695 * situation. Sleeping directly in the charge context with all kinds 1696 * of locks held is not a good idea, instead we remember an OOM state 1697 * in the task and mem_cgroup_oom_synchronize() has to be called at 1698 * the end of the page fault to complete the OOM handling. 1699 * 1700 * Returns %true if an ongoing memcg OOM situation was detected and 1701 * completed, %false otherwise. 1702 */ 1703 bool mem_cgroup_oom_synchronize(bool handle) 1704 { 1705 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1706 struct oom_wait_info owait; 1707 bool locked; 1708 1709 /* OOM is global, do not handle */ 1710 if (!memcg) 1711 return false; 1712 1713 if (!handle || oom_killer_disabled) 1714 goto cleanup; 1715 1716 owait.memcg = memcg; 1717 owait.wait.flags = 0; 1718 owait.wait.func = memcg_oom_wake_function; 1719 owait.wait.private = current; 1720 INIT_LIST_HEAD(&owait.wait.task_list); 1721 1722 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1723 mem_cgroup_mark_under_oom(memcg); 1724 1725 locked = mem_cgroup_oom_trylock(memcg); 1726 1727 if (locked) 1728 mem_cgroup_oom_notify(memcg); 1729 1730 if (locked && !memcg->oom_kill_disable) { 1731 mem_cgroup_unmark_under_oom(memcg); 1732 finish_wait(&memcg_oom_waitq, &owait.wait); 1733 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1734 current->memcg_oom.order); 1735 } else { 1736 schedule(); 1737 mem_cgroup_unmark_under_oom(memcg); 1738 finish_wait(&memcg_oom_waitq, &owait.wait); 1739 } 1740 1741 if (locked) { 1742 mem_cgroup_oom_unlock(memcg); 1743 /* 1744 * There is no guarantee that an OOM-lock contender 1745 * sees the wakeups triggered by the OOM kill 1746 * uncharges. Wake any sleepers explicitely. 1747 */ 1748 memcg_oom_recover(memcg); 1749 } 1750 cleanup: 1751 current->memcg_oom.memcg = NULL; 1752 css_put(&memcg->css); 1753 return true; 1754 } 1755 1756 /** 1757 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1758 * @page: page that is going to change accounted state 1759 * 1760 * This function must mark the beginning of an accounted page state 1761 * change to prevent double accounting when the page is concurrently 1762 * being moved to another memcg: 1763 * 1764 * memcg = mem_cgroup_begin_page_stat(page); 1765 * if (TestClearPageState(page)) 1766 * mem_cgroup_update_page_stat(memcg, state, -1); 1767 * mem_cgroup_end_page_stat(memcg); 1768 */ 1769 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 1770 { 1771 struct mem_cgroup *memcg; 1772 unsigned long flags; 1773 1774 /* 1775 * The RCU lock is held throughout the transaction. The fast 1776 * path can get away without acquiring the memcg->move_lock 1777 * because page moving starts with an RCU grace period. 1778 * 1779 * The RCU lock also protects the memcg from being freed when 1780 * the page state that is going to change is the only thing 1781 * preventing the page from being uncharged. 1782 * E.g. end-writeback clearing PageWriteback(), which allows 1783 * migration to go ahead and uncharge the page before the 1784 * account transaction might be complete. 1785 */ 1786 rcu_read_lock(); 1787 1788 if (mem_cgroup_disabled()) 1789 return NULL; 1790 again: 1791 memcg = page->mem_cgroup; 1792 if (unlikely(!memcg)) 1793 return NULL; 1794 1795 if (atomic_read(&memcg->moving_account) <= 0) 1796 return memcg; 1797 1798 spin_lock_irqsave(&memcg->move_lock, flags); 1799 if (memcg != page->mem_cgroup) { 1800 spin_unlock_irqrestore(&memcg->move_lock, flags); 1801 goto again; 1802 } 1803 1804 /* 1805 * When charge migration first begins, we can have locked and 1806 * unlocked page stat updates happening concurrently. Track 1807 * the task who has the lock for mem_cgroup_end_page_stat(). 1808 */ 1809 memcg->move_lock_task = current; 1810 memcg->move_lock_flags = flags; 1811 1812 return memcg; 1813 } 1814 EXPORT_SYMBOL(mem_cgroup_begin_page_stat); 1815 1816 /** 1817 * mem_cgroup_end_page_stat - finish a page state statistics transaction 1818 * @memcg: the memcg that was accounted against 1819 */ 1820 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 1821 { 1822 if (memcg && memcg->move_lock_task == current) { 1823 unsigned long flags = memcg->move_lock_flags; 1824 1825 memcg->move_lock_task = NULL; 1826 memcg->move_lock_flags = 0; 1827 1828 spin_unlock_irqrestore(&memcg->move_lock, flags); 1829 } 1830 1831 rcu_read_unlock(); 1832 } 1833 EXPORT_SYMBOL(mem_cgroup_end_page_stat); 1834 1835 /* 1836 * size of first charge trial. "32" comes from vmscan.c's magic value. 1837 * TODO: maybe necessary to use big numbers in big irons. 1838 */ 1839 #define CHARGE_BATCH 32U 1840 struct memcg_stock_pcp { 1841 struct mem_cgroup *cached; /* this never be root cgroup */ 1842 unsigned int nr_pages; 1843 struct work_struct work; 1844 unsigned long flags; 1845 #define FLUSHING_CACHED_CHARGE 0 1846 }; 1847 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1848 static DEFINE_MUTEX(percpu_charge_mutex); 1849 1850 /** 1851 * consume_stock: Try to consume stocked charge on this cpu. 1852 * @memcg: memcg to consume from. 1853 * @nr_pages: how many pages to charge. 1854 * 1855 * The charges will only happen if @memcg matches the current cpu's memcg 1856 * stock, and at least @nr_pages are available in that stock. Failure to 1857 * service an allocation will refill the stock. 1858 * 1859 * returns true if successful, false otherwise. 1860 */ 1861 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1862 { 1863 struct memcg_stock_pcp *stock; 1864 bool ret = false; 1865 1866 if (nr_pages > CHARGE_BATCH) 1867 return ret; 1868 1869 stock = &get_cpu_var(memcg_stock); 1870 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 1871 stock->nr_pages -= nr_pages; 1872 ret = true; 1873 } 1874 put_cpu_var(memcg_stock); 1875 return ret; 1876 } 1877 1878 /* 1879 * Returns stocks cached in percpu and reset cached information. 1880 */ 1881 static void drain_stock(struct memcg_stock_pcp *stock) 1882 { 1883 struct mem_cgroup *old = stock->cached; 1884 1885 if (stock->nr_pages) { 1886 page_counter_uncharge(&old->memory, stock->nr_pages); 1887 if (do_swap_account) 1888 page_counter_uncharge(&old->memsw, stock->nr_pages); 1889 css_put_many(&old->css, stock->nr_pages); 1890 stock->nr_pages = 0; 1891 } 1892 stock->cached = NULL; 1893 } 1894 1895 /* 1896 * This must be called under preempt disabled or must be called by 1897 * a thread which is pinned to local cpu. 1898 */ 1899 static void drain_local_stock(struct work_struct *dummy) 1900 { 1901 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 1902 drain_stock(stock); 1903 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1904 } 1905 1906 /* 1907 * Cache charges(val) to local per_cpu area. 1908 * This will be consumed by consume_stock() function, later. 1909 */ 1910 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1911 { 1912 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1913 1914 if (stock->cached != memcg) { /* reset if necessary */ 1915 drain_stock(stock); 1916 stock->cached = memcg; 1917 } 1918 stock->nr_pages += nr_pages; 1919 put_cpu_var(memcg_stock); 1920 } 1921 1922 /* 1923 * Drains all per-CPU charge caches for given root_memcg resp. subtree 1924 * of the hierarchy under it. 1925 */ 1926 static void drain_all_stock(struct mem_cgroup *root_memcg) 1927 { 1928 int cpu, curcpu; 1929 1930 /* If someone's already draining, avoid adding running more workers. */ 1931 if (!mutex_trylock(&percpu_charge_mutex)) 1932 return; 1933 /* Notify other cpus that system-wide "drain" is running */ 1934 get_online_cpus(); 1935 curcpu = get_cpu(); 1936 for_each_online_cpu(cpu) { 1937 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1938 struct mem_cgroup *memcg; 1939 1940 memcg = stock->cached; 1941 if (!memcg || !stock->nr_pages) 1942 continue; 1943 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 1944 continue; 1945 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 1946 if (cpu == curcpu) 1947 drain_local_stock(&stock->work); 1948 else 1949 schedule_work_on(cpu, &stock->work); 1950 } 1951 } 1952 put_cpu(); 1953 put_online_cpus(); 1954 mutex_unlock(&percpu_charge_mutex); 1955 } 1956 1957 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 1958 unsigned long action, 1959 void *hcpu) 1960 { 1961 int cpu = (unsigned long)hcpu; 1962 struct memcg_stock_pcp *stock; 1963 1964 if (action == CPU_ONLINE) 1965 return NOTIFY_OK; 1966 1967 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1968 return NOTIFY_OK; 1969 1970 stock = &per_cpu(memcg_stock, cpu); 1971 drain_stock(stock); 1972 return NOTIFY_OK; 1973 } 1974 1975 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 1976 unsigned int nr_pages) 1977 { 1978 unsigned int batch = max(CHARGE_BATCH, nr_pages); 1979 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1980 struct mem_cgroup *mem_over_limit; 1981 struct page_counter *counter; 1982 unsigned long nr_reclaimed; 1983 bool may_swap = true; 1984 bool drained = false; 1985 int ret = 0; 1986 1987 if (mem_cgroup_is_root(memcg)) 1988 goto done; 1989 retry: 1990 if (consume_stock(memcg, nr_pages)) 1991 goto done; 1992 1993 if (!do_swap_account || 1994 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 1995 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 1996 goto done_restock; 1997 if (do_swap_account) 1998 page_counter_uncharge(&memcg->memsw, batch); 1999 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2000 } else { 2001 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2002 may_swap = false; 2003 } 2004 2005 if (batch > nr_pages) { 2006 batch = nr_pages; 2007 goto retry; 2008 } 2009 2010 /* 2011 * Unlike in global OOM situations, memcg is not in a physical 2012 * memory shortage. Allow dying and OOM-killed tasks to 2013 * bypass the last charges so that they can exit quickly and 2014 * free their memory. 2015 */ 2016 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2017 fatal_signal_pending(current) || 2018 current->flags & PF_EXITING)) 2019 goto bypass; 2020 2021 if (unlikely(task_in_memcg_oom(current))) 2022 goto nomem; 2023 2024 if (!(gfp_mask & __GFP_WAIT)) 2025 goto nomem; 2026 2027 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); 2028 2029 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2030 gfp_mask, may_swap); 2031 2032 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2033 goto retry; 2034 2035 if (!drained) { 2036 drain_all_stock(mem_over_limit); 2037 drained = true; 2038 goto retry; 2039 } 2040 2041 if (gfp_mask & __GFP_NORETRY) 2042 goto nomem; 2043 /* 2044 * Even though the limit is exceeded at this point, reclaim 2045 * may have been able to free some pages. Retry the charge 2046 * before killing the task. 2047 * 2048 * Only for regular pages, though: huge pages are rather 2049 * unlikely to succeed so close to the limit, and we fall back 2050 * to regular pages anyway in case of failure. 2051 */ 2052 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2053 goto retry; 2054 /* 2055 * At task move, charge accounts can be doubly counted. So, it's 2056 * better to wait until the end of task_move if something is going on. 2057 */ 2058 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2059 goto retry; 2060 2061 if (nr_retries--) 2062 goto retry; 2063 2064 if (gfp_mask & __GFP_NOFAIL) 2065 goto bypass; 2066 2067 if (fatal_signal_pending(current)) 2068 goto bypass; 2069 2070 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); 2071 2072 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2073 nomem: 2074 if (!(gfp_mask & __GFP_NOFAIL)) 2075 return -ENOMEM; 2076 bypass: 2077 return -EINTR; 2078 2079 done_restock: 2080 css_get_many(&memcg->css, batch); 2081 if (batch > nr_pages) 2082 refill_stock(memcg, batch - nr_pages); 2083 if (!(gfp_mask & __GFP_WAIT)) 2084 goto done; 2085 /* 2086 * If the hierarchy is above the normal consumption range, 2087 * make the charging task trim their excess contribution. 2088 */ 2089 do { 2090 if (page_counter_read(&memcg->memory) <= memcg->high) 2091 continue; 2092 mem_cgroup_events(memcg, MEMCG_HIGH, 1); 2093 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2094 } while ((memcg = parent_mem_cgroup(memcg))); 2095 done: 2096 return ret; 2097 } 2098 2099 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2100 { 2101 if (mem_cgroup_is_root(memcg)) 2102 return; 2103 2104 page_counter_uncharge(&memcg->memory, nr_pages); 2105 if (do_swap_account) 2106 page_counter_uncharge(&memcg->memsw, nr_pages); 2107 2108 css_put_many(&memcg->css, nr_pages); 2109 } 2110 2111 static void lock_page_lru(struct page *page, int *isolated) 2112 { 2113 struct zone *zone = page_zone(page); 2114 2115 spin_lock_irq(&zone->lru_lock); 2116 if (PageLRU(page)) { 2117 struct lruvec *lruvec; 2118 2119 lruvec = mem_cgroup_page_lruvec(page, zone); 2120 ClearPageLRU(page); 2121 del_page_from_lru_list(page, lruvec, page_lru(page)); 2122 *isolated = 1; 2123 } else 2124 *isolated = 0; 2125 } 2126 2127 static void unlock_page_lru(struct page *page, int isolated) 2128 { 2129 struct zone *zone = page_zone(page); 2130 2131 if (isolated) { 2132 struct lruvec *lruvec; 2133 2134 lruvec = mem_cgroup_page_lruvec(page, zone); 2135 VM_BUG_ON_PAGE(PageLRU(page), page); 2136 SetPageLRU(page); 2137 add_page_to_lru_list(page, lruvec, page_lru(page)); 2138 } 2139 spin_unlock_irq(&zone->lru_lock); 2140 } 2141 2142 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2143 bool lrucare) 2144 { 2145 int isolated; 2146 2147 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2148 2149 /* 2150 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2151 * may already be on some other mem_cgroup's LRU. Take care of it. 2152 */ 2153 if (lrucare) 2154 lock_page_lru(page, &isolated); 2155 2156 /* 2157 * Nobody should be changing or seriously looking at 2158 * page->mem_cgroup at this point: 2159 * 2160 * - the page is uncharged 2161 * 2162 * - the page is off-LRU 2163 * 2164 * - an anonymous fault has exclusive page access, except for 2165 * a locked page table 2166 * 2167 * - a page cache insertion, a swapin fault, or a migration 2168 * have the page locked 2169 */ 2170 page->mem_cgroup = memcg; 2171 2172 if (lrucare) 2173 unlock_page_lru(page, isolated); 2174 } 2175 2176 #ifdef CONFIG_MEMCG_KMEM 2177 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2178 unsigned long nr_pages) 2179 { 2180 struct page_counter *counter; 2181 int ret = 0; 2182 2183 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2184 if (ret < 0) 2185 return ret; 2186 2187 ret = try_charge(memcg, gfp, nr_pages); 2188 if (ret == -EINTR) { 2189 /* 2190 * try_charge() chose to bypass to root due to OOM kill or 2191 * fatal signal. Since our only options are to either fail 2192 * the allocation or charge it to this cgroup, do it as a 2193 * temporary condition. But we can't fail. From a kmem/slab 2194 * perspective, the cache has already been selected, by 2195 * mem_cgroup_kmem_get_cache(), so it is too late to change 2196 * our minds. 2197 * 2198 * This condition will only trigger if the task entered 2199 * memcg_charge_kmem in a sane state, but was OOM-killed 2200 * during try_charge() above. Tasks that were already dying 2201 * when the allocation triggers should have been already 2202 * directed to the root cgroup in memcontrol.h 2203 */ 2204 page_counter_charge(&memcg->memory, nr_pages); 2205 if (do_swap_account) 2206 page_counter_charge(&memcg->memsw, nr_pages); 2207 css_get_many(&memcg->css, nr_pages); 2208 ret = 0; 2209 } else if (ret) 2210 page_counter_uncharge(&memcg->kmem, nr_pages); 2211 2212 return ret; 2213 } 2214 2215 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) 2216 { 2217 page_counter_uncharge(&memcg->memory, nr_pages); 2218 if (do_swap_account) 2219 page_counter_uncharge(&memcg->memsw, nr_pages); 2220 2221 page_counter_uncharge(&memcg->kmem, nr_pages); 2222 2223 css_put_many(&memcg->css, nr_pages); 2224 } 2225 2226 static int memcg_alloc_cache_id(void) 2227 { 2228 int id, size; 2229 int err; 2230 2231 id = ida_simple_get(&memcg_cache_ida, 2232 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2233 if (id < 0) 2234 return id; 2235 2236 if (id < memcg_nr_cache_ids) 2237 return id; 2238 2239 /* 2240 * There's no space for the new id in memcg_caches arrays, 2241 * so we have to grow them. 2242 */ 2243 down_write(&memcg_cache_ids_sem); 2244 2245 size = 2 * (id + 1); 2246 if (size < MEMCG_CACHES_MIN_SIZE) 2247 size = MEMCG_CACHES_MIN_SIZE; 2248 else if (size > MEMCG_CACHES_MAX_SIZE) 2249 size = MEMCG_CACHES_MAX_SIZE; 2250 2251 err = memcg_update_all_caches(size); 2252 if (!err) 2253 err = memcg_update_all_list_lrus(size); 2254 if (!err) 2255 memcg_nr_cache_ids = size; 2256 2257 up_write(&memcg_cache_ids_sem); 2258 2259 if (err) { 2260 ida_simple_remove(&memcg_cache_ida, id); 2261 return err; 2262 } 2263 return id; 2264 } 2265 2266 static void memcg_free_cache_id(int id) 2267 { 2268 ida_simple_remove(&memcg_cache_ida, id); 2269 } 2270 2271 struct memcg_kmem_cache_create_work { 2272 struct mem_cgroup *memcg; 2273 struct kmem_cache *cachep; 2274 struct work_struct work; 2275 }; 2276 2277 static void memcg_kmem_cache_create_func(struct work_struct *w) 2278 { 2279 struct memcg_kmem_cache_create_work *cw = 2280 container_of(w, struct memcg_kmem_cache_create_work, work); 2281 struct mem_cgroup *memcg = cw->memcg; 2282 struct kmem_cache *cachep = cw->cachep; 2283 2284 memcg_create_kmem_cache(memcg, cachep); 2285 2286 css_put(&memcg->css); 2287 kfree(cw); 2288 } 2289 2290 /* 2291 * Enqueue the creation of a per-memcg kmem_cache. 2292 */ 2293 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2294 struct kmem_cache *cachep) 2295 { 2296 struct memcg_kmem_cache_create_work *cw; 2297 2298 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2299 if (!cw) 2300 return; 2301 2302 css_get(&memcg->css); 2303 2304 cw->memcg = memcg; 2305 cw->cachep = cachep; 2306 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2307 2308 schedule_work(&cw->work); 2309 } 2310 2311 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2312 struct kmem_cache *cachep) 2313 { 2314 /* 2315 * We need to stop accounting when we kmalloc, because if the 2316 * corresponding kmalloc cache is not yet created, the first allocation 2317 * in __memcg_schedule_kmem_cache_create will recurse. 2318 * 2319 * However, it is better to enclose the whole function. Depending on 2320 * the debugging options enabled, INIT_WORK(), for instance, can 2321 * trigger an allocation. This too, will make us recurse. Because at 2322 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2323 * the safest choice is to do it like this, wrapping the whole function. 2324 */ 2325 current->memcg_kmem_skip_account = 1; 2326 __memcg_schedule_kmem_cache_create(memcg, cachep); 2327 current->memcg_kmem_skip_account = 0; 2328 } 2329 2330 /* 2331 * Return the kmem_cache we're supposed to use for a slab allocation. 2332 * We try to use the current memcg's version of the cache. 2333 * 2334 * If the cache does not exist yet, if we are the first user of it, 2335 * we either create it immediately, if possible, or create it asynchronously 2336 * in a workqueue. 2337 * In the latter case, we will let the current allocation go through with 2338 * the original cache. 2339 * 2340 * Can't be called in interrupt context or from kernel threads. 2341 * This function needs to be called with rcu_read_lock() held. 2342 */ 2343 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2344 { 2345 struct mem_cgroup *memcg; 2346 struct kmem_cache *memcg_cachep; 2347 int kmemcg_id; 2348 2349 VM_BUG_ON(!is_root_cache(cachep)); 2350 2351 if (current->memcg_kmem_skip_account) 2352 return cachep; 2353 2354 memcg = get_mem_cgroup_from_mm(current->mm); 2355 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2356 if (kmemcg_id < 0) 2357 goto out; 2358 2359 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2360 if (likely(memcg_cachep)) 2361 return memcg_cachep; 2362 2363 /* 2364 * If we are in a safe context (can wait, and not in interrupt 2365 * context), we could be be predictable and return right away. 2366 * This would guarantee that the allocation being performed 2367 * already belongs in the new cache. 2368 * 2369 * However, there are some clashes that can arrive from locking. 2370 * For instance, because we acquire the slab_mutex while doing 2371 * memcg_create_kmem_cache, this means no further allocation 2372 * could happen with the slab_mutex held. So it's better to 2373 * defer everything. 2374 */ 2375 memcg_schedule_kmem_cache_create(memcg, cachep); 2376 out: 2377 css_put(&memcg->css); 2378 return cachep; 2379 } 2380 2381 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2382 { 2383 if (!is_root_cache(cachep)) 2384 css_put(&cachep->memcg_params.memcg->css); 2385 } 2386 2387 /* 2388 * We need to verify if the allocation against current->mm->owner's memcg is 2389 * possible for the given order. But the page is not allocated yet, so we'll 2390 * need a further commit step to do the final arrangements. 2391 * 2392 * It is possible for the task to switch cgroups in this mean time, so at 2393 * commit time, we can't rely on task conversion any longer. We'll then use 2394 * the handle argument to return to the caller which cgroup we should commit 2395 * against. We could also return the memcg directly and avoid the pointer 2396 * passing, but a boolean return value gives better semantics considering 2397 * the compiled-out case as well. 2398 * 2399 * Returning true means the allocation is possible. 2400 */ 2401 bool 2402 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 2403 { 2404 struct mem_cgroup *memcg; 2405 int ret; 2406 2407 *_memcg = NULL; 2408 2409 memcg = get_mem_cgroup_from_mm(current->mm); 2410 2411 if (!memcg_kmem_is_active(memcg)) { 2412 css_put(&memcg->css); 2413 return true; 2414 } 2415 2416 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2417 if (!ret) 2418 *_memcg = memcg; 2419 2420 css_put(&memcg->css); 2421 return (ret == 0); 2422 } 2423 2424 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2425 int order) 2426 { 2427 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2428 2429 /* The page allocation failed. Revert */ 2430 if (!page) { 2431 memcg_uncharge_kmem(memcg, 1 << order); 2432 return; 2433 } 2434 page->mem_cgroup = memcg; 2435 } 2436 2437 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2438 { 2439 struct mem_cgroup *memcg = page->mem_cgroup; 2440 2441 if (!memcg) 2442 return; 2443 2444 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2445 2446 memcg_uncharge_kmem(memcg, 1 << order); 2447 page->mem_cgroup = NULL; 2448 } 2449 2450 struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) 2451 { 2452 struct mem_cgroup *memcg = NULL; 2453 struct kmem_cache *cachep; 2454 struct page *page; 2455 2456 page = virt_to_head_page(ptr); 2457 if (PageSlab(page)) { 2458 cachep = page->slab_cache; 2459 if (!is_root_cache(cachep)) 2460 memcg = cachep->memcg_params.memcg; 2461 } else 2462 /* page allocated by alloc_kmem_pages */ 2463 memcg = page->mem_cgroup; 2464 2465 return memcg; 2466 } 2467 #endif /* CONFIG_MEMCG_KMEM */ 2468 2469 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2470 2471 /* 2472 * Because tail pages are not marked as "used", set it. We're under 2473 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2474 * charge/uncharge will be never happen and move_account() is done under 2475 * compound_lock(), so we don't have to take care of races. 2476 */ 2477 void mem_cgroup_split_huge_fixup(struct page *head) 2478 { 2479 int i; 2480 2481 if (mem_cgroup_disabled()) 2482 return; 2483 2484 for (i = 1; i < HPAGE_PMD_NR; i++) 2485 head[i].mem_cgroup = head->mem_cgroup; 2486 2487 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2488 HPAGE_PMD_NR); 2489 } 2490 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2491 2492 #ifdef CONFIG_MEMCG_SWAP 2493 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2494 bool charge) 2495 { 2496 int val = (charge) ? 1 : -1; 2497 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 2498 } 2499 2500 /** 2501 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2502 * @entry: swap entry to be moved 2503 * @from: mem_cgroup which the entry is moved from 2504 * @to: mem_cgroup which the entry is moved to 2505 * 2506 * It succeeds only when the swap_cgroup's record for this entry is the same 2507 * as the mem_cgroup's id of @from. 2508 * 2509 * Returns 0 on success, -EINVAL on failure. 2510 * 2511 * The caller must have charged to @to, IOW, called page_counter_charge() about 2512 * both res and memsw, and called css_get(). 2513 */ 2514 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2515 struct mem_cgroup *from, struct mem_cgroup *to) 2516 { 2517 unsigned short old_id, new_id; 2518 2519 old_id = mem_cgroup_id(from); 2520 new_id = mem_cgroup_id(to); 2521 2522 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2523 mem_cgroup_swap_statistics(from, false); 2524 mem_cgroup_swap_statistics(to, true); 2525 return 0; 2526 } 2527 return -EINVAL; 2528 } 2529 #else 2530 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2531 struct mem_cgroup *from, struct mem_cgroup *to) 2532 { 2533 return -EINVAL; 2534 } 2535 #endif 2536 2537 static DEFINE_MUTEX(memcg_limit_mutex); 2538 2539 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2540 unsigned long limit) 2541 { 2542 unsigned long curusage; 2543 unsigned long oldusage; 2544 bool enlarge = false; 2545 int retry_count; 2546 int ret; 2547 2548 /* 2549 * For keeping hierarchical_reclaim simple, how long we should retry 2550 * is depends on callers. We set our retry-count to be function 2551 * of # of children which we should visit in this loop. 2552 */ 2553 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2554 mem_cgroup_count_children(memcg); 2555 2556 oldusage = page_counter_read(&memcg->memory); 2557 2558 do { 2559 if (signal_pending(current)) { 2560 ret = -EINTR; 2561 break; 2562 } 2563 2564 mutex_lock(&memcg_limit_mutex); 2565 if (limit > memcg->memsw.limit) { 2566 mutex_unlock(&memcg_limit_mutex); 2567 ret = -EINVAL; 2568 break; 2569 } 2570 if (limit > memcg->memory.limit) 2571 enlarge = true; 2572 ret = page_counter_limit(&memcg->memory, limit); 2573 mutex_unlock(&memcg_limit_mutex); 2574 2575 if (!ret) 2576 break; 2577 2578 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 2579 2580 curusage = page_counter_read(&memcg->memory); 2581 /* Usage is reduced ? */ 2582 if (curusage >= oldusage) 2583 retry_count--; 2584 else 2585 oldusage = curusage; 2586 } while (retry_count); 2587 2588 if (!ret && enlarge) 2589 memcg_oom_recover(memcg); 2590 2591 return ret; 2592 } 2593 2594 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2595 unsigned long limit) 2596 { 2597 unsigned long curusage; 2598 unsigned long oldusage; 2599 bool enlarge = false; 2600 int retry_count; 2601 int ret; 2602 2603 /* see mem_cgroup_resize_res_limit */ 2604 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2605 mem_cgroup_count_children(memcg); 2606 2607 oldusage = page_counter_read(&memcg->memsw); 2608 2609 do { 2610 if (signal_pending(current)) { 2611 ret = -EINTR; 2612 break; 2613 } 2614 2615 mutex_lock(&memcg_limit_mutex); 2616 if (limit < memcg->memory.limit) { 2617 mutex_unlock(&memcg_limit_mutex); 2618 ret = -EINVAL; 2619 break; 2620 } 2621 if (limit > memcg->memsw.limit) 2622 enlarge = true; 2623 ret = page_counter_limit(&memcg->memsw, limit); 2624 mutex_unlock(&memcg_limit_mutex); 2625 2626 if (!ret) 2627 break; 2628 2629 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 2630 2631 curusage = page_counter_read(&memcg->memsw); 2632 /* Usage is reduced ? */ 2633 if (curusage >= oldusage) 2634 retry_count--; 2635 else 2636 oldusage = curusage; 2637 } while (retry_count); 2638 2639 if (!ret && enlarge) 2640 memcg_oom_recover(memcg); 2641 2642 return ret; 2643 } 2644 2645 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2646 gfp_t gfp_mask, 2647 unsigned long *total_scanned) 2648 { 2649 unsigned long nr_reclaimed = 0; 2650 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2651 unsigned long reclaimed; 2652 int loop = 0; 2653 struct mem_cgroup_tree_per_zone *mctz; 2654 unsigned long excess; 2655 unsigned long nr_scanned; 2656 2657 if (order > 0) 2658 return 0; 2659 2660 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2661 /* 2662 * This loop can run a while, specially if mem_cgroup's continuously 2663 * keep exceeding their soft limit and putting the system under 2664 * pressure 2665 */ 2666 do { 2667 if (next_mz) 2668 mz = next_mz; 2669 else 2670 mz = mem_cgroup_largest_soft_limit_node(mctz); 2671 if (!mz) 2672 break; 2673 2674 nr_scanned = 0; 2675 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 2676 gfp_mask, &nr_scanned); 2677 nr_reclaimed += reclaimed; 2678 *total_scanned += nr_scanned; 2679 spin_lock_irq(&mctz->lock); 2680 __mem_cgroup_remove_exceeded(mz, mctz); 2681 2682 /* 2683 * If we failed to reclaim anything from this memory cgroup 2684 * it is time to move on to the next cgroup 2685 */ 2686 next_mz = NULL; 2687 if (!reclaimed) 2688 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 2689 2690 excess = soft_limit_excess(mz->memcg); 2691 /* 2692 * One school of thought says that we should not add 2693 * back the node to the tree if reclaim returns 0. 2694 * But our reclaim could return 0, simply because due 2695 * to priority we are exposing a smaller subset of 2696 * memory to reclaim from. Consider this as a longer 2697 * term TODO. 2698 */ 2699 /* If excess == 0, no tree ops */ 2700 __mem_cgroup_insert_exceeded(mz, mctz, excess); 2701 spin_unlock_irq(&mctz->lock); 2702 css_put(&mz->memcg->css); 2703 loop++; 2704 /* 2705 * Could not reclaim anything and there are no more 2706 * mem cgroups to try or we seem to be looping without 2707 * reclaiming anything. 2708 */ 2709 if (!nr_reclaimed && 2710 (next_mz == NULL || 2711 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2712 break; 2713 } while (!nr_reclaimed); 2714 if (next_mz) 2715 css_put(&next_mz->memcg->css); 2716 return nr_reclaimed; 2717 } 2718 2719 /* 2720 * Test whether @memcg has children, dead or alive. Note that this 2721 * function doesn't care whether @memcg has use_hierarchy enabled and 2722 * returns %true if there are child csses according to the cgroup 2723 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 2724 */ 2725 static inline bool memcg_has_children(struct mem_cgroup *memcg) 2726 { 2727 bool ret; 2728 2729 /* 2730 * The lock does not prevent addition or deletion of children, but 2731 * it prevents a new child from being initialized based on this 2732 * parent in css_online(), so it's enough to decide whether 2733 * hierarchically inherited attributes can still be changed or not. 2734 */ 2735 lockdep_assert_held(&memcg_create_mutex); 2736 2737 rcu_read_lock(); 2738 ret = css_next_child(NULL, &memcg->css); 2739 rcu_read_unlock(); 2740 return ret; 2741 } 2742 2743 /* 2744 * Reclaims as many pages from the given memcg as possible and moves 2745 * the rest to the parent. 2746 * 2747 * Caller is responsible for holding css reference for memcg. 2748 */ 2749 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 2750 { 2751 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2752 2753 /* we call try-to-free pages for make this cgroup empty */ 2754 lru_add_drain_all(); 2755 /* try to free all pages in this cgroup */ 2756 while (nr_retries && page_counter_read(&memcg->memory)) { 2757 int progress; 2758 2759 if (signal_pending(current)) 2760 return -EINTR; 2761 2762 progress = try_to_free_mem_cgroup_pages(memcg, 1, 2763 GFP_KERNEL, true); 2764 if (!progress) { 2765 nr_retries--; 2766 /* maybe some writeback is necessary */ 2767 congestion_wait(BLK_RW_ASYNC, HZ/10); 2768 } 2769 2770 } 2771 2772 return 0; 2773 } 2774 2775 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 2776 char *buf, size_t nbytes, 2777 loff_t off) 2778 { 2779 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 2780 2781 if (mem_cgroup_is_root(memcg)) 2782 return -EINVAL; 2783 return mem_cgroup_force_empty(memcg) ?: nbytes; 2784 } 2785 2786 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 2787 struct cftype *cft) 2788 { 2789 return mem_cgroup_from_css(css)->use_hierarchy; 2790 } 2791 2792 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 2793 struct cftype *cft, u64 val) 2794 { 2795 int retval = 0; 2796 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2797 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 2798 2799 mutex_lock(&memcg_create_mutex); 2800 2801 if (memcg->use_hierarchy == val) 2802 goto out; 2803 2804 /* 2805 * If parent's use_hierarchy is set, we can't make any modifications 2806 * in the child subtrees. If it is unset, then the change can 2807 * occur, provided the current cgroup has no children. 2808 * 2809 * For the root cgroup, parent_mem is NULL, we allow value to be 2810 * set if there are no children. 2811 */ 2812 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 2813 (val == 1 || val == 0)) { 2814 if (!memcg_has_children(memcg)) 2815 memcg->use_hierarchy = val; 2816 else 2817 retval = -EBUSY; 2818 } else 2819 retval = -EINVAL; 2820 2821 out: 2822 mutex_unlock(&memcg_create_mutex); 2823 2824 return retval; 2825 } 2826 2827 static unsigned long tree_stat(struct mem_cgroup *memcg, 2828 enum mem_cgroup_stat_index idx) 2829 { 2830 struct mem_cgroup *iter; 2831 unsigned long val = 0; 2832 2833 for_each_mem_cgroup_tree(iter, memcg) 2834 val += mem_cgroup_read_stat(iter, idx); 2835 2836 return val; 2837 } 2838 2839 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 2840 { 2841 u64 val; 2842 2843 if (mem_cgroup_is_root(memcg)) { 2844 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 2845 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 2846 if (swap) 2847 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 2848 } else { 2849 if (!swap) 2850 val = page_counter_read(&memcg->memory); 2851 else 2852 val = page_counter_read(&memcg->memsw); 2853 } 2854 return val << PAGE_SHIFT; 2855 } 2856 2857 enum { 2858 RES_USAGE, 2859 RES_LIMIT, 2860 RES_MAX_USAGE, 2861 RES_FAILCNT, 2862 RES_SOFT_LIMIT, 2863 }; 2864 2865 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 2866 struct cftype *cft) 2867 { 2868 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2869 struct page_counter *counter; 2870 2871 switch (MEMFILE_TYPE(cft->private)) { 2872 case _MEM: 2873 counter = &memcg->memory; 2874 break; 2875 case _MEMSWAP: 2876 counter = &memcg->memsw; 2877 break; 2878 case _KMEM: 2879 counter = &memcg->kmem; 2880 break; 2881 default: 2882 BUG(); 2883 } 2884 2885 switch (MEMFILE_ATTR(cft->private)) { 2886 case RES_USAGE: 2887 if (counter == &memcg->memory) 2888 return mem_cgroup_usage(memcg, false); 2889 if (counter == &memcg->memsw) 2890 return mem_cgroup_usage(memcg, true); 2891 return (u64)page_counter_read(counter) * PAGE_SIZE; 2892 case RES_LIMIT: 2893 return (u64)counter->limit * PAGE_SIZE; 2894 case RES_MAX_USAGE: 2895 return (u64)counter->watermark * PAGE_SIZE; 2896 case RES_FAILCNT: 2897 return counter->failcnt; 2898 case RES_SOFT_LIMIT: 2899 return (u64)memcg->soft_limit * PAGE_SIZE; 2900 default: 2901 BUG(); 2902 } 2903 } 2904 2905 #ifdef CONFIG_MEMCG_KMEM 2906 static int memcg_activate_kmem(struct mem_cgroup *memcg, 2907 unsigned long nr_pages) 2908 { 2909 int err = 0; 2910 int memcg_id; 2911 2912 BUG_ON(memcg->kmemcg_id >= 0); 2913 BUG_ON(memcg->kmem_acct_activated); 2914 BUG_ON(memcg->kmem_acct_active); 2915 2916 /* 2917 * For simplicity, we won't allow this to be disabled. It also can't 2918 * be changed if the cgroup has children already, or if tasks had 2919 * already joined. 2920 * 2921 * If tasks join before we set the limit, a person looking at 2922 * kmem.usage_in_bytes will have no way to determine when it took 2923 * place, which makes the value quite meaningless. 2924 * 2925 * After it first became limited, changes in the value of the limit are 2926 * of course permitted. 2927 */ 2928 mutex_lock(&memcg_create_mutex); 2929 if (cgroup_has_tasks(memcg->css.cgroup) || 2930 (memcg->use_hierarchy && memcg_has_children(memcg))) 2931 err = -EBUSY; 2932 mutex_unlock(&memcg_create_mutex); 2933 if (err) 2934 goto out; 2935 2936 memcg_id = memcg_alloc_cache_id(); 2937 if (memcg_id < 0) { 2938 err = memcg_id; 2939 goto out; 2940 } 2941 2942 /* 2943 * We couldn't have accounted to this cgroup, because it hasn't got 2944 * activated yet, so this should succeed. 2945 */ 2946 err = page_counter_limit(&memcg->kmem, nr_pages); 2947 VM_BUG_ON(err); 2948 2949 static_key_slow_inc(&memcg_kmem_enabled_key); 2950 /* 2951 * A memory cgroup is considered kmem-active as soon as it gets 2952 * kmemcg_id. Setting the id after enabling static branching will 2953 * guarantee no one starts accounting before all call sites are 2954 * patched. 2955 */ 2956 memcg->kmemcg_id = memcg_id; 2957 memcg->kmem_acct_activated = true; 2958 memcg->kmem_acct_active = true; 2959 out: 2960 return err; 2961 } 2962 2963 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2964 unsigned long limit) 2965 { 2966 int ret; 2967 2968 mutex_lock(&memcg_limit_mutex); 2969 if (!memcg_kmem_is_active(memcg)) 2970 ret = memcg_activate_kmem(memcg, limit); 2971 else 2972 ret = page_counter_limit(&memcg->kmem, limit); 2973 mutex_unlock(&memcg_limit_mutex); 2974 return ret; 2975 } 2976 2977 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 2978 { 2979 int ret = 0; 2980 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 2981 2982 if (!parent) 2983 return 0; 2984 2985 mutex_lock(&memcg_limit_mutex); 2986 /* 2987 * If the parent cgroup is not kmem-active now, it cannot be activated 2988 * after this point, because it has at least one child already. 2989 */ 2990 if (memcg_kmem_is_active(parent)) 2991 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 2992 mutex_unlock(&memcg_limit_mutex); 2993 return ret; 2994 } 2995 #else 2996 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2997 unsigned long limit) 2998 { 2999 return -EINVAL; 3000 } 3001 #endif /* CONFIG_MEMCG_KMEM */ 3002 3003 /* 3004 * The user of this function is... 3005 * RES_LIMIT. 3006 */ 3007 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3008 char *buf, size_t nbytes, loff_t off) 3009 { 3010 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3011 unsigned long nr_pages; 3012 int ret; 3013 3014 buf = strstrip(buf); 3015 ret = page_counter_memparse(buf, "-1", &nr_pages); 3016 if (ret) 3017 return ret; 3018 3019 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3020 case RES_LIMIT: 3021 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3022 ret = -EINVAL; 3023 break; 3024 } 3025 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3026 case _MEM: 3027 ret = mem_cgroup_resize_limit(memcg, nr_pages); 3028 break; 3029 case _MEMSWAP: 3030 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 3031 break; 3032 case _KMEM: 3033 ret = memcg_update_kmem_limit(memcg, nr_pages); 3034 break; 3035 } 3036 break; 3037 case RES_SOFT_LIMIT: 3038 memcg->soft_limit = nr_pages; 3039 ret = 0; 3040 break; 3041 } 3042 return ret ?: nbytes; 3043 } 3044 3045 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3046 size_t nbytes, loff_t off) 3047 { 3048 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3049 struct page_counter *counter; 3050 3051 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3052 case _MEM: 3053 counter = &memcg->memory; 3054 break; 3055 case _MEMSWAP: 3056 counter = &memcg->memsw; 3057 break; 3058 case _KMEM: 3059 counter = &memcg->kmem; 3060 break; 3061 default: 3062 BUG(); 3063 } 3064 3065 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3066 case RES_MAX_USAGE: 3067 page_counter_reset_watermark(counter); 3068 break; 3069 case RES_FAILCNT: 3070 counter->failcnt = 0; 3071 break; 3072 default: 3073 BUG(); 3074 } 3075 3076 return nbytes; 3077 } 3078 3079 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3080 struct cftype *cft) 3081 { 3082 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3083 } 3084 3085 #ifdef CONFIG_MMU 3086 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3087 struct cftype *cft, u64 val) 3088 { 3089 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3090 3091 if (val & ~MOVE_MASK) 3092 return -EINVAL; 3093 3094 /* 3095 * No kind of locking is needed in here, because ->can_attach() will 3096 * check this value once in the beginning of the process, and then carry 3097 * on with stale data. This means that changes to this value will only 3098 * affect task migrations starting after the change. 3099 */ 3100 memcg->move_charge_at_immigrate = val; 3101 return 0; 3102 } 3103 #else 3104 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3105 struct cftype *cft, u64 val) 3106 { 3107 return -ENOSYS; 3108 } 3109 #endif 3110 3111 #ifdef CONFIG_NUMA 3112 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3113 { 3114 struct numa_stat { 3115 const char *name; 3116 unsigned int lru_mask; 3117 }; 3118 3119 static const struct numa_stat stats[] = { 3120 { "total", LRU_ALL }, 3121 { "file", LRU_ALL_FILE }, 3122 { "anon", LRU_ALL_ANON }, 3123 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3124 }; 3125 const struct numa_stat *stat; 3126 int nid; 3127 unsigned long nr; 3128 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3129 3130 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3131 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3132 seq_printf(m, "%s=%lu", stat->name, nr); 3133 for_each_node_state(nid, N_MEMORY) { 3134 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3135 stat->lru_mask); 3136 seq_printf(m, " N%d=%lu", nid, nr); 3137 } 3138 seq_putc(m, '\n'); 3139 } 3140 3141 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3142 struct mem_cgroup *iter; 3143 3144 nr = 0; 3145 for_each_mem_cgroup_tree(iter, memcg) 3146 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3147 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3148 for_each_node_state(nid, N_MEMORY) { 3149 nr = 0; 3150 for_each_mem_cgroup_tree(iter, memcg) 3151 nr += mem_cgroup_node_nr_lru_pages( 3152 iter, nid, stat->lru_mask); 3153 seq_printf(m, " N%d=%lu", nid, nr); 3154 } 3155 seq_putc(m, '\n'); 3156 } 3157 3158 return 0; 3159 } 3160 #endif /* CONFIG_NUMA */ 3161 3162 static int memcg_stat_show(struct seq_file *m, void *v) 3163 { 3164 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3165 unsigned long memory, memsw; 3166 struct mem_cgroup *mi; 3167 unsigned int i; 3168 3169 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != 3170 MEM_CGROUP_STAT_NSTATS); 3171 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != 3172 MEM_CGROUP_EVENTS_NSTATS); 3173 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3174 3175 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3176 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3177 continue; 3178 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], 3179 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3180 } 3181 3182 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3183 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3184 mem_cgroup_read_events(memcg, i)); 3185 3186 for (i = 0; i < NR_LRU_LISTS; i++) 3187 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3188 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3189 3190 /* Hierarchical information */ 3191 memory = memsw = PAGE_COUNTER_MAX; 3192 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3193 memory = min(memory, mi->memory.limit); 3194 memsw = min(memsw, mi->memsw.limit); 3195 } 3196 seq_printf(m, "hierarchical_memory_limit %llu\n", 3197 (u64)memory * PAGE_SIZE); 3198 if (do_swap_account) 3199 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3200 (u64)memsw * PAGE_SIZE); 3201 3202 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3203 unsigned long long val = 0; 3204 3205 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3206 continue; 3207 for_each_mem_cgroup_tree(mi, memcg) 3208 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3209 seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); 3210 } 3211 3212 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3213 unsigned long long val = 0; 3214 3215 for_each_mem_cgroup_tree(mi, memcg) 3216 val += mem_cgroup_read_events(mi, i); 3217 seq_printf(m, "total_%s %llu\n", 3218 mem_cgroup_events_names[i], val); 3219 } 3220 3221 for (i = 0; i < NR_LRU_LISTS; i++) { 3222 unsigned long long val = 0; 3223 3224 for_each_mem_cgroup_tree(mi, memcg) 3225 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3226 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3227 } 3228 3229 #ifdef CONFIG_DEBUG_VM 3230 { 3231 int nid, zid; 3232 struct mem_cgroup_per_zone *mz; 3233 struct zone_reclaim_stat *rstat; 3234 unsigned long recent_rotated[2] = {0, 0}; 3235 unsigned long recent_scanned[2] = {0, 0}; 3236 3237 for_each_online_node(nid) 3238 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3239 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3240 rstat = &mz->lruvec.reclaim_stat; 3241 3242 recent_rotated[0] += rstat->recent_rotated[0]; 3243 recent_rotated[1] += rstat->recent_rotated[1]; 3244 recent_scanned[0] += rstat->recent_scanned[0]; 3245 recent_scanned[1] += rstat->recent_scanned[1]; 3246 } 3247 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3248 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3249 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3250 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3251 } 3252 #endif 3253 3254 return 0; 3255 } 3256 3257 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3258 struct cftype *cft) 3259 { 3260 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3261 3262 return mem_cgroup_swappiness(memcg); 3263 } 3264 3265 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3266 struct cftype *cft, u64 val) 3267 { 3268 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3269 3270 if (val > 100) 3271 return -EINVAL; 3272 3273 if (css->parent) 3274 memcg->swappiness = val; 3275 else 3276 vm_swappiness = val; 3277 3278 return 0; 3279 } 3280 3281 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3282 { 3283 struct mem_cgroup_threshold_ary *t; 3284 unsigned long usage; 3285 int i; 3286 3287 rcu_read_lock(); 3288 if (!swap) 3289 t = rcu_dereference(memcg->thresholds.primary); 3290 else 3291 t = rcu_dereference(memcg->memsw_thresholds.primary); 3292 3293 if (!t) 3294 goto unlock; 3295 3296 usage = mem_cgroup_usage(memcg, swap); 3297 3298 /* 3299 * current_threshold points to threshold just below or equal to usage. 3300 * If it's not true, a threshold was crossed after last 3301 * call of __mem_cgroup_threshold(). 3302 */ 3303 i = t->current_threshold; 3304 3305 /* 3306 * Iterate backward over array of thresholds starting from 3307 * current_threshold and check if a threshold is crossed. 3308 * If none of thresholds below usage is crossed, we read 3309 * only one element of the array here. 3310 */ 3311 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3312 eventfd_signal(t->entries[i].eventfd, 1); 3313 3314 /* i = current_threshold + 1 */ 3315 i++; 3316 3317 /* 3318 * Iterate forward over array of thresholds starting from 3319 * current_threshold+1 and check if a threshold is crossed. 3320 * If none of thresholds above usage is crossed, we read 3321 * only one element of the array here. 3322 */ 3323 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3324 eventfd_signal(t->entries[i].eventfd, 1); 3325 3326 /* Update current_threshold */ 3327 t->current_threshold = i - 1; 3328 unlock: 3329 rcu_read_unlock(); 3330 } 3331 3332 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3333 { 3334 while (memcg) { 3335 __mem_cgroup_threshold(memcg, false); 3336 if (do_swap_account) 3337 __mem_cgroup_threshold(memcg, true); 3338 3339 memcg = parent_mem_cgroup(memcg); 3340 } 3341 } 3342 3343 static int compare_thresholds(const void *a, const void *b) 3344 { 3345 const struct mem_cgroup_threshold *_a = a; 3346 const struct mem_cgroup_threshold *_b = b; 3347 3348 if (_a->threshold > _b->threshold) 3349 return 1; 3350 3351 if (_a->threshold < _b->threshold) 3352 return -1; 3353 3354 return 0; 3355 } 3356 3357 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3358 { 3359 struct mem_cgroup_eventfd_list *ev; 3360 3361 spin_lock(&memcg_oom_lock); 3362 3363 list_for_each_entry(ev, &memcg->oom_notify, list) 3364 eventfd_signal(ev->eventfd, 1); 3365 3366 spin_unlock(&memcg_oom_lock); 3367 return 0; 3368 } 3369 3370 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3371 { 3372 struct mem_cgroup *iter; 3373 3374 for_each_mem_cgroup_tree(iter, memcg) 3375 mem_cgroup_oom_notify_cb(iter); 3376 } 3377 3378 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3379 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3380 { 3381 struct mem_cgroup_thresholds *thresholds; 3382 struct mem_cgroup_threshold_ary *new; 3383 unsigned long threshold; 3384 unsigned long usage; 3385 int i, size, ret; 3386 3387 ret = page_counter_memparse(args, "-1", &threshold); 3388 if (ret) 3389 return ret; 3390 3391 mutex_lock(&memcg->thresholds_lock); 3392 3393 if (type == _MEM) { 3394 thresholds = &memcg->thresholds; 3395 usage = mem_cgroup_usage(memcg, false); 3396 } else if (type == _MEMSWAP) { 3397 thresholds = &memcg->memsw_thresholds; 3398 usage = mem_cgroup_usage(memcg, true); 3399 } else 3400 BUG(); 3401 3402 /* Check if a threshold crossed before adding a new one */ 3403 if (thresholds->primary) 3404 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3405 3406 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3407 3408 /* Allocate memory for new array of thresholds */ 3409 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3410 GFP_KERNEL); 3411 if (!new) { 3412 ret = -ENOMEM; 3413 goto unlock; 3414 } 3415 new->size = size; 3416 3417 /* Copy thresholds (if any) to new array */ 3418 if (thresholds->primary) { 3419 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3420 sizeof(struct mem_cgroup_threshold)); 3421 } 3422 3423 /* Add new threshold */ 3424 new->entries[size - 1].eventfd = eventfd; 3425 new->entries[size - 1].threshold = threshold; 3426 3427 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3428 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3429 compare_thresholds, NULL); 3430 3431 /* Find current threshold */ 3432 new->current_threshold = -1; 3433 for (i = 0; i < size; i++) { 3434 if (new->entries[i].threshold <= usage) { 3435 /* 3436 * new->current_threshold will not be used until 3437 * rcu_assign_pointer(), so it's safe to increment 3438 * it here. 3439 */ 3440 ++new->current_threshold; 3441 } else 3442 break; 3443 } 3444 3445 /* Free old spare buffer and save old primary buffer as spare */ 3446 kfree(thresholds->spare); 3447 thresholds->spare = thresholds->primary; 3448 3449 rcu_assign_pointer(thresholds->primary, new); 3450 3451 /* To be sure that nobody uses thresholds */ 3452 synchronize_rcu(); 3453 3454 unlock: 3455 mutex_unlock(&memcg->thresholds_lock); 3456 3457 return ret; 3458 } 3459 3460 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3461 struct eventfd_ctx *eventfd, const char *args) 3462 { 3463 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3464 } 3465 3466 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3467 struct eventfd_ctx *eventfd, const char *args) 3468 { 3469 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3470 } 3471 3472 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3473 struct eventfd_ctx *eventfd, enum res_type type) 3474 { 3475 struct mem_cgroup_thresholds *thresholds; 3476 struct mem_cgroup_threshold_ary *new; 3477 unsigned long usage; 3478 int i, j, size; 3479 3480 mutex_lock(&memcg->thresholds_lock); 3481 3482 if (type == _MEM) { 3483 thresholds = &memcg->thresholds; 3484 usage = mem_cgroup_usage(memcg, false); 3485 } else if (type == _MEMSWAP) { 3486 thresholds = &memcg->memsw_thresholds; 3487 usage = mem_cgroup_usage(memcg, true); 3488 } else 3489 BUG(); 3490 3491 if (!thresholds->primary) 3492 goto unlock; 3493 3494 /* Check if a threshold crossed before removing */ 3495 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3496 3497 /* Calculate new number of threshold */ 3498 size = 0; 3499 for (i = 0; i < thresholds->primary->size; i++) { 3500 if (thresholds->primary->entries[i].eventfd != eventfd) 3501 size++; 3502 } 3503 3504 new = thresholds->spare; 3505 3506 /* Set thresholds array to NULL if we don't have thresholds */ 3507 if (!size) { 3508 kfree(new); 3509 new = NULL; 3510 goto swap_buffers; 3511 } 3512 3513 new->size = size; 3514 3515 /* Copy thresholds and find current threshold */ 3516 new->current_threshold = -1; 3517 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3518 if (thresholds->primary->entries[i].eventfd == eventfd) 3519 continue; 3520 3521 new->entries[j] = thresholds->primary->entries[i]; 3522 if (new->entries[j].threshold <= usage) { 3523 /* 3524 * new->current_threshold will not be used 3525 * until rcu_assign_pointer(), so it's safe to increment 3526 * it here. 3527 */ 3528 ++new->current_threshold; 3529 } 3530 j++; 3531 } 3532 3533 swap_buffers: 3534 /* Swap primary and spare array */ 3535 thresholds->spare = thresholds->primary; 3536 /* If all events are unregistered, free the spare array */ 3537 if (!new) { 3538 kfree(thresholds->spare); 3539 thresholds->spare = NULL; 3540 } 3541 3542 rcu_assign_pointer(thresholds->primary, new); 3543 3544 /* To be sure that nobody uses thresholds */ 3545 synchronize_rcu(); 3546 unlock: 3547 mutex_unlock(&memcg->thresholds_lock); 3548 } 3549 3550 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3551 struct eventfd_ctx *eventfd) 3552 { 3553 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 3554 } 3555 3556 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3557 struct eventfd_ctx *eventfd) 3558 { 3559 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 3560 } 3561 3562 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 3563 struct eventfd_ctx *eventfd, const char *args) 3564 { 3565 struct mem_cgroup_eventfd_list *event; 3566 3567 event = kmalloc(sizeof(*event), GFP_KERNEL); 3568 if (!event) 3569 return -ENOMEM; 3570 3571 spin_lock(&memcg_oom_lock); 3572 3573 event->eventfd = eventfd; 3574 list_add(&event->list, &memcg->oom_notify); 3575 3576 /* already in OOM ? */ 3577 if (memcg->under_oom) 3578 eventfd_signal(eventfd, 1); 3579 spin_unlock(&memcg_oom_lock); 3580 3581 return 0; 3582 } 3583 3584 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 3585 struct eventfd_ctx *eventfd) 3586 { 3587 struct mem_cgroup_eventfd_list *ev, *tmp; 3588 3589 spin_lock(&memcg_oom_lock); 3590 3591 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 3592 if (ev->eventfd == eventfd) { 3593 list_del(&ev->list); 3594 kfree(ev); 3595 } 3596 } 3597 3598 spin_unlock(&memcg_oom_lock); 3599 } 3600 3601 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3602 { 3603 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3604 3605 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3606 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3607 return 0; 3608 } 3609 3610 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 3611 struct cftype *cft, u64 val) 3612 { 3613 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3614 3615 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3616 if (!css->parent || !((val == 0) || (val == 1))) 3617 return -EINVAL; 3618 3619 memcg->oom_kill_disable = val; 3620 if (!val) 3621 memcg_oom_recover(memcg); 3622 3623 return 0; 3624 } 3625 3626 #ifdef CONFIG_MEMCG_KMEM 3627 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3628 { 3629 int ret; 3630 3631 ret = memcg_propagate_kmem(memcg); 3632 if (ret) 3633 return ret; 3634 3635 return mem_cgroup_sockets_init(memcg, ss); 3636 } 3637 3638 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3639 { 3640 struct cgroup_subsys_state *css; 3641 struct mem_cgroup *parent, *child; 3642 int kmemcg_id; 3643 3644 if (!memcg->kmem_acct_active) 3645 return; 3646 3647 /* 3648 * Clear the 'active' flag before clearing memcg_caches arrays entries. 3649 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it 3650 * guarantees no cache will be created for this cgroup after we are 3651 * done (see memcg_create_kmem_cache()). 3652 */ 3653 memcg->kmem_acct_active = false; 3654 3655 memcg_deactivate_kmem_caches(memcg); 3656 3657 kmemcg_id = memcg->kmemcg_id; 3658 BUG_ON(kmemcg_id < 0); 3659 3660 parent = parent_mem_cgroup(memcg); 3661 if (!parent) 3662 parent = root_mem_cgroup; 3663 3664 /* 3665 * Change kmemcg_id of this cgroup and all its descendants to the 3666 * parent's id, and then move all entries from this cgroup's list_lrus 3667 * to ones of the parent. After we have finished, all list_lrus 3668 * corresponding to this cgroup are guaranteed to remain empty. The 3669 * ordering is imposed by list_lru_node->lock taken by 3670 * memcg_drain_all_list_lrus(). 3671 */ 3672 css_for_each_descendant_pre(css, &memcg->css) { 3673 child = mem_cgroup_from_css(css); 3674 BUG_ON(child->kmemcg_id != kmemcg_id); 3675 child->kmemcg_id = parent->kmemcg_id; 3676 if (!memcg->use_hierarchy) 3677 break; 3678 } 3679 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); 3680 3681 memcg_free_cache_id(kmemcg_id); 3682 } 3683 3684 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3685 { 3686 if (memcg->kmem_acct_activated) { 3687 memcg_destroy_kmem_caches(memcg); 3688 static_key_slow_dec(&memcg_kmem_enabled_key); 3689 WARN_ON(page_counter_read(&memcg->kmem)); 3690 } 3691 mem_cgroup_sockets_destroy(memcg); 3692 } 3693 #else 3694 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3695 { 3696 return 0; 3697 } 3698 3699 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3700 { 3701 } 3702 3703 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3704 { 3705 } 3706 #endif 3707 3708 #ifdef CONFIG_CGROUP_WRITEBACK 3709 3710 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) 3711 { 3712 return &memcg->cgwb_list; 3713 } 3714 3715 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3716 { 3717 return wb_domain_init(&memcg->cgwb_domain, gfp); 3718 } 3719 3720 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3721 { 3722 wb_domain_exit(&memcg->cgwb_domain); 3723 } 3724 3725 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3726 { 3727 wb_domain_size_changed(&memcg->cgwb_domain); 3728 } 3729 3730 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 3731 { 3732 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3733 3734 if (!memcg->css.parent) 3735 return NULL; 3736 3737 return &memcg->cgwb_domain; 3738 } 3739 3740 /** 3741 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3742 * @wb: bdi_writeback in question 3743 * @pavail: out parameter for number of available pages 3744 * @pdirty: out parameter for number of dirty pages 3745 * @pwriteback: out parameter for number of pages under writeback 3746 * 3747 * Determine the numbers of available, dirty, and writeback pages in @wb's 3748 * memcg. Dirty and writeback are self-explanatory. Available is a bit 3749 * more involved. 3750 * 3751 * A memcg's headroom is "min(max, high) - used". The available memory is 3752 * calculated as the lowest headroom of itself and the ancestors plus the 3753 * number of pages already being used for file pages. Note that this 3754 * doesn't consider the actual amount of available memory in the system. 3755 * The caller should further cap *@pavail accordingly. 3756 */ 3757 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, 3758 unsigned long *pdirty, unsigned long *pwriteback) 3759 { 3760 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3761 struct mem_cgroup *parent; 3762 unsigned long head_room = PAGE_COUNTER_MAX; 3763 unsigned long file_pages; 3764 3765 *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); 3766 3767 /* this should eventually include NR_UNSTABLE_NFS */ 3768 *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 3769 3770 file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3771 (1 << LRU_ACTIVE_FILE)); 3772 while ((parent = parent_mem_cgroup(memcg))) { 3773 unsigned long ceiling = min(memcg->memory.limit, memcg->high); 3774 unsigned long used = page_counter_read(&memcg->memory); 3775 3776 head_room = min(head_room, ceiling - min(ceiling, used)); 3777 memcg = parent; 3778 } 3779 3780 *pavail = file_pages + head_room; 3781 } 3782 3783 #else /* CONFIG_CGROUP_WRITEBACK */ 3784 3785 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3786 { 3787 return 0; 3788 } 3789 3790 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3791 { 3792 } 3793 3794 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3795 { 3796 } 3797 3798 #endif /* CONFIG_CGROUP_WRITEBACK */ 3799 3800 /* 3801 * DO NOT USE IN NEW FILES. 3802 * 3803 * "cgroup.event_control" implementation. 3804 * 3805 * This is way over-engineered. It tries to support fully configurable 3806 * events for each user. Such level of flexibility is completely 3807 * unnecessary especially in the light of the planned unified hierarchy. 3808 * 3809 * Please deprecate this and replace with something simpler if at all 3810 * possible. 3811 */ 3812 3813 /* 3814 * Unregister event and free resources. 3815 * 3816 * Gets called from workqueue. 3817 */ 3818 static void memcg_event_remove(struct work_struct *work) 3819 { 3820 struct mem_cgroup_event *event = 3821 container_of(work, struct mem_cgroup_event, remove); 3822 struct mem_cgroup *memcg = event->memcg; 3823 3824 remove_wait_queue(event->wqh, &event->wait); 3825 3826 event->unregister_event(memcg, event->eventfd); 3827 3828 /* Notify userspace the event is going away. */ 3829 eventfd_signal(event->eventfd, 1); 3830 3831 eventfd_ctx_put(event->eventfd); 3832 kfree(event); 3833 css_put(&memcg->css); 3834 } 3835 3836 /* 3837 * Gets called on POLLHUP on eventfd when user closes it. 3838 * 3839 * Called with wqh->lock held and interrupts disabled. 3840 */ 3841 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 3842 int sync, void *key) 3843 { 3844 struct mem_cgroup_event *event = 3845 container_of(wait, struct mem_cgroup_event, wait); 3846 struct mem_cgroup *memcg = event->memcg; 3847 unsigned long flags = (unsigned long)key; 3848 3849 if (flags & POLLHUP) { 3850 /* 3851 * If the event has been detached at cgroup removal, we 3852 * can simply return knowing the other side will cleanup 3853 * for us. 3854 * 3855 * We can't race against event freeing since the other 3856 * side will require wqh->lock via remove_wait_queue(), 3857 * which we hold. 3858 */ 3859 spin_lock(&memcg->event_list_lock); 3860 if (!list_empty(&event->list)) { 3861 list_del_init(&event->list); 3862 /* 3863 * We are in atomic context, but cgroup_event_remove() 3864 * may sleep, so we have to call it in workqueue. 3865 */ 3866 schedule_work(&event->remove); 3867 } 3868 spin_unlock(&memcg->event_list_lock); 3869 } 3870 3871 return 0; 3872 } 3873 3874 static void memcg_event_ptable_queue_proc(struct file *file, 3875 wait_queue_head_t *wqh, poll_table *pt) 3876 { 3877 struct mem_cgroup_event *event = 3878 container_of(pt, struct mem_cgroup_event, pt); 3879 3880 event->wqh = wqh; 3881 add_wait_queue(wqh, &event->wait); 3882 } 3883 3884 /* 3885 * DO NOT USE IN NEW FILES. 3886 * 3887 * Parse input and register new cgroup event handler. 3888 * 3889 * Input must be in format '<event_fd> <control_fd> <args>'. 3890 * Interpretation of args is defined by control file implementation. 3891 */ 3892 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 3893 char *buf, size_t nbytes, loff_t off) 3894 { 3895 struct cgroup_subsys_state *css = of_css(of); 3896 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3897 struct mem_cgroup_event *event; 3898 struct cgroup_subsys_state *cfile_css; 3899 unsigned int efd, cfd; 3900 struct fd efile; 3901 struct fd cfile; 3902 const char *name; 3903 char *endp; 3904 int ret; 3905 3906 buf = strstrip(buf); 3907 3908 efd = simple_strtoul(buf, &endp, 10); 3909 if (*endp != ' ') 3910 return -EINVAL; 3911 buf = endp + 1; 3912 3913 cfd = simple_strtoul(buf, &endp, 10); 3914 if ((*endp != ' ') && (*endp != '\0')) 3915 return -EINVAL; 3916 buf = endp + 1; 3917 3918 event = kzalloc(sizeof(*event), GFP_KERNEL); 3919 if (!event) 3920 return -ENOMEM; 3921 3922 event->memcg = memcg; 3923 INIT_LIST_HEAD(&event->list); 3924 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 3925 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 3926 INIT_WORK(&event->remove, memcg_event_remove); 3927 3928 efile = fdget(efd); 3929 if (!efile.file) { 3930 ret = -EBADF; 3931 goto out_kfree; 3932 } 3933 3934 event->eventfd = eventfd_ctx_fileget(efile.file); 3935 if (IS_ERR(event->eventfd)) { 3936 ret = PTR_ERR(event->eventfd); 3937 goto out_put_efile; 3938 } 3939 3940 cfile = fdget(cfd); 3941 if (!cfile.file) { 3942 ret = -EBADF; 3943 goto out_put_eventfd; 3944 } 3945 3946 /* the process need read permission on control file */ 3947 /* AV: shouldn't we check that it's been opened for read instead? */ 3948 ret = inode_permission(file_inode(cfile.file), MAY_READ); 3949 if (ret < 0) 3950 goto out_put_cfile; 3951 3952 /* 3953 * Determine the event callbacks and set them in @event. This used 3954 * to be done via struct cftype but cgroup core no longer knows 3955 * about these events. The following is crude but the whole thing 3956 * is for compatibility anyway. 3957 * 3958 * DO NOT ADD NEW FILES. 3959 */ 3960 name = cfile.file->f_path.dentry->d_name.name; 3961 3962 if (!strcmp(name, "memory.usage_in_bytes")) { 3963 event->register_event = mem_cgroup_usage_register_event; 3964 event->unregister_event = mem_cgroup_usage_unregister_event; 3965 } else if (!strcmp(name, "memory.oom_control")) { 3966 event->register_event = mem_cgroup_oom_register_event; 3967 event->unregister_event = mem_cgroup_oom_unregister_event; 3968 } else if (!strcmp(name, "memory.pressure_level")) { 3969 event->register_event = vmpressure_register_event; 3970 event->unregister_event = vmpressure_unregister_event; 3971 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 3972 event->register_event = memsw_cgroup_usage_register_event; 3973 event->unregister_event = memsw_cgroup_usage_unregister_event; 3974 } else { 3975 ret = -EINVAL; 3976 goto out_put_cfile; 3977 } 3978 3979 /* 3980 * Verify @cfile should belong to @css. Also, remaining events are 3981 * automatically removed on cgroup destruction but the removal is 3982 * asynchronous, so take an extra ref on @css. 3983 */ 3984 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 3985 &memory_cgrp_subsys); 3986 ret = -EINVAL; 3987 if (IS_ERR(cfile_css)) 3988 goto out_put_cfile; 3989 if (cfile_css != css) { 3990 css_put(cfile_css); 3991 goto out_put_cfile; 3992 } 3993 3994 ret = event->register_event(memcg, event->eventfd, buf); 3995 if (ret) 3996 goto out_put_css; 3997 3998 efile.file->f_op->poll(efile.file, &event->pt); 3999 4000 spin_lock(&memcg->event_list_lock); 4001 list_add(&event->list, &memcg->event_list); 4002 spin_unlock(&memcg->event_list_lock); 4003 4004 fdput(cfile); 4005 fdput(efile); 4006 4007 return nbytes; 4008 4009 out_put_css: 4010 css_put(css); 4011 out_put_cfile: 4012 fdput(cfile); 4013 out_put_eventfd: 4014 eventfd_ctx_put(event->eventfd); 4015 out_put_efile: 4016 fdput(efile); 4017 out_kfree: 4018 kfree(event); 4019 4020 return ret; 4021 } 4022 4023 static struct cftype mem_cgroup_legacy_files[] = { 4024 { 4025 .name = "usage_in_bytes", 4026 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4027 .read_u64 = mem_cgroup_read_u64, 4028 }, 4029 { 4030 .name = "max_usage_in_bytes", 4031 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4032 .write = mem_cgroup_reset, 4033 .read_u64 = mem_cgroup_read_u64, 4034 }, 4035 { 4036 .name = "limit_in_bytes", 4037 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4038 .write = mem_cgroup_write, 4039 .read_u64 = mem_cgroup_read_u64, 4040 }, 4041 { 4042 .name = "soft_limit_in_bytes", 4043 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4044 .write = mem_cgroup_write, 4045 .read_u64 = mem_cgroup_read_u64, 4046 }, 4047 { 4048 .name = "failcnt", 4049 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4050 .write = mem_cgroup_reset, 4051 .read_u64 = mem_cgroup_read_u64, 4052 }, 4053 { 4054 .name = "stat", 4055 .seq_show = memcg_stat_show, 4056 }, 4057 { 4058 .name = "force_empty", 4059 .write = mem_cgroup_force_empty_write, 4060 }, 4061 { 4062 .name = "use_hierarchy", 4063 .write_u64 = mem_cgroup_hierarchy_write, 4064 .read_u64 = mem_cgroup_hierarchy_read, 4065 }, 4066 { 4067 .name = "cgroup.event_control", /* XXX: for compat */ 4068 .write = memcg_write_event_control, 4069 .flags = CFTYPE_NO_PREFIX, 4070 .mode = S_IWUGO, 4071 }, 4072 { 4073 .name = "swappiness", 4074 .read_u64 = mem_cgroup_swappiness_read, 4075 .write_u64 = mem_cgroup_swappiness_write, 4076 }, 4077 { 4078 .name = "move_charge_at_immigrate", 4079 .read_u64 = mem_cgroup_move_charge_read, 4080 .write_u64 = mem_cgroup_move_charge_write, 4081 }, 4082 { 4083 .name = "oom_control", 4084 .seq_show = mem_cgroup_oom_control_read, 4085 .write_u64 = mem_cgroup_oom_control_write, 4086 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4087 }, 4088 { 4089 .name = "pressure_level", 4090 }, 4091 #ifdef CONFIG_NUMA 4092 { 4093 .name = "numa_stat", 4094 .seq_show = memcg_numa_stat_show, 4095 }, 4096 #endif 4097 #ifdef CONFIG_MEMCG_KMEM 4098 { 4099 .name = "kmem.limit_in_bytes", 4100 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4101 .write = mem_cgroup_write, 4102 .read_u64 = mem_cgroup_read_u64, 4103 }, 4104 { 4105 .name = "kmem.usage_in_bytes", 4106 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4107 .read_u64 = mem_cgroup_read_u64, 4108 }, 4109 { 4110 .name = "kmem.failcnt", 4111 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4112 .write = mem_cgroup_reset, 4113 .read_u64 = mem_cgroup_read_u64, 4114 }, 4115 { 4116 .name = "kmem.max_usage_in_bytes", 4117 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4118 .write = mem_cgroup_reset, 4119 .read_u64 = mem_cgroup_read_u64, 4120 }, 4121 #ifdef CONFIG_SLABINFO 4122 { 4123 .name = "kmem.slabinfo", 4124 .seq_start = slab_start, 4125 .seq_next = slab_next, 4126 .seq_stop = slab_stop, 4127 .seq_show = memcg_slab_show, 4128 }, 4129 #endif 4130 #endif 4131 { }, /* terminate */ 4132 }; 4133 4134 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4135 { 4136 struct mem_cgroup_per_node *pn; 4137 struct mem_cgroup_per_zone *mz; 4138 int zone, tmp = node; 4139 /* 4140 * This routine is called against possible nodes. 4141 * But it's BUG to call kmalloc() against offline node. 4142 * 4143 * TODO: this routine can waste much memory for nodes which will 4144 * never be onlined. It's better to use memory hotplug callback 4145 * function. 4146 */ 4147 if (!node_state(node, N_NORMAL_MEMORY)) 4148 tmp = -1; 4149 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4150 if (!pn) 4151 return 1; 4152 4153 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4154 mz = &pn->zoneinfo[zone]; 4155 lruvec_init(&mz->lruvec); 4156 mz->usage_in_excess = 0; 4157 mz->on_tree = false; 4158 mz->memcg = memcg; 4159 } 4160 memcg->nodeinfo[node] = pn; 4161 return 0; 4162 } 4163 4164 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4165 { 4166 kfree(memcg->nodeinfo[node]); 4167 } 4168 4169 static struct mem_cgroup *mem_cgroup_alloc(void) 4170 { 4171 struct mem_cgroup *memcg; 4172 size_t size; 4173 4174 size = sizeof(struct mem_cgroup); 4175 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4176 4177 memcg = kzalloc(size, GFP_KERNEL); 4178 if (!memcg) 4179 return NULL; 4180 4181 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4182 if (!memcg->stat) 4183 goto out_free; 4184 4185 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4186 goto out_free_stat; 4187 4188 return memcg; 4189 4190 out_free_stat: 4191 free_percpu(memcg->stat); 4192 out_free: 4193 kfree(memcg); 4194 return NULL; 4195 } 4196 4197 /* 4198 * At destroying mem_cgroup, references from swap_cgroup can remain. 4199 * (scanning all at force_empty is too costly...) 4200 * 4201 * Instead of clearing all references at force_empty, we remember 4202 * the number of reference from swap_cgroup and free mem_cgroup when 4203 * it goes down to 0. 4204 * 4205 * Removal of cgroup itself succeeds regardless of refs from swap. 4206 */ 4207 4208 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4209 { 4210 int node; 4211 4212 mem_cgroup_remove_from_trees(memcg); 4213 4214 for_each_node(node) 4215 free_mem_cgroup_per_zone_info(memcg, node); 4216 4217 free_percpu(memcg->stat); 4218 memcg_wb_domain_exit(memcg); 4219 kfree(memcg); 4220 } 4221 4222 /* 4223 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4224 */ 4225 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4226 { 4227 if (!memcg->memory.parent) 4228 return NULL; 4229 return mem_cgroup_from_counter(memcg->memory.parent, memory); 4230 } 4231 EXPORT_SYMBOL(parent_mem_cgroup); 4232 4233 static struct cgroup_subsys_state * __ref 4234 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4235 { 4236 struct mem_cgroup *memcg; 4237 long error = -ENOMEM; 4238 int node; 4239 4240 memcg = mem_cgroup_alloc(); 4241 if (!memcg) 4242 return ERR_PTR(error); 4243 4244 for_each_node(node) 4245 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4246 goto free_out; 4247 4248 /* root ? */ 4249 if (parent_css == NULL) { 4250 root_mem_cgroup = memcg; 4251 mem_cgroup_root_css = &memcg->css; 4252 page_counter_init(&memcg->memory, NULL); 4253 memcg->high = PAGE_COUNTER_MAX; 4254 memcg->soft_limit = PAGE_COUNTER_MAX; 4255 page_counter_init(&memcg->memsw, NULL); 4256 page_counter_init(&memcg->kmem, NULL); 4257 } 4258 4259 memcg->last_scanned_node = MAX_NUMNODES; 4260 INIT_LIST_HEAD(&memcg->oom_notify); 4261 memcg->move_charge_at_immigrate = 0; 4262 mutex_init(&memcg->thresholds_lock); 4263 spin_lock_init(&memcg->move_lock); 4264 vmpressure_init(&memcg->vmpressure); 4265 INIT_LIST_HEAD(&memcg->event_list); 4266 spin_lock_init(&memcg->event_list_lock); 4267 #ifdef CONFIG_MEMCG_KMEM 4268 memcg->kmemcg_id = -1; 4269 #endif 4270 #ifdef CONFIG_CGROUP_WRITEBACK 4271 INIT_LIST_HEAD(&memcg->cgwb_list); 4272 #endif 4273 return &memcg->css; 4274 4275 free_out: 4276 __mem_cgroup_free(memcg); 4277 return ERR_PTR(error); 4278 } 4279 4280 static int 4281 mem_cgroup_css_online(struct cgroup_subsys_state *css) 4282 { 4283 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4284 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4285 int ret; 4286 4287 if (css->id > MEM_CGROUP_ID_MAX) 4288 return -ENOSPC; 4289 4290 if (!parent) 4291 return 0; 4292 4293 mutex_lock(&memcg_create_mutex); 4294 4295 memcg->use_hierarchy = parent->use_hierarchy; 4296 memcg->oom_kill_disable = parent->oom_kill_disable; 4297 memcg->swappiness = mem_cgroup_swappiness(parent); 4298 4299 if (parent->use_hierarchy) { 4300 page_counter_init(&memcg->memory, &parent->memory); 4301 memcg->high = PAGE_COUNTER_MAX; 4302 memcg->soft_limit = PAGE_COUNTER_MAX; 4303 page_counter_init(&memcg->memsw, &parent->memsw); 4304 page_counter_init(&memcg->kmem, &parent->kmem); 4305 4306 /* 4307 * No need to take a reference to the parent because cgroup 4308 * core guarantees its existence. 4309 */ 4310 } else { 4311 page_counter_init(&memcg->memory, NULL); 4312 memcg->high = PAGE_COUNTER_MAX; 4313 memcg->soft_limit = PAGE_COUNTER_MAX; 4314 page_counter_init(&memcg->memsw, NULL); 4315 page_counter_init(&memcg->kmem, NULL); 4316 /* 4317 * Deeper hierachy with use_hierarchy == false doesn't make 4318 * much sense so let cgroup subsystem know about this 4319 * unfortunate state in our controller. 4320 */ 4321 if (parent != root_mem_cgroup) 4322 memory_cgrp_subsys.broken_hierarchy = true; 4323 } 4324 mutex_unlock(&memcg_create_mutex); 4325 4326 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 4327 if (ret) 4328 return ret; 4329 4330 /* 4331 * Make sure the memcg is initialized: mem_cgroup_iter() 4332 * orders reading memcg->initialized against its callers 4333 * reading the memcg members. 4334 */ 4335 smp_store_release(&memcg->initialized, 1); 4336 4337 return 0; 4338 } 4339 4340 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4341 { 4342 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4343 struct mem_cgroup_event *event, *tmp; 4344 4345 /* 4346 * Unregister events and notify userspace. 4347 * Notify userspace about cgroup removing only after rmdir of cgroup 4348 * directory to avoid race between userspace and kernelspace. 4349 */ 4350 spin_lock(&memcg->event_list_lock); 4351 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4352 list_del_init(&event->list); 4353 schedule_work(&event->remove); 4354 } 4355 spin_unlock(&memcg->event_list_lock); 4356 4357 vmpressure_cleanup(&memcg->vmpressure); 4358 4359 memcg_deactivate_kmem(memcg); 4360 4361 wb_memcg_offline(memcg); 4362 } 4363 4364 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4365 { 4366 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4367 4368 memcg_destroy_kmem(memcg); 4369 __mem_cgroup_free(memcg); 4370 } 4371 4372 /** 4373 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4374 * @css: the target css 4375 * 4376 * Reset the states of the mem_cgroup associated with @css. This is 4377 * invoked when the userland requests disabling on the default hierarchy 4378 * but the memcg is pinned through dependency. The memcg should stop 4379 * applying policies and should revert to the vanilla state as it may be 4380 * made visible again. 4381 * 4382 * The current implementation only resets the essential configurations. 4383 * This needs to be expanded to cover all the visible parts. 4384 */ 4385 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4386 { 4387 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4388 4389 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4390 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4391 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4392 memcg->low = 0; 4393 memcg->high = PAGE_COUNTER_MAX; 4394 memcg->soft_limit = PAGE_COUNTER_MAX; 4395 memcg_wb_domain_size_changed(memcg); 4396 } 4397 4398 #ifdef CONFIG_MMU 4399 /* Handlers for move charge at task migration. */ 4400 static int mem_cgroup_do_precharge(unsigned long count) 4401 { 4402 int ret; 4403 4404 /* Try a single bulk charge without reclaim first */ 4405 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 4406 if (!ret) { 4407 mc.precharge += count; 4408 return ret; 4409 } 4410 if (ret == -EINTR) { 4411 cancel_charge(root_mem_cgroup, count); 4412 return ret; 4413 } 4414 4415 /* Try charges one by one with reclaim */ 4416 while (count--) { 4417 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4418 /* 4419 * In case of failure, any residual charges against 4420 * mc.to will be dropped by mem_cgroup_clear_mc() 4421 * later on. However, cancel any charges that are 4422 * bypassed to root right away or they'll be lost. 4423 */ 4424 if (ret == -EINTR) 4425 cancel_charge(root_mem_cgroup, 1); 4426 if (ret) 4427 return ret; 4428 mc.precharge++; 4429 cond_resched(); 4430 } 4431 return 0; 4432 } 4433 4434 /** 4435 * get_mctgt_type - get target type of moving charge 4436 * @vma: the vma the pte to be checked belongs 4437 * @addr: the address corresponding to the pte to be checked 4438 * @ptent: the pte to be checked 4439 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4440 * 4441 * Returns 4442 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4443 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4444 * move charge. if @target is not NULL, the page is stored in target->page 4445 * with extra refcnt got(Callers should handle it). 4446 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4447 * target for charge migration. if @target is not NULL, the entry is stored 4448 * in target->ent. 4449 * 4450 * Called with pte lock held. 4451 */ 4452 union mc_target { 4453 struct page *page; 4454 swp_entry_t ent; 4455 }; 4456 4457 enum mc_target_type { 4458 MC_TARGET_NONE = 0, 4459 MC_TARGET_PAGE, 4460 MC_TARGET_SWAP, 4461 }; 4462 4463 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4464 unsigned long addr, pte_t ptent) 4465 { 4466 struct page *page = vm_normal_page(vma, addr, ptent); 4467 4468 if (!page || !page_mapped(page)) 4469 return NULL; 4470 if (PageAnon(page)) { 4471 if (!(mc.flags & MOVE_ANON)) 4472 return NULL; 4473 } else { 4474 if (!(mc.flags & MOVE_FILE)) 4475 return NULL; 4476 } 4477 if (!get_page_unless_zero(page)) 4478 return NULL; 4479 4480 return page; 4481 } 4482 4483 #ifdef CONFIG_SWAP 4484 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4485 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4486 { 4487 struct page *page = NULL; 4488 swp_entry_t ent = pte_to_swp_entry(ptent); 4489 4490 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 4491 return NULL; 4492 /* 4493 * Because lookup_swap_cache() updates some statistics counter, 4494 * we call find_get_page() with swapper_space directly. 4495 */ 4496 page = find_get_page(swap_address_space(ent), ent.val); 4497 if (do_swap_account) 4498 entry->val = ent.val; 4499 4500 return page; 4501 } 4502 #else 4503 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4504 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4505 { 4506 return NULL; 4507 } 4508 #endif 4509 4510 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4511 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4512 { 4513 struct page *page = NULL; 4514 struct address_space *mapping; 4515 pgoff_t pgoff; 4516 4517 if (!vma->vm_file) /* anonymous vma */ 4518 return NULL; 4519 if (!(mc.flags & MOVE_FILE)) 4520 return NULL; 4521 4522 mapping = vma->vm_file->f_mapping; 4523 pgoff = linear_page_index(vma, addr); 4524 4525 /* page is moved even if it's not RSS of this task(page-faulted). */ 4526 #ifdef CONFIG_SWAP 4527 /* shmem/tmpfs may report page out on swap: account for that too. */ 4528 if (shmem_mapping(mapping)) { 4529 page = find_get_entry(mapping, pgoff); 4530 if (radix_tree_exceptional_entry(page)) { 4531 swp_entry_t swp = radix_to_swp_entry(page); 4532 if (do_swap_account) 4533 *entry = swp; 4534 page = find_get_page(swap_address_space(swp), swp.val); 4535 } 4536 } else 4537 page = find_get_page(mapping, pgoff); 4538 #else 4539 page = find_get_page(mapping, pgoff); 4540 #endif 4541 return page; 4542 } 4543 4544 /** 4545 * mem_cgroup_move_account - move account of the page 4546 * @page: the page 4547 * @nr_pages: number of regular pages (>1 for huge pages) 4548 * @from: mem_cgroup which the page is moved from. 4549 * @to: mem_cgroup which the page is moved to. @from != @to. 4550 * 4551 * The caller must confirm following. 4552 * - page is not on LRU (isolate_page() is useful.) 4553 * - compound_lock is held when nr_pages > 1 4554 * 4555 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4556 * from old cgroup. 4557 */ 4558 static int mem_cgroup_move_account(struct page *page, 4559 unsigned int nr_pages, 4560 struct mem_cgroup *from, 4561 struct mem_cgroup *to) 4562 { 4563 unsigned long flags; 4564 int ret; 4565 bool anon; 4566 4567 VM_BUG_ON(from == to); 4568 VM_BUG_ON_PAGE(PageLRU(page), page); 4569 /* 4570 * The page is isolated from LRU. So, collapse function 4571 * will not handle this page. But page splitting can happen. 4572 * Do this check under compound_page_lock(). The caller should 4573 * hold it. 4574 */ 4575 ret = -EBUSY; 4576 if (nr_pages > 1 && !PageTransHuge(page)) 4577 goto out; 4578 4579 /* 4580 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 4581 * of its source page while we change it: page migration takes 4582 * both pages off the LRU, but page cache replacement doesn't. 4583 */ 4584 if (!trylock_page(page)) 4585 goto out; 4586 4587 ret = -EINVAL; 4588 if (page->mem_cgroup != from) 4589 goto out_unlock; 4590 4591 anon = PageAnon(page); 4592 4593 spin_lock_irqsave(&from->move_lock, flags); 4594 4595 if (!anon && page_mapped(page)) { 4596 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4597 nr_pages); 4598 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4599 nr_pages); 4600 } 4601 4602 /* 4603 * move_lock grabbed above and caller set from->moving_account, so 4604 * mem_cgroup_update_page_stat() will serialize updates to PageDirty. 4605 * So mapping should be stable for dirty pages. 4606 */ 4607 if (!anon && PageDirty(page)) { 4608 struct address_space *mapping = page_mapping(page); 4609 4610 if (mapping_cap_account_dirty(mapping)) { 4611 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], 4612 nr_pages); 4613 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], 4614 nr_pages); 4615 } 4616 } 4617 4618 if (PageWriteback(page)) { 4619 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4620 nr_pages); 4621 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4622 nr_pages); 4623 } 4624 4625 /* 4626 * It is safe to change page->mem_cgroup here because the page 4627 * is referenced, charged, and isolated - we can't race with 4628 * uncharging, charging, migration, or LRU putback. 4629 */ 4630 4631 /* caller should have done css_get */ 4632 page->mem_cgroup = to; 4633 spin_unlock_irqrestore(&from->move_lock, flags); 4634 4635 ret = 0; 4636 4637 local_irq_disable(); 4638 mem_cgroup_charge_statistics(to, page, nr_pages); 4639 memcg_check_events(to, page); 4640 mem_cgroup_charge_statistics(from, page, -nr_pages); 4641 memcg_check_events(from, page); 4642 local_irq_enable(); 4643 out_unlock: 4644 unlock_page(page); 4645 out: 4646 return ret; 4647 } 4648 4649 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4650 unsigned long addr, pte_t ptent, union mc_target *target) 4651 { 4652 struct page *page = NULL; 4653 enum mc_target_type ret = MC_TARGET_NONE; 4654 swp_entry_t ent = { .val = 0 }; 4655 4656 if (pte_present(ptent)) 4657 page = mc_handle_present_pte(vma, addr, ptent); 4658 else if (is_swap_pte(ptent)) 4659 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4660 else if (pte_none(ptent)) 4661 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4662 4663 if (!page && !ent.val) 4664 return ret; 4665 if (page) { 4666 /* 4667 * Do only loose check w/o serialization. 4668 * mem_cgroup_move_account() checks the page is valid or 4669 * not under LRU exclusion. 4670 */ 4671 if (page->mem_cgroup == mc.from) { 4672 ret = MC_TARGET_PAGE; 4673 if (target) 4674 target->page = page; 4675 } 4676 if (!ret || !target) 4677 put_page(page); 4678 } 4679 /* There is a swap entry and a page doesn't exist or isn't charged */ 4680 if (ent.val && !ret && 4681 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4682 ret = MC_TARGET_SWAP; 4683 if (target) 4684 target->ent = ent; 4685 } 4686 return ret; 4687 } 4688 4689 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4690 /* 4691 * We don't consider swapping or file mapped pages because THP does not 4692 * support them for now. 4693 * Caller should make sure that pmd_trans_huge(pmd) is true. 4694 */ 4695 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4696 unsigned long addr, pmd_t pmd, union mc_target *target) 4697 { 4698 struct page *page = NULL; 4699 enum mc_target_type ret = MC_TARGET_NONE; 4700 4701 page = pmd_page(pmd); 4702 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4703 if (!(mc.flags & MOVE_ANON)) 4704 return ret; 4705 if (page->mem_cgroup == mc.from) { 4706 ret = MC_TARGET_PAGE; 4707 if (target) { 4708 get_page(page); 4709 target->page = page; 4710 } 4711 } 4712 return ret; 4713 } 4714 #else 4715 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4716 unsigned long addr, pmd_t pmd, union mc_target *target) 4717 { 4718 return MC_TARGET_NONE; 4719 } 4720 #endif 4721 4722 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4723 unsigned long addr, unsigned long end, 4724 struct mm_walk *walk) 4725 { 4726 struct vm_area_struct *vma = walk->vma; 4727 pte_t *pte; 4728 spinlock_t *ptl; 4729 4730 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 4731 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 4732 mc.precharge += HPAGE_PMD_NR; 4733 spin_unlock(ptl); 4734 return 0; 4735 } 4736 4737 if (pmd_trans_unstable(pmd)) 4738 return 0; 4739 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4740 for (; addr != end; pte++, addr += PAGE_SIZE) 4741 if (get_mctgt_type(vma, addr, *pte, NULL)) 4742 mc.precharge++; /* increment precharge temporarily */ 4743 pte_unmap_unlock(pte - 1, ptl); 4744 cond_resched(); 4745 4746 return 0; 4747 } 4748 4749 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4750 { 4751 unsigned long precharge; 4752 4753 struct mm_walk mem_cgroup_count_precharge_walk = { 4754 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4755 .mm = mm, 4756 }; 4757 down_read(&mm->mmap_sem); 4758 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); 4759 up_read(&mm->mmap_sem); 4760 4761 precharge = mc.precharge; 4762 mc.precharge = 0; 4763 4764 return precharge; 4765 } 4766 4767 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4768 { 4769 unsigned long precharge = mem_cgroup_count_precharge(mm); 4770 4771 VM_BUG_ON(mc.moving_task); 4772 mc.moving_task = current; 4773 return mem_cgroup_do_precharge(precharge); 4774 } 4775 4776 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4777 static void __mem_cgroup_clear_mc(void) 4778 { 4779 struct mem_cgroup *from = mc.from; 4780 struct mem_cgroup *to = mc.to; 4781 4782 /* we must uncharge all the leftover precharges from mc.to */ 4783 if (mc.precharge) { 4784 cancel_charge(mc.to, mc.precharge); 4785 mc.precharge = 0; 4786 } 4787 /* 4788 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4789 * we must uncharge here. 4790 */ 4791 if (mc.moved_charge) { 4792 cancel_charge(mc.from, mc.moved_charge); 4793 mc.moved_charge = 0; 4794 } 4795 /* we must fixup refcnts and charges */ 4796 if (mc.moved_swap) { 4797 /* uncharge swap account from the old cgroup */ 4798 if (!mem_cgroup_is_root(mc.from)) 4799 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 4800 4801 /* 4802 * we charged both to->memory and to->memsw, so we 4803 * should uncharge to->memory. 4804 */ 4805 if (!mem_cgroup_is_root(mc.to)) 4806 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 4807 4808 css_put_many(&mc.from->css, mc.moved_swap); 4809 4810 /* we've already done css_get(mc.to) */ 4811 mc.moved_swap = 0; 4812 } 4813 memcg_oom_recover(from); 4814 memcg_oom_recover(to); 4815 wake_up_all(&mc.waitq); 4816 } 4817 4818 static void mem_cgroup_clear_mc(void) 4819 { 4820 /* 4821 * we must clear moving_task before waking up waiters at the end of 4822 * task migration. 4823 */ 4824 mc.moving_task = NULL; 4825 __mem_cgroup_clear_mc(); 4826 spin_lock(&mc.lock); 4827 mc.from = NULL; 4828 mc.to = NULL; 4829 spin_unlock(&mc.lock); 4830 } 4831 4832 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 4833 struct cgroup_taskset *tset) 4834 { 4835 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4836 struct mem_cgroup *from; 4837 struct task_struct *p; 4838 struct mm_struct *mm; 4839 unsigned long move_flags; 4840 int ret = 0; 4841 4842 /* 4843 * We are now commited to this value whatever it is. Changes in this 4844 * tunable will only affect upcoming migrations, not the current one. 4845 * So we need to save it, and keep it going. 4846 */ 4847 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 4848 if (!move_flags) 4849 return 0; 4850 4851 p = cgroup_taskset_first(tset); 4852 from = mem_cgroup_from_task(p); 4853 4854 VM_BUG_ON(from == memcg); 4855 4856 mm = get_task_mm(p); 4857 if (!mm) 4858 return 0; 4859 /* We move charges only when we move a owner of the mm */ 4860 if (mm->owner == p) { 4861 VM_BUG_ON(mc.from); 4862 VM_BUG_ON(mc.to); 4863 VM_BUG_ON(mc.precharge); 4864 VM_BUG_ON(mc.moved_charge); 4865 VM_BUG_ON(mc.moved_swap); 4866 4867 spin_lock(&mc.lock); 4868 mc.from = from; 4869 mc.to = memcg; 4870 mc.flags = move_flags; 4871 spin_unlock(&mc.lock); 4872 /* We set mc.moving_task later */ 4873 4874 ret = mem_cgroup_precharge_mc(mm); 4875 if (ret) 4876 mem_cgroup_clear_mc(); 4877 } 4878 mmput(mm); 4879 return ret; 4880 } 4881 4882 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 4883 struct cgroup_taskset *tset) 4884 { 4885 if (mc.to) 4886 mem_cgroup_clear_mc(); 4887 } 4888 4889 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4890 unsigned long addr, unsigned long end, 4891 struct mm_walk *walk) 4892 { 4893 int ret = 0; 4894 struct vm_area_struct *vma = walk->vma; 4895 pte_t *pte; 4896 spinlock_t *ptl; 4897 enum mc_target_type target_type; 4898 union mc_target target; 4899 struct page *page; 4900 4901 /* 4902 * We don't take compound_lock() here but no race with splitting thp 4903 * happens because: 4904 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 4905 * under splitting, which means there's no concurrent thp split, 4906 * - if another thread runs into split_huge_page() just after we 4907 * entered this if-block, the thread must wait for page table lock 4908 * to be unlocked in __split_huge_page_splitting(), where the main 4909 * part of thp split is not executed yet. 4910 */ 4911 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 4912 if (mc.precharge < HPAGE_PMD_NR) { 4913 spin_unlock(ptl); 4914 return 0; 4915 } 4916 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 4917 if (target_type == MC_TARGET_PAGE) { 4918 page = target.page; 4919 if (!isolate_lru_page(page)) { 4920 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 4921 mc.from, mc.to)) { 4922 mc.precharge -= HPAGE_PMD_NR; 4923 mc.moved_charge += HPAGE_PMD_NR; 4924 } 4925 putback_lru_page(page); 4926 } 4927 put_page(page); 4928 } 4929 spin_unlock(ptl); 4930 return 0; 4931 } 4932 4933 if (pmd_trans_unstable(pmd)) 4934 return 0; 4935 retry: 4936 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4937 for (; addr != end; addr += PAGE_SIZE) { 4938 pte_t ptent = *(pte++); 4939 swp_entry_t ent; 4940 4941 if (!mc.precharge) 4942 break; 4943 4944 switch (get_mctgt_type(vma, addr, ptent, &target)) { 4945 case MC_TARGET_PAGE: 4946 page = target.page; 4947 if (isolate_lru_page(page)) 4948 goto put; 4949 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 4950 mc.precharge--; 4951 /* we uncharge from mc.from later. */ 4952 mc.moved_charge++; 4953 } 4954 putback_lru_page(page); 4955 put: /* get_mctgt_type() gets the page */ 4956 put_page(page); 4957 break; 4958 case MC_TARGET_SWAP: 4959 ent = target.ent; 4960 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 4961 mc.precharge--; 4962 /* we fixup refcnts and charges later. */ 4963 mc.moved_swap++; 4964 } 4965 break; 4966 default: 4967 break; 4968 } 4969 } 4970 pte_unmap_unlock(pte - 1, ptl); 4971 cond_resched(); 4972 4973 if (addr != end) { 4974 /* 4975 * We have consumed all precharges we got in can_attach(). 4976 * We try charge one by one, but don't do any additional 4977 * charges to mc.to if we have failed in charge once in attach() 4978 * phase. 4979 */ 4980 ret = mem_cgroup_do_precharge(1); 4981 if (!ret) 4982 goto retry; 4983 } 4984 4985 return ret; 4986 } 4987 4988 static void mem_cgroup_move_charge(struct mm_struct *mm) 4989 { 4990 struct mm_walk mem_cgroup_move_charge_walk = { 4991 .pmd_entry = mem_cgroup_move_charge_pte_range, 4992 .mm = mm, 4993 }; 4994 4995 lru_add_drain_all(); 4996 /* 4997 * Signal mem_cgroup_begin_page_stat() to take the memcg's 4998 * move_lock while we're moving its pages to another memcg. 4999 * Then wait for already started RCU-only updates to finish. 5000 */ 5001 atomic_inc(&mc.from->moving_account); 5002 synchronize_rcu(); 5003 retry: 5004 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5005 /* 5006 * Someone who are holding the mmap_sem might be waiting in 5007 * waitq. So we cancel all extra charges, wake up all waiters, 5008 * and retry. Because we cancel precharges, we might not be able 5009 * to move enough charges, but moving charge is a best-effort 5010 * feature anyway, so it wouldn't be a big problem. 5011 */ 5012 __mem_cgroup_clear_mc(); 5013 cond_resched(); 5014 goto retry; 5015 } 5016 /* 5017 * When we have consumed all precharges and failed in doing 5018 * additional charge, the page walk just aborts. 5019 */ 5020 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); 5021 up_read(&mm->mmap_sem); 5022 atomic_dec(&mc.from->moving_account); 5023 } 5024 5025 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5026 struct cgroup_taskset *tset) 5027 { 5028 struct task_struct *p = cgroup_taskset_first(tset); 5029 struct mm_struct *mm = get_task_mm(p); 5030 5031 if (mm) { 5032 if (mc.to) 5033 mem_cgroup_move_charge(mm); 5034 mmput(mm); 5035 } 5036 if (mc.to) 5037 mem_cgroup_clear_mc(); 5038 } 5039 #else /* !CONFIG_MMU */ 5040 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5041 struct cgroup_taskset *tset) 5042 { 5043 return 0; 5044 } 5045 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5046 struct cgroup_taskset *tset) 5047 { 5048 } 5049 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5050 struct cgroup_taskset *tset) 5051 { 5052 } 5053 #endif 5054 5055 /* 5056 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5057 * to verify whether we're attached to the default hierarchy on each mount 5058 * attempt. 5059 */ 5060 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5061 { 5062 /* 5063 * use_hierarchy is forced on the default hierarchy. cgroup core 5064 * guarantees that @root doesn't have any children, so turning it 5065 * on for the root memcg is enough. 5066 */ 5067 if (cgroup_on_dfl(root_css->cgroup)) 5068 root_mem_cgroup->use_hierarchy = true; 5069 else 5070 root_mem_cgroup->use_hierarchy = false; 5071 } 5072 5073 static u64 memory_current_read(struct cgroup_subsys_state *css, 5074 struct cftype *cft) 5075 { 5076 return mem_cgroup_usage(mem_cgroup_from_css(css), false); 5077 } 5078 5079 static int memory_low_show(struct seq_file *m, void *v) 5080 { 5081 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5082 unsigned long low = READ_ONCE(memcg->low); 5083 5084 if (low == PAGE_COUNTER_MAX) 5085 seq_puts(m, "max\n"); 5086 else 5087 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); 5088 5089 return 0; 5090 } 5091 5092 static ssize_t memory_low_write(struct kernfs_open_file *of, 5093 char *buf, size_t nbytes, loff_t off) 5094 { 5095 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5096 unsigned long low; 5097 int err; 5098 5099 buf = strstrip(buf); 5100 err = page_counter_memparse(buf, "max", &low); 5101 if (err) 5102 return err; 5103 5104 memcg->low = low; 5105 5106 return nbytes; 5107 } 5108 5109 static int memory_high_show(struct seq_file *m, void *v) 5110 { 5111 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5112 unsigned long high = READ_ONCE(memcg->high); 5113 5114 if (high == PAGE_COUNTER_MAX) 5115 seq_puts(m, "max\n"); 5116 else 5117 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); 5118 5119 return 0; 5120 } 5121 5122 static ssize_t memory_high_write(struct kernfs_open_file *of, 5123 char *buf, size_t nbytes, loff_t off) 5124 { 5125 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5126 unsigned long high; 5127 int err; 5128 5129 buf = strstrip(buf); 5130 err = page_counter_memparse(buf, "max", &high); 5131 if (err) 5132 return err; 5133 5134 memcg->high = high; 5135 5136 memcg_wb_domain_size_changed(memcg); 5137 return nbytes; 5138 } 5139 5140 static int memory_max_show(struct seq_file *m, void *v) 5141 { 5142 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5143 unsigned long max = READ_ONCE(memcg->memory.limit); 5144 5145 if (max == PAGE_COUNTER_MAX) 5146 seq_puts(m, "max\n"); 5147 else 5148 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 5149 5150 return 0; 5151 } 5152 5153 static ssize_t memory_max_write(struct kernfs_open_file *of, 5154 char *buf, size_t nbytes, loff_t off) 5155 { 5156 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5157 unsigned long max; 5158 int err; 5159 5160 buf = strstrip(buf); 5161 err = page_counter_memparse(buf, "max", &max); 5162 if (err) 5163 return err; 5164 5165 err = mem_cgroup_resize_limit(memcg, max); 5166 if (err) 5167 return err; 5168 5169 memcg_wb_domain_size_changed(memcg); 5170 return nbytes; 5171 } 5172 5173 static int memory_events_show(struct seq_file *m, void *v) 5174 { 5175 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5176 5177 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); 5178 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); 5179 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); 5180 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); 5181 5182 return 0; 5183 } 5184 5185 static struct cftype memory_files[] = { 5186 { 5187 .name = "current", 5188 .read_u64 = memory_current_read, 5189 }, 5190 { 5191 .name = "low", 5192 .flags = CFTYPE_NOT_ON_ROOT, 5193 .seq_show = memory_low_show, 5194 .write = memory_low_write, 5195 }, 5196 { 5197 .name = "high", 5198 .flags = CFTYPE_NOT_ON_ROOT, 5199 .seq_show = memory_high_show, 5200 .write = memory_high_write, 5201 }, 5202 { 5203 .name = "max", 5204 .flags = CFTYPE_NOT_ON_ROOT, 5205 .seq_show = memory_max_show, 5206 .write = memory_max_write, 5207 }, 5208 { 5209 .name = "events", 5210 .flags = CFTYPE_NOT_ON_ROOT, 5211 .seq_show = memory_events_show, 5212 }, 5213 { } /* terminate */ 5214 }; 5215 5216 struct cgroup_subsys memory_cgrp_subsys = { 5217 .css_alloc = mem_cgroup_css_alloc, 5218 .css_online = mem_cgroup_css_online, 5219 .css_offline = mem_cgroup_css_offline, 5220 .css_free = mem_cgroup_css_free, 5221 .css_reset = mem_cgroup_css_reset, 5222 .can_attach = mem_cgroup_can_attach, 5223 .cancel_attach = mem_cgroup_cancel_attach, 5224 .attach = mem_cgroup_move_task, 5225 .bind = mem_cgroup_bind, 5226 .dfl_cftypes = memory_files, 5227 .legacy_cftypes = mem_cgroup_legacy_files, 5228 .early_init = 0, 5229 }; 5230 5231 /** 5232 * mem_cgroup_low - check if memory consumption is below the normal range 5233 * @root: the highest ancestor to consider 5234 * @memcg: the memory cgroup to check 5235 * 5236 * Returns %true if memory consumption of @memcg, and that of all 5237 * configurable ancestors up to @root, is below the normal range. 5238 */ 5239 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) 5240 { 5241 if (mem_cgroup_disabled()) 5242 return false; 5243 5244 /* 5245 * The toplevel group doesn't have a configurable range, so 5246 * it's never low when looked at directly, and it is not 5247 * considered an ancestor when assessing the hierarchy. 5248 */ 5249 5250 if (memcg == root_mem_cgroup) 5251 return false; 5252 5253 if (page_counter_read(&memcg->memory) >= memcg->low) 5254 return false; 5255 5256 while (memcg != root) { 5257 memcg = parent_mem_cgroup(memcg); 5258 5259 if (memcg == root_mem_cgroup) 5260 break; 5261 5262 if (page_counter_read(&memcg->memory) >= memcg->low) 5263 return false; 5264 } 5265 return true; 5266 } 5267 5268 /** 5269 * mem_cgroup_try_charge - try charging a page 5270 * @page: page to charge 5271 * @mm: mm context of the victim 5272 * @gfp_mask: reclaim mode 5273 * @memcgp: charged memcg return 5274 * 5275 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5276 * pages according to @gfp_mask if necessary. 5277 * 5278 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5279 * Otherwise, an error code is returned. 5280 * 5281 * After page->mapping has been set up, the caller must finalize the 5282 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5283 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5284 */ 5285 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5286 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5287 { 5288 struct mem_cgroup *memcg = NULL; 5289 unsigned int nr_pages = 1; 5290 int ret = 0; 5291 5292 if (mem_cgroup_disabled()) 5293 goto out; 5294 5295 if (PageSwapCache(page)) { 5296 /* 5297 * Every swap fault against a single page tries to charge the 5298 * page, bail as early as possible. shmem_unuse() encounters 5299 * already charged pages, too. The USED bit is protected by 5300 * the page lock, which serializes swap cache removal, which 5301 * in turn serializes uncharging. 5302 */ 5303 VM_BUG_ON_PAGE(!PageLocked(page), page); 5304 if (page->mem_cgroup) 5305 goto out; 5306 5307 if (do_swap_account) { 5308 swp_entry_t ent = { .val = page_private(page), }; 5309 unsigned short id = lookup_swap_cgroup_id(ent); 5310 5311 rcu_read_lock(); 5312 memcg = mem_cgroup_from_id(id); 5313 if (memcg && !css_tryget_online(&memcg->css)) 5314 memcg = NULL; 5315 rcu_read_unlock(); 5316 } 5317 } 5318 5319 if (PageTransHuge(page)) { 5320 nr_pages <<= compound_order(page); 5321 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5322 } 5323 5324 if (!memcg) 5325 memcg = get_mem_cgroup_from_mm(mm); 5326 5327 ret = try_charge(memcg, gfp_mask, nr_pages); 5328 5329 css_put(&memcg->css); 5330 5331 if (ret == -EINTR) { 5332 memcg = root_mem_cgroup; 5333 ret = 0; 5334 } 5335 out: 5336 *memcgp = memcg; 5337 return ret; 5338 } 5339 5340 /** 5341 * mem_cgroup_commit_charge - commit a page charge 5342 * @page: page to charge 5343 * @memcg: memcg to charge the page to 5344 * @lrucare: page might be on LRU already 5345 * 5346 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5347 * after page->mapping has been set up. This must happen atomically 5348 * as part of the page instantiation, i.e. under the page table lock 5349 * for anonymous pages, under the page lock for page and swap cache. 5350 * 5351 * In addition, the page must not be on the LRU during the commit, to 5352 * prevent racing with task migration. If it might be, use @lrucare. 5353 * 5354 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5355 */ 5356 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5357 bool lrucare) 5358 { 5359 unsigned int nr_pages = 1; 5360 5361 VM_BUG_ON_PAGE(!page->mapping, page); 5362 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5363 5364 if (mem_cgroup_disabled()) 5365 return; 5366 /* 5367 * Swap faults will attempt to charge the same page multiple 5368 * times. But reuse_swap_page() might have removed the page 5369 * from swapcache already, so we can't check PageSwapCache(). 5370 */ 5371 if (!memcg) 5372 return; 5373 5374 commit_charge(page, memcg, lrucare); 5375 5376 if (PageTransHuge(page)) { 5377 nr_pages <<= compound_order(page); 5378 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5379 } 5380 5381 local_irq_disable(); 5382 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5383 memcg_check_events(memcg, page); 5384 local_irq_enable(); 5385 5386 if (do_swap_account && PageSwapCache(page)) { 5387 swp_entry_t entry = { .val = page_private(page) }; 5388 /* 5389 * The swap entry might not get freed for a long time, 5390 * let's not wait for it. The page already received a 5391 * memory+swap charge, drop the swap entry duplicate. 5392 */ 5393 mem_cgroup_uncharge_swap(entry); 5394 } 5395 } 5396 5397 /** 5398 * mem_cgroup_cancel_charge - cancel a page charge 5399 * @page: page to charge 5400 * @memcg: memcg to charge the page to 5401 * 5402 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5403 */ 5404 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5405 { 5406 unsigned int nr_pages = 1; 5407 5408 if (mem_cgroup_disabled()) 5409 return; 5410 /* 5411 * Swap faults will attempt to charge the same page multiple 5412 * times. But reuse_swap_page() might have removed the page 5413 * from swapcache already, so we can't check PageSwapCache(). 5414 */ 5415 if (!memcg) 5416 return; 5417 5418 if (PageTransHuge(page)) { 5419 nr_pages <<= compound_order(page); 5420 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5421 } 5422 5423 cancel_charge(memcg, nr_pages); 5424 } 5425 5426 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5427 unsigned long nr_anon, unsigned long nr_file, 5428 unsigned long nr_huge, struct page *dummy_page) 5429 { 5430 unsigned long nr_pages = nr_anon + nr_file; 5431 unsigned long flags; 5432 5433 if (!mem_cgroup_is_root(memcg)) { 5434 page_counter_uncharge(&memcg->memory, nr_pages); 5435 if (do_swap_account) 5436 page_counter_uncharge(&memcg->memsw, nr_pages); 5437 memcg_oom_recover(memcg); 5438 } 5439 5440 local_irq_save(flags); 5441 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5442 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5443 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5444 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5445 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5446 memcg_check_events(memcg, dummy_page); 5447 local_irq_restore(flags); 5448 5449 if (!mem_cgroup_is_root(memcg)) 5450 css_put_many(&memcg->css, nr_pages); 5451 } 5452 5453 static void uncharge_list(struct list_head *page_list) 5454 { 5455 struct mem_cgroup *memcg = NULL; 5456 unsigned long nr_anon = 0; 5457 unsigned long nr_file = 0; 5458 unsigned long nr_huge = 0; 5459 unsigned long pgpgout = 0; 5460 struct list_head *next; 5461 struct page *page; 5462 5463 next = page_list->next; 5464 do { 5465 unsigned int nr_pages = 1; 5466 5467 page = list_entry(next, struct page, lru); 5468 next = page->lru.next; 5469 5470 VM_BUG_ON_PAGE(PageLRU(page), page); 5471 VM_BUG_ON_PAGE(page_count(page), page); 5472 5473 if (!page->mem_cgroup) 5474 continue; 5475 5476 /* 5477 * Nobody should be changing or seriously looking at 5478 * page->mem_cgroup at this point, we have fully 5479 * exclusive access to the page. 5480 */ 5481 5482 if (memcg != page->mem_cgroup) { 5483 if (memcg) { 5484 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5485 nr_huge, page); 5486 pgpgout = nr_anon = nr_file = nr_huge = 0; 5487 } 5488 memcg = page->mem_cgroup; 5489 } 5490 5491 if (PageTransHuge(page)) { 5492 nr_pages <<= compound_order(page); 5493 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5494 nr_huge += nr_pages; 5495 } 5496 5497 if (PageAnon(page)) 5498 nr_anon += nr_pages; 5499 else 5500 nr_file += nr_pages; 5501 5502 page->mem_cgroup = NULL; 5503 5504 pgpgout++; 5505 } while (next != page_list); 5506 5507 if (memcg) 5508 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5509 nr_huge, page); 5510 } 5511 5512 /** 5513 * mem_cgroup_uncharge - uncharge a page 5514 * @page: page to uncharge 5515 * 5516 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5517 * mem_cgroup_commit_charge(). 5518 */ 5519 void mem_cgroup_uncharge(struct page *page) 5520 { 5521 if (mem_cgroup_disabled()) 5522 return; 5523 5524 /* Don't touch page->lru of any random page, pre-check: */ 5525 if (!page->mem_cgroup) 5526 return; 5527 5528 INIT_LIST_HEAD(&page->lru); 5529 uncharge_list(&page->lru); 5530 } 5531 5532 /** 5533 * mem_cgroup_uncharge_list - uncharge a list of page 5534 * @page_list: list of pages to uncharge 5535 * 5536 * Uncharge a list of pages previously charged with 5537 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5538 */ 5539 void mem_cgroup_uncharge_list(struct list_head *page_list) 5540 { 5541 if (mem_cgroup_disabled()) 5542 return; 5543 5544 if (!list_empty(page_list)) 5545 uncharge_list(page_list); 5546 } 5547 5548 /** 5549 * mem_cgroup_migrate - migrate a charge to another page 5550 * @oldpage: currently charged page 5551 * @newpage: page to transfer the charge to 5552 * @lrucare: either or both pages might be on the LRU already 5553 * 5554 * Migrate the charge from @oldpage to @newpage. 5555 * 5556 * Both pages must be locked, @newpage->mapping must be set up. 5557 */ 5558 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5559 bool lrucare) 5560 { 5561 struct mem_cgroup *memcg; 5562 int isolated; 5563 5564 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5565 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5566 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 5567 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 5568 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5569 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5570 newpage); 5571 5572 if (mem_cgroup_disabled()) 5573 return; 5574 5575 /* Page cache replacement: new page already charged? */ 5576 if (newpage->mem_cgroup) 5577 return; 5578 5579 /* 5580 * Swapcache readahead pages can get migrated before being 5581 * charged, and migration from compaction can happen to an 5582 * uncharged page when the PFN walker finds a page that 5583 * reclaim just put back on the LRU but has not released yet. 5584 */ 5585 memcg = oldpage->mem_cgroup; 5586 if (!memcg) 5587 return; 5588 5589 if (lrucare) 5590 lock_page_lru(oldpage, &isolated); 5591 5592 oldpage->mem_cgroup = NULL; 5593 5594 if (lrucare) 5595 unlock_page_lru(oldpage, isolated); 5596 5597 commit_charge(newpage, memcg, lrucare); 5598 } 5599 5600 /* 5601 * subsys_initcall() for memory controller. 5602 * 5603 * Some parts like hotcpu_notifier() have to be initialized from this context 5604 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 5605 * everything that doesn't depend on a specific mem_cgroup structure should 5606 * be initialized from here. 5607 */ 5608 static int __init mem_cgroup_init(void) 5609 { 5610 int cpu, node; 5611 5612 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5613 5614 for_each_possible_cpu(cpu) 5615 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 5616 drain_local_stock); 5617 5618 for_each_node(node) { 5619 struct mem_cgroup_tree_per_node *rtpn; 5620 int zone; 5621 5622 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5623 node_online(node) ? node : NUMA_NO_NODE); 5624 5625 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5626 struct mem_cgroup_tree_per_zone *rtpz; 5627 5628 rtpz = &rtpn->rb_tree_per_zone[zone]; 5629 rtpz->rb_root = RB_ROOT; 5630 spin_lock_init(&rtpz->lock); 5631 } 5632 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5633 } 5634 5635 return 0; 5636 } 5637 subsys_initcall(mem_cgroup_init); 5638 5639 #ifdef CONFIG_MEMCG_SWAP 5640 /** 5641 * mem_cgroup_swapout - transfer a memsw charge to swap 5642 * @page: page whose memsw charge to transfer 5643 * @entry: swap entry to move the charge to 5644 * 5645 * Transfer the memsw charge of @page to @entry. 5646 */ 5647 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5648 { 5649 struct mem_cgroup *memcg; 5650 unsigned short oldid; 5651 5652 VM_BUG_ON_PAGE(PageLRU(page), page); 5653 VM_BUG_ON_PAGE(page_count(page), page); 5654 5655 if (!do_swap_account) 5656 return; 5657 5658 memcg = page->mem_cgroup; 5659 5660 /* Readahead page, never charged */ 5661 if (!memcg) 5662 return; 5663 5664 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5665 VM_BUG_ON_PAGE(oldid, page); 5666 mem_cgroup_swap_statistics(memcg, true); 5667 5668 page->mem_cgroup = NULL; 5669 5670 if (!mem_cgroup_is_root(memcg)) 5671 page_counter_uncharge(&memcg->memory, 1); 5672 5673 /* 5674 * Interrupts should be disabled here because the caller holds the 5675 * mapping->tree_lock lock which is taken with interrupts-off. It is 5676 * important here to have the interrupts disabled because it is the 5677 * only synchronisation we have for udpating the per-CPU variables. 5678 */ 5679 VM_BUG_ON(!irqs_disabled()); 5680 mem_cgroup_charge_statistics(memcg, page, -1); 5681 memcg_check_events(memcg, page); 5682 } 5683 5684 /** 5685 * mem_cgroup_uncharge_swap - uncharge a swap entry 5686 * @entry: swap entry to uncharge 5687 * 5688 * Drop the memsw charge associated with @entry. 5689 */ 5690 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5691 { 5692 struct mem_cgroup *memcg; 5693 unsigned short id; 5694 5695 if (!do_swap_account) 5696 return; 5697 5698 id = swap_cgroup_record(entry, 0); 5699 rcu_read_lock(); 5700 memcg = mem_cgroup_from_id(id); 5701 if (memcg) { 5702 if (!mem_cgroup_is_root(memcg)) 5703 page_counter_uncharge(&memcg->memsw, 1); 5704 mem_cgroup_swap_statistics(memcg, false); 5705 css_put(&memcg->css); 5706 } 5707 rcu_read_unlock(); 5708 } 5709 5710 /* for remember boot option*/ 5711 #ifdef CONFIG_MEMCG_SWAP_ENABLED 5712 static int really_do_swap_account __initdata = 1; 5713 #else 5714 static int really_do_swap_account __initdata; 5715 #endif 5716 5717 static int __init enable_swap_account(char *s) 5718 { 5719 if (!strcmp(s, "1")) 5720 really_do_swap_account = 1; 5721 else if (!strcmp(s, "0")) 5722 really_do_swap_account = 0; 5723 return 1; 5724 } 5725 __setup("swapaccount=", enable_swap_account); 5726 5727 static struct cftype memsw_cgroup_files[] = { 5728 { 5729 .name = "memsw.usage_in_bytes", 5730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 5731 .read_u64 = mem_cgroup_read_u64, 5732 }, 5733 { 5734 .name = "memsw.max_usage_in_bytes", 5735 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 5736 .write = mem_cgroup_reset, 5737 .read_u64 = mem_cgroup_read_u64, 5738 }, 5739 { 5740 .name = "memsw.limit_in_bytes", 5741 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 5742 .write = mem_cgroup_write, 5743 .read_u64 = mem_cgroup_read_u64, 5744 }, 5745 { 5746 .name = "memsw.failcnt", 5747 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 5748 .write = mem_cgroup_reset, 5749 .read_u64 = mem_cgroup_read_u64, 5750 }, 5751 { }, /* terminate */ 5752 }; 5753 5754 static int __init mem_cgroup_swap_init(void) 5755 { 5756 if (!mem_cgroup_disabled() && really_do_swap_account) { 5757 do_swap_account = 1; 5758 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5759 memsw_cgroup_files)); 5760 } 5761 return 0; 5762 } 5763 subsys_initcall(mem_cgroup_swap_init); 5764 5765 #endif /* CONFIG_MEMCG_SWAP */ 5766