1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * Native page reclaim 18 * Charge lifetime sanitation 19 * Lockless page tracking & accounting 20 * Unified hierarchy configuration model 21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 */ 33 34 #include <linux/page_counter.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cgroup.h> 37 #include <linux/mm.h> 38 #include <linux/hugetlb.h> 39 #include <linux/pagemap.h> 40 #include <linux/smp.h> 41 #include <linux/page-flags.h> 42 #include <linux/backing-dev.h> 43 #include <linux/bit_spinlock.h> 44 #include <linux/rcupdate.h> 45 #include <linux/limits.h> 46 #include <linux/export.h> 47 #include <linux/mutex.h> 48 #include <linux/rbtree.h> 49 #include <linux/slab.h> 50 #include <linux/swap.h> 51 #include <linux/swapops.h> 52 #include <linux/spinlock.h> 53 #include <linux/eventfd.h> 54 #include <linux/poll.h> 55 #include <linux/sort.h> 56 #include <linux/fs.h> 57 #include <linux/seq_file.h> 58 #include <linux/vmpressure.h> 59 #include <linux/mm_inline.h> 60 #include <linux/swap_cgroup.h> 61 #include <linux/cpu.h> 62 #include <linux/oom.h> 63 #include <linux/lockdep.h> 64 #include <linux/file.h> 65 #include "internal.h" 66 #include <net/sock.h> 67 #include <net/ip.h> 68 #include <net/tcp_memcontrol.h> 69 #include "slab.h" 70 71 #include <asm/uaccess.h> 72 73 #include <trace/events/vmscan.h> 74 75 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 76 EXPORT_SYMBOL(memory_cgrp_subsys); 77 78 #define MEM_CGROUP_RECLAIM_RETRIES 5 79 static struct mem_cgroup *root_mem_cgroup __read_mostly; 80 struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; 81 82 /* Whether the swap controller is active */ 83 #ifdef CONFIG_MEMCG_SWAP 84 int do_swap_account __read_mostly; 85 #else 86 #define do_swap_account 0 87 #endif 88 89 static const char * const mem_cgroup_stat_names[] = { 90 "cache", 91 "rss", 92 "rss_huge", 93 "mapped_file", 94 "dirty", 95 "writeback", 96 "swap", 97 }; 98 99 static const char * const mem_cgroup_events_names[] = { 100 "pgpgin", 101 "pgpgout", 102 "pgfault", 103 "pgmajfault", 104 }; 105 106 static const char * const mem_cgroup_lru_names[] = { 107 "inactive_anon", 108 "active_anon", 109 "inactive_file", 110 "active_file", 111 "unevictable", 112 }; 113 114 #define THRESHOLDS_EVENTS_TARGET 128 115 #define SOFTLIMIT_EVENTS_TARGET 1024 116 #define NUMAINFO_EVENTS_TARGET 1024 117 118 /* 119 * Cgroups above their limits are maintained in a RB-Tree, independent of 120 * their hierarchy representation 121 */ 122 123 struct mem_cgroup_tree_per_zone { 124 struct rb_root rb_root; 125 spinlock_t lock; 126 }; 127 128 struct mem_cgroup_tree_per_node { 129 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 130 }; 131 132 struct mem_cgroup_tree { 133 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 134 }; 135 136 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 137 138 /* for OOM */ 139 struct mem_cgroup_eventfd_list { 140 struct list_head list; 141 struct eventfd_ctx *eventfd; 142 }; 143 144 /* 145 * cgroup_event represents events which userspace want to receive. 146 */ 147 struct mem_cgroup_event { 148 /* 149 * memcg which the event belongs to. 150 */ 151 struct mem_cgroup *memcg; 152 /* 153 * eventfd to signal userspace about the event. 154 */ 155 struct eventfd_ctx *eventfd; 156 /* 157 * Each of these stored in a list by the cgroup. 158 */ 159 struct list_head list; 160 /* 161 * register_event() callback will be used to add new userspace 162 * waiter for changes related to this event. Use eventfd_signal() 163 * on eventfd to send notification to userspace. 164 */ 165 int (*register_event)(struct mem_cgroup *memcg, 166 struct eventfd_ctx *eventfd, const char *args); 167 /* 168 * unregister_event() callback will be called when userspace closes 169 * the eventfd or on cgroup removing. This callback must be set, 170 * if you want provide notification functionality. 171 */ 172 void (*unregister_event)(struct mem_cgroup *memcg, 173 struct eventfd_ctx *eventfd); 174 /* 175 * All fields below needed to unregister event when 176 * userspace closes eventfd. 177 */ 178 poll_table pt; 179 wait_queue_head_t *wqh; 180 wait_queue_t wait; 181 struct work_struct remove; 182 }; 183 184 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 185 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 186 187 /* Stuffs for move charges at task migration. */ 188 /* 189 * Types of charges to be moved. 190 */ 191 #define MOVE_ANON 0x1U 192 #define MOVE_FILE 0x2U 193 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 194 195 /* "mc" and its members are protected by cgroup_mutex */ 196 static struct move_charge_struct { 197 spinlock_t lock; /* for from, to */ 198 struct mem_cgroup *from; 199 struct mem_cgroup *to; 200 unsigned long flags; 201 unsigned long precharge; 202 unsigned long moved_charge; 203 unsigned long moved_swap; 204 struct task_struct *moving_task; /* a task moving charges */ 205 wait_queue_head_t waitq; /* a waitq for other context */ 206 } mc = { 207 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 208 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 209 }; 210 211 /* 212 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 213 * limit reclaim to prevent infinite loops, if they ever occur. 214 */ 215 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 216 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 217 218 enum charge_type { 219 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 220 MEM_CGROUP_CHARGE_TYPE_ANON, 221 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 222 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 223 NR_CHARGE_TYPE, 224 }; 225 226 /* for encoding cft->private value on file */ 227 enum res_type { 228 _MEM, 229 _MEMSWAP, 230 _OOM_TYPE, 231 _KMEM, 232 }; 233 234 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 235 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 236 #define MEMFILE_ATTR(val) ((val) & 0xffff) 237 /* Used for OOM nofiier */ 238 #define OOM_CONTROL (0) 239 240 /* 241 * The memcg_create_mutex will be held whenever a new cgroup is created. 242 * As a consequence, any change that needs to protect against new child cgroups 243 * appearing has to hold it as well. 244 */ 245 static DEFINE_MUTEX(memcg_create_mutex); 246 247 /* Some nice accessors for the vmpressure. */ 248 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 249 { 250 if (!memcg) 251 memcg = root_mem_cgroup; 252 return &memcg->vmpressure; 253 } 254 255 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 256 { 257 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 258 } 259 260 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 261 { 262 return (memcg == root_mem_cgroup); 263 } 264 265 /* 266 * We restrict the id in the range of [1, 65535], so it can fit into 267 * an unsigned short. 268 */ 269 #define MEM_CGROUP_ID_MAX USHRT_MAX 270 271 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 272 { 273 return memcg->css.id; 274 } 275 276 /* 277 * A helper function to get mem_cgroup from ID. must be called under 278 * rcu_read_lock(). The caller is responsible for calling 279 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 280 * refcnt from swap can be called against removed memcg.) 281 */ 282 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 283 { 284 struct cgroup_subsys_state *css; 285 286 css = css_from_id(id, &memory_cgrp_subsys); 287 return mem_cgroup_from_css(css); 288 } 289 290 /* Writing them here to avoid exposing memcg's inner layout */ 291 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 292 293 void sock_update_memcg(struct sock *sk) 294 { 295 if (mem_cgroup_sockets_enabled) { 296 struct mem_cgroup *memcg; 297 struct cg_proto *cg_proto; 298 299 BUG_ON(!sk->sk_prot->proto_cgroup); 300 301 /* Socket cloning can throw us here with sk_cgrp already 302 * filled. It won't however, necessarily happen from 303 * process context. So the test for root memcg given 304 * the current task's memcg won't help us in this case. 305 * 306 * Respecting the original socket's memcg is a better 307 * decision in this case. 308 */ 309 if (sk->sk_cgrp) { 310 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 311 css_get(&sk->sk_cgrp->memcg->css); 312 return; 313 } 314 315 rcu_read_lock(); 316 memcg = mem_cgroup_from_task(current); 317 cg_proto = sk->sk_prot->proto_cgroup(memcg); 318 if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && 319 css_tryget_online(&memcg->css)) { 320 sk->sk_cgrp = cg_proto; 321 } 322 rcu_read_unlock(); 323 } 324 } 325 EXPORT_SYMBOL(sock_update_memcg); 326 327 void sock_release_memcg(struct sock *sk) 328 { 329 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 330 struct mem_cgroup *memcg; 331 WARN_ON(!sk->sk_cgrp->memcg); 332 memcg = sk->sk_cgrp->memcg; 333 css_put(&sk->sk_cgrp->memcg->css); 334 } 335 } 336 337 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 338 { 339 if (!memcg || mem_cgroup_is_root(memcg)) 340 return NULL; 341 342 return &memcg->tcp_mem; 343 } 344 EXPORT_SYMBOL(tcp_proto_cgroup); 345 346 #endif 347 348 #ifdef CONFIG_MEMCG_KMEM 349 /* 350 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 351 * The main reason for not using cgroup id for this: 352 * this works better in sparse environments, where we have a lot of memcgs, 353 * but only a few kmem-limited. Or also, if we have, for instance, 200 354 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 355 * 200 entry array for that. 356 * 357 * The current size of the caches array is stored in memcg_nr_cache_ids. It 358 * will double each time we have to increase it. 359 */ 360 static DEFINE_IDA(memcg_cache_ida); 361 int memcg_nr_cache_ids; 362 363 /* Protects memcg_nr_cache_ids */ 364 static DECLARE_RWSEM(memcg_cache_ids_sem); 365 366 void memcg_get_cache_ids(void) 367 { 368 down_read(&memcg_cache_ids_sem); 369 } 370 371 void memcg_put_cache_ids(void) 372 { 373 up_read(&memcg_cache_ids_sem); 374 } 375 376 /* 377 * MIN_SIZE is different than 1, because we would like to avoid going through 378 * the alloc/free process all the time. In a small machine, 4 kmem-limited 379 * cgroups is a reasonable guess. In the future, it could be a parameter or 380 * tunable, but that is strictly not necessary. 381 * 382 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 383 * this constant directly from cgroup, but it is understandable that this is 384 * better kept as an internal representation in cgroup.c. In any case, the 385 * cgrp_id space is not getting any smaller, and we don't have to necessarily 386 * increase ours as well if it increases. 387 */ 388 #define MEMCG_CACHES_MIN_SIZE 4 389 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 390 391 /* 392 * A lot of the calls to the cache allocation functions are expected to be 393 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 394 * conditional to this static branch, we'll have to allow modules that does 395 * kmem_cache_alloc and the such to see this symbol as well 396 */ 397 struct static_key memcg_kmem_enabled_key; 398 EXPORT_SYMBOL(memcg_kmem_enabled_key); 399 400 #endif /* CONFIG_MEMCG_KMEM */ 401 402 static struct mem_cgroup_per_zone * 403 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 404 { 405 int nid = zone_to_nid(zone); 406 int zid = zone_idx(zone); 407 408 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 409 } 410 411 /** 412 * mem_cgroup_css_from_page - css of the memcg associated with a page 413 * @page: page of interest 414 * 415 * If memcg is bound to the default hierarchy, css of the memcg associated 416 * with @page is returned. The returned css remains associated with @page 417 * until it is released. 418 * 419 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 420 * is returned. 421 * 422 * XXX: The above description of behavior on the default hierarchy isn't 423 * strictly true yet as replace_page_cache_page() can modify the 424 * association before @page is released even on the default hierarchy; 425 * however, the current and planned usages don't mix the the two functions 426 * and replace_page_cache_page() will soon be updated to make the invariant 427 * actually true. 428 */ 429 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 430 { 431 struct mem_cgroup *memcg; 432 433 rcu_read_lock(); 434 435 memcg = page->mem_cgroup; 436 437 if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) 438 memcg = root_mem_cgroup; 439 440 rcu_read_unlock(); 441 return &memcg->css; 442 } 443 444 /** 445 * page_cgroup_ino - return inode number of the memcg a page is charged to 446 * @page: the page 447 * 448 * Look up the closest online ancestor of the memory cgroup @page is charged to 449 * and return its inode number or 0 if @page is not charged to any cgroup. It 450 * is safe to call this function without holding a reference to @page. 451 * 452 * Note, this function is inherently racy, because there is nothing to prevent 453 * the cgroup inode from getting torn down and potentially reallocated a moment 454 * after page_cgroup_ino() returns, so it only should be used by callers that 455 * do not care (such as procfs interfaces). 456 */ 457 ino_t page_cgroup_ino(struct page *page) 458 { 459 struct mem_cgroup *memcg; 460 unsigned long ino = 0; 461 462 rcu_read_lock(); 463 memcg = READ_ONCE(page->mem_cgroup); 464 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 465 memcg = parent_mem_cgroup(memcg); 466 if (memcg) 467 ino = cgroup_ino(memcg->css.cgroup); 468 rcu_read_unlock(); 469 return ino; 470 } 471 472 static struct mem_cgroup_per_zone * 473 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 474 { 475 int nid = page_to_nid(page); 476 int zid = page_zonenum(page); 477 478 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 479 } 480 481 static struct mem_cgroup_tree_per_zone * 482 soft_limit_tree_node_zone(int nid, int zid) 483 { 484 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 485 } 486 487 static struct mem_cgroup_tree_per_zone * 488 soft_limit_tree_from_page(struct page *page) 489 { 490 int nid = page_to_nid(page); 491 int zid = page_zonenum(page); 492 493 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 494 } 495 496 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 497 struct mem_cgroup_tree_per_zone *mctz, 498 unsigned long new_usage_in_excess) 499 { 500 struct rb_node **p = &mctz->rb_root.rb_node; 501 struct rb_node *parent = NULL; 502 struct mem_cgroup_per_zone *mz_node; 503 504 if (mz->on_tree) 505 return; 506 507 mz->usage_in_excess = new_usage_in_excess; 508 if (!mz->usage_in_excess) 509 return; 510 while (*p) { 511 parent = *p; 512 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 513 tree_node); 514 if (mz->usage_in_excess < mz_node->usage_in_excess) 515 p = &(*p)->rb_left; 516 /* 517 * We can't avoid mem cgroups that are over their soft 518 * limit by the same amount 519 */ 520 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 521 p = &(*p)->rb_right; 522 } 523 rb_link_node(&mz->tree_node, parent, p); 524 rb_insert_color(&mz->tree_node, &mctz->rb_root); 525 mz->on_tree = true; 526 } 527 528 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 529 struct mem_cgroup_tree_per_zone *mctz) 530 { 531 if (!mz->on_tree) 532 return; 533 rb_erase(&mz->tree_node, &mctz->rb_root); 534 mz->on_tree = false; 535 } 536 537 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 538 struct mem_cgroup_tree_per_zone *mctz) 539 { 540 unsigned long flags; 541 542 spin_lock_irqsave(&mctz->lock, flags); 543 __mem_cgroup_remove_exceeded(mz, mctz); 544 spin_unlock_irqrestore(&mctz->lock, flags); 545 } 546 547 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 548 { 549 unsigned long nr_pages = page_counter_read(&memcg->memory); 550 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 551 unsigned long excess = 0; 552 553 if (nr_pages > soft_limit) 554 excess = nr_pages - soft_limit; 555 556 return excess; 557 } 558 559 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 560 { 561 unsigned long excess; 562 struct mem_cgroup_per_zone *mz; 563 struct mem_cgroup_tree_per_zone *mctz; 564 565 mctz = soft_limit_tree_from_page(page); 566 /* 567 * Necessary to update all ancestors when hierarchy is used. 568 * because their event counter is not touched. 569 */ 570 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 571 mz = mem_cgroup_page_zoneinfo(memcg, page); 572 excess = soft_limit_excess(memcg); 573 /* 574 * We have to update the tree if mz is on RB-tree or 575 * mem is over its softlimit. 576 */ 577 if (excess || mz->on_tree) { 578 unsigned long flags; 579 580 spin_lock_irqsave(&mctz->lock, flags); 581 /* if on-tree, remove it */ 582 if (mz->on_tree) 583 __mem_cgroup_remove_exceeded(mz, mctz); 584 /* 585 * Insert again. mz->usage_in_excess will be updated. 586 * If excess is 0, no tree ops. 587 */ 588 __mem_cgroup_insert_exceeded(mz, mctz, excess); 589 spin_unlock_irqrestore(&mctz->lock, flags); 590 } 591 } 592 } 593 594 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 595 { 596 struct mem_cgroup_tree_per_zone *mctz; 597 struct mem_cgroup_per_zone *mz; 598 int nid, zid; 599 600 for_each_node(nid) { 601 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 602 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 603 mctz = soft_limit_tree_node_zone(nid, zid); 604 mem_cgroup_remove_exceeded(mz, mctz); 605 } 606 } 607 } 608 609 static struct mem_cgroup_per_zone * 610 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 611 { 612 struct rb_node *rightmost = NULL; 613 struct mem_cgroup_per_zone *mz; 614 615 retry: 616 mz = NULL; 617 rightmost = rb_last(&mctz->rb_root); 618 if (!rightmost) 619 goto done; /* Nothing to reclaim from */ 620 621 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 622 /* 623 * Remove the node now but someone else can add it back, 624 * we will to add it back at the end of reclaim to its correct 625 * position in the tree. 626 */ 627 __mem_cgroup_remove_exceeded(mz, mctz); 628 if (!soft_limit_excess(mz->memcg) || 629 !css_tryget_online(&mz->memcg->css)) 630 goto retry; 631 done: 632 return mz; 633 } 634 635 static struct mem_cgroup_per_zone * 636 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 637 { 638 struct mem_cgroup_per_zone *mz; 639 640 spin_lock_irq(&mctz->lock); 641 mz = __mem_cgroup_largest_soft_limit_node(mctz); 642 spin_unlock_irq(&mctz->lock); 643 return mz; 644 } 645 646 /* 647 * Implementation Note: reading percpu statistics for memcg. 648 * 649 * Both of vmstat[] and percpu_counter has threshold and do periodic 650 * synchronization to implement "quick" read. There are trade-off between 651 * reading cost and precision of value. Then, we may have a chance to implement 652 * a periodic synchronizion of counter in memcg's counter. 653 * 654 * But this _read() function is used for user interface now. The user accounts 655 * memory usage by memory cgroup and he _always_ requires exact value because 656 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 657 * have to visit all online cpus and make sum. So, for now, unnecessary 658 * synchronization is not implemented. (just implemented for cpu hotplug) 659 * 660 * If there are kernel internal actions which can make use of some not-exact 661 * value, and reading all cpu value can be performance bottleneck in some 662 * common workload, threashold and synchonization as vmstat[] should be 663 * implemented. 664 */ 665 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 666 enum mem_cgroup_stat_index idx) 667 { 668 long val = 0; 669 int cpu; 670 671 for_each_possible_cpu(cpu) 672 val += per_cpu(memcg->stat->count[idx], cpu); 673 return val; 674 } 675 676 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 677 enum mem_cgroup_events_index idx) 678 { 679 unsigned long val = 0; 680 int cpu; 681 682 for_each_possible_cpu(cpu) 683 val += per_cpu(memcg->stat->events[idx], cpu); 684 return val; 685 } 686 687 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 688 struct page *page, 689 int nr_pages) 690 { 691 /* 692 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 693 * counted as CACHE even if it's on ANON LRU. 694 */ 695 if (PageAnon(page)) 696 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 697 nr_pages); 698 else 699 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 700 nr_pages); 701 702 if (PageTransHuge(page)) 703 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 704 nr_pages); 705 706 /* pagein of a big page is an event. So, ignore page size */ 707 if (nr_pages > 0) 708 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 709 else { 710 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 711 nr_pages = -nr_pages; /* for event */ 712 } 713 714 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 715 } 716 717 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 718 int nid, 719 unsigned int lru_mask) 720 { 721 unsigned long nr = 0; 722 int zid; 723 724 VM_BUG_ON((unsigned)nid >= nr_node_ids); 725 726 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 727 struct mem_cgroup_per_zone *mz; 728 enum lru_list lru; 729 730 for_each_lru(lru) { 731 if (!(BIT(lru) & lru_mask)) 732 continue; 733 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 734 nr += mz->lru_size[lru]; 735 } 736 } 737 return nr; 738 } 739 740 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 741 unsigned int lru_mask) 742 { 743 unsigned long nr = 0; 744 int nid; 745 746 for_each_node_state(nid, N_MEMORY) 747 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 748 return nr; 749 } 750 751 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 752 enum mem_cgroup_events_target target) 753 { 754 unsigned long val, next; 755 756 val = __this_cpu_read(memcg->stat->nr_page_events); 757 next = __this_cpu_read(memcg->stat->targets[target]); 758 /* from time_after() in jiffies.h */ 759 if ((long)next - (long)val < 0) { 760 switch (target) { 761 case MEM_CGROUP_TARGET_THRESH: 762 next = val + THRESHOLDS_EVENTS_TARGET; 763 break; 764 case MEM_CGROUP_TARGET_SOFTLIMIT: 765 next = val + SOFTLIMIT_EVENTS_TARGET; 766 break; 767 case MEM_CGROUP_TARGET_NUMAINFO: 768 next = val + NUMAINFO_EVENTS_TARGET; 769 break; 770 default: 771 break; 772 } 773 __this_cpu_write(memcg->stat->targets[target], next); 774 return true; 775 } 776 return false; 777 } 778 779 /* 780 * Check events in order. 781 * 782 */ 783 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 784 { 785 /* threshold event is triggered in finer grain than soft limit */ 786 if (unlikely(mem_cgroup_event_ratelimit(memcg, 787 MEM_CGROUP_TARGET_THRESH))) { 788 bool do_softlimit; 789 bool do_numainfo __maybe_unused; 790 791 do_softlimit = mem_cgroup_event_ratelimit(memcg, 792 MEM_CGROUP_TARGET_SOFTLIMIT); 793 #if MAX_NUMNODES > 1 794 do_numainfo = mem_cgroup_event_ratelimit(memcg, 795 MEM_CGROUP_TARGET_NUMAINFO); 796 #endif 797 mem_cgroup_threshold(memcg); 798 if (unlikely(do_softlimit)) 799 mem_cgroup_update_tree(memcg, page); 800 #if MAX_NUMNODES > 1 801 if (unlikely(do_numainfo)) 802 atomic_inc(&memcg->numainfo_events); 803 #endif 804 } 805 } 806 807 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 808 { 809 /* 810 * mm_update_next_owner() may clear mm->owner to NULL 811 * if it races with swapoff, page migration, etc. 812 * So this can be called with p == NULL. 813 */ 814 if (unlikely(!p)) 815 return NULL; 816 817 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 818 } 819 EXPORT_SYMBOL(mem_cgroup_from_task); 820 821 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 822 { 823 struct mem_cgroup *memcg = NULL; 824 825 rcu_read_lock(); 826 do { 827 /* 828 * Page cache insertions can happen withou an 829 * actual mm context, e.g. during disk probing 830 * on boot, loopback IO, acct() writes etc. 831 */ 832 if (unlikely(!mm)) 833 memcg = root_mem_cgroup; 834 else { 835 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 836 if (unlikely(!memcg)) 837 memcg = root_mem_cgroup; 838 } 839 } while (!css_tryget_online(&memcg->css)); 840 rcu_read_unlock(); 841 return memcg; 842 } 843 844 /** 845 * mem_cgroup_iter - iterate over memory cgroup hierarchy 846 * @root: hierarchy root 847 * @prev: previously returned memcg, NULL on first invocation 848 * @reclaim: cookie for shared reclaim walks, NULL for full walks 849 * 850 * Returns references to children of the hierarchy below @root, or 851 * @root itself, or %NULL after a full round-trip. 852 * 853 * Caller must pass the return value in @prev on subsequent 854 * invocations for reference counting, or use mem_cgroup_iter_break() 855 * to cancel a hierarchy walk before the round-trip is complete. 856 * 857 * Reclaimers can specify a zone and a priority level in @reclaim to 858 * divide up the memcgs in the hierarchy among all concurrent 859 * reclaimers operating on the same zone and priority. 860 */ 861 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 862 struct mem_cgroup *prev, 863 struct mem_cgroup_reclaim_cookie *reclaim) 864 { 865 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 866 struct cgroup_subsys_state *css = NULL; 867 struct mem_cgroup *memcg = NULL; 868 struct mem_cgroup *pos = NULL; 869 870 if (mem_cgroup_disabled()) 871 return NULL; 872 873 if (!root) 874 root = root_mem_cgroup; 875 876 if (prev && !reclaim) 877 pos = prev; 878 879 if (!root->use_hierarchy && root != root_mem_cgroup) { 880 if (prev) 881 goto out; 882 return root; 883 } 884 885 rcu_read_lock(); 886 887 if (reclaim) { 888 struct mem_cgroup_per_zone *mz; 889 890 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 891 iter = &mz->iter[reclaim->priority]; 892 893 if (prev && reclaim->generation != iter->generation) 894 goto out_unlock; 895 896 do { 897 pos = READ_ONCE(iter->position); 898 /* 899 * A racing update may change the position and 900 * put the last reference, hence css_tryget(), 901 * or retry to see the updated position. 902 */ 903 } while (pos && !css_tryget(&pos->css)); 904 } 905 906 if (pos) 907 css = &pos->css; 908 909 for (;;) { 910 css = css_next_descendant_pre(css, &root->css); 911 if (!css) { 912 /* 913 * Reclaimers share the hierarchy walk, and a 914 * new one might jump in right at the end of 915 * the hierarchy - make sure they see at least 916 * one group and restart from the beginning. 917 */ 918 if (!prev) 919 continue; 920 break; 921 } 922 923 /* 924 * Verify the css and acquire a reference. The root 925 * is provided by the caller, so we know it's alive 926 * and kicking, and don't take an extra reference. 927 */ 928 memcg = mem_cgroup_from_css(css); 929 930 if (css == &root->css) 931 break; 932 933 if (css_tryget(css)) { 934 /* 935 * Make sure the memcg is initialized: 936 * mem_cgroup_css_online() orders the the 937 * initialization against setting the flag. 938 */ 939 if (smp_load_acquire(&memcg->initialized)) 940 break; 941 942 css_put(css); 943 } 944 945 memcg = NULL; 946 } 947 948 if (reclaim) { 949 if (cmpxchg(&iter->position, pos, memcg) == pos) { 950 if (memcg) 951 css_get(&memcg->css); 952 if (pos) 953 css_put(&pos->css); 954 } 955 956 /* 957 * pairs with css_tryget when dereferencing iter->position 958 * above. 959 */ 960 if (pos) 961 css_put(&pos->css); 962 963 if (!memcg) 964 iter->generation++; 965 else if (!prev) 966 reclaim->generation = iter->generation; 967 } 968 969 out_unlock: 970 rcu_read_unlock(); 971 out: 972 if (prev && prev != root) 973 css_put(&prev->css); 974 975 return memcg; 976 } 977 978 /** 979 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 980 * @root: hierarchy root 981 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 982 */ 983 void mem_cgroup_iter_break(struct mem_cgroup *root, 984 struct mem_cgroup *prev) 985 { 986 if (!root) 987 root = root_mem_cgroup; 988 if (prev && prev != root) 989 css_put(&prev->css); 990 } 991 992 /* 993 * Iteration constructs for visiting all cgroups (under a tree). If 994 * loops are exited prematurely (break), mem_cgroup_iter_break() must 995 * be used for reference counting. 996 */ 997 #define for_each_mem_cgroup_tree(iter, root) \ 998 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 999 iter != NULL; \ 1000 iter = mem_cgroup_iter(root, iter, NULL)) 1001 1002 #define for_each_mem_cgroup(iter) \ 1003 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1004 iter != NULL; \ 1005 iter = mem_cgroup_iter(NULL, iter, NULL)) 1006 1007 /** 1008 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1009 * @zone: zone of the wanted lruvec 1010 * @memcg: memcg of the wanted lruvec 1011 * 1012 * Returns the lru list vector holding pages for the given @zone and 1013 * @mem. This can be the global zone lruvec, if the memory controller 1014 * is disabled. 1015 */ 1016 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1017 struct mem_cgroup *memcg) 1018 { 1019 struct mem_cgroup_per_zone *mz; 1020 struct lruvec *lruvec; 1021 1022 if (mem_cgroup_disabled()) { 1023 lruvec = &zone->lruvec; 1024 goto out; 1025 } 1026 1027 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1028 lruvec = &mz->lruvec; 1029 out: 1030 /* 1031 * Since a node can be onlined after the mem_cgroup was created, 1032 * we have to be prepared to initialize lruvec->zone here; 1033 * and if offlined then reonlined, we need to reinitialize it. 1034 */ 1035 if (unlikely(lruvec->zone != zone)) 1036 lruvec->zone = zone; 1037 return lruvec; 1038 } 1039 1040 /** 1041 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1042 * @page: the page 1043 * @zone: zone of the page 1044 * 1045 * This function is only safe when following the LRU page isolation 1046 * and putback protocol: the LRU lock must be held, and the page must 1047 * either be PageLRU() or the caller must have isolated/allocated it. 1048 */ 1049 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1050 { 1051 struct mem_cgroup_per_zone *mz; 1052 struct mem_cgroup *memcg; 1053 struct lruvec *lruvec; 1054 1055 if (mem_cgroup_disabled()) { 1056 lruvec = &zone->lruvec; 1057 goto out; 1058 } 1059 1060 memcg = page->mem_cgroup; 1061 /* 1062 * Swapcache readahead pages are added to the LRU - and 1063 * possibly migrated - before they are charged. 1064 */ 1065 if (!memcg) 1066 memcg = root_mem_cgroup; 1067 1068 mz = mem_cgroup_page_zoneinfo(memcg, page); 1069 lruvec = &mz->lruvec; 1070 out: 1071 /* 1072 * Since a node can be onlined after the mem_cgroup was created, 1073 * we have to be prepared to initialize lruvec->zone here; 1074 * and if offlined then reonlined, we need to reinitialize it. 1075 */ 1076 if (unlikely(lruvec->zone != zone)) 1077 lruvec->zone = zone; 1078 return lruvec; 1079 } 1080 1081 /** 1082 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1083 * @lruvec: mem_cgroup per zone lru vector 1084 * @lru: index of lru list the page is sitting on 1085 * @nr_pages: positive when adding or negative when removing 1086 * 1087 * This function must be called when a page is added to or removed from an 1088 * lru list. 1089 */ 1090 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1091 int nr_pages) 1092 { 1093 struct mem_cgroup_per_zone *mz; 1094 unsigned long *lru_size; 1095 1096 if (mem_cgroup_disabled()) 1097 return; 1098 1099 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1100 lru_size = mz->lru_size + lru; 1101 *lru_size += nr_pages; 1102 VM_BUG_ON((long)(*lru_size) < 0); 1103 } 1104 1105 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1106 { 1107 struct mem_cgroup *task_memcg; 1108 struct task_struct *p; 1109 bool ret; 1110 1111 p = find_lock_task_mm(task); 1112 if (p) { 1113 task_memcg = get_mem_cgroup_from_mm(p->mm); 1114 task_unlock(p); 1115 } else { 1116 /* 1117 * All threads may have already detached their mm's, but the oom 1118 * killer still needs to detect if they have already been oom 1119 * killed to prevent needlessly killing additional tasks. 1120 */ 1121 rcu_read_lock(); 1122 task_memcg = mem_cgroup_from_task(task); 1123 css_get(&task_memcg->css); 1124 rcu_read_unlock(); 1125 } 1126 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1127 css_put(&task_memcg->css); 1128 return ret; 1129 } 1130 1131 #define mem_cgroup_from_counter(counter, member) \ 1132 container_of(counter, struct mem_cgroup, member) 1133 1134 /** 1135 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1136 * @memcg: the memory cgroup 1137 * 1138 * Returns the maximum amount of memory @mem can be charged with, in 1139 * pages. 1140 */ 1141 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1142 { 1143 unsigned long margin = 0; 1144 unsigned long count; 1145 unsigned long limit; 1146 1147 count = page_counter_read(&memcg->memory); 1148 limit = READ_ONCE(memcg->memory.limit); 1149 if (count < limit) 1150 margin = limit - count; 1151 1152 if (do_swap_account) { 1153 count = page_counter_read(&memcg->memsw); 1154 limit = READ_ONCE(memcg->memsw.limit); 1155 if (count <= limit) 1156 margin = min(margin, limit - count); 1157 } 1158 1159 return margin; 1160 } 1161 1162 /* 1163 * A routine for checking "mem" is under move_account() or not. 1164 * 1165 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1166 * moving cgroups. This is for waiting at high-memory pressure 1167 * caused by "move". 1168 */ 1169 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1170 { 1171 struct mem_cgroup *from; 1172 struct mem_cgroup *to; 1173 bool ret = false; 1174 /* 1175 * Unlike task_move routines, we access mc.to, mc.from not under 1176 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1177 */ 1178 spin_lock(&mc.lock); 1179 from = mc.from; 1180 to = mc.to; 1181 if (!from) 1182 goto unlock; 1183 1184 ret = mem_cgroup_is_descendant(from, memcg) || 1185 mem_cgroup_is_descendant(to, memcg); 1186 unlock: 1187 spin_unlock(&mc.lock); 1188 return ret; 1189 } 1190 1191 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1192 { 1193 if (mc.moving_task && current != mc.moving_task) { 1194 if (mem_cgroup_under_move(memcg)) { 1195 DEFINE_WAIT(wait); 1196 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1197 /* moving charge context might have finished. */ 1198 if (mc.moving_task) 1199 schedule(); 1200 finish_wait(&mc.waitq, &wait); 1201 return true; 1202 } 1203 } 1204 return false; 1205 } 1206 1207 #define K(x) ((x) << (PAGE_SHIFT-10)) 1208 /** 1209 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1210 * @memcg: The memory cgroup that went over limit 1211 * @p: Task that is going to be killed 1212 * 1213 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1214 * enabled 1215 */ 1216 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1217 { 1218 /* oom_info_lock ensures that parallel ooms do not interleave */ 1219 static DEFINE_MUTEX(oom_info_lock); 1220 struct mem_cgroup *iter; 1221 unsigned int i; 1222 1223 mutex_lock(&oom_info_lock); 1224 rcu_read_lock(); 1225 1226 if (p) { 1227 pr_info("Task in "); 1228 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1229 pr_cont(" killed as a result of limit of "); 1230 } else { 1231 pr_info("Memory limit reached of cgroup "); 1232 } 1233 1234 pr_cont_cgroup_path(memcg->css.cgroup); 1235 pr_cont("\n"); 1236 1237 rcu_read_unlock(); 1238 1239 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1240 K((u64)page_counter_read(&memcg->memory)), 1241 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1242 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1243 K((u64)page_counter_read(&memcg->memsw)), 1244 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1245 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1246 K((u64)page_counter_read(&memcg->kmem)), 1247 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1248 1249 for_each_mem_cgroup_tree(iter, memcg) { 1250 pr_info("Memory cgroup stats for "); 1251 pr_cont_cgroup_path(iter->css.cgroup); 1252 pr_cont(":"); 1253 1254 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1255 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1256 continue; 1257 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1258 K(mem_cgroup_read_stat(iter, i))); 1259 } 1260 1261 for (i = 0; i < NR_LRU_LISTS; i++) 1262 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1263 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1264 1265 pr_cont("\n"); 1266 } 1267 mutex_unlock(&oom_info_lock); 1268 } 1269 1270 /* 1271 * This function returns the number of memcg under hierarchy tree. Returns 1272 * 1(self count) if no children. 1273 */ 1274 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1275 { 1276 int num = 0; 1277 struct mem_cgroup *iter; 1278 1279 for_each_mem_cgroup_tree(iter, memcg) 1280 num++; 1281 return num; 1282 } 1283 1284 /* 1285 * Return the memory (and swap, if configured) limit for a memcg. 1286 */ 1287 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1288 { 1289 unsigned long limit; 1290 1291 limit = memcg->memory.limit; 1292 if (mem_cgroup_swappiness(memcg)) { 1293 unsigned long memsw_limit; 1294 1295 memsw_limit = memcg->memsw.limit; 1296 limit = min(limit + total_swap_pages, memsw_limit); 1297 } 1298 return limit; 1299 } 1300 1301 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1302 int order) 1303 { 1304 struct oom_control oc = { 1305 .zonelist = NULL, 1306 .nodemask = NULL, 1307 .gfp_mask = gfp_mask, 1308 .order = order, 1309 }; 1310 struct mem_cgroup *iter; 1311 unsigned long chosen_points = 0; 1312 unsigned long totalpages; 1313 unsigned int points = 0; 1314 struct task_struct *chosen = NULL; 1315 1316 mutex_lock(&oom_lock); 1317 1318 /* 1319 * If current has a pending SIGKILL or is exiting, then automatically 1320 * select it. The goal is to allow it to allocate so that it may 1321 * quickly exit and free its memory. 1322 */ 1323 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1324 mark_oom_victim(current); 1325 goto unlock; 1326 } 1327 1328 check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); 1329 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1330 for_each_mem_cgroup_tree(iter, memcg) { 1331 struct css_task_iter it; 1332 struct task_struct *task; 1333 1334 css_task_iter_start(&iter->css, &it); 1335 while ((task = css_task_iter_next(&it))) { 1336 switch (oom_scan_process_thread(&oc, task, totalpages)) { 1337 case OOM_SCAN_SELECT: 1338 if (chosen) 1339 put_task_struct(chosen); 1340 chosen = task; 1341 chosen_points = ULONG_MAX; 1342 get_task_struct(chosen); 1343 /* fall through */ 1344 case OOM_SCAN_CONTINUE: 1345 continue; 1346 case OOM_SCAN_ABORT: 1347 css_task_iter_end(&it); 1348 mem_cgroup_iter_break(memcg, iter); 1349 if (chosen) 1350 put_task_struct(chosen); 1351 goto unlock; 1352 case OOM_SCAN_OK: 1353 break; 1354 }; 1355 points = oom_badness(task, memcg, NULL, totalpages); 1356 if (!points || points < chosen_points) 1357 continue; 1358 /* Prefer thread group leaders for display purposes */ 1359 if (points == chosen_points && 1360 thread_group_leader(chosen)) 1361 continue; 1362 1363 if (chosen) 1364 put_task_struct(chosen); 1365 chosen = task; 1366 chosen_points = points; 1367 get_task_struct(chosen); 1368 } 1369 css_task_iter_end(&it); 1370 } 1371 1372 if (chosen) { 1373 points = chosen_points * 1000 / totalpages; 1374 oom_kill_process(&oc, chosen, points, totalpages, memcg, 1375 "Memory cgroup out of memory"); 1376 } 1377 unlock: 1378 mutex_unlock(&oom_lock); 1379 } 1380 1381 #if MAX_NUMNODES > 1 1382 1383 /** 1384 * test_mem_cgroup_node_reclaimable 1385 * @memcg: the target memcg 1386 * @nid: the node ID to be checked. 1387 * @noswap : specify true here if the user wants flle only information. 1388 * 1389 * This function returns whether the specified memcg contains any 1390 * reclaimable pages on a node. Returns true if there are any reclaimable 1391 * pages in the node. 1392 */ 1393 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1394 int nid, bool noswap) 1395 { 1396 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1397 return true; 1398 if (noswap || !total_swap_pages) 1399 return false; 1400 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1401 return true; 1402 return false; 1403 1404 } 1405 1406 /* 1407 * Always updating the nodemask is not very good - even if we have an empty 1408 * list or the wrong list here, we can start from some node and traverse all 1409 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1410 * 1411 */ 1412 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1413 { 1414 int nid; 1415 /* 1416 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1417 * pagein/pageout changes since the last update. 1418 */ 1419 if (!atomic_read(&memcg->numainfo_events)) 1420 return; 1421 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1422 return; 1423 1424 /* make a nodemask where this memcg uses memory from */ 1425 memcg->scan_nodes = node_states[N_MEMORY]; 1426 1427 for_each_node_mask(nid, node_states[N_MEMORY]) { 1428 1429 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1430 node_clear(nid, memcg->scan_nodes); 1431 } 1432 1433 atomic_set(&memcg->numainfo_events, 0); 1434 atomic_set(&memcg->numainfo_updating, 0); 1435 } 1436 1437 /* 1438 * Selecting a node where we start reclaim from. Because what we need is just 1439 * reducing usage counter, start from anywhere is O,K. Considering 1440 * memory reclaim from current node, there are pros. and cons. 1441 * 1442 * Freeing memory from current node means freeing memory from a node which 1443 * we'll use or we've used. So, it may make LRU bad. And if several threads 1444 * hit limits, it will see a contention on a node. But freeing from remote 1445 * node means more costs for memory reclaim because of memory latency. 1446 * 1447 * Now, we use round-robin. Better algorithm is welcomed. 1448 */ 1449 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1450 { 1451 int node; 1452 1453 mem_cgroup_may_update_nodemask(memcg); 1454 node = memcg->last_scanned_node; 1455 1456 node = next_node(node, memcg->scan_nodes); 1457 if (node == MAX_NUMNODES) 1458 node = first_node(memcg->scan_nodes); 1459 /* 1460 * We call this when we hit limit, not when pages are added to LRU. 1461 * No LRU may hold pages because all pages are UNEVICTABLE or 1462 * memcg is too small and all pages are not on LRU. In that case, 1463 * we use curret node. 1464 */ 1465 if (unlikely(node == MAX_NUMNODES)) 1466 node = numa_node_id(); 1467 1468 memcg->last_scanned_node = node; 1469 return node; 1470 } 1471 #else 1472 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1473 { 1474 return 0; 1475 } 1476 #endif 1477 1478 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1479 struct zone *zone, 1480 gfp_t gfp_mask, 1481 unsigned long *total_scanned) 1482 { 1483 struct mem_cgroup *victim = NULL; 1484 int total = 0; 1485 int loop = 0; 1486 unsigned long excess; 1487 unsigned long nr_scanned; 1488 struct mem_cgroup_reclaim_cookie reclaim = { 1489 .zone = zone, 1490 .priority = 0, 1491 }; 1492 1493 excess = soft_limit_excess(root_memcg); 1494 1495 while (1) { 1496 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1497 if (!victim) { 1498 loop++; 1499 if (loop >= 2) { 1500 /* 1501 * If we have not been able to reclaim 1502 * anything, it might because there are 1503 * no reclaimable pages under this hierarchy 1504 */ 1505 if (!total) 1506 break; 1507 /* 1508 * We want to do more targeted reclaim. 1509 * excess >> 2 is not to excessive so as to 1510 * reclaim too much, nor too less that we keep 1511 * coming back to reclaim from this cgroup 1512 */ 1513 if (total >= (excess >> 2) || 1514 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1515 break; 1516 } 1517 continue; 1518 } 1519 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1520 zone, &nr_scanned); 1521 *total_scanned += nr_scanned; 1522 if (!soft_limit_excess(root_memcg)) 1523 break; 1524 } 1525 mem_cgroup_iter_break(root_memcg, victim); 1526 return total; 1527 } 1528 1529 #ifdef CONFIG_LOCKDEP 1530 static struct lockdep_map memcg_oom_lock_dep_map = { 1531 .name = "memcg_oom_lock", 1532 }; 1533 #endif 1534 1535 static DEFINE_SPINLOCK(memcg_oom_lock); 1536 1537 /* 1538 * Check OOM-Killer is already running under our hierarchy. 1539 * If someone is running, return false. 1540 */ 1541 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1542 { 1543 struct mem_cgroup *iter, *failed = NULL; 1544 1545 spin_lock(&memcg_oom_lock); 1546 1547 for_each_mem_cgroup_tree(iter, memcg) { 1548 if (iter->oom_lock) { 1549 /* 1550 * this subtree of our hierarchy is already locked 1551 * so we cannot give a lock. 1552 */ 1553 failed = iter; 1554 mem_cgroup_iter_break(memcg, iter); 1555 break; 1556 } else 1557 iter->oom_lock = true; 1558 } 1559 1560 if (failed) { 1561 /* 1562 * OK, we failed to lock the whole subtree so we have 1563 * to clean up what we set up to the failing subtree 1564 */ 1565 for_each_mem_cgroup_tree(iter, memcg) { 1566 if (iter == failed) { 1567 mem_cgroup_iter_break(memcg, iter); 1568 break; 1569 } 1570 iter->oom_lock = false; 1571 } 1572 } else 1573 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1574 1575 spin_unlock(&memcg_oom_lock); 1576 1577 return !failed; 1578 } 1579 1580 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1581 { 1582 struct mem_cgroup *iter; 1583 1584 spin_lock(&memcg_oom_lock); 1585 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1586 for_each_mem_cgroup_tree(iter, memcg) 1587 iter->oom_lock = false; 1588 spin_unlock(&memcg_oom_lock); 1589 } 1590 1591 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1592 { 1593 struct mem_cgroup *iter; 1594 1595 spin_lock(&memcg_oom_lock); 1596 for_each_mem_cgroup_tree(iter, memcg) 1597 iter->under_oom++; 1598 spin_unlock(&memcg_oom_lock); 1599 } 1600 1601 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1602 { 1603 struct mem_cgroup *iter; 1604 1605 /* 1606 * When a new child is created while the hierarchy is under oom, 1607 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1608 */ 1609 spin_lock(&memcg_oom_lock); 1610 for_each_mem_cgroup_tree(iter, memcg) 1611 if (iter->under_oom > 0) 1612 iter->under_oom--; 1613 spin_unlock(&memcg_oom_lock); 1614 } 1615 1616 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1617 1618 struct oom_wait_info { 1619 struct mem_cgroup *memcg; 1620 wait_queue_t wait; 1621 }; 1622 1623 static int memcg_oom_wake_function(wait_queue_t *wait, 1624 unsigned mode, int sync, void *arg) 1625 { 1626 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1627 struct mem_cgroup *oom_wait_memcg; 1628 struct oom_wait_info *oom_wait_info; 1629 1630 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1631 oom_wait_memcg = oom_wait_info->memcg; 1632 1633 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1634 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1635 return 0; 1636 return autoremove_wake_function(wait, mode, sync, arg); 1637 } 1638 1639 static void memcg_oom_recover(struct mem_cgroup *memcg) 1640 { 1641 /* 1642 * For the following lockless ->under_oom test, the only required 1643 * guarantee is that it must see the state asserted by an OOM when 1644 * this function is called as a result of userland actions 1645 * triggered by the notification of the OOM. This is trivially 1646 * achieved by invoking mem_cgroup_mark_under_oom() before 1647 * triggering notification. 1648 */ 1649 if (memcg && memcg->under_oom) 1650 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1651 } 1652 1653 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1654 { 1655 if (!current->memcg_oom.may_oom) 1656 return; 1657 /* 1658 * We are in the middle of the charge context here, so we 1659 * don't want to block when potentially sitting on a callstack 1660 * that holds all kinds of filesystem and mm locks. 1661 * 1662 * Also, the caller may handle a failed allocation gracefully 1663 * (like optional page cache readahead) and so an OOM killer 1664 * invocation might not even be necessary. 1665 * 1666 * That's why we don't do anything here except remember the 1667 * OOM context and then deal with it at the end of the page 1668 * fault when the stack is unwound, the locks are released, 1669 * and when we know whether the fault was overall successful. 1670 */ 1671 css_get(&memcg->css); 1672 current->memcg_oom.memcg = memcg; 1673 current->memcg_oom.gfp_mask = mask; 1674 current->memcg_oom.order = order; 1675 } 1676 1677 /** 1678 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1679 * @handle: actually kill/wait or just clean up the OOM state 1680 * 1681 * This has to be called at the end of a page fault if the memcg OOM 1682 * handler was enabled. 1683 * 1684 * Memcg supports userspace OOM handling where failed allocations must 1685 * sleep on a waitqueue until the userspace task resolves the 1686 * situation. Sleeping directly in the charge context with all kinds 1687 * of locks held is not a good idea, instead we remember an OOM state 1688 * in the task and mem_cgroup_oom_synchronize() has to be called at 1689 * the end of the page fault to complete the OOM handling. 1690 * 1691 * Returns %true if an ongoing memcg OOM situation was detected and 1692 * completed, %false otherwise. 1693 */ 1694 bool mem_cgroup_oom_synchronize(bool handle) 1695 { 1696 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1697 struct oom_wait_info owait; 1698 bool locked; 1699 1700 /* OOM is global, do not handle */ 1701 if (!memcg) 1702 return false; 1703 1704 if (!handle || oom_killer_disabled) 1705 goto cleanup; 1706 1707 owait.memcg = memcg; 1708 owait.wait.flags = 0; 1709 owait.wait.func = memcg_oom_wake_function; 1710 owait.wait.private = current; 1711 INIT_LIST_HEAD(&owait.wait.task_list); 1712 1713 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1714 mem_cgroup_mark_under_oom(memcg); 1715 1716 locked = mem_cgroup_oom_trylock(memcg); 1717 1718 if (locked) 1719 mem_cgroup_oom_notify(memcg); 1720 1721 if (locked && !memcg->oom_kill_disable) { 1722 mem_cgroup_unmark_under_oom(memcg); 1723 finish_wait(&memcg_oom_waitq, &owait.wait); 1724 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1725 current->memcg_oom.order); 1726 } else { 1727 schedule(); 1728 mem_cgroup_unmark_under_oom(memcg); 1729 finish_wait(&memcg_oom_waitq, &owait.wait); 1730 } 1731 1732 if (locked) { 1733 mem_cgroup_oom_unlock(memcg); 1734 /* 1735 * There is no guarantee that an OOM-lock contender 1736 * sees the wakeups triggered by the OOM kill 1737 * uncharges. Wake any sleepers explicitely. 1738 */ 1739 memcg_oom_recover(memcg); 1740 } 1741 cleanup: 1742 current->memcg_oom.memcg = NULL; 1743 css_put(&memcg->css); 1744 return true; 1745 } 1746 1747 /** 1748 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1749 * @page: page that is going to change accounted state 1750 * 1751 * This function must mark the beginning of an accounted page state 1752 * change to prevent double accounting when the page is concurrently 1753 * being moved to another memcg: 1754 * 1755 * memcg = mem_cgroup_begin_page_stat(page); 1756 * if (TestClearPageState(page)) 1757 * mem_cgroup_update_page_stat(memcg, state, -1); 1758 * mem_cgroup_end_page_stat(memcg); 1759 */ 1760 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 1761 { 1762 struct mem_cgroup *memcg; 1763 unsigned long flags; 1764 1765 /* 1766 * The RCU lock is held throughout the transaction. The fast 1767 * path can get away without acquiring the memcg->move_lock 1768 * because page moving starts with an RCU grace period. 1769 * 1770 * The RCU lock also protects the memcg from being freed when 1771 * the page state that is going to change is the only thing 1772 * preventing the page from being uncharged. 1773 * E.g. end-writeback clearing PageWriteback(), which allows 1774 * migration to go ahead and uncharge the page before the 1775 * account transaction might be complete. 1776 */ 1777 rcu_read_lock(); 1778 1779 if (mem_cgroup_disabled()) 1780 return NULL; 1781 again: 1782 memcg = page->mem_cgroup; 1783 if (unlikely(!memcg)) 1784 return NULL; 1785 1786 if (atomic_read(&memcg->moving_account) <= 0) 1787 return memcg; 1788 1789 spin_lock_irqsave(&memcg->move_lock, flags); 1790 if (memcg != page->mem_cgroup) { 1791 spin_unlock_irqrestore(&memcg->move_lock, flags); 1792 goto again; 1793 } 1794 1795 /* 1796 * When charge migration first begins, we can have locked and 1797 * unlocked page stat updates happening concurrently. Track 1798 * the task who has the lock for mem_cgroup_end_page_stat(). 1799 */ 1800 memcg->move_lock_task = current; 1801 memcg->move_lock_flags = flags; 1802 1803 return memcg; 1804 } 1805 EXPORT_SYMBOL(mem_cgroup_begin_page_stat); 1806 1807 /** 1808 * mem_cgroup_end_page_stat - finish a page state statistics transaction 1809 * @memcg: the memcg that was accounted against 1810 */ 1811 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 1812 { 1813 if (memcg && memcg->move_lock_task == current) { 1814 unsigned long flags = memcg->move_lock_flags; 1815 1816 memcg->move_lock_task = NULL; 1817 memcg->move_lock_flags = 0; 1818 1819 spin_unlock_irqrestore(&memcg->move_lock, flags); 1820 } 1821 1822 rcu_read_unlock(); 1823 } 1824 EXPORT_SYMBOL(mem_cgroup_end_page_stat); 1825 1826 /* 1827 * size of first charge trial. "32" comes from vmscan.c's magic value. 1828 * TODO: maybe necessary to use big numbers in big irons. 1829 */ 1830 #define CHARGE_BATCH 32U 1831 struct memcg_stock_pcp { 1832 struct mem_cgroup *cached; /* this never be root cgroup */ 1833 unsigned int nr_pages; 1834 struct work_struct work; 1835 unsigned long flags; 1836 #define FLUSHING_CACHED_CHARGE 0 1837 }; 1838 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1839 static DEFINE_MUTEX(percpu_charge_mutex); 1840 1841 /** 1842 * consume_stock: Try to consume stocked charge on this cpu. 1843 * @memcg: memcg to consume from. 1844 * @nr_pages: how many pages to charge. 1845 * 1846 * The charges will only happen if @memcg matches the current cpu's memcg 1847 * stock, and at least @nr_pages are available in that stock. Failure to 1848 * service an allocation will refill the stock. 1849 * 1850 * returns true if successful, false otherwise. 1851 */ 1852 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1853 { 1854 struct memcg_stock_pcp *stock; 1855 bool ret = false; 1856 1857 if (nr_pages > CHARGE_BATCH) 1858 return ret; 1859 1860 stock = &get_cpu_var(memcg_stock); 1861 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 1862 stock->nr_pages -= nr_pages; 1863 ret = true; 1864 } 1865 put_cpu_var(memcg_stock); 1866 return ret; 1867 } 1868 1869 /* 1870 * Returns stocks cached in percpu and reset cached information. 1871 */ 1872 static void drain_stock(struct memcg_stock_pcp *stock) 1873 { 1874 struct mem_cgroup *old = stock->cached; 1875 1876 if (stock->nr_pages) { 1877 page_counter_uncharge(&old->memory, stock->nr_pages); 1878 if (do_swap_account) 1879 page_counter_uncharge(&old->memsw, stock->nr_pages); 1880 css_put_many(&old->css, stock->nr_pages); 1881 stock->nr_pages = 0; 1882 } 1883 stock->cached = NULL; 1884 } 1885 1886 /* 1887 * This must be called under preempt disabled or must be called by 1888 * a thread which is pinned to local cpu. 1889 */ 1890 static void drain_local_stock(struct work_struct *dummy) 1891 { 1892 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 1893 drain_stock(stock); 1894 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1895 } 1896 1897 /* 1898 * Cache charges(val) to local per_cpu area. 1899 * This will be consumed by consume_stock() function, later. 1900 */ 1901 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1902 { 1903 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1904 1905 if (stock->cached != memcg) { /* reset if necessary */ 1906 drain_stock(stock); 1907 stock->cached = memcg; 1908 } 1909 stock->nr_pages += nr_pages; 1910 put_cpu_var(memcg_stock); 1911 } 1912 1913 /* 1914 * Drains all per-CPU charge caches for given root_memcg resp. subtree 1915 * of the hierarchy under it. 1916 */ 1917 static void drain_all_stock(struct mem_cgroup *root_memcg) 1918 { 1919 int cpu, curcpu; 1920 1921 /* If someone's already draining, avoid adding running more workers. */ 1922 if (!mutex_trylock(&percpu_charge_mutex)) 1923 return; 1924 /* Notify other cpus that system-wide "drain" is running */ 1925 get_online_cpus(); 1926 curcpu = get_cpu(); 1927 for_each_online_cpu(cpu) { 1928 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1929 struct mem_cgroup *memcg; 1930 1931 memcg = stock->cached; 1932 if (!memcg || !stock->nr_pages) 1933 continue; 1934 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 1935 continue; 1936 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 1937 if (cpu == curcpu) 1938 drain_local_stock(&stock->work); 1939 else 1940 schedule_work_on(cpu, &stock->work); 1941 } 1942 } 1943 put_cpu(); 1944 put_online_cpus(); 1945 mutex_unlock(&percpu_charge_mutex); 1946 } 1947 1948 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 1949 unsigned long action, 1950 void *hcpu) 1951 { 1952 int cpu = (unsigned long)hcpu; 1953 struct memcg_stock_pcp *stock; 1954 1955 if (action == CPU_ONLINE) 1956 return NOTIFY_OK; 1957 1958 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1959 return NOTIFY_OK; 1960 1961 stock = &per_cpu(memcg_stock, cpu); 1962 drain_stock(stock); 1963 return NOTIFY_OK; 1964 } 1965 1966 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 1967 unsigned int nr_pages) 1968 { 1969 unsigned int batch = max(CHARGE_BATCH, nr_pages); 1970 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1971 struct mem_cgroup *mem_over_limit; 1972 struct page_counter *counter; 1973 unsigned long nr_reclaimed; 1974 bool may_swap = true; 1975 bool drained = false; 1976 int ret = 0; 1977 1978 if (mem_cgroup_is_root(memcg)) 1979 goto done; 1980 retry: 1981 if (consume_stock(memcg, nr_pages)) 1982 goto done; 1983 1984 if (!do_swap_account || 1985 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 1986 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 1987 goto done_restock; 1988 if (do_swap_account) 1989 page_counter_uncharge(&memcg->memsw, batch); 1990 mem_over_limit = mem_cgroup_from_counter(counter, memory); 1991 } else { 1992 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 1993 may_swap = false; 1994 } 1995 1996 if (batch > nr_pages) { 1997 batch = nr_pages; 1998 goto retry; 1999 } 2000 2001 /* 2002 * Unlike in global OOM situations, memcg is not in a physical 2003 * memory shortage. Allow dying and OOM-killed tasks to 2004 * bypass the last charges so that they can exit quickly and 2005 * free their memory. 2006 */ 2007 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2008 fatal_signal_pending(current) || 2009 current->flags & PF_EXITING)) 2010 goto bypass; 2011 2012 if (unlikely(task_in_memcg_oom(current))) 2013 goto nomem; 2014 2015 if (!(gfp_mask & __GFP_WAIT)) 2016 goto nomem; 2017 2018 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); 2019 2020 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2021 gfp_mask, may_swap); 2022 2023 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2024 goto retry; 2025 2026 if (!drained) { 2027 drain_all_stock(mem_over_limit); 2028 drained = true; 2029 goto retry; 2030 } 2031 2032 if (gfp_mask & __GFP_NORETRY) 2033 goto nomem; 2034 /* 2035 * Even though the limit is exceeded at this point, reclaim 2036 * may have been able to free some pages. Retry the charge 2037 * before killing the task. 2038 * 2039 * Only for regular pages, though: huge pages are rather 2040 * unlikely to succeed so close to the limit, and we fall back 2041 * to regular pages anyway in case of failure. 2042 */ 2043 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2044 goto retry; 2045 /* 2046 * At task move, charge accounts can be doubly counted. So, it's 2047 * better to wait until the end of task_move if something is going on. 2048 */ 2049 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2050 goto retry; 2051 2052 if (nr_retries--) 2053 goto retry; 2054 2055 if (gfp_mask & __GFP_NOFAIL) 2056 goto bypass; 2057 2058 if (fatal_signal_pending(current)) 2059 goto bypass; 2060 2061 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); 2062 2063 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2064 nomem: 2065 if (!(gfp_mask & __GFP_NOFAIL)) 2066 return -ENOMEM; 2067 bypass: 2068 return -EINTR; 2069 2070 done_restock: 2071 css_get_many(&memcg->css, batch); 2072 if (batch > nr_pages) 2073 refill_stock(memcg, batch - nr_pages); 2074 if (!(gfp_mask & __GFP_WAIT)) 2075 goto done; 2076 /* 2077 * If the hierarchy is above the normal consumption range, 2078 * make the charging task trim their excess contribution. 2079 */ 2080 do { 2081 if (page_counter_read(&memcg->memory) <= memcg->high) 2082 continue; 2083 mem_cgroup_events(memcg, MEMCG_HIGH, 1); 2084 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2085 } while ((memcg = parent_mem_cgroup(memcg))); 2086 done: 2087 return ret; 2088 } 2089 2090 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2091 { 2092 if (mem_cgroup_is_root(memcg)) 2093 return; 2094 2095 page_counter_uncharge(&memcg->memory, nr_pages); 2096 if (do_swap_account) 2097 page_counter_uncharge(&memcg->memsw, nr_pages); 2098 2099 css_put_many(&memcg->css, nr_pages); 2100 } 2101 2102 static void lock_page_lru(struct page *page, int *isolated) 2103 { 2104 struct zone *zone = page_zone(page); 2105 2106 spin_lock_irq(&zone->lru_lock); 2107 if (PageLRU(page)) { 2108 struct lruvec *lruvec; 2109 2110 lruvec = mem_cgroup_page_lruvec(page, zone); 2111 ClearPageLRU(page); 2112 del_page_from_lru_list(page, lruvec, page_lru(page)); 2113 *isolated = 1; 2114 } else 2115 *isolated = 0; 2116 } 2117 2118 static void unlock_page_lru(struct page *page, int isolated) 2119 { 2120 struct zone *zone = page_zone(page); 2121 2122 if (isolated) { 2123 struct lruvec *lruvec; 2124 2125 lruvec = mem_cgroup_page_lruvec(page, zone); 2126 VM_BUG_ON_PAGE(PageLRU(page), page); 2127 SetPageLRU(page); 2128 add_page_to_lru_list(page, lruvec, page_lru(page)); 2129 } 2130 spin_unlock_irq(&zone->lru_lock); 2131 } 2132 2133 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2134 bool lrucare) 2135 { 2136 int isolated; 2137 2138 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2139 2140 /* 2141 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2142 * may already be on some other mem_cgroup's LRU. Take care of it. 2143 */ 2144 if (lrucare) 2145 lock_page_lru(page, &isolated); 2146 2147 /* 2148 * Nobody should be changing or seriously looking at 2149 * page->mem_cgroup at this point: 2150 * 2151 * - the page is uncharged 2152 * 2153 * - the page is off-LRU 2154 * 2155 * - an anonymous fault has exclusive page access, except for 2156 * a locked page table 2157 * 2158 * - a page cache insertion, a swapin fault, or a migration 2159 * have the page locked 2160 */ 2161 page->mem_cgroup = memcg; 2162 2163 if (lrucare) 2164 unlock_page_lru(page, isolated); 2165 } 2166 2167 #ifdef CONFIG_MEMCG_KMEM 2168 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2169 unsigned long nr_pages) 2170 { 2171 struct page_counter *counter; 2172 int ret = 0; 2173 2174 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2175 if (ret < 0) 2176 return ret; 2177 2178 ret = try_charge(memcg, gfp, nr_pages); 2179 if (ret == -EINTR) { 2180 /* 2181 * try_charge() chose to bypass to root due to OOM kill or 2182 * fatal signal. Since our only options are to either fail 2183 * the allocation or charge it to this cgroup, do it as a 2184 * temporary condition. But we can't fail. From a kmem/slab 2185 * perspective, the cache has already been selected, by 2186 * mem_cgroup_kmem_get_cache(), so it is too late to change 2187 * our minds. 2188 * 2189 * This condition will only trigger if the task entered 2190 * memcg_charge_kmem in a sane state, but was OOM-killed 2191 * during try_charge() above. Tasks that were already dying 2192 * when the allocation triggers should have been already 2193 * directed to the root cgroup in memcontrol.h 2194 */ 2195 page_counter_charge(&memcg->memory, nr_pages); 2196 if (do_swap_account) 2197 page_counter_charge(&memcg->memsw, nr_pages); 2198 css_get_many(&memcg->css, nr_pages); 2199 ret = 0; 2200 } else if (ret) 2201 page_counter_uncharge(&memcg->kmem, nr_pages); 2202 2203 return ret; 2204 } 2205 2206 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) 2207 { 2208 page_counter_uncharge(&memcg->memory, nr_pages); 2209 if (do_swap_account) 2210 page_counter_uncharge(&memcg->memsw, nr_pages); 2211 2212 page_counter_uncharge(&memcg->kmem, nr_pages); 2213 2214 css_put_many(&memcg->css, nr_pages); 2215 } 2216 2217 static int memcg_alloc_cache_id(void) 2218 { 2219 int id, size; 2220 int err; 2221 2222 id = ida_simple_get(&memcg_cache_ida, 2223 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2224 if (id < 0) 2225 return id; 2226 2227 if (id < memcg_nr_cache_ids) 2228 return id; 2229 2230 /* 2231 * There's no space for the new id in memcg_caches arrays, 2232 * so we have to grow them. 2233 */ 2234 down_write(&memcg_cache_ids_sem); 2235 2236 size = 2 * (id + 1); 2237 if (size < MEMCG_CACHES_MIN_SIZE) 2238 size = MEMCG_CACHES_MIN_SIZE; 2239 else if (size > MEMCG_CACHES_MAX_SIZE) 2240 size = MEMCG_CACHES_MAX_SIZE; 2241 2242 err = memcg_update_all_caches(size); 2243 if (!err) 2244 err = memcg_update_all_list_lrus(size); 2245 if (!err) 2246 memcg_nr_cache_ids = size; 2247 2248 up_write(&memcg_cache_ids_sem); 2249 2250 if (err) { 2251 ida_simple_remove(&memcg_cache_ida, id); 2252 return err; 2253 } 2254 return id; 2255 } 2256 2257 static void memcg_free_cache_id(int id) 2258 { 2259 ida_simple_remove(&memcg_cache_ida, id); 2260 } 2261 2262 struct memcg_kmem_cache_create_work { 2263 struct mem_cgroup *memcg; 2264 struct kmem_cache *cachep; 2265 struct work_struct work; 2266 }; 2267 2268 static void memcg_kmem_cache_create_func(struct work_struct *w) 2269 { 2270 struct memcg_kmem_cache_create_work *cw = 2271 container_of(w, struct memcg_kmem_cache_create_work, work); 2272 struct mem_cgroup *memcg = cw->memcg; 2273 struct kmem_cache *cachep = cw->cachep; 2274 2275 memcg_create_kmem_cache(memcg, cachep); 2276 2277 css_put(&memcg->css); 2278 kfree(cw); 2279 } 2280 2281 /* 2282 * Enqueue the creation of a per-memcg kmem_cache. 2283 */ 2284 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2285 struct kmem_cache *cachep) 2286 { 2287 struct memcg_kmem_cache_create_work *cw; 2288 2289 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2290 if (!cw) 2291 return; 2292 2293 css_get(&memcg->css); 2294 2295 cw->memcg = memcg; 2296 cw->cachep = cachep; 2297 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2298 2299 schedule_work(&cw->work); 2300 } 2301 2302 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2303 struct kmem_cache *cachep) 2304 { 2305 /* 2306 * We need to stop accounting when we kmalloc, because if the 2307 * corresponding kmalloc cache is not yet created, the first allocation 2308 * in __memcg_schedule_kmem_cache_create will recurse. 2309 * 2310 * However, it is better to enclose the whole function. Depending on 2311 * the debugging options enabled, INIT_WORK(), for instance, can 2312 * trigger an allocation. This too, will make us recurse. Because at 2313 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2314 * the safest choice is to do it like this, wrapping the whole function. 2315 */ 2316 current->memcg_kmem_skip_account = 1; 2317 __memcg_schedule_kmem_cache_create(memcg, cachep); 2318 current->memcg_kmem_skip_account = 0; 2319 } 2320 2321 /* 2322 * Return the kmem_cache we're supposed to use for a slab allocation. 2323 * We try to use the current memcg's version of the cache. 2324 * 2325 * If the cache does not exist yet, if we are the first user of it, 2326 * we either create it immediately, if possible, or create it asynchronously 2327 * in a workqueue. 2328 * In the latter case, we will let the current allocation go through with 2329 * the original cache. 2330 * 2331 * Can't be called in interrupt context or from kernel threads. 2332 * This function needs to be called with rcu_read_lock() held. 2333 */ 2334 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2335 { 2336 struct mem_cgroup *memcg; 2337 struct kmem_cache *memcg_cachep; 2338 int kmemcg_id; 2339 2340 VM_BUG_ON(!is_root_cache(cachep)); 2341 2342 if (current->memcg_kmem_skip_account) 2343 return cachep; 2344 2345 memcg = get_mem_cgroup_from_mm(current->mm); 2346 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2347 if (kmemcg_id < 0) 2348 goto out; 2349 2350 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2351 if (likely(memcg_cachep)) 2352 return memcg_cachep; 2353 2354 /* 2355 * If we are in a safe context (can wait, and not in interrupt 2356 * context), we could be be predictable and return right away. 2357 * This would guarantee that the allocation being performed 2358 * already belongs in the new cache. 2359 * 2360 * However, there are some clashes that can arrive from locking. 2361 * For instance, because we acquire the slab_mutex while doing 2362 * memcg_create_kmem_cache, this means no further allocation 2363 * could happen with the slab_mutex held. So it's better to 2364 * defer everything. 2365 */ 2366 memcg_schedule_kmem_cache_create(memcg, cachep); 2367 out: 2368 css_put(&memcg->css); 2369 return cachep; 2370 } 2371 2372 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2373 { 2374 if (!is_root_cache(cachep)) 2375 css_put(&cachep->memcg_params.memcg->css); 2376 } 2377 2378 /* 2379 * We need to verify if the allocation against current->mm->owner's memcg is 2380 * possible for the given order. But the page is not allocated yet, so we'll 2381 * need a further commit step to do the final arrangements. 2382 * 2383 * It is possible for the task to switch cgroups in this mean time, so at 2384 * commit time, we can't rely on task conversion any longer. We'll then use 2385 * the handle argument to return to the caller which cgroup we should commit 2386 * against. We could also return the memcg directly and avoid the pointer 2387 * passing, but a boolean return value gives better semantics considering 2388 * the compiled-out case as well. 2389 * 2390 * Returning true means the allocation is possible. 2391 */ 2392 bool 2393 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 2394 { 2395 struct mem_cgroup *memcg; 2396 int ret; 2397 2398 *_memcg = NULL; 2399 2400 memcg = get_mem_cgroup_from_mm(current->mm); 2401 2402 if (!memcg_kmem_is_active(memcg)) { 2403 css_put(&memcg->css); 2404 return true; 2405 } 2406 2407 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2408 if (!ret) 2409 *_memcg = memcg; 2410 2411 css_put(&memcg->css); 2412 return (ret == 0); 2413 } 2414 2415 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2416 int order) 2417 { 2418 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2419 2420 /* The page allocation failed. Revert */ 2421 if (!page) { 2422 memcg_uncharge_kmem(memcg, 1 << order); 2423 return; 2424 } 2425 page->mem_cgroup = memcg; 2426 } 2427 2428 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2429 { 2430 struct mem_cgroup *memcg = page->mem_cgroup; 2431 2432 if (!memcg) 2433 return; 2434 2435 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2436 2437 memcg_uncharge_kmem(memcg, 1 << order); 2438 page->mem_cgroup = NULL; 2439 } 2440 2441 struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) 2442 { 2443 struct mem_cgroup *memcg = NULL; 2444 struct kmem_cache *cachep; 2445 struct page *page; 2446 2447 page = virt_to_head_page(ptr); 2448 if (PageSlab(page)) { 2449 cachep = page->slab_cache; 2450 if (!is_root_cache(cachep)) 2451 memcg = cachep->memcg_params.memcg; 2452 } else 2453 /* page allocated by alloc_kmem_pages */ 2454 memcg = page->mem_cgroup; 2455 2456 return memcg; 2457 } 2458 #endif /* CONFIG_MEMCG_KMEM */ 2459 2460 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2461 2462 /* 2463 * Because tail pages are not marked as "used", set it. We're under 2464 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2465 * charge/uncharge will be never happen and move_account() is done under 2466 * compound_lock(), so we don't have to take care of races. 2467 */ 2468 void mem_cgroup_split_huge_fixup(struct page *head) 2469 { 2470 int i; 2471 2472 if (mem_cgroup_disabled()) 2473 return; 2474 2475 for (i = 1; i < HPAGE_PMD_NR; i++) 2476 head[i].mem_cgroup = head->mem_cgroup; 2477 2478 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2479 HPAGE_PMD_NR); 2480 } 2481 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2482 2483 #ifdef CONFIG_MEMCG_SWAP 2484 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2485 bool charge) 2486 { 2487 int val = (charge) ? 1 : -1; 2488 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 2489 } 2490 2491 /** 2492 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2493 * @entry: swap entry to be moved 2494 * @from: mem_cgroup which the entry is moved from 2495 * @to: mem_cgroup which the entry is moved to 2496 * 2497 * It succeeds only when the swap_cgroup's record for this entry is the same 2498 * as the mem_cgroup's id of @from. 2499 * 2500 * Returns 0 on success, -EINVAL on failure. 2501 * 2502 * The caller must have charged to @to, IOW, called page_counter_charge() about 2503 * both res and memsw, and called css_get(). 2504 */ 2505 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2506 struct mem_cgroup *from, struct mem_cgroup *to) 2507 { 2508 unsigned short old_id, new_id; 2509 2510 old_id = mem_cgroup_id(from); 2511 new_id = mem_cgroup_id(to); 2512 2513 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2514 mem_cgroup_swap_statistics(from, false); 2515 mem_cgroup_swap_statistics(to, true); 2516 return 0; 2517 } 2518 return -EINVAL; 2519 } 2520 #else 2521 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2522 struct mem_cgroup *from, struct mem_cgroup *to) 2523 { 2524 return -EINVAL; 2525 } 2526 #endif 2527 2528 static DEFINE_MUTEX(memcg_limit_mutex); 2529 2530 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2531 unsigned long limit) 2532 { 2533 unsigned long curusage; 2534 unsigned long oldusage; 2535 bool enlarge = false; 2536 int retry_count; 2537 int ret; 2538 2539 /* 2540 * For keeping hierarchical_reclaim simple, how long we should retry 2541 * is depends on callers. We set our retry-count to be function 2542 * of # of children which we should visit in this loop. 2543 */ 2544 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2545 mem_cgroup_count_children(memcg); 2546 2547 oldusage = page_counter_read(&memcg->memory); 2548 2549 do { 2550 if (signal_pending(current)) { 2551 ret = -EINTR; 2552 break; 2553 } 2554 2555 mutex_lock(&memcg_limit_mutex); 2556 if (limit > memcg->memsw.limit) { 2557 mutex_unlock(&memcg_limit_mutex); 2558 ret = -EINVAL; 2559 break; 2560 } 2561 if (limit > memcg->memory.limit) 2562 enlarge = true; 2563 ret = page_counter_limit(&memcg->memory, limit); 2564 mutex_unlock(&memcg_limit_mutex); 2565 2566 if (!ret) 2567 break; 2568 2569 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 2570 2571 curusage = page_counter_read(&memcg->memory); 2572 /* Usage is reduced ? */ 2573 if (curusage >= oldusage) 2574 retry_count--; 2575 else 2576 oldusage = curusage; 2577 } while (retry_count); 2578 2579 if (!ret && enlarge) 2580 memcg_oom_recover(memcg); 2581 2582 return ret; 2583 } 2584 2585 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2586 unsigned long limit) 2587 { 2588 unsigned long curusage; 2589 unsigned long oldusage; 2590 bool enlarge = false; 2591 int retry_count; 2592 int ret; 2593 2594 /* see mem_cgroup_resize_res_limit */ 2595 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2596 mem_cgroup_count_children(memcg); 2597 2598 oldusage = page_counter_read(&memcg->memsw); 2599 2600 do { 2601 if (signal_pending(current)) { 2602 ret = -EINTR; 2603 break; 2604 } 2605 2606 mutex_lock(&memcg_limit_mutex); 2607 if (limit < memcg->memory.limit) { 2608 mutex_unlock(&memcg_limit_mutex); 2609 ret = -EINVAL; 2610 break; 2611 } 2612 if (limit > memcg->memsw.limit) 2613 enlarge = true; 2614 ret = page_counter_limit(&memcg->memsw, limit); 2615 mutex_unlock(&memcg_limit_mutex); 2616 2617 if (!ret) 2618 break; 2619 2620 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 2621 2622 curusage = page_counter_read(&memcg->memsw); 2623 /* Usage is reduced ? */ 2624 if (curusage >= oldusage) 2625 retry_count--; 2626 else 2627 oldusage = curusage; 2628 } while (retry_count); 2629 2630 if (!ret && enlarge) 2631 memcg_oom_recover(memcg); 2632 2633 return ret; 2634 } 2635 2636 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2637 gfp_t gfp_mask, 2638 unsigned long *total_scanned) 2639 { 2640 unsigned long nr_reclaimed = 0; 2641 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2642 unsigned long reclaimed; 2643 int loop = 0; 2644 struct mem_cgroup_tree_per_zone *mctz; 2645 unsigned long excess; 2646 unsigned long nr_scanned; 2647 2648 if (order > 0) 2649 return 0; 2650 2651 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2652 /* 2653 * This loop can run a while, specially if mem_cgroup's continuously 2654 * keep exceeding their soft limit and putting the system under 2655 * pressure 2656 */ 2657 do { 2658 if (next_mz) 2659 mz = next_mz; 2660 else 2661 mz = mem_cgroup_largest_soft_limit_node(mctz); 2662 if (!mz) 2663 break; 2664 2665 nr_scanned = 0; 2666 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 2667 gfp_mask, &nr_scanned); 2668 nr_reclaimed += reclaimed; 2669 *total_scanned += nr_scanned; 2670 spin_lock_irq(&mctz->lock); 2671 __mem_cgroup_remove_exceeded(mz, mctz); 2672 2673 /* 2674 * If we failed to reclaim anything from this memory cgroup 2675 * it is time to move on to the next cgroup 2676 */ 2677 next_mz = NULL; 2678 if (!reclaimed) 2679 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 2680 2681 excess = soft_limit_excess(mz->memcg); 2682 /* 2683 * One school of thought says that we should not add 2684 * back the node to the tree if reclaim returns 0. 2685 * But our reclaim could return 0, simply because due 2686 * to priority we are exposing a smaller subset of 2687 * memory to reclaim from. Consider this as a longer 2688 * term TODO. 2689 */ 2690 /* If excess == 0, no tree ops */ 2691 __mem_cgroup_insert_exceeded(mz, mctz, excess); 2692 spin_unlock_irq(&mctz->lock); 2693 css_put(&mz->memcg->css); 2694 loop++; 2695 /* 2696 * Could not reclaim anything and there are no more 2697 * mem cgroups to try or we seem to be looping without 2698 * reclaiming anything. 2699 */ 2700 if (!nr_reclaimed && 2701 (next_mz == NULL || 2702 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2703 break; 2704 } while (!nr_reclaimed); 2705 if (next_mz) 2706 css_put(&next_mz->memcg->css); 2707 return nr_reclaimed; 2708 } 2709 2710 /* 2711 * Test whether @memcg has children, dead or alive. Note that this 2712 * function doesn't care whether @memcg has use_hierarchy enabled and 2713 * returns %true if there are child csses according to the cgroup 2714 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 2715 */ 2716 static inline bool memcg_has_children(struct mem_cgroup *memcg) 2717 { 2718 bool ret; 2719 2720 /* 2721 * The lock does not prevent addition or deletion of children, but 2722 * it prevents a new child from being initialized based on this 2723 * parent in css_online(), so it's enough to decide whether 2724 * hierarchically inherited attributes can still be changed or not. 2725 */ 2726 lockdep_assert_held(&memcg_create_mutex); 2727 2728 rcu_read_lock(); 2729 ret = css_next_child(NULL, &memcg->css); 2730 rcu_read_unlock(); 2731 return ret; 2732 } 2733 2734 /* 2735 * Reclaims as many pages from the given memcg as possible and moves 2736 * the rest to the parent. 2737 * 2738 * Caller is responsible for holding css reference for memcg. 2739 */ 2740 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 2741 { 2742 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2743 2744 /* we call try-to-free pages for make this cgroup empty */ 2745 lru_add_drain_all(); 2746 /* try to free all pages in this cgroup */ 2747 while (nr_retries && page_counter_read(&memcg->memory)) { 2748 int progress; 2749 2750 if (signal_pending(current)) 2751 return -EINTR; 2752 2753 progress = try_to_free_mem_cgroup_pages(memcg, 1, 2754 GFP_KERNEL, true); 2755 if (!progress) { 2756 nr_retries--; 2757 /* maybe some writeback is necessary */ 2758 congestion_wait(BLK_RW_ASYNC, HZ/10); 2759 } 2760 2761 } 2762 2763 return 0; 2764 } 2765 2766 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 2767 char *buf, size_t nbytes, 2768 loff_t off) 2769 { 2770 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 2771 2772 if (mem_cgroup_is_root(memcg)) 2773 return -EINVAL; 2774 return mem_cgroup_force_empty(memcg) ?: nbytes; 2775 } 2776 2777 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 2778 struct cftype *cft) 2779 { 2780 return mem_cgroup_from_css(css)->use_hierarchy; 2781 } 2782 2783 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 2784 struct cftype *cft, u64 val) 2785 { 2786 int retval = 0; 2787 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2788 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 2789 2790 mutex_lock(&memcg_create_mutex); 2791 2792 if (memcg->use_hierarchy == val) 2793 goto out; 2794 2795 /* 2796 * If parent's use_hierarchy is set, we can't make any modifications 2797 * in the child subtrees. If it is unset, then the change can 2798 * occur, provided the current cgroup has no children. 2799 * 2800 * For the root cgroup, parent_mem is NULL, we allow value to be 2801 * set if there are no children. 2802 */ 2803 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 2804 (val == 1 || val == 0)) { 2805 if (!memcg_has_children(memcg)) 2806 memcg->use_hierarchy = val; 2807 else 2808 retval = -EBUSY; 2809 } else 2810 retval = -EINVAL; 2811 2812 out: 2813 mutex_unlock(&memcg_create_mutex); 2814 2815 return retval; 2816 } 2817 2818 static unsigned long tree_stat(struct mem_cgroup *memcg, 2819 enum mem_cgroup_stat_index idx) 2820 { 2821 struct mem_cgroup *iter; 2822 long val = 0; 2823 2824 /* Per-cpu values can be negative, use a signed accumulator */ 2825 for_each_mem_cgroup_tree(iter, memcg) 2826 val += mem_cgroup_read_stat(iter, idx); 2827 2828 if (val < 0) /* race ? */ 2829 val = 0; 2830 return val; 2831 } 2832 2833 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 2834 { 2835 u64 val; 2836 2837 if (mem_cgroup_is_root(memcg)) { 2838 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 2839 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 2840 if (swap) 2841 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 2842 } else { 2843 if (!swap) 2844 val = page_counter_read(&memcg->memory); 2845 else 2846 val = page_counter_read(&memcg->memsw); 2847 } 2848 return val << PAGE_SHIFT; 2849 } 2850 2851 enum { 2852 RES_USAGE, 2853 RES_LIMIT, 2854 RES_MAX_USAGE, 2855 RES_FAILCNT, 2856 RES_SOFT_LIMIT, 2857 }; 2858 2859 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 2860 struct cftype *cft) 2861 { 2862 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2863 struct page_counter *counter; 2864 2865 switch (MEMFILE_TYPE(cft->private)) { 2866 case _MEM: 2867 counter = &memcg->memory; 2868 break; 2869 case _MEMSWAP: 2870 counter = &memcg->memsw; 2871 break; 2872 case _KMEM: 2873 counter = &memcg->kmem; 2874 break; 2875 default: 2876 BUG(); 2877 } 2878 2879 switch (MEMFILE_ATTR(cft->private)) { 2880 case RES_USAGE: 2881 if (counter == &memcg->memory) 2882 return mem_cgroup_usage(memcg, false); 2883 if (counter == &memcg->memsw) 2884 return mem_cgroup_usage(memcg, true); 2885 return (u64)page_counter_read(counter) * PAGE_SIZE; 2886 case RES_LIMIT: 2887 return (u64)counter->limit * PAGE_SIZE; 2888 case RES_MAX_USAGE: 2889 return (u64)counter->watermark * PAGE_SIZE; 2890 case RES_FAILCNT: 2891 return counter->failcnt; 2892 case RES_SOFT_LIMIT: 2893 return (u64)memcg->soft_limit * PAGE_SIZE; 2894 default: 2895 BUG(); 2896 } 2897 } 2898 2899 #ifdef CONFIG_MEMCG_KMEM 2900 static int memcg_activate_kmem(struct mem_cgroup *memcg, 2901 unsigned long nr_pages) 2902 { 2903 int err = 0; 2904 int memcg_id; 2905 2906 BUG_ON(memcg->kmemcg_id >= 0); 2907 BUG_ON(memcg->kmem_acct_activated); 2908 BUG_ON(memcg->kmem_acct_active); 2909 2910 /* 2911 * For simplicity, we won't allow this to be disabled. It also can't 2912 * be changed if the cgroup has children already, or if tasks had 2913 * already joined. 2914 * 2915 * If tasks join before we set the limit, a person looking at 2916 * kmem.usage_in_bytes will have no way to determine when it took 2917 * place, which makes the value quite meaningless. 2918 * 2919 * After it first became limited, changes in the value of the limit are 2920 * of course permitted. 2921 */ 2922 mutex_lock(&memcg_create_mutex); 2923 if (cgroup_has_tasks(memcg->css.cgroup) || 2924 (memcg->use_hierarchy && memcg_has_children(memcg))) 2925 err = -EBUSY; 2926 mutex_unlock(&memcg_create_mutex); 2927 if (err) 2928 goto out; 2929 2930 memcg_id = memcg_alloc_cache_id(); 2931 if (memcg_id < 0) { 2932 err = memcg_id; 2933 goto out; 2934 } 2935 2936 /* 2937 * We couldn't have accounted to this cgroup, because it hasn't got 2938 * activated yet, so this should succeed. 2939 */ 2940 err = page_counter_limit(&memcg->kmem, nr_pages); 2941 VM_BUG_ON(err); 2942 2943 static_key_slow_inc(&memcg_kmem_enabled_key); 2944 /* 2945 * A memory cgroup is considered kmem-active as soon as it gets 2946 * kmemcg_id. Setting the id after enabling static branching will 2947 * guarantee no one starts accounting before all call sites are 2948 * patched. 2949 */ 2950 memcg->kmemcg_id = memcg_id; 2951 memcg->kmem_acct_activated = true; 2952 memcg->kmem_acct_active = true; 2953 out: 2954 return err; 2955 } 2956 2957 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2958 unsigned long limit) 2959 { 2960 int ret; 2961 2962 mutex_lock(&memcg_limit_mutex); 2963 if (!memcg_kmem_is_active(memcg)) 2964 ret = memcg_activate_kmem(memcg, limit); 2965 else 2966 ret = page_counter_limit(&memcg->kmem, limit); 2967 mutex_unlock(&memcg_limit_mutex); 2968 return ret; 2969 } 2970 2971 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 2972 { 2973 int ret = 0; 2974 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 2975 2976 if (!parent) 2977 return 0; 2978 2979 mutex_lock(&memcg_limit_mutex); 2980 /* 2981 * If the parent cgroup is not kmem-active now, it cannot be activated 2982 * after this point, because it has at least one child already. 2983 */ 2984 if (memcg_kmem_is_active(parent)) 2985 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 2986 mutex_unlock(&memcg_limit_mutex); 2987 return ret; 2988 } 2989 #else 2990 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2991 unsigned long limit) 2992 { 2993 return -EINVAL; 2994 } 2995 #endif /* CONFIG_MEMCG_KMEM */ 2996 2997 /* 2998 * The user of this function is... 2999 * RES_LIMIT. 3000 */ 3001 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3002 char *buf, size_t nbytes, loff_t off) 3003 { 3004 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3005 unsigned long nr_pages; 3006 int ret; 3007 3008 buf = strstrip(buf); 3009 ret = page_counter_memparse(buf, "-1", &nr_pages); 3010 if (ret) 3011 return ret; 3012 3013 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3014 case RES_LIMIT: 3015 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3016 ret = -EINVAL; 3017 break; 3018 } 3019 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3020 case _MEM: 3021 ret = mem_cgroup_resize_limit(memcg, nr_pages); 3022 break; 3023 case _MEMSWAP: 3024 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 3025 break; 3026 case _KMEM: 3027 ret = memcg_update_kmem_limit(memcg, nr_pages); 3028 break; 3029 } 3030 break; 3031 case RES_SOFT_LIMIT: 3032 memcg->soft_limit = nr_pages; 3033 ret = 0; 3034 break; 3035 } 3036 return ret ?: nbytes; 3037 } 3038 3039 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3040 size_t nbytes, loff_t off) 3041 { 3042 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3043 struct page_counter *counter; 3044 3045 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3046 case _MEM: 3047 counter = &memcg->memory; 3048 break; 3049 case _MEMSWAP: 3050 counter = &memcg->memsw; 3051 break; 3052 case _KMEM: 3053 counter = &memcg->kmem; 3054 break; 3055 default: 3056 BUG(); 3057 } 3058 3059 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3060 case RES_MAX_USAGE: 3061 page_counter_reset_watermark(counter); 3062 break; 3063 case RES_FAILCNT: 3064 counter->failcnt = 0; 3065 break; 3066 default: 3067 BUG(); 3068 } 3069 3070 return nbytes; 3071 } 3072 3073 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3074 struct cftype *cft) 3075 { 3076 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3077 } 3078 3079 #ifdef CONFIG_MMU 3080 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3081 struct cftype *cft, u64 val) 3082 { 3083 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3084 3085 if (val & ~MOVE_MASK) 3086 return -EINVAL; 3087 3088 /* 3089 * No kind of locking is needed in here, because ->can_attach() will 3090 * check this value once in the beginning of the process, and then carry 3091 * on with stale data. This means that changes to this value will only 3092 * affect task migrations starting after the change. 3093 */ 3094 memcg->move_charge_at_immigrate = val; 3095 return 0; 3096 } 3097 #else 3098 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3099 struct cftype *cft, u64 val) 3100 { 3101 return -ENOSYS; 3102 } 3103 #endif 3104 3105 #ifdef CONFIG_NUMA 3106 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3107 { 3108 struct numa_stat { 3109 const char *name; 3110 unsigned int lru_mask; 3111 }; 3112 3113 static const struct numa_stat stats[] = { 3114 { "total", LRU_ALL }, 3115 { "file", LRU_ALL_FILE }, 3116 { "anon", LRU_ALL_ANON }, 3117 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3118 }; 3119 const struct numa_stat *stat; 3120 int nid; 3121 unsigned long nr; 3122 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3123 3124 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3125 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3126 seq_printf(m, "%s=%lu", stat->name, nr); 3127 for_each_node_state(nid, N_MEMORY) { 3128 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3129 stat->lru_mask); 3130 seq_printf(m, " N%d=%lu", nid, nr); 3131 } 3132 seq_putc(m, '\n'); 3133 } 3134 3135 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3136 struct mem_cgroup *iter; 3137 3138 nr = 0; 3139 for_each_mem_cgroup_tree(iter, memcg) 3140 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3141 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3142 for_each_node_state(nid, N_MEMORY) { 3143 nr = 0; 3144 for_each_mem_cgroup_tree(iter, memcg) 3145 nr += mem_cgroup_node_nr_lru_pages( 3146 iter, nid, stat->lru_mask); 3147 seq_printf(m, " N%d=%lu", nid, nr); 3148 } 3149 seq_putc(m, '\n'); 3150 } 3151 3152 return 0; 3153 } 3154 #endif /* CONFIG_NUMA */ 3155 3156 static int memcg_stat_show(struct seq_file *m, void *v) 3157 { 3158 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3159 unsigned long memory, memsw; 3160 struct mem_cgroup *mi; 3161 unsigned int i; 3162 3163 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != 3164 MEM_CGROUP_STAT_NSTATS); 3165 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != 3166 MEM_CGROUP_EVENTS_NSTATS); 3167 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3168 3169 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3170 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3171 continue; 3172 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 3173 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3174 } 3175 3176 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3177 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3178 mem_cgroup_read_events(memcg, i)); 3179 3180 for (i = 0; i < NR_LRU_LISTS; i++) 3181 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3182 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3183 3184 /* Hierarchical information */ 3185 memory = memsw = PAGE_COUNTER_MAX; 3186 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3187 memory = min(memory, mi->memory.limit); 3188 memsw = min(memsw, mi->memsw.limit); 3189 } 3190 seq_printf(m, "hierarchical_memory_limit %llu\n", 3191 (u64)memory * PAGE_SIZE); 3192 if (do_swap_account) 3193 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3194 (u64)memsw * PAGE_SIZE); 3195 3196 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3197 long long val = 0; 3198 3199 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3200 continue; 3201 for_each_mem_cgroup_tree(mi, memcg) 3202 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3203 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 3204 } 3205 3206 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3207 unsigned long long val = 0; 3208 3209 for_each_mem_cgroup_tree(mi, memcg) 3210 val += mem_cgroup_read_events(mi, i); 3211 seq_printf(m, "total_%s %llu\n", 3212 mem_cgroup_events_names[i], val); 3213 } 3214 3215 for (i = 0; i < NR_LRU_LISTS; i++) { 3216 unsigned long long val = 0; 3217 3218 for_each_mem_cgroup_tree(mi, memcg) 3219 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3220 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3221 } 3222 3223 #ifdef CONFIG_DEBUG_VM 3224 { 3225 int nid, zid; 3226 struct mem_cgroup_per_zone *mz; 3227 struct zone_reclaim_stat *rstat; 3228 unsigned long recent_rotated[2] = {0, 0}; 3229 unsigned long recent_scanned[2] = {0, 0}; 3230 3231 for_each_online_node(nid) 3232 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3233 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3234 rstat = &mz->lruvec.reclaim_stat; 3235 3236 recent_rotated[0] += rstat->recent_rotated[0]; 3237 recent_rotated[1] += rstat->recent_rotated[1]; 3238 recent_scanned[0] += rstat->recent_scanned[0]; 3239 recent_scanned[1] += rstat->recent_scanned[1]; 3240 } 3241 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3242 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3243 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3244 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3245 } 3246 #endif 3247 3248 return 0; 3249 } 3250 3251 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3252 struct cftype *cft) 3253 { 3254 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3255 3256 return mem_cgroup_swappiness(memcg); 3257 } 3258 3259 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3260 struct cftype *cft, u64 val) 3261 { 3262 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3263 3264 if (val > 100) 3265 return -EINVAL; 3266 3267 if (css->parent) 3268 memcg->swappiness = val; 3269 else 3270 vm_swappiness = val; 3271 3272 return 0; 3273 } 3274 3275 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3276 { 3277 struct mem_cgroup_threshold_ary *t; 3278 unsigned long usage; 3279 int i; 3280 3281 rcu_read_lock(); 3282 if (!swap) 3283 t = rcu_dereference(memcg->thresholds.primary); 3284 else 3285 t = rcu_dereference(memcg->memsw_thresholds.primary); 3286 3287 if (!t) 3288 goto unlock; 3289 3290 usage = mem_cgroup_usage(memcg, swap); 3291 3292 /* 3293 * current_threshold points to threshold just below or equal to usage. 3294 * If it's not true, a threshold was crossed after last 3295 * call of __mem_cgroup_threshold(). 3296 */ 3297 i = t->current_threshold; 3298 3299 /* 3300 * Iterate backward over array of thresholds starting from 3301 * current_threshold and check if a threshold is crossed. 3302 * If none of thresholds below usage is crossed, we read 3303 * only one element of the array here. 3304 */ 3305 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3306 eventfd_signal(t->entries[i].eventfd, 1); 3307 3308 /* i = current_threshold + 1 */ 3309 i++; 3310 3311 /* 3312 * Iterate forward over array of thresholds starting from 3313 * current_threshold+1 and check if a threshold is crossed. 3314 * If none of thresholds above usage is crossed, we read 3315 * only one element of the array here. 3316 */ 3317 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3318 eventfd_signal(t->entries[i].eventfd, 1); 3319 3320 /* Update current_threshold */ 3321 t->current_threshold = i - 1; 3322 unlock: 3323 rcu_read_unlock(); 3324 } 3325 3326 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3327 { 3328 while (memcg) { 3329 __mem_cgroup_threshold(memcg, false); 3330 if (do_swap_account) 3331 __mem_cgroup_threshold(memcg, true); 3332 3333 memcg = parent_mem_cgroup(memcg); 3334 } 3335 } 3336 3337 static int compare_thresholds(const void *a, const void *b) 3338 { 3339 const struct mem_cgroup_threshold *_a = a; 3340 const struct mem_cgroup_threshold *_b = b; 3341 3342 if (_a->threshold > _b->threshold) 3343 return 1; 3344 3345 if (_a->threshold < _b->threshold) 3346 return -1; 3347 3348 return 0; 3349 } 3350 3351 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3352 { 3353 struct mem_cgroup_eventfd_list *ev; 3354 3355 spin_lock(&memcg_oom_lock); 3356 3357 list_for_each_entry(ev, &memcg->oom_notify, list) 3358 eventfd_signal(ev->eventfd, 1); 3359 3360 spin_unlock(&memcg_oom_lock); 3361 return 0; 3362 } 3363 3364 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3365 { 3366 struct mem_cgroup *iter; 3367 3368 for_each_mem_cgroup_tree(iter, memcg) 3369 mem_cgroup_oom_notify_cb(iter); 3370 } 3371 3372 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3373 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3374 { 3375 struct mem_cgroup_thresholds *thresholds; 3376 struct mem_cgroup_threshold_ary *new; 3377 unsigned long threshold; 3378 unsigned long usage; 3379 int i, size, ret; 3380 3381 ret = page_counter_memparse(args, "-1", &threshold); 3382 if (ret) 3383 return ret; 3384 3385 mutex_lock(&memcg->thresholds_lock); 3386 3387 if (type == _MEM) { 3388 thresholds = &memcg->thresholds; 3389 usage = mem_cgroup_usage(memcg, false); 3390 } else if (type == _MEMSWAP) { 3391 thresholds = &memcg->memsw_thresholds; 3392 usage = mem_cgroup_usage(memcg, true); 3393 } else 3394 BUG(); 3395 3396 /* Check if a threshold crossed before adding a new one */ 3397 if (thresholds->primary) 3398 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3399 3400 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3401 3402 /* Allocate memory for new array of thresholds */ 3403 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3404 GFP_KERNEL); 3405 if (!new) { 3406 ret = -ENOMEM; 3407 goto unlock; 3408 } 3409 new->size = size; 3410 3411 /* Copy thresholds (if any) to new array */ 3412 if (thresholds->primary) { 3413 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3414 sizeof(struct mem_cgroup_threshold)); 3415 } 3416 3417 /* Add new threshold */ 3418 new->entries[size - 1].eventfd = eventfd; 3419 new->entries[size - 1].threshold = threshold; 3420 3421 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3422 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3423 compare_thresholds, NULL); 3424 3425 /* Find current threshold */ 3426 new->current_threshold = -1; 3427 for (i = 0; i < size; i++) { 3428 if (new->entries[i].threshold <= usage) { 3429 /* 3430 * new->current_threshold will not be used until 3431 * rcu_assign_pointer(), so it's safe to increment 3432 * it here. 3433 */ 3434 ++new->current_threshold; 3435 } else 3436 break; 3437 } 3438 3439 /* Free old spare buffer and save old primary buffer as spare */ 3440 kfree(thresholds->spare); 3441 thresholds->spare = thresholds->primary; 3442 3443 rcu_assign_pointer(thresholds->primary, new); 3444 3445 /* To be sure that nobody uses thresholds */ 3446 synchronize_rcu(); 3447 3448 unlock: 3449 mutex_unlock(&memcg->thresholds_lock); 3450 3451 return ret; 3452 } 3453 3454 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3455 struct eventfd_ctx *eventfd, const char *args) 3456 { 3457 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3458 } 3459 3460 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3461 struct eventfd_ctx *eventfd, const char *args) 3462 { 3463 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3464 } 3465 3466 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3467 struct eventfd_ctx *eventfd, enum res_type type) 3468 { 3469 struct mem_cgroup_thresholds *thresholds; 3470 struct mem_cgroup_threshold_ary *new; 3471 unsigned long usage; 3472 int i, j, size; 3473 3474 mutex_lock(&memcg->thresholds_lock); 3475 3476 if (type == _MEM) { 3477 thresholds = &memcg->thresholds; 3478 usage = mem_cgroup_usage(memcg, false); 3479 } else if (type == _MEMSWAP) { 3480 thresholds = &memcg->memsw_thresholds; 3481 usage = mem_cgroup_usage(memcg, true); 3482 } else 3483 BUG(); 3484 3485 if (!thresholds->primary) 3486 goto unlock; 3487 3488 /* Check if a threshold crossed before removing */ 3489 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3490 3491 /* Calculate new number of threshold */ 3492 size = 0; 3493 for (i = 0; i < thresholds->primary->size; i++) { 3494 if (thresholds->primary->entries[i].eventfd != eventfd) 3495 size++; 3496 } 3497 3498 new = thresholds->spare; 3499 3500 /* Set thresholds array to NULL if we don't have thresholds */ 3501 if (!size) { 3502 kfree(new); 3503 new = NULL; 3504 goto swap_buffers; 3505 } 3506 3507 new->size = size; 3508 3509 /* Copy thresholds and find current threshold */ 3510 new->current_threshold = -1; 3511 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3512 if (thresholds->primary->entries[i].eventfd == eventfd) 3513 continue; 3514 3515 new->entries[j] = thresholds->primary->entries[i]; 3516 if (new->entries[j].threshold <= usage) { 3517 /* 3518 * new->current_threshold will not be used 3519 * until rcu_assign_pointer(), so it's safe to increment 3520 * it here. 3521 */ 3522 ++new->current_threshold; 3523 } 3524 j++; 3525 } 3526 3527 swap_buffers: 3528 /* Swap primary and spare array */ 3529 thresholds->spare = thresholds->primary; 3530 /* If all events are unregistered, free the spare array */ 3531 if (!new) { 3532 kfree(thresholds->spare); 3533 thresholds->spare = NULL; 3534 } 3535 3536 rcu_assign_pointer(thresholds->primary, new); 3537 3538 /* To be sure that nobody uses thresholds */ 3539 synchronize_rcu(); 3540 unlock: 3541 mutex_unlock(&memcg->thresholds_lock); 3542 } 3543 3544 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3545 struct eventfd_ctx *eventfd) 3546 { 3547 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 3548 } 3549 3550 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3551 struct eventfd_ctx *eventfd) 3552 { 3553 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 3554 } 3555 3556 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 3557 struct eventfd_ctx *eventfd, const char *args) 3558 { 3559 struct mem_cgroup_eventfd_list *event; 3560 3561 event = kmalloc(sizeof(*event), GFP_KERNEL); 3562 if (!event) 3563 return -ENOMEM; 3564 3565 spin_lock(&memcg_oom_lock); 3566 3567 event->eventfd = eventfd; 3568 list_add(&event->list, &memcg->oom_notify); 3569 3570 /* already in OOM ? */ 3571 if (memcg->under_oom) 3572 eventfd_signal(eventfd, 1); 3573 spin_unlock(&memcg_oom_lock); 3574 3575 return 0; 3576 } 3577 3578 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 3579 struct eventfd_ctx *eventfd) 3580 { 3581 struct mem_cgroup_eventfd_list *ev, *tmp; 3582 3583 spin_lock(&memcg_oom_lock); 3584 3585 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 3586 if (ev->eventfd == eventfd) { 3587 list_del(&ev->list); 3588 kfree(ev); 3589 } 3590 } 3591 3592 spin_unlock(&memcg_oom_lock); 3593 } 3594 3595 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3596 { 3597 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3598 3599 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3600 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3601 return 0; 3602 } 3603 3604 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 3605 struct cftype *cft, u64 val) 3606 { 3607 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3608 3609 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3610 if (!css->parent || !((val == 0) || (val == 1))) 3611 return -EINVAL; 3612 3613 memcg->oom_kill_disable = val; 3614 if (!val) 3615 memcg_oom_recover(memcg); 3616 3617 return 0; 3618 } 3619 3620 #ifdef CONFIG_MEMCG_KMEM 3621 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3622 { 3623 int ret; 3624 3625 ret = memcg_propagate_kmem(memcg); 3626 if (ret) 3627 return ret; 3628 3629 return mem_cgroup_sockets_init(memcg, ss); 3630 } 3631 3632 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3633 { 3634 struct cgroup_subsys_state *css; 3635 struct mem_cgroup *parent, *child; 3636 int kmemcg_id; 3637 3638 if (!memcg->kmem_acct_active) 3639 return; 3640 3641 /* 3642 * Clear the 'active' flag before clearing memcg_caches arrays entries. 3643 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it 3644 * guarantees no cache will be created for this cgroup after we are 3645 * done (see memcg_create_kmem_cache()). 3646 */ 3647 memcg->kmem_acct_active = false; 3648 3649 memcg_deactivate_kmem_caches(memcg); 3650 3651 kmemcg_id = memcg->kmemcg_id; 3652 BUG_ON(kmemcg_id < 0); 3653 3654 parent = parent_mem_cgroup(memcg); 3655 if (!parent) 3656 parent = root_mem_cgroup; 3657 3658 /* 3659 * Change kmemcg_id of this cgroup and all its descendants to the 3660 * parent's id, and then move all entries from this cgroup's list_lrus 3661 * to ones of the parent. After we have finished, all list_lrus 3662 * corresponding to this cgroup are guaranteed to remain empty. The 3663 * ordering is imposed by list_lru_node->lock taken by 3664 * memcg_drain_all_list_lrus(). 3665 */ 3666 css_for_each_descendant_pre(css, &memcg->css) { 3667 child = mem_cgroup_from_css(css); 3668 BUG_ON(child->kmemcg_id != kmemcg_id); 3669 child->kmemcg_id = parent->kmemcg_id; 3670 if (!memcg->use_hierarchy) 3671 break; 3672 } 3673 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); 3674 3675 memcg_free_cache_id(kmemcg_id); 3676 } 3677 3678 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3679 { 3680 if (memcg->kmem_acct_activated) { 3681 memcg_destroy_kmem_caches(memcg); 3682 static_key_slow_dec(&memcg_kmem_enabled_key); 3683 WARN_ON(page_counter_read(&memcg->kmem)); 3684 } 3685 mem_cgroup_sockets_destroy(memcg); 3686 } 3687 #else 3688 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3689 { 3690 return 0; 3691 } 3692 3693 static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3694 { 3695 } 3696 3697 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 3698 { 3699 } 3700 #endif 3701 3702 #ifdef CONFIG_CGROUP_WRITEBACK 3703 3704 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) 3705 { 3706 return &memcg->cgwb_list; 3707 } 3708 3709 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3710 { 3711 return wb_domain_init(&memcg->cgwb_domain, gfp); 3712 } 3713 3714 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3715 { 3716 wb_domain_exit(&memcg->cgwb_domain); 3717 } 3718 3719 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3720 { 3721 wb_domain_size_changed(&memcg->cgwb_domain); 3722 } 3723 3724 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 3725 { 3726 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3727 3728 if (!memcg->css.parent) 3729 return NULL; 3730 3731 return &memcg->cgwb_domain; 3732 } 3733 3734 /** 3735 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3736 * @wb: bdi_writeback in question 3737 * @pavail: out parameter for number of available pages 3738 * @pdirty: out parameter for number of dirty pages 3739 * @pwriteback: out parameter for number of pages under writeback 3740 * 3741 * Determine the numbers of available, dirty, and writeback pages in @wb's 3742 * memcg. Dirty and writeback are self-explanatory. Available is a bit 3743 * more involved. 3744 * 3745 * A memcg's headroom is "min(max, high) - used". The available memory is 3746 * calculated as the lowest headroom of itself and the ancestors plus the 3747 * number of pages already being used for file pages. Note that this 3748 * doesn't consider the actual amount of available memory in the system. 3749 * The caller should further cap *@pavail accordingly. 3750 */ 3751 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, 3752 unsigned long *pdirty, unsigned long *pwriteback) 3753 { 3754 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3755 struct mem_cgroup *parent; 3756 unsigned long head_room = PAGE_COUNTER_MAX; 3757 unsigned long file_pages; 3758 3759 *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); 3760 3761 /* this should eventually include NR_UNSTABLE_NFS */ 3762 *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 3763 3764 file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3765 (1 << LRU_ACTIVE_FILE)); 3766 while ((parent = parent_mem_cgroup(memcg))) { 3767 unsigned long ceiling = min(memcg->memory.limit, memcg->high); 3768 unsigned long used = page_counter_read(&memcg->memory); 3769 3770 head_room = min(head_room, ceiling - min(ceiling, used)); 3771 memcg = parent; 3772 } 3773 3774 *pavail = file_pages + head_room; 3775 } 3776 3777 #else /* CONFIG_CGROUP_WRITEBACK */ 3778 3779 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3780 { 3781 return 0; 3782 } 3783 3784 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3785 { 3786 } 3787 3788 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3789 { 3790 } 3791 3792 #endif /* CONFIG_CGROUP_WRITEBACK */ 3793 3794 /* 3795 * DO NOT USE IN NEW FILES. 3796 * 3797 * "cgroup.event_control" implementation. 3798 * 3799 * This is way over-engineered. It tries to support fully configurable 3800 * events for each user. Such level of flexibility is completely 3801 * unnecessary especially in the light of the planned unified hierarchy. 3802 * 3803 * Please deprecate this and replace with something simpler if at all 3804 * possible. 3805 */ 3806 3807 /* 3808 * Unregister event and free resources. 3809 * 3810 * Gets called from workqueue. 3811 */ 3812 static void memcg_event_remove(struct work_struct *work) 3813 { 3814 struct mem_cgroup_event *event = 3815 container_of(work, struct mem_cgroup_event, remove); 3816 struct mem_cgroup *memcg = event->memcg; 3817 3818 remove_wait_queue(event->wqh, &event->wait); 3819 3820 event->unregister_event(memcg, event->eventfd); 3821 3822 /* Notify userspace the event is going away. */ 3823 eventfd_signal(event->eventfd, 1); 3824 3825 eventfd_ctx_put(event->eventfd); 3826 kfree(event); 3827 css_put(&memcg->css); 3828 } 3829 3830 /* 3831 * Gets called on POLLHUP on eventfd when user closes it. 3832 * 3833 * Called with wqh->lock held and interrupts disabled. 3834 */ 3835 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 3836 int sync, void *key) 3837 { 3838 struct mem_cgroup_event *event = 3839 container_of(wait, struct mem_cgroup_event, wait); 3840 struct mem_cgroup *memcg = event->memcg; 3841 unsigned long flags = (unsigned long)key; 3842 3843 if (flags & POLLHUP) { 3844 /* 3845 * If the event has been detached at cgroup removal, we 3846 * can simply return knowing the other side will cleanup 3847 * for us. 3848 * 3849 * We can't race against event freeing since the other 3850 * side will require wqh->lock via remove_wait_queue(), 3851 * which we hold. 3852 */ 3853 spin_lock(&memcg->event_list_lock); 3854 if (!list_empty(&event->list)) { 3855 list_del_init(&event->list); 3856 /* 3857 * We are in atomic context, but cgroup_event_remove() 3858 * may sleep, so we have to call it in workqueue. 3859 */ 3860 schedule_work(&event->remove); 3861 } 3862 spin_unlock(&memcg->event_list_lock); 3863 } 3864 3865 return 0; 3866 } 3867 3868 static void memcg_event_ptable_queue_proc(struct file *file, 3869 wait_queue_head_t *wqh, poll_table *pt) 3870 { 3871 struct mem_cgroup_event *event = 3872 container_of(pt, struct mem_cgroup_event, pt); 3873 3874 event->wqh = wqh; 3875 add_wait_queue(wqh, &event->wait); 3876 } 3877 3878 /* 3879 * DO NOT USE IN NEW FILES. 3880 * 3881 * Parse input and register new cgroup event handler. 3882 * 3883 * Input must be in format '<event_fd> <control_fd> <args>'. 3884 * Interpretation of args is defined by control file implementation. 3885 */ 3886 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 3887 char *buf, size_t nbytes, loff_t off) 3888 { 3889 struct cgroup_subsys_state *css = of_css(of); 3890 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3891 struct mem_cgroup_event *event; 3892 struct cgroup_subsys_state *cfile_css; 3893 unsigned int efd, cfd; 3894 struct fd efile; 3895 struct fd cfile; 3896 const char *name; 3897 char *endp; 3898 int ret; 3899 3900 buf = strstrip(buf); 3901 3902 efd = simple_strtoul(buf, &endp, 10); 3903 if (*endp != ' ') 3904 return -EINVAL; 3905 buf = endp + 1; 3906 3907 cfd = simple_strtoul(buf, &endp, 10); 3908 if ((*endp != ' ') && (*endp != '\0')) 3909 return -EINVAL; 3910 buf = endp + 1; 3911 3912 event = kzalloc(sizeof(*event), GFP_KERNEL); 3913 if (!event) 3914 return -ENOMEM; 3915 3916 event->memcg = memcg; 3917 INIT_LIST_HEAD(&event->list); 3918 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 3919 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 3920 INIT_WORK(&event->remove, memcg_event_remove); 3921 3922 efile = fdget(efd); 3923 if (!efile.file) { 3924 ret = -EBADF; 3925 goto out_kfree; 3926 } 3927 3928 event->eventfd = eventfd_ctx_fileget(efile.file); 3929 if (IS_ERR(event->eventfd)) { 3930 ret = PTR_ERR(event->eventfd); 3931 goto out_put_efile; 3932 } 3933 3934 cfile = fdget(cfd); 3935 if (!cfile.file) { 3936 ret = -EBADF; 3937 goto out_put_eventfd; 3938 } 3939 3940 /* the process need read permission on control file */ 3941 /* AV: shouldn't we check that it's been opened for read instead? */ 3942 ret = inode_permission(file_inode(cfile.file), MAY_READ); 3943 if (ret < 0) 3944 goto out_put_cfile; 3945 3946 /* 3947 * Determine the event callbacks and set them in @event. This used 3948 * to be done via struct cftype but cgroup core no longer knows 3949 * about these events. The following is crude but the whole thing 3950 * is for compatibility anyway. 3951 * 3952 * DO NOT ADD NEW FILES. 3953 */ 3954 name = cfile.file->f_path.dentry->d_name.name; 3955 3956 if (!strcmp(name, "memory.usage_in_bytes")) { 3957 event->register_event = mem_cgroup_usage_register_event; 3958 event->unregister_event = mem_cgroup_usage_unregister_event; 3959 } else if (!strcmp(name, "memory.oom_control")) { 3960 event->register_event = mem_cgroup_oom_register_event; 3961 event->unregister_event = mem_cgroup_oom_unregister_event; 3962 } else if (!strcmp(name, "memory.pressure_level")) { 3963 event->register_event = vmpressure_register_event; 3964 event->unregister_event = vmpressure_unregister_event; 3965 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 3966 event->register_event = memsw_cgroup_usage_register_event; 3967 event->unregister_event = memsw_cgroup_usage_unregister_event; 3968 } else { 3969 ret = -EINVAL; 3970 goto out_put_cfile; 3971 } 3972 3973 /* 3974 * Verify @cfile should belong to @css. Also, remaining events are 3975 * automatically removed on cgroup destruction but the removal is 3976 * asynchronous, so take an extra ref on @css. 3977 */ 3978 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 3979 &memory_cgrp_subsys); 3980 ret = -EINVAL; 3981 if (IS_ERR(cfile_css)) 3982 goto out_put_cfile; 3983 if (cfile_css != css) { 3984 css_put(cfile_css); 3985 goto out_put_cfile; 3986 } 3987 3988 ret = event->register_event(memcg, event->eventfd, buf); 3989 if (ret) 3990 goto out_put_css; 3991 3992 efile.file->f_op->poll(efile.file, &event->pt); 3993 3994 spin_lock(&memcg->event_list_lock); 3995 list_add(&event->list, &memcg->event_list); 3996 spin_unlock(&memcg->event_list_lock); 3997 3998 fdput(cfile); 3999 fdput(efile); 4000 4001 return nbytes; 4002 4003 out_put_css: 4004 css_put(css); 4005 out_put_cfile: 4006 fdput(cfile); 4007 out_put_eventfd: 4008 eventfd_ctx_put(event->eventfd); 4009 out_put_efile: 4010 fdput(efile); 4011 out_kfree: 4012 kfree(event); 4013 4014 return ret; 4015 } 4016 4017 static struct cftype mem_cgroup_legacy_files[] = { 4018 { 4019 .name = "usage_in_bytes", 4020 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4021 .read_u64 = mem_cgroup_read_u64, 4022 }, 4023 { 4024 .name = "max_usage_in_bytes", 4025 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4026 .write = mem_cgroup_reset, 4027 .read_u64 = mem_cgroup_read_u64, 4028 }, 4029 { 4030 .name = "limit_in_bytes", 4031 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4032 .write = mem_cgroup_write, 4033 .read_u64 = mem_cgroup_read_u64, 4034 }, 4035 { 4036 .name = "soft_limit_in_bytes", 4037 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4038 .write = mem_cgroup_write, 4039 .read_u64 = mem_cgroup_read_u64, 4040 }, 4041 { 4042 .name = "failcnt", 4043 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4044 .write = mem_cgroup_reset, 4045 .read_u64 = mem_cgroup_read_u64, 4046 }, 4047 { 4048 .name = "stat", 4049 .seq_show = memcg_stat_show, 4050 }, 4051 { 4052 .name = "force_empty", 4053 .write = mem_cgroup_force_empty_write, 4054 }, 4055 { 4056 .name = "use_hierarchy", 4057 .write_u64 = mem_cgroup_hierarchy_write, 4058 .read_u64 = mem_cgroup_hierarchy_read, 4059 }, 4060 { 4061 .name = "cgroup.event_control", /* XXX: for compat */ 4062 .write = memcg_write_event_control, 4063 .flags = CFTYPE_NO_PREFIX, 4064 .mode = S_IWUGO, 4065 }, 4066 { 4067 .name = "swappiness", 4068 .read_u64 = mem_cgroup_swappiness_read, 4069 .write_u64 = mem_cgroup_swappiness_write, 4070 }, 4071 { 4072 .name = "move_charge_at_immigrate", 4073 .read_u64 = mem_cgroup_move_charge_read, 4074 .write_u64 = mem_cgroup_move_charge_write, 4075 }, 4076 { 4077 .name = "oom_control", 4078 .seq_show = mem_cgroup_oom_control_read, 4079 .write_u64 = mem_cgroup_oom_control_write, 4080 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4081 }, 4082 { 4083 .name = "pressure_level", 4084 }, 4085 #ifdef CONFIG_NUMA 4086 { 4087 .name = "numa_stat", 4088 .seq_show = memcg_numa_stat_show, 4089 }, 4090 #endif 4091 #ifdef CONFIG_MEMCG_KMEM 4092 { 4093 .name = "kmem.limit_in_bytes", 4094 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4095 .write = mem_cgroup_write, 4096 .read_u64 = mem_cgroup_read_u64, 4097 }, 4098 { 4099 .name = "kmem.usage_in_bytes", 4100 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4101 .read_u64 = mem_cgroup_read_u64, 4102 }, 4103 { 4104 .name = "kmem.failcnt", 4105 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4106 .write = mem_cgroup_reset, 4107 .read_u64 = mem_cgroup_read_u64, 4108 }, 4109 { 4110 .name = "kmem.max_usage_in_bytes", 4111 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4112 .write = mem_cgroup_reset, 4113 .read_u64 = mem_cgroup_read_u64, 4114 }, 4115 #ifdef CONFIG_SLABINFO 4116 { 4117 .name = "kmem.slabinfo", 4118 .seq_start = slab_start, 4119 .seq_next = slab_next, 4120 .seq_stop = slab_stop, 4121 .seq_show = memcg_slab_show, 4122 }, 4123 #endif 4124 #endif 4125 { }, /* terminate */ 4126 }; 4127 4128 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4129 { 4130 struct mem_cgroup_per_node *pn; 4131 struct mem_cgroup_per_zone *mz; 4132 int zone, tmp = node; 4133 /* 4134 * This routine is called against possible nodes. 4135 * But it's BUG to call kmalloc() against offline node. 4136 * 4137 * TODO: this routine can waste much memory for nodes which will 4138 * never be onlined. It's better to use memory hotplug callback 4139 * function. 4140 */ 4141 if (!node_state(node, N_NORMAL_MEMORY)) 4142 tmp = -1; 4143 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4144 if (!pn) 4145 return 1; 4146 4147 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4148 mz = &pn->zoneinfo[zone]; 4149 lruvec_init(&mz->lruvec); 4150 mz->usage_in_excess = 0; 4151 mz->on_tree = false; 4152 mz->memcg = memcg; 4153 } 4154 memcg->nodeinfo[node] = pn; 4155 return 0; 4156 } 4157 4158 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4159 { 4160 kfree(memcg->nodeinfo[node]); 4161 } 4162 4163 static struct mem_cgroup *mem_cgroup_alloc(void) 4164 { 4165 struct mem_cgroup *memcg; 4166 size_t size; 4167 4168 size = sizeof(struct mem_cgroup); 4169 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4170 4171 memcg = kzalloc(size, GFP_KERNEL); 4172 if (!memcg) 4173 return NULL; 4174 4175 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4176 if (!memcg->stat) 4177 goto out_free; 4178 4179 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4180 goto out_free_stat; 4181 4182 spin_lock_init(&memcg->pcp_counter_lock); 4183 return memcg; 4184 4185 out_free_stat: 4186 free_percpu(memcg->stat); 4187 out_free: 4188 kfree(memcg); 4189 return NULL; 4190 } 4191 4192 /* 4193 * At destroying mem_cgroup, references from swap_cgroup can remain. 4194 * (scanning all at force_empty is too costly...) 4195 * 4196 * Instead of clearing all references at force_empty, we remember 4197 * the number of reference from swap_cgroup and free mem_cgroup when 4198 * it goes down to 0. 4199 * 4200 * Removal of cgroup itself succeeds regardless of refs from swap. 4201 */ 4202 4203 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4204 { 4205 int node; 4206 4207 mem_cgroup_remove_from_trees(memcg); 4208 4209 for_each_node(node) 4210 free_mem_cgroup_per_zone_info(memcg, node); 4211 4212 free_percpu(memcg->stat); 4213 memcg_wb_domain_exit(memcg); 4214 kfree(memcg); 4215 } 4216 4217 /* 4218 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4219 */ 4220 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4221 { 4222 if (!memcg->memory.parent) 4223 return NULL; 4224 return mem_cgroup_from_counter(memcg->memory.parent, memory); 4225 } 4226 EXPORT_SYMBOL(parent_mem_cgroup); 4227 4228 static struct cgroup_subsys_state * __ref 4229 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4230 { 4231 struct mem_cgroup *memcg; 4232 long error = -ENOMEM; 4233 int node; 4234 4235 memcg = mem_cgroup_alloc(); 4236 if (!memcg) 4237 return ERR_PTR(error); 4238 4239 for_each_node(node) 4240 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4241 goto free_out; 4242 4243 /* root ? */ 4244 if (parent_css == NULL) { 4245 root_mem_cgroup = memcg; 4246 mem_cgroup_root_css = &memcg->css; 4247 page_counter_init(&memcg->memory, NULL); 4248 memcg->high = PAGE_COUNTER_MAX; 4249 memcg->soft_limit = PAGE_COUNTER_MAX; 4250 page_counter_init(&memcg->memsw, NULL); 4251 page_counter_init(&memcg->kmem, NULL); 4252 } 4253 4254 memcg->last_scanned_node = MAX_NUMNODES; 4255 INIT_LIST_HEAD(&memcg->oom_notify); 4256 memcg->move_charge_at_immigrate = 0; 4257 mutex_init(&memcg->thresholds_lock); 4258 spin_lock_init(&memcg->move_lock); 4259 vmpressure_init(&memcg->vmpressure); 4260 INIT_LIST_HEAD(&memcg->event_list); 4261 spin_lock_init(&memcg->event_list_lock); 4262 #ifdef CONFIG_MEMCG_KMEM 4263 memcg->kmemcg_id = -1; 4264 #endif 4265 #ifdef CONFIG_CGROUP_WRITEBACK 4266 INIT_LIST_HEAD(&memcg->cgwb_list); 4267 #endif 4268 return &memcg->css; 4269 4270 free_out: 4271 __mem_cgroup_free(memcg); 4272 return ERR_PTR(error); 4273 } 4274 4275 static int 4276 mem_cgroup_css_online(struct cgroup_subsys_state *css) 4277 { 4278 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4279 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4280 int ret; 4281 4282 if (css->id > MEM_CGROUP_ID_MAX) 4283 return -ENOSPC; 4284 4285 if (!parent) 4286 return 0; 4287 4288 mutex_lock(&memcg_create_mutex); 4289 4290 memcg->use_hierarchy = parent->use_hierarchy; 4291 memcg->oom_kill_disable = parent->oom_kill_disable; 4292 memcg->swappiness = mem_cgroup_swappiness(parent); 4293 4294 if (parent->use_hierarchy) { 4295 page_counter_init(&memcg->memory, &parent->memory); 4296 memcg->high = PAGE_COUNTER_MAX; 4297 memcg->soft_limit = PAGE_COUNTER_MAX; 4298 page_counter_init(&memcg->memsw, &parent->memsw); 4299 page_counter_init(&memcg->kmem, &parent->kmem); 4300 4301 /* 4302 * No need to take a reference to the parent because cgroup 4303 * core guarantees its existence. 4304 */ 4305 } else { 4306 page_counter_init(&memcg->memory, NULL); 4307 memcg->high = PAGE_COUNTER_MAX; 4308 memcg->soft_limit = PAGE_COUNTER_MAX; 4309 page_counter_init(&memcg->memsw, NULL); 4310 page_counter_init(&memcg->kmem, NULL); 4311 /* 4312 * Deeper hierachy with use_hierarchy == false doesn't make 4313 * much sense so let cgroup subsystem know about this 4314 * unfortunate state in our controller. 4315 */ 4316 if (parent != root_mem_cgroup) 4317 memory_cgrp_subsys.broken_hierarchy = true; 4318 } 4319 mutex_unlock(&memcg_create_mutex); 4320 4321 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 4322 if (ret) 4323 return ret; 4324 4325 /* 4326 * Make sure the memcg is initialized: mem_cgroup_iter() 4327 * orders reading memcg->initialized against its callers 4328 * reading the memcg members. 4329 */ 4330 smp_store_release(&memcg->initialized, 1); 4331 4332 return 0; 4333 } 4334 4335 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4336 { 4337 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4338 struct mem_cgroup_event *event, *tmp; 4339 4340 /* 4341 * Unregister events and notify userspace. 4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4343 * directory to avoid race between userspace and kernelspace. 4344 */ 4345 spin_lock(&memcg->event_list_lock); 4346 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4347 list_del_init(&event->list); 4348 schedule_work(&event->remove); 4349 } 4350 spin_unlock(&memcg->event_list_lock); 4351 4352 vmpressure_cleanup(&memcg->vmpressure); 4353 4354 memcg_deactivate_kmem(memcg); 4355 4356 wb_memcg_offline(memcg); 4357 } 4358 4359 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4360 { 4361 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4362 4363 memcg_destroy_kmem(memcg); 4364 __mem_cgroup_free(memcg); 4365 } 4366 4367 /** 4368 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4369 * @css: the target css 4370 * 4371 * Reset the states of the mem_cgroup associated with @css. This is 4372 * invoked when the userland requests disabling on the default hierarchy 4373 * but the memcg is pinned through dependency. The memcg should stop 4374 * applying policies and should revert to the vanilla state as it may be 4375 * made visible again. 4376 * 4377 * The current implementation only resets the essential configurations. 4378 * This needs to be expanded to cover all the visible parts. 4379 */ 4380 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4381 { 4382 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4383 4384 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4385 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4386 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4387 memcg->low = 0; 4388 memcg->high = PAGE_COUNTER_MAX; 4389 memcg->soft_limit = PAGE_COUNTER_MAX; 4390 memcg_wb_domain_size_changed(memcg); 4391 } 4392 4393 #ifdef CONFIG_MMU 4394 /* Handlers for move charge at task migration. */ 4395 static int mem_cgroup_do_precharge(unsigned long count) 4396 { 4397 int ret; 4398 4399 /* Try a single bulk charge without reclaim first */ 4400 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 4401 if (!ret) { 4402 mc.precharge += count; 4403 return ret; 4404 } 4405 if (ret == -EINTR) { 4406 cancel_charge(root_mem_cgroup, count); 4407 return ret; 4408 } 4409 4410 /* Try charges one by one with reclaim */ 4411 while (count--) { 4412 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4413 /* 4414 * In case of failure, any residual charges against 4415 * mc.to will be dropped by mem_cgroup_clear_mc() 4416 * later on. However, cancel any charges that are 4417 * bypassed to root right away or they'll be lost. 4418 */ 4419 if (ret == -EINTR) 4420 cancel_charge(root_mem_cgroup, 1); 4421 if (ret) 4422 return ret; 4423 mc.precharge++; 4424 cond_resched(); 4425 } 4426 return 0; 4427 } 4428 4429 /** 4430 * get_mctgt_type - get target type of moving charge 4431 * @vma: the vma the pte to be checked belongs 4432 * @addr: the address corresponding to the pte to be checked 4433 * @ptent: the pte to be checked 4434 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4435 * 4436 * Returns 4437 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4438 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4439 * move charge. if @target is not NULL, the page is stored in target->page 4440 * with extra refcnt got(Callers should handle it). 4441 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4442 * target for charge migration. if @target is not NULL, the entry is stored 4443 * in target->ent. 4444 * 4445 * Called with pte lock held. 4446 */ 4447 union mc_target { 4448 struct page *page; 4449 swp_entry_t ent; 4450 }; 4451 4452 enum mc_target_type { 4453 MC_TARGET_NONE = 0, 4454 MC_TARGET_PAGE, 4455 MC_TARGET_SWAP, 4456 }; 4457 4458 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4459 unsigned long addr, pte_t ptent) 4460 { 4461 struct page *page = vm_normal_page(vma, addr, ptent); 4462 4463 if (!page || !page_mapped(page)) 4464 return NULL; 4465 if (PageAnon(page)) { 4466 if (!(mc.flags & MOVE_ANON)) 4467 return NULL; 4468 } else { 4469 if (!(mc.flags & MOVE_FILE)) 4470 return NULL; 4471 } 4472 if (!get_page_unless_zero(page)) 4473 return NULL; 4474 4475 return page; 4476 } 4477 4478 #ifdef CONFIG_SWAP 4479 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4480 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4481 { 4482 struct page *page = NULL; 4483 swp_entry_t ent = pte_to_swp_entry(ptent); 4484 4485 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 4486 return NULL; 4487 /* 4488 * Because lookup_swap_cache() updates some statistics counter, 4489 * we call find_get_page() with swapper_space directly. 4490 */ 4491 page = find_get_page(swap_address_space(ent), ent.val); 4492 if (do_swap_account) 4493 entry->val = ent.val; 4494 4495 return page; 4496 } 4497 #else 4498 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4499 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4500 { 4501 return NULL; 4502 } 4503 #endif 4504 4505 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4506 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4507 { 4508 struct page *page = NULL; 4509 struct address_space *mapping; 4510 pgoff_t pgoff; 4511 4512 if (!vma->vm_file) /* anonymous vma */ 4513 return NULL; 4514 if (!(mc.flags & MOVE_FILE)) 4515 return NULL; 4516 4517 mapping = vma->vm_file->f_mapping; 4518 pgoff = linear_page_index(vma, addr); 4519 4520 /* page is moved even if it's not RSS of this task(page-faulted). */ 4521 #ifdef CONFIG_SWAP 4522 /* shmem/tmpfs may report page out on swap: account for that too. */ 4523 if (shmem_mapping(mapping)) { 4524 page = find_get_entry(mapping, pgoff); 4525 if (radix_tree_exceptional_entry(page)) { 4526 swp_entry_t swp = radix_to_swp_entry(page); 4527 if (do_swap_account) 4528 *entry = swp; 4529 page = find_get_page(swap_address_space(swp), swp.val); 4530 } 4531 } else 4532 page = find_get_page(mapping, pgoff); 4533 #else 4534 page = find_get_page(mapping, pgoff); 4535 #endif 4536 return page; 4537 } 4538 4539 /** 4540 * mem_cgroup_move_account - move account of the page 4541 * @page: the page 4542 * @nr_pages: number of regular pages (>1 for huge pages) 4543 * @from: mem_cgroup which the page is moved from. 4544 * @to: mem_cgroup which the page is moved to. @from != @to. 4545 * 4546 * The caller must confirm following. 4547 * - page is not on LRU (isolate_page() is useful.) 4548 * - compound_lock is held when nr_pages > 1 4549 * 4550 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4551 * from old cgroup. 4552 */ 4553 static int mem_cgroup_move_account(struct page *page, 4554 unsigned int nr_pages, 4555 struct mem_cgroup *from, 4556 struct mem_cgroup *to) 4557 { 4558 unsigned long flags; 4559 int ret; 4560 bool anon; 4561 4562 VM_BUG_ON(from == to); 4563 VM_BUG_ON_PAGE(PageLRU(page), page); 4564 /* 4565 * The page is isolated from LRU. So, collapse function 4566 * will not handle this page. But page splitting can happen. 4567 * Do this check under compound_page_lock(). The caller should 4568 * hold it. 4569 */ 4570 ret = -EBUSY; 4571 if (nr_pages > 1 && !PageTransHuge(page)) 4572 goto out; 4573 4574 /* 4575 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 4576 * of its source page while we change it: page migration takes 4577 * both pages off the LRU, but page cache replacement doesn't. 4578 */ 4579 if (!trylock_page(page)) 4580 goto out; 4581 4582 ret = -EINVAL; 4583 if (page->mem_cgroup != from) 4584 goto out_unlock; 4585 4586 anon = PageAnon(page); 4587 4588 spin_lock_irqsave(&from->move_lock, flags); 4589 4590 if (!anon && page_mapped(page)) { 4591 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4592 nr_pages); 4593 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4594 nr_pages); 4595 } 4596 4597 /* 4598 * move_lock grabbed above and caller set from->moving_account, so 4599 * mem_cgroup_update_page_stat() will serialize updates to PageDirty. 4600 * So mapping should be stable for dirty pages. 4601 */ 4602 if (!anon && PageDirty(page)) { 4603 struct address_space *mapping = page_mapping(page); 4604 4605 if (mapping_cap_account_dirty(mapping)) { 4606 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], 4607 nr_pages); 4608 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], 4609 nr_pages); 4610 } 4611 } 4612 4613 if (PageWriteback(page)) { 4614 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4615 nr_pages); 4616 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4617 nr_pages); 4618 } 4619 4620 /* 4621 * It is safe to change page->mem_cgroup here because the page 4622 * is referenced, charged, and isolated - we can't race with 4623 * uncharging, charging, migration, or LRU putback. 4624 */ 4625 4626 /* caller should have done css_get */ 4627 page->mem_cgroup = to; 4628 spin_unlock_irqrestore(&from->move_lock, flags); 4629 4630 ret = 0; 4631 4632 local_irq_disable(); 4633 mem_cgroup_charge_statistics(to, page, nr_pages); 4634 memcg_check_events(to, page); 4635 mem_cgroup_charge_statistics(from, page, -nr_pages); 4636 memcg_check_events(from, page); 4637 local_irq_enable(); 4638 out_unlock: 4639 unlock_page(page); 4640 out: 4641 return ret; 4642 } 4643 4644 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4645 unsigned long addr, pte_t ptent, union mc_target *target) 4646 { 4647 struct page *page = NULL; 4648 enum mc_target_type ret = MC_TARGET_NONE; 4649 swp_entry_t ent = { .val = 0 }; 4650 4651 if (pte_present(ptent)) 4652 page = mc_handle_present_pte(vma, addr, ptent); 4653 else if (is_swap_pte(ptent)) 4654 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4655 else if (pte_none(ptent)) 4656 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4657 4658 if (!page && !ent.val) 4659 return ret; 4660 if (page) { 4661 /* 4662 * Do only loose check w/o serialization. 4663 * mem_cgroup_move_account() checks the page is valid or 4664 * not under LRU exclusion. 4665 */ 4666 if (page->mem_cgroup == mc.from) { 4667 ret = MC_TARGET_PAGE; 4668 if (target) 4669 target->page = page; 4670 } 4671 if (!ret || !target) 4672 put_page(page); 4673 } 4674 /* There is a swap entry and a page doesn't exist or isn't charged */ 4675 if (ent.val && !ret && 4676 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4677 ret = MC_TARGET_SWAP; 4678 if (target) 4679 target->ent = ent; 4680 } 4681 return ret; 4682 } 4683 4684 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4685 /* 4686 * We don't consider swapping or file mapped pages because THP does not 4687 * support them for now. 4688 * Caller should make sure that pmd_trans_huge(pmd) is true. 4689 */ 4690 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4691 unsigned long addr, pmd_t pmd, union mc_target *target) 4692 { 4693 struct page *page = NULL; 4694 enum mc_target_type ret = MC_TARGET_NONE; 4695 4696 page = pmd_page(pmd); 4697 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4698 if (!(mc.flags & MOVE_ANON)) 4699 return ret; 4700 if (page->mem_cgroup == mc.from) { 4701 ret = MC_TARGET_PAGE; 4702 if (target) { 4703 get_page(page); 4704 target->page = page; 4705 } 4706 } 4707 return ret; 4708 } 4709 #else 4710 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4711 unsigned long addr, pmd_t pmd, union mc_target *target) 4712 { 4713 return MC_TARGET_NONE; 4714 } 4715 #endif 4716 4717 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4718 unsigned long addr, unsigned long end, 4719 struct mm_walk *walk) 4720 { 4721 struct vm_area_struct *vma = walk->vma; 4722 pte_t *pte; 4723 spinlock_t *ptl; 4724 4725 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 4726 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 4727 mc.precharge += HPAGE_PMD_NR; 4728 spin_unlock(ptl); 4729 return 0; 4730 } 4731 4732 if (pmd_trans_unstable(pmd)) 4733 return 0; 4734 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4735 for (; addr != end; pte++, addr += PAGE_SIZE) 4736 if (get_mctgt_type(vma, addr, *pte, NULL)) 4737 mc.precharge++; /* increment precharge temporarily */ 4738 pte_unmap_unlock(pte - 1, ptl); 4739 cond_resched(); 4740 4741 return 0; 4742 } 4743 4744 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4745 { 4746 unsigned long precharge; 4747 4748 struct mm_walk mem_cgroup_count_precharge_walk = { 4749 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4750 .mm = mm, 4751 }; 4752 down_read(&mm->mmap_sem); 4753 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); 4754 up_read(&mm->mmap_sem); 4755 4756 precharge = mc.precharge; 4757 mc.precharge = 0; 4758 4759 return precharge; 4760 } 4761 4762 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4763 { 4764 unsigned long precharge = mem_cgroup_count_precharge(mm); 4765 4766 VM_BUG_ON(mc.moving_task); 4767 mc.moving_task = current; 4768 return mem_cgroup_do_precharge(precharge); 4769 } 4770 4771 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4772 static void __mem_cgroup_clear_mc(void) 4773 { 4774 struct mem_cgroup *from = mc.from; 4775 struct mem_cgroup *to = mc.to; 4776 4777 /* we must uncharge all the leftover precharges from mc.to */ 4778 if (mc.precharge) { 4779 cancel_charge(mc.to, mc.precharge); 4780 mc.precharge = 0; 4781 } 4782 /* 4783 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4784 * we must uncharge here. 4785 */ 4786 if (mc.moved_charge) { 4787 cancel_charge(mc.from, mc.moved_charge); 4788 mc.moved_charge = 0; 4789 } 4790 /* we must fixup refcnts and charges */ 4791 if (mc.moved_swap) { 4792 /* uncharge swap account from the old cgroup */ 4793 if (!mem_cgroup_is_root(mc.from)) 4794 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 4795 4796 /* 4797 * we charged both to->memory and to->memsw, so we 4798 * should uncharge to->memory. 4799 */ 4800 if (!mem_cgroup_is_root(mc.to)) 4801 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 4802 4803 css_put_many(&mc.from->css, mc.moved_swap); 4804 4805 /* we've already done css_get(mc.to) */ 4806 mc.moved_swap = 0; 4807 } 4808 memcg_oom_recover(from); 4809 memcg_oom_recover(to); 4810 wake_up_all(&mc.waitq); 4811 } 4812 4813 static void mem_cgroup_clear_mc(void) 4814 { 4815 /* 4816 * we must clear moving_task before waking up waiters at the end of 4817 * task migration. 4818 */ 4819 mc.moving_task = NULL; 4820 __mem_cgroup_clear_mc(); 4821 spin_lock(&mc.lock); 4822 mc.from = NULL; 4823 mc.to = NULL; 4824 spin_unlock(&mc.lock); 4825 } 4826 4827 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 4828 struct cgroup_taskset *tset) 4829 { 4830 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4831 struct mem_cgroup *from; 4832 struct task_struct *p; 4833 struct mm_struct *mm; 4834 unsigned long move_flags; 4835 int ret = 0; 4836 4837 /* 4838 * We are now commited to this value whatever it is. Changes in this 4839 * tunable will only affect upcoming migrations, not the current one. 4840 * So we need to save it, and keep it going. 4841 */ 4842 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 4843 if (!move_flags) 4844 return 0; 4845 4846 p = cgroup_taskset_first(tset); 4847 from = mem_cgroup_from_task(p); 4848 4849 VM_BUG_ON(from == memcg); 4850 4851 mm = get_task_mm(p); 4852 if (!mm) 4853 return 0; 4854 /* We move charges only when we move a owner of the mm */ 4855 if (mm->owner == p) { 4856 VM_BUG_ON(mc.from); 4857 VM_BUG_ON(mc.to); 4858 VM_BUG_ON(mc.precharge); 4859 VM_BUG_ON(mc.moved_charge); 4860 VM_BUG_ON(mc.moved_swap); 4861 4862 spin_lock(&mc.lock); 4863 mc.from = from; 4864 mc.to = memcg; 4865 mc.flags = move_flags; 4866 spin_unlock(&mc.lock); 4867 /* We set mc.moving_task later */ 4868 4869 ret = mem_cgroup_precharge_mc(mm); 4870 if (ret) 4871 mem_cgroup_clear_mc(); 4872 } 4873 mmput(mm); 4874 return ret; 4875 } 4876 4877 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 4878 struct cgroup_taskset *tset) 4879 { 4880 if (mc.to) 4881 mem_cgroup_clear_mc(); 4882 } 4883 4884 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4885 unsigned long addr, unsigned long end, 4886 struct mm_walk *walk) 4887 { 4888 int ret = 0; 4889 struct vm_area_struct *vma = walk->vma; 4890 pte_t *pte; 4891 spinlock_t *ptl; 4892 enum mc_target_type target_type; 4893 union mc_target target; 4894 struct page *page; 4895 4896 /* 4897 * We don't take compound_lock() here but no race with splitting thp 4898 * happens because: 4899 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 4900 * under splitting, which means there's no concurrent thp split, 4901 * - if another thread runs into split_huge_page() just after we 4902 * entered this if-block, the thread must wait for page table lock 4903 * to be unlocked in __split_huge_page_splitting(), where the main 4904 * part of thp split is not executed yet. 4905 */ 4906 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 4907 if (mc.precharge < HPAGE_PMD_NR) { 4908 spin_unlock(ptl); 4909 return 0; 4910 } 4911 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 4912 if (target_type == MC_TARGET_PAGE) { 4913 page = target.page; 4914 if (!isolate_lru_page(page)) { 4915 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 4916 mc.from, mc.to)) { 4917 mc.precharge -= HPAGE_PMD_NR; 4918 mc.moved_charge += HPAGE_PMD_NR; 4919 } 4920 putback_lru_page(page); 4921 } 4922 put_page(page); 4923 } 4924 spin_unlock(ptl); 4925 return 0; 4926 } 4927 4928 if (pmd_trans_unstable(pmd)) 4929 return 0; 4930 retry: 4931 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4932 for (; addr != end; addr += PAGE_SIZE) { 4933 pte_t ptent = *(pte++); 4934 swp_entry_t ent; 4935 4936 if (!mc.precharge) 4937 break; 4938 4939 switch (get_mctgt_type(vma, addr, ptent, &target)) { 4940 case MC_TARGET_PAGE: 4941 page = target.page; 4942 if (isolate_lru_page(page)) 4943 goto put; 4944 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 4945 mc.precharge--; 4946 /* we uncharge from mc.from later. */ 4947 mc.moved_charge++; 4948 } 4949 putback_lru_page(page); 4950 put: /* get_mctgt_type() gets the page */ 4951 put_page(page); 4952 break; 4953 case MC_TARGET_SWAP: 4954 ent = target.ent; 4955 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 4956 mc.precharge--; 4957 /* we fixup refcnts and charges later. */ 4958 mc.moved_swap++; 4959 } 4960 break; 4961 default: 4962 break; 4963 } 4964 } 4965 pte_unmap_unlock(pte - 1, ptl); 4966 cond_resched(); 4967 4968 if (addr != end) { 4969 /* 4970 * We have consumed all precharges we got in can_attach(). 4971 * We try charge one by one, but don't do any additional 4972 * charges to mc.to if we have failed in charge once in attach() 4973 * phase. 4974 */ 4975 ret = mem_cgroup_do_precharge(1); 4976 if (!ret) 4977 goto retry; 4978 } 4979 4980 return ret; 4981 } 4982 4983 static void mem_cgroup_move_charge(struct mm_struct *mm) 4984 { 4985 struct mm_walk mem_cgroup_move_charge_walk = { 4986 .pmd_entry = mem_cgroup_move_charge_pte_range, 4987 .mm = mm, 4988 }; 4989 4990 lru_add_drain_all(); 4991 /* 4992 * Signal mem_cgroup_begin_page_stat() to take the memcg's 4993 * move_lock while we're moving its pages to another memcg. 4994 * Then wait for already started RCU-only updates to finish. 4995 */ 4996 atomic_inc(&mc.from->moving_account); 4997 synchronize_rcu(); 4998 retry: 4999 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5000 /* 5001 * Someone who are holding the mmap_sem might be waiting in 5002 * waitq. So we cancel all extra charges, wake up all waiters, 5003 * and retry. Because we cancel precharges, we might not be able 5004 * to move enough charges, but moving charge is a best-effort 5005 * feature anyway, so it wouldn't be a big problem. 5006 */ 5007 __mem_cgroup_clear_mc(); 5008 cond_resched(); 5009 goto retry; 5010 } 5011 /* 5012 * When we have consumed all precharges and failed in doing 5013 * additional charge, the page walk just aborts. 5014 */ 5015 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); 5016 up_read(&mm->mmap_sem); 5017 atomic_dec(&mc.from->moving_account); 5018 } 5019 5020 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5021 struct cgroup_taskset *tset) 5022 { 5023 struct task_struct *p = cgroup_taskset_first(tset); 5024 struct mm_struct *mm = get_task_mm(p); 5025 5026 if (mm) { 5027 if (mc.to) 5028 mem_cgroup_move_charge(mm); 5029 mmput(mm); 5030 } 5031 if (mc.to) 5032 mem_cgroup_clear_mc(); 5033 } 5034 #else /* !CONFIG_MMU */ 5035 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5036 struct cgroup_taskset *tset) 5037 { 5038 return 0; 5039 } 5040 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5041 struct cgroup_taskset *tset) 5042 { 5043 } 5044 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5045 struct cgroup_taskset *tset) 5046 { 5047 } 5048 #endif 5049 5050 /* 5051 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5052 * to verify whether we're attached to the default hierarchy on each mount 5053 * attempt. 5054 */ 5055 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5056 { 5057 /* 5058 * use_hierarchy is forced on the default hierarchy. cgroup core 5059 * guarantees that @root doesn't have any children, so turning it 5060 * on for the root memcg is enough. 5061 */ 5062 if (cgroup_on_dfl(root_css->cgroup)) 5063 root_mem_cgroup->use_hierarchy = true; 5064 else 5065 root_mem_cgroup->use_hierarchy = false; 5066 } 5067 5068 static u64 memory_current_read(struct cgroup_subsys_state *css, 5069 struct cftype *cft) 5070 { 5071 return mem_cgroup_usage(mem_cgroup_from_css(css), false); 5072 } 5073 5074 static int memory_low_show(struct seq_file *m, void *v) 5075 { 5076 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5077 unsigned long low = READ_ONCE(memcg->low); 5078 5079 if (low == PAGE_COUNTER_MAX) 5080 seq_puts(m, "max\n"); 5081 else 5082 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); 5083 5084 return 0; 5085 } 5086 5087 static ssize_t memory_low_write(struct kernfs_open_file *of, 5088 char *buf, size_t nbytes, loff_t off) 5089 { 5090 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5091 unsigned long low; 5092 int err; 5093 5094 buf = strstrip(buf); 5095 err = page_counter_memparse(buf, "max", &low); 5096 if (err) 5097 return err; 5098 5099 memcg->low = low; 5100 5101 return nbytes; 5102 } 5103 5104 static int memory_high_show(struct seq_file *m, void *v) 5105 { 5106 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5107 unsigned long high = READ_ONCE(memcg->high); 5108 5109 if (high == PAGE_COUNTER_MAX) 5110 seq_puts(m, "max\n"); 5111 else 5112 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); 5113 5114 return 0; 5115 } 5116 5117 static ssize_t memory_high_write(struct kernfs_open_file *of, 5118 char *buf, size_t nbytes, loff_t off) 5119 { 5120 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5121 unsigned long high; 5122 int err; 5123 5124 buf = strstrip(buf); 5125 err = page_counter_memparse(buf, "max", &high); 5126 if (err) 5127 return err; 5128 5129 memcg->high = high; 5130 5131 memcg_wb_domain_size_changed(memcg); 5132 return nbytes; 5133 } 5134 5135 static int memory_max_show(struct seq_file *m, void *v) 5136 { 5137 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5138 unsigned long max = READ_ONCE(memcg->memory.limit); 5139 5140 if (max == PAGE_COUNTER_MAX) 5141 seq_puts(m, "max\n"); 5142 else 5143 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 5144 5145 return 0; 5146 } 5147 5148 static ssize_t memory_max_write(struct kernfs_open_file *of, 5149 char *buf, size_t nbytes, loff_t off) 5150 { 5151 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5152 unsigned long max; 5153 int err; 5154 5155 buf = strstrip(buf); 5156 err = page_counter_memparse(buf, "max", &max); 5157 if (err) 5158 return err; 5159 5160 err = mem_cgroup_resize_limit(memcg, max); 5161 if (err) 5162 return err; 5163 5164 memcg_wb_domain_size_changed(memcg); 5165 return nbytes; 5166 } 5167 5168 static int memory_events_show(struct seq_file *m, void *v) 5169 { 5170 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5171 5172 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); 5173 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); 5174 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); 5175 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); 5176 5177 return 0; 5178 } 5179 5180 static struct cftype memory_files[] = { 5181 { 5182 .name = "current", 5183 .read_u64 = memory_current_read, 5184 }, 5185 { 5186 .name = "low", 5187 .flags = CFTYPE_NOT_ON_ROOT, 5188 .seq_show = memory_low_show, 5189 .write = memory_low_write, 5190 }, 5191 { 5192 .name = "high", 5193 .flags = CFTYPE_NOT_ON_ROOT, 5194 .seq_show = memory_high_show, 5195 .write = memory_high_write, 5196 }, 5197 { 5198 .name = "max", 5199 .flags = CFTYPE_NOT_ON_ROOT, 5200 .seq_show = memory_max_show, 5201 .write = memory_max_write, 5202 }, 5203 { 5204 .name = "events", 5205 .flags = CFTYPE_NOT_ON_ROOT, 5206 .seq_show = memory_events_show, 5207 }, 5208 { } /* terminate */ 5209 }; 5210 5211 struct cgroup_subsys memory_cgrp_subsys = { 5212 .css_alloc = mem_cgroup_css_alloc, 5213 .css_online = mem_cgroup_css_online, 5214 .css_offline = mem_cgroup_css_offline, 5215 .css_free = mem_cgroup_css_free, 5216 .css_reset = mem_cgroup_css_reset, 5217 .can_attach = mem_cgroup_can_attach, 5218 .cancel_attach = mem_cgroup_cancel_attach, 5219 .attach = mem_cgroup_move_task, 5220 .bind = mem_cgroup_bind, 5221 .dfl_cftypes = memory_files, 5222 .legacy_cftypes = mem_cgroup_legacy_files, 5223 .early_init = 0, 5224 }; 5225 5226 /** 5227 * mem_cgroup_low - check if memory consumption is below the normal range 5228 * @root: the highest ancestor to consider 5229 * @memcg: the memory cgroup to check 5230 * 5231 * Returns %true if memory consumption of @memcg, and that of all 5232 * configurable ancestors up to @root, is below the normal range. 5233 */ 5234 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) 5235 { 5236 if (mem_cgroup_disabled()) 5237 return false; 5238 5239 /* 5240 * The toplevel group doesn't have a configurable range, so 5241 * it's never low when looked at directly, and it is not 5242 * considered an ancestor when assessing the hierarchy. 5243 */ 5244 5245 if (memcg == root_mem_cgroup) 5246 return false; 5247 5248 if (page_counter_read(&memcg->memory) >= memcg->low) 5249 return false; 5250 5251 while (memcg != root) { 5252 memcg = parent_mem_cgroup(memcg); 5253 5254 if (memcg == root_mem_cgroup) 5255 break; 5256 5257 if (page_counter_read(&memcg->memory) >= memcg->low) 5258 return false; 5259 } 5260 return true; 5261 } 5262 5263 /** 5264 * mem_cgroup_try_charge - try charging a page 5265 * @page: page to charge 5266 * @mm: mm context of the victim 5267 * @gfp_mask: reclaim mode 5268 * @memcgp: charged memcg return 5269 * 5270 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5271 * pages according to @gfp_mask if necessary. 5272 * 5273 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5274 * Otherwise, an error code is returned. 5275 * 5276 * After page->mapping has been set up, the caller must finalize the 5277 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5278 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5279 */ 5280 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5281 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5282 { 5283 struct mem_cgroup *memcg = NULL; 5284 unsigned int nr_pages = 1; 5285 int ret = 0; 5286 5287 if (mem_cgroup_disabled()) 5288 goto out; 5289 5290 if (PageSwapCache(page)) { 5291 /* 5292 * Every swap fault against a single page tries to charge the 5293 * page, bail as early as possible. shmem_unuse() encounters 5294 * already charged pages, too. The USED bit is protected by 5295 * the page lock, which serializes swap cache removal, which 5296 * in turn serializes uncharging. 5297 */ 5298 VM_BUG_ON_PAGE(!PageLocked(page), page); 5299 if (page->mem_cgroup) 5300 goto out; 5301 5302 if (do_swap_account) { 5303 swp_entry_t ent = { .val = page_private(page), }; 5304 unsigned short id = lookup_swap_cgroup_id(ent); 5305 5306 rcu_read_lock(); 5307 memcg = mem_cgroup_from_id(id); 5308 if (memcg && !css_tryget_online(&memcg->css)) 5309 memcg = NULL; 5310 rcu_read_unlock(); 5311 } 5312 } 5313 5314 if (PageTransHuge(page)) { 5315 nr_pages <<= compound_order(page); 5316 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5317 } 5318 5319 if (!memcg) 5320 memcg = get_mem_cgroup_from_mm(mm); 5321 5322 ret = try_charge(memcg, gfp_mask, nr_pages); 5323 5324 css_put(&memcg->css); 5325 5326 if (ret == -EINTR) { 5327 memcg = root_mem_cgroup; 5328 ret = 0; 5329 } 5330 out: 5331 *memcgp = memcg; 5332 return ret; 5333 } 5334 5335 /** 5336 * mem_cgroup_commit_charge - commit a page charge 5337 * @page: page to charge 5338 * @memcg: memcg to charge the page to 5339 * @lrucare: page might be on LRU already 5340 * 5341 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5342 * after page->mapping has been set up. This must happen atomically 5343 * as part of the page instantiation, i.e. under the page table lock 5344 * for anonymous pages, under the page lock for page and swap cache. 5345 * 5346 * In addition, the page must not be on the LRU during the commit, to 5347 * prevent racing with task migration. If it might be, use @lrucare. 5348 * 5349 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5350 */ 5351 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5352 bool lrucare) 5353 { 5354 unsigned int nr_pages = 1; 5355 5356 VM_BUG_ON_PAGE(!page->mapping, page); 5357 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5358 5359 if (mem_cgroup_disabled()) 5360 return; 5361 /* 5362 * Swap faults will attempt to charge the same page multiple 5363 * times. But reuse_swap_page() might have removed the page 5364 * from swapcache already, so we can't check PageSwapCache(). 5365 */ 5366 if (!memcg) 5367 return; 5368 5369 commit_charge(page, memcg, lrucare); 5370 5371 if (PageTransHuge(page)) { 5372 nr_pages <<= compound_order(page); 5373 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5374 } 5375 5376 local_irq_disable(); 5377 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5378 memcg_check_events(memcg, page); 5379 local_irq_enable(); 5380 5381 if (do_swap_account && PageSwapCache(page)) { 5382 swp_entry_t entry = { .val = page_private(page) }; 5383 /* 5384 * The swap entry might not get freed for a long time, 5385 * let's not wait for it. The page already received a 5386 * memory+swap charge, drop the swap entry duplicate. 5387 */ 5388 mem_cgroup_uncharge_swap(entry); 5389 } 5390 } 5391 5392 /** 5393 * mem_cgroup_cancel_charge - cancel a page charge 5394 * @page: page to charge 5395 * @memcg: memcg to charge the page to 5396 * 5397 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5398 */ 5399 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5400 { 5401 unsigned int nr_pages = 1; 5402 5403 if (mem_cgroup_disabled()) 5404 return; 5405 /* 5406 * Swap faults will attempt to charge the same page multiple 5407 * times. But reuse_swap_page() might have removed the page 5408 * from swapcache already, so we can't check PageSwapCache(). 5409 */ 5410 if (!memcg) 5411 return; 5412 5413 if (PageTransHuge(page)) { 5414 nr_pages <<= compound_order(page); 5415 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5416 } 5417 5418 cancel_charge(memcg, nr_pages); 5419 } 5420 5421 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5422 unsigned long nr_anon, unsigned long nr_file, 5423 unsigned long nr_huge, struct page *dummy_page) 5424 { 5425 unsigned long nr_pages = nr_anon + nr_file; 5426 unsigned long flags; 5427 5428 if (!mem_cgroup_is_root(memcg)) { 5429 page_counter_uncharge(&memcg->memory, nr_pages); 5430 if (do_swap_account) 5431 page_counter_uncharge(&memcg->memsw, nr_pages); 5432 memcg_oom_recover(memcg); 5433 } 5434 5435 local_irq_save(flags); 5436 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5437 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5438 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5439 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5440 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5441 memcg_check_events(memcg, dummy_page); 5442 local_irq_restore(flags); 5443 5444 if (!mem_cgroup_is_root(memcg)) 5445 css_put_many(&memcg->css, nr_pages); 5446 } 5447 5448 static void uncharge_list(struct list_head *page_list) 5449 { 5450 struct mem_cgroup *memcg = NULL; 5451 unsigned long nr_anon = 0; 5452 unsigned long nr_file = 0; 5453 unsigned long nr_huge = 0; 5454 unsigned long pgpgout = 0; 5455 struct list_head *next; 5456 struct page *page; 5457 5458 next = page_list->next; 5459 do { 5460 unsigned int nr_pages = 1; 5461 5462 page = list_entry(next, struct page, lru); 5463 next = page->lru.next; 5464 5465 VM_BUG_ON_PAGE(PageLRU(page), page); 5466 VM_BUG_ON_PAGE(page_count(page), page); 5467 5468 if (!page->mem_cgroup) 5469 continue; 5470 5471 /* 5472 * Nobody should be changing or seriously looking at 5473 * page->mem_cgroup at this point, we have fully 5474 * exclusive access to the page. 5475 */ 5476 5477 if (memcg != page->mem_cgroup) { 5478 if (memcg) { 5479 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5480 nr_huge, page); 5481 pgpgout = nr_anon = nr_file = nr_huge = 0; 5482 } 5483 memcg = page->mem_cgroup; 5484 } 5485 5486 if (PageTransHuge(page)) { 5487 nr_pages <<= compound_order(page); 5488 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5489 nr_huge += nr_pages; 5490 } 5491 5492 if (PageAnon(page)) 5493 nr_anon += nr_pages; 5494 else 5495 nr_file += nr_pages; 5496 5497 page->mem_cgroup = NULL; 5498 5499 pgpgout++; 5500 } while (next != page_list); 5501 5502 if (memcg) 5503 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5504 nr_huge, page); 5505 } 5506 5507 /** 5508 * mem_cgroup_uncharge - uncharge a page 5509 * @page: page to uncharge 5510 * 5511 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5512 * mem_cgroup_commit_charge(). 5513 */ 5514 void mem_cgroup_uncharge(struct page *page) 5515 { 5516 if (mem_cgroup_disabled()) 5517 return; 5518 5519 /* Don't touch page->lru of any random page, pre-check: */ 5520 if (!page->mem_cgroup) 5521 return; 5522 5523 INIT_LIST_HEAD(&page->lru); 5524 uncharge_list(&page->lru); 5525 } 5526 5527 /** 5528 * mem_cgroup_uncharge_list - uncharge a list of page 5529 * @page_list: list of pages to uncharge 5530 * 5531 * Uncharge a list of pages previously charged with 5532 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5533 */ 5534 void mem_cgroup_uncharge_list(struct list_head *page_list) 5535 { 5536 if (mem_cgroup_disabled()) 5537 return; 5538 5539 if (!list_empty(page_list)) 5540 uncharge_list(page_list); 5541 } 5542 5543 /** 5544 * mem_cgroup_migrate - migrate a charge to another page 5545 * @oldpage: currently charged page 5546 * @newpage: page to transfer the charge to 5547 * @lrucare: either or both pages might be on the LRU already 5548 * 5549 * Migrate the charge from @oldpage to @newpage. 5550 * 5551 * Both pages must be locked, @newpage->mapping must be set up. 5552 */ 5553 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5554 bool lrucare) 5555 { 5556 struct mem_cgroup *memcg; 5557 int isolated; 5558 5559 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5560 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5561 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 5562 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 5563 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5564 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5565 newpage); 5566 5567 if (mem_cgroup_disabled()) 5568 return; 5569 5570 /* Page cache replacement: new page already charged? */ 5571 if (newpage->mem_cgroup) 5572 return; 5573 5574 /* 5575 * Swapcache readahead pages can get migrated before being 5576 * charged, and migration from compaction can happen to an 5577 * uncharged page when the PFN walker finds a page that 5578 * reclaim just put back on the LRU but has not released yet. 5579 */ 5580 memcg = oldpage->mem_cgroup; 5581 if (!memcg) 5582 return; 5583 5584 if (lrucare) 5585 lock_page_lru(oldpage, &isolated); 5586 5587 oldpage->mem_cgroup = NULL; 5588 5589 if (lrucare) 5590 unlock_page_lru(oldpage, isolated); 5591 5592 commit_charge(newpage, memcg, lrucare); 5593 } 5594 5595 /* 5596 * subsys_initcall() for memory controller. 5597 * 5598 * Some parts like hotcpu_notifier() have to be initialized from this context 5599 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 5600 * everything that doesn't depend on a specific mem_cgroup structure should 5601 * be initialized from here. 5602 */ 5603 static int __init mem_cgroup_init(void) 5604 { 5605 int cpu, node; 5606 5607 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5608 5609 for_each_possible_cpu(cpu) 5610 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 5611 drain_local_stock); 5612 5613 for_each_node(node) { 5614 struct mem_cgroup_tree_per_node *rtpn; 5615 int zone; 5616 5617 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5618 node_online(node) ? node : NUMA_NO_NODE); 5619 5620 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5621 struct mem_cgroup_tree_per_zone *rtpz; 5622 5623 rtpz = &rtpn->rb_tree_per_zone[zone]; 5624 rtpz->rb_root = RB_ROOT; 5625 spin_lock_init(&rtpz->lock); 5626 } 5627 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5628 } 5629 5630 return 0; 5631 } 5632 subsys_initcall(mem_cgroup_init); 5633 5634 #ifdef CONFIG_MEMCG_SWAP 5635 /** 5636 * mem_cgroup_swapout - transfer a memsw charge to swap 5637 * @page: page whose memsw charge to transfer 5638 * @entry: swap entry to move the charge to 5639 * 5640 * Transfer the memsw charge of @page to @entry. 5641 */ 5642 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5643 { 5644 struct mem_cgroup *memcg; 5645 unsigned short oldid; 5646 5647 VM_BUG_ON_PAGE(PageLRU(page), page); 5648 VM_BUG_ON_PAGE(page_count(page), page); 5649 5650 if (!do_swap_account) 5651 return; 5652 5653 memcg = page->mem_cgroup; 5654 5655 /* Readahead page, never charged */ 5656 if (!memcg) 5657 return; 5658 5659 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5660 VM_BUG_ON_PAGE(oldid, page); 5661 mem_cgroup_swap_statistics(memcg, true); 5662 5663 page->mem_cgroup = NULL; 5664 5665 if (!mem_cgroup_is_root(memcg)) 5666 page_counter_uncharge(&memcg->memory, 1); 5667 5668 /* 5669 * Interrupts should be disabled here because the caller holds the 5670 * mapping->tree_lock lock which is taken with interrupts-off. It is 5671 * important here to have the interrupts disabled because it is the 5672 * only synchronisation we have for udpating the per-CPU variables. 5673 */ 5674 VM_BUG_ON(!irqs_disabled()); 5675 mem_cgroup_charge_statistics(memcg, page, -1); 5676 memcg_check_events(memcg, page); 5677 } 5678 5679 /** 5680 * mem_cgroup_uncharge_swap - uncharge a swap entry 5681 * @entry: swap entry to uncharge 5682 * 5683 * Drop the memsw charge associated with @entry. 5684 */ 5685 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5686 { 5687 struct mem_cgroup *memcg; 5688 unsigned short id; 5689 5690 if (!do_swap_account) 5691 return; 5692 5693 id = swap_cgroup_record(entry, 0); 5694 rcu_read_lock(); 5695 memcg = mem_cgroup_from_id(id); 5696 if (memcg) { 5697 if (!mem_cgroup_is_root(memcg)) 5698 page_counter_uncharge(&memcg->memsw, 1); 5699 mem_cgroup_swap_statistics(memcg, false); 5700 css_put(&memcg->css); 5701 } 5702 rcu_read_unlock(); 5703 } 5704 5705 /* for remember boot option*/ 5706 #ifdef CONFIG_MEMCG_SWAP_ENABLED 5707 static int really_do_swap_account __initdata = 1; 5708 #else 5709 static int really_do_swap_account __initdata; 5710 #endif 5711 5712 static int __init enable_swap_account(char *s) 5713 { 5714 if (!strcmp(s, "1")) 5715 really_do_swap_account = 1; 5716 else if (!strcmp(s, "0")) 5717 really_do_swap_account = 0; 5718 return 1; 5719 } 5720 __setup("swapaccount=", enable_swap_account); 5721 5722 static struct cftype memsw_cgroup_files[] = { 5723 { 5724 .name = "memsw.usage_in_bytes", 5725 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 5726 .read_u64 = mem_cgroup_read_u64, 5727 }, 5728 { 5729 .name = "memsw.max_usage_in_bytes", 5730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 5731 .write = mem_cgroup_reset, 5732 .read_u64 = mem_cgroup_read_u64, 5733 }, 5734 { 5735 .name = "memsw.limit_in_bytes", 5736 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 5737 .write = mem_cgroup_write, 5738 .read_u64 = mem_cgroup_read_u64, 5739 }, 5740 { 5741 .name = "memsw.failcnt", 5742 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 5743 .write = mem_cgroup_reset, 5744 .read_u64 = mem_cgroup_read_u64, 5745 }, 5746 { }, /* terminate */ 5747 }; 5748 5749 static int __init mem_cgroup_swap_init(void) 5750 { 5751 if (!mem_cgroup_disabled() && really_do_swap_account) { 5752 do_swap_account = 1; 5753 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5754 memsw_cgroup_files)); 5755 } 5756 return 0; 5757 } 5758 subsys_initcall(mem_cgroup_swap_init); 5759 5760 #endif /* CONFIG_MEMCG_SWAP */ 5761