1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 */ 24 25 #include <linux/page_counter.h> 26 #include <linux/memcontrol.h> 27 #include <linux/cgroup.h> 28 #include <linux/pagewalk.h> 29 #include <linux/sched/mm.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/hugetlb.h> 32 #include <linux/pagemap.h> 33 #include <linux/vm_event_item.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/swap_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include <linux/tracehook.h> 60 #include <linux/seq_buf.h> 61 #include "internal.h" 62 #include <net/sock.h> 63 #include <net/ip.h> 64 #include "slab.h" 65 66 #include <linux/uaccess.h> 67 68 #include <trace/events/vmscan.h> 69 70 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 71 EXPORT_SYMBOL(memory_cgrp_subsys); 72 73 struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #define MEM_CGROUP_RECLAIM_RETRIES 5 76 77 /* Socket memory accounting disabled? */ 78 static bool cgroup_memory_nosocket; 79 80 /* Kernel memory accounting disabled? */ 81 static bool cgroup_memory_nokmem; 82 83 /* Whether the swap controller is active */ 84 #ifdef CONFIG_MEMCG_SWAP 85 int do_swap_account __read_mostly; 86 #else 87 #define do_swap_account 0 88 #endif 89 90 #ifdef CONFIG_CGROUP_WRITEBACK 91 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 92 #endif 93 94 /* Whether legacy memory+swap accounting is active */ 95 static bool do_memsw_account(void) 96 { 97 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; 98 } 99 100 static const char *const mem_cgroup_lru_names[] = { 101 "inactive_anon", 102 "active_anon", 103 "inactive_file", 104 "active_file", 105 "unevictable", 106 }; 107 108 #define THRESHOLDS_EVENTS_TARGET 128 109 #define SOFTLIMIT_EVENTS_TARGET 1024 110 #define NUMAINFO_EVENTS_TARGET 1024 111 112 /* 113 * Cgroups above their limits are maintained in a RB-Tree, independent of 114 * their hierarchy representation 115 */ 116 117 struct mem_cgroup_tree_per_node { 118 struct rb_root rb_root; 119 struct rb_node *rb_rightmost; 120 spinlock_t lock; 121 }; 122 123 struct mem_cgroup_tree { 124 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 125 }; 126 127 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 128 129 /* for OOM */ 130 struct mem_cgroup_eventfd_list { 131 struct list_head list; 132 struct eventfd_ctx *eventfd; 133 }; 134 135 /* 136 * cgroup_event represents events which userspace want to receive. 137 */ 138 struct mem_cgroup_event { 139 /* 140 * memcg which the event belongs to. 141 */ 142 struct mem_cgroup *memcg; 143 /* 144 * eventfd to signal userspace about the event. 145 */ 146 struct eventfd_ctx *eventfd; 147 /* 148 * Each of these stored in a list by the cgroup. 149 */ 150 struct list_head list; 151 /* 152 * register_event() callback will be used to add new userspace 153 * waiter for changes related to this event. Use eventfd_signal() 154 * on eventfd to send notification to userspace. 155 */ 156 int (*register_event)(struct mem_cgroup *memcg, 157 struct eventfd_ctx *eventfd, const char *args); 158 /* 159 * unregister_event() callback will be called when userspace closes 160 * the eventfd or on cgroup removing. This callback must be set, 161 * if you want provide notification functionality. 162 */ 163 void (*unregister_event)(struct mem_cgroup *memcg, 164 struct eventfd_ctx *eventfd); 165 /* 166 * All fields below needed to unregister event when 167 * userspace closes eventfd. 168 */ 169 poll_table pt; 170 wait_queue_head_t *wqh; 171 wait_queue_entry_t wait; 172 struct work_struct remove; 173 }; 174 175 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 176 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 177 178 /* Stuffs for move charges at task migration. */ 179 /* 180 * Types of charges to be moved. 181 */ 182 #define MOVE_ANON 0x1U 183 #define MOVE_FILE 0x2U 184 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 185 186 /* "mc" and its members are protected by cgroup_mutex */ 187 static struct move_charge_struct { 188 spinlock_t lock; /* for from, to */ 189 struct mm_struct *mm; 190 struct mem_cgroup *from; 191 struct mem_cgroup *to; 192 unsigned long flags; 193 unsigned long precharge; 194 unsigned long moved_charge; 195 unsigned long moved_swap; 196 struct task_struct *moving_task; /* a task moving charges */ 197 wait_queue_head_t waitq; /* a waitq for other context */ 198 } mc = { 199 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 200 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 201 }; 202 203 /* 204 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 205 * limit reclaim to prevent infinite loops, if they ever occur. 206 */ 207 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 208 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 209 210 enum charge_type { 211 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 212 MEM_CGROUP_CHARGE_TYPE_ANON, 213 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 214 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 215 NR_CHARGE_TYPE, 216 }; 217 218 /* for encoding cft->private value on file */ 219 enum res_type { 220 _MEM, 221 _MEMSWAP, 222 _OOM_TYPE, 223 _KMEM, 224 _TCP, 225 }; 226 227 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 228 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 229 #define MEMFILE_ATTR(val) ((val) & 0xffff) 230 /* Used for OOM nofiier */ 231 #define OOM_CONTROL (0) 232 233 /* 234 * Iteration constructs for visiting all cgroups (under a tree). If 235 * loops are exited prematurely (break), mem_cgroup_iter_break() must 236 * be used for reference counting. 237 */ 238 #define for_each_mem_cgroup_tree(iter, root) \ 239 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 240 iter != NULL; \ 241 iter = mem_cgroup_iter(root, iter, NULL)) 242 243 #define for_each_mem_cgroup(iter) \ 244 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 245 iter != NULL; \ 246 iter = mem_cgroup_iter(NULL, iter, NULL)) 247 248 static inline bool should_force_charge(void) 249 { 250 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 251 (current->flags & PF_EXITING); 252 } 253 254 /* Some nice accessors for the vmpressure. */ 255 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 256 { 257 if (!memcg) 258 memcg = root_mem_cgroup; 259 return &memcg->vmpressure; 260 } 261 262 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 263 { 264 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 265 } 266 267 #ifdef CONFIG_MEMCG_KMEM 268 /* 269 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 270 * The main reason for not using cgroup id for this: 271 * this works better in sparse environments, where we have a lot of memcgs, 272 * but only a few kmem-limited. Or also, if we have, for instance, 200 273 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 274 * 200 entry array for that. 275 * 276 * The current size of the caches array is stored in memcg_nr_cache_ids. It 277 * will double each time we have to increase it. 278 */ 279 static DEFINE_IDA(memcg_cache_ida); 280 int memcg_nr_cache_ids; 281 282 /* Protects memcg_nr_cache_ids */ 283 static DECLARE_RWSEM(memcg_cache_ids_sem); 284 285 void memcg_get_cache_ids(void) 286 { 287 down_read(&memcg_cache_ids_sem); 288 } 289 290 void memcg_put_cache_ids(void) 291 { 292 up_read(&memcg_cache_ids_sem); 293 } 294 295 /* 296 * MIN_SIZE is different than 1, because we would like to avoid going through 297 * the alloc/free process all the time. In a small machine, 4 kmem-limited 298 * cgroups is a reasonable guess. In the future, it could be a parameter or 299 * tunable, but that is strictly not necessary. 300 * 301 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 302 * this constant directly from cgroup, but it is understandable that this is 303 * better kept as an internal representation in cgroup.c. In any case, the 304 * cgrp_id space is not getting any smaller, and we don't have to necessarily 305 * increase ours as well if it increases. 306 */ 307 #define MEMCG_CACHES_MIN_SIZE 4 308 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 309 310 /* 311 * A lot of the calls to the cache allocation functions are expected to be 312 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 313 * conditional to this static branch, we'll have to allow modules that does 314 * kmem_cache_alloc and the such to see this symbol as well 315 */ 316 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 317 EXPORT_SYMBOL(memcg_kmem_enabled_key); 318 319 struct workqueue_struct *memcg_kmem_cache_wq; 320 321 static int memcg_shrinker_map_size; 322 static DEFINE_MUTEX(memcg_shrinker_map_mutex); 323 324 static void memcg_free_shrinker_map_rcu(struct rcu_head *head) 325 { 326 kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 327 } 328 329 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 330 int size, int old_size) 331 { 332 struct memcg_shrinker_map *new, *old; 333 int nid; 334 335 lockdep_assert_held(&memcg_shrinker_map_mutex); 336 337 for_each_node(nid) { 338 old = rcu_dereference_protected( 339 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 340 /* Not yet online memcg */ 341 if (!old) 342 return 0; 343 344 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); 345 if (!new) 346 return -ENOMEM; 347 348 /* Set all old bits, clear all new bits */ 349 memset(new->map, (int)0xff, old_size); 350 memset((void *)new->map + old_size, 0, size - old_size); 351 352 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 353 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 354 } 355 356 return 0; 357 } 358 359 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 360 { 361 struct mem_cgroup_per_node *pn; 362 struct memcg_shrinker_map *map; 363 int nid; 364 365 if (mem_cgroup_is_root(memcg)) 366 return; 367 368 for_each_node(nid) { 369 pn = mem_cgroup_nodeinfo(memcg, nid); 370 map = rcu_dereference_protected(pn->shrinker_map, true); 371 if (map) 372 kvfree(map); 373 rcu_assign_pointer(pn->shrinker_map, NULL); 374 } 375 } 376 377 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 378 { 379 struct memcg_shrinker_map *map; 380 int nid, size, ret = 0; 381 382 if (mem_cgroup_is_root(memcg)) 383 return 0; 384 385 mutex_lock(&memcg_shrinker_map_mutex); 386 size = memcg_shrinker_map_size; 387 for_each_node(nid) { 388 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); 389 if (!map) { 390 memcg_free_shrinker_maps(memcg); 391 ret = -ENOMEM; 392 break; 393 } 394 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 395 } 396 mutex_unlock(&memcg_shrinker_map_mutex); 397 398 return ret; 399 } 400 401 int memcg_expand_shrinker_maps(int new_id) 402 { 403 int size, old_size, ret = 0; 404 struct mem_cgroup *memcg; 405 406 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 407 old_size = memcg_shrinker_map_size; 408 if (size <= old_size) 409 return 0; 410 411 mutex_lock(&memcg_shrinker_map_mutex); 412 if (!root_mem_cgroup) 413 goto unlock; 414 415 for_each_mem_cgroup(memcg) { 416 if (mem_cgroup_is_root(memcg)) 417 continue; 418 ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 419 if (ret) 420 goto unlock; 421 } 422 unlock: 423 if (!ret) 424 memcg_shrinker_map_size = size; 425 mutex_unlock(&memcg_shrinker_map_mutex); 426 return ret; 427 } 428 429 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 430 { 431 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 432 struct memcg_shrinker_map *map; 433 434 rcu_read_lock(); 435 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 436 /* Pairs with smp mb in shrink_slab() */ 437 smp_mb__before_atomic(); 438 set_bit(shrinker_id, map->map); 439 rcu_read_unlock(); 440 } 441 } 442 443 #else /* CONFIG_MEMCG_KMEM */ 444 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 445 { 446 return 0; 447 } 448 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } 449 #endif /* CONFIG_MEMCG_KMEM */ 450 451 /** 452 * mem_cgroup_css_from_page - css of the memcg associated with a page 453 * @page: page of interest 454 * 455 * If memcg is bound to the default hierarchy, css of the memcg associated 456 * with @page is returned. The returned css remains associated with @page 457 * until it is released. 458 * 459 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 460 * is returned. 461 */ 462 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 463 { 464 struct mem_cgroup *memcg; 465 466 memcg = page->mem_cgroup; 467 468 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 469 memcg = root_mem_cgroup; 470 471 return &memcg->css; 472 } 473 474 /** 475 * page_cgroup_ino - return inode number of the memcg a page is charged to 476 * @page: the page 477 * 478 * Look up the closest online ancestor of the memory cgroup @page is charged to 479 * and return its inode number or 0 if @page is not charged to any cgroup. It 480 * is safe to call this function without holding a reference to @page. 481 * 482 * Note, this function is inherently racy, because there is nothing to prevent 483 * the cgroup inode from getting torn down and potentially reallocated a moment 484 * after page_cgroup_ino() returns, so it only should be used by callers that 485 * do not care (such as procfs interfaces). 486 */ 487 ino_t page_cgroup_ino(struct page *page) 488 { 489 struct mem_cgroup *memcg; 490 unsigned long ino = 0; 491 492 rcu_read_lock(); 493 if (PageHead(page) && PageSlab(page)) 494 memcg = memcg_from_slab_page(page); 495 else 496 memcg = READ_ONCE(page->mem_cgroup); 497 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 498 memcg = parent_mem_cgroup(memcg); 499 if (memcg) 500 ino = cgroup_ino(memcg->css.cgroup); 501 rcu_read_unlock(); 502 return ino; 503 } 504 505 static struct mem_cgroup_per_node * 506 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 507 { 508 int nid = page_to_nid(page); 509 510 return memcg->nodeinfo[nid]; 511 } 512 513 static struct mem_cgroup_tree_per_node * 514 soft_limit_tree_node(int nid) 515 { 516 return soft_limit_tree.rb_tree_per_node[nid]; 517 } 518 519 static struct mem_cgroup_tree_per_node * 520 soft_limit_tree_from_page(struct page *page) 521 { 522 int nid = page_to_nid(page); 523 524 return soft_limit_tree.rb_tree_per_node[nid]; 525 } 526 527 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 528 struct mem_cgroup_tree_per_node *mctz, 529 unsigned long new_usage_in_excess) 530 { 531 struct rb_node **p = &mctz->rb_root.rb_node; 532 struct rb_node *parent = NULL; 533 struct mem_cgroup_per_node *mz_node; 534 bool rightmost = true; 535 536 if (mz->on_tree) 537 return; 538 539 mz->usage_in_excess = new_usage_in_excess; 540 if (!mz->usage_in_excess) 541 return; 542 while (*p) { 543 parent = *p; 544 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 545 tree_node); 546 if (mz->usage_in_excess < mz_node->usage_in_excess) { 547 p = &(*p)->rb_left; 548 rightmost = false; 549 } 550 551 /* 552 * We can't avoid mem cgroups that are over their soft 553 * limit by the same amount 554 */ 555 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 556 p = &(*p)->rb_right; 557 } 558 559 if (rightmost) 560 mctz->rb_rightmost = &mz->tree_node; 561 562 rb_link_node(&mz->tree_node, parent, p); 563 rb_insert_color(&mz->tree_node, &mctz->rb_root); 564 mz->on_tree = true; 565 } 566 567 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 568 struct mem_cgroup_tree_per_node *mctz) 569 { 570 if (!mz->on_tree) 571 return; 572 573 if (&mz->tree_node == mctz->rb_rightmost) 574 mctz->rb_rightmost = rb_prev(&mz->tree_node); 575 576 rb_erase(&mz->tree_node, &mctz->rb_root); 577 mz->on_tree = false; 578 } 579 580 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 581 struct mem_cgroup_tree_per_node *mctz) 582 { 583 unsigned long flags; 584 585 spin_lock_irqsave(&mctz->lock, flags); 586 __mem_cgroup_remove_exceeded(mz, mctz); 587 spin_unlock_irqrestore(&mctz->lock, flags); 588 } 589 590 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 591 { 592 unsigned long nr_pages = page_counter_read(&memcg->memory); 593 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 594 unsigned long excess = 0; 595 596 if (nr_pages > soft_limit) 597 excess = nr_pages - soft_limit; 598 599 return excess; 600 } 601 602 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 603 { 604 unsigned long excess; 605 struct mem_cgroup_per_node *mz; 606 struct mem_cgroup_tree_per_node *mctz; 607 608 mctz = soft_limit_tree_from_page(page); 609 if (!mctz) 610 return; 611 /* 612 * Necessary to update all ancestors when hierarchy is used. 613 * because their event counter is not touched. 614 */ 615 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 616 mz = mem_cgroup_page_nodeinfo(memcg, page); 617 excess = soft_limit_excess(memcg); 618 /* 619 * We have to update the tree if mz is on RB-tree or 620 * mem is over its softlimit. 621 */ 622 if (excess || mz->on_tree) { 623 unsigned long flags; 624 625 spin_lock_irqsave(&mctz->lock, flags); 626 /* if on-tree, remove it */ 627 if (mz->on_tree) 628 __mem_cgroup_remove_exceeded(mz, mctz); 629 /* 630 * Insert again. mz->usage_in_excess will be updated. 631 * If excess is 0, no tree ops. 632 */ 633 __mem_cgroup_insert_exceeded(mz, mctz, excess); 634 spin_unlock_irqrestore(&mctz->lock, flags); 635 } 636 } 637 } 638 639 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 640 { 641 struct mem_cgroup_tree_per_node *mctz; 642 struct mem_cgroup_per_node *mz; 643 int nid; 644 645 for_each_node(nid) { 646 mz = mem_cgroup_nodeinfo(memcg, nid); 647 mctz = soft_limit_tree_node(nid); 648 if (mctz) 649 mem_cgroup_remove_exceeded(mz, mctz); 650 } 651 } 652 653 static struct mem_cgroup_per_node * 654 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 655 { 656 struct mem_cgroup_per_node *mz; 657 658 retry: 659 mz = NULL; 660 if (!mctz->rb_rightmost) 661 goto done; /* Nothing to reclaim from */ 662 663 mz = rb_entry(mctz->rb_rightmost, 664 struct mem_cgroup_per_node, tree_node); 665 /* 666 * Remove the node now but someone else can add it back, 667 * we will to add it back at the end of reclaim to its correct 668 * position in the tree. 669 */ 670 __mem_cgroup_remove_exceeded(mz, mctz); 671 if (!soft_limit_excess(mz->memcg) || 672 !css_tryget_online(&mz->memcg->css)) 673 goto retry; 674 done: 675 return mz; 676 } 677 678 static struct mem_cgroup_per_node * 679 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 680 { 681 struct mem_cgroup_per_node *mz; 682 683 spin_lock_irq(&mctz->lock); 684 mz = __mem_cgroup_largest_soft_limit_node(mctz); 685 spin_unlock_irq(&mctz->lock); 686 return mz; 687 } 688 689 /** 690 * __mod_memcg_state - update cgroup memory statistics 691 * @memcg: the memory cgroup 692 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 693 * @val: delta to add to the counter, can be negative 694 */ 695 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 696 { 697 long x; 698 699 if (mem_cgroup_disabled()) 700 return; 701 702 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 703 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { 704 struct mem_cgroup *mi; 705 706 /* 707 * Batch local counters to keep them in sync with 708 * the hierarchical ones. 709 */ 710 __this_cpu_add(memcg->vmstats_local->stat[idx], x); 711 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 712 atomic_long_add(x, &mi->vmstats[idx]); 713 x = 0; 714 } 715 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); 716 } 717 718 static struct mem_cgroup_per_node * 719 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 720 { 721 struct mem_cgroup *parent; 722 723 parent = parent_mem_cgroup(pn->memcg); 724 if (!parent) 725 return NULL; 726 return mem_cgroup_nodeinfo(parent, nid); 727 } 728 729 /** 730 * __mod_lruvec_state - update lruvec memory statistics 731 * @lruvec: the lruvec 732 * @idx: the stat item 733 * @val: delta to add to the counter, can be negative 734 * 735 * The lruvec is the intersection of the NUMA node and a cgroup. This 736 * function updates the all three counters that are affected by a 737 * change of state at this level: per-node, per-cgroup, per-lruvec. 738 */ 739 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 740 int val) 741 { 742 pg_data_t *pgdat = lruvec_pgdat(lruvec); 743 struct mem_cgroup_per_node *pn; 744 struct mem_cgroup *memcg; 745 long x; 746 747 /* Update node */ 748 __mod_node_page_state(pgdat, idx, val); 749 750 if (mem_cgroup_disabled()) 751 return; 752 753 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 754 memcg = pn->memcg; 755 756 /* Update memcg */ 757 __mod_memcg_state(memcg, idx, val); 758 759 /* Update lruvec */ 760 __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 761 762 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 763 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { 764 struct mem_cgroup_per_node *pi; 765 766 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 767 atomic_long_add(x, &pi->lruvec_stat[idx]); 768 x = 0; 769 } 770 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 771 } 772 773 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) 774 { 775 struct page *page = virt_to_head_page(p); 776 pg_data_t *pgdat = page_pgdat(page); 777 struct mem_cgroup *memcg; 778 struct lruvec *lruvec; 779 780 rcu_read_lock(); 781 memcg = memcg_from_slab_page(page); 782 783 /* Untracked pages have no memcg, no lruvec. Update only the node */ 784 if (!memcg || memcg == root_mem_cgroup) { 785 __mod_node_page_state(pgdat, idx, val); 786 } else { 787 lruvec = mem_cgroup_lruvec(pgdat, memcg); 788 __mod_lruvec_state(lruvec, idx, val); 789 } 790 rcu_read_unlock(); 791 } 792 793 /** 794 * __count_memcg_events - account VM events in a cgroup 795 * @memcg: the memory cgroup 796 * @idx: the event item 797 * @count: the number of events that occured 798 */ 799 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 800 unsigned long count) 801 { 802 unsigned long x; 803 804 if (mem_cgroup_disabled()) 805 return; 806 807 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 808 if (unlikely(x > MEMCG_CHARGE_BATCH)) { 809 struct mem_cgroup *mi; 810 811 /* 812 * Batch local counters to keep them in sync with 813 * the hierarchical ones. 814 */ 815 __this_cpu_add(memcg->vmstats_local->events[idx], x); 816 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 817 atomic_long_add(x, &mi->vmevents[idx]); 818 x = 0; 819 } 820 __this_cpu_write(memcg->vmstats_percpu->events[idx], x); 821 } 822 823 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 824 { 825 return atomic_long_read(&memcg->vmevents[event]); 826 } 827 828 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 829 { 830 long x = 0; 831 int cpu; 832 833 for_each_possible_cpu(cpu) 834 x += per_cpu(memcg->vmstats_local->events[event], cpu); 835 return x; 836 } 837 838 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 839 struct page *page, 840 bool compound, int nr_pages) 841 { 842 /* 843 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 844 * counted as CACHE even if it's on ANON LRU. 845 */ 846 if (PageAnon(page)) 847 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); 848 else { 849 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); 850 if (PageSwapBacked(page)) 851 __mod_memcg_state(memcg, NR_SHMEM, nr_pages); 852 } 853 854 if (compound) { 855 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 856 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); 857 } 858 859 /* pagein of a big page is an event. So, ignore page size */ 860 if (nr_pages > 0) 861 __count_memcg_events(memcg, PGPGIN, 1); 862 else { 863 __count_memcg_events(memcg, PGPGOUT, 1); 864 nr_pages = -nr_pages; /* for event */ 865 } 866 867 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 868 } 869 870 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 871 enum mem_cgroup_events_target target) 872 { 873 unsigned long val, next; 874 875 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 876 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 877 /* from time_after() in jiffies.h */ 878 if ((long)(next - val) < 0) { 879 switch (target) { 880 case MEM_CGROUP_TARGET_THRESH: 881 next = val + THRESHOLDS_EVENTS_TARGET; 882 break; 883 case MEM_CGROUP_TARGET_SOFTLIMIT: 884 next = val + SOFTLIMIT_EVENTS_TARGET; 885 break; 886 case MEM_CGROUP_TARGET_NUMAINFO: 887 next = val + NUMAINFO_EVENTS_TARGET; 888 break; 889 default: 890 break; 891 } 892 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 893 return true; 894 } 895 return false; 896 } 897 898 /* 899 * Check events in order. 900 * 901 */ 902 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 903 { 904 /* threshold event is triggered in finer grain than soft limit */ 905 if (unlikely(mem_cgroup_event_ratelimit(memcg, 906 MEM_CGROUP_TARGET_THRESH))) { 907 bool do_softlimit; 908 bool do_numainfo __maybe_unused; 909 910 do_softlimit = mem_cgroup_event_ratelimit(memcg, 911 MEM_CGROUP_TARGET_SOFTLIMIT); 912 #if MAX_NUMNODES > 1 913 do_numainfo = mem_cgroup_event_ratelimit(memcg, 914 MEM_CGROUP_TARGET_NUMAINFO); 915 #endif 916 mem_cgroup_threshold(memcg); 917 if (unlikely(do_softlimit)) 918 mem_cgroup_update_tree(memcg, page); 919 #if MAX_NUMNODES > 1 920 if (unlikely(do_numainfo)) 921 atomic_inc(&memcg->numainfo_events); 922 #endif 923 } 924 } 925 926 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 927 { 928 /* 929 * mm_update_next_owner() may clear mm->owner to NULL 930 * if it races with swapoff, page migration, etc. 931 * So this can be called with p == NULL. 932 */ 933 if (unlikely(!p)) 934 return NULL; 935 936 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 937 } 938 EXPORT_SYMBOL(mem_cgroup_from_task); 939 940 /** 941 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 942 * @mm: mm from which memcg should be extracted. It can be NULL. 943 * 944 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 945 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 946 * returned. 947 */ 948 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 949 { 950 struct mem_cgroup *memcg; 951 952 if (mem_cgroup_disabled()) 953 return NULL; 954 955 rcu_read_lock(); 956 do { 957 /* 958 * Page cache insertions can happen withou an 959 * actual mm context, e.g. during disk probing 960 * on boot, loopback IO, acct() writes etc. 961 */ 962 if (unlikely(!mm)) 963 memcg = root_mem_cgroup; 964 else { 965 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 966 if (unlikely(!memcg)) 967 memcg = root_mem_cgroup; 968 } 969 } while (!css_tryget_online(&memcg->css)); 970 rcu_read_unlock(); 971 return memcg; 972 } 973 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 974 975 /** 976 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 977 * @page: page from which memcg should be extracted. 978 * 979 * Obtain a reference on page->memcg and returns it if successful. Otherwise 980 * root_mem_cgroup is returned. 981 */ 982 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 983 { 984 struct mem_cgroup *memcg = page->mem_cgroup; 985 986 if (mem_cgroup_disabled()) 987 return NULL; 988 989 rcu_read_lock(); 990 if (!memcg || !css_tryget_online(&memcg->css)) 991 memcg = root_mem_cgroup; 992 rcu_read_unlock(); 993 return memcg; 994 } 995 EXPORT_SYMBOL(get_mem_cgroup_from_page); 996 997 /** 998 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. 999 */ 1000 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1001 { 1002 if (unlikely(current->active_memcg)) { 1003 struct mem_cgroup *memcg = root_mem_cgroup; 1004 1005 rcu_read_lock(); 1006 if (css_tryget_online(¤t->active_memcg->css)) 1007 memcg = current->active_memcg; 1008 rcu_read_unlock(); 1009 return memcg; 1010 } 1011 return get_mem_cgroup_from_mm(current->mm); 1012 } 1013 1014 /** 1015 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1016 * @root: hierarchy root 1017 * @prev: previously returned memcg, NULL on first invocation 1018 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1019 * 1020 * Returns references to children of the hierarchy below @root, or 1021 * @root itself, or %NULL after a full round-trip. 1022 * 1023 * Caller must pass the return value in @prev on subsequent 1024 * invocations for reference counting, or use mem_cgroup_iter_break() 1025 * to cancel a hierarchy walk before the round-trip is complete. 1026 * 1027 * Reclaimers can specify a node and a priority level in @reclaim to 1028 * divide up the memcgs in the hierarchy among all concurrent 1029 * reclaimers operating on the same node and priority. 1030 */ 1031 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1032 struct mem_cgroup *prev, 1033 struct mem_cgroup_reclaim_cookie *reclaim) 1034 { 1035 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1036 struct cgroup_subsys_state *css = NULL; 1037 struct mem_cgroup *memcg = NULL; 1038 struct mem_cgroup *pos = NULL; 1039 1040 if (mem_cgroup_disabled()) 1041 return NULL; 1042 1043 if (!root) 1044 root = root_mem_cgroup; 1045 1046 if (prev && !reclaim) 1047 pos = prev; 1048 1049 if (!root->use_hierarchy && root != root_mem_cgroup) { 1050 if (prev) 1051 goto out; 1052 return root; 1053 } 1054 1055 rcu_read_lock(); 1056 1057 if (reclaim) { 1058 struct mem_cgroup_per_node *mz; 1059 1060 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 1061 iter = &mz->iter[reclaim->priority]; 1062 1063 if (prev && reclaim->generation != iter->generation) 1064 goto out_unlock; 1065 1066 while (1) { 1067 pos = READ_ONCE(iter->position); 1068 if (!pos || css_tryget(&pos->css)) 1069 break; 1070 /* 1071 * css reference reached zero, so iter->position will 1072 * be cleared by ->css_released. However, we should not 1073 * rely on this happening soon, because ->css_released 1074 * is called from a work queue, and by busy-waiting we 1075 * might block it. So we clear iter->position right 1076 * away. 1077 */ 1078 (void)cmpxchg(&iter->position, pos, NULL); 1079 } 1080 } 1081 1082 if (pos) 1083 css = &pos->css; 1084 1085 for (;;) { 1086 css = css_next_descendant_pre(css, &root->css); 1087 if (!css) { 1088 /* 1089 * Reclaimers share the hierarchy walk, and a 1090 * new one might jump in right at the end of 1091 * the hierarchy - make sure they see at least 1092 * one group and restart from the beginning. 1093 */ 1094 if (!prev) 1095 continue; 1096 break; 1097 } 1098 1099 /* 1100 * Verify the css and acquire a reference. The root 1101 * is provided by the caller, so we know it's alive 1102 * and kicking, and don't take an extra reference. 1103 */ 1104 memcg = mem_cgroup_from_css(css); 1105 1106 if (css == &root->css) 1107 break; 1108 1109 if (css_tryget(css)) 1110 break; 1111 1112 memcg = NULL; 1113 } 1114 1115 if (reclaim) { 1116 /* 1117 * The position could have already been updated by a competing 1118 * thread, so check that the value hasn't changed since we read 1119 * it to avoid reclaiming from the same cgroup twice. 1120 */ 1121 (void)cmpxchg(&iter->position, pos, memcg); 1122 1123 if (pos) 1124 css_put(&pos->css); 1125 1126 if (!memcg) 1127 iter->generation++; 1128 else if (!prev) 1129 reclaim->generation = iter->generation; 1130 } 1131 1132 out_unlock: 1133 rcu_read_unlock(); 1134 out: 1135 if (prev && prev != root) 1136 css_put(&prev->css); 1137 1138 return memcg; 1139 } 1140 1141 /** 1142 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1143 * @root: hierarchy root 1144 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1145 */ 1146 void mem_cgroup_iter_break(struct mem_cgroup *root, 1147 struct mem_cgroup *prev) 1148 { 1149 if (!root) 1150 root = root_mem_cgroup; 1151 if (prev && prev != root) 1152 css_put(&prev->css); 1153 } 1154 1155 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1156 struct mem_cgroup *dead_memcg) 1157 { 1158 struct mem_cgroup_reclaim_iter *iter; 1159 struct mem_cgroup_per_node *mz; 1160 int nid; 1161 int i; 1162 1163 for_each_node(nid) { 1164 mz = mem_cgroup_nodeinfo(from, nid); 1165 for (i = 0; i <= DEF_PRIORITY; i++) { 1166 iter = &mz->iter[i]; 1167 cmpxchg(&iter->position, 1168 dead_memcg, NULL); 1169 } 1170 } 1171 } 1172 1173 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1174 { 1175 struct mem_cgroup *memcg = dead_memcg; 1176 struct mem_cgroup *last; 1177 1178 do { 1179 __invalidate_reclaim_iterators(memcg, dead_memcg); 1180 last = memcg; 1181 } while ((memcg = parent_mem_cgroup(memcg))); 1182 1183 /* 1184 * When cgruop1 non-hierarchy mode is used, 1185 * parent_mem_cgroup() does not walk all the way up to the 1186 * cgroup root (root_mem_cgroup). So we have to handle 1187 * dead_memcg from cgroup root separately. 1188 */ 1189 if (last != root_mem_cgroup) 1190 __invalidate_reclaim_iterators(root_mem_cgroup, 1191 dead_memcg); 1192 } 1193 1194 /** 1195 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1196 * @memcg: hierarchy root 1197 * @fn: function to call for each task 1198 * @arg: argument passed to @fn 1199 * 1200 * This function iterates over tasks attached to @memcg or to any of its 1201 * descendants and calls @fn for each task. If @fn returns a non-zero 1202 * value, the function breaks the iteration loop and returns the value. 1203 * Otherwise, it will iterate over all tasks and return 0. 1204 * 1205 * This function must not be called for the root memory cgroup. 1206 */ 1207 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1208 int (*fn)(struct task_struct *, void *), void *arg) 1209 { 1210 struct mem_cgroup *iter; 1211 int ret = 0; 1212 1213 BUG_ON(memcg == root_mem_cgroup); 1214 1215 for_each_mem_cgroup_tree(iter, memcg) { 1216 struct css_task_iter it; 1217 struct task_struct *task; 1218 1219 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1220 while (!ret && (task = css_task_iter_next(&it))) 1221 ret = fn(task, arg); 1222 css_task_iter_end(&it); 1223 if (ret) { 1224 mem_cgroup_iter_break(memcg, iter); 1225 break; 1226 } 1227 } 1228 return ret; 1229 } 1230 1231 /** 1232 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1233 * @page: the page 1234 * @pgdat: pgdat of the page 1235 * 1236 * This function is only safe when following the LRU page isolation 1237 * and putback protocol: the LRU lock must be held, and the page must 1238 * either be PageLRU() or the caller must have isolated/allocated it. 1239 */ 1240 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 1241 { 1242 struct mem_cgroup_per_node *mz; 1243 struct mem_cgroup *memcg; 1244 struct lruvec *lruvec; 1245 1246 if (mem_cgroup_disabled()) { 1247 lruvec = &pgdat->lruvec; 1248 goto out; 1249 } 1250 1251 memcg = page->mem_cgroup; 1252 /* 1253 * Swapcache readahead pages are added to the LRU - and 1254 * possibly migrated - before they are charged. 1255 */ 1256 if (!memcg) 1257 memcg = root_mem_cgroup; 1258 1259 mz = mem_cgroup_page_nodeinfo(memcg, page); 1260 lruvec = &mz->lruvec; 1261 out: 1262 /* 1263 * Since a node can be onlined after the mem_cgroup was created, 1264 * we have to be prepared to initialize lruvec->zone here; 1265 * and if offlined then reonlined, we need to reinitialize it. 1266 */ 1267 if (unlikely(lruvec->pgdat != pgdat)) 1268 lruvec->pgdat = pgdat; 1269 return lruvec; 1270 } 1271 1272 /** 1273 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1274 * @lruvec: mem_cgroup per zone lru vector 1275 * @lru: index of lru list the page is sitting on 1276 * @zid: zone id of the accounted pages 1277 * @nr_pages: positive when adding or negative when removing 1278 * 1279 * This function must be called under lru_lock, just before a page is added 1280 * to or just after a page is removed from an lru list (that ordering being 1281 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1282 */ 1283 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1284 int zid, int nr_pages) 1285 { 1286 struct mem_cgroup_per_node *mz; 1287 unsigned long *lru_size; 1288 long size; 1289 1290 if (mem_cgroup_disabled()) 1291 return; 1292 1293 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1294 lru_size = &mz->lru_zone_size[zid][lru]; 1295 1296 if (nr_pages < 0) 1297 *lru_size += nr_pages; 1298 1299 size = *lru_size; 1300 if (WARN_ONCE(size < 0, 1301 "%s(%p, %d, %d): lru_size %ld\n", 1302 __func__, lruvec, lru, nr_pages, size)) { 1303 VM_BUG_ON(1); 1304 *lru_size = 0; 1305 } 1306 1307 if (nr_pages > 0) 1308 *lru_size += nr_pages; 1309 } 1310 1311 /** 1312 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1313 * @memcg: the memory cgroup 1314 * 1315 * Returns the maximum amount of memory @mem can be charged with, in 1316 * pages. 1317 */ 1318 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1319 { 1320 unsigned long margin = 0; 1321 unsigned long count; 1322 unsigned long limit; 1323 1324 count = page_counter_read(&memcg->memory); 1325 limit = READ_ONCE(memcg->memory.max); 1326 if (count < limit) 1327 margin = limit - count; 1328 1329 if (do_memsw_account()) { 1330 count = page_counter_read(&memcg->memsw); 1331 limit = READ_ONCE(memcg->memsw.max); 1332 if (count <= limit) 1333 margin = min(margin, limit - count); 1334 else 1335 margin = 0; 1336 } 1337 1338 return margin; 1339 } 1340 1341 /* 1342 * A routine for checking "mem" is under move_account() or not. 1343 * 1344 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1345 * moving cgroups. This is for waiting at high-memory pressure 1346 * caused by "move". 1347 */ 1348 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1349 { 1350 struct mem_cgroup *from; 1351 struct mem_cgroup *to; 1352 bool ret = false; 1353 /* 1354 * Unlike task_move routines, we access mc.to, mc.from not under 1355 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1356 */ 1357 spin_lock(&mc.lock); 1358 from = mc.from; 1359 to = mc.to; 1360 if (!from) 1361 goto unlock; 1362 1363 ret = mem_cgroup_is_descendant(from, memcg) || 1364 mem_cgroup_is_descendant(to, memcg); 1365 unlock: 1366 spin_unlock(&mc.lock); 1367 return ret; 1368 } 1369 1370 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1371 { 1372 if (mc.moving_task && current != mc.moving_task) { 1373 if (mem_cgroup_under_move(memcg)) { 1374 DEFINE_WAIT(wait); 1375 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1376 /* moving charge context might have finished. */ 1377 if (mc.moving_task) 1378 schedule(); 1379 finish_wait(&mc.waitq, &wait); 1380 return true; 1381 } 1382 } 1383 return false; 1384 } 1385 1386 static char *memory_stat_format(struct mem_cgroup *memcg) 1387 { 1388 struct seq_buf s; 1389 int i; 1390 1391 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1392 if (!s.buffer) 1393 return NULL; 1394 1395 /* 1396 * Provide statistics on the state of the memory subsystem as 1397 * well as cumulative event counters that show past behavior. 1398 * 1399 * This list is ordered following a combination of these gradients: 1400 * 1) generic big picture -> specifics and details 1401 * 2) reflecting userspace activity -> reflecting kernel heuristics 1402 * 1403 * Current memory state: 1404 */ 1405 1406 seq_buf_printf(&s, "anon %llu\n", 1407 (u64)memcg_page_state(memcg, MEMCG_RSS) * 1408 PAGE_SIZE); 1409 seq_buf_printf(&s, "file %llu\n", 1410 (u64)memcg_page_state(memcg, MEMCG_CACHE) * 1411 PAGE_SIZE); 1412 seq_buf_printf(&s, "kernel_stack %llu\n", 1413 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1414 1024); 1415 seq_buf_printf(&s, "slab %llu\n", 1416 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + 1417 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * 1418 PAGE_SIZE); 1419 seq_buf_printf(&s, "sock %llu\n", 1420 (u64)memcg_page_state(memcg, MEMCG_SOCK) * 1421 PAGE_SIZE); 1422 1423 seq_buf_printf(&s, "shmem %llu\n", 1424 (u64)memcg_page_state(memcg, NR_SHMEM) * 1425 PAGE_SIZE); 1426 seq_buf_printf(&s, "file_mapped %llu\n", 1427 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * 1428 PAGE_SIZE); 1429 seq_buf_printf(&s, "file_dirty %llu\n", 1430 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * 1431 PAGE_SIZE); 1432 seq_buf_printf(&s, "file_writeback %llu\n", 1433 (u64)memcg_page_state(memcg, NR_WRITEBACK) * 1434 PAGE_SIZE); 1435 1436 /* 1437 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter 1438 * with the NR_ANON_THP vm counter, but right now it's a pain in the 1439 * arse because it requires migrating the work out of rmap to a place 1440 * where the page->mem_cgroup is set up and stable. 1441 */ 1442 seq_buf_printf(&s, "anon_thp %llu\n", 1443 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * 1444 PAGE_SIZE); 1445 1446 for (i = 0; i < NR_LRU_LISTS; i++) 1447 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], 1448 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1449 PAGE_SIZE); 1450 1451 seq_buf_printf(&s, "slab_reclaimable %llu\n", 1452 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * 1453 PAGE_SIZE); 1454 seq_buf_printf(&s, "slab_unreclaimable %llu\n", 1455 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * 1456 PAGE_SIZE); 1457 1458 /* Accumulated memory events */ 1459 1460 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); 1461 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); 1462 1463 seq_buf_printf(&s, "workingset_refault %lu\n", 1464 memcg_page_state(memcg, WORKINGSET_REFAULT)); 1465 seq_buf_printf(&s, "workingset_activate %lu\n", 1466 memcg_page_state(memcg, WORKINGSET_ACTIVATE)); 1467 seq_buf_printf(&s, "workingset_nodereclaim %lu\n", 1468 memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); 1469 1470 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); 1471 seq_buf_printf(&s, "pgscan %lu\n", 1472 memcg_events(memcg, PGSCAN_KSWAPD) + 1473 memcg_events(memcg, PGSCAN_DIRECT)); 1474 seq_buf_printf(&s, "pgsteal %lu\n", 1475 memcg_events(memcg, PGSTEAL_KSWAPD) + 1476 memcg_events(memcg, PGSTEAL_DIRECT)); 1477 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); 1478 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); 1479 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); 1480 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); 1481 1482 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1483 seq_buf_printf(&s, "thp_fault_alloc %lu\n", 1484 memcg_events(memcg, THP_FAULT_ALLOC)); 1485 seq_buf_printf(&s, "thp_collapse_alloc %lu\n", 1486 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1487 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1488 1489 /* The above should easily fit into one page */ 1490 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1491 1492 return s.buffer; 1493 } 1494 1495 #define K(x) ((x) << (PAGE_SHIFT-10)) 1496 /** 1497 * mem_cgroup_print_oom_context: Print OOM information relevant to 1498 * memory controller. 1499 * @memcg: The memory cgroup that went over limit 1500 * @p: Task that is going to be killed 1501 * 1502 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1503 * enabled 1504 */ 1505 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1506 { 1507 rcu_read_lock(); 1508 1509 if (memcg) { 1510 pr_cont(",oom_memcg="); 1511 pr_cont_cgroup_path(memcg->css.cgroup); 1512 } else 1513 pr_cont(",global_oom"); 1514 if (p) { 1515 pr_cont(",task_memcg="); 1516 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1517 } 1518 rcu_read_unlock(); 1519 } 1520 1521 /** 1522 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1523 * memory controller. 1524 * @memcg: The memory cgroup that went over limit 1525 */ 1526 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1527 { 1528 char *buf; 1529 1530 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1531 K((u64)page_counter_read(&memcg->memory)), 1532 K((u64)memcg->memory.max), memcg->memory.failcnt); 1533 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1534 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1535 K((u64)page_counter_read(&memcg->swap)), 1536 K((u64)memcg->swap.max), memcg->swap.failcnt); 1537 else { 1538 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1539 K((u64)page_counter_read(&memcg->memsw)), 1540 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1541 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1542 K((u64)page_counter_read(&memcg->kmem)), 1543 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1544 } 1545 1546 pr_info("Memory cgroup stats for "); 1547 pr_cont_cgroup_path(memcg->css.cgroup); 1548 pr_cont(":"); 1549 buf = memory_stat_format(memcg); 1550 if (!buf) 1551 return; 1552 pr_info("%s", buf); 1553 kfree(buf); 1554 } 1555 1556 /* 1557 * Return the memory (and swap, if configured) limit for a memcg. 1558 */ 1559 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1560 { 1561 unsigned long max; 1562 1563 max = memcg->memory.max; 1564 if (mem_cgroup_swappiness(memcg)) { 1565 unsigned long memsw_max; 1566 unsigned long swap_max; 1567 1568 memsw_max = memcg->memsw.max; 1569 swap_max = memcg->swap.max; 1570 swap_max = min(swap_max, (unsigned long)total_swap_pages); 1571 max = min(max + swap_max, memsw_max); 1572 } 1573 return max; 1574 } 1575 1576 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1577 int order) 1578 { 1579 struct oom_control oc = { 1580 .zonelist = NULL, 1581 .nodemask = NULL, 1582 .memcg = memcg, 1583 .gfp_mask = gfp_mask, 1584 .order = order, 1585 }; 1586 bool ret; 1587 1588 if (mutex_lock_killable(&oom_lock)) 1589 return true; 1590 /* 1591 * A few threads which were not waiting at mutex_lock_killable() can 1592 * fail to bail out. Therefore, check again after holding oom_lock. 1593 */ 1594 ret = should_force_charge() || out_of_memory(&oc); 1595 mutex_unlock(&oom_lock); 1596 return ret; 1597 } 1598 1599 #if MAX_NUMNODES > 1 1600 1601 /** 1602 * test_mem_cgroup_node_reclaimable 1603 * @memcg: the target memcg 1604 * @nid: the node ID to be checked. 1605 * @noswap : specify true here if the user wants flle only information. 1606 * 1607 * This function returns whether the specified memcg contains any 1608 * reclaimable pages on a node. Returns true if there are any reclaimable 1609 * pages in the node. 1610 */ 1611 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1612 int nid, bool noswap) 1613 { 1614 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 1615 1616 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || 1617 lruvec_page_state(lruvec, NR_ACTIVE_FILE)) 1618 return true; 1619 if (noswap || !total_swap_pages) 1620 return false; 1621 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || 1622 lruvec_page_state(lruvec, NR_ACTIVE_ANON)) 1623 return true; 1624 return false; 1625 1626 } 1627 1628 /* 1629 * Always updating the nodemask is not very good - even if we have an empty 1630 * list or the wrong list here, we can start from some node and traverse all 1631 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1632 * 1633 */ 1634 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1635 { 1636 int nid; 1637 /* 1638 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1639 * pagein/pageout changes since the last update. 1640 */ 1641 if (!atomic_read(&memcg->numainfo_events)) 1642 return; 1643 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1644 return; 1645 1646 /* make a nodemask where this memcg uses memory from */ 1647 memcg->scan_nodes = node_states[N_MEMORY]; 1648 1649 for_each_node_mask(nid, node_states[N_MEMORY]) { 1650 1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1652 node_clear(nid, memcg->scan_nodes); 1653 } 1654 1655 atomic_set(&memcg->numainfo_events, 0); 1656 atomic_set(&memcg->numainfo_updating, 0); 1657 } 1658 1659 /* 1660 * Selecting a node where we start reclaim from. Because what we need is just 1661 * reducing usage counter, start from anywhere is O,K. Considering 1662 * memory reclaim from current node, there are pros. and cons. 1663 * 1664 * Freeing memory from current node means freeing memory from a node which 1665 * we'll use or we've used. So, it may make LRU bad. And if several threads 1666 * hit limits, it will see a contention on a node. But freeing from remote 1667 * node means more costs for memory reclaim because of memory latency. 1668 * 1669 * Now, we use round-robin. Better algorithm is welcomed. 1670 */ 1671 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1672 { 1673 int node; 1674 1675 mem_cgroup_may_update_nodemask(memcg); 1676 node = memcg->last_scanned_node; 1677 1678 node = next_node_in(node, memcg->scan_nodes); 1679 /* 1680 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages 1681 * last time it really checked all the LRUs due to rate limiting. 1682 * Fallback to the current node in that case for simplicity. 1683 */ 1684 if (unlikely(node == MAX_NUMNODES)) 1685 node = numa_node_id(); 1686 1687 memcg->last_scanned_node = node; 1688 return node; 1689 } 1690 #else 1691 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1692 { 1693 return 0; 1694 } 1695 #endif 1696 1697 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1698 pg_data_t *pgdat, 1699 gfp_t gfp_mask, 1700 unsigned long *total_scanned) 1701 { 1702 struct mem_cgroup *victim = NULL; 1703 int total = 0; 1704 int loop = 0; 1705 unsigned long excess; 1706 unsigned long nr_scanned; 1707 struct mem_cgroup_reclaim_cookie reclaim = { 1708 .pgdat = pgdat, 1709 .priority = 0, 1710 }; 1711 1712 excess = soft_limit_excess(root_memcg); 1713 1714 while (1) { 1715 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1716 if (!victim) { 1717 loop++; 1718 if (loop >= 2) { 1719 /* 1720 * If we have not been able to reclaim 1721 * anything, it might because there are 1722 * no reclaimable pages under this hierarchy 1723 */ 1724 if (!total) 1725 break; 1726 /* 1727 * We want to do more targeted reclaim. 1728 * excess >> 2 is not to excessive so as to 1729 * reclaim too much, nor too less that we keep 1730 * coming back to reclaim from this cgroup 1731 */ 1732 if (total >= (excess >> 2) || 1733 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1734 break; 1735 } 1736 continue; 1737 } 1738 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1739 pgdat, &nr_scanned); 1740 *total_scanned += nr_scanned; 1741 if (!soft_limit_excess(root_memcg)) 1742 break; 1743 } 1744 mem_cgroup_iter_break(root_memcg, victim); 1745 return total; 1746 } 1747 1748 #ifdef CONFIG_LOCKDEP 1749 static struct lockdep_map memcg_oom_lock_dep_map = { 1750 .name = "memcg_oom_lock", 1751 }; 1752 #endif 1753 1754 static DEFINE_SPINLOCK(memcg_oom_lock); 1755 1756 /* 1757 * Check OOM-Killer is already running under our hierarchy. 1758 * If someone is running, return false. 1759 */ 1760 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1761 { 1762 struct mem_cgroup *iter, *failed = NULL; 1763 1764 spin_lock(&memcg_oom_lock); 1765 1766 for_each_mem_cgroup_tree(iter, memcg) { 1767 if (iter->oom_lock) { 1768 /* 1769 * this subtree of our hierarchy is already locked 1770 * so we cannot give a lock. 1771 */ 1772 failed = iter; 1773 mem_cgroup_iter_break(memcg, iter); 1774 break; 1775 } else 1776 iter->oom_lock = true; 1777 } 1778 1779 if (failed) { 1780 /* 1781 * OK, we failed to lock the whole subtree so we have 1782 * to clean up what we set up to the failing subtree 1783 */ 1784 for_each_mem_cgroup_tree(iter, memcg) { 1785 if (iter == failed) { 1786 mem_cgroup_iter_break(memcg, iter); 1787 break; 1788 } 1789 iter->oom_lock = false; 1790 } 1791 } else 1792 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1793 1794 spin_unlock(&memcg_oom_lock); 1795 1796 return !failed; 1797 } 1798 1799 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1800 { 1801 struct mem_cgroup *iter; 1802 1803 spin_lock(&memcg_oom_lock); 1804 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1805 for_each_mem_cgroup_tree(iter, memcg) 1806 iter->oom_lock = false; 1807 spin_unlock(&memcg_oom_lock); 1808 } 1809 1810 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1811 { 1812 struct mem_cgroup *iter; 1813 1814 spin_lock(&memcg_oom_lock); 1815 for_each_mem_cgroup_tree(iter, memcg) 1816 iter->under_oom++; 1817 spin_unlock(&memcg_oom_lock); 1818 } 1819 1820 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1821 { 1822 struct mem_cgroup *iter; 1823 1824 /* 1825 * When a new child is created while the hierarchy is under oom, 1826 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1827 */ 1828 spin_lock(&memcg_oom_lock); 1829 for_each_mem_cgroup_tree(iter, memcg) 1830 if (iter->under_oom > 0) 1831 iter->under_oom--; 1832 spin_unlock(&memcg_oom_lock); 1833 } 1834 1835 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1836 1837 struct oom_wait_info { 1838 struct mem_cgroup *memcg; 1839 wait_queue_entry_t wait; 1840 }; 1841 1842 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1843 unsigned mode, int sync, void *arg) 1844 { 1845 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1846 struct mem_cgroup *oom_wait_memcg; 1847 struct oom_wait_info *oom_wait_info; 1848 1849 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1850 oom_wait_memcg = oom_wait_info->memcg; 1851 1852 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1853 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1854 return 0; 1855 return autoremove_wake_function(wait, mode, sync, arg); 1856 } 1857 1858 static void memcg_oom_recover(struct mem_cgroup *memcg) 1859 { 1860 /* 1861 * For the following lockless ->under_oom test, the only required 1862 * guarantee is that it must see the state asserted by an OOM when 1863 * this function is called as a result of userland actions 1864 * triggered by the notification of the OOM. This is trivially 1865 * achieved by invoking mem_cgroup_mark_under_oom() before 1866 * triggering notification. 1867 */ 1868 if (memcg && memcg->under_oom) 1869 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1870 } 1871 1872 enum oom_status { 1873 OOM_SUCCESS, 1874 OOM_FAILED, 1875 OOM_ASYNC, 1876 OOM_SKIPPED 1877 }; 1878 1879 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1880 { 1881 enum oom_status ret; 1882 bool locked; 1883 1884 if (order > PAGE_ALLOC_COSTLY_ORDER) 1885 return OOM_SKIPPED; 1886 1887 memcg_memory_event(memcg, MEMCG_OOM); 1888 1889 /* 1890 * We are in the middle of the charge context here, so we 1891 * don't want to block when potentially sitting on a callstack 1892 * that holds all kinds of filesystem and mm locks. 1893 * 1894 * cgroup1 allows disabling the OOM killer and waiting for outside 1895 * handling until the charge can succeed; remember the context and put 1896 * the task to sleep at the end of the page fault when all locks are 1897 * released. 1898 * 1899 * On the other hand, in-kernel OOM killer allows for an async victim 1900 * memory reclaim (oom_reaper) and that means that we are not solely 1901 * relying on the oom victim to make a forward progress and we can 1902 * invoke the oom killer here. 1903 * 1904 * Please note that mem_cgroup_out_of_memory might fail to find a 1905 * victim and then we have to bail out from the charge path. 1906 */ 1907 if (memcg->oom_kill_disable) { 1908 if (!current->in_user_fault) 1909 return OOM_SKIPPED; 1910 css_get(&memcg->css); 1911 current->memcg_in_oom = memcg; 1912 current->memcg_oom_gfp_mask = mask; 1913 current->memcg_oom_order = order; 1914 1915 return OOM_ASYNC; 1916 } 1917 1918 mem_cgroup_mark_under_oom(memcg); 1919 1920 locked = mem_cgroup_oom_trylock(memcg); 1921 1922 if (locked) 1923 mem_cgroup_oom_notify(memcg); 1924 1925 mem_cgroup_unmark_under_oom(memcg); 1926 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1927 ret = OOM_SUCCESS; 1928 else 1929 ret = OOM_FAILED; 1930 1931 if (locked) 1932 mem_cgroup_oom_unlock(memcg); 1933 1934 return ret; 1935 } 1936 1937 /** 1938 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1939 * @handle: actually kill/wait or just clean up the OOM state 1940 * 1941 * This has to be called at the end of a page fault if the memcg OOM 1942 * handler was enabled. 1943 * 1944 * Memcg supports userspace OOM handling where failed allocations must 1945 * sleep on a waitqueue until the userspace task resolves the 1946 * situation. Sleeping directly in the charge context with all kinds 1947 * of locks held is not a good idea, instead we remember an OOM state 1948 * in the task and mem_cgroup_oom_synchronize() has to be called at 1949 * the end of the page fault to complete the OOM handling. 1950 * 1951 * Returns %true if an ongoing memcg OOM situation was detected and 1952 * completed, %false otherwise. 1953 */ 1954 bool mem_cgroup_oom_synchronize(bool handle) 1955 { 1956 struct mem_cgroup *memcg = current->memcg_in_oom; 1957 struct oom_wait_info owait; 1958 bool locked; 1959 1960 /* OOM is global, do not handle */ 1961 if (!memcg) 1962 return false; 1963 1964 if (!handle) 1965 goto cleanup; 1966 1967 owait.memcg = memcg; 1968 owait.wait.flags = 0; 1969 owait.wait.func = memcg_oom_wake_function; 1970 owait.wait.private = current; 1971 INIT_LIST_HEAD(&owait.wait.entry); 1972 1973 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1974 mem_cgroup_mark_under_oom(memcg); 1975 1976 locked = mem_cgroup_oom_trylock(memcg); 1977 1978 if (locked) 1979 mem_cgroup_oom_notify(memcg); 1980 1981 if (locked && !memcg->oom_kill_disable) { 1982 mem_cgroup_unmark_under_oom(memcg); 1983 finish_wait(&memcg_oom_waitq, &owait.wait); 1984 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1985 current->memcg_oom_order); 1986 } else { 1987 schedule(); 1988 mem_cgroup_unmark_under_oom(memcg); 1989 finish_wait(&memcg_oom_waitq, &owait.wait); 1990 } 1991 1992 if (locked) { 1993 mem_cgroup_oom_unlock(memcg); 1994 /* 1995 * There is no guarantee that an OOM-lock contender 1996 * sees the wakeups triggered by the OOM kill 1997 * uncharges. Wake any sleepers explicitely. 1998 */ 1999 memcg_oom_recover(memcg); 2000 } 2001 cleanup: 2002 current->memcg_in_oom = NULL; 2003 css_put(&memcg->css); 2004 return true; 2005 } 2006 2007 /** 2008 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2009 * @victim: task to be killed by the OOM killer 2010 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2011 * 2012 * Returns a pointer to a memory cgroup, which has to be cleaned up 2013 * by killing all belonging OOM-killable tasks. 2014 * 2015 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2016 */ 2017 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2018 struct mem_cgroup *oom_domain) 2019 { 2020 struct mem_cgroup *oom_group = NULL; 2021 struct mem_cgroup *memcg; 2022 2023 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2024 return NULL; 2025 2026 if (!oom_domain) 2027 oom_domain = root_mem_cgroup; 2028 2029 rcu_read_lock(); 2030 2031 memcg = mem_cgroup_from_task(victim); 2032 if (memcg == root_mem_cgroup) 2033 goto out; 2034 2035 /* 2036 * Traverse the memory cgroup hierarchy from the victim task's 2037 * cgroup up to the OOMing cgroup (or root) to find the 2038 * highest-level memory cgroup with oom.group set. 2039 */ 2040 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2041 if (memcg->oom_group) 2042 oom_group = memcg; 2043 2044 if (memcg == oom_domain) 2045 break; 2046 } 2047 2048 if (oom_group) 2049 css_get(&oom_group->css); 2050 out: 2051 rcu_read_unlock(); 2052 2053 return oom_group; 2054 } 2055 2056 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2057 { 2058 pr_info("Tasks in "); 2059 pr_cont_cgroup_path(memcg->css.cgroup); 2060 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2061 } 2062 2063 /** 2064 * lock_page_memcg - lock a page->mem_cgroup binding 2065 * @page: the page 2066 * 2067 * This function protects unlocked LRU pages from being moved to 2068 * another cgroup. 2069 * 2070 * It ensures lifetime of the returned memcg. Caller is responsible 2071 * for the lifetime of the page; __unlock_page_memcg() is available 2072 * when @page might get freed inside the locked section. 2073 */ 2074 struct mem_cgroup *lock_page_memcg(struct page *page) 2075 { 2076 struct mem_cgroup *memcg; 2077 unsigned long flags; 2078 2079 /* 2080 * The RCU lock is held throughout the transaction. The fast 2081 * path can get away without acquiring the memcg->move_lock 2082 * because page moving starts with an RCU grace period. 2083 * 2084 * The RCU lock also protects the memcg from being freed when 2085 * the page state that is going to change is the only thing 2086 * preventing the page itself from being freed. E.g. writeback 2087 * doesn't hold a page reference and relies on PG_writeback to 2088 * keep off truncation, migration and so forth. 2089 */ 2090 rcu_read_lock(); 2091 2092 if (mem_cgroup_disabled()) 2093 return NULL; 2094 again: 2095 memcg = page->mem_cgroup; 2096 if (unlikely(!memcg)) 2097 return NULL; 2098 2099 if (atomic_read(&memcg->moving_account) <= 0) 2100 return memcg; 2101 2102 spin_lock_irqsave(&memcg->move_lock, flags); 2103 if (memcg != page->mem_cgroup) { 2104 spin_unlock_irqrestore(&memcg->move_lock, flags); 2105 goto again; 2106 } 2107 2108 /* 2109 * When charge migration first begins, we can have locked and 2110 * unlocked page stat updates happening concurrently. Track 2111 * the task who has the lock for unlock_page_memcg(). 2112 */ 2113 memcg->move_lock_task = current; 2114 memcg->move_lock_flags = flags; 2115 2116 return memcg; 2117 } 2118 EXPORT_SYMBOL(lock_page_memcg); 2119 2120 /** 2121 * __unlock_page_memcg - unlock and unpin a memcg 2122 * @memcg: the memcg 2123 * 2124 * Unlock and unpin a memcg returned by lock_page_memcg(). 2125 */ 2126 void __unlock_page_memcg(struct mem_cgroup *memcg) 2127 { 2128 if (memcg && memcg->move_lock_task == current) { 2129 unsigned long flags = memcg->move_lock_flags; 2130 2131 memcg->move_lock_task = NULL; 2132 memcg->move_lock_flags = 0; 2133 2134 spin_unlock_irqrestore(&memcg->move_lock, flags); 2135 } 2136 2137 rcu_read_unlock(); 2138 } 2139 2140 /** 2141 * unlock_page_memcg - unlock a page->mem_cgroup binding 2142 * @page: the page 2143 */ 2144 void unlock_page_memcg(struct page *page) 2145 { 2146 __unlock_page_memcg(page->mem_cgroup); 2147 } 2148 EXPORT_SYMBOL(unlock_page_memcg); 2149 2150 struct memcg_stock_pcp { 2151 struct mem_cgroup *cached; /* this never be root cgroup */ 2152 unsigned int nr_pages; 2153 struct work_struct work; 2154 unsigned long flags; 2155 #define FLUSHING_CACHED_CHARGE 0 2156 }; 2157 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2158 static DEFINE_MUTEX(percpu_charge_mutex); 2159 2160 /** 2161 * consume_stock: Try to consume stocked charge on this cpu. 2162 * @memcg: memcg to consume from. 2163 * @nr_pages: how many pages to charge. 2164 * 2165 * The charges will only happen if @memcg matches the current cpu's memcg 2166 * stock, and at least @nr_pages are available in that stock. Failure to 2167 * service an allocation will refill the stock. 2168 * 2169 * returns true if successful, false otherwise. 2170 */ 2171 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2172 { 2173 struct memcg_stock_pcp *stock; 2174 unsigned long flags; 2175 bool ret = false; 2176 2177 if (nr_pages > MEMCG_CHARGE_BATCH) 2178 return ret; 2179 2180 local_irq_save(flags); 2181 2182 stock = this_cpu_ptr(&memcg_stock); 2183 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2184 stock->nr_pages -= nr_pages; 2185 ret = true; 2186 } 2187 2188 local_irq_restore(flags); 2189 2190 return ret; 2191 } 2192 2193 /* 2194 * Returns stocks cached in percpu and reset cached information. 2195 */ 2196 static void drain_stock(struct memcg_stock_pcp *stock) 2197 { 2198 struct mem_cgroup *old = stock->cached; 2199 2200 if (stock->nr_pages) { 2201 page_counter_uncharge(&old->memory, stock->nr_pages); 2202 if (do_memsw_account()) 2203 page_counter_uncharge(&old->memsw, stock->nr_pages); 2204 css_put_many(&old->css, stock->nr_pages); 2205 stock->nr_pages = 0; 2206 } 2207 stock->cached = NULL; 2208 } 2209 2210 static void drain_local_stock(struct work_struct *dummy) 2211 { 2212 struct memcg_stock_pcp *stock; 2213 unsigned long flags; 2214 2215 /* 2216 * The only protection from memory hotplug vs. drain_stock races is 2217 * that we always operate on local CPU stock here with IRQ disabled 2218 */ 2219 local_irq_save(flags); 2220 2221 stock = this_cpu_ptr(&memcg_stock); 2222 drain_stock(stock); 2223 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2224 2225 local_irq_restore(flags); 2226 } 2227 2228 /* 2229 * Cache charges(val) to local per_cpu area. 2230 * This will be consumed by consume_stock() function, later. 2231 */ 2232 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2233 { 2234 struct memcg_stock_pcp *stock; 2235 unsigned long flags; 2236 2237 local_irq_save(flags); 2238 2239 stock = this_cpu_ptr(&memcg_stock); 2240 if (stock->cached != memcg) { /* reset if necessary */ 2241 drain_stock(stock); 2242 stock->cached = memcg; 2243 } 2244 stock->nr_pages += nr_pages; 2245 2246 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2247 drain_stock(stock); 2248 2249 local_irq_restore(flags); 2250 } 2251 2252 /* 2253 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2254 * of the hierarchy under it. 2255 */ 2256 static void drain_all_stock(struct mem_cgroup *root_memcg) 2257 { 2258 int cpu, curcpu; 2259 2260 /* If someone's already draining, avoid adding running more workers. */ 2261 if (!mutex_trylock(&percpu_charge_mutex)) 2262 return; 2263 /* 2264 * Notify other cpus that system-wide "drain" is running 2265 * We do not care about races with the cpu hotplug because cpu down 2266 * as well as workers from this path always operate on the local 2267 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2268 */ 2269 curcpu = get_cpu(); 2270 for_each_online_cpu(cpu) { 2271 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2272 struct mem_cgroup *memcg; 2273 2274 memcg = stock->cached; 2275 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) 2276 continue; 2277 if (!mem_cgroup_is_descendant(memcg, root_memcg)) { 2278 css_put(&memcg->css); 2279 continue; 2280 } 2281 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2282 if (cpu == curcpu) 2283 drain_local_stock(&stock->work); 2284 else 2285 schedule_work_on(cpu, &stock->work); 2286 } 2287 css_put(&memcg->css); 2288 } 2289 put_cpu(); 2290 mutex_unlock(&percpu_charge_mutex); 2291 } 2292 2293 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2294 { 2295 struct memcg_stock_pcp *stock; 2296 struct mem_cgroup *memcg, *mi; 2297 2298 stock = &per_cpu(memcg_stock, cpu); 2299 drain_stock(stock); 2300 2301 for_each_mem_cgroup(memcg) { 2302 int i; 2303 2304 for (i = 0; i < MEMCG_NR_STAT; i++) { 2305 int nid; 2306 long x; 2307 2308 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); 2309 if (x) 2310 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2311 atomic_long_add(x, &memcg->vmstats[i]); 2312 2313 if (i >= NR_VM_NODE_STAT_ITEMS) 2314 continue; 2315 2316 for_each_node(nid) { 2317 struct mem_cgroup_per_node *pn; 2318 2319 pn = mem_cgroup_nodeinfo(memcg, nid); 2320 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 2321 if (x) 2322 do { 2323 atomic_long_add(x, &pn->lruvec_stat[i]); 2324 } while ((pn = parent_nodeinfo(pn, nid))); 2325 } 2326 } 2327 2328 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 2329 long x; 2330 2331 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); 2332 if (x) 2333 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2334 atomic_long_add(x, &memcg->vmevents[i]); 2335 } 2336 } 2337 2338 return 0; 2339 } 2340 2341 static void reclaim_high(struct mem_cgroup *memcg, 2342 unsigned int nr_pages, 2343 gfp_t gfp_mask) 2344 { 2345 do { 2346 if (page_counter_read(&memcg->memory) <= memcg->high) 2347 continue; 2348 memcg_memory_event(memcg, MEMCG_HIGH); 2349 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2350 } while ((memcg = parent_mem_cgroup(memcg))); 2351 } 2352 2353 static void high_work_func(struct work_struct *work) 2354 { 2355 struct mem_cgroup *memcg; 2356 2357 memcg = container_of(work, struct mem_cgroup, high_work); 2358 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2359 } 2360 2361 /* 2362 * Scheduled by try_charge() to be executed from the userland return path 2363 * and reclaims memory over the high limit. 2364 */ 2365 void mem_cgroup_handle_over_high(void) 2366 { 2367 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2368 struct mem_cgroup *memcg; 2369 2370 if (likely(!nr_pages)) 2371 return; 2372 2373 memcg = get_mem_cgroup_from_mm(current->mm); 2374 reclaim_high(memcg, nr_pages, GFP_KERNEL); 2375 css_put(&memcg->css); 2376 current->memcg_nr_pages_over_high = 0; 2377 } 2378 2379 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2380 unsigned int nr_pages) 2381 { 2382 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2383 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2384 struct mem_cgroup *mem_over_limit; 2385 struct page_counter *counter; 2386 unsigned long nr_reclaimed; 2387 bool may_swap = true; 2388 bool drained = false; 2389 enum oom_status oom_status; 2390 2391 if (mem_cgroup_is_root(memcg)) 2392 return 0; 2393 retry: 2394 if (consume_stock(memcg, nr_pages)) 2395 return 0; 2396 2397 if (!do_memsw_account() || 2398 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2399 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2400 goto done_restock; 2401 if (do_memsw_account()) 2402 page_counter_uncharge(&memcg->memsw, batch); 2403 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2404 } else { 2405 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2406 may_swap = false; 2407 } 2408 2409 if (batch > nr_pages) { 2410 batch = nr_pages; 2411 goto retry; 2412 } 2413 2414 /* 2415 * Unlike in global OOM situations, memcg is not in a physical 2416 * memory shortage. Allow dying and OOM-killed tasks to 2417 * bypass the last charges so that they can exit quickly and 2418 * free their memory. 2419 */ 2420 if (unlikely(should_force_charge())) 2421 goto force; 2422 2423 /* 2424 * Prevent unbounded recursion when reclaim operations need to 2425 * allocate memory. This might exceed the limits temporarily, 2426 * but we prefer facilitating memory reclaim and getting back 2427 * under the limit over triggering OOM kills in these cases. 2428 */ 2429 if (unlikely(current->flags & PF_MEMALLOC)) 2430 goto force; 2431 2432 if (unlikely(task_in_memcg_oom(current))) 2433 goto nomem; 2434 2435 if (!gfpflags_allow_blocking(gfp_mask)) 2436 goto nomem; 2437 2438 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2439 2440 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2441 gfp_mask, may_swap); 2442 2443 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2444 goto retry; 2445 2446 if (!drained) { 2447 drain_all_stock(mem_over_limit); 2448 drained = true; 2449 goto retry; 2450 } 2451 2452 if (gfp_mask & __GFP_NORETRY) 2453 goto nomem; 2454 /* 2455 * Even though the limit is exceeded at this point, reclaim 2456 * may have been able to free some pages. Retry the charge 2457 * before killing the task. 2458 * 2459 * Only for regular pages, though: huge pages are rather 2460 * unlikely to succeed so close to the limit, and we fall back 2461 * to regular pages anyway in case of failure. 2462 */ 2463 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2464 goto retry; 2465 /* 2466 * At task move, charge accounts can be doubly counted. So, it's 2467 * better to wait until the end of task_move if something is going on. 2468 */ 2469 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2470 goto retry; 2471 2472 if (nr_retries--) 2473 goto retry; 2474 2475 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2476 goto nomem; 2477 2478 if (gfp_mask & __GFP_NOFAIL) 2479 goto force; 2480 2481 if (fatal_signal_pending(current)) 2482 goto force; 2483 2484 /* 2485 * keep retrying as long as the memcg oom killer is able to make 2486 * a forward progress or bypass the charge if the oom killer 2487 * couldn't make any progress. 2488 */ 2489 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2490 get_order(nr_pages * PAGE_SIZE)); 2491 switch (oom_status) { 2492 case OOM_SUCCESS: 2493 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2494 goto retry; 2495 case OOM_FAILED: 2496 goto force; 2497 default: 2498 goto nomem; 2499 } 2500 nomem: 2501 if (!(gfp_mask & __GFP_NOFAIL)) 2502 return -ENOMEM; 2503 force: 2504 /* 2505 * The allocation either can't fail or will lead to more memory 2506 * being freed very soon. Allow memory usage go over the limit 2507 * temporarily by force charging it. 2508 */ 2509 page_counter_charge(&memcg->memory, nr_pages); 2510 if (do_memsw_account()) 2511 page_counter_charge(&memcg->memsw, nr_pages); 2512 css_get_many(&memcg->css, nr_pages); 2513 2514 return 0; 2515 2516 done_restock: 2517 css_get_many(&memcg->css, batch); 2518 if (batch > nr_pages) 2519 refill_stock(memcg, batch - nr_pages); 2520 2521 /* 2522 * If the hierarchy is above the normal consumption range, schedule 2523 * reclaim on returning to userland. We can perform reclaim here 2524 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2525 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2526 * not recorded as it most likely matches current's and won't 2527 * change in the meantime. As high limit is checked again before 2528 * reclaim, the cost of mismatch is negligible. 2529 */ 2530 do { 2531 if (page_counter_read(&memcg->memory) > memcg->high) { 2532 /* Don't bother a random interrupted task */ 2533 if (in_interrupt()) { 2534 schedule_work(&memcg->high_work); 2535 break; 2536 } 2537 current->memcg_nr_pages_over_high += batch; 2538 set_notify_resume(current); 2539 break; 2540 } 2541 } while ((memcg = parent_mem_cgroup(memcg))); 2542 2543 return 0; 2544 } 2545 2546 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2547 { 2548 if (mem_cgroup_is_root(memcg)) 2549 return; 2550 2551 page_counter_uncharge(&memcg->memory, nr_pages); 2552 if (do_memsw_account()) 2553 page_counter_uncharge(&memcg->memsw, nr_pages); 2554 2555 css_put_many(&memcg->css, nr_pages); 2556 } 2557 2558 static void lock_page_lru(struct page *page, int *isolated) 2559 { 2560 pg_data_t *pgdat = page_pgdat(page); 2561 2562 spin_lock_irq(&pgdat->lru_lock); 2563 if (PageLRU(page)) { 2564 struct lruvec *lruvec; 2565 2566 lruvec = mem_cgroup_page_lruvec(page, pgdat); 2567 ClearPageLRU(page); 2568 del_page_from_lru_list(page, lruvec, page_lru(page)); 2569 *isolated = 1; 2570 } else 2571 *isolated = 0; 2572 } 2573 2574 static void unlock_page_lru(struct page *page, int isolated) 2575 { 2576 pg_data_t *pgdat = page_pgdat(page); 2577 2578 if (isolated) { 2579 struct lruvec *lruvec; 2580 2581 lruvec = mem_cgroup_page_lruvec(page, pgdat); 2582 VM_BUG_ON_PAGE(PageLRU(page), page); 2583 SetPageLRU(page); 2584 add_page_to_lru_list(page, lruvec, page_lru(page)); 2585 } 2586 spin_unlock_irq(&pgdat->lru_lock); 2587 } 2588 2589 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2590 bool lrucare) 2591 { 2592 int isolated; 2593 2594 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2595 2596 /* 2597 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2598 * may already be on some other mem_cgroup's LRU. Take care of it. 2599 */ 2600 if (lrucare) 2601 lock_page_lru(page, &isolated); 2602 2603 /* 2604 * Nobody should be changing or seriously looking at 2605 * page->mem_cgroup at this point: 2606 * 2607 * - the page is uncharged 2608 * 2609 * - the page is off-LRU 2610 * 2611 * - an anonymous fault has exclusive page access, except for 2612 * a locked page table 2613 * 2614 * - a page cache insertion, a swapin fault, or a migration 2615 * have the page locked 2616 */ 2617 page->mem_cgroup = memcg; 2618 2619 if (lrucare) 2620 unlock_page_lru(page, isolated); 2621 } 2622 2623 #ifdef CONFIG_MEMCG_KMEM 2624 static int memcg_alloc_cache_id(void) 2625 { 2626 int id, size; 2627 int err; 2628 2629 id = ida_simple_get(&memcg_cache_ida, 2630 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2631 if (id < 0) 2632 return id; 2633 2634 if (id < memcg_nr_cache_ids) 2635 return id; 2636 2637 /* 2638 * There's no space for the new id in memcg_caches arrays, 2639 * so we have to grow them. 2640 */ 2641 down_write(&memcg_cache_ids_sem); 2642 2643 size = 2 * (id + 1); 2644 if (size < MEMCG_CACHES_MIN_SIZE) 2645 size = MEMCG_CACHES_MIN_SIZE; 2646 else if (size > MEMCG_CACHES_MAX_SIZE) 2647 size = MEMCG_CACHES_MAX_SIZE; 2648 2649 err = memcg_update_all_caches(size); 2650 if (!err) 2651 err = memcg_update_all_list_lrus(size); 2652 if (!err) 2653 memcg_nr_cache_ids = size; 2654 2655 up_write(&memcg_cache_ids_sem); 2656 2657 if (err) { 2658 ida_simple_remove(&memcg_cache_ida, id); 2659 return err; 2660 } 2661 return id; 2662 } 2663 2664 static void memcg_free_cache_id(int id) 2665 { 2666 ida_simple_remove(&memcg_cache_ida, id); 2667 } 2668 2669 struct memcg_kmem_cache_create_work { 2670 struct mem_cgroup *memcg; 2671 struct kmem_cache *cachep; 2672 struct work_struct work; 2673 }; 2674 2675 static void memcg_kmem_cache_create_func(struct work_struct *w) 2676 { 2677 struct memcg_kmem_cache_create_work *cw = 2678 container_of(w, struct memcg_kmem_cache_create_work, work); 2679 struct mem_cgroup *memcg = cw->memcg; 2680 struct kmem_cache *cachep = cw->cachep; 2681 2682 memcg_create_kmem_cache(memcg, cachep); 2683 2684 css_put(&memcg->css); 2685 kfree(cw); 2686 } 2687 2688 /* 2689 * Enqueue the creation of a per-memcg kmem_cache. 2690 */ 2691 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2692 struct kmem_cache *cachep) 2693 { 2694 struct memcg_kmem_cache_create_work *cw; 2695 2696 if (!css_tryget_online(&memcg->css)) 2697 return; 2698 2699 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); 2700 if (!cw) 2701 return; 2702 2703 cw->memcg = memcg; 2704 cw->cachep = cachep; 2705 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2706 2707 queue_work(memcg_kmem_cache_wq, &cw->work); 2708 } 2709 2710 static inline bool memcg_kmem_bypass(void) 2711 { 2712 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 2713 return true; 2714 return false; 2715 } 2716 2717 /** 2718 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation 2719 * @cachep: the original global kmem cache 2720 * 2721 * Return the kmem_cache we're supposed to use for a slab allocation. 2722 * We try to use the current memcg's version of the cache. 2723 * 2724 * If the cache does not exist yet, if we are the first user of it, we 2725 * create it asynchronously in a workqueue and let the current allocation 2726 * go through with the original cache. 2727 * 2728 * This function takes a reference to the cache it returns to assure it 2729 * won't get destroyed while we are working with it. Once the caller is 2730 * done with it, memcg_kmem_put_cache() must be called to release the 2731 * reference. 2732 */ 2733 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) 2734 { 2735 struct mem_cgroup *memcg; 2736 struct kmem_cache *memcg_cachep; 2737 struct memcg_cache_array *arr; 2738 int kmemcg_id; 2739 2740 VM_BUG_ON(!is_root_cache(cachep)); 2741 2742 if (memcg_kmem_bypass()) 2743 return cachep; 2744 2745 rcu_read_lock(); 2746 2747 if (unlikely(current->active_memcg)) 2748 memcg = current->active_memcg; 2749 else 2750 memcg = mem_cgroup_from_task(current); 2751 2752 if (!memcg || memcg == root_mem_cgroup) 2753 goto out_unlock; 2754 2755 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2756 if (kmemcg_id < 0) 2757 goto out_unlock; 2758 2759 arr = rcu_dereference(cachep->memcg_params.memcg_caches); 2760 2761 /* 2762 * Make sure we will access the up-to-date value. The code updating 2763 * memcg_caches issues a write barrier to match the data dependency 2764 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). 2765 */ 2766 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); 2767 2768 /* 2769 * If we are in a safe context (can wait, and not in interrupt 2770 * context), we could be be predictable and return right away. 2771 * This would guarantee that the allocation being performed 2772 * already belongs in the new cache. 2773 * 2774 * However, there are some clashes that can arrive from locking. 2775 * For instance, because we acquire the slab_mutex while doing 2776 * memcg_create_kmem_cache, this means no further allocation 2777 * could happen with the slab_mutex held. So it's better to 2778 * defer everything. 2779 * 2780 * If the memcg is dying or memcg_cache is about to be released, 2781 * don't bother creating new kmem_caches. Because memcg_cachep 2782 * is ZEROed as the fist step of kmem offlining, we don't need 2783 * percpu_ref_tryget_live() here. css_tryget_online() check in 2784 * memcg_schedule_kmem_cache_create() will prevent us from 2785 * creation of a new kmem_cache. 2786 */ 2787 if (unlikely(!memcg_cachep)) 2788 memcg_schedule_kmem_cache_create(memcg, cachep); 2789 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) 2790 cachep = memcg_cachep; 2791 out_unlock: 2792 rcu_read_unlock(); 2793 return cachep; 2794 } 2795 2796 /** 2797 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache 2798 * @cachep: the cache returned by memcg_kmem_get_cache 2799 */ 2800 void memcg_kmem_put_cache(struct kmem_cache *cachep) 2801 { 2802 if (!is_root_cache(cachep)) 2803 percpu_ref_put(&cachep->memcg_params.refcnt); 2804 } 2805 2806 /** 2807 * __memcg_kmem_charge_memcg: charge a kmem page 2808 * @page: page to charge 2809 * @gfp: reclaim mode 2810 * @order: allocation order 2811 * @memcg: memory cgroup to charge 2812 * 2813 * Returns 0 on success, an error code on failure. 2814 */ 2815 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2816 struct mem_cgroup *memcg) 2817 { 2818 unsigned int nr_pages = 1 << order; 2819 struct page_counter *counter; 2820 int ret; 2821 2822 ret = try_charge(memcg, gfp, nr_pages); 2823 if (ret) 2824 return ret; 2825 2826 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 2827 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 2828 cancel_charge(memcg, nr_pages); 2829 return -ENOMEM; 2830 } 2831 return 0; 2832 } 2833 2834 /** 2835 * __memcg_kmem_charge: charge a kmem page to the current memory cgroup 2836 * @page: page to charge 2837 * @gfp: reclaim mode 2838 * @order: allocation order 2839 * 2840 * Returns 0 on success, an error code on failure. 2841 */ 2842 int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2843 { 2844 struct mem_cgroup *memcg; 2845 int ret = 0; 2846 2847 if (memcg_kmem_bypass()) 2848 return 0; 2849 2850 memcg = get_mem_cgroup_from_current(); 2851 if (!mem_cgroup_is_root(memcg)) { 2852 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); 2853 if (!ret) { 2854 page->mem_cgroup = memcg; 2855 __SetPageKmemcg(page); 2856 } 2857 } 2858 css_put(&memcg->css); 2859 return ret; 2860 } 2861 2862 /** 2863 * __memcg_kmem_uncharge_memcg: uncharge a kmem page 2864 * @memcg: memcg to uncharge 2865 * @nr_pages: number of pages to uncharge 2866 */ 2867 void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, 2868 unsigned int nr_pages) 2869 { 2870 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2871 page_counter_uncharge(&memcg->kmem, nr_pages); 2872 2873 page_counter_uncharge(&memcg->memory, nr_pages); 2874 if (do_memsw_account()) 2875 page_counter_uncharge(&memcg->memsw, nr_pages); 2876 } 2877 /** 2878 * __memcg_kmem_uncharge: uncharge a kmem page 2879 * @page: page to uncharge 2880 * @order: allocation order 2881 */ 2882 void __memcg_kmem_uncharge(struct page *page, int order) 2883 { 2884 struct mem_cgroup *memcg = page->mem_cgroup; 2885 unsigned int nr_pages = 1 << order; 2886 2887 if (!memcg) 2888 return; 2889 2890 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2891 __memcg_kmem_uncharge_memcg(memcg, nr_pages); 2892 page->mem_cgroup = NULL; 2893 2894 /* slab pages do not have PageKmemcg flag set */ 2895 if (PageKmemcg(page)) 2896 __ClearPageKmemcg(page); 2897 2898 css_put_many(&memcg->css, nr_pages); 2899 } 2900 #endif /* CONFIG_MEMCG_KMEM */ 2901 2902 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2903 2904 /* 2905 * Because tail pages are not marked as "used", set it. We're under 2906 * pgdat->lru_lock and migration entries setup in all page mappings. 2907 */ 2908 void mem_cgroup_split_huge_fixup(struct page *head) 2909 { 2910 int i; 2911 2912 if (mem_cgroup_disabled()) 2913 return; 2914 2915 for (i = 1; i < HPAGE_PMD_NR; i++) 2916 head[i].mem_cgroup = head->mem_cgroup; 2917 2918 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); 2919 } 2920 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2921 2922 #ifdef CONFIG_MEMCG_SWAP 2923 /** 2924 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2925 * @entry: swap entry to be moved 2926 * @from: mem_cgroup which the entry is moved from 2927 * @to: mem_cgroup which the entry is moved to 2928 * 2929 * It succeeds only when the swap_cgroup's record for this entry is the same 2930 * as the mem_cgroup's id of @from. 2931 * 2932 * Returns 0 on success, -EINVAL on failure. 2933 * 2934 * The caller must have charged to @to, IOW, called page_counter_charge() about 2935 * both res and memsw, and called css_get(). 2936 */ 2937 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2938 struct mem_cgroup *from, struct mem_cgroup *to) 2939 { 2940 unsigned short old_id, new_id; 2941 2942 old_id = mem_cgroup_id(from); 2943 new_id = mem_cgroup_id(to); 2944 2945 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2946 mod_memcg_state(from, MEMCG_SWAP, -1); 2947 mod_memcg_state(to, MEMCG_SWAP, 1); 2948 return 0; 2949 } 2950 return -EINVAL; 2951 } 2952 #else 2953 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2954 struct mem_cgroup *from, struct mem_cgroup *to) 2955 { 2956 return -EINVAL; 2957 } 2958 #endif 2959 2960 static DEFINE_MUTEX(memcg_max_mutex); 2961 2962 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 2963 unsigned long max, bool memsw) 2964 { 2965 bool enlarge = false; 2966 bool drained = false; 2967 int ret; 2968 bool limits_invariant; 2969 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 2970 2971 do { 2972 if (signal_pending(current)) { 2973 ret = -EINTR; 2974 break; 2975 } 2976 2977 mutex_lock(&memcg_max_mutex); 2978 /* 2979 * Make sure that the new limit (memsw or memory limit) doesn't 2980 * break our basic invariant rule memory.max <= memsw.max. 2981 */ 2982 limits_invariant = memsw ? max >= memcg->memory.max : 2983 max <= memcg->memsw.max; 2984 if (!limits_invariant) { 2985 mutex_unlock(&memcg_max_mutex); 2986 ret = -EINVAL; 2987 break; 2988 } 2989 if (max > counter->max) 2990 enlarge = true; 2991 ret = page_counter_set_max(counter, max); 2992 mutex_unlock(&memcg_max_mutex); 2993 2994 if (!ret) 2995 break; 2996 2997 if (!drained) { 2998 drain_all_stock(memcg); 2999 drained = true; 3000 continue; 3001 } 3002 3003 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3004 GFP_KERNEL, !memsw)) { 3005 ret = -EBUSY; 3006 break; 3007 } 3008 } while (true); 3009 3010 if (!ret && enlarge) 3011 memcg_oom_recover(memcg); 3012 3013 return ret; 3014 } 3015 3016 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3017 gfp_t gfp_mask, 3018 unsigned long *total_scanned) 3019 { 3020 unsigned long nr_reclaimed = 0; 3021 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3022 unsigned long reclaimed; 3023 int loop = 0; 3024 struct mem_cgroup_tree_per_node *mctz; 3025 unsigned long excess; 3026 unsigned long nr_scanned; 3027 3028 if (order > 0) 3029 return 0; 3030 3031 mctz = soft_limit_tree_node(pgdat->node_id); 3032 3033 /* 3034 * Do not even bother to check the largest node if the root 3035 * is empty. Do it lockless to prevent lock bouncing. Races 3036 * are acceptable as soft limit is best effort anyway. 3037 */ 3038 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3039 return 0; 3040 3041 /* 3042 * This loop can run a while, specially if mem_cgroup's continuously 3043 * keep exceeding their soft limit and putting the system under 3044 * pressure 3045 */ 3046 do { 3047 if (next_mz) 3048 mz = next_mz; 3049 else 3050 mz = mem_cgroup_largest_soft_limit_node(mctz); 3051 if (!mz) 3052 break; 3053 3054 nr_scanned = 0; 3055 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3056 gfp_mask, &nr_scanned); 3057 nr_reclaimed += reclaimed; 3058 *total_scanned += nr_scanned; 3059 spin_lock_irq(&mctz->lock); 3060 __mem_cgroup_remove_exceeded(mz, mctz); 3061 3062 /* 3063 * If we failed to reclaim anything from this memory cgroup 3064 * it is time to move on to the next cgroup 3065 */ 3066 next_mz = NULL; 3067 if (!reclaimed) 3068 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3069 3070 excess = soft_limit_excess(mz->memcg); 3071 /* 3072 * One school of thought says that we should not add 3073 * back the node to the tree if reclaim returns 0. 3074 * But our reclaim could return 0, simply because due 3075 * to priority we are exposing a smaller subset of 3076 * memory to reclaim from. Consider this as a longer 3077 * term TODO. 3078 */ 3079 /* If excess == 0, no tree ops */ 3080 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3081 spin_unlock_irq(&mctz->lock); 3082 css_put(&mz->memcg->css); 3083 loop++; 3084 /* 3085 * Could not reclaim anything and there are no more 3086 * mem cgroups to try or we seem to be looping without 3087 * reclaiming anything. 3088 */ 3089 if (!nr_reclaimed && 3090 (next_mz == NULL || 3091 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3092 break; 3093 } while (!nr_reclaimed); 3094 if (next_mz) 3095 css_put(&next_mz->memcg->css); 3096 return nr_reclaimed; 3097 } 3098 3099 /* 3100 * Test whether @memcg has children, dead or alive. Note that this 3101 * function doesn't care whether @memcg has use_hierarchy enabled and 3102 * returns %true if there are child csses according to the cgroup 3103 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3104 */ 3105 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3106 { 3107 bool ret; 3108 3109 rcu_read_lock(); 3110 ret = css_next_child(NULL, &memcg->css); 3111 rcu_read_unlock(); 3112 return ret; 3113 } 3114 3115 /* 3116 * Reclaims as many pages from the given memcg as possible. 3117 * 3118 * Caller is responsible for holding css reference for memcg. 3119 */ 3120 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3121 { 3122 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3123 3124 /* we call try-to-free pages for make this cgroup empty */ 3125 lru_add_drain_all(); 3126 3127 drain_all_stock(memcg); 3128 3129 /* try to free all pages in this cgroup */ 3130 while (nr_retries && page_counter_read(&memcg->memory)) { 3131 int progress; 3132 3133 if (signal_pending(current)) 3134 return -EINTR; 3135 3136 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3137 GFP_KERNEL, true); 3138 if (!progress) { 3139 nr_retries--; 3140 /* maybe some writeback is necessary */ 3141 congestion_wait(BLK_RW_ASYNC, HZ/10); 3142 } 3143 3144 } 3145 3146 return 0; 3147 } 3148 3149 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3150 char *buf, size_t nbytes, 3151 loff_t off) 3152 { 3153 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3154 3155 if (mem_cgroup_is_root(memcg)) 3156 return -EINVAL; 3157 return mem_cgroup_force_empty(memcg) ?: nbytes; 3158 } 3159 3160 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3161 struct cftype *cft) 3162 { 3163 return mem_cgroup_from_css(css)->use_hierarchy; 3164 } 3165 3166 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3167 struct cftype *cft, u64 val) 3168 { 3169 int retval = 0; 3170 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3171 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3172 3173 if (memcg->use_hierarchy == val) 3174 return 0; 3175 3176 /* 3177 * If parent's use_hierarchy is set, we can't make any modifications 3178 * in the child subtrees. If it is unset, then the change can 3179 * occur, provided the current cgroup has no children. 3180 * 3181 * For the root cgroup, parent_mem is NULL, we allow value to be 3182 * set if there are no children. 3183 */ 3184 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3185 (val == 1 || val == 0)) { 3186 if (!memcg_has_children(memcg)) 3187 memcg->use_hierarchy = val; 3188 else 3189 retval = -EBUSY; 3190 } else 3191 retval = -EINVAL; 3192 3193 return retval; 3194 } 3195 3196 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3197 { 3198 unsigned long val; 3199 3200 if (mem_cgroup_is_root(memcg)) { 3201 val = memcg_page_state(memcg, MEMCG_CACHE) + 3202 memcg_page_state(memcg, MEMCG_RSS); 3203 if (swap) 3204 val += memcg_page_state(memcg, MEMCG_SWAP); 3205 } else { 3206 if (!swap) 3207 val = page_counter_read(&memcg->memory); 3208 else 3209 val = page_counter_read(&memcg->memsw); 3210 } 3211 return val; 3212 } 3213 3214 enum { 3215 RES_USAGE, 3216 RES_LIMIT, 3217 RES_MAX_USAGE, 3218 RES_FAILCNT, 3219 RES_SOFT_LIMIT, 3220 }; 3221 3222 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3223 struct cftype *cft) 3224 { 3225 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3226 struct page_counter *counter; 3227 3228 switch (MEMFILE_TYPE(cft->private)) { 3229 case _MEM: 3230 counter = &memcg->memory; 3231 break; 3232 case _MEMSWAP: 3233 counter = &memcg->memsw; 3234 break; 3235 case _KMEM: 3236 counter = &memcg->kmem; 3237 break; 3238 case _TCP: 3239 counter = &memcg->tcpmem; 3240 break; 3241 default: 3242 BUG(); 3243 } 3244 3245 switch (MEMFILE_ATTR(cft->private)) { 3246 case RES_USAGE: 3247 if (counter == &memcg->memory) 3248 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3249 if (counter == &memcg->memsw) 3250 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3251 return (u64)page_counter_read(counter) * PAGE_SIZE; 3252 case RES_LIMIT: 3253 return (u64)counter->max * PAGE_SIZE; 3254 case RES_MAX_USAGE: 3255 return (u64)counter->watermark * PAGE_SIZE; 3256 case RES_FAILCNT: 3257 return counter->failcnt; 3258 case RES_SOFT_LIMIT: 3259 return (u64)memcg->soft_limit * PAGE_SIZE; 3260 default: 3261 BUG(); 3262 } 3263 } 3264 3265 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) 3266 { 3267 unsigned long stat[MEMCG_NR_STAT]; 3268 struct mem_cgroup *mi; 3269 int node, cpu, i; 3270 int min_idx, max_idx; 3271 3272 if (slab_only) { 3273 min_idx = NR_SLAB_RECLAIMABLE; 3274 max_idx = NR_SLAB_UNRECLAIMABLE; 3275 } else { 3276 min_idx = 0; 3277 max_idx = MEMCG_NR_STAT; 3278 } 3279 3280 for (i = min_idx; i < max_idx; i++) 3281 stat[i] = 0; 3282 3283 for_each_online_cpu(cpu) 3284 for (i = min_idx; i < max_idx; i++) 3285 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); 3286 3287 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3288 for (i = min_idx; i < max_idx; i++) 3289 atomic_long_add(stat[i], &mi->vmstats[i]); 3290 3291 if (!slab_only) 3292 max_idx = NR_VM_NODE_STAT_ITEMS; 3293 3294 for_each_node(node) { 3295 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3296 struct mem_cgroup_per_node *pi; 3297 3298 for (i = min_idx; i < max_idx; i++) 3299 stat[i] = 0; 3300 3301 for_each_online_cpu(cpu) 3302 for (i = min_idx; i < max_idx; i++) 3303 stat[i] += per_cpu( 3304 pn->lruvec_stat_cpu->count[i], cpu); 3305 3306 for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 3307 for (i = min_idx; i < max_idx; i++) 3308 atomic_long_add(stat[i], &pi->lruvec_stat[i]); 3309 } 3310 } 3311 3312 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 3313 { 3314 unsigned long events[NR_VM_EVENT_ITEMS]; 3315 struct mem_cgroup *mi; 3316 int cpu, i; 3317 3318 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3319 events[i] = 0; 3320 3321 for_each_online_cpu(cpu) 3322 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3323 events[i] += per_cpu(memcg->vmstats_percpu->events[i], 3324 cpu); 3325 3326 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3327 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3328 atomic_long_add(events[i], &mi->vmevents[i]); 3329 } 3330 3331 #ifdef CONFIG_MEMCG_KMEM 3332 static int memcg_online_kmem(struct mem_cgroup *memcg) 3333 { 3334 int memcg_id; 3335 3336 if (cgroup_memory_nokmem) 3337 return 0; 3338 3339 BUG_ON(memcg->kmemcg_id >= 0); 3340 BUG_ON(memcg->kmem_state); 3341 3342 memcg_id = memcg_alloc_cache_id(); 3343 if (memcg_id < 0) 3344 return memcg_id; 3345 3346 static_branch_inc(&memcg_kmem_enabled_key); 3347 /* 3348 * A memory cgroup is considered kmem-online as soon as it gets 3349 * kmemcg_id. Setting the id after enabling static branching will 3350 * guarantee no one starts accounting before all call sites are 3351 * patched. 3352 */ 3353 memcg->kmemcg_id = memcg_id; 3354 memcg->kmem_state = KMEM_ONLINE; 3355 INIT_LIST_HEAD(&memcg->kmem_caches); 3356 3357 return 0; 3358 } 3359 3360 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3361 { 3362 struct cgroup_subsys_state *css; 3363 struct mem_cgroup *parent, *child; 3364 int kmemcg_id; 3365 3366 if (memcg->kmem_state != KMEM_ONLINE) 3367 return; 3368 /* 3369 * Clear the online state before clearing memcg_caches array 3370 * entries. The slab_mutex in memcg_deactivate_kmem_caches() 3371 * guarantees that no cache will be created for this cgroup 3372 * after we are done (see memcg_create_kmem_cache()). 3373 */ 3374 memcg->kmem_state = KMEM_ALLOCATED; 3375 3376 parent = parent_mem_cgroup(memcg); 3377 if (!parent) 3378 parent = root_mem_cgroup; 3379 3380 /* 3381 * Deactivate and reparent kmem_caches. Then flush percpu 3382 * slab statistics to have precise values at the parent and 3383 * all ancestor levels. It's required to keep slab stats 3384 * accurate after the reparenting of kmem_caches. 3385 */ 3386 memcg_deactivate_kmem_caches(memcg, parent); 3387 memcg_flush_percpu_vmstats(memcg, true); 3388 3389 kmemcg_id = memcg->kmemcg_id; 3390 BUG_ON(kmemcg_id < 0); 3391 3392 /* 3393 * Change kmemcg_id of this cgroup and all its descendants to the 3394 * parent's id, and then move all entries from this cgroup's list_lrus 3395 * to ones of the parent. After we have finished, all list_lrus 3396 * corresponding to this cgroup are guaranteed to remain empty. The 3397 * ordering is imposed by list_lru_node->lock taken by 3398 * memcg_drain_all_list_lrus(). 3399 */ 3400 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3401 css_for_each_descendant_pre(css, &memcg->css) { 3402 child = mem_cgroup_from_css(css); 3403 BUG_ON(child->kmemcg_id != kmemcg_id); 3404 child->kmemcg_id = parent->kmemcg_id; 3405 if (!memcg->use_hierarchy) 3406 break; 3407 } 3408 rcu_read_unlock(); 3409 3410 memcg_drain_all_list_lrus(kmemcg_id, parent); 3411 3412 memcg_free_cache_id(kmemcg_id); 3413 } 3414 3415 static void memcg_free_kmem(struct mem_cgroup *memcg) 3416 { 3417 /* css_alloc() failed, offlining didn't happen */ 3418 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3419 memcg_offline_kmem(memcg); 3420 3421 if (memcg->kmem_state == KMEM_ALLOCATED) { 3422 WARN_ON(!list_empty(&memcg->kmem_caches)); 3423 static_branch_dec(&memcg_kmem_enabled_key); 3424 } 3425 } 3426 #else 3427 static int memcg_online_kmem(struct mem_cgroup *memcg) 3428 { 3429 return 0; 3430 } 3431 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3432 { 3433 } 3434 static void memcg_free_kmem(struct mem_cgroup *memcg) 3435 { 3436 } 3437 #endif /* CONFIG_MEMCG_KMEM */ 3438 3439 static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3440 unsigned long max) 3441 { 3442 int ret; 3443 3444 mutex_lock(&memcg_max_mutex); 3445 ret = page_counter_set_max(&memcg->kmem, max); 3446 mutex_unlock(&memcg_max_mutex); 3447 return ret; 3448 } 3449 3450 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3451 { 3452 int ret; 3453 3454 mutex_lock(&memcg_max_mutex); 3455 3456 ret = page_counter_set_max(&memcg->tcpmem, max); 3457 if (ret) 3458 goto out; 3459 3460 if (!memcg->tcpmem_active) { 3461 /* 3462 * The active flag needs to be written after the static_key 3463 * update. This is what guarantees that the socket activation 3464 * function is the last one to run. See mem_cgroup_sk_alloc() 3465 * for details, and note that we don't mark any socket as 3466 * belonging to this memcg until that flag is up. 3467 * 3468 * We need to do this, because static_keys will span multiple 3469 * sites, but we can't control their order. If we mark a socket 3470 * as accounted, but the accounting functions are not patched in 3471 * yet, we'll lose accounting. 3472 * 3473 * We never race with the readers in mem_cgroup_sk_alloc(), 3474 * because when this value change, the code to process it is not 3475 * patched in yet. 3476 */ 3477 static_branch_inc(&memcg_sockets_enabled_key); 3478 memcg->tcpmem_active = true; 3479 } 3480 out: 3481 mutex_unlock(&memcg_max_mutex); 3482 return ret; 3483 } 3484 3485 /* 3486 * The user of this function is... 3487 * RES_LIMIT. 3488 */ 3489 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3490 char *buf, size_t nbytes, loff_t off) 3491 { 3492 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3493 unsigned long nr_pages; 3494 int ret; 3495 3496 buf = strstrip(buf); 3497 ret = page_counter_memparse(buf, "-1", &nr_pages); 3498 if (ret) 3499 return ret; 3500 3501 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3502 case RES_LIMIT: 3503 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3504 ret = -EINVAL; 3505 break; 3506 } 3507 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3508 case _MEM: 3509 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3510 break; 3511 case _MEMSWAP: 3512 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3513 break; 3514 case _KMEM: 3515 ret = memcg_update_kmem_max(memcg, nr_pages); 3516 break; 3517 case _TCP: 3518 ret = memcg_update_tcp_max(memcg, nr_pages); 3519 break; 3520 } 3521 break; 3522 case RES_SOFT_LIMIT: 3523 memcg->soft_limit = nr_pages; 3524 ret = 0; 3525 break; 3526 } 3527 return ret ?: nbytes; 3528 } 3529 3530 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3531 size_t nbytes, loff_t off) 3532 { 3533 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3534 struct page_counter *counter; 3535 3536 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3537 case _MEM: 3538 counter = &memcg->memory; 3539 break; 3540 case _MEMSWAP: 3541 counter = &memcg->memsw; 3542 break; 3543 case _KMEM: 3544 counter = &memcg->kmem; 3545 break; 3546 case _TCP: 3547 counter = &memcg->tcpmem; 3548 break; 3549 default: 3550 BUG(); 3551 } 3552 3553 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3554 case RES_MAX_USAGE: 3555 page_counter_reset_watermark(counter); 3556 break; 3557 case RES_FAILCNT: 3558 counter->failcnt = 0; 3559 break; 3560 default: 3561 BUG(); 3562 } 3563 3564 return nbytes; 3565 } 3566 3567 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3568 struct cftype *cft) 3569 { 3570 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3571 } 3572 3573 #ifdef CONFIG_MMU 3574 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3575 struct cftype *cft, u64 val) 3576 { 3577 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3578 3579 if (val & ~MOVE_MASK) 3580 return -EINVAL; 3581 3582 /* 3583 * No kind of locking is needed in here, because ->can_attach() will 3584 * check this value once in the beginning of the process, and then carry 3585 * on with stale data. This means that changes to this value will only 3586 * affect task migrations starting after the change. 3587 */ 3588 memcg->move_charge_at_immigrate = val; 3589 return 0; 3590 } 3591 #else 3592 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3593 struct cftype *cft, u64 val) 3594 { 3595 return -ENOSYS; 3596 } 3597 #endif 3598 3599 #ifdef CONFIG_NUMA 3600 3601 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3602 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3603 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3604 3605 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3606 int nid, unsigned int lru_mask) 3607 { 3608 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 3609 unsigned long nr = 0; 3610 enum lru_list lru; 3611 3612 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3613 3614 for_each_lru(lru) { 3615 if (!(BIT(lru) & lru_mask)) 3616 continue; 3617 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3618 } 3619 return nr; 3620 } 3621 3622 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3623 unsigned int lru_mask) 3624 { 3625 unsigned long nr = 0; 3626 enum lru_list lru; 3627 3628 for_each_lru(lru) { 3629 if (!(BIT(lru) & lru_mask)) 3630 continue; 3631 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3632 } 3633 return nr; 3634 } 3635 3636 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3637 { 3638 struct numa_stat { 3639 const char *name; 3640 unsigned int lru_mask; 3641 }; 3642 3643 static const struct numa_stat stats[] = { 3644 { "total", LRU_ALL }, 3645 { "file", LRU_ALL_FILE }, 3646 { "anon", LRU_ALL_ANON }, 3647 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3648 }; 3649 const struct numa_stat *stat; 3650 int nid; 3651 unsigned long nr; 3652 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3653 3654 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3655 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3656 seq_printf(m, "%s=%lu", stat->name, nr); 3657 for_each_node_state(nid, N_MEMORY) { 3658 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3659 stat->lru_mask); 3660 seq_printf(m, " N%d=%lu", nid, nr); 3661 } 3662 seq_putc(m, '\n'); 3663 } 3664 3665 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3666 struct mem_cgroup *iter; 3667 3668 nr = 0; 3669 for_each_mem_cgroup_tree(iter, memcg) 3670 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3671 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3672 for_each_node_state(nid, N_MEMORY) { 3673 nr = 0; 3674 for_each_mem_cgroup_tree(iter, memcg) 3675 nr += mem_cgroup_node_nr_lru_pages( 3676 iter, nid, stat->lru_mask); 3677 seq_printf(m, " N%d=%lu", nid, nr); 3678 } 3679 seq_putc(m, '\n'); 3680 } 3681 3682 return 0; 3683 } 3684 #endif /* CONFIG_NUMA */ 3685 3686 static const unsigned int memcg1_stats[] = { 3687 MEMCG_CACHE, 3688 MEMCG_RSS, 3689 MEMCG_RSS_HUGE, 3690 NR_SHMEM, 3691 NR_FILE_MAPPED, 3692 NR_FILE_DIRTY, 3693 NR_WRITEBACK, 3694 MEMCG_SWAP, 3695 }; 3696 3697 static const char *const memcg1_stat_names[] = { 3698 "cache", 3699 "rss", 3700 "rss_huge", 3701 "shmem", 3702 "mapped_file", 3703 "dirty", 3704 "writeback", 3705 "swap", 3706 }; 3707 3708 /* Universal VM events cgroup1 shows, original sort order */ 3709 static const unsigned int memcg1_events[] = { 3710 PGPGIN, 3711 PGPGOUT, 3712 PGFAULT, 3713 PGMAJFAULT, 3714 }; 3715 3716 static const char *const memcg1_event_names[] = { 3717 "pgpgin", 3718 "pgpgout", 3719 "pgfault", 3720 "pgmajfault", 3721 }; 3722 3723 static int memcg_stat_show(struct seq_file *m, void *v) 3724 { 3725 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3726 unsigned long memory, memsw; 3727 struct mem_cgroup *mi; 3728 unsigned int i; 3729 3730 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 3731 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3732 3733 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3734 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3735 continue; 3736 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], 3737 memcg_page_state_local(memcg, memcg1_stats[i]) * 3738 PAGE_SIZE); 3739 } 3740 3741 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3742 seq_printf(m, "%s %lu\n", memcg1_event_names[i], 3743 memcg_events_local(memcg, memcg1_events[i])); 3744 3745 for (i = 0; i < NR_LRU_LISTS; i++) 3746 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3747 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 3748 PAGE_SIZE); 3749 3750 /* Hierarchical information */ 3751 memory = memsw = PAGE_COUNTER_MAX; 3752 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3753 memory = min(memory, mi->memory.max); 3754 memsw = min(memsw, mi->memsw.max); 3755 } 3756 seq_printf(m, "hierarchical_memory_limit %llu\n", 3757 (u64)memory * PAGE_SIZE); 3758 if (do_memsw_account()) 3759 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3760 (u64)memsw * PAGE_SIZE); 3761 3762 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3763 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3764 continue; 3765 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 3766 (u64)memcg_page_state(memcg, memcg1_stats[i]) * 3767 PAGE_SIZE); 3768 } 3769 3770 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3771 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], 3772 (u64)memcg_events(memcg, memcg1_events[i])); 3773 3774 for (i = 0; i < NR_LRU_LISTS; i++) 3775 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], 3776 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 3777 PAGE_SIZE); 3778 3779 #ifdef CONFIG_DEBUG_VM 3780 { 3781 pg_data_t *pgdat; 3782 struct mem_cgroup_per_node *mz; 3783 struct zone_reclaim_stat *rstat; 3784 unsigned long recent_rotated[2] = {0, 0}; 3785 unsigned long recent_scanned[2] = {0, 0}; 3786 3787 for_each_online_pgdat(pgdat) { 3788 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 3789 rstat = &mz->lruvec.reclaim_stat; 3790 3791 recent_rotated[0] += rstat->recent_rotated[0]; 3792 recent_rotated[1] += rstat->recent_rotated[1]; 3793 recent_scanned[0] += rstat->recent_scanned[0]; 3794 recent_scanned[1] += rstat->recent_scanned[1]; 3795 } 3796 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3797 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3798 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3799 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3800 } 3801 #endif 3802 3803 return 0; 3804 } 3805 3806 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3807 struct cftype *cft) 3808 { 3809 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3810 3811 return mem_cgroup_swappiness(memcg); 3812 } 3813 3814 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3815 struct cftype *cft, u64 val) 3816 { 3817 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3818 3819 if (val > 100) 3820 return -EINVAL; 3821 3822 if (css->parent) 3823 memcg->swappiness = val; 3824 else 3825 vm_swappiness = val; 3826 3827 return 0; 3828 } 3829 3830 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3831 { 3832 struct mem_cgroup_threshold_ary *t; 3833 unsigned long usage; 3834 int i; 3835 3836 rcu_read_lock(); 3837 if (!swap) 3838 t = rcu_dereference(memcg->thresholds.primary); 3839 else 3840 t = rcu_dereference(memcg->memsw_thresholds.primary); 3841 3842 if (!t) 3843 goto unlock; 3844 3845 usage = mem_cgroup_usage(memcg, swap); 3846 3847 /* 3848 * current_threshold points to threshold just below or equal to usage. 3849 * If it's not true, a threshold was crossed after last 3850 * call of __mem_cgroup_threshold(). 3851 */ 3852 i = t->current_threshold; 3853 3854 /* 3855 * Iterate backward over array of thresholds starting from 3856 * current_threshold and check if a threshold is crossed. 3857 * If none of thresholds below usage is crossed, we read 3858 * only one element of the array here. 3859 */ 3860 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3861 eventfd_signal(t->entries[i].eventfd, 1); 3862 3863 /* i = current_threshold + 1 */ 3864 i++; 3865 3866 /* 3867 * Iterate forward over array of thresholds starting from 3868 * current_threshold+1 and check if a threshold is crossed. 3869 * If none of thresholds above usage is crossed, we read 3870 * only one element of the array here. 3871 */ 3872 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3873 eventfd_signal(t->entries[i].eventfd, 1); 3874 3875 /* Update current_threshold */ 3876 t->current_threshold = i - 1; 3877 unlock: 3878 rcu_read_unlock(); 3879 } 3880 3881 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3882 { 3883 while (memcg) { 3884 __mem_cgroup_threshold(memcg, false); 3885 if (do_memsw_account()) 3886 __mem_cgroup_threshold(memcg, true); 3887 3888 memcg = parent_mem_cgroup(memcg); 3889 } 3890 } 3891 3892 static int compare_thresholds(const void *a, const void *b) 3893 { 3894 const struct mem_cgroup_threshold *_a = a; 3895 const struct mem_cgroup_threshold *_b = b; 3896 3897 if (_a->threshold > _b->threshold) 3898 return 1; 3899 3900 if (_a->threshold < _b->threshold) 3901 return -1; 3902 3903 return 0; 3904 } 3905 3906 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3907 { 3908 struct mem_cgroup_eventfd_list *ev; 3909 3910 spin_lock(&memcg_oom_lock); 3911 3912 list_for_each_entry(ev, &memcg->oom_notify, list) 3913 eventfd_signal(ev->eventfd, 1); 3914 3915 spin_unlock(&memcg_oom_lock); 3916 return 0; 3917 } 3918 3919 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3920 { 3921 struct mem_cgroup *iter; 3922 3923 for_each_mem_cgroup_tree(iter, memcg) 3924 mem_cgroup_oom_notify_cb(iter); 3925 } 3926 3927 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3928 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3929 { 3930 struct mem_cgroup_thresholds *thresholds; 3931 struct mem_cgroup_threshold_ary *new; 3932 unsigned long threshold; 3933 unsigned long usage; 3934 int i, size, ret; 3935 3936 ret = page_counter_memparse(args, "-1", &threshold); 3937 if (ret) 3938 return ret; 3939 3940 mutex_lock(&memcg->thresholds_lock); 3941 3942 if (type == _MEM) { 3943 thresholds = &memcg->thresholds; 3944 usage = mem_cgroup_usage(memcg, false); 3945 } else if (type == _MEMSWAP) { 3946 thresholds = &memcg->memsw_thresholds; 3947 usage = mem_cgroup_usage(memcg, true); 3948 } else 3949 BUG(); 3950 3951 /* Check if a threshold crossed before adding a new one */ 3952 if (thresholds->primary) 3953 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3954 3955 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3956 3957 /* Allocate memory for new array of thresholds */ 3958 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 3959 if (!new) { 3960 ret = -ENOMEM; 3961 goto unlock; 3962 } 3963 new->size = size; 3964 3965 /* Copy thresholds (if any) to new array */ 3966 if (thresholds->primary) { 3967 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3968 sizeof(struct mem_cgroup_threshold)); 3969 } 3970 3971 /* Add new threshold */ 3972 new->entries[size - 1].eventfd = eventfd; 3973 new->entries[size - 1].threshold = threshold; 3974 3975 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3976 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3977 compare_thresholds, NULL); 3978 3979 /* Find current threshold */ 3980 new->current_threshold = -1; 3981 for (i = 0; i < size; i++) { 3982 if (new->entries[i].threshold <= usage) { 3983 /* 3984 * new->current_threshold will not be used until 3985 * rcu_assign_pointer(), so it's safe to increment 3986 * it here. 3987 */ 3988 ++new->current_threshold; 3989 } else 3990 break; 3991 } 3992 3993 /* Free old spare buffer and save old primary buffer as spare */ 3994 kfree(thresholds->spare); 3995 thresholds->spare = thresholds->primary; 3996 3997 rcu_assign_pointer(thresholds->primary, new); 3998 3999 /* To be sure that nobody uses thresholds */ 4000 synchronize_rcu(); 4001 4002 unlock: 4003 mutex_unlock(&memcg->thresholds_lock); 4004 4005 return ret; 4006 } 4007 4008 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4009 struct eventfd_ctx *eventfd, const char *args) 4010 { 4011 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4012 } 4013 4014 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4015 struct eventfd_ctx *eventfd, const char *args) 4016 { 4017 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4018 } 4019 4020 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4021 struct eventfd_ctx *eventfd, enum res_type type) 4022 { 4023 struct mem_cgroup_thresholds *thresholds; 4024 struct mem_cgroup_threshold_ary *new; 4025 unsigned long usage; 4026 int i, j, size; 4027 4028 mutex_lock(&memcg->thresholds_lock); 4029 4030 if (type == _MEM) { 4031 thresholds = &memcg->thresholds; 4032 usage = mem_cgroup_usage(memcg, false); 4033 } else if (type == _MEMSWAP) { 4034 thresholds = &memcg->memsw_thresholds; 4035 usage = mem_cgroup_usage(memcg, true); 4036 } else 4037 BUG(); 4038 4039 if (!thresholds->primary) 4040 goto unlock; 4041 4042 /* Check if a threshold crossed before removing */ 4043 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4044 4045 /* Calculate new number of threshold */ 4046 size = 0; 4047 for (i = 0; i < thresholds->primary->size; i++) { 4048 if (thresholds->primary->entries[i].eventfd != eventfd) 4049 size++; 4050 } 4051 4052 new = thresholds->spare; 4053 4054 /* Set thresholds array to NULL if we don't have thresholds */ 4055 if (!size) { 4056 kfree(new); 4057 new = NULL; 4058 goto swap_buffers; 4059 } 4060 4061 new->size = size; 4062 4063 /* Copy thresholds and find current threshold */ 4064 new->current_threshold = -1; 4065 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4066 if (thresholds->primary->entries[i].eventfd == eventfd) 4067 continue; 4068 4069 new->entries[j] = thresholds->primary->entries[i]; 4070 if (new->entries[j].threshold <= usage) { 4071 /* 4072 * new->current_threshold will not be used 4073 * until rcu_assign_pointer(), so it's safe to increment 4074 * it here. 4075 */ 4076 ++new->current_threshold; 4077 } 4078 j++; 4079 } 4080 4081 swap_buffers: 4082 /* Swap primary and spare array */ 4083 thresholds->spare = thresholds->primary; 4084 4085 rcu_assign_pointer(thresholds->primary, new); 4086 4087 /* To be sure that nobody uses thresholds */ 4088 synchronize_rcu(); 4089 4090 /* If all events are unregistered, free the spare array */ 4091 if (!new) { 4092 kfree(thresholds->spare); 4093 thresholds->spare = NULL; 4094 } 4095 unlock: 4096 mutex_unlock(&memcg->thresholds_lock); 4097 } 4098 4099 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4100 struct eventfd_ctx *eventfd) 4101 { 4102 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4103 } 4104 4105 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4106 struct eventfd_ctx *eventfd) 4107 { 4108 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4109 } 4110 4111 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4112 struct eventfd_ctx *eventfd, const char *args) 4113 { 4114 struct mem_cgroup_eventfd_list *event; 4115 4116 event = kmalloc(sizeof(*event), GFP_KERNEL); 4117 if (!event) 4118 return -ENOMEM; 4119 4120 spin_lock(&memcg_oom_lock); 4121 4122 event->eventfd = eventfd; 4123 list_add(&event->list, &memcg->oom_notify); 4124 4125 /* already in OOM ? */ 4126 if (memcg->under_oom) 4127 eventfd_signal(eventfd, 1); 4128 spin_unlock(&memcg_oom_lock); 4129 4130 return 0; 4131 } 4132 4133 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4134 struct eventfd_ctx *eventfd) 4135 { 4136 struct mem_cgroup_eventfd_list *ev, *tmp; 4137 4138 spin_lock(&memcg_oom_lock); 4139 4140 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4141 if (ev->eventfd == eventfd) { 4142 list_del(&ev->list); 4143 kfree(ev); 4144 } 4145 } 4146 4147 spin_unlock(&memcg_oom_lock); 4148 } 4149 4150 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4151 { 4152 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4153 4154 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4155 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4156 seq_printf(sf, "oom_kill %lu\n", 4157 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4158 return 0; 4159 } 4160 4161 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4162 struct cftype *cft, u64 val) 4163 { 4164 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4165 4166 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4167 if (!css->parent || !((val == 0) || (val == 1))) 4168 return -EINVAL; 4169 4170 memcg->oom_kill_disable = val; 4171 if (!val) 4172 memcg_oom_recover(memcg); 4173 4174 return 0; 4175 } 4176 4177 #ifdef CONFIG_CGROUP_WRITEBACK 4178 4179 #include <trace/events/writeback.h> 4180 4181 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4182 { 4183 return wb_domain_init(&memcg->cgwb_domain, gfp); 4184 } 4185 4186 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4187 { 4188 wb_domain_exit(&memcg->cgwb_domain); 4189 } 4190 4191 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4192 { 4193 wb_domain_size_changed(&memcg->cgwb_domain); 4194 } 4195 4196 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4197 { 4198 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4199 4200 if (!memcg->css.parent) 4201 return NULL; 4202 4203 return &memcg->cgwb_domain; 4204 } 4205 4206 /* 4207 * idx can be of type enum memcg_stat_item or node_stat_item. 4208 * Keep in sync with memcg_exact_page(). 4209 */ 4210 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) 4211 { 4212 long x = atomic_long_read(&memcg->vmstats[idx]); 4213 int cpu; 4214 4215 for_each_online_cpu(cpu) 4216 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; 4217 if (x < 0) 4218 x = 0; 4219 return x; 4220 } 4221 4222 /** 4223 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4224 * @wb: bdi_writeback in question 4225 * @pfilepages: out parameter for number of file pages 4226 * @pheadroom: out parameter for number of allocatable pages according to memcg 4227 * @pdirty: out parameter for number of dirty pages 4228 * @pwriteback: out parameter for number of pages under writeback 4229 * 4230 * Determine the numbers of file, headroom, dirty, and writeback pages in 4231 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4232 * is a bit more involved. 4233 * 4234 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4235 * headroom is calculated as the lowest headroom of itself and the 4236 * ancestors. Note that this doesn't consider the actual amount of 4237 * available memory in the system. The caller should further cap 4238 * *@pheadroom accordingly. 4239 */ 4240 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4241 unsigned long *pheadroom, unsigned long *pdirty, 4242 unsigned long *pwriteback) 4243 { 4244 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4245 struct mem_cgroup *parent; 4246 4247 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); 4248 4249 /* this should eventually include NR_UNSTABLE_NFS */ 4250 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 4251 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 4252 memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 4253 *pheadroom = PAGE_COUNTER_MAX; 4254 4255 while ((parent = parent_mem_cgroup(memcg))) { 4256 unsigned long ceiling = min(memcg->memory.max, memcg->high); 4257 unsigned long used = page_counter_read(&memcg->memory); 4258 4259 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4260 memcg = parent; 4261 } 4262 } 4263 4264 /* 4265 * Foreign dirty flushing 4266 * 4267 * There's an inherent mismatch between memcg and writeback. The former 4268 * trackes ownership per-page while the latter per-inode. This was a 4269 * deliberate design decision because honoring per-page ownership in the 4270 * writeback path is complicated, may lead to higher CPU and IO overheads 4271 * and deemed unnecessary given that write-sharing an inode across 4272 * different cgroups isn't a common use-case. 4273 * 4274 * Combined with inode majority-writer ownership switching, this works well 4275 * enough in most cases but there are some pathological cases. For 4276 * example, let's say there are two cgroups A and B which keep writing to 4277 * different but confined parts of the same inode. B owns the inode and 4278 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4279 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4280 * triggering background writeback. A will be slowed down without a way to 4281 * make writeback of the dirty pages happen. 4282 * 4283 * Conditions like the above can lead to a cgroup getting repatedly and 4284 * severely throttled after making some progress after each 4285 * dirty_expire_interval while the underyling IO device is almost 4286 * completely idle. 4287 * 4288 * Solving this problem completely requires matching the ownership tracking 4289 * granularities between memcg and writeback in either direction. However, 4290 * the more egregious behaviors can be avoided by simply remembering the 4291 * most recent foreign dirtying events and initiating remote flushes on 4292 * them when local writeback isn't enough to keep the memory clean enough. 4293 * 4294 * The following two functions implement such mechanism. When a foreign 4295 * page - a page whose memcg and writeback ownerships don't match - is 4296 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4297 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4298 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4299 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4300 * foreign bdi_writebacks which haven't expired. Both the numbers of 4301 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4302 * limited to MEMCG_CGWB_FRN_CNT. 4303 * 4304 * The mechanism only remembers IDs and doesn't hold any object references. 4305 * As being wrong occasionally doesn't matter, updates and accesses to the 4306 * records are lockless and racy. 4307 */ 4308 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4309 struct bdi_writeback *wb) 4310 { 4311 struct mem_cgroup *memcg = page->mem_cgroup; 4312 struct memcg_cgwb_frn *frn; 4313 u64 now = get_jiffies_64(); 4314 u64 oldest_at = now; 4315 int oldest = -1; 4316 int i; 4317 4318 trace_track_foreign_dirty(page, wb); 4319 4320 /* 4321 * Pick the slot to use. If there is already a slot for @wb, keep 4322 * using it. If not replace the oldest one which isn't being 4323 * written out. 4324 */ 4325 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4326 frn = &memcg->cgwb_frn[i]; 4327 if (frn->bdi_id == wb->bdi->id && 4328 frn->memcg_id == wb->memcg_css->id) 4329 break; 4330 if (time_before64(frn->at, oldest_at) && 4331 atomic_read(&frn->done.cnt) == 1) { 4332 oldest = i; 4333 oldest_at = frn->at; 4334 } 4335 } 4336 4337 if (i < MEMCG_CGWB_FRN_CNT) { 4338 /* 4339 * Re-using an existing one. Update timestamp lazily to 4340 * avoid making the cacheline hot. We want them to be 4341 * reasonably up-to-date and significantly shorter than 4342 * dirty_expire_interval as that's what expires the record. 4343 * Use the shorter of 1s and dirty_expire_interval / 8. 4344 */ 4345 unsigned long update_intv = 4346 min_t(unsigned long, HZ, 4347 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4348 4349 if (time_before64(frn->at, now - update_intv)) 4350 frn->at = now; 4351 } else if (oldest >= 0) { 4352 /* replace the oldest free one */ 4353 frn = &memcg->cgwb_frn[oldest]; 4354 frn->bdi_id = wb->bdi->id; 4355 frn->memcg_id = wb->memcg_css->id; 4356 frn->at = now; 4357 } 4358 } 4359 4360 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4361 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4362 { 4363 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4364 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4365 u64 now = jiffies_64; 4366 int i; 4367 4368 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4369 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4370 4371 /* 4372 * If the record is older than dirty_expire_interval, 4373 * writeback on it has already started. No need to kick it 4374 * off again. Also, don't start a new one if there's 4375 * already one in flight. 4376 */ 4377 if (time_after64(frn->at, now - intv) && 4378 atomic_read(&frn->done.cnt) == 1) { 4379 frn->at = 0; 4380 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4381 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 4382 WB_REASON_FOREIGN_FLUSH, 4383 &frn->done); 4384 } 4385 } 4386 } 4387 4388 #else /* CONFIG_CGROUP_WRITEBACK */ 4389 4390 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4391 { 4392 return 0; 4393 } 4394 4395 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4396 { 4397 } 4398 4399 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4400 { 4401 } 4402 4403 #endif /* CONFIG_CGROUP_WRITEBACK */ 4404 4405 /* 4406 * DO NOT USE IN NEW FILES. 4407 * 4408 * "cgroup.event_control" implementation. 4409 * 4410 * This is way over-engineered. It tries to support fully configurable 4411 * events for each user. Such level of flexibility is completely 4412 * unnecessary especially in the light of the planned unified hierarchy. 4413 * 4414 * Please deprecate this and replace with something simpler if at all 4415 * possible. 4416 */ 4417 4418 /* 4419 * Unregister event and free resources. 4420 * 4421 * Gets called from workqueue. 4422 */ 4423 static void memcg_event_remove(struct work_struct *work) 4424 { 4425 struct mem_cgroup_event *event = 4426 container_of(work, struct mem_cgroup_event, remove); 4427 struct mem_cgroup *memcg = event->memcg; 4428 4429 remove_wait_queue(event->wqh, &event->wait); 4430 4431 event->unregister_event(memcg, event->eventfd); 4432 4433 /* Notify userspace the event is going away. */ 4434 eventfd_signal(event->eventfd, 1); 4435 4436 eventfd_ctx_put(event->eventfd); 4437 kfree(event); 4438 css_put(&memcg->css); 4439 } 4440 4441 /* 4442 * Gets called on EPOLLHUP on eventfd when user closes it. 4443 * 4444 * Called with wqh->lock held and interrupts disabled. 4445 */ 4446 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4447 int sync, void *key) 4448 { 4449 struct mem_cgroup_event *event = 4450 container_of(wait, struct mem_cgroup_event, wait); 4451 struct mem_cgroup *memcg = event->memcg; 4452 __poll_t flags = key_to_poll(key); 4453 4454 if (flags & EPOLLHUP) { 4455 /* 4456 * If the event has been detached at cgroup removal, we 4457 * can simply return knowing the other side will cleanup 4458 * for us. 4459 * 4460 * We can't race against event freeing since the other 4461 * side will require wqh->lock via remove_wait_queue(), 4462 * which we hold. 4463 */ 4464 spin_lock(&memcg->event_list_lock); 4465 if (!list_empty(&event->list)) { 4466 list_del_init(&event->list); 4467 /* 4468 * We are in atomic context, but cgroup_event_remove() 4469 * may sleep, so we have to call it in workqueue. 4470 */ 4471 schedule_work(&event->remove); 4472 } 4473 spin_unlock(&memcg->event_list_lock); 4474 } 4475 4476 return 0; 4477 } 4478 4479 static void memcg_event_ptable_queue_proc(struct file *file, 4480 wait_queue_head_t *wqh, poll_table *pt) 4481 { 4482 struct mem_cgroup_event *event = 4483 container_of(pt, struct mem_cgroup_event, pt); 4484 4485 event->wqh = wqh; 4486 add_wait_queue(wqh, &event->wait); 4487 } 4488 4489 /* 4490 * DO NOT USE IN NEW FILES. 4491 * 4492 * Parse input and register new cgroup event handler. 4493 * 4494 * Input must be in format '<event_fd> <control_fd> <args>'. 4495 * Interpretation of args is defined by control file implementation. 4496 */ 4497 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4498 char *buf, size_t nbytes, loff_t off) 4499 { 4500 struct cgroup_subsys_state *css = of_css(of); 4501 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4502 struct mem_cgroup_event *event; 4503 struct cgroup_subsys_state *cfile_css; 4504 unsigned int efd, cfd; 4505 struct fd efile; 4506 struct fd cfile; 4507 const char *name; 4508 char *endp; 4509 int ret; 4510 4511 buf = strstrip(buf); 4512 4513 efd = simple_strtoul(buf, &endp, 10); 4514 if (*endp != ' ') 4515 return -EINVAL; 4516 buf = endp + 1; 4517 4518 cfd = simple_strtoul(buf, &endp, 10); 4519 if ((*endp != ' ') && (*endp != '\0')) 4520 return -EINVAL; 4521 buf = endp + 1; 4522 4523 event = kzalloc(sizeof(*event), GFP_KERNEL); 4524 if (!event) 4525 return -ENOMEM; 4526 4527 event->memcg = memcg; 4528 INIT_LIST_HEAD(&event->list); 4529 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4530 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4531 INIT_WORK(&event->remove, memcg_event_remove); 4532 4533 efile = fdget(efd); 4534 if (!efile.file) { 4535 ret = -EBADF; 4536 goto out_kfree; 4537 } 4538 4539 event->eventfd = eventfd_ctx_fileget(efile.file); 4540 if (IS_ERR(event->eventfd)) { 4541 ret = PTR_ERR(event->eventfd); 4542 goto out_put_efile; 4543 } 4544 4545 cfile = fdget(cfd); 4546 if (!cfile.file) { 4547 ret = -EBADF; 4548 goto out_put_eventfd; 4549 } 4550 4551 /* the process need read permission on control file */ 4552 /* AV: shouldn't we check that it's been opened for read instead? */ 4553 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4554 if (ret < 0) 4555 goto out_put_cfile; 4556 4557 /* 4558 * Determine the event callbacks and set them in @event. This used 4559 * to be done via struct cftype but cgroup core no longer knows 4560 * about these events. The following is crude but the whole thing 4561 * is for compatibility anyway. 4562 * 4563 * DO NOT ADD NEW FILES. 4564 */ 4565 name = cfile.file->f_path.dentry->d_name.name; 4566 4567 if (!strcmp(name, "memory.usage_in_bytes")) { 4568 event->register_event = mem_cgroup_usage_register_event; 4569 event->unregister_event = mem_cgroup_usage_unregister_event; 4570 } else if (!strcmp(name, "memory.oom_control")) { 4571 event->register_event = mem_cgroup_oom_register_event; 4572 event->unregister_event = mem_cgroup_oom_unregister_event; 4573 } else if (!strcmp(name, "memory.pressure_level")) { 4574 event->register_event = vmpressure_register_event; 4575 event->unregister_event = vmpressure_unregister_event; 4576 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4577 event->register_event = memsw_cgroup_usage_register_event; 4578 event->unregister_event = memsw_cgroup_usage_unregister_event; 4579 } else { 4580 ret = -EINVAL; 4581 goto out_put_cfile; 4582 } 4583 4584 /* 4585 * Verify @cfile should belong to @css. Also, remaining events are 4586 * automatically removed on cgroup destruction but the removal is 4587 * asynchronous, so take an extra ref on @css. 4588 */ 4589 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4590 &memory_cgrp_subsys); 4591 ret = -EINVAL; 4592 if (IS_ERR(cfile_css)) 4593 goto out_put_cfile; 4594 if (cfile_css != css) { 4595 css_put(cfile_css); 4596 goto out_put_cfile; 4597 } 4598 4599 ret = event->register_event(memcg, event->eventfd, buf); 4600 if (ret) 4601 goto out_put_css; 4602 4603 vfs_poll(efile.file, &event->pt); 4604 4605 spin_lock(&memcg->event_list_lock); 4606 list_add(&event->list, &memcg->event_list); 4607 spin_unlock(&memcg->event_list_lock); 4608 4609 fdput(cfile); 4610 fdput(efile); 4611 4612 return nbytes; 4613 4614 out_put_css: 4615 css_put(css); 4616 out_put_cfile: 4617 fdput(cfile); 4618 out_put_eventfd: 4619 eventfd_ctx_put(event->eventfd); 4620 out_put_efile: 4621 fdput(efile); 4622 out_kfree: 4623 kfree(event); 4624 4625 return ret; 4626 } 4627 4628 static struct cftype mem_cgroup_legacy_files[] = { 4629 { 4630 .name = "usage_in_bytes", 4631 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4632 .read_u64 = mem_cgroup_read_u64, 4633 }, 4634 { 4635 .name = "max_usage_in_bytes", 4636 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4637 .write = mem_cgroup_reset, 4638 .read_u64 = mem_cgroup_read_u64, 4639 }, 4640 { 4641 .name = "limit_in_bytes", 4642 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4643 .write = mem_cgroup_write, 4644 .read_u64 = mem_cgroup_read_u64, 4645 }, 4646 { 4647 .name = "soft_limit_in_bytes", 4648 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4649 .write = mem_cgroup_write, 4650 .read_u64 = mem_cgroup_read_u64, 4651 }, 4652 { 4653 .name = "failcnt", 4654 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4655 .write = mem_cgroup_reset, 4656 .read_u64 = mem_cgroup_read_u64, 4657 }, 4658 { 4659 .name = "stat", 4660 .seq_show = memcg_stat_show, 4661 }, 4662 { 4663 .name = "force_empty", 4664 .write = mem_cgroup_force_empty_write, 4665 }, 4666 { 4667 .name = "use_hierarchy", 4668 .write_u64 = mem_cgroup_hierarchy_write, 4669 .read_u64 = mem_cgroup_hierarchy_read, 4670 }, 4671 { 4672 .name = "cgroup.event_control", /* XXX: for compat */ 4673 .write = memcg_write_event_control, 4674 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4675 }, 4676 { 4677 .name = "swappiness", 4678 .read_u64 = mem_cgroup_swappiness_read, 4679 .write_u64 = mem_cgroup_swappiness_write, 4680 }, 4681 { 4682 .name = "move_charge_at_immigrate", 4683 .read_u64 = mem_cgroup_move_charge_read, 4684 .write_u64 = mem_cgroup_move_charge_write, 4685 }, 4686 { 4687 .name = "oom_control", 4688 .seq_show = mem_cgroup_oom_control_read, 4689 .write_u64 = mem_cgroup_oom_control_write, 4690 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4691 }, 4692 { 4693 .name = "pressure_level", 4694 }, 4695 #ifdef CONFIG_NUMA 4696 { 4697 .name = "numa_stat", 4698 .seq_show = memcg_numa_stat_show, 4699 }, 4700 #endif 4701 { 4702 .name = "kmem.limit_in_bytes", 4703 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4704 .write = mem_cgroup_write, 4705 .read_u64 = mem_cgroup_read_u64, 4706 }, 4707 { 4708 .name = "kmem.usage_in_bytes", 4709 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4710 .read_u64 = mem_cgroup_read_u64, 4711 }, 4712 { 4713 .name = "kmem.failcnt", 4714 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4715 .write = mem_cgroup_reset, 4716 .read_u64 = mem_cgroup_read_u64, 4717 }, 4718 { 4719 .name = "kmem.max_usage_in_bytes", 4720 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4721 .write = mem_cgroup_reset, 4722 .read_u64 = mem_cgroup_read_u64, 4723 }, 4724 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) 4725 { 4726 .name = "kmem.slabinfo", 4727 .seq_start = memcg_slab_start, 4728 .seq_next = memcg_slab_next, 4729 .seq_stop = memcg_slab_stop, 4730 .seq_show = memcg_slab_show, 4731 }, 4732 #endif 4733 { 4734 .name = "kmem.tcp.limit_in_bytes", 4735 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 4736 .write = mem_cgroup_write, 4737 .read_u64 = mem_cgroup_read_u64, 4738 }, 4739 { 4740 .name = "kmem.tcp.usage_in_bytes", 4741 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 4742 .read_u64 = mem_cgroup_read_u64, 4743 }, 4744 { 4745 .name = "kmem.tcp.failcnt", 4746 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 4747 .write = mem_cgroup_reset, 4748 .read_u64 = mem_cgroup_read_u64, 4749 }, 4750 { 4751 .name = "kmem.tcp.max_usage_in_bytes", 4752 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 4753 .write = mem_cgroup_reset, 4754 .read_u64 = mem_cgroup_read_u64, 4755 }, 4756 { }, /* terminate */ 4757 }; 4758 4759 /* 4760 * Private memory cgroup IDR 4761 * 4762 * Swap-out records and page cache shadow entries need to store memcg 4763 * references in constrained space, so we maintain an ID space that is 4764 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 4765 * memory-controlled cgroups to 64k. 4766 * 4767 * However, there usually are many references to the oflline CSS after 4768 * the cgroup has been destroyed, such as page cache or reclaimable 4769 * slab objects, that don't need to hang on to the ID. We want to keep 4770 * those dead CSS from occupying IDs, or we might quickly exhaust the 4771 * relatively small ID space and prevent the creation of new cgroups 4772 * even when there are much fewer than 64k cgroups - possibly none. 4773 * 4774 * Maintain a private 16-bit ID space for memcg, and allow the ID to 4775 * be freed and recycled when it's no longer needed, which is usually 4776 * when the CSS is offlined. 4777 * 4778 * The only exception to that are records of swapped out tmpfs/shmem 4779 * pages that need to be attributed to live ancestors on swapin. But 4780 * those references are manageable from userspace. 4781 */ 4782 4783 static DEFINE_IDR(mem_cgroup_idr); 4784 4785 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 4786 { 4787 if (memcg->id.id > 0) { 4788 idr_remove(&mem_cgroup_idr, memcg->id.id); 4789 memcg->id.id = 0; 4790 } 4791 } 4792 4793 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) 4794 { 4795 refcount_add(n, &memcg->id.ref); 4796 } 4797 4798 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4799 { 4800 if (refcount_sub_and_test(n, &memcg->id.ref)) { 4801 mem_cgroup_id_remove(memcg); 4802 4803 /* Memcg ID pins CSS */ 4804 css_put(&memcg->css); 4805 } 4806 } 4807 4808 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) 4809 { 4810 mem_cgroup_id_get_many(memcg, 1); 4811 } 4812 4813 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4814 { 4815 mem_cgroup_id_put_many(memcg, 1); 4816 } 4817 4818 /** 4819 * mem_cgroup_from_id - look up a memcg from a memcg id 4820 * @id: the memcg id to look up 4821 * 4822 * Caller must hold rcu_read_lock(). 4823 */ 4824 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 4825 { 4826 WARN_ON_ONCE(!rcu_read_lock_held()); 4827 return idr_find(&mem_cgroup_idr, id); 4828 } 4829 4830 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4831 { 4832 struct mem_cgroup_per_node *pn; 4833 int tmp = node; 4834 /* 4835 * This routine is called against possible nodes. 4836 * But it's BUG to call kmalloc() against offline node. 4837 * 4838 * TODO: this routine can waste much memory for nodes which will 4839 * never be onlined. It's better to use memory hotplug callback 4840 * function. 4841 */ 4842 if (!node_state(node, N_NORMAL_MEMORY)) 4843 tmp = -1; 4844 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4845 if (!pn) 4846 return 1; 4847 4848 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); 4849 if (!pn->lruvec_stat_local) { 4850 kfree(pn); 4851 return 1; 4852 } 4853 4854 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); 4855 if (!pn->lruvec_stat_cpu) { 4856 free_percpu(pn->lruvec_stat_local); 4857 kfree(pn); 4858 return 1; 4859 } 4860 4861 lruvec_init(&pn->lruvec); 4862 pn->usage_in_excess = 0; 4863 pn->on_tree = false; 4864 pn->memcg = memcg; 4865 4866 memcg->nodeinfo[node] = pn; 4867 return 0; 4868 } 4869 4870 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4871 { 4872 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 4873 4874 if (!pn) 4875 return; 4876 4877 free_percpu(pn->lruvec_stat_cpu); 4878 free_percpu(pn->lruvec_stat_local); 4879 kfree(pn); 4880 } 4881 4882 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4883 { 4884 int node; 4885 4886 /* 4887 * Flush percpu vmstats and vmevents to guarantee the value correctness 4888 * on parent's and all ancestor levels. 4889 */ 4890 memcg_flush_percpu_vmstats(memcg, false); 4891 memcg_flush_percpu_vmevents(memcg); 4892 for_each_node(node) 4893 free_mem_cgroup_per_node_info(memcg, node); 4894 free_percpu(memcg->vmstats_percpu); 4895 free_percpu(memcg->vmstats_local); 4896 kfree(memcg); 4897 } 4898 4899 static void mem_cgroup_free(struct mem_cgroup *memcg) 4900 { 4901 memcg_wb_domain_exit(memcg); 4902 __mem_cgroup_free(memcg); 4903 } 4904 4905 static struct mem_cgroup *mem_cgroup_alloc(void) 4906 { 4907 struct mem_cgroup *memcg; 4908 unsigned int size; 4909 int node; 4910 int __maybe_unused i; 4911 4912 size = sizeof(struct mem_cgroup); 4913 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4914 4915 memcg = kzalloc(size, GFP_KERNEL); 4916 if (!memcg) 4917 return NULL; 4918 4919 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 4920 1, MEM_CGROUP_ID_MAX, 4921 GFP_KERNEL); 4922 if (memcg->id.id < 0) 4923 goto fail; 4924 4925 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); 4926 if (!memcg->vmstats_local) 4927 goto fail; 4928 4929 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); 4930 if (!memcg->vmstats_percpu) 4931 goto fail; 4932 4933 for_each_node(node) 4934 if (alloc_mem_cgroup_per_node_info(memcg, node)) 4935 goto fail; 4936 4937 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4938 goto fail; 4939 4940 INIT_WORK(&memcg->high_work, high_work_func); 4941 memcg->last_scanned_node = MAX_NUMNODES; 4942 INIT_LIST_HEAD(&memcg->oom_notify); 4943 mutex_init(&memcg->thresholds_lock); 4944 spin_lock_init(&memcg->move_lock); 4945 vmpressure_init(&memcg->vmpressure); 4946 INIT_LIST_HEAD(&memcg->event_list); 4947 spin_lock_init(&memcg->event_list_lock); 4948 memcg->socket_pressure = jiffies; 4949 #ifdef CONFIG_MEMCG_KMEM 4950 memcg->kmemcg_id = -1; 4951 #endif 4952 #ifdef CONFIG_CGROUP_WRITEBACK 4953 INIT_LIST_HEAD(&memcg->cgwb_list); 4954 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 4955 memcg->cgwb_frn[i].done = 4956 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 4957 #endif 4958 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 4959 return memcg; 4960 fail: 4961 mem_cgroup_id_remove(memcg); 4962 __mem_cgroup_free(memcg); 4963 return NULL; 4964 } 4965 4966 static struct cgroup_subsys_state * __ref 4967 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4968 { 4969 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 4970 struct mem_cgroup *memcg; 4971 long error = -ENOMEM; 4972 4973 memcg = mem_cgroup_alloc(); 4974 if (!memcg) 4975 return ERR_PTR(error); 4976 4977 memcg->high = PAGE_COUNTER_MAX; 4978 memcg->soft_limit = PAGE_COUNTER_MAX; 4979 if (parent) { 4980 memcg->swappiness = mem_cgroup_swappiness(parent); 4981 memcg->oom_kill_disable = parent->oom_kill_disable; 4982 } 4983 if (parent && parent->use_hierarchy) { 4984 memcg->use_hierarchy = true; 4985 page_counter_init(&memcg->memory, &parent->memory); 4986 page_counter_init(&memcg->swap, &parent->swap); 4987 page_counter_init(&memcg->memsw, &parent->memsw); 4988 page_counter_init(&memcg->kmem, &parent->kmem); 4989 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 4990 } else { 4991 page_counter_init(&memcg->memory, NULL); 4992 page_counter_init(&memcg->swap, NULL); 4993 page_counter_init(&memcg->memsw, NULL); 4994 page_counter_init(&memcg->kmem, NULL); 4995 page_counter_init(&memcg->tcpmem, NULL); 4996 /* 4997 * Deeper hierachy with use_hierarchy == false doesn't make 4998 * much sense so let cgroup subsystem know about this 4999 * unfortunate state in our controller. 5000 */ 5001 if (parent != root_mem_cgroup) 5002 memory_cgrp_subsys.broken_hierarchy = true; 5003 } 5004 5005 /* The following stuff does not apply to the root */ 5006 if (!parent) { 5007 #ifdef CONFIG_MEMCG_KMEM 5008 INIT_LIST_HEAD(&memcg->kmem_caches); 5009 #endif 5010 root_mem_cgroup = memcg; 5011 return &memcg->css; 5012 } 5013 5014 error = memcg_online_kmem(memcg); 5015 if (error) 5016 goto fail; 5017 5018 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5019 static_branch_inc(&memcg_sockets_enabled_key); 5020 5021 return &memcg->css; 5022 fail: 5023 mem_cgroup_id_remove(memcg); 5024 mem_cgroup_free(memcg); 5025 return ERR_PTR(-ENOMEM); 5026 } 5027 5028 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5029 { 5030 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5031 5032 /* 5033 * A memcg must be visible for memcg_expand_shrinker_maps() 5034 * by the time the maps are allocated. So, we allocate maps 5035 * here, when for_each_mem_cgroup() can't skip it. 5036 */ 5037 if (memcg_alloc_shrinker_maps(memcg)) { 5038 mem_cgroup_id_remove(memcg); 5039 return -ENOMEM; 5040 } 5041 5042 /* Online state pins memcg ID, memcg ID pins CSS */ 5043 refcount_set(&memcg->id.ref, 1); 5044 css_get(css); 5045 return 0; 5046 } 5047 5048 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5049 { 5050 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5051 struct mem_cgroup_event *event, *tmp; 5052 5053 /* 5054 * Unregister events and notify userspace. 5055 * Notify userspace about cgroup removing only after rmdir of cgroup 5056 * directory to avoid race between userspace and kernelspace. 5057 */ 5058 spin_lock(&memcg->event_list_lock); 5059 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5060 list_del_init(&event->list); 5061 schedule_work(&event->remove); 5062 } 5063 spin_unlock(&memcg->event_list_lock); 5064 5065 page_counter_set_min(&memcg->memory, 0); 5066 page_counter_set_low(&memcg->memory, 0); 5067 5068 memcg_offline_kmem(memcg); 5069 wb_memcg_offline(memcg); 5070 5071 drain_all_stock(memcg); 5072 5073 mem_cgroup_id_put(memcg); 5074 } 5075 5076 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5077 { 5078 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5079 5080 invalidate_reclaim_iterators(memcg); 5081 } 5082 5083 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5084 { 5085 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5086 int __maybe_unused i; 5087 5088 #ifdef CONFIG_CGROUP_WRITEBACK 5089 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5090 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5091 #endif 5092 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5093 static_branch_dec(&memcg_sockets_enabled_key); 5094 5095 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5096 static_branch_dec(&memcg_sockets_enabled_key); 5097 5098 vmpressure_cleanup(&memcg->vmpressure); 5099 cancel_work_sync(&memcg->high_work); 5100 mem_cgroup_remove_from_trees(memcg); 5101 memcg_free_shrinker_maps(memcg); 5102 memcg_free_kmem(memcg); 5103 mem_cgroup_free(memcg); 5104 } 5105 5106 /** 5107 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5108 * @css: the target css 5109 * 5110 * Reset the states of the mem_cgroup associated with @css. This is 5111 * invoked when the userland requests disabling on the default hierarchy 5112 * but the memcg is pinned through dependency. The memcg should stop 5113 * applying policies and should revert to the vanilla state as it may be 5114 * made visible again. 5115 * 5116 * The current implementation only resets the essential configurations. 5117 * This needs to be expanded to cover all the visible parts. 5118 */ 5119 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5120 { 5121 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5122 5123 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5124 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5125 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); 5126 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5127 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5128 page_counter_set_min(&memcg->memory, 0); 5129 page_counter_set_low(&memcg->memory, 0); 5130 memcg->high = PAGE_COUNTER_MAX; 5131 memcg->soft_limit = PAGE_COUNTER_MAX; 5132 memcg_wb_domain_size_changed(memcg); 5133 } 5134 5135 #ifdef CONFIG_MMU 5136 /* Handlers for move charge at task migration. */ 5137 static int mem_cgroup_do_precharge(unsigned long count) 5138 { 5139 int ret; 5140 5141 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5142 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5143 if (!ret) { 5144 mc.precharge += count; 5145 return ret; 5146 } 5147 5148 /* Try charges one by one with reclaim, but do not retry */ 5149 while (count--) { 5150 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5151 if (ret) 5152 return ret; 5153 mc.precharge++; 5154 cond_resched(); 5155 } 5156 return 0; 5157 } 5158 5159 union mc_target { 5160 struct page *page; 5161 swp_entry_t ent; 5162 }; 5163 5164 enum mc_target_type { 5165 MC_TARGET_NONE = 0, 5166 MC_TARGET_PAGE, 5167 MC_TARGET_SWAP, 5168 MC_TARGET_DEVICE, 5169 }; 5170 5171 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5172 unsigned long addr, pte_t ptent) 5173 { 5174 struct page *page = vm_normal_page(vma, addr, ptent); 5175 5176 if (!page || !page_mapped(page)) 5177 return NULL; 5178 if (PageAnon(page)) { 5179 if (!(mc.flags & MOVE_ANON)) 5180 return NULL; 5181 } else { 5182 if (!(mc.flags & MOVE_FILE)) 5183 return NULL; 5184 } 5185 if (!get_page_unless_zero(page)) 5186 return NULL; 5187 5188 return page; 5189 } 5190 5191 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5192 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5193 pte_t ptent, swp_entry_t *entry) 5194 { 5195 struct page *page = NULL; 5196 swp_entry_t ent = pte_to_swp_entry(ptent); 5197 5198 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 5199 return NULL; 5200 5201 /* 5202 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5203 * a device and because they are not accessible by CPU they are store 5204 * as special swap entry in the CPU page table. 5205 */ 5206 if (is_device_private_entry(ent)) { 5207 page = device_private_entry_to_page(ent); 5208 /* 5209 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5210 * a refcount of 1 when free (unlike normal page) 5211 */ 5212 if (!page_ref_add_unless(page, 1, 1)) 5213 return NULL; 5214 return page; 5215 } 5216 5217 /* 5218 * Because lookup_swap_cache() updates some statistics counter, 5219 * we call find_get_page() with swapper_space directly. 5220 */ 5221 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5222 if (do_memsw_account()) 5223 entry->val = ent.val; 5224 5225 return page; 5226 } 5227 #else 5228 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5229 pte_t ptent, swp_entry_t *entry) 5230 { 5231 return NULL; 5232 } 5233 #endif 5234 5235 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5236 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5237 { 5238 struct page *page = NULL; 5239 struct address_space *mapping; 5240 pgoff_t pgoff; 5241 5242 if (!vma->vm_file) /* anonymous vma */ 5243 return NULL; 5244 if (!(mc.flags & MOVE_FILE)) 5245 return NULL; 5246 5247 mapping = vma->vm_file->f_mapping; 5248 pgoff = linear_page_index(vma, addr); 5249 5250 /* page is moved even if it's not RSS of this task(page-faulted). */ 5251 #ifdef CONFIG_SWAP 5252 /* shmem/tmpfs may report page out on swap: account for that too. */ 5253 if (shmem_mapping(mapping)) { 5254 page = find_get_entry(mapping, pgoff); 5255 if (xa_is_value(page)) { 5256 swp_entry_t swp = radix_to_swp_entry(page); 5257 if (do_memsw_account()) 5258 *entry = swp; 5259 page = find_get_page(swap_address_space(swp), 5260 swp_offset(swp)); 5261 } 5262 } else 5263 page = find_get_page(mapping, pgoff); 5264 #else 5265 page = find_get_page(mapping, pgoff); 5266 #endif 5267 return page; 5268 } 5269 5270 /** 5271 * mem_cgroup_move_account - move account of the page 5272 * @page: the page 5273 * @compound: charge the page as compound or small page 5274 * @from: mem_cgroup which the page is moved from. 5275 * @to: mem_cgroup which the page is moved to. @from != @to. 5276 * 5277 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5278 * 5279 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5280 * from old cgroup. 5281 */ 5282 static int mem_cgroup_move_account(struct page *page, 5283 bool compound, 5284 struct mem_cgroup *from, 5285 struct mem_cgroup *to) 5286 { 5287 unsigned long flags; 5288 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5289 int ret; 5290 bool anon; 5291 5292 VM_BUG_ON(from == to); 5293 VM_BUG_ON_PAGE(PageLRU(page), page); 5294 VM_BUG_ON(compound && !PageTransHuge(page)); 5295 5296 /* 5297 * Prevent mem_cgroup_migrate() from looking at 5298 * page->mem_cgroup of its source page while we change it. 5299 */ 5300 ret = -EBUSY; 5301 if (!trylock_page(page)) 5302 goto out; 5303 5304 ret = -EINVAL; 5305 if (page->mem_cgroup != from) 5306 goto out_unlock; 5307 5308 anon = PageAnon(page); 5309 5310 spin_lock_irqsave(&from->move_lock, flags); 5311 5312 if (!anon && page_mapped(page)) { 5313 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); 5314 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); 5315 } 5316 5317 /* 5318 * move_lock grabbed above and caller set from->moving_account, so 5319 * mod_memcg_page_state will serialize updates to PageDirty. 5320 * So mapping should be stable for dirty pages. 5321 */ 5322 if (!anon && PageDirty(page)) { 5323 struct address_space *mapping = page_mapping(page); 5324 5325 if (mapping_cap_account_dirty(mapping)) { 5326 __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); 5327 __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); 5328 } 5329 } 5330 5331 if (PageWriteback(page)) { 5332 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); 5333 __mod_memcg_state(to, NR_WRITEBACK, nr_pages); 5334 } 5335 5336 /* 5337 * It is safe to change page->mem_cgroup here because the page 5338 * is referenced, charged, and isolated - we can't race with 5339 * uncharging, charging, migration, or LRU putback. 5340 */ 5341 5342 /* caller should have done css_get */ 5343 page->mem_cgroup = to; 5344 spin_unlock_irqrestore(&from->move_lock, flags); 5345 5346 ret = 0; 5347 5348 local_irq_disable(); 5349 mem_cgroup_charge_statistics(to, page, compound, nr_pages); 5350 memcg_check_events(to, page); 5351 mem_cgroup_charge_statistics(from, page, compound, -nr_pages); 5352 memcg_check_events(from, page); 5353 local_irq_enable(); 5354 out_unlock: 5355 unlock_page(page); 5356 out: 5357 return ret; 5358 } 5359 5360 /** 5361 * get_mctgt_type - get target type of moving charge 5362 * @vma: the vma the pte to be checked belongs 5363 * @addr: the address corresponding to the pte to be checked 5364 * @ptent: the pte to be checked 5365 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5366 * 5367 * Returns 5368 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5369 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5370 * move charge. if @target is not NULL, the page is stored in target->page 5371 * with extra refcnt got(Callers should handle it). 5372 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5373 * target for charge migration. if @target is not NULL, the entry is stored 5374 * in target->ent. 5375 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5376 * (so ZONE_DEVICE page and thus not on the lru). 5377 * For now we such page is charge like a regular page would be as for all 5378 * intent and purposes it is just special memory taking the place of a 5379 * regular page. 5380 * 5381 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5382 * 5383 * Called with pte lock held. 5384 */ 5385 5386 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5387 unsigned long addr, pte_t ptent, union mc_target *target) 5388 { 5389 struct page *page = NULL; 5390 enum mc_target_type ret = MC_TARGET_NONE; 5391 swp_entry_t ent = { .val = 0 }; 5392 5393 if (pte_present(ptent)) 5394 page = mc_handle_present_pte(vma, addr, ptent); 5395 else if (is_swap_pte(ptent)) 5396 page = mc_handle_swap_pte(vma, ptent, &ent); 5397 else if (pte_none(ptent)) 5398 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5399 5400 if (!page && !ent.val) 5401 return ret; 5402 if (page) { 5403 /* 5404 * Do only loose check w/o serialization. 5405 * mem_cgroup_move_account() checks the page is valid or 5406 * not under LRU exclusion. 5407 */ 5408 if (page->mem_cgroup == mc.from) { 5409 ret = MC_TARGET_PAGE; 5410 if (is_device_private_page(page)) 5411 ret = MC_TARGET_DEVICE; 5412 if (target) 5413 target->page = page; 5414 } 5415 if (!ret || !target) 5416 put_page(page); 5417 } 5418 /* 5419 * There is a swap entry and a page doesn't exist or isn't charged. 5420 * But we cannot move a tail-page in a THP. 5421 */ 5422 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5423 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5424 ret = MC_TARGET_SWAP; 5425 if (target) 5426 target->ent = ent; 5427 } 5428 return ret; 5429 } 5430 5431 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5432 /* 5433 * We don't consider PMD mapped swapping or file mapped pages because THP does 5434 * not support them for now. 5435 * Caller should make sure that pmd_trans_huge(pmd) is true. 5436 */ 5437 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5438 unsigned long addr, pmd_t pmd, union mc_target *target) 5439 { 5440 struct page *page = NULL; 5441 enum mc_target_type ret = MC_TARGET_NONE; 5442 5443 if (unlikely(is_swap_pmd(pmd))) { 5444 VM_BUG_ON(thp_migration_supported() && 5445 !is_pmd_migration_entry(pmd)); 5446 return ret; 5447 } 5448 page = pmd_page(pmd); 5449 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5450 if (!(mc.flags & MOVE_ANON)) 5451 return ret; 5452 if (page->mem_cgroup == mc.from) { 5453 ret = MC_TARGET_PAGE; 5454 if (target) { 5455 get_page(page); 5456 target->page = page; 5457 } 5458 } 5459 return ret; 5460 } 5461 #else 5462 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5463 unsigned long addr, pmd_t pmd, union mc_target *target) 5464 { 5465 return MC_TARGET_NONE; 5466 } 5467 #endif 5468 5469 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5470 unsigned long addr, unsigned long end, 5471 struct mm_walk *walk) 5472 { 5473 struct vm_area_struct *vma = walk->vma; 5474 pte_t *pte; 5475 spinlock_t *ptl; 5476 5477 ptl = pmd_trans_huge_lock(pmd, vma); 5478 if (ptl) { 5479 /* 5480 * Note their can not be MC_TARGET_DEVICE for now as we do not 5481 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5482 * this might change. 5483 */ 5484 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5485 mc.precharge += HPAGE_PMD_NR; 5486 spin_unlock(ptl); 5487 return 0; 5488 } 5489 5490 if (pmd_trans_unstable(pmd)) 5491 return 0; 5492 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5493 for (; addr != end; pte++, addr += PAGE_SIZE) 5494 if (get_mctgt_type(vma, addr, *pte, NULL)) 5495 mc.precharge++; /* increment precharge temporarily */ 5496 pte_unmap_unlock(pte - 1, ptl); 5497 cond_resched(); 5498 5499 return 0; 5500 } 5501 5502 static const struct mm_walk_ops precharge_walk_ops = { 5503 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5504 }; 5505 5506 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5507 { 5508 unsigned long precharge; 5509 5510 down_read(&mm->mmap_sem); 5511 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5512 up_read(&mm->mmap_sem); 5513 5514 precharge = mc.precharge; 5515 mc.precharge = 0; 5516 5517 return precharge; 5518 } 5519 5520 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5521 { 5522 unsigned long precharge = mem_cgroup_count_precharge(mm); 5523 5524 VM_BUG_ON(mc.moving_task); 5525 mc.moving_task = current; 5526 return mem_cgroup_do_precharge(precharge); 5527 } 5528 5529 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5530 static void __mem_cgroup_clear_mc(void) 5531 { 5532 struct mem_cgroup *from = mc.from; 5533 struct mem_cgroup *to = mc.to; 5534 5535 /* we must uncharge all the leftover precharges from mc.to */ 5536 if (mc.precharge) { 5537 cancel_charge(mc.to, mc.precharge); 5538 mc.precharge = 0; 5539 } 5540 /* 5541 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5542 * we must uncharge here. 5543 */ 5544 if (mc.moved_charge) { 5545 cancel_charge(mc.from, mc.moved_charge); 5546 mc.moved_charge = 0; 5547 } 5548 /* we must fixup refcnts and charges */ 5549 if (mc.moved_swap) { 5550 /* uncharge swap account from the old cgroup */ 5551 if (!mem_cgroup_is_root(mc.from)) 5552 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5553 5554 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5555 5556 /* 5557 * we charged both to->memory and to->memsw, so we 5558 * should uncharge to->memory. 5559 */ 5560 if (!mem_cgroup_is_root(mc.to)) 5561 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5562 5563 mem_cgroup_id_get_many(mc.to, mc.moved_swap); 5564 css_put_many(&mc.to->css, mc.moved_swap); 5565 5566 mc.moved_swap = 0; 5567 } 5568 memcg_oom_recover(from); 5569 memcg_oom_recover(to); 5570 wake_up_all(&mc.waitq); 5571 } 5572 5573 static void mem_cgroup_clear_mc(void) 5574 { 5575 struct mm_struct *mm = mc.mm; 5576 5577 /* 5578 * we must clear moving_task before waking up waiters at the end of 5579 * task migration. 5580 */ 5581 mc.moving_task = NULL; 5582 __mem_cgroup_clear_mc(); 5583 spin_lock(&mc.lock); 5584 mc.from = NULL; 5585 mc.to = NULL; 5586 mc.mm = NULL; 5587 spin_unlock(&mc.lock); 5588 5589 mmput(mm); 5590 } 5591 5592 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5593 { 5594 struct cgroup_subsys_state *css; 5595 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5596 struct mem_cgroup *from; 5597 struct task_struct *leader, *p; 5598 struct mm_struct *mm; 5599 unsigned long move_flags; 5600 int ret = 0; 5601 5602 /* charge immigration isn't supported on the default hierarchy */ 5603 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5604 return 0; 5605 5606 /* 5607 * Multi-process migrations only happen on the default hierarchy 5608 * where charge immigration is not used. Perform charge 5609 * immigration if @tset contains a leader and whine if there are 5610 * multiple. 5611 */ 5612 p = NULL; 5613 cgroup_taskset_for_each_leader(leader, css, tset) { 5614 WARN_ON_ONCE(p); 5615 p = leader; 5616 memcg = mem_cgroup_from_css(css); 5617 } 5618 if (!p) 5619 return 0; 5620 5621 /* 5622 * We are now commited to this value whatever it is. Changes in this 5623 * tunable will only affect upcoming migrations, not the current one. 5624 * So we need to save it, and keep it going. 5625 */ 5626 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5627 if (!move_flags) 5628 return 0; 5629 5630 from = mem_cgroup_from_task(p); 5631 5632 VM_BUG_ON(from == memcg); 5633 5634 mm = get_task_mm(p); 5635 if (!mm) 5636 return 0; 5637 /* We move charges only when we move a owner of the mm */ 5638 if (mm->owner == p) { 5639 VM_BUG_ON(mc.from); 5640 VM_BUG_ON(mc.to); 5641 VM_BUG_ON(mc.precharge); 5642 VM_BUG_ON(mc.moved_charge); 5643 VM_BUG_ON(mc.moved_swap); 5644 5645 spin_lock(&mc.lock); 5646 mc.mm = mm; 5647 mc.from = from; 5648 mc.to = memcg; 5649 mc.flags = move_flags; 5650 spin_unlock(&mc.lock); 5651 /* We set mc.moving_task later */ 5652 5653 ret = mem_cgroup_precharge_mc(mm); 5654 if (ret) 5655 mem_cgroup_clear_mc(); 5656 } else { 5657 mmput(mm); 5658 } 5659 return ret; 5660 } 5661 5662 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5663 { 5664 if (mc.to) 5665 mem_cgroup_clear_mc(); 5666 } 5667 5668 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5669 unsigned long addr, unsigned long end, 5670 struct mm_walk *walk) 5671 { 5672 int ret = 0; 5673 struct vm_area_struct *vma = walk->vma; 5674 pte_t *pte; 5675 spinlock_t *ptl; 5676 enum mc_target_type target_type; 5677 union mc_target target; 5678 struct page *page; 5679 5680 ptl = pmd_trans_huge_lock(pmd, vma); 5681 if (ptl) { 5682 if (mc.precharge < HPAGE_PMD_NR) { 5683 spin_unlock(ptl); 5684 return 0; 5685 } 5686 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5687 if (target_type == MC_TARGET_PAGE) { 5688 page = target.page; 5689 if (!isolate_lru_page(page)) { 5690 if (!mem_cgroup_move_account(page, true, 5691 mc.from, mc.to)) { 5692 mc.precharge -= HPAGE_PMD_NR; 5693 mc.moved_charge += HPAGE_PMD_NR; 5694 } 5695 putback_lru_page(page); 5696 } 5697 put_page(page); 5698 } else if (target_type == MC_TARGET_DEVICE) { 5699 page = target.page; 5700 if (!mem_cgroup_move_account(page, true, 5701 mc.from, mc.to)) { 5702 mc.precharge -= HPAGE_PMD_NR; 5703 mc.moved_charge += HPAGE_PMD_NR; 5704 } 5705 put_page(page); 5706 } 5707 spin_unlock(ptl); 5708 return 0; 5709 } 5710 5711 if (pmd_trans_unstable(pmd)) 5712 return 0; 5713 retry: 5714 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5715 for (; addr != end; addr += PAGE_SIZE) { 5716 pte_t ptent = *(pte++); 5717 bool device = false; 5718 swp_entry_t ent; 5719 5720 if (!mc.precharge) 5721 break; 5722 5723 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5724 case MC_TARGET_DEVICE: 5725 device = true; 5726 /* fall through */ 5727 case MC_TARGET_PAGE: 5728 page = target.page; 5729 /* 5730 * We can have a part of the split pmd here. Moving it 5731 * can be done but it would be too convoluted so simply 5732 * ignore such a partial THP and keep it in original 5733 * memcg. There should be somebody mapping the head. 5734 */ 5735 if (PageTransCompound(page)) 5736 goto put; 5737 if (!device && isolate_lru_page(page)) 5738 goto put; 5739 if (!mem_cgroup_move_account(page, false, 5740 mc.from, mc.to)) { 5741 mc.precharge--; 5742 /* we uncharge from mc.from later. */ 5743 mc.moved_charge++; 5744 } 5745 if (!device) 5746 putback_lru_page(page); 5747 put: /* get_mctgt_type() gets the page */ 5748 put_page(page); 5749 break; 5750 case MC_TARGET_SWAP: 5751 ent = target.ent; 5752 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5753 mc.precharge--; 5754 /* we fixup refcnts and charges later. */ 5755 mc.moved_swap++; 5756 } 5757 break; 5758 default: 5759 break; 5760 } 5761 } 5762 pte_unmap_unlock(pte - 1, ptl); 5763 cond_resched(); 5764 5765 if (addr != end) { 5766 /* 5767 * We have consumed all precharges we got in can_attach(). 5768 * We try charge one by one, but don't do any additional 5769 * charges to mc.to if we have failed in charge once in attach() 5770 * phase. 5771 */ 5772 ret = mem_cgroup_do_precharge(1); 5773 if (!ret) 5774 goto retry; 5775 } 5776 5777 return ret; 5778 } 5779 5780 static const struct mm_walk_ops charge_walk_ops = { 5781 .pmd_entry = mem_cgroup_move_charge_pte_range, 5782 }; 5783 5784 static void mem_cgroup_move_charge(void) 5785 { 5786 lru_add_drain_all(); 5787 /* 5788 * Signal lock_page_memcg() to take the memcg's move_lock 5789 * while we're moving its pages to another memcg. Then wait 5790 * for already started RCU-only updates to finish. 5791 */ 5792 atomic_inc(&mc.from->moving_account); 5793 synchronize_rcu(); 5794 retry: 5795 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { 5796 /* 5797 * Someone who are holding the mmap_sem might be waiting in 5798 * waitq. So we cancel all extra charges, wake up all waiters, 5799 * and retry. Because we cancel precharges, we might not be able 5800 * to move enough charges, but moving charge is a best-effort 5801 * feature anyway, so it wouldn't be a big problem. 5802 */ 5803 __mem_cgroup_clear_mc(); 5804 cond_resched(); 5805 goto retry; 5806 } 5807 /* 5808 * When we have consumed all precharges and failed in doing 5809 * additional charge, the page walk just aborts. 5810 */ 5811 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 5812 NULL); 5813 5814 up_read(&mc.mm->mmap_sem); 5815 atomic_dec(&mc.from->moving_account); 5816 } 5817 5818 static void mem_cgroup_move_task(void) 5819 { 5820 if (mc.to) { 5821 mem_cgroup_move_charge(); 5822 mem_cgroup_clear_mc(); 5823 } 5824 } 5825 #else /* !CONFIG_MMU */ 5826 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5827 { 5828 return 0; 5829 } 5830 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5831 { 5832 } 5833 static void mem_cgroup_move_task(void) 5834 { 5835 } 5836 #endif 5837 5838 /* 5839 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5840 * to verify whether we're attached to the default hierarchy on each mount 5841 * attempt. 5842 */ 5843 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5844 { 5845 /* 5846 * use_hierarchy is forced on the default hierarchy. cgroup core 5847 * guarantees that @root doesn't have any children, so turning it 5848 * on for the root memcg is enough. 5849 */ 5850 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5851 root_mem_cgroup->use_hierarchy = true; 5852 else 5853 root_mem_cgroup->use_hierarchy = false; 5854 } 5855 5856 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 5857 { 5858 if (value == PAGE_COUNTER_MAX) 5859 seq_puts(m, "max\n"); 5860 else 5861 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 5862 5863 return 0; 5864 } 5865 5866 static u64 memory_current_read(struct cgroup_subsys_state *css, 5867 struct cftype *cft) 5868 { 5869 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5870 5871 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 5872 } 5873 5874 static int memory_min_show(struct seq_file *m, void *v) 5875 { 5876 return seq_puts_memcg_tunable(m, 5877 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 5878 } 5879 5880 static ssize_t memory_min_write(struct kernfs_open_file *of, 5881 char *buf, size_t nbytes, loff_t off) 5882 { 5883 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5884 unsigned long min; 5885 int err; 5886 5887 buf = strstrip(buf); 5888 err = page_counter_memparse(buf, "max", &min); 5889 if (err) 5890 return err; 5891 5892 page_counter_set_min(&memcg->memory, min); 5893 5894 return nbytes; 5895 } 5896 5897 static int memory_low_show(struct seq_file *m, void *v) 5898 { 5899 return seq_puts_memcg_tunable(m, 5900 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 5901 } 5902 5903 static ssize_t memory_low_write(struct kernfs_open_file *of, 5904 char *buf, size_t nbytes, loff_t off) 5905 { 5906 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5907 unsigned long low; 5908 int err; 5909 5910 buf = strstrip(buf); 5911 err = page_counter_memparse(buf, "max", &low); 5912 if (err) 5913 return err; 5914 5915 page_counter_set_low(&memcg->memory, low); 5916 5917 return nbytes; 5918 } 5919 5920 static int memory_high_show(struct seq_file *m, void *v) 5921 { 5922 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); 5923 } 5924 5925 static ssize_t memory_high_write(struct kernfs_open_file *of, 5926 char *buf, size_t nbytes, loff_t off) 5927 { 5928 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5929 unsigned long nr_pages; 5930 unsigned long high; 5931 int err; 5932 5933 buf = strstrip(buf); 5934 err = page_counter_memparse(buf, "max", &high); 5935 if (err) 5936 return err; 5937 5938 memcg->high = high; 5939 5940 nr_pages = page_counter_read(&memcg->memory); 5941 if (nr_pages > high) 5942 try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 5943 GFP_KERNEL, true); 5944 5945 memcg_wb_domain_size_changed(memcg); 5946 return nbytes; 5947 } 5948 5949 static int memory_max_show(struct seq_file *m, void *v) 5950 { 5951 return seq_puts_memcg_tunable(m, 5952 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 5953 } 5954 5955 static ssize_t memory_max_write(struct kernfs_open_file *of, 5956 char *buf, size_t nbytes, loff_t off) 5957 { 5958 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5959 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; 5960 bool drained = false; 5961 unsigned long max; 5962 int err; 5963 5964 buf = strstrip(buf); 5965 err = page_counter_memparse(buf, "max", &max); 5966 if (err) 5967 return err; 5968 5969 xchg(&memcg->memory.max, max); 5970 5971 for (;;) { 5972 unsigned long nr_pages = page_counter_read(&memcg->memory); 5973 5974 if (nr_pages <= max) 5975 break; 5976 5977 if (signal_pending(current)) { 5978 err = -EINTR; 5979 break; 5980 } 5981 5982 if (!drained) { 5983 drain_all_stock(memcg); 5984 drained = true; 5985 continue; 5986 } 5987 5988 if (nr_reclaims) { 5989 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 5990 GFP_KERNEL, true)) 5991 nr_reclaims--; 5992 continue; 5993 } 5994 5995 memcg_memory_event(memcg, MEMCG_OOM); 5996 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 5997 break; 5998 } 5999 6000 memcg_wb_domain_size_changed(memcg); 6001 return nbytes; 6002 } 6003 6004 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6005 { 6006 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6007 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6008 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6009 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6010 seq_printf(m, "oom_kill %lu\n", 6011 atomic_long_read(&events[MEMCG_OOM_KILL])); 6012 } 6013 6014 static int memory_events_show(struct seq_file *m, void *v) 6015 { 6016 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6017 6018 __memory_events_show(m, memcg->memory_events); 6019 return 0; 6020 } 6021 6022 static int memory_events_local_show(struct seq_file *m, void *v) 6023 { 6024 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6025 6026 __memory_events_show(m, memcg->memory_events_local); 6027 return 0; 6028 } 6029 6030 static int memory_stat_show(struct seq_file *m, void *v) 6031 { 6032 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6033 char *buf; 6034 6035 buf = memory_stat_format(memcg); 6036 if (!buf) 6037 return -ENOMEM; 6038 seq_puts(m, buf); 6039 kfree(buf); 6040 return 0; 6041 } 6042 6043 static int memory_oom_group_show(struct seq_file *m, void *v) 6044 { 6045 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6046 6047 seq_printf(m, "%d\n", memcg->oom_group); 6048 6049 return 0; 6050 } 6051 6052 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6053 char *buf, size_t nbytes, loff_t off) 6054 { 6055 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6056 int ret, oom_group; 6057 6058 buf = strstrip(buf); 6059 if (!buf) 6060 return -EINVAL; 6061 6062 ret = kstrtoint(buf, 0, &oom_group); 6063 if (ret) 6064 return ret; 6065 6066 if (oom_group != 0 && oom_group != 1) 6067 return -EINVAL; 6068 6069 memcg->oom_group = oom_group; 6070 6071 return nbytes; 6072 } 6073 6074 static struct cftype memory_files[] = { 6075 { 6076 .name = "current", 6077 .flags = CFTYPE_NOT_ON_ROOT, 6078 .read_u64 = memory_current_read, 6079 }, 6080 { 6081 .name = "min", 6082 .flags = CFTYPE_NOT_ON_ROOT, 6083 .seq_show = memory_min_show, 6084 .write = memory_min_write, 6085 }, 6086 { 6087 .name = "low", 6088 .flags = CFTYPE_NOT_ON_ROOT, 6089 .seq_show = memory_low_show, 6090 .write = memory_low_write, 6091 }, 6092 { 6093 .name = "high", 6094 .flags = CFTYPE_NOT_ON_ROOT, 6095 .seq_show = memory_high_show, 6096 .write = memory_high_write, 6097 }, 6098 { 6099 .name = "max", 6100 .flags = CFTYPE_NOT_ON_ROOT, 6101 .seq_show = memory_max_show, 6102 .write = memory_max_write, 6103 }, 6104 { 6105 .name = "events", 6106 .flags = CFTYPE_NOT_ON_ROOT, 6107 .file_offset = offsetof(struct mem_cgroup, events_file), 6108 .seq_show = memory_events_show, 6109 }, 6110 { 6111 .name = "events.local", 6112 .flags = CFTYPE_NOT_ON_ROOT, 6113 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6114 .seq_show = memory_events_local_show, 6115 }, 6116 { 6117 .name = "stat", 6118 .flags = CFTYPE_NOT_ON_ROOT, 6119 .seq_show = memory_stat_show, 6120 }, 6121 { 6122 .name = "oom.group", 6123 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6124 .seq_show = memory_oom_group_show, 6125 .write = memory_oom_group_write, 6126 }, 6127 { } /* terminate */ 6128 }; 6129 6130 struct cgroup_subsys memory_cgrp_subsys = { 6131 .css_alloc = mem_cgroup_css_alloc, 6132 .css_online = mem_cgroup_css_online, 6133 .css_offline = mem_cgroup_css_offline, 6134 .css_released = mem_cgroup_css_released, 6135 .css_free = mem_cgroup_css_free, 6136 .css_reset = mem_cgroup_css_reset, 6137 .can_attach = mem_cgroup_can_attach, 6138 .cancel_attach = mem_cgroup_cancel_attach, 6139 .post_attach = mem_cgroup_move_task, 6140 .bind = mem_cgroup_bind, 6141 .dfl_cftypes = memory_files, 6142 .legacy_cftypes = mem_cgroup_legacy_files, 6143 .early_init = 0, 6144 }; 6145 6146 /** 6147 * mem_cgroup_protected - check if memory consumption is in the normal range 6148 * @root: the top ancestor of the sub-tree being checked 6149 * @memcg: the memory cgroup to check 6150 * 6151 * WARNING: This function is not stateless! It can only be used as part 6152 * of a top-down tree iteration, not for isolated queries. 6153 * 6154 * Returns one of the following: 6155 * MEMCG_PROT_NONE: cgroup memory is not protected 6156 * MEMCG_PROT_LOW: cgroup memory is protected as long there is 6157 * an unprotected supply of reclaimable memory from other cgroups. 6158 * MEMCG_PROT_MIN: cgroup memory is protected 6159 * 6160 * @root is exclusive; it is never protected when looked at directly 6161 * 6162 * To provide a proper hierarchical behavior, effective memory.min/low values 6163 * are used. Below is the description of how effective memory.low is calculated. 6164 * Effective memory.min values is calculated in the same way. 6165 * 6166 * Effective memory.low is always equal or less than the original memory.low. 6167 * If there is no memory.low overcommittment (which is always true for 6168 * top-level memory cgroups), these two values are equal. 6169 * Otherwise, it's a part of parent's effective memory.low, 6170 * calculated as a cgroup's memory.low usage divided by sum of sibling's 6171 * memory.low usages, where memory.low usage is the size of actually 6172 * protected memory. 6173 * 6174 * low_usage 6175 * elow = min( memory.low, parent->elow * ------------------ ), 6176 * siblings_low_usage 6177 * 6178 * | memory.current, if memory.current < memory.low 6179 * low_usage = | 6180 * | 0, otherwise. 6181 * 6182 * 6183 * Such definition of the effective memory.low provides the expected 6184 * hierarchical behavior: parent's memory.low value is limiting 6185 * children, unprotected memory is reclaimed first and cgroups, 6186 * which are not using their guarantee do not affect actual memory 6187 * distribution. 6188 * 6189 * For example, if there are memcgs A, A/B, A/C, A/D and A/E: 6190 * 6191 * A A/memory.low = 2G, A/memory.current = 6G 6192 * //\\ 6193 * BC DE B/memory.low = 3G B/memory.current = 2G 6194 * C/memory.low = 1G C/memory.current = 2G 6195 * D/memory.low = 0 D/memory.current = 2G 6196 * E/memory.low = 10G E/memory.current = 0 6197 * 6198 * and the memory pressure is applied, the following memory distribution 6199 * is expected (approximately): 6200 * 6201 * A/memory.current = 2G 6202 * 6203 * B/memory.current = 1.3G 6204 * C/memory.current = 0.6G 6205 * D/memory.current = 0 6206 * E/memory.current = 0 6207 * 6208 * These calculations require constant tracking of the actual low usages 6209 * (see propagate_protected_usage()), as well as recursive calculation of 6210 * effective memory.low values. But as we do call mem_cgroup_protected() 6211 * path for each memory cgroup top-down from the reclaim, 6212 * it's possible to optimize this part, and save calculated elow 6213 * for next usage. This part is intentionally racy, but it's ok, 6214 * as memory.low is a best-effort mechanism. 6215 */ 6216 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, 6217 struct mem_cgroup *memcg) 6218 { 6219 struct mem_cgroup *parent; 6220 unsigned long emin, parent_emin; 6221 unsigned long elow, parent_elow; 6222 unsigned long usage; 6223 6224 if (mem_cgroup_disabled()) 6225 return MEMCG_PROT_NONE; 6226 6227 if (!root) 6228 root = root_mem_cgroup; 6229 if (memcg == root) 6230 return MEMCG_PROT_NONE; 6231 6232 usage = page_counter_read(&memcg->memory); 6233 if (!usage) 6234 return MEMCG_PROT_NONE; 6235 6236 emin = memcg->memory.min; 6237 elow = memcg->memory.low; 6238 6239 parent = parent_mem_cgroup(memcg); 6240 /* No parent means a non-hierarchical mode on v1 memcg */ 6241 if (!parent) 6242 return MEMCG_PROT_NONE; 6243 6244 if (parent == root) 6245 goto exit; 6246 6247 parent_emin = READ_ONCE(parent->memory.emin); 6248 emin = min(emin, parent_emin); 6249 if (emin && parent_emin) { 6250 unsigned long min_usage, siblings_min_usage; 6251 6252 min_usage = min(usage, memcg->memory.min); 6253 siblings_min_usage = atomic_long_read( 6254 &parent->memory.children_min_usage); 6255 6256 if (min_usage && siblings_min_usage) 6257 emin = min(emin, parent_emin * min_usage / 6258 siblings_min_usage); 6259 } 6260 6261 parent_elow = READ_ONCE(parent->memory.elow); 6262 elow = min(elow, parent_elow); 6263 if (elow && parent_elow) { 6264 unsigned long low_usage, siblings_low_usage; 6265 6266 low_usage = min(usage, memcg->memory.low); 6267 siblings_low_usage = atomic_long_read( 6268 &parent->memory.children_low_usage); 6269 6270 if (low_usage && siblings_low_usage) 6271 elow = min(elow, parent_elow * low_usage / 6272 siblings_low_usage); 6273 } 6274 6275 exit: 6276 memcg->memory.emin = emin; 6277 memcg->memory.elow = elow; 6278 6279 if (usage <= emin) 6280 return MEMCG_PROT_MIN; 6281 else if (usage <= elow) 6282 return MEMCG_PROT_LOW; 6283 else 6284 return MEMCG_PROT_NONE; 6285 } 6286 6287 /** 6288 * mem_cgroup_try_charge - try charging a page 6289 * @page: page to charge 6290 * @mm: mm context of the victim 6291 * @gfp_mask: reclaim mode 6292 * @memcgp: charged memcg return 6293 * @compound: charge the page as compound or small page 6294 * 6295 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6296 * pages according to @gfp_mask if necessary. 6297 * 6298 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 6299 * Otherwise, an error code is returned. 6300 * 6301 * After page->mapping has been set up, the caller must finalize the 6302 * charge with mem_cgroup_commit_charge(). Or abort the transaction 6303 * with mem_cgroup_cancel_charge() in case page instantiation fails. 6304 */ 6305 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 6306 gfp_t gfp_mask, struct mem_cgroup **memcgp, 6307 bool compound) 6308 { 6309 struct mem_cgroup *memcg = NULL; 6310 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 6311 int ret = 0; 6312 6313 if (mem_cgroup_disabled()) 6314 goto out; 6315 6316 if (PageSwapCache(page)) { 6317 /* 6318 * Every swap fault against a single page tries to charge the 6319 * page, bail as early as possible. shmem_unuse() encounters 6320 * already charged pages, too. The USED bit is protected by 6321 * the page lock, which serializes swap cache removal, which 6322 * in turn serializes uncharging. 6323 */ 6324 VM_BUG_ON_PAGE(!PageLocked(page), page); 6325 if (compound_head(page)->mem_cgroup) 6326 goto out; 6327 6328 if (do_swap_account) { 6329 swp_entry_t ent = { .val = page_private(page), }; 6330 unsigned short id = lookup_swap_cgroup_id(ent); 6331 6332 rcu_read_lock(); 6333 memcg = mem_cgroup_from_id(id); 6334 if (memcg && !css_tryget_online(&memcg->css)) 6335 memcg = NULL; 6336 rcu_read_unlock(); 6337 } 6338 } 6339 6340 if (!memcg) 6341 memcg = get_mem_cgroup_from_mm(mm); 6342 6343 ret = try_charge(memcg, gfp_mask, nr_pages); 6344 6345 css_put(&memcg->css); 6346 out: 6347 *memcgp = memcg; 6348 return ret; 6349 } 6350 6351 int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, 6352 gfp_t gfp_mask, struct mem_cgroup **memcgp, 6353 bool compound) 6354 { 6355 struct mem_cgroup *memcg; 6356 int ret; 6357 6358 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); 6359 memcg = *memcgp; 6360 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); 6361 return ret; 6362 } 6363 6364 /** 6365 * mem_cgroup_commit_charge - commit a page charge 6366 * @page: page to charge 6367 * @memcg: memcg to charge the page to 6368 * @lrucare: page might be on LRU already 6369 * @compound: charge the page as compound or small page 6370 * 6371 * Finalize a charge transaction started by mem_cgroup_try_charge(), 6372 * after page->mapping has been set up. This must happen atomically 6373 * as part of the page instantiation, i.e. under the page table lock 6374 * for anonymous pages, under the page lock for page and swap cache. 6375 * 6376 * In addition, the page must not be on the LRU during the commit, to 6377 * prevent racing with task migration. If it might be, use @lrucare. 6378 * 6379 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 6380 */ 6381 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 6382 bool lrucare, bool compound) 6383 { 6384 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 6385 6386 VM_BUG_ON_PAGE(!page->mapping, page); 6387 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 6388 6389 if (mem_cgroup_disabled()) 6390 return; 6391 /* 6392 * Swap faults will attempt to charge the same page multiple 6393 * times. But reuse_swap_page() might have removed the page 6394 * from swapcache already, so we can't check PageSwapCache(). 6395 */ 6396 if (!memcg) 6397 return; 6398 6399 commit_charge(page, memcg, lrucare); 6400 6401 local_irq_disable(); 6402 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); 6403 memcg_check_events(memcg, page); 6404 local_irq_enable(); 6405 6406 if (do_memsw_account() && PageSwapCache(page)) { 6407 swp_entry_t entry = { .val = page_private(page) }; 6408 /* 6409 * The swap entry might not get freed for a long time, 6410 * let's not wait for it. The page already received a 6411 * memory+swap charge, drop the swap entry duplicate. 6412 */ 6413 mem_cgroup_uncharge_swap(entry, nr_pages); 6414 } 6415 } 6416 6417 /** 6418 * mem_cgroup_cancel_charge - cancel a page charge 6419 * @page: page to charge 6420 * @memcg: memcg to charge the page to 6421 * @compound: charge the page as compound or small page 6422 * 6423 * Cancel a charge transaction started by mem_cgroup_try_charge(). 6424 */ 6425 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, 6426 bool compound) 6427 { 6428 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 6429 6430 if (mem_cgroup_disabled()) 6431 return; 6432 /* 6433 * Swap faults will attempt to charge the same page multiple 6434 * times. But reuse_swap_page() might have removed the page 6435 * from swapcache already, so we can't check PageSwapCache(). 6436 */ 6437 if (!memcg) 6438 return; 6439 6440 cancel_charge(memcg, nr_pages); 6441 } 6442 6443 struct uncharge_gather { 6444 struct mem_cgroup *memcg; 6445 unsigned long pgpgout; 6446 unsigned long nr_anon; 6447 unsigned long nr_file; 6448 unsigned long nr_kmem; 6449 unsigned long nr_huge; 6450 unsigned long nr_shmem; 6451 struct page *dummy_page; 6452 }; 6453 6454 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6455 { 6456 memset(ug, 0, sizeof(*ug)); 6457 } 6458 6459 static void uncharge_batch(const struct uncharge_gather *ug) 6460 { 6461 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; 6462 unsigned long flags; 6463 6464 if (!mem_cgroup_is_root(ug->memcg)) { 6465 page_counter_uncharge(&ug->memcg->memory, nr_pages); 6466 if (do_memsw_account()) 6467 page_counter_uncharge(&ug->memcg->memsw, nr_pages); 6468 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6469 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6470 memcg_oom_recover(ug->memcg); 6471 } 6472 6473 local_irq_save(flags); 6474 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); 6475 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); 6476 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); 6477 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); 6478 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6479 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); 6480 memcg_check_events(ug->memcg, ug->dummy_page); 6481 local_irq_restore(flags); 6482 6483 if (!mem_cgroup_is_root(ug->memcg)) 6484 css_put_many(&ug->memcg->css, nr_pages); 6485 } 6486 6487 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6488 { 6489 VM_BUG_ON_PAGE(PageLRU(page), page); 6490 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && 6491 !PageHWPoison(page) , page); 6492 6493 if (!page->mem_cgroup) 6494 return; 6495 6496 /* 6497 * Nobody should be changing or seriously looking at 6498 * page->mem_cgroup at this point, we have fully 6499 * exclusive access to the page. 6500 */ 6501 6502 if (ug->memcg != page->mem_cgroup) { 6503 if (ug->memcg) { 6504 uncharge_batch(ug); 6505 uncharge_gather_clear(ug); 6506 } 6507 ug->memcg = page->mem_cgroup; 6508 } 6509 6510 if (!PageKmemcg(page)) { 6511 unsigned int nr_pages = 1; 6512 6513 if (PageTransHuge(page)) { 6514 nr_pages <<= compound_order(page); 6515 ug->nr_huge += nr_pages; 6516 } 6517 if (PageAnon(page)) 6518 ug->nr_anon += nr_pages; 6519 else { 6520 ug->nr_file += nr_pages; 6521 if (PageSwapBacked(page)) 6522 ug->nr_shmem += nr_pages; 6523 } 6524 ug->pgpgout++; 6525 } else { 6526 ug->nr_kmem += 1 << compound_order(page); 6527 __ClearPageKmemcg(page); 6528 } 6529 6530 ug->dummy_page = page; 6531 page->mem_cgroup = NULL; 6532 } 6533 6534 static void uncharge_list(struct list_head *page_list) 6535 { 6536 struct uncharge_gather ug; 6537 struct list_head *next; 6538 6539 uncharge_gather_clear(&ug); 6540 6541 /* 6542 * Note that the list can be a single page->lru; hence the 6543 * do-while loop instead of a simple list_for_each_entry(). 6544 */ 6545 next = page_list->next; 6546 do { 6547 struct page *page; 6548 6549 page = list_entry(next, struct page, lru); 6550 next = page->lru.next; 6551 6552 uncharge_page(page, &ug); 6553 } while (next != page_list); 6554 6555 if (ug.memcg) 6556 uncharge_batch(&ug); 6557 } 6558 6559 /** 6560 * mem_cgroup_uncharge - uncharge a page 6561 * @page: page to uncharge 6562 * 6563 * Uncharge a page previously charged with mem_cgroup_try_charge() and 6564 * mem_cgroup_commit_charge(). 6565 */ 6566 void mem_cgroup_uncharge(struct page *page) 6567 { 6568 struct uncharge_gather ug; 6569 6570 if (mem_cgroup_disabled()) 6571 return; 6572 6573 /* Don't touch page->lru of any random page, pre-check: */ 6574 if (!page->mem_cgroup) 6575 return; 6576 6577 uncharge_gather_clear(&ug); 6578 uncharge_page(page, &ug); 6579 uncharge_batch(&ug); 6580 } 6581 6582 /** 6583 * mem_cgroup_uncharge_list - uncharge a list of page 6584 * @page_list: list of pages to uncharge 6585 * 6586 * Uncharge a list of pages previously charged with 6587 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 6588 */ 6589 void mem_cgroup_uncharge_list(struct list_head *page_list) 6590 { 6591 if (mem_cgroup_disabled()) 6592 return; 6593 6594 if (!list_empty(page_list)) 6595 uncharge_list(page_list); 6596 } 6597 6598 /** 6599 * mem_cgroup_migrate - charge a page's replacement 6600 * @oldpage: currently circulating page 6601 * @newpage: replacement page 6602 * 6603 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6604 * be uncharged upon free. 6605 * 6606 * Both pages must be locked, @newpage->mapping must be set up. 6607 */ 6608 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6609 { 6610 struct mem_cgroup *memcg; 6611 unsigned int nr_pages; 6612 bool compound; 6613 unsigned long flags; 6614 6615 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6616 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6617 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6618 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6619 newpage); 6620 6621 if (mem_cgroup_disabled()) 6622 return; 6623 6624 /* Page cache replacement: new page already charged? */ 6625 if (newpage->mem_cgroup) 6626 return; 6627 6628 /* Swapcache readahead pages can get replaced before being charged */ 6629 memcg = oldpage->mem_cgroup; 6630 if (!memcg) 6631 return; 6632 6633 /* Force-charge the new page. The old one will be freed soon */ 6634 compound = PageTransHuge(newpage); 6635 nr_pages = compound ? hpage_nr_pages(newpage) : 1; 6636 6637 page_counter_charge(&memcg->memory, nr_pages); 6638 if (do_memsw_account()) 6639 page_counter_charge(&memcg->memsw, nr_pages); 6640 css_get_many(&memcg->css, nr_pages); 6641 6642 commit_charge(newpage, memcg, false); 6643 6644 local_irq_save(flags); 6645 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); 6646 memcg_check_events(memcg, newpage); 6647 local_irq_restore(flags); 6648 } 6649 6650 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6651 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6652 6653 void mem_cgroup_sk_alloc(struct sock *sk) 6654 { 6655 struct mem_cgroup *memcg; 6656 6657 if (!mem_cgroup_sockets_enabled) 6658 return; 6659 6660 /* 6661 * Socket cloning can throw us here with sk_memcg already 6662 * filled. It won't however, necessarily happen from 6663 * process context. So the test for root memcg given 6664 * the current task's memcg won't help us in this case. 6665 * 6666 * Respecting the original socket's memcg is a better 6667 * decision in this case. 6668 */ 6669 if (sk->sk_memcg) { 6670 css_get(&sk->sk_memcg->css); 6671 return; 6672 } 6673 6674 rcu_read_lock(); 6675 memcg = mem_cgroup_from_task(current); 6676 if (memcg == root_mem_cgroup) 6677 goto out; 6678 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6679 goto out; 6680 if (css_tryget_online(&memcg->css)) 6681 sk->sk_memcg = memcg; 6682 out: 6683 rcu_read_unlock(); 6684 } 6685 6686 void mem_cgroup_sk_free(struct sock *sk) 6687 { 6688 if (sk->sk_memcg) 6689 css_put(&sk->sk_memcg->css); 6690 } 6691 6692 /** 6693 * mem_cgroup_charge_skmem - charge socket memory 6694 * @memcg: memcg to charge 6695 * @nr_pages: number of pages to charge 6696 * 6697 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 6698 * @memcg's configured limit, %false if the charge had to be forced. 6699 */ 6700 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6701 { 6702 gfp_t gfp_mask = GFP_KERNEL; 6703 6704 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6705 struct page_counter *fail; 6706 6707 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 6708 memcg->tcpmem_pressure = 0; 6709 return true; 6710 } 6711 page_counter_charge(&memcg->tcpmem, nr_pages); 6712 memcg->tcpmem_pressure = 1; 6713 return false; 6714 } 6715 6716 /* Don't block in the packet receive path */ 6717 if (in_softirq()) 6718 gfp_mask = GFP_NOWAIT; 6719 6720 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 6721 6722 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 6723 return true; 6724 6725 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 6726 return false; 6727 } 6728 6729 /** 6730 * mem_cgroup_uncharge_skmem - uncharge socket memory 6731 * @memcg: memcg to uncharge 6732 * @nr_pages: number of pages to uncharge 6733 */ 6734 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6735 { 6736 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6737 page_counter_uncharge(&memcg->tcpmem, nr_pages); 6738 return; 6739 } 6740 6741 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 6742 6743 refill_stock(memcg, nr_pages); 6744 } 6745 6746 static int __init cgroup_memory(char *s) 6747 { 6748 char *token; 6749 6750 while ((token = strsep(&s, ",")) != NULL) { 6751 if (!*token) 6752 continue; 6753 if (!strcmp(token, "nosocket")) 6754 cgroup_memory_nosocket = true; 6755 if (!strcmp(token, "nokmem")) 6756 cgroup_memory_nokmem = true; 6757 } 6758 return 0; 6759 } 6760 __setup("cgroup.memory=", cgroup_memory); 6761 6762 /* 6763 * subsys_initcall() for memory controller. 6764 * 6765 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 6766 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 6767 * basically everything that doesn't depend on a specific mem_cgroup structure 6768 * should be initialized from here. 6769 */ 6770 static int __init mem_cgroup_init(void) 6771 { 6772 int cpu, node; 6773 6774 #ifdef CONFIG_MEMCG_KMEM 6775 /* 6776 * Kmem cache creation is mostly done with the slab_mutex held, 6777 * so use a workqueue with limited concurrency to avoid stalling 6778 * all worker threads in case lots of cgroups are created and 6779 * destroyed simultaneously. 6780 */ 6781 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); 6782 BUG_ON(!memcg_kmem_cache_wq); 6783 #endif 6784 6785 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 6786 memcg_hotplug_cpu_dead); 6787 6788 for_each_possible_cpu(cpu) 6789 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 6790 drain_local_stock); 6791 6792 for_each_node(node) { 6793 struct mem_cgroup_tree_per_node *rtpn; 6794 6795 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 6796 node_online(node) ? node : NUMA_NO_NODE); 6797 6798 rtpn->rb_root = RB_ROOT; 6799 rtpn->rb_rightmost = NULL; 6800 spin_lock_init(&rtpn->lock); 6801 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6802 } 6803 6804 return 0; 6805 } 6806 subsys_initcall(mem_cgroup_init); 6807 6808 #ifdef CONFIG_MEMCG_SWAP 6809 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 6810 { 6811 while (!refcount_inc_not_zero(&memcg->id.ref)) { 6812 /* 6813 * The root cgroup cannot be destroyed, so it's refcount must 6814 * always be >= 1. 6815 */ 6816 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 6817 VM_BUG_ON(1); 6818 break; 6819 } 6820 memcg = parent_mem_cgroup(memcg); 6821 if (!memcg) 6822 memcg = root_mem_cgroup; 6823 } 6824 return memcg; 6825 } 6826 6827 /** 6828 * mem_cgroup_swapout - transfer a memsw charge to swap 6829 * @page: page whose memsw charge to transfer 6830 * @entry: swap entry to move the charge to 6831 * 6832 * Transfer the memsw charge of @page to @entry. 6833 */ 6834 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6835 { 6836 struct mem_cgroup *memcg, *swap_memcg; 6837 unsigned int nr_entries; 6838 unsigned short oldid; 6839 6840 VM_BUG_ON_PAGE(PageLRU(page), page); 6841 VM_BUG_ON_PAGE(page_count(page), page); 6842 6843 if (!do_memsw_account()) 6844 return; 6845 6846 memcg = page->mem_cgroup; 6847 6848 /* Readahead page, never charged */ 6849 if (!memcg) 6850 return; 6851 6852 /* 6853 * In case the memcg owning these pages has been offlined and doesn't 6854 * have an ID allocated to it anymore, charge the closest online 6855 * ancestor for the swap instead and transfer the memory+swap charge. 6856 */ 6857 swap_memcg = mem_cgroup_id_get_online(memcg); 6858 nr_entries = hpage_nr_pages(page); 6859 /* Get references for the tail pages, too */ 6860 if (nr_entries > 1) 6861 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 6862 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 6863 nr_entries); 6864 VM_BUG_ON_PAGE(oldid, page); 6865 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 6866 6867 page->mem_cgroup = NULL; 6868 6869 if (!mem_cgroup_is_root(memcg)) 6870 page_counter_uncharge(&memcg->memory, nr_entries); 6871 6872 if (memcg != swap_memcg) { 6873 if (!mem_cgroup_is_root(swap_memcg)) 6874 page_counter_charge(&swap_memcg->memsw, nr_entries); 6875 page_counter_uncharge(&memcg->memsw, nr_entries); 6876 } 6877 6878 /* 6879 * Interrupts should be disabled here because the caller holds the 6880 * i_pages lock which is taken with interrupts-off. It is 6881 * important here to have the interrupts disabled because it is the 6882 * only synchronisation we have for updating the per-CPU variables. 6883 */ 6884 VM_BUG_ON(!irqs_disabled()); 6885 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), 6886 -nr_entries); 6887 memcg_check_events(memcg, page); 6888 6889 if (!mem_cgroup_is_root(memcg)) 6890 css_put_many(&memcg->css, nr_entries); 6891 } 6892 6893 /** 6894 * mem_cgroup_try_charge_swap - try charging swap space for a page 6895 * @page: page being added to swap 6896 * @entry: swap entry to charge 6897 * 6898 * Try to charge @page's memcg for the swap space at @entry. 6899 * 6900 * Returns 0 on success, -ENOMEM on failure. 6901 */ 6902 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 6903 { 6904 unsigned int nr_pages = hpage_nr_pages(page); 6905 struct page_counter *counter; 6906 struct mem_cgroup *memcg; 6907 unsigned short oldid; 6908 6909 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) 6910 return 0; 6911 6912 memcg = page->mem_cgroup; 6913 6914 /* Readahead page, never charged */ 6915 if (!memcg) 6916 return 0; 6917 6918 if (!entry.val) { 6919 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 6920 return 0; 6921 } 6922 6923 memcg = mem_cgroup_id_get_online(memcg); 6924 6925 if (!mem_cgroup_is_root(memcg) && 6926 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 6927 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 6928 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 6929 mem_cgroup_id_put(memcg); 6930 return -ENOMEM; 6931 } 6932 6933 /* Get references for the tail pages, too */ 6934 if (nr_pages > 1) 6935 mem_cgroup_id_get_many(memcg, nr_pages - 1); 6936 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 6937 VM_BUG_ON_PAGE(oldid, page); 6938 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 6939 6940 return 0; 6941 } 6942 6943 /** 6944 * mem_cgroup_uncharge_swap - uncharge swap space 6945 * @entry: swap entry to uncharge 6946 * @nr_pages: the amount of swap space to uncharge 6947 */ 6948 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 6949 { 6950 struct mem_cgroup *memcg; 6951 unsigned short id; 6952 6953 if (!do_swap_account) 6954 return; 6955 6956 id = swap_cgroup_record(entry, 0, nr_pages); 6957 rcu_read_lock(); 6958 memcg = mem_cgroup_from_id(id); 6959 if (memcg) { 6960 if (!mem_cgroup_is_root(memcg)) { 6961 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6962 page_counter_uncharge(&memcg->swap, nr_pages); 6963 else 6964 page_counter_uncharge(&memcg->memsw, nr_pages); 6965 } 6966 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 6967 mem_cgroup_id_put_many(memcg, nr_pages); 6968 } 6969 rcu_read_unlock(); 6970 } 6971 6972 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 6973 { 6974 long nr_swap_pages = get_nr_swap_pages(); 6975 6976 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6977 return nr_swap_pages; 6978 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 6979 nr_swap_pages = min_t(long, nr_swap_pages, 6980 READ_ONCE(memcg->swap.max) - 6981 page_counter_read(&memcg->swap)); 6982 return nr_swap_pages; 6983 } 6984 6985 bool mem_cgroup_swap_full(struct page *page) 6986 { 6987 struct mem_cgroup *memcg; 6988 6989 VM_BUG_ON_PAGE(!PageLocked(page), page); 6990 6991 if (vm_swap_full()) 6992 return true; 6993 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6994 return false; 6995 6996 memcg = page->mem_cgroup; 6997 if (!memcg) 6998 return false; 6999 7000 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7001 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) 7002 return true; 7003 7004 return false; 7005 } 7006 7007 /* for remember boot option*/ 7008 #ifdef CONFIG_MEMCG_SWAP_ENABLED 7009 static int really_do_swap_account __initdata = 1; 7010 #else 7011 static int really_do_swap_account __initdata; 7012 #endif 7013 7014 static int __init enable_swap_account(char *s) 7015 { 7016 if (!strcmp(s, "1")) 7017 really_do_swap_account = 1; 7018 else if (!strcmp(s, "0")) 7019 really_do_swap_account = 0; 7020 return 1; 7021 } 7022 __setup("swapaccount=", enable_swap_account); 7023 7024 static u64 swap_current_read(struct cgroup_subsys_state *css, 7025 struct cftype *cft) 7026 { 7027 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7028 7029 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7030 } 7031 7032 static int swap_max_show(struct seq_file *m, void *v) 7033 { 7034 return seq_puts_memcg_tunable(m, 7035 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7036 } 7037 7038 static ssize_t swap_max_write(struct kernfs_open_file *of, 7039 char *buf, size_t nbytes, loff_t off) 7040 { 7041 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7042 unsigned long max; 7043 int err; 7044 7045 buf = strstrip(buf); 7046 err = page_counter_memparse(buf, "max", &max); 7047 if (err) 7048 return err; 7049 7050 xchg(&memcg->swap.max, max); 7051 7052 return nbytes; 7053 } 7054 7055 static int swap_events_show(struct seq_file *m, void *v) 7056 { 7057 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7058 7059 seq_printf(m, "max %lu\n", 7060 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7061 seq_printf(m, "fail %lu\n", 7062 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7063 7064 return 0; 7065 } 7066 7067 static struct cftype swap_files[] = { 7068 { 7069 .name = "swap.current", 7070 .flags = CFTYPE_NOT_ON_ROOT, 7071 .read_u64 = swap_current_read, 7072 }, 7073 { 7074 .name = "swap.max", 7075 .flags = CFTYPE_NOT_ON_ROOT, 7076 .seq_show = swap_max_show, 7077 .write = swap_max_write, 7078 }, 7079 { 7080 .name = "swap.events", 7081 .flags = CFTYPE_NOT_ON_ROOT, 7082 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7083 .seq_show = swap_events_show, 7084 }, 7085 { } /* terminate */ 7086 }; 7087 7088 static struct cftype memsw_cgroup_files[] = { 7089 { 7090 .name = "memsw.usage_in_bytes", 7091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7092 .read_u64 = mem_cgroup_read_u64, 7093 }, 7094 { 7095 .name = "memsw.max_usage_in_bytes", 7096 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7097 .write = mem_cgroup_reset, 7098 .read_u64 = mem_cgroup_read_u64, 7099 }, 7100 { 7101 .name = "memsw.limit_in_bytes", 7102 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7103 .write = mem_cgroup_write, 7104 .read_u64 = mem_cgroup_read_u64, 7105 }, 7106 { 7107 .name = "memsw.failcnt", 7108 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7109 .write = mem_cgroup_reset, 7110 .read_u64 = mem_cgroup_read_u64, 7111 }, 7112 { }, /* terminate */ 7113 }; 7114 7115 static int __init mem_cgroup_swap_init(void) 7116 { 7117 if (!mem_cgroup_disabled() && really_do_swap_account) { 7118 do_swap_account = 1; 7119 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, 7120 swap_files)); 7121 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 7122 memsw_cgroup_files)); 7123 } 7124 return 0; 7125 } 7126 subsys_initcall(mem_cgroup_swap_init); 7127 7128 #endif /* CONFIG_MEMCG_SWAP */ 7129