1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 */ 24 25 #include <linux/page_counter.h> 26 #include <linux/memcontrol.h> 27 #include <linux/cgroup.h> 28 #include <linux/pagewalk.h> 29 #include <linux/sched/mm.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/hugetlb.h> 32 #include <linux/pagemap.h> 33 #include <linux/vm_event_item.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/swap_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include <linux/tracehook.h> 60 #include <linux/psi.h> 61 #include <linux/seq_buf.h> 62 #include "internal.h" 63 #include <net/sock.h> 64 #include <net/ip.h> 65 #include "slab.h" 66 67 #include <linux/uaccess.h> 68 69 #include <trace/events/vmscan.h> 70 71 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 72 EXPORT_SYMBOL(memory_cgrp_subsys); 73 74 struct mem_cgroup *root_mem_cgroup __read_mostly; 75 76 /* Socket memory accounting disabled? */ 77 static bool cgroup_memory_nosocket; 78 79 /* Kernel memory accounting disabled? */ 80 static bool cgroup_memory_nokmem; 81 82 /* Whether the swap controller is active */ 83 #ifdef CONFIG_MEMCG_SWAP 84 bool cgroup_memory_noswap __read_mostly; 85 #else 86 #define cgroup_memory_noswap 1 87 #endif 88 89 #ifdef CONFIG_CGROUP_WRITEBACK 90 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 91 #endif 92 93 /* Whether legacy memory+swap accounting is active */ 94 static bool do_memsw_account(void) 95 { 96 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 97 } 98 99 #define THRESHOLDS_EVENTS_TARGET 128 100 #define SOFTLIMIT_EVENTS_TARGET 1024 101 102 /* 103 * Cgroups above their limits are maintained in a RB-Tree, independent of 104 * their hierarchy representation 105 */ 106 107 struct mem_cgroup_tree_per_node { 108 struct rb_root rb_root; 109 struct rb_node *rb_rightmost; 110 spinlock_t lock; 111 }; 112 113 struct mem_cgroup_tree { 114 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 115 }; 116 117 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 118 119 /* for OOM */ 120 struct mem_cgroup_eventfd_list { 121 struct list_head list; 122 struct eventfd_ctx *eventfd; 123 }; 124 125 /* 126 * cgroup_event represents events which userspace want to receive. 127 */ 128 struct mem_cgroup_event { 129 /* 130 * memcg which the event belongs to. 131 */ 132 struct mem_cgroup *memcg; 133 /* 134 * eventfd to signal userspace about the event. 135 */ 136 struct eventfd_ctx *eventfd; 137 /* 138 * Each of these stored in a list by the cgroup. 139 */ 140 struct list_head list; 141 /* 142 * register_event() callback will be used to add new userspace 143 * waiter for changes related to this event. Use eventfd_signal() 144 * on eventfd to send notification to userspace. 145 */ 146 int (*register_event)(struct mem_cgroup *memcg, 147 struct eventfd_ctx *eventfd, const char *args); 148 /* 149 * unregister_event() callback will be called when userspace closes 150 * the eventfd or on cgroup removing. This callback must be set, 151 * if you want provide notification functionality. 152 */ 153 void (*unregister_event)(struct mem_cgroup *memcg, 154 struct eventfd_ctx *eventfd); 155 /* 156 * All fields below needed to unregister event when 157 * userspace closes eventfd. 158 */ 159 poll_table pt; 160 wait_queue_head_t *wqh; 161 wait_queue_entry_t wait; 162 struct work_struct remove; 163 }; 164 165 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 166 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 167 168 /* Stuffs for move charges at task migration. */ 169 /* 170 * Types of charges to be moved. 171 */ 172 #define MOVE_ANON 0x1U 173 #define MOVE_FILE 0x2U 174 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 175 176 /* "mc" and its members are protected by cgroup_mutex */ 177 static struct move_charge_struct { 178 spinlock_t lock; /* for from, to */ 179 struct mm_struct *mm; 180 struct mem_cgroup *from; 181 struct mem_cgroup *to; 182 unsigned long flags; 183 unsigned long precharge; 184 unsigned long moved_charge; 185 unsigned long moved_swap; 186 struct task_struct *moving_task; /* a task moving charges */ 187 wait_queue_head_t waitq; /* a waitq for other context */ 188 } mc = { 189 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 190 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 191 }; 192 193 /* 194 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 195 * limit reclaim to prevent infinite loops, if they ever occur. 196 */ 197 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 198 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 199 200 /* for encoding cft->private value on file */ 201 enum res_type { 202 _MEM, 203 _MEMSWAP, 204 _OOM_TYPE, 205 _KMEM, 206 _TCP, 207 }; 208 209 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 210 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 211 #define MEMFILE_ATTR(val) ((val) & 0xffff) 212 /* Used for OOM nofiier */ 213 #define OOM_CONTROL (0) 214 215 /* 216 * Iteration constructs for visiting all cgroups (under a tree). If 217 * loops are exited prematurely (break), mem_cgroup_iter_break() must 218 * be used for reference counting. 219 */ 220 #define for_each_mem_cgroup_tree(iter, root) \ 221 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 222 iter != NULL; \ 223 iter = mem_cgroup_iter(root, iter, NULL)) 224 225 #define for_each_mem_cgroup(iter) \ 226 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 227 iter != NULL; \ 228 iter = mem_cgroup_iter(NULL, iter, NULL)) 229 230 static inline bool should_force_charge(void) 231 { 232 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 233 (current->flags & PF_EXITING); 234 } 235 236 /* Some nice accessors for the vmpressure. */ 237 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 238 { 239 if (!memcg) 240 memcg = root_mem_cgroup; 241 return &memcg->vmpressure; 242 } 243 244 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 245 { 246 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 247 } 248 249 #ifdef CONFIG_MEMCG_KMEM 250 extern spinlock_t css_set_lock; 251 252 static void obj_cgroup_release(struct percpu_ref *ref) 253 { 254 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 255 struct mem_cgroup *memcg; 256 unsigned int nr_bytes; 257 unsigned int nr_pages; 258 unsigned long flags; 259 260 /* 261 * At this point all allocated objects are freed, and 262 * objcg->nr_charged_bytes can't have an arbitrary byte value. 263 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 264 * 265 * The following sequence can lead to it: 266 * 1) CPU0: objcg == stock->cached_objcg 267 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 268 * PAGE_SIZE bytes are charged 269 * 3) CPU1: a process from another memcg is allocating something, 270 * the stock if flushed, 271 * objcg->nr_charged_bytes = PAGE_SIZE - 92 272 * 5) CPU0: we do release this object, 273 * 92 bytes are added to stock->nr_bytes 274 * 6) CPU0: stock is flushed, 275 * 92 bytes are added to objcg->nr_charged_bytes 276 * 277 * In the result, nr_charged_bytes == PAGE_SIZE. 278 * This page will be uncharged in obj_cgroup_release(). 279 */ 280 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 281 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 282 nr_pages = nr_bytes >> PAGE_SHIFT; 283 284 spin_lock_irqsave(&css_set_lock, flags); 285 memcg = obj_cgroup_memcg(objcg); 286 if (nr_pages) 287 __memcg_kmem_uncharge(memcg, nr_pages); 288 list_del(&objcg->list); 289 mem_cgroup_put(memcg); 290 spin_unlock_irqrestore(&css_set_lock, flags); 291 292 percpu_ref_exit(ref); 293 kfree_rcu(objcg, rcu); 294 } 295 296 static struct obj_cgroup *obj_cgroup_alloc(void) 297 { 298 struct obj_cgroup *objcg; 299 int ret; 300 301 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 302 if (!objcg) 303 return NULL; 304 305 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 306 GFP_KERNEL); 307 if (ret) { 308 kfree(objcg); 309 return NULL; 310 } 311 INIT_LIST_HEAD(&objcg->list); 312 return objcg; 313 } 314 315 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 316 struct mem_cgroup *parent) 317 { 318 struct obj_cgroup *objcg, *iter; 319 320 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 321 322 spin_lock_irq(&css_set_lock); 323 324 /* Move active objcg to the parent's list */ 325 xchg(&objcg->memcg, parent); 326 css_get(&parent->css); 327 list_add(&objcg->list, &parent->objcg_list); 328 329 /* Move already reparented objcgs to the parent's list */ 330 list_for_each_entry(iter, &memcg->objcg_list, list) { 331 css_get(&parent->css); 332 xchg(&iter->memcg, parent); 333 css_put(&memcg->css); 334 } 335 list_splice(&memcg->objcg_list, &parent->objcg_list); 336 337 spin_unlock_irq(&css_set_lock); 338 339 percpu_ref_kill(&objcg->refcnt); 340 } 341 342 /* 343 * This will be used as a shrinker list's index. 344 * The main reason for not using cgroup id for this: 345 * this works better in sparse environments, where we have a lot of memcgs, 346 * but only a few kmem-limited. Or also, if we have, for instance, 200 347 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 348 * 200 entry array for that. 349 * 350 * The current size of the caches array is stored in memcg_nr_cache_ids. It 351 * will double each time we have to increase it. 352 */ 353 static DEFINE_IDA(memcg_cache_ida); 354 int memcg_nr_cache_ids; 355 356 /* Protects memcg_nr_cache_ids */ 357 static DECLARE_RWSEM(memcg_cache_ids_sem); 358 359 void memcg_get_cache_ids(void) 360 { 361 down_read(&memcg_cache_ids_sem); 362 } 363 364 void memcg_put_cache_ids(void) 365 { 366 up_read(&memcg_cache_ids_sem); 367 } 368 369 /* 370 * MIN_SIZE is different than 1, because we would like to avoid going through 371 * the alloc/free process all the time. In a small machine, 4 kmem-limited 372 * cgroups is a reasonable guess. In the future, it could be a parameter or 373 * tunable, but that is strictly not necessary. 374 * 375 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 376 * this constant directly from cgroup, but it is understandable that this is 377 * better kept as an internal representation in cgroup.c. In any case, the 378 * cgrp_id space is not getting any smaller, and we don't have to necessarily 379 * increase ours as well if it increases. 380 */ 381 #define MEMCG_CACHES_MIN_SIZE 4 382 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 383 384 /* 385 * A lot of the calls to the cache allocation functions are expected to be 386 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 387 * conditional to this static branch, we'll have to allow modules that does 388 * kmem_cache_alloc and the such to see this symbol as well 389 */ 390 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 391 EXPORT_SYMBOL(memcg_kmem_enabled_key); 392 #endif 393 394 static int memcg_shrinker_map_size; 395 static DEFINE_MUTEX(memcg_shrinker_map_mutex); 396 397 static void memcg_free_shrinker_map_rcu(struct rcu_head *head) 398 { 399 kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 400 } 401 402 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 403 int size, int old_size) 404 { 405 struct memcg_shrinker_map *new, *old; 406 int nid; 407 408 lockdep_assert_held(&memcg_shrinker_map_mutex); 409 410 for_each_node(nid) { 411 old = rcu_dereference_protected( 412 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 413 /* Not yet online memcg */ 414 if (!old) 415 return 0; 416 417 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 418 if (!new) 419 return -ENOMEM; 420 421 /* Set all old bits, clear all new bits */ 422 memset(new->map, (int)0xff, old_size); 423 memset((void *)new->map + old_size, 0, size - old_size); 424 425 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 426 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 427 } 428 429 return 0; 430 } 431 432 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 433 { 434 struct mem_cgroup_per_node *pn; 435 struct memcg_shrinker_map *map; 436 int nid; 437 438 if (mem_cgroup_is_root(memcg)) 439 return; 440 441 for_each_node(nid) { 442 pn = mem_cgroup_nodeinfo(memcg, nid); 443 map = rcu_dereference_protected(pn->shrinker_map, true); 444 if (map) 445 kvfree(map); 446 rcu_assign_pointer(pn->shrinker_map, NULL); 447 } 448 } 449 450 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 451 { 452 struct memcg_shrinker_map *map; 453 int nid, size, ret = 0; 454 455 if (mem_cgroup_is_root(memcg)) 456 return 0; 457 458 mutex_lock(&memcg_shrinker_map_mutex); 459 size = memcg_shrinker_map_size; 460 for_each_node(nid) { 461 map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); 462 if (!map) { 463 memcg_free_shrinker_maps(memcg); 464 ret = -ENOMEM; 465 break; 466 } 467 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 468 } 469 mutex_unlock(&memcg_shrinker_map_mutex); 470 471 return ret; 472 } 473 474 int memcg_expand_shrinker_maps(int new_id) 475 { 476 int size, old_size, ret = 0; 477 struct mem_cgroup *memcg; 478 479 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 480 old_size = memcg_shrinker_map_size; 481 if (size <= old_size) 482 return 0; 483 484 mutex_lock(&memcg_shrinker_map_mutex); 485 if (!root_mem_cgroup) 486 goto unlock; 487 488 for_each_mem_cgroup(memcg) { 489 if (mem_cgroup_is_root(memcg)) 490 continue; 491 ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 492 if (ret) { 493 mem_cgroup_iter_break(NULL, memcg); 494 goto unlock; 495 } 496 } 497 unlock: 498 if (!ret) 499 memcg_shrinker_map_size = size; 500 mutex_unlock(&memcg_shrinker_map_mutex); 501 return ret; 502 } 503 504 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 505 { 506 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 507 struct memcg_shrinker_map *map; 508 509 rcu_read_lock(); 510 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 511 /* Pairs with smp mb in shrink_slab() */ 512 smp_mb__before_atomic(); 513 set_bit(shrinker_id, map->map); 514 rcu_read_unlock(); 515 } 516 } 517 518 /** 519 * mem_cgroup_css_from_page - css of the memcg associated with a page 520 * @page: page of interest 521 * 522 * If memcg is bound to the default hierarchy, css of the memcg associated 523 * with @page is returned. The returned css remains associated with @page 524 * until it is released. 525 * 526 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 527 * is returned. 528 */ 529 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 530 { 531 struct mem_cgroup *memcg; 532 533 memcg = page->mem_cgroup; 534 535 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 536 memcg = root_mem_cgroup; 537 538 return &memcg->css; 539 } 540 541 /** 542 * page_cgroup_ino - return inode number of the memcg a page is charged to 543 * @page: the page 544 * 545 * Look up the closest online ancestor of the memory cgroup @page is charged to 546 * and return its inode number or 0 if @page is not charged to any cgroup. It 547 * is safe to call this function without holding a reference to @page. 548 * 549 * Note, this function is inherently racy, because there is nothing to prevent 550 * the cgroup inode from getting torn down and potentially reallocated a moment 551 * after page_cgroup_ino() returns, so it only should be used by callers that 552 * do not care (such as procfs interfaces). 553 */ 554 ino_t page_cgroup_ino(struct page *page) 555 { 556 struct mem_cgroup *memcg; 557 unsigned long ino = 0; 558 559 rcu_read_lock(); 560 memcg = page->mem_cgroup; 561 562 /* 563 * The lowest bit set means that memcg isn't a valid 564 * memcg pointer, but a obj_cgroups pointer. 565 * In this case the page is shared and doesn't belong 566 * to any specific memory cgroup. 567 */ 568 if ((unsigned long) memcg & 0x1UL) 569 memcg = NULL; 570 571 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 572 memcg = parent_mem_cgroup(memcg); 573 if (memcg) 574 ino = cgroup_ino(memcg->css.cgroup); 575 rcu_read_unlock(); 576 return ino; 577 } 578 579 static struct mem_cgroup_per_node * 580 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 581 { 582 int nid = page_to_nid(page); 583 584 return memcg->nodeinfo[nid]; 585 } 586 587 static struct mem_cgroup_tree_per_node * 588 soft_limit_tree_node(int nid) 589 { 590 return soft_limit_tree.rb_tree_per_node[nid]; 591 } 592 593 static struct mem_cgroup_tree_per_node * 594 soft_limit_tree_from_page(struct page *page) 595 { 596 int nid = page_to_nid(page); 597 598 return soft_limit_tree.rb_tree_per_node[nid]; 599 } 600 601 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 602 struct mem_cgroup_tree_per_node *mctz, 603 unsigned long new_usage_in_excess) 604 { 605 struct rb_node **p = &mctz->rb_root.rb_node; 606 struct rb_node *parent = NULL; 607 struct mem_cgroup_per_node *mz_node; 608 bool rightmost = true; 609 610 if (mz->on_tree) 611 return; 612 613 mz->usage_in_excess = new_usage_in_excess; 614 if (!mz->usage_in_excess) 615 return; 616 while (*p) { 617 parent = *p; 618 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 619 tree_node); 620 if (mz->usage_in_excess < mz_node->usage_in_excess) { 621 p = &(*p)->rb_left; 622 rightmost = false; 623 } 624 625 /* 626 * We can't avoid mem cgroups that are over their soft 627 * limit by the same amount 628 */ 629 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 630 p = &(*p)->rb_right; 631 } 632 633 if (rightmost) 634 mctz->rb_rightmost = &mz->tree_node; 635 636 rb_link_node(&mz->tree_node, parent, p); 637 rb_insert_color(&mz->tree_node, &mctz->rb_root); 638 mz->on_tree = true; 639 } 640 641 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 642 struct mem_cgroup_tree_per_node *mctz) 643 { 644 if (!mz->on_tree) 645 return; 646 647 if (&mz->tree_node == mctz->rb_rightmost) 648 mctz->rb_rightmost = rb_prev(&mz->tree_node); 649 650 rb_erase(&mz->tree_node, &mctz->rb_root); 651 mz->on_tree = false; 652 } 653 654 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 655 struct mem_cgroup_tree_per_node *mctz) 656 { 657 unsigned long flags; 658 659 spin_lock_irqsave(&mctz->lock, flags); 660 __mem_cgroup_remove_exceeded(mz, mctz); 661 spin_unlock_irqrestore(&mctz->lock, flags); 662 } 663 664 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 665 { 666 unsigned long nr_pages = page_counter_read(&memcg->memory); 667 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 668 unsigned long excess = 0; 669 670 if (nr_pages > soft_limit) 671 excess = nr_pages - soft_limit; 672 673 return excess; 674 } 675 676 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 677 { 678 unsigned long excess; 679 struct mem_cgroup_per_node *mz; 680 struct mem_cgroup_tree_per_node *mctz; 681 682 mctz = soft_limit_tree_from_page(page); 683 if (!mctz) 684 return; 685 /* 686 * Necessary to update all ancestors when hierarchy is used. 687 * because their event counter is not touched. 688 */ 689 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 690 mz = mem_cgroup_page_nodeinfo(memcg, page); 691 excess = soft_limit_excess(memcg); 692 /* 693 * We have to update the tree if mz is on RB-tree or 694 * mem is over its softlimit. 695 */ 696 if (excess || mz->on_tree) { 697 unsigned long flags; 698 699 spin_lock_irqsave(&mctz->lock, flags); 700 /* if on-tree, remove it */ 701 if (mz->on_tree) 702 __mem_cgroup_remove_exceeded(mz, mctz); 703 /* 704 * Insert again. mz->usage_in_excess will be updated. 705 * If excess is 0, no tree ops. 706 */ 707 __mem_cgroup_insert_exceeded(mz, mctz, excess); 708 spin_unlock_irqrestore(&mctz->lock, flags); 709 } 710 } 711 } 712 713 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 714 { 715 struct mem_cgroup_tree_per_node *mctz; 716 struct mem_cgroup_per_node *mz; 717 int nid; 718 719 for_each_node(nid) { 720 mz = mem_cgroup_nodeinfo(memcg, nid); 721 mctz = soft_limit_tree_node(nid); 722 if (mctz) 723 mem_cgroup_remove_exceeded(mz, mctz); 724 } 725 } 726 727 static struct mem_cgroup_per_node * 728 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 729 { 730 struct mem_cgroup_per_node *mz; 731 732 retry: 733 mz = NULL; 734 if (!mctz->rb_rightmost) 735 goto done; /* Nothing to reclaim from */ 736 737 mz = rb_entry(mctz->rb_rightmost, 738 struct mem_cgroup_per_node, tree_node); 739 /* 740 * Remove the node now but someone else can add it back, 741 * we will to add it back at the end of reclaim to its correct 742 * position in the tree. 743 */ 744 __mem_cgroup_remove_exceeded(mz, mctz); 745 if (!soft_limit_excess(mz->memcg) || 746 !css_tryget(&mz->memcg->css)) 747 goto retry; 748 done: 749 return mz; 750 } 751 752 static struct mem_cgroup_per_node * 753 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 754 { 755 struct mem_cgroup_per_node *mz; 756 757 spin_lock_irq(&mctz->lock); 758 mz = __mem_cgroup_largest_soft_limit_node(mctz); 759 spin_unlock_irq(&mctz->lock); 760 return mz; 761 } 762 763 /** 764 * __mod_memcg_state - update cgroup memory statistics 765 * @memcg: the memory cgroup 766 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 767 * @val: delta to add to the counter, can be negative 768 */ 769 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 770 { 771 long x, threshold = MEMCG_CHARGE_BATCH; 772 773 if (mem_cgroup_disabled()) 774 return; 775 776 if (memcg_stat_item_in_bytes(idx)) 777 threshold <<= PAGE_SHIFT; 778 779 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 780 if (unlikely(abs(x) > threshold)) { 781 struct mem_cgroup *mi; 782 783 /* 784 * Batch local counters to keep them in sync with 785 * the hierarchical ones. 786 */ 787 __this_cpu_add(memcg->vmstats_local->stat[idx], x); 788 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 789 atomic_long_add(x, &mi->vmstats[idx]); 790 x = 0; 791 } 792 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); 793 } 794 795 static struct mem_cgroup_per_node * 796 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 797 { 798 struct mem_cgroup *parent; 799 800 parent = parent_mem_cgroup(pn->memcg); 801 if (!parent) 802 return NULL; 803 return mem_cgroup_nodeinfo(parent, nid); 804 } 805 806 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 807 int val) 808 { 809 struct mem_cgroup_per_node *pn; 810 struct mem_cgroup *memcg; 811 long x, threshold = MEMCG_CHARGE_BATCH; 812 813 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 814 memcg = pn->memcg; 815 816 /* Update memcg */ 817 __mod_memcg_state(memcg, idx, val); 818 819 /* Update lruvec */ 820 __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 821 822 if (vmstat_item_in_bytes(idx)) 823 threshold <<= PAGE_SHIFT; 824 825 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 826 if (unlikely(abs(x) > threshold)) { 827 pg_data_t *pgdat = lruvec_pgdat(lruvec); 828 struct mem_cgroup_per_node *pi; 829 830 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 831 atomic_long_add(x, &pi->lruvec_stat[idx]); 832 x = 0; 833 } 834 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 835 } 836 837 /** 838 * __mod_lruvec_state - update lruvec memory statistics 839 * @lruvec: the lruvec 840 * @idx: the stat item 841 * @val: delta to add to the counter, can be negative 842 * 843 * The lruvec is the intersection of the NUMA node and a cgroup. This 844 * function updates the all three counters that are affected by a 845 * change of state at this level: per-node, per-cgroup, per-lruvec. 846 */ 847 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 848 int val) 849 { 850 /* Update node */ 851 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 852 853 /* Update memcg and lruvec */ 854 if (!mem_cgroup_disabled()) 855 __mod_memcg_lruvec_state(lruvec, idx, val); 856 } 857 858 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) 859 { 860 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 861 struct mem_cgroup *memcg; 862 struct lruvec *lruvec; 863 864 rcu_read_lock(); 865 memcg = mem_cgroup_from_obj(p); 866 867 /* Untracked pages have no memcg, no lruvec. Update only the node */ 868 if (!memcg || memcg == root_mem_cgroup) { 869 __mod_node_page_state(pgdat, idx, val); 870 } else { 871 lruvec = mem_cgroup_lruvec(memcg, pgdat); 872 __mod_lruvec_state(lruvec, idx, val); 873 } 874 rcu_read_unlock(); 875 } 876 877 void mod_memcg_obj_state(void *p, int idx, int val) 878 { 879 struct mem_cgroup *memcg; 880 881 rcu_read_lock(); 882 memcg = mem_cgroup_from_obj(p); 883 if (memcg) 884 mod_memcg_state(memcg, idx, val); 885 rcu_read_unlock(); 886 } 887 888 /** 889 * __count_memcg_events - account VM events in a cgroup 890 * @memcg: the memory cgroup 891 * @idx: the event item 892 * @count: the number of events that occured 893 */ 894 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 895 unsigned long count) 896 { 897 unsigned long x; 898 899 if (mem_cgroup_disabled()) 900 return; 901 902 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 903 if (unlikely(x > MEMCG_CHARGE_BATCH)) { 904 struct mem_cgroup *mi; 905 906 /* 907 * Batch local counters to keep them in sync with 908 * the hierarchical ones. 909 */ 910 __this_cpu_add(memcg->vmstats_local->events[idx], x); 911 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 912 atomic_long_add(x, &mi->vmevents[idx]); 913 x = 0; 914 } 915 __this_cpu_write(memcg->vmstats_percpu->events[idx], x); 916 } 917 918 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 919 { 920 return atomic_long_read(&memcg->vmevents[event]); 921 } 922 923 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 924 { 925 long x = 0; 926 int cpu; 927 928 for_each_possible_cpu(cpu) 929 x += per_cpu(memcg->vmstats_local->events[event], cpu); 930 return x; 931 } 932 933 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 934 struct page *page, 935 int nr_pages) 936 { 937 /* pagein of a big page is an event. So, ignore page size */ 938 if (nr_pages > 0) 939 __count_memcg_events(memcg, PGPGIN, 1); 940 else { 941 __count_memcg_events(memcg, PGPGOUT, 1); 942 nr_pages = -nr_pages; /* for event */ 943 } 944 945 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 946 } 947 948 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 949 enum mem_cgroup_events_target target) 950 { 951 unsigned long val, next; 952 953 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 954 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 955 /* from time_after() in jiffies.h */ 956 if ((long)(next - val) < 0) { 957 switch (target) { 958 case MEM_CGROUP_TARGET_THRESH: 959 next = val + THRESHOLDS_EVENTS_TARGET; 960 break; 961 case MEM_CGROUP_TARGET_SOFTLIMIT: 962 next = val + SOFTLIMIT_EVENTS_TARGET; 963 break; 964 default: 965 break; 966 } 967 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 968 return true; 969 } 970 return false; 971 } 972 973 /* 974 * Check events in order. 975 * 976 */ 977 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 978 { 979 /* threshold event is triggered in finer grain than soft limit */ 980 if (unlikely(mem_cgroup_event_ratelimit(memcg, 981 MEM_CGROUP_TARGET_THRESH))) { 982 bool do_softlimit; 983 984 do_softlimit = mem_cgroup_event_ratelimit(memcg, 985 MEM_CGROUP_TARGET_SOFTLIMIT); 986 mem_cgroup_threshold(memcg); 987 if (unlikely(do_softlimit)) 988 mem_cgroup_update_tree(memcg, page); 989 } 990 } 991 992 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 993 { 994 /* 995 * mm_update_next_owner() may clear mm->owner to NULL 996 * if it races with swapoff, page migration, etc. 997 * So this can be called with p == NULL. 998 */ 999 if (unlikely(!p)) 1000 return NULL; 1001 1002 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1003 } 1004 EXPORT_SYMBOL(mem_cgroup_from_task); 1005 1006 /** 1007 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1008 * @mm: mm from which memcg should be extracted. It can be NULL. 1009 * 1010 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 1011 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 1012 * returned. 1013 */ 1014 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1015 { 1016 struct mem_cgroup *memcg; 1017 1018 if (mem_cgroup_disabled()) 1019 return NULL; 1020 1021 rcu_read_lock(); 1022 do { 1023 /* 1024 * Page cache insertions can happen withou an 1025 * actual mm context, e.g. during disk probing 1026 * on boot, loopback IO, acct() writes etc. 1027 */ 1028 if (unlikely(!mm)) 1029 memcg = root_mem_cgroup; 1030 else { 1031 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1032 if (unlikely(!memcg)) 1033 memcg = root_mem_cgroup; 1034 } 1035 } while (!css_tryget(&memcg->css)); 1036 rcu_read_unlock(); 1037 return memcg; 1038 } 1039 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1040 1041 /** 1042 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 1043 * @page: page from which memcg should be extracted. 1044 * 1045 * Obtain a reference on page->memcg and returns it if successful. Otherwise 1046 * root_mem_cgroup is returned. 1047 */ 1048 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 1049 { 1050 struct mem_cgroup *memcg = page->mem_cgroup; 1051 1052 if (mem_cgroup_disabled()) 1053 return NULL; 1054 1055 rcu_read_lock(); 1056 /* Page should not get uncharged and freed memcg under us. */ 1057 if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) 1058 memcg = root_mem_cgroup; 1059 rcu_read_unlock(); 1060 return memcg; 1061 } 1062 EXPORT_SYMBOL(get_mem_cgroup_from_page); 1063 1064 /** 1065 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. 1066 */ 1067 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1068 { 1069 if (unlikely(current->active_memcg)) { 1070 struct mem_cgroup *memcg; 1071 1072 rcu_read_lock(); 1073 /* current->active_memcg must hold a ref. */ 1074 if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) 1075 memcg = root_mem_cgroup; 1076 else 1077 memcg = current->active_memcg; 1078 rcu_read_unlock(); 1079 return memcg; 1080 } 1081 return get_mem_cgroup_from_mm(current->mm); 1082 } 1083 1084 /** 1085 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1086 * @root: hierarchy root 1087 * @prev: previously returned memcg, NULL on first invocation 1088 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1089 * 1090 * Returns references to children of the hierarchy below @root, or 1091 * @root itself, or %NULL after a full round-trip. 1092 * 1093 * Caller must pass the return value in @prev on subsequent 1094 * invocations for reference counting, or use mem_cgroup_iter_break() 1095 * to cancel a hierarchy walk before the round-trip is complete. 1096 * 1097 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1098 * in the hierarchy among all concurrent reclaimers operating on the 1099 * same node. 1100 */ 1101 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1102 struct mem_cgroup *prev, 1103 struct mem_cgroup_reclaim_cookie *reclaim) 1104 { 1105 struct mem_cgroup_reclaim_iter *iter; 1106 struct cgroup_subsys_state *css = NULL; 1107 struct mem_cgroup *memcg = NULL; 1108 struct mem_cgroup *pos = NULL; 1109 1110 if (mem_cgroup_disabled()) 1111 return NULL; 1112 1113 if (!root) 1114 root = root_mem_cgroup; 1115 1116 if (prev && !reclaim) 1117 pos = prev; 1118 1119 if (!root->use_hierarchy && root != root_mem_cgroup) { 1120 if (prev) 1121 goto out; 1122 return root; 1123 } 1124 1125 rcu_read_lock(); 1126 1127 if (reclaim) { 1128 struct mem_cgroup_per_node *mz; 1129 1130 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 1131 iter = &mz->iter; 1132 1133 if (prev && reclaim->generation != iter->generation) 1134 goto out_unlock; 1135 1136 while (1) { 1137 pos = READ_ONCE(iter->position); 1138 if (!pos || css_tryget(&pos->css)) 1139 break; 1140 /* 1141 * css reference reached zero, so iter->position will 1142 * be cleared by ->css_released. However, we should not 1143 * rely on this happening soon, because ->css_released 1144 * is called from a work queue, and by busy-waiting we 1145 * might block it. So we clear iter->position right 1146 * away. 1147 */ 1148 (void)cmpxchg(&iter->position, pos, NULL); 1149 } 1150 } 1151 1152 if (pos) 1153 css = &pos->css; 1154 1155 for (;;) { 1156 css = css_next_descendant_pre(css, &root->css); 1157 if (!css) { 1158 /* 1159 * Reclaimers share the hierarchy walk, and a 1160 * new one might jump in right at the end of 1161 * the hierarchy - make sure they see at least 1162 * one group and restart from the beginning. 1163 */ 1164 if (!prev) 1165 continue; 1166 break; 1167 } 1168 1169 /* 1170 * Verify the css and acquire a reference. The root 1171 * is provided by the caller, so we know it's alive 1172 * and kicking, and don't take an extra reference. 1173 */ 1174 memcg = mem_cgroup_from_css(css); 1175 1176 if (css == &root->css) 1177 break; 1178 1179 if (css_tryget(css)) 1180 break; 1181 1182 memcg = NULL; 1183 } 1184 1185 if (reclaim) { 1186 /* 1187 * The position could have already been updated by a competing 1188 * thread, so check that the value hasn't changed since we read 1189 * it to avoid reclaiming from the same cgroup twice. 1190 */ 1191 (void)cmpxchg(&iter->position, pos, memcg); 1192 1193 if (pos) 1194 css_put(&pos->css); 1195 1196 if (!memcg) 1197 iter->generation++; 1198 else if (!prev) 1199 reclaim->generation = iter->generation; 1200 } 1201 1202 out_unlock: 1203 rcu_read_unlock(); 1204 out: 1205 if (prev && prev != root) 1206 css_put(&prev->css); 1207 1208 return memcg; 1209 } 1210 1211 /** 1212 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1213 * @root: hierarchy root 1214 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1215 */ 1216 void mem_cgroup_iter_break(struct mem_cgroup *root, 1217 struct mem_cgroup *prev) 1218 { 1219 if (!root) 1220 root = root_mem_cgroup; 1221 if (prev && prev != root) 1222 css_put(&prev->css); 1223 } 1224 1225 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1226 struct mem_cgroup *dead_memcg) 1227 { 1228 struct mem_cgroup_reclaim_iter *iter; 1229 struct mem_cgroup_per_node *mz; 1230 int nid; 1231 1232 for_each_node(nid) { 1233 mz = mem_cgroup_nodeinfo(from, nid); 1234 iter = &mz->iter; 1235 cmpxchg(&iter->position, dead_memcg, NULL); 1236 } 1237 } 1238 1239 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1240 { 1241 struct mem_cgroup *memcg = dead_memcg; 1242 struct mem_cgroup *last; 1243 1244 do { 1245 __invalidate_reclaim_iterators(memcg, dead_memcg); 1246 last = memcg; 1247 } while ((memcg = parent_mem_cgroup(memcg))); 1248 1249 /* 1250 * When cgruop1 non-hierarchy mode is used, 1251 * parent_mem_cgroup() does not walk all the way up to the 1252 * cgroup root (root_mem_cgroup). So we have to handle 1253 * dead_memcg from cgroup root separately. 1254 */ 1255 if (last != root_mem_cgroup) 1256 __invalidate_reclaim_iterators(root_mem_cgroup, 1257 dead_memcg); 1258 } 1259 1260 /** 1261 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1262 * @memcg: hierarchy root 1263 * @fn: function to call for each task 1264 * @arg: argument passed to @fn 1265 * 1266 * This function iterates over tasks attached to @memcg or to any of its 1267 * descendants and calls @fn for each task. If @fn returns a non-zero 1268 * value, the function breaks the iteration loop and returns the value. 1269 * Otherwise, it will iterate over all tasks and return 0. 1270 * 1271 * This function must not be called for the root memory cgroup. 1272 */ 1273 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1274 int (*fn)(struct task_struct *, void *), void *arg) 1275 { 1276 struct mem_cgroup *iter; 1277 int ret = 0; 1278 1279 BUG_ON(memcg == root_mem_cgroup); 1280 1281 for_each_mem_cgroup_tree(iter, memcg) { 1282 struct css_task_iter it; 1283 struct task_struct *task; 1284 1285 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1286 while (!ret && (task = css_task_iter_next(&it))) 1287 ret = fn(task, arg); 1288 css_task_iter_end(&it); 1289 if (ret) { 1290 mem_cgroup_iter_break(memcg, iter); 1291 break; 1292 } 1293 } 1294 return ret; 1295 } 1296 1297 /** 1298 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1299 * @page: the page 1300 * @pgdat: pgdat of the page 1301 * 1302 * This function relies on page->mem_cgroup being stable - see the 1303 * access rules in commit_charge(). 1304 */ 1305 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 1306 { 1307 struct mem_cgroup_per_node *mz; 1308 struct mem_cgroup *memcg; 1309 struct lruvec *lruvec; 1310 1311 if (mem_cgroup_disabled()) { 1312 lruvec = &pgdat->__lruvec; 1313 goto out; 1314 } 1315 1316 memcg = page->mem_cgroup; 1317 /* 1318 * Swapcache readahead pages are added to the LRU - and 1319 * possibly migrated - before they are charged. 1320 */ 1321 if (!memcg) 1322 memcg = root_mem_cgroup; 1323 1324 mz = mem_cgroup_page_nodeinfo(memcg, page); 1325 lruvec = &mz->lruvec; 1326 out: 1327 /* 1328 * Since a node can be onlined after the mem_cgroup was created, 1329 * we have to be prepared to initialize lruvec->zone here; 1330 * and if offlined then reonlined, we need to reinitialize it. 1331 */ 1332 if (unlikely(lruvec->pgdat != pgdat)) 1333 lruvec->pgdat = pgdat; 1334 return lruvec; 1335 } 1336 1337 /** 1338 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1339 * @lruvec: mem_cgroup per zone lru vector 1340 * @lru: index of lru list the page is sitting on 1341 * @zid: zone id of the accounted pages 1342 * @nr_pages: positive when adding or negative when removing 1343 * 1344 * This function must be called under lru_lock, just before a page is added 1345 * to or just after a page is removed from an lru list (that ordering being 1346 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1347 */ 1348 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1349 int zid, int nr_pages) 1350 { 1351 struct mem_cgroup_per_node *mz; 1352 unsigned long *lru_size; 1353 long size; 1354 1355 if (mem_cgroup_disabled()) 1356 return; 1357 1358 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1359 lru_size = &mz->lru_zone_size[zid][lru]; 1360 1361 if (nr_pages < 0) 1362 *lru_size += nr_pages; 1363 1364 size = *lru_size; 1365 if (WARN_ONCE(size < 0, 1366 "%s(%p, %d, %d): lru_size %ld\n", 1367 __func__, lruvec, lru, nr_pages, size)) { 1368 VM_BUG_ON(1); 1369 *lru_size = 0; 1370 } 1371 1372 if (nr_pages > 0) 1373 *lru_size += nr_pages; 1374 } 1375 1376 /** 1377 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1378 * @memcg: the memory cgroup 1379 * 1380 * Returns the maximum amount of memory @mem can be charged with, in 1381 * pages. 1382 */ 1383 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1384 { 1385 unsigned long margin = 0; 1386 unsigned long count; 1387 unsigned long limit; 1388 1389 count = page_counter_read(&memcg->memory); 1390 limit = READ_ONCE(memcg->memory.max); 1391 if (count < limit) 1392 margin = limit - count; 1393 1394 if (do_memsw_account()) { 1395 count = page_counter_read(&memcg->memsw); 1396 limit = READ_ONCE(memcg->memsw.max); 1397 if (count < limit) 1398 margin = min(margin, limit - count); 1399 else 1400 margin = 0; 1401 } 1402 1403 return margin; 1404 } 1405 1406 /* 1407 * A routine for checking "mem" is under move_account() or not. 1408 * 1409 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1410 * moving cgroups. This is for waiting at high-memory pressure 1411 * caused by "move". 1412 */ 1413 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1414 { 1415 struct mem_cgroup *from; 1416 struct mem_cgroup *to; 1417 bool ret = false; 1418 /* 1419 * Unlike task_move routines, we access mc.to, mc.from not under 1420 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1421 */ 1422 spin_lock(&mc.lock); 1423 from = mc.from; 1424 to = mc.to; 1425 if (!from) 1426 goto unlock; 1427 1428 ret = mem_cgroup_is_descendant(from, memcg) || 1429 mem_cgroup_is_descendant(to, memcg); 1430 unlock: 1431 spin_unlock(&mc.lock); 1432 return ret; 1433 } 1434 1435 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1436 { 1437 if (mc.moving_task && current != mc.moving_task) { 1438 if (mem_cgroup_under_move(memcg)) { 1439 DEFINE_WAIT(wait); 1440 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1441 /* moving charge context might have finished. */ 1442 if (mc.moving_task) 1443 schedule(); 1444 finish_wait(&mc.waitq, &wait); 1445 return true; 1446 } 1447 } 1448 return false; 1449 } 1450 1451 struct memory_stat { 1452 const char *name; 1453 unsigned int ratio; 1454 unsigned int idx; 1455 }; 1456 1457 static struct memory_stat memory_stats[] = { 1458 { "anon", PAGE_SIZE, NR_ANON_MAPPED }, 1459 { "file", PAGE_SIZE, NR_FILE_PAGES }, 1460 { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, 1461 { "percpu", 1, MEMCG_PERCPU_B }, 1462 { "sock", PAGE_SIZE, MEMCG_SOCK }, 1463 { "shmem", PAGE_SIZE, NR_SHMEM }, 1464 { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, 1465 { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, 1466 { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, 1467 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1468 /* 1469 * The ratio will be initialized in memory_stats_init(). Because 1470 * on some architectures, the macro of HPAGE_PMD_SIZE is not 1471 * constant(e.g. powerpc). 1472 */ 1473 { "anon_thp", 0, NR_ANON_THPS }, 1474 #endif 1475 { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, 1476 { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, 1477 { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, 1478 { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, 1479 { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, 1480 1481 /* 1482 * Note: The slab_reclaimable and slab_unreclaimable must be 1483 * together and slab_reclaimable must be in front. 1484 */ 1485 { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, 1486 { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, 1487 1488 /* The memory events */ 1489 { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, 1490 { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, 1491 { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, 1492 { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, 1493 { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, 1494 { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, 1495 { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, 1496 }; 1497 1498 static int __init memory_stats_init(void) 1499 { 1500 int i; 1501 1502 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1503 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1504 if (memory_stats[i].idx == NR_ANON_THPS) 1505 memory_stats[i].ratio = HPAGE_PMD_SIZE; 1506 #endif 1507 VM_BUG_ON(!memory_stats[i].ratio); 1508 VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); 1509 } 1510 1511 return 0; 1512 } 1513 pure_initcall(memory_stats_init); 1514 1515 static char *memory_stat_format(struct mem_cgroup *memcg) 1516 { 1517 struct seq_buf s; 1518 int i; 1519 1520 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1521 if (!s.buffer) 1522 return NULL; 1523 1524 /* 1525 * Provide statistics on the state of the memory subsystem as 1526 * well as cumulative event counters that show past behavior. 1527 * 1528 * This list is ordered following a combination of these gradients: 1529 * 1) generic big picture -> specifics and details 1530 * 2) reflecting userspace activity -> reflecting kernel heuristics 1531 * 1532 * Current memory state: 1533 */ 1534 1535 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1536 u64 size; 1537 1538 size = memcg_page_state(memcg, memory_stats[i].idx); 1539 size *= memory_stats[i].ratio; 1540 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1541 1542 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1543 size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + 1544 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); 1545 seq_buf_printf(&s, "slab %llu\n", size); 1546 } 1547 } 1548 1549 /* Accumulated memory events */ 1550 1551 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 1552 memcg_events(memcg, PGFAULT)); 1553 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 1554 memcg_events(memcg, PGMAJFAULT)); 1555 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 1556 memcg_events(memcg, PGREFILL)); 1557 seq_buf_printf(&s, "pgscan %lu\n", 1558 memcg_events(memcg, PGSCAN_KSWAPD) + 1559 memcg_events(memcg, PGSCAN_DIRECT)); 1560 seq_buf_printf(&s, "pgsteal %lu\n", 1561 memcg_events(memcg, PGSTEAL_KSWAPD) + 1562 memcg_events(memcg, PGSTEAL_DIRECT)); 1563 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 1564 memcg_events(memcg, PGACTIVATE)); 1565 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 1566 memcg_events(memcg, PGDEACTIVATE)); 1567 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 1568 memcg_events(memcg, PGLAZYFREE)); 1569 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 1570 memcg_events(memcg, PGLAZYFREED)); 1571 1572 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1573 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 1574 memcg_events(memcg, THP_FAULT_ALLOC)); 1575 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 1576 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1577 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1578 1579 /* The above should easily fit into one page */ 1580 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1581 1582 return s.buffer; 1583 } 1584 1585 #define K(x) ((x) << (PAGE_SHIFT-10)) 1586 /** 1587 * mem_cgroup_print_oom_context: Print OOM information relevant to 1588 * memory controller. 1589 * @memcg: The memory cgroup that went over limit 1590 * @p: Task that is going to be killed 1591 * 1592 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1593 * enabled 1594 */ 1595 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1596 { 1597 rcu_read_lock(); 1598 1599 if (memcg) { 1600 pr_cont(",oom_memcg="); 1601 pr_cont_cgroup_path(memcg->css.cgroup); 1602 } else 1603 pr_cont(",global_oom"); 1604 if (p) { 1605 pr_cont(",task_memcg="); 1606 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1607 } 1608 rcu_read_unlock(); 1609 } 1610 1611 /** 1612 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1613 * memory controller. 1614 * @memcg: The memory cgroup that went over limit 1615 */ 1616 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1617 { 1618 char *buf; 1619 1620 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1621 K((u64)page_counter_read(&memcg->memory)), 1622 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1623 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1624 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1625 K((u64)page_counter_read(&memcg->swap)), 1626 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1627 else { 1628 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1629 K((u64)page_counter_read(&memcg->memsw)), 1630 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1631 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1632 K((u64)page_counter_read(&memcg->kmem)), 1633 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1634 } 1635 1636 pr_info("Memory cgroup stats for "); 1637 pr_cont_cgroup_path(memcg->css.cgroup); 1638 pr_cont(":"); 1639 buf = memory_stat_format(memcg); 1640 if (!buf) 1641 return; 1642 pr_info("%s", buf); 1643 kfree(buf); 1644 } 1645 1646 /* 1647 * Return the memory (and swap, if configured) limit for a memcg. 1648 */ 1649 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1650 { 1651 unsigned long max = READ_ONCE(memcg->memory.max); 1652 1653 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 1654 if (mem_cgroup_swappiness(memcg)) 1655 max += min(READ_ONCE(memcg->swap.max), 1656 (unsigned long)total_swap_pages); 1657 } else { /* v1 */ 1658 if (mem_cgroup_swappiness(memcg)) { 1659 /* Calculate swap excess capacity from memsw limit */ 1660 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1661 1662 max += min(swap, (unsigned long)total_swap_pages); 1663 } 1664 } 1665 return max; 1666 } 1667 1668 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1669 { 1670 return page_counter_read(&memcg->memory); 1671 } 1672 1673 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1674 int order) 1675 { 1676 struct oom_control oc = { 1677 .zonelist = NULL, 1678 .nodemask = NULL, 1679 .memcg = memcg, 1680 .gfp_mask = gfp_mask, 1681 .order = order, 1682 }; 1683 bool ret = true; 1684 1685 if (mutex_lock_killable(&oom_lock)) 1686 return true; 1687 1688 if (mem_cgroup_margin(memcg) >= (1 << order)) 1689 goto unlock; 1690 1691 /* 1692 * A few threads which were not waiting at mutex_lock_killable() can 1693 * fail to bail out. Therefore, check again after holding oom_lock. 1694 */ 1695 ret = should_force_charge() || out_of_memory(&oc); 1696 1697 unlock: 1698 mutex_unlock(&oom_lock); 1699 return ret; 1700 } 1701 1702 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1703 pg_data_t *pgdat, 1704 gfp_t gfp_mask, 1705 unsigned long *total_scanned) 1706 { 1707 struct mem_cgroup *victim = NULL; 1708 int total = 0; 1709 int loop = 0; 1710 unsigned long excess; 1711 unsigned long nr_scanned; 1712 struct mem_cgroup_reclaim_cookie reclaim = { 1713 .pgdat = pgdat, 1714 }; 1715 1716 excess = soft_limit_excess(root_memcg); 1717 1718 while (1) { 1719 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1720 if (!victim) { 1721 loop++; 1722 if (loop >= 2) { 1723 /* 1724 * If we have not been able to reclaim 1725 * anything, it might because there are 1726 * no reclaimable pages under this hierarchy 1727 */ 1728 if (!total) 1729 break; 1730 /* 1731 * We want to do more targeted reclaim. 1732 * excess >> 2 is not to excessive so as to 1733 * reclaim too much, nor too less that we keep 1734 * coming back to reclaim from this cgroup 1735 */ 1736 if (total >= (excess >> 2) || 1737 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1738 break; 1739 } 1740 continue; 1741 } 1742 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1743 pgdat, &nr_scanned); 1744 *total_scanned += nr_scanned; 1745 if (!soft_limit_excess(root_memcg)) 1746 break; 1747 } 1748 mem_cgroup_iter_break(root_memcg, victim); 1749 return total; 1750 } 1751 1752 #ifdef CONFIG_LOCKDEP 1753 static struct lockdep_map memcg_oom_lock_dep_map = { 1754 .name = "memcg_oom_lock", 1755 }; 1756 #endif 1757 1758 static DEFINE_SPINLOCK(memcg_oom_lock); 1759 1760 /* 1761 * Check OOM-Killer is already running under our hierarchy. 1762 * If someone is running, return false. 1763 */ 1764 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1765 { 1766 struct mem_cgroup *iter, *failed = NULL; 1767 1768 spin_lock(&memcg_oom_lock); 1769 1770 for_each_mem_cgroup_tree(iter, memcg) { 1771 if (iter->oom_lock) { 1772 /* 1773 * this subtree of our hierarchy is already locked 1774 * so we cannot give a lock. 1775 */ 1776 failed = iter; 1777 mem_cgroup_iter_break(memcg, iter); 1778 break; 1779 } else 1780 iter->oom_lock = true; 1781 } 1782 1783 if (failed) { 1784 /* 1785 * OK, we failed to lock the whole subtree so we have 1786 * to clean up what we set up to the failing subtree 1787 */ 1788 for_each_mem_cgroup_tree(iter, memcg) { 1789 if (iter == failed) { 1790 mem_cgroup_iter_break(memcg, iter); 1791 break; 1792 } 1793 iter->oom_lock = false; 1794 } 1795 } else 1796 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1797 1798 spin_unlock(&memcg_oom_lock); 1799 1800 return !failed; 1801 } 1802 1803 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1804 { 1805 struct mem_cgroup *iter; 1806 1807 spin_lock(&memcg_oom_lock); 1808 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1809 for_each_mem_cgroup_tree(iter, memcg) 1810 iter->oom_lock = false; 1811 spin_unlock(&memcg_oom_lock); 1812 } 1813 1814 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1815 { 1816 struct mem_cgroup *iter; 1817 1818 spin_lock(&memcg_oom_lock); 1819 for_each_mem_cgroup_tree(iter, memcg) 1820 iter->under_oom++; 1821 spin_unlock(&memcg_oom_lock); 1822 } 1823 1824 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1825 { 1826 struct mem_cgroup *iter; 1827 1828 /* 1829 * Be careful about under_oom underflows becase a child memcg 1830 * could have been added after mem_cgroup_mark_under_oom. 1831 */ 1832 spin_lock(&memcg_oom_lock); 1833 for_each_mem_cgroup_tree(iter, memcg) 1834 if (iter->under_oom > 0) 1835 iter->under_oom--; 1836 spin_unlock(&memcg_oom_lock); 1837 } 1838 1839 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1840 1841 struct oom_wait_info { 1842 struct mem_cgroup *memcg; 1843 wait_queue_entry_t wait; 1844 }; 1845 1846 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1847 unsigned mode, int sync, void *arg) 1848 { 1849 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1850 struct mem_cgroup *oom_wait_memcg; 1851 struct oom_wait_info *oom_wait_info; 1852 1853 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1854 oom_wait_memcg = oom_wait_info->memcg; 1855 1856 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1857 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1858 return 0; 1859 return autoremove_wake_function(wait, mode, sync, arg); 1860 } 1861 1862 static void memcg_oom_recover(struct mem_cgroup *memcg) 1863 { 1864 /* 1865 * For the following lockless ->under_oom test, the only required 1866 * guarantee is that it must see the state asserted by an OOM when 1867 * this function is called as a result of userland actions 1868 * triggered by the notification of the OOM. This is trivially 1869 * achieved by invoking mem_cgroup_mark_under_oom() before 1870 * triggering notification. 1871 */ 1872 if (memcg && memcg->under_oom) 1873 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1874 } 1875 1876 enum oom_status { 1877 OOM_SUCCESS, 1878 OOM_FAILED, 1879 OOM_ASYNC, 1880 OOM_SKIPPED 1881 }; 1882 1883 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1884 { 1885 enum oom_status ret; 1886 bool locked; 1887 1888 if (order > PAGE_ALLOC_COSTLY_ORDER) 1889 return OOM_SKIPPED; 1890 1891 memcg_memory_event(memcg, MEMCG_OOM); 1892 1893 /* 1894 * We are in the middle of the charge context here, so we 1895 * don't want to block when potentially sitting on a callstack 1896 * that holds all kinds of filesystem and mm locks. 1897 * 1898 * cgroup1 allows disabling the OOM killer and waiting for outside 1899 * handling until the charge can succeed; remember the context and put 1900 * the task to sleep at the end of the page fault when all locks are 1901 * released. 1902 * 1903 * On the other hand, in-kernel OOM killer allows for an async victim 1904 * memory reclaim (oom_reaper) and that means that we are not solely 1905 * relying on the oom victim to make a forward progress and we can 1906 * invoke the oom killer here. 1907 * 1908 * Please note that mem_cgroup_out_of_memory might fail to find a 1909 * victim and then we have to bail out from the charge path. 1910 */ 1911 if (memcg->oom_kill_disable) { 1912 if (!current->in_user_fault) 1913 return OOM_SKIPPED; 1914 css_get(&memcg->css); 1915 current->memcg_in_oom = memcg; 1916 current->memcg_oom_gfp_mask = mask; 1917 current->memcg_oom_order = order; 1918 1919 return OOM_ASYNC; 1920 } 1921 1922 mem_cgroup_mark_under_oom(memcg); 1923 1924 locked = mem_cgroup_oom_trylock(memcg); 1925 1926 if (locked) 1927 mem_cgroup_oom_notify(memcg); 1928 1929 mem_cgroup_unmark_under_oom(memcg); 1930 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1931 ret = OOM_SUCCESS; 1932 else 1933 ret = OOM_FAILED; 1934 1935 if (locked) 1936 mem_cgroup_oom_unlock(memcg); 1937 1938 return ret; 1939 } 1940 1941 /** 1942 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1943 * @handle: actually kill/wait or just clean up the OOM state 1944 * 1945 * This has to be called at the end of a page fault if the memcg OOM 1946 * handler was enabled. 1947 * 1948 * Memcg supports userspace OOM handling where failed allocations must 1949 * sleep on a waitqueue until the userspace task resolves the 1950 * situation. Sleeping directly in the charge context with all kinds 1951 * of locks held is not a good idea, instead we remember an OOM state 1952 * in the task and mem_cgroup_oom_synchronize() has to be called at 1953 * the end of the page fault to complete the OOM handling. 1954 * 1955 * Returns %true if an ongoing memcg OOM situation was detected and 1956 * completed, %false otherwise. 1957 */ 1958 bool mem_cgroup_oom_synchronize(bool handle) 1959 { 1960 struct mem_cgroup *memcg = current->memcg_in_oom; 1961 struct oom_wait_info owait; 1962 bool locked; 1963 1964 /* OOM is global, do not handle */ 1965 if (!memcg) 1966 return false; 1967 1968 if (!handle) 1969 goto cleanup; 1970 1971 owait.memcg = memcg; 1972 owait.wait.flags = 0; 1973 owait.wait.func = memcg_oom_wake_function; 1974 owait.wait.private = current; 1975 INIT_LIST_HEAD(&owait.wait.entry); 1976 1977 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1978 mem_cgroup_mark_under_oom(memcg); 1979 1980 locked = mem_cgroup_oom_trylock(memcg); 1981 1982 if (locked) 1983 mem_cgroup_oom_notify(memcg); 1984 1985 if (locked && !memcg->oom_kill_disable) { 1986 mem_cgroup_unmark_under_oom(memcg); 1987 finish_wait(&memcg_oom_waitq, &owait.wait); 1988 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1989 current->memcg_oom_order); 1990 } else { 1991 schedule(); 1992 mem_cgroup_unmark_under_oom(memcg); 1993 finish_wait(&memcg_oom_waitq, &owait.wait); 1994 } 1995 1996 if (locked) { 1997 mem_cgroup_oom_unlock(memcg); 1998 /* 1999 * There is no guarantee that an OOM-lock contender 2000 * sees the wakeups triggered by the OOM kill 2001 * uncharges. Wake any sleepers explicitely. 2002 */ 2003 memcg_oom_recover(memcg); 2004 } 2005 cleanup: 2006 current->memcg_in_oom = NULL; 2007 css_put(&memcg->css); 2008 return true; 2009 } 2010 2011 /** 2012 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2013 * @victim: task to be killed by the OOM killer 2014 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2015 * 2016 * Returns a pointer to a memory cgroup, which has to be cleaned up 2017 * by killing all belonging OOM-killable tasks. 2018 * 2019 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2020 */ 2021 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2022 struct mem_cgroup *oom_domain) 2023 { 2024 struct mem_cgroup *oom_group = NULL; 2025 struct mem_cgroup *memcg; 2026 2027 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2028 return NULL; 2029 2030 if (!oom_domain) 2031 oom_domain = root_mem_cgroup; 2032 2033 rcu_read_lock(); 2034 2035 memcg = mem_cgroup_from_task(victim); 2036 if (memcg == root_mem_cgroup) 2037 goto out; 2038 2039 /* 2040 * If the victim task has been asynchronously moved to a different 2041 * memory cgroup, we might end up killing tasks outside oom_domain. 2042 * In this case it's better to ignore memory.group.oom. 2043 */ 2044 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2045 goto out; 2046 2047 /* 2048 * Traverse the memory cgroup hierarchy from the victim task's 2049 * cgroup up to the OOMing cgroup (or root) to find the 2050 * highest-level memory cgroup with oom.group set. 2051 */ 2052 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2053 if (memcg->oom_group) 2054 oom_group = memcg; 2055 2056 if (memcg == oom_domain) 2057 break; 2058 } 2059 2060 if (oom_group) 2061 css_get(&oom_group->css); 2062 out: 2063 rcu_read_unlock(); 2064 2065 return oom_group; 2066 } 2067 2068 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2069 { 2070 pr_info("Tasks in "); 2071 pr_cont_cgroup_path(memcg->css.cgroup); 2072 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2073 } 2074 2075 /** 2076 * lock_page_memcg - lock a page->mem_cgroup binding 2077 * @page: the page 2078 * 2079 * This function protects unlocked LRU pages from being moved to 2080 * another cgroup. 2081 * 2082 * It ensures lifetime of the returned memcg. Caller is responsible 2083 * for the lifetime of the page; __unlock_page_memcg() is available 2084 * when @page might get freed inside the locked section. 2085 */ 2086 struct mem_cgroup *lock_page_memcg(struct page *page) 2087 { 2088 struct page *head = compound_head(page); /* rmap on tail pages */ 2089 struct mem_cgroup *memcg; 2090 unsigned long flags; 2091 2092 /* 2093 * The RCU lock is held throughout the transaction. The fast 2094 * path can get away without acquiring the memcg->move_lock 2095 * because page moving starts with an RCU grace period. 2096 * 2097 * The RCU lock also protects the memcg from being freed when 2098 * the page state that is going to change is the only thing 2099 * preventing the page itself from being freed. E.g. writeback 2100 * doesn't hold a page reference and relies on PG_writeback to 2101 * keep off truncation, migration and so forth. 2102 */ 2103 rcu_read_lock(); 2104 2105 if (mem_cgroup_disabled()) 2106 return NULL; 2107 again: 2108 memcg = head->mem_cgroup; 2109 if (unlikely(!memcg)) 2110 return NULL; 2111 2112 if (atomic_read(&memcg->moving_account) <= 0) 2113 return memcg; 2114 2115 spin_lock_irqsave(&memcg->move_lock, flags); 2116 if (memcg != head->mem_cgroup) { 2117 spin_unlock_irqrestore(&memcg->move_lock, flags); 2118 goto again; 2119 } 2120 2121 /* 2122 * When charge migration first begins, we can have locked and 2123 * unlocked page stat updates happening concurrently. Track 2124 * the task who has the lock for unlock_page_memcg(). 2125 */ 2126 memcg->move_lock_task = current; 2127 memcg->move_lock_flags = flags; 2128 2129 return memcg; 2130 } 2131 EXPORT_SYMBOL(lock_page_memcg); 2132 2133 /** 2134 * __unlock_page_memcg - unlock and unpin a memcg 2135 * @memcg: the memcg 2136 * 2137 * Unlock and unpin a memcg returned by lock_page_memcg(). 2138 */ 2139 void __unlock_page_memcg(struct mem_cgroup *memcg) 2140 { 2141 if (memcg && memcg->move_lock_task == current) { 2142 unsigned long flags = memcg->move_lock_flags; 2143 2144 memcg->move_lock_task = NULL; 2145 memcg->move_lock_flags = 0; 2146 2147 spin_unlock_irqrestore(&memcg->move_lock, flags); 2148 } 2149 2150 rcu_read_unlock(); 2151 } 2152 2153 /** 2154 * unlock_page_memcg - unlock a page->mem_cgroup binding 2155 * @page: the page 2156 */ 2157 void unlock_page_memcg(struct page *page) 2158 { 2159 struct page *head = compound_head(page); 2160 2161 __unlock_page_memcg(head->mem_cgroup); 2162 } 2163 EXPORT_SYMBOL(unlock_page_memcg); 2164 2165 struct memcg_stock_pcp { 2166 struct mem_cgroup *cached; /* this never be root cgroup */ 2167 unsigned int nr_pages; 2168 2169 #ifdef CONFIG_MEMCG_KMEM 2170 struct obj_cgroup *cached_objcg; 2171 unsigned int nr_bytes; 2172 #endif 2173 2174 struct work_struct work; 2175 unsigned long flags; 2176 #define FLUSHING_CACHED_CHARGE 0 2177 }; 2178 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2179 static DEFINE_MUTEX(percpu_charge_mutex); 2180 2181 #ifdef CONFIG_MEMCG_KMEM 2182 static void drain_obj_stock(struct memcg_stock_pcp *stock); 2183 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2184 struct mem_cgroup *root_memcg); 2185 2186 #else 2187 static inline void drain_obj_stock(struct memcg_stock_pcp *stock) 2188 { 2189 } 2190 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2191 struct mem_cgroup *root_memcg) 2192 { 2193 return false; 2194 } 2195 #endif 2196 2197 /** 2198 * consume_stock: Try to consume stocked charge on this cpu. 2199 * @memcg: memcg to consume from. 2200 * @nr_pages: how many pages to charge. 2201 * 2202 * The charges will only happen if @memcg matches the current cpu's memcg 2203 * stock, and at least @nr_pages are available in that stock. Failure to 2204 * service an allocation will refill the stock. 2205 * 2206 * returns true if successful, false otherwise. 2207 */ 2208 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2209 { 2210 struct memcg_stock_pcp *stock; 2211 unsigned long flags; 2212 bool ret = false; 2213 2214 if (nr_pages > MEMCG_CHARGE_BATCH) 2215 return ret; 2216 2217 local_irq_save(flags); 2218 2219 stock = this_cpu_ptr(&memcg_stock); 2220 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2221 stock->nr_pages -= nr_pages; 2222 ret = true; 2223 } 2224 2225 local_irq_restore(flags); 2226 2227 return ret; 2228 } 2229 2230 /* 2231 * Returns stocks cached in percpu and reset cached information. 2232 */ 2233 static void drain_stock(struct memcg_stock_pcp *stock) 2234 { 2235 struct mem_cgroup *old = stock->cached; 2236 2237 if (!old) 2238 return; 2239 2240 if (stock->nr_pages) { 2241 page_counter_uncharge(&old->memory, stock->nr_pages); 2242 if (do_memsw_account()) 2243 page_counter_uncharge(&old->memsw, stock->nr_pages); 2244 stock->nr_pages = 0; 2245 } 2246 2247 css_put(&old->css); 2248 stock->cached = NULL; 2249 } 2250 2251 static void drain_local_stock(struct work_struct *dummy) 2252 { 2253 struct memcg_stock_pcp *stock; 2254 unsigned long flags; 2255 2256 /* 2257 * The only protection from memory hotplug vs. drain_stock races is 2258 * that we always operate on local CPU stock here with IRQ disabled 2259 */ 2260 local_irq_save(flags); 2261 2262 stock = this_cpu_ptr(&memcg_stock); 2263 drain_obj_stock(stock); 2264 drain_stock(stock); 2265 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2266 2267 local_irq_restore(flags); 2268 } 2269 2270 /* 2271 * Cache charges(val) to local per_cpu area. 2272 * This will be consumed by consume_stock() function, later. 2273 */ 2274 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2275 { 2276 struct memcg_stock_pcp *stock; 2277 unsigned long flags; 2278 2279 local_irq_save(flags); 2280 2281 stock = this_cpu_ptr(&memcg_stock); 2282 if (stock->cached != memcg) { /* reset if necessary */ 2283 drain_stock(stock); 2284 css_get(&memcg->css); 2285 stock->cached = memcg; 2286 } 2287 stock->nr_pages += nr_pages; 2288 2289 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2290 drain_stock(stock); 2291 2292 local_irq_restore(flags); 2293 } 2294 2295 /* 2296 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2297 * of the hierarchy under it. 2298 */ 2299 static void drain_all_stock(struct mem_cgroup *root_memcg) 2300 { 2301 int cpu, curcpu; 2302 2303 /* If someone's already draining, avoid adding running more workers. */ 2304 if (!mutex_trylock(&percpu_charge_mutex)) 2305 return; 2306 /* 2307 * Notify other cpus that system-wide "drain" is running 2308 * We do not care about races with the cpu hotplug because cpu down 2309 * as well as workers from this path always operate on the local 2310 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2311 */ 2312 curcpu = get_cpu(); 2313 for_each_online_cpu(cpu) { 2314 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2315 struct mem_cgroup *memcg; 2316 bool flush = false; 2317 2318 rcu_read_lock(); 2319 memcg = stock->cached; 2320 if (memcg && stock->nr_pages && 2321 mem_cgroup_is_descendant(memcg, root_memcg)) 2322 flush = true; 2323 if (obj_stock_flush_required(stock, root_memcg)) 2324 flush = true; 2325 rcu_read_unlock(); 2326 2327 if (flush && 2328 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2329 if (cpu == curcpu) 2330 drain_local_stock(&stock->work); 2331 else 2332 schedule_work_on(cpu, &stock->work); 2333 } 2334 } 2335 put_cpu(); 2336 mutex_unlock(&percpu_charge_mutex); 2337 } 2338 2339 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2340 { 2341 struct memcg_stock_pcp *stock; 2342 struct mem_cgroup *memcg, *mi; 2343 2344 stock = &per_cpu(memcg_stock, cpu); 2345 drain_stock(stock); 2346 2347 for_each_mem_cgroup(memcg) { 2348 int i; 2349 2350 for (i = 0; i < MEMCG_NR_STAT; i++) { 2351 int nid; 2352 long x; 2353 2354 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); 2355 if (x) 2356 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2357 atomic_long_add(x, &memcg->vmstats[i]); 2358 2359 if (i >= NR_VM_NODE_STAT_ITEMS) 2360 continue; 2361 2362 for_each_node(nid) { 2363 struct mem_cgroup_per_node *pn; 2364 2365 pn = mem_cgroup_nodeinfo(memcg, nid); 2366 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 2367 if (x) 2368 do { 2369 atomic_long_add(x, &pn->lruvec_stat[i]); 2370 } while ((pn = parent_nodeinfo(pn, nid))); 2371 } 2372 } 2373 2374 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 2375 long x; 2376 2377 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); 2378 if (x) 2379 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2380 atomic_long_add(x, &memcg->vmevents[i]); 2381 } 2382 } 2383 2384 return 0; 2385 } 2386 2387 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2388 unsigned int nr_pages, 2389 gfp_t gfp_mask) 2390 { 2391 unsigned long nr_reclaimed = 0; 2392 2393 do { 2394 unsigned long pflags; 2395 2396 if (page_counter_read(&memcg->memory) <= 2397 READ_ONCE(memcg->memory.high)) 2398 continue; 2399 2400 memcg_memory_event(memcg, MEMCG_HIGH); 2401 2402 psi_memstall_enter(&pflags); 2403 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2404 gfp_mask, true); 2405 psi_memstall_leave(&pflags); 2406 } while ((memcg = parent_mem_cgroup(memcg)) && 2407 !mem_cgroup_is_root(memcg)); 2408 2409 return nr_reclaimed; 2410 } 2411 2412 static void high_work_func(struct work_struct *work) 2413 { 2414 struct mem_cgroup *memcg; 2415 2416 memcg = container_of(work, struct mem_cgroup, high_work); 2417 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2418 } 2419 2420 /* 2421 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2422 * enough to still cause a significant slowdown in most cases, while still 2423 * allowing diagnostics and tracing to proceed without becoming stuck. 2424 */ 2425 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2426 2427 /* 2428 * When calculating the delay, we use these either side of the exponentiation to 2429 * maintain precision and scale to a reasonable number of jiffies (see the table 2430 * below. 2431 * 2432 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2433 * overage ratio to a delay. 2434 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2435 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2436 * to produce a reasonable delay curve. 2437 * 2438 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2439 * reasonable delay curve compared to precision-adjusted overage, not 2440 * penalising heavily at first, but still making sure that growth beyond the 2441 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2442 * example, with a high of 100 megabytes: 2443 * 2444 * +-------+------------------------+ 2445 * | usage | time to allocate in ms | 2446 * +-------+------------------------+ 2447 * | 100M | 0 | 2448 * | 101M | 6 | 2449 * | 102M | 25 | 2450 * | 103M | 57 | 2451 * | 104M | 102 | 2452 * | 105M | 159 | 2453 * | 106M | 230 | 2454 * | 107M | 313 | 2455 * | 108M | 409 | 2456 * | 109M | 518 | 2457 * | 110M | 639 | 2458 * | 111M | 774 | 2459 * | 112M | 921 | 2460 * | 113M | 1081 | 2461 * | 114M | 1254 | 2462 * | 115M | 1439 | 2463 * | 116M | 1638 | 2464 * | 117M | 1849 | 2465 * | 118M | 2000 | 2466 * | 119M | 2000 | 2467 * | 120M | 2000 | 2468 * +-------+------------------------+ 2469 */ 2470 #define MEMCG_DELAY_PRECISION_SHIFT 20 2471 #define MEMCG_DELAY_SCALING_SHIFT 14 2472 2473 static u64 calculate_overage(unsigned long usage, unsigned long high) 2474 { 2475 u64 overage; 2476 2477 if (usage <= high) 2478 return 0; 2479 2480 /* 2481 * Prevent division by 0 in overage calculation by acting as if 2482 * it was a threshold of 1 page 2483 */ 2484 high = max(high, 1UL); 2485 2486 overage = usage - high; 2487 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2488 return div64_u64(overage, high); 2489 } 2490 2491 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2492 { 2493 u64 overage, max_overage = 0; 2494 2495 do { 2496 overage = calculate_overage(page_counter_read(&memcg->memory), 2497 READ_ONCE(memcg->memory.high)); 2498 max_overage = max(overage, max_overage); 2499 } while ((memcg = parent_mem_cgroup(memcg)) && 2500 !mem_cgroup_is_root(memcg)); 2501 2502 return max_overage; 2503 } 2504 2505 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2506 { 2507 u64 overage, max_overage = 0; 2508 2509 do { 2510 overage = calculate_overage(page_counter_read(&memcg->swap), 2511 READ_ONCE(memcg->swap.high)); 2512 if (overage) 2513 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2514 max_overage = max(overage, max_overage); 2515 } while ((memcg = parent_mem_cgroup(memcg)) && 2516 !mem_cgroup_is_root(memcg)); 2517 2518 return max_overage; 2519 } 2520 2521 /* 2522 * Get the number of jiffies that we should penalise a mischievous cgroup which 2523 * is exceeding its memory.high by checking both it and its ancestors. 2524 */ 2525 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2526 unsigned int nr_pages, 2527 u64 max_overage) 2528 { 2529 unsigned long penalty_jiffies; 2530 2531 if (!max_overage) 2532 return 0; 2533 2534 /* 2535 * We use overage compared to memory.high to calculate the number of 2536 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2537 * fairly lenient on small overages, and increasingly harsh when the 2538 * memcg in question makes it clear that it has no intention of stopping 2539 * its crazy behaviour, so we exponentially increase the delay based on 2540 * overage amount. 2541 */ 2542 penalty_jiffies = max_overage * max_overage * HZ; 2543 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2544 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2545 2546 /* 2547 * Factor in the task's own contribution to the overage, such that four 2548 * N-sized allocations are throttled approximately the same as one 2549 * 4N-sized allocation. 2550 * 2551 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2552 * larger the current charge patch is than that. 2553 */ 2554 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2555 } 2556 2557 /* 2558 * Scheduled by try_charge() to be executed from the userland return path 2559 * and reclaims memory over the high limit. 2560 */ 2561 void mem_cgroup_handle_over_high(void) 2562 { 2563 unsigned long penalty_jiffies; 2564 unsigned long pflags; 2565 unsigned long nr_reclaimed; 2566 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2567 int nr_retries = MAX_RECLAIM_RETRIES; 2568 struct mem_cgroup *memcg; 2569 bool in_retry = false; 2570 2571 if (likely(!nr_pages)) 2572 return; 2573 2574 memcg = get_mem_cgroup_from_mm(current->mm); 2575 current->memcg_nr_pages_over_high = 0; 2576 2577 retry_reclaim: 2578 /* 2579 * The allocating task should reclaim at least the batch size, but for 2580 * subsequent retries we only want to do what's necessary to prevent oom 2581 * or breaching resource isolation. 2582 * 2583 * This is distinct from memory.max or page allocator behaviour because 2584 * memory.high is currently batched, whereas memory.max and the page 2585 * allocator run every time an allocation is made. 2586 */ 2587 nr_reclaimed = reclaim_high(memcg, 2588 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2589 GFP_KERNEL); 2590 2591 /* 2592 * memory.high is breached and reclaim is unable to keep up. Throttle 2593 * allocators proactively to slow down excessive growth. 2594 */ 2595 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2596 mem_find_max_overage(memcg)); 2597 2598 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2599 swap_find_max_overage(memcg)); 2600 2601 /* 2602 * Clamp the max delay per usermode return so as to still keep the 2603 * application moving forwards and also permit diagnostics, albeit 2604 * extremely slowly. 2605 */ 2606 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2607 2608 /* 2609 * Don't sleep if the amount of jiffies this memcg owes us is so low 2610 * that it's not even worth doing, in an attempt to be nice to those who 2611 * go only a small amount over their memory.high value and maybe haven't 2612 * been aggressively reclaimed enough yet. 2613 */ 2614 if (penalty_jiffies <= HZ / 100) 2615 goto out; 2616 2617 /* 2618 * If reclaim is making forward progress but we're still over 2619 * memory.high, we want to encourage that rather than doing allocator 2620 * throttling. 2621 */ 2622 if (nr_reclaimed || nr_retries--) { 2623 in_retry = true; 2624 goto retry_reclaim; 2625 } 2626 2627 /* 2628 * If we exit early, we're guaranteed to die (since 2629 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2630 * need to account for any ill-begotten jiffies to pay them off later. 2631 */ 2632 psi_memstall_enter(&pflags); 2633 schedule_timeout_killable(penalty_jiffies); 2634 psi_memstall_leave(&pflags); 2635 2636 out: 2637 css_put(&memcg->css); 2638 } 2639 2640 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2641 unsigned int nr_pages) 2642 { 2643 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2644 int nr_retries = MAX_RECLAIM_RETRIES; 2645 struct mem_cgroup *mem_over_limit; 2646 struct page_counter *counter; 2647 enum oom_status oom_status; 2648 unsigned long nr_reclaimed; 2649 bool may_swap = true; 2650 bool drained = false; 2651 unsigned long pflags; 2652 2653 if (mem_cgroup_is_root(memcg)) 2654 return 0; 2655 retry: 2656 if (consume_stock(memcg, nr_pages)) 2657 return 0; 2658 2659 if (!do_memsw_account() || 2660 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2661 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2662 goto done_restock; 2663 if (do_memsw_account()) 2664 page_counter_uncharge(&memcg->memsw, batch); 2665 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2666 } else { 2667 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2668 may_swap = false; 2669 } 2670 2671 if (batch > nr_pages) { 2672 batch = nr_pages; 2673 goto retry; 2674 } 2675 2676 /* 2677 * Memcg doesn't have a dedicated reserve for atomic 2678 * allocations. But like the global atomic pool, we need to 2679 * put the burden of reclaim on regular allocation requests 2680 * and let these go through as privileged allocations. 2681 */ 2682 if (gfp_mask & __GFP_ATOMIC) 2683 goto force; 2684 2685 /* 2686 * Unlike in global OOM situations, memcg is not in a physical 2687 * memory shortage. Allow dying and OOM-killed tasks to 2688 * bypass the last charges so that they can exit quickly and 2689 * free their memory. 2690 */ 2691 if (unlikely(should_force_charge())) 2692 goto force; 2693 2694 /* 2695 * Prevent unbounded recursion when reclaim operations need to 2696 * allocate memory. This might exceed the limits temporarily, 2697 * but we prefer facilitating memory reclaim and getting back 2698 * under the limit over triggering OOM kills in these cases. 2699 */ 2700 if (unlikely(current->flags & PF_MEMALLOC)) 2701 goto force; 2702 2703 if (unlikely(task_in_memcg_oom(current))) 2704 goto nomem; 2705 2706 if (!gfpflags_allow_blocking(gfp_mask)) 2707 goto nomem; 2708 2709 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2710 2711 psi_memstall_enter(&pflags); 2712 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2713 gfp_mask, may_swap); 2714 psi_memstall_leave(&pflags); 2715 2716 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2717 goto retry; 2718 2719 if (!drained) { 2720 drain_all_stock(mem_over_limit); 2721 drained = true; 2722 goto retry; 2723 } 2724 2725 if (gfp_mask & __GFP_NORETRY) 2726 goto nomem; 2727 /* 2728 * Even though the limit is exceeded at this point, reclaim 2729 * may have been able to free some pages. Retry the charge 2730 * before killing the task. 2731 * 2732 * Only for regular pages, though: huge pages are rather 2733 * unlikely to succeed so close to the limit, and we fall back 2734 * to regular pages anyway in case of failure. 2735 */ 2736 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2737 goto retry; 2738 /* 2739 * At task move, charge accounts can be doubly counted. So, it's 2740 * better to wait until the end of task_move if something is going on. 2741 */ 2742 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2743 goto retry; 2744 2745 if (nr_retries--) 2746 goto retry; 2747 2748 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2749 goto nomem; 2750 2751 if (gfp_mask & __GFP_NOFAIL) 2752 goto force; 2753 2754 if (fatal_signal_pending(current)) 2755 goto force; 2756 2757 /* 2758 * keep retrying as long as the memcg oom killer is able to make 2759 * a forward progress or bypass the charge if the oom killer 2760 * couldn't make any progress. 2761 */ 2762 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2763 get_order(nr_pages * PAGE_SIZE)); 2764 switch (oom_status) { 2765 case OOM_SUCCESS: 2766 nr_retries = MAX_RECLAIM_RETRIES; 2767 goto retry; 2768 case OOM_FAILED: 2769 goto force; 2770 default: 2771 goto nomem; 2772 } 2773 nomem: 2774 if (!(gfp_mask & __GFP_NOFAIL)) 2775 return -ENOMEM; 2776 force: 2777 /* 2778 * The allocation either can't fail or will lead to more memory 2779 * being freed very soon. Allow memory usage go over the limit 2780 * temporarily by force charging it. 2781 */ 2782 page_counter_charge(&memcg->memory, nr_pages); 2783 if (do_memsw_account()) 2784 page_counter_charge(&memcg->memsw, nr_pages); 2785 2786 return 0; 2787 2788 done_restock: 2789 if (batch > nr_pages) 2790 refill_stock(memcg, batch - nr_pages); 2791 2792 /* 2793 * If the hierarchy is above the normal consumption range, schedule 2794 * reclaim on returning to userland. We can perform reclaim here 2795 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2796 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2797 * not recorded as it most likely matches current's and won't 2798 * change in the meantime. As high limit is checked again before 2799 * reclaim, the cost of mismatch is negligible. 2800 */ 2801 do { 2802 bool mem_high, swap_high; 2803 2804 mem_high = page_counter_read(&memcg->memory) > 2805 READ_ONCE(memcg->memory.high); 2806 swap_high = page_counter_read(&memcg->swap) > 2807 READ_ONCE(memcg->swap.high); 2808 2809 /* Don't bother a random interrupted task */ 2810 if (in_interrupt()) { 2811 if (mem_high) { 2812 schedule_work(&memcg->high_work); 2813 break; 2814 } 2815 continue; 2816 } 2817 2818 if (mem_high || swap_high) { 2819 /* 2820 * The allocating tasks in this cgroup will need to do 2821 * reclaim or be throttled to prevent further growth 2822 * of the memory or swap footprints. 2823 * 2824 * Target some best-effort fairness between the tasks, 2825 * and distribute reclaim work and delay penalties 2826 * based on how much each task is actually allocating. 2827 */ 2828 current->memcg_nr_pages_over_high += batch; 2829 set_notify_resume(current); 2830 break; 2831 } 2832 } while ((memcg = parent_mem_cgroup(memcg))); 2833 2834 return 0; 2835 } 2836 2837 #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) 2838 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2839 { 2840 if (mem_cgroup_is_root(memcg)) 2841 return; 2842 2843 page_counter_uncharge(&memcg->memory, nr_pages); 2844 if (do_memsw_account()) 2845 page_counter_uncharge(&memcg->memsw, nr_pages); 2846 } 2847 #endif 2848 2849 static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2850 { 2851 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2852 /* 2853 * Any of the following ensures page->mem_cgroup stability: 2854 * 2855 * - the page lock 2856 * - LRU isolation 2857 * - lock_page_memcg() 2858 * - exclusive reference 2859 */ 2860 page->mem_cgroup = memcg; 2861 } 2862 2863 #ifdef CONFIG_MEMCG_KMEM 2864 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 2865 gfp_t gfp) 2866 { 2867 unsigned int objects = objs_per_slab_page(s, page); 2868 void *vec; 2869 2870 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2871 page_to_nid(page)); 2872 if (!vec) 2873 return -ENOMEM; 2874 2875 if (cmpxchg(&page->obj_cgroups, NULL, 2876 (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) 2877 kfree(vec); 2878 else 2879 kmemleak_not_leak(vec); 2880 2881 return 0; 2882 } 2883 2884 /* 2885 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2886 * 2887 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2888 * cgroup_mutex, etc. 2889 */ 2890 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2891 { 2892 struct page *page; 2893 2894 if (mem_cgroup_disabled()) 2895 return NULL; 2896 2897 page = virt_to_head_page(p); 2898 2899 /* 2900 * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer 2901 * or a pointer to obj_cgroup vector. In the latter case the lowest 2902 * bit of the pointer is set. 2903 * The page->mem_cgroup pointer can be asynchronously changed 2904 * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed 2905 * from a valid memcg pointer to objcg vector or back. 2906 */ 2907 if (!page->mem_cgroup) 2908 return NULL; 2909 2910 /* 2911 * Slab objects are accounted individually, not per-page. 2912 * Memcg membership data for each individual object is saved in 2913 * the page->obj_cgroups. 2914 */ 2915 if (page_has_obj_cgroups(page)) { 2916 struct obj_cgroup *objcg; 2917 unsigned int off; 2918 2919 off = obj_to_index(page->slab_cache, page, p); 2920 objcg = page_obj_cgroups(page)[off]; 2921 if (objcg) 2922 return obj_cgroup_memcg(objcg); 2923 2924 return NULL; 2925 } 2926 2927 /* All other pages use page->mem_cgroup */ 2928 return page->mem_cgroup; 2929 } 2930 2931 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 2932 { 2933 struct obj_cgroup *objcg = NULL; 2934 struct mem_cgroup *memcg; 2935 2936 if (unlikely(!current->mm && !current->active_memcg)) 2937 return NULL; 2938 2939 rcu_read_lock(); 2940 if (unlikely(current->active_memcg)) 2941 memcg = rcu_dereference(current->active_memcg); 2942 else 2943 memcg = mem_cgroup_from_task(current); 2944 2945 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 2946 objcg = rcu_dereference(memcg->objcg); 2947 if (objcg && obj_cgroup_tryget(objcg)) 2948 break; 2949 } 2950 rcu_read_unlock(); 2951 2952 return objcg; 2953 } 2954 2955 static int memcg_alloc_cache_id(void) 2956 { 2957 int id, size; 2958 int err; 2959 2960 id = ida_simple_get(&memcg_cache_ida, 2961 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2962 if (id < 0) 2963 return id; 2964 2965 if (id < memcg_nr_cache_ids) 2966 return id; 2967 2968 /* 2969 * There's no space for the new id in memcg_caches arrays, 2970 * so we have to grow them. 2971 */ 2972 down_write(&memcg_cache_ids_sem); 2973 2974 size = 2 * (id + 1); 2975 if (size < MEMCG_CACHES_MIN_SIZE) 2976 size = MEMCG_CACHES_MIN_SIZE; 2977 else if (size > MEMCG_CACHES_MAX_SIZE) 2978 size = MEMCG_CACHES_MAX_SIZE; 2979 2980 err = memcg_update_all_list_lrus(size); 2981 if (!err) 2982 memcg_nr_cache_ids = size; 2983 2984 up_write(&memcg_cache_ids_sem); 2985 2986 if (err) { 2987 ida_simple_remove(&memcg_cache_ida, id); 2988 return err; 2989 } 2990 return id; 2991 } 2992 2993 static void memcg_free_cache_id(int id) 2994 { 2995 ida_simple_remove(&memcg_cache_ida, id); 2996 } 2997 2998 /** 2999 * __memcg_kmem_charge: charge a number of kernel pages to a memcg 3000 * @memcg: memory cgroup to charge 3001 * @gfp: reclaim mode 3002 * @nr_pages: number of pages to charge 3003 * 3004 * Returns 0 on success, an error code on failure. 3005 */ 3006 int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, 3007 unsigned int nr_pages) 3008 { 3009 struct page_counter *counter; 3010 int ret; 3011 3012 ret = try_charge(memcg, gfp, nr_pages); 3013 if (ret) 3014 return ret; 3015 3016 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 3017 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 3018 3019 /* 3020 * Enforce __GFP_NOFAIL allocation because callers are not 3021 * prepared to see failures and likely do not have any failure 3022 * handling code. 3023 */ 3024 if (gfp & __GFP_NOFAIL) { 3025 page_counter_charge(&memcg->kmem, nr_pages); 3026 return 0; 3027 } 3028 cancel_charge(memcg, nr_pages); 3029 return -ENOMEM; 3030 } 3031 return 0; 3032 } 3033 3034 /** 3035 * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg 3036 * @memcg: memcg to uncharge 3037 * @nr_pages: number of pages to uncharge 3038 */ 3039 void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) 3040 { 3041 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 3042 page_counter_uncharge(&memcg->kmem, nr_pages); 3043 3044 page_counter_uncharge(&memcg->memory, nr_pages); 3045 if (do_memsw_account()) 3046 page_counter_uncharge(&memcg->memsw, nr_pages); 3047 } 3048 3049 /** 3050 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3051 * @page: page to charge 3052 * @gfp: reclaim mode 3053 * @order: allocation order 3054 * 3055 * Returns 0 on success, an error code on failure. 3056 */ 3057 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3058 { 3059 struct mem_cgroup *memcg; 3060 int ret = 0; 3061 3062 if (memcg_kmem_bypass()) 3063 return 0; 3064 3065 memcg = get_mem_cgroup_from_current(); 3066 if (!mem_cgroup_is_root(memcg)) { 3067 ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 3068 if (!ret) { 3069 page->mem_cgroup = memcg; 3070 __SetPageKmemcg(page); 3071 return 0; 3072 } 3073 } 3074 css_put(&memcg->css); 3075 return ret; 3076 } 3077 3078 /** 3079 * __memcg_kmem_uncharge_page: uncharge a kmem page 3080 * @page: page to uncharge 3081 * @order: allocation order 3082 */ 3083 void __memcg_kmem_uncharge_page(struct page *page, int order) 3084 { 3085 struct mem_cgroup *memcg = page->mem_cgroup; 3086 unsigned int nr_pages = 1 << order; 3087 3088 if (!memcg) 3089 return; 3090 3091 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3092 __memcg_kmem_uncharge(memcg, nr_pages); 3093 page->mem_cgroup = NULL; 3094 css_put(&memcg->css); 3095 3096 /* slab pages do not have PageKmemcg flag set */ 3097 if (PageKmemcg(page)) 3098 __ClearPageKmemcg(page); 3099 } 3100 3101 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3102 { 3103 struct memcg_stock_pcp *stock; 3104 unsigned long flags; 3105 bool ret = false; 3106 3107 local_irq_save(flags); 3108 3109 stock = this_cpu_ptr(&memcg_stock); 3110 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3111 stock->nr_bytes -= nr_bytes; 3112 ret = true; 3113 } 3114 3115 local_irq_restore(flags); 3116 3117 return ret; 3118 } 3119 3120 static void drain_obj_stock(struct memcg_stock_pcp *stock) 3121 { 3122 struct obj_cgroup *old = stock->cached_objcg; 3123 3124 if (!old) 3125 return; 3126 3127 if (stock->nr_bytes) { 3128 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3129 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3130 3131 if (nr_pages) { 3132 rcu_read_lock(); 3133 __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); 3134 rcu_read_unlock(); 3135 } 3136 3137 /* 3138 * The leftover is flushed to the centralized per-memcg value. 3139 * On the next attempt to refill obj stock it will be moved 3140 * to a per-cpu stock (probably, on an other CPU), see 3141 * refill_obj_stock(). 3142 * 3143 * How often it's flushed is a trade-off between the memory 3144 * limit enforcement accuracy and potential CPU contention, 3145 * so it might be changed in the future. 3146 */ 3147 atomic_add(nr_bytes, &old->nr_charged_bytes); 3148 stock->nr_bytes = 0; 3149 } 3150 3151 obj_cgroup_put(old); 3152 stock->cached_objcg = NULL; 3153 } 3154 3155 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3156 struct mem_cgroup *root_memcg) 3157 { 3158 struct mem_cgroup *memcg; 3159 3160 if (stock->cached_objcg) { 3161 memcg = obj_cgroup_memcg(stock->cached_objcg); 3162 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3163 return true; 3164 } 3165 3166 return false; 3167 } 3168 3169 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3170 { 3171 struct memcg_stock_pcp *stock; 3172 unsigned long flags; 3173 3174 local_irq_save(flags); 3175 3176 stock = this_cpu_ptr(&memcg_stock); 3177 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3178 drain_obj_stock(stock); 3179 obj_cgroup_get(objcg); 3180 stock->cached_objcg = objcg; 3181 stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); 3182 } 3183 stock->nr_bytes += nr_bytes; 3184 3185 if (stock->nr_bytes > PAGE_SIZE) 3186 drain_obj_stock(stock); 3187 3188 local_irq_restore(flags); 3189 } 3190 3191 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3192 { 3193 struct mem_cgroup *memcg; 3194 unsigned int nr_pages, nr_bytes; 3195 int ret; 3196 3197 if (consume_obj_stock(objcg, size)) 3198 return 0; 3199 3200 /* 3201 * In theory, memcg->nr_charged_bytes can have enough 3202 * pre-charged bytes to satisfy the allocation. However, 3203 * flushing memcg->nr_charged_bytes requires two atomic 3204 * operations, and memcg->nr_charged_bytes can't be big, 3205 * so it's better to ignore it and try grab some new pages. 3206 * memcg->nr_charged_bytes will be flushed in 3207 * refill_obj_stock(), called from this function or 3208 * independently later. 3209 */ 3210 rcu_read_lock(); 3211 memcg = obj_cgroup_memcg(objcg); 3212 css_get(&memcg->css); 3213 rcu_read_unlock(); 3214 3215 nr_pages = size >> PAGE_SHIFT; 3216 nr_bytes = size & (PAGE_SIZE - 1); 3217 3218 if (nr_bytes) 3219 nr_pages += 1; 3220 3221 ret = __memcg_kmem_charge(memcg, gfp, nr_pages); 3222 if (!ret && nr_bytes) 3223 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); 3224 3225 css_put(&memcg->css); 3226 return ret; 3227 } 3228 3229 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3230 { 3231 refill_obj_stock(objcg, size); 3232 } 3233 3234 #endif /* CONFIG_MEMCG_KMEM */ 3235 3236 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3237 3238 /* 3239 * Because tail pages are not marked as "used", set it. We're under 3240 * pgdat->lru_lock and migration entries setup in all page mappings. 3241 */ 3242 void mem_cgroup_split_huge_fixup(struct page *head) 3243 { 3244 struct mem_cgroup *memcg = head->mem_cgroup; 3245 int i; 3246 3247 if (mem_cgroup_disabled()) 3248 return; 3249 3250 for (i = 1; i < HPAGE_PMD_NR; i++) { 3251 css_get(&memcg->css); 3252 head[i].mem_cgroup = memcg; 3253 } 3254 } 3255 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3256 3257 #ifdef CONFIG_MEMCG_SWAP 3258 /** 3259 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3260 * @entry: swap entry to be moved 3261 * @from: mem_cgroup which the entry is moved from 3262 * @to: mem_cgroup which the entry is moved to 3263 * 3264 * It succeeds only when the swap_cgroup's record for this entry is the same 3265 * as the mem_cgroup's id of @from. 3266 * 3267 * Returns 0 on success, -EINVAL on failure. 3268 * 3269 * The caller must have charged to @to, IOW, called page_counter_charge() about 3270 * both res and memsw, and called css_get(). 3271 */ 3272 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3273 struct mem_cgroup *from, struct mem_cgroup *to) 3274 { 3275 unsigned short old_id, new_id; 3276 3277 old_id = mem_cgroup_id(from); 3278 new_id = mem_cgroup_id(to); 3279 3280 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3281 mod_memcg_state(from, MEMCG_SWAP, -1); 3282 mod_memcg_state(to, MEMCG_SWAP, 1); 3283 return 0; 3284 } 3285 return -EINVAL; 3286 } 3287 #else 3288 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3289 struct mem_cgroup *from, struct mem_cgroup *to) 3290 { 3291 return -EINVAL; 3292 } 3293 #endif 3294 3295 static DEFINE_MUTEX(memcg_max_mutex); 3296 3297 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3298 unsigned long max, bool memsw) 3299 { 3300 bool enlarge = false; 3301 bool drained = false; 3302 int ret; 3303 bool limits_invariant; 3304 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3305 3306 do { 3307 if (signal_pending(current)) { 3308 ret = -EINTR; 3309 break; 3310 } 3311 3312 mutex_lock(&memcg_max_mutex); 3313 /* 3314 * Make sure that the new limit (memsw or memory limit) doesn't 3315 * break our basic invariant rule memory.max <= memsw.max. 3316 */ 3317 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3318 max <= memcg->memsw.max; 3319 if (!limits_invariant) { 3320 mutex_unlock(&memcg_max_mutex); 3321 ret = -EINVAL; 3322 break; 3323 } 3324 if (max > counter->max) 3325 enlarge = true; 3326 ret = page_counter_set_max(counter, max); 3327 mutex_unlock(&memcg_max_mutex); 3328 3329 if (!ret) 3330 break; 3331 3332 if (!drained) { 3333 drain_all_stock(memcg); 3334 drained = true; 3335 continue; 3336 } 3337 3338 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3339 GFP_KERNEL, !memsw)) { 3340 ret = -EBUSY; 3341 break; 3342 } 3343 } while (true); 3344 3345 if (!ret && enlarge) 3346 memcg_oom_recover(memcg); 3347 3348 return ret; 3349 } 3350 3351 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3352 gfp_t gfp_mask, 3353 unsigned long *total_scanned) 3354 { 3355 unsigned long nr_reclaimed = 0; 3356 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3357 unsigned long reclaimed; 3358 int loop = 0; 3359 struct mem_cgroup_tree_per_node *mctz; 3360 unsigned long excess; 3361 unsigned long nr_scanned; 3362 3363 if (order > 0) 3364 return 0; 3365 3366 mctz = soft_limit_tree_node(pgdat->node_id); 3367 3368 /* 3369 * Do not even bother to check the largest node if the root 3370 * is empty. Do it lockless to prevent lock bouncing. Races 3371 * are acceptable as soft limit is best effort anyway. 3372 */ 3373 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3374 return 0; 3375 3376 /* 3377 * This loop can run a while, specially if mem_cgroup's continuously 3378 * keep exceeding their soft limit and putting the system under 3379 * pressure 3380 */ 3381 do { 3382 if (next_mz) 3383 mz = next_mz; 3384 else 3385 mz = mem_cgroup_largest_soft_limit_node(mctz); 3386 if (!mz) 3387 break; 3388 3389 nr_scanned = 0; 3390 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3391 gfp_mask, &nr_scanned); 3392 nr_reclaimed += reclaimed; 3393 *total_scanned += nr_scanned; 3394 spin_lock_irq(&mctz->lock); 3395 __mem_cgroup_remove_exceeded(mz, mctz); 3396 3397 /* 3398 * If we failed to reclaim anything from this memory cgroup 3399 * it is time to move on to the next cgroup 3400 */ 3401 next_mz = NULL; 3402 if (!reclaimed) 3403 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3404 3405 excess = soft_limit_excess(mz->memcg); 3406 /* 3407 * One school of thought says that we should not add 3408 * back the node to the tree if reclaim returns 0. 3409 * But our reclaim could return 0, simply because due 3410 * to priority we are exposing a smaller subset of 3411 * memory to reclaim from. Consider this as a longer 3412 * term TODO. 3413 */ 3414 /* If excess == 0, no tree ops */ 3415 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3416 spin_unlock_irq(&mctz->lock); 3417 css_put(&mz->memcg->css); 3418 loop++; 3419 /* 3420 * Could not reclaim anything and there are no more 3421 * mem cgroups to try or we seem to be looping without 3422 * reclaiming anything. 3423 */ 3424 if (!nr_reclaimed && 3425 (next_mz == NULL || 3426 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3427 break; 3428 } while (!nr_reclaimed); 3429 if (next_mz) 3430 css_put(&next_mz->memcg->css); 3431 return nr_reclaimed; 3432 } 3433 3434 /* 3435 * Test whether @memcg has children, dead or alive. Note that this 3436 * function doesn't care whether @memcg has use_hierarchy enabled and 3437 * returns %true if there are child csses according to the cgroup 3438 * hierarchy. Testing use_hierarchy is the caller's responsibility. 3439 */ 3440 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3441 { 3442 bool ret; 3443 3444 rcu_read_lock(); 3445 ret = css_next_child(NULL, &memcg->css); 3446 rcu_read_unlock(); 3447 return ret; 3448 } 3449 3450 /* 3451 * Reclaims as many pages from the given memcg as possible. 3452 * 3453 * Caller is responsible for holding css reference for memcg. 3454 */ 3455 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3456 { 3457 int nr_retries = MAX_RECLAIM_RETRIES; 3458 3459 /* we call try-to-free pages for make this cgroup empty */ 3460 lru_add_drain_all(); 3461 3462 drain_all_stock(memcg); 3463 3464 /* try to free all pages in this cgroup */ 3465 while (nr_retries && page_counter_read(&memcg->memory)) { 3466 int progress; 3467 3468 if (signal_pending(current)) 3469 return -EINTR; 3470 3471 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3472 GFP_KERNEL, true); 3473 if (!progress) { 3474 nr_retries--; 3475 /* maybe some writeback is necessary */ 3476 congestion_wait(BLK_RW_ASYNC, HZ/10); 3477 } 3478 3479 } 3480 3481 return 0; 3482 } 3483 3484 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3485 char *buf, size_t nbytes, 3486 loff_t off) 3487 { 3488 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3489 3490 if (mem_cgroup_is_root(memcg)) 3491 return -EINVAL; 3492 return mem_cgroup_force_empty(memcg) ?: nbytes; 3493 } 3494 3495 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3496 struct cftype *cft) 3497 { 3498 return mem_cgroup_from_css(css)->use_hierarchy; 3499 } 3500 3501 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3502 struct cftype *cft, u64 val) 3503 { 3504 int retval = 0; 3505 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3506 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3507 3508 if (memcg->use_hierarchy == val) 3509 return 0; 3510 3511 /* 3512 * If parent's use_hierarchy is set, we can't make any modifications 3513 * in the child subtrees. If it is unset, then the change can 3514 * occur, provided the current cgroup has no children. 3515 * 3516 * For the root cgroup, parent_mem is NULL, we allow value to be 3517 * set if there are no children. 3518 */ 3519 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3520 (val == 1 || val == 0)) { 3521 if (!memcg_has_children(memcg)) 3522 memcg->use_hierarchy = val; 3523 else 3524 retval = -EBUSY; 3525 } else 3526 retval = -EINVAL; 3527 3528 return retval; 3529 } 3530 3531 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3532 { 3533 unsigned long val; 3534 3535 if (mem_cgroup_is_root(memcg)) { 3536 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3537 memcg_page_state(memcg, NR_ANON_MAPPED); 3538 if (swap) 3539 val += memcg_page_state(memcg, MEMCG_SWAP); 3540 } else { 3541 if (!swap) 3542 val = page_counter_read(&memcg->memory); 3543 else 3544 val = page_counter_read(&memcg->memsw); 3545 } 3546 return val; 3547 } 3548 3549 enum { 3550 RES_USAGE, 3551 RES_LIMIT, 3552 RES_MAX_USAGE, 3553 RES_FAILCNT, 3554 RES_SOFT_LIMIT, 3555 }; 3556 3557 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3558 struct cftype *cft) 3559 { 3560 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3561 struct page_counter *counter; 3562 3563 switch (MEMFILE_TYPE(cft->private)) { 3564 case _MEM: 3565 counter = &memcg->memory; 3566 break; 3567 case _MEMSWAP: 3568 counter = &memcg->memsw; 3569 break; 3570 case _KMEM: 3571 counter = &memcg->kmem; 3572 break; 3573 case _TCP: 3574 counter = &memcg->tcpmem; 3575 break; 3576 default: 3577 BUG(); 3578 } 3579 3580 switch (MEMFILE_ATTR(cft->private)) { 3581 case RES_USAGE: 3582 if (counter == &memcg->memory) 3583 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3584 if (counter == &memcg->memsw) 3585 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3586 return (u64)page_counter_read(counter) * PAGE_SIZE; 3587 case RES_LIMIT: 3588 return (u64)counter->max * PAGE_SIZE; 3589 case RES_MAX_USAGE: 3590 return (u64)counter->watermark * PAGE_SIZE; 3591 case RES_FAILCNT: 3592 return counter->failcnt; 3593 case RES_SOFT_LIMIT: 3594 return (u64)memcg->soft_limit * PAGE_SIZE; 3595 default: 3596 BUG(); 3597 } 3598 } 3599 3600 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) 3601 { 3602 unsigned long stat[MEMCG_NR_STAT] = {0}; 3603 struct mem_cgroup *mi; 3604 int node, cpu, i; 3605 3606 for_each_online_cpu(cpu) 3607 for (i = 0; i < MEMCG_NR_STAT; i++) 3608 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); 3609 3610 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3611 for (i = 0; i < MEMCG_NR_STAT; i++) 3612 atomic_long_add(stat[i], &mi->vmstats[i]); 3613 3614 for_each_node(node) { 3615 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3616 struct mem_cgroup_per_node *pi; 3617 3618 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3619 stat[i] = 0; 3620 3621 for_each_online_cpu(cpu) 3622 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3623 stat[i] += per_cpu( 3624 pn->lruvec_stat_cpu->count[i], cpu); 3625 3626 for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 3627 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3628 atomic_long_add(stat[i], &pi->lruvec_stat[i]); 3629 } 3630 } 3631 3632 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 3633 { 3634 unsigned long events[NR_VM_EVENT_ITEMS]; 3635 struct mem_cgroup *mi; 3636 int cpu, i; 3637 3638 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3639 events[i] = 0; 3640 3641 for_each_online_cpu(cpu) 3642 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3643 events[i] += per_cpu(memcg->vmstats_percpu->events[i], 3644 cpu); 3645 3646 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3647 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3648 atomic_long_add(events[i], &mi->vmevents[i]); 3649 } 3650 3651 #ifdef CONFIG_MEMCG_KMEM 3652 static int memcg_online_kmem(struct mem_cgroup *memcg) 3653 { 3654 struct obj_cgroup *objcg; 3655 int memcg_id; 3656 3657 if (cgroup_memory_nokmem) 3658 return 0; 3659 3660 BUG_ON(memcg->kmemcg_id >= 0); 3661 BUG_ON(memcg->kmem_state); 3662 3663 memcg_id = memcg_alloc_cache_id(); 3664 if (memcg_id < 0) 3665 return memcg_id; 3666 3667 objcg = obj_cgroup_alloc(); 3668 if (!objcg) { 3669 memcg_free_cache_id(memcg_id); 3670 return -ENOMEM; 3671 } 3672 objcg->memcg = memcg; 3673 rcu_assign_pointer(memcg->objcg, objcg); 3674 3675 static_branch_enable(&memcg_kmem_enabled_key); 3676 3677 /* 3678 * A memory cgroup is considered kmem-online as soon as it gets 3679 * kmemcg_id. Setting the id after enabling static branching will 3680 * guarantee no one starts accounting before all call sites are 3681 * patched. 3682 */ 3683 memcg->kmemcg_id = memcg_id; 3684 memcg->kmem_state = KMEM_ONLINE; 3685 3686 return 0; 3687 } 3688 3689 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3690 { 3691 struct cgroup_subsys_state *css; 3692 struct mem_cgroup *parent, *child; 3693 int kmemcg_id; 3694 3695 if (memcg->kmem_state != KMEM_ONLINE) 3696 return; 3697 3698 memcg->kmem_state = KMEM_ALLOCATED; 3699 3700 parent = parent_mem_cgroup(memcg); 3701 if (!parent) 3702 parent = root_mem_cgroup; 3703 3704 memcg_reparent_objcgs(memcg, parent); 3705 3706 kmemcg_id = memcg->kmemcg_id; 3707 BUG_ON(kmemcg_id < 0); 3708 3709 /* 3710 * Change kmemcg_id of this cgroup and all its descendants to the 3711 * parent's id, and then move all entries from this cgroup's list_lrus 3712 * to ones of the parent. After we have finished, all list_lrus 3713 * corresponding to this cgroup are guaranteed to remain empty. The 3714 * ordering is imposed by list_lru_node->lock taken by 3715 * memcg_drain_all_list_lrus(). 3716 */ 3717 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3718 css_for_each_descendant_pre(css, &memcg->css) { 3719 child = mem_cgroup_from_css(css); 3720 BUG_ON(child->kmemcg_id != kmemcg_id); 3721 child->kmemcg_id = parent->kmemcg_id; 3722 if (!memcg->use_hierarchy) 3723 break; 3724 } 3725 rcu_read_unlock(); 3726 3727 memcg_drain_all_list_lrus(kmemcg_id, parent); 3728 3729 memcg_free_cache_id(kmemcg_id); 3730 } 3731 3732 static void memcg_free_kmem(struct mem_cgroup *memcg) 3733 { 3734 /* css_alloc() failed, offlining didn't happen */ 3735 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3736 memcg_offline_kmem(memcg); 3737 } 3738 #else 3739 static int memcg_online_kmem(struct mem_cgroup *memcg) 3740 { 3741 return 0; 3742 } 3743 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3744 { 3745 } 3746 static void memcg_free_kmem(struct mem_cgroup *memcg) 3747 { 3748 } 3749 #endif /* CONFIG_MEMCG_KMEM */ 3750 3751 static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3752 unsigned long max) 3753 { 3754 int ret; 3755 3756 mutex_lock(&memcg_max_mutex); 3757 ret = page_counter_set_max(&memcg->kmem, max); 3758 mutex_unlock(&memcg_max_mutex); 3759 return ret; 3760 } 3761 3762 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3763 { 3764 int ret; 3765 3766 mutex_lock(&memcg_max_mutex); 3767 3768 ret = page_counter_set_max(&memcg->tcpmem, max); 3769 if (ret) 3770 goto out; 3771 3772 if (!memcg->tcpmem_active) { 3773 /* 3774 * The active flag needs to be written after the static_key 3775 * update. This is what guarantees that the socket activation 3776 * function is the last one to run. See mem_cgroup_sk_alloc() 3777 * for details, and note that we don't mark any socket as 3778 * belonging to this memcg until that flag is up. 3779 * 3780 * We need to do this, because static_keys will span multiple 3781 * sites, but we can't control their order. If we mark a socket 3782 * as accounted, but the accounting functions are not patched in 3783 * yet, we'll lose accounting. 3784 * 3785 * We never race with the readers in mem_cgroup_sk_alloc(), 3786 * because when this value change, the code to process it is not 3787 * patched in yet. 3788 */ 3789 static_branch_inc(&memcg_sockets_enabled_key); 3790 memcg->tcpmem_active = true; 3791 } 3792 out: 3793 mutex_unlock(&memcg_max_mutex); 3794 return ret; 3795 } 3796 3797 /* 3798 * The user of this function is... 3799 * RES_LIMIT. 3800 */ 3801 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3802 char *buf, size_t nbytes, loff_t off) 3803 { 3804 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3805 unsigned long nr_pages; 3806 int ret; 3807 3808 buf = strstrip(buf); 3809 ret = page_counter_memparse(buf, "-1", &nr_pages); 3810 if (ret) 3811 return ret; 3812 3813 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3814 case RES_LIMIT: 3815 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3816 ret = -EINVAL; 3817 break; 3818 } 3819 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3820 case _MEM: 3821 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3822 break; 3823 case _MEMSWAP: 3824 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3825 break; 3826 case _KMEM: 3827 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 3828 "Please report your usecase to linux-mm@kvack.org if you " 3829 "depend on this functionality.\n"); 3830 ret = memcg_update_kmem_max(memcg, nr_pages); 3831 break; 3832 case _TCP: 3833 ret = memcg_update_tcp_max(memcg, nr_pages); 3834 break; 3835 } 3836 break; 3837 case RES_SOFT_LIMIT: 3838 memcg->soft_limit = nr_pages; 3839 ret = 0; 3840 break; 3841 } 3842 return ret ?: nbytes; 3843 } 3844 3845 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3846 size_t nbytes, loff_t off) 3847 { 3848 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3849 struct page_counter *counter; 3850 3851 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3852 case _MEM: 3853 counter = &memcg->memory; 3854 break; 3855 case _MEMSWAP: 3856 counter = &memcg->memsw; 3857 break; 3858 case _KMEM: 3859 counter = &memcg->kmem; 3860 break; 3861 case _TCP: 3862 counter = &memcg->tcpmem; 3863 break; 3864 default: 3865 BUG(); 3866 } 3867 3868 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3869 case RES_MAX_USAGE: 3870 page_counter_reset_watermark(counter); 3871 break; 3872 case RES_FAILCNT: 3873 counter->failcnt = 0; 3874 break; 3875 default: 3876 BUG(); 3877 } 3878 3879 return nbytes; 3880 } 3881 3882 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3883 struct cftype *cft) 3884 { 3885 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3886 } 3887 3888 #ifdef CONFIG_MMU 3889 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3890 struct cftype *cft, u64 val) 3891 { 3892 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3893 3894 if (val & ~MOVE_MASK) 3895 return -EINVAL; 3896 3897 /* 3898 * No kind of locking is needed in here, because ->can_attach() will 3899 * check this value once in the beginning of the process, and then carry 3900 * on with stale data. This means that changes to this value will only 3901 * affect task migrations starting after the change. 3902 */ 3903 memcg->move_charge_at_immigrate = val; 3904 return 0; 3905 } 3906 #else 3907 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3908 struct cftype *cft, u64 val) 3909 { 3910 return -ENOSYS; 3911 } 3912 #endif 3913 3914 #ifdef CONFIG_NUMA 3915 3916 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3917 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3918 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3919 3920 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3921 int nid, unsigned int lru_mask, bool tree) 3922 { 3923 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3924 unsigned long nr = 0; 3925 enum lru_list lru; 3926 3927 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3928 3929 for_each_lru(lru) { 3930 if (!(BIT(lru) & lru_mask)) 3931 continue; 3932 if (tree) 3933 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3934 else 3935 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3936 } 3937 return nr; 3938 } 3939 3940 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3941 unsigned int lru_mask, 3942 bool tree) 3943 { 3944 unsigned long nr = 0; 3945 enum lru_list lru; 3946 3947 for_each_lru(lru) { 3948 if (!(BIT(lru) & lru_mask)) 3949 continue; 3950 if (tree) 3951 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3952 else 3953 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3954 } 3955 return nr; 3956 } 3957 3958 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3959 { 3960 struct numa_stat { 3961 const char *name; 3962 unsigned int lru_mask; 3963 }; 3964 3965 static const struct numa_stat stats[] = { 3966 { "total", LRU_ALL }, 3967 { "file", LRU_ALL_FILE }, 3968 { "anon", LRU_ALL_ANON }, 3969 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3970 }; 3971 const struct numa_stat *stat; 3972 int nid; 3973 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3974 3975 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3976 seq_printf(m, "%s=%lu", stat->name, 3977 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3978 false)); 3979 for_each_node_state(nid, N_MEMORY) 3980 seq_printf(m, " N%d=%lu", nid, 3981 mem_cgroup_node_nr_lru_pages(memcg, nid, 3982 stat->lru_mask, false)); 3983 seq_putc(m, '\n'); 3984 } 3985 3986 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3987 3988 seq_printf(m, "hierarchical_%s=%lu", stat->name, 3989 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3990 true)); 3991 for_each_node_state(nid, N_MEMORY) 3992 seq_printf(m, " N%d=%lu", nid, 3993 mem_cgroup_node_nr_lru_pages(memcg, nid, 3994 stat->lru_mask, true)); 3995 seq_putc(m, '\n'); 3996 } 3997 3998 return 0; 3999 } 4000 #endif /* CONFIG_NUMA */ 4001 4002 static const unsigned int memcg1_stats[] = { 4003 NR_FILE_PAGES, 4004 NR_ANON_MAPPED, 4005 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4006 NR_ANON_THPS, 4007 #endif 4008 NR_SHMEM, 4009 NR_FILE_MAPPED, 4010 NR_FILE_DIRTY, 4011 NR_WRITEBACK, 4012 MEMCG_SWAP, 4013 }; 4014 4015 static const char *const memcg1_stat_names[] = { 4016 "cache", 4017 "rss", 4018 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4019 "rss_huge", 4020 #endif 4021 "shmem", 4022 "mapped_file", 4023 "dirty", 4024 "writeback", 4025 "swap", 4026 }; 4027 4028 /* Universal VM events cgroup1 shows, original sort order */ 4029 static const unsigned int memcg1_events[] = { 4030 PGPGIN, 4031 PGPGOUT, 4032 PGFAULT, 4033 PGMAJFAULT, 4034 }; 4035 4036 static int memcg_stat_show(struct seq_file *m, void *v) 4037 { 4038 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4039 unsigned long memory, memsw; 4040 struct mem_cgroup *mi; 4041 unsigned int i; 4042 4043 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4044 4045 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4046 unsigned long nr; 4047 4048 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4049 continue; 4050 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4051 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4052 if (memcg1_stats[i] == NR_ANON_THPS) 4053 nr *= HPAGE_PMD_NR; 4054 #endif 4055 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 4056 } 4057 4058 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4059 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 4060 memcg_events_local(memcg, memcg1_events[i])); 4061 4062 for (i = 0; i < NR_LRU_LISTS; i++) 4063 seq_printf(m, "%s %lu\n", lru_list_name(i), 4064 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4065 PAGE_SIZE); 4066 4067 /* Hierarchical information */ 4068 memory = memsw = PAGE_COUNTER_MAX; 4069 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4070 memory = min(memory, READ_ONCE(mi->memory.max)); 4071 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4072 } 4073 seq_printf(m, "hierarchical_memory_limit %llu\n", 4074 (u64)memory * PAGE_SIZE); 4075 if (do_memsw_account()) 4076 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4077 (u64)memsw * PAGE_SIZE); 4078 4079 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4080 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4081 continue; 4082 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 4083 (u64)memcg_page_state(memcg, memcg1_stats[i]) * 4084 PAGE_SIZE); 4085 } 4086 4087 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4088 seq_printf(m, "total_%s %llu\n", 4089 vm_event_name(memcg1_events[i]), 4090 (u64)memcg_events(memcg, memcg1_events[i])); 4091 4092 for (i = 0; i < NR_LRU_LISTS; i++) 4093 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4094 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4095 PAGE_SIZE); 4096 4097 #ifdef CONFIG_DEBUG_VM 4098 { 4099 pg_data_t *pgdat; 4100 struct mem_cgroup_per_node *mz; 4101 unsigned long anon_cost = 0; 4102 unsigned long file_cost = 0; 4103 4104 for_each_online_pgdat(pgdat) { 4105 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 4106 4107 anon_cost += mz->lruvec.anon_cost; 4108 file_cost += mz->lruvec.file_cost; 4109 } 4110 seq_printf(m, "anon_cost %lu\n", anon_cost); 4111 seq_printf(m, "file_cost %lu\n", file_cost); 4112 } 4113 #endif 4114 4115 return 0; 4116 } 4117 4118 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4119 struct cftype *cft) 4120 { 4121 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4122 4123 return mem_cgroup_swappiness(memcg); 4124 } 4125 4126 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4127 struct cftype *cft, u64 val) 4128 { 4129 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4130 4131 if (val > 100) 4132 return -EINVAL; 4133 4134 if (css->parent) 4135 memcg->swappiness = val; 4136 else 4137 vm_swappiness = val; 4138 4139 return 0; 4140 } 4141 4142 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4143 { 4144 struct mem_cgroup_threshold_ary *t; 4145 unsigned long usage; 4146 int i; 4147 4148 rcu_read_lock(); 4149 if (!swap) 4150 t = rcu_dereference(memcg->thresholds.primary); 4151 else 4152 t = rcu_dereference(memcg->memsw_thresholds.primary); 4153 4154 if (!t) 4155 goto unlock; 4156 4157 usage = mem_cgroup_usage(memcg, swap); 4158 4159 /* 4160 * current_threshold points to threshold just below or equal to usage. 4161 * If it's not true, a threshold was crossed after last 4162 * call of __mem_cgroup_threshold(). 4163 */ 4164 i = t->current_threshold; 4165 4166 /* 4167 * Iterate backward over array of thresholds starting from 4168 * current_threshold and check if a threshold is crossed. 4169 * If none of thresholds below usage is crossed, we read 4170 * only one element of the array here. 4171 */ 4172 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4173 eventfd_signal(t->entries[i].eventfd, 1); 4174 4175 /* i = current_threshold + 1 */ 4176 i++; 4177 4178 /* 4179 * Iterate forward over array of thresholds starting from 4180 * current_threshold+1 and check if a threshold is crossed. 4181 * If none of thresholds above usage is crossed, we read 4182 * only one element of the array here. 4183 */ 4184 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4185 eventfd_signal(t->entries[i].eventfd, 1); 4186 4187 /* Update current_threshold */ 4188 t->current_threshold = i - 1; 4189 unlock: 4190 rcu_read_unlock(); 4191 } 4192 4193 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4194 { 4195 while (memcg) { 4196 __mem_cgroup_threshold(memcg, false); 4197 if (do_memsw_account()) 4198 __mem_cgroup_threshold(memcg, true); 4199 4200 memcg = parent_mem_cgroup(memcg); 4201 } 4202 } 4203 4204 static int compare_thresholds(const void *a, const void *b) 4205 { 4206 const struct mem_cgroup_threshold *_a = a; 4207 const struct mem_cgroup_threshold *_b = b; 4208 4209 if (_a->threshold > _b->threshold) 4210 return 1; 4211 4212 if (_a->threshold < _b->threshold) 4213 return -1; 4214 4215 return 0; 4216 } 4217 4218 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4219 { 4220 struct mem_cgroup_eventfd_list *ev; 4221 4222 spin_lock(&memcg_oom_lock); 4223 4224 list_for_each_entry(ev, &memcg->oom_notify, list) 4225 eventfd_signal(ev->eventfd, 1); 4226 4227 spin_unlock(&memcg_oom_lock); 4228 return 0; 4229 } 4230 4231 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4232 { 4233 struct mem_cgroup *iter; 4234 4235 for_each_mem_cgroup_tree(iter, memcg) 4236 mem_cgroup_oom_notify_cb(iter); 4237 } 4238 4239 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4240 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4241 { 4242 struct mem_cgroup_thresholds *thresholds; 4243 struct mem_cgroup_threshold_ary *new; 4244 unsigned long threshold; 4245 unsigned long usage; 4246 int i, size, ret; 4247 4248 ret = page_counter_memparse(args, "-1", &threshold); 4249 if (ret) 4250 return ret; 4251 4252 mutex_lock(&memcg->thresholds_lock); 4253 4254 if (type == _MEM) { 4255 thresholds = &memcg->thresholds; 4256 usage = mem_cgroup_usage(memcg, false); 4257 } else if (type == _MEMSWAP) { 4258 thresholds = &memcg->memsw_thresholds; 4259 usage = mem_cgroup_usage(memcg, true); 4260 } else 4261 BUG(); 4262 4263 /* Check if a threshold crossed before adding a new one */ 4264 if (thresholds->primary) 4265 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4266 4267 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4268 4269 /* Allocate memory for new array of thresholds */ 4270 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4271 if (!new) { 4272 ret = -ENOMEM; 4273 goto unlock; 4274 } 4275 new->size = size; 4276 4277 /* Copy thresholds (if any) to new array */ 4278 if (thresholds->primary) 4279 memcpy(new->entries, thresholds->primary->entries, 4280 flex_array_size(new, entries, size - 1)); 4281 4282 /* Add new threshold */ 4283 new->entries[size - 1].eventfd = eventfd; 4284 new->entries[size - 1].threshold = threshold; 4285 4286 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4287 sort(new->entries, size, sizeof(*new->entries), 4288 compare_thresholds, NULL); 4289 4290 /* Find current threshold */ 4291 new->current_threshold = -1; 4292 for (i = 0; i < size; i++) { 4293 if (new->entries[i].threshold <= usage) { 4294 /* 4295 * new->current_threshold will not be used until 4296 * rcu_assign_pointer(), so it's safe to increment 4297 * it here. 4298 */ 4299 ++new->current_threshold; 4300 } else 4301 break; 4302 } 4303 4304 /* Free old spare buffer and save old primary buffer as spare */ 4305 kfree(thresholds->spare); 4306 thresholds->spare = thresholds->primary; 4307 4308 rcu_assign_pointer(thresholds->primary, new); 4309 4310 /* To be sure that nobody uses thresholds */ 4311 synchronize_rcu(); 4312 4313 unlock: 4314 mutex_unlock(&memcg->thresholds_lock); 4315 4316 return ret; 4317 } 4318 4319 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4320 struct eventfd_ctx *eventfd, const char *args) 4321 { 4322 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4323 } 4324 4325 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4326 struct eventfd_ctx *eventfd, const char *args) 4327 { 4328 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4329 } 4330 4331 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4332 struct eventfd_ctx *eventfd, enum res_type type) 4333 { 4334 struct mem_cgroup_thresholds *thresholds; 4335 struct mem_cgroup_threshold_ary *new; 4336 unsigned long usage; 4337 int i, j, size, entries; 4338 4339 mutex_lock(&memcg->thresholds_lock); 4340 4341 if (type == _MEM) { 4342 thresholds = &memcg->thresholds; 4343 usage = mem_cgroup_usage(memcg, false); 4344 } else if (type == _MEMSWAP) { 4345 thresholds = &memcg->memsw_thresholds; 4346 usage = mem_cgroup_usage(memcg, true); 4347 } else 4348 BUG(); 4349 4350 if (!thresholds->primary) 4351 goto unlock; 4352 4353 /* Check if a threshold crossed before removing */ 4354 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4355 4356 /* Calculate new number of threshold */ 4357 size = entries = 0; 4358 for (i = 0; i < thresholds->primary->size; i++) { 4359 if (thresholds->primary->entries[i].eventfd != eventfd) 4360 size++; 4361 else 4362 entries++; 4363 } 4364 4365 new = thresholds->spare; 4366 4367 /* If no items related to eventfd have been cleared, nothing to do */ 4368 if (!entries) 4369 goto unlock; 4370 4371 /* Set thresholds array to NULL if we don't have thresholds */ 4372 if (!size) { 4373 kfree(new); 4374 new = NULL; 4375 goto swap_buffers; 4376 } 4377 4378 new->size = size; 4379 4380 /* Copy thresholds and find current threshold */ 4381 new->current_threshold = -1; 4382 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4383 if (thresholds->primary->entries[i].eventfd == eventfd) 4384 continue; 4385 4386 new->entries[j] = thresholds->primary->entries[i]; 4387 if (new->entries[j].threshold <= usage) { 4388 /* 4389 * new->current_threshold will not be used 4390 * until rcu_assign_pointer(), so it's safe to increment 4391 * it here. 4392 */ 4393 ++new->current_threshold; 4394 } 4395 j++; 4396 } 4397 4398 swap_buffers: 4399 /* Swap primary and spare array */ 4400 thresholds->spare = thresholds->primary; 4401 4402 rcu_assign_pointer(thresholds->primary, new); 4403 4404 /* To be sure that nobody uses thresholds */ 4405 synchronize_rcu(); 4406 4407 /* If all events are unregistered, free the spare array */ 4408 if (!new) { 4409 kfree(thresholds->spare); 4410 thresholds->spare = NULL; 4411 } 4412 unlock: 4413 mutex_unlock(&memcg->thresholds_lock); 4414 } 4415 4416 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4417 struct eventfd_ctx *eventfd) 4418 { 4419 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4420 } 4421 4422 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4423 struct eventfd_ctx *eventfd) 4424 { 4425 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4426 } 4427 4428 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4429 struct eventfd_ctx *eventfd, const char *args) 4430 { 4431 struct mem_cgroup_eventfd_list *event; 4432 4433 event = kmalloc(sizeof(*event), GFP_KERNEL); 4434 if (!event) 4435 return -ENOMEM; 4436 4437 spin_lock(&memcg_oom_lock); 4438 4439 event->eventfd = eventfd; 4440 list_add(&event->list, &memcg->oom_notify); 4441 4442 /* already in OOM ? */ 4443 if (memcg->under_oom) 4444 eventfd_signal(eventfd, 1); 4445 spin_unlock(&memcg_oom_lock); 4446 4447 return 0; 4448 } 4449 4450 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4451 struct eventfd_ctx *eventfd) 4452 { 4453 struct mem_cgroup_eventfd_list *ev, *tmp; 4454 4455 spin_lock(&memcg_oom_lock); 4456 4457 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4458 if (ev->eventfd == eventfd) { 4459 list_del(&ev->list); 4460 kfree(ev); 4461 } 4462 } 4463 4464 spin_unlock(&memcg_oom_lock); 4465 } 4466 4467 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4468 { 4469 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4470 4471 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4472 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4473 seq_printf(sf, "oom_kill %lu\n", 4474 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4475 return 0; 4476 } 4477 4478 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4479 struct cftype *cft, u64 val) 4480 { 4481 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4482 4483 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4484 if (!css->parent || !((val == 0) || (val == 1))) 4485 return -EINVAL; 4486 4487 memcg->oom_kill_disable = val; 4488 if (!val) 4489 memcg_oom_recover(memcg); 4490 4491 return 0; 4492 } 4493 4494 #ifdef CONFIG_CGROUP_WRITEBACK 4495 4496 #include <trace/events/writeback.h> 4497 4498 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4499 { 4500 return wb_domain_init(&memcg->cgwb_domain, gfp); 4501 } 4502 4503 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4504 { 4505 wb_domain_exit(&memcg->cgwb_domain); 4506 } 4507 4508 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4509 { 4510 wb_domain_size_changed(&memcg->cgwb_domain); 4511 } 4512 4513 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4514 { 4515 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4516 4517 if (!memcg->css.parent) 4518 return NULL; 4519 4520 return &memcg->cgwb_domain; 4521 } 4522 4523 /* 4524 * idx can be of type enum memcg_stat_item or node_stat_item. 4525 * Keep in sync with memcg_exact_page(). 4526 */ 4527 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) 4528 { 4529 long x = atomic_long_read(&memcg->vmstats[idx]); 4530 int cpu; 4531 4532 for_each_online_cpu(cpu) 4533 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; 4534 if (x < 0) 4535 x = 0; 4536 return x; 4537 } 4538 4539 /** 4540 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4541 * @wb: bdi_writeback in question 4542 * @pfilepages: out parameter for number of file pages 4543 * @pheadroom: out parameter for number of allocatable pages according to memcg 4544 * @pdirty: out parameter for number of dirty pages 4545 * @pwriteback: out parameter for number of pages under writeback 4546 * 4547 * Determine the numbers of file, headroom, dirty, and writeback pages in 4548 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4549 * is a bit more involved. 4550 * 4551 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4552 * headroom is calculated as the lowest headroom of itself and the 4553 * ancestors. Note that this doesn't consider the actual amount of 4554 * available memory in the system. The caller should further cap 4555 * *@pheadroom accordingly. 4556 */ 4557 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4558 unsigned long *pheadroom, unsigned long *pdirty, 4559 unsigned long *pwriteback) 4560 { 4561 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4562 struct mem_cgroup *parent; 4563 4564 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); 4565 4566 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 4567 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 4568 memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 4569 *pheadroom = PAGE_COUNTER_MAX; 4570 4571 while ((parent = parent_mem_cgroup(memcg))) { 4572 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4573 READ_ONCE(memcg->memory.high)); 4574 unsigned long used = page_counter_read(&memcg->memory); 4575 4576 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4577 memcg = parent; 4578 } 4579 } 4580 4581 /* 4582 * Foreign dirty flushing 4583 * 4584 * There's an inherent mismatch between memcg and writeback. The former 4585 * trackes ownership per-page while the latter per-inode. This was a 4586 * deliberate design decision because honoring per-page ownership in the 4587 * writeback path is complicated, may lead to higher CPU and IO overheads 4588 * and deemed unnecessary given that write-sharing an inode across 4589 * different cgroups isn't a common use-case. 4590 * 4591 * Combined with inode majority-writer ownership switching, this works well 4592 * enough in most cases but there are some pathological cases. For 4593 * example, let's say there are two cgroups A and B which keep writing to 4594 * different but confined parts of the same inode. B owns the inode and 4595 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4596 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4597 * triggering background writeback. A will be slowed down without a way to 4598 * make writeback of the dirty pages happen. 4599 * 4600 * Conditions like the above can lead to a cgroup getting repatedly and 4601 * severely throttled after making some progress after each 4602 * dirty_expire_interval while the underyling IO device is almost 4603 * completely idle. 4604 * 4605 * Solving this problem completely requires matching the ownership tracking 4606 * granularities between memcg and writeback in either direction. However, 4607 * the more egregious behaviors can be avoided by simply remembering the 4608 * most recent foreign dirtying events and initiating remote flushes on 4609 * them when local writeback isn't enough to keep the memory clean enough. 4610 * 4611 * The following two functions implement such mechanism. When a foreign 4612 * page - a page whose memcg and writeback ownerships don't match - is 4613 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4614 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4615 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4616 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4617 * foreign bdi_writebacks which haven't expired. Both the numbers of 4618 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4619 * limited to MEMCG_CGWB_FRN_CNT. 4620 * 4621 * The mechanism only remembers IDs and doesn't hold any object references. 4622 * As being wrong occasionally doesn't matter, updates and accesses to the 4623 * records are lockless and racy. 4624 */ 4625 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4626 struct bdi_writeback *wb) 4627 { 4628 struct mem_cgroup *memcg = page->mem_cgroup; 4629 struct memcg_cgwb_frn *frn; 4630 u64 now = get_jiffies_64(); 4631 u64 oldest_at = now; 4632 int oldest = -1; 4633 int i; 4634 4635 trace_track_foreign_dirty(page, wb); 4636 4637 /* 4638 * Pick the slot to use. If there is already a slot for @wb, keep 4639 * using it. If not replace the oldest one which isn't being 4640 * written out. 4641 */ 4642 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4643 frn = &memcg->cgwb_frn[i]; 4644 if (frn->bdi_id == wb->bdi->id && 4645 frn->memcg_id == wb->memcg_css->id) 4646 break; 4647 if (time_before64(frn->at, oldest_at) && 4648 atomic_read(&frn->done.cnt) == 1) { 4649 oldest = i; 4650 oldest_at = frn->at; 4651 } 4652 } 4653 4654 if (i < MEMCG_CGWB_FRN_CNT) { 4655 /* 4656 * Re-using an existing one. Update timestamp lazily to 4657 * avoid making the cacheline hot. We want them to be 4658 * reasonably up-to-date and significantly shorter than 4659 * dirty_expire_interval as that's what expires the record. 4660 * Use the shorter of 1s and dirty_expire_interval / 8. 4661 */ 4662 unsigned long update_intv = 4663 min_t(unsigned long, HZ, 4664 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4665 4666 if (time_before64(frn->at, now - update_intv)) 4667 frn->at = now; 4668 } else if (oldest >= 0) { 4669 /* replace the oldest free one */ 4670 frn = &memcg->cgwb_frn[oldest]; 4671 frn->bdi_id = wb->bdi->id; 4672 frn->memcg_id = wb->memcg_css->id; 4673 frn->at = now; 4674 } 4675 } 4676 4677 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4678 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4679 { 4680 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4681 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4682 u64 now = jiffies_64; 4683 int i; 4684 4685 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4686 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4687 4688 /* 4689 * If the record is older than dirty_expire_interval, 4690 * writeback on it has already started. No need to kick it 4691 * off again. Also, don't start a new one if there's 4692 * already one in flight. 4693 */ 4694 if (time_after64(frn->at, now - intv) && 4695 atomic_read(&frn->done.cnt) == 1) { 4696 frn->at = 0; 4697 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4698 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 4699 WB_REASON_FOREIGN_FLUSH, 4700 &frn->done); 4701 } 4702 } 4703 } 4704 4705 #else /* CONFIG_CGROUP_WRITEBACK */ 4706 4707 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4708 { 4709 return 0; 4710 } 4711 4712 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4713 { 4714 } 4715 4716 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4717 { 4718 } 4719 4720 #endif /* CONFIG_CGROUP_WRITEBACK */ 4721 4722 /* 4723 * DO NOT USE IN NEW FILES. 4724 * 4725 * "cgroup.event_control" implementation. 4726 * 4727 * This is way over-engineered. It tries to support fully configurable 4728 * events for each user. Such level of flexibility is completely 4729 * unnecessary especially in the light of the planned unified hierarchy. 4730 * 4731 * Please deprecate this and replace with something simpler if at all 4732 * possible. 4733 */ 4734 4735 /* 4736 * Unregister event and free resources. 4737 * 4738 * Gets called from workqueue. 4739 */ 4740 static void memcg_event_remove(struct work_struct *work) 4741 { 4742 struct mem_cgroup_event *event = 4743 container_of(work, struct mem_cgroup_event, remove); 4744 struct mem_cgroup *memcg = event->memcg; 4745 4746 remove_wait_queue(event->wqh, &event->wait); 4747 4748 event->unregister_event(memcg, event->eventfd); 4749 4750 /* Notify userspace the event is going away. */ 4751 eventfd_signal(event->eventfd, 1); 4752 4753 eventfd_ctx_put(event->eventfd); 4754 kfree(event); 4755 css_put(&memcg->css); 4756 } 4757 4758 /* 4759 * Gets called on EPOLLHUP on eventfd when user closes it. 4760 * 4761 * Called with wqh->lock held and interrupts disabled. 4762 */ 4763 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4764 int sync, void *key) 4765 { 4766 struct mem_cgroup_event *event = 4767 container_of(wait, struct mem_cgroup_event, wait); 4768 struct mem_cgroup *memcg = event->memcg; 4769 __poll_t flags = key_to_poll(key); 4770 4771 if (flags & EPOLLHUP) { 4772 /* 4773 * If the event has been detached at cgroup removal, we 4774 * can simply return knowing the other side will cleanup 4775 * for us. 4776 * 4777 * We can't race against event freeing since the other 4778 * side will require wqh->lock via remove_wait_queue(), 4779 * which we hold. 4780 */ 4781 spin_lock(&memcg->event_list_lock); 4782 if (!list_empty(&event->list)) { 4783 list_del_init(&event->list); 4784 /* 4785 * We are in atomic context, but cgroup_event_remove() 4786 * may sleep, so we have to call it in workqueue. 4787 */ 4788 schedule_work(&event->remove); 4789 } 4790 spin_unlock(&memcg->event_list_lock); 4791 } 4792 4793 return 0; 4794 } 4795 4796 static void memcg_event_ptable_queue_proc(struct file *file, 4797 wait_queue_head_t *wqh, poll_table *pt) 4798 { 4799 struct mem_cgroup_event *event = 4800 container_of(pt, struct mem_cgroup_event, pt); 4801 4802 event->wqh = wqh; 4803 add_wait_queue(wqh, &event->wait); 4804 } 4805 4806 /* 4807 * DO NOT USE IN NEW FILES. 4808 * 4809 * Parse input and register new cgroup event handler. 4810 * 4811 * Input must be in format '<event_fd> <control_fd> <args>'. 4812 * Interpretation of args is defined by control file implementation. 4813 */ 4814 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4815 char *buf, size_t nbytes, loff_t off) 4816 { 4817 struct cgroup_subsys_state *css = of_css(of); 4818 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4819 struct mem_cgroup_event *event; 4820 struct cgroup_subsys_state *cfile_css; 4821 unsigned int efd, cfd; 4822 struct fd efile; 4823 struct fd cfile; 4824 const char *name; 4825 char *endp; 4826 int ret; 4827 4828 buf = strstrip(buf); 4829 4830 efd = simple_strtoul(buf, &endp, 10); 4831 if (*endp != ' ') 4832 return -EINVAL; 4833 buf = endp + 1; 4834 4835 cfd = simple_strtoul(buf, &endp, 10); 4836 if ((*endp != ' ') && (*endp != '\0')) 4837 return -EINVAL; 4838 buf = endp + 1; 4839 4840 event = kzalloc(sizeof(*event), GFP_KERNEL); 4841 if (!event) 4842 return -ENOMEM; 4843 4844 event->memcg = memcg; 4845 INIT_LIST_HEAD(&event->list); 4846 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4847 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4848 INIT_WORK(&event->remove, memcg_event_remove); 4849 4850 efile = fdget(efd); 4851 if (!efile.file) { 4852 ret = -EBADF; 4853 goto out_kfree; 4854 } 4855 4856 event->eventfd = eventfd_ctx_fileget(efile.file); 4857 if (IS_ERR(event->eventfd)) { 4858 ret = PTR_ERR(event->eventfd); 4859 goto out_put_efile; 4860 } 4861 4862 cfile = fdget(cfd); 4863 if (!cfile.file) { 4864 ret = -EBADF; 4865 goto out_put_eventfd; 4866 } 4867 4868 /* the process need read permission on control file */ 4869 /* AV: shouldn't we check that it's been opened for read instead? */ 4870 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4871 if (ret < 0) 4872 goto out_put_cfile; 4873 4874 /* 4875 * Determine the event callbacks and set them in @event. This used 4876 * to be done via struct cftype but cgroup core no longer knows 4877 * about these events. The following is crude but the whole thing 4878 * is for compatibility anyway. 4879 * 4880 * DO NOT ADD NEW FILES. 4881 */ 4882 name = cfile.file->f_path.dentry->d_name.name; 4883 4884 if (!strcmp(name, "memory.usage_in_bytes")) { 4885 event->register_event = mem_cgroup_usage_register_event; 4886 event->unregister_event = mem_cgroup_usage_unregister_event; 4887 } else if (!strcmp(name, "memory.oom_control")) { 4888 event->register_event = mem_cgroup_oom_register_event; 4889 event->unregister_event = mem_cgroup_oom_unregister_event; 4890 } else if (!strcmp(name, "memory.pressure_level")) { 4891 event->register_event = vmpressure_register_event; 4892 event->unregister_event = vmpressure_unregister_event; 4893 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4894 event->register_event = memsw_cgroup_usage_register_event; 4895 event->unregister_event = memsw_cgroup_usage_unregister_event; 4896 } else { 4897 ret = -EINVAL; 4898 goto out_put_cfile; 4899 } 4900 4901 /* 4902 * Verify @cfile should belong to @css. Also, remaining events are 4903 * automatically removed on cgroup destruction but the removal is 4904 * asynchronous, so take an extra ref on @css. 4905 */ 4906 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4907 &memory_cgrp_subsys); 4908 ret = -EINVAL; 4909 if (IS_ERR(cfile_css)) 4910 goto out_put_cfile; 4911 if (cfile_css != css) { 4912 css_put(cfile_css); 4913 goto out_put_cfile; 4914 } 4915 4916 ret = event->register_event(memcg, event->eventfd, buf); 4917 if (ret) 4918 goto out_put_css; 4919 4920 vfs_poll(efile.file, &event->pt); 4921 4922 spin_lock(&memcg->event_list_lock); 4923 list_add(&event->list, &memcg->event_list); 4924 spin_unlock(&memcg->event_list_lock); 4925 4926 fdput(cfile); 4927 fdput(efile); 4928 4929 return nbytes; 4930 4931 out_put_css: 4932 css_put(css); 4933 out_put_cfile: 4934 fdput(cfile); 4935 out_put_eventfd: 4936 eventfd_ctx_put(event->eventfd); 4937 out_put_efile: 4938 fdput(efile); 4939 out_kfree: 4940 kfree(event); 4941 4942 return ret; 4943 } 4944 4945 static struct cftype mem_cgroup_legacy_files[] = { 4946 { 4947 .name = "usage_in_bytes", 4948 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4949 .read_u64 = mem_cgroup_read_u64, 4950 }, 4951 { 4952 .name = "max_usage_in_bytes", 4953 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4954 .write = mem_cgroup_reset, 4955 .read_u64 = mem_cgroup_read_u64, 4956 }, 4957 { 4958 .name = "limit_in_bytes", 4959 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4960 .write = mem_cgroup_write, 4961 .read_u64 = mem_cgroup_read_u64, 4962 }, 4963 { 4964 .name = "soft_limit_in_bytes", 4965 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4966 .write = mem_cgroup_write, 4967 .read_u64 = mem_cgroup_read_u64, 4968 }, 4969 { 4970 .name = "failcnt", 4971 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4972 .write = mem_cgroup_reset, 4973 .read_u64 = mem_cgroup_read_u64, 4974 }, 4975 { 4976 .name = "stat", 4977 .seq_show = memcg_stat_show, 4978 }, 4979 { 4980 .name = "force_empty", 4981 .write = mem_cgroup_force_empty_write, 4982 }, 4983 { 4984 .name = "use_hierarchy", 4985 .write_u64 = mem_cgroup_hierarchy_write, 4986 .read_u64 = mem_cgroup_hierarchy_read, 4987 }, 4988 { 4989 .name = "cgroup.event_control", /* XXX: for compat */ 4990 .write = memcg_write_event_control, 4991 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4992 }, 4993 { 4994 .name = "swappiness", 4995 .read_u64 = mem_cgroup_swappiness_read, 4996 .write_u64 = mem_cgroup_swappiness_write, 4997 }, 4998 { 4999 .name = "move_charge_at_immigrate", 5000 .read_u64 = mem_cgroup_move_charge_read, 5001 .write_u64 = mem_cgroup_move_charge_write, 5002 }, 5003 { 5004 .name = "oom_control", 5005 .seq_show = mem_cgroup_oom_control_read, 5006 .write_u64 = mem_cgroup_oom_control_write, 5007 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5008 }, 5009 { 5010 .name = "pressure_level", 5011 }, 5012 #ifdef CONFIG_NUMA 5013 { 5014 .name = "numa_stat", 5015 .seq_show = memcg_numa_stat_show, 5016 }, 5017 #endif 5018 { 5019 .name = "kmem.limit_in_bytes", 5020 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5021 .write = mem_cgroup_write, 5022 .read_u64 = mem_cgroup_read_u64, 5023 }, 5024 { 5025 .name = "kmem.usage_in_bytes", 5026 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5027 .read_u64 = mem_cgroup_read_u64, 5028 }, 5029 { 5030 .name = "kmem.failcnt", 5031 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5032 .write = mem_cgroup_reset, 5033 .read_u64 = mem_cgroup_read_u64, 5034 }, 5035 { 5036 .name = "kmem.max_usage_in_bytes", 5037 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5038 .write = mem_cgroup_reset, 5039 .read_u64 = mem_cgroup_read_u64, 5040 }, 5041 #if defined(CONFIG_MEMCG_KMEM) && \ 5042 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5043 { 5044 .name = "kmem.slabinfo", 5045 .seq_show = memcg_slab_show, 5046 }, 5047 #endif 5048 { 5049 .name = "kmem.tcp.limit_in_bytes", 5050 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5051 .write = mem_cgroup_write, 5052 .read_u64 = mem_cgroup_read_u64, 5053 }, 5054 { 5055 .name = "kmem.tcp.usage_in_bytes", 5056 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5057 .read_u64 = mem_cgroup_read_u64, 5058 }, 5059 { 5060 .name = "kmem.tcp.failcnt", 5061 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5062 .write = mem_cgroup_reset, 5063 .read_u64 = mem_cgroup_read_u64, 5064 }, 5065 { 5066 .name = "kmem.tcp.max_usage_in_bytes", 5067 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5068 .write = mem_cgroup_reset, 5069 .read_u64 = mem_cgroup_read_u64, 5070 }, 5071 { }, /* terminate */ 5072 }; 5073 5074 /* 5075 * Private memory cgroup IDR 5076 * 5077 * Swap-out records and page cache shadow entries need to store memcg 5078 * references in constrained space, so we maintain an ID space that is 5079 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5080 * memory-controlled cgroups to 64k. 5081 * 5082 * However, there usually are many references to the offline CSS after 5083 * the cgroup has been destroyed, such as page cache or reclaimable 5084 * slab objects, that don't need to hang on to the ID. We want to keep 5085 * those dead CSS from occupying IDs, or we might quickly exhaust the 5086 * relatively small ID space and prevent the creation of new cgroups 5087 * even when there are much fewer than 64k cgroups - possibly none. 5088 * 5089 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5090 * be freed and recycled when it's no longer needed, which is usually 5091 * when the CSS is offlined. 5092 * 5093 * The only exception to that are records of swapped out tmpfs/shmem 5094 * pages that need to be attributed to live ancestors on swapin. But 5095 * those references are manageable from userspace. 5096 */ 5097 5098 static DEFINE_IDR(mem_cgroup_idr); 5099 5100 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5101 { 5102 if (memcg->id.id > 0) { 5103 idr_remove(&mem_cgroup_idr, memcg->id.id); 5104 memcg->id.id = 0; 5105 } 5106 } 5107 5108 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5109 unsigned int n) 5110 { 5111 refcount_add(n, &memcg->id.ref); 5112 } 5113 5114 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5115 { 5116 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5117 mem_cgroup_id_remove(memcg); 5118 5119 /* Memcg ID pins CSS */ 5120 css_put(&memcg->css); 5121 } 5122 } 5123 5124 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5125 { 5126 mem_cgroup_id_put_many(memcg, 1); 5127 } 5128 5129 /** 5130 * mem_cgroup_from_id - look up a memcg from a memcg id 5131 * @id: the memcg id to look up 5132 * 5133 * Caller must hold rcu_read_lock(). 5134 */ 5135 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5136 { 5137 WARN_ON_ONCE(!rcu_read_lock_held()); 5138 return idr_find(&mem_cgroup_idr, id); 5139 } 5140 5141 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5142 { 5143 struct mem_cgroup_per_node *pn; 5144 int tmp = node; 5145 /* 5146 * This routine is called against possible nodes. 5147 * But it's BUG to call kmalloc() against offline node. 5148 * 5149 * TODO: this routine can waste much memory for nodes which will 5150 * never be onlined. It's better to use memory hotplug callback 5151 * function. 5152 */ 5153 if (!node_state(node, N_NORMAL_MEMORY)) 5154 tmp = -1; 5155 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5156 if (!pn) 5157 return 1; 5158 5159 pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, 5160 GFP_KERNEL_ACCOUNT); 5161 if (!pn->lruvec_stat_local) { 5162 kfree(pn); 5163 return 1; 5164 } 5165 5166 pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, 5167 GFP_KERNEL_ACCOUNT); 5168 if (!pn->lruvec_stat_cpu) { 5169 free_percpu(pn->lruvec_stat_local); 5170 kfree(pn); 5171 return 1; 5172 } 5173 5174 lruvec_init(&pn->lruvec); 5175 pn->usage_in_excess = 0; 5176 pn->on_tree = false; 5177 pn->memcg = memcg; 5178 5179 memcg->nodeinfo[node] = pn; 5180 return 0; 5181 } 5182 5183 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5184 { 5185 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5186 5187 if (!pn) 5188 return; 5189 5190 free_percpu(pn->lruvec_stat_cpu); 5191 free_percpu(pn->lruvec_stat_local); 5192 kfree(pn); 5193 } 5194 5195 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5196 { 5197 int node; 5198 5199 for_each_node(node) 5200 free_mem_cgroup_per_node_info(memcg, node); 5201 free_percpu(memcg->vmstats_percpu); 5202 free_percpu(memcg->vmstats_local); 5203 kfree(memcg); 5204 } 5205 5206 static void mem_cgroup_free(struct mem_cgroup *memcg) 5207 { 5208 memcg_wb_domain_exit(memcg); 5209 /* 5210 * Flush percpu vmstats and vmevents to guarantee the value correctness 5211 * on parent's and all ancestor levels. 5212 */ 5213 memcg_flush_percpu_vmstats(memcg); 5214 memcg_flush_percpu_vmevents(memcg); 5215 __mem_cgroup_free(memcg); 5216 } 5217 5218 static struct mem_cgroup *mem_cgroup_alloc(void) 5219 { 5220 struct mem_cgroup *memcg; 5221 unsigned int size; 5222 int node; 5223 int __maybe_unused i; 5224 long error = -ENOMEM; 5225 5226 size = sizeof(struct mem_cgroup); 5227 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5228 5229 memcg = kzalloc(size, GFP_KERNEL); 5230 if (!memcg) 5231 return ERR_PTR(error); 5232 5233 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5234 1, MEM_CGROUP_ID_MAX, 5235 GFP_KERNEL); 5236 if (memcg->id.id < 0) { 5237 error = memcg->id.id; 5238 goto fail; 5239 } 5240 5241 memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5242 GFP_KERNEL_ACCOUNT); 5243 if (!memcg->vmstats_local) 5244 goto fail; 5245 5246 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5247 GFP_KERNEL_ACCOUNT); 5248 if (!memcg->vmstats_percpu) 5249 goto fail; 5250 5251 for_each_node(node) 5252 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5253 goto fail; 5254 5255 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5256 goto fail; 5257 5258 INIT_WORK(&memcg->high_work, high_work_func); 5259 INIT_LIST_HEAD(&memcg->oom_notify); 5260 mutex_init(&memcg->thresholds_lock); 5261 spin_lock_init(&memcg->move_lock); 5262 vmpressure_init(&memcg->vmpressure); 5263 INIT_LIST_HEAD(&memcg->event_list); 5264 spin_lock_init(&memcg->event_list_lock); 5265 memcg->socket_pressure = jiffies; 5266 #ifdef CONFIG_MEMCG_KMEM 5267 memcg->kmemcg_id = -1; 5268 INIT_LIST_HEAD(&memcg->objcg_list); 5269 #endif 5270 #ifdef CONFIG_CGROUP_WRITEBACK 5271 INIT_LIST_HEAD(&memcg->cgwb_list); 5272 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5273 memcg->cgwb_frn[i].done = 5274 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5275 #endif 5276 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5277 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5278 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5279 memcg->deferred_split_queue.split_queue_len = 0; 5280 #endif 5281 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5282 return memcg; 5283 fail: 5284 mem_cgroup_id_remove(memcg); 5285 __mem_cgroup_free(memcg); 5286 return ERR_PTR(error); 5287 } 5288 5289 static struct cgroup_subsys_state * __ref 5290 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5291 { 5292 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5293 struct mem_cgroup *memcg; 5294 long error = -ENOMEM; 5295 5296 memalloc_use_memcg(parent); 5297 memcg = mem_cgroup_alloc(); 5298 memalloc_unuse_memcg(); 5299 if (IS_ERR(memcg)) 5300 return ERR_CAST(memcg); 5301 5302 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5303 memcg->soft_limit = PAGE_COUNTER_MAX; 5304 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5305 if (parent) { 5306 memcg->swappiness = mem_cgroup_swappiness(parent); 5307 memcg->oom_kill_disable = parent->oom_kill_disable; 5308 } 5309 if (parent && parent->use_hierarchy) { 5310 memcg->use_hierarchy = true; 5311 page_counter_init(&memcg->memory, &parent->memory); 5312 page_counter_init(&memcg->swap, &parent->swap); 5313 page_counter_init(&memcg->kmem, &parent->kmem); 5314 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5315 } else { 5316 page_counter_init(&memcg->memory, NULL); 5317 page_counter_init(&memcg->swap, NULL); 5318 page_counter_init(&memcg->kmem, NULL); 5319 page_counter_init(&memcg->tcpmem, NULL); 5320 /* 5321 * Deeper hierachy with use_hierarchy == false doesn't make 5322 * much sense so let cgroup subsystem know about this 5323 * unfortunate state in our controller. 5324 */ 5325 if (parent != root_mem_cgroup) 5326 memory_cgrp_subsys.broken_hierarchy = true; 5327 } 5328 5329 /* The following stuff does not apply to the root */ 5330 if (!parent) { 5331 root_mem_cgroup = memcg; 5332 return &memcg->css; 5333 } 5334 5335 error = memcg_online_kmem(memcg); 5336 if (error) 5337 goto fail; 5338 5339 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5340 static_branch_inc(&memcg_sockets_enabled_key); 5341 5342 return &memcg->css; 5343 fail: 5344 mem_cgroup_id_remove(memcg); 5345 mem_cgroup_free(memcg); 5346 return ERR_PTR(error); 5347 } 5348 5349 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5350 { 5351 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5352 5353 /* 5354 * A memcg must be visible for memcg_expand_shrinker_maps() 5355 * by the time the maps are allocated. So, we allocate maps 5356 * here, when for_each_mem_cgroup() can't skip it. 5357 */ 5358 if (memcg_alloc_shrinker_maps(memcg)) { 5359 mem_cgroup_id_remove(memcg); 5360 return -ENOMEM; 5361 } 5362 5363 /* Online state pins memcg ID, memcg ID pins CSS */ 5364 refcount_set(&memcg->id.ref, 1); 5365 css_get(css); 5366 return 0; 5367 } 5368 5369 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5370 { 5371 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5372 struct mem_cgroup_event *event, *tmp; 5373 5374 /* 5375 * Unregister events and notify userspace. 5376 * Notify userspace about cgroup removing only after rmdir of cgroup 5377 * directory to avoid race between userspace and kernelspace. 5378 */ 5379 spin_lock(&memcg->event_list_lock); 5380 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5381 list_del_init(&event->list); 5382 schedule_work(&event->remove); 5383 } 5384 spin_unlock(&memcg->event_list_lock); 5385 5386 page_counter_set_min(&memcg->memory, 0); 5387 page_counter_set_low(&memcg->memory, 0); 5388 5389 memcg_offline_kmem(memcg); 5390 wb_memcg_offline(memcg); 5391 5392 drain_all_stock(memcg); 5393 5394 mem_cgroup_id_put(memcg); 5395 } 5396 5397 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5398 { 5399 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5400 5401 invalidate_reclaim_iterators(memcg); 5402 } 5403 5404 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5405 { 5406 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5407 int __maybe_unused i; 5408 5409 #ifdef CONFIG_CGROUP_WRITEBACK 5410 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5411 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5412 #endif 5413 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5414 static_branch_dec(&memcg_sockets_enabled_key); 5415 5416 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5417 static_branch_dec(&memcg_sockets_enabled_key); 5418 5419 vmpressure_cleanup(&memcg->vmpressure); 5420 cancel_work_sync(&memcg->high_work); 5421 mem_cgroup_remove_from_trees(memcg); 5422 memcg_free_shrinker_maps(memcg); 5423 memcg_free_kmem(memcg); 5424 mem_cgroup_free(memcg); 5425 } 5426 5427 /** 5428 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5429 * @css: the target css 5430 * 5431 * Reset the states of the mem_cgroup associated with @css. This is 5432 * invoked when the userland requests disabling on the default hierarchy 5433 * but the memcg is pinned through dependency. The memcg should stop 5434 * applying policies and should revert to the vanilla state as it may be 5435 * made visible again. 5436 * 5437 * The current implementation only resets the essential configurations. 5438 * This needs to be expanded to cover all the visible parts. 5439 */ 5440 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5441 { 5442 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5443 5444 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5445 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5446 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5447 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5448 page_counter_set_min(&memcg->memory, 0); 5449 page_counter_set_low(&memcg->memory, 0); 5450 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5451 memcg->soft_limit = PAGE_COUNTER_MAX; 5452 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5453 memcg_wb_domain_size_changed(memcg); 5454 } 5455 5456 #ifdef CONFIG_MMU 5457 /* Handlers for move charge at task migration. */ 5458 static int mem_cgroup_do_precharge(unsigned long count) 5459 { 5460 int ret; 5461 5462 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5463 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5464 if (!ret) { 5465 mc.precharge += count; 5466 return ret; 5467 } 5468 5469 /* Try charges one by one with reclaim, but do not retry */ 5470 while (count--) { 5471 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5472 if (ret) 5473 return ret; 5474 mc.precharge++; 5475 cond_resched(); 5476 } 5477 return 0; 5478 } 5479 5480 union mc_target { 5481 struct page *page; 5482 swp_entry_t ent; 5483 }; 5484 5485 enum mc_target_type { 5486 MC_TARGET_NONE = 0, 5487 MC_TARGET_PAGE, 5488 MC_TARGET_SWAP, 5489 MC_TARGET_DEVICE, 5490 }; 5491 5492 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5493 unsigned long addr, pte_t ptent) 5494 { 5495 struct page *page = vm_normal_page(vma, addr, ptent); 5496 5497 if (!page || !page_mapped(page)) 5498 return NULL; 5499 if (PageAnon(page)) { 5500 if (!(mc.flags & MOVE_ANON)) 5501 return NULL; 5502 } else { 5503 if (!(mc.flags & MOVE_FILE)) 5504 return NULL; 5505 } 5506 if (!get_page_unless_zero(page)) 5507 return NULL; 5508 5509 return page; 5510 } 5511 5512 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5513 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5514 pte_t ptent, swp_entry_t *entry) 5515 { 5516 struct page *page = NULL; 5517 swp_entry_t ent = pte_to_swp_entry(ptent); 5518 5519 if (!(mc.flags & MOVE_ANON)) 5520 return NULL; 5521 5522 /* 5523 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5524 * a device and because they are not accessible by CPU they are store 5525 * as special swap entry in the CPU page table. 5526 */ 5527 if (is_device_private_entry(ent)) { 5528 page = device_private_entry_to_page(ent); 5529 /* 5530 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5531 * a refcount of 1 when free (unlike normal page) 5532 */ 5533 if (!page_ref_add_unless(page, 1, 1)) 5534 return NULL; 5535 return page; 5536 } 5537 5538 if (non_swap_entry(ent)) 5539 return NULL; 5540 5541 /* 5542 * Because lookup_swap_cache() updates some statistics counter, 5543 * we call find_get_page() with swapper_space directly. 5544 */ 5545 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5546 entry->val = ent.val; 5547 5548 return page; 5549 } 5550 #else 5551 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5552 pte_t ptent, swp_entry_t *entry) 5553 { 5554 return NULL; 5555 } 5556 #endif 5557 5558 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5559 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5560 { 5561 if (!vma->vm_file) /* anonymous vma */ 5562 return NULL; 5563 if (!(mc.flags & MOVE_FILE)) 5564 return NULL; 5565 5566 /* page is moved even if it's not RSS of this task(page-faulted). */ 5567 /* shmem/tmpfs may report page out on swap: account for that too. */ 5568 return find_get_incore_page(vma->vm_file->f_mapping, 5569 linear_page_index(vma, addr)); 5570 } 5571 5572 /** 5573 * mem_cgroup_move_account - move account of the page 5574 * @page: the page 5575 * @compound: charge the page as compound or small page 5576 * @from: mem_cgroup which the page is moved from. 5577 * @to: mem_cgroup which the page is moved to. @from != @to. 5578 * 5579 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5580 * 5581 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5582 * from old cgroup. 5583 */ 5584 static int mem_cgroup_move_account(struct page *page, 5585 bool compound, 5586 struct mem_cgroup *from, 5587 struct mem_cgroup *to) 5588 { 5589 struct lruvec *from_vec, *to_vec; 5590 struct pglist_data *pgdat; 5591 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 5592 int ret; 5593 5594 VM_BUG_ON(from == to); 5595 VM_BUG_ON_PAGE(PageLRU(page), page); 5596 VM_BUG_ON(compound && !PageTransHuge(page)); 5597 5598 /* 5599 * Prevent mem_cgroup_migrate() from looking at 5600 * page->mem_cgroup of its source page while we change it. 5601 */ 5602 ret = -EBUSY; 5603 if (!trylock_page(page)) 5604 goto out; 5605 5606 ret = -EINVAL; 5607 if (page->mem_cgroup != from) 5608 goto out_unlock; 5609 5610 pgdat = page_pgdat(page); 5611 from_vec = mem_cgroup_lruvec(from, pgdat); 5612 to_vec = mem_cgroup_lruvec(to, pgdat); 5613 5614 lock_page_memcg(page); 5615 5616 if (PageAnon(page)) { 5617 if (page_mapped(page)) { 5618 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5619 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5620 if (PageTransHuge(page)) { 5621 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5622 -nr_pages); 5623 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5624 nr_pages); 5625 } 5626 5627 } 5628 } else { 5629 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5630 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5631 5632 if (PageSwapBacked(page)) { 5633 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5634 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5635 } 5636 5637 if (page_mapped(page)) { 5638 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5639 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5640 } 5641 5642 if (PageDirty(page)) { 5643 struct address_space *mapping = page_mapping(page); 5644 5645 if (mapping_can_writeback(mapping)) { 5646 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5647 -nr_pages); 5648 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5649 nr_pages); 5650 } 5651 } 5652 } 5653 5654 if (PageWriteback(page)) { 5655 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5656 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5657 } 5658 5659 /* 5660 * All state has been migrated, let's switch to the new memcg. 5661 * 5662 * It is safe to change page->mem_cgroup here because the page 5663 * is referenced, charged, isolated, and locked: we can't race 5664 * with (un)charging, migration, LRU putback, or anything else 5665 * that would rely on a stable page->mem_cgroup. 5666 * 5667 * Note that lock_page_memcg is a memcg lock, not a page lock, 5668 * to save space. As soon as we switch page->mem_cgroup to a 5669 * new memcg that isn't locked, the above state can change 5670 * concurrently again. Make sure we're truly done with it. 5671 */ 5672 smp_mb(); 5673 5674 css_get(&to->css); 5675 css_put(&from->css); 5676 5677 page->mem_cgroup = to; 5678 5679 __unlock_page_memcg(from); 5680 5681 ret = 0; 5682 5683 local_irq_disable(); 5684 mem_cgroup_charge_statistics(to, page, nr_pages); 5685 memcg_check_events(to, page); 5686 mem_cgroup_charge_statistics(from, page, -nr_pages); 5687 memcg_check_events(from, page); 5688 local_irq_enable(); 5689 out_unlock: 5690 unlock_page(page); 5691 out: 5692 return ret; 5693 } 5694 5695 /** 5696 * get_mctgt_type - get target type of moving charge 5697 * @vma: the vma the pte to be checked belongs 5698 * @addr: the address corresponding to the pte to be checked 5699 * @ptent: the pte to be checked 5700 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5701 * 5702 * Returns 5703 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5704 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5705 * move charge. if @target is not NULL, the page is stored in target->page 5706 * with extra refcnt got(Callers should handle it). 5707 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5708 * target for charge migration. if @target is not NULL, the entry is stored 5709 * in target->ent. 5710 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5711 * (so ZONE_DEVICE page and thus not on the lru). 5712 * For now we such page is charge like a regular page would be as for all 5713 * intent and purposes it is just special memory taking the place of a 5714 * regular page. 5715 * 5716 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5717 * 5718 * Called with pte lock held. 5719 */ 5720 5721 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5722 unsigned long addr, pte_t ptent, union mc_target *target) 5723 { 5724 struct page *page = NULL; 5725 enum mc_target_type ret = MC_TARGET_NONE; 5726 swp_entry_t ent = { .val = 0 }; 5727 5728 if (pte_present(ptent)) 5729 page = mc_handle_present_pte(vma, addr, ptent); 5730 else if (is_swap_pte(ptent)) 5731 page = mc_handle_swap_pte(vma, ptent, &ent); 5732 else if (pte_none(ptent)) 5733 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5734 5735 if (!page && !ent.val) 5736 return ret; 5737 if (page) { 5738 /* 5739 * Do only loose check w/o serialization. 5740 * mem_cgroup_move_account() checks the page is valid or 5741 * not under LRU exclusion. 5742 */ 5743 if (page->mem_cgroup == mc.from) { 5744 ret = MC_TARGET_PAGE; 5745 if (is_device_private_page(page)) 5746 ret = MC_TARGET_DEVICE; 5747 if (target) 5748 target->page = page; 5749 } 5750 if (!ret || !target) 5751 put_page(page); 5752 } 5753 /* 5754 * There is a swap entry and a page doesn't exist or isn't charged. 5755 * But we cannot move a tail-page in a THP. 5756 */ 5757 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5758 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5759 ret = MC_TARGET_SWAP; 5760 if (target) 5761 target->ent = ent; 5762 } 5763 return ret; 5764 } 5765 5766 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5767 /* 5768 * We don't consider PMD mapped swapping or file mapped pages because THP does 5769 * not support them for now. 5770 * Caller should make sure that pmd_trans_huge(pmd) is true. 5771 */ 5772 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5773 unsigned long addr, pmd_t pmd, union mc_target *target) 5774 { 5775 struct page *page = NULL; 5776 enum mc_target_type ret = MC_TARGET_NONE; 5777 5778 if (unlikely(is_swap_pmd(pmd))) { 5779 VM_BUG_ON(thp_migration_supported() && 5780 !is_pmd_migration_entry(pmd)); 5781 return ret; 5782 } 5783 page = pmd_page(pmd); 5784 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5785 if (!(mc.flags & MOVE_ANON)) 5786 return ret; 5787 if (page->mem_cgroup == mc.from) { 5788 ret = MC_TARGET_PAGE; 5789 if (target) { 5790 get_page(page); 5791 target->page = page; 5792 } 5793 } 5794 return ret; 5795 } 5796 #else 5797 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5798 unsigned long addr, pmd_t pmd, union mc_target *target) 5799 { 5800 return MC_TARGET_NONE; 5801 } 5802 #endif 5803 5804 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5805 unsigned long addr, unsigned long end, 5806 struct mm_walk *walk) 5807 { 5808 struct vm_area_struct *vma = walk->vma; 5809 pte_t *pte; 5810 spinlock_t *ptl; 5811 5812 ptl = pmd_trans_huge_lock(pmd, vma); 5813 if (ptl) { 5814 /* 5815 * Note their can not be MC_TARGET_DEVICE for now as we do not 5816 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5817 * this might change. 5818 */ 5819 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5820 mc.precharge += HPAGE_PMD_NR; 5821 spin_unlock(ptl); 5822 return 0; 5823 } 5824 5825 if (pmd_trans_unstable(pmd)) 5826 return 0; 5827 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5828 for (; addr != end; pte++, addr += PAGE_SIZE) 5829 if (get_mctgt_type(vma, addr, *pte, NULL)) 5830 mc.precharge++; /* increment precharge temporarily */ 5831 pte_unmap_unlock(pte - 1, ptl); 5832 cond_resched(); 5833 5834 return 0; 5835 } 5836 5837 static const struct mm_walk_ops precharge_walk_ops = { 5838 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5839 }; 5840 5841 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5842 { 5843 unsigned long precharge; 5844 5845 mmap_read_lock(mm); 5846 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5847 mmap_read_unlock(mm); 5848 5849 precharge = mc.precharge; 5850 mc.precharge = 0; 5851 5852 return precharge; 5853 } 5854 5855 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5856 { 5857 unsigned long precharge = mem_cgroup_count_precharge(mm); 5858 5859 VM_BUG_ON(mc.moving_task); 5860 mc.moving_task = current; 5861 return mem_cgroup_do_precharge(precharge); 5862 } 5863 5864 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5865 static void __mem_cgroup_clear_mc(void) 5866 { 5867 struct mem_cgroup *from = mc.from; 5868 struct mem_cgroup *to = mc.to; 5869 5870 /* we must uncharge all the leftover precharges from mc.to */ 5871 if (mc.precharge) { 5872 cancel_charge(mc.to, mc.precharge); 5873 mc.precharge = 0; 5874 } 5875 /* 5876 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5877 * we must uncharge here. 5878 */ 5879 if (mc.moved_charge) { 5880 cancel_charge(mc.from, mc.moved_charge); 5881 mc.moved_charge = 0; 5882 } 5883 /* we must fixup refcnts and charges */ 5884 if (mc.moved_swap) { 5885 /* uncharge swap account from the old cgroup */ 5886 if (!mem_cgroup_is_root(mc.from)) 5887 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5888 5889 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5890 5891 /* 5892 * we charged both to->memory and to->memsw, so we 5893 * should uncharge to->memory. 5894 */ 5895 if (!mem_cgroup_is_root(mc.to)) 5896 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5897 5898 mc.moved_swap = 0; 5899 } 5900 memcg_oom_recover(from); 5901 memcg_oom_recover(to); 5902 wake_up_all(&mc.waitq); 5903 } 5904 5905 static void mem_cgroup_clear_mc(void) 5906 { 5907 struct mm_struct *mm = mc.mm; 5908 5909 /* 5910 * we must clear moving_task before waking up waiters at the end of 5911 * task migration. 5912 */ 5913 mc.moving_task = NULL; 5914 __mem_cgroup_clear_mc(); 5915 spin_lock(&mc.lock); 5916 mc.from = NULL; 5917 mc.to = NULL; 5918 mc.mm = NULL; 5919 spin_unlock(&mc.lock); 5920 5921 mmput(mm); 5922 } 5923 5924 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5925 { 5926 struct cgroup_subsys_state *css; 5927 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5928 struct mem_cgroup *from; 5929 struct task_struct *leader, *p; 5930 struct mm_struct *mm; 5931 unsigned long move_flags; 5932 int ret = 0; 5933 5934 /* charge immigration isn't supported on the default hierarchy */ 5935 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5936 return 0; 5937 5938 /* 5939 * Multi-process migrations only happen on the default hierarchy 5940 * where charge immigration is not used. Perform charge 5941 * immigration if @tset contains a leader and whine if there are 5942 * multiple. 5943 */ 5944 p = NULL; 5945 cgroup_taskset_for_each_leader(leader, css, tset) { 5946 WARN_ON_ONCE(p); 5947 p = leader; 5948 memcg = mem_cgroup_from_css(css); 5949 } 5950 if (!p) 5951 return 0; 5952 5953 /* 5954 * We are now commited to this value whatever it is. Changes in this 5955 * tunable will only affect upcoming migrations, not the current one. 5956 * So we need to save it, and keep it going. 5957 */ 5958 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5959 if (!move_flags) 5960 return 0; 5961 5962 from = mem_cgroup_from_task(p); 5963 5964 VM_BUG_ON(from == memcg); 5965 5966 mm = get_task_mm(p); 5967 if (!mm) 5968 return 0; 5969 /* We move charges only when we move a owner of the mm */ 5970 if (mm->owner == p) { 5971 VM_BUG_ON(mc.from); 5972 VM_BUG_ON(mc.to); 5973 VM_BUG_ON(mc.precharge); 5974 VM_BUG_ON(mc.moved_charge); 5975 VM_BUG_ON(mc.moved_swap); 5976 5977 spin_lock(&mc.lock); 5978 mc.mm = mm; 5979 mc.from = from; 5980 mc.to = memcg; 5981 mc.flags = move_flags; 5982 spin_unlock(&mc.lock); 5983 /* We set mc.moving_task later */ 5984 5985 ret = mem_cgroup_precharge_mc(mm); 5986 if (ret) 5987 mem_cgroup_clear_mc(); 5988 } else { 5989 mmput(mm); 5990 } 5991 return ret; 5992 } 5993 5994 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5995 { 5996 if (mc.to) 5997 mem_cgroup_clear_mc(); 5998 } 5999 6000 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6001 unsigned long addr, unsigned long end, 6002 struct mm_walk *walk) 6003 { 6004 int ret = 0; 6005 struct vm_area_struct *vma = walk->vma; 6006 pte_t *pte; 6007 spinlock_t *ptl; 6008 enum mc_target_type target_type; 6009 union mc_target target; 6010 struct page *page; 6011 6012 ptl = pmd_trans_huge_lock(pmd, vma); 6013 if (ptl) { 6014 if (mc.precharge < HPAGE_PMD_NR) { 6015 spin_unlock(ptl); 6016 return 0; 6017 } 6018 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6019 if (target_type == MC_TARGET_PAGE) { 6020 page = target.page; 6021 if (!isolate_lru_page(page)) { 6022 if (!mem_cgroup_move_account(page, true, 6023 mc.from, mc.to)) { 6024 mc.precharge -= HPAGE_PMD_NR; 6025 mc.moved_charge += HPAGE_PMD_NR; 6026 } 6027 putback_lru_page(page); 6028 } 6029 put_page(page); 6030 } else if (target_type == MC_TARGET_DEVICE) { 6031 page = target.page; 6032 if (!mem_cgroup_move_account(page, true, 6033 mc.from, mc.to)) { 6034 mc.precharge -= HPAGE_PMD_NR; 6035 mc.moved_charge += HPAGE_PMD_NR; 6036 } 6037 put_page(page); 6038 } 6039 spin_unlock(ptl); 6040 return 0; 6041 } 6042 6043 if (pmd_trans_unstable(pmd)) 6044 return 0; 6045 retry: 6046 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6047 for (; addr != end; addr += PAGE_SIZE) { 6048 pte_t ptent = *(pte++); 6049 bool device = false; 6050 swp_entry_t ent; 6051 6052 if (!mc.precharge) 6053 break; 6054 6055 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6056 case MC_TARGET_DEVICE: 6057 device = true; 6058 fallthrough; 6059 case MC_TARGET_PAGE: 6060 page = target.page; 6061 /* 6062 * We can have a part of the split pmd here. Moving it 6063 * can be done but it would be too convoluted so simply 6064 * ignore such a partial THP and keep it in original 6065 * memcg. There should be somebody mapping the head. 6066 */ 6067 if (PageTransCompound(page)) 6068 goto put; 6069 if (!device && isolate_lru_page(page)) 6070 goto put; 6071 if (!mem_cgroup_move_account(page, false, 6072 mc.from, mc.to)) { 6073 mc.precharge--; 6074 /* we uncharge from mc.from later. */ 6075 mc.moved_charge++; 6076 } 6077 if (!device) 6078 putback_lru_page(page); 6079 put: /* get_mctgt_type() gets the page */ 6080 put_page(page); 6081 break; 6082 case MC_TARGET_SWAP: 6083 ent = target.ent; 6084 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6085 mc.precharge--; 6086 mem_cgroup_id_get_many(mc.to, 1); 6087 /* we fixup other refcnts and charges later. */ 6088 mc.moved_swap++; 6089 } 6090 break; 6091 default: 6092 break; 6093 } 6094 } 6095 pte_unmap_unlock(pte - 1, ptl); 6096 cond_resched(); 6097 6098 if (addr != end) { 6099 /* 6100 * We have consumed all precharges we got in can_attach(). 6101 * We try charge one by one, but don't do any additional 6102 * charges to mc.to if we have failed in charge once in attach() 6103 * phase. 6104 */ 6105 ret = mem_cgroup_do_precharge(1); 6106 if (!ret) 6107 goto retry; 6108 } 6109 6110 return ret; 6111 } 6112 6113 static const struct mm_walk_ops charge_walk_ops = { 6114 .pmd_entry = mem_cgroup_move_charge_pte_range, 6115 }; 6116 6117 static void mem_cgroup_move_charge(void) 6118 { 6119 lru_add_drain_all(); 6120 /* 6121 * Signal lock_page_memcg() to take the memcg's move_lock 6122 * while we're moving its pages to another memcg. Then wait 6123 * for already started RCU-only updates to finish. 6124 */ 6125 atomic_inc(&mc.from->moving_account); 6126 synchronize_rcu(); 6127 retry: 6128 if (unlikely(!mmap_read_trylock(mc.mm))) { 6129 /* 6130 * Someone who are holding the mmap_lock might be waiting in 6131 * waitq. So we cancel all extra charges, wake up all waiters, 6132 * and retry. Because we cancel precharges, we might not be able 6133 * to move enough charges, but moving charge is a best-effort 6134 * feature anyway, so it wouldn't be a big problem. 6135 */ 6136 __mem_cgroup_clear_mc(); 6137 cond_resched(); 6138 goto retry; 6139 } 6140 /* 6141 * When we have consumed all precharges and failed in doing 6142 * additional charge, the page walk just aborts. 6143 */ 6144 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 6145 NULL); 6146 6147 mmap_read_unlock(mc.mm); 6148 atomic_dec(&mc.from->moving_account); 6149 } 6150 6151 static void mem_cgroup_move_task(void) 6152 { 6153 if (mc.to) { 6154 mem_cgroup_move_charge(); 6155 mem_cgroup_clear_mc(); 6156 } 6157 } 6158 #else /* !CONFIG_MMU */ 6159 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6160 { 6161 return 0; 6162 } 6163 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6164 { 6165 } 6166 static void mem_cgroup_move_task(void) 6167 { 6168 } 6169 #endif 6170 6171 /* 6172 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6173 * to verify whether we're attached to the default hierarchy on each mount 6174 * attempt. 6175 */ 6176 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 6177 { 6178 /* 6179 * use_hierarchy is forced on the default hierarchy. cgroup core 6180 * guarantees that @root doesn't have any children, so turning it 6181 * on for the root memcg is enough. 6182 */ 6183 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6184 root_mem_cgroup->use_hierarchy = true; 6185 else 6186 root_mem_cgroup->use_hierarchy = false; 6187 } 6188 6189 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6190 { 6191 if (value == PAGE_COUNTER_MAX) 6192 seq_puts(m, "max\n"); 6193 else 6194 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6195 6196 return 0; 6197 } 6198 6199 static u64 memory_current_read(struct cgroup_subsys_state *css, 6200 struct cftype *cft) 6201 { 6202 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6203 6204 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6205 } 6206 6207 static int memory_min_show(struct seq_file *m, void *v) 6208 { 6209 return seq_puts_memcg_tunable(m, 6210 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6211 } 6212 6213 static ssize_t memory_min_write(struct kernfs_open_file *of, 6214 char *buf, size_t nbytes, loff_t off) 6215 { 6216 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6217 unsigned long min; 6218 int err; 6219 6220 buf = strstrip(buf); 6221 err = page_counter_memparse(buf, "max", &min); 6222 if (err) 6223 return err; 6224 6225 page_counter_set_min(&memcg->memory, min); 6226 6227 return nbytes; 6228 } 6229 6230 static int memory_low_show(struct seq_file *m, void *v) 6231 { 6232 return seq_puts_memcg_tunable(m, 6233 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6234 } 6235 6236 static ssize_t memory_low_write(struct kernfs_open_file *of, 6237 char *buf, size_t nbytes, loff_t off) 6238 { 6239 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6240 unsigned long low; 6241 int err; 6242 6243 buf = strstrip(buf); 6244 err = page_counter_memparse(buf, "max", &low); 6245 if (err) 6246 return err; 6247 6248 page_counter_set_low(&memcg->memory, low); 6249 6250 return nbytes; 6251 } 6252 6253 static int memory_high_show(struct seq_file *m, void *v) 6254 { 6255 return seq_puts_memcg_tunable(m, 6256 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6257 } 6258 6259 static ssize_t memory_high_write(struct kernfs_open_file *of, 6260 char *buf, size_t nbytes, loff_t off) 6261 { 6262 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6263 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6264 bool drained = false; 6265 unsigned long high; 6266 int err; 6267 6268 buf = strstrip(buf); 6269 err = page_counter_memparse(buf, "max", &high); 6270 if (err) 6271 return err; 6272 6273 for (;;) { 6274 unsigned long nr_pages = page_counter_read(&memcg->memory); 6275 unsigned long reclaimed; 6276 6277 if (nr_pages <= high) 6278 break; 6279 6280 if (signal_pending(current)) 6281 break; 6282 6283 if (!drained) { 6284 drain_all_stock(memcg); 6285 drained = true; 6286 continue; 6287 } 6288 6289 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6290 GFP_KERNEL, true); 6291 6292 if (!reclaimed && !nr_retries--) 6293 break; 6294 } 6295 6296 page_counter_set_high(&memcg->memory, high); 6297 6298 memcg_wb_domain_size_changed(memcg); 6299 6300 return nbytes; 6301 } 6302 6303 static int memory_max_show(struct seq_file *m, void *v) 6304 { 6305 return seq_puts_memcg_tunable(m, 6306 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6307 } 6308 6309 static ssize_t memory_max_write(struct kernfs_open_file *of, 6310 char *buf, size_t nbytes, loff_t off) 6311 { 6312 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6313 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6314 bool drained = false; 6315 unsigned long max; 6316 int err; 6317 6318 buf = strstrip(buf); 6319 err = page_counter_memparse(buf, "max", &max); 6320 if (err) 6321 return err; 6322 6323 xchg(&memcg->memory.max, max); 6324 6325 for (;;) { 6326 unsigned long nr_pages = page_counter_read(&memcg->memory); 6327 6328 if (nr_pages <= max) 6329 break; 6330 6331 if (signal_pending(current)) 6332 break; 6333 6334 if (!drained) { 6335 drain_all_stock(memcg); 6336 drained = true; 6337 continue; 6338 } 6339 6340 if (nr_reclaims) { 6341 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6342 GFP_KERNEL, true)) 6343 nr_reclaims--; 6344 continue; 6345 } 6346 6347 memcg_memory_event(memcg, MEMCG_OOM); 6348 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6349 break; 6350 } 6351 6352 memcg_wb_domain_size_changed(memcg); 6353 return nbytes; 6354 } 6355 6356 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6357 { 6358 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6359 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6360 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6361 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6362 seq_printf(m, "oom_kill %lu\n", 6363 atomic_long_read(&events[MEMCG_OOM_KILL])); 6364 } 6365 6366 static int memory_events_show(struct seq_file *m, void *v) 6367 { 6368 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6369 6370 __memory_events_show(m, memcg->memory_events); 6371 return 0; 6372 } 6373 6374 static int memory_events_local_show(struct seq_file *m, void *v) 6375 { 6376 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6377 6378 __memory_events_show(m, memcg->memory_events_local); 6379 return 0; 6380 } 6381 6382 static int memory_stat_show(struct seq_file *m, void *v) 6383 { 6384 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6385 char *buf; 6386 6387 buf = memory_stat_format(memcg); 6388 if (!buf) 6389 return -ENOMEM; 6390 seq_puts(m, buf); 6391 kfree(buf); 6392 return 0; 6393 } 6394 6395 #ifdef CONFIG_NUMA 6396 static int memory_numa_stat_show(struct seq_file *m, void *v) 6397 { 6398 int i; 6399 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6400 6401 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6402 int nid; 6403 6404 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6405 continue; 6406 6407 seq_printf(m, "%s", memory_stats[i].name); 6408 for_each_node_state(nid, N_MEMORY) { 6409 u64 size; 6410 struct lruvec *lruvec; 6411 6412 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6413 size = lruvec_page_state(lruvec, memory_stats[i].idx); 6414 size *= memory_stats[i].ratio; 6415 seq_printf(m, " N%d=%llu", nid, size); 6416 } 6417 seq_putc(m, '\n'); 6418 } 6419 6420 return 0; 6421 } 6422 #endif 6423 6424 static int memory_oom_group_show(struct seq_file *m, void *v) 6425 { 6426 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6427 6428 seq_printf(m, "%d\n", memcg->oom_group); 6429 6430 return 0; 6431 } 6432 6433 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6434 char *buf, size_t nbytes, loff_t off) 6435 { 6436 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6437 int ret, oom_group; 6438 6439 buf = strstrip(buf); 6440 if (!buf) 6441 return -EINVAL; 6442 6443 ret = kstrtoint(buf, 0, &oom_group); 6444 if (ret) 6445 return ret; 6446 6447 if (oom_group != 0 && oom_group != 1) 6448 return -EINVAL; 6449 6450 memcg->oom_group = oom_group; 6451 6452 return nbytes; 6453 } 6454 6455 static struct cftype memory_files[] = { 6456 { 6457 .name = "current", 6458 .flags = CFTYPE_NOT_ON_ROOT, 6459 .read_u64 = memory_current_read, 6460 }, 6461 { 6462 .name = "min", 6463 .flags = CFTYPE_NOT_ON_ROOT, 6464 .seq_show = memory_min_show, 6465 .write = memory_min_write, 6466 }, 6467 { 6468 .name = "low", 6469 .flags = CFTYPE_NOT_ON_ROOT, 6470 .seq_show = memory_low_show, 6471 .write = memory_low_write, 6472 }, 6473 { 6474 .name = "high", 6475 .flags = CFTYPE_NOT_ON_ROOT, 6476 .seq_show = memory_high_show, 6477 .write = memory_high_write, 6478 }, 6479 { 6480 .name = "max", 6481 .flags = CFTYPE_NOT_ON_ROOT, 6482 .seq_show = memory_max_show, 6483 .write = memory_max_write, 6484 }, 6485 { 6486 .name = "events", 6487 .flags = CFTYPE_NOT_ON_ROOT, 6488 .file_offset = offsetof(struct mem_cgroup, events_file), 6489 .seq_show = memory_events_show, 6490 }, 6491 { 6492 .name = "events.local", 6493 .flags = CFTYPE_NOT_ON_ROOT, 6494 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6495 .seq_show = memory_events_local_show, 6496 }, 6497 { 6498 .name = "stat", 6499 .seq_show = memory_stat_show, 6500 }, 6501 #ifdef CONFIG_NUMA 6502 { 6503 .name = "numa_stat", 6504 .seq_show = memory_numa_stat_show, 6505 }, 6506 #endif 6507 { 6508 .name = "oom.group", 6509 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6510 .seq_show = memory_oom_group_show, 6511 .write = memory_oom_group_write, 6512 }, 6513 { } /* terminate */ 6514 }; 6515 6516 struct cgroup_subsys memory_cgrp_subsys = { 6517 .css_alloc = mem_cgroup_css_alloc, 6518 .css_online = mem_cgroup_css_online, 6519 .css_offline = mem_cgroup_css_offline, 6520 .css_released = mem_cgroup_css_released, 6521 .css_free = mem_cgroup_css_free, 6522 .css_reset = mem_cgroup_css_reset, 6523 .can_attach = mem_cgroup_can_attach, 6524 .cancel_attach = mem_cgroup_cancel_attach, 6525 .post_attach = mem_cgroup_move_task, 6526 .bind = mem_cgroup_bind, 6527 .dfl_cftypes = memory_files, 6528 .legacy_cftypes = mem_cgroup_legacy_files, 6529 .early_init = 0, 6530 }; 6531 6532 /* 6533 * This function calculates an individual cgroup's effective 6534 * protection which is derived from its own memory.min/low, its 6535 * parent's and siblings' settings, as well as the actual memory 6536 * distribution in the tree. 6537 * 6538 * The following rules apply to the effective protection values: 6539 * 6540 * 1. At the first level of reclaim, effective protection is equal to 6541 * the declared protection in memory.min and memory.low. 6542 * 6543 * 2. To enable safe delegation of the protection configuration, at 6544 * subsequent levels the effective protection is capped to the 6545 * parent's effective protection. 6546 * 6547 * 3. To make complex and dynamic subtrees easier to configure, the 6548 * user is allowed to overcommit the declared protection at a given 6549 * level. If that is the case, the parent's effective protection is 6550 * distributed to the children in proportion to how much protection 6551 * they have declared and how much of it they are utilizing. 6552 * 6553 * This makes distribution proportional, but also work-conserving: 6554 * if one cgroup claims much more protection than it uses memory, 6555 * the unused remainder is available to its siblings. 6556 * 6557 * 4. Conversely, when the declared protection is undercommitted at a 6558 * given level, the distribution of the larger parental protection 6559 * budget is NOT proportional. A cgroup's protection from a sibling 6560 * is capped to its own memory.min/low setting. 6561 * 6562 * 5. However, to allow protecting recursive subtrees from each other 6563 * without having to declare each individual cgroup's fixed share 6564 * of the ancestor's claim to protection, any unutilized - 6565 * "floating" - protection from up the tree is distributed in 6566 * proportion to each cgroup's *usage*. This makes the protection 6567 * neutral wrt sibling cgroups and lets them compete freely over 6568 * the shared parental protection budget, but it protects the 6569 * subtree as a whole from neighboring subtrees. 6570 * 6571 * Note that 4. and 5. are not in conflict: 4. is about protecting 6572 * against immediate siblings whereas 5. is about protecting against 6573 * neighboring subtrees. 6574 */ 6575 static unsigned long effective_protection(unsigned long usage, 6576 unsigned long parent_usage, 6577 unsigned long setting, 6578 unsigned long parent_effective, 6579 unsigned long siblings_protected) 6580 { 6581 unsigned long protected; 6582 unsigned long ep; 6583 6584 protected = min(usage, setting); 6585 /* 6586 * If all cgroups at this level combined claim and use more 6587 * protection then what the parent affords them, distribute 6588 * shares in proportion to utilization. 6589 * 6590 * We are using actual utilization rather than the statically 6591 * claimed protection in order to be work-conserving: claimed 6592 * but unused protection is available to siblings that would 6593 * otherwise get a smaller chunk than what they claimed. 6594 */ 6595 if (siblings_protected > parent_effective) 6596 return protected * parent_effective / siblings_protected; 6597 6598 /* 6599 * Ok, utilized protection of all children is within what the 6600 * parent affords them, so we know whatever this child claims 6601 * and utilizes is effectively protected. 6602 * 6603 * If there is unprotected usage beyond this value, reclaim 6604 * will apply pressure in proportion to that amount. 6605 * 6606 * If there is unutilized protection, the cgroup will be fully 6607 * shielded from reclaim, but we do return a smaller value for 6608 * protection than what the group could enjoy in theory. This 6609 * is okay. With the overcommit distribution above, effective 6610 * protection is always dependent on how memory is actually 6611 * consumed among the siblings anyway. 6612 */ 6613 ep = protected; 6614 6615 /* 6616 * If the children aren't claiming (all of) the protection 6617 * afforded to them by the parent, distribute the remainder in 6618 * proportion to the (unprotected) memory of each cgroup. That 6619 * way, cgroups that aren't explicitly prioritized wrt each 6620 * other compete freely over the allowance, but they are 6621 * collectively protected from neighboring trees. 6622 * 6623 * We're using unprotected memory for the weight so that if 6624 * some cgroups DO claim explicit protection, we don't protect 6625 * the same bytes twice. 6626 * 6627 * Check both usage and parent_usage against the respective 6628 * protected values. One should imply the other, but they 6629 * aren't read atomically - make sure the division is sane. 6630 */ 6631 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6632 return ep; 6633 if (parent_effective > siblings_protected && 6634 parent_usage > siblings_protected && 6635 usage > protected) { 6636 unsigned long unclaimed; 6637 6638 unclaimed = parent_effective - siblings_protected; 6639 unclaimed *= usage - protected; 6640 unclaimed /= parent_usage - siblings_protected; 6641 6642 ep += unclaimed; 6643 } 6644 6645 return ep; 6646 } 6647 6648 /** 6649 * mem_cgroup_protected - check if memory consumption is in the normal range 6650 * @root: the top ancestor of the sub-tree being checked 6651 * @memcg: the memory cgroup to check 6652 * 6653 * WARNING: This function is not stateless! It can only be used as part 6654 * of a top-down tree iteration, not for isolated queries. 6655 */ 6656 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6657 struct mem_cgroup *memcg) 6658 { 6659 unsigned long usage, parent_usage; 6660 struct mem_cgroup *parent; 6661 6662 if (mem_cgroup_disabled()) 6663 return; 6664 6665 if (!root) 6666 root = root_mem_cgroup; 6667 6668 /* 6669 * Effective values of the reclaim targets are ignored so they 6670 * can be stale. Have a look at mem_cgroup_protection for more 6671 * details. 6672 * TODO: calculation should be more robust so that we do not need 6673 * that special casing. 6674 */ 6675 if (memcg == root) 6676 return; 6677 6678 usage = page_counter_read(&memcg->memory); 6679 if (!usage) 6680 return; 6681 6682 parent = parent_mem_cgroup(memcg); 6683 /* No parent means a non-hierarchical mode on v1 memcg */ 6684 if (!parent) 6685 return; 6686 6687 if (parent == root) { 6688 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6689 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6690 return; 6691 } 6692 6693 parent_usage = page_counter_read(&parent->memory); 6694 6695 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6696 READ_ONCE(memcg->memory.min), 6697 READ_ONCE(parent->memory.emin), 6698 atomic_long_read(&parent->memory.children_min_usage))); 6699 6700 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6701 READ_ONCE(memcg->memory.low), 6702 READ_ONCE(parent->memory.elow), 6703 atomic_long_read(&parent->memory.children_low_usage))); 6704 } 6705 6706 /** 6707 * mem_cgroup_charge - charge a newly allocated page to a cgroup 6708 * @page: page to charge 6709 * @mm: mm context of the victim 6710 * @gfp_mask: reclaim mode 6711 * 6712 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6713 * pages according to @gfp_mask if necessary. 6714 * 6715 * Returns 0 on success. Otherwise, an error code is returned. 6716 */ 6717 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 6718 { 6719 unsigned int nr_pages = thp_nr_pages(page); 6720 struct mem_cgroup *memcg = NULL; 6721 int ret = 0; 6722 6723 if (mem_cgroup_disabled()) 6724 goto out; 6725 6726 if (PageSwapCache(page)) { 6727 swp_entry_t ent = { .val = page_private(page), }; 6728 unsigned short id; 6729 6730 /* 6731 * Every swap fault against a single page tries to charge the 6732 * page, bail as early as possible. shmem_unuse() encounters 6733 * already charged pages, too. page->mem_cgroup is protected 6734 * by the page lock, which serializes swap cache removal, which 6735 * in turn serializes uncharging. 6736 */ 6737 VM_BUG_ON_PAGE(!PageLocked(page), page); 6738 if (compound_head(page)->mem_cgroup) 6739 goto out; 6740 6741 id = lookup_swap_cgroup_id(ent); 6742 rcu_read_lock(); 6743 memcg = mem_cgroup_from_id(id); 6744 if (memcg && !css_tryget_online(&memcg->css)) 6745 memcg = NULL; 6746 rcu_read_unlock(); 6747 } 6748 6749 if (!memcg) 6750 memcg = get_mem_cgroup_from_mm(mm); 6751 6752 ret = try_charge(memcg, gfp_mask, nr_pages); 6753 if (ret) 6754 goto out_put; 6755 6756 css_get(&memcg->css); 6757 commit_charge(page, memcg); 6758 6759 local_irq_disable(); 6760 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6761 memcg_check_events(memcg, page); 6762 local_irq_enable(); 6763 6764 if (PageSwapCache(page)) { 6765 swp_entry_t entry = { .val = page_private(page) }; 6766 /* 6767 * The swap entry might not get freed for a long time, 6768 * let's not wait for it. The page already received a 6769 * memory+swap charge, drop the swap entry duplicate. 6770 */ 6771 mem_cgroup_uncharge_swap(entry, nr_pages); 6772 } 6773 6774 out_put: 6775 css_put(&memcg->css); 6776 out: 6777 return ret; 6778 } 6779 6780 struct uncharge_gather { 6781 struct mem_cgroup *memcg; 6782 unsigned long nr_pages; 6783 unsigned long pgpgout; 6784 unsigned long nr_kmem; 6785 struct page *dummy_page; 6786 }; 6787 6788 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6789 { 6790 memset(ug, 0, sizeof(*ug)); 6791 } 6792 6793 static void uncharge_batch(const struct uncharge_gather *ug) 6794 { 6795 unsigned long flags; 6796 6797 if (!mem_cgroup_is_root(ug->memcg)) { 6798 page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); 6799 if (do_memsw_account()) 6800 page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); 6801 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6802 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6803 memcg_oom_recover(ug->memcg); 6804 } 6805 6806 local_irq_save(flags); 6807 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6808 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); 6809 memcg_check_events(ug->memcg, ug->dummy_page); 6810 local_irq_restore(flags); 6811 6812 /* drop reference from uncharge_page */ 6813 css_put(&ug->memcg->css); 6814 } 6815 6816 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6817 { 6818 unsigned long nr_pages; 6819 6820 VM_BUG_ON_PAGE(PageLRU(page), page); 6821 6822 if (!page->mem_cgroup) 6823 return; 6824 6825 /* 6826 * Nobody should be changing or seriously looking at 6827 * page->mem_cgroup at this point, we have fully 6828 * exclusive access to the page. 6829 */ 6830 6831 if (ug->memcg != page->mem_cgroup) { 6832 if (ug->memcg) { 6833 uncharge_batch(ug); 6834 uncharge_gather_clear(ug); 6835 } 6836 ug->memcg = page->mem_cgroup; 6837 6838 /* pairs with css_put in uncharge_batch */ 6839 css_get(&ug->memcg->css); 6840 } 6841 6842 nr_pages = compound_nr(page); 6843 ug->nr_pages += nr_pages; 6844 6845 if (!PageKmemcg(page)) { 6846 ug->pgpgout++; 6847 } else { 6848 ug->nr_kmem += nr_pages; 6849 __ClearPageKmemcg(page); 6850 } 6851 6852 ug->dummy_page = page; 6853 page->mem_cgroup = NULL; 6854 css_put(&ug->memcg->css); 6855 } 6856 6857 static void uncharge_list(struct list_head *page_list) 6858 { 6859 struct uncharge_gather ug; 6860 struct list_head *next; 6861 6862 uncharge_gather_clear(&ug); 6863 6864 /* 6865 * Note that the list can be a single page->lru; hence the 6866 * do-while loop instead of a simple list_for_each_entry(). 6867 */ 6868 next = page_list->next; 6869 do { 6870 struct page *page; 6871 6872 page = list_entry(next, struct page, lru); 6873 next = page->lru.next; 6874 6875 uncharge_page(page, &ug); 6876 } while (next != page_list); 6877 6878 if (ug.memcg) 6879 uncharge_batch(&ug); 6880 } 6881 6882 /** 6883 * mem_cgroup_uncharge - uncharge a page 6884 * @page: page to uncharge 6885 * 6886 * Uncharge a page previously charged with mem_cgroup_charge(). 6887 */ 6888 void mem_cgroup_uncharge(struct page *page) 6889 { 6890 struct uncharge_gather ug; 6891 6892 if (mem_cgroup_disabled()) 6893 return; 6894 6895 /* Don't touch page->lru of any random page, pre-check: */ 6896 if (!page->mem_cgroup) 6897 return; 6898 6899 uncharge_gather_clear(&ug); 6900 uncharge_page(page, &ug); 6901 uncharge_batch(&ug); 6902 } 6903 6904 /** 6905 * mem_cgroup_uncharge_list - uncharge a list of page 6906 * @page_list: list of pages to uncharge 6907 * 6908 * Uncharge a list of pages previously charged with 6909 * mem_cgroup_charge(). 6910 */ 6911 void mem_cgroup_uncharge_list(struct list_head *page_list) 6912 { 6913 if (mem_cgroup_disabled()) 6914 return; 6915 6916 if (!list_empty(page_list)) 6917 uncharge_list(page_list); 6918 } 6919 6920 /** 6921 * mem_cgroup_migrate - charge a page's replacement 6922 * @oldpage: currently circulating page 6923 * @newpage: replacement page 6924 * 6925 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6926 * be uncharged upon free. 6927 * 6928 * Both pages must be locked, @newpage->mapping must be set up. 6929 */ 6930 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6931 { 6932 struct mem_cgroup *memcg; 6933 unsigned int nr_pages; 6934 unsigned long flags; 6935 6936 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6937 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6938 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6939 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6940 newpage); 6941 6942 if (mem_cgroup_disabled()) 6943 return; 6944 6945 /* Page cache replacement: new page already charged? */ 6946 if (newpage->mem_cgroup) 6947 return; 6948 6949 /* Swapcache readahead pages can get replaced before being charged */ 6950 memcg = oldpage->mem_cgroup; 6951 if (!memcg) 6952 return; 6953 6954 /* Force-charge the new page. The old one will be freed soon */ 6955 nr_pages = thp_nr_pages(newpage); 6956 6957 page_counter_charge(&memcg->memory, nr_pages); 6958 if (do_memsw_account()) 6959 page_counter_charge(&memcg->memsw, nr_pages); 6960 6961 css_get(&memcg->css); 6962 commit_charge(newpage, memcg); 6963 6964 local_irq_save(flags); 6965 mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 6966 memcg_check_events(memcg, newpage); 6967 local_irq_restore(flags); 6968 } 6969 6970 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6971 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6972 6973 void mem_cgroup_sk_alloc(struct sock *sk) 6974 { 6975 struct mem_cgroup *memcg; 6976 6977 if (!mem_cgroup_sockets_enabled) 6978 return; 6979 6980 /* Do not associate the sock with unrelated interrupted task's memcg. */ 6981 if (in_interrupt()) 6982 return; 6983 6984 rcu_read_lock(); 6985 memcg = mem_cgroup_from_task(current); 6986 if (memcg == root_mem_cgroup) 6987 goto out; 6988 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6989 goto out; 6990 if (css_tryget(&memcg->css)) 6991 sk->sk_memcg = memcg; 6992 out: 6993 rcu_read_unlock(); 6994 } 6995 6996 void mem_cgroup_sk_free(struct sock *sk) 6997 { 6998 if (sk->sk_memcg) 6999 css_put(&sk->sk_memcg->css); 7000 } 7001 7002 /** 7003 * mem_cgroup_charge_skmem - charge socket memory 7004 * @memcg: memcg to charge 7005 * @nr_pages: number of pages to charge 7006 * 7007 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7008 * @memcg's configured limit, %false if the charge had to be forced. 7009 */ 7010 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7011 { 7012 gfp_t gfp_mask = GFP_KERNEL; 7013 7014 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7015 struct page_counter *fail; 7016 7017 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7018 memcg->tcpmem_pressure = 0; 7019 return true; 7020 } 7021 page_counter_charge(&memcg->tcpmem, nr_pages); 7022 memcg->tcpmem_pressure = 1; 7023 return false; 7024 } 7025 7026 /* Don't block in the packet receive path */ 7027 if (in_softirq()) 7028 gfp_mask = GFP_NOWAIT; 7029 7030 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7031 7032 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 7033 return true; 7034 7035 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 7036 return false; 7037 } 7038 7039 /** 7040 * mem_cgroup_uncharge_skmem - uncharge socket memory 7041 * @memcg: memcg to uncharge 7042 * @nr_pages: number of pages to uncharge 7043 */ 7044 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7045 { 7046 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7047 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7048 return; 7049 } 7050 7051 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7052 7053 refill_stock(memcg, nr_pages); 7054 } 7055 7056 static int __init cgroup_memory(char *s) 7057 { 7058 char *token; 7059 7060 while ((token = strsep(&s, ",")) != NULL) { 7061 if (!*token) 7062 continue; 7063 if (!strcmp(token, "nosocket")) 7064 cgroup_memory_nosocket = true; 7065 if (!strcmp(token, "nokmem")) 7066 cgroup_memory_nokmem = true; 7067 } 7068 return 0; 7069 } 7070 __setup("cgroup.memory=", cgroup_memory); 7071 7072 /* 7073 * subsys_initcall() for memory controller. 7074 * 7075 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7076 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7077 * basically everything that doesn't depend on a specific mem_cgroup structure 7078 * should be initialized from here. 7079 */ 7080 static int __init mem_cgroup_init(void) 7081 { 7082 int cpu, node; 7083 7084 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7085 memcg_hotplug_cpu_dead); 7086 7087 for_each_possible_cpu(cpu) 7088 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7089 drain_local_stock); 7090 7091 for_each_node(node) { 7092 struct mem_cgroup_tree_per_node *rtpn; 7093 7094 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7095 node_online(node) ? node : NUMA_NO_NODE); 7096 7097 rtpn->rb_root = RB_ROOT; 7098 rtpn->rb_rightmost = NULL; 7099 spin_lock_init(&rtpn->lock); 7100 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7101 } 7102 7103 return 0; 7104 } 7105 subsys_initcall(mem_cgroup_init); 7106 7107 #ifdef CONFIG_MEMCG_SWAP 7108 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7109 { 7110 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7111 /* 7112 * The root cgroup cannot be destroyed, so it's refcount must 7113 * always be >= 1. 7114 */ 7115 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 7116 VM_BUG_ON(1); 7117 break; 7118 } 7119 memcg = parent_mem_cgroup(memcg); 7120 if (!memcg) 7121 memcg = root_mem_cgroup; 7122 } 7123 return memcg; 7124 } 7125 7126 /** 7127 * mem_cgroup_swapout - transfer a memsw charge to swap 7128 * @page: page whose memsw charge to transfer 7129 * @entry: swap entry to move the charge to 7130 * 7131 * Transfer the memsw charge of @page to @entry. 7132 */ 7133 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 7134 { 7135 struct mem_cgroup *memcg, *swap_memcg; 7136 unsigned int nr_entries; 7137 unsigned short oldid; 7138 7139 VM_BUG_ON_PAGE(PageLRU(page), page); 7140 VM_BUG_ON_PAGE(page_count(page), page); 7141 7142 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7143 return; 7144 7145 memcg = page->mem_cgroup; 7146 7147 /* Readahead page, never charged */ 7148 if (!memcg) 7149 return; 7150 7151 /* 7152 * In case the memcg owning these pages has been offlined and doesn't 7153 * have an ID allocated to it anymore, charge the closest online 7154 * ancestor for the swap instead and transfer the memory+swap charge. 7155 */ 7156 swap_memcg = mem_cgroup_id_get_online(memcg); 7157 nr_entries = thp_nr_pages(page); 7158 /* Get references for the tail pages, too */ 7159 if (nr_entries > 1) 7160 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7161 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7162 nr_entries); 7163 VM_BUG_ON_PAGE(oldid, page); 7164 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7165 7166 page->mem_cgroup = NULL; 7167 7168 if (!mem_cgroup_is_root(memcg)) 7169 page_counter_uncharge(&memcg->memory, nr_entries); 7170 7171 if (!cgroup_memory_noswap && memcg != swap_memcg) { 7172 if (!mem_cgroup_is_root(swap_memcg)) 7173 page_counter_charge(&swap_memcg->memsw, nr_entries); 7174 page_counter_uncharge(&memcg->memsw, nr_entries); 7175 } 7176 7177 /* 7178 * Interrupts should be disabled here because the caller holds the 7179 * i_pages lock which is taken with interrupts-off. It is 7180 * important here to have the interrupts disabled because it is the 7181 * only synchronisation we have for updating the per-CPU variables. 7182 */ 7183 VM_BUG_ON(!irqs_disabled()); 7184 mem_cgroup_charge_statistics(memcg, page, -nr_entries); 7185 memcg_check_events(memcg, page); 7186 7187 css_put(&memcg->css); 7188 } 7189 7190 /** 7191 * mem_cgroup_try_charge_swap - try charging swap space for a page 7192 * @page: page being added to swap 7193 * @entry: swap entry to charge 7194 * 7195 * Try to charge @page's memcg for the swap space at @entry. 7196 * 7197 * Returns 0 on success, -ENOMEM on failure. 7198 */ 7199 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 7200 { 7201 unsigned int nr_pages = thp_nr_pages(page); 7202 struct page_counter *counter; 7203 struct mem_cgroup *memcg; 7204 unsigned short oldid; 7205 7206 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7207 return 0; 7208 7209 memcg = page->mem_cgroup; 7210 7211 /* Readahead page, never charged */ 7212 if (!memcg) 7213 return 0; 7214 7215 if (!entry.val) { 7216 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7217 return 0; 7218 } 7219 7220 memcg = mem_cgroup_id_get_online(memcg); 7221 7222 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 7223 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7224 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7225 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7226 mem_cgroup_id_put(memcg); 7227 return -ENOMEM; 7228 } 7229 7230 /* Get references for the tail pages, too */ 7231 if (nr_pages > 1) 7232 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7233 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7234 VM_BUG_ON_PAGE(oldid, page); 7235 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7236 7237 return 0; 7238 } 7239 7240 /** 7241 * mem_cgroup_uncharge_swap - uncharge swap space 7242 * @entry: swap entry to uncharge 7243 * @nr_pages: the amount of swap space to uncharge 7244 */ 7245 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7246 { 7247 struct mem_cgroup *memcg; 7248 unsigned short id; 7249 7250 id = swap_cgroup_record(entry, 0, nr_pages); 7251 rcu_read_lock(); 7252 memcg = mem_cgroup_from_id(id); 7253 if (memcg) { 7254 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 7255 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7256 page_counter_uncharge(&memcg->swap, nr_pages); 7257 else 7258 page_counter_uncharge(&memcg->memsw, nr_pages); 7259 } 7260 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7261 mem_cgroup_id_put_many(memcg, nr_pages); 7262 } 7263 rcu_read_unlock(); 7264 } 7265 7266 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7267 { 7268 long nr_swap_pages = get_nr_swap_pages(); 7269 7270 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7271 return nr_swap_pages; 7272 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7273 nr_swap_pages = min_t(long, nr_swap_pages, 7274 READ_ONCE(memcg->swap.max) - 7275 page_counter_read(&memcg->swap)); 7276 return nr_swap_pages; 7277 } 7278 7279 bool mem_cgroup_swap_full(struct page *page) 7280 { 7281 struct mem_cgroup *memcg; 7282 7283 VM_BUG_ON_PAGE(!PageLocked(page), page); 7284 7285 if (vm_swap_full()) 7286 return true; 7287 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7288 return false; 7289 7290 memcg = page->mem_cgroup; 7291 if (!memcg) 7292 return false; 7293 7294 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 7295 unsigned long usage = page_counter_read(&memcg->swap); 7296 7297 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7298 usage * 2 >= READ_ONCE(memcg->swap.max)) 7299 return true; 7300 } 7301 7302 return false; 7303 } 7304 7305 static int __init setup_swap_account(char *s) 7306 { 7307 if (!strcmp(s, "1")) 7308 cgroup_memory_noswap = 0; 7309 else if (!strcmp(s, "0")) 7310 cgroup_memory_noswap = 1; 7311 return 1; 7312 } 7313 __setup("swapaccount=", setup_swap_account); 7314 7315 static u64 swap_current_read(struct cgroup_subsys_state *css, 7316 struct cftype *cft) 7317 { 7318 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7319 7320 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7321 } 7322 7323 static int swap_high_show(struct seq_file *m, void *v) 7324 { 7325 return seq_puts_memcg_tunable(m, 7326 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7327 } 7328 7329 static ssize_t swap_high_write(struct kernfs_open_file *of, 7330 char *buf, size_t nbytes, loff_t off) 7331 { 7332 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7333 unsigned long high; 7334 int err; 7335 7336 buf = strstrip(buf); 7337 err = page_counter_memparse(buf, "max", &high); 7338 if (err) 7339 return err; 7340 7341 page_counter_set_high(&memcg->swap, high); 7342 7343 return nbytes; 7344 } 7345 7346 static int swap_max_show(struct seq_file *m, void *v) 7347 { 7348 return seq_puts_memcg_tunable(m, 7349 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7350 } 7351 7352 static ssize_t swap_max_write(struct kernfs_open_file *of, 7353 char *buf, size_t nbytes, loff_t off) 7354 { 7355 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7356 unsigned long max; 7357 int err; 7358 7359 buf = strstrip(buf); 7360 err = page_counter_memparse(buf, "max", &max); 7361 if (err) 7362 return err; 7363 7364 xchg(&memcg->swap.max, max); 7365 7366 return nbytes; 7367 } 7368 7369 static int swap_events_show(struct seq_file *m, void *v) 7370 { 7371 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7372 7373 seq_printf(m, "high %lu\n", 7374 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7375 seq_printf(m, "max %lu\n", 7376 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7377 seq_printf(m, "fail %lu\n", 7378 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7379 7380 return 0; 7381 } 7382 7383 static struct cftype swap_files[] = { 7384 { 7385 .name = "swap.current", 7386 .flags = CFTYPE_NOT_ON_ROOT, 7387 .read_u64 = swap_current_read, 7388 }, 7389 { 7390 .name = "swap.high", 7391 .flags = CFTYPE_NOT_ON_ROOT, 7392 .seq_show = swap_high_show, 7393 .write = swap_high_write, 7394 }, 7395 { 7396 .name = "swap.max", 7397 .flags = CFTYPE_NOT_ON_ROOT, 7398 .seq_show = swap_max_show, 7399 .write = swap_max_write, 7400 }, 7401 { 7402 .name = "swap.events", 7403 .flags = CFTYPE_NOT_ON_ROOT, 7404 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7405 .seq_show = swap_events_show, 7406 }, 7407 { } /* terminate */ 7408 }; 7409 7410 static struct cftype memsw_files[] = { 7411 { 7412 .name = "memsw.usage_in_bytes", 7413 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7414 .read_u64 = mem_cgroup_read_u64, 7415 }, 7416 { 7417 .name = "memsw.max_usage_in_bytes", 7418 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7419 .write = mem_cgroup_reset, 7420 .read_u64 = mem_cgroup_read_u64, 7421 }, 7422 { 7423 .name = "memsw.limit_in_bytes", 7424 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7425 .write = mem_cgroup_write, 7426 .read_u64 = mem_cgroup_read_u64, 7427 }, 7428 { 7429 .name = "memsw.failcnt", 7430 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7431 .write = mem_cgroup_reset, 7432 .read_u64 = mem_cgroup_read_u64, 7433 }, 7434 { }, /* terminate */ 7435 }; 7436 7437 /* 7438 * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 7439 * instead of a core_initcall(), this could mean cgroup_memory_noswap still 7440 * remains set to false even when memcg is disabled via "cgroup_disable=memory" 7441 * boot parameter. This may result in premature OOPS inside 7442 * mem_cgroup_get_nr_swap_pages() function in corner cases. 7443 */ 7444 static int __init mem_cgroup_swap_init(void) 7445 { 7446 /* No memory control -> no swap control */ 7447 if (mem_cgroup_disabled()) 7448 cgroup_memory_noswap = true; 7449 7450 if (cgroup_memory_noswap) 7451 return 0; 7452 7453 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7454 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7455 7456 return 0; 7457 } 7458 core_initcall(mem_cgroup_swap_init); 7459 7460 #endif /* CONFIG_MEMCG_SWAP */ 7461