1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/memremap.h> 57 #include <linux/mm_inline.h> 58 #include <linux/swap_cgroup.h> 59 #include <linux/cpu.h> 60 #include <linux/oom.h> 61 #include <linux/lockdep.h> 62 #include <linux/file.h> 63 #include <linux/resume_user_mode.h> 64 #include <linux/psi.h> 65 #include <linux/seq_buf.h> 66 #include "internal.h" 67 #include <net/sock.h> 68 #include <net/ip.h> 69 #include "slab.h" 70 #include "swap.h" 71 72 #include <linux/uaccess.h> 73 74 #include <trace/events/vmscan.h> 75 76 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 77 EXPORT_SYMBOL(memory_cgrp_subsys); 78 79 struct mem_cgroup *root_mem_cgroup __read_mostly; 80 81 /* Active memory cgroup to use from an interrupt context */ 82 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 83 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 84 85 /* Socket memory accounting disabled? */ 86 static bool cgroup_memory_nosocket __ro_after_init; 87 88 /* Kernel memory accounting disabled? */ 89 static bool cgroup_memory_nokmem __ro_after_init; 90 91 #ifdef CONFIG_CGROUP_WRITEBACK 92 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 93 #endif 94 95 /* Whether legacy memory+swap accounting is active */ 96 static bool do_memsw_account(void) 97 { 98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys); 99 } 100 101 #define THRESHOLDS_EVENTS_TARGET 128 102 #define SOFTLIMIT_EVENTS_TARGET 1024 103 104 /* 105 * Cgroups above their limits are maintained in a RB-Tree, independent of 106 * their hierarchy representation 107 */ 108 109 struct mem_cgroup_tree_per_node { 110 struct rb_root rb_root; 111 struct rb_node *rb_rightmost; 112 spinlock_t lock; 113 }; 114 115 struct mem_cgroup_tree { 116 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 117 }; 118 119 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 120 121 /* for OOM */ 122 struct mem_cgroup_eventfd_list { 123 struct list_head list; 124 struct eventfd_ctx *eventfd; 125 }; 126 127 /* 128 * cgroup_event represents events which userspace want to receive. 129 */ 130 struct mem_cgroup_event { 131 /* 132 * memcg which the event belongs to. 133 */ 134 struct mem_cgroup *memcg; 135 /* 136 * eventfd to signal userspace about the event. 137 */ 138 struct eventfd_ctx *eventfd; 139 /* 140 * Each of these stored in a list by the cgroup. 141 */ 142 struct list_head list; 143 /* 144 * register_event() callback will be used to add new userspace 145 * waiter for changes related to this event. Use eventfd_signal() 146 * on eventfd to send notification to userspace. 147 */ 148 int (*register_event)(struct mem_cgroup *memcg, 149 struct eventfd_ctx *eventfd, const char *args); 150 /* 151 * unregister_event() callback will be called when userspace closes 152 * the eventfd or on cgroup removing. This callback must be set, 153 * if you want provide notification functionality. 154 */ 155 void (*unregister_event)(struct mem_cgroup *memcg, 156 struct eventfd_ctx *eventfd); 157 /* 158 * All fields below needed to unregister event when 159 * userspace closes eventfd. 160 */ 161 poll_table pt; 162 wait_queue_head_t *wqh; 163 wait_queue_entry_t wait; 164 struct work_struct remove; 165 }; 166 167 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 168 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 169 170 /* Stuffs for move charges at task migration. */ 171 /* 172 * Types of charges to be moved. 173 */ 174 #define MOVE_ANON 0x1U 175 #define MOVE_FILE 0x2U 176 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 177 178 /* "mc" and its members are protected by cgroup_mutex */ 179 static struct move_charge_struct { 180 spinlock_t lock; /* for from, to */ 181 struct mm_struct *mm; 182 struct mem_cgroup *from; 183 struct mem_cgroup *to; 184 unsigned long flags; 185 unsigned long precharge; 186 unsigned long moved_charge; 187 unsigned long moved_swap; 188 struct task_struct *moving_task; /* a task moving charges */ 189 wait_queue_head_t waitq; /* a waitq for other context */ 190 } mc = { 191 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 192 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 193 }; 194 195 /* 196 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 197 * limit reclaim to prevent infinite loops, if they ever occur. 198 */ 199 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 200 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 201 202 /* for encoding cft->private value on file */ 203 enum res_type { 204 _MEM, 205 _MEMSWAP, 206 _KMEM, 207 _TCP, 208 }; 209 210 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 211 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 212 #define MEMFILE_ATTR(val) ((val) & 0xffff) 213 214 /* 215 * Iteration constructs for visiting all cgroups (under a tree). If 216 * loops are exited prematurely (break), mem_cgroup_iter_break() must 217 * be used for reference counting. 218 */ 219 #define for_each_mem_cgroup_tree(iter, root) \ 220 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 221 iter != NULL; \ 222 iter = mem_cgroup_iter(root, iter, NULL)) 223 224 #define for_each_mem_cgroup(iter) \ 225 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 226 iter != NULL; \ 227 iter = mem_cgroup_iter(NULL, iter, NULL)) 228 229 static inline bool task_is_dying(void) 230 { 231 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 232 (current->flags & PF_EXITING); 233 } 234 235 /* Some nice accessors for the vmpressure. */ 236 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 237 { 238 if (!memcg) 239 memcg = root_mem_cgroup; 240 return &memcg->vmpressure; 241 } 242 243 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 244 { 245 return container_of(vmpr, struct mem_cgroup, vmpressure); 246 } 247 248 #ifdef CONFIG_MEMCG_KMEM 249 static DEFINE_SPINLOCK(objcg_lock); 250 251 bool mem_cgroup_kmem_disabled(void) 252 { 253 return cgroup_memory_nokmem; 254 } 255 256 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 257 unsigned int nr_pages); 258 259 static void obj_cgroup_release(struct percpu_ref *ref) 260 { 261 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 262 unsigned int nr_bytes; 263 unsigned int nr_pages; 264 unsigned long flags; 265 266 /* 267 * At this point all allocated objects are freed, and 268 * objcg->nr_charged_bytes can't have an arbitrary byte value. 269 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 270 * 271 * The following sequence can lead to it: 272 * 1) CPU0: objcg == stock->cached_objcg 273 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 274 * PAGE_SIZE bytes are charged 275 * 3) CPU1: a process from another memcg is allocating something, 276 * the stock if flushed, 277 * objcg->nr_charged_bytes = PAGE_SIZE - 92 278 * 5) CPU0: we do release this object, 279 * 92 bytes are added to stock->nr_bytes 280 * 6) CPU0: stock is flushed, 281 * 92 bytes are added to objcg->nr_charged_bytes 282 * 283 * In the result, nr_charged_bytes == PAGE_SIZE. 284 * This page will be uncharged in obj_cgroup_release(). 285 */ 286 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 287 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 288 nr_pages = nr_bytes >> PAGE_SHIFT; 289 290 if (nr_pages) 291 obj_cgroup_uncharge_pages(objcg, nr_pages); 292 293 spin_lock_irqsave(&objcg_lock, flags); 294 list_del(&objcg->list); 295 spin_unlock_irqrestore(&objcg_lock, flags); 296 297 percpu_ref_exit(ref); 298 kfree_rcu(objcg, rcu); 299 } 300 301 static struct obj_cgroup *obj_cgroup_alloc(void) 302 { 303 struct obj_cgroup *objcg; 304 int ret; 305 306 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 307 if (!objcg) 308 return NULL; 309 310 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 311 GFP_KERNEL); 312 if (ret) { 313 kfree(objcg); 314 return NULL; 315 } 316 INIT_LIST_HEAD(&objcg->list); 317 return objcg; 318 } 319 320 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 321 struct mem_cgroup *parent) 322 { 323 struct obj_cgroup *objcg, *iter; 324 325 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 326 327 spin_lock_irq(&objcg_lock); 328 329 /* 1) Ready to reparent active objcg. */ 330 list_add(&objcg->list, &memcg->objcg_list); 331 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 332 list_for_each_entry(iter, &memcg->objcg_list, list) 333 WRITE_ONCE(iter->memcg, parent); 334 /* 3) Move already reparented objcgs to the parent's list */ 335 list_splice(&memcg->objcg_list, &parent->objcg_list); 336 337 spin_unlock_irq(&objcg_lock); 338 339 percpu_ref_kill(&objcg->refcnt); 340 } 341 342 /* 343 * A lot of the calls to the cache allocation functions are expected to be 344 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 345 * conditional to this static branch, we'll have to allow modules that does 346 * kmem_cache_alloc and the such to see this symbol as well 347 */ 348 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 349 EXPORT_SYMBOL(memcg_kmem_enabled_key); 350 #endif 351 352 /** 353 * mem_cgroup_css_from_folio - css of the memcg associated with a folio 354 * @folio: folio of interest 355 * 356 * If memcg is bound to the default hierarchy, css of the memcg associated 357 * with @folio is returned. The returned css remains associated with @folio 358 * until it is released. 359 * 360 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 361 * is returned. 362 */ 363 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) 364 { 365 struct mem_cgroup *memcg = folio_memcg(folio); 366 367 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 368 memcg = root_mem_cgroup; 369 370 return &memcg->css; 371 } 372 373 /** 374 * page_cgroup_ino - return inode number of the memcg a page is charged to 375 * @page: the page 376 * 377 * Look up the closest online ancestor of the memory cgroup @page is charged to 378 * and return its inode number or 0 if @page is not charged to any cgroup. It 379 * is safe to call this function without holding a reference to @page. 380 * 381 * Note, this function is inherently racy, because there is nothing to prevent 382 * the cgroup inode from getting torn down and potentially reallocated a moment 383 * after page_cgroup_ino() returns, so it only should be used by callers that 384 * do not care (such as procfs interfaces). 385 */ 386 ino_t page_cgroup_ino(struct page *page) 387 { 388 struct mem_cgroup *memcg; 389 unsigned long ino = 0; 390 391 rcu_read_lock(); 392 memcg = page_memcg_check(page); 393 394 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 395 memcg = parent_mem_cgroup(memcg); 396 if (memcg) 397 ino = cgroup_ino(memcg->css.cgroup); 398 rcu_read_unlock(); 399 return ino; 400 } 401 402 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 403 struct mem_cgroup_tree_per_node *mctz, 404 unsigned long new_usage_in_excess) 405 { 406 struct rb_node **p = &mctz->rb_root.rb_node; 407 struct rb_node *parent = NULL; 408 struct mem_cgroup_per_node *mz_node; 409 bool rightmost = true; 410 411 if (mz->on_tree) 412 return; 413 414 mz->usage_in_excess = new_usage_in_excess; 415 if (!mz->usage_in_excess) 416 return; 417 while (*p) { 418 parent = *p; 419 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 420 tree_node); 421 if (mz->usage_in_excess < mz_node->usage_in_excess) { 422 p = &(*p)->rb_left; 423 rightmost = false; 424 } else { 425 p = &(*p)->rb_right; 426 } 427 } 428 429 if (rightmost) 430 mctz->rb_rightmost = &mz->tree_node; 431 432 rb_link_node(&mz->tree_node, parent, p); 433 rb_insert_color(&mz->tree_node, &mctz->rb_root); 434 mz->on_tree = true; 435 } 436 437 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 438 struct mem_cgroup_tree_per_node *mctz) 439 { 440 if (!mz->on_tree) 441 return; 442 443 if (&mz->tree_node == mctz->rb_rightmost) 444 mctz->rb_rightmost = rb_prev(&mz->tree_node); 445 446 rb_erase(&mz->tree_node, &mctz->rb_root); 447 mz->on_tree = false; 448 } 449 450 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 451 struct mem_cgroup_tree_per_node *mctz) 452 { 453 unsigned long flags; 454 455 spin_lock_irqsave(&mctz->lock, flags); 456 __mem_cgroup_remove_exceeded(mz, mctz); 457 spin_unlock_irqrestore(&mctz->lock, flags); 458 } 459 460 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 461 { 462 unsigned long nr_pages = page_counter_read(&memcg->memory); 463 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 464 unsigned long excess = 0; 465 466 if (nr_pages > soft_limit) 467 excess = nr_pages - soft_limit; 468 469 return excess; 470 } 471 472 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) 473 { 474 unsigned long excess; 475 struct mem_cgroup_per_node *mz; 476 struct mem_cgroup_tree_per_node *mctz; 477 478 if (lru_gen_enabled()) { 479 if (soft_limit_excess(memcg)) 480 lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); 481 return; 482 } 483 484 mctz = soft_limit_tree.rb_tree_per_node[nid]; 485 if (!mctz) 486 return; 487 /* 488 * Necessary to update all ancestors when hierarchy is used. 489 * because their event counter is not touched. 490 */ 491 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 492 mz = memcg->nodeinfo[nid]; 493 excess = soft_limit_excess(memcg); 494 /* 495 * We have to update the tree if mz is on RB-tree or 496 * mem is over its softlimit. 497 */ 498 if (excess || mz->on_tree) { 499 unsigned long flags; 500 501 spin_lock_irqsave(&mctz->lock, flags); 502 /* if on-tree, remove it */ 503 if (mz->on_tree) 504 __mem_cgroup_remove_exceeded(mz, mctz); 505 /* 506 * Insert again. mz->usage_in_excess will be updated. 507 * If excess is 0, no tree ops. 508 */ 509 __mem_cgroup_insert_exceeded(mz, mctz, excess); 510 spin_unlock_irqrestore(&mctz->lock, flags); 511 } 512 } 513 } 514 515 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 516 { 517 struct mem_cgroup_tree_per_node *mctz; 518 struct mem_cgroup_per_node *mz; 519 int nid; 520 521 for_each_node(nid) { 522 mz = memcg->nodeinfo[nid]; 523 mctz = soft_limit_tree.rb_tree_per_node[nid]; 524 if (mctz) 525 mem_cgroup_remove_exceeded(mz, mctz); 526 } 527 } 528 529 static struct mem_cgroup_per_node * 530 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 531 { 532 struct mem_cgroup_per_node *mz; 533 534 retry: 535 mz = NULL; 536 if (!mctz->rb_rightmost) 537 goto done; /* Nothing to reclaim from */ 538 539 mz = rb_entry(mctz->rb_rightmost, 540 struct mem_cgroup_per_node, tree_node); 541 /* 542 * Remove the node now but someone else can add it back, 543 * we will to add it back at the end of reclaim to its correct 544 * position in the tree. 545 */ 546 __mem_cgroup_remove_exceeded(mz, mctz); 547 if (!soft_limit_excess(mz->memcg) || 548 !css_tryget(&mz->memcg->css)) 549 goto retry; 550 done: 551 return mz; 552 } 553 554 static struct mem_cgroup_per_node * 555 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 556 { 557 struct mem_cgroup_per_node *mz; 558 559 spin_lock_irq(&mctz->lock); 560 mz = __mem_cgroup_largest_soft_limit_node(mctz); 561 spin_unlock_irq(&mctz->lock); 562 return mz; 563 } 564 565 /* 566 * memcg and lruvec stats flushing 567 * 568 * Many codepaths leading to stats update or read are performance sensitive and 569 * adding stats flushing in such codepaths is not desirable. So, to optimize the 570 * flushing the kernel does: 571 * 572 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 573 * rstat update tree grow unbounded. 574 * 575 * 2) Flush the stats synchronously on reader side only when there are more than 576 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 577 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 578 * only for 2 seconds due to (1). 579 */ 580 static void flush_memcg_stats_dwork(struct work_struct *w); 581 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 582 static DEFINE_SPINLOCK(stats_flush_lock); 583 static DEFINE_PER_CPU(unsigned int, stats_updates); 584 static atomic_t stats_flush_threshold = ATOMIC_INIT(0); 585 static u64 flush_next_time; 586 587 #define FLUSH_TIME (2UL*HZ) 588 589 /* 590 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 591 * not rely on this as part of an acquired spinlock_t lock. These functions are 592 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 593 * is sufficient. 594 */ 595 static void memcg_stats_lock(void) 596 { 597 preempt_disable_nested(); 598 VM_WARN_ON_IRQS_ENABLED(); 599 } 600 601 static void __memcg_stats_lock(void) 602 { 603 preempt_disable_nested(); 604 } 605 606 static void memcg_stats_unlock(void) 607 { 608 preempt_enable_nested(); 609 } 610 611 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 612 { 613 unsigned int x; 614 615 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 616 617 x = __this_cpu_add_return(stats_updates, abs(val)); 618 if (x > MEMCG_CHARGE_BATCH) { 619 /* 620 * If stats_flush_threshold exceeds the threshold 621 * (>num_online_cpus()), cgroup stats update will be triggered 622 * in __mem_cgroup_flush_stats(). Increasing this var further 623 * is redundant and simply adds overhead in atomic update. 624 */ 625 if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) 626 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); 627 __this_cpu_write(stats_updates, 0); 628 } 629 } 630 631 static void __mem_cgroup_flush_stats(void) 632 { 633 unsigned long flag; 634 635 if (!spin_trylock_irqsave(&stats_flush_lock, flag)) 636 return; 637 638 flush_next_time = jiffies_64 + 2*FLUSH_TIME; 639 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); 640 atomic_set(&stats_flush_threshold, 0); 641 spin_unlock_irqrestore(&stats_flush_lock, flag); 642 } 643 644 void mem_cgroup_flush_stats(void) 645 { 646 if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 647 __mem_cgroup_flush_stats(); 648 } 649 650 void mem_cgroup_flush_stats_delayed(void) 651 { 652 if (time_after64(jiffies_64, flush_next_time)) 653 mem_cgroup_flush_stats(); 654 } 655 656 static void flush_memcg_stats_dwork(struct work_struct *w) 657 { 658 __mem_cgroup_flush_stats(); 659 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 660 } 661 662 /* Subset of vm_event_item to report for memcg event stats */ 663 static const unsigned int memcg_vm_event_stat[] = { 664 PGPGIN, 665 PGPGOUT, 666 PGSCAN_KSWAPD, 667 PGSCAN_DIRECT, 668 PGSCAN_KHUGEPAGED, 669 PGSTEAL_KSWAPD, 670 PGSTEAL_DIRECT, 671 PGSTEAL_KHUGEPAGED, 672 PGFAULT, 673 PGMAJFAULT, 674 PGREFILL, 675 PGACTIVATE, 676 PGDEACTIVATE, 677 PGLAZYFREE, 678 PGLAZYFREED, 679 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 680 ZSWPIN, 681 ZSWPOUT, 682 #endif 683 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 684 THP_FAULT_ALLOC, 685 THP_COLLAPSE_ALLOC, 686 #endif 687 }; 688 689 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 690 static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 691 692 static void init_memcg_events(void) 693 { 694 int i; 695 696 for (i = 0; i < NR_MEMCG_EVENTS; ++i) 697 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; 698 } 699 700 static inline int memcg_events_index(enum vm_event_item idx) 701 { 702 return mem_cgroup_events_index[idx] - 1; 703 } 704 705 struct memcg_vmstats_percpu { 706 /* Local (CPU and cgroup) page state & events */ 707 long state[MEMCG_NR_STAT]; 708 unsigned long events[NR_MEMCG_EVENTS]; 709 710 /* Delta calculation for lockless upward propagation */ 711 long state_prev[MEMCG_NR_STAT]; 712 unsigned long events_prev[NR_MEMCG_EVENTS]; 713 714 /* Cgroup1: threshold notifications & softlimit tree updates */ 715 unsigned long nr_page_events; 716 unsigned long targets[MEM_CGROUP_NTARGETS]; 717 }; 718 719 struct memcg_vmstats { 720 /* Aggregated (CPU and subtree) page state & events */ 721 long state[MEMCG_NR_STAT]; 722 unsigned long events[NR_MEMCG_EVENTS]; 723 724 /* Pending child counts during tree propagation */ 725 long state_pending[MEMCG_NR_STAT]; 726 unsigned long events_pending[NR_MEMCG_EVENTS]; 727 }; 728 729 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 730 { 731 long x = READ_ONCE(memcg->vmstats->state[idx]); 732 #ifdef CONFIG_SMP 733 if (x < 0) 734 x = 0; 735 #endif 736 return x; 737 } 738 739 /** 740 * __mod_memcg_state - update cgroup memory statistics 741 * @memcg: the memory cgroup 742 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 743 * @val: delta to add to the counter, can be negative 744 */ 745 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 746 { 747 if (mem_cgroup_disabled()) 748 return; 749 750 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 751 memcg_rstat_updated(memcg, val); 752 } 753 754 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 755 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 756 { 757 long x = 0; 758 int cpu; 759 760 for_each_possible_cpu(cpu) 761 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); 762 #ifdef CONFIG_SMP 763 if (x < 0) 764 x = 0; 765 #endif 766 return x; 767 } 768 769 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 770 int val) 771 { 772 struct mem_cgroup_per_node *pn; 773 struct mem_cgroup *memcg; 774 775 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 776 memcg = pn->memcg; 777 778 /* 779 * The caller from rmap relay on disabled preemption becase they never 780 * update their counter from in-interrupt context. For these two 781 * counters we check that the update is never performed from an 782 * interrupt context while other caller need to have disabled interrupt. 783 */ 784 __memcg_stats_lock(); 785 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 786 switch (idx) { 787 case NR_ANON_MAPPED: 788 case NR_FILE_MAPPED: 789 case NR_ANON_THPS: 790 case NR_SHMEM_PMDMAPPED: 791 case NR_FILE_PMDMAPPED: 792 WARN_ON_ONCE(!in_task()); 793 break; 794 default: 795 VM_WARN_ON_IRQS_ENABLED(); 796 } 797 } 798 799 /* Update memcg */ 800 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 801 802 /* Update lruvec */ 803 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 804 805 memcg_rstat_updated(memcg, val); 806 memcg_stats_unlock(); 807 } 808 809 /** 810 * __mod_lruvec_state - update lruvec memory statistics 811 * @lruvec: the lruvec 812 * @idx: the stat item 813 * @val: delta to add to the counter, can be negative 814 * 815 * The lruvec is the intersection of the NUMA node and a cgroup. This 816 * function updates the all three counters that are affected by a 817 * change of state at this level: per-node, per-cgroup, per-lruvec. 818 */ 819 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 820 int val) 821 { 822 /* Update node */ 823 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 824 825 /* Update memcg and lruvec */ 826 if (!mem_cgroup_disabled()) 827 __mod_memcg_lruvec_state(lruvec, idx, val); 828 } 829 830 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 831 int val) 832 { 833 struct page *head = compound_head(page); /* rmap on tail pages */ 834 struct mem_cgroup *memcg; 835 pg_data_t *pgdat = page_pgdat(page); 836 struct lruvec *lruvec; 837 838 rcu_read_lock(); 839 memcg = page_memcg(head); 840 /* Untracked pages have no memcg, no lruvec. Update only the node */ 841 if (!memcg) { 842 rcu_read_unlock(); 843 __mod_node_page_state(pgdat, idx, val); 844 return; 845 } 846 847 lruvec = mem_cgroup_lruvec(memcg, pgdat); 848 __mod_lruvec_state(lruvec, idx, val); 849 rcu_read_unlock(); 850 } 851 EXPORT_SYMBOL(__mod_lruvec_page_state); 852 853 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 854 { 855 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 856 struct mem_cgroup *memcg; 857 struct lruvec *lruvec; 858 859 rcu_read_lock(); 860 memcg = mem_cgroup_from_slab_obj(p); 861 862 /* 863 * Untracked pages have no memcg, no lruvec. Update only the 864 * node. If we reparent the slab objects to the root memcg, 865 * when we free the slab object, we need to update the per-memcg 866 * vmstats to keep it correct for the root memcg. 867 */ 868 if (!memcg) { 869 __mod_node_page_state(pgdat, idx, val); 870 } else { 871 lruvec = mem_cgroup_lruvec(memcg, pgdat); 872 __mod_lruvec_state(lruvec, idx, val); 873 } 874 rcu_read_unlock(); 875 } 876 877 /** 878 * __count_memcg_events - account VM events in a cgroup 879 * @memcg: the memory cgroup 880 * @idx: the event item 881 * @count: the number of events that occurred 882 */ 883 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 884 unsigned long count) 885 { 886 int index = memcg_events_index(idx); 887 888 if (mem_cgroup_disabled() || index < 0) 889 return; 890 891 memcg_stats_lock(); 892 __this_cpu_add(memcg->vmstats_percpu->events[index], count); 893 memcg_rstat_updated(memcg, count); 894 memcg_stats_unlock(); 895 } 896 897 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 898 { 899 int index = memcg_events_index(event); 900 901 if (index < 0) 902 return 0; 903 return READ_ONCE(memcg->vmstats->events[index]); 904 } 905 906 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 907 { 908 long x = 0; 909 int cpu; 910 int index = memcg_events_index(event); 911 912 if (index < 0) 913 return 0; 914 915 for_each_possible_cpu(cpu) 916 x += per_cpu(memcg->vmstats_percpu->events[index], cpu); 917 return x; 918 } 919 920 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 921 int nr_pages) 922 { 923 /* pagein of a big page is an event. So, ignore page size */ 924 if (nr_pages > 0) 925 __count_memcg_events(memcg, PGPGIN, 1); 926 else { 927 __count_memcg_events(memcg, PGPGOUT, 1); 928 nr_pages = -nr_pages; /* for event */ 929 } 930 931 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 932 } 933 934 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 935 enum mem_cgroup_events_target target) 936 { 937 unsigned long val, next; 938 939 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 940 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 941 /* from time_after() in jiffies.h */ 942 if ((long)(next - val) < 0) { 943 switch (target) { 944 case MEM_CGROUP_TARGET_THRESH: 945 next = val + THRESHOLDS_EVENTS_TARGET; 946 break; 947 case MEM_CGROUP_TARGET_SOFTLIMIT: 948 next = val + SOFTLIMIT_EVENTS_TARGET; 949 break; 950 default: 951 break; 952 } 953 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 954 return true; 955 } 956 return false; 957 } 958 959 /* 960 * Check events in order. 961 * 962 */ 963 static void memcg_check_events(struct mem_cgroup *memcg, int nid) 964 { 965 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 966 return; 967 968 /* threshold event is triggered in finer grain than soft limit */ 969 if (unlikely(mem_cgroup_event_ratelimit(memcg, 970 MEM_CGROUP_TARGET_THRESH))) { 971 bool do_softlimit; 972 973 do_softlimit = mem_cgroup_event_ratelimit(memcg, 974 MEM_CGROUP_TARGET_SOFTLIMIT); 975 mem_cgroup_threshold(memcg); 976 if (unlikely(do_softlimit)) 977 mem_cgroup_update_tree(memcg, nid); 978 } 979 } 980 981 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 982 { 983 /* 984 * mm_update_next_owner() may clear mm->owner to NULL 985 * if it races with swapoff, page migration, etc. 986 * So this can be called with p == NULL. 987 */ 988 if (unlikely(!p)) 989 return NULL; 990 991 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 992 } 993 EXPORT_SYMBOL(mem_cgroup_from_task); 994 995 static __always_inline struct mem_cgroup *active_memcg(void) 996 { 997 if (!in_task()) 998 return this_cpu_read(int_active_memcg); 999 else 1000 return current->active_memcg; 1001 } 1002 1003 /** 1004 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1005 * @mm: mm from which memcg should be extracted. It can be NULL. 1006 * 1007 * Obtain a reference on mm->memcg and returns it if successful. If mm 1008 * is NULL, then the memcg is chosen as follows: 1009 * 1) The active memcg, if set. 1010 * 2) current->mm->memcg, if available 1011 * 3) root memcg 1012 * If mem_cgroup is disabled, NULL is returned. 1013 */ 1014 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1015 { 1016 struct mem_cgroup *memcg; 1017 1018 if (mem_cgroup_disabled()) 1019 return NULL; 1020 1021 /* 1022 * Page cache insertions can happen without an 1023 * actual mm context, e.g. during disk probing 1024 * on boot, loopback IO, acct() writes etc. 1025 * 1026 * No need to css_get on root memcg as the reference 1027 * counting is disabled on the root level in the 1028 * cgroup core. See CSS_NO_REF. 1029 */ 1030 if (unlikely(!mm)) { 1031 memcg = active_memcg(); 1032 if (unlikely(memcg)) { 1033 /* remote memcg must hold a ref */ 1034 css_get(&memcg->css); 1035 return memcg; 1036 } 1037 mm = current->mm; 1038 if (unlikely(!mm)) 1039 return root_mem_cgroup; 1040 } 1041 1042 rcu_read_lock(); 1043 do { 1044 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1045 if (unlikely(!memcg)) 1046 memcg = root_mem_cgroup; 1047 } while (!css_tryget(&memcg->css)); 1048 rcu_read_unlock(); 1049 return memcg; 1050 } 1051 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1052 1053 static __always_inline bool memcg_kmem_bypass(void) 1054 { 1055 /* Allow remote memcg charging from any context. */ 1056 if (unlikely(active_memcg())) 1057 return false; 1058 1059 /* Memcg to charge can't be determined. */ 1060 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) 1061 return true; 1062 1063 return false; 1064 } 1065 1066 /** 1067 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1068 * @root: hierarchy root 1069 * @prev: previously returned memcg, NULL on first invocation 1070 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1071 * 1072 * Returns references to children of the hierarchy below @root, or 1073 * @root itself, or %NULL after a full round-trip. 1074 * 1075 * Caller must pass the return value in @prev on subsequent 1076 * invocations for reference counting, or use mem_cgroup_iter_break() 1077 * to cancel a hierarchy walk before the round-trip is complete. 1078 * 1079 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1080 * in the hierarchy among all concurrent reclaimers operating on the 1081 * same node. 1082 */ 1083 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1084 struct mem_cgroup *prev, 1085 struct mem_cgroup_reclaim_cookie *reclaim) 1086 { 1087 struct mem_cgroup_reclaim_iter *iter; 1088 struct cgroup_subsys_state *css = NULL; 1089 struct mem_cgroup *memcg = NULL; 1090 struct mem_cgroup *pos = NULL; 1091 1092 if (mem_cgroup_disabled()) 1093 return NULL; 1094 1095 if (!root) 1096 root = root_mem_cgroup; 1097 1098 rcu_read_lock(); 1099 1100 if (reclaim) { 1101 struct mem_cgroup_per_node *mz; 1102 1103 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1104 iter = &mz->iter; 1105 1106 /* 1107 * On start, join the current reclaim iteration cycle. 1108 * Exit when a concurrent walker completes it. 1109 */ 1110 if (!prev) 1111 reclaim->generation = iter->generation; 1112 else if (reclaim->generation != iter->generation) 1113 goto out_unlock; 1114 1115 while (1) { 1116 pos = READ_ONCE(iter->position); 1117 if (!pos || css_tryget(&pos->css)) 1118 break; 1119 /* 1120 * css reference reached zero, so iter->position will 1121 * be cleared by ->css_released. However, we should not 1122 * rely on this happening soon, because ->css_released 1123 * is called from a work queue, and by busy-waiting we 1124 * might block it. So we clear iter->position right 1125 * away. 1126 */ 1127 (void)cmpxchg(&iter->position, pos, NULL); 1128 } 1129 } else if (prev) { 1130 pos = prev; 1131 } 1132 1133 if (pos) 1134 css = &pos->css; 1135 1136 for (;;) { 1137 css = css_next_descendant_pre(css, &root->css); 1138 if (!css) { 1139 /* 1140 * Reclaimers share the hierarchy walk, and a 1141 * new one might jump in right at the end of 1142 * the hierarchy - make sure they see at least 1143 * one group and restart from the beginning. 1144 */ 1145 if (!prev) 1146 continue; 1147 break; 1148 } 1149 1150 /* 1151 * Verify the css and acquire a reference. The root 1152 * is provided by the caller, so we know it's alive 1153 * and kicking, and don't take an extra reference. 1154 */ 1155 if (css == &root->css || css_tryget(css)) { 1156 memcg = mem_cgroup_from_css(css); 1157 break; 1158 } 1159 } 1160 1161 if (reclaim) { 1162 /* 1163 * The position could have already been updated by a competing 1164 * thread, so check that the value hasn't changed since we read 1165 * it to avoid reclaiming from the same cgroup twice. 1166 */ 1167 (void)cmpxchg(&iter->position, pos, memcg); 1168 1169 if (pos) 1170 css_put(&pos->css); 1171 1172 if (!memcg) 1173 iter->generation++; 1174 } 1175 1176 out_unlock: 1177 rcu_read_unlock(); 1178 if (prev && prev != root) 1179 css_put(&prev->css); 1180 1181 return memcg; 1182 } 1183 1184 /** 1185 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1186 * @root: hierarchy root 1187 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1188 */ 1189 void mem_cgroup_iter_break(struct mem_cgroup *root, 1190 struct mem_cgroup *prev) 1191 { 1192 if (!root) 1193 root = root_mem_cgroup; 1194 if (prev && prev != root) 1195 css_put(&prev->css); 1196 } 1197 1198 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1199 struct mem_cgroup *dead_memcg) 1200 { 1201 struct mem_cgroup_reclaim_iter *iter; 1202 struct mem_cgroup_per_node *mz; 1203 int nid; 1204 1205 for_each_node(nid) { 1206 mz = from->nodeinfo[nid]; 1207 iter = &mz->iter; 1208 cmpxchg(&iter->position, dead_memcg, NULL); 1209 } 1210 } 1211 1212 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1213 { 1214 struct mem_cgroup *memcg = dead_memcg; 1215 struct mem_cgroup *last; 1216 1217 do { 1218 __invalidate_reclaim_iterators(memcg, dead_memcg); 1219 last = memcg; 1220 } while ((memcg = parent_mem_cgroup(memcg))); 1221 1222 /* 1223 * When cgroup1 non-hierarchy mode is used, 1224 * parent_mem_cgroup() does not walk all the way up to the 1225 * cgroup root (root_mem_cgroup). So we have to handle 1226 * dead_memcg from cgroup root separately. 1227 */ 1228 if (!mem_cgroup_is_root(last)) 1229 __invalidate_reclaim_iterators(root_mem_cgroup, 1230 dead_memcg); 1231 } 1232 1233 /** 1234 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1235 * @memcg: hierarchy root 1236 * @fn: function to call for each task 1237 * @arg: argument passed to @fn 1238 * 1239 * This function iterates over tasks attached to @memcg or to any of its 1240 * descendants and calls @fn for each task. If @fn returns a non-zero 1241 * value, the function breaks the iteration loop and returns the value. 1242 * Otherwise, it will iterate over all tasks and return 0. 1243 * 1244 * This function must not be called for the root memory cgroup. 1245 */ 1246 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1247 int (*fn)(struct task_struct *, void *), void *arg) 1248 { 1249 struct mem_cgroup *iter; 1250 int ret = 0; 1251 1252 BUG_ON(mem_cgroup_is_root(memcg)); 1253 1254 for_each_mem_cgroup_tree(iter, memcg) { 1255 struct css_task_iter it; 1256 struct task_struct *task; 1257 1258 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1259 while (!ret && (task = css_task_iter_next(&it))) 1260 ret = fn(task, arg); 1261 css_task_iter_end(&it); 1262 if (ret) { 1263 mem_cgroup_iter_break(memcg, iter); 1264 break; 1265 } 1266 } 1267 return ret; 1268 } 1269 1270 #ifdef CONFIG_DEBUG_VM 1271 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1272 { 1273 struct mem_cgroup *memcg; 1274 1275 if (mem_cgroup_disabled()) 1276 return; 1277 1278 memcg = folio_memcg(folio); 1279 1280 if (!memcg) 1281 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1282 else 1283 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1284 } 1285 #endif 1286 1287 /** 1288 * folio_lruvec_lock - Lock the lruvec for a folio. 1289 * @folio: Pointer to the folio. 1290 * 1291 * These functions are safe to use under any of the following conditions: 1292 * - folio locked 1293 * - folio_test_lru false 1294 * - folio_memcg_lock() 1295 * - folio frozen (refcount of 0) 1296 * 1297 * Return: The lruvec this folio is on with its lock held. 1298 */ 1299 struct lruvec *folio_lruvec_lock(struct folio *folio) 1300 { 1301 struct lruvec *lruvec = folio_lruvec(folio); 1302 1303 spin_lock(&lruvec->lru_lock); 1304 lruvec_memcg_debug(lruvec, folio); 1305 1306 return lruvec; 1307 } 1308 1309 /** 1310 * folio_lruvec_lock_irq - Lock the lruvec for a folio. 1311 * @folio: Pointer to the folio. 1312 * 1313 * These functions are safe to use under any of the following conditions: 1314 * - folio locked 1315 * - folio_test_lru false 1316 * - folio_memcg_lock() 1317 * - folio frozen (refcount of 0) 1318 * 1319 * Return: The lruvec this folio is on with its lock held and interrupts 1320 * disabled. 1321 */ 1322 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1323 { 1324 struct lruvec *lruvec = folio_lruvec(folio); 1325 1326 spin_lock_irq(&lruvec->lru_lock); 1327 lruvec_memcg_debug(lruvec, folio); 1328 1329 return lruvec; 1330 } 1331 1332 /** 1333 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 1334 * @folio: Pointer to the folio. 1335 * @flags: Pointer to irqsave flags. 1336 * 1337 * These functions are safe to use under any of the following conditions: 1338 * - folio locked 1339 * - folio_test_lru false 1340 * - folio_memcg_lock() 1341 * - folio frozen (refcount of 0) 1342 * 1343 * Return: The lruvec this folio is on with its lock held and interrupts 1344 * disabled. 1345 */ 1346 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1347 unsigned long *flags) 1348 { 1349 struct lruvec *lruvec = folio_lruvec(folio); 1350 1351 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1352 lruvec_memcg_debug(lruvec, folio); 1353 1354 return lruvec; 1355 } 1356 1357 /** 1358 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1359 * @lruvec: mem_cgroup per zone lru vector 1360 * @lru: index of lru list the page is sitting on 1361 * @zid: zone id of the accounted pages 1362 * @nr_pages: positive when adding or negative when removing 1363 * 1364 * This function must be called under lru_lock, just before a page is added 1365 * to or just after a page is removed from an lru list. 1366 */ 1367 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1368 int zid, int nr_pages) 1369 { 1370 struct mem_cgroup_per_node *mz; 1371 unsigned long *lru_size; 1372 long size; 1373 1374 if (mem_cgroup_disabled()) 1375 return; 1376 1377 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1378 lru_size = &mz->lru_zone_size[zid][lru]; 1379 1380 if (nr_pages < 0) 1381 *lru_size += nr_pages; 1382 1383 size = *lru_size; 1384 if (WARN_ONCE(size < 0, 1385 "%s(%p, %d, %d): lru_size %ld\n", 1386 __func__, lruvec, lru, nr_pages, size)) { 1387 VM_BUG_ON(1); 1388 *lru_size = 0; 1389 } 1390 1391 if (nr_pages > 0) 1392 *lru_size += nr_pages; 1393 } 1394 1395 /** 1396 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1397 * @memcg: the memory cgroup 1398 * 1399 * Returns the maximum amount of memory @mem can be charged with, in 1400 * pages. 1401 */ 1402 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1403 { 1404 unsigned long margin = 0; 1405 unsigned long count; 1406 unsigned long limit; 1407 1408 count = page_counter_read(&memcg->memory); 1409 limit = READ_ONCE(memcg->memory.max); 1410 if (count < limit) 1411 margin = limit - count; 1412 1413 if (do_memsw_account()) { 1414 count = page_counter_read(&memcg->memsw); 1415 limit = READ_ONCE(memcg->memsw.max); 1416 if (count < limit) 1417 margin = min(margin, limit - count); 1418 else 1419 margin = 0; 1420 } 1421 1422 return margin; 1423 } 1424 1425 /* 1426 * A routine for checking "mem" is under move_account() or not. 1427 * 1428 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1429 * moving cgroups. This is for waiting at high-memory pressure 1430 * caused by "move". 1431 */ 1432 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1433 { 1434 struct mem_cgroup *from; 1435 struct mem_cgroup *to; 1436 bool ret = false; 1437 /* 1438 * Unlike task_move routines, we access mc.to, mc.from not under 1439 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1440 */ 1441 spin_lock(&mc.lock); 1442 from = mc.from; 1443 to = mc.to; 1444 if (!from) 1445 goto unlock; 1446 1447 ret = mem_cgroup_is_descendant(from, memcg) || 1448 mem_cgroup_is_descendant(to, memcg); 1449 unlock: 1450 spin_unlock(&mc.lock); 1451 return ret; 1452 } 1453 1454 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1455 { 1456 if (mc.moving_task && current != mc.moving_task) { 1457 if (mem_cgroup_under_move(memcg)) { 1458 DEFINE_WAIT(wait); 1459 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1460 /* moving charge context might have finished. */ 1461 if (mc.moving_task) 1462 schedule(); 1463 finish_wait(&mc.waitq, &wait); 1464 return true; 1465 } 1466 } 1467 return false; 1468 } 1469 1470 struct memory_stat { 1471 const char *name; 1472 unsigned int idx; 1473 }; 1474 1475 static const struct memory_stat memory_stats[] = { 1476 { "anon", NR_ANON_MAPPED }, 1477 { "file", NR_FILE_PAGES }, 1478 { "kernel", MEMCG_KMEM }, 1479 { "kernel_stack", NR_KERNEL_STACK_KB }, 1480 { "pagetables", NR_PAGETABLE }, 1481 { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 1482 { "percpu", MEMCG_PERCPU_B }, 1483 { "sock", MEMCG_SOCK }, 1484 { "vmalloc", MEMCG_VMALLOC }, 1485 { "shmem", NR_SHMEM }, 1486 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 1487 { "zswap", MEMCG_ZSWAP_B }, 1488 { "zswapped", MEMCG_ZSWAPPED }, 1489 #endif 1490 { "file_mapped", NR_FILE_MAPPED }, 1491 { "file_dirty", NR_FILE_DIRTY }, 1492 { "file_writeback", NR_WRITEBACK }, 1493 #ifdef CONFIG_SWAP 1494 { "swapcached", NR_SWAPCACHE }, 1495 #endif 1496 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1497 { "anon_thp", NR_ANON_THPS }, 1498 { "file_thp", NR_FILE_THPS }, 1499 { "shmem_thp", NR_SHMEM_THPS }, 1500 #endif 1501 { "inactive_anon", NR_INACTIVE_ANON }, 1502 { "active_anon", NR_ACTIVE_ANON }, 1503 { "inactive_file", NR_INACTIVE_FILE }, 1504 { "active_file", NR_ACTIVE_FILE }, 1505 { "unevictable", NR_UNEVICTABLE }, 1506 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1507 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1508 1509 /* The memory events */ 1510 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1511 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1512 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1513 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1514 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1515 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1516 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1517 }; 1518 1519 /* Translate stat items to the correct unit for memory.stat output */ 1520 static int memcg_page_state_unit(int item) 1521 { 1522 switch (item) { 1523 case MEMCG_PERCPU_B: 1524 case MEMCG_ZSWAP_B: 1525 case NR_SLAB_RECLAIMABLE_B: 1526 case NR_SLAB_UNRECLAIMABLE_B: 1527 case WORKINGSET_REFAULT_ANON: 1528 case WORKINGSET_REFAULT_FILE: 1529 case WORKINGSET_ACTIVATE_ANON: 1530 case WORKINGSET_ACTIVATE_FILE: 1531 case WORKINGSET_RESTORE_ANON: 1532 case WORKINGSET_RESTORE_FILE: 1533 case WORKINGSET_NODERECLAIM: 1534 return 1; 1535 case NR_KERNEL_STACK_KB: 1536 return SZ_1K; 1537 default: 1538 return PAGE_SIZE; 1539 } 1540 } 1541 1542 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1543 int item) 1544 { 1545 return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 1546 } 1547 1548 static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) 1549 { 1550 struct seq_buf s; 1551 int i; 1552 1553 seq_buf_init(&s, buf, bufsize); 1554 1555 /* 1556 * Provide statistics on the state of the memory subsystem as 1557 * well as cumulative event counters that show past behavior. 1558 * 1559 * This list is ordered following a combination of these gradients: 1560 * 1) generic big picture -> specifics and details 1561 * 2) reflecting userspace activity -> reflecting kernel heuristics 1562 * 1563 * Current memory state: 1564 */ 1565 mem_cgroup_flush_stats(); 1566 1567 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1568 u64 size; 1569 1570 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1571 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1572 1573 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1574 size += memcg_page_state_output(memcg, 1575 NR_SLAB_RECLAIMABLE_B); 1576 seq_buf_printf(&s, "slab %llu\n", size); 1577 } 1578 } 1579 1580 /* Accumulated memory events */ 1581 seq_buf_printf(&s, "pgscan %lu\n", 1582 memcg_events(memcg, PGSCAN_KSWAPD) + 1583 memcg_events(memcg, PGSCAN_DIRECT) + 1584 memcg_events(memcg, PGSCAN_KHUGEPAGED)); 1585 seq_buf_printf(&s, "pgsteal %lu\n", 1586 memcg_events(memcg, PGSTEAL_KSWAPD) + 1587 memcg_events(memcg, PGSTEAL_DIRECT) + 1588 memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 1589 1590 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 1591 if (memcg_vm_event_stat[i] == PGPGIN || 1592 memcg_vm_event_stat[i] == PGPGOUT) 1593 continue; 1594 1595 seq_buf_printf(&s, "%s %lu\n", 1596 vm_event_name(memcg_vm_event_stat[i]), 1597 memcg_events(memcg, memcg_vm_event_stat[i])); 1598 } 1599 1600 /* The above should easily fit into one page */ 1601 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1602 } 1603 1604 #define K(x) ((x) << (PAGE_SHIFT-10)) 1605 /** 1606 * mem_cgroup_print_oom_context: Print OOM information relevant to 1607 * memory controller. 1608 * @memcg: The memory cgroup that went over limit 1609 * @p: Task that is going to be killed 1610 * 1611 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1612 * enabled 1613 */ 1614 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1615 { 1616 rcu_read_lock(); 1617 1618 if (memcg) { 1619 pr_cont(",oom_memcg="); 1620 pr_cont_cgroup_path(memcg->css.cgroup); 1621 } else 1622 pr_cont(",global_oom"); 1623 if (p) { 1624 pr_cont(",task_memcg="); 1625 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1626 } 1627 rcu_read_unlock(); 1628 } 1629 1630 /** 1631 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1632 * memory controller. 1633 * @memcg: The memory cgroup that went over limit 1634 */ 1635 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1636 { 1637 /* Use static buffer, for the caller is holding oom_lock. */ 1638 static char buf[PAGE_SIZE]; 1639 1640 lockdep_assert_held(&oom_lock); 1641 1642 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1643 K((u64)page_counter_read(&memcg->memory)), 1644 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1645 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1646 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1647 K((u64)page_counter_read(&memcg->swap)), 1648 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1649 else { 1650 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1651 K((u64)page_counter_read(&memcg->memsw)), 1652 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1653 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1654 K((u64)page_counter_read(&memcg->kmem)), 1655 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1656 } 1657 1658 pr_info("Memory cgroup stats for "); 1659 pr_cont_cgroup_path(memcg->css.cgroup); 1660 pr_cont(":"); 1661 memory_stat_format(memcg, buf, sizeof(buf)); 1662 pr_info("%s", buf); 1663 } 1664 1665 /* 1666 * Return the memory (and swap, if configured) limit for a memcg. 1667 */ 1668 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1669 { 1670 unsigned long max = READ_ONCE(memcg->memory.max); 1671 1672 if (do_memsw_account()) { 1673 if (mem_cgroup_swappiness(memcg)) { 1674 /* Calculate swap excess capacity from memsw limit */ 1675 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1676 1677 max += min(swap, (unsigned long)total_swap_pages); 1678 } 1679 } else { 1680 if (mem_cgroup_swappiness(memcg)) 1681 max += min(READ_ONCE(memcg->swap.max), 1682 (unsigned long)total_swap_pages); 1683 } 1684 return max; 1685 } 1686 1687 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1688 { 1689 return page_counter_read(&memcg->memory); 1690 } 1691 1692 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1693 int order) 1694 { 1695 struct oom_control oc = { 1696 .zonelist = NULL, 1697 .nodemask = NULL, 1698 .memcg = memcg, 1699 .gfp_mask = gfp_mask, 1700 .order = order, 1701 }; 1702 bool ret = true; 1703 1704 if (mutex_lock_killable(&oom_lock)) 1705 return true; 1706 1707 if (mem_cgroup_margin(memcg) >= (1 << order)) 1708 goto unlock; 1709 1710 /* 1711 * A few threads which were not waiting at mutex_lock_killable() can 1712 * fail to bail out. Therefore, check again after holding oom_lock. 1713 */ 1714 ret = task_is_dying() || out_of_memory(&oc); 1715 1716 unlock: 1717 mutex_unlock(&oom_lock); 1718 return ret; 1719 } 1720 1721 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1722 pg_data_t *pgdat, 1723 gfp_t gfp_mask, 1724 unsigned long *total_scanned) 1725 { 1726 struct mem_cgroup *victim = NULL; 1727 int total = 0; 1728 int loop = 0; 1729 unsigned long excess; 1730 unsigned long nr_scanned; 1731 struct mem_cgroup_reclaim_cookie reclaim = { 1732 .pgdat = pgdat, 1733 }; 1734 1735 excess = soft_limit_excess(root_memcg); 1736 1737 while (1) { 1738 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1739 if (!victim) { 1740 loop++; 1741 if (loop >= 2) { 1742 /* 1743 * If we have not been able to reclaim 1744 * anything, it might because there are 1745 * no reclaimable pages under this hierarchy 1746 */ 1747 if (!total) 1748 break; 1749 /* 1750 * We want to do more targeted reclaim. 1751 * excess >> 2 is not to excessive so as to 1752 * reclaim too much, nor too less that we keep 1753 * coming back to reclaim from this cgroup 1754 */ 1755 if (total >= (excess >> 2) || 1756 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1757 break; 1758 } 1759 continue; 1760 } 1761 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1762 pgdat, &nr_scanned); 1763 *total_scanned += nr_scanned; 1764 if (!soft_limit_excess(root_memcg)) 1765 break; 1766 } 1767 mem_cgroup_iter_break(root_memcg, victim); 1768 return total; 1769 } 1770 1771 #ifdef CONFIG_LOCKDEP 1772 static struct lockdep_map memcg_oom_lock_dep_map = { 1773 .name = "memcg_oom_lock", 1774 }; 1775 #endif 1776 1777 static DEFINE_SPINLOCK(memcg_oom_lock); 1778 1779 /* 1780 * Check OOM-Killer is already running under our hierarchy. 1781 * If someone is running, return false. 1782 */ 1783 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1784 { 1785 struct mem_cgroup *iter, *failed = NULL; 1786 1787 spin_lock(&memcg_oom_lock); 1788 1789 for_each_mem_cgroup_tree(iter, memcg) { 1790 if (iter->oom_lock) { 1791 /* 1792 * this subtree of our hierarchy is already locked 1793 * so we cannot give a lock. 1794 */ 1795 failed = iter; 1796 mem_cgroup_iter_break(memcg, iter); 1797 break; 1798 } else 1799 iter->oom_lock = true; 1800 } 1801 1802 if (failed) { 1803 /* 1804 * OK, we failed to lock the whole subtree so we have 1805 * to clean up what we set up to the failing subtree 1806 */ 1807 for_each_mem_cgroup_tree(iter, memcg) { 1808 if (iter == failed) { 1809 mem_cgroup_iter_break(memcg, iter); 1810 break; 1811 } 1812 iter->oom_lock = false; 1813 } 1814 } else 1815 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1816 1817 spin_unlock(&memcg_oom_lock); 1818 1819 return !failed; 1820 } 1821 1822 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1823 { 1824 struct mem_cgroup *iter; 1825 1826 spin_lock(&memcg_oom_lock); 1827 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1828 for_each_mem_cgroup_tree(iter, memcg) 1829 iter->oom_lock = false; 1830 spin_unlock(&memcg_oom_lock); 1831 } 1832 1833 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1834 { 1835 struct mem_cgroup *iter; 1836 1837 spin_lock(&memcg_oom_lock); 1838 for_each_mem_cgroup_tree(iter, memcg) 1839 iter->under_oom++; 1840 spin_unlock(&memcg_oom_lock); 1841 } 1842 1843 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1844 { 1845 struct mem_cgroup *iter; 1846 1847 /* 1848 * Be careful about under_oom underflows because a child memcg 1849 * could have been added after mem_cgroup_mark_under_oom. 1850 */ 1851 spin_lock(&memcg_oom_lock); 1852 for_each_mem_cgroup_tree(iter, memcg) 1853 if (iter->under_oom > 0) 1854 iter->under_oom--; 1855 spin_unlock(&memcg_oom_lock); 1856 } 1857 1858 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1859 1860 struct oom_wait_info { 1861 struct mem_cgroup *memcg; 1862 wait_queue_entry_t wait; 1863 }; 1864 1865 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1866 unsigned mode, int sync, void *arg) 1867 { 1868 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1869 struct mem_cgroup *oom_wait_memcg; 1870 struct oom_wait_info *oom_wait_info; 1871 1872 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1873 oom_wait_memcg = oom_wait_info->memcg; 1874 1875 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1876 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1877 return 0; 1878 return autoremove_wake_function(wait, mode, sync, arg); 1879 } 1880 1881 static void memcg_oom_recover(struct mem_cgroup *memcg) 1882 { 1883 /* 1884 * For the following lockless ->under_oom test, the only required 1885 * guarantee is that it must see the state asserted by an OOM when 1886 * this function is called as a result of userland actions 1887 * triggered by the notification of the OOM. This is trivially 1888 * achieved by invoking mem_cgroup_mark_under_oom() before 1889 * triggering notification. 1890 */ 1891 if (memcg && memcg->under_oom) 1892 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1893 } 1894 1895 /* 1896 * Returns true if successfully killed one or more processes. Though in some 1897 * corner cases it can return true even without killing any process. 1898 */ 1899 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1900 { 1901 bool locked, ret; 1902 1903 if (order > PAGE_ALLOC_COSTLY_ORDER) 1904 return false; 1905 1906 memcg_memory_event(memcg, MEMCG_OOM); 1907 1908 /* 1909 * We are in the middle of the charge context here, so we 1910 * don't want to block when potentially sitting on a callstack 1911 * that holds all kinds of filesystem and mm locks. 1912 * 1913 * cgroup1 allows disabling the OOM killer and waiting for outside 1914 * handling until the charge can succeed; remember the context and put 1915 * the task to sleep at the end of the page fault when all locks are 1916 * released. 1917 * 1918 * On the other hand, in-kernel OOM killer allows for an async victim 1919 * memory reclaim (oom_reaper) and that means that we are not solely 1920 * relying on the oom victim to make a forward progress and we can 1921 * invoke the oom killer here. 1922 * 1923 * Please note that mem_cgroup_out_of_memory might fail to find a 1924 * victim and then we have to bail out from the charge path. 1925 */ 1926 if (memcg->oom_kill_disable) { 1927 if (current->in_user_fault) { 1928 css_get(&memcg->css); 1929 current->memcg_in_oom = memcg; 1930 current->memcg_oom_gfp_mask = mask; 1931 current->memcg_oom_order = order; 1932 } 1933 return false; 1934 } 1935 1936 mem_cgroup_mark_under_oom(memcg); 1937 1938 locked = mem_cgroup_oom_trylock(memcg); 1939 1940 if (locked) 1941 mem_cgroup_oom_notify(memcg); 1942 1943 mem_cgroup_unmark_under_oom(memcg); 1944 ret = mem_cgroup_out_of_memory(memcg, mask, order); 1945 1946 if (locked) 1947 mem_cgroup_oom_unlock(memcg); 1948 1949 return ret; 1950 } 1951 1952 /** 1953 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1954 * @handle: actually kill/wait or just clean up the OOM state 1955 * 1956 * This has to be called at the end of a page fault if the memcg OOM 1957 * handler was enabled. 1958 * 1959 * Memcg supports userspace OOM handling where failed allocations must 1960 * sleep on a waitqueue until the userspace task resolves the 1961 * situation. Sleeping directly in the charge context with all kinds 1962 * of locks held is not a good idea, instead we remember an OOM state 1963 * in the task and mem_cgroup_oom_synchronize() has to be called at 1964 * the end of the page fault to complete the OOM handling. 1965 * 1966 * Returns %true if an ongoing memcg OOM situation was detected and 1967 * completed, %false otherwise. 1968 */ 1969 bool mem_cgroup_oom_synchronize(bool handle) 1970 { 1971 struct mem_cgroup *memcg = current->memcg_in_oom; 1972 struct oom_wait_info owait; 1973 bool locked; 1974 1975 /* OOM is global, do not handle */ 1976 if (!memcg) 1977 return false; 1978 1979 if (!handle) 1980 goto cleanup; 1981 1982 owait.memcg = memcg; 1983 owait.wait.flags = 0; 1984 owait.wait.func = memcg_oom_wake_function; 1985 owait.wait.private = current; 1986 INIT_LIST_HEAD(&owait.wait.entry); 1987 1988 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1989 mem_cgroup_mark_under_oom(memcg); 1990 1991 locked = mem_cgroup_oom_trylock(memcg); 1992 1993 if (locked) 1994 mem_cgroup_oom_notify(memcg); 1995 1996 if (locked && !memcg->oom_kill_disable) { 1997 mem_cgroup_unmark_under_oom(memcg); 1998 finish_wait(&memcg_oom_waitq, &owait.wait); 1999 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 2000 current->memcg_oom_order); 2001 } else { 2002 schedule(); 2003 mem_cgroup_unmark_under_oom(memcg); 2004 finish_wait(&memcg_oom_waitq, &owait.wait); 2005 } 2006 2007 if (locked) { 2008 mem_cgroup_oom_unlock(memcg); 2009 /* 2010 * There is no guarantee that an OOM-lock contender 2011 * sees the wakeups triggered by the OOM kill 2012 * uncharges. Wake any sleepers explicitly. 2013 */ 2014 memcg_oom_recover(memcg); 2015 } 2016 cleanup: 2017 current->memcg_in_oom = NULL; 2018 css_put(&memcg->css); 2019 return true; 2020 } 2021 2022 /** 2023 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2024 * @victim: task to be killed by the OOM killer 2025 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2026 * 2027 * Returns a pointer to a memory cgroup, which has to be cleaned up 2028 * by killing all belonging OOM-killable tasks. 2029 * 2030 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2031 */ 2032 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2033 struct mem_cgroup *oom_domain) 2034 { 2035 struct mem_cgroup *oom_group = NULL; 2036 struct mem_cgroup *memcg; 2037 2038 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2039 return NULL; 2040 2041 if (!oom_domain) 2042 oom_domain = root_mem_cgroup; 2043 2044 rcu_read_lock(); 2045 2046 memcg = mem_cgroup_from_task(victim); 2047 if (mem_cgroup_is_root(memcg)) 2048 goto out; 2049 2050 /* 2051 * If the victim task has been asynchronously moved to a different 2052 * memory cgroup, we might end up killing tasks outside oom_domain. 2053 * In this case it's better to ignore memory.group.oom. 2054 */ 2055 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2056 goto out; 2057 2058 /* 2059 * Traverse the memory cgroup hierarchy from the victim task's 2060 * cgroup up to the OOMing cgroup (or root) to find the 2061 * highest-level memory cgroup with oom.group set. 2062 */ 2063 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2064 if (memcg->oom_group) 2065 oom_group = memcg; 2066 2067 if (memcg == oom_domain) 2068 break; 2069 } 2070 2071 if (oom_group) 2072 css_get(&oom_group->css); 2073 out: 2074 rcu_read_unlock(); 2075 2076 return oom_group; 2077 } 2078 2079 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2080 { 2081 pr_info("Tasks in "); 2082 pr_cont_cgroup_path(memcg->css.cgroup); 2083 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2084 } 2085 2086 /** 2087 * folio_memcg_lock - Bind a folio to its memcg. 2088 * @folio: The folio. 2089 * 2090 * This function prevents unlocked LRU folios from being moved to 2091 * another cgroup. 2092 * 2093 * It ensures lifetime of the bound memcg. The caller is responsible 2094 * for the lifetime of the folio. 2095 */ 2096 void folio_memcg_lock(struct folio *folio) 2097 { 2098 struct mem_cgroup *memcg; 2099 unsigned long flags; 2100 2101 /* 2102 * The RCU lock is held throughout the transaction. The fast 2103 * path can get away without acquiring the memcg->move_lock 2104 * because page moving starts with an RCU grace period. 2105 */ 2106 rcu_read_lock(); 2107 2108 if (mem_cgroup_disabled()) 2109 return; 2110 again: 2111 memcg = folio_memcg(folio); 2112 if (unlikely(!memcg)) 2113 return; 2114 2115 #ifdef CONFIG_PROVE_LOCKING 2116 local_irq_save(flags); 2117 might_lock(&memcg->move_lock); 2118 local_irq_restore(flags); 2119 #endif 2120 2121 if (atomic_read(&memcg->moving_account) <= 0) 2122 return; 2123 2124 spin_lock_irqsave(&memcg->move_lock, flags); 2125 if (memcg != folio_memcg(folio)) { 2126 spin_unlock_irqrestore(&memcg->move_lock, flags); 2127 goto again; 2128 } 2129 2130 /* 2131 * When charge migration first begins, we can have multiple 2132 * critical sections holding the fast-path RCU lock and one 2133 * holding the slowpath move_lock. Track the task who has the 2134 * move_lock for unlock_page_memcg(). 2135 */ 2136 memcg->move_lock_task = current; 2137 memcg->move_lock_flags = flags; 2138 } 2139 2140 void lock_page_memcg(struct page *page) 2141 { 2142 folio_memcg_lock(page_folio(page)); 2143 } 2144 2145 static void __folio_memcg_unlock(struct mem_cgroup *memcg) 2146 { 2147 if (memcg && memcg->move_lock_task == current) { 2148 unsigned long flags = memcg->move_lock_flags; 2149 2150 memcg->move_lock_task = NULL; 2151 memcg->move_lock_flags = 0; 2152 2153 spin_unlock_irqrestore(&memcg->move_lock, flags); 2154 } 2155 2156 rcu_read_unlock(); 2157 } 2158 2159 /** 2160 * folio_memcg_unlock - Release the binding between a folio and its memcg. 2161 * @folio: The folio. 2162 * 2163 * This releases the binding created by folio_memcg_lock(). This does 2164 * not change the accounting of this folio to its memcg, but it does 2165 * permit others to change it. 2166 */ 2167 void folio_memcg_unlock(struct folio *folio) 2168 { 2169 __folio_memcg_unlock(folio_memcg(folio)); 2170 } 2171 2172 void unlock_page_memcg(struct page *page) 2173 { 2174 folio_memcg_unlock(page_folio(page)); 2175 } 2176 2177 struct memcg_stock_pcp { 2178 local_lock_t stock_lock; 2179 struct mem_cgroup *cached; /* this never be root cgroup */ 2180 unsigned int nr_pages; 2181 2182 #ifdef CONFIG_MEMCG_KMEM 2183 struct obj_cgroup *cached_objcg; 2184 struct pglist_data *cached_pgdat; 2185 unsigned int nr_bytes; 2186 int nr_slab_reclaimable_b; 2187 int nr_slab_unreclaimable_b; 2188 #endif 2189 2190 struct work_struct work; 2191 unsigned long flags; 2192 #define FLUSHING_CACHED_CHARGE 0 2193 }; 2194 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 2195 .stock_lock = INIT_LOCAL_LOCK(stock_lock), 2196 }; 2197 static DEFINE_MUTEX(percpu_charge_mutex); 2198 2199 #ifdef CONFIG_MEMCG_KMEM 2200 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 2201 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2202 struct mem_cgroup *root_memcg); 2203 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); 2204 2205 #else 2206 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 2207 { 2208 return NULL; 2209 } 2210 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2211 struct mem_cgroup *root_memcg) 2212 { 2213 return false; 2214 } 2215 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2216 { 2217 } 2218 #endif 2219 2220 /** 2221 * consume_stock: Try to consume stocked charge on this cpu. 2222 * @memcg: memcg to consume from. 2223 * @nr_pages: how many pages to charge. 2224 * 2225 * The charges will only happen if @memcg matches the current cpu's memcg 2226 * stock, and at least @nr_pages are available in that stock. Failure to 2227 * service an allocation will refill the stock. 2228 * 2229 * returns true if successful, false otherwise. 2230 */ 2231 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2232 { 2233 struct memcg_stock_pcp *stock; 2234 unsigned long flags; 2235 bool ret = false; 2236 2237 if (nr_pages > MEMCG_CHARGE_BATCH) 2238 return ret; 2239 2240 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2241 2242 stock = this_cpu_ptr(&memcg_stock); 2243 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2244 stock->nr_pages -= nr_pages; 2245 ret = true; 2246 } 2247 2248 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2249 2250 return ret; 2251 } 2252 2253 /* 2254 * Returns stocks cached in percpu and reset cached information. 2255 */ 2256 static void drain_stock(struct memcg_stock_pcp *stock) 2257 { 2258 struct mem_cgroup *old = stock->cached; 2259 2260 if (!old) 2261 return; 2262 2263 if (stock->nr_pages) { 2264 page_counter_uncharge(&old->memory, stock->nr_pages); 2265 if (do_memsw_account()) 2266 page_counter_uncharge(&old->memsw, stock->nr_pages); 2267 stock->nr_pages = 0; 2268 } 2269 2270 css_put(&old->css); 2271 stock->cached = NULL; 2272 } 2273 2274 static void drain_local_stock(struct work_struct *dummy) 2275 { 2276 struct memcg_stock_pcp *stock; 2277 struct obj_cgroup *old = NULL; 2278 unsigned long flags; 2279 2280 /* 2281 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 2282 * drain_stock races is that we always operate on local CPU stock 2283 * here with IRQ disabled 2284 */ 2285 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2286 2287 stock = this_cpu_ptr(&memcg_stock); 2288 old = drain_obj_stock(stock); 2289 drain_stock(stock); 2290 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2291 2292 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2293 if (old) 2294 obj_cgroup_put(old); 2295 } 2296 2297 /* 2298 * Cache charges(val) to local per_cpu area. 2299 * This will be consumed by consume_stock() function, later. 2300 */ 2301 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2302 { 2303 struct memcg_stock_pcp *stock; 2304 2305 stock = this_cpu_ptr(&memcg_stock); 2306 if (stock->cached != memcg) { /* reset if necessary */ 2307 drain_stock(stock); 2308 css_get(&memcg->css); 2309 stock->cached = memcg; 2310 } 2311 stock->nr_pages += nr_pages; 2312 2313 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2314 drain_stock(stock); 2315 } 2316 2317 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2318 { 2319 unsigned long flags; 2320 2321 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2322 __refill_stock(memcg, nr_pages); 2323 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2324 } 2325 2326 /* 2327 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2328 * of the hierarchy under it. 2329 */ 2330 static void drain_all_stock(struct mem_cgroup *root_memcg) 2331 { 2332 int cpu, curcpu; 2333 2334 /* If someone's already draining, avoid adding running more workers. */ 2335 if (!mutex_trylock(&percpu_charge_mutex)) 2336 return; 2337 /* 2338 * Notify other cpus that system-wide "drain" is running 2339 * We do not care about races with the cpu hotplug because cpu down 2340 * as well as workers from this path always operate on the local 2341 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2342 */ 2343 migrate_disable(); 2344 curcpu = smp_processor_id(); 2345 for_each_online_cpu(cpu) { 2346 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2347 struct mem_cgroup *memcg; 2348 bool flush = false; 2349 2350 rcu_read_lock(); 2351 memcg = stock->cached; 2352 if (memcg && stock->nr_pages && 2353 mem_cgroup_is_descendant(memcg, root_memcg)) 2354 flush = true; 2355 else if (obj_stock_flush_required(stock, root_memcg)) 2356 flush = true; 2357 rcu_read_unlock(); 2358 2359 if (flush && 2360 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2361 if (cpu == curcpu) 2362 drain_local_stock(&stock->work); 2363 else 2364 schedule_work_on(cpu, &stock->work); 2365 } 2366 } 2367 migrate_enable(); 2368 mutex_unlock(&percpu_charge_mutex); 2369 } 2370 2371 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2372 { 2373 struct memcg_stock_pcp *stock; 2374 2375 stock = &per_cpu(memcg_stock, cpu); 2376 drain_stock(stock); 2377 2378 return 0; 2379 } 2380 2381 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2382 unsigned int nr_pages, 2383 gfp_t gfp_mask) 2384 { 2385 unsigned long nr_reclaimed = 0; 2386 2387 do { 2388 unsigned long pflags; 2389 2390 if (page_counter_read(&memcg->memory) <= 2391 READ_ONCE(memcg->memory.high)) 2392 continue; 2393 2394 memcg_memory_event(memcg, MEMCG_HIGH); 2395 2396 psi_memstall_enter(&pflags); 2397 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2398 gfp_mask, 2399 MEMCG_RECLAIM_MAY_SWAP); 2400 psi_memstall_leave(&pflags); 2401 } while ((memcg = parent_mem_cgroup(memcg)) && 2402 !mem_cgroup_is_root(memcg)); 2403 2404 return nr_reclaimed; 2405 } 2406 2407 static void high_work_func(struct work_struct *work) 2408 { 2409 struct mem_cgroup *memcg; 2410 2411 memcg = container_of(work, struct mem_cgroup, high_work); 2412 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2413 } 2414 2415 /* 2416 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2417 * enough to still cause a significant slowdown in most cases, while still 2418 * allowing diagnostics and tracing to proceed without becoming stuck. 2419 */ 2420 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2421 2422 /* 2423 * When calculating the delay, we use these either side of the exponentiation to 2424 * maintain precision and scale to a reasonable number of jiffies (see the table 2425 * below. 2426 * 2427 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2428 * overage ratio to a delay. 2429 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2430 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2431 * to produce a reasonable delay curve. 2432 * 2433 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2434 * reasonable delay curve compared to precision-adjusted overage, not 2435 * penalising heavily at first, but still making sure that growth beyond the 2436 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2437 * example, with a high of 100 megabytes: 2438 * 2439 * +-------+------------------------+ 2440 * | usage | time to allocate in ms | 2441 * +-------+------------------------+ 2442 * | 100M | 0 | 2443 * | 101M | 6 | 2444 * | 102M | 25 | 2445 * | 103M | 57 | 2446 * | 104M | 102 | 2447 * | 105M | 159 | 2448 * | 106M | 230 | 2449 * | 107M | 313 | 2450 * | 108M | 409 | 2451 * | 109M | 518 | 2452 * | 110M | 639 | 2453 * | 111M | 774 | 2454 * | 112M | 921 | 2455 * | 113M | 1081 | 2456 * | 114M | 1254 | 2457 * | 115M | 1439 | 2458 * | 116M | 1638 | 2459 * | 117M | 1849 | 2460 * | 118M | 2000 | 2461 * | 119M | 2000 | 2462 * | 120M | 2000 | 2463 * +-------+------------------------+ 2464 */ 2465 #define MEMCG_DELAY_PRECISION_SHIFT 20 2466 #define MEMCG_DELAY_SCALING_SHIFT 14 2467 2468 static u64 calculate_overage(unsigned long usage, unsigned long high) 2469 { 2470 u64 overage; 2471 2472 if (usage <= high) 2473 return 0; 2474 2475 /* 2476 * Prevent division by 0 in overage calculation by acting as if 2477 * it was a threshold of 1 page 2478 */ 2479 high = max(high, 1UL); 2480 2481 overage = usage - high; 2482 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2483 return div64_u64(overage, high); 2484 } 2485 2486 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2487 { 2488 u64 overage, max_overage = 0; 2489 2490 do { 2491 overage = calculate_overage(page_counter_read(&memcg->memory), 2492 READ_ONCE(memcg->memory.high)); 2493 max_overage = max(overage, max_overage); 2494 } while ((memcg = parent_mem_cgroup(memcg)) && 2495 !mem_cgroup_is_root(memcg)); 2496 2497 return max_overage; 2498 } 2499 2500 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2501 { 2502 u64 overage, max_overage = 0; 2503 2504 do { 2505 overage = calculate_overage(page_counter_read(&memcg->swap), 2506 READ_ONCE(memcg->swap.high)); 2507 if (overage) 2508 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2509 max_overage = max(overage, max_overage); 2510 } while ((memcg = parent_mem_cgroup(memcg)) && 2511 !mem_cgroup_is_root(memcg)); 2512 2513 return max_overage; 2514 } 2515 2516 /* 2517 * Get the number of jiffies that we should penalise a mischievous cgroup which 2518 * is exceeding its memory.high by checking both it and its ancestors. 2519 */ 2520 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2521 unsigned int nr_pages, 2522 u64 max_overage) 2523 { 2524 unsigned long penalty_jiffies; 2525 2526 if (!max_overage) 2527 return 0; 2528 2529 /* 2530 * We use overage compared to memory.high to calculate the number of 2531 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2532 * fairly lenient on small overages, and increasingly harsh when the 2533 * memcg in question makes it clear that it has no intention of stopping 2534 * its crazy behaviour, so we exponentially increase the delay based on 2535 * overage amount. 2536 */ 2537 penalty_jiffies = max_overage * max_overage * HZ; 2538 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2539 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2540 2541 /* 2542 * Factor in the task's own contribution to the overage, such that four 2543 * N-sized allocations are throttled approximately the same as one 2544 * 4N-sized allocation. 2545 * 2546 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2547 * larger the current charge patch is than that. 2548 */ 2549 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2550 } 2551 2552 /* 2553 * Scheduled by try_charge() to be executed from the userland return path 2554 * and reclaims memory over the high limit. 2555 */ 2556 void mem_cgroup_handle_over_high(void) 2557 { 2558 unsigned long penalty_jiffies; 2559 unsigned long pflags; 2560 unsigned long nr_reclaimed; 2561 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2562 int nr_retries = MAX_RECLAIM_RETRIES; 2563 struct mem_cgroup *memcg; 2564 bool in_retry = false; 2565 2566 if (likely(!nr_pages)) 2567 return; 2568 2569 memcg = get_mem_cgroup_from_mm(current->mm); 2570 current->memcg_nr_pages_over_high = 0; 2571 2572 retry_reclaim: 2573 /* 2574 * The allocating task should reclaim at least the batch size, but for 2575 * subsequent retries we only want to do what's necessary to prevent oom 2576 * or breaching resource isolation. 2577 * 2578 * This is distinct from memory.max or page allocator behaviour because 2579 * memory.high is currently batched, whereas memory.max and the page 2580 * allocator run every time an allocation is made. 2581 */ 2582 nr_reclaimed = reclaim_high(memcg, 2583 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2584 GFP_KERNEL); 2585 2586 /* 2587 * memory.high is breached and reclaim is unable to keep up. Throttle 2588 * allocators proactively to slow down excessive growth. 2589 */ 2590 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2591 mem_find_max_overage(memcg)); 2592 2593 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2594 swap_find_max_overage(memcg)); 2595 2596 /* 2597 * Clamp the max delay per usermode return so as to still keep the 2598 * application moving forwards and also permit diagnostics, albeit 2599 * extremely slowly. 2600 */ 2601 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2602 2603 /* 2604 * Don't sleep if the amount of jiffies this memcg owes us is so low 2605 * that it's not even worth doing, in an attempt to be nice to those who 2606 * go only a small amount over their memory.high value and maybe haven't 2607 * been aggressively reclaimed enough yet. 2608 */ 2609 if (penalty_jiffies <= HZ / 100) 2610 goto out; 2611 2612 /* 2613 * If reclaim is making forward progress but we're still over 2614 * memory.high, we want to encourage that rather than doing allocator 2615 * throttling. 2616 */ 2617 if (nr_reclaimed || nr_retries--) { 2618 in_retry = true; 2619 goto retry_reclaim; 2620 } 2621 2622 /* 2623 * If we exit early, we're guaranteed to die (since 2624 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2625 * need to account for any ill-begotten jiffies to pay them off later. 2626 */ 2627 psi_memstall_enter(&pflags); 2628 schedule_timeout_killable(penalty_jiffies); 2629 psi_memstall_leave(&pflags); 2630 2631 out: 2632 css_put(&memcg->css); 2633 } 2634 2635 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2636 unsigned int nr_pages) 2637 { 2638 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2639 int nr_retries = MAX_RECLAIM_RETRIES; 2640 struct mem_cgroup *mem_over_limit; 2641 struct page_counter *counter; 2642 unsigned long nr_reclaimed; 2643 bool passed_oom = false; 2644 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 2645 bool drained = false; 2646 bool raised_max_event = false; 2647 unsigned long pflags; 2648 2649 retry: 2650 if (consume_stock(memcg, nr_pages)) 2651 return 0; 2652 2653 if (!do_memsw_account() || 2654 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2655 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2656 goto done_restock; 2657 if (do_memsw_account()) 2658 page_counter_uncharge(&memcg->memsw, batch); 2659 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2660 } else { 2661 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2662 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 2663 } 2664 2665 if (batch > nr_pages) { 2666 batch = nr_pages; 2667 goto retry; 2668 } 2669 2670 /* 2671 * Prevent unbounded recursion when reclaim operations need to 2672 * allocate memory. This might exceed the limits temporarily, 2673 * but we prefer facilitating memory reclaim and getting back 2674 * under the limit over triggering OOM kills in these cases. 2675 */ 2676 if (unlikely(current->flags & PF_MEMALLOC)) 2677 goto force; 2678 2679 if (unlikely(task_in_memcg_oom(current))) 2680 goto nomem; 2681 2682 if (!gfpflags_allow_blocking(gfp_mask)) 2683 goto nomem; 2684 2685 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2686 raised_max_event = true; 2687 2688 psi_memstall_enter(&pflags); 2689 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2690 gfp_mask, reclaim_options); 2691 psi_memstall_leave(&pflags); 2692 2693 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2694 goto retry; 2695 2696 if (!drained) { 2697 drain_all_stock(mem_over_limit); 2698 drained = true; 2699 goto retry; 2700 } 2701 2702 if (gfp_mask & __GFP_NORETRY) 2703 goto nomem; 2704 /* 2705 * Even though the limit is exceeded at this point, reclaim 2706 * may have been able to free some pages. Retry the charge 2707 * before killing the task. 2708 * 2709 * Only for regular pages, though: huge pages are rather 2710 * unlikely to succeed so close to the limit, and we fall back 2711 * to regular pages anyway in case of failure. 2712 */ 2713 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2714 goto retry; 2715 /* 2716 * At task move, charge accounts can be doubly counted. So, it's 2717 * better to wait until the end of task_move if something is going on. 2718 */ 2719 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2720 goto retry; 2721 2722 if (nr_retries--) 2723 goto retry; 2724 2725 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2726 goto nomem; 2727 2728 /* Avoid endless loop for tasks bypassed by the oom killer */ 2729 if (passed_oom && task_is_dying()) 2730 goto nomem; 2731 2732 /* 2733 * keep retrying as long as the memcg oom killer is able to make 2734 * a forward progress or bypass the charge if the oom killer 2735 * couldn't make any progress. 2736 */ 2737 if (mem_cgroup_oom(mem_over_limit, gfp_mask, 2738 get_order(nr_pages * PAGE_SIZE))) { 2739 passed_oom = true; 2740 nr_retries = MAX_RECLAIM_RETRIES; 2741 goto retry; 2742 } 2743 nomem: 2744 /* 2745 * Memcg doesn't have a dedicated reserve for atomic 2746 * allocations. But like the global atomic pool, we need to 2747 * put the burden of reclaim on regular allocation requests 2748 * and let these go through as privileged allocations. 2749 */ 2750 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 2751 return -ENOMEM; 2752 force: 2753 /* 2754 * If the allocation has to be enforced, don't forget to raise 2755 * a MEMCG_MAX event. 2756 */ 2757 if (!raised_max_event) 2758 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2759 2760 /* 2761 * The allocation either can't fail or will lead to more memory 2762 * being freed very soon. Allow memory usage go over the limit 2763 * temporarily by force charging it. 2764 */ 2765 page_counter_charge(&memcg->memory, nr_pages); 2766 if (do_memsw_account()) 2767 page_counter_charge(&memcg->memsw, nr_pages); 2768 2769 return 0; 2770 2771 done_restock: 2772 if (batch > nr_pages) 2773 refill_stock(memcg, batch - nr_pages); 2774 2775 /* 2776 * If the hierarchy is above the normal consumption range, schedule 2777 * reclaim on returning to userland. We can perform reclaim here 2778 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2779 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2780 * not recorded as it most likely matches current's and won't 2781 * change in the meantime. As high limit is checked again before 2782 * reclaim, the cost of mismatch is negligible. 2783 */ 2784 do { 2785 bool mem_high, swap_high; 2786 2787 mem_high = page_counter_read(&memcg->memory) > 2788 READ_ONCE(memcg->memory.high); 2789 swap_high = page_counter_read(&memcg->swap) > 2790 READ_ONCE(memcg->swap.high); 2791 2792 /* Don't bother a random interrupted task */ 2793 if (!in_task()) { 2794 if (mem_high) { 2795 schedule_work(&memcg->high_work); 2796 break; 2797 } 2798 continue; 2799 } 2800 2801 if (mem_high || swap_high) { 2802 /* 2803 * The allocating tasks in this cgroup will need to do 2804 * reclaim or be throttled to prevent further growth 2805 * of the memory or swap footprints. 2806 * 2807 * Target some best-effort fairness between the tasks, 2808 * and distribute reclaim work and delay penalties 2809 * based on how much each task is actually allocating. 2810 */ 2811 current->memcg_nr_pages_over_high += batch; 2812 set_notify_resume(current); 2813 break; 2814 } 2815 } while ((memcg = parent_mem_cgroup(memcg))); 2816 2817 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 2818 !(current->flags & PF_MEMALLOC) && 2819 gfpflags_allow_blocking(gfp_mask)) { 2820 mem_cgroup_handle_over_high(); 2821 } 2822 return 0; 2823 } 2824 2825 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2826 unsigned int nr_pages) 2827 { 2828 if (mem_cgroup_is_root(memcg)) 2829 return 0; 2830 2831 return try_charge_memcg(memcg, gfp_mask, nr_pages); 2832 } 2833 2834 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2835 { 2836 if (mem_cgroup_is_root(memcg)) 2837 return; 2838 2839 page_counter_uncharge(&memcg->memory, nr_pages); 2840 if (do_memsw_account()) 2841 page_counter_uncharge(&memcg->memsw, nr_pages); 2842 } 2843 2844 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2845 { 2846 VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 2847 /* 2848 * Any of the following ensures page's memcg stability: 2849 * 2850 * - the page lock 2851 * - LRU isolation 2852 * - lock_page_memcg() 2853 * - exclusive reference 2854 * - mem_cgroup_trylock_pages() 2855 */ 2856 folio->memcg_data = (unsigned long)memcg; 2857 } 2858 2859 #ifdef CONFIG_MEMCG_KMEM 2860 /* 2861 * The allocated objcg pointers array is not accounted directly. 2862 * Moreover, it should not come from DMA buffer and is not readily 2863 * reclaimable. So those GFP bits should be masked off. 2864 */ 2865 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) 2866 2867 /* 2868 * mod_objcg_mlstate() may be called with irq enabled, so 2869 * mod_memcg_lruvec_state() should be used. 2870 */ 2871 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 2872 struct pglist_data *pgdat, 2873 enum node_stat_item idx, int nr) 2874 { 2875 struct mem_cgroup *memcg; 2876 struct lruvec *lruvec; 2877 2878 rcu_read_lock(); 2879 memcg = obj_cgroup_memcg(objcg); 2880 lruvec = mem_cgroup_lruvec(memcg, pgdat); 2881 mod_memcg_lruvec_state(lruvec, idx, nr); 2882 rcu_read_unlock(); 2883 } 2884 2885 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, 2886 gfp_t gfp, bool new_slab) 2887 { 2888 unsigned int objects = objs_per_slab(s, slab); 2889 unsigned long memcg_data; 2890 void *vec; 2891 2892 gfp &= ~OBJCGS_CLEAR_MASK; 2893 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2894 slab_nid(slab)); 2895 if (!vec) 2896 return -ENOMEM; 2897 2898 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2899 if (new_slab) { 2900 /* 2901 * If the slab is brand new and nobody can yet access its 2902 * memcg_data, no synchronization is required and memcg_data can 2903 * be simply assigned. 2904 */ 2905 slab->memcg_data = memcg_data; 2906 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { 2907 /* 2908 * If the slab is already in use, somebody can allocate and 2909 * assign obj_cgroups in parallel. In this case the existing 2910 * objcg vector should be reused. 2911 */ 2912 kfree(vec); 2913 return 0; 2914 } 2915 2916 kmemleak_not_leak(vec); 2917 return 0; 2918 } 2919 2920 static __always_inline 2921 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2922 { 2923 /* 2924 * Slab objects are accounted individually, not per-page. 2925 * Memcg membership data for each individual object is saved in 2926 * slab->memcg_data. 2927 */ 2928 if (folio_test_slab(folio)) { 2929 struct obj_cgroup **objcgs; 2930 struct slab *slab; 2931 unsigned int off; 2932 2933 slab = folio_slab(folio); 2934 objcgs = slab_objcgs(slab); 2935 if (!objcgs) 2936 return NULL; 2937 2938 off = obj_to_index(slab->slab_cache, slab, p); 2939 if (objcgs[off]) 2940 return obj_cgroup_memcg(objcgs[off]); 2941 2942 return NULL; 2943 } 2944 2945 /* 2946 * folio_memcg_check() is used here, because in theory we can encounter 2947 * a folio where the slab flag has been cleared already, but 2948 * slab->memcg_data has not been freed yet 2949 * folio_memcg_check() will guarantee that a proper memory 2950 * cgroup pointer or NULL will be returned. 2951 */ 2952 return folio_memcg_check(folio); 2953 } 2954 2955 /* 2956 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2957 * 2958 * A passed kernel object can be a slab object, vmalloc object or a generic 2959 * kernel page, so different mechanisms for getting the memory cgroup pointer 2960 * should be used. 2961 * 2962 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2963 * can not know for sure how the kernel object is implemented. 2964 * mem_cgroup_from_obj() can be safely used in such cases. 2965 * 2966 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2967 * cgroup_mutex, etc. 2968 */ 2969 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2970 { 2971 struct folio *folio; 2972 2973 if (mem_cgroup_disabled()) 2974 return NULL; 2975 2976 if (unlikely(is_vmalloc_addr(p))) 2977 folio = page_folio(vmalloc_to_page(p)); 2978 else 2979 folio = virt_to_folio(p); 2980 2981 return mem_cgroup_from_obj_folio(folio, p); 2982 } 2983 2984 /* 2985 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2986 * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, 2987 * allocated using vmalloc(). 2988 * 2989 * A passed kernel object must be a slab object or a generic kernel page. 2990 * 2991 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2992 * cgroup_mutex, etc. 2993 */ 2994 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 2995 { 2996 if (mem_cgroup_disabled()) 2997 return NULL; 2998 2999 return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 3000 } 3001 3002 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 3003 { 3004 struct obj_cgroup *objcg = NULL; 3005 3006 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 3007 objcg = rcu_dereference(memcg->objcg); 3008 if (objcg && obj_cgroup_tryget(objcg)) 3009 break; 3010 objcg = NULL; 3011 } 3012 return objcg; 3013 } 3014 3015 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 3016 { 3017 struct obj_cgroup *objcg = NULL; 3018 struct mem_cgroup *memcg; 3019 3020 if (memcg_kmem_bypass()) 3021 return NULL; 3022 3023 rcu_read_lock(); 3024 if (unlikely(active_memcg())) 3025 memcg = active_memcg(); 3026 else 3027 memcg = mem_cgroup_from_task(current); 3028 objcg = __get_obj_cgroup_from_memcg(memcg); 3029 rcu_read_unlock(); 3030 return objcg; 3031 } 3032 3033 struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) 3034 { 3035 struct obj_cgroup *objcg; 3036 3037 if (!memcg_kmem_enabled()) 3038 return NULL; 3039 3040 if (PageMemcgKmem(page)) { 3041 objcg = __folio_objcg(page_folio(page)); 3042 obj_cgroup_get(objcg); 3043 } else { 3044 struct mem_cgroup *memcg; 3045 3046 rcu_read_lock(); 3047 memcg = __folio_memcg(page_folio(page)); 3048 if (memcg) 3049 objcg = __get_obj_cgroup_from_memcg(memcg); 3050 else 3051 objcg = NULL; 3052 rcu_read_unlock(); 3053 } 3054 return objcg; 3055 } 3056 3057 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 3058 { 3059 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 3060 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 3061 if (nr_pages > 0) 3062 page_counter_charge(&memcg->kmem, nr_pages); 3063 else 3064 page_counter_uncharge(&memcg->kmem, -nr_pages); 3065 } 3066 } 3067 3068 3069 /* 3070 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 3071 * @objcg: object cgroup to uncharge 3072 * @nr_pages: number of pages to uncharge 3073 */ 3074 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 3075 unsigned int nr_pages) 3076 { 3077 struct mem_cgroup *memcg; 3078 3079 memcg = get_mem_cgroup_from_objcg(objcg); 3080 3081 memcg_account_kmem(memcg, -nr_pages); 3082 refill_stock(memcg, nr_pages); 3083 3084 css_put(&memcg->css); 3085 } 3086 3087 /* 3088 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 3089 * @objcg: object cgroup to charge 3090 * @gfp: reclaim mode 3091 * @nr_pages: number of pages to charge 3092 * 3093 * Returns 0 on success, an error code on failure. 3094 */ 3095 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 3096 unsigned int nr_pages) 3097 { 3098 struct mem_cgroup *memcg; 3099 int ret; 3100 3101 memcg = get_mem_cgroup_from_objcg(objcg); 3102 3103 ret = try_charge_memcg(memcg, gfp, nr_pages); 3104 if (ret) 3105 goto out; 3106 3107 memcg_account_kmem(memcg, nr_pages); 3108 out: 3109 css_put(&memcg->css); 3110 3111 return ret; 3112 } 3113 3114 /** 3115 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3116 * @page: page to charge 3117 * @gfp: reclaim mode 3118 * @order: allocation order 3119 * 3120 * Returns 0 on success, an error code on failure. 3121 */ 3122 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3123 { 3124 struct obj_cgroup *objcg; 3125 int ret = 0; 3126 3127 objcg = get_obj_cgroup_from_current(); 3128 if (objcg) { 3129 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 3130 if (!ret) { 3131 page->memcg_data = (unsigned long)objcg | 3132 MEMCG_DATA_KMEM; 3133 return 0; 3134 } 3135 obj_cgroup_put(objcg); 3136 } 3137 return ret; 3138 } 3139 3140 /** 3141 * __memcg_kmem_uncharge_page: uncharge a kmem page 3142 * @page: page to uncharge 3143 * @order: allocation order 3144 */ 3145 void __memcg_kmem_uncharge_page(struct page *page, int order) 3146 { 3147 struct folio *folio = page_folio(page); 3148 struct obj_cgroup *objcg; 3149 unsigned int nr_pages = 1 << order; 3150 3151 if (!folio_memcg_kmem(folio)) 3152 return; 3153 3154 objcg = __folio_objcg(folio); 3155 obj_cgroup_uncharge_pages(objcg, nr_pages); 3156 folio->memcg_data = 0; 3157 obj_cgroup_put(objcg); 3158 } 3159 3160 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 3161 enum node_stat_item idx, int nr) 3162 { 3163 struct memcg_stock_pcp *stock; 3164 struct obj_cgroup *old = NULL; 3165 unsigned long flags; 3166 int *bytes; 3167 3168 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3169 stock = this_cpu_ptr(&memcg_stock); 3170 3171 /* 3172 * Save vmstat data in stock and skip vmstat array update unless 3173 * accumulating over a page of vmstat data or when pgdat or idx 3174 * changes. 3175 */ 3176 if (stock->cached_objcg != objcg) { 3177 old = drain_obj_stock(stock); 3178 obj_cgroup_get(objcg); 3179 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3180 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3181 stock->cached_objcg = objcg; 3182 stock->cached_pgdat = pgdat; 3183 } else if (stock->cached_pgdat != pgdat) { 3184 /* Flush the existing cached vmstat data */ 3185 struct pglist_data *oldpg = stock->cached_pgdat; 3186 3187 if (stock->nr_slab_reclaimable_b) { 3188 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 3189 stock->nr_slab_reclaimable_b); 3190 stock->nr_slab_reclaimable_b = 0; 3191 } 3192 if (stock->nr_slab_unreclaimable_b) { 3193 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 3194 stock->nr_slab_unreclaimable_b); 3195 stock->nr_slab_unreclaimable_b = 0; 3196 } 3197 stock->cached_pgdat = pgdat; 3198 } 3199 3200 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 3201 : &stock->nr_slab_unreclaimable_b; 3202 /* 3203 * Even for large object >= PAGE_SIZE, the vmstat data will still be 3204 * cached locally at least once before pushing it out. 3205 */ 3206 if (!*bytes) { 3207 *bytes = nr; 3208 nr = 0; 3209 } else { 3210 *bytes += nr; 3211 if (abs(*bytes) > PAGE_SIZE) { 3212 nr = *bytes; 3213 *bytes = 0; 3214 } else { 3215 nr = 0; 3216 } 3217 } 3218 if (nr) 3219 mod_objcg_mlstate(objcg, pgdat, idx, nr); 3220 3221 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3222 if (old) 3223 obj_cgroup_put(old); 3224 } 3225 3226 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3227 { 3228 struct memcg_stock_pcp *stock; 3229 unsigned long flags; 3230 bool ret = false; 3231 3232 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3233 3234 stock = this_cpu_ptr(&memcg_stock); 3235 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3236 stock->nr_bytes -= nr_bytes; 3237 ret = true; 3238 } 3239 3240 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3241 3242 return ret; 3243 } 3244 3245 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 3246 { 3247 struct obj_cgroup *old = stock->cached_objcg; 3248 3249 if (!old) 3250 return NULL; 3251 3252 if (stock->nr_bytes) { 3253 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3254 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3255 3256 if (nr_pages) { 3257 struct mem_cgroup *memcg; 3258 3259 memcg = get_mem_cgroup_from_objcg(old); 3260 3261 memcg_account_kmem(memcg, -nr_pages); 3262 __refill_stock(memcg, nr_pages); 3263 3264 css_put(&memcg->css); 3265 } 3266 3267 /* 3268 * The leftover is flushed to the centralized per-memcg value. 3269 * On the next attempt to refill obj stock it will be moved 3270 * to a per-cpu stock (probably, on an other CPU), see 3271 * refill_obj_stock(). 3272 * 3273 * How often it's flushed is a trade-off between the memory 3274 * limit enforcement accuracy and potential CPU contention, 3275 * so it might be changed in the future. 3276 */ 3277 atomic_add(nr_bytes, &old->nr_charged_bytes); 3278 stock->nr_bytes = 0; 3279 } 3280 3281 /* 3282 * Flush the vmstat data in current stock 3283 */ 3284 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 3285 if (stock->nr_slab_reclaimable_b) { 3286 mod_objcg_mlstate(old, stock->cached_pgdat, 3287 NR_SLAB_RECLAIMABLE_B, 3288 stock->nr_slab_reclaimable_b); 3289 stock->nr_slab_reclaimable_b = 0; 3290 } 3291 if (stock->nr_slab_unreclaimable_b) { 3292 mod_objcg_mlstate(old, stock->cached_pgdat, 3293 NR_SLAB_UNRECLAIMABLE_B, 3294 stock->nr_slab_unreclaimable_b); 3295 stock->nr_slab_unreclaimable_b = 0; 3296 } 3297 stock->cached_pgdat = NULL; 3298 } 3299 3300 stock->cached_objcg = NULL; 3301 /* 3302 * The `old' objects needs to be released by the caller via 3303 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 3304 */ 3305 return old; 3306 } 3307 3308 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3309 struct mem_cgroup *root_memcg) 3310 { 3311 struct mem_cgroup *memcg; 3312 3313 if (stock->cached_objcg) { 3314 memcg = obj_cgroup_memcg(stock->cached_objcg); 3315 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3316 return true; 3317 } 3318 3319 return false; 3320 } 3321 3322 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3323 bool allow_uncharge) 3324 { 3325 struct memcg_stock_pcp *stock; 3326 struct obj_cgroup *old = NULL; 3327 unsigned long flags; 3328 unsigned int nr_pages = 0; 3329 3330 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3331 3332 stock = this_cpu_ptr(&memcg_stock); 3333 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3334 old = drain_obj_stock(stock); 3335 obj_cgroup_get(objcg); 3336 stock->cached_objcg = objcg; 3337 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3338 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3339 allow_uncharge = true; /* Allow uncharge when objcg changes */ 3340 } 3341 stock->nr_bytes += nr_bytes; 3342 3343 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 3344 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3345 stock->nr_bytes &= (PAGE_SIZE - 1); 3346 } 3347 3348 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3349 if (old) 3350 obj_cgroup_put(old); 3351 3352 if (nr_pages) 3353 obj_cgroup_uncharge_pages(objcg, nr_pages); 3354 } 3355 3356 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3357 { 3358 unsigned int nr_pages, nr_bytes; 3359 int ret; 3360 3361 if (consume_obj_stock(objcg, size)) 3362 return 0; 3363 3364 /* 3365 * In theory, objcg->nr_charged_bytes can have enough 3366 * pre-charged bytes to satisfy the allocation. However, 3367 * flushing objcg->nr_charged_bytes requires two atomic 3368 * operations, and objcg->nr_charged_bytes can't be big. 3369 * The shared objcg->nr_charged_bytes can also become a 3370 * performance bottleneck if all tasks of the same memcg are 3371 * trying to update it. So it's better to ignore it and try 3372 * grab some new pages. The stock's nr_bytes will be flushed to 3373 * objcg->nr_charged_bytes later on when objcg changes. 3374 * 3375 * The stock's nr_bytes may contain enough pre-charged bytes 3376 * to allow one less page from being charged, but we can't rely 3377 * on the pre-charged bytes not being changed outside of 3378 * consume_obj_stock() or refill_obj_stock(). So ignore those 3379 * pre-charged bytes as well when charging pages. To avoid a 3380 * page uncharge right after a page charge, we set the 3381 * allow_uncharge flag to false when calling refill_obj_stock() 3382 * to temporarily allow the pre-charged bytes to exceed the page 3383 * size limit. The maximum reachable value of the pre-charged 3384 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 3385 * race. 3386 */ 3387 nr_pages = size >> PAGE_SHIFT; 3388 nr_bytes = size & (PAGE_SIZE - 1); 3389 3390 if (nr_bytes) 3391 nr_pages += 1; 3392 3393 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3394 if (!ret && nr_bytes) 3395 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 3396 3397 return ret; 3398 } 3399 3400 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3401 { 3402 refill_obj_stock(objcg, size, true); 3403 } 3404 3405 #endif /* CONFIG_MEMCG_KMEM */ 3406 3407 /* 3408 * Because page_memcg(head) is not set on tails, set it now. 3409 */ 3410 void split_page_memcg(struct page *head, unsigned int nr) 3411 { 3412 struct folio *folio = page_folio(head); 3413 struct mem_cgroup *memcg = folio_memcg(folio); 3414 int i; 3415 3416 if (mem_cgroup_disabled() || !memcg) 3417 return; 3418 3419 for (i = 1; i < nr; i++) 3420 folio_page(folio, i)->memcg_data = folio->memcg_data; 3421 3422 if (folio_memcg_kmem(folio)) 3423 obj_cgroup_get_many(__folio_objcg(folio), nr - 1); 3424 else 3425 css_get_many(&memcg->css, nr - 1); 3426 } 3427 3428 #ifdef CONFIG_SWAP 3429 /** 3430 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3431 * @entry: swap entry to be moved 3432 * @from: mem_cgroup which the entry is moved from 3433 * @to: mem_cgroup which the entry is moved to 3434 * 3435 * It succeeds only when the swap_cgroup's record for this entry is the same 3436 * as the mem_cgroup's id of @from. 3437 * 3438 * Returns 0 on success, -EINVAL on failure. 3439 * 3440 * The caller must have charged to @to, IOW, called page_counter_charge() about 3441 * both res and memsw, and called css_get(). 3442 */ 3443 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3444 struct mem_cgroup *from, struct mem_cgroup *to) 3445 { 3446 unsigned short old_id, new_id; 3447 3448 old_id = mem_cgroup_id(from); 3449 new_id = mem_cgroup_id(to); 3450 3451 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3452 mod_memcg_state(from, MEMCG_SWAP, -1); 3453 mod_memcg_state(to, MEMCG_SWAP, 1); 3454 return 0; 3455 } 3456 return -EINVAL; 3457 } 3458 #else 3459 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3460 struct mem_cgroup *from, struct mem_cgroup *to) 3461 { 3462 return -EINVAL; 3463 } 3464 #endif 3465 3466 static DEFINE_MUTEX(memcg_max_mutex); 3467 3468 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3469 unsigned long max, bool memsw) 3470 { 3471 bool enlarge = false; 3472 bool drained = false; 3473 int ret; 3474 bool limits_invariant; 3475 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3476 3477 do { 3478 if (signal_pending(current)) { 3479 ret = -EINTR; 3480 break; 3481 } 3482 3483 mutex_lock(&memcg_max_mutex); 3484 /* 3485 * Make sure that the new limit (memsw or memory limit) doesn't 3486 * break our basic invariant rule memory.max <= memsw.max. 3487 */ 3488 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3489 max <= memcg->memsw.max; 3490 if (!limits_invariant) { 3491 mutex_unlock(&memcg_max_mutex); 3492 ret = -EINVAL; 3493 break; 3494 } 3495 if (max > counter->max) 3496 enlarge = true; 3497 ret = page_counter_set_max(counter, max); 3498 mutex_unlock(&memcg_max_mutex); 3499 3500 if (!ret) 3501 break; 3502 3503 if (!drained) { 3504 drain_all_stock(memcg); 3505 drained = true; 3506 continue; 3507 } 3508 3509 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3510 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 3511 ret = -EBUSY; 3512 break; 3513 } 3514 } while (true); 3515 3516 if (!ret && enlarge) 3517 memcg_oom_recover(memcg); 3518 3519 return ret; 3520 } 3521 3522 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3523 gfp_t gfp_mask, 3524 unsigned long *total_scanned) 3525 { 3526 unsigned long nr_reclaimed = 0; 3527 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3528 unsigned long reclaimed; 3529 int loop = 0; 3530 struct mem_cgroup_tree_per_node *mctz; 3531 unsigned long excess; 3532 3533 if (lru_gen_enabled()) 3534 return 0; 3535 3536 if (order > 0) 3537 return 0; 3538 3539 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 3540 3541 /* 3542 * Do not even bother to check the largest node if the root 3543 * is empty. Do it lockless to prevent lock bouncing. Races 3544 * are acceptable as soft limit is best effort anyway. 3545 */ 3546 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3547 return 0; 3548 3549 /* 3550 * This loop can run a while, specially if mem_cgroup's continuously 3551 * keep exceeding their soft limit and putting the system under 3552 * pressure 3553 */ 3554 do { 3555 if (next_mz) 3556 mz = next_mz; 3557 else 3558 mz = mem_cgroup_largest_soft_limit_node(mctz); 3559 if (!mz) 3560 break; 3561 3562 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3563 gfp_mask, total_scanned); 3564 nr_reclaimed += reclaimed; 3565 spin_lock_irq(&mctz->lock); 3566 3567 /* 3568 * If we failed to reclaim anything from this memory cgroup 3569 * it is time to move on to the next cgroup 3570 */ 3571 next_mz = NULL; 3572 if (!reclaimed) 3573 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3574 3575 excess = soft_limit_excess(mz->memcg); 3576 /* 3577 * One school of thought says that we should not add 3578 * back the node to the tree if reclaim returns 0. 3579 * But our reclaim could return 0, simply because due 3580 * to priority we are exposing a smaller subset of 3581 * memory to reclaim from. Consider this as a longer 3582 * term TODO. 3583 */ 3584 /* If excess == 0, no tree ops */ 3585 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3586 spin_unlock_irq(&mctz->lock); 3587 css_put(&mz->memcg->css); 3588 loop++; 3589 /* 3590 * Could not reclaim anything and there are no more 3591 * mem cgroups to try or we seem to be looping without 3592 * reclaiming anything. 3593 */ 3594 if (!nr_reclaimed && 3595 (next_mz == NULL || 3596 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3597 break; 3598 } while (!nr_reclaimed); 3599 if (next_mz) 3600 css_put(&next_mz->memcg->css); 3601 return nr_reclaimed; 3602 } 3603 3604 /* 3605 * Reclaims as many pages from the given memcg as possible. 3606 * 3607 * Caller is responsible for holding css reference for memcg. 3608 */ 3609 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3610 { 3611 int nr_retries = MAX_RECLAIM_RETRIES; 3612 3613 /* we call try-to-free pages for make this cgroup empty */ 3614 lru_add_drain_all(); 3615 3616 drain_all_stock(memcg); 3617 3618 /* try to free all pages in this cgroup */ 3619 while (nr_retries && page_counter_read(&memcg->memory)) { 3620 if (signal_pending(current)) 3621 return -EINTR; 3622 3623 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3624 MEMCG_RECLAIM_MAY_SWAP)) 3625 nr_retries--; 3626 } 3627 3628 return 0; 3629 } 3630 3631 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3632 char *buf, size_t nbytes, 3633 loff_t off) 3634 { 3635 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3636 3637 if (mem_cgroup_is_root(memcg)) 3638 return -EINVAL; 3639 return mem_cgroup_force_empty(memcg) ?: nbytes; 3640 } 3641 3642 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3643 struct cftype *cft) 3644 { 3645 return 1; 3646 } 3647 3648 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3649 struct cftype *cft, u64 val) 3650 { 3651 if (val == 1) 3652 return 0; 3653 3654 pr_warn_once("Non-hierarchical mode is deprecated. " 3655 "Please report your usecase to linux-mm@kvack.org if you " 3656 "depend on this functionality.\n"); 3657 3658 return -EINVAL; 3659 } 3660 3661 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3662 { 3663 unsigned long val; 3664 3665 if (mem_cgroup_is_root(memcg)) { 3666 mem_cgroup_flush_stats(); 3667 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3668 memcg_page_state(memcg, NR_ANON_MAPPED); 3669 if (swap) 3670 val += memcg_page_state(memcg, MEMCG_SWAP); 3671 } else { 3672 if (!swap) 3673 val = page_counter_read(&memcg->memory); 3674 else 3675 val = page_counter_read(&memcg->memsw); 3676 } 3677 return val; 3678 } 3679 3680 enum { 3681 RES_USAGE, 3682 RES_LIMIT, 3683 RES_MAX_USAGE, 3684 RES_FAILCNT, 3685 RES_SOFT_LIMIT, 3686 }; 3687 3688 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3689 struct cftype *cft) 3690 { 3691 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3692 struct page_counter *counter; 3693 3694 switch (MEMFILE_TYPE(cft->private)) { 3695 case _MEM: 3696 counter = &memcg->memory; 3697 break; 3698 case _MEMSWAP: 3699 counter = &memcg->memsw; 3700 break; 3701 case _KMEM: 3702 counter = &memcg->kmem; 3703 break; 3704 case _TCP: 3705 counter = &memcg->tcpmem; 3706 break; 3707 default: 3708 BUG(); 3709 } 3710 3711 switch (MEMFILE_ATTR(cft->private)) { 3712 case RES_USAGE: 3713 if (counter == &memcg->memory) 3714 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3715 if (counter == &memcg->memsw) 3716 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3717 return (u64)page_counter_read(counter) * PAGE_SIZE; 3718 case RES_LIMIT: 3719 return (u64)counter->max * PAGE_SIZE; 3720 case RES_MAX_USAGE: 3721 return (u64)counter->watermark * PAGE_SIZE; 3722 case RES_FAILCNT: 3723 return counter->failcnt; 3724 case RES_SOFT_LIMIT: 3725 return (u64)memcg->soft_limit * PAGE_SIZE; 3726 default: 3727 BUG(); 3728 } 3729 } 3730 3731 #ifdef CONFIG_MEMCG_KMEM 3732 static int memcg_online_kmem(struct mem_cgroup *memcg) 3733 { 3734 struct obj_cgroup *objcg; 3735 3736 if (mem_cgroup_kmem_disabled()) 3737 return 0; 3738 3739 if (unlikely(mem_cgroup_is_root(memcg))) 3740 return 0; 3741 3742 objcg = obj_cgroup_alloc(); 3743 if (!objcg) 3744 return -ENOMEM; 3745 3746 objcg->memcg = memcg; 3747 rcu_assign_pointer(memcg->objcg, objcg); 3748 3749 static_branch_enable(&memcg_kmem_enabled_key); 3750 3751 memcg->kmemcg_id = memcg->id.id; 3752 3753 return 0; 3754 } 3755 3756 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3757 { 3758 struct mem_cgroup *parent; 3759 3760 if (mem_cgroup_kmem_disabled()) 3761 return; 3762 3763 if (unlikely(mem_cgroup_is_root(memcg))) 3764 return; 3765 3766 parent = parent_mem_cgroup(memcg); 3767 if (!parent) 3768 parent = root_mem_cgroup; 3769 3770 memcg_reparent_objcgs(memcg, parent); 3771 3772 /* 3773 * After we have finished memcg_reparent_objcgs(), all list_lrus 3774 * corresponding to this cgroup are guaranteed to remain empty. 3775 * The ordering is imposed by list_lru_node->lock taken by 3776 * memcg_reparent_list_lrus(). 3777 */ 3778 memcg_reparent_list_lrus(memcg, parent); 3779 } 3780 #else 3781 static int memcg_online_kmem(struct mem_cgroup *memcg) 3782 { 3783 return 0; 3784 } 3785 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3786 { 3787 } 3788 #endif /* CONFIG_MEMCG_KMEM */ 3789 3790 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3791 { 3792 int ret; 3793 3794 mutex_lock(&memcg_max_mutex); 3795 3796 ret = page_counter_set_max(&memcg->tcpmem, max); 3797 if (ret) 3798 goto out; 3799 3800 if (!memcg->tcpmem_active) { 3801 /* 3802 * The active flag needs to be written after the static_key 3803 * update. This is what guarantees that the socket activation 3804 * function is the last one to run. See mem_cgroup_sk_alloc() 3805 * for details, and note that we don't mark any socket as 3806 * belonging to this memcg until that flag is up. 3807 * 3808 * We need to do this, because static_keys will span multiple 3809 * sites, but we can't control their order. If we mark a socket 3810 * as accounted, but the accounting functions are not patched in 3811 * yet, we'll lose accounting. 3812 * 3813 * We never race with the readers in mem_cgroup_sk_alloc(), 3814 * because when this value change, the code to process it is not 3815 * patched in yet. 3816 */ 3817 static_branch_inc(&memcg_sockets_enabled_key); 3818 memcg->tcpmem_active = true; 3819 } 3820 out: 3821 mutex_unlock(&memcg_max_mutex); 3822 return ret; 3823 } 3824 3825 /* 3826 * The user of this function is... 3827 * RES_LIMIT. 3828 */ 3829 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3830 char *buf, size_t nbytes, loff_t off) 3831 { 3832 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3833 unsigned long nr_pages; 3834 int ret; 3835 3836 buf = strstrip(buf); 3837 ret = page_counter_memparse(buf, "-1", &nr_pages); 3838 if (ret) 3839 return ret; 3840 3841 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3842 case RES_LIMIT: 3843 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3844 ret = -EINVAL; 3845 break; 3846 } 3847 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3848 case _MEM: 3849 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3850 break; 3851 case _MEMSWAP: 3852 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3853 break; 3854 case _KMEM: 3855 /* kmem.limit_in_bytes is deprecated. */ 3856 ret = -EOPNOTSUPP; 3857 break; 3858 case _TCP: 3859 ret = memcg_update_tcp_max(memcg, nr_pages); 3860 break; 3861 } 3862 break; 3863 case RES_SOFT_LIMIT: 3864 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 3865 ret = -EOPNOTSUPP; 3866 } else { 3867 memcg->soft_limit = nr_pages; 3868 ret = 0; 3869 } 3870 break; 3871 } 3872 return ret ?: nbytes; 3873 } 3874 3875 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3876 size_t nbytes, loff_t off) 3877 { 3878 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3879 struct page_counter *counter; 3880 3881 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3882 case _MEM: 3883 counter = &memcg->memory; 3884 break; 3885 case _MEMSWAP: 3886 counter = &memcg->memsw; 3887 break; 3888 case _KMEM: 3889 counter = &memcg->kmem; 3890 break; 3891 case _TCP: 3892 counter = &memcg->tcpmem; 3893 break; 3894 default: 3895 BUG(); 3896 } 3897 3898 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3899 case RES_MAX_USAGE: 3900 page_counter_reset_watermark(counter); 3901 break; 3902 case RES_FAILCNT: 3903 counter->failcnt = 0; 3904 break; 3905 default: 3906 BUG(); 3907 } 3908 3909 return nbytes; 3910 } 3911 3912 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3913 struct cftype *cft) 3914 { 3915 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3916 } 3917 3918 #ifdef CONFIG_MMU 3919 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3920 struct cftype *cft, u64 val) 3921 { 3922 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3923 3924 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 3925 "Please report your usecase to linux-mm@kvack.org if you " 3926 "depend on this functionality.\n"); 3927 3928 if (val & ~MOVE_MASK) 3929 return -EINVAL; 3930 3931 /* 3932 * No kind of locking is needed in here, because ->can_attach() will 3933 * check this value once in the beginning of the process, and then carry 3934 * on with stale data. This means that changes to this value will only 3935 * affect task migrations starting after the change. 3936 */ 3937 memcg->move_charge_at_immigrate = val; 3938 return 0; 3939 } 3940 #else 3941 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3942 struct cftype *cft, u64 val) 3943 { 3944 return -ENOSYS; 3945 } 3946 #endif 3947 3948 #ifdef CONFIG_NUMA 3949 3950 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3951 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3952 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3953 3954 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3955 int nid, unsigned int lru_mask, bool tree) 3956 { 3957 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3958 unsigned long nr = 0; 3959 enum lru_list lru; 3960 3961 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3962 3963 for_each_lru(lru) { 3964 if (!(BIT(lru) & lru_mask)) 3965 continue; 3966 if (tree) 3967 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3968 else 3969 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3970 } 3971 return nr; 3972 } 3973 3974 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3975 unsigned int lru_mask, 3976 bool tree) 3977 { 3978 unsigned long nr = 0; 3979 enum lru_list lru; 3980 3981 for_each_lru(lru) { 3982 if (!(BIT(lru) & lru_mask)) 3983 continue; 3984 if (tree) 3985 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3986 else 3987 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3988 } 3989 return nr; 3990 } 3991 3992 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3993 { 3994 struct numa_stat { 3995 const char *name; 3996 unsigned int lru_mask; 3997 }; 3998 3999 static const struct numa_stat stats[] = { 4000 { "total", LRU_ALL }, 4001 { "file", LRU_ALL_FILE }, 4002 { "anon", LRU_ALL_ANON }, 4003 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4004 }; 4005 const struct numa_stat *stat; 4006 int nid; 4007 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4008 4009 mem_cgroup_flush_stats(); 4010 4011 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4012 seq_printf(m, "%s=%lu", stat->name, 4013 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4014 false)); 4015 for_each_node_state(nid, N_MEMORY) 4016 seq_printf(m, " N%d=%lu", nid, 4017 mem_cgroup_node_nr_lru_pages(memcg, nid, 4018 stat->lru_mask, false)); 4019 seq_putc(m, '\n'); 4020 } 4021 4022 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4023 4024 seq_printf(m, "hierarchical_%s=%lu", stat->name, 4025 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4026 true)); 4027 for_each_node_state(nid, N_MEMORY) 4028 seq_printf(m, " N%d=%lu", nid, 4029 mem_cgroup_node_nr_lru_pages(memcg, nid, 4030 stat->lru_mask, true)); 4031 seq_putc(m, '\n'); 4032 } 4033 4034 return 0; 4035 } 4036 #endif /* CONFIG_NUMA */ 4037 4038 static const unsigned int memcg1_stats[] = { 4039 NR_FILE_PAGES, 4040 NR_ANON_MAPPED, 4041 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4042 NR_ANON_THPS, 4043 #endif 4044 NR_SHMEM, 4045 NR_FILE_MAPPED, 4046 NR_FILE_DIRTY, 4047 NR_WRITEBACK, 4048 WORKINGSET_REFAULT_ANON, 4049 WORKINGSET_REFAULT_FILE, 4050 MEMCG_SWAP, 4051 }; 4052 4053 static const char *const memcg1_stat_names[] = { 4054 "cache", 4055 "rss", 4056 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4057 "rss_huge", 4058 #endif 4059 "shmem", 4060 "mapped_file", 4061 "dirty", 4062 "writeback", 4063 "workingset_refault_anon", 4064 "workingset_refault_file", 4065 "swap", 4066 }; 4067 4068 /* Universal VM events cgroup1 shows, original sort order */ 4069 static const unsigned int memcg1_events[] = { 4070 PGPGIN, 4071 PGPGOUT, 4072 PGFAULT, 4073 PGMAJFAULT, 4074 }; 4075 4076 static int memcg_stat_show(struct seq_file *m, void *v) 4077 { 4078 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4079 unsigned long memory, memsw; 4080 struct mem_cgroup *mi; 4081 unsigned int i; 4082 4083 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4084 4085 mem_cgroup_flush_stats(); 4086 4087 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4088 unsigned long nr; 4089 4090 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4091 continue; 4092 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4093 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], 4094 nr * memcg_page_state_unit(memcg1_stats[i])); 4095 } 4096 4097 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4098 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 4099 memcg_events_local(memcg, memcg1_events[i])); 4100 4101 for (i = 0; i < NR_LRU_LISTS; i++) 4102 seq_printf(m, "%s %lu\n", lru_list_name(i), 4103 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4104 PAGE_SIZE); 4105 4106 /* Hierarchical information */ 4107 memory = memsw = PAGE_COUNTER_MAX; 4108 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4109 memory = min(memory, READ_ONCE(mi->memory.max)); 4110 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4111 } 4112 seq_printf(m, "hierarchical_memory_limit %llu\n", 4113 (u64)memory * PAGE_SIZE); 4114 if (do_memsw_account()) 4115 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4116 (u64)memsw * PAGE_SIZE); 4117 4118 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4119 unsigned long nr; 4120 4121 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4122 continue; 4123 nr = memcg_page_state(memcg, memcg1_stats[i]); 4124 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 4125 (u64)nr * memcg_page_state_unit(memcg1_stats[i])); 4126 } 4127 4128 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4129 seq_printf(m, "total_%s %llu\n", 4130 vm_event_name(memcg1_events[i]), 4131 (u64)memcg_events(memcg, memcg1_events[i])); 4132 4133 for (i = 0; i < NR_LRU_LISTS; i++) 4134 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4135 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4136 PAGE_SIZE); 4137 4138 #ifdef CONFIG_DEBUG_VM 4139 { 4140 pg_data_t *pgdat; 4141 struct mem_cgroup_per_node *mz; 4142 unsigned long anon_cost = 0; 4143 unsigned long file_cost = 0; 4144 4145 for_each_online_pgdat(pgdat) { 4146 mz = memcg->nodeinfo[pgdat->node_id]; 4147 4148 anon_cost += mz->lruvec.anon_cost; 4149 file_cost += mz->lruvec.file_cost; 4150 } 4151 seq_printf(m, "anon_cost %lu\n", anon_cost); 4152 seq_printf(m, "file_cost %lu\n", file_cost); 4153 } 4154 #endif 4155 4156 return 0; 4157 } 4158 4159 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4160 struct cftype *cft) 4161 { 4162 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4163 4164 return mem_cgroup_swappiness(memcg); 4165 } 4166 4167 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4168 struct cftype *cft, u64 val) 4169 { 4170 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4171 4172 if (val > 200) 4173 return -EINVAL; 4174 4175 if (!mem_cgroup_is_root(memcg)) 4176 memcg->swappiness = val; 4177 else 4178 vm_swappiness = val; 4179 4180 return 0; 4181 } 4182 4183 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4184 { 4185 struct mem_cgroup_threshold_ary *t; 4186 unsigned long usage; 4187 int i; 4188 4189 rcu_read_lock(); 4190 if (!swap) 4191 t = rcu_dereference(memcg->thresholds.primary); 4192 else 4193 t = rcu_dereference(memcg->memsw_thresholds.primary); 4194 4195 if (!t) 4196 goto unlock; 4197 4198 usage = mem_cgroup_usage(memcg, swap); 4199 4200 /* 4201 * current_threshold points to threshold just below or equal to usage. 4202 * If it's not true, a threshold was crossed after last 4203 * call of __mem_cgroup_threshold(). 4204 */ 4205 i = t->current_threshold; 4206 4207 /* 4208 * Iterate backward over array of thresholds starting from 4209 * current_threshold and check if a threshold is crossed. 4210 * If none of thresholds below usage is crossed, we read 4211 * only one element of the array here. 4212 */ 4213 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4214 eventfd_signal(t->entries[i].eventfd, 1); 4215 4216 /* i = current_threshold + 1 */ 4217 i++; 4218 4219 /* 4220 * Iterate forward over array of thresholds starting from 4221 * current_threshold+1 and check if a threshold is crossed. 4222 * If none of thresholds above usage is crossed, we read 4223 * only one element of the array here. 4224 */ 4225 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4226 eventfd_signal(t->entries[i].eventfd, 1); 4227 4228 /* Update current_threshold */ 4229 t->current_threshold = i - 1; 4230 unlock: 4231 rcu_read_unlock(); 4232 } 4233 4234 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4235 { 4236 while (memcg) { 4237 __mem_cgroup_threshold(memcg, false); 4238 if (do_memsw_account()) 4239 __mem_cgroup_threshold(memcg, true); 4240 4241 memcg = parent_mem_cgroup(memcg); 4242 } 4243 } 4244 4245 static int compare_thresholds(const void *a, const void *b) 4246 { 4247 const struct mem_cgroup_threshold *_a = a; 4248 const struct mem_cgroup_threshold *_b = b; 4249 4250 if (_a->threshold > _b->threshold) 4251 return 1; 4252 4253 if (_a->threshold < _b->threshold) 4254 return -1; 4255 4256 return 0; 4257 } 4258 4259 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4260 { 4261 struct mem_cgroup_eventfd_list *ev; 4262 4263 spin_lock(&memcg_oom_lock); 4264 4265 list_for_each_entry(ev, &memcg->oom_notify, list) 4266 eventfd_signal(ev->eventfd, 1); 4267 4268 spin_unlock(&memcg_oom_lock); 4269 return 0; 4270 } 4271 4272 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4273 { 4274 struct mem_cgroup *iter; 4275 4276 for_each_mem_cgroup_tree(iter, memcg) 4277 mem_cgroup_oom_notify_cb(iter); 4278 } 4279 4280 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4281 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4282 { 4283 struct mem_cgroup_thresholds *thresholds; 4284 struct mem_cgroup_threshold_ary *new; 4285 unsigned long threshold; 4286 unsigned long usage; 4287 int i, size, ret; 4288 4289 ret = page_counter_memparse(args, "-1", &threshold); 4290 if (ret) 4291 return ret; 4292 4293 mutex_lock(&memcg->thresholds_lock); 4294 4295 if (type == _MEM) { 4296 thresholds = &memcg->thresholds; 4297 usage = mem_cgroup_usage(memcg, false); 4298 } else if (type == _MEMSWAP) { 4299 thresholds = &memcg->memsw_thresholds; 4300 usage = mem_cgroup_usage(memcg, true); 4301 } else 4302 BUG(); 4303 4304 /* Check if a threshold crossed before adding a new one */ 4305 if (thresholds->primary) 4306 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4307 4308 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4309 4310 /* Allocate memory for new array of thresholds */ 4311 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4312 if (!new) { 4313 ret = -ENOMEM; 4314 goto unlock; 4315 } 4316 new->size = size; 4317 4318 /* Copy thresholds (if any) to new array */ 4319 if (thresholds->primary) 4320 memcpy(new->entries, thresholds->primary->entries, 4321 flex_array_size(new, entries, size - 1)); 4322 4323 /* Add new threshold */ 4324 new->entries[size - 1].eventfd = eventfd; 4325 new->entries[size - 1].threshold = threshold; 4326 4327 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4328 sort(new->entries, size, sizeof(*new->entries), 4329 compare_thresholds, NULL); 4330 4331 /* Find current threshold */ 4332 new->current_threshold = -1; 4333 for (i = 0; i < size; i++) { 4334 if (new->entries[i].threshold <= usage) { 4335 /* 4336 * new->current_threshold will not be used until 4337 * rcu_assign_pointer(), so it's safe to increment 4338 * it here. 4339 */ 4340 ++new->current_threshold; 4341 } else 4342 break; 4343 } 4344 4345 /* Free old spare buffer and save old primary buffer as spare */ 4346 kfree(thresholds->spare); 4347 thresholds->spare = thresholds->primary; 4348 4349 rcu_assign_pointer(thresholds->primary, new); 4350 4351 /* To be sure that nobody uses thresholds */ 4352 synchronize_rcu(); 4353 4354 unlock: 4355 mutex_unlock(&memcg->thresholds_lock); 4356 4357 return ret; 4358 } 4359 4360 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4361 struct eventfd_ctx *eventfd, const char *args) 4362 { 4363 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4364 } 4365 4366 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4367 struct eventfd_ctx *eventfd, const char *args) 4368 { 4369 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4370 } 4371 4372 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4373 struct eventfd_ctx *eventfd, enum res_type type) 4374 { 4375 struct mem_cgroup_thresholds *thresholds; 4376 struct mem_cgroup_threshold_ary *new; 4377 unsigned long usage; 4378 int i, j, size, entries; 4379 4380 mutex_lock(&memcg->thresholds_lock); 4381 4382 if (type == _MEM) { 4383 thresholds = &memcg->thresholds; 4384 usage = mem_cgroup_usage(memcg, false); 4385 } else if (type == _MEMSWAP) { 4386 thresholds = &memcg->memsw_thresholds; 4387 usage = mem_cgroup_usage(memcg, true); 4388 } else 4389 BUG(); 4390 4391 if (!thresholds->primary) 4392 goto unlock; 4393 4394 /* Check if a threshold crossed before removing */ 4395 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4396 4397 /* Calculate new number of threshold */ 4398 size = entries = 0; 4399 for (i = 0; i < thresholds->primary->size; i++) { 4400 if (thresholds->primary->entries[i].eventfd != eventfd) 4401 size++; 4402 else 4403 entries++; 4404 } 4405 4406 new = thresholds->spare; 4407 4408 /* If no items related to eventfd have been cleared, nothing to do */ 4409 if (!entries) 4410 goto unlock; 4411 4412 /* Set thresholds array to NULL if we don't have thresholds */ 4413 if (!size) { 4414 kfree(new); 4415 new = NULL; 4416 goto swap_buffers; 4417 } 4418 4419 new->size = size; 4420 4421 /* Copy thresholds and find current threshold */ 4422 new->current_threshold = -1; 4423 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4424 if (thresholds->primary->entries[i].eventfd == eventfd) 4425 continue; 4426 4427 new->entries[j] = thresholds->primary->entries[i]; 4428 if (new->entries[j].threshold <= usage) { 4429 /* 4430 * new->current_threshold will not be used 4431 * until rcu_assign_pointer(), so it's safe to increment 4432 * it here. 4433 */ 4434 ++new->current_threshold; 4435 } 4436 j++; 4437 } 4438 4439 swap_buffers: 4440 /* Swap primary and spare array */ 4441 thresholds->spare = thresholds->primary; 4442 4443 rcu_assign_pointer(thresholds->primary, new); 4444 4445 /* To be sure that nobody uses thresholds */ 4446 synchronize_rcu(); 4447 4448 /* If all events are unregistered, free the spare array */ 4449 if (!new) { 4450 kfree(thresholds->spare); 4451 thresholds->spare = NULL; 4452 } 4453 unlock: 4454 mutex_unlock(&memcg->thresholds_lock); 4455 } 4456 4457 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4458 struct eventfd_ctx *eventfd) 4459 { 4460 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4461 } 4462 4463 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4464 struct eventfd_ctx *eventfd) 4465 { 4466 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4467 } 4468 4469 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4470 struct eventfd_ctx *eventfd, const char *args) 4471 { 4472 struct mem_cgroup_eventfd_list *event; 4473 4474 event = kmalloc(sizeof(*event), GFP_KERNEL); 4475 if (!event) 4476 return -ENOMEM; 4477 4478 spin_lock(&memcg_oom_lock); 4479 4480 event->eventfd = eventfd; 4481 list_add(&event->list, &memcg->oom_notify); 4482 4483 /* already in OOM ? */ 4484 if (memcg->under_oom) 4485 eventfd_signal(eventfd, 1); 4486 spin_unlock(&memcg_oom_lock); 4487 4488 return 0; 4489 } 4490 4491 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4492 struct eventfd_ctx *eventfd) 4493 { 4494 struct mem_cgroup_eventfd_list *ev, *tmp; 4495 4496 spin_lock(&memcg_oom_lock); 4497 4498 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4499 if (ev->eventfd == eventfd) { 4500 list_del(&ev->list); 4501 kfree(ev); 4502 } 4503 } 4504 4505 spin_unlock(&memcg_oom_lock); 4506 } 4507 4508 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4509 { 4510 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4511 4512 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4513 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4514 seq_printf(sf, "oom_kill %lu\n", 4515 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4516 return 0; 4517 } 4518 4519 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4520 struct cftype *cft, u64 val) 4521 { 4522 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4523 4524 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4525 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4526 return -EINVAL; 4527 4528 memcg->oom_kill_disable = val; 4529 if (!val) 4530 memcg_oom_recover(memcg); 4531 4532 return 0; 4533 } 4534 4535 #ifdef CONFIG_CGROUP_WRITEBACK 4536 4537 #include <trace/events/writeback.h> 4538 4539 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4540 { 4541 return wb_domain_init(&memcg->cgwb_domain, gfp); 4542 } 4543 4544 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4545 { 4546 wb_domain_exit(&memcg->cgwb_domain); 4547 } 4548 4549 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4550 { 4551 wb_domain_size_changed(&memcg->cgwb_domain); 4552 } 4553 4554 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4555 { 4556 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4557 4558 if (!memcg->css.parent) 4559 return NULL; 4560 4561 return &memcg->cgwb_domain; 4562 } 4563 4564 /** 4565 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4566 * @wb: bdi_writeback in question 4567 * @pfilepages: out parameter for number of file pages 4568 * @pheadroom: out parameter for number of allocatable pages according to memcg 4569 * @pdirty: out parameter for number of dirty pages 4570 * @pwriteback: out parameter for number of pages under writeback 4571 * 4572 * Determine the numbers of file, headroom, dirty, and writeback pages in 4573 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4574 * is a bit more involved. 4575 * 4576 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4577 * headroom is calculated as the lowest headroom of itself and the 4578 * ancestors. Note that this doesn't consider the actual amount of 4579 * available memory in the system. The caller should further cap 4580 * *@pheadroom accordingly. 4581 */ 4582 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4583 unsigned long *pheadroom, unsigned long *pdirty, 4584 unsigned long *pwriteback) 4585 { 4586 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4587 struct mem_cgroup *parent; 4588 4589 mem_cgroup_flush_stats(); 4590 4591 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4592 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4593 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4594 memcg_page_state(memcg, NR_ACTIVE_FILE); 4595 4596 *pheadroom = PAGE_COUNTER_MAX; 4597 while ((parent = parent_mem_cgroup(memcg))) { 4598 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4599 READ_ONCE(memcg->memory.high)); 4600 unsigned long used = page_counter_read(&memcg->memory); 4601 4602 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4603 memcg = parent; 4604 } 4605 } 4606 4607 /* 4608 * Foreign dirty flushing 4609 * 4610 * There's an inherent mismatch between memcg and writeback. The former 4611 * tracks ownership per-page while the latter per-inode. This was a 4612 * deliberate design decision because honoring per-page ownership in the 4613 * writeback path is complicated, may lead to higher CPU and IO overheads 4614 * and deemed unnecessary given that write-sharing an inode across 4615 * different cgroups isn't a common use-case. 4616 * 4617 * Combined with inode majority-writer ownership switching, this works well 4618 * enough in most cases but there are some pathological cases. For 4619 * example, let's say there are two cgroups A and B which keep writing to 4620 * different but confined parts of the same inode. B owns the inode and 4621 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4622 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4623 * triggering background writeback. A will be slowed down without a way to 4624 * make writeback of the dirty pages happen. 4625 * 4626 * Conditions like the above can lead to a cgroup getting repeatedly and 4627 * severely throttled after making some progress after each 4628 * dirty_expire_interval while the underlying IO device is almost 4629 * completely idle. 4630 * 4631 * Solving this problem completely requires matching the ownership tracking 4632 * granularities between memcg and writeback in either direction. However, 4633 * the more egregious behaviors can be avoided by simply remembering the 4634 * most recent foreign dirtying events and initiating remote flushes on 4635 * them when local writeback isn't enough to keep the memory clean enough. 4636 * 4637 * The following two functions implement such mechanism. When a foreign 4638 * page - a page whose memcg and writeback ownerships don't match - is 4639 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4640 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4641 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4642 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4643 * foreign bdi_writebacks which haven't expired. Both the numbers of 4644 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4645 * limited to MEMCG_CGWB_FRN_CNT. 4646 * 4647 * The mechanism only remembers IDs and doesn't hold any object references. 4648 * As being wrong occasionally doesn't matter, updates and accesses to the 4649 * records are lockless and racy. 4650 */ 4651 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 4652 struct bdi_writeback *wb) 4653 { 4654 struct mem_cgroup *memcg = folio_memcg(folio); 4655 struct memcg_cgwb_frn *frn; 4656 u64 now = get_jiffies_64(); 4657 u64 oldest_at = now; 4658 int oldest = -1; 4659 int i; 4660 4661 trace_track_foreign_dirty(folio, wb); 4662 4663 /* 4664 * Pick the slot to use. If there is already a slot for @wb, keep 4665 * using it. If not replace the oldest one which isn't being 4666 * written out. 4667 */ 4668 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4669 frn = &memcg->cgwb_frn[i]; 4670 if (frn->bdi_id == wb->bdi->id && 4671 frn->memcg_id == wb->memcg_css->id) 4672 break; 4673 if (time_before64(frn->at, oldest_at) && 4674 atomic_read(&frn->done.cnt) == 1) { 4675 oldest = i; 4676 oldest_at = frn->at; 4677 } 4678 } 4679 4680 if (i < MEMCG_CGWB_FRN_CNT) { 4681 /* 4682 * Re-using an existing one. Update timestamp lazily to 4683 * avoid making the cacheline hot. We want them to be 4684 * reasonably up-to-date and significantly shorter than 4685 * dirty_expire_interval as that's what expires the record. 4686 * Use the shorter of 1s and dirty_expire_interval / 8. 4687 */ 4688 unsigned long update_intv = 4689 min_t(unsigned long, HZ, 4690 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4691 4692 if (time_before64(frn->at, now - update_intv)) 4693 frn->at = now; 4694 } else if (oldest >= 0) { 4695 /* replace the oldest free one */ 4696 frn = &memcg->cgwb_frn[oldest]; 4697 frn->bdi_id = wb->bdi->id; 4698 frn->memcg_id = wb->memcg_css->id; 4699 frn->at = now; 4700 } 4701 } 4702 4703 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4704 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4705 { 4706 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4707 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4708 u64 now = jiffies_64; 4709 int i; 4710 4711 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4712 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4713 4714 /* 4715 * If the record is older than dirty_expire_interval, 4716 * writeback on it has already started. No need to kick it 4717 * off again. Also, don't start a new one if there's 4718 * already one in flight. 4719 */ 4720 if (time_after64(frn->at, now - intv) && 4721 atomic_read(&frn->done.cnt) == 1) { 4722 frn->at = 0; 4723 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4724 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 4725 WB_REASON_FOREIGN_FLUSH, 4726 &frn->done); 4727 } 4728 } 4729 } 4730 4731 #else /* CONFIG_CGROUP_WRITEBACK */ 4732 4733 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4734 { 4735 return 0; 4736 } 4737 4738 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4739 { 4740 } 4741 4742 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4743 { 4744 } 4745 4746 #endif /* CONFIG_CGROUP_WRITEBACK */ 4747 4748 /* 4749 * DO NOT USE IN NEW FILES. 4750 * 4751 * "cgroup.event_control" implementation. 4752 * 4753 * This is way over-engineered. It tries to support fully configurable 4754 * events for each user. Such level of flexibility is completely 4755 * unnecessary especially in the light of the planned unified hierarchy. 4756 * 4757 * Please deprecate this and replace with something simpler if at all 4758 * possible. 4759 */ 4760 4761 /* 4762 * Unregister event and free resources. 4763 * 4764 * Gets called from workqueue. 4765 */ 4766 static void memcg_event_remove(struct work_struct *work) 4767 { 4768 struct mem_cgroup_event *event = 4769 container_of(work, struct mem_cgroup_event, remove); 4770 struct mem_cgroup *memcg = event->memcg; 4771 4772 remove_wait_queue(event->wqh, &event->wait); 4773 4774 event->unregister_event(memcg, event->eventfd); 4775 4776 /* Notify userspace the event is going away. */ 4777 eventfd_signal(event->eventfd, 1); 4778 4779 eventfd_ctx_put(event->eventfd); 4780 kfree(event); 4781 css_put(&memcg->css); 4782 } 4783 4784 /* 4785 * Gets called on EPOLLHUP on eventfd when user closes it. 4786 * 4787 * Called with wqh->lock held and interrupts disabled. 4788 */ 4789 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4790 int sync, void *key) 4791 { 4792 struct mem_cgroup_event *event = 4793 container_of(wait, struct mem_cgroup_event, wait); 4794 struct mem_cgroup *memcg = event->memcg; 4795 __poll_t flags = key_to_poll(key); 4796 4797 if (flags & EPOLLHUP) { 4798 /* 4799 * If the event has been detached at cgroup removal, we 4800 * can simply return knowing the other side will cleanup 4801 * for us. 4802 * 4803 * We can't race against event freeing since the other 4804 * side will require wqh->lock via remove_wait_queue(), 4805 * which we hold. 4806 */ 4807 spin_lock(&memcg->event_list_lock); 4808 if (!list_empty(&event->list)) { 4809 list_del_init(&event->list); 4810 /* 4811 * We are in atomic context, but cgroup_event_remove() 4812 * may sleep, so we have to call it in workqueue. 4813 */ 4814 schedule_work(&event->remove); 4815 } 4816 spin_unlock(&memcg->event_list_lock); 4817 } 4818 4819 return 0; 4820 } 4821 4822 static void memcg_event_ptable_queue_proc(struct file *file, 4823 wait_queue_head_t *wqh, poll_table *pt) 4824 { 4825 struct mem_cgroup_event *event = 4826 container_of(pt, struct mem_cgroup_event, pt); 4827 4828 event->wqh = wqh; 4829 add_wait_queue(wqh, &event->wait); 4830 } 4831 4832 /* 4833 * DO NOT USE IN NEW FILES. 4834 * 4835 * Parse input and register new cgroup event handler. 4836 * 4837 * Input must be in format '<event_fd> <control_fd> <args>'. 4838 * Interpretation of args is defined by control file implementation. 4839 */ 4840 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4841 char *buf, size_t nbytes, loff_t off) 4842 { 4843 struct cgroup_subsys_state *css = of_css(of); 4844 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4845 struct mem_cgroup_event *event; 4846 struct cgroup_subsys_state *cfile_css; 4847 unsigned int efd, cfd; 4848 struct fd efile; 4849 struct fd cfile; 4850 struct dentry *cdentry; 4851 const char *name; 4852 char *endp; 4853 int ret; 4854 4855 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 4856 return -EOPNOTSUPP; 4857 4858 buf = strstrip(buf); 4859 4860 efd = simple_strtoul(buf, &endp, 10); 4861 if (*endp != ' ') 4862 return -EINVAL; 4863 buf = endp + 1; 4864 4865 cfd = simple_strtoul(buf, &endp, 10); 4866 if ((*endp != ' ') && (*endp != '\0')) 4867 return -EINVAL; 4868 buf = endp + 1; 4869 4870 event = kzalloc(sizeof(*event), GFP_KERNEL); 4871 if (!event) 4872 return -ENOMEM; 4873 4874 event->memcg = memcg; 4875 INIT_LIST_HEAD(&event->list); 4876 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4877 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4878 INIT_WORK(&event->remove, memcg_event_remove); 4879 4880 efile = fdget(efd); 4881 if (!efile.file) { 4882 ret = -EBADF; 4883 goto out_kfree; 4884 } 4885 4886 event->eventfd = eventfd_ctx_fileget(efile.file); 4887 if (IS_ERR(event->eventfd)) { 4888 ret = PTR_ERR(event->eventfd); 4889 goto out_put_efile; 4890 } 4891 4892 cfile = fdget(cfd); 4893 if (!cfile.file) { 4894 ret = -EBADF; 4895 goto out_put_eventfd; 4896 } 4897 4898 /* the process need read permission on control file */ 4899 /* AV: shouldn't we check that it's been opened for read instead? */ 4900 ret = file_permission(cfile.file, MAY_READ); 4901 if (ret < 0) 4902 goto out_put_cfile; 4903 4904 /* 4905 * The control file must be a regular cgroup1 file. As a regular cgroup 4906 * file can't be renamed, it's safe to access its name afterwards. 4907 */ 4908 cdentry = cfile.file->f_path.dentry; 4909 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 4910 ret = -EINVAL; 4911 goto out_put_cfile; 4912 } 4913 4914 /* 4915 * Determine the event callbacks and set them in @event. This used 4916 * to be done via struct cftype but cgroup core no longer knows 4917 * about these events. The following is crude but the whole thing 4918 * is for compatibility anyway. 4919 * 4920 * DO NOT ADD NEW FILES. 4921 */ 4922 name = cdentry->d_name.name; 4923 4924 if (!strcmp(name, "memory.usage_in_bytes")) { 4925 event->register_event = mem_cgroup_usage_register_event; 4926 event->unregister_event = mem_cgroup_usage_unregister_event; 4927 } else if (!strcmp(name, "memory.oom_control")) { 4928 event->register_event = mem_cgroup_oom_register_event; 4929 event->unregister_event = mem_cgroup_oom_unregister_event; 4930 } else if (!strcmp(name, "memory.pressure_level")) { 4931 event->register_event = vmpressure_register_event; 4932 event->unregister_event = vmpressure_unregister_event; 4933 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4934 event->register_event = memsw_cgroup_usage_register_event; 4935 event->unregister_event = memsw_cgroup_usage_unregister_event; 4936 } else { 4937 ret = -EINVAL; 4938 goto out_put_cfile; 4939 } 4940 4941 /* 4942 * Verify @cfile should belong to @css. Also, remaining events are 4943 * automatically removed on cgroup destruction but the removal is 4944 * asynchronous, so take an extra ref on @css. 4945 */ 4946 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 4947 &memory_cgrp_subsys); 4948 ret = -EINVAL; 4949 if (IS_ERR(cfile_css)) 4950 goto out_put_cfile; 4951 if (cfile_css != css) { 4952 css_put(cfile_css); 4953 goto out_put_cfile; 4954 } 4955 4956 ret = event->register_event(memcg, event->eventfd, buf); 4957 if (ret) 4958 goto out_put_css; 4959 4960 vfs_poll(efile.file, &event->pt); 4961 4962 spin_lock_irq(&memcg->event_list_lock); 4963 list_add(&event->list, &memcg->event_list); 4964 spin_unlock_irq(&memcg->event_list_lock); 4965 4966 fdput(cfile); 4967 fdput(efile); 4968 4969 return nbytes; 4970 4971 out_put_css: 4972 css_put(css); 4973 out_put_cfile: 4974 fdput(cfile); 4975 out_put_eventfd: 4976 eventfd_ctx_put(event->eventfd); 4977 out_put_efile: 4978 fdput(efile); 4979 out_kfree: 4980 kfree(event); 4981 4982 return ret; 4983 } 4984 4985 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 4986 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 4987 { 4988 /* 4989 * Deprecated. 4990 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 4991 */ 4992 return 0; 4993 } 4994 #endif 4995 4996 static struct cftype mem_cgroup_legacy_files[] = { 4997 { 4998 .name = "usage_in_bytes", 4999 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5000 .read_u64 = mem_cgroup_read_u64, 5001 }, 5002 { 5003 .name = "max_usage_in_bytes", 5004 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5005 .write = mem_cgroup_reset, 5006 .read_u64 = mem_cgroup_read_u64, 5007 }, 5008 { 5009 .name = "limit_in_bytes", 5010 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5011 .write = mem_cgroup_write, 5012 .read_u64 = mem_cgroup_read_u64, 5013 }, 5014 { 5015 .name = "soft_limit_in_bytes", 5016 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5017 .write = mem_cgroup_write, 5018 .read_u64 = mem_cgroup_read_u64, 5019 }, 5020 { 5021 .name = "failcnt", 5022 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5023 .write = mem_cgroup_reset, 5024 .read_u64 = mem_cgroup_read_u64, 5025 }, 5026 { 5027 .name = "stat", 5028 .seq_show = memcg_stat_show, 5029 }, 5030 { 5031 .name = "force_empty", 5032 .write = mem_cgroup_force_empty_write, 5033 }, 5034 { 5035 .name = "use_hierarchy", 5036 .write_u64 = mem_cgroup_hierarchy_write, 5037 .read_u64 = mem_cgroup_hierarchy_read, 5038 }, 5039 { 5040 .name = "cgroup.event_control", /* XXX: for compat */ 5041 .write = memcg_write_event_control, 5042 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 5043 }, 5044 { 5045 .name = "swappiness", 5046 .read_u64 = mem_cgroup_swappiness_read, 5047 .write_u64 = mem_cgroup_swappiness_write, 5048 }, 5049 { 5050 .name = "move_charge_at_immigrate", 5051 .read_u64 = mem_cgroup_move_charge_read, 5052 .write_u64 = mem_cgroup_move_charge_write, 5053 }, 5054 { 5055 .name = "oom_control", 5056 .seq_show = mem_cgroup_oom_control_read, 5057 .write_u64 = mem_cgroup_oom_control_write, 5058 }, 5059 { 5060 .name = "pressure_level", 5061 }, 5062 #ifdef CONFIG_NUMA 5063 { 5064 .name = "numa_stat", 5065 .seq_show = memcg_numa_stat_show, 5066 }, 5067 #endif 5068 { 5069 .name = "kmem.limit_in_bytes", 5070 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5071 .write = mem_cgroup_write, 5072 .read_u64 = mem_cgroup_read_u64, 5073 }, 5074 { 5075 .name = "kmem.usage_in_bytes", 5076 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5077 .read_u64 = mem_cgroup_read_u64, 5078 }, 5079 { 5080 .name = "kmem.failcnt", 5081 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5082 .write = mem_cgroup_reset, 5083 .read_u64 = mem_cgroup_read_u64, 5084 }, 5085 { 5086 .name = "kmem.max_usage_in_bytes", 5087 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5088 .write = mem_cgroup_reset, 5089 .read_u64 = mem_cgroup_read_u64, 5090 }, 5091 #if defined(CONFIG_MEMCG_KMEM) && \ 5092 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5093 { 5094 .name = "kmem.slabinfo", 5095 .seq_show = mem_cgroup_slab_show, 5096 }, 5097 #endif 5098 { 5099 .name = "kmem.tcp.limit_in_bytes", 5100 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5101 .write = mem_cgroup_write, 5102 .read_u64 = mem_cgroup_read_u64, 5103 }, 5104 { 5105 .name = "kmem.tcp.usage_in_bytes", 5106 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5107 .read_u64 = mem_cgroup_read_u64, 5108 }, 5109 { 5110 .name = "kmem.tcp.failcnt", 5111 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5112 .write = mem_cgroup_reset, 5113 .read_u64 = mem_cgroup_read_u64, 5114 }, 5115 { 5116 .name = "kmem.tcp.max_usage_in_bytes", 5117 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5118 .write = mem_cgroup_reset, 5119 .read_u64 = mem_cgroup_read_u64, 5120 }, 5121 { }, /* terminate */ 5122 }; 5123 5124 /* 5125 * Private memory cgroup IDR 5126 * 5127 * Swap-out records and page cache shadow entries need to store memcg 5128 * references in constrained space, so we maintain an ID space that is 5129 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5130 * memory-controlled cgroups to 64k. 5131 * 5132 * However, there usually are many references to the offline CSS after 5133 * the cgroup has been destroyed, such as page cache or reclaimable 5134 * slab objects, that don't need to hang on to the ID. We want to keep 5135 * those dead CSS from occupying IDs, or we might quickly exhaust the 5136 * relatively small ID space and prevent the creation of new cgroups 5137 * even when there are much fewer than 64k cgroups - possibly none. 5138 * 5139 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5140 * be freed and recycled when it's no longer needed, which is usually 5141 * when the CSS is offlined. 5142 * 5143 * The only exception to that are records of swapped out tmpfs/shmem 5144 * pages that need to be attributed to live ancestors on swapin. But 5145 * those references are manageable from userspace. 5146 */ 5147 5148 static DEFINE_IDR(mem_cgroup_idr); 5149 5150 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5151 { 5152 if (memcg->id.id > 0) { 5153 idr_remove(&mem_cgroup_idr, memcg->id.id); 5154 memcg->id.id = 0; 5155 } 5156 } 5157 5158 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5159 unsigned int n) 5160 { 5161 refcount_add(n, &memcg->id.ref); 5162 } 5163 5164 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5165 { 5166 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5167 mem_cgroup_id_remove(memcg); 5168 5169 /* Memcg ID pins CSS */ 5170 css_put(&memcg->css); 5171 } 5172 } 5173 5174 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5175 { 5176 mem_cgroup_id_put_many(memcg, 1); 5177 } 5178 5179 /** 5180 * mem_cgroup_from_id - look up a memcg from a memcg id 5181 * @id: the memcg id to look up 5182 * 5183 * Caller must hold rcu_read_lock(). 5184 */ 5185 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5186 { 5187 WARN_ON_ONCE(!rcu_read_lock_held()); 5188 return idr_find(&mem_cgroup_idr, id); 5189 } 5190 5191 #ifdef CONFIG_SHRINKER_DEBUG 5192 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 5193 { 5194 struct cgroup *cgrp; 5195 struct cgroup_subsys_state *css; 5196 struct mem_cgroup *memcg; 5197 5198 cgrp = cgroup_get_from_id(ino); 5199 if (IS_ERR(cgrp)) 5200 return ERR_CAST(cgrp); 5201 5202 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 5203 if (css) 5204 memcg = container_of(css, struct mem_cgroup, css); 5205 else 5206 memcg = ERR_PTR(-ENOENT); 5207 5208 cgroup_put(cgrp); 5209 5210 return memcg; 5211 } 5212 #endif 5213 5214 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5215 { 5216 struct mem_cgroup_per_node *pn; 5217 5218 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 5219 if (!pn) 5220 return 1; 5221 5222 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 5223 GFP_KERNEL_ACCOUNT); 5224 if (!pn->lruvec_stats_percpu) { 5225 kfree(pn); 5226 return 1; 5227 } 5228 5229 lruvec_init(&pn->lruvec); 5230 pn->memcg = memcg; 5231 5232 memcg->nodeinfo[node] = pn; 5233 return 0; 5234 } 5235 5236 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5237 { 5238 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5239 5240 if (!pn) 5241 return; 5242 5243 free_percpu(pn->lruvec_stats_percpu); 5244 kfree(pn); 5245 } 5246 5247 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5248 { 5249 int node; 5250 5251 for_each_node(node) 5252 free_mem_cgroup_per_node_info(memcg, node); 5253 kfree(memcg->vmstats); 5254 free_percpu(memcg->vmstats_percpu); 5255 kfree(memcg); 5256 } 5257 5258 static void mem_cgroup_free(struct mem_cgroup *memcg) 5259 { 5260 lru_gen_exit_memcg(memcg); 5261 memcg_wb_domain_exit(memcg); 5262 __mem_cgroup_free(memcg); 5263 } 5264 5265 static struct mem_cgroup *mem_cgroup_alloc(void) 5266 { 5267 struct mem_cgroup *memcg; 5268 int node; 5269 int __maybe_unused i; 5270 long error = -ENOMEM; 5271 5272 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 5273 if (!memcg) 5274 return ERR_PTR(error); 5275 5276 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5277 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); 5278 if (memcg->id.id < 0) { 5279 error = memcg->id.id; 5280 goto fail; 5281 } 5282 5283 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); 5284 if (!memcg->vmstats) 5285 goto fail; 5286 5287 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5288 GFP_KERNEL_ACCOUNT); 5289 if (!memcg->vmstats_percpu) 5290 goto fail; 5291 5292 for_each_node(node) 5293 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5294 goto fail; 5295 5296 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5297 goto fail; 5298 5299 INIT_WORK(&memcg->high_work, high_work_func); 5300 INIT_LIST_HEAD(&memcg->oom_notify); 5301 mutex_init(&memcg->thresholds_lock); 5302 spin_lock_init(&memcg->move_lock); 5303 vmpressure_init(&memcg->vmpressure); 5304 INIT_LIST_HEAD(&memcg->event_list); 5305 spin_lock_init(&memcg->event_list_lock); 5306 memcg->socket_pressure = jiffies; 5307 #ifdef CONFIG_MEMCG_KMEM 5308 memcg->kmemcg_id = -1; 5309 INIT_LIST_HEAD(&memcg->objcg_list); 5310 #endif 5311 #ifdef CONFIG_CGROUP_WRITEBACK 5312 INIT_LIST_HEAD(&memcg->cgwb_list); 5313 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5314 memcg->cgwb_frn[i].done = 5315 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5316 #endif 5317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5318 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5319 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5320 memcg->deferred_split_queue.split_queue_len = 0; 5321 #endif 5322 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5323 lru_gen_init_memcg(memcg); 5324 return memcg; 5325 fail: 5326 mem_cgroup_id_remove(memcg); 5327 __mem_cgroup_free(memcg); 5328 return ERR_PTR(error); 5329 } 5330 5331 static struct cgroup_subsys_state * __ref 5332 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5333 { 5334 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5335 struct mem_cgroup *memcg, *old_memcg; 5336 5337 old_memcg = set_active_memcg(parent); 5338 memcg = mem_cgroup_alloc(); 5339 set_active_memcg(old_memcg); 5340 if (IS_ERR(memcg)) 5341 return ERR_CAST(memcg); 5342 5343 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5344 memcg->soft_limit = PAGE_COUNTER_MAX; 5345 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 5346 memcg->zswap_max = PAGE_COUNTER_MAX; 5347 #endif 5348 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5349 if (parent) { 5350 memcg->swappiness = mem_cgroup_swappiness(parent); 5351 memcg->oom_kill_disable = parent->oom_kill_disable; 5352 5353 page_counter_init(&memcg->memory, &parent->memory); 5354 page_counter_init(&memcg->swap, &parent->swap); 5355 page_counter_init(&memcg->kmem, &parent->kmem); 5356 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5357 } else { 5358 init_memcg_events(); 5359 page_counter_init(&memcg->memory, NULL); 5360 page_counter_init(&memcg->swap, NULL); 5361 page_counter_init(&memcg->kmem, NULL); 5362 page_counter_init(&memcg->tcpmem, NULL); 5363 5364 root_mem_cgroup = memcg; 5365 return &memcg->css; 5366 } 5367 5368 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5369 static_branch_inc(&memcg_sockets_enabled_key); 5370 5371 return &memcg->css; 5372 } 5373 5374 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5375 { 5376 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5377 5378 if (memcg_online_kmem(memcg)) 5379 goto remove_id; 5380 5381 /* 5382 * A memcg must be visible for expand_shrinker_info() 5383 * by the time the maps are allocated. So, we allocate maps 5384 * here, when for_each_mem_cgroup() can't skip it. 5385 */ 5386 if (alloc_shrinker_info(memcg)) 5387 goto offline_kmem; 5388 5389 /* Online state pins memcg ID, memcg ID pins CSS */ 5390 refcount_set(&memcg->id.ref, 1); 5391 css_get(css); 5392 5393 if (unlikely(mem_cgroup_is_root(memcg))) 5394 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 5395 2UL*HZ); 5396 lru_gen_online_memcg(memcg); 5397 return 0; 5398 offline_kmem: 5399 memcg_offline_kmem(memcg); 5400 remove_id: 5401 mem_cgroup_id_remove(memcg); 5402 return -ENOMEM; 5403 } 5404 5405 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5406 { 5407 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5408 struct mem_cgroup_event *event, *tmp; 5409 5410 /* 5411 * Unregister events and notify userspace. 5412 * Notify userspace about cgroup removing only after rmdir of cgroup 5413 * directory to avoid race between userspace and kernelspace. 5414 */ 5415 spin_lock_irq(&memcg->event_list_lock); 5416 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5417 list_del_init(&event->list); 5418 schedule_work(&event->remove); 5419 } 5420 spin_unlock_irq(&memcg->event_list_lock); 5421 5422 page_counter_set_min(&memcg->memory, 0); 5423 page_counter_set_low(&memcg->memory, 0); 5424 5425 memcg_offline_kmem(memcg); 5426 reparent_shrinker_deferred(memcg); 5427 wb_memcg_offline(memcg); 5428 lru_gen_offline_memcg(memcg); 5429 5430 drain_all_stock(memcg); 5431 5432 mem_cgroup_id_put(memcg); 5433 } 5434 5435 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5436 { 5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5438 5439 invalidate_reclaim_iterators(memcg); 5440 lru_gen_release_memcg(memcg); 5441 } 5442 5443 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5444 { 5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5446 int __maybe_unused i; 5447 5448 #ifdef CONFIG_CGROUP_WRITEBACK 5449 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5450 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5451 #endif 5452 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5453 static_branch_dec(&memcg_sockets_enabled_key); 5454 5455 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5456 static_branch_dec(&memcg_sockets_enabled_key); 5457 5458 vmpressure_cleanup(&memcg->vmpressure); 5459 cancel_work_sync(&memcg->high_work); 5460 mem_cgroup_remove_from_trees(memcg); 5461 free_shrinker_info(memcg); 5462 mem_cgroup_free(memcg); 5463 } 5464 5465 /** 5466 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5467 * @css: the target css 5468 * 5469 * Reset the states of the mem_cgroup associated with @css. This is 5470 * invoked when the userland requests disabling on the default hierarchy 5471 * but the memcg is pinned through dependency. The memcg should stop 5472 * applying policies and should revert to the vanilla state as it may be 5473 * made visible again. 5474 * 5475 * The current implementation only resets the essential configurations. 5476 * This needs to be expanded to cover all the visible parts. 5477 */ 5478 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5479 { 5480 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5481 5482 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5483 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5484 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5485 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5486 page_counter_set_min(&memcg->memory, 0); 5487 page_counter_set_low(&memcg->memory, 0); 5488 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5489 memcg->soft_limit = PAGE_COUNTER_MAX; 5490 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5491 memcg_wb_domain_size_changed(memcg); 5492 } 5493 5494 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5495 { 5496 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5497 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5498 struct memcg_vmstats_percpu *statc; 5499 long delta, v; 5500 int i, nid; 5501 5502 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5503 5504 for (i = 0; i < MEMCG_NR_STAT; i++) { 5505 /* 5506 * Collect the aggregated propagation counts of groups 5507 * below us. We're in a per-cpu loop here and this is 5508 * a global counter, so the first cycle will get them. 5509 */ 5510 delta = memcg->vmstats->state_pending[i]; 5511 if (delta) 5512 memcg->vmstats->state_pending[i] = 0; 5513 5514 /* Add CPU changes on this level since the last flush */ 5515 v = READ_ONCE(statc->state[i]); 5516 if (v != statc->state_prev[i]) { 5517 delta += v - statc->state_prev[i]; 5518 statc->state_prev[i] = v; 5519 } 5520 5521 if (!delta) 5522 continue; 5523 5524 /* Aggregate counts on this level and propagate upwards */ 5525 memcg->vmstats->state[i] += delta; 5526 if (parent) 5527 parent->vmstats->state_pending[i] += delta; 5528 } 5529 5530 for (i = 0; i < NR_MEMCG_EVENTS; i++) { 5531 delta = memcg->vmstats->events_pending[i]; 5532 if (delta) 5533 memcg->vmstats->events_pending[i] = 0; 5534 5535 v = READ_ONCE(statc->events[i]); 5536 if (v != statc->events_prev[i]) { 5537 delta += v - statc->events_prev[i]; 5538 statc->events_prev[i] = v; 5539 } 5540 5541 if (!delta) 5542 continue; 5543 5544 memcg->vmstats->events[i] += delta; 5545 if (parent) 5546 parent->vmstats->events_pending[i] += delta; 5547 } 5548 5549 for_each_node_state(nid, N_MEMORY) { 5550 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 5551 struct mem_cgroup_per_node *ppn = NULL; 5552 struct lruvec_stats_percpu *lstatc; 5553 5554 if (parent) 5555 ppn = parent->nodeinfo[nid]; 5556 5557 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 5558 5559 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 5560 delta = pn->lruvec_stats.state_pending[i]; 5561 if (delta) 5562 pn->lruvec_stats.state_pending[i] = 0; 5563 5564 v = READ_ONCE(lstatc->state[i]); 5565 if (v != lstatc->state_prev[i]) { 5566 delta += v - lstatc->state_prev[i]; 5567 lstatc->state_prev[i] = v; 5568 } 5569 5570 if (!delta) 5571 continue; 5572 5573 pn->lruvec_stats.state[i] += delta; 5574 if (ppn) 5575 ppn->lruvec_stats.state_pending[i] += delta; 5576 } 5577 } 5578 } 5579 5580 #ifdef CONFIG_MMU 5581 /* Handlers for move charge at task migration. */ 5582 static int mem_cgroup_do_precharge(unsigned long count) 5583 { 5584 int ret; 5585 5586 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5587 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5588 if (!ret) { 5589 mc.precharge += count; 5590 return ret; 5591 } 5592 5593 /* Try charges one by one with reclaim, but do not retry */ 5594 while (count--) { 5595 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5596 if (ret) 5597 return ret; 5598 mc.precharge++; 5599 cond_resched(); 5600 } 5601 return 0; 5602 } 5603 5604 union mc_target { 5605 struct page *page; 5606 swp_entry_t ent; 5607 }; 5608 5609 enum mc_target_type { 5610 MC_TARGET_NONE = 0, 5611 MC_TARGET_PAGE, 5612 MC_TARGET_SWAP, 5613 MC_TARGET_DEVICE, 5614 }; 5615 5616 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5617 unsigned long addr, pte_t ptent) 5618 { 5619 struct page *page = vm_normal_page(vma, addr, ptent); 5620 5621 if (!page || !page_mapped(page)) 5622 return NULL; 5623 if (PageAnon(page)) { 5624 if (!(mc.flags & MOVE_ANON)) 5625 return NULL; 5626 } else { 5627 if (!(mc.flags & MOVE_FILE)) 5628 return NULL; 5629 } 5630 if (!get_page_unless_zero(page)) 5631 return NULL; 5632 5633 return page; 5634 } 5635 5636 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5637 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5638 pte_t ptent, swp_entry_t *entry) 5639 { 5640 struct page *page = NULL; 5641 swp_entry_t ent = pte_to_swp_entry(ptent); 5642 5643 if (!(mc.flags & MOVE_ANON)) 5644 return NULL; 5645 5646 /* 5647 * Handle device private pages that are not accessible by the CPU, but 5648 * stored as special swap entries in the page table. 5649 */ 5650 if (is_device_private_entry(ent)) { 5651 page = pfn_swap_entry_to_page(ent); 5652 if (!get_page_unless_zero(page)) 5653 return NULL; 5654 return page; 5655 } 5656 5657 if (non_swap_entry(ent)) 5658 return NULL; 5659 5660 /* 5661 * Because swap_cache_get_folio() updates some statistics counter, 5662 * we call find_get_page() with swapper_space directly. 5663 */ 5664 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5665 entry->val = ent.val; 5666 5667 return page; 5668 } 5669 #else 5670 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5671 pte_t ptent, swp_entry_t *entry) 5672 { 5673 return NULL; 5674 } 5675 #endif 5676 5677 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5678 unsigned long addr, pte_t ptent) 5679 { 5680 unsigned long index; 5681 struct folio *folio; 5682 5683 if (!vma->vm_file) /* anonymous vma */ 5684 return NULL; 5685 if (!(mc.flags & MOVE_FILE)) 5686 return NULL; 5687 5688 /* folio is moved even if it's not RSS of this task(page-faulted). */ 5689 /* shmem/tmpfs may report page out on swap: account for that too. */ 5690 index = linear_page_index(vma, addr); 5691 folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 5692 if (!folio) 5693 return NULL; 5694 return folio_file_page(folio, index); 5695 } 5696 5697 /** 5698 * mem_cgroup_move_account - move account of the page 5699 * @page: the page 5700 * @compound: charge the page as compound or small page 5701 * @from: mem_cgroup which the page is moved from. 5702 * @to: mem_cgroup which the page is moved to. @from != @to. 5703 * 5704 * The page must be locked and not on the LRU. 5705 * 5706 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5707 * from old cgroup. 5708 */ 5709 static int mem_cgroup_move_account(struct page *page, 5710 bool compound, 5711 struct mem_cgroup *from, 5712 struct mem_cgroup *to) 5713 { 5714 struct folio *folio = page_folio(page); 5715 struct lruvec *from_vec, *to_vec; 5716 struct pglist_data *pgdat; 5717 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 5718 int nid, ret; 5719 5720 VM_BUG_ON(from == to); 5721 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 5722 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 5723 VM_BUG_ON(compound && !folio_test_large(folio)); 5724 5725 ret = -EINVAL; 5726 if (folio_memcg(folio) != from) 5727 goto out; 5728 5729 pgdat = folio_pgdat(folio); 5730 from_vec = mem_cgroup_lruvec(from, pgdat); 5731 to_vec = mem_cgroup_lruvec(to, pgdat); 5732 5733 folio_memcg_lock(folio); 5734 5735 if (folio_test_anon(folio)) { 5736 if (folio_mapped(folio)) { 5737 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5738 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5739 if (folio_test_transhuge(folio)) { 5740 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5741 -nr_pages); 5742 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5743 nr_pages); 5744 } 5745 } 5746 } else { 5747 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5748 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5749 5750 if (folio_test_swapbacked(folio)) { 5751 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5752 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5753 } 5754 5755 if (folio_mapped(folio)) { 5756 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5757 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5758 } 5759 5760 if (folio_test_dirty(folio)) { 5761 struct address_space *mapping = folio_mapping(folio); 5762 5763 if (mapping_can_writeback(mapping)) { 5764 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5765 -nr_pages); 5766 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5767 nr_pages); 5768 } 5769 } 5770 } 5771 5772 #ifdef CONFIG_SWAP 5773 if (folio_test_swapcache(folio)) { 5774 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 5775 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 5776 } 5777 #endif 5778 if (folio_test_writeback(folio)) { 5779 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5780 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5781 } 5782 5783 /* 5784 * All state has been migrated, let's switch to the new memcg. 5785 * 5786 * It is safe to change page's memcg here because the page 5787 * is referenced, charged, isolated, and locked: we can't race 5788 * with (un)charging, migration, LRU putback, or anything else 5789 * that would rely on a stable page's memory cgroup. 5790 * 5791 * Note that lock_page_memcg is a memcg lock, not a page lock, 5792 * to save space. As soon as we switch page's memory cgroup to a 5793 * new memcg that isn't locked, the above state can change 5794 * concurrently again. Make sure we're truly done with it. 5795 */ 5796 smp_mb(); 5797 5798 css_get(&to->css); 5799 css_put(&from->css); 5800 5801 folio->memcg_data = (unsigned long)to; 5802 5803 __folio_memcg_unlock(from); 5804 5805 ret = 0; 5806 nid = folio_nid(folio); 5807 5808 local_irq_disable(); 5809 mem_cgroup_charge_statistics(to, nr_pages); 5810 memcg_check_events(to, nid); 5811 mem_cgroup_charge_statistics(from, -nr_pages); 5812 memcg_check_events(from, nid); 5813 local_irq_enable(); 5814 out: 5815 return ret; 5816 } 5817 5818 /** 5819 * get_mctgt_type - get target type of moving charge 5820 * @vma: the vma the pte to be checked belongs 5821 * @addr: the address corresponding to the pte to be checked 5822 * @ptent: the pte to be checked 5823 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5824 * 5825 * Returns 5826 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5827 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5828 * move charge. if @target is not NULL, the page is stored in target->page 5829 * with extra refcnt got(Callers should handle it). 5830 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5831 * target for charge migration. if @target is not NULL, the entry is stored 5832 * in target->ent. 5833 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and 5834 * thus not on the lru. 5835 * For now we such page is charge like a regular page would be as for all 5836 * intent and purposes it is just special memory taking the place of a 5837 * regular page. 5838 * 5839 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5840 * 5841 * Called with pte lock held. 5842 */ 5843 5844 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5845 unsigned long addr, pte_t ptent, union mc_target *target) 5846 { 5847 struct page *page = NULL; 5848 enum mc_target_type ret = MC_TARGET_NONE; 5849 swp_entry_t ent = { .val = 0 }; 5850 5851 if (pte_present(ptent)) 5852 page = mc_handle_present_pte(vma, addr, ptent); 5853 else if (pte_none_mostly(ptent)) 5854 /* 5855 * PTE markers should be treated as a none pte here, separated 5856 * from other swap handling below. 5857 */ 5858 page = mc_handle_file_pte(vma, addr, ptent); 5859 else if (is_swap_pte(ptent)) 5860 page = mc_handle_swap_pte(vma, ptent, &ent); 5861 5862 if (target && page) { 5863 if (!trylock_page(page)) { 5864 put_page(page); 5865 return ret; 5866 } 5867 /* 5868 * page_mapped() must be stable during the move. This 5869 * pte is locked, so if it's present, the page cannot 5870 * become unmapped. If it isn't, we have only partial 5871 * control over the mapped state: the page lock will 5872 * prevent new faults against pagecache and swapcache, 5873 * so an unmapped page cannot become mapped. However, 5874 * if the page is already mapped elsewhere, it can 5875 * unmap, and there is nothing we can do about it. 5876 * Alas, skip moving the page in this case. 5877 */ 5878 if (!pte_present(ptent) && page_mapped(page)) { 5879 unlock_page(page); 5880 put_page(page); 5881 return ret; 5882 } 5883 } 5884 5885 if (!page && !ent.val) 5886 return ret; 5887 if (page) { 5888 /* 5889 * Do only loose check w/o serialization. 5890 * mem_cgroup_move_account() checks the page is valid or 5891 * not under LRU exclusion. 5892 */ 5893 if (page_memcg(page) == mc.from) { 5894 ret = MC_TARGET_PAGE; 5895 if (is_device_private_page(page) || 5896 is_device_coherent_page(page)) 5897 ret = MC_TARGET_DEVICE; 5898 if (target) 5899 target->page = page; 5900 } 5901 if (!ret || !target) { 5902 if (target) 5903 unlock_page(page); 5904 put_page(page); 5905 } 5906 } 5907 /* 5908 * There is a swap entry and a page doesn't exist or isn't charged. 5909 * But we cannot move a tail-page in a THP. 5910 */ 5911 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5912 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5913 ret = MC_TARGET_SWAP; 5914 if (target) 5915 target->ent = ent; 5916 } 5917 return ret; 5918 } 5919 5920 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5921 /* 5922 * We don't consider PMD mapped swapping or file mapped pages because THP does 5923 * not support them for now. 5924 * Caller should make sure that pmd_trans_huge(pmd) is true. 5925 */ 5926 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5927 unsigned long addr, pmd_t pmd, union mc_target *target) 5928 { 5929 struct page *page = NULL; 5930 enum mc_target_type ret = MC_TARGET_NONE; 5931 5932 if (unlikely(is_swap_pmd(pmd))) { 5933 VM_BUG_ON(thp_migration_supported() && 5934 !is_pmd_migration_entry(pmd)); 5935 return ret; 5936 } 5937 page = pmd_page(pmd); 5938 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5939 if (!(mc.flags & MOVE_ANON)) 5940 return ret; 5941 if (page_memcg(page) == mc.from) { 5942 ret = MC_TARGET_PAGE; 5943 if (target) { 5944 get_page(page); 5945 if (!trylock_page(page)) { 5946 put_page(page); 5947 return MC_TARGET_NONE; 5948 } 5949 target->page = page; 5950 } 5951 } 5952 return ret; 5953 } 5954 #else 5955 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5956 unsigned long addr, pmd_t pmd, union mc_target *target) 5957 { 5958 return MC_TARGET_NONE; 5959 } 5960 #endif 5961 5962 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5963 unsigned long addr, unsigned long end, 5964 struct mm_walk *walk) 5965 { 5966 struct vm_area_struct *vma = walk->vma; 5967 pte_t *pte; 5968 spinlock_t *ptl; 5969 5970 ptl = pmd_trans_huge_lock(pmd, vma); 5971 if (ptl) { 5972 /* 5973 * Note their can not be MC_TARGET_DEVICE for now as we do not 5974 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5975 * this might change. 5976 */ 5977 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5978 mc.precharge += HPAGE_PMD_NR; 5979 spin_unlock(ptl); 5980 return 0; 5981 } 5982 5983 if (pmd_trans_unstable(pmd)) 5984 return 0; 5985 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5986 for (; addr != end; pte++, addr += PAGE_SIZE) 5987 if (get_mctgt_type(vma, addr, *pte, NULL)) 5988 mc.precharge++; /* increment precharge temporarily */ 5989 pte_unmap_unlock(pte - 1, ptl); 5990 cond_resched(); 5991 5992 return 0; 5993 } 5994 5995 static const struct mm_walk_ops precharge_walk_ops = { 5996 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5997 }; 5998 5999 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6000 { 6001 unsigned long precharge; 6002 6003 mmap_read_lock(mm); 6004 walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 6005 mmap_read_unlock(mm); 6006 6007 precharge = mc.precharge; 6008 mc.precharge = 0; 6009 6010 return precharge; 6011 } 6012 6013 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6014 { 6015 unsigned long precharge = mem_cgroup_count_precharge(mm); 6016 6017 VM_BUG_ON(mc.moving_task); 6018 mc.moving_task = current; 6019 return mem_cgroup_do_precharge(precharge); 6020 } 6021 6022 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6023 static void __mem_cgroup_clear_mc(void) 6024 { 6025 struct mem_cgroup *from = mc.from; 6026 struct mem_cgroup *to = mc.to; 6027 6028 /* we must uncharge all the leftover precharges from mc.to */ 6029 if (mc.precharge) { 6030 cancel_charge(mc.to, mc.precharge); 6031 mc.precharge = 0; 6032 } 6033 /* 6034 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6035 * we must uncharge here. 6036 */ 6037 if (mc.moved_charge) { 6038 cancel_charge(mc.from, mc.moved_charge); 6039 mc.moved_charge = 0; 6040 } 6041 /* we must fixup refcnts and charges */ 6042 if (mc.moved_swap) { 6043 /* uncharge swap account from the old cgroup */ 6044 if (!mem_cgroup_is_root(mc.from)) 6045 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 6046 6047 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 6048 6049 /* 6050 * we charged both to->memory and to->memsw, so we 6051 * should uncharge to->memory. 6052 */ 6053 if (!mem_cgroup_is_root(mc.to)) 6054 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 6055 6056 mc.moved_swap = 0; 6057 } 6058 memcg_oom_recover(from); 6059 memcg_oom_recover(to); 6060 wake_up_all(&mc.waitq); 6061 } 6062 6063 static void mem_cgroup_clear_mc(void) 6064 { 6065 struct mm_struct *mm = mc.mm; 6066 6067 /* 6068 * we must clear moving_task before waking up waiters at the end of 6069 * task migration. 6070 */ 6071 mc.moving_task = NULL; 6072 __mem_cgroup_clear_mc(); 6073 spin_lock(&mc.lock); 6074 mc.from = NULL; 6075 mc.to = NULL; 6076 mc.mm = NULL; 6077 spin_unlock(&mc.lock); 6078 6079 mmput(mm); 6080 } 6081 6082 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6083 { 6084 struct cgroup_subsys_state *css; 6085 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 6086 struct mem_cgroup *from; 6087 struct task_struct *leader, *p; 6088 struct mm_struct *mm; 6089 unsigned long move_flags; 6090 int ret = 0; 6091 6092 /* charge immigration isn't supported on the default hierarchy */ 6093 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6094 return 0; 6095 6096 /* 6097 * Multi-process migrations only happen on the default hierarchy 6098 * where charge immigration is not used. Perform charge 6099 * immigration if @tset contains a leader and whine if there are 6100 * multiple. 6101 */ 6102 p = NULL; 6103 cgroup_taskset_for_each_leader(leader, css, tset) { 6104 WARN_ON_ONCE(p); 6105 p = leader; 6106 memcg = mem_cgroup_from_css(css); 6107 } 6108 if (!p) 6109 return 0; 6110 6111 /* 6112 * We are now committed to this value whatever it is. Changes in this 6113 * tunable will only affect upcoming migrations, not the current one. 6114 * So we need to save it, and keep it going. 6115 */ 6116 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 6117 if (!move_flags) 6118 return 0; 6119 6120 from = mem_cgroup_from_task(p); 6121 6122 VM_BUG_ON(from == memcg); 6123 6124 mm = get_task_mm(p); 6125 if (!mm) 6126 return 0; 6127 /* We move charges only when we move a owner of the mm */ 6128 if (mm->owner == p) { 6129 VM_BUG_ON(mc.from); 6130 VM_BUG_ON(mc.to); 6131 VM_BUG_ON(mc.precharge); 6132 VM_BUG_ON(mc.moved_charge); 6133 VM_BUG_ON(mc.moved_swap); 6134 6135 spin_lock(&mc.lock); 6136 mc.mm = mm; 6137 mc.from = from; 6138 mc.to = memcg; 6139 mc.flags = move_flags; 6140 spin_unlock(&mc.lock); 6141 /* We set mc.moving_task later */ 6142 6143 ret = mem_cgroup_precharge_mc(mm); 6144 if (ret) 6145 mem_cgroup_clear_mc(); 6146 } else { 6147 mmput(mm); 6148 } 6149 return ret; 6150 } 6151 6152 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6153 { 6154 if (mc.to) 6155 mem_cgroup_clear_mc(); 6156 } 6157 6158 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6159 unsigned long addr, unsigned long end, 6160 struct mm_walk *walk) 6161 { 6162 int ret = 0; 6163 struct vm_area_struct *vma = walk->vma; 6164 pte_t *pte; 6165 spinlock_t *ptl; 6166 enum mc_target_type target_type; 6167 union mc_target target; 6168 struct page *page; 6169 6170 ptl = pmd_trans_huge_lock(pmd, vma); 6171 if (ptl) { 6172 if (mc.precharge < HPAGE_PMD_NR) { 6173 spin_unlock(ptl); 6174 return 0; 6175 } 6176 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6177 if (target_type == MC_TARGET_PAGE) { 6178 page = target.page; 6179 if (!isolate_lru_page(page)) { 6180 if (!mem_cgroup_move_account(page, true, 6181 mc.from, mc.to)) { 6182 mc.precharge -= HPAGE_PMD_NR; 6183 mc.moved_charge += HPAGE_PMD_NR; 6184 } 6185 putback_lru_page(page); 6186 } 6187 unlock_page(page); 6188 put_page(page); 6189 } else if (target_type == MC_TARGET_DEVICE) { 6190 page = target.page; 6191 if (!mem_cgroup_move_account(page, true, 6192 mc.from, mc.to)) { 6193 mc.precharge -= HPAGE_PMD_NR; 6194 mc.moved_charge += HPAGE_PMD_NR; 6195 } 6196 unlock_page(page); 6197 put_page(page); 6198 } 6199 spin_unlock(ptl); 6200 return 0; 6201 } 6202 6203 if (pmd_trans_unstable(pmd)) 6204 return 0; 6205 retry: 6206 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6207 for (; addr != end; addr += PAGE_SIZE) { 6208 pte_t ptent = *(pte++); 6209 bool device = false; 6210 swp_entry_t ent; 6211 6212 if (!mc.precharge) 6213 break; 6214 6215 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6216 case MC_TARGET_DEVICE: 6217 device = true; 6218 fallthrough; 6219 case MC_TARGET_PAGE: 6220 page = target.page; 6221 /* 6222 * We can have a part of the split pmd here. Moving it 6223 * can be done but it would be too convoluted so simply 6224 * ignore such a partial THP and keep it in original 6225 * memcg. There should be somebody mapping the head. 6226 */ 6227 if (PageTransCompound(page)) 6228 goto put; 6229 if (!device && isolate_lru_page(page)) 6230 goto put; 6231 if (!mem_cgroup_move_account(page, false, 6232 mc.from, mc.to)) { 6233 mc.precharge--; 6234 /* we uncharge from mc.from later. */ 6235 mc.moved_charge++; 6236 } 6237 if (!device) 6238 putback_lru_page(page); 6239 put: /* get_mctgt_type() gets & locks the page */ 6240 unlock_page(page); 6241 put_page(page); 6242 break; 6243 case MC_TARGET_SWAP: 6244 ent = target.ent; 6245 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6246 mc.precharge--; 6247 mem_cgroup_id_get_many(mc.to, 1); 6248 /* we fixup other refcnts and charges later. */ 6249 mc.moved_swap++; 6250 } 6251 break; 6252 default: 6253 break; 6254 } 6255 } 6256 pte_unmap_unlock(pte - 1, ptl); 6257 cond_resched(); 6258 6259 if (addr != end) { 6260 /* 6261 * We have consumed all precharges we got in can_attach(). 6262 * We try charge one by one, but don't do any additional 6263 * charges to mc.to if we have failed in charge once in attach() 6264 * phase. 6265 */ 6266 ret = mem_cgroup_do_precharge(1); 6267 if (!ret) 6268 goto retry; 6269 } 6270 6271 return ret; 6272 } 6273 6274 static const struct mm_walk_ops charge_walk_ops = { 6275 .pmd_entry = mem_cgroup_move_charge_pte_range, 6276 }; 6277 6278 static void mem_cgroup_move_charge(void) 6279 { 6280 lru_add_drain_all(); 6281 /* 6282 * Signal lock_page_memcg() to take the memcg's move_lock 6283 * while we're moving its pages to another memcg. Then wait 6284 * for already started RCU-only updates to finish. 6285 */ 6286 atomic_inc(&mc.from->moving_account); 6287 synchronize_rcu(); 6288 retry: 6289 if (unlikely(!mmap_read_trylock(mc.mm))) { 6290 /* 6291 * Someone who are holding the mmap_lock might be waiting in 6292 * waitq. So we cancel all extra charges, wake up all waiters, 6293 * and retry. Because we cancel precharges, we might not be able 6294 * to move enough charges, but moving charge is a best-effort 6295 * feature anyway, so it wouldn't be a big problem. 6296 */ 6297 __mem_cgroup_clear_mc(); 6298 cond_resched(); 6299 goto retry; 6300 } 6301 /* 6302 * When we have consumed all precharges and failed in doing 6303 * additional charge, the page walk just aborts. 6304 */ 6305 walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 6306 mmap_read_unlock(mc.mm); 6307 atomic_dec(&mc.from->moving_account); 6308 } 6309 6310 static void mem_cgroup_move_task(void) 6311 { 6312 if (mc.to) { 6313 mem_cgroup_move_charge(); 6314 mem_cgroup_clear_mc(); 6315 } 6316 } 6317 #else /* !CONFIG_MMU */ 6318 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6319 { 6320 return 0; 6321 } 6322 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6323 { 6324 } 6325 static void mem_cgroup_move_task(void) 6326 { 6327 } 6328 #endif 6329 6330 #ifdef CONFIG_LRU_GEN 6331 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6332 { 6333 struct task_struct *task; 6334 struct cgroup_subsys_state *css; 6335 6336 /* find the first leader if there is any */ 6337 cgroup_taskset_for_each_leader(task, css, tset) 6338 break; 6339 6340 if (!task) 6341 return; 6342 6343 task_lock(task); 6344 if (task->mm && READ_ONCE(task->mm->owner) == task) 6345 lru_gen_migrate_mm(task->mm); 6346 task_unlock(task); 6347 } 6348 #else 6349 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6350 { 6351 } 6352 #endif /* CONFIG_LRU_GEN */ 6353 6354 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6355 { 6356 if (value == PAGE_COUNTER_MAX) 6357 seq_puts(m, "max\n"); 6358 else 6359 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6360 6361 return 0; 6362 } 6363 6364 static u64 memory_current_read(struct cgroup_subsys_state *css, 6365 struct cftype *cft) 6366 { 6367 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6368 6369 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6370 } 6371 6372 static u64 memory_peak_read(struct cgroup_subsys_state *css, 6373 struct cftype *cft) 6374 { 6375 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6376 6377 return (u64)memcg->memory.watermark * PAGE_SIZE; 6378 } 6379 6380 static int memory_min_show(struct seq_file *m, void *v) 6381 { 6382 return seq_puts_memcg_tunable(m, 6383 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6384 } 6385 6386 static ssize_t memory_min_write(struct kernfs_open_file *of, 6387 char *buf, size_t nbytes, loff_t off) 6388 { 6389 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6390 unsigned long min; 6391 int err; 6392 6393 buf = strstrip(buf); 6394 err = page_counter_memparse(buf, "max", &min); 6395 if (err) 6396 return err; 6397 6398 page_counter_set_min(&memcg->memory, min); 6399 6400 return nbytes; 6401 } 6402 6403 static int memory_low_show(struct seq_file *m, void *v) 6404 { 6405 return seq_puts_memcg_tunable(m, 6406 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6407 } 6408 6409 static ssize_t memory_low_write(struct kernfs_open_file *of, 6410 char *buf, size_t nbytes, loff_t off) 6411 { 6412 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6413 unsigned long low; 6414 int err; 6415 6416 buf = strstrip(buf); 6417 err = page_counter_memparse(buf, "max", &low); 6418 if (err) 6419 return err; 6420 6421 page_counter_set_low(&memcg->memory, low); 6422 6423 return nbytes; 6424 } 6425 6426 static int memory_high_show(struct seq_file *m, void *v) 6427 { 6428 return seq_puts_memcg_tunable(m, 6429 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6430 } 6431 6432 static ssize_t memory_high_write(struct kernfs_open_file *of, 6433 char *buf, size_t nbytes, loff_t off) 6434 { 6435 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6436 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6437 bool drained = false; 6438 unsigned long high; 6439 int err; 6440 6441 buf = strstrip(buf); 6442 err = page_counter_memparse(buf, "max", &high); 6443 if (err) 6444 return err; 6445 6446 page_counter_set_high(&memcg->memory, high); 6447 6448 for (;;) { 6449 unsigned long nr_pages = page_counter_read(&memcg->memory); 6450 unsigned long reclaimed; 6451 6452 if (nr_pages <= high) 6453 break; 6454 6455 if (signal_pending(current)) 6456 break; 6457 6458 if (!drained) { 6459 drain_all_stock(memcg); 6460 drained = true; 6461 continue; 6462 } 6463 6464 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6465 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 6466 6467 if (!reclaimed && !nr_retries--) 6468 break; 6469 } 6470 6471 memcg_wb_domain_size_changed(memcg); 6472 return nbytes; 6473 } 6474 6475 static int memory_max_show(struct seq_file *m, void *v) 6476 { 6477 return seq_puts_memcg_tunable(m, 6478 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6479 } 6480 6481 static ssize_t memory_max_write(struct kernfs_open_file *of, 6482 char *buf, size_t nbytes, loff_t off) 6483 { 6484 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6485 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6486 bool drained = false; 6487 unsigned long max; 6488 int err; 6489 6490 buf = strstrip(buf); 6491 err = page_counter_memparse(buf, "max", &max); 6492 if (err) 6493 return err; 6494 6495 xchg(&memcg->memory.max, max); 6496 6497 for (;;) { 6498 unsigned long nr_pages = page_counter_read(&memcg->memory); 6499 6500 if (nr_pages <= max) 6501 break; 6502 6503 if (signal_pending(current)) 6504 break; 6505 6506 if (!drained) { 6507 drain_all_stock(memcg); 6508 drained = true; 6509 continue; 6510 } 6511 6512 if (nr_reclaims) { 6513 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6514 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 6515 nr_reclaims--; 6516 continue; 6517 } 6518 6519 memcg_memory_event(memcg, MEMCG_OOM); 6520 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6521 break; 6522 } 6523 6524 memcg_wb_domain_size_changed(memcg); 6525 return nbytes; 6526 } 6527 6528 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6529 { 6530 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6531 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6532 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6533 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6534 seq_printf(m, "oom_kill %lu\n", 6535 atomic_long_read(&events[MEMCG_OOM_KILL])); 6536 seq_printf(m, "oom_group_kill %lu\n", 6537 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 6538 } 6539 6540 static int memory_events_show(struct seq_file *m, void *v) 6541 { 6542 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6543 6544 __memory_events_show(m, memcg->memory_events); 6545 return 0; 6546 } 6547 6548 static int memory_events_local_show(struct seq_file *m, void *v) 6549 { 6550 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6551 6552 __memory_events_show(m, memcg->memory_events_local); 6553 return 0; 6554 } 6555 6556 static int memory_stat_show(struct seq_file *m, void *v) 6557 { 6558 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6559 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 6560 6561 if (!buf) 6562 return -ENOMEM; 6563 memory_stat_format(memcg, buf, PAGE_SIZE); 6564 seq_puts(m, buf); 6565 kfree(buf); 6566 return 0; 6567 } 6568 6569 #ifdef CONFIG_NUMA 6570 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6571 int item) 6572 { 6573 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 6574 } 6575 6576 static int memory_numa_stat_show(struct seq_file *m, void *v) 6577 { 6578 int i; 6579 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6580 6581 mem_cgroup_flush_stats(); 6582 6583 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6584 int nid; 6585 6586 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6587 continue; 6588 6589 seq_printf(m, "%s", memory_stats[i].name); 6590 for_each_node_state(nid, N_MEMORY) { 6591 u64 size; 6592 struct lruvec *lruvec; 6593 6594 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6595 size = lruvec_page_state_output(lruvec, 6596 memory_stats[i].idx); 6597 seq_printf(m, " N%d=%llu", nid, size); 6598 } 6599 seq_putc(m, '\n'); 6600 } 6601 6602 return 0; 6603 } 6604 #endif 6605 6606 static int memory_oom_group_show(struct seq_file *m, void *v) 6607 { 6608 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6609 6610 seq_printf(m, "%d\n", memcg->oom_group); 6611 6612 return 0; 6613 } 6614 6615 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6616 char *buf, size_t nbytes, loff_t off) 6617 { 6618 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6619 int ret, oom_group; 6620 6621 buf = strstrip(buf); 6622 if (!buf) 6623 return -EINVAL; 6624 6625 ret = kstrtoint(buf, 0, &oom_group); 6626 if (ret) 6627 return ret; 6628 6629 if (oom_group != 0 && oom_group != 1) 6630 return -EINVAL; 6631 6632 memcg->oom_group = oom_group; 6633 6634 return nbytes; 6635 } 6636 6637 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 6638 size_t nbytes, loff_t off) 6639 { 6640 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6641 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6642 unsigned long nr_to_reclaim, nr_reclaimed = 0; 6643 unsigned int reclaim_options; 6644 int err; 6645 6646 buf = strstrip(buf); 6647 err = page_counter_memparse(buf, "", &nr_to_reclaim); 6648 if (err) 6649 return err; 6650 6651 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 6652 while (nr_reclaimed < nr_to_reclaim) { 6653 unsigned long reclaimed; 6654 6655 if (signal_pending(current)) 6656 return -EINTR; 6657 6658 /* 6659 * This is the final attempt, drain percpu lru caches in the 6660 * hope of introducing more evictable pages for 6661 * try_to_free_mem_cgroup_pages(). 6662 */ 6663 if (!nr_retries) 6664 lru_add_drain_all(); 6665 6666 reclaimed = try_to_free_mem_cgroup_pages(memcg, 6667 nr_to_reclaim - nr_reclaimed, 6668 GFP_KERNEL, reclaim_options); 6669 6670 if (!reclaimed && !nr_retries--) 6671 return -EAGAIN; 6672 6673 nr_reclaimed += reclaimed; 6674 } 6675 6676 return nbytes; 6677 } 6678 6679 static struct cftype memory_files[] = { 6680 { 6681 .name = "current", 6682 .flags = CFTYPE_NOT_ON_ROOT, 6683 .read_u64 = memory_current_read, 6684 }, 6685 { 6686 .name = "peak", 6687 .flags = CFTYPE_NOT_ON_ROOT, 6688 .read_u64 = memory_peak_read, 6689 }, 6690 { 6691 .name = "min", 6692 .flags = CFTYPE_NOT_ON_ROOT, 6693 .seq_show = memory_min_show, 6694 .write = memory_min_write, 6695 }, 6696 { 6697 .name = "low", 6698 .flags = CFTYPE_NOT_ON_ROOT, 6699 .seq_show = memory_low_show, 6700 .write = memory_low_write, 6701 }, 6702 { 6703 .name = "high", 6704 .flags = CFTYPE_NOT_ON_ROOT, 6705 .seq_show = memory_high_show, 6706 .write = memory_high_write, 6707 }, 6708 { 6709 .name = "max", 6710 .flags = CFTYPE_NOT_ON_ROOT, 6711 .seq_show = memory_max_show, 6712 .write = memory_max_write, 6713 }, 6714 { 6715 .name = "events", 6716 .flags = CFTYPE_NOT_ON_ROOT, 6717 .file_offset = offsetof(struct mem_cgroup, events_file), 6718 .seq_show = memory_events_show, 6719 }, 6720 { 6721 .name = "events.local", 6722 .flags = CFTYPE_NOT_ON_ROOT, 6723 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6724 .seq_show = memory_events_local_show, 6725 }, 6726 { 6727 .name = "stat", 6728 .seq_show = memory_stat_show, 6729 }, 6730 #ifdef CONFIG_NUMA 6731 { 6732 .name = "numa_stat", 6733 .seq_show = memory_numa_stat_show, 6734 }, 6735 #endif 6736 { 6737 .name = "oom.group", 6738 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6739 .seq_show = memory_oom_group_show, 6740 .write = memory_oom_group_write, 6741 }, 6742 { 6743 .name = "reclaim", 6744 .flags = CFTYPE_NS_DELEGATABLE, 6745 .write = memory_reclaim, 6746 }, 6747 { } /* terminate */ 6748 }; 6749 6750 struct cgroup_subsys memory_cgrp_subsys = { 6751 .css_alloc = mem_cgroup_css_alloc, 6752 .css_online = mem_cgroup_css_online, 6753 .css_offline = mem_cgroup_css_offline, 6754 .css_released = mem_cgroup_css_released, 6755 .css_free = mem_cgroup_css_free, 6756 .css_reset = mem_cgroup_css_reset, 6757 .css_rstat_flush = mem_cgroup_css_rstat_flush, 6758 .can_attach = mem_cgroup_can_attach, 6759 .attach = mem_cgroup_attach, 6760 .cancel_attach = mem_cgroup_cancel_attach, 6761 .post_attach = mem_cgroup_move_task, 6762 .dfl_cftypes = memory_files, 6763 .legacy_cftypes = mem_cgroup_legacy_files, 6764 .early_init = 0, 6765 }; 6766 6767 /* 6768 * This function calculates an individual cgroup's effective 6769 * protection which is derived from its own memory.min/low, its 6770 * parent's and siblings' settings, as well as the actual memory 6771 * distribution in the tree. 6772 * 6773 * The following rules apply to the effective protection values: 6774 * 6775 * 1. At the first level of reclaim, effective protection is equal to 6776 * the declared protection in memory.min and memory.low. 6777 * 6778 * 2. To enable safe delegation of the protection configuration, at 6779 * subsequent levels the effective protection is capped to the 6780 * parent's effective protection. 6781 * 6782 * 3. To make complex and dynamic subtrees easier to configure, the 6783 * user is allowed to overcommit the declared protection at a given 6784 * level. If that is the case, the parent's effective protection is 6785 * distributed to the children in proportion to how much protection 6786 * they have declared and how much of it they are utilizing. 6787 * 6788 * This makes distribution proportional, but also work-conserving: 6789 * if one cgroup claims much more protection than it uses memory, 6790 * the unused remainder is available to its siblings. 6791 * 6792 * 4. Conversely, when the declared protection is undercommitted at a 6793 * given level, the distribution of the larger parental protection 6794 * budget is NOT proportional. A cgroup's protection from a sibling 6795 * is capped to its own memory.min/low setting. 6796 * 6797 * 5. However, to allow protecting recursive subtrees from each other 6798 * without having to declare each individual cgroup's fixed share 6799 * of the ancestor's claim to protection, any unutilized - 6800 * "floating" - protection from up the tree is distributed in 6801 * proportion to each cgroup's *usage*. This makes the protection 6802 * neutral wrt sibling cgroups and lets them compete freely over 6803 * the shared parental protection budget, but it protects the 6804 * subtree as a whole from neighboring subtrees. 6805 * 6806 * Note that 4. and 5. are not in conflict: 4. is about protecting 6807 * against immediate siblings whereas 5. is about protecting against 6808 * neighboring subtrees. 6809 */ 6810 static unsigned long effective_protection(unsigned long usage, 6811 unsigned long parent_usage, 6812 unsigned long setting, 6813 unsigned long parent_effective, 6814 unsigned long siblings_protected) 6815 { 6816 unsigned long protected; 6817 unsigned long ep; 6818 6819 protected = min(usage, setting); 6820 /* 6821 * If all cgroups at this level combined claim and use more 6822 * protection then what the parent affords them, distribute 6823 * shares in proportion to utilization. 6824 * 6825 * We are using actual utilization rather than the statically 6826 * claimed protection in order to be work-conserving: claimed 6827 * but unused protection is available to siblings that would 6828 * otherwise get a smaller chunk than what they claimed. 6829 */ 6830 if (siblings_protected > parent_effective) 6831 return protected * parent_effective / siblings_protected; 6832 6833 /* 6834 * Ok, utilized protection of all children is within what the 6835 * parent affords them, so we know whatever this child claims 6836 * and utilizes is effectively protected. 6837 * 6838 * If there is unprotected usage beyond this value, reclaim 6839 * will apply pressure in proportion to that amount. 6840 * 6841 * If there is unutilized protection, the cgroup will be fully 6842 * shielded from reclaim, but we do return a smaller value for 6843 * protection than what the group could enjoy in theory. This 6844 * is okay. With the overcommit distribution above, effective 6845 * protection is always dependent on how memory is actually 6846 * consumed among the siblings anyway. 6847 */ 6848 ep = protected; 6849 6850 /* 6851 * If the children aren't claiming (all of) the protection 6852 * afforded to them by the parent, distribute the remainder in 6853 * proportion to the (unprotected) memory of each cgroup. That 6854 * way, cgroups that aren't explicitly prioritized wrt each 6855 * other compete freely over the allowance, but they are 6856 * collectively protected from neighboring trees. 6857 * 6858 * We're using unprotected memory for the weight so that if 6859 * some cgroups DO claim explicit protection, we don't protect 6860 * the same bytes twice. 6861 * 6862 * Check both usage and parent_usage against the respective 6863 * protected values. One should imply the other, but they 6864 * aren't read atomically - make sure the division is sane. 6865 */ 6866 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6867 return ep; 6868 if (parent_effective > siblings_protected && 6869 parent_usage > siblings_protected && 6870 usage > protected) { 6871 unsigned long unclaimed; 6872 6873 unclaimed = parent_effective - siblings_protected; 6874 unclaimed *= usage - protected; 6875 unclaimed /= parent_usage - siblings_protected; 6876 6877 ep += unclaimed; 6878 } 6879 6880 return ep; 6881 } 6882 6883 /** 6884 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 6885 * @root: the top ancestor of the sub-tree being checked 6886 * @memcg: the memory cgroup to check 6887 * 6888 * WARNING: This function is not stateless! It can only be used as part 6889 * of a top-down tree iteration, not for isolated queries. 6890 */ 6891 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6892 struct mem_cgroup *memcg) 6893 { 6894 unsigned long usage, parent_usage; 6895 struct mem_cgroup *parent; 6896 6897 if (mem_cgroup_disabled()) 6898 return; 6899 6900 if (!root) 6901 root = root_mem_cgroup; 6902 6903 /* 6904 * Effective values of the reclaim targets are ignored so they 6905 * can be stale. Have a look at mem_cgroup_protection for more 6906 * details. 6907 * TODO: calculation should be more robust so that we do not need 6908 * that special casing. 6909 */ 6910 if (memcg == root) 6911 return; 6912 6913 usage = page_counter_read(&memcg->memory); 6914 if (!usage) 6915 return; 6916 6917 parent = parent_mem_cgroup(memcg); 6918 6919 if (parent == root) { 6920 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6921 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6922 return; 6923 } 6924 6925 parent_usage = page_counter_read(&parent->memory); 6926 6927 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6928 READ_ONCE(memcg->memory.min), 6929 READ_ONCE(parent->memory.emin), 6930 atomic_long_read(&parent->memory.children_min_usage))); 6931 6932 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6933 READ_ONCE(memcg->memory.low), 6934 READ_ONCE(parent->memory.elow), 6935 atomic_long_read(&parent->memory.children_low_usage))); 6936 } 6937 6938 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 6939 gfp_t gfp) 6940 { 6941 long nr_pages = folio_nr_pages(folio); 6942 int ret; 6943 6944 ret = try_charge(memcg, gfp, nr_pages); 6945 if (ret) 6946 goto out; 6947 6948 css_get(&memcg->css); 6949 commit_charge(folio, memcg); 6950 6951 local_irq_disable(); 6952 mem_cgroup_charge_statistics(memcg, nr_pages); 6953 memcg_check_events(memcg, folio_nid(folio)); 6954 local_irq_enable(); 6955 out: 6956 return ret; 6957 } 6958 6959 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 6960 { 6961 struct mem_cgroup *memcg; 6962 int ret; 6963 6964 memcg = get_mem_cgroup_from_mm(mm); 6965 ret = charge_memcg(folio, memcg, gfp); 6966 css_put(&memcg->css); 6967 6968 return ret; 6969 } 6970 6971 /** 6972 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 6973 * @folio: folio to charge. 6974 * @mm: mm context of the victim 6975 * @gfp: reclaim mode 6976 * @entry: swap entry for which the folio is allocated 6977 * 6978 * This function charges a folio allocated for swapin. Please call this before 6979 * adding the folio to the swapcache. 6980 * 6981 * Returns 0 on success. Otherwise, an error code is returned. 6982 */ 6983 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 6984 gfp_t gfp, swp_entry_t entry) 6985 { 6986 struct mem_cgroup *memcg; 6987 unsigned short id; 6988 int ret; 6989 6990 if (mem_cgroup_disabled()) 6991 return 0; 6992 6993 id = lookup_swap_cgroup_id(entry); 6994 rcu_read_lock(); 6995 memcg = mem_cgroup_from_id(id); 6996 if (!memcg || !css_tryget_online(&memcg->css)) 6997 memcg = get_mem_cgroup_from_mm(mm); 6998 rcu_read_unlock(); 6999 7000 ret = charge_memcg(folio, memcg, gfp); 7001 7002 css_put(&memcg->css); 7003 return ret; 7004 } 7005 7006 /* 7007 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 7008 * @entry: swap entry for which the page is charged 7009 * 7010 * Call this function after successfully adding the charged page to swapcache. 7011 * 7012 * Note: This function assumes the page for which swap slot is being uncharged 7013 * is order 0 page. 7014 */ 7015 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 7016 { 7017 /* 7018 * Cgroup1's unified memory+swap counter has been charged with the 7019 * new swapcache page, finish the transfer by uncharging the swap 7020 * slot. The swap slot would also get uncharged when it dies, but 7021 * it can stick around indefinitely and we'd count the page twice 7022 * the entire time. 7023 * 7024 * Cgroup2 has separate resource counters for memory and swap, 7025 * so this is a non-issue here. Memory and swap charge lifetimes 7026 * correspond 1:1 to page and swap slot lifetimes: we charge the 7027 * page to memory here, and uncharge swap when the slot is freed. 7028 */ 7029 if (!mem_cgroup_disabled() && do_memsw_account()) { 7030 /* 7031 * The swap entry might not get freed for a long time, 7032 * let's not wait for it. The page already received a 7033 * memory+swap charge, drop the swap entry duplicate. 7034 */ 7035 mem_cgroup_uncharge_swap(entry, 1); 7036 } 7037 } 7038 7039 struct uncharge_gather { 7040 struct mem_cgroup *memcg; 7041 unsigned long nr_memory; 7042 unsigned long pgpgout; 7043 unsigned long nr_kmem; 7044 int nid; 7045 }; 7046 7047 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 7048 { 7049 memset(ug, 0, sizeof(*ug)); 7050 } 7051 7052 static void uncharge_batch(const struct uncharge_gather *ug) 7053 { 7054 unsigned long flags; 7055 7056 if (ug->nr_memory) { 7057 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 7058 if (do_memsw_account()) 7059 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 7060 if (ug->nr_kmem) 7061 memcg_account_kmem(ug->memcg, -ug->nr_kmem); 7062 memcg_oom_recover(ug->memcg); 7063 } 7064 7065 local_irq_save(flags); 7066 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 7067 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 7068 memcg_check_events(ug->memcg, ug->nid); 7069 local_irq_restore(flags); 7070 7071 /* drop reference from uncharge_folio */ 7072 css_put(&ug->memcg->css); 7073 } 7074 7075 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 7076 { 7077 long nr_pages; 7078 struct mem_cgroup *memcg; 7079 struct obj_cgroup *objcg; 7080 7081 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7082 7083 /* 7084 * Nobody should be changing or seriously looking at 7085 * folio memcg or objcg at this point, we have fully 7086 * exclusive access to the folio. 7087 */ 7088 if (folio_memcg_kmem(folio)) { 7089 objcg = __folio_objcg(folio); 7090 /* 7091 * This get matches the put at the end of the function and 7092 * kmem pages do not hold memcg references anymore. 7093 */ 7094 memcg = get_mem_cgroup_from_objcg(objcg); 7095 } else { 7096 memcg = __folio_memcg(folio); 7097 } 7098 7099 if (!memcg) 7100 return; 7101 7102 if (ug->memcg != memcg) { 7103 if (ug->memcg) { 7104 uncharge_batch(ug); 7105 uncharge_gather_clear(ug); 7106 } 7107 ug->memcg = memcg; 7108 ug->nid = folio_nid(folio); 7109 7110 /* pairs with css_put in uncharge_batch */ 7111 css_get(&memcg->css); 7112 } 7113 7114 nr_pages = folio_nr_pages(folio); 7115 7116 if (folio_memcg_kmem(folio)) { 7117 ug->nr_memory += nr_pages; 7118 ug->nr_kmem += nr_pages; 7119 7120 folio->memcg_data = 0; 7121 obj_cgroup_put(objcg); 7122 } else { 7123 /* LRU pages aren't accounted at the root level */ 7124 if (!mem_cgroup_is_root(memcg)) 7125 ug->nr_memory += nr_pages; 7126 ug->pgpgout++; 7127 7128 folio->memcg_data = 0; 7129 } 7130 7131 css_put(&memcg->css); 7132 } 7133 7134 void __mem_cgroup_uncharge(struct folio *folio) 7135 { 7136 struct uncharge_gather ug; 7137 7138 /* Don't touch folio->lru of any random page, pre-check: */ 7139 if (!folio_memcg(folio)) 7140 return; 7141 7142 uncharge_gather_clear(&ug); 7143 uncharge_folio(folio, &ug); 7144 uncharge_batch(&ug); 7145 } 7146 7147 /** 7148 * __mem_cgroup_uncharge_list - uncharge a list of page 7149 * @page_list: list of pages to uncharge 7150 * 7151 * Uncharge a list of pages previously charged with 7152 * __mem_cgroup_charge(). 7153 */ 7154 void __mem_cgroup_uncharge_list(struct list_head *page_list) 7155 { 7156 struct uncharge_gather ug; 7157 struct folio *folio; 7158 7159 uncharge_gather_clear(&ug); 7160 list_for_each_entry(folio, page_list, lru) 7161 uncharge_folio(folio, &ug); 7162 if (ug.memcg) 7163 uncharge_batch(&ug); 7164 } 7165 7166 /** 7167 * mem_cgroup_migrate - Charge a folio's replacement. 7168 * @old: Currently circulating folio. 7169 * @new: Replacement folio. 7170 * 7171 * Charge @new as a replacement folio for @old. @old will 7172 * be uncharged upon free. 7173 * 7174 * Both folios must be locked, @new->mapping must be set up. 7175 */ 7176 void mem_cgroup_migrate(struct folio *old, struct folio *new) 7177 { 7178 struct mem_cgroup *memcg; 7179 long nr_pages = folio_nr_pages(new); 7180 unsigned long flags; 7181 7182 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 7183 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 7184 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 7185 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 7186 7187 if (mem_cgroup_disabled()) 7188 return; 7189 7190 /* Page cache replacement: new folio already charged? */ 7191 if (folio_memcg(new)) 7192 return; 7193 7194 memcg = folio_memcg(old); 7195 VM_WARN_ON_ONCE_FOLIO(!memcg, old); 7196 if (!memcg) 7197 return; 7198 7199 /* Force-charge the new page. The old one will be freed soon */ 7200 if (!mem_cgroup_is_root(memcg)) { 7201 page_counter_charge(&memcg->memory, nr_pages); 7202 if (do_memsw_account()) 7203 page_counter_charge(&memcg->memsw, nr_pages); 7204 } 7205 7206 css_get(&memcg->css); 7207 commit_charge(new, memcg); 7208 7209 local_irq_save(flags); 7210 mem_cgroup_charge_statistics(memcg, nr_pages); 7211 memcg_check_events(memcg, folio_nid(new)); 7212 local_irq_restore(flags); 7213 } 7214 7215 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 7216 EXPORT_SYMBOL(memcg_sockets_enabled_key); 7217 7218 void mem_cgroup_sk_alloc(struct sock *sk) 7219 { 7220 struct mem_cgroup *memcg; 7221 7222 if (!mem_cgroup_sockets_enabled) 7223 return; 7224 7225 /* Do not associate the sock with unrelated interrupted task's memcg. */ 7226 if (!in_task()) 7227 return; 7228 7229 rcu_read_lock(); 7230 memcg = mem_cgroup_from_task(current); 7231 if (mem_cgroup_is_root(memcg)) 7232 goto out; 7233 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 7234 goto out; 7235 if (css_tryget(&memcg->css)) 7236 sk->sk_memcg = memcg; 7237 out: 7238 rcu_read_unlock(); 7239 } 7240 7241 void mem_cgroup_sk_free(struct sock *sk) 7242 { 7243 if (sk->sk_memcg) 7244 css_put(&sk->sk_memcg->css); 7245 } 7246 7247 /** 7248 * mem_cgroup_charge_skmem - charge socket memory 7249 * @memcg: memcg to charge 7250 * @nr_pages: number of pages to charge 7251 * @gfp_mask: reclaim mode 7252 * 7253 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7254 * @memcg's configured limit, %false if it doesn't. 7255 */ 7256 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 7257 gfp_t gfp_mask) 7258 { 7259 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7260 struct page_counter *fail; 7261 7262 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7263 memcg->tcpmem_pressure = 0; 7264 return true; 7265 } 7266 memcg->tcpmem_pressure = 1; 7267 if (gfp_mask & __GFP_NOFAIL) { 7268 page_counter_charge(&memcg->tcpmem, nr_pages); 7269 return true; 7270 } 7271 return false; 7272 } 7273 7274 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 7275 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7276 return true; 7277 } 7278 7279 return false; 7280 } 7281 7282 /** 7283 * mem_cgroup_uncharge_skmem - uncharge socket memory 7284 * @memcg: memcg to uncharge 7285 * @nr_pages: number of pages to uncharge 7286 */ 7287 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7288 { 7289 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7290 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7291 return; 7292 } 7293 7294 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7295 7296 refill_stock(memcg, nr_pages); 7297 } 7298 7299 static int __init cgroup_memory(char *s) 7300 { 7301 char *token; 7302 7303 while ((token = strsep(&s, ",")) != NULL) { 7304 if (!*token) 7305 continue; 7306 if (!strcmp(token, "nosocket")) 7307 cgroup_memory_nosocket = true; 7308 if (!strcmp(token, "nokmem")) 7309 cgroup_memory_nokmem = true; 7310 } 7311 return 1; 7312 } 7313 __setup("cgroup.memory=", cgroup_memory); 7314 7315 /* 7316 * subsys_initcall() for memory controller. 7317 * 7318 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7319 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7320 * basically everything that doesn't depend on a specific mem_cgroup structure 7321 * should be initialized from here. 7322 */ 7323 static int __init mem_cgroup_init(void) 7324 { 7325 int cpu, node; 7326 7327 /* 7328 * Currently s32 type (can refer to struct batched_lruvec_stat) is 7329 * used for per-memcg-per-cpu caching of per-node statistics. In order 7330 * to work fine, we should make sure that the overfill threshold can't 7331 * exceed S32_MAX / PAGE_SIZE. 7332 */ 7333 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 7334 7335 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7336 memcg_hotplug_cpu_dead); 7337 7338 for_each_possible_cpu(cpu) 7339 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7340 drain_local_stock); 7341 7342 for_each_node(node) { 7343 struct mem_cgroup_tree_per_node *rtpn; 7344 7345 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7346 node_online(node) ? node : NUMA_NO_NODE); 7347 7348 rtpn->rb_root = RB_ROOT; 7349 rtpn->rb_rightmost = NULL; 7350 spin_lock_init(&rtpn->lock); 7351 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7352 } 7353 7354 return 0; 7355 } 7356 subsys_initcall(mem_cgroup_init); 7357 7358 #ifdef CONFIG_SWAP 7359 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7360 { 7361 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7362 /* 7363 * The root cgroup cannot be destroyed, so it's refcount must 7364 * always be >= 1. 7365 */ 7366 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 7367 VM_BUG_ON(1); 7368 break; 7369 } 7370 memcg = parent_mem_cgroup(memcg); 7371 if (!memcg) 7372 memcg = root_mem_cgroup; 7373 } 7374 return memcg; 7375 } 7376 7377 /** 7378 * mem_cgroup_swapout - transfer a memsw charge to swap 7379 * @folio: folio whose memsw charge to transfer 7380 * @entry: swap entry to move the charge to 7381 * 7382 * Transfer the memsw charge of @folio to @entry. 7383 */ 7384 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 7385 { 7386 struct mem_cgroup *memcg, *swap_memcg; 7387 unsigned int nr_entries; 7388 unsigned short oldid; 7389 7390 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7391 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 7392 7393 if (mem_cgroup_disabled()) 7394 return; 7395 7396 if (!do_memsw_account()) 7397 return; 7398 7399 memcg = folio_memcg(folio); 7400 7401 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7402 if (!memcg) 7403 return; 7404 7405 /* 7406 * In case the memcg owning these pages has been offlined and doesn't 7407 * have an ID allocated to it anymore, charge the closest online 7408 * ancestor for the swap instead and transfer the memory+swap charge. 7409 */ 7410 swap_memcg = mem_cgroup_id_get_online(memcg); 7411 nr_entries = folio_nr_pages(folio); 7412 /* Get references for the tail pages, too */ 7413 if (nr_entries > 1) 7414 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7415 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7416 nr_entries); 7417 VM_BUG_ON_FOLIO(oldid, folio); 7418 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7419 7420 folio->memcg_data = 0; 7421 7422 if (!mem_cgroup_is_root(memcg)) 7423 page_counter_uncharge(&memcg->memory, nr_entries); 7424 7425 if (memcg != swap_memcg) { 7426 if (!mem_cgroup_is_root(swap_memcg)) 7427 page_counter_charge(&swap_memcg->memsw, nr_entries); 7428 page_counter_uncharge(&memcg->memsw, nr_entries); 7429 } 7430 7431 /* 7432 * Interrupts should be disabled here because the caller holds the 7433 * i_pages lock which is taken with interrupts-off. It is 7434 * important here to have the interrupts disabled because it is the 7435 * only synchronisation we have for updating the per-CPU variables. 7436 */ 7437 memcg_stats_lock(); 7438 mem_cgroup_charge_statistics(memcg, -nr_entries); 7439 memcg_stats_unlock(); 7440 memcg_check_events(memcg, folio_nid(folio)); 7441 7442 css_put(&memcg->css); 7443 } 7444 7445 /** 7446 * __mem_cgroup_try_charge_swap - try charging swap space for a folio 7447 * @folio: folio being added to swap 7448 * @entry: swap entry to charge 7449 * 7450 * Try to charge @folio's memcg for the swap space at @entry. 7451 * 7452 * Returns 0 on success, -ENOMEM on failure. 7453 */ 7454 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 7455 { 7456 unsigned int nr_pages = folio_nr_pages(folio); 7457 struct page_counter *counter; 7458 struct mem_cgroup *memcg; 7459 unsigned short oldid; 7460 7461 if (do_memsw_account()) 7462 return 0; 7463 7464 memcg = folio_memcg(folio); 7465 7466 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7467 if (!memcg) 7468 return 0; 7469 7470 if (!entry.val) { 7471 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7472 return 0; 7473 } 7474 7475 memcg = mem_cgroup_id_get_online(memcg); 7476 7477 if (!mem_cgroup_is_root(memcg) && 7478 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7479 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7480 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7481 mem_cgroup_id_put(memcg); 7482 return -ENOMEM; 7483 } 7484 7485 /* Get references for the tail pages, too */ 7486 if (nr_pages > 1) 7487 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7488 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7489 VM_BUG_ON_FOLIO(oldid, folio); 7490 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7491 7492 return 0; 7493 } 7494 7495 /** 7496 * __mem_cgroup_uncharge_swap - uncharge swap space 7497 * @entry: swap entry to uncharge 7498 * @nr_pages: the amount of swap space to uncharge 7499 */ 7500 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7501 { 7502 struct mem_cgroup *memcg; 7503 unsigned short id; 7504 7505 if (mem_cgroup_disabled()) 7506 return; 7507 7508 id = swap_cgroup_record(entry, 0, nr_pages); 7509 rcu_read_lock(); 7510 memcg = mem_cgroup_from_id(id); 7511 if (memcg) { 7512 if (!mem_cgroup_is_root(memcg)) { 7513 if (do_memsw_account()) 7514 page_counter_uncharge(&memcg->memsw, nr_pages); 7515 else 7516 page_counter_uncharge(&memcg->swap, nr_pages); 7517 } 7518 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7519 mem_cgroup_id_put_many(memcg, nr_pages); 7520 } 7521 rcu_read_unlock(); 7522 } 7523 7524 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7525 { 7526 long nr_swap_pages = get_nr_swap_pages(); 7527 7528 if (mem_cgroup_disabled() || do_memsw_account()) 7529 return nr_swap_pages; 7530 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 7531 nr_swap_pages = min_t(long, nr_swap_pages, 7532 READ_ONCE(memcg->swap.max) - 7533 page_counter_read(&memcg->swap)); 7534 return nr_swap_pages; 7535 } 7536 7537 bool mem_cgroup_swap_full(struct folio *folio) 7538 { 7539 struct mem_cgroup *memcg; 7540 7541 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 7542 7543 if (vm_swap_full()) 7544 return true; 7545 if (do_memsw_account()) 7546 return false; 7547 7548 memcg = folio_memcg(folio); 7549 if (!memcg) 7550 return false; 7551 7552 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 7553 unsigned long usage = page_counter_read(&memcg->swap); 7554 7555 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7556 usage * 2 >= READ_ONCE(memcg->swap.max)) 7557 return true; 7558 } 7559 7560 return false; 7561 } 7562 7563 static int __init setup_swap_account(char *s) 7564 { 7565 pr_warn_once("The swapaccount= commandline option is deprecated. " 7566 "Please report your usecase to linux-mm@kvack.org if you " 7567 "depend on this functionality.\n"); 7568 return 1; 7569 } 7570 __setup("swapaccount=", setup_swap_account); 7571 7572 static u64 swap_current_read(struct cgroup_subsys_state *css, 7573 struct cftype *cft) 7574 { 7575 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7576 7577 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7578 } 7579 7580 static int swap_high_show(struct seq_file *m, void *v) 7581 { 7582 return seq_puts_memcg_tunable(m, 7583 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7584 } 7585 7586 static ssize_t swap_high_write(struct kernfs_open_file *of, 7587 char *buf, size_t nbytes, loff_t off) 7588 { 7589 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7590 unsigned long high; 7591 int err; 7592 7593 buf = strstrip(buf); 7594 err = page_counter_memparse(buf, "max", &high); 7595 if (err) 7596 return err; 7597 7598 page_counter_set_high(&memcg->swap, high); 7599 7600 return nbytes; 7601 } 7602 7603 static int swap_max_show(struct seq_file *m, void *v) 7604 { 7605 return seq_puts_memcg_tunable(m, 7606 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7607 } 7608 7609 static ssize_t swap_max_write(struct kernfs_open_file *of, 7610 char *buf, size_t nbytes, loff_t off) 7611 { 7612 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7613 unsigned long max; 7614 int err; 7615 7616 buf = strstrip(buf); 7617 err = page_counter_memparse(buf, "max", &max); 7618 if (err) 7619 return err; 7620 7621 xchg(&memcg->swap.max, max); 7622 7623 return nbytes; 7624 } 7625 7626 static int swap_events_show(struct seq_file *m, void *v) 7627 { 7628 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7629 7630 seq_printf(m, "high %lu\n", 7631 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7632 seq_printf(m, "max %lu\n", 7633 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7634 seq_printf(m, "fail %lu\n", 7635 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7636 7637 return 0; 7638 } 7639 7640 static struct cftype swap_files[] = { 7641 { 7642 .name = "swap.current", 7643 .flags = CFTYPE_NOT_ON_ROOT, 7644 .read_u64 = swap_current_read, 7645 }, 7646 { 7647 .name = "swap.high", 7648 .flags = CFTYPE_NOT_ON_ROOT, 7649 .seq_show = swap_high_show, 7650 .write = swap_high_write, 7651 }, 7652 { 7653 .name = "swap.max", 7654 .flags = CFTYPE_NOT_ON_ROOT, 7655 .seq_show = swap_max_show, 7656 .write = swap_max_write, 7657 }, 7658 { 7659 .name = "swap.events", 7660 .flags = CFTYPE_NOT_ON_ROOT, 7661 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7662 .seq_show = swap_events_show, 7663 }, 7664 { } /* terminate */ 7665 }; 7666 7667 static struct cftype memsw_files[] = { 7668 { 7669 .name = "memsw.usage_in_bytes", 7670 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7671 .read_u64 = mem_cgroup_read_u64, 7672 }, 7673 { 7674 .name = "memsw.max_usage_in_bytes", 7675 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7676 .write = mem_cgroup_reset, 7677 .read_u64 = mem_cgroup_read_u64, 7678 }, 7679 { 7680 .name = "memsw.limit_in_bytes", 7681 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7682 .write = mem_cgroup_write, 7683 .read_u64 = mem_cgroup_read_u64, 7684 }, 7685 { 7686 .name = "memsw.failcnt", 7687 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7688 .write = mem_cgroup_reset, 7689 .read_u64 = mem_cgroup_read_u64, 7690 }, 7691 { }, /* terminate */ 7692 }; 7693 7694 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7695 /** 7696 * obj_cgroup_may_zswap - check if this cgroup can zswap 7697 * @objcg: the object cgroup 7698 * 7699 * Check if the hierarchical zswap limit has been reached. 7700 * 7701 * This doesn't check for specific headroom, and it is not atomic 7702 * either. But with zswap, the size of the allocation is only known 7703 * once compression has occured, and this optimistic pre-check avoids 7704 * spending cycles on compression when there is already no room left 7705 * or zswap is disabled altogether somewhere in the hierarchy. 7706 */ 7707 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 7708 { 7709 struct mem_cgroup *memcg, *original_memcg; 7710 bool ret = true; 7711 7712 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7713 return true; 7714 7715 original_memcg = get_mem_cgroup_from_objcg(objcg); 7716 for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 7717 memcg = parent_mem_cgroup(memcg)) { 7718 unsigned long max = READ_ONCE(memcg->zswap_max); 7719 unsigned long pages; 7720 7721 if (max == PAGE_COUNTER_MAX) 7722 continue; 7723 if (max == 0) { 7724 ret = false; 7725 break; 7726 } 7727 7728 cgroup_rstat_flush(memcg->css.cgroup); 7729 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 7730 if (pages < max) 7731 continue; 7732 ret = false; 7733 break; 7734 } 7735 mem_cgroup_put(original_memcg); 7736 return ret; 7737 } 7738 7739 /** 7740 * obj_cgroup_charge_zswap - charge compression backend memory 7741 * @objcg: the object cgroup 7742 * @size: size of compressed object 7743 * 7744 * This forces the charge after obj_cgroup_may_swap() allowed 7745 * compression and storage in zwap for this cgroup to go ahead. 7746 */ 7747 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 7748 { 7749 struct mem_cgroup *memcg; 7750 7751 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7752 return; 7753 7754 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 7755 7756 /* PF_MEMALLOC context, charging must succeed */ 7757 if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 7758 VM_WARN_ON_ONCE(1); 7759 7760 rcu_read_lock(); 7761 memcg = obj_cgroup_memcg(objcg); 7762 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 7763 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 7764 rcu_read_unlock(); 7765 } 7766 7767 /** 7768 * obj_cgroup_uncharge_zswap - uncharge compression backend memory 7769 * @objcg: the object cgroup 7770 * @size: size of compressed object 7771 * 7772 * Uncharges zswap memory on page in. 7773 */ 7774 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 7775 { 7776 struct mem_cgroup *memcg; 7777 7778 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7779 return; 7780 7781 obj_cgroup_uncharge(objcg, size); 7782 7783 rcu_read_lock(); 7784 memcg = obj_cgroup_memcg(objcg); 7785 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 7786 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 7787 rcu_read_unlock(); 7788 } 7789 7790 static u64 zswap_current_read(struct cgroup_subsys_state *css, 7791 struct cftype *cft) 7792 { 7793 cgroup_rstat_flush(css->cgroup); 7794 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); 7795 } 7796 7797 static int zswap_max_show(struct seq_file *m, void *v) 7798 { 7799 return seq_puts_memcg_tunable(m, 7800 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 7801 } 7802 7803 static ssize_t zswap_max_write(struct kernfs_open_file *of, 7804 char *buf, size_t nbytes, loff_t off) 7805 { 7806 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7807 unsigned long max; 7808 int err; 7809 7810 buf = strstrip(buf); 7811 err = page_counter_memparse(buf, "max", &max); 7812 if (err) 7813 return err; 7814 7815 xchg(&memcg->zswap_max, max); 7816 7817 return nbytes; 7818 } 7819 7820 static struct cftype zswap_files[] = { 7821 { 7822 .name = "zswap.current", 7823 .flags = CFTYPE_NOT_ON_ROOT, 7824 .read_u64 = zswap_current_read, 7825 }, 7826 { 7827 .name = "zswap.max", 7828 .flags = CFTYPE_NOT_ON_ROOT, 7829 .seq_show = zswap_max_show, 7830 .write = zswap_max_write, 7831 }, 7832 { } /* terminate */ 7833 }; 7834 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ 7835 7836 static int __init mem_cgroup_swap_init(void) 7837 { 7838 if (mem_cgroup_disabled()) 7839 return 0; 7840 7841 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7842 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7843 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7844 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 7845 #endif 7846 return 0; 7847 } 7848 subsys_initcall(mem_cgroup_swap_init); 7849 7850 #endif /* CONFIG_SWAP */ 7851