1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/memremap.h> 57 #include <linux/mm_inline.h> 58 #include <linux/swap_cgroup.h> 59 #include <linux/cpu.h> 60 #include <linux/oom.h> 61 #include <linux/lockdep.h> 62 #include <linux/file.h> 63 #include <linux/resume_user_mode.h> 64 #include <linux/psi.h> 65 #include <linux/seq_buf.h> 66 #include <linux/sched/isolation.h> 67 #include "internal.h" 68 #include <net/sock.h> 69 #include <net/ip.h> 70 #include "slab.h" 71 #include "swap.h" 72 73 #include <linux/uaccess.h> 74 75 #include <trace/events/vmscan.h> 76 77 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 78 EXPORT_SYMBOL(memory_cgrp_subsys); 79 80 struct mem_cgroup *root_mem_cgroup __read_mostly; 81 82 /* Active memory cgroup to use from an interrupt context */ 83 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 84 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 85 86 /* Socket memory accounting disabled? */ 87 static bool cgroup_memory_nosocket __ro_after_init; 88 89 /* Kernel memory accounting disabled? */ 90 static bool cgroup_memory_nokmem __ro_after_init; 91 92 /* BPF memory accounting disabled? */ 93 static bool cgroup_memory_nobpf __ro_after_init; 94 95 #ifdef CONFIG_CGROUP_WRITEBACK 96 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 97 #endif 98 99 /* Whether legacy memory+swap accounting is active */ 100 static bool do_memsw_account(void) 101 { 102 return !cgroup_subsys_on_dfl(memory_cgrp_subsys); 103 } 104 105 #define THRESHOLDS_EVENTS_TARGET 128 106 #define SOFTLIMIT_EVENTS_TARGET 1024 107 108 /* 109 * Cgroups above their limits are maintained in a RB-Tree, independent of 110 * their hierarchy representation 111 */ 112 113 struct mem_cgroup_tree_per_node { 114 struct rb_root rb_root; 115 struct rb_node *rb_rightmost; 116 spinlock_t lock; 117 }; 118 119 struct mem_cgroup_tree { 120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 121 }; 122 123 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 124 125 /* for OOM */ 126 struct mem_cgroup_eventfd_list { 127 struct list_head list; 128 struct eventfd_ctx *eventfd; 129 }; 130 131 /* 132 * cgroup_event represents events which userspace want to receive. 133 */ 134 struct mem_cgroup_event { 135 /* 136 * memcg which the event belongs to. 137 */ 138 struct mem_cgroup *memcg; 139 /* 140 * eventfd to signal userspace about the event. 141 */ 142 struct eventfd_ctx *eventfd; 143 /* 144 * Each of these stored in a list by the cgroup. 145 */ 146 struct list_head list; 147 /* 148 * register_event() callback will be used to add new userspace 149 * waiter for changes related to this event. Use eventfd_signal() 150 * on eventfd to send notification to userspace. 151 */ 152 int (*register_event)(struct mem_cgroup *memcg, 153 struct eventfd_ctx *eventfd, const char *args); 154 /* 155 * unregister_event() callback will be called when userspace closes 156 * the eventfd or on cgroup removing. This callback must be set, 157 * if you want provide notification functionality. 158 */ 159 void (*unregister_event)(struct mem_cgroup *memcg, 160 struct eventfd_ctx *eventfd); 161 /* 162 * All fields below needed to unregister event when 163 * userspace closes eventfd. 164 */ 165 poll_table pt; 166 wait_queue_head_t *wqh; 167 wait_queue_entry_t wait; 168 struct work_struct remove; 169 }; 170 171 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 172 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 173 174 /* Stuffs for move charges at task migration. */ 175 /* 176 * Types of charges to be moved. 177 */ 178 #define MOVE_ANON 0x1U 179 #define MOVE_FILE 0x2U 180 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 181 182 /* "mc" and its members are protected by cgroup_mutex */ 183 static struct move_charge_struct { 184 spinlock_t lock; /* for from, to */ 185 struct mm_struct *mm; 186 struct mem_cgroup *from; 187 struct mem_cgroup *to; 188 unsigned long flags; 189 unsigned long precharge; 190 unsigned long moved_charge; 191 unsigned long moved_swap; 192 struct task_struct *moving_task; /* a task moving charges */ 193 wait_queue_head_t waitq; /* a waitq for other context */ 194 } mc = { 195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 197 }; 198 199 /* 200 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 201 * limit reclaim to prevent infinite loops, if they ever occur. 202 */ 203 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 204 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 205 206 /* for encoding cft->private value on file */ 207 enum res_type { 208 _MEM, 209 _MEMSWAP, 210 _KMEM, 211 _TCP, 212 }; 213 214 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 215 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 216 #define MEMFILE_ATTR(val) ((val) & 0xffff) 217 218 /* 219 * Iteration constructs for visiting all cgroups (under a tree). If 220 * loops are exited prematurely (break), mem_cgroup_iter_break() must 221 * be used for reference counting. 222 */ 223 #define for_each_mem_cgroup_tree(iter, root) \ 224 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 225 iter != NULL; \ 226 iter = mem_cgroup_iter(root, iter, NULL)) 227 228 #define for_each_mem_cgroup(iter) \ 229 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 230 iter != NULL; \ 231 iter = mem_cgroup_iter(NULL, iter, NULL)) 232 233 static inline bool task_is_dying(void) 234 { 235 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 236 (current->flags & PF_EXITING); 237 } 238 239 /* Some nice accessors for the vmpressure. */ 240 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 241 { 242 if (!memcg) 243 memcg = root_mem_cgroup; 244 return &memcg->vmpressure; 245 } 246 247 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 248 { 249 return container_of(vmpr, struct mem_cgroup, vmpressure); 250 } 251 252 #ifdef CONFIG_MEMCG_KMEM 253 static DEFINE_SPINLOCK(objcg_lock); 254 255 bool mem_cgroup_kmem_disabled(void) 256 { 257 return cgroup_memory_nokmem; 258 } 259 260 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 261 unsigned int nr_pages); 262 263 static void obj_cgroup_release(struct percpu_ref *ref) 264 { 265 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 266 unsigned int nr_bytes; 267 unsigned int nr_pages; 268 unsigned long flags; 269 270 /* 271 * At this point all allocated objects are freed, and 272 * objcg->nr_charged_bytes can't have an arbitrary byte value. 273 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 274 * 275 * The following sequence can lead to it: 276 * 1) CPU0: objcg == stock->cached_objcg 277 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 278 * PAGE_SIZE bytes are charged 279 * 3) CPU1: a process from another memcg is allocating something, 280 * the stock if flushed, 281 * objcg->nr_charged_bytes = PAGE_SIZE - 92 282 * 5) CPU0: we do release this object, 283 * 92 bytes are added to stock->nr_bytes 284 * 6) CPU0: stock is flushed, 285 * 92 bytes are added to objcg->nr_charged_bytes 286 * 287 * In the result, nr_charged_bytes == PAGE_SIZE. 288 * This page will be uncharged in obj_cgroup_release(). 289 */ 290 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 291 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 292 nr_pages = nr_bytes >> PAGE_SHIFT; 293 294 if (nr_pages) 295 obj_cgroup_uncharge_pages(objcg, nr_pages); 296 297 spin_lock_irqsave(&objcg_lock, flags); 298 list_del(&objcg->list); 299 spin_unlock_irqrestore(&objcg_lock, flags); 300 301 percpu_ref_exit(ref); 302 kfree_rcu(objcg, rcu); 303 } 304 305 static struct obj_cgroup *obj_cgroup_alloc(void) 306 { 307 struct obj_cgroup *objcg; 308 int ret; 309 310 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 311 if (!objcg) 312 return NULL; 313 314 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 315 GFP_KERNEL); 316 if (ret) { 317 kfree(objcg); 318 return NULL; 319 } 320 INIT_LIST_HEAD(&objcg->list); 321 return objcg; 322 } 323 324 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 325 struct mem_cgroup *parent) 326 { 327 struct obj_cgroup *objcg, *iter; 328 329 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 330 331 spin_lock_irq(&objcg_lock); 332 333 /* 1) Ready to reparent active objcg. */ 334 list_add(&objcg->list, &memcg->objcg_list); 335 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 336 list_for_each_entry(iter, &memcg->objcg_list, list) 337 WRITE_ONCE(iter->memcg, parent); 338 /* 3) Move already reparented objcgs to the parent's list */ 339 list_splice(&memcg->objcg_list, &parent->objcg_list); 340 341 spin_unlock_irq(&objcg_lock); 342 343 percpu_ref_kill(&objcg->refcnt); 344 } 345 346 /* 347 * A lot of the calls to the cache allocation functions are expected to be 348 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 349 * conditional to this static branch, we'll have to allow modules that does 350 * kmem_cache_alloc and the such to see this symbol as well 351 */ 352 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); 353 EXPORT_SYMBOL(memcg_kmem_online_key); 354 355 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); 356 EXPORT_SYMBOL(memcg_bpf_enabled_key); 357 #endif 358 359 /** 360 * mem_cgroup_css_from_folio - css of the memcg associated with a folio 361 * @folio: folio of interest 362 * 363 * If memcg is bound to the default hierarchy, css of the memcg associated 364 * with @folio is returned. The returned css remains associated with @folio 365 * until it is released. 366 * 367 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 368 * is returned. 369 */ 370 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) 371 { 372 struct mem_cgroup *memcg = folio_memcg(folio); 373 374 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 375 memcg = root_mem_cgroup; 376 377 return &memcg->css; 378 } 379 380 /** 381 * page_cgroup_ino - return inode number of the memcg a page is charged to 382 * @page: the page 383 * 384 * Look up the closest online ancestor of the memory cgroup @page is charged to 385 * and return its inode number or 0 if @page is not charged to any cgroup. It 386 * is safe to call this function without holding a reference to @page. 387 * 388 * Note, this function is inherently racy, because there is nothing to prevent 389 * the cgroup inode from getting torn down and potentially reallocated a moment 390 * after page_cgroup_ino() returns, so it only should be used by callers that 391 * do not care (such as procfs interfaces). 392 */ 393 ino_t page_cgroup_ino(struct page *page) 394 { 395 struct mem_cgroup *memcg; 396 unsigned long ino = 0; 397 398 rcu_read_lock(); 399 /* page_folio() is racy here, but the entire function is racy anyway */ 400 memcg = folio_memcg_check(page_folio(page)); 401 402 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 403 memcg = parent_mem_cgroup(memcg); 404 if (memcg) 405 ino = cgroup_ino(memcg->css.cgroup); 406 rcu_read_unlock(); 407 return ino; 408 } 409 410 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 411 struct mem_cgroup_tree_per_node *mctz, 412 unsigned long new_usage_in_excess) 413 { 414 struct rb_node **p = &mctz->rb_root.rb_node; 415 struct rb_node *parent = NULL; 416 struct mem_cgroup_per_node *mz_node; 417 bool rightmost = true; 418 419 if (mz->on_tree) 420 return; 421 422 mz->usage_in_excess = new_usage_in_excess; 423 if (!mz->usage_in_excess) 424 return; 425 while (*p) { 426 parent = *p; 427 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 428 tree_node); 429 if (mz->usage_in_excess < mz_node->usage_in_excess) { 430 p = &(*p)->rb_left; 431 rightmost = false; 432 } else { 433 p = &(*p)->rb_right; 434 } 435 } 436 437 if (rightmost) 438 mctz->rb_rightmost = &mz->tree_node; 439 440 rb_link_node(&mz->tree_node, parent, p); 441 rb_insert_color(&mz->tree_node, &mctz->rb_root); 442 mz->on_tree = true; 443 } 444 445 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 446 struct mem_cgroup_tree_per_node *mctz) 447 { 448 if (!mz->on_tree) 449 return; 450 451 if (&mz->tree_node == mctz->rb_rightmost) 452 mctz->rb_rightmost = rb_prev(&mz->tree_node); 453 454 rb_erase(&mz->tree_node, &mctz->rb_root); 455 mz->on_tree = false; 456 } 457 458 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 459 struct mem_cgroup_tree_per_node *mctz) 460 { 461 unsigned long flags; 462 463 spin_lock_irqsave(&mctz->lock, flags); 464 __mem_cgroup_remove_exceeded(mz, mctz); 465 spin_unlock_irqrestore(&mctz->lock, flags); 466 } 467 468 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 469 { 470 unsigned long nr_pages = page_counter_read(&memcg->memory); 471 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 472 unsigned long excess = 0; 473 474 if (nr_pages > soft_limit) 475 excess = nr_pages - soft_limit; 476 477 return excess; 478 } 479 480 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) 481 { 482 unsigned long excess; 483 struct mem_cgroup_per_node *mz; 484 struct mem_cgroup_tree_per_node *mctz; 485 486 if (lru_gen_enabled()) { 487 if (soft_limit_excess(memcg)) 488 lru_gen_soft_reclaim(memcg, nid); 489 return; 490 } 491 492 mctz = soft_limit_tree.rb_tree_per_node[nid]; 493 if (!mctz) 494 return; 495 /* 496 * Necessary to update all ancestors when hierarchy is used. 497 * because their event counter is not touched. 498 */ 499 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 500 mz = memcg->nodeinfo[nid]; 501 excess = soft_limit_excess(memcg); 502 /* 503 * We have to update the tree if mz is on RB-tree or 504 * mem is over its softlimit. 505 */ 506 if (excess || mz->on_tree) { 507 unsigned long flags; 508 509 spin_lock_irqsave(&mctz->lock, flags); 510 /* if on-tree, remove it */ 511 if (mz->on_tree) 512 __mem_cgroup_remove_exceeded(mz, mctz); 513 /* 514 * Insert again. mz->usage_in_excess will be updated. 515 * If excess is 0, no tree ops. 516 */ 517 __mem_cgroup_insert_exceeded(mz, mctz, excess); 518 spin_unlock_irqrestore(&mctz->lock, flags); 519 } 520 } 521 } 522 523 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 524 { 525 struct mem_cgroup_tree_per_node *mctz; 526 struct mem_cgroup_per_node *mz; 527 int nid; 528 529 for_each_node(nid) { 530 mz = memcg->nodeinfo[nid]; 531 mctz = soft_limit_tree.rb_tree_per_node[nid]; 532 if (mctz) 533 mem_cgroup_remove_exceeded(mz, mctz); 534 } 535 } 536 537 static struct mem_cgroup_per_node * 538 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 539 { 540 struct mem_cgroup_per_node *mz; 541 542 retry: 543 mz = NULL; 544 if (!mctz->rb_rightmost) 545 goto done; /* Nothing to reclaim from */ 546 547 mz = rb_entry(mctz->rb_rightmost, 548 struct mem_cgroup_per_node, tree_node); 549 /* 550 * Remove the node now but someone else can add it back, 551 * we will to add it back at the end of reclaim to its correct 552 * position in the tree. 553 */ 554 __mem_cgroup_remove_exceeded(mz, mctz); 555 if (!soft_limit_excess(mz->memcg) || 556 !css_tryget(&mz->memcg->css)) 557 goto retry; 558 done: 559 return mz; 560 } 561 562 static struct mem_cgroup_per_node * 563 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 564 { 565 struct mem_cgroup_per_node *mz; 566 567 spin_lock_irq(&mctz->lock); 568 mz = __mem_cgroup_largest_soft_limit_node(mctz); 569 spin_unlock_irq(&mctz->lock); 570 return mz; 571 } 572 573 /* 574 * memcg and lruvec stats flushing 575 * 576 * Many codepaths leading to stats update or read are performance sensitive and 577 * adding stats flushing in such codepaths is not desirable. So, to optimize the 578 * flushing the kernel does: 579 * 580 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 581 * rstat update tree grow unbounded. 582 * 583 * 2) Flush the stats synchronously on reader side only when there are more than 584 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 585 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 586 * only for 2 seconds due to (1). 587 */ 588 static void flush_memcg_stats_dwork(struct work_struct *w); 589 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 590 static DEFINE_PER_CPU(unsigned int, stats_updates); 591 static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); 592 static atomic_t stats_flush_threshold = ATOMIC_INIT(0); 593 static u64 flush_next_time; 594 595 #define FLUSH_TIME (2UL*HZ) 596 597 /* 598 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 599 * not rely on this as part of an acquired spinlock_t lock. These functions are 600 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 601 * is sufficient. 602 */ 603 static void memcg_stats_lock(void) 604 { 605 preempt_disable_nested(); 606 VM_WARN_ON_IRQS_ENABLED(); 607 } 608 609 static void __memcg_stats_lock(void) 610 { 611 preempt_disable_nested(); 612 } 613 614 static void memcg_stats_unlock(void) 615 { 616 preempt_enable_nested(); 617 } 618 619 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 620 { 621 unsigned int x; 622 623 if (!val) 624 return; 625 626 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 627 628 x = __this_cpu_add_return(stats_updates, abs(val)); 629 if (x > MEMCG_CHARGE_BATCH) { 630 /* 631 * If stats_flush_threshold exceeds the threshold 632 * (>num_online_cpus()), cgroup stats update will be triggered 633 * in __mem_cgroup_flush_stats(). Increasing this var further 634 * is redundant and simply adds overhead in atomic update. 635 */ 636 if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) 637 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); 638 __this_cpu_write(stats_updates, 0); 639 } 640 } 641 642 static void do_flush_stats(void) 643 { 644 /* 645 * We always flush the entire tree, so concurrent flushers can just 646 * skip. This avoids a thundering herd problem on the rstat global lock 647 * from memcg flushers (e.g. reclaim, refault, etc). 648 */ 649 if (atomic_read(&stats_flush_ongoing) || 650 atomic_xchg(&stats_flush_ongoing, 1)) 651 return; 652 653 WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); 654 655 cgroup_rstat_flush(root_mem_cgroup->css.cgroup); 656 657 atomic_set(&stats_flush_threshold, 0); 658 atomic_set(&stats_flush_ongoing, 0); 659 } 660 661 void mem_cgroup_flush_stats(void) 662 { 663 if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 664 do_flush_stats(); 665 } 666 667 void mem_cgroup_flush_stats_ratelimited(void) 668 { 669 if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) 670 mem_cgroup_flush_stats(); 671 } 672 673 static void flush_memcg_stats_dwork(struct work_struct *w) 674 { 675 /* 676 * Always flush here so that flushing in latency-sensitive paths is 677 * as cheap as possible. 678 */ 679 do_flush_stats(); 680 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 681 } 682 683 /* Subset of vm_event_item to report for memcg event stats */ 684 static const unsigned int memcg_vm_event_stat[] = { 685 PGPGIN, 686 PGPGOUT, 687 PGSCAN_KSWAPD, 688 PGSCAN_DIRECT, 689 PGSCAN_KHUGEPAGED, 690 PGSTEAL_KSWAPD, 691 PGSTEAL_DIRECT, 692 PGSTEAL_KHUGEPAGED, 693 PGFAULT, 694 PGMAJFAULT, 695 PGREFILL, 696 PGACTIVATE, 697 PGDEACTIVATE, 698 PGLAZYFREE, 699 PGLAZYFREED, 700 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 701 ZSWPIN, 702 ZSWPOUT, 703 #endif 704 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 705 THP_FAULT_ALLOC, 706 THP_COLLAPSE_ALLOC, 707 #endif 708 }; 709 710 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 711 static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 712 713 static void init_memcg_events(void) 714 { 715 int i; 716 717 for (i = 0; i < NR_MEMCG_EVENTS; ++i) 718 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; 719 } 720 721 static inline int memcg_events_index(enum vm_event_item idx) 722 { 723 return mem_cgroup_events_index[idx] - 1; 724 } 725 726 struct memcg_vmstats_percpu { 727 /* Local (CPU and cgroup) page state & events */ 728 long state[MEMCG_NR_STAT]; 729 unsigned long events[NR_MEMCG_EVENTS]; 730 731 /* Delta calculation for lockless upward propagation */ 732 long state_prev[MEMCG_NR_STAT]; 733 unsigned long events_prev[NR_MEMCG_EVENTS]; 734 735 /* Cgroup1: threshold notifications & softlimit tree updates */ 736 unsigned long nr_page_events; 737 unsigned long targets[MEM_CGROUP_NTARGETS]; 738 }; 739 740 struct memcg_vmstats { 741 /* Aggregated (CPU and subtree) page state & events */ 742 long state[MEMCG_NR_STAT]; 743 unsigned long events[NR_MEMCG_EVENTS]; 744 745 /* Pending child counts during tree propagation */ 746 long state_pending[MEMCG_NR_STAT]; 747 unsigned long events_pending[NR_MEMCG_EVENTS]; 748 }; 749 750 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 751 { 752 long x = READ_ONCE(memcg->vmstats->state[idx]); 753 #ifdef CONFIG_SMP 754 if (x < 0) 755 x = 0; 756 #endif 757 return x; 758 } 759 760 /** 761 * __mod_memcg_state - update cgroup memory statistics 762 * @memcg: the memory cgroup 763 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 764 * @val: delta to add to the counter, can be negative 765 */ 766 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 767 { 768 if (mem_cgroup_disabled()) 769 return; 770 771 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 772 memcg_rstat_updated(memcg, val); 773 } 774 775 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 776 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 777 { 778 long x = 0; 779 int cpu; 780 781 for_each_possible_cpu(cpu) 782 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); 783 #ifdef CONFIG_SMP 784 if (x < 0) 785 x = 0; 786 #endif 787 return x; 788 } 789 790 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 791 int val) 792 { 793 struct mem_cgroup_per_node *pn; 794 struct mem_cgroup *memcg; 795 796 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 797 memcg = pn->memcg; 798 799 /* 800 * The caller from rmap relay on disabled preemption becase they never 801 * update their counter from in-interrupt context. For these two 802 * counters we check that the update is never performed from an 803 * interrupt context while other caller need to have disabled interrupt. 804 */ 805 __memcg_stats_lock(); 806 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 807 switch (idx) { 808 case NR_ANON_MAPPED: 809 case NR_FILE_MAPPED: 810 case NR_ANON_THPS: 811 case NR_SHMEM_PMDMAPPED: 812 case NR_FILE_PMDMAPPED: 813 WARN_ON_ONCE(!in_task()); 814 break; 815 default: 816 VM_WARN_ON_IRQS_ENABLED(); 817 } 818 } 819 820 /* Update memcg */ 821 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 822 823 /* Update lruvec */ 824 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 825 826 memcg_rstat_updated(memcg, val); 827 memcg_stats_unlock(); 828 } 829 830 /** 831 * __mod_lruvec_state - update lruvec memory statistics 832 * @lruvec: the lruvec 833 * @idx: the stat item 834 * @val: delta to add to the counter, can be negative 835 * 836 * The lruvec is the intersection of the NUMA node and a cgroup. This 837 * function updates the all three counters that are affected by a 838 * change of state at this level: per-node, per-cgroup, per-lruvec. 839 */ 840 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 841 int val) 842 { 843 /* Update node */ 844 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 845 846 /* Update memcg and lruvec */ 847 if (!mem_cgroup_disabled()) 848 __mod_memcg_lruvec_state(lruvec, idx, val); 849 } 850 851 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 852 int val) 853 { 854 struct page *head = compound_head(page); /* rmap on tail pages */ 855 struct mem_cgroup *memcg; 856 pg_data_t *pgdat = page_pgdat(page); 857 struct lruvec *lruvec; 858 859 rcu_read_lock(); 860 memcg = page_memcg(head); 861 /* Untracked pages have no memcg, no lruvec. Update only the node */ 862 if (!memcg) { 863 rcu_read_unlock(); 864 __mod_node_page_state(pgdat, idx, val); 865 return; 866 } 867 868 lruvec = mem_cgroup_lruvec(memcg, pgdat); 869 __mod_lruvec_state(lruvec, idx, val); 870 rcu_read_unlock(); 871 } 872 EXPORT_SYMBOL(__mod_lruvec_page_state); 873 874 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 875 { 876 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 877 struct mem_cgroup *memcg; 878 struct lruvec *lruvec; 879 880 rcu_read_lock(); 881 memcg = mem_cgroup_from_slab_obj(p); 882 883 /* 884 * Untracked pages have no memcg, no lruvec. Update only the 885 * node. If we reparent the slab objects to the root memcg, 886 * when we free the slab object, we need to update the per-memcg 887 * vmstats to keep it correct for the root memcg. 888 */ 889 if (!memcg) { 890 __mod_node_page_state(pgdat, idx, val); 891 } else { 892 lruvec = mem_cgroup_lruvec(memcg, pgdat); 893 __mod_lruvec_state(lruvec, idx, val); 894 } 895 rcu_read_unlock(); 896 } 897 898 /** 899 * __count_memcg_events - account VM events in a cgroup 900 * @memcg: the memory cgroup 901 * @idx: the event item 902 * @count: the number of events that occurred 903 */ 904 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 905 unsigned long count) 906 { 907 int index = memcg_events_index(idx); 908 909 if (mem_cgroup_disabled() || index < 0) 910 return; 911 912 memcg_stats_lock(); 913 __this_cpu_add(memcg->vmstats_percpu->events[index], count); 914 memcg_rstat_updated(memcg, count); 915 memcg_stats_unlock(); 916 } 917 918 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 919 { 920 int index = memcg_events_index(event); 921 922 if (index < 0) 923 return 0; 924 return READ_ONCE(memcg->vmstats->events[index]); 925 } 926 927 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 928 { 929 long x = 0; 930 int cpu; 931 int index = memcg_events_index(event); 932 933 if (index < 0) 934 return 0; 935 936 for_each_possible_cpu(cpu) 937 x += per_cpu(memcg->vmstats_percpu->events[index], cpu); 938 return x; 939 } 940 941 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 942 int nr_pages) 943 { 944 /* pagein of a big page is an event. So, ignore page size */ 945 if (nr_pages > 0) 946 __count_memcg_events(memcg, PGPGIN, 1); 947 else { 948 __count_memcg_events(memcg, PGPGOUT, 1); 949 nr_pages = -nr_pages; /* for event */ 950 } 951 952 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 953 } 954 955 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 956 enum mem_cgroup_events_target target) 957 { 958 unsigned long val, next; 959 960 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 961 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 962 /* from time_after() in jiffies.h */ 963 if ((long)(next - val) < 0) { 964 switch (target) { 965 case MEM_CGROUP_TARGET_THRESH: 966 next = val + THRESHOLDS_EVENTS_TARGET; 967 break; 968 case MEM_CGROUP_TARGET_SOFTLIMIT: 969 next = val + SOFTLIMIT_EVENTS_TARGET; 970 break; 971 default: 972 break; 973 } 974 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 975 return true; 976 } 977 return false; 978 } 979 980 /* 981 * Check events in order. 982 * 983 */ 984 static void memcg_check_events(struct mem_cgroup *memcg, int nid) 985 { 986 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 987 return; 988 989 /* threshold event is triggered in finer grain than soft limit */ 990 if (unlikely(mem_cgroup_event_ratelimit(memcg, 991 MEM_CGROUP_TARGET_THRESH))) { 992 bool do_softlimit; 993 994 do_softlimit = mem_cgroup_event_ratelimit(memcg, 995 MEM_CGROUP_TARGET_SOFTLIMIT); 996 mem_cgroup_threshold(memcg); 997 if (unlikely(do_softlimit)) 998 mem_cgroup_update_tree(memcg, nid); 999 } 1000 } 1001 1002 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1003 { 1004 /* 1005 * mm_update_next_owner() may clear mm->owner to NULL 1006 * if it races with swapoff, page migration, etc. 1007 * So this can be called with p == NULL. 1008 */ 1009 if (unlikely(!p)) 1010 return NULL; 1011 1012 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1013 } 1014 EXPORT_SYMBOL(mem_cgroup_from_task); 1015 1016 static __always_inline struct mem_cgroup *active_memcg(void) 1017 { 1018 if (!in_task()) 1019 return this_cpu_read(int_active_memcg); 1020 else 1021 return current->active_memcg; 1022 } 1023 1024 /** 1025 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1026 * @mm: mm from which memcg should be extracted. It can be NULL. 1027 * 1028 * Obtain a reference on mm->memcg and returns it if successful. If mm 1029 * is NULL, then the memcg is chosen as follows: 1030 * 1) The active memcg, if set. 1031 * 2) current->mm->memcg, if available 1032 * 3) root memcg 1033 * If mem_cgroup is disabled, NULL is returned. 1034 */ 1035 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1036 { 1037 struct mem_cgroup *memcg; 1038 1039 if (mem_cgroup_disabled()) 1040 return NULL; 1041 1042 /* 1043 * Page cache insertions can happen without an 1044 * actual mm context, e.g. during disk probing 1045 * on boot, loopback IO, acct() writes etc. 1046 * 1047 * No need to css_get on root memcg as the reference 1048 * counting is disabled on the root level in the 1049 * cgroup core. See CSS_NO_REF. 1050 */ 1051 if (unlikely(!mm)) { 1052 memcg = active_memcg(); 1053 if (unlikely(memcg)) { 1054 /* remote memcg must hold a ref */ 1055 css_get(&memcg->css); 1056 return memcg; 1057 } 1058 mm = current->mm; 1059 if (unlikely(!mm)) 1060 return root_mem_cgroup; 1061 } 1062 1063 rcu_read_lock(); 1064 do { 1065 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1066 if (unlikely(!memcg)) 1067 memcg = root_mem_cgroup; 1068 } while (!css_tryget(&memcg->css)); 1069 rcu_read_unlock(); 1070 return memcg; 1071 } 1072 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1073 1074 static __always_inline bool memcg_kmem_bypass(void) 1075 { 1076 /* Allow remote memcg charging from any context. */ 1077 if (unlikely(active_memcg())) 1078 return false; 1079 1080 /* Memcg to charge can't be determined. */ 1081 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) 1082 return true; 1083 1084 return false; 1085 } 1086 1087 /** 1088 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1089 * @root: hierarchy root 1090 * @prev: previously returned memcg, NULL on first invocation 1091 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1092 * 1093 * Returns references to children of the hierarchy below @root, or 1094 * @root itself, or %NULL after a full round-trip. 1095 * 1096 * Caller must pass the return value in @prev on subsequent 1097 * invocations for reference counting, or use mem_cgroup_iter_break() 1098 * to cancel a hierarchy walk before the round-trip is complete. 1099 * 1100 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1101 * in the hierarchy among all concurrent reclaimers operating on the 1102 * same node. 1103 */ 1104 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1105 struct mem_cgroup *prev, 1106 struct mem_cgroup_reclaim_cookie *reclaim) 1107 { 1108 struct mem_cgroup_reclaim_iter *iter; 1109 struct cgroup_subsys_state *css = NULL; 1110 struct mem_cgroup *memcg = NULL; 1111 struct mem_cgroup *pos = NULL; 1112 1113 if (mem_cgroup_disabled()) 1114 return NULL; 1115 1116 if (!root) 1117 root = root_mem_cgroup; 1118 1119 rcu_read_lock(); 1120 1121 if (reclaim) { 1122 struct mem_cgroup_per_node *mz; 1123 1124 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1125 iter = &mz->iter; 1126 1127 /* 1128 * On start, join the current reclaim iteration cycle. 1129 * Exit when a concurrent walker completes it. 1130 */ 1131 if (!prev) 1132 reclaim->generation = iter->generation; 1133 else if (reclaim->generation != iter->generation) 1134 goto out_unlock; 1135 1136 while (1) { 1137 pos = READ_ONCE(iter->position); 1138 if (!pos || css_tryget(&pos->css)) 1139 break; 1140 /* 1141 * css reference reached zero, so iter->position will 1142 * be cleared by ->css_released. However, we should not 1143 * rely on this happening soon, because ->css_released 1144 * is called from a work queue, and by busy-waiting we 1145 * might block it. So we clear iter->position right 1146 * away. 1147 */ 1148 (void)cmpxchg(&iter->position, pos, NULL); 1149 } 1150 } else if (prev) { 1151 pos = prev; 1152 } 1153 1154 if (pos) 1155 css = &pos->css; 1156 1157 for (;;) { 1158 css = css_next_descendant_pre(css, &root->css); 1159 if (!css) { 1160 /* 1161 * Reclaimers share the hierarchy walk, and a 1162 * new one might jump in right at the end of 1163 * the hierarchy - make sure they see at least 1164 * one group and restart from the beginning. 1165 */ 1166 if (!prev) 1167 continue; 1168 break; 1169 } 1170 1171 /* 1172 * Verify the css and acquire a reference. The root 1173 * is provided by the caller, so we know it's alive 1174 * and kicking, and don't take an extra reference. 1175 */ 1176 if (css == &root->css || css_tryget(css)) { 1177 memcg = mem_cgroup_from_css(css); 1178 break; 1179 } 1180 } 1181 1182 if (reclaim) { 1183 /* 1184 * The position could have already been updated by a competing 1185 * thread, so check that the value hasn't changed since we read 1186 * it to avoid reclaiming from the same cgroup twice. 1187 */ 1188 (void)cmpxchg(&iter->position, pos, memcg); 1189 1190 if (pos) 1191 css_put(&pos->css); 1192 1193 if (!memcg) 1194 iter->generation++; 1195 } 1196 1197 out_unlock: 1198 rcu_read_unlock(); 1199 if (prev && prev != root) 1200 css_put(&prev->css); 1201 1202 return memcg; 1203 } 1204 1205 /** 1206 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1207 * @root: hierarchy root 1208 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1209 */ 1210 void mem_cgroup_iter_break(struct mem_cgroup *root, 1211 struct mem_cgroup *prev) 1212 { 1213 if (!root) 1214 root = root_mem_cgroup; 1215 if (prev && prev != root) 1216 css_put(&prev->css); 1217 } 1218 1219 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1220 struct mem_cgroup *dead_memcg) 1221 { 1222 struct mem_cgroup_reclaim_iter *iter; 1223 struct mem_cgroup_per_node *mz; 1224 int nid; 1225 1226 for_each_node(nid) { 1227 mz = from->nodeinfo[nid]; 1228 iter = &mz->iter; 1229 cmpxchg(&iter->position, dead_memcg, NULL); 1230 } 1231 } 1232 1233 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1234 { 1235 struct mem_cgroup *memcg = dead_memcg; 1236 struct mem_cgroup *last; 1237 1238 do { 1239 __invalidate_reclaim_iterators(memcg, dead_memcg); 1240 last = memcg; 1241 } while ((memcg = parent_mem_cgroup(memcg))); 1242 1243 /* 1244 * When cgroup1 non-hierarchy mode is used, 1245 * parent_mem_cgroup() does not walk all the way up to the 1246 * cgroup root (root_mem_cgroup). So we have to handle 1247 * dead_memcg from cgroup root separately. 1248 */ 1249 if (!mem_cgroup_is_root(last)) 1250 __invalidate_reclaim_iterators(root_mem_cgroup, 1251 dead_memcg); 1252 } 1253 1254 /** 1255 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1256 * @memcg: hierarchy root 1257 * @fn: function to call for each task 1258 * @arg: argument passed to @fn 1259 * 1260 * This function iterates over tasks attached to @memcg or to any of its 1261 * descendants and calls @fn for each task. If @fn returns a non-zero 1262 * value, the function breaks the iteration loop. Otherwise, it will iterate 1263 * over all tasks and return 0. 1264 * 1265 * This function must not be called for the root memory cgroup. 1266 */ 1267 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1268 int (*fn)(struct task_struct *, void *), void *arg) 1269 { 1270 struct mem_cgroup *iter; 1271 int ret = 0; 1272 1273 BUG_ON(mem_cgroup_is_root(memcg)); 1274 1275 for_each_mem_cgroup_tree(iter, memcg) { 1276 struct css_task_iter it; 1277 struct task_struct *task; 1278 1279 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1280 while (!ret && (task = css_task_iter_next(&it))) 1281 ret = fn(task, arg); 1282 css_task_iter_end(&it); 1283 if (ret) { 1284 mem_cgroup_iter_break(memcg, iter); 1285 break; 1286 } 1287 } 1288 } 1289 1290 #ifdef CONFIG_DEBUG_VM 1291 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1292 { 1293 struct mem_cgroup *memcg; 1294 1295 if (mem_cgroup_disabled()) 1296 return; 1297 1298 memcg = folio_memcg(folio); 1299 1300 if (!memcg) 1301 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1302 else 1303 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1304 } 1305 #endif 1306 1307 /** 1308 * folio_lruvec_lock - Lock the lruvec for a folio. 1309 * @folio: Pointer to the folio. 1310 * 1311 * These functions are safe to use under any of the following conditions: 1312 * - folio locked 1313 * - folio_test_lru false 1314 * - folio_memcg_lock() 1315 * - folio frozen (refcount of 0) 1316 * 1317 * Return: The lruvec this folio is on with its lock held. 1318 */ 1319 struct lruvec *folio_lruvec_lock(struct folio *folio) 1320 { 1321 struct lruvec *lruvec = folio_lruvec(folio); 1322 1323 spin_lock(&lruvec->lru_lock); 1324 lruvec_memcg_debug(lruvec, folio); 1325 1326 return lruvec; 1327 } 1328 1329 /** 1330 * folio_lruvec_lock_irq - Lock the lruvec for a folio. 1331 * @folio: Pointer to the folio. 1332 * 1333 * These functions are safe to use under any of the following conditions: 1334 * - folio locked 1335 * - folio_test_lru false 1336 * - folio_memcg_lock() 1337 * - folio frozen (refcount of 0) 1338 * 1339 * Return: The lruvec this folio is on with its lock held and interrupts 1340 * disabled. 1341 */ 1342 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1343 { 1344 struct lruvec *lruvec = folio_lruvec(folio); 1345 1346 spin_lock_irq(&lruvec->lru_lock); 1347 lruvec_memcg_debug(lruvec, folio); 1348 1349 return lruvec; 1350 } 1351 1352 /** 1353 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 1354 * @folio: Pointer to the folio. 1355 * @flags: Pointer to irqsave flags. 1356 * 1357 * These functions are safe to use under any of the following conditions: 1358 * - folio locked 1359 * - folio_test_lru false 1360 * - folio_memcg_lock() 1361 * - folio frozen (refcount of 0) 1362 * 1363 * Return: The lruvec this folio is on with its lock held and interrupts 1364 * disabled. 1365 */ 1366 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1367 unsigned long *flags) 1368 { 1369 struct lruvec *lruvec = folio_lruvec(folio); 1370 1371 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1372 lruvec_memcg_debug(lruvec, folio); 1373 1374 return lruvec; 1375 } 1376 1377 /** 1378 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1379 * @lruvec: mem_cgroup per zone lru vector 1380 * @lru: index of lru list the page is sitting on 1381 * @zid: zone id of the accounted pages 1382 * @nr_pages: positive when adding or negative when removing 1383 * 1384 * This function must be called under lru_lock, just before a page is added 1385 * to or just after a page is removed from an lru list. 1386 */ 1387 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1388 int zid, int nr_pages) 1389 { 1390 struct mem_cgroup_per_node *mz; 1391 unsigned long *lru_size; 1392 long size; 1393 1394 if (mem_cgroup_disabled()) 1395 return; 1396 1397 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1398 lru_size = &mz->lru_zone_size[zid][lru]; 1399 1400 if (nr_pages < 0) 1401 *lru_size += nr_pages; 1402 1403 size = *lru_size; 1404 if (WARN_ONCE(size < 0, 1405 "%s(%p, %d, %d): lru_size %ld\n", 1406 __func__, lruvec, lru, nr_pages, size)) { 1407 VM_BUG_ON(1); 1408 *lru_size = 0; 1409 } 1410 1411 if (nr_pages > 0) 1412 *lru_size += nr_pages; 1413 } 1414 1415 /** 1416 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1417 * @memcg: the memory cgroup 1418 * 1419 * Returns the maximum amount of memory @mem can be charged with, in 1420 * pages. 1421 */ 1422 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1423 { 1424 unsigned long margin = 0; 1425 unsigned long count; 1426 unsigned long limit; 1427 1428 count = page_counter_read(&memcg->memory); 1429 limit = READ_ONCE(memcg->memory.max); 1430 if (count < limit) 1431 margin = limit - count; 1432 1433 if (do_memsw_account()) { 1434 count = page_counter_read(&memcg->memsw); 1435 limit = READ_ONCE(memcg->memsw.max); 1436 if (count < limit) 1437 margin = min(margin, limit - count); 1438 else 1439 margin = 0; 1440 } 1441 1442 return margin; 1443 } 1444 1445 /* 1446 * A routine for checking "mem" is under move_account() or not. 1447 * 1448 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1449 * moving cgroups. This is for waiting at high-memory pressure 1450 * caused by "move". 1451 */ 1452 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1453 { 1454 struct mem_cgroup *from; 1455 struct mem_cgroup *to; 1456 bool ret = false; 1457 /* 1458 * Unlike task_move routines, we access mc.to, mc.from not under 1459 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1460 */ 1461 spin_lock(&mc.lock); 1462 from = mc.from; 1463 to = mc.to; 1464 if (!from) 1465 goto unlock; 1466 1467 ret = mem_cgroup_is_descendant(from, memcg) || 1468 mem_cgroup_is_descendant(to, memcg); 1469 unlock: 1470 spin_unlock(&mc.lock); 1471 return ret; 1472 } 1473 1474 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1475 { 1476 if (mc.moving_task && current != mc.moving_task) { 1477 if (mem_cgroup_under_move(memcg)) { 1478 DEFINE_WAIT(wait); 1479 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1480 /* moving charge context might have finished. */ 1481 if (mc.moving_task) 1482 schedule(); 1483 finish_wait(&mc.waitq, &wait); 1484 return true; 1485 } 1486 } 1487 return false; 1488 } 1489 1490 struct memory_stat { 1491 const char *name; 1492 unsigned int idx; 1493 }; 1494 1495 static const struct memory_stat memory_stats[] = { 1496 { "anon", NR_ANON_MAPPED }, 1497 { "file", NR_FILE_PAGES }, 1498 { "kernel", MEMCG_KMEM }, 1499 { "kernel_stack", NR_KERNEL_STACK_KB }, 1500 { "pagetables", NR_PAGETABLE }, 1501 { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 1502 { "percpu", MEMCG_PERCPU_B }, 1503 { "sock", MEMCG_SOCK }, 1504 { "vmalloc", MEMCG_VMALLOC }, 1505 { "shmem", NR_SHMEM }, 1506 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 1507 { "zswap", MEMCG_ZSWAP_B }, 1508 { "zswapped", MEMCG_ZSWAPPED }, 1509 #endif 1510 { "file_mapped", NR_FILE_MAPPED }, 1511 { "file_dirty", NR_FILE_DIRTY }, 1512 { "file_writeback", NR_WRITEBACK }, 1513 #ifdef CONFIG_SWAP 1514 { "swapcached", NR_SWAPCACHE }, 1515 #endif 1516 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1517 { "anon_thp", NR_ANON_THPS }, 1518 { "file_thp", NR_FILE_THPS }, 1519 { "shmem_thp", NR_SHMEM_THPS }, 1520 #endif 1521 { "inactive_anon", NR_INACTIVE_ANON }, 1522 { "active_anon", NR_ACTIVE_ANON }, 1523 { "inactive_file", NR_INACTIVE_FILE }, 1524 { "active_file", NR_ACTIVE_FILE }, 1525 { "unevictable", NR_UNEVICTABLE }, 1526 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1527 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1528 1529 /* The memory events */ 1530 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1531 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1532 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1533 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1534 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1535 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1536 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1537 }; 1538 1539 /* Translate stat items to the correct unit for memory.stat output */ 1540 static int memcg_page_state_unit(int item) 1541 { 1542 switch (item) { 1543 case MEMCG_PERCPU_B: 1544 case MEMCG_ZSWAP_B: 1545 case NR_SLAB_RECLAIMABLE_B: 1546 case NR_SLAB_UNRECLAIMABLE_B: 1547 case WORKINGSET_REFAULT_ANON: 1548 case WORKINGSET_REFAULT_FILE: 1549 case WORKINGSET_ACTIVATE_ANON: 1550 case WORKINGSET_ACTIVATE_FILE: 1551 case WORKINGSET_RESTORE_ANON: 1552 case WORKINGSET_RESTORE_FILE: 1553 case WORKINGSET_NODERECLAIM: 1554 return 1; 1555 case NR_KERNEL_STACK_KB: 1556 return SZ_1K; 1557 default: 1558 return PAGE_SIZE; 1559 } 1560 } 1561 1562 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1563 int item) 1564 { 1565 return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 1566 } 1567 1568 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1569 { 1570 int i; 1571 1572 /* 1573 * Provide statistics on the state of the memory subsystem as 1574 * well as cumulative event counters that show past behavior. 1575 * 1576 * This list is ordered following a combination of these gradients: 1577 * 1) generic big picture -> specifics and details 1578 * 2) reflecting userspace activity -> reflecting kernel heuristics 1579 * 1580 * Current memory state: 1581 */ 1582 mem_cgroup_flush_stats(); 1583 1584 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1585 u64 size; 1586 1587 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1588 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); 1589 1590 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1591 size += memcg_page_state_output(memcg, 1592 NR_SLAB_RECLAIMABLE_B); 1593 seq_buf_printf(s, "slab %llu\n", size); 1594 } 1595 } 1596 1597 /* Accumulated memory events */ 1598 seq_buf_printf(s, "pgscan %lu\n", 1599 memcg_events(memcg, PGSCAN_KSWAPD) + 1600 memcg_events(memcg, PGSCAN_DIRECT) + 1601 memcg_events(memcg, PGSCAN_KHUGEPAGED)); 1602 seq_buf_printf(s, "pgsteal %lu\n", 1603 memcg_events(memcg, PGSTEAL_KSWAPD) + 1604 memcg_events(memcg, PGSTEAL_DIRECT) + 1605 memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 1606 1607 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 1608 if (memcg_vm_event_stat[i] == PGPGIN || 1609 memcg_vm_event_stat[i] == PGPGOUT) 1610 continue; 1611 1612 seq_buf_printf(s, "%s %lu\n", 1613 vm_event_name(memcg_vm_event_stat[i]), 1614 memcg_events(memcg, memcg_vm_event_stat[i])); 1615 } 1616 1617 /* The above should easily fit into one page */ 1618 WARN_ON_ONCE(seq_buf_has_overflowed(s)); 1619 } 1620 1621 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); 1622 1623 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1624 { 1625 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1626 memcg_stat_format(memcg, s); 1627 else 1628 memcg1_stat_format(memcg, s); 1629 WARN_ON_ONCE(seq_buf_has_overflowed(s)); 1630 } 1631 1632 /** 1633 * mem_cgroup_print_oom_context: Print OOM information relevant to 1634 * memory controller. 1635 * @memcg: The memory cgroup that went over limit 1636 * @p: Task that is going to be killed 1637 * 1638 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1639 * enabled 1640 */ 1641 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1642 { 1643 rcu_read_lock(); 1644 1645 if (memcg) { 1646 pr_cont(",oom_memcg="); 1647 pr_cont_cgroup_path(memcg->css.cgroup); 1648 } else 1649 pr_cont(",global_oom"); 1650 if (p) { 1651 pr_cont(",task_memcg="); 1652 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1653 } 1654 rcu_read_unlock(); 1655 } 1656 1657 /** 1658 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1659 * memory controller. 1660 * @memcg: The memory cgroup that went over limit 1661 */ 1662 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1663 { 1664 /* Use static buffer, for the caller is holding oom_lock. */ 1665 static char buf[PAGE_SIZE]; 1666 struct seq_buf s; 1667 1668 lockdep_assert_held(&oom_lock); 1669 1670 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1671 K((u64)page_counter_read(&memcg->memory)), 1672 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1673 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1674 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1675 K((u64)page_counter_read(&memcg->swap)), 1676 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1677 else { 1678 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1679 K((u64)page_counter_read(&memcg->memsw)), 1680 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1681 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1682 K((u64)page_counter_read(&memcg->kmem)), 1683 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1684 } 1685 1686 pr_info("Memory cgroup stats for "); 1687 pr_cont_cgroup_path(memcg->css.cgroup); 1688 pr_cont(":"); 1689 seq_buf_init(&s, buf, sizeof(buf)); 1690 memory_stat_format(memcg, &s); 1691 seq_buf_do_printk(&s, KERN_INFO); 1692 } 1693 1694 /* 1695 * Return the memory (and swap, if configured) limit for a memcg. 1696 */ 1697 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1698 { 1699 unsigned long max = READ_ONCE(memcg->memory.max); 1700 1701 if (do_memsw_account()) { 1702 if (mem_cgroup_swappiness(memcg)) { 1703 /* Calculate swap excess capacity from memsw limit */ 1704 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1705 1706 max += min(swap, (unsigned long)total_swap_pages); 1707 } 1708 } else { 1709 if (mem_cgroup_swappiness(memcg)) 1710 max += min(READ_ONCE(memcg->swap.max), 1711 (unsigned long)total_swap_pages); 1712 } 1713 return max; 1714 } 1715 1716 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1717 { 1718 return page_counter_read(&memcg->memory); 1719 } 1720 1721 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1722 int order) 1723 { 1724 struct oom_control oc = { 1725 .zonelist = NULL, 1726 .nodemask = NULL, 1727 .memcg = memcg, 1728 .gfp_mask = gfp_mask, 1729 .order = order, 1730 }; 1731 bool ret = true; 1732 1733 if (mutex_lock_killable(&oom_lock)) 1734 return true; 1735 1736 if (mem_cgroup_margin(memcg) >= (1 << order)) 1737 goto unlock; 1738 1739 /* 1740 * A few threads which were not waiting at mutex_lock_killable() can 1741 * fail to bail out. Therefore, check again after holding oom_lock. 1742 */ 1743 ret = task_is_dying() || out_of_memory(&oc); 1744 1745 unlock: 1746 mutex_unlock(&oom_lock); 1747 return ret; 1748 } 1749 1750 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1751 pg_data_t *pgdat, 1752 gfp_t gfp_mask, 1753 unsigned long *total_scanned) 1754 { 1755 struct mem_cgroup *victim = NULL; 1756 int total = 0; 1757 int loop = 0; 1758 unsigned long excess; 1759 unsigned long nr_scanned; 1760 struct mem_cgroup_reclaim_cookie reclaim = { 1761 .pgdat = pgdat, 1762 }; 1763 1764 excess = soft_limit_excess(root_memcg); 1765 1766 while (1) { 1767 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1768 if (!victim) { 1769 loop++; 1770 if (loop >= 2) { 1771 /* 1772 * If we have not been able to reclaim 1773 * anything, it might because there are 1774 * no reclaimable pages under this hierarchy 1775 */ 1776 if (!total) 1777 break; 1778 /* 1779 * We want to do more targeted reclaim. 1780 * excess >> 2 is not to excessive so as to 1781 * reclaim too much, nor too less that we keep 1782 * coming back to reclaim from this cgroup 1783 */ 1784 if (total >= (excess >> 2) || 1785 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1786 break; 1787 } 1788 continue; 1789 } 1790 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1791 pgdat, &nr_scanned); 1792 *total_scanned += nr_scanned; 1793 if (!soft_limit_excess(root_memcg)) 1794 break; 1795 } 1796 mem_cgroup_iter_break(root_memcg, victim); 1797 return total; 1798 } 1799 1800 #ifdef CONFIG_LOCKDEP 1801 static struct lockdep_map memcg_oom_lock_dep_map = { 1802 .name = "memcg_oom_lock", 1803 }; 1804 #endif 1805 1806 static DEFINE_SPINLOCK(memcg_oom_lock); 1807 1808 /* 1809 * Check OOM-Killer is already running under our hierarchy. 1810 * If someone is running, return false. 1811 */ 1812 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1813 { 1814 struct mem_cgroup *iter, *failed = NULL; 1815 1816 spin_lock(&memcg_oom_lock); 1817 1818 for_each_mem_cgroup_tree(iter, memcg) { 1819 if (iter->oom_lock) { 1820 /* 1821 * this subtree of our hierarchy is already locked 1822 * so we cannot give a lock. 1823 */ 1824 failed = iter; 1825 mem_cgroup_iter_break(memcg, iter); 1826 break; 1827 } else 1828 iter->oom_lock = true; 1829 } 1830 1831 if (failed) { 1832 /* 1833 * OK, we failed to lock the whole subtree so we have 1834 * to clean up what we set up to the failing subtree 1835 */ 1836 for_each_mem_cgroup_tree(iter, memcg) { 1837 if (iter == failed) { 1838 mem_cgroup_iter_break(memcg, iter); 1839 break; 1840 } 1841 iter->oom_lock = false; 1842 } 1843 } else 1844 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1845 1846 spin_unlock(&memcg_oom_lock); 1847 1848 return !failed; 1849 } 1850 1851 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1852 { 1853 struct mem_cgroup *iter; 1854 1855 spin_lock(&memcg_oom_lock); 1856 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1857 for_each_mem_cgroup_tree(iter, memcg) 1858 iter->oom_lock = false; 1859 spin_unlock(&memcg_oom_lock); 1860 } 1861 1862 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1863 { 1864 struct mem_cgroup *iter; 1865 1866 spin_lock(&memcg_oom_lock); 1867 for_each_mem_cgroup_tree(iter, memcg) 1868 iter->under_oom++; 1869 spin_unlock(&memcg_oom_lock); 1870 } 1871 1872 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1873 { 1874 struct mem_cgroup *iter; 1875 1876 /* 1877 * Be careful about under_oom underflows because a child memcg 1878 * could have been added after mem_cgroup_mark_under_oom. 1879 */ 1880 spin_lock(&memcg_oom_lock); 1881 for_each_mem_cgroup_tree(iter, memcg) 1882 if (iter->under_oom > 0) 1883 iter->under_oom--; 1884 spin_unlock(&memcg_oom_lock); 1885 } 1886 1887 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1888 1889 struct oom_wait_info { 1890 struct mem_cgroup *memcg; 1891 wait_queue_entry_t wait; 1892 }; 1893 1894 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1895 unsigned mode, int sync, void *arg) 1896 { 1897 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1898 struct mem_cgroup *oom_wait_memcg; 1899 struct oom_wait_info *oom_wait_info; 1900 1901 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1902 oom_wait_memcg = oom_wait_info->memcg; 1903 1904 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1905 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1906 return 0; 1907 return autoremove_wake_function(wait, mode, sync, arg); 1908 } 1909 1910 static void memcg_oom_recover(struct mem_cgroup *memcg) 1911 { 1912 /* 1913 * For the following lockless ->under_oom test, the only required 1914 * guarantee is that it must see the state asserted by an OOM when 1915 * this function is called as a result of userland actions 1916 * triggered by the notification of the OOM. This is trivially 1917 * achieved by invoking mem_cgroup_mark_under_oom() before 1918 * triggering notification. 1919 */ 1920 if (memcg && memcg->under_oom) 1921 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1922 } 1923 1924 /* 1925 * Returns true if successfully killed one or more processes. Though in some 1926 * corner cases it can return true even without killing any process. 1927 */ 1928 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1929 { 1930 bool locked, ret; 1931 1932 if (order > PAGE_ALLOC_COSTLY_ORDER) 1933 return false; 1934 1935 memcg_memory_event(memcg, MEMCG_OOM); 1936 1937 /* 1938 * We are in the middle of the charge context here, so we 1939 * don't want to block when potentially sitting on a callstack 1940 * that holds all kinds of filesystem and mm locks. 1941 * 1942 * cgroup1 allows disabling the OOM killer and waiting for outside 1943 * handling until the charge can succeed; remember the context and put 1944 * the task to sleep at the end of the page fault when all locks are 1945 * released. 1946 * 1947 * On the other hand, in-kernel OOM killer allows for an async victim 1948 * memory reclaim (oom_reaper) and that means that we are not solely 1949 * relying on the oom victim to make a forward progress and we can 1950 * invoke the oom killer here. 1951 * 1952 * Please note that mem_cgroup_out_of_memory might fail to find a 1953 * victim and then we have to bail out from the charge path. 1954 */ 1955 if (READ_ONCE(memcg->oom_kill_disable)) { 1956 if (current->in_user_fault) { 1957 css_get(&memcg->css); 1958 current->memcg_in_oom = memcg; 1959 current->memcg_oom_gfp_mask = mask; 1960 current->memcg_oom_order = order; 1961 } 1962 return false; 1963 } 1964 1965 mem_cgroup_mark_under_oom(memcg); 1966 1967 locked = mem_cgroup_oom_trylock(memcg); 1968 1969 if (locked) 1970 mem_cgroup_oom_notify(memcg); 1971 1972 mem_cgroup_unmark_under_oom(memcg); 1973 ret = mem_cgroup_out_of_memory(memcg, mask, order); 1974 1975 if (locked) 1976 mem_cgroup_oom_unlock(memcg); 1977 1978 return ret; 1979 } 1980 1981 /** 1982 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1983 * @handle: actually kill/wait or just clean up the OOM state 1984 * 1985 * This has to be called at the end of a page fault if the memcg OOM 1986 * handler was enabled. 1987 * 1988 * Memcg supports userspace OOM handling where failed allocations must 1989 * sleep on a waitqueue until the userspace task resolves the 1990 * situation. Sleeping directly in the charge context with all kinds 1991 * of locks held is not a good idea, instead we remember an OOM state 1992 * in the task and mem_cgroup_oom_synchronize() has to be called at 1993 * the end of the page fault to complete the OOM handling. 1994 * 1995 * Returns %true if an ongoing memcg OOM situation was detected and 1996 * completed, %false otherwise. 1997 */ 1998 bool mem_cgroup_oom_synchronize(bool handle) 1999 { 2000 struct mem_cgroup *memcg = current->memcg_in_oom; 2001 struct oom_wait_info owait; 2002 bool locked; 2003 2004 /* OOM is global, do not handle */ 2005 if (!memcg) 2006 return false; 2007 2008 if (!handle) 2009 goto cleanup; 2010 2011 owait.memcg = memcg; 2012 owait.wait.flags = 0; 2013 owait.wait.func = memcg_oom_wake_function; 2014 owait.wait.private = current; 2015 INIT_LIST_HEAD(&owait.wait.entry); 2016 2017 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2018 mem_cgroup_mark_under_oom(memcg); 2019 2020 locked = mem_cgroup_oom_trylock(memcg); 2021 2022 if (locked) 2023 mem_cgroup_oom_notify(memcg); 2024 2025 schedule(); 2026 mem_cgroup_unmark_under_oom(memcg); 2027 finish_wait(&memcg_oom_waitq, &owait.wait); 2028 2029 if (locked) 2030 mem_cgroup_oom_unlock(memcg); 2031 cleanup: 2032 current->memcg_in_oom = NULL; 2033 css_put(&memcg->css); 2034 return true; 2035 } 2036 2037 /** 2038 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2039 * @victim: task to be killed by the OOM killer 2040 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2041 * 2042 * Returns a pointer to a memory cgroup, which has to be cleaned up 2043 * by killing all belonging OOM-killable tasks. 2044 * 2045 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2046 */ 2047 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2048 struct mem_cgroup *oom_domain) 2049 { 2050 struct mem_cgroup *oom_group = NULL; 2051 struct mem_cgroup *memcg; 2052 2053 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2054 return NULL; 2055 2056 if (!oom_domain) 2057 oom_domain = root_mem_cgroup; 2058 2059 rcu_read_lock(); 2060 2061 memcg = mem_cgroup_from_task(victim); 2062 if (mem_cgroup_is_root(memcg)) 2063 goto out; 2064 2065 /* 2066 * If the victim task has been asynchronously moved to a different 2067 * memory cgroup, we might end up killing tasks outside oom_domain. 2068 * In this case it's better to ignore memory.group.oom. 2069 */ 2070 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2071 goto out; 2072 2073 /* 2074 * Traverse the memory cgroup hierarchy from the victim task's 2075 * cgroup up to the OOMing cgroup (or root) to find the 2076 * highest-level memory cgroup with oom.group set. 2077 */ 2078 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2079 if (READ_ONCE(memcg->oom_group)) 2080 oom_group = memcg; 2081 2082 if (memcg == oom_domain) 2083 break; 2084 } 2085 2086 if (oom_group) 2087 css_get(&oom_group->css); 2088 out: 2089 rcu_read_unlock(); 2090 2091 return oom_group; 2092 } 2093 2094 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2095 { 2096 pr_info("Tasks in "); 2097 pr_cont_cgroup_path(memcg->css.cgroup); 2098 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2099 } 2100 2101 /** 2102 * folio_memcg_lock - Bind a folio to its memcg. 2103 * @folio: The folio. 2104 * 2105 * This function prevents unlocked LRU folios from being moved to 2106 * another cgroup. 2107 * 2108 * It ensures lifetime of the bound memcg. The caller is responsible 2109 * for the lifetime of the folio. 2110 */ 2111 void folio_memcg_lock(struct folio *folio) 2112 { 2113 struct mem_cgroup *memcg; 2114 unsigned long flags; 2115 2116 /* 2117 * The RCU lock is held throughout the transaction. The fast 2118 * path can get away without acquiring the memcg->move_lock 2119 * because page moving starts with an RCU grace period. 2120 */ 2121 rcu_read_lock(); 2122 2123 if (mem_cgroup_disabled()) 2124 return; 2125 again: 2126 memcg = folio_memcg(folio); 2127 if (unlikely(!memcg)) 2128 return; 2129 2130 #ifdef CONFIG_PROVE_LOCKING 2131 local_irq_save(flags); 2132 might_lock(&memcg->move_lock); 2133 local_irq_restore(flags); 2134 #endif 2135 2136 if (atomic_read(&memcg->moving_account) <= 0) 2137 return; 2138 2139 spin_lock_irqsave(&memcg->move_lock, flags); 2140 if (memcg != folio_memcg(folio)) { 2141 spin_unlock_irqrestore(&memcg->move_lock, flags); 2142 goto again; 2143 } 2144 2145 /* 2146 * When charge migration first begins, we can have multiple 2147 * critical sections holding the fast-path RCU lock and one 2148 * holding the slowpath move_lock. Track the task who has the 2149 * move_lock for folio_memcg_unlock(). 2150 */ 2151 memcg->move_lock_task = current; 2152 memcg->move_lock_flags = flags; 2153 } 2154 2155 static void __folio_memcg_unlock(struct mem_cgroup *memcg) 2156 { 2157 if (memcg && memcg->move_lock_task == current) { 2158 unsigned long flags = memcg->move_lock_flags; 2159 2160 memcg->move_lock_task = NULL; 2161 memcg->move_lock_flags = 0; 2162 2163 spin_unlock_irqrestore(&memcg->move_lock, flags); 2164 } 2165 2166 rcu_read_unlock(); 2167 } 2168 2169 /** 2170 * folio_memcg_unlock - Release the binding between a folio and its memcg. 2171 * @folio: The folio. 2172 * 2173 * This releases the binding created by folio_memcg_lock(). This does 2174 * not change the accounting of this folio to its memcg, but it does 2175 * permit others to change it. 2176 */ 2177 void folio_memcg_unlock(struct folio *folio) 2178 { 2179 __folio_memcg_unlock(folio_memcg(folio)); 2180 } 2181 2182 struct memcg_stock_pcp { 2183 local_lock_t stock_lock; 2184 struct mem_cgroup *cached; /* this never be root cgroup */ 2185 unsigned int nr_pages; 2186 2187 #ifdef CONFIG_MEMCG_KMEM 2188 struct obj_cgroup *cached_objcg; 2189 struct pglist_data *cached_pgdat; 2190 unsigned int nr_bytes; 2191 int nr_slab_reclaimable_b; 2192 int nr_slab_unreclaimable_b; 2193 #endif 2194 2195 struct work_struct work; 2196 unsigned long flags; 2197 #define FLUSHING_CACHED_CHARGE 0 2198 }; 2199 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 2200 .stock_lock = INIT_LOCAL_LOCK(stock_lock), 2201 }; 2202 static DEFINE_MUTEX(percpu_charge_mutex); 2203 2204 #ifdef CONFIG_MEMCG_KMEM 2205 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 2206 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2207 struct mem_cgroup *root_memcg); 2208 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); 2209 2210 #else 2211 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 2212 { 2213 return NULL; 2214 } 2215 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2216 struct mem_cgroup *root_memcg) 2217 { 2218 return false; 2219 } 2220 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2221 { 2222 } 2223 #endif 2224 2225 /** 2226 * consume_stock: Try to consume stocked charge on this cpu. 2227 * @memcg: memcg to consume from. 2228 * @nr_pages: how many pages to charge. 2229 * 2230 * The charges will only happen if @memcg matches the current cpu's memcg 2231 * stock, and at least @nr_pages are available in that stock. Failure to 2232 * service an allocation will refill the stock. 2233 * 2234 * returns true if successful, false otherwise. 2235 */ 2236 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2237 { 2238 struct memcg_stock_pcp *stock; 2239 unsigned long flags; 2240 bool ret = false; 2241 2242 if (nr_pages > MEMCG_CHARGE_BATCH) 2243 return ret; 2244 2245 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2246 2247 stock = this_cpu_ptr(&memcg_stock); 2248 if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { 2249 stock->nr_pages -= nr_pages; 2250 ret = true; 2251 } 2252 2253 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2254 2255 return ret; 2256 } 2257 2258 /* 2259 * Returns stocks cached in percpu and reset cached information. 2260 */ 2261 static void drain_stock(struct memcg_stock_pcp *stock) 2262 { 2263 struct mem_cgroup *old = READ_ONCE(stock->cached); 2264 2265 if (!old) 2266 return; 2267 2268 if (stock->nr_pages) { 2269 page_counter_uncharge(&old->memory, stock->nr_pages); 2270 if (do_memsw_account()) 2271 page_counter_uncharge(&old->memsw, stock->nr_pages); 2272 stock->nr_pages = 0; 2273 } 2274 2275 css_put(&old->css); 2276 WRITE_ONCE(stock->cached, NULL); 2277 } 2278 2279 static void drain_local_stock(struct work_struct *dummy) 2280 { 2281 struct memcg_stock_pcp *stock; 2282 struct obj_cgroup *old = NULL; 2283 unsigned long flags; 2284 2285 /* 2286 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 2287 * drain_stock races is that we always operate on local CPU stock 2288 * here with IRQ disabled 2289 */ 2290 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2291 2292 stock = this_cpu_ptr(&memcg_stock); 2293 old = drain_obj_stock(stock); 2294 drain_stock(stock); 2295 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2296 2297 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2298 if (old) 2299 obj_cgroup_put(old); 2300 } 2301 2302 /* 2303 * Cache charges(val) to local per_cpu area. 2304 * This will be consumed by consume_stock() function, later. 2305 */ 2306 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2307 { 2308 struct memcg_stock_pcp *stock; 2309 2310 stock = this_cpu_ptr(&memcg_stock); 2311 if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ 2312 drain_stock(stock); 2313 css_get(&memcg->css); 2314 WRITE_ONCE(stock->cached, memcg); 2315 } 2316 stock->nr_pages += nr_pages; 2317 2318 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2319 drain_stock(stock); 2320 } 2321 2322 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2323 { 2324 unsigned long flags; 2325 2326 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2327 __refill_stock(memcg, nr_pages); 2328 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2329 } 2330 2331 /* 2332 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2333 * of the hierarchy under it. 2334 */ 2335 static void drain_all_stock(struct mem_cgroup *root_memcg) 2336 { 2337 int cpu, curcpu; 2338 2339 /* If someone's already draining, avoid adding running more workers. */ 2340 if (!mutex_trylock(&percpu_charge_mutex)) 2341 return; 2342 /* 2343 * Notify other cpus that system-wide "drain" is running 2344 * We do not care about races with the cpu hotplug because cpu down 2345 * as well as workers from this path always operate on the local 2346 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2347 */ 2348 migrate_disable(); 2349 curcpu = smp_processor_id(); 2350 for_each_online_cpu(cpu) { 2351 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2352 struct mem_cgroup *memcg; 2353 bool flush = false; 2354 2355 rcu_read_lock(); 2356 memcg = READ_ONCE(stock->cached); 2357 if (memcg && stock->nr_pages && 2358 mem_cgroup_is_descendant(memcg, root_memcg)) 2359 flush = true; 2360 else if (obj_stock_flush_required(stock, root_memcg)) 2361 flush = true; 2362 rcu_read_unlock(); 2363 2364 if (flush && 2365 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2366 if (cpu == curcpu) 2367 drain_local_stock(&stock->work); 2368 else if (!cpu_is_isolated(cpu)) 2369 schedule_work_on(cpu, &stock->work); 2370 } 2371 } 2372 migrate_enable(); 2373 mutex_unlock(&percpu_charge_mutex); 2374 } 2375 2376 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2377 { 2378 struct memcg_stock_pcp *stock; 2379 2380 stock = &per_cpu(memcg_stock, cpu); 2381 drain_stock(stock); 2382 2383 return 0; 2384 } 2385 2386 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2387 unsigned int nr_pages, 2388 gfp_t gfp_mask) 2389 { 2390 unsigned long nr_reclaimed = 0; 2391 2392 do { 2393 unsigned long pflags; 2394 2395 if (page_counter_read(&memcg->memory) <= 2396 READ_ONCE(memcg->memory.high)) 2397 continue; 2398 2399 memcg_memory_event(memcg, MEMCG_HIGH); 2400 2401 psi_memstall_enter(&pflags); 2402 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2403 gfp_mask, 2404 MEMCG_RECLAIM_MAY_SWAP); 2405 psi_memstall_leave(&pflags); 2406 } while ((memcg = parent_mem_cgroup(memcg)) && 2407 !mem_cgroup_is_root(memcg)); 2408 2409 return nr_reclaimed; 2410 } 2411 2412 static void high_work_func(struct work_struct *work) 2413 { 2414 struct mem_cgroup *memcg; 2415 2416 memcg = container_of(work, struct mem_cgroup, high_work); 2417 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2418 } 2419 2420 /* 2421 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2422 * enough to still cause a significant slowdown in most cases, while still 2423 * allowing diagnostics and tracing to proceed without becoming stuck. 2424 */ 2425 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2426 2427 /* 2428 * When calculating the delay, we use these either side of the exponentiation to 2429 * maintain precision and scale to a reasonable number of jiffies (see the table 2430 * below. 2431 * 2432 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2433 * overage ratio to a delay. 2434 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2435 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2436 * to produce a reasonable delay curve. 2437 * 2438 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2439 * reasonable delay curve compared to precision-adjusted overage, not 2440 * penalising heavily at first, but still making sure that growth beyond the 2441 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2442 * example, with a high of 100 megabytes: 2443 * 2444 * +-------+------------------------+ 2445 * | usage | time to allocate in ms | 2446 * +-------+------------------------+ 2447 * | 100M | 0 | 2448 * | 101M | 6 | 2449 * | 102M | 25 | 2450 * | 103M | 57 | 2451 * | 104M | 102 | 2452 * | 105M | 159 | 2453 * | 106M | 230 | 2454 * | 107M | 313 | 2455 * | 108M | 409 | 2456 * | 109M | 518 | 2457 * | 110M | 639 | 2458 * | 111M | 774 | 2459 * | 112M | 921 | 2460 * | 113M | 1081 | 2461 * | 114M | 1254 | 2462 * | 115M | 1439 | 2463 * | 116M | 1638 | 2464 * | 117M | 1849 | 2465 * | 118M | 2000 | 2466 * | 119M | 2000 | 2467 * | 120M | 2000 | 2468 * +-------+------------------------+ 2469 */ 2470 #define MEMCG_DELAY_PRECISION_SHIFT 20 2471 #define MEMCG_DELAY_SCALING_SHIFT 14 2472 2473 static u64 calculate_overage(unsigned long usage, unsigned long high) 2474 { 2475 u64 overage; 2476 2477 if (usage <= high) 2478 return 0; 2479 2480 /* 2481 * Prevent division by 0 in overage calculation by acting as if 2482 * it was a threshold of 1 page 2483 */ 2484 high = max(high, 1UL); 2485 2486 overage = usage - high; 2487 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2488 return div64_u64(overage, high); 2489 } 2490 2491 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2492 { 2493 u64 overage, max_overage = 0; 2494 2495 do { 2496 overage = calculate_overage(page_counter_read(&memcg->memory), 2497 READ_ONCE(memcg->memory.high)); 2498 max_overage = max(overage, max_overage); 2499 } while ((memcg = parent_mem_cgroup(memcg)) && 2500 !mem_cgroup_is_root(memcg)); 2501 2502 return max_overage; 2503 } 2504 2505 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2506 { 2507 u64 overage, max_overage = 0; 2508 2509 do { 2510 overage = calculate_overage(page_counter_read(&memcg->swap), 2511 READ_ONCE(memcg->swap.high)); 2512 if (overage) 2513 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2514 max_overage = max(overage, max_overage); 2515 } while ((memcg = parent_mem_cgroup(memcg)) && 2516 !mem_cgroup_is_root(memcg)); 2517 2518 return max_overage; 2519 } 2520 2521 /* 2522 * Get the number of jiffies that we should penalise a mischievous cgroup which 2523 * is exceeding its memory.high by checking both it and its ancestors. 2524 */ 2525 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2526 unsigned int nr_pages, 2527 u64 max_overage) 2528 { 2529 unsigned long penalty_jiffies; 2530 2531 if (!max_overage) 2532 return 0; 2533 2534 /* 2535 * We use overage compared to memory.high to calculate the number of 2536 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2537 * fairly lenient on small overages, and increasingly harsh when the 2538 * memcg in question makes it clear that it has no intention of stopping 2539 * its crazy behaviour, so we exponentially increase the delay based on 2540 * overage amount. 2541 */ 2542 penalty_jiffies = max_overage * max_overage * HZ; 2543 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2544 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2545 2546 /* 2547 * Factor in the task's own contribution to the overage, such that four 2548 * N-sized allocations are throttled approximately the same as one 2549 * 4N-sized allocation. 2550 * 2551 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2552 * larger the current charge patch is than that. 2553 */ 2554 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2555 } 2556 2557 /* 2558 * Scheduled by try_charge() to be executed from the userland return path 2559 * and reclaims memory over the high limit. 2560 */ 2561 void mem_cgroup_handle_over_high(void) 2562 { 2563 unsigned long penalty_jiffies; 2564 unsigned long pflags; 2565 unsigned long nr_reclaimed; 2566 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2567 int nr_retries = MAX_RECLAIM_RETRIES; 2568 struct mem_cgroup *memcg; 2569 bool in_retry = false; 2570 2571 if (likely(!nr_pages)) 2572 return; 2573 2574 memcg = get_mem_cgroup_from_mm(current->mm); 2575 current->memcg_nr_pages_over_high = 0; 2576 2577 retry_reclaim: 2578 /* 2579 * The allocating task should reclaim at least the batch size, but for 2580 * subsequent retries we only want to do what's necessary to prevent oom 2581 * or breaching resource isolation. 2582 * 2583 * This is distinct from memory.max or page allocator behaviour because 2584 * memory.high is currently batched, whereas memory.max and the page 2585 * allocator run every time an allocation is made. 2586 */ 2587 nr_reclaimed = reclaim_high(memcg, 2588 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2589 GFP_KERNEL); 2590 2591 /* 2592 * memory.high is breached and reclaim is unable to keep up. Throttle 2593 * allocators proactively to slow down excessive growth. 2594 */ 2595 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2596 mem_find_max_overage(memcg)); 2597 2598 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2599 swap_find_max_overage(memcg)); 2600 2601 /* 2602 * Clamp the max delay per usermode return so as to still keep the 2603 * application moving forwards and also permit diagnostics, albeit 2604 * extremely slowly. 2605 */ 2606 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2607 2608 /* 2609 * Don't sleep if the amount of jiffies this memcg owes us is so low 2610 * that it's not even worth doing, in an attempt to be nice to those who 2611 * go only a small amount over their memory.high value and maybe haven't 2612 * been aggressively reclaimed enough yet. 2613 */ 2614 if (penalty_jiffies <= HZ / 100) 2615 goto out; 2616 2617 /* 2618 * If reclaim is making forward progress but we're still over 2619 * memory.high, we want to encourage that rather than doing allocator 2620 * throttling. 2621 */ 2622 if (nr_reclaimed || nr_retries--) { 2623 in_retry = true; 2624 goto retry_reclaim; 2625 } 2626 2627 /* 2628 * If we exit early, we're guaranteed to die (since 2629 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2630 * need to account for any ill-begotten jiffies to pay them off later. 2631 */ 2632 psi_memstall_enter(&pflags); 2633 schedule_timeout_killable(penalty_jiffies); 2634 psi_memstall_leave(&pflags); 2635 2636 out: 2637 css_put(&memcg->css); 2638 } 2639 2640 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2641 unsigned int nr_pages) 2642 { 2643 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2644 int nr_retries = MAX_RECLAIM_RETRIES; 2645 struct mem_cgroup *mem_over_limit; 2646 struct page_counter *counter; 2647 unsigned long nr_reclaimed; 2648 bool passed_oom = false; 2649 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 2650 bool drained = false; 2651 bool raised_max_event = false; 2652 unsigned long pflags; 2653 2654 retry: 2655 if (consume_stock(memcg, nr_pages)) 2656 return 0; 2657 2658 if (!do_memsw_account() || 2659 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2660 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2661 goto done_restock; 2662 if (do_memsw_account()) 2663 page_counter_uncharge(&memcg->memsw, batch); 2664 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2665 } else { 2666 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2667 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 2668 } 2669 2670 if (batch > nr_pages) { 2671 batch = nr_pages; 2672 goto retry; 2673 } 2674 2675 /* 2676 * Prevent unbounded recursion when reclaim operations need to 2677 * allocate memory. This might exceed the limits temporarily, 2678 * but we prefer facilitating memory reclaim and getting back 2679 * under the limit over triggering OOM kills in these cases. 2680 */ 2681 if (unlikely(current->flags & PF_MEMALLOC)) 2682 goto force; 2683 2684 if (unlikely(task_in_memcg_oom(current))) 2685 goto nomem; 2686 2687 if (!gfpflags_allow_blocking(gfp_mask)) 2688 goto nomem; 2689 2690 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2691 raised_max_event = true; 2692 2693 psi_memstall_enter(&pflags); 2694 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2695 gfp_mask, reclaim_options); 2696 psi_memstall_leave(&pflags); 2697 2698 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2699 goto retry; 2700 2701 if (!drained) { 2702 drain_all_stock(mem_over_limit); 2703 drained = true; 2704 goto retry; 2705 } 2706 2707 if (gfp_mask & __GFP_NORETRY) 2708 goto nomem; 2709 /* 2710 * Even though the limit is exceeded at this point, reclaim 2711 * may have been able to free some pages. Retry the charge 2712 * before killing the task. 2713 * 2714 * Only for regular pages, though: huge pages are rather 2715 * unlikely to succeed so close to the limit, and we fall back 2716 * to regular pages anyway in case of failure. 2717 */ 2718 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2719 goto retry; 2720 /* 2721 * At task move, charge accounts can be doubly counted. So, it's 2722 * better to wait until the end of task_move if something is going on. 2723 */ 2724 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2725 goto retry; 2726 2727 if (nr_retries--) 2728 goto retry; 2729 2730 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2731 goto nomem; 2732 2733 /* Avoid endless loop for tasks bypassed by the oom killer */ 2734 if (passed_oom && task_is_dying()) 2735 goto nomem; 2736 2737 /* 2738 * keep retrying as long as the memcg oom killer is able to make 2739 * a forward progress or bypass the charge if the oom killer 2740 * couldn't make any progress. 2741 */ 2742 if (mem_cgroup_oom(mem_over_limit, gfp_mask, 2743 get_order(nr_pages * PAGE_SIZE))) { 2744 passed_oom = true; 2745 nr_retries = MAX_RECLAIM_RETRIES; 2746 goto retry; 2747 } 2748 nomem: 2749 /* 2750 * Memcg doesn't have a dedicated reserve for atomic 2751 * allocations. But like the global atomic pool, we need to 2752 * put the burden of reclaim on regular allocation requests 2753 * and let these go through as privileged allocations. 2754 */ 2755 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 2756 return -ENOMEM; 2757 force: 2758 /* 2759 * If the allocation has to be enforced, don't forget to raise 2760 * a MEMCG_MAX event. 2761 */ 2762 if (!raised_max_event) 2763 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2764 2765 /* 2766 * The allocation either can't fail or will lead to more memory 2767 * being freed very soon. Allow memory usage go over the limit 2768 * temporarily by force charging it. 2769 */ 2770 page_counter_charge(&memcg->memory, nr_pages); 2771 if (do_memsw_account()) 2772 page_counter_charge(&memcg->memsw, nr_pages); 2773 2774 return 0; 2775 2776 done_restock: 2777 if (batch > nr_pages) 2778 refill_stock(memcg, batch - nr_pages); 2779 2780 /* 2781 * If the hierarchy is above the normal consumption range, schedule 2782 * reclaim on returning to userland. We can perform reclaim here 2783 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2784 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2785 * not recorded as it most likely matches current's and won't 2786 * change in the meantime. As high limit is checked again before 2787 * reclaim, the cost of mismatch is negligible. 2788 */ 2789 do { 2790 bool mem_high, swap_high; 2791 2792 mem_high = page_counter_read(&memcg->memory) > 2793 READ_ONCE(memcg->memory.high); 2794 swap_high = page_counter_read(&memcg->swap) > 2795 READ_ONCE(memcg->swap.high); 2796 2797 /* Don't bother a random interrupted task */ 2798 if (!in_task()) { 2799 if (mem_high) { 2800 schedule_work(&memcg->high_work); 2801 break; 2802 } 2803 continue; 2804 } 2805 2806 if (mem_high || swap_high) { 2807 /* 2808 * The allocating tasks in this cgroup will need to do 2809 * reclaim or be throttled to prevent further growth 2810 * of the memory or swap footprints. 2811 * 2812 * Target some best-effort fairness between the tasks, 2813 * and distribute reclaim work and delay penalties 2814 * based on how much each task is actually allocating. 2815 */ 2816 current->memcg_nr_pages_over_high += batch; 2817 set_notify_resume(current); 2818 break; 2819 } 2820 } while ((memcg = parent_mem_cgroup(memcg))); 2821 2822 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 2823 !(current->flags & PF_MEMALLOC) && 2824 gfpflags_allow_blocking(gfp_mask)) { 2825 mem_cgroup_handle_over_high(); 2826 } 2827 return 0; 2828 } 2829 2830 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2831 unsigned int nr_pages) 2832 { 2833 if (mem_cgroup_is_root(memcg)) 2834 return 0; 2835 2836 return try_charge_memcg(memcg, gfp_mask, nr_pages); 2837 } 2838 2839 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2840 { 2841 if (mem_cgroup_is_root(memcg)) 2842 return; 2843 2844 page_counter_uncharge(&memcg->memory, nr_pages); 2845 if (do_memsw_account()) 2846 page_counter_uncharge(&memcg->memsw, nr_pages); 2847 } 2848 2849 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2850 { 2851 VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 2852 /* 2853 * Any of the following ensures page's memcg stability: 2854 * 2855 * - the page lock 2856 * - LRU isolation 2857 * - folio_memcg_lock() 2858 * - exclusive reference 2859 * - mem_cgroup_trylock_pages() 2860 */ 2861 folio->memcg_data = (unsigned long)memcg; 2862 } 2863 2864 #ifdef CONFIG_MEMCG_KMEM 2865 /* 2866 * The allocated objcg pointers array is not accounted directly. 2867 * Moreover, it should not come from DMA buffer and is not readily 2868 * reclaimable. So those GFP bits should be masked off. 2869 */ 2870 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) 2871 2872 /* 2873 * mod_objcg_mlstate() may be called with irq enabled, so 2874 * mod_memcg_lruvec_state() should be used. 2875 */ 2876 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 2877 struct pglist_data *pgdat, 2878 enum node_stat_item idx, int nr) 2879 { 2880 struct mem_cgroup *memcg; 2881 struct lruvec *lruvec; 2882 2883 rcu_read_lock(); 2884 memcg = obj_cgroup_memcg(objcg); 2885 lruvec = mem_cgroup_lruvec(memcg, pgdat); 2886 mod_memcg_lruvec_state(lruvec, idx, nr); 2887 rcu_read_unlock(); 2888 } 2889 2890 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, 2891 gfp_t gfp, bool new_slab) 2892 { 2893 unsigned int objects = objs_per_slab(s, slab); 2894 unsigned long memcg_data; 2895 void *vec; 2896 2897 gfp &= ~OBJCGS_CLEAR_MASK; 2898 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2899 slab_nid(slab)); 2900 if (!vec) 2901 return -ENOMEM; 2902 2903 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2904 if (new_slab) { 2905 /* 2906 * If the slab is brand new and nobody can yet access its 2907 * memcg_data, no synchronization is required and memcg_data can 2908 * be simply assigned. 2909 */ 2910 slab->memcg_data = memcg_data; 2911 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { 2912 /* 2913 * If the slab is already in use, somebody can allocate and 2914 * assign obj_cgroups in parallel. In this case the existing 2915 * objcg vector should be reused. 2916 */ 2917 kfree(vec); 2918 return 0; 2919 } 2920 2921 kmemleak_not_leak(vec); 2922 return 0; 2923 } 2924 2925 static __always_inline 2926 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2927 { 2928 /* 2929 * Slab objects are accounted individually, not per-page. 2930 * Memcg membership data for each individual object is saved in 2931 * slab->memcg_data. 2932 */ 2933 if (folio_test_slab(folio)) { 2934 struct obj_cgroup **objcgs; 2935 struct slab *slab; 2936 unsigned int off; 2937 2938 slab = folio_slab(folio); 2939 objcgs = slab_objcgs(slab); 2940 if (!objcgs) 2941 return NULL; 2942 2943 off = obj_to_index(slab->slab_cache, slab, p); 2944 if (objcgs[off]) 2945 return obj_cgroup_memcg(objcgs[off]); 2946 2947 return NULL; 2948 } 2949 2950 /* 2951 * folio_memcg_check() is used here, because in theory we can encounter 2952 * a folio where the slab flag has been cleared already, but 2953 * slab->memcg_data has not been freed yet 2954 * folio_memcg_check() will guarantee that a proper memory 2955 * cgroup pointer or NULL will be returned. 2956 */ 2957 return folio_memcg_check(folio); 2958 } 2959 2960 /* 2961 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2962 * 2963 * A passed kernel object can be a slab object, vmalloc object or a generic 2964 * kernel page, so different mechanisms for getting the memory cgroup pointer 2965 * should be used. 2966 * 2967 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2968 * can not know for sure how the kernel object is implemented. 2969 * mem_cgroup_from_obj() can be safely used in such cases. 2970 * 2971 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2972 * cgroup_mutex, etc. 2973 */ 2974 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2975 { 2976 struct folio *folio; 2977 2978 if (mem_cgroup_disabled()) 2979 return NULL; 2980 2981 if (unlikely(is_vmalloc_addr(p))) 2982 folio = page_folio(vmalloc_to_page(p)); 2983 else 2984 folio = virt_to_folio(p); 2985 2986 return mem_cgroup_from_obj_folio(folio, p); 2987 } 2988 2989 /* 2990 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2991 * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, 2992 * allocated using vmalloc(). 2993 * 2994 * A passed kernel object must be a slab object or a generic kernel page. 2995 * 2996 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2997 * cgroup_mutex, etc. 2998 */ 2999 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 3000 { 3001 if (mem_cgroup_disabled()) 3002 return NULL; 3003 3004 return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 3005 } 3006 3007 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 3008 { 3009 struct obj_cgroup *objcg = NULL; 3010 3011 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 3012 objcg = rcu_dereference(memcg->objcg); 3013 if (objcg && obj_cgroup_tryget(objcg)) 3014 break; 3015 objcg = NULL; 3016 } 3017 return objcg; 3018 } 3019 3020 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 3021 { 3022 struct obj_cgroup *objcg = NULL; 3023 struct mem_cgroup *memcg; 3024 3025 if (memcg_kmem_bypass()) 3026 return NULL; 3027 3028 rcu_read_lock(); 3029 if (unlikely(active_memcg())) 3030 memcg = active_memcg(); 3031 else 3032 memcg = mem_cgroup_from_task(current); 3033 objcg = __get_obj_cgroup_from_memcg(memcg); 3034 rcu_read_unlock(); 3035 return objcg; 3036 } 3037 3038 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) 3039 { 3040 struct obj_cgroup *objcg; 3041 3042 if (!memcg_kmem_online()) 3043 return NULL; 3044 3045 if (folio_memcg_kmem(folio)) { 3046 objcg = __folio_objcg(folio); 3047 obj_cgroup_get(objcg); 3048 } else { 3049 struct mem_cgroup *memcg; 3050 3051 rcu_read_lock(); 3052 memcg = __folio_memcg(folio); 3053 if (memcg) 3054 objcg = __get_obj_cgroup_from_memcg(memcg); 3055 else 3056 objcg = NULL; 3057 rcu_read_unlock(); 3058 } 3059 return objcg; 3060 } 3061 3062 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 3063 { 3064 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 3065 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 3066 if (nr_pages > 0) 3067 page_counter_charge(&memcg->kmem, nr_pages); 3068 else 3069 page_counter_uncharge(&memcg->kmem, -nr_pages); 3070 } 3071 } 3072 3073 3074 /* 3075 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 3076 * @objcg: object cgroup to uncharge 3077 * @nr_pages: number of pages to uncharge 3078 */ 3079 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 3080 unsigned int nr_pages) 3081 { 3082 struct mem_cgroup *memcg; 3083 3084 memcg = get_mem_cgroup_from_objcg(objcg); 3085 3086 memcg_account_kmem(memcg, -nr_pages); 3087 refill_stock(memcg, nr_pages); 3088 3089 css_put(&memcg->css); 3090 } 3091 3092 /* 3093 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 3094 * @objcg: object cgroup to charge 3095 * @gfp: reclaim mode 3096 * @nr_pages: number of pages to charge 3097 * 3098 * Returns 0 on success, an error code on failure. 3099 */ 3100 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 3101 unsigned int nr_pages) 3102 { 3103 struct mem_cgroup *memcg; 3104 int ret; 3105 3106 memcg = get_mem_cgroup_from_objcg(objcg); 3107 3108 ret = try_charge_memcg(memcg, gfp, nr_pages); 3109 if (ret) 3110 goto out; 3111 3112 memcg_account_kmem(memcg, nr_pages); 3113 out: 3114 css_put(&memcg->css); 3115 3116 return ret; 3117 } 3118 3119 /** 3120 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3121 * @page: page to charge 3122 * @gfp: reclaim mode 3123 * @order: allocation order 3124 * 3125 * Returns 0 on success, an error code on failure. 3126 */ 3127 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3128 { 3129 struct obj_cgroup *objcg; 3130 int ret = 0; 3131 3132 objcg = get_obj_cgroup_from_current(); 3133 if (objcg) { 3134 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 3135 if (!ret) { 3136 page->memcg_data = (unsigned long)objcg | 3137 MEMCG_DATA_KMEM; 3138 return 0; 3139 } 3140 obj_cgroup_put(objcg); 3141 } 3142 return ret; 3143 } 3144 3145 /** 3146 * __memcg_kmem_uncharge_page: uncharge a kmem page 3147 * @page: page to uncharge 3148 * @order: allocation order 3149 */ 3150 void __memcg_kmem_uncharge_page(struct page *page, int order) 3151 { 3152 struct folio *folio = page_folio(page); 3153 struct obj_cgroup *objcg; 3154 unsigned int nr_pages = 1 << order; 3155 3156 if (!folio_memcg_kmem(folio)) 3157 return; 3158 3159 objcg = __folio_objcg(folio); 3160 obj_cgroup_uncharge_pages(objcg, nr_pages); 3161 folio->memcg_data = 0; 3162 obj_cgroup_put(objcg); 3163 } 3164 3165 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 3166 enum node_stat_item idx, int nr) 3167 { 3168 struct memcg_stock_pcp *stock; 3169 struct obj_cgroup *old = NULL; 3170 unsigned long flags; 3171 int *bytes; 3172 3173 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3174 stock = this_cpu_ptr(&memcg_stock); 3175 3176 /* 3177 * Save vmstat data in stock and skip vmstat array update unless 3178 * accumulating over a page of vmstat data or when pgdat or idx 3179 * changes. 3180 */ 3181 if (READ_ONCE(stock->cached_objcg) != objcg) { 3182 old = drain_obj_stock(stock); 3183 obj_cgroup_get(objcg); 3184 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3185 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3186 WRITE_ONCE(stock->cached_objcg, objcg); 3187 stock->cached_pgdat = pgdat; 3188 } else if (stock->cached_pgdat != pgdat) { 3189 /* Flush the existing cached vmstat data */ 3190 struct pglist_data *oldpg = stock->cached_pgdat; 3191 3192 if (stock->nr_slab_reclaimable_b) { 3193 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 3194 stock->nr_slab_reclaimable_b); 3195 stock->nr_slab_reclaimable_b = 0; 3196 } 3197 if (stock->nr_slab_unreclaimable_b) { 3198 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 3199 stock->nr_slab_unreclaimable_b); 3200 stock->nr_slab_unreclaimable_b = 0; 3201 } 3202 stock->cached_pgdat = pgdat; 3203 } 3204 3205 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 3206 : &stock->nr_slab_unreclaimable_b; 3207 /* 3208 * Even for large object >= PAGE_SIZE, the vmstat data will still be 3209 * cached locally at least once before pushing it out. 3210 */ 3211 if (!*bytes) { 3212 *bytes = nr; 3213 nr = 0; 3214 } else { 3215 *bytes += nr; 3216 if (abs(*bytes) > PAGE_SIZE) { 3217 nr = *bytes; 3218 *bytes = 0; 3219 } else { 3220 nr = 0; 3221 } 3222 } 3223 if (nr) 3224 mod_objcg_mlstate(objcg, pgdat, idx, nr); 3225 3226 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3227 if (old) 3228 obj_cgroup_put(old); 3229 } 3230 3231 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3232 { 3233 struct memcg_stock_pcp *stock; 3234 unsigned long flags; 3235 bool ret = false; 3236 3237 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3238 3239 stock = this_cpu_ptr(&memcg_stock); 3240 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 3241 stock->nr_bytes -= nr_bytes; 3242 ret = true; 3243 } 3244 3245 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3246 3247 return ret; 3248 } 3249 3250 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 3251 { 3252 struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); 3253 3254 if (!old) 3255 return NULL; 3256 3257 if (stock->nr_bytes) { 3258 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3259 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3260 3261 if (nr_pages) { 3262 struct mem_cgroup *memcg; 3263 3264 memcg = get_mem_cgroup_from_objcg(old); 3265 3266 memcg_account_kmem(memcg, -nr_pages); 3267 __refill_stock(memcg, nr_pages); 3268 3269 css_put(&memcg->css); 3270 } 3271 3272 /* 3273 * The leftover is flushed to the centralized per-memcg value. 3274 * On the next attempt to refill obj stock it will be moved 3275 * to a per-cpu stock (probably, on an other CPU), see 3276 * refill_obj_stock(). 3277 * 3278 * How often it's flushed is a trade-off between the memory 3279 * limit enforcement accuracy and potential CPU contention, 3280 * so it might be changed in the future. 3281 */ 3282 atomic_add(nr_bytes, &old->nr_charged_bytes); 3283 stock->nr_bytes = 0; 3284 } 3285 3286 /* 3287 * Flush the vmstat data in current stock 3288 */ 3289 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 3290 if (stock->nr_slab_reclaimable_b) { 3291 mod_objcg_mlstate(old, stock->cached_pgdat, 3292 NR_SLAB_RECLAIMABLE_B, 3293 stock->nr_slab_reclaimable_b); 3294 stock->nr_slab_reclaimable_b = 0; 3295 } 3296 if (stock->nr_slab_unreclaimable_b) { 3297 mod_objcg_mlstate(old, stock->cached_pgdat, 3298 NR_SLAB_UNRECLAIMABLE_B, 3299 stock->nr_slab_unreclaimable_b); 3300 stock->nr_slab_unreclaimable_b = 0; 3301 } 3302 stock->cached_pgdat = NULL; 3303 } 3304 3305 WRITE_ONCE(stock->cached_objcg, NULL); 3306 /* 3307 * The `old' objects needs to be released by the caller via 3308 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 3309 */ 3310 return old; 3311 } 3312 3313 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3314 struct mem_cgroup *root_memcg) 3315 { 3316 struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); 3317 struct mem_cgroup *memcg; 3318 3319 if (objcg) { 3320 memcg = obj_cgroup_memcg(objcg); 3321 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3322 return true; 3323 } 3324 3325 return false; 3326 } 3327 3328 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3329 bool allow_uncharge) 3330 { 3331 struct memcg_stock_pcp *stock; 3332 struct obj_cgroup *old = NULL; 3333 unsigned long flags; 3334 unsigned int nr_pages = 0; 3335 3336 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3337 3338 stock = this_cpu_ptr(&memcg_stock); 3339 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ 3340 old = drain_obj_stock(stock); 3341 obj_cgroup_get(objcg); 3342 WRITE_ONCE(stock->cached_objcg, objcg); 3343 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3344 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3345 allow_uncharge = true; /* Allow uncharge when objcg changes */ 3346 } 3347 stock->nr_bytes += nr_bytes; 3348 3349 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 3350 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3351 stock->nr_bytes &= (PAGE_SIZE - 1); 3352 } 3353 3354 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3355 if (old) 3356 obj_cgroup_put(old); 3357 3358 if (nr_pages) 3359 obj_cgroup_uncharge_pages(objcg, nr_pages); 3360 } 3361 3362 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3363 { 3364 unsigned int nr_pages, nr_bytes; 3365 int ret; 3366 3367 if (consume_obj_stock(objcg, size)) 3368 return 0; 3369 3370 /* 3371 * In theory, objcg->nr_charged_bytes can have enough 3372 * pre-charged bytes to satisfy the allocation. However, 3373 * flushing objcg->nr_charged_bytes requires two atomic 3374 * operations, and objcg->nr_charged_bytes can't be big. 3375 * The shared objcg->nr_charged_bytes can also become a 3376 * performance bottleneck if all tasks of the same memcg are 3377 * trying to update it. So it's better to ignore it and try 3378 * grab some new pages. The stock's nr_bytes will be flushed to 3379 * objcg->nr_charged_bytes later on when objcg changes. 3380 * 3381 * The stock's nr_bytes may contain enough pre-charged bytes 3382 * to allow one less page from being charged, but we can't rely 3383 * on the pre-charged bytes not being changed outside of 3384 * consume_obj_stock() or refill_obj_stock(). So ignore those 3385 * pre-charged bytes as well when charging pages. To avoid a 3386 * page uncharge right after a page charge, we set the 3387 * allow_uncharge flag to false when calling refill_obj_stock() 3388 * to temporarily allow the pre-charged bytes to exceed the page 3389 * size limit. The maximum reachable value of the pre-charged 3390 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 3391 * race. 3392 */ 3393 nr_pages = size >> PAGE_SHIFT; 3394 nr_bytes = size & (PAGE_SIZE - 1); 3395 3396 if (nr_bytes) 3397 nr_pages += 1; 3398 3399 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3400 if (!ret && nr_bytes) 3401 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 3402 3403 return ret; 3404 } 3405 3406 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3407 { 3408 refill_obj_stock(objcg, size, true); 3409 } 3410 3411 #endif /* CONFIG_MEMCG_KMEM */ 3412 3413 /* 3414 * Because page_memcg(head) is not set on tails, set it now. 3415 */ 3416 void split_page_memcg(struct page *head, unsigned int nr) 3417 { 3418 struct folio *folio = page_folio(head); 3419 struct mem_cgroup *memcg = folio_memcg(folio); 3420 int i; 3421 3422 if (mem_cgroup_disabled() || !memcg) 3423 return; 3424 3425 for (i = 1; i < nr; i++) 3426 folio_page(folio, i)->memcg_data = folio->memcg_data; 3427 3428 if (folio_memcg_kmem(folio)) 3429 obj_cgroup_get_many(__folio_objcg(folio), nr - 1); 3430 else 3431 css_get_many(&memcg->css, nr - 1); 3432 } 3433 3434 #ifdef CONFIG_SWAP 3435 /** 3436 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3437 * @entry: swap entry to be moved 3438 * @from: mem_cgroup which the entry is moved from 3439 * @to: mem_cgroup which the entry is moved to 3440 * 3441 * It succeeds only when the swap_cgroup's record for this entry is the same 3442 * as the mem_cgroup's id of @from. 3443 * 3444 * Returns 0 on success, -EINVAL on failure. 3445 * 3446 * The caller must have charged to @to, IOW, called page_counter_charge() about 3447 * both res and memsw, and called css_get(). 3448 */ 3449 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3450 struct mem_cgroup *from, struct mem_cgroup *to) 3451 { 3452 unsigned short old_id, new_id; 3453 3454 old_id = mem_cgroup_id(from); 3455 new_id = mem_cgroup_id(to); 3456 3457 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3458 mod_memcg_state(from, MEMCG_SWAP, -1); 3459 mod_memcg_state(to, MEMCG_SWAP, 1); 3460 return 0; 3461 } 3462 return -EINVAL; 3463 } 3464 #else 3465 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3466 struct mem_cgroup *from, struct mem_cgroup *to) 3467 { 3468 return -EINVAL; 3469 } 3470 #endif 3471 3472 static DEFINE_MUTEX(memcg_max_mutex); 3473 3474 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3475 unsigned long max, bool memsw) 3476 { 3477 bool enlarge = false; 3478 bool drained = false; 3479 int ret; 3480 bool limits_invariant; 3481 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3482 3483 do { 3484 if (signal_pending(current)) { 3485 ret = -EINTR; 3486 break; 3487 } 3488 3489 mutex_lock(&memcg_max_mutex); 3490 /* 3491 * Make sure that the new limit (memsw or memory limit) doesn't 3492 * break our basic invariant rule memory.max <= memsw.max. 3493 */ 3494 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3495 max <= memcg->memsw.max; 3496 if (!limits_invariant) { 3497 mutex_unlock(&memcg_max_mutex); 3498 ret = -EINVAL; 3499 break; 3500 } 3501 if (max > counter->max) 3502 enlarge = true; 3503 ret = page_counter_set_max(counter, max); 3504 mutex_unlock(&memcg_max_mutex); 3505 3506 if (!ret) 3507 break; 3508 3509 if (!drained) { 3510 drain_all_stock(memcg); 3511 drained = true; 3512 continue; 3513 } 3514 3515 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3516 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 3517 ret = -EBUSY; 3518 break; 3519 } 3520 } while (true); 3521 3522 if (!ret && enlarge) 3523 memcg_oom_recover(memcg); 3524 3525 return ret; 3526 } 3527 3528 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3529 gfp_t gfp_mask, 3530 unsigned long *total_scanned) 3531 { 3532 unsigned long nr_reclaimed = 0; 3533 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3534 unsigned long reclaimed; 3535 int loop = 0; 3536 struct mem_cgroup_tree_per_node *mctz; 3537 unsigned long excess; 3538 3539 if (lru_gen_enabled()) 3540 return 0; 3541 3542 if (order > 0) 3543 return 0; 3544 3545 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 3546 3547 /* 3548 * Do not even bother to check the largest node if the root 3549 * is empty. Do it lockless to prevent lock bouncing. Races 3550 * are acceptable as soft limit is best effort anyway. 3551 */ 3552 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3553 return 0; 3554 3555 /* 3556 * This loop can run a while, specially if mem_cgroup's continuously 3557 * keep exceeding their soft limit and putting the system under 3558 * pressure 3559 */ 3560 do { 3561 if (next_mz) 3562 mz = next_mz; 3563 else 3564 mz = mem_cgroup_largest_soft_limit_node(mctz); 3565 if (!mz) 3566 break; 3567 3568 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3569 gfp_mask, total_scanned); 3570 nr_reclaimed += reclaimed; 3571 spin_lock_irq(&mctz->lock); 3572 3573 /* 3574 * If we failed to reclaim anything from this memory cgroup 3575 * it is time to move on to the next cgroup 3576 */ 3577 next_mz = NULL; 3578 if (!reclaimed) 3579 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3580 3581 excess = soft_limit_excess(mz->memcg); 3582 /* 3583 * One school of thought says that we should not add 3584 * back the node to the tree if reclaim returns 0. 3585 * But our reclaim could return 0, simply because due 3586 * to priority we are exposing a smaller subset of 3587 * memory to reclaim from. Consider this as a longer 3588 * term TODO. 3589 */ 3590 /* If excess == 0, no tree ops */ 3591 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3592 spin_unlock_irq(&mctz->lock); 3593 css_put(&mz->memcg->css); 3594 loop++; 3595 /* 3596 * Could not reclaim anything and there are no more 3597 * mem cgroups to try or we seem to be looping without 3598 * reclaiming anything. 3599 */ 3600 if (!nr_reclaimed && 3601 (next_mz == NULL || 3602 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3603 break; 3604 } while (!nr_reclaimed); 3605 if (next_mz) 3606 css_put(&next_mz->memcg->css); 3607 return nr_reclaimed; 3608 } 3609 3610 /* 3611 * Reclaims as many pages from the given memcg as possible. 3612 * 3613 * Caller is responsible for holding css reference for memcg. 3614 */ 3615 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3616 { 3617 int nr_retries = MAX_RECLAIM_RETRIES; 3618 3619 /* we call try-to-free pages for make this cgroup empty */ 3620 lru_add_drain_all(); 3621 3622 drain_all_stock(memcg); 3623 3624 /* try to free all pages in this cgroup */ 3625 while (nr_retries && page_counter_read(&memcg->memory)) { 3626 if (signal_pending(current)) 3627 return -EINTR; 3628 3629 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3630 MEMCG_RECLAIM_MAY_SWAP)) 3631 nr_retries--; 3632 } 3633 3634 return 0; 3635 } 3636 3637 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3638 char *buf, size_t nbytes, 3639 loff_t off) 3640 { 3641 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3642 3643 if (mem_cgroup_is_root(memcg)) 3644 return -EINVAL; 3645 return mem_cgroup_force_empty(memcg) ?: nbytes; 3646 } 3647 3648 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3649 struct cftype *cft) 3650 { 3651 return 1; 3652 } 3653 3654 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3655 struct cftype *cft, u64 val) 3656 { 3657 if (val == 1) 3658 return 0; 3659 3660 pr_warn_once("Non-hierarchical mode is deprecated. " 3661 "Please report your usecase to linux-mm@kvack.org if you " 3662 "depend on this functionality.\n"); 3663 3664 return -EINVAL; 3665 } 3666 3667 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3668 { 3669 unsigned long val; 3670 3671 if (mem_cgroup_is_root(memcg)) { 3672 /* 3673 * Approximate root's usage from global state. This isn't 3674 * perfect, but the root usage was always an approximation. 3675 */ 3676 val = global_node_page_state(NR_FILE_PAGES) + 3677 global_node_page_state(NR_ANON_MAPPED); 3678 if (swap) 3679 val += total_swap_pages - get_nr_swap_pages(); 3680 } else { 3681 if (!swap) 3682 val = page_counter_read(&memcg->memory); 3683 else 3684 val = page_counter_read(&memcg->memsw); 3685 } 3686 return val; 3687 } 3688 3689 enum { 3690 RES_USAGE, 3691 RES_LIMIT, 3692 RES_MAX_USAGE, 3693 RES_FAILCNT, 3694 RES_SOFT_LIMIT, 3695 }; 3696 3697 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3698 struct cftype *cft) 3699 { 3700 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3701 struct page_counter *counter; 3702 3703 switch (MEMFILE_TYPE(cft->private)) { 3704 case _MEM: 3705 counter = &memcg->memory; 3706 break; 3707 case _MEMSWAP: 3708 counter = &memcg->memsw; 3709 break; 3710 case _KMEM: 3711 counter = &memcg->kmem; 3712 break; 3713 case _TCP: 3714 counter = &memcg->tcpmem; 3715 break; 3716 default: 3717 BUG(); 3718 } 3719 3720 switch (MEMFILE_ATTR(cft->private)) { 3721 case RES_USAGE: 3722 if (counter == &memcg->memory) 3723 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3724 if (counter == &memcg->memsw) 3725 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3726 return (u64)page_counter_read(counter) * PAGE_SIZE; 3727 case RES_LIMIT: 3728 return (u64)counter->max * PAGE_SIZE; 3729 case RES_MAX_USAGE: 3730 return (u64)counter->watermark * PAGE_SIZE; 3731 case RES_FAILCNT: 3732 return counter->failcnt; 3733 case RES_SOFT_LIMIT: 3734 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 3735 default: 3736 BUG(); 3737 } 3738 } 3739 3740 /* 3741 * This function doesn't do anything useful. Its only job is to provide a read 3742 * handler for a file so that cgroup_file_mode() will add read permissions. 3743 */ 3744 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 3745 __always_unused void *v) 3746 { 3747 return -EINVAL; 3748 } 3749 3750 #ifdef CONFIG_MEMCG_KMEM 3751 static int memcg_online_kmem(struct mem_cgroup *memcg) 3752 { 3753 struct obj_cgroup *objcg; 3754 3755 if (mem_cgroup_kmem_disabled()) 3756 return 0; 3757 3758 if (unlikely(mem_cgroup_is_root(memcg))) 3759 return 0; 3760 3761 objcg = obj_cgroup_alloc(); 3762 if (!objcg) 3763 return -ENOMEM; 3764 3765 objcg->memcg = memcg; 3766 rcu_assign_pointer(memcg->objcg, objcg); 3767 3768 static_branch_enable(&memcg_kmem_online_key); 3769 3770 memcg->kmemcg_id = memcg->id.id; 3771 3772 return 0; 3773 } 3774 3775 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3776 { 3777 struct mem_cgroup *parent; 3778 3779 if (mem_cgroup_kmem_disabled()) 3780 return; 3781 3782 if (unlikely(mem_cgroup_is_root(memcg))) 3783 return; 3784 3785 parent = parent_mem_cgroup(memcg); 3786 if (!parent) 3787 parent = root_mem_cgroup; 3788 3789 memcg_reparent_objcgs(memcg, parent); 3790 3791 /* 3792 * After we have finished memcg_reparent_objcgs(), all list_lrus 3793 * corresponding to this cgroup are guaranteed to remain empty. 3794 * The ordering is imposed by list_lru_node->lock taken by 3795 * memcg_reparent_list_lrus(). 3796 */ 3797 memcg_reparent_list_lrus(memcg, parent); 3798 } 3799 #else 3800 static int memcg_online_kmem(struct mem_cgroup *memcg) 3801 { 3802 return 0; 3803 } 3804 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3805 { 3806 } 3807 #endif /* CONFIG_MEMCG_KMEM */ 3808 3809 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3810 { 3811 int ret; 3812 3813 mutex_lock(&memcg_max_mutex); 3814 3815 ret = page_counter_set_max(&memcg->tcpmem, max); 3816 if (ret) 3817 goto out; 3818 3819 if (!memcg->tcpmem_active) { 3820 /* 3821 * The active flag needs to be written after the static_key 3822 * update. This is what guarantees that the socket activation 3823 * function is the last one to run. See mem_cgroup_sk_alloc() 3824 * for details, and note that we don't mark any socket as 3825 * belonging to this memcg until that flag is up. 3826 * 3827 * We need to do this, because static_keys will span multiple 3828 * sites, but we can't control their order. If we mark a socket 3829 * as accounted, but the accounting functions are not patched in 3830 * yet, we'll lose accounting. 3831 * 3832 * We never race with the readers in mem_cgroup_sk_alloc(), 3833 * because when this value change, the code to process it is not 3834 * patched in yet. 3835 */ 3836 static_branch_inc(&memcg_sockets_enabled_key); 3837 memcg->tcpmem_active = true; 3838 } 3839 out: 3840 mutex_unlock(&memcg_max_mutex); 3841 return ret; 3842 } 3843 3844 /* 3845 * The user of this function is... 3846 * RES_LIMIT. 3847 */ 3848 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3849 char *buf, size_t nbytes, loff_t off) 3850 { 3851 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3852 unsigned long nr_pages; 3853 int ret; 3854 3855 buf = strstrip(buf); 3856 ret = page_counter_memparse(buf, "-1", &nr_pages); 3857 if (ret) 3858 return ret; 3859 3860 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3861 case RES_LIMIT: 3862 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3863 ret = -EINVAL; 3864 break; 3865 } 3866 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3867 case _MEM: 3868 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3869 break; 3870 case _MEMSWAP: 3871 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3872 break; 3873 case _TCP: 3874 ret = memcg_update_tcp_max(memcg, nr_pages); 3875 break; 3876 } 3877 break; 3878 case RES_SOFT_LIMIT: 3879 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 3880 ret = -EOPNOTSUPP; 3881 } else { 3882 WRITE_ONCE(memcg->soft_limit, nr_pages); 3883 ret = 0; 3884 } 3885 break; 3886 } 3887 return ret ?: nbytes; 3888 } 3889 3890 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3891 size_t nbytes, loff_t off) 3892 { 3893 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3894 struct page_counter *counter; 3895 3896 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3897 case _MEM: 3898 counter = &memcg->memory; 3899 break; 3900 case _MEMSWAP: 3901 counter = &memcg->memsw; 3902 break; 3903 case _KMEM: 3904 counter = &memcg->kmem; 3905 break; 3906 case _TCP: 3907 counter = &memcg->tcpmem; 3908 break; 3909 default: 3910 BUG(); 3911 } 3912 3913 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3914 case RES_MAX_USAGE: 3915 page_counter_reset_watermark(counter); 3916 break; 3917 case RES_FAILCNT: 3918 counter->failcnt = 0; 3919 break; 3920 default: 3921 BUG(); 3922 } 3923 3924 return nbytes; 3925 } 3926 3927 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3928 struct cftype *cft) 3929 { 3930 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3931 } 3932 3933 #ifdef CONFIG_MMU 3934 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3935 struct cftype *cft, u64 val) 3936 { 3937 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3938 3939 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 3940 "Please report your usecase to linux-mm@kvack.org if you " 3941 "depend on this functionality.\n"); 3942 3943 if (val & ~MOVE_MASK) 3944 return -EINVAL; 3945 3946 /* 3947 * No kind of locking is needed in here, because ->can_attach() will 3948 * check this value once in the beginning of the process, and then carry 3949 * on with stale data. This means that changes to this value will only 3950 * affect task migrations starting after the change. 3951 */ 3952 memcg->move_charge_at_immigrate = val; 3953 return 0; 3954 } 3955 #else 3956 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3957 struct cftype *cft, u64 val) 3958 { 3959 return -ENOSYS; 3960 } 3961 #endif 3962 3963 #ifdef CONFIG_NUMA 3964 3965 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3966 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3967 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3968 3969 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3970 int nid, unsigned int lru_mask, bool tree) 3971 { 3972 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3973 unsigned long nr = 0; 3974 enum lru_list lru; 3975 3976 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3977 3978 for_each_lru(lru) { 3979 if (!(BIT(lru) & lru_mask)) 3980 continue; 3981 if (tree) 3982 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3983 else 3984 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3985 } 3986 return nr; 3987 } 3988 3989 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3990 unsigned int lru_mask, 3991 bool tree) 3992 { 3993 unsigned long nr = 0; 3994 enum lru_list lru; 3995 3996 for_each_lru(lru) { 3997 if (!(BIT(lru) & lru_mask)) 3998 continue; 3999 if (tree) 4000 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 4001 else 4002 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 4003 } 4004 return nr; 4005 } 4006 4007 static int memcg_numa_stat_show(struct seq_file *m, void *v) 4008 { 4009 struct numa_stat { 4010 const char *name; 4011 unsigned int lru_mask; 4012 }; 4013 4014 static const struct numa_stat stats[] = { 4015 { "total", LRU_ALL }, 4016 { "file", LRU_ALL_FILE }, 4017 { "anon", LRU_ALL_ANON }, 4018 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4019 }; 4020 const struct numa_stat *stat; 4021 int nid; 4022 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4023 4024 mem_cgroup_flush_stats(); 4025 4026 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4027 seq_printf(m, "%s=%lu", stat->name, 4028 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4029 false)); 4030 for_each_node_state(nid, N_MEMORY) 4031 seq_printf(m, " N%d=%lu", nid, 4032 mem_cgroup_node_nr_lru_pages(memcg, nid, 4033 stat->lru_mask, false)); 4034 seq_putc(m, '\n'); 4035 } 4036 4037 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4038 4039 seq_printf(m, "hierarchical_%s=%lu", stat->name, 4040 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4041 true)); 4042 for_each_node_state(nid, N_MEMORY) 4043 seq_printf(m, " N%d=%lu", nid, 4044 mem_cgroup_node_nr_lru_pages(memcg, nid, 4045 stat->lru_mask, true)); 4046 seq_putc(m, '\n'); 4047 } 4048 4049 return 0; 4050 } 4051 #endif /* CONFIG_NUMA */ 4052 4053 static const unsigned int memcg1_stats[] = { 4054 NR_FILE_PAGES, 4055 NR_ANON_MAPPED, 4056 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4057 NR_ANON_THPS, 4058 #endif 4059 NR_SHMEM, 4060 NR_FILE_MAPPED, 4061 NR_FILE_DIRTY, 4062 NR_WRITEBACK, 4063 WORKINGSET_REFAULT_ANON, 4064 WORKINGSET_REFAULT_FILE, 4065 MEMCG_SWAP, 4066 }; 4067 4068 static const char *const memcg1_stat_names[] = { 4069 "cache", 4070 "rss", 4071 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4072 "rss_huge", 4073 #endif 4074 "shmem", 4075 "mapped_file", 4076 "dirty", 4077 "writeback", 4078 "workingset_refault_anon", 4079 "workingset_refault_file", 4080 "swap", 4081 }; 4082 4083 /* Universal VM events cgroup1 shows, original sort order */ 4084 static const unsigned int memcg1_events[] = { 4085 PGPGIN, 4086 PGPGOUT, 4087 PGFAULT, 4088 PGMAJFAULT, 4089 }; 4090 4091 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 4092 { 4093 unsigned long memory, memsw; 4094 struct mem_cgroup *mi; 4095 unsigned int i; 4096 4097 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4098 4099 mem_cgroup_flush_stats(); 4100 4101 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4102 unsigned long nr; 4103 4104 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4105 continue; 4106 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4107 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], 4108 nr * memcg_page_state_unit(memcg1_stats[i])); 4109 } 4110 4111 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4112 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 4113 memcg_events_local(memcg, memcg1_events[i])); 4114 4115 for (i = 0; i < NR_LRU_LISTS; i++) 4116 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 4117 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4118 PAGE_SIZE); 4119 4120 /* Hierarchical information */ 4121 memory = memsw = PAGE_COUNTER_MAX; 4122 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4123 memory = min(memory, READ_ONCE(mi->memory.max)); 4124 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4125 } 4126 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 4127 (u64)memory * PAGE_SIZE); 4128 if (do_memsw_account()) 4129 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 4130 (u64)memsw * PAGE_SIZE); 4131 4132 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4133 unsigned long nr; 4134 4135 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4136 continue; 4137 nr = memcg_page_state(memcg, memcg1_stats[i]); 4138 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 4139 (u64)nr * memcg_page_state_unit(memcg1_stats[i])); 4140 } 4141 4142 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4143 seq_buf_printf(s, "total_%s %llu\n", 4144 vm_event_name(memcg1_events[i]), 4145 (u64)memcg_events(memcg, memcg1_events[i])); 4146 4147 for (i = 0; i < NR_LRU_LISTS; i++) 4148 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 4149 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4150 PAGE_SIZE); 4151 4152 #ifdef CONFIG_DEBUG_VM 4153 { 4154 pg_data_t *pgdat; 4155 struct mem_cgroup_per_node *mz; 4156 unsigned long anon_cost = 0; 4157 unsigned long file_cost = 0; 4158 4159 for_each_online_pgdat(pgdat) { 4160 mz = memcg->nodeinfo[pgdat->node_id]; 4161 4162 anon_cost += mz->lruvec.anon_cost; 4163 file_cost += mz->lruvec.file_cost; 4164 } 4165 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 4166 seq_buf_printf(s, "file_cost %lu\n", file_cost); 4167 } 4168 #endif 4169 } 4170 4171 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4172 struct cftype *cft) 4173 { 4174 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4175 4176 return mem_cgroup_swappiness(memcg); 4177 } 4178 4179 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4180 struct cftype *cft, u64 val) 4181 { 4182 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4183 4184 if (val > 200) 4185 return -EINVAL; 4186 4187 if (!mem_cgroup_is_root(memcg)) 4188 WRITE_ONCE(memcg->swappiness, val); 4189 else 4190 WRITE_ONCE(vm_swappiness, val); 4191 4192 return 0; 4193 } 4194 4195 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4196 { 4197 struct mem_cgroup_threshold_ary *t; 4198 unsigned long usage; 4199 int i; 4200 4201 rcu_read_lock(); 4202 if (!swap) 4203 t = rcu_dereference(memcg->thresholds.primary); 4204 else 4205 t = rcu_dereference(memcg->memsw_thresholds.primary); 4206 4207 if (!t) 4208 goto unlock; 4209 4210 usage = mem_cgroup_usage(memcg, swap); 4211 4212 /* 4213 * current_threshold points to threshold just below or equal to usage. 4214 * If it's not true, a threshold was crossed after last 4215 * call of __mem_cgroup_threshold(). 4216 */ 4217 i = t->current_threshold; 4218 4219 /* 4220 * Iterate backward over array of thresholds starting from 4221 * current_threshold and check if a threshold is crossed. 4222 * If none of thresholds below usage is crossed, we read 4223 * only one element of the array here. 4224 */ 4225 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4226 eventfd_signal(t->entries[i].eventfd, 1); 4227 4228 /* i = current_threshold + 1 */ 4229 i++; 4230 4231 /* 4232 * Iterate forward over array of thresholds starting from 4233 * current_threshold+1 and check if a threshold is crossed. 4234 * If none of thresholds above usage is crossed, we read 4235 * only one element of the array here. 4236 */ 4237 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4238 eventfd_signal(t->entries[i].eventfd, 1); 4239 4240 /* Update current_threshold */ 4241 t->current_threshold = i - 1; 4242 unlock: 4243 rcu_read_unlock(); 4244 } 4245 4246 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4247 { 4248 while (memcg) { 4249 __mem_cgroup_threshold(memcg, false); 4250 if (do_memsw_account()) 4251 __mem_cgroup_threshold(memcg, true); 4252 4253 memcg = parent_mem_cgroup(memcg); 4254 } 4255 } 4256 4257 static int compare_thresholds(const void *a, const void *b) 4258 { 4259 const struct mem_cgroup_threshold *_a = a; 4260 const struct mem_cgroup_threshold *_b = b; 4261 4262 if (_a->threshold > _b->threshold) 4263 return 1; 4264 4265 if (_a->threshold < _b->threshold) 4266 return -1; 4267 4268 return 0; 4269 } 4270 4271 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4272 { 4273 struct mem_cgroup_eventfd_list *ev; 4274 4275 spin_lock(&memcg_oom_lock); 4276 4277 list_for_each_entry(ev, &memcg->oom_notify, list) 4278 eventfd_signal(ev->eventfd, 1); 4279 4280 spin_unlock(&memcg_oom_lock); 4281 return 0; 4282 } 4283 4284 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4285 { 4286 struct mem_cgroup *iter; 4287 4288 for_each_mem_cgroup_tree(iter, memcg) 4289 mem_cgroup_oom_notify_cb(iter); 4290 } 4291 4292 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4293 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4294 { 4295 struct mem_cgroup_thresholds *thresholds; 4296 struct mem_cgroup_threshold_ary *new; 4297 unsigned long threshold; 4298 unsigned long usage; 4299 int i, size, ret; 4300 4301 ret = page_counter_memparse(args, "-1", &threshold); 4302 if (ret) 4303 return ret; 4304 4305 mutex_lock(&memcg->thresholds_lock); 4306 4307 if (type == _MEM) { 4308 thresholds = &memcg->thresholds; 4309 usage = mem_cgroup_usage(memcg, false); 4310 } else if (type == _MEMSWAP) { 4311 thresholds = &memcg->memsw_thresholds; 4312 usage = mem_cgroup_usage(memcg, true); 4313 } else 4314 BUG(); 4315 4316 /* Check if a threshold crossed before adding a new one */ 4317 if (thresholds->primary) 4318 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4319 4320 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4321 4322 /* Allocate memory for new array of thresholds */ 4323 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4324 if (!new) { 4325 ret = -ENOMEM; 4326 goto unlock; 4327 } 4328 new->size = size; 4329 4330 /* Copy thresholds (if any) to new array */ 4331 if (thresholds->primary) 4332 memcpy(new->entries, thresholds->primary->entries, 4333 flex_array_size(new, entries, size - 1)); 4334 4335 /* Add new threshold */ 4336 new->entries[size - 1].eventfd = eventfd; 4337 new->entries[size - 1].threshold = threshold; 4338 4339 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4340 sort(new->entries, size, sizeof(*new->entries), 4341 compare_thresholds, NULL); 4342 4343 /* Find current threshold */ 4344 new->current_threshold = -1; 4345 for (i = 0; i < size; i++) { 4346 if (new->entries[i].threshold <= usage) { 4347 /* 4348 * new->current_threshold will not be used until 4349 * rcu_assign_pointer(), so it's safe to increment 4350 * it here. 4351 */ 4352 ++new->current_threshold; 4353 } else 4354 break; 4355 } 4356 4357 /* Free old spare buffer and save old primary buffer as spare */ 4358 kfree(thresholds->spare); 4359 thresholds->spare = thresholds->primary; 4360 4361 rcu_assign_pointer(thresholds->primary, new); 4362 4363 /* To be sure that nobody uses thresholds */ 4364 synchronize_rcu(); 4365 4366 unlock: 4367 mutex_unlock(&memcg->thresholds_lock); 4368 4369 return ret; 4370 } 4371 4372 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4373 struct eventfd_ctx *eventfd, const char *args) 4374 { 4375 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4376 } 4377 4378 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4379 struct eventfd_ctx *eventfd, const char *args) 4380 { 4381 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4382 } 4383 4384 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4385 struct eventfd_ctx *eventfd, enum res_type type) 4386 { 4387 struct mem_cgroup_thresholds *thresholds; 4388 struct mem_cgroup_threshold_ary *new; 4389 unsigned long usage; 4390 int i, j, size, entries; 4391 4392 mutex_lock(&memcg->thresholds_lock); 4393 4394 if (type == _MEM) { 4395 thresholds = &memcg->thresholds; 4396 usage = mem_cgroup_usage(memcg, false); 4397 } else if (type == _MEMSWAP) { 4398 thresholds = &memcg->memsw_thresholds; 4399 usage = mem_cgroup_usage(memcg, true); 4400 } else 4401 BUG(); 4402 4403 if (!thresholds->primary) 4404 goto unlock; 4405 4406 /* Check if a threshold crossed before removing */ 4407 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4408 4409 /* Calculate new number of threshold */ 4410 size = entries = 0; 4411 for (i = 0; i < thresholds->primary->size; i++) { 4412 if (thresholds->primary->entries[i].eventfd != eventfd) 4413 size++; 4414 else 4415 entries++; 4416 } 4417 4418 new = thresholds->spare; 4419 4420 /* If no items related to eventfd have been cleared, nothing to do */ 4421 if (!entries) 4422 goto unlock; 4423 4424 /* Set thresholds array to NULL if we don't have thresholds */ 4425 if (!size) { 4426 kfree(new); 4427 new = NULL; 4428 goto swap_buffers; 4429 } 4430 4431 new->size = size; 4432 4433 /* Copy thresholds and find current threshold */ 4434 new->current_threshold = -1; 4435 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4436 if (thresholds->primary->entries[i].eventfd == eventfd) 4437 continue; 4438 4439 new->entries[j] = thresholds->primary->entries[i]; 4440 if (new->entries[j].threshold <= usage) { 4441 /* 4442 * new->current_threshold will not be used 4443 * until rcu_assign_pointer(), so it's safe to increment 4444 * it here. 4445 */ 4446 ++new->current_threshold; 4447 } 4448 j++; 4449 } 4450 4451 swap_buffers: 4452 /* Swap primary and spare array */ 4453 thresholds->spare = thresholds->primary; 4454 4455 rcu_assign_pointer(thresholds->primary, new); 4456 4457 /* To be sure that nobody uses thresholds */ 4458 synchronize_rcu(); 4459 4460 /* If all events are unregistered, free the spare array */ 4461 if (!new) { 4462 kfree(thresholds->spare); 4463 thresholds->spare = NULL; 4464 } 4465 unlock: 4466 mutex_unlock(&memcg->thresholds_lock); 4467 } 4468 4469 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4470 struct eventfd_ctx *eventfd) 4471 { 4472 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4473 } 4474 4475 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4476 struct eventfd_ctx *eventfd) 4477 { 4478 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4479 } 4480 4481 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4482 struct eventfd_ctx *eventfd, const char *args) 4483 { 4484 struct mem_cgroup_eventfd_list *event; 4485 4486 event = kmalloc(sizeof(*event), GFP_KERNEL); 4487 if (!event) 4488 return -ENOMEM; 4489 4490 spin_lock(&memcg_oom_lock); 4491 4492 event->eventfd = eventfd; 4493 list_add(&event->list, &memcg->oom_notify); 4494 4495 /* already in OOM ? */ 4496 if (memcg->under_oom) 4497 eventfd_signal(eventfd, 1); 4498 spin_unlock(&memcg_oom_lock); 4499 4500 return 0; 4501 } 4502 4503 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4504 struct eventfd_ctx *eventfd) 4505 { 4506 struct mem_cgroup_eventfd_list *ev, *tmp; 4507 4508 spin_lock(&memcg_oom_lock); 4509 4510 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4511 if (ev->eventfd == eventfd) { 4512 list_del(&ev->list); 4513 kfree(ev); 4514 } 4515 } 4516 4517 spin_unlock(&memcg_oom_lock); 4518 } 4519 4520 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4521 { 4522 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4523 4524 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 4525 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4526 seq_printf(sf, "oom_kill %lu\n", 4527 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4528 return 0; 4529 } 4530 4531 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4532 struct cftype *cft, u64 val) 4533 { 4534 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4535 4536 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4537 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4538 return -EINVAL; 4539 4540 WRITE_ONCE(memcg->oom_kill_disable, val); 4541 if (!val) 4542 memcg_oom_recover(memcg); 4543 4544 return 0; 4545 } 4546 4547 #ifdef CONFIG_CGROUP_WRITEBACK 4548 4549 #include <trace/events/writeback.h> 4550 4551 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4552 { 4553 return wb_domain_init(&memcg->cgwb_domain, gfp); 4554 } 4555 4556 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4557 { 4558 wb_domain_exit(&memcg->cgwb_domain); 4559 } 4560 4561 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4562 { 4563 wb_domain_size_changed(&memcg->cgwb_domain); 4564 } 4565 4566 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4567 { 4568 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4569 4570 if (!memcg->css.parent) 4571 return NULL; 4572 4573 return &memcg->cgwb_domain; 4574 } 4575 4576 /** 4577 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4578 * @wb: bdi_writeback in question 4579 * @pfilepages: out parameter for number of file pages 4580 * @pheadroom: out parameter for number of allocatable pages according to memcg 4581 * @pdirty: out parameter for number of dirty pages 4582 * @pwriteback: out parameter for number of pages under writeback 4583 * 4584 * Determine the numbers of file, headroom, dirty, and writeback pages in 4585 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4586 * is a bit more involved. 4587 * 4588 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4589 * headroom is calculated as the lowest headroom of itself and the 4590 * ancestors. Note that this doesn't consider the actual amount of 4591 * available memory in the system. The caller should further cap 4592 * *@pheadroom accordingly. 4593 */ 4594 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4595 unsigned long *pheadroom, unsigned long *pdirty, 4596 unsigned long *pwriteback) 4597 { 4598 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4599 struct mem_cgroup *parent; 4600 4601 mem_cgroup_flush_stats(); 4602 4603 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4604 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4605 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4606 memcg_page_state(memcg, NR_ACTIVE_FILE); 4607 4608 *pheadroom = PAGE_COUNTER_MAX; 4609 while ((parent = parent_mem_cgroup(memcg))) { 4610 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4611 READ_ONCE(memcg->memory.high)); 4612 unsigned long used = page_counter_read(&memcg->memory); 4613 4614 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4615 memcg = parent; 4616 } 4617 } 4618 4619 /* 4620 * Foreign dirty flushing 4621 * 4622 * There's an inherent mismatch between memcg and writeback. The former 4623 * tracks ownership per-page while the latter per-inode. This was a 4624 * deliberate design decision because honoring per-page ownership in the 4625 * writeback path is complicated, may lead to higher CPU and IO overheads 4626 * and deemed unnecessary given that write-sharing an inode across 4627 * different cgroups isn't a common use-case. 4628 * 4629 * Combined with inode majority-writer ownership switching, this works well 4630 * enough in most cases but there are some pathological cases. For 4631 * example, let's say there are two cgroups A and B which keep writing to 4632 * different but confined parts of the same inode. B owns the inode and 4633 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4634 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4635 * triggering background writeback. A will be slowed down without a way to 4636 * make writeback of the dirty pages happen. 4637 * 4638 * Conditions like the above can lead to a cgroup getting repeatedly and 4639 * severely throttled after making some progress after each 4640 * dirty_expire_interval while the underlying IO device is almost 4641 * completely idle. 4642 * 4643 * Solving this problem completely requires matching the ownership tracking 4644 * granularities between memcg and writeback in either direction. However, 4645 * the more egregious behaviors can be avoided by simply remembering the 4646 * most recent foreign dirtying events and initiating remote flushes on 4647 * them when local writeback isn't enough to keep the memory clean enough. 4648 * 4649 * The following two functions implement such mechanism. When a foreign 4650 * page - a page whose memcg and writeback ownerships don't match - is 4651 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4652 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4653 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4654 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4655 * foreign bdi_writebacks which haven't expired. Both the numbers of 4656 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4657 * limited to MEMCG_CGWB_FRN_CNT. 4658 * 4659 * The mechanism only remembers IDs and doesn't hold any object references. 4660 * As being wrong occasionally doesn't matter, updates and accesses to the 4661 * records are lockless and racy. 4662 */ 4663 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 4664 struct bdi_writeback *wb) 4665 { 4666 struct mem_cgroup *memcg = folio_memcg(folio); 4667 struct memcg_cgwb_frn *frn; 4668 u64 now = get_jiffies_64(); 4669 u64 oldest_at = now; 4670 int oldest = -1; 4671 int i; 4672 4673 trace_track_foreign_dirty(folio, wb); 4674 4675 /* 4676 * Pick the slot to use. If there is already a slot for @wb, keep 4677 * using it. If not replace the oldest one which isn't being 4678 * written out. 4679 */ 4680 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4681 frn = &memcg->cgwb_frn[i]; 4682 if (frn->bdi_id == wb->bdi->id && 4683 frn->memcg_id == wb->memcg_css->id) 4684 break; 4685 if (time_before64(frn->at, oldest_at) && 4686 atomic_read(&frn->done.cnt) == 1) { 4687 oldest = i; 4688 oldest_at = frn->at; 4689 } 4690 } 4691 4692 if (i < MEMCG_CGWB_FRN_CNT) { 4693 /* 4694 * Re-using an existing one. Update timestamp lazily to 4695 * avoid making the cacheline hot. We want them to be 4696 * reasonably up-to-date and significantly shorter than 4697 * dirty_expire_interval as that's what expires the record. 4698 * Use the shorter of 1s and dirty_expire_interval / 8. 4699 */ 4700 unsigned long update_intv = 4701 min_t(unsigned long, HZ, 4702 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4703 4704 if (time_before64(frn->at, now - update_intv)) 4705 frn->at = now; 4706 } else if (oldest >= 0) { 4707 /* replace the oldest free one */ 4708 frn = &memcg->cgwb_frn[oldest]; 4709 frn->bdi_id = wb->bdi->id; 4710 frn->memcg_id = wb->memcg_css->id; 4711 frn->at = now; 4712 } 4713 } 4714 4715 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4716 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4717 { 4718 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4719 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4720 u64 now = jiffies_64; 4721 int i; 4722 4723 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4724 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4725 4726 /* 4727 * If the record is older than dirty_expire_interval, 4728 * writeback on it has already started. No need to kick it 4729 * off again. Also, don't start a new one if there's 4730 * already one in flight. 4731 */ 4732 if (time_after64(frn->at, now - intv) && 4733 atomic_read(&frn->done.cnt) == 1) { 4734 frn->at = 0; 4735 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4736 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 4737 WB_REASON_FOREIGN_FLUSH, 4738 &frn->done); 4739 } 4740 } 4741 } 4742 4743 #else /* CONFIG_CGROUP_WRITEBACK */ 4744 4745 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4746 { 4747 return 0; 4748 } 4749 4750 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4751 { 4752 } 4753 4754 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4755 { 4756 } 4757 4758 #endif /* CONFIG_CGROUP_WRITEBACK */ 4759 4760 /* 4761 * DO NOT USE IN NEW FILES. 4762 * 4763 * "cgroup.event_control" implementation. 4764 * 4765 * This is way over-engineered. It tries to support fully configurable 4766 * events for each user. Such level of flexibility is completely 4767 * unnecessary especially in the light of the planned unified hierarchy. 4768 * 4769 * Please deprecate this and replace with something simpler if at all 4770 * possible. 4771 */ 4772 4773 /* 4774 * Unregister event and free resources. 4775 * 4776 * Gets called from workqueue. 4777 */ 4778 static void memcg_event_remove(struct work_struct *work) 4779 { 4780 struct mem_cgroup_event *event = 4781 container_of(work, struct mem_cgroup_event, remove); 4782 struct mem_cgroup *memcg = event->memcg; 4783 4784 remove_wait_queue(event->wqh, &event->wait); 4785 4786 event->unregister_event(memcg, event->eventfd); 4787 4788 /* Notify userspace the event is going away. */ 4789 eventfd_signal(event->eventfd, 1); 4790 4791 eventfd_ctx_put(event->eventfd); 4792 kfree(event); 4793 css_put(&memcg->css); 4794 } 4795 4796 /* 4797 * Gets called on EPOLLHUP on eventfd when user closes it. 4798 * 4799 * Called with wqh->lock held and interrupts disabled. 4800 */ 4801 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4802 int sync, void *key) 4803 { 4804 struct mem_cgroup_event *event = 4805 container_of(wait, struct mem_cgroup_event, wait); 4806 struct mem_cgroup *memcg = event->memcg; 4807 __poll_t flags = key_to_poll(key); 4808 4809 if (flags & EPOLLHUP) { 4810 /* 4811 * If the event has been detached at cgroup removal, we 4812 * can simply return knowing the other side will cleanup 4813 * for us. 4814 * 4815 * We can't race against event freeing since the other 4816 * side will require wqh->lock via remove_wait_queue(), 4817 * which we hold. 4818 */ 4819 spin_lock(&memcg->event_list_lock); 4820 if (!list_empty(&event->list)) { 4821 list_del_init(&event->list); 4822 /* 4823 * We are in atomic context, but cgroup_event_remove() 4824 * may sleep, so we have to call it in workqueue. 4825 */ 4826 schedule_work(&event->remove); 4827 } 4828 spin_unlock(&memcg->event_list_lock); 4829 } 4830 4831 return 0; 4832 } 4833 4834 static void memcg_event_ptable_queue_proc(struct file *file, 4835 wait_queue_head_t *wqh, poll_table *pt) 4836 { 4837 struct mem_cgroup_event *event = 4838 container_of(pt, struct mem_cgroup_event, pt); 4839 4840 event->wqh = wqh; 4841 add_wait_queue(wqh, &event->wait); 4842 } 4843 4844 /* 4845 * DO NOT USE IN NEW FILES. 4846 * 4847 * Parse input and register new cgroup event handler. 4848 * 4849 * Input must be in format '<event_fd> <control_fd> <args>'. 4850 * Interpretation of args is defined by control file implementation. 4851 */ 4852 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4853 char *buf, size_t nbytes, loff_t off) 4854 { 4855 struct cgroup_subsys_state *css = of_css(of); 4856 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4857 struct mem_cgroup_event *event; 4858 struct cgroup_subsys_state *cfile_css; 4859 unsigned int efd, cfd; 4860 struct fd efile; 4861 struct fd cfile; 4862 struct dentry *cdentry; 4863 const char *name; 4864 char *endp; 4865 int ret; 4866 4867 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 4868 return -EOPNOTSUPP; 4869 4870 buf = strstrip(buf); 4871 4872 efd = simple_strtoul(buf, &endp, 10); 4873 if (*endp != ' ') 4874 return -EINVAL; 4875 buf = endp + 1; 4876 4877 cfd = simple_strtoul(buf, &endp, 10); 4878 if ((*endp != ' ') && (*endp != '\0')) 4879 return -EINVAL; 4880 buf = endp + 1; 4881 4882 event = kzalloc(sizeof(*event), GFP_KERNEL); 4883 if (!event) 4884 return -ENOMEM; 4885 4886 event->memcg = memcg; 4887 INIT_LIST_HEAD(&event->list); 4888 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4889 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4890 INIT_WORK(&event->remove, memcg_event_remove); 4891 4892 efile = fdget(efd); 4893 if (!efile.file) { 4894 ret = -EBADF; 4895 goto out_kfree; 4896 } 4897 4898 event->eventfd = eventfd_ctx_fileget(efile.file); 4899 if (IS_ERR(event->eventfd)) { 4900 ret = PTR_ERR(event->eventfd); 4901 goto out_put_efile; 4902 } 4903 4904 cfile = fdget(cfd); 4905 if (!cfile.file) { 4906 ret = -EBADF; 4907 goto out_put_eventfd; 4908 } 4909 4910 /* the process need read permission on control file */ 4911 /* AV: shouldn't we check that it's been opened for read instead? */ 4912 ret = file_permission(cfile.file, MAY_READ); 4913 if (ret < 0) 4914 goto out_put_cfile; 4915 4916 /* 4917 * The control file must be a regular cgroup1 file. As a regular cgroup 4918 * file can't be renamed, it's safe to access its name afterwards. 4919 */ 4920 cdentry = cfile.file->f_path.dentry; 4921 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 4922 ret = -EINVAL; 4923 goto out_put_cfile; 4924 } 4925 4926 /* 4927 * Determine the event callbacks and set them in @event. This used 4928 * to be done via struct cftype but cgroup core no longer knows 4929 * about these events. The following is crude but the whole thing 4930 * is for compatibility anyway. 4931 * 4932 * DO NOT ADD NEW FILES. 4933 */ 4934 name = cdentry->d_name.name; 4935 4936 if (!strcmp(name, "memory.usage_in_bytes")) { 4937 event->register_event = mem_cgroup_usage_register_event; 4938 event->unregister_event = mem_cgroup_usage_unregister_event; 4939 } else if (!strcmp(name, "memory.oom_control")) { 4940 event->register_event = mem_cgroup_oom_register_event; 4941 event->unregister_event = mem_cgroup_oom_unregister_event; 4942 } else if (!strcmp(name, "memory.pressure_level")) { 4943 event->register_event = vmpressure_register_event; 4944 event->unregister_event = vmpressure_unregister_event; 4945 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4946 event->register_event = memsw_cgroup_usage_register_event; 4947 event->unregister_event = memsw_cgroup_usage_unregister_event; 4948 } else { 4949 ret = -EINVAL; 4950 goto out_put_cfile; 4951 } 4952 4953 /* 4954 * Verify @cfile should belong to @css. Also, remaining events are 4955 * automatically removed on cgroup destruction but the removal is 4956 * asynchronous, so take an extra ref on @css. 4957 */ 4958 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 4959 &memory_cgrp_subsys); 4960 ret = -EINVAL; 4961 if (IS_ERR(cfile_css)) 4962 goto out_put_cfile; 4963 if (cfile_css != css) { 4964 css_put(cfile_css); 4965 goto out_put_cfile; 4966 } 4967 4968 ret = event->register_event(memcg, event->eventfd, buf); 4969 if (ret) 4970 goto out_put_css; 4971 4972 vfs_poll(efile.file, &event->pt); 4973 4974 spin_lock_irq(&memcg->event_list_lock); 4975 list_add(&event->list, &memcg->event_list); 4976 spin_unlock_irq(&memcg->event_list_lock); 4977 4978 fdput(cfile); 4979 fdput(efile); 4980 4981 return nbytes; 4982 4983 out_put_css: 4984 css_put(css); 4985 out_put_cfile: 4986 fdput(cfile); 4987 out_put_eventfd: 4988 eventfd_ctx_put(event->eventfd); 4989 out_put_efile: 4990 fdput(efile); 4991 out_kfree: 4992 kfree(event); 4993 4994 return ret; 4995 } 4996 4997 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 4998 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 4999 { 5000 /* 5001 * Deprecated. 5002 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 5003 */ 5004 return 0; 5005 } 5006 #endif 5007 5008 static int memory_stat_show(struct seq_file *m, void *v); 5009 5010 static struct cftype mem_cgroup_legacy_files[] = { 5011 { 5012 .name = "usage_in_bytes", 5013 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5014 .read_u64 = mem_cgroup_read_u64, 5015 }, 5016 { 5017 .name = "max_usage_in_bytes", 5018 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5019 .write = mem_cgroup_reset, 5020 .read_u64 = mem_cgroup_read_u64, 5021 }, 5022 { 5023 .name = "limit_in_bytes", 5024 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5025 .write = mem_cgroup_write, 5026 .read_u64 = mem_cgroup_read_u64, 5027 }, 5028 { 5029 .name = "soft_limit_in_bytes", 5030 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5031 .write = mem_cgroup_write, 5032 .read_u64 = mem_cgroup_read_u64, 5033 }, 5034 { 5035 .name = "failcnt", 5036 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5037 .write = mem_cgroup_reset, 5038 .read_u64 = mem_cgroup_read_u64, 5039 }, 5040 { 5041 .name = "stat", 5042 .seq_show = memory_stat_show, 5043 }, 5044 { 5045 .name = "force_empty", 5046 .write = mem_cgroup_force_empty_write, 5047 }, 5048 { 5049 .name = "use_hierarchy", 5050 .write_u64 = mem_cgroup_hierarchy_write, 5051 .read_u64 = mem_cgroup_hierarchy_read, 5052 }, 5053 { 5054 .name = "cgroup.event_control", /* XXX: for compat */ 5055 .write = memcg_write_event_control, 5056 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 5057 }, 5058 { 5059 .name = "swappiness", 5060 .read_u64 = mem_cgroup_swappiness_read, 5061 .write_u64 = mem_cgroup_swappiness_write, 5062 }, 5063 { 5064 .name = "move_charge_at_immigrate", 5065 .read_u64 = mem_cgroup_move_charge_read, 5066 .write_u64 = mem_cgroup_move_charge_write, 5067 }, 5068 { 5069 .name = "oom_control", 5070 .seq_show = mem_cgroup_oom_control_read, 5071 .write_u64 = mem_cgroup_oom_control_write, 5072 }, 5073 { 5074 .name = "pressure_level", 5075 .seq_show = mem_cgroup_dummy_seq_show, 5076 }, 5077 #ifdef CONFIG_NUMA 5078 { 5079 .name = "numa_stat", 5080 .seq_show = memcg_numa_stat_show, 5081 }, 5082 #endif 5083 { 5084 .name = "kmem.usage_in_bytes", 5085 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5086 .read_u64 = mem_cgroup_read_u64, 5087 }, 5088 { 5089 .name = "kmem.failcnt", 5090 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5091 .write = mem_cgroup_reset, 5092 .read_u64 = mem_cgroup_read_u64, 5093 }, 5094 { 5095 .name = "kmem.max_usage_in_bytes", 5096 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5097 .write = mem_cgroup_reset, 5098 .read_u64 = mem_cgroup_read_u64, 5099 }, 5100 #if defined(CONFIG_MEMCG_KMEM) && \ 5101 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5102 { 5103 .name = "kmem.slabinfo", 5104 .seq_show = mem_cgroup_slab_show, 5105 }, 5106 #endif 5107 { 5108 .name = "kmem.tcp.limit_in_bytes", 5109 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5110 .write = mem_cgroup_write, 5111 .read_u64 = mem_cgroup_read_u64, 5112 }, 5113 { 5114 .name = "kmem.tcp.usage_in_bytes", 5115 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5116 .read_u64 = mem_cgroup_read_u64, 5117 }, 5118 { 5119 .name = "kmem.tcp.failcnt", 5120 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5121 .write = mem_cgroup_reset, 5122 .read_u64 = mem_cgroup_read_u64, 5123 }, 5124 { 5125 .name = "kmem.tcp.max_usage_in_bytes", 5126 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5127 .write = mem_cgroup_reset, 5128 .read_u64 = mem_cgroup_read_u64, 5129 }, 5130 { }, /* terminate */ 5131 }; 5132 5133 /* 5134 * Private memory cgroup IDR 5135 * 5136 * Swap-out records and page cache shadow entries need to store memcg 5137 * references in constrained space, so we maintain an ID space that is 5138 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5139 * memory-controlled cgroups to 64k. 5140 * 5141 * However, there usually are many references to the offline CSS after 5142 * the cgroup has been destroyed, such as page cache or reclaimable 5143 * slab objects, that don't need to hang on to the ID. We want to keep 5144 * those dead CSS from occupying IDs, or we might quickly exhaust the 5145 * relatively small ID space and prevent the creation of new cgroups 5146 * even when there are much fewer than 64k cgroups - possibly none. 5147 * 5148 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5149 * be freed and recycled when it's no longer needed, which is usually 5150 * when the CSS is offlined. 5151 * 5152 * The only exception to that are records of swapped out tmpfs/shmem 5153 * pages that need to be attributed to live ancestors on swapin. But 5154 * those references are manageable from userspace. 5155 */ 5156 5157 #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) 5158 static DEFINE_IDR(mem_cgroup_idr); 5159 5160 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5161 { 5162 if (memcg->id.id > 0) { 5163 idr_remove(&mem_cgroup_idr, memcg->id.id); 5164 memcg->id.id = 0; 5165 } 5166 } 5167 5168 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5169 unsigned int n) 5170 { 5171 refcount_add(n, &memcg->id.ref); 5172 } 5173 5174 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5175 { 5176 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5177 mem_cgroup_id_remove(memcg); 5178 5179 /* Memcg ID pins CSS */ 5180 css_put(&memcg->css); 5181 } 5182 } 5183 5184 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5185 { 5186 mem_cgroup_id_put_many(memcg, 1); 5187 } 5188 5189 /** 5190 * mem_cgroup_from_id - look up a memcg from a memcg id 5191 * @id: the memcg id to look up 5192 * 5193 * Caller must hold rcu_read_lock(). 5194 */ 5195 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5196 { 5197 WARN_ON_ONCE(!rcu_read_lock_held()); 5198 return idr_find(&mem_cgroup_idr, id); 5199 } 5200 5201 #ifdef CONFIG_SHRINKER_DEBUG 5202 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 5203 { 5204 struct cgroup *cgrp; 5205 struct cgroup_subsys_state *css; 5206 struct mem_cgroup *memcg; 5207 5208 cgrp = cgroup_get_from_id(ino); 5209 if (IS_ERR(cgrp)) 5210 return ERR_CAST(cgrp); 5211 5212 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 5213 if (css) 5214 memcg = container_of(css, struct mem_cgroup, css); 5215 else 5216 memcg = ERR_PTR(-ENOENT); 5217 5218 cgroup_put(cgrp); 5219 5220 return memcg; 5221 } 5222 #endif 5223 5224 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5225 { 5226 struct mem_cgroup_per_node *pn; 5227 5228 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 5229 if (!pn) 5230 return 1; 5231 5232 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 5233 GFP_KERNEL_ACCOUNT); 5234 if (!pn->lruvec_stats_percpu) { 5235 kfree(pn); 5236 return 1; 5237 } 5238 5239 lruvec_init(&pn->lruvec); 5240 pn->memcg = memcg; 5241 5242 memcg->nodeinfo[node] = pn; 5243 return 0; 5244 } 5245 5246 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5247 { 5248 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5249 5250 if (!pn) 5251 return; 5252 5253 free_percpu(pn->lruvec_stats_percpu); 5254 kfree(pn); 5255 } 5256 5257 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5258 { 5259 int node; 5260 5261 for_each_node(node) 5262 free_mem_cgroup_per_node_info(memcg, node); 5263 kfree(memcg->vmstats); 5264 free_percpu(memcg->vmstats_percpu); 5265 kfree(memcg); 5266 } 5267 5268 static void mem_cgroup_free(struct mem_cgroup *memcg) 5269 { 5270 lru_gen_exit_memcg(memcg); 5271 memcg_wb_domain_exit(memcg); 5272 __mem_cgroup_free(memcg); 5273 } 5274 5275 static struct mem_cgroup *mem_cgroup_alloc(void) 5276 { 5277 struct mem_cgroup *memcg; 5278 int node; 5279 int __maybe_unused i; 5280 long error = -ENOMEM; 5281 5282 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 5283 if (!memcg) 5284 return ERR_PTR(error); 5285 5286 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5287 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); 5288 if (memcg->id.id < 0) { 5289 error = memcg->id.id; 5290 goto fail; 5291 } 5292 5293 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); 5294 if (!memcg->vmstats) 5295 goto fail; 5296 5297 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5298 GFP_KERNEL_ACCOUNT); 5299 if (!memcg->vmstats_percpu) 5300 goto fail; 5301 5302 for_each_node(node) 5303 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5304 goto fail; 5305 5306 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5307 goto fail; 5308 5309 INIT_WORK(&memcg->high_work, high_work_func); 5310 INIT_LIST_HEAD(&memcg->oom_notify); 5311 mutex_init(&memcg->thresholds_lock); 5312 spin_lock_init(&memcg->move_lock); 5313 vmpressure_init(&memcg->vmpressure); 5314 INIT_LIST_HEAD(&memcg->event_list); 5315 spin_lock_init(&memcg->event_list_lock); 5316 memcg->socket_pressure = jiffies; 5317 #ifdef CONFIG_MEMCG_KMEM 5318 memcg->kmemcg_id = -1; 5319 INIT_LIST_HEAD(&memcg->objcg_list); 5320 #endif 5321 #ifdef CONFIG_CGROUP_WRITEBACK 5322 INIT_LIST_HEAD(&memcg->cgwb_list); 5323 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5324 memcg->cgwb_frn[i].done = 5325 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5326 #endif 5327 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5328 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5329 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5330 memcg->deferred_split_queue.split_queue_len = 0; 5331 #endif 5332 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5333 lru_gen_init_memcg(memcg); 5334 return memcg; 5335 fail: 5336 mem_cgroup_id_remove(memcg); 5337 __mem_cgroup_free(memcg); 5338 return ERR_PTR(error); 5339 } 5340 5341 static struct cgroup_subsys_state * __ref 5342 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5343 { 5344 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5345 struct mem_cgroup *memcg, *old_memcg; 5346 5347 old_memcg = set_active_memcg(parent); 5348 memcg = mem_cgroup_alloc(); 5349 set_active_memcg(old_memcg); 5350 if (IS_ERR(memcg)) 5351 return ERR_CAST(memcg); 5352 5353 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5354 WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 5355 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 5356 memcg->zswap_max = PAGE_COUNTER_MAX; 5357 #endif 5358 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5359 if (parent) { 5360 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); 5361 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); 5362 5363 page_counter_init(&memcg->memory, &parent->memory); 5364 page_counter_init(&memcg->swap, &parent->swap); 5365 page_counter_init(&memcg->kmem, &parent->kmem); 5366 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5367 } else { 5368 init_memcg_events(); 5369 page_counter_init(&memcg->memory, NULL); 5370 page_counter_init(&memcg->swap, NULL); 5371 page_counter_init(&memcg->kmem, NULL); 5372 page_counter_init(&memcg->tcpmem, NULL); 5373 5374 root_mem_cgroup = memcg; 5375 return &memcg->css; 5376 } 5377 5378 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5379 static_branch_inc(&memcg_sockets_enabled_key); 5380 5381 #if defined(CONFIG_MEMCG_KMEM) 5382 if (!cgroup_memory_nobpf) 5383 static_branch_inc(&memcg_bpf_enabled_key); 5384 #endif 5385 5386 return &memcg->css; 5387 } 5388 5389 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5390 { 5391 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5392 5393 if (memcg_online_kmem(memcg)) 5394 goto remove_id; 5395 5396 /* 5397 * A memcg must be visible for expand_shrinker_info() 5398 * by the time the maps are allocated. So, we allocate maps 5399 * here, when for_each_mem_cgroup() can't skip it. 5400 */ 5401 if (alloc_shrinker_info(memcg)) 5402 goto offline_kmem; 5403 5404 /* Online state pins memcg ID, memcg ID pins CSS */ 5405 refcount_set(&memcg->id.ref, 1); 5406 css_get(css); 5407 5408 if (unlikely(mem_cgroup_is_root(memcg))) 5409 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 5410 FLUSH_TIME); 5411 lru_gen_online_memcg(memcg); 5412 return 0; 5413 offline_kmem: 5414 memcg_offline_kmem(memcg); 5415 remove_id: 5416 mem_cgroup_id_remove(memcg); 5417 return -ENOMEM; 5418 } 5419 5420 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5421 { 5422 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5423 struct mem_cgroup_event *event, *tmp; 5424 5425 /* 5426 * Unregister events and notify userspace. 5427 * Notify userspace about cgroup removing only after rmdir of cgroup 5428 * directory to avoid race between userspace and kernelspace. 5429 */ 5430 spin_lock_irq(&memcg->event_list_lock); 5431 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5432 list_del_init(&event->list); 5433 schedule_work(&event->remove); 5434 } 5435 spin_unlock_irq(&memcg->event_list_lock); 5436 5437 page_counter_set_min(&memcg->memory, 0); 5438 page_counter_set_low(&memcg->memory, 0); 5439 5440 memcg_offline_kmem(memcg); 5441 reparent_shrinker_deferred(memcg); 5442 wb_memcg_offline(memcg); 5443 lru_gen_offline_memcg(memcg); 5444 5445 drain_all_stock(memcg); 5446 5447 mem_cgroup_id_put(memcg); 5448 } 5449 5450 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5451 { 5452 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5453 5454 invalidate_reclaim_iterators(memcg); 5455 lru_gen_release_memcg(memcg); 5456 } 5457 5458 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5459 { 5460 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5461 int __maybe_unused i; 5462 5463 #ifdef CONFIG_CGROUP_WRITEBACK 5464 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5465 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5466 #endif 5467 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5468 static_branch_dec(&memcg_sockets_enabled_key); 5469 5470 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5471 static_branch_dec(&memcg_sockets_enabled_key); 5472 5473 #if defined(CONFIG_MEMCG_KMEM) 5474 if (!cgroup_memory_nobpf) 5475 static_branch_dec(&memcg_bpf_enabled_key); 5476 #endif 5477 5478 vmpressure_cleanup(&memcg->vmpressure); 5479 cancel_work_sync(&memcg->high_work); 5480 mem_cgroup_remove_from_trees(memcg); 5481 free_shrinker_info(memcg); 5482 mem_cgroup_free(memcg); 5483 } 5484 5485 /** 5486 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5487 * @css: the target css 5488 * 5489 * Reset the states of the mem_cgroup associated with @css. This is 5490 * invoked when the userland requests disabling on the default hierarchy 5491 * but the memcg is pinned through dependency. The memcg should stop 5492 * applying policies and should revert to the vanilla state as it may be 5493 * made visible again. 5494 * 5495 * The current implementation only resets the essential configurations. 5496 * This needs to be expanded to cover all the visible parts. 5497 */ 5498 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5499 { 5500 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5501 5502 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5503 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5504 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5505 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5506 page_counter_set_min(&memcg->memory, 0); 5507 page_counter_set_low(&memcg->memory, 0); 5508 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5509 WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 5510 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5511 memcg_wb_domain_size_changed(memcg); 5512 } 5513 5514 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5515 { 5516 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5517 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5518 struct memcg_vmstats_percpu *statc; 5519 long delta, v; 5520 int i, nid; 5521 5522 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5523 5524 for (i = 0; i < MEMCG_NR_STAT; i++) { 5525 /* 5526 * Collect the aggregated propagation counts of groups 5527 * below us. We're in a per-cpu loop here and this is 5528 * a global counter, so the first cycle will get them. 5529 */ 5530 delta = memcg->vmstats->state_pending[i]; 5531 if (delta) 5532 memcg->vmstats->state_pending[i] = 0; 5533 5534 /* Add CPU changes on this level since the last flush */ 5535 v = READ_ONCE(statc->state[i]); 5536 if (v != statc->state_prev[i]) { 5537 delta += v - statc->state_prev[i]; 5538 statc->state_prev[i] = v; 5539 } 5540 5541 if (!delta) 5542 continue; 5543 5544 /* Aggregate counts on this level and propagate upwards */ 5545 memcg->vmstats->state[i] += delta; 5546 if (parent) 5547 parent->vmstats->state_pending[i] += delta; 5548 } 5549 5550 for (i = 0; i < NR_MEMCG_EVENTS; i++) { 5551 delta = memcg->vmstats->events_pending[i]; 5552 if (delta) 5553 memcg->vmstats->events_pending[i] = 0; 5554 5555 v = READ_ONCE(statc->events[i]); 5556 if (v != statc->events_prev[i]) { 5557 delta += v - statc->events_prev[i]; 5558 statc->events_prev[i] = v; 5559 } 5560 5561 if (!delta) 5562 continue; 5563 5564 memcg->vmstats->events[i] += delta; 5565 if (parent) 5566 parent->vmstats->events_pending[i] += delta; 5567 } 5568 5569 for_each_node_state(nid, N_MEMORY) { 5570 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 5571 struct mem_cgroup_per_node *ppn = NULL; 5572 struct lruvec_stats_percpu *lstatc; 5573 5574 if (parent) 5575 ppn = parent->nodeinfo[nid]; 5576 5577 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 5578 5579 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 5580 delta = pn->lruvec_stats.state_pending[i]; 5581 if (delta) 5582 pn->lruvec_stats.state_pending[i] = 0; 5583 5584 v = READ_ONCE(lstatc->state[i]); 5585 if (v != lstatc->state_prev[i]) { 5586 delta += v - lstatc->state_prev[i]; 5587 lstatc->state_prev[i] = v; 5588 } 5589 5590 if (!delta) 5591 continue; 5592 5593 pn->lruvec_stats.state[i] += delta; 5594 if (ppn) 5595 ppn->lruvec_stats.state_pending[i] += delta; 5596 } 5597 } 5598 } 5599 5600 #ifdef CONFIG_MMU 5601 /* Handlers for move charge at task migration. */ 5602 static int mem_cgroup_do_precharge(unsigned long count) 5603 { 5604 int ret; 5605 5606 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5607 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5608 if (!ret) { 5609 mc.precharge += count; 5610 return ret; 5611 } 5612 5613 /* Try charges one by one with reclaim, but do not retry */ 5614 while (count--) { 5615 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5616 if (ret) 5617 return ret; 5618 mc.precharge++; 5619 cond_resched(); 5620 } 5621 return 0; 5622 } 5623 5624 union mc_target { 5625 struct page *page; 5626 swp_entry_t ent; 5627 }; 5628 5629 enum mc_target_type { 5630 MC_TARGET_NONE = 0, 5631 MC_TARGET_PAGE, 5632 MC_TARGET_SWAP, 5633 MC_TARGET_DEVICE, 5634 }; 5635 5636 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5637 unsigned long addr, pte_t ptent) 5638 { 5639 struct page *page = vm_normal_page(vma, addr, ptent); 5640 5641 if (!page) 5642 return NULL; 5643 if (PageAnon(page)) { 5644 if (!(mc.flags & MOVE_ANON)) 5645 return NULL; 5646 } else { 5647 if (!(mc.flags & MOVE_FILE)) 5648 return NULL; 5649 } 5650 get_page(page); 5651 5652 return page; 5653 } 5654 5655 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5656 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5657 pte_t ptent, swp_entry_t *entry) 5658 { 5659 struct page *page = NULL; 5660 swp_entry_t ent = pte_to_swp_entry(ptent); 5661 5662 if (!(mc.flags & MOVE_ANON)) 5663 return NULL; 5664 5665 /* 5666 * Handle device private pages that are not accessible by the CPU, but 5667 * stored as special swap entries in the page table. 5668 */ 5669 if (is_device_private_entry(ent)) { 5670 page = pfn_swap_entry_to_page(ent); 5671 if (!get_page_unless_zero(page)) 5672 return NULL; 5673 return page; 5674 } 5675 5676 if (non_swap_entry(ent)) 5677 return NULL; 5678 5679 /* 5680 * Because swap_cache_get_folio() updates some statistics counter, 5681 * we call find_get_page() with swapper_space directly. 5682 */ 5683 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5684 entry->val = ent.val; 5685 5686 return page; 5687 } 5688 #else 5689 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5690 pte_t ptent, swp_entry_t *entry) 5691 { 5692 return NULL; 5693 } 5694 #endif 5695 5696 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5697 unsigned long addr, pte_t ptent) 5698 { 5699 unsigned long index; 5700 struct folio *folio; 5701 5702 if (!vma->vm_file) /* anonymous vma */ 5703 return NULL; 5704 if (!(mc.flags & MOVE_FILE)) 5705 return NULL; 5706 5707 /* folio is moved even if it's not RSS of this task(page-faulted). */ 5708 /* shmem/tmpfs may report page out on swap: account for that too. */ 5709 index = linear_page_index(vma, addr); 5710 folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 5711 if (IS_ERR(folio)) 5712 return NULL; 5713 return folio_file_page(folio, index); 5714 } 5715 5716 /** 5717 * mem_cgroup_move_account - move account of the page 5718 * @page: the page 5719 * @compound: charge the page as compound or small page 5720 * @from: mem_cgroup which the page is moved from. 5721 * @to: mem_cgroup which the page is moved to. @from != @to. 5722 * 5723 * The page must be locked and not on the LRU. 5724 * 5725 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5726 * from old cgroup. 5727 */ 5728 static int mem_cgroup_move_account(struct page *page, 5729 bool compound, 5730 struct mem_cgroup *from, 5731 struct mem_cgroup *to) 5732 { 5733 struct folio *folio = page_folio(page); 5734 struct lruvec *from_vec, *to_vec; 5735 struct pglist_data *pgdat; 5736 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 5737 int nid, ret; 5738 5739 VM_BUG_ON(from == to); 5740 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 5741 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 5742 VM_BUG_ON(compound && !folio_test_large(folio)); 5743 5744 ret = -EINVAL; 5745 if (folio_memcg(folio) != from) 5746 goto out; 5747 5748 pgdat = folio_pgdat(folio); 5749 from_vec = mem_cgroup_lruvec(from, pgdat); 5750 to_vec = mem_cgroup_lruvec(to, pgdat); 5751 5752 folio_memcg_lock(folio); 5753 5754 if (folio_test_anon(folio)) { 5755 if (folio_mapped(folio)) { 5756 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5757 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5758 if (folio_test_transhuge(folio)) { 5759 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5760 -nr_pages); 5761 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5762 nr_pages); 5763 } 5764 } 5765 } else { 5766 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5767 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5768 5769 if (folio_test_swapbacked(folio)) { 5770 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5771 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5772 } 5773 5774 if (folio_mapped(folio)) { 5775 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5776 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5777 } 5778 5779 if (folio_test_dirty(folio)) { 5780 struct address_space *mapping = folio_mapping(folio); 5781 5782 if (mapping_can_writeback(mapping)) { 5783 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5784 -nr_pages); 5785 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5786 nr_pages); 5787 } 5788 } 5789 } 5790 5791 #ifdef CONFIG_SWAP 5792 if (folio_test_swapcache(folio)) { 5793 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 5794 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 5795 } 5796 #endif 5797 if (folio_test_writeback(folio)) { 5798 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5799 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5800 } 5801 5802 /* 5803 * All state has been migrated, let's switch to the new memcg. 5804 * 5805 * It is safe to change page's memcg here because the page 5806 * is referenced, charged, isolated, and locked: we can't race 5807 * with (un)charging, migration, LRU putback, or anything else 5808 * that would rely on a stable page's memory cgroup. 5809 * 5810 * Note that folio_memcg_lock is a memcg lock, not a page lock, 5811 * to save space. As soon as we switch page's memory cgroup to a 5812 * new memcg that isn't locked, the above state can change 5813 * concurrently again. Make sure we're truly done with it. 5814 */ 5815 smp_mb(); 5816 5817 css_get(&to->css); 5818 css_put(&from->css); 5819 5820 folio->memcg_data = (unsigned long)to; 5821 5822 __folio_memcg_unlock(from); 5823 5824 ret = 0; 5825 nid = folio_nid(folio); 5826 5827 local_irq_disable(); 5828 mem_cgroup_charge_statistics(to, nr_pages); 5829 memcg_check_events(to, nid); 5830 mem_cgroup_charge_statistics(from, -nr_pages); 5831 memcg_check_events(from, nid); 5832 local_irq_enable(); 5833 out: 5834 return ret; 5835 } 5836 5837 /** 5838 * get_mctgt_type - get target type of moving charge 5839 * @vma: the vma the pte to be checked belongs 5840 * @addr: the address corresponding to the pte to be checked 5841 * @ptent: the pte to be checked 5842 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5843 * 5844 * Returns 5845 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5846 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5847 * move charge. if @target is not NULL, the page is stored in target->page 5848 * with extra refcnt got(Callers should handle it). 5849 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5850 * target for charge migration. if @target is not NULL, the entry is stored 5851 * in target->ent. 5852 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and 5853 * thus not on the lru. 5854 * For now we such page is charge like a regular page would be as for all 5855 * intent and purposes it is just special memory taking the place of a 5856 * regular page. 5857 * 5858 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5859 * 5860 * Called with pte lock held. 5861 */ 5862 5863 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5864 unsigned long addr, pte_t ptent, union mc_target *target) 5865 { 5866 struct page *page = NULL; 5867 enum mc_target_type ret = MC_TARGET_NONE; 5868 swp_entry_t ent = { .val = 0 }; 5869 5870 if (pte_present(ptent)) 5871 page = mc_handle_present_pte(vma, addr, ptent); 5872 else if (pte_none_mostly(ptent)) 5873 /* 5874 * PTE markers should be treated as a none pte here, separated 5875 * from other swap handling below. 5876 */ 5877 page = mc_handle_file_pte(vma, addr, ptent); 5878 else if (is_swap_pte(ptent)) 5879 page = mc_handle_swap_pte(vma, ptent, &ent); 5880 5881 if (target && page) { 5882 if (!trylock_page(page)) { 5883 put_page(page); 5884 return ret; 5885 } 5886 /* 5887 * page_mapped() must be stable during the move. This 5888 * pte is locked, so if it's present, the page cannot 5889 * become unmapped. If it isn't, we have only partial 5890 * control over the mapped state: the page lock will 5891 * prevent new faults against pagecache and swapcache, 5892 * so an unmapped page cannot become mapped. However, 5893 * if the page is already mapped elsewhere, it can 5894 * unmap, and there is nothing we can do about it. 5895 * Alas, skip moving the page in this case. 5896 */ 5897 if (!pte_present(ptent) && page_mapped(page)) { 5898 unlock_page(page); 5899 put_page(page); 5900 return ret; 5901 } 5902 } 5903 5904 if (!page && !ent.val) 5905 return ret; 5906 if (page) { 5907 /* 5908 * Do only loose check w/o serialization. 5909 * mem_cgroup_move_account() checks the page is valid or 5910 * not under LRU exclusion. 5911 */ 5912 if (page_memcg(page) == mc.from) { 5913 ret = MC_TARGET_PAGE; 5914 if (is_device_private_page(page) || 5915 is_device_coherent_page(page)) 5916 ret = MC_TARGET_DEVICE; 5917 if (target) 5918 target->page = page; 5919 } 5920 if (!ret || !target) { 5921 if (target) 5922 unlock_page(page); 5923 put_page(page); 5924 } 5925 } 5926 /* 5927 * There is a swap entry and a page doesn't exist or isn't charged. 5928 * But we cannot move a tail-page in a THP. 5929 */ 5930 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5931 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5932 ret = MC_TARGET_SWAP; 5933 if (target) 5934 target->ent = ent; 5935 } 5936 return ret; 5937 } 5938 5939 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5940 /* 5941 * We don't consider PMD mapped swapping or file mapped pages because THP does 5942 * not support them for now. 5943 * Caller should make sure that pmd_trans_huge(pmd) is true. 5944 */ 5945 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5946 unsigned long addr, pmd_t pmd, union mc_target *target) 5947 { 5948 struct page *page = NULL; 5949 enum mc_target_type ret = MC_TARGET_NONE; 5950 5951 if (unlikely(is_swap_pmd(pmd))) { 5952 VM_BUG_ON(thp_migration_supported() && 5953 !is_pmd_migration_entry(pmd)); 5954 return ret; 5955 } 5956 page = pmd_page(pmd); 5957 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5958 if (!(mc.flags & MOVE_ANON)) 5959 return ret; 5960 if (page_memcg(page) == mc.from) { 5961 ret = MC_TARGET_PAGE; 5962 if (target) { 5963 get_page(page); 5964 if (!trylock_page(page)) { 5965 put_page(page); 5966 return MC_TARGET_NONE; 5967 } 5968 target->page = page; 5969 } 5970 } 5971 return ret; 5972 } 5973 #else 5974 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5975 unsigned long addr, pmd_t pmd, union mc_target *target) 5976 { 5977 return MC_TARGET_NONE; 5978 } 5979 #endif 5980 5981 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5982 unsigned long addr, unsigned long end, 5983 struct mm_walk *walk) 5984 { 5985 struct vm_area_struct *vma = walk->vma; 5986 pte_t *pte; 5987 spinlock_t *ptl; 5988 5989 ptl = pmd_trans_huge_lock(pmd, vma); 5990 if (ptl) { 5991 /* 5992 * Note their can not be MC_TARGET_DEVICE for now as we do not 5993 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5994 * this might change. 5995 */ 5996 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5997 mc.precharge += HPAGE_PMD_NR; 5998 spin_unlock(ptl); 5999 return 0; 6000 } 6001 6002 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6003 if (!pte) 6004 return 0; 6005 for (; addr != end; pte++, addr += PAGE_SIZE) 6006 if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) 6007 mc.precharge++; /* increment precharge temporarily */ 6008 pte_unmap_unlock(pte - 1, ptl); 6009 cond_resched(); 6010 6011 return 0; 6012 } 6013 6014 static const struct mm_walk_ops precharge_walk_ops = { 6015 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6016 }; 6017 6018 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6019 { 6020 unsigned long precharge; 6021 6022 mmap_read_lock(mm); 6023 walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 6024 mmap_read_unlock(mm); 6025 6026 precharge = mc.precharge; 6027 mc.precharge = 0; 6028 6029 return precharge; 6030 } 6031 6032 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6033 { 6034 unsigned long precharge = mem_cgroup_count_precharge(mm); 6035 6036 VM_BUG_ON(mc.moving_task); 6037 mc.moving_task = current; 6038 return mem_cgroup_do_precharge(precharge); 6039 } 6040 6041 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6042 static void __mem_cgroup_clear_mc(void) 6043 { 6044 struct mem_cgroup *from = mc.from; 6045 struct mem_cgroup *to = mc.to; 6046 6047 /* we must uncharge all the leftover precharges from mc.to */ 6048 if (mc.precharge) { 6049 cancel_charge(mc.to, mc.precharge); 6050 mc.precharge = 0; 6051 } 6052 /* 6053 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6054 * we must uncharge here. 6055 */ 6056 if (mc.moved_charge) { 6057 cancel_charge(mc.from, mc.moved_charge); 6058 mc.moved_charge = 0; 6059 } 6060 /* we must fixup refcnts and charges */ 6061 if (mc.moved_swap) { 6062 /* uncharge swap account from the old cgroup */ 6063 if (!mem_cgroup_is_root(mc.from)) 6064 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 6065 6066 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 6067 6068 /* 6069 * we charged both to->memory and to->memsw, so we 6070 * should uncharge to->memory. 6071 */ 6072 if (!mem_cgroup_is_root(mc.to)) 6073 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 6074 6075 mc.moved_swap = 0; 6076 } 6077 memcg_oom_recover(from); 6078 memcg_oom_recover(to); 6079 wake_up_all(&mc.waitq); 6080 } 6081 6082 static void mem_cgroup_clear_mc(void) 6083 { 6084 struct mm_struct *mm = mc.mm; 6085 6086 /* 6087 * we must clear moving_task before waking up waiters at the end of 6088 * task migration. 6089 */ 6090 mc.moving_task = NULL; 6091 __mem_cgroup_clear_mc(); 6092 spin_lock(&mc.lock); 6093 mc.from = NULL; 6094 mc.to = NULL; 6095 mc.mm = NULL; 6096 spin_unlock(&mc.lock); 6097 6098 mmput(mm); 6099 } 6100 6101 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6102 { 6103 struct cgroup_subsys_state *css; 6104 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 6105 struct mem_cgroup *from; 6106 struct task_struct *leader, *p; 6107 struct mm_struct *mm; 6108 unsigned long move_flags; 6109 int ret = 0; 6110 6111 /* charge immigration isn't supported on the default hierarchy */ 6112 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6113 return 0; 6114 6115 /* 6116 * Multi-process migrations only happen on the default hierarchy 6117 * where charge immigration is not used. Perform charge 6118 * immigration if @tset contains a leader and whine if there are 6119 * multiple. 6120 */ 6121 p = NULL; 6122 cgroup_taskset_for_each_leader(leader, css, tset) { 6123 WARN_ON_ONCE(p); 6124 p = leader; 6125 memcg = mem_cgroup_from_css(css); 6126 } 6127 if (!p) 6128 return 0; 6129 6130 /* 6131 * We are now committed to this value whatever it is. Changes in this 6132 * tunable will only affect upcoming migrations, not the current one. 6133 * So we need to save it, and keep it going. 6134 */ 6135 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 6136 if (!move_flags) 6137 return 0; 6138 6139 from = mem_cgroup_from_task(p); 6140 6141 VM_BUG_ON(from == memcg); 6142 6143 mm = get_task_mm(p); 6144 if (!mm) 6145 return 0; 6146 /* We move charges only when we move a owner of the mm */ 6147 if (mm->owner == p) { 6148 VM_BUG_ON(mc.from); 6149 VM_BUG_ON(mc.to); 6150 VM_BUG_ON(mc.precharge); 6151 VM_BUG_ON(mc.moved_charge); 6152 VM_BUG_ON(mc.moved_swap); 6153 6154 spin_lock(&mc.lock); 6155 mc.mm = mm; 6156 mc.from = from; 6157 mc.to = memcg; 6158 mc.flags = move_flags; 6159 spin_unlock(&mc.lock); 6160 /* We set mc.moving_task later */ 6161 6162 ret = mem_cgroup_precharge_mc(mm); 6163 if (ret) 6164 mem_cgroup_clear_mc(); 6165 } else { 6166 mmput(mm); 6167 } 6168 return ret; 6169 } 6170 6171 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6172 { 6173 if (mc.to) 6174 mem_cgroup_clear_mc(); 6175 } 6176 6177 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6178 unsigned long addr, unsigned long end, 6179 struct mm_walk *walk) 6180 { 6181 int ret = 0; 6182 struct vm_area_struct *vma = walk->vma; 6183 pte_t *pte; 6184 spinlock_t *ptl; 6185 enum mc_target_type target_type; 6186 union mc_target target; 6187 struct page *page; 6188 6189 ptl = pmd_trans_huge_lock(pmd, vma); 6190 if (ptl) { 6191 if (mc.precharge < HPAGE_PMD_NR) { 6192 spin_unlock(ptl); 6193 return 0; 6194 } 6195 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6196 if (target_type == MC_TARGET_PAGE) { 6197 page = target.page; 6198 if (isolate_lru_page(page)) { 6199 if (!mem_cgroup_move_account(page, true, 6200 mc.from, mc.to)) { 6201 mc.precharge -= HPAGE_PMD_NR; 6202 mc.moved_charge += HPAGE_PMD_NR; 6203 } 6204 putback_lru_page(page); 6205 } 6206 unlock_page(page); 6207 put_page(page); 6208 } else if (target_type == MC_TARGET_DEVICE) { 6209 page = target.page; 6210 if (!mem_cgroup_move_account(page, true, 6211 mc.from, mc.to)) { 6212 mc.precharge -= HPAGE_PMD_NR; 6213 mc.moved_charge += HPAGE_PMD_NR; 6214 } 6215 unlock_page(page); 6216 put_page(page); 6217 } 6218 spin_unlock(ptl); 6219 return 0; 6220 } 6221 6222 retry: 6223 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6224 if (!pte) 6225 return 0; 6226 for (; addr != end; addr += PAGE_SIZE) { 6227 pte_t ptent = ptep_get(pte++); 6228 bool device = false; 6229 swp_entry_t ent; 6230 6231 if (!mc.precharge) 6232 break; 6233 6234 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6235 case MC_TARGET_DEVICE: 6236 device = true; 6237 fallthrough; 6238 case MC_TARGET_PAGE: 6239 page = target.page; 6240 /* 6241 * We can have a part of the split pmd here. Moving it 6242 * can be done but it would be too convoluted so simply 6243 * ignore such a partial THP and keep it in original 6244 * memcg. There should be somebody mapping the head. 6245 */ 6246 if (PageTransCompound(page)) 6247 goto put; 6248 if (!device && !isolate_lru_page(page)) 6249 goto put; 6250 if (!mem_cgroup_move_account(page, false, 6251 mc.from, mc.to)) { 6252 mc.precharge--; 6253 /* we uncharge from mc.from later. */ 6254 mc.moved_charge++; 6255 } 6256 if (!device) 6257 putback_lru_page(page); 6258 put: /* get_mctgt_type() gets & locks the page */ 6259 unlock_page(page); 6260 put_page(page); 6261 break; 6262 case MC_TARGET_SWAP: 6263 ent = target.ent; 6264 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6265 mc.precharge--; 6266 mem_cgroup_id_get_many(mc.to, 1); 6267 /* we fixup other refcnts and charges later. */ 6268 mc.moved_swap++; 6269 } 6270 break; 6271 default: 6272 break; 6273 } 6274 } 6275 pte_unmap_unlock(pte - 1, ptl); 6276 cond_resched(); 6277 6278 if (addr != end) { 6279 /* 6280 * We have consumed all precharges we got in can_attach(). 6281 * We try charge one by one, but don't do any additional 6282 * charges to mc.to if we have failed in charge once in attach() 6283 * phase. 6284 */ 6285 ret = mem_cgroup_do_precharge(1); 6286 if (!ret) 6287 goto retry; 6288 } 6289 6290 return ret; 6291 } 6292 6293 static const struct mm_walk_ops charge_walk_ops = { 6294 .pmd_entry = mem_cgroup_move_charge_pte_range, 6295 }; 6296 6297 static void mem_cgroup_move_charge(void) 6298 { 6299 lru_add_drain_all(); 6300 /* 6301 * Signal folio_memcg_lock() to take the memcg's move_lock 6302 * while we're moving its pages to another memcg. Then wait 6303 * for already started RCU-only updates to finish. 6304 */ 6305 atomic_inc(&mc.from->moving_account); 6306 synchronize_rcu(); 6307 retry: 6308 if (unlikely(!mmap_read_trylock(mc.mm))) { 6309 /* 6310 * Someone who are holding the mmap_lock might be waiting in 6311 * waitq. So we cancel all extra charges, wake up all waiters, 6312 * and retry. Because we cancel precharges, we might not be able 6313 * to move enough charges, but moving charge is a best-effort 6314 * feature anyway, so it wouldn't be a big problem. 6315 */ 6316 __mem_cgroup_clear_mc(); 6317 cond_resched(); 6318 goto retry; 6319 } 6320 /* 6321 * When we have consumed all precharges and failed in doing 6322 * additional charge, the page walk just aborts. 6323 */ 6324 walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 6325 mmap_read_unlock(mc.mm); 6326 atomic_dec(&mc.from->moving_account); 6327 } 6328 6329 static void mem_cgroup_move_task(void) 6330 { 6331 if (mc.to) { 6332 mem_cgroup_move_charge(); 6333 mem_cgroup_clear_mc(); 6334 } 6335 } 6336 #else /* !CONFIG_MMU */ 6337 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6338 { 6339 return 0; 6340 } 6341 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6342 { 6343 } 6344 static void mem_cgroup_move_task(void) 6345 { 6346 } 6347 #endif 6348 6349 #ifdef CONFIG_LRU_GEN 6350 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6351 { 6352 struct task_struct *task; 6353 struct cgroup_subsys_state *css; 6354 6355 /* find the first leader if there is any */ 6356 cgroup_taskset_for_each_leader(task, css, tset) 6357 break; 6358 6359 if (!task) 6360 return; 6361 6362 task_lock(task); 6363 if (task->mm && READ_ONCE(task->mm->owner) == task) 6364 lru_gen_migrate_mm(task->mm); 6365 task_unlock(task); 6366 } 6367 #else 6368 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6369 { 6370 } 6371 #endif /* CONFIG_LRU_GEN */ 6372 6373 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6374 { 6375 if (value == PAGE_COUNTER_MAX) 6376 seq_puts(m, "max\n"); 6377 else 6378 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6379 6380 return 0; 6381 } 6382 6383 static u64 memory_current_read(struct cgroup_subsys_state *css, 6384 struct cftype *cft) 6385 { 6386 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6387 6388 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6389 } 6390 6391 static u64 memory_peak_read(struct cgroup_subsys_state *css, 6392 struct cftype *cft) 6393 { 6394 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6395 6396 return (u64)memcg->memory.watermark * PAGE_SIZE; 6397 } 6398 6399 static int memory_min_show(struct seq_file *m, void *v) 6400 { 6401 return seq_puts_memcg_tunable(m, 6402 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6403 } 6404 6405 static ssize_t memory_min_write(struct kernfs_open_file *of, 6406 char *buf, size_t nbytes, loff_t off) 6407 { 6408 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6409 unsigned long min; 6410 int err; 6411 6412 buf = strstrip(buf); 6413 err = page_counter_memparse(buf, "max", &min); 6414 if (err) 6415 return err; 6416 6417 page_counter_set_min(&memcg->memory, min); 6418 6419 return nbytes; 6420 } 6421 6422 static int memory_low_show(struct seq_file *m, void *v) 6423 { 6424 return seq_puts_memcg_tunable(m, 6425 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6426 } 6427 6428 static ssize_t memory_low_write(struct kernfs_open_file *of, 6429 char *buf, size_t nbytes, loff_t off) 6430 { 6431 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6432 unsigned long low; 6433 int err; 6434 6435 buf = strstrip(buf); 6436 err = page_counter_memparse(buf, "max", &low); 6437 if (err) 6438 return err; 6439 6440 page_counter_set_low(&memcg->memory, low); 6441 6442 return nbytes; 6443 } 6444 6445 static int memory_high_show(struct seq_file *m, void *v) 6446 { 6447 return seq_puts_memcg_tunable(m, 6448 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6449 } 6450 6451 static ssize_t memory_high_write(struct kernfs_open_file *of, 6452 char *buf, size_t nbytes, loff_t off) 6453 { 6454 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6455 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6456 bool drained = false; 6457 unsigned long high; 6458 int err; 6459 6460 buf = strstrip(buf); 6461 err = page_counter_memparse(buf, "max", &high); 6462 if (err) 6463 return err; 6464 6465 page_counter_set_high(&memcg->memory, high); 6466 6467 for (;;) { 6468 unsigned long nr_pages = page_counter_read(&memcg->memory); 6469 unsigned long reclaimed; 6470 6471 if (nr_pages <= high) 6472 break; 6473 6474 if (signal_pending(current)) 6475 break; 6476 6477 if (!drained) { 6478 drain_all_stock(memcg); 6479 drained = true; 6480 continue; 6481 } 6482 6483 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6484 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 6485 6486 if (!reclaimed && !nr_retries--) 6487 break; 6488 } 6489 6490 memcg_wb_domain_size_changed(memcg); 6491 return nbytes; 6492 } 6493 6494 static int memory_max_show(struct seq_file *m, void *v) 6495 { 6496 return seq_puts_memcg_tunable(m, 6497 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6498 } 6499 6500 static ssize_t memory_max_write(struct kernfs_open_file *of, 6501 char *buf, size_t nbytes, loff_t off) 6502 { 6503 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6504 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6505 bool drained = false; 6506 unsigned long max; 6507 int err; 6508 6509 buf = strstrip(buf); 6510 err = page_counter_memparse(buf, "max", &max); 6511 if (err) 6512 return err; 6513 6514 xchg(&memcg->memory.max, max); 6515 6516 for (;;) { 6517 unsigned long nr_pages = page_counter_read(&memcg->memory); 6518 6519 if (nr_pages <= max) 6520 break; 6521 6522 if (signal_pending(current)) 6523 break; 6524 6525 if (!drained) { 6526 drain_all_stock(memcg); 6527 drained = true; 6528 continue; 6529 } 6530 6531 if (nr_reclaims) { 6532 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6533 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 6534 nr_reclaims--; 6535 continue; 6536 } 6537 6538 memcg_memory_event(memcg, MEMCG_OOM); 6539 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6540 break; 6541 } 6542 6543 memcg_wb_domain_size_changed(memcg); 6544 return nbytes; 6545 } 6546 6547 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6548 { 6549 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6550 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6551 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6552 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6553 seq_printf(m, "oom_kill %lu\n", 6554 atomic_long_read(&events[MEMCG_OOM_KILL])); 6555 seq_printf(m, "oom_group_kill %lu\n", 6556 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 6557 } 6558 6559 static int memory_events_show(struct seq_file *m, void *v) 6560 { 6561 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6562 6563 __memory_events_show(m, memcg->memory_events); 6564 return 0; 6565 } 6566 6567 static int memory_events_local_show(struct seq_file *m, void *v) 6568 { 6569 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6570 6571 __memory_events_show(m, memcg->memory_events_local); 6572 return 0; 6573 } 6574 6575 static int memory_stat_show(struct seq_file *m, void *v) 6576 { 6577 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6578 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 6579 struct seq_buf s; 6580 6581 if (!buf) 6582 return -ENOMEM; 6583 seq_buf_init(&s, buf, PAGE_SIZE); 6584 memory_stat_format(memcg, &s); 6585 seq_puts(m, buf); 6586 kfree(buf); 6587 return 0; 6588 } 6589 6590 #ifdef CONFIG_NUMA 6591 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6592 int item) 6593 { 6594 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 6595 } 6596 6597 static int memory_numa_stat_show(struct seq_file *m, void *v) 6598 { 6599 int i; 6600 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6601 6602 mem_cgroup_flush_stats(); 6603 6604 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6605 int nid; 6606 6607 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6608 continue; 6609 6610 seq_printf(m, "%s", memory_stats[i].name); 6611 for_each_node_state(nid, N_MEMORY) { 6612 u64 size; 6613 struct lruvec *lruvec; 6614 6615 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6616 size = lruvec_page_state_output(lruvec, 6617 memory_stats[i].idx); 6618 seq_printf(m, " N%d=%llu", nid, size); 6619 } 6620 seq_putc(m, '\n'); 6621 } 6622 6623 return 0; 6624 } 6625 #endif 6626 6627 static int memory_oom_group_show(struct seq_file *m, void *v) 6628 { 6629 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6630 6631 seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group)); 6632 6633 return 0; 6634 } 6635 6636 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6637 char *buf, size_t nbytes, loff_t off) 6638 { 6639 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6640 int ret, oom_group; 6641 6642 buf = strstrip(buf); 6643 if (!buf) 6644 return -EINVAL; 6645 6646 ret = kstrtoint(buf, 0, &oom_group); 6647 if (ret) 6648 return ret; 6649 6650 if (oom_group != 0 && oom_group != 1) 6651 return -EINVAL; 6652 6653 WRITE_ONCE(memcg->oom_group, oom_group); 6654 6655 return nbytes; 6656 } 6657 6658 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 6659 size_t nbytes, loff_t off) 6660 { 6661 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6662 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6663 unsigned long nr_to_reclaim, nr_reclaimed = 0; 6664 unsigned int reclaim_options; 6665 int err; 6666 6667 buf = strstrip(buf); 6668 err = page_counter_memparse(buf, "", &nr_to_reclaim); 6669 if (err) 6670 return err; 6671 6672 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 6673 while (nr_reclaimed < nr_to_reclaim) { 6674 unsigned long reclaimed; 6675 6676 if (signal_pending(current)) 6677 return -EINTR; 6678 6679 /* 6680 * This is the final attempt, drain percpu lru caches in the 6681 * hope of introducing more evictable pages for 6682 * try_to_free_mem_cgroup_pages(). 6683 */ 6684 if (!nr_retries) 6685 lru_add_drain_all(); 6686 6687 reclaimed = try_to_free_mem_cgroup_pages(memcg, 6688 min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), 6689 GFP_KERNEL, reclaim_options); 6690 6691 if (!reclaimed && !nr_retries--) 6692 return -EAGAIN; 6693 6694 nr_reclaimed += reclaimed; 6695 } 6696 6697 return nbytes; 6698 } 6699 6700 static struct cftype memory_files[] = { 6701 { 6702 .name = "current", 6703 .flags = CFTYPE_NOT_ON_ROOT, 6704 .read_u64 = memory_current_read, 6705 }, 6706 { 6707 .name = "peak", 6708 .flags = CFTYPE_NOT_ON_ROOT, 6709 .read_u64 = memory_peak_read, 6710 }, 6711 { 6712 .name = "min", 6713 .flags = CFTYPE_NOT_ON_ROOT, 6714 .seq_show = memory_min_show, 6715 .write = memory_min_write, 6716 }, 6717 { 6718 .name = "low", 6719 .flags = CFTYPE_NOT_ON_ROOT, 6720 .seq_show = memory_low_show, 6721 .write = memory_low_write, 6722 }, 6723 { 6724 .name = "high", 6725 .flags = CFTYPE_NOT_ON_ROOT, 6726 .seq_show = memory_high_show, 6727 .write = memory_high_write, 6728 }, 6729 { 6730 .name = "max", 6731 .flags = CFTYPE_NOT_ON_ROOT, 6732 .seq_show = memory_max_show, 6733 .write = memory_max_write, 6734 }, 6735 { 6736 .name = "events", 6737 .flags = CFTYPE_NOT_ON_ROOT, 6738 .file_offset = offsetof(struct mem_cgroup, events_file), 6739 .seq_show = memory_events_show, 6740 }, 6741 { 6742 .name = "events.local", 6743 .flags = CFTYPE_NOT_ON_ROOT, 6744 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6745 .seq_show = memory_events_local_show, 6746 }, 6747 { 6748 .name = "stat", 6749 .seq_show = memory_stat_show, 6750 }, 6751 #ifdef CONFIG_NUMA 6752 { 6753 .name = "numa_stat", 6754 .seq_show = memory_numa_stat_show, 6755 }, 6756 #endif 6757 { 6758 .name = "oom.group", 6759 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6760 .seq_show = memory_oom_group_show, 6761 .write = memory_oom_group_write, 6762 }, 6763 { 6764 .name = "reclaim", 6765 .flags = CFTYPE_NS_DELEGATABLE, 6766 .write = memory_reclaim, 6767 }, 6768 { } /* terminate */ 6769 }; 6770 6771 struct cgroup_subsys memory_cgrp_subsys = { 6772 .css_alloc = mem_cgroup_css_alloc, 6773 .css_online = mem_cgroup_css_online, 6774 .css_offline = mem_cgroup_css_offline, 6775 .css_released = mem_cgroup_css_released, 6776 .css_free = mem_cgroup_css_free, 6777 .css_reset = mem_cgroup_css_reset, 6778 .css_rstat_flush = mem_cgroup_css_rstat_flush, 6779 .can_attach = mem_cgroup_can_attach, 6780 .attach = mem_cgroup_attach, 6781 .cancel_attach = mem_cgroup_cancel_attach, 6782 .post_attach = mem_cgroup_move_task, 6783 .dfl_cftypes = memory_files, 6784 .legacy_cftypes = mem_cgroup_legacy_files, 6785 .early_init = 0, 6786 }; 6787 6788 /* 6789 * This function calculates an individual cgroup's effective 6790 * protection which is derived from its own memory.min/low, its 6791 * parent's and siblings' settings, as well as the actual memory 6792 * distribution in the tree. 6793 * 6794 * The following rules apply to the effective protection values: 6795 * 6796 * 1. At the first level of reclaim, effective protection is equal to 6797 * the declared protection in memory.min and memory.low. 6798 * 6799 * 2. To enable safe delegation of the protection configuration, at 6800 * subsequent levels the effective protection is capped to the 6801 * parent's effective protection. 6802 * 6803 * 3. To make complex and dynamic subtrees easier to configure, the 6804 * user is allowed to overcommit the declared protection at a given 6805 * level. If that is the case, the parent's effective protection is 6806 * distributed to the children in proportion to how much protection 6807 * they have declared and how much of it they are utilizing. 6808 * 6809 * This makes distribution proportional, but also work-conserving: 6810 * if one cgroup claims much more protection than it uses memory, 6811 * the unused remainder is available to its siblings. 6812 * 6813 * 4. Conversely, when the declared protection is undercommitted at a 6814 * given level, the distribution of the larger parental protection 6815 * budget is NOT proportional. A cgroup's protection from a sibling 6816 * is capped to its own memory.min/low setting. 6817 * 6818 * 5. However, to allow protecting recursive subtrees from each other 6819 * without having to declare each individual cgroup's fixed share 6820 * of the ancestor's claim to protection, any unutilized - 6821 * "floating" - protection from up the tree is distributed in 6822 * proportion to each cgroup's *usage*. This makes the protection 6823 * neutral wrt sibling cgroups and lets them compete freely over 6824 * the shared parental protection budget, but it protects the 6825 * subtree as a whole from neighboring subtrees. 6826 * 6827 * Note that 4. and 5. are not in conflict: 4. is about protecting 6828 * against immediate siblings whereas 5. is about protecting against 6829 * neighboring subtrees. 6830 */ 6831 static unsigned long effective_protection(unsigned long usage, 6832 unsigned long parent_usage, 6833 unsigned long setting, 6834 unsigned long parent_effective, 6835 unsigned long siblings_protected) 6836 { 6837 unsigned long protected; 6838 unsigned long ep; 6839 6840 protected = min(usage, setting); 6841 /* 6842 * If all cgroups at this level combined claim and use more 6843 * protection than what the parent affords them, distribute 6844 * shares in proportion to utilization. 6845 * 6846 * We are using actual utilization rather than the statically 6847 * claimed protection in order to be work-conserving: claimed 6848 * but unused protection is available to siblings that would 6849 * otherwise get a smaller chunk than what they claimed. 6850 */ 6851 if (siblings_protected > parent_effective) 6852 return protected * parent_effective / siblings_protected; 6853 6854 /* 6855 * Ok, utilized protection of all children is within what the 6856 * parent affords them, so we know whatever this child claims 6857 * and utilizes is effectively protected. 6858 * 6859 * If there is unprotected usage beyond this value, reclaim 6860 * will apply pressure in proportion to that amount. 6861 * 6862 * If there is unutilized protection, the cgroup will be fully 6863 * shielded from reclaim, but we do return a smaller value for 6864 * protection than what the group could enjoy in theory. This 6865 * is okay. With the overcommit distribution above, effective 6866 * protection is always dependent on how memory is actually 6867 * consumed among the siblings anyway. 6868 */ 6869 ep = protected; 6870 6871 /* 6872 * If the children aren't claiming (all of) the protection 6873 * afforded to them by the parent, distribute the remainder in 6874 * proportion to the (unprotected) memory of each cgroup. That 6875 * way, cgroups that aren't explicitly prioritized wrt each 6876 * other compete freely over the allowance, but they are 6877 * collectively protected from neighboring trees. 6878 * 6879 * We're using unprotected memory for the weight so that if 6880 * some cgroups DO claim explicit protection, we don't protect 6881 * the same bytes twice. 6882 * 6883 * Check both usage and parent_usage against the respective 6884 * protected values. One should imply the other, but they 6885 * aren't read atomically - make sure the division is sane. 6886 */ 6887 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6888 return ep; 6889 if (parent_effective > siblings_protected && 6890 parent_usage > siblings_protected && 6891 usage > protected) { 6892 unsigned long unclaimed; 6893 6894 unclaimed = parent_effective - siblings_protected; 6895 unclaimed *= usage - protected; 6896 unclaimed /= parent_usage - siblings_protected; 6897 6898 ep += unclaimed; 6899 } 6900 6901 return ep; 6902 } 6903 6904 /** 6905 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 6906 * @root: the top ancestor of the sub-tree being checked 6907 * @memcg: the memory cgroup to check 6908 * 6909 * WARNING: This function is not stateless! It can only be used as part 6910 * of a top-down tree iteration, not for isolated queries. 6911 */ 6912 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6913 struct mem_cgroup *memcg) 6914 { 6915 unsigned long usage, parent_usage; 6916 struct mem_cgroup *parent; 6917 6918 if (mem_cgroup_disabled()) 6919 return; 6920 6921 if (!root) 6922 root = root_mem_cgroup; 6923 6924 /* 6925 * Effective values of the reclaim targets are ignored so they 6926 * can be stale. Have a look at mem_cgroup_protection for more 6927 * details. 6928 * TODO: calculation should be more robust so that we do not need 6929 * that special casing. 6930 */ 6931 if (memcg == root) 6932 return; 6933 6934 usage = page_counter_read(&memcg->memory); 6935 if (!usage) 6936 return; 6937 6938 parent = parent_mem_cgroup(memcg); 6939 6940 if (parent == root) { 6941 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6942 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6943 return; 6944 } 6945 6946 parent_usage = page_counter_read(&parent->memory); 6947 6948 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6949 READ_ONCE(memcg->memory.min), 6950 READ_ONCE(parent->memory.emin), 6951 atomic_long_read(&parent->memory.children_min_usage))); 6952 6953 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6954 READ_ONCE(memcg->memory.low), 6955 READ_ONCE(parent->memory.elow), 6956 atomic_long_read(&parent->memory.children_low_usage))); 6957 } 6958 6959 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 6960 gfp_t gfp) 6961 { 6962 long nr_pages = folio_nr_pages(folio); 6963 int ret; 6964 6965 ret = try_charge(memcg, gfp, nr_pages); 6966 if (ret) 6967 goto out; 6968 6969 css_get(&memcg->css); 6970 commit_charge(folio, memcg); 6971 6972 local_irq_disable(); 6973 mem_cgroup_charge_statistics(memcg, nr_pages); 6974 memcg_check_events(memcg, folio_nid(folio)); 6975 local_irq_enable(); 6976 out: 6977 return ret; 6978 } 6979 6980 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 6981 { 6982 struct mem_cgroup *memcg; 6983 int ret; 6984 6985 memcg = get_mem_cgroup_from_mm(mm); 6986 ret = charge_memcg(folio, memcg, gfp); 6987 css_put(&memcg->css); 6988 6989 return ret; 6990 } 6991 6992 /** 6993 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 6994 * @folio: folio to charge. 6995 * @mm: mm context of the victim 6996 * @gfp: reclaim mode 6997 * @entry: swap entry for which the folio is allocated 6998 * 6999 * This function charges a folio allocated for swapin. Please call this before 7000 * adding the folio to the swapcache. 7001 * 7002 * Returns 0 on success. Otherwise, an error code is returned. 7003 */ 7004 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 7005 gfp_t gfp, swp_entry_t entry) 7006 { 7007 struct mem_cgroup *memcg; 7008 unsigned short id; 7009 int ret; 7010 7011 if (mem_cgroup_disabled()) 7012 return 0; 7013 7014 id = lookup_swap_cgroup_id(entry); 7015 rcu_read_lock(); 7016 memcg = mem_cgroup_from_id(id); 7017 if (!memcg || !css_tryget_online(&memcg->css)) 7018 memcg = get_mem_cgroup_from_mm(mm); 7019 rcu_read_unlock(); 7020 7021 ret = charge_memcg(folio, memcg, gfp); 7022 7023 css_put(&memcg->css); 7024 return ret; 7025 } 7026 7027 /* 7028 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 7029 * @entry: swap entry for which the page is charged 7030 * 7031 * Call this function after successfully adding the charged page to swapcache. 7032 * 7033 * Note: This function assumes the page for which swap slot is being uncharged 7034 * is order 0 page. 7035 */ 7036 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 7037 { 7038 /* 7039 * Cgroup1's unified memory+swap counter has been charged with the 7040 * new swapcache page, finish the transfer by uncharging the swap 7041 * slot. The swap slot would also get uncharged when it dies, but 7042 * it can stick around indefinitely and we'd count the page twice 7043 * the entire time. 7044 * 7045 * Cgroup2 has separate resource counters for memory and swap, 7046 * so this is a non-issue here. Memory and swap charge lifetimes 7047 * correspond 1:1 to page and swap slot lifetimes: we charge the 7048 * page to memory here, and uncharge swap when the slot is freed. 7049 */ 7050 if (!mem_cgroup_disabled() && do_memsw_account()) { 7051 /* 7052 * The swap entry might not get freed for a long time, 7053 * let's not wait for it. The page already received a 7054 * memory+swap charge, drop the swap entry duplicate. 7055 */ 7056 mem_cgroup_uncharge_swap(entry, 1); 7057 } 7058 } 7059 7060 struct uncharge_gather { 7061 struct mem_cgroup *memcg; 7062 unsigned long nr_memory; 7063 unsigned long pgpgout; 7064 unsigned long nr_kmem; 7065 int nid; 7066 }; 7067 7068 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 7069 { 7070 memset(ug, 0, sizeof(*ug)); 7071 } 7072 7073 static void uncharge_batch(const struct uncharge_gather *ug) 7074 { 7075 unsigned long flags; 7076 7077 if (ug->nr_memory) { 7078 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 7079 if (do_memsw_account()) 7080 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 7081 if (ug->nr_kmem) 7082 memcg_account_kmem(ug->memcg, -ug->nr_kmem); 7083 memcg_oom_recover(ug->memcg); 7084 } 7085 7086 local_irq_save(flags); 7087 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 7088 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 7089 memcg_check_events(ug->memcg, ug->nid); 7090 local_irq_restore(flags); 7091 7092 /* drop reference from uncharge_folio */ 7093 css_put(&ug->memcg->css); 7094 } 7095 7096 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 7097 { 7098 long nr_pages; 7099 struct mem_cgroup *memcg; 7100 struct obj_cgroup *objcg; 7101 7102 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7103 7104 /* 7105 * Nobody should be changing or seriously looking at 7106 * folio memcg or objcg at this point, we have fully 7107 * exclusive access to the folio. 7108 */ 7109 if (folio_memcg_kmem(folio)) { 7110 objcg = __folio_objcg(folio); 7111 /* 7112 * This get matches the put at the end of the function and 7113 * kmem pages do not hold memcg references anymore. 7114 */ 7115 memcg = get_mem_cgroup_from_objcg(objcg); 7116 } else { 7117 memcg = __folio_memcg(folio); 7118 } 7119 7120 if (!memcg) 7121 return; 7122 7123 if (ug->memcg != memcg) { 7124 if (ug->memcg) { 7125 uncharge_batch(ug); 7126 uncharge_gather_clear(ug); 7127 } 7128 ug->memcg = memcg; 7129 ug->nid = folio_nid(folio); 7130 7131 /* pairs with css_put in uncharge_batch */ 7132 css_get(&memcg->css); 7133 } 7134 7135 nr_pages = folio_nr_pages(folio); 7136 7137 if (folio_memcg_kmem(folio)) { 7138 ug->nr_memory += nr_pages; 7139 ug->nr_kmem += nr_pages; 7140 7141 folio->memcg_data = 0; 7142 obj_cgroup_put(objcg); 7143 } else { 7144 /* LRU pages aren't accounted at the root level */ 7145 if (!mem_cgroup_is_root(memcg)) 7146 ug->nr_memory += nr_pages; 7147 ug->pgpgout++; 7148 7149 folio->memcg_data = 0; 7150 } 7151 7152 css_put(&memcg->css); 7153 } 7154 7155 void __mem_cgroup_uncharge(struct folio *folio) 7156 { 7157 struct uncharge_gather ug; 7158 7159 /* Don't touch folio->lru of any random page, pre-check: */ 7160 if (!folio_memcg(folio)) 7161 return; 7162 7163 uncharge_gather_clear(&ug); 7164 uncharge_folio(folio, &ug); 7165 uncharge_batch(&ug); 7166 } 7167 7168 /** 7169 * __mem_cgroup_uncharge_list - uncharge a list of page 7170 * @page_list: list of pages to uncharge 7171 * 7172 * Uncharge a list of pages previously charged with 7173 * __mem_cgroup_charge(). 7174 */ 7175 void __mem_cgroup_uncharge_list(struct list_head *page_list) 7176 { 7177 struct uncharge_gather ug; 7178 struct folio *folio; 7179 7180 uncharge_gather_clear(&ug); 7181 list_for_each_entry(folio, page_list, lru) 7182 uncharge_folio(folio, &ug); 7183 if (ug.memcg) 7184 uncharge_batch(&ug); 7185 } 7186 7187 /** 7188 * mem_cgroup_migrate - Charge a folio's replacement. 7189 * @old: Currently circulating folio. 7190 * @new: Replacement folio. 7191 * 7192 * Charge @new as a replacement folio for @old. @old will 7193 * be uncharged upon free. 7194 * 7195 * Both folios must be locked, @new->mapping must be set up. 7196 */ 7197 void mem_cgroup_migrate(struct folio *old, struct folio *new) 7198 { 7199 struct mem_cgroup *memcg; 7200 long nr_pages = folio_nr_pages(new); 7201 unsigned long flags; 7202 7203 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 7204 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 7205 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 7206 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 7207 7208 if (mem_cgroup_disabled()) 7209 return; 7210 7211 /* Page cache replacement: new folio already charged? */ 7212 if (folio_memcg(new)) 7213 return; 7214 7215 memcg = folio_memcg(old); 7216 VM_WARN_ON_ONCE_FOLIO(!memcg, old); 7217 if (!memcg) 7218 return; 7219 7220 /* Force-charge the new page. The old one will be freed soon */ 7221 if (!mem_cgroup_is_root(memcg)) { 7222 page_counter_charge(&memcg->memory, nr_pages); 7223 if (do_memsw_account()) 7224 page_counter_charge(&memcg->memsw, nr_pages); 7225 } 7226 7227 css_get(&memcg->css); 7228 commit_charge(new, memcg); 7229 7230 local_irq_save(flags); 7231 mem_cgroup_charge_statistics(memcg, nr_pages); 7232 memcg_check_events(memcg, folio_nid(new)); 7233 local_irq_restore(flags); 7234 } 7235 7236 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 7237 EXPORT_SYMBOL(memcg_sockets_enabled_key); 7238 7239 void mem_cgroup_sk_alloc(struct sock *sk) 7240 { 7241 struct mem_cgroup *memcg; 7242 7243 if (!mem_cgroup_sockets_enabled) 7244 return; 7245 7246 /* Do not associate the sock with unrelated interrupted task's memcg. */ 7247 if (!in_task()) 7248 return; 7249 7250 rcu_read_lock(); 7251 memcg = mem_cgroup_from_task(current); 7252 if (mem_cgroup_is_root(memcg)) 7253 goto out; 7254 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 7255 goto out; 7256 if (css_tryget(&memcg->css)) 7257 sk->sk_memcg = memcg; 7258 out: 7259 rcu_read_unlock(); 7260 } 7261 7262 void mem_cgroup_sk_free(struct sock *sk) 7263 { 7264 if (sk->sk_memcg) 7265 css_put(&sk->sk_memcg->css); 7266 } 7267 7268 /** 7269 * mem_cgroup_charge_skmem - charge socket memory 7270 * @memcg: memcg to charge 7271 * @nr_pages: number of pages to charge 7272 * @gfp_mask: reclaim mode 7273 * 7274 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7275 * @memcg's configured limit, %false if it doesn't. 7276 */ 7277 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 7278 gfp_t gfp_mask) 7279 { 7280 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7281 struct page_counter *fail; 7282 7283 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7284 memcg->tcpmem_pressure = 0; 7285 return true; 7286 } 7287 memcg->tcpmem_pressure = 1; 7288 if (gfp_mask & __GFP_NOFAIL) { 7289 page_counter_charge(&memcg->tcpmem, nr_pages); 7290 return true; 7291 } 7292 return false; 7293 } 7294 7295 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 7296 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7297 return true; 7298 } 7299 7300 return false; 7301 } 7302 7303 /** 7304 * mem_cgroup_uncharge_skmem - uncharge socket memory 7305 * @memcg: memcg to uncharge 7306 * @nr_pages: number of pages to uncharge 7307 */ 7308 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7309 { 7310 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7311 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7312 return; 7313 } 7314 7315 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7316 7317 refill_stock(memcg, nr_pages); 7318 } 7319 7320 static int __init cgroup_memory(char *s) 7321 { 7322 char *token; 7323 7324 while ((token = strsep(&s, ",")) != NULL) { 7325 if (!*token) 7326 continue; 7327 if (!strcmp(token, "nosocket")) 7328 cgroup_memory_nosocket = true; 7329 if (!strcmp(token, "nokmem")) 7330 cgroup_memory_nokmem = true; 7331 if (!strcmp(token, "nobpf")) 7332 cgroup_memory_nobpf = true; 7333 } 7334 return 1; 7335 } 7336 __setup("cgroup.memory=", cgroup_memory); 7337 7338 /* 7339 * subsys_initcall() for memory controller. 7340 * 7341 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7342 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7343 * basically everything that doesn't depend on a specific mem_cgroup structure 7344 * should be initialized from here. 7345 */ 7346 static int __init mem_cgroup_init(void) 7347 { 7348 int cpu, node; 7349 7350 /* 7351 * Currently s32 type (can refer to struct batched_lruvec_stat) is 7352 * used for per-memcg-per-cpu caching of per-node statistics. In order 7353 * to work fine, we should make sure that the overfill threshold can't 7354 * exceed S32_MAX / PAGE_SIZE. 7355 */ 7356 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 7357 7358 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7359 memcg_hotplug_cpu_dead); 7360 7361 for_each_possible_cpu(cpu) 7362 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7363 drain_local_stock); 7364 7365 for_each_node(node) { 7366 struct mem_cgroup_tree_per_node *rtpn; 7367 7368 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 7369 7370 rtpn->rb_root = RB_ROOT; 7371 rtpn->rb_rightmost = NULL; 7372 spin_lock_init(&rtpn->lock); 7373 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7374 } 7375 7376 return 0; 7377 } 7378 subsys_initcall(mem_cgroup_init); 7379 7380 #ifdef CONFIG_SWAP 7381 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7382 { 7383 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7384 /* 7385 * The root cgroup cannot be destroyed, so it's refcount must 7386 * always be >= 1. 7387 */ 7388 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 7389 VM_BUG_ON(1); 7390 break; 7391 } 7392 memcg = parent_mem_cgroup(memcg); 7393 if (!memcg) 7394 memcg = root_mem_cgroup; 7395 } 7396 return memcg; 7397 } 7398 7399 /** 7400 * mem_cgroup_swapout - transfer a memsw charge to swap 7401 * @folio: folio whose memsw charge to transfer 7402 * @entry: swap entry to move the charge to 7403 * 7404 * Transfer the memsw charge of @folio to @entry. 7405 */ 7406 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 7407 { 7408 struct mem_cgroup *memcg, *swap_memcg; 7409 unsigned int nr_entries; 7410 unsigned short oldid; 7411 7412 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7413 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 7414 7415 if (mem_cgroup_disabled()) 7416 return; 7417 7418 if (!do_memsw_account()) 7419 return; 7420 7421 memcg = folio_memcg(folio); 7422 7423 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7424 if (!memcg) 7425 return; 7426 7427 /* 7428 * In case the memcg owning these pages has been offlined and doesn't 7429 * have an ID allocated to it anymore, charge the closest online 7430 * ancestor for the swap instead and transfer the memory+swap charge. 7431 */ 7432 swap_memcg = mem_cgroup_id_get_online(memcg); 7433 nr_entries = folio_nr_pages(folio); 7434 /* Get references for the tail pages, too */ 7435 if (nr_entries > 1) 7436 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7437 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7438 nr_entries); 7439 VM_BUG_ON_FOLIO(oldid, folio); 7440 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7441 7442 folio->memcg_data = 0; 7443 7444 if (!mem_cgroup_is_root(memcg)) 7445 page_counter_uncharge(&memcg->memory, nr_entries); 7446 7447 if (memcg != swap_memcg) { 7448 if (!mem_cgroup_is_root(swap_memcg)) 7449 page_counter_charge(&swap_memcg->memsw, nr_entries); 7450 page_counter_uncharge(&memcg->memsw, nr_entries); 7451 } 7452 7453 /* 7454 * Interrupts should be disabled here because the caller holds the 7455 * i_pages lock which is taken with interrupts-off. It is 7456 * important here to have the interrupts disabled because it is the 7457 * only synchronisation we have for updating the per-CPU variables. 7458 */ 7459 memcg_stats_lock(); 7460 mem_cgroup_charge_statistics(memcg, -nr_entries); 7461 memcg_stats_unlock(); 7462 memcg_check_events(memcg, folio_nid(folio)); 7463 7464 css_put(&memcg->css); 7465 } 7466 7467 /** 7468 * __mem_cgroup_try_charge_swap - try charging swap space for a folio 7469 * @folio: folio being added to swap 7470 * @entry: swap entry to charge 7471 * 7472 * Try to charge @folio's memcg for the swap space at @entry. 7473 * 7474 * Returns 0 on success, -ENOMEM on failure. 7475 */ 7476 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 7477 { 7478 unsigned int nr_pages = folio_nr_pages(folio); 7479 struct page_counter *counter; 7480 struct mem_cgroup *memcg; 7481 unsigned short oldid; 7482 7483 if (do_memsw_account()) 7484 return 0; 7485 7486 memcg = folio_memcg(folio); 7487 7488 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7489 if (!memcg) 7490 return 0; 7491 7492 if (!entry.val) { 7493 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7494 return 0; 7495 } 7496 7497 memcg = mem_cgroup_id_get_online(memcg); 7498 7499 if (!mem_cgroup_is_root(memcg) && 7500 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7501 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7502 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7503 mem_cgroup_id_put(memcg); 7504 return -ENOMEM; 7505 } 7506 7507 /* Get references for the tail pages, too */ 7508 if (nr_pages > 1) 7509 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7510 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7511 VM_BUG_ON_FOLIO(oldid, folio); 7512 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7513 7514 return 0; 7515 } 7516 7517 /** 7518 * __mem_cgroup_uncharge_swap - uncharge swap space 7519 * @entry: swap entry to uncharge 7520 * @nr_pages: the amount of swap space to uncharge 7521 */ 7522 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7523 { 7524 struct mem_cgroup *memcg; 7525 unsigned short id; 7526 7527 if (mem_cgroup_disabled()) 7528 return; 7529 7530 id = swap_cgroup_record(entry, 0, nr_pages); 7531 rcu_read_lock(); 7532 memcg = mem_cgroup_from_id(id); 7533 if (memcg) { 7534 if (!mem_cgroup_is_root(memcg)) { 7535 if (do_memsw_account()) 7536 page_counter_uncharge(&memcg->memsw, nr_pages); 7537 else 7538 page_counter_uncharge(&memcg->swap, nr_pages); 7539 } 7540 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7541 mem_cgroup_id_put_many(memcg, nr_pages); 7542 } 7543 rcu_read_unlock(); 7544 } 7545 7546 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7547 { 7548 long nr_swap_pages = get_nr_swap_pages(); 7549 7550 if (mem_cgroup_disabled() || do_memsw_account()) 7551 return nr_swap_pages; 7552 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 7553 nr_swap_pages = min_t(long, nr_swap_pages, 7554 READ_ONCE(memcg->swap.max) - 7555 page_counter_read(&memcg->swap)); 7556 return nr_swap_pages; 7557 } 7558 7559 bool mem_cgroup_swap_full(struct folio *folio) 7560 { 7561 struct mem_cgroup *memcg; 7562 7563 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 7564 7565 if (vm_swap_full()) 7566 return true; 7567 if (do_memsw_account()) 7568 return false; 7569 7570 memcg = folio_memcg(folio); 7571 if (!memcg) 7572 return false; 7573 7574 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 7575 unsigned long usage = page_counter_read(&memcg->swap); 7576 7577 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7578 usage * 2 >= READ_ONCE(memcg->swap.max)) 7579 return true; 7580 } 7581 7582 return false; 7583 } 7584 7585 static int __init setup_swap_account(char *s) 7586 { 7587 pr_warn_once("The swapaccount= commandline option is deprecated. " 7588 "Please report your usecase to linux-mm@kvack.org if you " 7589 "depend on this functionality.\n"); 7590 return 1; 7591 } 7592 __setup("swapaccount=", setup_swap_account); 7593 7594 static u64 swap_current_read(struct cgroup_subsys_state *css, 7595 struct cftype *cft) 7596 { 7597 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7598 7599 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7600 } 7601 7602 static u64 swap_peak_read(struct cgroup_subsys_state *css, 7603 struct cftype *cft) 7604 { 7605 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7606 7607 return (u64)memcg->swap.watermark * PAGE_SIZE; 7608 } 7609 7610 static int swap_high_show(struct seq_file *m, void *v) 7611 { 7612 return seq_puts_memcg_tunable(m, 7613 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7614 } 7615 7616 static ssize_t swap_high_write(struct kernfs_open_file *of, 7617 char *buf, size_t nbytes, loff_t off) 7618 { 7619 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7620 unsigned long high; 7621 int err; 7622 7623 buf = strstrip(buf); 7624 err = page_counter_memparse(buf, "max", &high); 7625 if (err) 7626 return err; 7627 7628 page_counter_set_high(&memcg->swap, high); 7629 7630 return nbytes; 7631 } 7632 7633 static int swap_max_show(struct seq_file *m, void *v) 7634 { 7635 return seq_puts_memcg_tunable(m, 7636 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7637 } 7638 7639 static ssize_t swap_max_write(struct kernfs_open_file *of, 7640 char *buf, size_t nbytes, loff_t off) 7641 { 7642 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7643 unsigned long max; 7644 int err; 7645 7646 buf = strstrip(buf); 7647 err = page_counter_memparse(buf, "max", &max); 7648 if (err) 7649 return err; 7650 7651 xchg(&memcg->swap.max, max); 7652 7653 return nbytes; 7654 } 7655 7656 static int swap_events_show(struct seq_file *m, void *v) 7657 { 7658 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7659 7660 seq_printf(m, "high %lu\n", 7661 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7662 seq_printf(m, "max %lu\n", 7663 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7664 seq_printf(m, "fail %lu\n", 7665 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7666 7667 return 0; 7668 } 7669 7670 static struct cftype swap_files[] = { 7671 { 7672 .name = "swap.current", 7673 .flags = CFTYPE_NOT_ON_ROOT, 7674 .read_u64 = swap_current_read, 7675 }, 7676 { 7677 .name = "swap.high", 7678 .flags = CFTYPE_NOT_ON_ROOT, 7679 .seq_show = swap_high_show, 7680 .write = swap_high_write, 7681 }, 7682 { 7683 .name = "swap.max", 7684 .flags = CFTYPE_NOT_ON_ROOT, 7685 .seq_show = swap_max_show, 7686 .write = swap_max_write, 7687 }, 7688 { 7689 .name = "swap.peak", 7690 .flags = CFTYPE_NOT_ON_ROOT, 7691 .read_u64 = swap_peak_read, 7692 }, 7693 { 7694 .name = "swap.events", 7695 .flags = CFTYPE_NOT_ON_ROOT, 7696 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7697 .seq_show = swap_events_show, 7698 }, 7699 { } /* terminate */ 7700 }; 7701 7702 static struct cftype memsw_files[] = { 7703 { 7704 .name = "memsw.usage_in_bytes", 7705 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7706 .read_u64 = mem_cgroup_read_u64, 7707 }, 7708 { 7709 .name = "memsw.max_usage_in_bytes", 7710 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7711 .write = mem_cgroup_reset, 7712 .read_u64 = mem_cgroup_read_u64, 7713 }, 7714 { 7715 .name = "memsw.limit_in_bytes", 7716 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7717 .write = mem_cgroup_write, 7718 .read_u64 = mem_cgroup_read_u64, 7719 }, 7720 { 7721 .name = "memsw.failcnt", 7722 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7723 .write = mem_cgroup_reset, 7724 .read_u64 = mem_cgroup_read_u64, 7725 }, 7726 { }, /* terminate */ 7727 }; 7728 7729 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7730 /** 7731 * obj_cgroup_may_zswap - check if this cgroup can zswap 7732 * @objcg: the object cgroup 7733 * 7734 * Check if the hierarchical zswap limit has been reached. 7735 * 7736 * This doesn't check for specific headroom, and it is not atomic 7737 * either. But with zswap, the size of the allocation is only known 7738 * once compression has occured, and this optimistic pre-check avoids 7739 * spending cycles on compression when there is already no room left 7740 * or zswap is disabled altogether somewhere in the hierarchy. 7741 */ 7742 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 7743 { 7744 struct mem_cgroup *memcg, *original_memcg; 7745 bool ret = true; 7746 7747 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7748 return true; 7749 7750 original_memcg = get_mem_cgroup_from_objcg(objcg); 7751 for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 7752 memcg = parent_mem_cgroup(memcg)) { 7753 unsigned long max = READ_ONCE(memcg->zswap_max); 7754 unsigned long pages; 7755 7756 if (max == PAGE_COUNTER_MAX) 7757 continue; 7758 if (max == 0) { 7759 ret = false; 7760 break; 7761 } 7762 7763 cgroup_rstat_flush(memcg->css.cgroup); 7764 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 7765 if (pages < max) 7766 continue; 7767 ret = false; 7768 break; 7769 } 7770 mem_cgroup_put(original_memcg); 7771 return ret; 7772 } 7773 7774 /** 7775 * obj_cgroup_charge_zswap - charge compression backend memory 7776 * @objcg: the object cgroup 7777 * @size: size of compressed object 7778 * 7779 * This forces the charge after obj_cgroup_may_zswap() allowed 7780 * compression and storage in zwap for this cgroup to go ahead. 7781 */ 7782 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 7783 { 7784 struct mem_cgroup *memcg; 7785 7786 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7787 return; 7788 7789 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 7790 7791 /* PF_MEMALLOC context, charging must succeed */ 7792 if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 7793 VM_WARN_ON_ONCE(1); 7794 7795 rcu_read_lock(); 7796 memcg = obj_cgroup_memcg(objcg); 7797 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 7798 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 7799 rcu_read_unlock(); 7800 } 7801 7802 /** 7803 * obj_cgroup_uncharge_zswap - uncharge compression backend memory 7804 * @objcg: the object cgroup 7805 * @size: size of compressed object 7806 * 7807 * Uncharges zswap memory on page in. 7808 */ 7809 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 7810 { 7811 struct mem_cgroup *memcg; 7812 7813 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7814 return; 7815 7816 obj_cgroup_uncharge(objcg, size); 7817 7818 rcu_read_lock(); 7819 memcg = obj_cgroup_memcg(objcg); 7820 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 7821 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 7822 rcu_read_unlock(); 7823 } 7824 7825 static u64 zswap_current_read(struct cgroup_subsys_state *css, 7826 struct cftype *cft) 7827 { 7828 cgroup_rstat_flush(css->cgroup); 7829 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); 7830 } 7831 7832 static int zswap_max_show(struct seq_file *m, void *v) 7833 { 7834 return seq_puts_memcg_tunable(m, 7835 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 7836 } 7837 7838 static ssize_t zswap_max_write(struct kernfs_open_file *of, 7839 char *buf, size_t nbytes, loff_t off) 7840 { 7841 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7842 unsigned long max; 7843 int err; 7844 7845 buf = strstrip(buf); 7846 err = page_counter_memparse(buf, "max", &max); 7847 if (err) 7848 return err; 7849 7850 xchg(&memcg->zswap_max, max); 7851 7852 return nbytes; 7853 } 7854 7855 static struct cftype zswap_files[] = { 7856 { 7857 .name = "zswap.current", 7858 .flags = CFTYPE_NOT_ON_ROOT, 7859 .read_u64 = zswap_current_read, 7860 }, 7861 { 7862 .name = "zswap.max", 7863 .flags = CFTYPE_NOT_ON_ROOT, 7864 .seq_show = zswap_max_show, 7865 .write = zswap_max_write, 7866 }, 7867 { } /* terminate */ 7868 }; 7869 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ 7870 7871 static int __init mem_cgroup_swap_init(void) 7872 { 7873 if (mem_cgroup_disabled()) 7874 return 0; 7875 7876 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7877 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7878 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7879 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 7880 #endif 7881 return 0; 7882 } 7883 subsys_initcall(mem_cgroup_swap_init); 7884 7885 #endif /* CONFIG_SWAP */ 7886