1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * Native page reclaim 18 * Charge lifetime sanitation 19 * Lockless page tracking & accounting 20 * Unified hierarchy configuration model 21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 */ 33 34 #include <linux/page_counter.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cgroup.h> 37 #include <linux/mm.h> 38 #include <linux/sched/mm.h> 39 #include <linux/shmem_fs.h> 40 #include <linux/hugetlb.h> 41 #include <linux/pagemap.h> 42 #include <linux/smp.h> 43 #include <linux/page-flags.h> 44 #include <linux/backing-dev.h> 45 #include <linux/bit_spinlock.h> 46 #include <linux/rcupdate.h> 47 #include <linux/limits.h> 48 #include <linux/export.h> 49 #include <linux/mutex.h> 50 #include <linux/rbtree.h> 51 #include <linux/slab.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/spinlock.h> 55 #include <linux/eventfd.h> 56 #include <linux/poll.h> 57 #include <linux/sort.h> 58 #include <linux/fs.h> 59 #include <linux/seq_file.h> 60 #include <linux/vmpressure.h> 61 #include <linux/mm_inline.h> 62 #include <linux/swap_cgroup.h> 63 #include <linux/cpu.h> 64 #include <linux/oom.h> 65 #include <linux/lockdep.h> 66 #include <linux/file.h> 67 #include <linux/tracehook.h> 68 #include "internal.h" 69 #include <net/sock.h> 70 #include <net/ip.h> 71 #include "slab.h" 72 73 #include <linux/uaccess.h> 74 75 #include <trace/events/vmscan.h> 76 77 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 78 EXPORT_SYMBOL(memory_cgrp_subsys); 79 80 struct mem_cgroup *root_mem_cgroup __read_mostly; 81 82 #define MEM_CGROUP_RECLAIM_RETRIES 5 83 84 /* Socket memory accounting disabled? */ 85 static bool cgroup_memory_nosocket; 86 87 /* Kernel memory accounting disabled? */ 88 static bool cgroup_memory_nokmem; 89 90 /* Whether the swap controller is active */ 91 #ifdef CONFIG_MEMCG_SWAP 92 int do_swap_account __read_mostly; 93 #else 94 #define do_swap_account 0 95 #endif 96 97 /* Whether legacy memory+swap accounting is active */ 98 static bool do_memsw_account(void) 99 { 100 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; 101 } 102 103 static const char * const mem_cgroup_stat_names[] = { 104 "cache", 105 "rss", 106 "rss_huge", 107 "mapped_file", 108 "dirty", 109 "writeback", 110 "swap", 111 }; 112 113 static const char * const mem_cgroup_events_names[] = { 114 "pgpgin", 115 "pgpgout", 116 "pgfault", 117 "pgmajfault", 118 }; 119 120 static const char * const mem_cgroup_lru_names[] = { 121 "inactive_anon", 122 "active_anon", 123 "inactive_file", 124 "active_file", 125 "unevictable", 126 }; 127 128 #define THRESHOLDS_EVENTS_TARGET 128 129 #define SOFTLIMIT_EVENTS_TARGET 1024 130 #define NUMAINFO_EVENTS_TARGET 1024 131 132 /* 133 * Cgroups above their limits are maintained in a RB-Tree, independent of 134 * their hierarchy representation 135 */ 136 137 struct mem_cgroup_tree_per_node { 138 struct rb_root rb_root; 139 spinlock_t lock; 140 }; 141 142 struct mem_cgroup_tree { 143 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 144 }; 145 146 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 147 148 /* for OOM */ 149 struct mem_cgroup_eventfd_list { 150 struct list_head list; 151 struct eventfd_ctx *eventfd; 152 }; 153 154 /* 155 * cgroup_event represents events which userspace want to receive. 156 */ 157 struct mem_cgroup_event { 158 /* 159 * memcg which the event belongs to. 160 */ 161 struct mem_cgroup *memcg; 162 /* 163 * eventfd to signal userspace about the event. 164 */ 165 struct eventfd_ctx *eventfd; 166 /* 167 * Each of these stored in a list by the cgroup. 168 */ 169 struct list_head list; 170 /* 171 * register_event() callback will be used to add new userspace 172 * waiter for changes related to this event. Use eventfd_signal() 173 * on eventfd to send notification to userspace. 174 */ 175 int (*register_event)(struct mem_cgroup *memcg, 176 struct eventfd_ctx *eventfd, const char *args); 177 /* 178 * unregister_event() callback will be called when userspace closes 179 * the eventfd or on cgroup removing. This callback must be set, 180 * if you want provide notification functionality. 181 */ 182 void (*unregister_event)(struct mem_cgroup *memcg, 183 struct eventfd_ctx *eventfd); 184 /* 185 * All fields below needed to unregister event when 186 * userspace closes eventfd. 187 */ 188 poll_table pt; 189 wait_queue_head_t *wqh; 190 wait_queue_t wait; 191 struct work_struct remove; 192 }; 193 194 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 195 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 196 197 /* Stuffs for move charges at task migration. */ 198 /* 199 * Types of charges to be moved. 200 */ 201 #define MOVE_ANON 0x1U 202 #define MOVE_FILE 0x2U 203 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 204 205 /* "mc" and its members are protected by cgroup_mutex */ 206 static struct move_charge_struct { 207 spinlock_t lock; /* for from, to */ 208 struct mm_struct *mm; 209 struct mem_cgroup *from; 210 struct mem_cgroup *to; 211 unsigned long flags; 212 unsigned long precharge; 213 unsigned long moved_charge; 214 unsigned long moved_swap; 215 struct task_struct *moving_task; /* a task moving charges */ 216 wait_queue_head_t waitq; /* a waitq for other context */ 217 } mc = { 218 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 219 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 220 }; 221 222 /* 223 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 224 * limit reclaim to prevent infinite loops, if they ever occur. 225 */ 226 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 227 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 228 229 enum charge_type { 230 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 231 MEM_CGROUP_CHARGE_TYPE_ANON, 232 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 233 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 234 NR_CHARGE_TYPE, 235 }; 236 237 /* for encoding cft->private value on file */ 238 enum res_type { 239 _MEM, 240 _MEMSWAP, 241 _OOM_TYPE, 242 _KMEM, 243 _TCP, 244 }; 245 246 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 247 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 248 #define MEMFILE_ATTR(val) ((val) & 0xffff) 249 /* Used for OOM nofiier */ 250 #define OOM_CONTROL (0) 251 252 /* Some nice accessors for the vmpressure. */ 253 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 254 { 255 if (!memcg) 256 memcg = root_mem_cgroup; 257 return &memcg->vmpressure; 258 } 259 260 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 261 { 262 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 263 } 264 265 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 266 { 267 return (memcg == root_mem_cgroup); 268 } 269 270 #ifndef CONFIG_SLOB 271 /* 272 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 273 * The main reason for not using cgroup id for this: 274 * this works better in sparse environments, where we have a lot of memcgs, 275 * but only a few kmem-limited. Or also, if we have, for instance, 200 276 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 277 * 200 entry array for that. 278 * 279 * The current size of the caches array is stored in memcg_nr_cache_ids. It 280 * will double each time we have to increase it. 281 */ 282 static DEFINE_IDA(memcg_cache_ida); 283 int memcg_nr_cache_ids; 284 285 /* Protects memcg_nr_cache_ids */ 286 static DECLARE_RWSEM(memcg_cache_ids_sem); 287 288 void memcg_get_cache_ids(void) 289 { 290 down_read(&memcg_cache_ids_sem); 291 } 292 293 void memcg_put_cache_ids(void) 294 { 295 up_read(&memcg_cache_ids_sem); 296 } 297 298 /* 299 * MIN_SIZE is different than 1, because we would like to avoid going through 300 * the alloc/free process all the time. In a small machine, 4 kmem-limited 301 * cgroups is a reasonable guess. In the future, it could be a parameter or 302 * tunable, but that is strictly not necessary. 303 * 304 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 305 * this constant directly from cgroup, but it is understandable that this is 306 * better kept as an internal representation in cgroup.c. In any case, the 307 * cgrp_id space is not getting any smaller, and we don't have to necessarily 308 * increase ours as well if it increases. 309 */ 310 #define MEMCG_CACHES_MIN_SIZE 4 311 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 312 313 /* 314 * A lot of the calls to the cache allocation functions are expected to be 315 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 316 * conditional to this static branch, we'll have to allow modules that does 317 * kmem_cache_alloc and the such to see this symbol as well 318 */ 319 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 320 EXPORT_SYMBOL(memcg_kmem_enabled_key); 321 322 struct workqueue_struct *memcg_kmem_cache_wq; 323 324 #endif /* !CONFIG_SLOB */ 325 326 /** 327 * mem_cgroup_css_from_page - css of the memcg associated with a page 328 * @page: page of interest 329 * 330 * If memcg is bound to the default hierarchy, css of the memcg associated 331 * with @page is returned. The returned css remains associated with @page 332 * until it is released. 333 * 334 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 335 * is returned. 336 */ 337 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 338 { 339 struct mem_cgroup *memcg; 340 341 memcg = page->mem_cgroup; 342 343 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 344 memcg = root_mem_cgroup; 345 346 return &memcg->css; 347 } 348 349 /** 350 * page_cgroup_ino - return inode number of the memcg a page is charged to 351 * @page: the page 352 * 353 * Look up the closest online ancestor of the memory cgroup @page is charged to 354 * and return its inode number or 0 if @page is not charged to any cgroup. It 355 * is safe to call this function without holding a reference to @page. 356 * 357 * Note, this function is inherently racy, because there is nothing to prevent 358 * the cgroup inode from getting torn down and potentially reallocated a moment 359 * after page_cgroup_ino() returns, so it only should be used by callers that 360 * do not care (such as procfs interfaces). 361 */ 362 ino_t page_cgroup_ino(struct page *page) 363 { 364 struct mem_cgroup *memcg; 365 unsigned long ino = 0; 366 367 rcu_read_lock(); 368 memcg = READ_ONCE(page->mem_cgroup); 369 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 370 memcg = parent_mem_cgroup(memcg); 371 if (memcg) 372 ino = cgroup_ino(memcg->css.cgroup); 373 rcu_read_unlock(); 374 return ino; 375 } 376 377 static struct mem_cgroup_per_node * 378 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 379 { 380 int nid = page_to_nid(page); 381 382 return memcg->nodeinfo[nid]; 383 } 384 385 static struct mem_cgroup_tree_per_node * 386 soft_limit_tree_node(int nid) 387 { 388 return soft_limit_tree.rb_tree_per_node[nid]; 389 } 390 391 static struct mem_cgroup_tree_per_node * 392 soft_limit_tree_from_page(struct page *page) 393 { 394 int nid = page_to_nid(page); 395 396 return soft_limit_tree.rb_tree_per_node[nid]; 397 } 398 399 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 400 struct mem_cgroup_tree_per_node *mctz, 401 unsigned long new_usage_in_excess) 402 { 403 struct rb_node **p = &mctz->rb_root.rb_node; 404 struct rb_node *parent = NULL; 405 struct mem_cgroup_per_node *mz_node; 406 407 if (mz->on_tree) 408 return; 409 410 mz->usage_in_excess = new_usage_in_excess; 411 if (!mz->usage_in_excess) 412 return; 413 while (*p) { 414 parent = *p; 415 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 416 tree_node); 417 if (mz->usage_in_excess < mz_node->usage_in_excess) 418 p = &(*p)->rb_left; 419 /* 420 * We can't avoid mem cgroups that are over their soft 421 * limit by the same amount 422 */ 423 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 424 p = &(*p)->rb_right; 425 } 426 rb_link_node(&mz->tree_node, parent, p); 427 rb_insert_color(&mz->tree_node, &mctz->rb_root); 428 mz->on_tree = true; 429 } 430 431 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 432 struct mem_cgroup_tree_per_node *mctz) 433 { 434 if (!mz->on_tree) 435 return; 436 rb_erase(&mz->tree_node, &mctz->rb_root); 437 mz->on_tree = false; 438 } 439 440 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 441 struct mem_cgroup_tree_per_node *mctz) 442 { 443 unsigned long flags; 444 445 spin_lock_irqsave(&mctz->lock, flags); 446 __mem_cgroup_remove_exceeded(mz, mctz); 447 spin_unlock_irqrestore(&mctz->lock, flags); 448 } 449 450 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 451 { 452 unsigned long nr_pages = page_counter_read(&memcg->memory); 453 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 454 unsigned long excess = 0; 455 456 if (nr_pages > soft_limit) 457 excess = nr_pages - soft_limit; 458 459 return excess; 460 } 461 462 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 463 { 464 unsigned long excess; 465 struct mem_cgroup_per_node *mz; 466 struct mem_cgroup_tree_per_node *mctz; 467 468 mctz = soft_limit_tree_from_page(page); 469 if (!mctz) 470 return; 471 /* 472 * Necessary to update all ancestors when hierarchy is used. 473 * because their event counter is not touched. 474 */ 475 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 476 mz = mem_cgroup_page_nodeinfo(memcg, page); 477 excess = soft_limit_excess(memcg); 478 /* 479 * We have to update the tree if mz is on RB-tree or 480 * mem is over its softlimit. 481 */ 482 if (excess || mz->on_tree) { 483 unsigned long flags; 484 485 spin_lock_irqsave(&mctz->lock, flags); 486 /* if on-tree, remove it */ 487 if (mz->on_tree) 488 __mem_cgroup_remove_exceeded(mz, mctz); 489 /* 490 * Insert again. mz->usage_in_excess will be updated. 491 * If excess is 0, no tree ops. 492 */ 493 __mem_cgroup_insert_exceeded(mz, mctz, excess); 494 spin_unlock_irqrestore(&mctz->lock, flags); 495 } 496 } 497 } 498 499 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 500 { 501 struct mem_cgroup_tree_per_node *mctz; 502 struct mem_cgroup_per_node *mz; 503 int nid; 504 505 for_each_node(nid) { 506 mz = mem_cgroup_nodeinfo(memcg, nid); 507 mctz = soft_limit_tree_node(nid); 508 if (mctz) 509 mem_cgroup_remove_exceeded(mz, mctz); 510 } 511 } 512 513 static struct mem_cgroup_per_node * 514 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 515 { 516 struct rb_node *rightmost = NULL; 517 struct mem_cgroup_per_node *mz; 518 519 retry: 520 mz = NULL; 521 rightmost = rb_last(&mctz->rb_root); 522 if (!rightmost) 523 goto done; /* Nothing to reclaim from */ 524 525 mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); 526 /* 527 * Remove the node now but someone else can add it back, 528 * we will to add it back at the end of reclaim to its correct 529 * position in the tree. 530 */ 531 __mem_cgroup_remove_exceeded(mz, mctz); 532 if (!soft_limit_excess(mz->memcg) || 533 !css_tryget_online(&mz->memcg->css)) 534 goto retry; 535 done: 536 return mz; 537 } 538 539 static struct mem_cgroup_per_node * 540 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 541 { 542 struct mem_cgroup_per_node *mz; 543 544 spin_lock_irq(&mctz->lock); 545 mz = __mem_cgroup_largest_soft_limit_node(mctz); 546 spin_unlock_irq(&mctz->lock); 547 return mz; 548 } 549 550 /* 551 * Return page count for single (non recursive) @memcg. 552 * 553 * Implementation Note: reading percpu statistics for memcg. 554 * 555 * Both of vmstat[] and percpu_counter has threshold and do periodic 556 * synchronization to implement "quick" read. There are trade-off between 557 * reading cost and precision of value. Then, we may have a chance to implement 558 * a periodic synchronization of counter in memcg's counter. 559 * 560 * But this _read() function is used for user interface now. The user accounts 561 * memory usage by memory cgroup and he _always_ requires exact value because 562 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 563 * have to visit all online cpus and make sum. So, for now, unnecessary 564 * synchronization is not implemented. (just implemented for cpu hotplug) 565 * 566 * If there are kernel internal actions which can make use of some not-exact 567 * value, and reading all cpu value can be performance bottleneck in some 568 * common workload, threshold and synchronization as vmstat[] should be 569 * implemented. 570 */ 571 static unsigned long 572 mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) 573 { 574 long val = 0; 575 int cpu; 576 577 /* Per-cpu values can be negative, use a signed accumulator */ 578 for_each_possible_cpu(cpu) 579 val += per_cpu(memcg->stat->count[idx], cpu); 580 /* 581 * Summing races with updates, so val may be negative. Avoid exposing 582 * transient negative values. 583 */ 584 if (val < 0) 585 val = 0; 586 return val; 587 } 588 589 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 590 enum mem_cgroup_events_index idx) 591 { 592 unsigned long val = 0; 593 int cpu; 594 595 for_each_possible_cpu(cpu) 596 val += per_cpu(memcg->stat->events[idx], cpu); 597 return val; 598 } 599 600 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 601 struct page *page, 602 bool compound, int nr_pages) 603 { 604 /* 605 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 606 * counted as CACHE even if it's on ANON LRU. 607 */ 608 if (PageAnon(page)) 609 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 610 nr_pages); 611 else 612 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 613 nr_pages); 614 615 if (compound) { 616 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 617 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 618 nr_pages); 619 } 620 621 /* pagein of a big page is an event. So, ignore page size */ 622 if (nr_pages > 0) 623 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 624 else { 625 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 626 nr_pages = -nr_pages; /* for event */ 627 } 628 629 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 630 } 631 632 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 633 int nid, unsigned int lru_mask) 634 { 635 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 636 unsigned long nr = 0; 637 enum lru_list lru; 638 639 VM_BUG_ON((unsigned)nid >= nr_node_ids); 640 641 for_each_lru(lru) { 642 if (!(BIT(lru) & lru_mask)) 643 continue; 644 nr += mem_cgroup_get_lru_size(lruvec, lru); 645 } 646 return nr; 647 } 648 649 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 650 unsigned int lru_mask) 651 { 652 unsigned long nr = 0; 653 int nid; 654 655 for_each_node_state(nid, N_MEMORY) 656 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 657 return nr; 658 } 659 660 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 661 enum mem_cgroup_events_target target) 662 { 663 unsigned long val, next; 664 665 val = __this_cpu_read(memcg->stat->nr_page_events); 666 next = __this_cpu_read(memcg->stat->targets[target]); 667 /* from time_after() in jiffies.h */ 668 if ((long)next - (long)val < 0) { 669 switch (target) { 670 case MEM_CGROUP_TARGET_THRESH: 671 next = val + THRESHOLDS_EVENTS_TARGET; 672 break; 673 case MEM_CGROUP_TARGET_SOFTLIMIT: 674 next = val + SOFTLIMIT_EVENTS_TARGET; 675 break; 676 case MEM_CGROUP_TARGET_NUMAINFO: 677 next = val + NUMAINFO_EVENTS_TARGET; 678 break; 679 default: 680 break; 681 } 682 __this_cpu_write(memcg->stat->targets[target], next); 683 return true; 684 } 685 return false; 686 } 687 688 /* 689 * Check events in order. 690 * 691 */ 692 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 693 { 694 /* threshold event is triggered in finer grain than soft limit */ 695 if (unlikely(mem_cgroup_event_ratelimit(memcg, 696 MEM_CGROUP_TARGET_THRESH))) { 697 bool do_softlimit; 698 bool do_numainfo __maybe_unused; 699 700 do_softlimit = mem_cgroup_event_ratelimit(memcg, 701 MEM_CGROUP_TARGET_SOFTLIMIT); 702 #if MAX_NUMNODES > 1 703 do_numainfo = mem_cgroup_event_ratelimit(memcg, 704 MEM_CGROUP_TARGET_NUMAINFO); 705 #endif 706 mem_cgroup_threshold(memcg); 707 if (unlikely(do_softlimit)) 708 mem_cgroup_update_tree(memcg, page); 709 #if MAX_NUMNODES > 1 710 if (unlikely(do_numainfo)) 711 atomic_inc(&memcg->numainfo_events); 712 #endif 713 } 714 } 715 716 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 717 { 718 /* 719 * mm_update_next_owner() may clear mm->owner to NULL 720 * if it races with swapoff, page migration, etc. 721 * So this can be called with p == NULL. 722 */ 723 if (unlikely(!p)) 724 return NULL; 725 726 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 727 } 728 EXPORT_SYMBOL(mem_cgroup_from_task); 729 730 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 731 { 732 struct mem_cgroup *memcg = NULL; 733 734 rcu_read_lock(); 735 do { 736 /* 737 * Page cache insertions can happen withou an 738 * actual mm context, e.g. during disk probing 739 * on boot, loopback IO, acct() writes etc. 740 */ 741 if (unlikely(!mm)) 742 memcg = root_mem_cgroup; 743 else { 744 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 745 if (unlikely(!memcg)) 746 memcg = root_mem_cgroup; 747 } 748 } while (!css_tryget_online(&memcg->css)); 749 rcu_read_unlock(); 750 return memcg; 751 } 752 753 /** 754 * mem_cgroup_iter - iterate over memory cgroup hierarchy 755 * @root: hierarchy root 756 * @prev: previously returned memcg, NULL on first invocation 757 * @reclaim: cookie for shared reclaim walks, NULL for full walks 758 * 759 * Returns references to children of the hierarchy below @root, or 760 * @root itself, or %NULL after a full round-trip. 761 * 762 * Caller must pass the return value in @prev on subsequent 763 * invocations for reference counting, or use mem_cgroup_iter_break() 764 * to cancel a hierarchy walk before the round-trip is complete. 765 * 766 * Reclaimers can specify a zone and a priority level in @reclaim to 767 * divide up the memcgs in the hierarchy among all concurrent 768 * reclaimers operating on the same zone and priority. 769 */ 770 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 771 struct mem_cgroup *prev, 772 struct mem_cgroup_reclaim_cookie *reclaim) 773 { 774 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 775 struct cgroup_subsys_state *css = NULL; 776 struct mem_cgroup *memcg = NULL; 777 struct mem_cgroup *pos = NULL; 778 779 if (mem_cgroup_disabled()) 780 return NULL; 781 782 if (!root) 783 root = root_mem_cgroup; 784 785 if (prev && !reclaim) 786 pos = prev; 787 788 if (!root->use_hierarchy && root != root_mem_cgroup) { 789 if (prev) 790 goto out; 791 return root; 792 } 793 794 rcu_read_lock(); 795 796 if (reclaim) { 797 struct mem_cgroup_per_node *mz; 798 799 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 800 iter = &mz->iter[reclaim->priority]; 801 802 if (prev && reclaim->generation != iter->generation) 803 goto out_unlock; 804 805 while (1) { 806 pos = READ_ONCE(iter->position); 807 if (!pos || css_tryget(&pos->css)) 808 break; 809 /* 810 * css reference reached zero, so iter->position will 811 * be cleared by ->css_released. However, we should not 812 * rely on this happening soon, because ->css_released 813 * is called from a work queue, and by busy-waiting we 814 * might block it. So we clear iter->position right 815 * away. 816 */ 817 (void)cmpxchg(&iter->position, pos, NULL); 818 } 819 } 820 821 if (pos) 822 css = &pos->css; 823 824 for (;;) { 825 css = css_next_descendant_pre(css, &root->css); 826 if (!css) { 827 /* 828 * Reclaimers share the hierarchy walk, and a 829 * new one might jump in right at the end of 830 * the hierarchy - make sure they see at least 831 * one group and restart from the beginning. 832 */ 833 if (!prev) 834 continue; 835 break; 836 } 837 838 /* 839 * Verify the css and acquire a reference. The root 840 * is provided by the caller, so we know it's alive 841 * and kicking, and don't take an extra reference. 842 */ 843 memcg = mem_cgroup_from_css(css); 844 845 if (css == &root->css) 846 break; 847 848 if (css_tryget(css)) 849 break; 850 851 memcg = NULL; 852 } 853 854 if (reclaim) { 855 /* 856 * The position could have already been updated by a competing 857 * thread, so check that the value hasn't changed since we read 858 * it to avoid reclaiming from the same cgroup twice. 859 */ 860 (void)cmpxchg(&iter->position, pos, memcg); 861 862 if (pos) 863 css_put(&pos->css); 864 865 if (!memcg) 866 iter->generation++; 867 else if (!prev) 868 reclaim->generation = iter->generation; 869 } 870 871 out_unlock: 872 rcu_read_unlock(); 873 out: 874 if (prev && prev != root) 875 css_put(&prev->css); 876 877 return memcg; 878 } 879 880 /** 881 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 882 * @root: hierarchy root 883 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 884 */ 885 void mem_cgroup_iter_break(struct mem_cgroup *root, 886 struct mem_cgroup *prev) 887 { 888 if (!root) 889 root = root_mem_cgroup; 890 if (prev && prev != root) 891 css_put(&prev->css); 892 } 893 894 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 895 { 896 struct mem_cgroup *memcg = dead_memcg; 897 struct mem_cgroup_reclaim_iter *iter; 898 struct mem_cgroup_per_node *mz; 899 int nid; 900 int i; 901 902 while ((memcg = parent_mem_cgroup(memcg))) { 903 for_each_node(nid) { 904 mz = mem_cgroup_nodeinfo(memcg, nid); 905 for (i = 0; i <= DEF_PRIORITY; i++) { 906 iter = &mz->iter[i]; 907 cmpxchg(&iter->position, 908 dead_memcg, NULL); 909 } 910 } 911 } 912 } 913 914 /* 915 * Iteration constructs for visiting all cgroups (under a tree). If 916 * loops are exited prematurely (break), mem_cgroup_iter_break() must 917 * be used for reference counting. 918 */ 919 #define for_each_mem_cgroup_tree(iter, root) \ 920 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 921 iter != NULL; \ 922 iter = mem_cgroup_iter(root, iter, NULL)) 923 924 #define for_each_mem_cgroup(iter) \ 925 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 926 iter != NULL; \ 927 iter = mem_cgroup_iter(NULL, iter, NULL)) 928 929 /** 930 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 931 * @memcg: hierarchy root 932 * @fn: function to call for each task 933 * @arg: argument passed to @fn 934 * 935 * This function iterates over tasks attached to @memcg or to any of its 936 * descendants and calls @fn for each task. If @fn returns a non-zero 937 * value, the function breaks the iteration loop and returns the value. 938 * Otherwise, it will iterate over all tasks and return 0. 939 * 940 * This function must not be called for the root memory cgroup. 941 */ 942 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 943 int (*fn)(struct task_struct *, void *), void *arg) 944 { 945 struct mem_cgroup *iter; 946 int ret = 0; 947 948 BUG_ON(memcg == root_mem_cgroup); 949 950 for_each_mem_cgroup_tree(iter, memcg) { 951 struct css_task_iter it; 952 struct task_struct *task; 953 954 css_task_iter_start(&iter->css, &it); 955 while (!ret && (task = css_task_iter_next(&it))) 956 ret = fn(task, arg); 957 css_task_iter_end(&it); 958 if (ret) { 959 mem_cgroup_iter_break(memcg, iter); 960 break; 961 } 962 } 963 return ret; 964 } 965 966 /** 967 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 968 * @page: the page 969 * @zone: zone of the page 970 * 971 * This function is only safe when following the LRU page isolation 972 * and putback protocol: the LRU lock must be held, and the page must 973 * either be PageLRU() or the caller must have isolated/allocated it. 974 */ 975 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 976 { 977 struct mem_cgroup_per_node *mz; 978 struct mem_cgroup *memcg; 979 struct lruvec *lruvec; 980 981 if (mem_cgroup_disabled()) { 982 lruvec = &pgdat->lruvec; 983 goto out; 984 } 985 986 memcg = page->mem_cgroup; 987 /* 988 * Swapcache readahead pages are added to the LRU - and 989 * possibly migrated - before they are charged. 990 */ 991 if (!memcg) 992 memcg = root_mem_cgroup; 993 994 mz = mem_cgroup_page_nodeinfo(memcg, page); 995 lruvec = &mz->lruvec; 996 out: 997 /* 998 * Since a node can be onlined after the mem_cgroup was created, 999 * we have to be prepared to initialize lruvec->zone here; 1000 * and if offlined then reonlined, we need to reinitialize it. 1001 */ 1002 if (unlikely(lruvec->pgdat != pgdat)) 1003 lruvec->pgdat = pgdat; 1004 return lruvec; 1005 } 1006 1007 /** 1008 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1009 * @lruvec: mem_cgroup per zone lru vector 1010 * @lru: index of lru list the page is sitting on 1011 * @zid: zone id of the accounted pages 1012 * @nr_pages: positive when adding or negative when removing 1013 * 1014 * This function must be called under lru_lock, just before a page is added 1015 * to or just after a page is removed from an lru list (that ordering being 1016 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1017 */ 1018 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1019 int zid, int nr_pages) 1020 { 1021 struct mem_cgroup_per_node *mz; 1022 unsigned long *lru_size; 1023 long size; 1024 1025 if (mem_cgroup_disabled()) 1026 return; 1027 1028 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1029 lru_size = &mz->lru_zone_size[zid][lru]; 1030 1031 if (nr_pages < 0) 1032 *lru_size += nr_pages; 1033 1034 size = *lru_size; 1035 if (WARN_ONCE(size < 0, 1036 "%s(%p, %d, %d): lru_size %ld\n", 1037 __func__, lruvec, lru, nr_pages, size)) { 1038 VM_BUG_ON(1); 1039 *lru_size = 0; 1040 } 1041 1042 if (nr_pages > 0) 1043 *lru_size += nr_pages; 1044 } 1045 1046 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1047 { 1048 struct mem_cgroup *task_memcg; 1049 struct task_struct *p; 1050 bool ret; 1051 1052 p = find_lock_task_mm(task); 1053 if (p) { 1054 task_memcg = get_mem_cgroup_from_mm(p->mm); 1055 task_unlock(p); 1056 } else { 1057 /* 1058 * All threads may have already detached their mm's, but the oom 1059 * killer still needs to detect if they have already been oom 1060 * killed to prevent needlessly killing additional tasks. 1061 */ 1062 rcu_read_lock(); 1063 task_memcg = mem_cgroup_from_task(task); 1064 css_get(&task_memcg->css); 1065 rcu_read_unlock(); 1066 } 1067 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1068 css_put(&task_memcg->css); 1069 return ret; 1070 } 1071 1072 /** 1073 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1074 * @memcg: the memory cgroup 1075 * 1076 * Returns the maximum amount of memory @mem can be charged with, in 1077 * pages. 1078 */ 1079 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1080 { 1081 unsigned long margin = 0; 1082 unsigned long count; 1083 unsigned long limit; 1084 1085 count = page_counter_read(&memcg->memory); 1086 limit = READ_ONCE(memcg->memory.limit); 1087 if (count < limit) 1088 margin = limit - count; 1089 1090 if (do_memsw_account()) { 1091 count = page_counter_read(&memcg->memsw); 1092 limit = READ_ONCE(memcg->memsw.limit); 1093 if (count <= limit) 1094 margin = min(margin, limit - count); 1095 else 1096 margin = 0; 1097 } 1098 1099 return margin; 1100 } 1101 1102 /* 1103 * A routine for checking "mem" is under move_account() or not. 1104 * 1105 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1106 * moving cgroups. This is for waiting at high-memory pressure 1107 * caused by "move". 1108 */ 1109 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1110 { 1111 struct mem_cgroup *from; 1112 struct mem_cgroup *to; 1113 bool ret = false; 1114 /* 1115 * Unlike task_move routines, we access mc.to, mc.from not under 1116 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1117 */ 1118 spin_lock(&mc.lock); 1119 from = mc.from; 1120 to = mc.to; 1121 if (!from) 1122 goto unlock; 1123 1124 ret = mem_cgroup_is_descendant(from, memcg) || 1125 mem_cgroup_is_descendant(to, memcg); 1126 unlock: 1127 spin_unlock(&mc.lock); 1128 return ret; 1129 } 1130 1131 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1132 { 1133 if (mc.moving_task && current != mc.moving_task) { 1134 if (mem_cgroup_under_move(memcg)) { 1135 DEFINE_WAIT(wait); 1136 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1137 /* moving charge context might have finished. */ 1138 if (mc.moving_task) 1139 schedule(); 1140 finish_wait(&mc.waitq, &wait); 1141 return true; 1142 } 1143 } 1144 return false; 1145 } 1146 1147 #define K(x) ((x) << (PAGE_SHIFT-10)) 1148 /** 1149 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1150 * @memcg: The memory cgroup that went over limit 1151 * @p: Task that is going to be killed 1152 * 1153 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1154 * enabled 1155 */ 1156 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1157 { 1158 struct mem_cgroup *iter; 1159 unsigned int i; 1160 1161 rcu_read_lock(); 1162 1163 if (p) { 1164 pr_info("Task in "); 1165 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1166 pr_cont(" killed as a result of limit of "); 1167 } else { 1168 pr_info("Memory limit reached of cgroup "); 1169 } 1170 1171 pr_cont_cgroup_path(memcg->css.cgroup); 1172 pr_cont("\n"); 1173 1174 rcu_read_unlock(); 1175 1176 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1177 K((u64)page_counter_read(&memcg->memory)), 1178 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1179 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1180 K((u64)page_counter_read(&memcg->memsw)), 1181 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1182 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1183 K((u64)page_counter_read(&memcg->kmem)), 1184 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1185 1186 for_each_mem_cgroup_tree(iter, memcg) { 1187 pr_info("Memory cgroup stats for "); 1188 pr_cont_cgroup_path(iter->css.cgroup); 1189 pr_cont(":"); 1190 1191 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1192 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1193 continue; 1194 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], 1195 K(mem_cgroup_read_stat(iter, i))); 1196 } 1197 1198 for (i = 0; i < NR_LRU_LISTS; i++) 1199 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1200 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1201 1202 pr_cont("\n"); 1203 } 1204 } 1205 1206 /* 1207 * This function returns the number of memcg under hierarchy tree. Returns 1208 * 1(self count) if no children. 1209 */ 1210 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1211 { 1212 int num = 0; 1213 struct mem_cgroup *iter; 1214 1215 for_each_mem_cgroup_tree(iter, memcg) 1216 num++; 1217 return num; 1218 } 1219 1220 /* 1221 * Return the memory (and swap, if configured) limit for a memcg. 1222 */ 1223 unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1224 { 1225 unsigned long limit; 1226 1227 limit = memcg->memory.limit; 1228 if (mem_cgroup_swappiness(memcg)) { 1229 unsigned long memsw_limit; 1230 unsigned long swap_limit; 1231 1232 memsw_limit = memcg->memsw.limit; 1233 swap_limit = memcg->swap.limit; 1234 swap_limit = min(swap_limit, (unsigned long)total_swap_pages); 1235 limit = min(limit + swap_limit, memsw_limit); 1236 } 1237 return limit; 1238 } 1239 1240 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1241 int order) 1242 { 1243 struct oom_control oc = { 1244 .zonelist = NULL, 1245 .nodemask = NULL, 1246 .memcg = memcg, 1247 .gfp_mask = gfp_mask, 1248 .order = order, 1249 }; 1250 bool ret; 1251 1252 mutex_lock(&oom_lock); 1253 ret = out_of_memory(&oc); 1254 mutex_unlock(&oom_lock); 1255 return ret; 1256 } 1257 1258 #if MAX_NUMNODES > 1 1259 1260 /** 1261 * test_mem_cgroup_node_reclaimable 1262 * @memcg: the target memcg 1263 * @nid: the node ID to be checked. 1264 * @noswap : specify true here if the user wants flle only information. 1265 * 1266 * This function returns whether the specified memcg contains any 1267 * reclaimable pages on a node. Returns true if there are any reclaimable 1268 * pages in the node. 1269 */ 1270 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1271 int nid, bool noswap) 1272 { 1273 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1274 return true; 1275 if (noswap || !total_swap_pages) 1276 return false; 1277 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1278 return true; 1279 return false; 1280 1281 } 1282 1283 /* 1284 * Always updating the nodemask is not very good - even if we have an empty 1285 * list or the wrong list here, we can start from some node and traverse all 1286 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1287 * 1288 */ 1289 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1290 { 1291 int nid; 1292 /* 1293 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1294 * pagein/pageout changes since the last update. 1295 */ 1296 if (!atomic_read(&memcg->numainfo_events)) 1297 return; 1298 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1299 return; 1300 1301 /* make a nodemask where this memcg uses memory from */ 1302 memcg->scan_nodes = node_states[N_MEMORY]; 1303 1304 for_each_node_mask(nid, node_states[N_MEMORY]) { 1305 1306 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1307 node_clear(nid, memcg->scan_nodes); 1308 } 1309 1310 atomic_set(&memcg->numainfo_events, 0); 1311 atomic_set(&memcg->numainfo_updating, 0); 1312 } 1313 1314 /* 1315 * Selecting a node where we start reclaim from. Because what we need is just 1316 * reducing usage counter, start from anywhere is O,K. Considering 1317 * memory reclaim from current node, there are pros. and cons. 1318 * 1319 * Freeing memory from current node means freeing memory from a node which 1320 * we'll use or we've used. So, it may make LRU bad. And if several threads 1321 * hit limits, it will see a contention on a node. But freeing from remote 1322 * node means more costs for memory reclaim because of memory latency. 1323 * 1324 * Now, we use round-robin. Better algorithm is welcomed. 1325 */ 1326 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1327 { 1328 int node; 1329 1330 mem_cgroup_may_update_nodemask(memcg); 1331 node = memcg->last_scanned_node; 1332 1333 node = next_node_in(node, memcg->scan_nodes); 1334 /* 1335 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages 1336 * last time it really checked all the LRUs due to rate limiting. 1337 * Fallback to the current node in that case for simplicity. 1338 */ 1339 if (unlikely(node == MAX_NUMNODES)) 1340 node = numa_node_id(); 1341 1342 memcg->last_scanned_node = node; 1343 return node; 1344 } 1345 #else 1346 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1347 { 1348 return 0; 1349 } 1350 #endif 1351 1352 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1353 pg_data_t *pgdat, 1354 gfp_t gfp_mask, 1355 unsigned long *total_scanned) 1356 { 1357 struct mem_cgroup *victim = NULL; 1358 int total = 0; 1359 int loop = 0; 1360 unsigned long excess; 1361 unsigned long nr_scanned; 1362 struct mem_cgroup_reclaim_cookie reclaim = { 1363 .pgdat = pgdat, 1364 .priority = 0, 1365 }; 1366 1367 excess = soft_limit_excess(root_memcg); 1368 1369 while (1) { 1370 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1371 if (!victim) { 1372 loop++; 1373 if (loop >= 2) { 1374 /* 1375 * If we have not been able to reclaim 1376 * anything, it might because there are 1377 * no reclaimable pages under this hierarchy 1378 */ 1379 if (!total) 1380 break; 1381 /* 1382 * We want to do more targeted reclaim. 1383 * excess >> 2 is not to excessive so as to 1384 * reclaim too much, nor too less that we keep 1385 * coming back to reclaim from this cgroup 1386 */ 1387 if (total >= (excess >> 2) || 1388 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1389 break; 1390 } 1391 continue; 1392 } 1393 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1394 pgdat, &nr_scanned); 1395 *total_scanned += nr_scanned; 1396 if (!soft_limit_excess(root_memcg)) 1397 break; 1398 } 1399 mem_cgroup_iter_break(root_memcg, victim); 1400 return total; 1401 } 1402 1403 #ifdef CONFIG_LOCKDEP 1404 static struct lockdep_map memcg_oom_lock_dep_map = { 1405 .name = "memcg_oom_lock", 1406 }; 1407 #endif 1408 1409 static DEFINE_SPINLOCK(memcg_oom_lock); 1410 1411 /* 1412 * Check OOM-Killer is already running under our hierarchy. 1413 * If someone is running, return false. 1414 */ 1415 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1416 { 1417 struct mem_cgroup *iter, *failed = NULL; 1418 1419 spin_lock(&memcg_oom_lock); 1420 1421 for_each_mem_cgroup_tree(iter, memcg) { 1422 if (iter->oom_lock) { 1423 /* 1424 * this subtree of our hierarchy is already locked 1425 * so we cannot give a lock. 1426 */ 1427 failed = iter; 1428 mem_cgroup_iter_break(memcg, iter); 1429 break; 1430 } else 1431 iter->oom_lock = true; 1432 } 1433 1434 if (failed) { 1435 /* 1436 * OK, we failed to lock the whole subtree so we have 1437 * to clean up what we set up to the failing subtree 1438 */ 1439 for_each_mem_cgroup_tree(iter, memcg) { 1440 if (iter == failed) { 1441 mem_cgroup_iter_break(memcg, iter); 1442 break; 1443 } 1444 iter->oom_lock = false; 1445 } 1446 } else 1447 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1448 1449 spin_unlock(&memcg_oom_lock); 1450 1451 return !failed; 1452 } 1453 1454 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1455 { 1456 struct mem_cgroup *iter; 1457 1458 spin_lock(&memcg_oom_lock); 1459 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1460 for_each_mem_cgroup_tree(iter, memcg) 1461 iter->oom_lock = false; 1462 spin_unlock(&memcg_oom_lock); 1463 } 1464 1465 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1466 { 1467 struct mem_cgroup *iter; 1468 1469 spin_lock(&memcg_oom_lock); 1470 for_each_mem_cgroup_tree(iter, memcg) 1471 iter->under_oom++; 1472 spin_unlock(&memcg_oom_lock); 1473 } 1474 1475 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1476 { 1477 struct mem_cgroup *iter; 1478 1479 /* 1480 * When a new child is created while the hierarchy is under oom, 1481 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1482 */ 1483 spin_lock(&memcg_oom_lock); 1484 for_each_mem_cgroup_tree(iter, memcg) 1485 if (iter->under_oom > 0) 1486 iter->under_oom--; 1487 spin_unlock(&memcg_oom_lock); 1488 } 1489 1490 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1491 1492 struct oom_wait_info { 1493 struct mem_cgroup *memcg; 1494 wait_queue_t wait; 1495 }; 1496 1497 static int memcg_oom_wake_function(wait_queue_t *wait, 1498 unsigned mode, int sync, void *arg) 1499 { 1500 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1501 struct mem_cgroup *oom_wait_memcg; 1502 struct oom_wait_info *oom_wait_info; 1503 1504 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1505 oom_wait_memcg = oom_wait_info->memcg; 1506 1507 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1508 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1509 return 0; 1510 return autoremove_wake_function(wait, mode, sync, arg); 1511 } 1512 1513 static void memcg_oom_recover(struct mem_cgroup *memcg) 1514 { 1515 /* 1516 * For the following lockless ->under_oom test, the only required 1517 * guarantee is that it must see the state asserted by an OOM when 1518 * this function is called as a result of userland actions 1519 * triggered by the notification of the OOM. This is trivially 1520 * achieved by invoking mem_cgroup_mark_under_oom() before 1521 * triggering notification. 1522 */ 1523 if (memcg && memcg->under_oom) 1524 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1525 } 1526 1527 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1528 { 1529 if (!current->memcg_may_oom) 1530 return; 1531 /* 1532 * We are in the middle of the charge context here, so we 1533 * don't want to block when potentially sitting on a callstack 1534 * that holds all kinds of filesystem and mm locks. 1535 * 1536 * Also, the caller may handle a failed allocation gracefully 1537 * (like optional page cache readahead) and so an OOM killer 1538 * invocation might not even be necessary. 1539 * 1540 * That's why we don't do anything here except remember the 1541 * OOM context and then deal with it at the end of the page 1542 * fault when the stack is unwound, the locks are released, 1543 * and when we know whether the fault was overall successful. 1544 */ 1545 css_get(&memcg->css); 1546 current->memcg_in_oom = memcg; 1547 current->memcg_oom_gfp_mask = mask; 1548 current->memcg_oom_order = order; 1549 } 1550 1551 /** 1552 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1553 * @handle: actually kill/wait or just clean up the OOM state 1554 * 1555 * This has to be called at the end of a page fault if the memcg OOM 1556 * handler was enabled. 1557 * 1558 * Memcg supports userspace OOM handling where failed allocations must 1559 * sleep on a waitqueue until the userspace task resolves the 1560 * situation. Sleeping directly in the charge context with all kinds 1561 * of locks held is not a good idea, instead we remember an OOM state 1562 * in the task and mem_cgroup_oom_synchronize() has to be called at 1563 * the end of the page fault to complete the OOM handling. 1564 * 1565 * Returns %true if an ongoing memcg OOM situation was detected and 1566 * completed, %false otherwise. 1567 */ 1568 bool mem_cgroup_oom_synchronize(bool handle) 1569 { 1570 struct mem_cgroup *memcg = current->memcg_in_oom; 1571 struct oom_wait_info owait; 1572 bool locked; 1573 1574 /* OOM is global, do not handle */ 1575 if (!memcg) 1576 return false; 1577 1578 if (!handle) 1579 goto cleanup; 1580 1581 owait.memcg = memcg; 1582 owait.wait.flags = 0; 1583 owait.wait.func = memcg_oom_wake_function; 1584 owait.wait.private = current; 1585 INIT_LIST_HEAD(&owait.wait.task_list); 1586 1587 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1588 mem_cgroup_mark_under_oom(memcg); 1589 1590 locked = mem_cgroup_oom_trylock(memcg); 1591 1592 if (locked) 1593 mem_cgroup_oom_notify(memcg); 1594 1595 if (locked && !memcg->oom_kill_disable) { 1596 mem_cgroup_unmark_under_oom(memcg); 1597 finish_wait(&memcg_oom_waitq, &owait.wait); 1598 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1599 current->memcg_oom_order); 1600 } else { 1601 schedule(); 1602 mem_cgroup_unmark_under_oom(memcg); 1603 finish_wait(&memcg_oom_waitq, &owait.wait); 1604 } 1605 1606 if (locked) { 1607 mem_cgroup_oom_unlock(memcg); 1608 /* 1609 * There is no guarantee that an OOM-lock contender 1610 * sees the wakeups triggered by the OOM kill 1611 * uncharges. Wake any sleepers explicitely. 1612 */ 1613 memcg_oom_recover(memcg); 1614 } 1615 cleanup: 1616 current->memcg_in_oom = NULL; 1617 css_put(&memcg->css); 1618 return true; 1619 } 1620 1621 /** 1622 * lock_page_memcg - lock a page->mem_cgroup binding 1623 * @page: the page 1624 * 1625 * This function protects unlocked LRU pages from being moved to 1626 * another cgroup and stabilizes their page->mem_cgroup binding. 1627 */ 1628 void lock_page_memcg(struct page *page) 1629 { 1630 struct mem_cgroup *memcg; 1631 unsigned long flags; 1632 1633 /* 1634 * The RCU lock is held throughout the transaction. The fast 1635 * path can get away without acquiring the memcg->move_lock 1636 * because page moving starts with an RCU grace period. 1637 */ 1638 rcu_read_lock(); 1639 1640 if (mem_cgroup_disabled()) 1641 return; 1642 again: 1643 memcg = page->mem_cgroup; 1644 if (unlikely(!memcg)) 1645 return; 1646 1647 if (atomic_read(&memcg->moving_account) <= 0) 1648 return; 1649 1650 spin_lock_irqsave(&memcg->move_lock, flags); 1651 if (memcg != page->mem_cgroup) { 1652 spin_unlock_irqrestore(&memcg->move_lock, flags); 1653 goto again; 1654 } 1655 1656 /* 1657 * When charge migration first begins, we can have locked and 1658 * unlocked page stat updates happening concurrently. Track 1659 * the task who has the lock for unlock_page_memcg(). 1660 */ 1661 memcg->move_lock_task = current; 1662 memcg->move_lock_flags = flags; 1663 1664 return; 1665 } 1666 EXPORT_SYMBOL(lock_page_memcg); 1667 1668 /** 1669 * unlock_page_memcg - unlock a page->mem_cgroup binding 1670 * @page: the page 1671 */ 1672 void unlock_page_memcg(struct page *page) 1673 { 1674 struct mem_cgroup *memcg = page->mem_cgroup; 1675 1676 if (memcg && memcg->move_lock_task == current) { 1677 unsigned long flags = memcg->move_lock_flags; 1678 1679 memcg->move_lock_task = NULL; 1680 memcg->move_lock_flags = 0; 1681 1682 spin_unlock_irqrestore(&memcg->move_lock, flags); 1683 } 1684 1685 rcu_read_unlock(); 1686 } 1687 EXPORT_SYMBOL(unlock_page_memcg); 1688 1689 /* 1690 * size of first charge trial. "32" comes from vmscan.c's magic value. 1691 * TODO: maybe necessary to use big numbers in big irons. 1692 */ 1693 #define CHARGE_BATCH 32U 1694 struct memcg_stock_pcp { 1695 struct mem_cgroup *cached; /* this never be root cgroup */ 1696 unsigned int nr_pages; 1697 struct work_struct work; 1698 unsigned long flags; 1699 #define FLUSHING_CACHED_CHARGE 0 1700 }; 1701 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1702 static DEFINE_MUTEX(percpu_charge_mutex); 1703 1704 /** 1705 * consume_stock: Try to consume stocked charge on this cpu. 1706 * @memcg: memcg to consume from. 1707 * @nr_pages: how many pages to charge. 1708 * 1709 * The charges will only happen if @memcg matches the current cpu's memcg 1710 * stock, and at least @nr_pages are available in that stock. Failure to 1711 * service an allocation will refill the stock. 1712 * 1713 * returns true if successful, false otherwise. 1714 */ 1715 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1716 { 1717 struct memcg_stock_pcp *stock; 1718 unsigned long flags; 1719 bool ret = false; 1720 1721 if (nr_pages > CHARGE_BATCH) 1722 return ret; 1723 1724 local_irq_save(flags); 1725 1726 stock = this_cpu_ptr(&memcg_stock); 1727 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 1728 stock->nr_pages -= nr_pages; 1729 ret = true; 1730 } 1731 1732 local_irq_restore(flags); 1733 1734 return ret; 1735 } 1736 1737 /* 1738 * Returns stocks cached in percpu and reset cached information. 1739 */ 1740 static void drain_stock(struct memcg_stock_pcp *stock) 1741 { 1742 struct mem_cgroup *old = stock->cached; 1743 1744 if (stock->nr_pages) { 1745 page_counter_uncharge(&old->memory, stock->nr_pages); 1746 if (do_memsw_account()) 1747 page_counter_uncharge(&old->memsw, stock->nr_pages); 1748 css_put_many(&old->css, stock->nr_pages); 1749 stock->nr_pages = 0; 1750 } 1751 stock->cached = NULL; 1752 } 1753 1754 static void drain_local_stock(struct work_struct *dummy) 1755 { 1756 struct memcg_stock_pcp *stock; 1757 unsigned long flags; 1758 1759 local_irq_save(flags); 1760 1761 stock = this_cpu_ptr(&memcg_stock); 1762 drain_stock(stock); 1763 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1764 1765 local_irq_restore(flags); 1766 } 1767 1768 /* 1769 * Cache charges(val) to local per_cpu area. 1770 * This will be consumed by consume_stock() function, later. 1771 */ 1772 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1773 { 1774 struct memcg_stock_pcp *stock; 1775 unsigned long flags; 1776 1777 local_irq_save(flags); 1778 1779 stock = this_cpu_ptr(&memcg_stock); 1780 if (stock->cached != memcg) { /* reset if necessary */ 1781 drain_stock(stock); 1782 stock->cached = memcg; 1783 } 1784 stock->nr_pages += nr_pages; 1785 1786 local_irq_restore(flags); 1787 } 1788 1789 /* 1790 * Drains all per-CPU charge caches for given root_memcg resp. subtree 1791 * of the hierarchy under it. 1792 */ 1793 static void drain_all_stock(struct mem_cgroup *root_memcg) 1794 { 1795 int cpu, curcpu; 1796 1797 /* If someone's already draining, avoid adding running more workers. */ 1798 if (!mutex_trylock(&percpu_charge_mutex)) 1799 return; 1800 /* Notify other cpus that system-wide "drain" is running */ 1801 get_online_cpus(); 1802 curcpu = get_cpu(); 1803 for_each_online_cpu(cpu) { 1804 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1805 struct mem_cgroup *memcg; 1806 1807 memcg = stock->cached; 1808 if (!memcg || !stock->nr_pages) 1809 continue; 1810 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 1811 continue; 1812 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 1813 if (cpu == curcpu) 1814 drain_local_stock(&stock->work); 1815 else 1816 schedule_work_on(cpu, &stock->work); 1817 } 1818 } 1819 put_cpu(); 1820 put_online_cpus(); 1821 mutex_unlock(&percpu_charge_mutex); 1822 } 1823 1824 static int memcg_hotplug_cpu_dead(unsigned int cpu) 1825 { 1826 struct memcg_stock_pcp *stock; 1827 1828 stock = &per_cpu(memcg_stock, cpu); 1829 drain_stock(stock); 1830 return 0; 1831 } 1832 1833 static void reclaim_high(struct mem_cgroup *memcg, 1834 unsigned int nr_pages, 1835 gfp_t gfp_mask) 1836 { 1837 do { 1838 if (page_counter_read(&memcg->memory) <= memcg->high) 1839 continue; 1840 mem_cgroup_events(memcg, MEMCG_HIGH, 1); 1841 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 1842 } while ((memcg = parent_mem_cgroup(memcg))); 1843 } 1844 1845 static void high_work_func(struct work_struct *work) 1846 { 1847 struct mem_cgroup *memcg; 1848 1849 memcg = container_of(work, struct mem_cgroup, high_work); 1850 reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); 1851 } 1852 1853 /* 1854 * Scheduled by try_charge() to be executed from the userland return path 1855 * and reclaims memory over the high limit. 1856 */ 1857 void mem_cgroup_handle_over_high(void) 1858 { 1859 unsigned int nr_pages = current->memcg_nr_pages_over_high; 1860 struct mem_cgroup *memcg; 1861 1862 if (likely(!nr_pages)) 1863 return; 1864 1865 memcg = get_mem_cgroup_from_mm(current->mm); 1866 reclaim_high(memcg, nr_pages, GFP_KERNEL); 1867 css_put(&memcg->css); 1868 current->memcg_nr_pages_over_high = 0; 1869 } 1870 1871 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 1872 unsigned int nr_pages) 1873 { 1874 unsigned int batch = max(CHARGE_BATCH, nr_pages); 1875 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1876 struct mem_cgroup *mem_over_limit; 1877 struct page_counter *counter; 1878 unsigned long nr_reclaimed; 1879 bool may_swap = true; 1880 bool drained = false; 1881 1882 if (mem_cgroup_is_root(memcg)) 1883 return 0; 1884 retry: 1885 if (consume_stock(memcg, nr_pages)) 1886 return 0; 1887 1888 if (!do_memsw_account() || 1889 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 1890 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 1891 goto done_restock; 1892 if (do_memsw_account()) 1893 page_counter_uncharge(&memcg->memsw, batch); 1894 mem_over_limit = mem_cgroup_from_counter(counter, memory); 1895 } else { 1896 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 1897 may_swap = false; 1898 } 1899 1900 if (batch > nr_pages) { 1901 batch = nr_pages; 1902 goto retry; 1903 } 1904 1905 /* 1906 * Unlike in global OOM situations, memcg is not in a physical 1907 * memory shortage. Allow dying and OOM-killed tasks to 1908 * bypass the last charges so that they can exit quickly and 1909 * free their memory. 1910 */ 1911 if (unlikely(test_thread_flag(TIF_MEMDIE) || 1912 fatal_signal_pending(current) || 1913 current->flags & PF_EXITING)) 1914 goto force; 1915 1916 /* 1917 * Prevent unbounded recursion when reclaim operations need to 1918 * allocate memory. This might exceed the limits temporarily, 1919 * but we prefer facilitating memory reclaim and getting back 1920 * under the limit over triggering OOM kills in these cases. 1921 */ 1922 if (unlikely(current->flags & PF_MEMALLOC)) 1923 goto force; 1924 1925 if (unlikely(task_in_memcg_oom(current))) 1926 goto nomem; 1927 1928 if (!gfpflags_allow_blocking(gfp_mask)) 1929 goto nomem; 1930 1931 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); 1932 1933 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 1934 gfp_mask, may_swap); 1935 1936 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 1937 goto retry; 1938 1939 if (!drained) { 1940 drain_all_stock(mem_over_limit); 1941 drained = true; 1942 goto retry; 1943 } 1944 1945 if (gfp_mask & __GFP_NORETRY) 1946 goto nomem; 1947 /* 1948 * Even though the limit is exceeded at this point, reclaim 1949 * may have been able to free some pages. Retry the charge 1950 * before killing the task. 1951 * 1952 * Only for regular pages, though: huge pages are rather 1953 * unlikely to succeed so close to the limit, and we fall back 1954 * to regular pages anyway in case of failure. 1955 */ 1956 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 1957 goto retry; 1958 /* 1959 * At task move, charge accounts can be doubly counted. So, it's 1960 * better to wait until the end of task_move if something is going on. 1961 */ 1962 if (mem_cgroup_wait_acct_move(mem_over_limit)) 1963 goto retry; 1964 1965 if (nr_retries--) 1966 goto retry; 1967 1968 if (gfp_mask & __GFP_NOFAIL) 1969 goto force; 1970 1971 if (fatal_signal_pending(current)) 1972 goto force; 1973 1974 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); 1975 1976 mem_cgroup_oom(mem_over_limit, gfp_mask, 1977 get_order(nr_pages * PAGE_SIZE)); 1978 nomem: 1979 if (!(gfp_mask & __GFP_NOFAIL)) 1980 return -ENOMEM; 1981 force: 1982 /* 1983 * The allocation either can't fail or will lead to more memory 1984 * being freed very soon. Allow memory usage go over the limit 1985 * temporarily by force charging it. 1986 */ 1987 page_counter_charge(&memcg->memory, nr_pages); 1988 if (do_memsw_account()) 1989 page_counter_charge(&memcg->memsw, nr_pages); 1990 css_get_many(&memcg->css, nr_pages); 1991 1992 return 0; 1993 1994 done_restock: 1995 css_get_many(&memcg->css, batch); 1996 if (batch > nr_pages) 1997 refill_stock(memcg, batch - nr_pages); 1998 1999 /* 2000 * If the hierarchy is above the normal consumption range, schedule 2001 * reclaim on returning to userland. We can perform reclaim here 2002 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2003 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2004 * not recorded as it most likely matches current's and won't 2005 * change in the meantime. As high limit is checked again before 2006 * reclaim, the cost of mismatch is negligible. 2007 */ 2008 do { 2009 if (page_counter_read(&memcg->memory) > memcg->high) { 2010 /* Don't bother a random interrupted task */ 2011 if (in_interrupt()) { 2012 schedule_work(&memcg->high_work); 2013 break; 2014 } 2015 current->memcg_nr_pages_over_high += batch; 2016 set_notify_resume(current); 2017 break; 2018 } 2019 } while ((memcg = parent_mem_cgroup(memcg))); 2020 2021 return 0; 2022 } 2023 2024 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2025 { 2026 if (mem_cgroup_is_root(memcg)) 2027 return; 2028 2029 page_counter_uncharge(&memcg->memory, nr_pages); 2030 if (do_memsw_account()) 2031 page_counter_uncharge(&memcg->memsw, nr_pages); 2032 2033 css_put_many(&memcg->css, nr_pages); 2034 } 2035 2036 static void lock_page_lru(struct page *page, int *isolated) 2037 { 2038 struct zone *zone = page_zone(page); 2039 2040 spin_lock_irq(zone_lru_lock(zone)); 2041 if (PageLRU(page)) { 2042 struct lruvec *lruvec; 2043 2044 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2045 ClearPageLRU(page); 2046 del_page_from_lru_list(page, lruvec, page_lru(page)); 2047 *isolated = 1; 2048 } else 2049 *isolated = 0; 2050 } 2051 2052 static void unlock_page_lru(struct page *page, int isolated) 2053 { 2054 struct zone *zone = page_zone(page); 2055 2056 if (isolated) { 2057 struct lruvec *lruvec; 2058 2059 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2060 VM_BUG_ON_PAGE(PageLRU(page), page); 2061 SetPageLRU(page); 2062 add_page_to_lru_list(page, lruvec, page_lru(page)); 2063 } 2064 spin_unlock_irq(zone_lru_lock(zone)); 2065 } 2066 2067 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2068 bool lrucare) 2069 { 2070 int isolated; 2071 2072 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2073 2074 /* 2075 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2076 * may already be on some other mem_cgroup's LRU. Take care of it. 2077 */ 2078 if (lrucare) 2079 lock_page_lru(page, &isolated); 2080 2081 /* 2082 * Nobody should be changing or seriously looking at 2083 * page->mem_cgroup at this point: 2084 * 2085 * - the page is uncharged 2086 * 2087 * - the page is off-LRU 2088 * 2089 * - an anonymous fault has exclusive page access, except for 2090 * a locked page table 2091 * 2092 * - a page cache insertion, a swapin fault, or a migration 2093 * have the page locked 2094 */ 2095 page->mem_cgroup = memcg; 2096 2097 if (lrucare) 2098 unlock_page_lru(page, isolated); 2099 } 2100 2101 #ifndef CONFIG_SLOB 2102 static int memcg_alloc_cache_id(void) 2103 { 2104 int id, size; 2105 int err; 2106 2107 id = ida_simple_get(&memcg_cache_ida, 2108 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2109 if (id < 0) 2110 return id; 2111 2112 if (id < memcg_nr_cache_ids) 2113 return id; 2114 2115 /* 2116 * There's no space for the new id in memcg_caches arrays, 2117 * so we have to grow them. 2118 */ 2119 down_write(&memcg_cache_ids_sem); 2120 2121 size = 2 * (id + 1); 2122 if (size < MEMCG_CACHES_MIN_SIZE) 2123 size = MEMCG_CACHES_MIN_SIZE; 2124 else if (size > MEMCG_CACHES_MAX_SIZE) 2125 size = MEMCG_CACHES_MAX_SIZE; 2126 2127 err = memcg_update_all_caches(size); 2128 if (!err) 2129 err = memcg_update_all_list_lrus(size); 2130 if (!err) 2131 memcg_nr_cache_ids = size; 2132 2133 up_write(&memcg_cache_ids_sem); 2134 2135 if (err) { 2136 ida_simple_remove(&memcg_cache_ida, id); 2137 return err; 2138 } 2139 return id; 2140 } 2141 2142 static void memcg_free_cache_id(int id) 2143 { 2144 ida_simple_remove(&memcg_cache_ida, id); 2145 } 2146 2147 struct memcg_kmem_cache_create_work { 2148 struct mem_cgroup *memcg; 2149 struct kmem_cache *cachep; 2150 struct work_struct work; 2151 }; 2152 2153 static void memcg_kmem_cache_create_func(struct work_struct *w) 2154 { 2155 struct memcg_kmem_cache_create_work *cw = 2156 container_of(w, struct memcg_kmem_cache_create_work, work); 2157 struct mem_cgroup *memcg = cw->memcg; 2158 struct kmem_cache *cachep = cw->cachep; 2159 2160 memcg_create_kmem_cache(memcg, cachep); 2161 2162 css_put(&memcg->css); 2163 kfree(cw); 2164 } 2165 2166 /* 2167 * Enqueue the creation of a per-memcg kmem_cache. 2168 */ 2169 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2170 struct kmem_cache *cachep) 2171 { 2172 struct memcg_kmem_cache_create_work *cw; 2173 2174 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2175 if (!cw) 2176 return; 2177 2178 css_get(&memcg->css); 2179 2180 cw->memcg = memcg; 2181 cw->cachep = cachep; 2182 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2183 2184 queue_work(memcg_kmem_cache_wq, &cw->work); 2185 } 2186 2187 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2188 struct kmem_cache *cachep) 2189 { 2190 /* 2191 * We need to stop accounting when we kmalloc, because if the 2192 * corresponding kmalloc cache is not yet created, the first allocation 2193 * in __memcg_schedule_kmem_cache_create will recurse. 2194 * 2195 * However, it is better to enclose the whole function. Depending on 2196 * the debugging options enabled, INIT_WORK(), for instance, can 2197 * trigger an allocation. This too, will make us recurse. Because at 2198 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2199 * the safest choice is to do it like this, wrapping the whole function. 2200 */ 2201 current->memcg_kmem_skip_account = 1; 2202 __memcg_schedule_kmem_cache_create(memcg, cachep); 2203 current->memcg_kmem_skip_account = 0; 2204 } 2205 2206 static inline bool memcg_kmem_bypass(void) 2207 { 2208 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 2209 return true; 2210 return false; 2211 } 2212 2213 /** 2214 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation 2215 * @cachep: the original global kmem cache 2216 * 2217 * Return the kmem_cache we're supposed to use for a slab allocation. 2218 * We try to use the current memcg's version of the cache. 2219 * 2220 * If the cache does not exist yet, if we are the first user of it, we 2221 * create it asynchronously in a workqueue and let the current allocation 2222 * go through with the original cache. 2223 * 2224 * This function takes a reference to the cache it returns to assure it 2225 * won't get destroyed while we are working with it. Once the caller is 2226 * done with it, memcg_kmem_put_cache() must be called to release the 2227 * reference. 2228 */ 2229 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) 2230 { 2231 struct mem_cgroup *memcg; 2232 struct kmem_cache *memcg_cachep; 2233 int kmemcg_id; 2234 2235 VM_BUG_ON(!is_root_cache(cachep)); 2236 2237 if (memcg_kmem_bypass()) 2238 return cachep; 2239 2240 if (current->memcg_kmem_skip_account) 2241 return cachep; 2242 2243 memcg = get_mem_cgroup_from_mm(current->mm); 2244 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2245 if (kmemcg_id < 0) 2246 goto out; 2247 2248 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2249 if (likely(memcg_cachep)) 2250 return memcg_cachep; 2251 2252 /* 2253 * If we are in a safe context (can wait, and not in interrupt 2254 * context), we could be be predictable and return right away. 2255 * This would guarantee that the allocation being performed 2256 * already belongs in the new cache. 2257 * 2258 * However, there are some clashes that can arrive from locking. 2259 * For instance, because we acquire the slab_mutex while doing 2260 * memcg_create_kmem_cache, this means no further allocation 2261 * could happen with the slab_mutex held. So it's better to 2262 * defer everything. 2263 */ 2264 memcg_schedule_kmem_cache_create(memcg, cachep); 2265 out: 2266 css_put(&memcg->css); 2267 return cachep; 2268 } 2269 2270 /** 2271 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache 2272 * @cachep: the cache returned by memcg_kmem_get_cache 2273 */ 2274 void memcg_kmem_put_cache(struct kmem_cache *cachep) 2275 { 2276 if (!is_root_cache(cachep)) 2277 css_put(&cachep->memcg_params.memcg->css); 2278 } 2279 2280 /** 2281 * memcg_kmem_charge: charge a kmem page 2282 * @page: page to charge 2283 * @gfp: reclaim mode 2284 * @order: allocation order 2285 * @memcg: memory cgroup to charge 2286 * 2287 * Returns 0 on success, an error code on failure. 2288 */ 2289 int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2290 struct mem_cgroup *memcg) 2291 { 2292 unsigned int nr_pages = 1 << order; 2293 struct page_counter *counter; 2294 int ret; 2295 2296 ret = try_charge(memcg, gfp, nr_pages); 2297 if (ret) 2298 return ret; 2299 2300 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 2301 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 2302 cancel_charge(memcg, nr_pages); 2303 return -ENOMEM; 2304 } 2305 2306 page->mem_cgroup = memcg; 2307 2308 return 0; 2309 } 2310 2311 /** 2312 * memcg_kmem_charge: charge a kmem page to the current memory cgroup 2313 * @page: page to charge 2314 * @gfp: reclaim mode 2315 * @order: allocation order 2316 * 2317 * Returns 0 on success, an error code on failure. 2318 */ 2319 int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2320 { 2321 struct mem_cgroup *memcg; 2322 int ret = 0; 2323 2324 if (memcg_kmem_bypass()) 2325 return 0; 2326 2327 memcg = get_mem_cgroup_from_mm(current->mm); 2328 if (!mem_cgroup_is_root(memcg)) { 2329 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); 2330 if (!ret) 2331 __SetPageKmemcg(page); 2332 } 2333 css_put(&memcg->css); 2334 return ret; 2335 } 2336 /** 2337 * memcg_kmem_uncharge: uncharge a kmem page 2338 * @page: page to uncharge 2339 * @order: allocation order 2340 */ 2341 void memcg_kmem_uncharge(struct page *page, int order) 2342 { 2343 struct mem_cgroup *memcg = page->mem_cgroup; 2344 unsigned int nr_pages = 1 << order; 2345 2346 if (!memcg) 2347 return; 2348 2349 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2350 2351 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2352 page_counter_uncharge(&memcg->kmem, nr_pages); 2353 2354 page_counter_uncharge(&memcg->memory, nr_pages); 2355 if (do_memsw_account()) 2356 page_counter_uncharge(&memcg->memsw, nr_pages); 2357 2358 page->mem_cgroup = NULL; 2359 2360 /* slab pages do not have PageKmemcg flag set */ 2361 if (PageKmemcg(page)) 2362 __ClearPageKmemcg(page); 2363 2364 css_put_many(&memcg->css, nr_pages); 2365 } 2366 #endif /* !CONFIG_SLOB */ 2367 2368 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2369 2370 /* 2371 * Because tail pages are not marked as "used", set it. We're under 2372 * zone_lru_lock and migration entries setup in all page mappings. 2373 */ 2374 void mem_cgroup_split_huge_fixup(struct page *head) 2375 { 2376 int i; 2377 2378 if (mem_cgroup_disabled()) 2379 return; 2380 2381 for (i = 1; i < HPAGE_PMD_NR; i++) 2382 head[i].mem_cgroup = head->mem_cgroup; 2383 2384 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2385 HPAGE_PMD_NR); 2386 } 2387 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2388 2389 #ifdef CONFIG_MEMCG_SWAP 2390 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2391 bool charge) 2392 { 2393 int val = (charge) ? 1 : -1; 2394 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 2395 } 2396 2397 /** 2398 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2399 * @entry: swap entry to be moved 2400 * @from: mem_cgroup which the entry is moved from 2401 * @to: mem_cgroup which the entry is moved to 2402 * 2403 * It succeeds only when the swap_cgroup's record for this entry is the same 2404 * as the mem_cgroup's id of @from. 2405 * 2406 * Returns 0 on success, -EINVAL on failure. 2407 * 2408 * The caller must have charged to @to, IOW, called page_counter_charge() about 2409 * both res and memsw, and called css_get(). 2410 */ 2411 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2412 struct mem_cgroup *from, struct mem_cgroup *to) 2413 { 2414 unsigned short old_id, new_id; 2415 2416 old_id = mem_cgroup_id(from); 2417 new_id = mem_cgroup_id(to); 2418 2419 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2420 mem_cgroup_swap_statistics(from, false); 2421 mem_cgroup_swap_statistics(to, true); 2422 return 0; 2423 } 2424 return -EINVAL; 2425 } 2426 #else 2427 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2428 struct mem_cgroup *from, struct mem_cgroup *to) 2429 { 2430 return -EINVAL; 2431 } 2432 #endif 2433 2434 static DEFINE_MUTEX(memcg_limit_mutex); 2435 2436 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2437 unsigned long limit) 2438 { 2439 unsigned long curusage; 2440 unsigned long oldusage; 2441 bool enlarge = false; 2442 int retry_count; 2443 int ret; 2444 2445 /* 2446 * For keeping hierarchical_reclaim simple, how long we should retry 2447 * is depends on callers. We set our retry-count to be function 2448 * of # of children which we should visit in this loop. 2449 */ 2450 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2451 mem_cgroup_count_children(memcg); 2452 2453 oldusage = page_counter_read(&memcg->memory); 2454 2455 do { 2456 if (signal_pending(current)) { 2457 ret = -EINTR; 2458 break; 2459 } 2460 2461 mutex_lock(&memcg_limit_mutex); 2462 if (limit > memcg->memsw.limit) { 2463 mutex_unlock(&memcg_limit_mutex); 2464 ret = -EINVAL; 2465 break; 2466 } 2467 if (limit > memcg->memory.limit) 2468 enlarge = true; 2469 ret = page_counter_limit(&memcg->memory, limit); 2470 mutex_unlock(&memcg_limit_mutex); 2471 2472 if (!ret) 2473 break; 2474 2475 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 2476 2477 curusage = page_counter_read(&memcg->memory); 2478 /* Usage is reduced ? */ 2479 if (curusage >= oldusage) 2480 retry_count--; 2481 else 2482 oldusage = curusage; 2483 } while (retry_count); 2484 2485 if (!ret && enlarge) 2486 memcg_oom_recover(memcg); 2487 2488 return ret; 2489 } 2490 2491 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2492 unsigned long limit) 2493 { 2494 unsigned long curusage; 2495 unsigned long oldusage; 2496 bool enlarge = false; 2497 int retry_count; 2498 int ret; 2499 2500 /* see mem_cgroup_resize_res_limit */ 2501 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 2502 mem_cgroup_count_children(memcg); 2503 2504 oldusage = page_counter_read(&memcg->memsw); 2505 2506 do { 2507 if (signal_pending(current)) { 2508 ret = -EINTR; 2509 break; 2510 } 2511 2512 mutex_lock(&memcg_limit_mutex); 2513 if (limit < memcg->memory.limit) { 2514 mutex_unlock(&memcg_limit_mutex); 2515 ret = -EINVAL; 2516 break; 2517 } 2518 if (limit > memcg->memsw.limit) 2519 enlarge = true; 2520 ret = page_counter_limit(&memcg->memsw, limit); 2521 mutex_unlock(&memcg_limit_mutex); 2522 2523 if (!ret) 2524 break; 2525 2526 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 2527 2528 curusage = page_counter_read(&memcg->memsw); 2529 /* Usage is reduced ? */ 2530 if (curusage >= oldusage) 2531 retry_count--; 2532 else 2533 oldusage = curusage; 2534 } while (retry_count); 2535 2536 if (!ret && enlarge) 2537 memcg_oom_recover(memcg); 2538 2539 return ret; 2540 } 2541 2542 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 2543 gfp_t gfp_mask, 2544 unsigned long *total_scanned) 2545 { 2546 unsigned long nr_reclaimed = 0; 2547 struct mem_cgroup_per_node *mz, *next_mz = NULL; 2548 unsigned long reclaimed; 2549 int loop = 0; 2550 struct mem_cgroup_tree_per_node *mctz; 2551 unsigned long excess; 2552 unsigned long nr_scanned; 2553 2554 if (order > 0) 2555 return 0; 2556 2557 mctz = soft_limit_tree_node(pgdat->node_id); 2558 2559 /* 2560 * Do not even bother to check the largest node if the root 2561 * is empty. Do it lockless to prevent lock bouncing. Races 2562 * are acceptable as soft limit is best effort anyway. 2563 */ 2564 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 2565 return 0; 2566 2567 /* 2568 * This loop can run a while, specially if mem_cgroup's continuously 2569 * keep exceeding their soft limit and putting the system under 2570 * pressure 2571 */ 2572 do { 2573 if (next_mz) 2574 mz = next_mz; 2575 else 2576 mz = mem_cgroup_largest_soft_limit_node(mctz); 2577 if (!mz) 2578 break; 2579 2580 nr_scanned = 0; 2581 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 2582 gfp_mask, &nr_scanned); 2583 nr_reclaimed += reclaimed; 2584 *total_scanned += nr_scanned; 2585 spin_lock_irq(&mctz->lock); 2586 __mem_cgroup_remove_exceeded(mz, mctz); 2587 2588 /* 2589 * If we failed to reclaim anything from this memory cgroup 2590 * it is time to move on to the next cgroup 2591 */ 2592 next_mz = NULL; 2593 if (!reclaimed) 2594 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 2595 2596 excess = soft_limit_excess(mz->memcg); 2597 /* 2598 * One school of thought says that we should not add 2599 * back the node to the tree if reclaim returns 0. 2600 * But our reclaim could return 0, simply because due 2601 * to priority we are exposing a smaller subset of 2602 * memory to reclaim from. Consider this as a longer 2603 * term TODO. 2604 */ 2605 /* If excess == 0, no tree ops */ 2606 __mem_cgroup_insert_exceeded(mz, mctz, excess); 2607 spin_unlock_irq(&mctz->lock); 2608 css_put(&mz->memcg->css); 2609 loop++; 2610 /* 2611 * Could not reclaim anything and there are no more 2612 * mem cgroups to try or we seem to be looping without 2613 * reclaiming anything. 2614 */ 2615 if (!nr_reclaimed && 2616 (next_mz == NULL || 2617 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2618 break; 2619 } while (!nr_reclaimed); 2620 if (next_mz) 2621 css_put(&next_mz->memcg->css); 2622 return nr_reclaimed; 2623 } 2624 2625 /* 2626 * Test whether @memcg has children, dead or alive. Note that this 2627 * function doesn't care whether @memcg has use_hierarchy enabled and 2628 * returns %true if there are child csses according to the cgroup 2629 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 2630 */ 2631 static inline bool memcg_has_children(struct mem_cgroup *memcg) 2632 { 2633 bool ret; 2634 2635 rcu_read_lock(); 2636 ret = css_next_child(NULL, &memcg->css); 2637 rcu_read_unlock(); 2638 return ret; 2639 } 2640 2641 /* 2642 * Reclaims as many pages from the given memcg as possible. 2643 * 2644 * Caller is responsible for holding css reference for memcg. 2645 */ 2646 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 2647 { 2648 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2649 2650 /* we call try-to-free pages for make this cgroup empty */ 2651 lru_add_drain_all(); 2652 /* try to free all pages in this cgroup */ 2653 while (nr_retries && page_counter_read(&memcg->memory)) { 2654 int progress; 2655 2656 if (signal_pending(current)) 2657 return -EINTR; 2658 2659 progress = try_to_free_mem_cgroup_pages(memcg, 1, 2660 GFP_KERNEL, true); 2661 if (!progress) { 2662 nr_retries--; 2663 /* maybe some writeback is necessary */ 2664 congestion_wait(BLK_RW_ASYNC, HZ/10); 2665 } 2666 2667 } 2668 2669 return 0; 2670 } 2671 2672 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 2673 char *buf, size_t nbytes, 2674 loff_t off) 2675 { 2676 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 2677 2678 if (mem_cgroup_is_root(memcg)) 2679 return -EINVAL; 2680 return mem_cgroup_force_empty(memcg) ?: nbytes; 2681 } 2682 2683 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 2684 struct cftype *cft) 2685 { 2686 return mem_cgroup_from_css(css)->use_hierarchy; 2687 } 2688 2689 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 2690 struct cftype *cft, u64 val) 2691 { 2692 int retval = 0; 2693 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2694 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 2695 2696 if (memcg->use_hierarchy == val) 2697 return 0; 2698 2699 /* 2700 * If parent's use_hierarchy is set, we can't make any modifications 2701 * in the child subtrees. If it is unset, then the change can 2702 * occur, provided the current cgroup has no children. 2703 * 2704 * For the root cgroup, parent_mem is NULL, we allow value to be 2705 * set if there are no children. 2706 */ 2707 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 2708 (val == 1 || val == 0)) { 2709 if (!memcg_has_children(memcg)) 2710 memcg->use_hierarchy = val; 2711 else 2712 retval = -EBUSY; 2713 } else 2714 retval = -EINVAL; 2715 2716 return retval; 2717 } 2718 2719 static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) 2720 { 2721 struct mem_cgroup *iter; 2722 int i; 2723 2724 memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); 2725 2726 for_each_mem_cgroup_tree(iter, memcg) { 2727 for (i = 0; i < MEMCG_NR_STAT; i++) 2728 stat[i] += mem_cgroup_read_stat(iter, i); 2729 } 2730 } 2731 2732 static void tree_events(struct mem_cgroup *memcg, unsigned long *events) 2733 { 2734 struct mem_cgroup *iter; 2735 int i; 2736 2737 memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); 2738 2739 for_each_mem_cgroup_tree(iter, memcg) { 2740 for (i = 0; i < MEMCG_NR_EVENTS; i++) 2741 events[i] += mem_cgroup_read_events(iter, i); 2742 } 2743 } 2744 2745 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 2746 { 2747 unsigned long val = 0; 2748 2749 if (mem_cgroup_is_root(memcg)) { 2750 struct mem_cgroup *iter; 2751 2752 for_each_mem_cgroup_tree(iter, memcg) { 2753 val += mem_cgroup_read_stat(iter, 2754 MEM_CGROUP_STAT_CACHE); 2755 val += mem_cgroup_read_stat(iter, 2756 MEM_CGROUP_STAT_RSS); 2757 if (swap) 2758 val += mem_cgroup_read_stat(iter, 2759 MEM_CGROUP_STAT_SWAP); 2760 } 2761 } else { 2762 if (!swap) 2763 val = page_counter_read(&memcg->memory); 2764 else 2765 val = page_counter_read(&memcg->memsw); 2766 } 2767 return val; 2768 } 2769 2770 enum { 2771 RES_USAGE, 2772 RES_LIMIT, 2773 RES_MAX_USAGE, 2774 RES_FAILCNT, 2775 RES_SOFT_LIMIT, 2776 }; 2777 2778 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 2779 struct cftype *cft) 2780 { 2781 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2782 struct page_counter *counter; 2783 2784 switch (MEMFILE_TYPE(cft->private)) { 2785 case _MEM: 2786 counter = &memcg->memory; 2787 break; 2788 case _MEMSWAP: 2789 counter = &memcg->memsw; 2790 break; 2791 case _KMEM: 2792 counter = &memcg->kmem; 2793 break; 2794 case _TCP: 2795 counter = &memcg->tcpmem; 2796 break; 2797 default: 2798 BUG(); 2799 } 2800 2801 switch (MEMFILE_ATTR(cft->private)) { 2802 case RES_USAGE: 2803 if (counter == &memcg->memory) 2804 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 2805 if (counter == &memcg->memsw) 2806 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 2807 return (u64)page_counter_read(counter) * PAGE_SIZE; 2808 case RES_LIMIT: 2809 return (u64)counter->limit * PAGE_SIZE; 2810 case RES_MAX_USAGE: 2811 return (u64)counter->watermark * PAGE_SIZE; 2812 case RES_FAILCNT: 2813 return counter->failcnt; 2814 case RES_SOFT_LIMIT: 2815 return (u64)memcg->soft_limit * PAGE_SIZE; 2816 default: 2817 BUG(); 2818 } 2819 } 2820 2821 #ifndef CONFIG_SLOB 2822 static int memcg_online_kmem(struct mem_cgroup *memcg) 2823 { 2824 int memcg_id; 2825 2826 if (cgroup_memory_nokmem) 2827 return 0; 2828 2829 BUG_ON(memcg->kmemcg_id >= 0); 2830 BUG_ON(memcg->kmem_state); 2831 2832 memcg_id = memcg_alloc_cache_id(); 2833 if (memcg_id < 0) 2834 return memcg_id; 2835 2836 static_branch_inc(&memcg_kmem_enabled_key); 2837 /* 2838 * A memory cgroup is considered kmem-online as soon as it gets 2839 * kmemcg_id. Setting the id after enabling static branching will 2840 * guarantee no one starts accounting before all call sites are 2841 * patched. 2842 */ 2843 memcg->kmemcg_id = memcg_id; 2844 memcg->kmem_state = KMEM_ONLINE; 2845 INIT_LIST_HEAD(&memcg->kmem_caches); 2846 2847 return 0; 2848 } 2849 2850 static void memcg_offline_kmem(struct mem_cgroup *memcg) 2851 { 2852 struct cgroup_subsys_state *css; 2853 struct mem_cgroup *parent, *child; 2854 int kmemcg_id; 2855 2856 if (memcg->kmem_state != KMEM_ONLINE) 2857 return; 2858 /* 2859 * Clear the online state before clearing memcg_caches array 2860 * entries. The slab_mutex in memcg_deactivate_kmem_caches() 2861 * guarantees that no cache will be created for this cgroup 2862 * after we are done (see memcg_create_kmem_cache()). 2863 */ 2864 memcg->kmem_state = KMEM_ALLOCATED; 2865 2866 memcg_deactivate_kmem_caches(memcg); 2867 2868 kmemcg_id = memcg->kmemcg_id; 2869 BUG_ON(kmemcg_id < 0); 2870 2871 parent = parent_mem_cgroup(memcg); 2872 if (!parent) 2873 parent = root_mem_cgroup; 2874 2875 /* 2876 * Change kmemcg_id of this cgroup and all its descendants to the 2877 * parent's id, and then move all entries from this cgroup's list_lrus 2878 * to ones of the parent. After we have finished, all list_lrus 2879 * corresponding to this cgroup are guaranteed to remain empty. The 2880 * ordering is imposed by list_lru_node->lock taken by 2881 * memcg_drain_all_list_lrus(). 2882 */ 2883 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 2884 css_for_each_descendant_pre(css, &memcg->css) { 2885 child = mem_cgroup_from_css(css); 2886 BUG_ON(child->kmemcg_id != kmemcg_id); 2887 child->kmemcg_id = parent->kmemcg_id; 2888 if (!memcg->use_hierarchy) 2889 break; 2890 } 2891 rcu_read_unlock(); 2892 2893 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); 2894 2895 memcg_free_cache_id(kmemcg_id); 2896 } 2897 2898 static void memcg_free_kmem(struct mem_cgroup *memcg) 2899 { 2900 /* css_alloc() failed, offlining didn't happen */ 2901 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 2902 memcg_offline_kmem(memcg); 2903 2904 if (memcg->kmem_state == KMEM_ALLOCATED) { 2905 memcg_destroy_kmem_caches(memcg); 2906 static_branch_dec(&memcg_kmem_enabled_key); 2907 WARN_ON(page_counter_read(&memcg->kmem)); 2908 } 2909 } 2910 #else 2911 static int memcg_online_kmem(struct mem_cgroup *memcg) 2912 { 2913 return 0; 2914 } 2915 static void memcg_offline_kmem(struct mem_cgroup *memcg) 2916 { 2917 } 2918 static void memcg_free_kmem(struct mem_cgroup *memcg) 2919 { 2920 } 2921 #endif /* !CONFIG_SLOB */ 2922 2923 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2924 unsigned long limit) 2925 { 2926 int ret; 2927 2928 mutex_lock(&memcg_limit_mutex); 2929 ret = page_counter_limit(&memcg->kmem, limit); 2930 mutex_unlock(&memcg_limit_mutex); 2931 return ret; 2932 } 2933 2934 static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) 2935 { 2936 int ret; 2937 2938 mutex_lock(&memcg_limit_mutex); 2939 2940 ret = page_counter_limit(&memcg->tcpmem, limit); 2941 if (ret) 2942 goto out; 2943 2944 if (!memcg->tcpmem_active) { 2945 /* 2946 * The active flag needs to be written after the static_key 2947 * update. This is what guarantees that the socket activation 2948 * function is the last one to run. See mem_cgroup_sk_alloc() 2949 * for details, and note that we don't mark any socket as 2950 * belonging to this memcg until that flag is up. 2951 * 2952 * We need to do this, because static_keys will span multiple 2953 * sites, but we can't control their order. If we mark a socket 2954 * as accounted, but the accounting functions are not patched in 2955 * yet, we'll lose accounting. 2956 * 2957 * We never race with the readers in mem_cgroup_sk_alloc(), 2958 * because when this value change, the code to process it is not 2959 * patched in yet. 2960 */ 2961 static_branch_inc(&memcg_sockets_enabled_key); 2962 memcg->tcpmem_active = true; 2963 } 2964 out: 2965 mutex_unlock(&memcg_limit_mutex); 2966 return ret; 2967 } 2968 2969 /* 2970 * The user of this function is... 2971 * RES_LIMIT. 2972 */ 2973 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 2974 char *buf, size_t nbytes, loff_t off) 2975 { 2976 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 2977 unsigned long nr_pages; 2978 int ret; 2979 2980 buf = strstrip(buf); 2981 ret = page_counter_memparse(buf, "-1", &nr_pages); 2982 if (ret) 2983 return ret; 2984 2985 switch (MEMFILE_ATTR(of_cft(of)->private)) { 2986 case RES_LIMIT: 2987 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 2988 ret = -EINVAL; 2989 break; 2990 } 2991 switch (MEMFILE_TYPE(of_cft(of)->private)) { 2992 case _MEM: 2993 ret = mem_cgroup_resize_limit(memcg, nr_pages); 2994 break; 2995 case _MEMSWAP: 2996 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 2997 break; 2998 case _KMEM: 2999 ret = memcg_update_kmem_limit(memcg, nr_pages); 3000 break; 3001 case _TCP: 3002 ret = memcg_update_tcp_limit(memcg, nr_pages); 3003 break; 3004 } 3005 break; 3006 case RES_SOFT_LIMIT: 3007 memcg->soft_limit = nr_pages; 3008 ret = 0; 3009 break; 3010 } 3011 return ret ?: nbytes; 3012 } 3013 3014 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3015 size_t nbytes, loff_t off) 3016 { 3017 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3018 struct page_counter *counter; 3019 3020 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3021 case _MEM: 3022 counter = &memcg->memory; 3023 break; 3024 case _MEMSWAP: 3025 counter = &memcg->memsw; 3026 break; 3027 case _KMEM: 3028 counter = &memcg->kmem; 3029 break; 3030 case _TCP: 3031 counter = &memcg->tcpmem; 3032 break; 3033 default: 3034 BUG(); 3035 } 3036 3037 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3038 case RES_MAX_USAGE: 3039 page_counter_reset_watermark(counter); 3040 break; 3041 case RES_FAILCNT: 3042 counter->failcnt = 0; 3043 break; 3044 default: 3045 BUG(); 3046 } 3047 3048 return nbytes; 3049 } 3050 3051 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3052 struct cftype *cft) 3053 { 3054 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3055 } 3056 3057 #ifdef CONFIG_MMU 3058 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3059 struct cftype *cft, u64 val) 3060 { 3061 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3062 3063 if (val & ~MOVE_MASK) 3064 return -EINVAL; 3065 3066 /* 3067 * No kind of locking is needed in here, because ->can_attach() will 3068 * check this value once in the beginning of the process, and then carry 3069 * on with stale data. This means that changes to this value will only 3070 * affect task migrations starting after the change. 3071 */ 3072 memcg->move_charge_at_immigrate = val; 3073 return 0; 3074 } 3075 #else 3076 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3077 struct cftype *cft, u64 val) 3078 { 3079 return -ENOSYS; 3080 } 3081 #endif 3082 3083 #ifdef CONFIG_NUMA 3084 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3085 { 3086 struct numa_stat { 3087 const char *name; 3088 unsigned int lru_mask; 3089 }; 3090 3091 static const struct numa_stat stats[] = { 3092 { "total", LRU_ALL }, 3093 { "file", LRU_ALL_FILE }, 3094 { "anon", LRU_ALL_ANON }, 3095 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3096 }; 3097 const struct numa_stat *stat; 3098 int nid; 3099 unsigned long nr; 3100 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3101 3102 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3103 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3104 seq_printf(m, "%s=%lu", stat->name, nr); 3105 for_each_node_state(nid, N_MEMORY) { 3106 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3107 stat->lru_mask); 3108 seq_printf(m, " N%d=%lu", nid, nr); 3109 } 3110 seq_putc(m, '\n'); 3111 } 3112 3113 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3114 struct mem_cgroup *iter; 3115 3116 nr = 0; 3117 for_each_mem_cgroup_tree(iter, memcg) 3118 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3119 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3120 for_each_node_state(nid, N_MEMORY) { 3121 nr = 0; 3122 for_each_mem_cgroup_tree(iter, memcg) 3123 nr += mem_cgroup_node_nr_lru_pages( 3124 iter, nid, stat->lru_mask); 3125 seq_printf(m, " N%d=%lu", nid, nr); 3126 } 3127 seq_putc(m, '\n'); 3128 } 3129 3130 return 0; 3131 } 3132 #endif /* CONFIG_NUMA */ 3133 3134 static int memcg_stat_show(struct seq_file *m, void *v) 3135 { 3136 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3137 unsigned long memory, memsw; 3138 struct mem_cgroup *mi; 3139 unsigned int i; 3140 3141 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != 3142 MEM_CGROUP_STAT_NSTATS); 3143 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != 3144 MEM_CGROUP_EVENTS_NSTATS); 3145 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3146 3147 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3148 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) 3149 continue; 3150 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], 3151 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3152 } 3153 3154 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3155 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3156 mem_cgroup_read_events(memcg, i)); 3157 3158 for (i = 0; i < NR_LRU_LISTS; i++) 3159 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3160 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3161 3162 /* Hierarchical information */ 3163 memory = memsw = PAGE_COUNTER_MAX; 3164 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3165 memory = min(memory, mi->memory.limit); 3166 memsw = min(memsw, mi->memsw.limit); 3167 } 3168 seq_printf(m, "hierarchical_memory_limit %llu\n", 3169 (u64)memory * PAGE_SIZE); 3170 if (do_memsw_account()) 3171 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3172 (u64)memsw * PAGE_SIZE); 3173 3174 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3175 unsigned long long val = 0; 3176 3177 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) 3178 continue; 3179 for_each_mem_cgroup_tree(mi, memcg) 3180 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3181 seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); 3182 } 3183 3184 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3185 unsigned long long val = 0; 3186 3187 for_each_mem_cgroup_tree(mi, memcg) 3188 val += mem_cgroup_read_events(mi, i); 3189 seq_printf(m, "total_%s %llu\n", 3190 mem_cgroup_events_names[i], val); 3191 } 3192 3193 for (i = 0; i < NR_LRU_LISTS; i++) { 3194 unsigned long long val = 0; 3195 3196 for_each_mem_cgroup_tree(mi, memcg) 3197 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3198 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3199 } 3200 3201 #ifdef CONFIG_DEBUG_VM 3202 { 3203 pg_data_t *pgdat; 3204 struct mem_cgroup_per_node *mz; 3205 struct zone_reclaim_stat *rstat; 3206 unsigned long recent_rotated[2] = {0, 0}; 3207 unsigned long recent_scanned[2] = {0, 0}; 3208 3209 for_each_online_pgdat(pgdat) { 3210 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 3211 rstat = &mz->lruvec.reclaim_stat; 3212 3213 recent_rotated[0] += rstat->recent_rotated[0]; 3214 recent_rotated[1] += rstat->recent_rotated[1]; 3215 recent_scanned[0] += rstat->recent_scanned[0]; 3216 recent_scanned[1] += rstat->recent_scanned[1]; 3217 } 3218 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3219 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3220 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3221 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3222 } 3223 #endif 3224 3225 return 0; 3226 } 3227 3228 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3229 struct cftype *cft) 3230 { 3231 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3232 3233 return mem_cgroup_swappiness(memcg); 3234 } 3235 3236 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3237 struct cftype *cft, u64 val) 3238 { 3239 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3240 3241 if (val > 100) 3242 return -EINVAL; 3243 3244 if (css->parent) 3245 memcg->swappiness = val; 3246 else 3247 vm_swappiness = val; 3248 3249 return 0; 3250 } 3251 3252 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3253 { 3254 struct mem_cgroup_threshold_ary *t; 3255 unsigned long usage; 3256 int i; 3257 3258 rcu_read_lock(); 3259 if (!swap) 3260 t = rcu_dereference(memcg->thresholds.primary); 3261 else 3262 t = rcu_dereference(memcg->memsw_thresholds.primary); 3263 3264 if (!t) 3265 goto unlock; 3266 3267 usage = mem_cgroup_usage(memcg, swap); 3268 3269 /* 3270 * current_threshold points to threshold just below or equal to usage. 3271 * If it's not true, a threshold was crossed after last 3272 * call of __mem_cgroup_threshold(). 3273 */ 3274 i = t->current_threshold; 3275 3276 /* 3277 * Iterate backward over array of thresholds starting from 3278 * current_threshold and check if a threshold is crossed. 3279 * If none of thresholds below usage is crossed, we read 3280 * only one element of the array here. 3281 */ 3282 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3283 eventfd_signal(t->entries[i].eventfd, 1); 3284 3285 /* i = current_threshold + 1 */ 3286 i++; 3287 3288 /* 3289 * Iterate forward over array of thresholds starting from 3290 * current_threshold+1 and check if a threshold is crossed. 3291 * If none of thresholds above usage is crossed, we read 3292 * only one element of the array here. 3293 */ 3294 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3295 eventfd_signal(t->entries[i].eventfd, 1); 3296 3297 /* Update current_threshold */ 3298 t->current_threshold = i - 1; 3299 unlock: 3300 rcu_read_unlock(); 3301 } 3302 3303 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3304 { 3305 while (memcg) { 3306 __mem_cgroup_threshold(memcg, false); 3307 if (do_memsw_account()) 3308 __mem_cgroup_threshold(memcg, true); 3309 3310 memcg = parent_mem_cgroup(memcg); 3311 } 3312 } 3313 3314 static int compare_thresholds(const void *a, const void *b) 3315 { 3316 const struct mem_cgroup_threshold *_a = a; 3317 const struct mem_cgroup_threshold *_b = b; 3318 3319 if (_a->threshold > _b->threshold) 3320 return 1; 3321 3322 if (_a->threshold < _b->threshold) 3323 return -1; 3324 3325 return 0; 3326 } 3327 3328 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3329 { 3330 struct mem_cgroup_eventfd_list *ev; 3331 3332 spin_lock(&memcg_oom_lock); 3333 3334 list_for_each_entry(ev, &memcg->oom_notify, list) 3335 eventfd_signal(ev->eventfd, 1); 3336 3337 spin_unlock(&memcg_oom_lock); 3338 return 0; 3339 } 3340 3341 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3342 { 3343 struct mem_cgroup *iter; 3344 3345 for_each_mem_cgroup_tree(iter, memcg) 3346 mem_cgroup_oom_notify_cb(iter); 3347 } 3348 3349 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3350 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3351 { 3352 struct mem_cgroup_thresholds *thresholds; 3353 struct mem_cgroup_threshold_ary *new; 3354 unsigned long threshold; 3355 unsigned long usage; 3356 int i, size, ret; 3357 3358 ret = page_counter_memparse(args, "-1", &threshold); 3359 if (ret) 3360 return ret; 3361 3362 mutex_lock(&memcg->thresholds_lock); 3363 3364 if (type == _MEM) { 3365 thresholds = &memcg->thresholds; 3366 usage = mem_cgroup_usage(memcg, false); 3367 } else if (type == _MEMSWAP) { 3368 thresholds = &memcg->memsw_thresholds; 3369 usage = mem_cgroup_usage(memcg, true); 3370 } else 3371 BUG(); 3372 3373 /* Check if a threshold crossed before adding a new one */ 3374 if (thresholds->primary) 3375 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3376 3377 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3378 3379 /* Allocate memory for new array of thresholds */ 3380 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3381 GFP_KERNEL); 3382 if (!new) { 3383 ret = -ENOMEM; 3384 goto unlock; 3385 } 3386 new->size = size; 3387 3388 /* Copy thresholds (if any) to new array */ 3389 if (thresholds->primary) { 3390 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3391 sizeof(struct mem_cgroup_threshold)); 3392 } 3393 3394 /* Add new threshold */ 3395 new->entries[size - 1].eventfd = eventfd; 3396 new->entries[size - 1].threshold = threshold; 3397 3398 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3399 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3400 compare_thresholds, NULL); 3401 3402 /* Find current threshold */ 3403 new->current_threshold = -1; 3404 for (i = 0; i < size; i++) { 3405 if (new->entries[i].threshold <= usage) { 3406 /* 3407 * new->current_threshold will not be used until 3408 * rcu_assign_pointer(), so it's safe to increment 3409 * it here. 3410 */ 3411 ++new->current_threshold; 3412 } else 3413 break; 3414 } 3415 3416 /* Free old spare buffer and save old primary buffer as spare */ 3417 kfree(thresholds->spare); 3418 thresholds->spare = thresholds->primary; 3419 3420 rcu_assign_pointer(thresholds->primary, new); 3421 3422 /* To be sure that nobody uses thresholds */ 3423 synchronize_rcu(); 3424 3425 unlock: 3426 mutex_unlock(&memcg->thresholds_lock); 3427 3428 return ret; 3429 } 3430 3431 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3432 struct eventfd_ctx *eventfd, const char *args) 3433 { 3434 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3435 } 3436 3437 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3438 struct eventfd_ctx *eventfd, const char *args) 3439 { 3440 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3441 } 3442 3443 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3444 struct eventfd_ctx *eventfd, enum res_type type) 3445 { 3446 struct mem_cgroup_thresholds *thresholds; 3447 struct mem_cgroup_threshold_ary *new; 3448 unsigned long usage; 3449 int i, j, size; 3450 3451 mutex_lock(&memcg->thresholds_lock); 3452 3453 if (type == _MEM) { 3454 thresholds = &memcg->thresholds; 3455 usage = mem_cgroup_usage(memcg, false); 3456 } else if (type == _MEMSWAP) { 3457 thresholds = &memcg->memsw_thresholds; 3458 usage = mem_cgroup_usage(memcg, true); 3459 } else 3460 BUG(); 3461 3462 if (!thresholds->primary) 3463 goto unlock; 3464 3465 /* Check if a threshold crossed before removing */ 3466 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3467 3468 /* Calculate new number of threshold */ 3469 size = 0; 3470 for (i = 0; i < thresholds->primary->size; i++) { 3471 if (thresholds->primary->entries[i].eventfd != eventfd) 3472 size++; 3473 } 3474 3475 new = thresholds->spare; 3476 3477 /* Set thresholds array to NULL if we don't have thresholds */ 3478 if (!size) { 3479 kfree(new); 3480 new = NULL; 3481 goto swap_buffers; 3482 } 3483 3484 new->size = size; 3485 3486 /* Copy thresholds and find current threshold */ 3487 new->current_threshold = -1; 3488 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3489 if (thresholds->primary->entries[i].eventfd == eventfd) 3490 continue; 3491 3492 new->entries[j] = thresholds->primary->entries[i]; 3493 if (new->entries[j].threshold <= usage) { 3494 /* 3495 * new->current_threshold will not be used 3496 * until rcu_assign_pointer(), so it's safe to increment 3497 * it here. 3498 */ 3499 ++new->current_threshold; 3500 } 3501 j++; 3502 } 3503 3504 swap_buffers: 3505 /* Swap primary and spare array */ 3506 thresholds->spare = thresholds->primary; 3507 3508 rcu_assign_pointer(thresholds->primary, new); 3509 3510 /* To be sure that nobody uses thresholds */ 3511 synchronize_rcu(); 3512 3513 /* If all events are unregistered, free the spare array */ 3514 if (!new) { 3515 kfree(thresholds->spare); 3516 thresholds->spare = NULL; 3517 } 3518 unlock: 3519 mutex_unlock(&memcg->thresholds_lock); 3520 } 3521 3522 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3523 struct eventfd_ctx *eventfd) 3524 { 3525 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 3526 } 3527 3528 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3529 struct eventfd_ctx *eventfd) 3530 { 3531 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 3532 } 3533 3534 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 3535 struct eventfd_ctx *eventfd, const char *args) 3536 { 3537 struct mem_cgroup_eventfd_list *event; 3538 3539 event = kmalloc(sizeof(*event), GFP_KERNEL); 3540 if (!event) 3541 return -ENOMEM; 3542 3543 spin_lock(&memcg_oom_lock); 3544 3545 event->eventfd = eventfd; 3546 list_add(&event->list, &memcg->oom_notify); 3547 3548 /* already in OOM ? */ 3549 if (memcg->under_oom) 3550 eventfd_signal(eventfd, 1); 3551 spin_unlock(&memcg_oom_lock); 3552 3553 return 0; 3554 } 3555 3556 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 3557 struct eventfd_ctx *eventfd) 3558 { 3559 struct mem_cgroup_eventfd_list *ev, *tmp; 3560 3561 spin_lock(&memcg_oom_lock); 3562 3563 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 3564 if (ev->eventfd == eventfd) { 3565 list_del(&ev->list); 3566 kfree(ev); 3567 } 3568 } 3569 3570 spin_unlock(&memcg_oom_lock); 3571 } 3572 3573 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3574 { 3575 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3576 3577 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3578 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3579 return 0; 3580 } 3581 3582 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 3583 struct cftype *cft, u64 val) 3584 { 3585 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3586 3587 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3588 if (!css->parent || !((val == 0) || (val == 1))) 3589 return -EINVAL; 3590 3591 memcg->oom_kill_disable = val; 3592 if (!val) 3593 memcg_oom_recover(memcg); 3594 3595 return 0; 3596 } 3597 3598 #ifdef CONFIG_CGROUP_WRITEBACK 3599 3600 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) 3601 { 3602 return &memcg->cgwb_list; 3603 } 3604 3605 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3606 { 3607 return wb_domain_init(&memcg->cgwb_domain, gfp); 3608 } 3609 3610 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3611 { 3612 wb_domain_exit(&memcg->cgwb_domain); 3613 } 3614 3615 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3616 { 3617 wb_domain_size_changed(&memcg->cgwb_domain); 3618 } 3619 3620 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 3621 { 3622 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3623 3624 if (!memcg->css.parent) 3625 return NULL; 3626 3627 return &memcg->cgwb_domain; 3628 } 3629 3630 /** 3631 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3632 * @wb: bdi_writeback in question 3633 * @pfilepages: out parameter for number of file pages 3634 * @pheadroom: out parameter for number of allocatable pages according to memcg 3635 * @pdirty: out parameter for number of dirty pages 3636 * @pwriteback: out parameter for number of pages under writeback 3637 * 3638 * Determine the numbers of file, headroom, dirty, and writeback pages in 3639 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 3640 * is a bit more involved. 3641 * 3642 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 3643 * headroom is calculated as the lowest headroom of itself and the 3644 * ancestors. Note that this doesn't consider the actual amount of 3645 * available memory in the system. The caller should further cap 3646 * *@pheadroom accordingly. 3647 */ 3648 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 3649 unsigned long *pheadroom, unsigned long *pdirty, 3650 unsigned long *pwriteback) 3651 { 3652 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3653 struct mem_cgroup *parent; 3654 3655 *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); 3656 3657 /* this should eventually include NR_UNSTABLE_NFS */ 3658 *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 3659 *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3660 (1 << LRU_ACTIVE_FILE)); 3661 *pheadroom = PAGE_COUNTER_MAX; 3662 3663 while ((parent = parent_mem_cgroup(memcg))) { 3664 unsigned long ceiling = min(memcg->memory.limit, memcg->high); 3665 unsigned long used = page_counter_read(&memcg->memory); 3666 3667 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 3668 memcg = parent; 3669 } 3670 } 3671 3672 #else /* CONFIG_CGROUP_WRITEBACK */ 3673 3674 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3675 { 3676 return 0; 3677 } 3678 3679 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3680 { 3681 } 3682 3683 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3684 { 3685 } 3686 3687 #endif /* CONFIG_CGROUP_WRITEBACK */ 3688 3689 /* 3690 * DO NOT USE IN NEW FILES. 3691 * 3692 * "cgroup.event_control" implementation. 3693 * 3694 * This is way over-engineered. It tries to support fully configurable 3695 * events for each user. Such level of flexibility is completely 3696 * unnecessary especially in the light of the planned unified hierarchy. 3697 * 3698 * Please deprecate this and replace with something simpler if at all 3699 * possible. 3700 */ 3701 3702 /* 3703 * Unregister event and free resources. 3704 * 3705 * Gets called from workqueue. 3706 */ 3707 static void memcg_event_remove(struct work_struct *work) 3708 { 3709 struct mem_cgroup_event *event = 3710 container_of(work, struct mem_cgroup_event, remove); 3711 struct mem_cgroup *memcg = event->memcg; 3712 3713 remove_wait_queue(event->wqh, &event->wait); 3714 3715 event->unregister_event(memcg, event->eventfd); 3716 3717 /* Notify userspace the event is going away. */ 3718 eventfd_signal(event->eventfd, 1); 3719 3720 eventfd_ctx_put(event->eventfd); 3721 kfree(event); 3722 css_put(&memcg->css); 3723 } 3724 3725 /* 3726 * Gets called on POLLHUP on eventfd when user closes it. 3727 * 3728 * Called with wqh->lock held and interrupts disabled. 3729 */ 3730 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 3731 int sync, void *key) 3732 { 3733 struct mem_cgroup_event *event = 3734 container_of(wait, struct mem_cgroup_event, wait); 3735 struct mem_cgroup *memcg = event->memcg; 3736 unsigned long flags = (unsigned long)key; 3737 3738 if (flags & POLLHUP) { 3739 /* 3740 * If the event has been detached at cgroup removal, we 3741 * can simply return knowing the other side will cleanup 3742 * for us. 3743 * 3744 * We can't race against event freeing since the other 3745 * side will require wqh->lock via remove_wait_queue(), 3746 * which we hold. 3747 */ 3748 spin_lock(&memcg->event_list_lock); 3749 if (!list_empty(&event->list)) { 3750 list_del_init(&event->list); 3751 /* 3752 * We are in atomic context, but cgroup_event_remove() 3753 * may sleep, so we have to call it in workqueue. 3754 */ 3755 schedule_work(&event->remove); 3756 } 3757 spin_unlock(&memcg->event_list_lock); 3758 } 3759 3760 return 0; 3761 } 3762 3763 static void memcg_event_ptable_queue_proc(struct file *file, 3764 wait_queue_head_t *wqh, poll_table *pt) 3765 { 3766 struct mem_cgroup_event *event = 3767 container_of(pt, struct mem_cgroup_event, pt); 3768 3769 event->wqh = wqh; 3770 add_wait_queue(wqh, &event->wait); 3771 } 3772 3773 /* 3774 * DO NOT USE IN NEW FILES. 3775 * 3776 * Parse input and register new cgroup event handler. 3777 * 3778 * Input must be in format '<event_fd> <control_fd> <args>'. 3779 * Interpretation of args is defined by control file implementation. 3780 */ 3781 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 3782 char *buf, size_t nbytes, loff_t off) 3783 { 3784 struct cgroup_subsys_state *css = of_css(of); 3785 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3786 struct mem_cgroup_event *event; 3787 struct cgroup_subsys_state *cfile_css; 3788 unsigned int efd, cfd; 3789 struct fd efile; 3790 struct fd cfile; 3791 const char *name; 3792 char *endp; 3793 int ret; 3794 3795 buf = strstrip(buf); 3796 3797 efd = simple_strtoul(buf, &endp, 10); 3798 if (*endp != ' ') 3799 return -EINVAL; 3800 buf = endp + 1; 3801 3802 cfd = simple_strtoul(buf, &endp, 10); 3803 if ((*endp != ' ') && (*endp != '\0')) 3804 return -EINVAL; 3805 buf = endp + 1; 3806 3807 event = kzalloc(sizeof(*event), GFP_KERNEL); 3808 if (!event) 3809 return -ENOMEM; 3810 3811 event->memcg = memcg; 3812 INIT_LIST_HEAD(&event->list); 3813 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 3814 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 3815 INIT_WORK(&event->remove, memcg_event_remove); 3816 3817 efile = fdget(efd); 3818 if (!efile.file) { 3819 ret = -EBADF; 3820 goto out_kfree; 3821 } 3822 3823 event->eventfd = eventfd_ctx_fileget(efile.file); 3824 if (IS_ERR(event->eventfd)) { 3825 ret = PTR_ERR(event->eventfd); 3826 goto out_put_efile; 3827 } 3828 3829 cfile = fdget(cfd); 3830 if (!cfile.file) { 3831 ret = -EBADF; 3832 goto out_put_eventfd; 3833 } 3834 3835 /* the process need read permission on control file */ 3836 /* AV: shouldn't we check that it's been opened for read instead? */ 3837 ret = inode_permission(file_inode(cfile.file), MAY_READ); 3838 if (ret < 0) 3839 goto out_put_cfile; 3840 3841 /* 3842 * Determine the event callbacks and set them in @event. This used 3843 * to be done via struct cftype but cgroup core no longer knows 3844 * about these events. The following is crude but the whole thing 3845 * is for compatibility anyway. 3846 * 3847 * DO NOT ADD NEW FILES. 3848 */ 3849 name = cfile.file->f_path.dentry->d_name.name; 3850 3851 if (!strcmp(name, "memory.usage_in_bytes")) { 3852 event->register_event = mem_cgroup_usage_register_event; 3853 event->unregister_event = mem_cgroup_usage_unregister_event; 3854 } else if (!strcmp(name, "memory.oom_control")) { 3855 event->register_event = mem_cgroup_oom_register_event; 3856 event->unregister_event = mem_cgroup_oom_unregister_event; 3857 } else if (!strcmp(name, "memory.pressure_level")) { 3858 event->register_event = vmpressure_register_event; 3859 event->unregister_event = vmpressure_unregister_event; 3860 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 3861 event->register_event = memsw_cgroup_usage_register_event; 3862 event->unregister_event = memsw_cgroup_usage_unregister_event; 3863 } else { 3864 ret = -EINVAL; 3865 goto out_put_cfile; 3866 } 3867 3868 /* 3869 * Verify @cfile should belong to @css. Also, remaining events are 3870 * automatically removed on cgroup destruction but the removal is 3871 * asynchronous, so take an extra ref on @css. 3872 */ 3873 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 3874 &memory_cgrp_subsys); 3875 ret = -EINVAL; 3876 if (IS_ERR(cfile_css)) 3877 goto out_put_cfile; 3878 if (cfile_css != css) { 3879 css_put(cfile_css); 3880 goto out_put_cfile; 3881 } 3882 3883 ret = event->register_event(memcg, event->eventfd, buf); 3884 if (ret) 3885 goto out_put_css; 3886 3887 efile.file->f_op->poll(efile.file, &event->pt); 3888 3889 spin_lock(&memcg->event_list_lock); 3890 list_add(&event->list, &memcg->event_list); 3891 spin_unlock(&memcg->event_list_lock); 3892 3893 fdput(cfile); 3894 fdput(efile); 3895 3896 return nbytes; 3897 3898 out_put_css: 3899 css_put(css); 3900 out_put_cfile: 3901 fdput(cfile); 3902 out_put_eventfd: 3903 eventfd_ctx_put(event->eventfd); 3904 out_put_efile: 3905 fdput(efile); 3906 out_kfree: 3907 kfree(event); 3908 3909 return ret; 3910 } 3911 3912 static struct cftype mem_cgroup_legacy_files[] = { 3913 { 3914 .name = "usage_in_bytes", 3915 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3916 .read_u64 = mem_cgroup_read_u64, 3917 }, 3918 { 3919 .name = "max_usage_in_bytes", 3920 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3921 .write = mem_cgroup_reset, 3922 .read_u64 = mem_cgroup_read_u64, 3923 }, 3924 { 3925 .name = "limit_in_bytes", 3926 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3927 .write = mem_cgroup_write, 3928 .read_u64 = mem_cgroup_read_u64, 3929 }, 3930 { 3931 .name = "soft_limit_in_bytes", 3932 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3933 .write = mem_cgroup_write, 3934 .read_u64 = mem_cgroup_read_u64, 3935 }, 3936 { 3937 .name = "failcnt", 3938 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3939 .write = mem_cgroup_reset, 3940 .read_u64 = mem_cgroup_read_u64, 3941 }, 3942 { 3943 .name = "stat", 3944 .seq_show = memcg_stat_show, 3945 }, 3946 { 3947 .name = "force_empty", 3948 .write = mem_cgroup_force_empty_write, 3949 }, 3950 { 3951 .name = "use_hierarchy", 3952 .write_u64 = mem_cgroup_hierarchy_write, 3953 .read_u64 = mem_cgroup_hierarchy_read, 3954 }, 3955 { 3956 .name = "cgroup.event_control", /* XXX: for compat */ 3957 .write = memcg_write_event_control, 3958 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 3959 }, 3960 { 3961 .name = "swappiness", 3962 .read_u64 = mem_cgroup_swappiness_read, 3963 .write_u64 = mem_cgroup_swappiness_write, 3964 }, 3965 { 3966 .name = "move_charge_at_immigrate", 3967 .read_u64 = mem_cgroup_move_charge_read, 3968 .write_u64 = mem_cgroup_move_charge_write, 3969 }, 3970 { 3971 .name = "oom_control", 3972 .seq_show = mem_cgroup_oom_control_read, 3973 .write_u64 = mem_cgroup_oom_control_write, 3974 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3975 }, 3976 { 3977 .name = "pressure_level", 3978 }, 3979 #ifdef CONFIG_NUMA 3980 { 3981 .name = "numa_stat", 3982 .seq_show = memcg_numa_stat_show, 3983 }, 3984 #endif 3985 { 3986 .name = "kmem.limit_in_bytes", 3987 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 3988 .write = mem_cgroup_write, 3989 .read_u64 = mem_cgroup_read_u64, 3990 }, 3991 { 3992 .name = "kmem.usage_in_bytes", 3993 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 3994 .read_u64 = mem_cgroup_read_u64, 3995 }, 3996 { 3997 .name = "kmem.failcnt", 3998 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 3999 .write = mem_cgroup_reset, 4000 .read_u64 = mem_cgroup_read_u64, 4001 }, 4002 { 4003 .name = "kmem.max_usage_in_bytes", 4004 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4005 .write = mem_cgroup_reset, 4006 .read_u64 = mem_cgroup_read_u64, 4007 }, 4008 #ifdef CONFIG_SLABINFO 4009 { 4010 .name = "kmem.slabinfo", 4011 .seq_start = memcg_slab_start, 4012 .seq_next = memcg_slab_next, 4013 .seq_stop = memcg_slab_stop, 4014 .seq_show = memcg_slab_show, 4015 }, 4016 #endif 4017 { 4018 .name = "kmem.tcp.limit_in_bytes", 4019 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 4020 .write = mem_cgroup_write, 4021 .read_u64 = mem_cgroup_read_u64, 4022 }, 4023 { 4024 .name = "kmem.tcp.usage_in_bytes", 4025 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 4026 .read_u64 = mem_cgroup_read_u64, 4027 }, 4028 { 4029 .name = "kmem.tcp.failcnt", 4030 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 4031 .write = mem_cgroup_reset, 4032 .read_u64 = mem_cgroup_read_u64, 4033 }, 4034 { 4035 .name = "kmem.tcp.max_usage_in_bytes", 4036 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 4037 .write = mem_cgroup_reset, 4038 .read_u64 = mem_cgroup_read_u64, 4039 }, 4040 { }, /* terminate */ 4041 }; 4042 4043 /* 4044 * Private memory cgroup IDR 4045 * 4046 * Swap-out records and page cache shadow entries need to store memcg 4047 * references in constrained space, so we maintain an ID space that is 4048 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 4049 * memory-controlled cgroups to 64k. 4050 * 4051 * However, there usually are many references to the oflline CSS after 4052 * the cgroup has been destroyed, such as page cache or reclaimable 4053 * slab objects, that don't need to hang on to the ID. We want to keep 4054 * those dead CSS from occupying IDs, or we might quickly exhaust the 4055 * relatively small ID space and prevent the creation of new cgroups 4056 * even when there are much fewer than 64k cgroups - possibly none. 4057 * 4058 * Maintain a private 16-bit ID space for memcg, and allow the ID to 4059 * be freed and recycled when it's no longer needed, which is usually 4060 * when the CSS is offlined. 4061 * 4062 * The only exception to that are records of swapped out tmpfs/shmem 4063 * pages that need to be attributed to live ancestors on swapin. But 4064 * those references are manageable from userspace. 4065 */ 4066 4067 static DEFINE_IDR(mem_cgroup_idr); 4068 4069 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) 4070 { 4071 VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); 4072 atomic_add(n, &memcg->id.ref); 4073 } 4074 4075 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4076 { 4077 VM_BUG_ON(atomic_read(&memcg->id.ref) < n); 4078 if (atomic_sub_and_test(n, &memcg->id.ref)) { 4079 idr_remove(&mem_cgroup_idr, memcg->id.id); 4080 memcg->id.id = 0; 4081 4082 /* Memcg ID pins CSS */ 4083 css_put(&memcg->css); 4084 } 4085 } 4086 4087 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) 4088 { 4089 mem_cgroup_id_get_many(memcg, 1); 4090 } 4091 4092 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4093 { 4094 mem_cgroup_id_put_many(memcg, 1); 4095 } 4096 4097 /** 4098 * mem_cgroup_from_id - look up a memcg from a memcg id 4099 * @id: the memcg id to look up 4100 * 4101 * Caller must hold rcu_read_lock(). 4102 */ 4103 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 4104 { 4105 WARN_ON_ONCE(!rcu_read_lock_held()); 4106 return idr_find(&mem_cgroup_idr, id); 4107 } 4108 4109 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4110 { 4111 struct mem_cgroup_per_node *pn; 4112 int tmp = node; 4113 /* 4114 * This routine is called against possible nodes. 4115 * But it's BUG to call kmalloc() against offline node. 4116 * 4117 * TODO: this routine can waste much memory for nodes which will 4118 * never be onlined. It's better to use memory hotplug callback 4119 * function. 4120 */ 4121 if (!node_state(node, N_NORMAL_MEMORY)) 4122 tmp = -1; 4123 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4124 if (!pn) 4125 return 1; 4126 4127 lruvec_init(&pn->lruvec); 4128 pn->usage_in_excess = 0; 4129 pn->on_tree = false; 4130 pn->memcg = memcg; 4131 4132 memcg->nodeinfo[node] = pn; 4133 return 0; 4134 } 4135 4136 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4137 { 4138 kfree(memcg->nodeinfo[node]); 4139 } 4140 4141 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4142 { 4143 int node; 4144 4145 for_each_node(node) 4146 free_mem_cgroup_per_node_info(memcg, node); 4147 free_percpu(memcg->stat); 4148 kfree(memcg); 4149 } 4150 4151 static void mem_cgroup_free(struct mem_cgroup *memcg) 4152 { 4153 memcg_wb_domain_exit(memcg); 4154 __mem_cgroup_free(memcg); 4155 } 4156 4157 static struct mem_cgroup *mem_cgroup_alloc(void) 4158 { 4159 struct mem_cgroup *memcg; 4160 size_t size; 4161 int node; 4162 4163 size = sizeof(struct mem_cgroup); 4164 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4165 4166 memcg = kzalloc(size, GFP_KERNEL); 4167 if (!memcg) 4168 return NULL; 4169 4170 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 4171 1, MEM_CGROUP_ID_MAX, 4172 GFP_KERNEL); 4173 if (memcg->id.id < 0) 4174 goto fail; 4175 4176 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4177 if (!memcg->stat) 4178 goto fail; 4179 4180 for_each_node(node) 4181 if (alloc_mem_cgroup_per_node_info(memcg, node)) 4182 goto fail; 4183 4184 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4185 goto fail; 4186 4187 INIT_WORK(&memcg->high_work, high_work_func); 4188 memcg->last_scanned_node = MAX_NUMNODES; 4189 INIT_LIST_HEAD(&memcg->oom_notify); 4190 mutex_init(&memcg->thresholds_lock); 4191 spin_lock_init(&memcg->move_lock); 4192 vmpressure_init(&memcg->vmpressure); 4193 INIT_LIST_HEAD(&memcg->event_list); 4194 spin_lock_init(&memcg->event_list_lock); 4195 memcg->socket_pressure = jiffies; 4196 #ifndef CONFIG_SLOB 4197 memcg->kmemcg_id = -1; 4198 #endif 4199 #ifdef CONFIG_CGROUP_WRITEBACK 4200 INIT_LIST_HEAD(&memcg->cgwb_list); 4201 #endif 4202 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 4203 return memcg; 4204 fail: 4205 if (memcg->id.id > 0) 4206 idr_remove(&mem_cgroup_idr, memcg->id.id); 4207 __mem_cgroup_free(memcg); 4208 return NULL; 4209 } 4210 4211 static struct cgroup_subsys_state * __ref 4212 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4213 { 4214 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 4215 struct mem_cgroup *memcg; 4216 long error = -ENOMEM; 4217 4218 memcg = mem_cgroup_alloc(); 4219 if (!memcg) 4220 return ERR_PTR(error); 4221 4222 memcg->high = PAGE_COUNTER_MAX; 4223 memcg->soft_limit = PAGE_COUNTER_MAX; 4224 if (parent) { 4225 memcg->swappiness = mem_cgroup_swappiness(parent); 4226 memcg->oom_kill_disable = parent->oom_kill_disable; 4227 } 4228 if (parent && parent->use_hierarchy) { 4229 memcg->use_hierarchy = true; 4230 page_counter_init(&memcg->memory, &parent->memory); 4231 page_counter_init(&memcg->swap, &parent->swap); 4232 page_counter_init(&memcg->memsw, &parent->memsw); 4233 page_counter_init(&memcg->kmem, &parent->kmem); 4234 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 4235 } else { 4236 page_counter_init(&memcg->memory, NULL); 4237 page_counter_init(&memcg->swap, NULL); 4238 page_counter_init(&memcg->memsw, NULL); 4239 page_counter_init(&memcg->kmem, NULL); 4240 page_counter_init(&memcg->tcpmem, NULL); 4241 /* 4242 * Deeper hierachy with use_hierarchy == false doesn't make 4243 * much sense so let cgroup subsystem know about this 4244 * unfortunate state in our controller. 4245 */ 4246 if (parent != root_mem_cgroup) 4247 memory_cgrp_subsys.broken_hierarchy = true; 4248 } 4249 4250 /* The following stuff does not apply to the root */ 4251 if (!parent) { 4252 root_mem_cgroup = memcg; 4253 return &memcg->css; 4254 } 4255 4256 error = memcg_online_kmem(memcg); 4257 if (error) 4258 goto fail; 4259 4260 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 4261 static_branch_inc(&memcg_sockets_enabled_key); 4262 4263 return &memcg->css; 4264 fail: 4265 mem_cgroup_free(memcg); 4266 return ERR_PTR(-ENOMEM); 4267 } 4268 4269 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 4270 { 4271 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4272 4273 /* Online state pins memcg ID, memcg ID pins CSS */ 4274 atomic_set(&memcg->id.ref, 1); 4275 css_get(css); 4276 return 0; 4277 } 4278 4279 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4280 { 4281 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4282 struct mem_cgroup_event *event, *tmp; 4283 4284 /* 4285 * Unregister events and notify userspace. 4286 * Notify userspace about cgroup removing only after rmdir of cgroup 4287 * directory to avoid race between userspace and kernelspace. 4288 */ 4289 spin_lock(&memcg->event_list_lock); 4290 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4291 list_del_init(&event->list); 4292 schedule_work(&event->remove); 4293 } 4294 spin_unlock(&memcg->event_list_lock); 4295 4296 memcg_offline_kmem(memcg); 4297 wb_memcg_offline(memcg); 4298 4299 mem_cgroup_id_put(memcg); 4300 } 4301 4302 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 4303 { 4304 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4305 4306 invalidate_reclaim_iterators(memcg); 4307 } 4308 4309 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4310 { 4311 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4312 4313 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 4314 static_branch_dec(&memcg_sockets_enabled_key); 4315 4316 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 4317 static_branch_dec(&memcg_sockets_enabled_key); 4318 4319 vmpressure_cleanup(&memcg->vmpressure); 4320 cancel_work_sync(&memcg->high_work); 4321 mem_cgroup_remove_from_trees(memcg); 4322 memcg_free_kmem(memcg); 4323 mem_cgroup_free(memcg); 4324 } 4325 4326 /** 4327 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4328 * @css: the target css 4329 * 4330 * Reset the states of the mem_cgroup associated with @css. This is 4331 * invoked when the userland requests disabling on the default hierarchy 4332 * but the memcg is pinned through dependency. The memcg should stop 4333 * applying policies and should revert to the vanilla state as it may be 4334 * made visible again. 4335 * 4336 * The current implementation only resets the essential configurations. 4337 * This needs to be expanded to cover all the visible parts. 4338 */ 4339 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4340 { 4341 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4342 4343 page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); 4344 page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); 4345 page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); 4346 page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); 4347 page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); 4348 memcg->low = 0; 4349 memcg->high = PAGE_COUNTER_MAX; 4350 memcg->soft_limit = PAGE_COUNTER_MAX; 4351 memcg_wb_domain_size_changed(memcg); 4352 } 4353 4354 #ifdef CONFIG_MMU 4355 /* Handlers for move charge at task migration. */ 4356 static int mem_cgroup_do_precharge(unsigned long count) 4357 { 4358 int ret; 4359 4360 /* Try a single bulk charge without reclaim first, kswapd may wake */ 4361 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 4362 if (!ret) { 4363 mc.precharge += count; 4364 return ret; 4365 } 4366 4367 /* Try charges one by one with reclaim, but do not retry */ 4368 while (count--) { 4369 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 4370 if (ret) 4371 return ret; 4372 mc.precharge++; 4373 cond_resched(); 4374 } 4375 return 0; 4376 } 4377 4378 union mc_target { 4379 struct page *page; 4380 swp_entry_t ent; 4381 }; 4382 4383 enum mc_target_type { 4384 MC_TARGET_NONE = 0, 4385 MC_TARGET_PAGE, 4386 MC_TARGET_SWAP, 4387 }; 4388 4389 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4390 unsigned long addr, pte_t ptent) 4391 { 4392 struct page *page = vm_normal_page(vma, addr, ptent); 4393 4394 if (!page || !page_mapped(page)) 4395 return NULL; 4396 if (PageAnon(page)) { 4397 if (!(mc.flags & MOVE_ANON)) 4398 return NULL; 4399 } else { 4400 if (!(mc.flags & MOVE_FILE)) 4401 return NULL; 4402 } 4403 if (!get_page_unless_zero(page)) 4404 return NULL; 4405 4406 return page; 4407 } 4408 4409 #ifdef CONFIG_SWAP 4410 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4411 pte_t ptent, swp_entry_t *entry) 4412 { 4413 struct page *page = NULL; 4414 swp_entry_t ent = pte_to_swp_entry(ptent); 4415 4416 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 4417 return NULL; 4418 /* 4419 * Because lookup_swap_cache() updates some statistics counter, 4420 * we call find_get_page() with swapper_space directly. 4421 */ 4422 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 4423 if (do_memsw_account()) 4424 entry->val = ent.val; 4425 4426 return page; 4427 } 4428 #else 4429 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4430 pte_t ptent, swp_entry_t *entry) 4431 { 4432 return NULL; 4433 } 4434 #endif 4435 4436 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4437 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4438 { 4439 struct page *page = NULL; 4440 struct address_space *mapping; 4441 pgoff_t pgoff; 4442 4443 if (!vma->vm_file) /* anonymous vma */ 4444 return NULL; 4445 if (!(mc.flags & MOVE_FILE)) 4446 return NULL; 4447 4448 mapping = vma->vm_file->f_mapping; 4449 pgoff = linear_page_index(vma, addr); 4450 4451 /* page is moved even if it's not RSS of this task(page-faulted). */ 4452 #ifdef CONFIG_SWAP 4453 /* shmem/tmpfs may report page out on swap: account for that too. */ 4454 if (shmem_mapping(mapping)) { 4455 page = find_get_entry(mapping, pgoff); 4456 if (radix_tree_exceptional_entry(page)) { 4457 swp_entry_t swp = radix_to_swp_entry(page); 4458 if (do_memsw_account()) 4459 *entry = swp; 4460 page = find_get_page(swap_address_space(swp), 4461 swp_offset(swp)); 4462 } 4463 } else 4464 page = find_get_page(mapping, pgoff); 4465 #else 4466 page = find_get_page(mapping, pgoff); 4467 #endif 4468 return page; 4469 } 4470 4471 /** 4472 * mem_cgroup_move_account - move account of the page 4473 * @page: the page 4474 * @compound: charge the page as compound or small page 4475 * @from: mem_cgroup which the page is moved from. 4476 * @to: mem_cgroup which the page is moved to. @from != @to. 4477 * 4478 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 4479 * 4480 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4481 * from old cgroup. 4482 */ 4483 static int mem_cgroup_move_account(struct page *page, 4484 bool compound, 4485 struct mem_cgroup *from, 4486 struct mem_cgroup *to) 4487 { 4488 unsigned long flags; 4489 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 4490 int ret; 4491 bool anon; 4492 4493 VM_BUG_ON(from == to); 4494 VM_BUG_ON_PAGE(PageLRU(page), page); 4495 VM_BUG_ON(compound && !PageTransHuge(page)); 4496 4497 /* 4498 * Prevent mem_cgroup_migrate() from looking at 4499 * page->mem_cgroup of its source page while we change it. 4500 */ 4501 ret = -EBUSY; 4502 if (!trylock_page(page)) 4503 goto out; 4504 4505 ret = -EINVAL; 4506 if (page->mem_cgroup != from) 4507 goto out_unlock; 4508 4509 anon = PageAnon(page); 4510 4511 spin_lock_irqsave(&from->move_lock, flags); 4512 4513 if (!anon && page_mapped(page)) { 4514 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4515 nr_pages); 4516 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 4517 nr_pages); 4518 } 4519 4520 /* 4521 * move_lock grabbed above and caller set from->moving_account, so 4522 * mem_cgroup_update_page_stat() will serialize updates to PageDirty. 4523 * So mapping should be stable for dirty pages. 4524 */ 4525 if (!anon && PageDirty(page)) { 4526 struct address_space *mapping = page_mapping(page); 4527 4528 if (mapping_cap_account_dirty(mapping)) { 4529 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], 4530 nr_pages); 4531 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], 4532 nr_pages); 4533 } 4534 } 4535 4536 if (PageWriteback(page)) { 4537 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4538 nr_pages); 4539 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 4540 nr_pages); 4541 } 4542 4543 /* 4544 * It is safe to change page->mem_cgroup here because the page 4545 * is referenced, charged, and isolated - we can't race with 4546 * uncharging, charging, migration, or LRU putback. 4547 */ 4548 4549 /* caller should have done css_get */ 4550 page->mem_cgroup = to; 4551 spin_unlock_irqrestore(&from->move_lock, flags); 4552 4553 ret = 0; 4554 4555 local_irq_disable(); 4556 mem_cgroup_charge_statistics(to, page, compound, nr_pages); 4557 memcg_check_events(to, page); 4558 mem_cgroup_charge_statistics(from, page, compound, -nr_pages); 4559 memcg_check_events(from, page); 4560 local_irq_enable(); 4561 out_unlock: 4562 unlock_page(page); 4563 out: 4564 return ret; 4565 } 4566 4567 /** 4568 * get_mctgt_type - get target type of moving charge 4569 * @vma: the vma the pte to be checked belongs 4570 * @addr: the address corresponding to the pte to be checked 4571 * @ptent: the pte to be checked 4572 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4573 * 4574 * Returns 4575 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4576 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4577 * move charge. if @target is not NULL, the page is stored in target->page 4578 * with extra refcnt got(Callers should handle it). 4579 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4580 * target for charge migration. if @target is not NULL, the entry is stored 4581 * in target->ent. 4582 * 4583 * Called with pte lock held. 4584 */ 4585 4586 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4587 unsigned long addr, pte_t ptent, union mc_target *target) 4588 { 4589 struct page *page = NULL; 4590 enum mc_target_type ret = MC_TARGET_NONE; 4591 swp_entry_t ent = { .val = 0 }; 4592 4593 if (pte_present(ptent)) 4594 page = mc_handle_present_pte(vma, addr, ptent); 4595 else if (is_swap_pte(ptent)) 4596 page = mc_handle_swap_pte(vma, ptent, &ent); 4597 else if (pte_none(ptent)) 4598 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4599 4600 if (!page && !ent.val) 4601 return ret; 4602 if (page) { 4603 /* 4604 * Do only loose check w/o serialization. 4605 * mem_cgroup_move_account() checks the page is valid or 4606 * not under LRU exclusion. 4607 */ 4608 if (page->mem_cgroup == mc.from) { 4609 ret = MC_TARGET_PAGE; 4610 if (target) 4611 target->page = page; 4612 } 4613 if (!ret || !target) 4614 put_page(page); 4615 } 4616 /* There is a swap entry and a page doesn't exist or isn't charged */ 4617 if (ent.val && !ret && 4618 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4619 ret = MC_TARGET_SWAP; 4620 if (target) 4621 target->ent = ent; 4622 } 4623 return ret; 4624 } 4625 4626 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4627 /* 4628 * We don't consider swapping or file mapped pages because THP does not 4629 * support them for now. 4630 * Caller should make sure that pmd_trans_huge(pmd) is true. 4631 */ 4632 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4633 unsigned long addr, pmd_t pmd, union mc_target *target) 4634 { 4635 struct page *page = NULL; 4636 enum mc_target_type ret = MC_TARGET_NONE; 4637 4638 page = pmd_page(pmd); 4639 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4640 if (!(mc.flags & MOVE_ANON)) 4641 return ret; 4642 if (page->mem_cgroup == mc.from) { 4643 ret = MC_TARGET_PAGE; 4644 if (target) { 4645 get_page(page); 4646 target->page = page; 4647 } 4648 } 4649 return ret; 4650 } 4651 #else 4652 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4653 unsigned long addr, pmd_t pmd, union mc_target *target) 4654 { 4655 return MC_TARGET_NONE; 4656 } 4657 #endif 4658 4659 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4660 unsigned long addr, unsigned long end, 4661 struct mm_walk *walk) 4662 { 4663 struct vm_area_struct *vma = walk->vma; 4664 pte_t *pte; 4665 spinlock_t *ptl; 4666 4667 ptl = pmd_trans_huge_lock(pmd, vma); 4668 if (ptl) { 4669 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 4670 mc.precharge += HPAGE_PMD_NR; 4671 spin_unlock(ptl); 4672 return 0; 4673 } 4674 4675 if (pmd_trans_unstable(pmd)) 4676 return 0; 4677 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4678 for (; addr != end; pte++, addr += PAGE_SIZE) 4679 if (get_mctgt_type(vma, addr, *pte, NULL)) 4680 mc.precharge++; /* increment precharge temporarily */ 4681 pte_unmap_unlock(pte - 1, ptl); 4682 cond_resched(); 4683 4684 return 0; 4685 } 4686 4687 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4688 { 4689 unsigned long precharge; 4690 4691 struct mm_walk mem_cgroup_count_precharge_walk = { 4692 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4693 .mm = mm, 4694 }; 4695 down_read(&mm->mmap_sem); 4696 walk_page_range(0, mm->highest_vm_end, 4697 &mem_cgroup_count_precharge_walk); 4698 up_read(&mm->mmap_sem); 4699 4700 precharge = mc.precharge; 4701 mc.precharge = 0; 4702 4703 return precharge; 4704 } 4705 4706 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4707 { 4708 unsigned long precharge = mem_cgroup_count_precharge(mm); 4709 4710 VM_BUG_ON(mc.moving_task); 4711 mc.moving_task = current; 4712 return mem_cgroup_do_precharge(precharge); 4713 } 4714 4715 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4716 static void __mem_cgroup_clear_mc(void) 4717 { 4718 struct mem_cgroup *from = mc.from; 4719 struct mem_cgroup *to = mc.to; 4720 4721 /* we must uncharge all the leftover precharges from mc.to */ 4722 if (mc.precharge) { 4723 cancel_charge(mc.to, mc.precharge); 4724 mc.precharge = 0; 4725 } 4726 /* 4727 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4728 * we must uncharge here. 4729 */ 4730 if (mc.moved_charge) { 4731 cancel_charge(mc.from, mc.moved_charge); 4732 mc.moved_charge = 0; 4733 } 4734 /* we must fixup refcnts and charges */ 4735 if (mc.moved_swap) { 4736 /* uncharge swap account from the old cgroup */ 4737 if (!mem_cgroup_is_root(mc.from)) 4738 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 4739 4740 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 4741 4742 /* 4743 * we charged both to->memory and to->memsw, so we 4744 * should uncharge to->memory. 4745 */ 4746 if (!mem_cgroup_is_root(mc.to)) 4747 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 4748 4749 mem_cgroup_id_get_many(mc.to, mc.moved_swap); 4750 css_put_many(&mc.to->css, mc.moved_swap); 4751 4752 mc.moved_swap = 0; 4753 } 4754 memcg_oom_recover(from); 4755 memcg_oom_recover(to); 4756 wake_up_all(&mc.waitq); 4757 } 4758 4759 static void mem_cgroup_clear_mc(void) 4760 { 4761 struct mm_struct *mm = mc.mm; 4762 4763 /* 4764 * we must clear moving_task before waking up waiters at the end of 4765 * task migration. 4766 */ 4767 mc.moving_task = NULL; 4768 __mem_cgroup_clear_mc(); 4769 spin_lock(&mc.lock); 4770 mc.from = NULL; 4771 mc.to = NULL; 4772 mc.mm = NULL; 4773 spin_unlock(&mc.lock); 4774 4775 mmput(mm); 4776 } 4777 4778 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 4779 { 4780 struct cgroup_subsys_state *css; 4781 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 4782 struct mem_cgroup *from; 4783 struct task_struct *leader, *p; 4784 struct mm_struct *mm; 4785 unsigned long move_flags; 4786 int ret = 0; 4787 4788 /* charge immigration isn't supported on the default hierarchy */ 4789 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 4790 return 0; 4791 4792 /* 4793 * Multi-process migrations only happen on the default hierarchy 4794 * where charge immigration is not used. Perform charge 4795 * immigration if @tset contains a leader and whine if there are 4796 * multiple. 4797 */ 4798 p = NULL; 4799 cgroup_taskset_for_each_leader(leader, css, tset) { 4800 WARN_ON_ONCE(p); 4801 p = leader; 4802 memcg = mem_cgroup_from_css(css); 4803 } 4804 if (!p) 4805 return 0; 4806 4807 /* 4808 * We are now commited to this value whatever it is. Changes in this 4809 * tunable will only affect upcoming migrations, not the current one. 4810 * So we need to save it, and keep it going. 4811 */ 4812 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 4813 if (!move_flags) 4814 return 0; 4815 4816 from = mem_cgroup_from_task(p); 4817 4818 VM_BUG_ON(from == memcg); 4819 4820 mm = get_task_mm(p); 4821 if (!mm) 4822 return 0; 4823 /* We move charges only when we move a owner of the mm */ 4824 if (mm->owner == p) { 4825 VM_BUG_ON(mc.from); 4826 VM_BUG_ON(mc.to); 4827 VM_BUG_ON(mc.precharge); 4828 VM_BUG_ON(mc.moved_charge); 4829 VM_BUG_ON(mc.moved_swap); 4830 4831 spin_lock(&mc.lock); 4832 mc.mm = mm; 4833 mc.from = from; 4834 mc.to = memcg; 4835 mc.flags = move_flags; 4836 spin_unlock(&mc.lock); 4837 /* We set mc.moving_task later */ 4838 4839 ret = mem_cgroup_precharge_mc(mm); 4840 if (ret) 4841 mem_cgroup_clear_mc(); 4842 } else { 4843 mmput(mm); 4844 } 4845 return ret; 4846 } 4847 4848 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 4849 { 4850 if (mc.to) 4851 mem_cgroup_clear_mc(); 4852 } 4853 4854 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4855 unsigned long addr, unsigned long end, 4856 struct mm_walk *walk) 4857 { 4858 int ret = 0; 4859 struct vm_area_struct *vma = walk->vma; 4860 pte_t *pte; 4861 spinlock_t *ptl; 4862 enum mc_target_type target_type; 4863 union mc_target target; 4864 struct page *page; 4865 4866 ptl = pmd_trans_huge_lock(pmd, vma); 4867 if (ptl) { 4868 if (mc.precharge < HPAGE_PMD_NR) { 4869 spin_unlock(ptl); 4870 return 0; 4871 } 4872 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 4873 if (target_type == MC_TARGET_PAGE) { 4874 page = target.page; 4875 if (!isolate_lru_page(page)) { 4876 if (!mem_cgroup_move_account(page, true, 4877 mc.from, mc.to)) { 4878 mc.precharge -= HPAGE_PMD_NR; 4879 mc.moved_charge += HPAGE_PMD_NR; 4880 } 4881 putback_lru_page(page); 4882 } 4883 put_page(page); 4884 } 4885 spin_unlock(ptl); 4886 return 0; 4887 } 4888 4889 if (pmd_trans_unstable(pmd)) 4890 return 0; 4891 retry: 4892 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4893 for (; addr != end; addr += PAGE_SIZE) { 4894 pte_t ptent = *(pte++); 4895 swp_entry_t ent; 4896 4897 if (!mc.precharge) 4898 break; 4899 4900 switch (get_mctgt_type(vma, addr, ptent, &target)) { 4901 case MC_TARGET_PAGE: 4902 page = target.page; 4903 /* 4904 * We can have a part of the split pmd here. Moving it 4905 * can be done but it would be too convoluted so simply 4906 * ignore such a partial THP and keep it in original 4907 * memcg. There should be somebody mapping the head. 4908 */ 4909 if (PageTransCompound(page)) 4910 goto put; 4911 if (isolate_lru_page(page)) 4912 goto put; 4913 if (!mem_cgroup_move_account(page, false, 4914 mc.from, mc.to)) { 4915 mc.precharge--; 4916 /* we uncharge from mc.from later. */ 4917 mc.moved_charge++; 4918 } 4919 putback_lru_page(page); 4920 put: /* get_mctgt_type() gets the page */ 4921 put_page(page); 4922 break; 4923 case MC_TARGET_SWAP: 4924 ent = target.ent; 4925 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 4926 mc.precharge--; 4927 /* we fixup refcnts and charges later. */ 4928 mc.moved_swap++; 4929 } 4930 break; 4931 default: 4932 break; 4933 } 4934 } 4935 pte_unmap_unlock(pte - 1, ptl); 4936 cond_resched(); 4937 4938 if (addr != end) { 4939 /* 4940 * We have consumed all precharges we got in can_attach(). 4941 * We try charge one by one, but don't do any additional 4942 * charges to mc.to if we have failed in charge once in attach() 4943 * phase. 4944 */ 4945 ret = mem_cgroup_do_precharge(1); 4946 if (!ret) 4947 goto retry; 4948 } 4949 4950 return ret; 4951 } 4952 4953 static void mem_cgroup_move_charge(void) 4954 { 4955 struct mm_walk mem_cgroup_move_charge_walk = { 4956 .pmd_entry = mem_cgroup_move_charge_pte_range, 4957 .mm = mc.mm, 4958 }; 4959 4960 lru_add_drain_all(); 4961 /* 4962 * Signal lock_page_memcg() to take the memcg's move_lock 4963 * while we're moving its pages to another memcg. Then wait 4964 * for already started RCU-only updates to finish. 4965 */ 4966 atomic_inc(&mc.from->moving_account); 4967 synchronize_rcu(); 4968 retry: 4969 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { 4970 /* 4971 * Someone who are holding the mmap_sem might be waiting in 4972 * waitq. So we cancel all extra charges, wake up all waiters, 4973 * and retry. Because we cancel precharges, we might not be able 4974 * to move enough charges, but moving charge is a best-effort 4975 * feature anyway, so it wouldn't be a big problem. 4976 */ 4977 __mem_cgroup_clear_mc(); 4978 cond_resched(); 4979 goto retry; 4980 } 4981 /* 4982 * When we have consumed all precharges and failed in doing 4983 * additional charge, the page walk just aborts. 4984 */ 4985 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); 4986 4987 up_read(&mc.mm->mmap_sem); 4988 atomic_dec(&mc.from->moving_account); 4989 } 4990 4991 static void mem_cgroup_move_task(void) 4992 { 4993 if (mc.to) { 4994 mem_cgroup_move_charge(); 4995 mem_cgroup_clear_mc(); 4996 } 4997 } 4998 #else /* !CONFIG_MMU */ 4999 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5000 { 5001 return 0; 5002 } 5003 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5004 { 5005 } 5006 static void mem_cgroup_move_task(void) 5007 { 5008 } 5009 #endif 5010 5011 /* 5012 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5013 * to verify whether we're attached to the default hierarchy on each mount 5014 * attempt. 5015 */ 5016 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5017 { 5018 /* 5019 * use_hierarchy is forced on the default hierarchy. cgroup core 5020 * guarantees that @root doesn't have any children, so turning it 5021 * on for the root memcg is enough. 5022 */ 5023 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5024 root_mem_cgroup->use_hierarchy = true; 5025 else 5026 root_mem_cgroup->use_hierarchy = false; 5027 } 5028 5029 static u64 memory_current_read(struct cgroup_subsys_state *css, 5030 struct cftype *cft) 5031 { 5032 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5033 5034 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 5035 } 5036 5037 static int memory_low_show(struct seq_file *m, void *v) 5038 { 5039 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5040 unsigned long low = READ_ONCE(memcg->low); 5041 5042 if (low == PAGE_COUNTER_MAX) 5043 seq_puts(m, "max\n"); 5044 else 5045 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); 5046 5047 return 0; 5048 } 5049 5050 static ssize_t memory_low_write(struct kernfs_open_file *of, 5051 char *buf, size_t nbytes, loff_t off) 5052 { 5053 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5054 unsigned long low; 5055 int err; 5056 5057 buf = strstrip(buf); 5058 err = page_counter_memparse(buf, "max", &low); 5059 if (err) 5060 return err; 5061 5062 memcg->low = low; 5063 5064 return nbytes; 5065 } 5066 5067 static int memory_high_show(struct seq_file *m, void *v) 5068 { 5069 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5070 unsigned long high = READ_ONCE(memcg->high); 5071 5072 if (high == PAGE_COUNTER_MAX) 5073 seq_puts(m, "max\n"); 5074 else 5075 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); 5076 5077 return 0; 5078 } 5079 5080 static ssize_t memory_high_write(struct kernfs_open_file *of, 5081 char *buf, size_t nbytes, loff_t off) 5082 { 5083 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5084 unsigned long nr_pages; 5085 unsigned long high; 5086 int err; 5087 5088 buf = strstrip(buf); 5089 err = page_counter_memparse(buf, "max", &high); 5090 if (err) 5091 return err; 5092 5093 memcg->high = high; 5094 5095 nr_pages = page_counter_read(&memcg->memory); 5096 if (nr_pages > high) 5097 try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 5098 GFP_KERNEL, true); 5099 5100 memcg_wb_domain_size_changed(memcg); 5101 return nbytes; 5102 } 5103 5104 static int memory_max_show(struct seq_file *m, void *v) 5105 { 5106 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5107 unsigned long max = READ_ONCE(memcg->memory.limit); 5108 5109 if (max == PAGE_COUNTER_MAX) 5110 seq_puts(m, "max\n"); 5111 else 5112 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 5113 5114 return 0; 5115 } 5116 5117 static ssize_t memory_max_write(struct kernfs_open_file *of, 5118 char *buf, size_t nbytes, loff_t off) 5119 { 5120 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5121 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; 5122 bool drained = false; 5123 unsigned long max; 5124 int err; 5125 5126 buf = strstrip(buf); 5127 err = page_counter_memparse(buf, "max", &max); 5128 if (err) 5129 return err; 5130 5131 xchg(&memcg->memory.limit, max); 5132 5133 for (;;) { 5134 unsigned long nr_pages = page_counter_read(&memcg->memory); 5135 5136 if (nr_pages <= max) 5137 break; 5138 5139 if (signal_pending(current)) { 5140 err = -EINTR; 5141 break; 5142 } 5143 5144 if (!drained) { 5145 drain_all_stock(memcg); 5146 drained = true; 5147 continue; 5148 } 5149 5150 if (nr_reclaims) { 5151 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 5152 GFP_KERNEL, true)) 5153 nr_reclaims--; 5154 continue; 5155 } 5156 5157 mem_cgroup_events(memcg, MEMCG_OOM, 1); 5158 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 5159 break; 5160 } 5161 5162 memcg_wb_domain_size_changed(memcg); 5163 return nbytes; 5164 } 5165 5166 static int memory_events_show(struct seq_file *m, void *v) 5167 { 5168 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5169 5170 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); 5171 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); 5172 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); 5173 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); 5174 5175 return 0; 5176 } 5177 5178 static int memory_stat_show(struct seq_file *m, void *v) 5179 { 5180 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5181 unsigned long stat[MEMCG_NR_STAT]; 5182 unsigned long events[MEMCG_NR_EVENTS]; 5183 int i; 5184 5185 /* 5186 * Provide statistics on the state of the memory subsystem as 5187 * well as cumulative event counters that show past behavior. 5188 * 5189 * This list is ordered following a combination of these gradients: 5190 * 1) generic big picture -> specifics and details 5191 * 2) reflecting userspace activity -> reflecting kernel heuristics 5192 * 5193 * Current memory state: 5194 */ 5195 5196 tree_stat(memcg, stat); 5197 tree_events(memcg, events); 5198 5199 seq_printf(m, "anon %llu\n", 5200 (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); 5201 seq_printf(m, "file %llu\n", 5202 (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); 5203 seq_printf(m, "kernel_stack %llu\n", 5204 (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); 5205 seq_printf(m, "slab %llu\n", 5206 (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + 5207 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); 5208 seq_printf(m, "sock %llu\n", 5209 (u64)stat[MEMCG_SOCK] * PAGE_SIZE); 5210 5211 seq_printf(m, "file_mapped %llu\n", 5212 (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); 5213 seq_printf(m, "file_dirty %llu\n", 5214 (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); 5215 seq_printf(m, "file_writeback %llu\n", 5216 (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); 5217 5218 for (i = 0; i < NR_LRU_LISTS; i++) { 5219 struct mem_cgroup *mi; 5220 unsigned long val = 0; 5221 5222 for_each_mem_cgroup_tree(mi, memcg) 5223 val += mem_cgroup_nr_lru_pages(mi, BIT(i)); 5224 seq_printf(m, "%s %llu\n", 5225 mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); 5226 } 5227 5228 seq_printf(m, "slab_reclaimable %llu\n", 5229 (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); 5230 seq_printf(m, "slab_unreclaimable %llu\n", 5231 (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); 5232 5233 /* Accumulated memory events */ 5234 5235 seq_printf(m, "pgfault %lu\n", 5236 events[MEM_CGROUP_EVENTS_PGFAULT]); 5237 seq_printf(m, "pgmajfault %lu\n", 5238 events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 5239 5240 return 0; 5241 } 5242 5243 static struct cftype memory_files[] = { 5244 { 5245 .name = "current", 5246 .flags = CFTYPE_NOT_ON_ROOT, 5247 .read_u64 = memory_current_read, 5248 }, 5249 { 5250 .name = "low", 5251 .flags = CFTYPE_NOT_ON_ROOT, 5252 .seq_show = memory_low_show, 5253 .write = memory_low_write, 5254 }, 5255 { 5256 .name = "high", 5257 .flags = CFTYPE_NOT_ON_ROOT, 5258 .seq_show = memory_high_show, 5259 .write = memory_high_write, 5260 }, 5261 { 5262 .name = "max", 5263 .flags = CFTYPE_NOT_ON_ROOT, 5264 .seq_show = memory_max_show, 5265 .write = memory_max_write, 5266 }, 5267 { 5268 .name = "events", 5269 .flags = CFTYPE_NOT_ON_ROOT, 5270 .file_offset = offsetof(struct mem_cgroup, events_file), 5271 .seq_show = memory_events_show, 5272 }, 5273 { 5274 .name = "stat", 5275 .flags = CFTYPE_NOT_ON_ROOT, 5276 .seq_show = memory_stat_show, 5277 }, 5278 { } /* terminate */ 5279 }; 5280 5281 struct cgroup_subsys memory_cgrp_subsys = { 5282 .css_alloc = mem_cgroup_css_alloc, 5283 .css_online = mem_cgroup_css_online, 5284 .css_offline = mem_cgroup_css_offline, 5285 .css_released = mem_cgroup_css_released, 5286 .css_free = mem_cgroup_css_free, 5287 .css_reset = mem_cgroup_css_reset, 5288 .can_attach = mem_cgroup_can_attach, 5289 .cancel_attach = mem_cgroup_cancel_attach, 5290 .post_attach = mem_cgroup_move_task, 5291 .bind = mem_cgroup_bind, 5292 .dfl_cftypes = memory_files, 5293 .legacy_cftypes = mem_cgroup_legacy_files, 5294 .early_init = 0, 5295 }; 5296 5297 /** 5298 * mem_cgroup_low - check if memory consumption is below the normal range 5299 * @root: the highest ancestor to consider 5300 * @memcg: the memory cgroup to check 5301 * 5302 * Returns %true if memory consumption of @memcg, and that of all 5303 * configurable ancestors up to @root, is below the normal range. 5304 */ 5305 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) 5306 { 5307 if (mem_cgroup_disabled()) 5308 return false; 5309 5310 /* 5311 * The toplevel group doesn't have a configurable range, so 5312 * it's never low when looked at directly, and it is not 5313 * considered an ancestor when assessing the hierarchy. 5314 */ 5315 5316 if (memcg == root_mem_cgroup) 5317 return false; 5318 5319 if (page_counter_read(&memcg->memory) >= memcg->low) 5320 return false; 5321 5322 while (memcg != root) { 5323 memcg = parent_mem_cgroup(memcg); 5324 5325 if (memcg == root_mem_cgroup) 5326 break; 5327 5328 if (page_counter_read(&memcg->memory) >= memcg->low) 5329 return false; 5330 } 5331 return true; 5332 } 5333 5334 /** 5335 * mem_cgroup_try_charge - try charging a page 5336 * @page: page to charge 5337 * @mm: mm context of the victim 5338 * @gfp_mask: reclaim mode 5339 * @memcgp: charged memcg return 5340 * @compound: charge the page as compound or small page 5341 * 5342 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5343 * pages according to @gfp_mask if necessary. 5344 * 5345 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5346 * Otherwise, an error code is returned. 5347 * 5348 * After page->mapping has been set up, the caller must finalize the 5349 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5350 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5351 */ 5352 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5353 gfp_t gfp_mask, struct mem_cgroup **memcgp, 5354 bool compound) 5355 { 5356 struct mem_cgroup *memcg = NULL; 5357 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5358 int ret = 0; 5359 5360 if (mem_cgroup_disabled()) 5361 goto out; 5362 5363 if (PageSwapCache(page)) { 5364 /* 5365 * Every swap fault against a single page tries to charge the 5366 * page, bail as early as possible. shmem_unuse() encounters 5367 * already charged pages, too. The USED bit is protected by 5368 * the page lock, which serializes swap cache removal, which 5369 * in turn serializes uncharging. 5370 */ 5371 VM_BUG_ON_PAGE(!PageLocked(page), page); 5372 if (page->mem_cgroup) 5373 goto out; 5374 5375 if (do_swap_account) { 5376 swp_entry_t ent = { .val = page_private(page), }; 5377 unsigned short id = lookup_swap_cgroup_id(ent); 5378 5379 rcu_read_lock(); 5380 memcg = mem_cgroup_from_id(id); 5381 if (memcg && !css_tryget_online(&memcg->css)) 5382 memcg = NULL; 5383 rcu_read_unlock(); 5384 } 5385 } 5386 5387 if (!memcg) 5388 memcg = get_mem_cgroup_from_mm(mm); 5389 5390 ret = try_charge(memcg, gfp_mask, nr_pages); 5391 5392 css_put(&memcg->css); 5393 out: 5394 *memcgp = memcg; 5395 return ret; 5396 } 5397 5398 /** 5399 * mem_cgroup_commit_charge - commit a page charge 5400 * @page: page to charge 5401 * @memcg: memcg to charge the page to 5402 * @lrucare: page might be on LRU already 5403 * @compound: charge the page as compound or small page 5404 * 5405 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5406 * after page->mapping has been set up. This must happen atomically 5407 * as part of the page instantiation, i.e. under the page table lock 5408 * for anonymous pages, under the page lock for page and swap cache. 5409 * 5410 * In addition, the page must not be on the LRU during the commit, to 5411 * prevent racing with task migration. If it might be, use @lrucare. 5412 * 5413 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5414 */ 5415 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5416 bool lrucare, bool compound) 5417 { 5418 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5419 5420 VM_BUG_ON_PAGE(!page->mapping, page); 5421 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5422 5423 if (mem_cgroup_disabled()) 5424 return; 5425 /* 5426 * Swap faults will attempt to charge the same page multiple 5427 * times. But reuse_swap_page() might have removed the page 5428 * from swapcache already, so we can't check PageSwapCache(). 5429 */ 5430 if (!memcg) 5431 return; 5432 5433 commit_charge(page, memcg, lrucare); 5434 5435 local_irq_disable(); 5436 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); 5437 memcg_check_events(memcg, page); 5438 local_irq_enable(); 5439 5440 if (do_memsw_account() && PageSwapCache(page)) { 5441 swp_entry_t entry = { .val = page_private(page) }; 5442 /* 5443 * The swap entry might not get freed for a long time, 5444 * let's not wait for it. The page already received a 5445 * memory+swap charge, drop the swap entry duplicate. 5446 */ 5447 mem_cgroup_uncharge_swap(entry); 5448 } 5449 } 5450 5451 /** 5452 * mem_cgroup_cancel_charge - cancel a page charge 5453 * @page: page to charge 5454 * @memcg: memcg to charge the page to 5455 * @compound: charge the page as compound or small page 5456 * 5457 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5458 */ 5459 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, 5460 bool compound) 5461 { 5462 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5463 5464 if (mem_cgroup_disabled()) 5465 return; 5466 /* 5467 * Swap faults will attempt to charge the same page multiple 5468 * times. But reuse_swap_page() might have removed the page 5469 * from swapcache already, so we can't check PageSwapCache(). 5470 */ 5471 if (!memcg) 5472 return; 5473 5474 cancel_charge(memcg, nr_pages); 5475 } 5476 5477 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5478 unsigned long nr_anon, unsigned long nr_file, 5479 unsigned long nr_huge, unsigned long nr_kmem, 5480 struct page *dummy_page) 5481 { 5482 unsigned long nr_pages = nr_anon + nr_file + nr_kmem; 5483 unsigned long flags; 5484 5485 if (!mem_cgroup_is_root(memcg)) { 5486 page_counter_uncharge(&memcg->memory, nr_pages); 5487 if (do_memsw_account()) 5488 page_counter_uncharge(&memcg->memsw, nr_pages); 5489 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) 5490 page_counter_uncharge(&memcg->kmem, nr_kmem); 5491 memcg_oom_recover(memcg); 5492 } 5493 5494 local_irq_save(flags); 5495 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5496 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5497 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5498 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5499 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5500 memcg_check_events(memcg, dummy_page); 5501 local_irq_restore(flags); 5502 5503 if (!mem_cgroup_is_root(memcg)) 5504 css_put_many(&memcg->css, nr_pages); 5505 } 5506 5507 static void uncharge_list(struct list_head *page_list) 5508 { 5509 struct mem_cgroup *memcg = NULL; 5510 unsigned long nr_anon = 0; 5511 unsigned long nr_file = 0; 5512 unsigned long nr_huge = 0; 5513 unsigned long nr_kmem = 0; 5514 unsigned long pgpgout = 0; 5515 struct list_head *next; 5516 struct page *page; 5517 5518 /* 5519 * Note that the list can be a single page->lru; hence the 5520 * do-while loop instead of a simple list_for_each_entry(). 5521 */ 5522 next = page_list->next; 5523 do { 5524 page = list_entry(next, struct page, lru); 5525 next = page->lru.next; 5526 5527 VM_BUG_ON_PAGE(PageLRU(page), page); 5528 VM_BUG_ON_PAGE(page_count(page), page); 5529 5530 if (!page->mem_cgroup) 5531 continue; 5532 5533 /* 5534 * Nobody should be changing or seriously looking at 5535 * page->mem_cgroup at this point, we have fully 5536 * exclusive access to the page. 5537 */ 5538 5539 if (memcg != page->mem_cgroup) { 5540 if (memcg) { 5541 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5542 nr_huge, nr_kmem, page); 5543 pgpgout = nr_anon = nr_file = 5544 nr_huge = nr_kmem = 0; 5545 } 5546 memcg = page->mem_cgroup; 5547 } 5548 5549 if (!PageKmemcg(page)) { 5550 unsigned int nr_pages = 1; 5551 5552 if (PageTransHuge(page)) { 5553 nr_pages <<= compound_order(page); 5554 nr_huge += nr_pages; 5555 } 5556 if (PageAnon(page)) 5557 nr_anon += nr_pages; 5558 else 5559 nr_file += nr_pages; 5560 pgpgout++; 5561 } else { 5562 nr_kmem += 1 << compound_order(page); 5563 __ClearPageKmemcg(page); 5564 } 5565 5566 page->mem_cgroup = NULL; 5567 } while (next != page_list); 5568 5569 if (memcg) 5570 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5571 nr_huge, nr_kmem, page); 5572 } 5573 5574 /** 5575 * mem_cgroup_uncharge - uncharge a page 5576 * @page: page to uncharge 5577 * 5578 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5579 * mem_cgroup_commit_charge(). 5580 */ 5581 void mem_cgroup_uncharge(struct page *page) 5582 { 5583 if (mem_cgroup_disabled()) 5584 return; 5585 5586 /* Don't touch page->lru of any random page, pre-check: */ 5587 if (!page->mem_cgroup) 5588 return; 5589 5590 INIT_LIST_HEAD(&page->lru); 5591 uncharge_list(&page->lru); 5592 } 5593 5594 /** 5595 * mem_cgroup_uncharge_list - uncharge a list of page 5596 * @page_list: list of pages to uncharge 5597 * 5598 * Uncharge a list of pages previously charged with 5599 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5600 */ 5601 void mem_cgroup_uncharge_list(struct list_head *page_list) 5602 { 5603 if (mem_cgroup_disabled()) 5604 return; 5605 5606 if (!list_empty(page_list)) 5607 uncharge_list(page_list); 5608 } 5609 5610 /** 5611 * mem_cgroup_migrate - charge a page's replacement 5612 * @oldpage: currently circulating page 5613 * @newpage: replacement page 5614 * 5615 * Charge @newpage as a replacement page for @oldpage. @oldpage will 5616 * be uncharged upon free. 5617 * 5618 * Both pages must be locked, @newpage->mapping must be set up. 5619 */ 5620 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 5621 { 5622 struct mem_cgroup *memcg; 5623 unsigned int nr_pages; 5624 bool compound; 5625 unsigned long flags; 5626 5627 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5628 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5629 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5630 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5631 newpage); 5632 5633 if (mem_cgroup_disabled()) 5634 return; 5635 5636 /* Page cache replacement: new page already charged? */ 5637 if (newpage->mem_cgroup) 5638 return; 5639 5640 /* Swapcache readahead pages can get replaced before being charged */ 5641 memcg = oldpage->mem_cgroup; 5642 if (!memcg) 5643 return; 5644 5645 /* Force-charge the new page. The old one will be freed soon */ 5646 compound = PageTransHuge(newpage); 5647 nr_pages = compound ? hpage_nr_pages(newpage) : 1; 5648 5649 page_counter_charge(&memcg->memory, nr_pages); 5650 if (do_memsw_account()) 5651 page_counter_charge(&memcg->memsw, nr_pages); 5652 css_get_many(&memcg->css, nr_pages); 5653 5654 commit_charge(newpage, memcg, false); 5655 5656 local_irq_save(flags); 5657 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); 5658 memcg_check_events(memcg, newpage); 5659 local_irq_restore(flags); 5660 } 5661 5662 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 5663 EXPORT_SYMBOL(memcg_sockets_enabled_key); 5664 5665 void mem_cgroup_sk_alloc(struct sock *sk) 5666 { 5667 struct mem_cgroup *memcg; 5668 5669 if (!mem_cgroup_sockets_enabled) 5670 return; 5671 5672 /* 5673 * Socket cloning can throw us here with sk_memcg already 5674 * filled. It won't however, necessarily happen from 5675 * process context. So the test for root memcg given 5676 * the current task's memcg won't help us in this case. 5677 * 5678 * Respecting the original socket's memcg is a better 5679 * decision in this case. 5680 */ 5681 if (sk->sk_memcg) { 5682 BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); 5683 css_get(&sk->sk_memcg->css); 5684 return; 5685 } 5686 5687 rcu_read_lock(); 5688 memcg = mem_cgroup_from_task(current); 5689 if (memcg == root_mem_cgroup) 5690 goto out; 5691 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 5692 goto out; 5693 if (css_tryget_online(&memcg->css)) 5694 sk->sk_memcg = memcg; 5695 out: 5696 rcu_read_unlock(); 5697 } 5698 5699 void mem_cgroup_sk_free(struct sock *sk) 5700 { 5701 if (sk->sk_memcg) 5702 css_put(&sk->sk_memcg->css); 5703 } 5704 5705 /** 5706 * mem_cgroup_charge_skmem - charge socket memory 5707 * @memcg: memcg to charge 5708 * @nr_pages: number of pages to charge 5709 * 5710 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 5711 * @memcg's configured limit, %false if the charge had to be forced. 5712 */ 5713 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 5714 { 5715 gfp_t gfp_mask = GFP_KERNEL; 5716 5717 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 5718 struct page_counter *fail; 5719 5720 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 5721 memcg->tcpmem_pressure = 0; 5722 return true; 5723 } 5724 page_counter_charge(&memcg->tcpmem, nr_pages); 5725 memcg->tcpmem_pressure = 1; 5726 return false; 5727 } 5728 5729 /* Don't block in the packet receive path */ 5730 if (in_softirq()) 5731 gfp_mask = GFP_NOWAIT; 5732 5733 this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); 5734 5735 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 5736 return true; 5737 5738 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 5739 return false; 5740 } 5741 5742 /** 5743 * mem_cgroup_uncharge_skmem - uncharge socket memory 5744 * @memcg - memcg to uncharge 5745 * @nr_pages - number of pages to uncharge 5746 */ 5747 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 5748 { 5749 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 5750 page_counter_uncharge(&memcg->tcpmem, nr_pages); 5751 return; 5752 } 5753 5754 this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); 5755 5756 page_counter_uncharge(&memcg->memory, nr_pages); 5757 css_put_many(&memcg->css, nr_pages); 5758 } 5759 5760 static int __init cgroup_memory(char *s) 5761 { 5762 char *token; 5763 5764 while ((token = strsep(&s, ",")) != NULL) { 5765 if (!*token) 5766 continue; 5767 if (!strcmp(token, "nosocket")) 5768 cgroup_memory_nosocket = true; 5769 if (!strcmp(token, "nokmem")) 5770 cgroup_memory_nokmem = true; 5771 } 5772 return 0; 5773 } 5774 __setup("cgroup.memory=", cgroup_memory); 5775 5776 /* 5777 * subsys_initcall() for memory controller. 5778 * 5779 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 5780 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 5781 * basically everything that doesn't depend on a specific mem_cgroup structure 5782 * should be initialized from here. 5783 */ 5784 static int __init mem_cgroup_init(void) 5785 { 5786 int cpu, node; 5787 5788 #ifndef CONFIG_SLOB 5789 /* 5790 * Kmem cache creation is mostly done with the slab_mutex held, 5791 * so use a workqueue with limited concurrency to avoid stalling 5792 * all worker threads in case lots of cgroups are created and 5793 * destroyed simultaneously. 5794 */ 5795 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); 5796 BUG_ON(!memcg_kmem_cache_wq); 5797 #endif 5798 5799 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 5800 memcg_hotplug_cpu_dead); 5801 5802 for_each_possible_cpu(cpu) 5803 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 5804 drain_local_stock); 5805 5806 for_each_node(node) { 5807 struct mem_cgroup_tree_per_node *rtpn; 5808 5809 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5810 node_online(node) ? node : NUMA_NO_NODE); 5811 5812 rtpn->rb_root = RB_ROOT; 5813 spin_lock_init(&rtpn->lock); 5814 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5815 } 5816 5817 return 0; 5818 } 5819 subsys_initcall(mem_cgroup_init); 5820 5821 #ifdef CONFIG_MEMCG_SWAP 5822 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 5823 { 5824 while (!atomic_inc_not_zero(&memcg->id.ref)) { 5825 /* 5826 * The root cgroup cannot be destroyed, so it's refcount must 5827 * always be >= 1. 5828 */ 5829 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 5830 VM_BUG_ON(1); 5831 break; 5832 } 5833 memcg = parent_mem_cgroup(memcg); 5834 if (!memcg) 5835 memcg = root_mem_cgroup; 5836 } 5837 return memcg; 5838 } 5839 5840 /** 5841 * mem_cgroup_swapout - transfer a memsw charge to swap 5842 * @page: page whose memsw charge to transfer 5843 * @entry: swap entry to move the charge to 5844 * 5845 * Transfer the memsw charge of @page to @entry. 5846 */ 5847 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5848 { 5849 struct mem_cgroup *memcg, *swap_memcg; 5850 unsigned short oldid; 5851 5852 VM_BUG_ON_PAGE(PageLRU(page), page); 5853 VM_BUG_ON_PAGE(page_count(page), page); 5854 5855 if (!do_memsw_account()) 5856 return; 5857 5858 memcg = page->mem_cgroup; 5859 5860 /* Readahead page, never charged */ 5861 if (!memcg) 5862 return; 5863 5864 /* 5865 * In case the memcg owning these pages has been offlined and doesn't 5866 * have an ID allocated to it anymore, charge the closest online 5867 * ancestor for the swap instead and transfer the memory+swap charge. 5868 */ 5869 swap_memcg = mem_cgroup_id_get_online(memcg); 5870 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); 5871 VM_BUG_ON_PAGE(oldid, page); 5872 mem_cgroup_swap_statistics(swap_memcg, true); 5873 5874 page->mem_cgroup = NULL; 5875 5876 if (!mem_cgroup_is_root(memcg)) 5877 page_counter_uncharge(&memcg->memory, 1); 5878 5879 if (memcg != swap_memcg) { 5880 if (!mem_cgroup_is_root(swap_memcg)) 5881 page_counter_charge(&swap_memcg->memsw, 1); 5882 page_counter_uncharge(&memcg->memsw, 1); 5883 } 5884 5885 /* 5886 * Interrupts should be disabled here because the caller holds the 5887 * mapping->tree_lock lock which is taken with interrupts-off. It is 5888 * important here to have the interrupts disabled because it is the 5889 * only synchronisation we have for udpating the per-CPU variables. 5890 */ 5891 VM_BUG_ON(!irqs_disabled()); 5892 mem_cgroup_charge_statistics(memcg, page, false, -1); 5893 memcg_check_events(memcg, page); 5894 5895 if (!mem_cgroup_is_root(memcg)) 5896 css_put(&memcg->css); 5897 } 5898 5899 /* 5900 * mem_cgroup_try_charge_swap - try charging a swap entry 5901 * @page: page being added to swap 5902 * @entry: swap entry to charge 5903 * 5904 * Try to charge @entry to the memcg that @page belongs to. 5905 * 5906 * Returns 0 on success, -ENOMEM on failure. 5907 */ 5908 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 5909 { 5910 struct mem_cgroup *memcg; 5911 struct page_counter *counter; 5912 unsigned short oldid; 5913 5914 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) 5915 return 0; 5916 5917 memcg = page->mem_cgroup; 5918 5919 /* Readahead page, never charged */ 5920 if (!memcg) 5921 return 0; 5922 5923 memcg = mem_cgroup_id_get_online(memcg); 5924 5925 if (!mem_cgroup_is_root(memcg) && 5926 !page_counter_try_charge(&memcg->swap, 1, &counter)) { 5927 mem_cgroup_id_put(memcg); 5928 return -ENOMEM; 5929 } 5930 5931 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5932 VM_BUG_ON_PAGE(oldid, page); 5933 mem_cgroup_swap_statistics(memcg, true); 5934 5935 return 0; 5936 } 5937 5938 /** 5939 * mem_cgroup_uncharge_swap - uncharge a swap entry 5940 * @entry: swap entry to uncharge 5941 * 5942 * Drop the swap charge associated with @entry. 5943 */ 5944 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5945 { 5946 struct mem_cgroup *memcg; 5947 unsigned short id; 5948 5949 if (!do_swap_account) 5950 return; 5951 5952 id = swap_cgroup_record(entry, 0); 5953 rcu_read_lock(); 5954 memcg = mem_cgroup_from_id(id); 5955 if (memcg) { 5956 if (!mem_cgroup_is_root(memcg)) { 5957 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5958 page_counter_uncharge(&memcg->swap, 1); 5959 else 5960 page_counter_uncharge(&memcg->memsw, 1); 5961 } 5962 mem_cgroup_swap_statistics(memcg, false); 5963 mem_cgroup_id_put(memcg); 5964 } 5965 rcu_read_unlock(); 5966 } 5967 5968 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 5969 { 5970 long nr_swap_pages = get_nr_swap_pages(); 5971 5972 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5973 return nr_swap_pages; 5974 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 5975 nr_swap_pages = min_t(long, nr_swap_pages, 5976 READ_ONCE(memcg->swap.limit) - 5977 page_counter_read(&memcg->swap)); 5978 return nr_swap_pages; 5979 } 5980 5981 bool mem_cgroup_swap_full(struct page *page) 5982 { 5983 struct mem_cgroup *memcg; 5984 5985 VM_BUG_ON_PAGE(!PageLocked(page), page); 5986 5987 if (vm_swap_full()) 5988 return true; 5989 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5990 return false; 5991 5992 memcg = page->mem_cgroup; 5993 if (!memcg) 5994 return false; 5995 5996 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 5997 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) 5998 return true; 5999 6000 return false; 6001 } 6002 6003 /* for remember boot option*/ 6004 #ifdef CONFIG_MEMCG_SWAP_ENABLED 6005 static int really_do_swap_account __initdata = 1; 6006 #else 6007 static int really_do_swap_account __initdata; 6008 #endif 6009 6010 static int __init enable_swap_account(char *s) 6011 { 6012 if (!strcmp(s, "1")) 6013 really_do_swap_account = 1; 6014 else if (!strcmp(s, "0")) 6015 really_do_swap_account = 0; 6016 return 1; 6017 } 6018 __setup("swapaccount=", enable_swap_account); 6019 6020 static u64 swap_current_read(struct cgroup_subsys_state *css, 6021 struct cftype *cft) 6022 { 6023 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6024 6025 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 6026 } 6027 6028 static int swap_max_show(struct seq_file *m, void *v) 6029 { 6030 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 6031 unsigned long max = READ_ONCE(memcg->swap.limit); 6032 6033 if (max == PAGE_COUNTER_MAX) 6034 seq_puts(m, "max\n"); 6035 else 6036 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 6037 6038 return 0; 6039 } 6040 6041 static ssize_t swap_max_write(struct kernfs_open_file *of, 6042 char *buf, size_t nbytes, loff_t off) 6043 { 6044 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6045 unsigned long max; 6046 int err; 6047 6048 buf = strstrip(buf); 6049 err = page_counter_memparse(buf, "max", &max); 6050 if (err) 6051 return err; 6052 6053 mutex_lock(&memcg_limit_mutex); 6054 err = page_counter_limit(&memcg->swap, max); 6055 mutex_unlock(&memcg_limit_mutex); 6056 if (err) 6057 return err; 6058 6059 return nbytes; 6060 } 6061 6062 static struct cftype swap_files[] = { 6063 { 6064 .name = "swap.current", 6065 .flags = CFTYPE_NOT_ON_ROOT, 6066 .read_u64 = swap_current_read, 6067 }, 6068 { 6069 .name = "swap.max", 6070 .flags = CFTYPE_NOT_ON_ROOT, 6071 .seq_show = swap_max_show, 6072 .write = swap_max_write, 6073 }, 6074 { } /* terminate */ 6075 }; 6076 6077 static struct cftype memsw_cgroup_files[] = { 6078 { 6079 .name = "memsw.usage_in_bytes", 6080 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6081 .read_u64 = mem_cgroup_read_u64, 6082 }, 6083 { 6084 .name = "memsw.max_usage_in_bytes", 6085 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6086 .write = mem_cgroup_reset, 6087 .read_u64 = mem_cgroup_read_u64, 6088 }, 6089 { 6090 .name = "memsw.limit_in_bytes", 6091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6092 .write = mem_cgroup_write, 6093 .read_u64 = mem_cgroup_read_u64, 6094 }, 6095 { 6096 .name = "memsw.failcnt", 6097 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6098 .write = mem_cgroup_reset, 6099 .read_u64 = mem_cgroup_read_u64, 6100 }, 6101 { }, /* terminate */ 6102 }; 6103 6104 static int __init mem_cgroup_swap_init(void) 6105 { 6106 if (!mem_cgroup_disabled() && really_do_swap_account) { 6107 do_swap_account = 1; 6108 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, 6109 swap_files)); 6110 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6111 memsw_cgroup_files)); 6112 } 6113 return 0; 6114 } 6115 subsys_initcall(mem_cgroup_swap_init); 6116 6117 #endif /* CONFIG_MEMCG_SWAP */ 6118