memcontrol.c (b7c15a3ce6fea5da3aa836c897a78ac628467d54) memcontrol.c (5c041f5d1f23d3a172dd0db3215634c484b4acd6)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* memcontrol.c - Memory Controller
3 *
4 * Copyright IBM Corporation, 2007
5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6 *
7 * Copyright 2007 OpenVZ SWsoft Inc
8 * Author: Pavel Emelianov <xemul@openvz.org>

--- 53 unchanged lines hidden (view full) ---

62#include <linux/file.h>
63#include <linux/resume_user_mode.h>
64#include <linux/psi.h>
65#include <linux/seq_buf.h>
66#include "internal.h"
67#include <net/sock.h>
68#include <net/ip.h>
69#include "slab.h"
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* memcontrol.c - Memory Controller
3 *
4 * Copyright IBM Corporation, 2007
5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6 *
7 * Copyright 2007 OpenVZ SWsoft Inc
8 * Author: Pavel Emelianov <xemul@openvz.org>

--- 53 unchanged lines hidden (view full) ---

62#include <linux/file.h>
63#include <linux/resume_user_mode.h>
64#include <linux/psi.h>
65#include <linux/seq_buf.h>
66#include "internal.h"
67#include <net/sock.h>
68#include <net/ip.h>
69#include "slab.h"
70#include "swap.h"
70
71#include <linux/uaccess.h>
72
73#include <trace/events/vmscan.h>
74
75struct cgroup_subsys memory_cgrp_subsys __read_mostly;
76EXPORT_SYMBOL(memory_cgrp_subsys);
77

--- 6 unchanged lines hidden (view full) ---

84/* Socket memory accounting disabled? */
85static bool cgroup_memory_nosocket __ro_after_init;
86
87/* Kernel memory accounting disabled? */
88static bool cgroup_memory_nokmem __ro_after_init;
89
90/* Whether the swap controller is active */
91#ifdef CONFIG_MEMCG_SWAP
71
72#include <linux/uaccess.h>
73
74#include <trace/events/vmscan.h>
75
76struct cgroup_subsys memory_cgrp_subsys __read_mostly;
77EXPORT_SYMBOL(memory_cgrp_subsys);
78

--- 6 unchanged lines hidden (view full) ---

85/* Socket memory accounting disabled? */
86static bool cgroup_memory_nosocket __ro_after_init;
87
88/* Kernel memory accounting disabled? */
89static bool cgroup_memory_nokmem __ro_after_init;
90
91/* Whether the swap controller is active */
92#ifdef CONFIG_MEMCG_SWAP
92bool cgroup_memory_noswap __ro_after_init;
93static bool cgroup_memory_noswap __ro_after_init;
93#else
94#define cgroup_memory_noswap 1
95#endif
96
97#ifdef CONFIG_CGROUP_WRITEBACK
98static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
99#endif
100

--- 103 unchanged lines hidden (view full) ---

204 */
205#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
206#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
207
208/* for encoding cft->private value on file */
209enum res_type {
210 _MEM,
211 _MEMSWAP,
94#else
95#define cgroup_memory_noswap 1
96#endif
97
98#ifdef CONFIG_CGROUP_WRITEBACK
99static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
100#endif
101

--- 103 unchanged lines hidden (view full) ---

205 */
206#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
207#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
208
209/* for encoding cft->private value on file */
210enum res_type {
211 _MEM,
212 _MEMSWAP,
212 _OOM_TYPE,
213 _KMEM,
214 _TCP,
215};
216
217#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
218#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
219#define MEMFILE_ATTR(val) ((val) & 0xffff)
213 _KMEM,
214 _TCP,
215};
216
217#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
218#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
219#define MEMFILE_ATTR(val) ((val) & 0xffff)
220/* Used for OOM notifier */
221#define OOM_CONTROL (0)
222
223/*
224 * Iteration constructs for visiting all cgroups (under a tree). If
225 * loops are exited prematurely (break), mem_cgroup_iter_break() must
226 * be used for reference counting.
227 */
228#define for_each_mem_cgroup_tree(iter, root) \
229 for (iter = mem_cgroup_iter(root, NULL, NULL); \

--- 778 unchanged lines hidden (view full) ---

1008 struct mem_cgroup *pos = NULL;
1009
1010 if (mem_cgroup_disabled())
1011 return NULL;
1012
1013 if (!root)
1014 root = root_mem_cgroup;
1015
220
221/*
222 * Iteration constructs for visiting all cgroups (under a tree). If
223 * loops are exited prematurely (break), mem_cgroup_iter_break() must
224 * be used for reference counting.
225 */
226#define for_each_mem_cgroup_tree(iter, root) \
227 for (iter = mem_cgroup_iter(root, NULL, NULL); \

--- 778 unchanged lines hidden (view full) ---

1006 struct mem_cgroup *pos = NULL;
1007
1008 if (mem_cgroup_disabled())
1009 return NULL;
1010
1011 if (!root)
1012 root = root_mem_cgroup;
1013
1016 if (prev && !reclaim)
1017 pos = prev;
1018
1019 rcu_read_lock();
1020
1021 if (reclaim) {
1022 struct mem_cgroup_per_node *mz;
1023
1024 mz = root->nodeinfo[reclaim->pgdat->node_id];
1025 iter = &mz->iter;
1026
1014 rcu_read_lock();
1015
1016 if (reclaim) {
1017 struct mem_cgroup_per_node *mz;
1018
1019 mz = root->nodeinfo[reclaim->pgdat->node_id];
1020 iter = &mz->iter;
1021
1027 if (prev && reclaim->generation != iter->generation)
1022 /*
1023 * On start, join the current reclaim iteration cycle.
1024 * Exit when a concurrent walker completes it.
1025 */
1026 if (!prev)
1027 reclaim->generation = iter->generation;
1028 else if (reclaim->generation != iter->generation)
1028 goto out_unlock;
1029
1030 while (1) {
1031 pos = READ_ONCE(iter->position);
1032 if (!pos || css_tryget(&pos->css))
1033 break;
1034 /*
1035 * css reference reached zero, so iter->position will
1036 * be cleared by ->css_released. However, we should not
1037 * rely on this happening soon, because ->css_released
1038 * is called from a work queue, and by busy-waiting we
1039 * might block it. So we clear iter->position right
1040 * away.
1041 */
1042 (void)cmpxchg(&iter->position, pos, NULL);
1043 }
1029 goto out_unlock;
1030
1031 while (1) {
1032 pos = READ_ONCE(iter->position);
1033 if (!pos || css_tryget(&pos->css))
1034 break;
1035 /*
1036 * css reference reached zero, so iter->position will
1037 * be cleared by ->css_released. However, we should not
1038 * rely on this happening soon, because ->css_released
1039 * is called from a work queue, and by busy-waiting we
1040 * might block it. So we clear iter->position right
1041 * away.
1042 */
1043 (void)cmpxchg(&iter->position, pos, NULL);
1044 }
1045 } else if (prev) {
1046 pos = prev;
1044 }
1045
1046 if (pos)
1047 css = &pos->css;
1048
1049 for (;;) {
1050 css = css_next_descendant_pre(css, &root->css);
1051 if (!css) {

--- 8 unchanged lines hidden (view full) ---

1060 break;
1061 }
1062
1063 /*
1064 * Verify the css and acquire a reference. The root
1065 * is provided by the caller, so we know it's alive
1066 * and kicking, and don't take an extra reference.
1067 */
1047 }
1048
1049 if (pos)
1050 css = &pos->css;
1051
1052 for (;;) {
1053 css = css_next_descendant_pre(css, &root->css);
1054 if (!css) {

--- 8 unchanged lines hidden (view full) ---

1063 break;
1064 }
1065
1066 /*
1067 * Verify the css and acquire a reference. The root
1068 * is provided by the caller, so we know it's alive
1069 * and kicking, and don't take an extra reference.
1070 */
1068 memcg = mem_cgroup_from_css(css);
1069
1070 if (css == &root->css)
1071 if (css == &root->css || css_tryget(css)) {
1072 memcg = mem_cgroup_from_css(css);
1071 break;
1073 break;
1072
1073 if (css_tryget(css))
1074 break;
1075
1076 memcg = NULL;
1074 }
1077 }
1078
1079 if (reclaim) {
1080 /*
1081 * The position could have already been updated by a competing
1082 * thread, so check that the value hasn't changed since we read
1083 * it to avoid reclaiming from the same cgroup twice.
1084 */
1085 (void)cmpxchg(&iter->position, pos, memcg);
1086
1087 if (pos)
1088 css_put(&pos->css);
1089
1090 if (!memcg)
1091 iter->generation++;
1075 }
1076
1077 if (reclaim) {
1078 /*
1079 * The position could have already been updated by a competing
1080 * thread, so check that the value hasn't changed since we read
1081 * it to avoid reclaiming from the same cgroup twice.
1082 */
1083 (void)cmpxchg(&iter->position, pos, memcg);
1084
1085 if (pos)
1086 css_put(&pos->css);
1087
1088 if (!memcg)
1089 iter->generation++;
1092 else if (!prev)
1093 reclaim->generation = iter->generation;
1094 }
1095
1096out_unlock:
1097 rcu_read_unlock();
1098 if (prev && prev != root)
1099 css_put(&prev->css);
1100
1101 return memcg;

--- 2280 unchanged lines hidden (view full) ---

3382 unsigned long *total_scanned)
3383{
3384 unsigned long nr_reclaimed = 0;
3385 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3386 unsigned long reclaimed;
3387 int loop = 0;
3388 struct mem_cgroup_tree_per_node *mctz;
3389 unsigned long excess;
1090 }
1091
1092out_unlock:
1093 rcu_read_unlock();
1094 if (prev && prev != root)
1095 css_put(&prev->css);
1096
1097 return memcg;

--- 2280 unchanged lines hidden (view full) ---

3378 unsigned long *total_scanned)
3379{
3380 unsigned long nr_reclaimed = 0;
3381 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3382 unsigned long reclaimed;
3383 int loop = 0;
3384 struct mem_cgroup_tree_per_node *mctz;
3385 unsigned long excess;
3390 unsigned long nr_scanned;
3391
3392 if (order > 0)
3393 return 0;
3394
3395 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3396
3397 /*
3398 * Do not even bother to check the largest node if the root

--- 11 unchanged lines hidden (view full) ---

3410 do {
3411 if (next_mz)
3412 mz = next_mz;
3413 else
3414 mz = mem_cgroup_largest_soft_limit_node(mctz);
3415 if (!mz)
3416 break;
3417
3386
3387 if (order > 0)
3388 return 0;
3389
3390 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3391
3392 /*
3393 * Do not even bother to check the largest node if the root

--- 11 unchanged lines hidden (view full) ---

3405 do {
3406 if (next_mz)
3407 mz = next_mz;
3408 else
3409 mz = mem_cgroup_largest_soft_limit_node(mctz);
3410 if (!mz)
3411 break;
3412
3418 nr_scanned = 0;
3419 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3413 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3420 gfp_mask, &nr_scanned);
3414 gfp_mask, total_scanned);
3421 nr_reclaimed += reclaimed;
3415 nr_reclaimed += reclaimed;
3422 *total_scanned += nr_scanned;
3423 spin_lock_irq(&mctz->lock);
3416 spin_lock_irq(&mctz->lock);
3424 __mem_cgroup_remove_exceeded(mz, mctz);
3425
3426 /*
3427 * If we failed to reclaim anything from this memory cgroup
3428 * it is time to move on to the next cgroup
3429 */
3430 next_mz = NULL;
3431 if (!reclaimed)
3432 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);

--- 1455 unchanged lines hidden (view full) ---

4888 .name = "move_charge_at_immigrate",
4889 .read_u64 = mem_cgroup_move_charge_read,
4890 .write_u64 = mem_cgroup_move_charge_write,
4891 },
4892 {
4893 .name = "oom_control",
4894 .seq_show = mem_cgroup_oom_control_read,
4895 .write_u64 = mem_cgroup_oom_control_write,
3417
3418 /*
3419 * If we failed to reclaim anything from this memory cgroup
3420 * it is time to move on to the next cgroup
3421 */
3422 next_mz = NULL;
3423 if (!reclaimed)
3424 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);

--- 1455 unchanged lines hidden (view full) ---

4880 .name = "move_charge_at_immigrate",
4881 .read_u64 = mem_cgroup_move_charge_read,
4882 .write_u64 = mem_cgroup_move_charge_write,
4883 },
4884 {
4885 .name = "oom_control",
4886 .seq_show = mem_cgroup_oom_control_read,
4887 .write_u64 = mem_cgroup_oom_control_write,
4896 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4897 },
4898 {
4899 .name = "pressure_level",
4900 },
4901#ifdef CONFIG_NUMA
4902 {
4903 .name = "numa_stat",
4904 .seq_show = memcg_numa_stat_show,

--- 739 unchanged lines hidden (view full) ---

5644 unsigned long addr, pte_t ptent, union mc_target *target)
5645{
5646 struct page *page = NULL;
5647 enum mc_target_type ret = MC_TARGET_NONE;
5648 swp_entry_t ent = { .val = 0 };
5649
5650 if (pte_present(ptent))
5651 page = mc_handle_present_pte(vma, addr, ptent);
4888 },
4889 {
4890 .name = "pressure_level",
4891 },
4892#ifdef CONFIG_NUMA
4893 {
4894 .name = "numa_stat",
4895 .seq_show = memcg_numa_stat_show,

--- 739 unchanged lines hidden (view full) ---

5635 unsigned long addr, pte_t ptent, union mc_target *target)
5636{
5637 struct page *page = NULL;
5638 enum mc_target_type ret = MC_TARGET_NONE;
5639 swp_entry_t ent = { .val = 0 };
5640
5641 if (pte_present(ptent))
5642 page = mc_handle_present_pte(vma, addr, ptent);
5643 else if (pte_none_mostly(ptent))
5644 /*
5645 * PTE markers should be treated as a none pte here, separated
5646 * from other swap handling below.
5647 */
5648 page = mc_handle_file_pte(vma, addr, ptent);
5652 else if (is_swap_pte(ptent))
5653 page = mc_handle_swap_pte(vma, ptent, &ent);
5649 else if (is_swap_pte(ptent))
5650 page = mc_handle_swap_pte(vma, ptent, &ent);
5654 else if (pte_none(ptent))
5655 page = mc_handle_file_pte(vma, addr, ptent);
5656
5657 if (!page && !ent.val)
5658 return ret;
5659 if (page) {
5660 /*
5661 * Do only loose check w/o serialization.
5662 * mem_cgroup_move_account() checks the page is valid or
5663 * not under LRU exclusion.

--- 696 unchanged lines hidden (view full) ---

6360 if (oom_group != 0 && oom_group != 1)
6361 return -EINVAL;
6362
6363 memcg->oom_group = oom_group;
6364
6365 return nbytes;
6366}
6367
5651
5652 if (!page && !ent.val)
5653 return ret;
5654 if (page) {
5655 /*
5656 * Do only loose check w/o serialization.
5657 * mem_cgroup_move_account() checks the page is valid or
5658 * not under LRU exclusion.

--- 696 unchanged lines hidden (view full) ---

6355 if (oom_group != 0 && oom_group != 1)
6356 return -EINVAL;
6357
6358 memcg->oom_group = oom_group;
6359
6360 return nbytes;
6361}
6362
6363static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
6364 size_t nbytes, loff_t off)
6365{
6366 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6367 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6368 unsigned long nr_to_reclaim, nr_reclaimed = 0;
6369 int err;
6370
6371 buf = strstrip(buf);
6372 err = page_counter_memparse(buf, "", &nr_to_reclaim);
6373 if (err)
6374 return err;
6375
6376 while (nr_reclaimed < nr_to_reclaim) {
6377 unsigned long reclaimed;
6378
6379 if (signal_pending(current))
6380 return -EINTR;
6381
6382 /*
6383 * This is the final attempt, drain percpu lru caches in the
6384 * hope of introducing more evictable pages for
6385 * try_to_free_mem_cgroup_pages().
6386 */
6387 if (!nr_retries)
6388 lru_add_drain_all();
6389
6390 reclaimed = try_to_free_mem_cgroup_pages(memcg,
6391 nr_to_reclaim - nr_reclaimed,
6392 GFP_KERNEL, true);
6393
6394 if (!reclaimed && !nr_retries--)
6395 return -EAGAIN;
6396
6397 nr_reclaimed += reclaimed;
6398 }
6399
6400 return nbytes;
6401}
6402
6368static struct cftype memory_files[] = {
6369 {
6370 .name = "current",
6371 .flags = CFTYPE_NOT_ON_ROOT,
6372 .read_u64 = memory_current_read,
6373 },
6374 {
6375 .name = "min",

--- 42 unchanged lines hidden (view full) ---

6418 },
6419#endif
6420 {
6421 .name = "oom.group",
6422 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6423 .seq_show = memory_oom_group_show,
6424 .write = memory_oom_group_write,
6425 },
6403static struct cftype memory_files[] = {
6404 {
6405 .name = "current",
6406 .flags = CFTYPE_NOT_ON_ROOT,
6407 .read_u64 = memory_current_read,
6408 },
6409 {
6410 .name = "min",

--- 42 unchanged lines hidden (view full) ---

6453 },
6454#endif
6455 {
6456 .name = "oom.group",
6457 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6458 .seq_show = memory_oom_group_show,
6459 .write = memory_oom_group_write,
6460 },
6461 {
6462 .name = "reclaim",
6463 .flags = CFTYPE_NS_DELEGATABLE,
6464 .write = memory_reclaim,
6465 },
6426 { } /* terminate */
6427};
6428
6429struct cgroup_subsys memory_cgrp_subsys = {
6430 .css_alloc = mem_cgroup_css_alloc,
6431 .css_online = mem_cgroup_css_online,
6432 .css_offline = mem_cgroup_css_offline,
6433 .css_released = mem_cgroup_css_released,

--- 154 unchanged lines hidden (view full) ---

6588 if (memcg == root)
6589 return;
6590
6591 usage = page_counter_read(&memcg->memory);
6592 if (!usage)
6593 return;
6594
6595 parent = parent_mem_cgroup(memcg);
6466 { } /* terminate */
6467};
6468
6469struct cgroup_subsys memory_cgrp_subsys = {
6470 .css_alloc = mem_cgroup_css_alloc,
6471 .css_online = mem_cgroup_css_online,
6472 .css_offline = mem_cgroup_css_offline,
6473 .css_released = mem_cgroup_css_released,

--- 154 unchanged lines hidden (view full) ---

6628 if (memcg == root)
6629 return;
6630
6631 usage = page_counter_read(&memcg->memory);
6632 if (!usage)
6633 return;
6634
6635 parent = parent_mem_cgroup(memcg);
6596 /* No parent means a non-hierarchical mode on v1 memcg */
6597 if (!parent)
6598 return;
6599
6600 if (parent == root) {
6601 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6602 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6603 return;
6604 }
6605
6606 parent_usage = page_counter_read(&parent->memory);

--- 791 unchanged lines hidden ---
6636
6637 if (parent == root) {
6638 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6639 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6640 return;
6641 }
6642
6643 parent_usage = page_counter_read(&parent->memory);

--- 791 unchanged lines hidden ---