1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <unistd.h>
6 #include <stdio.h>
7 #include <signal.h>
8 #include <sys/sysinfo.h>
9 #include <string.h>
10 #include <sys/wait.h>
11 #include <sys/mman.h>
12 
13 #include "../kselftest.h"
14 #include "cgroup_util.h"
15 
16 static int read_int(const char *path, size_t *value)
17 {
18 	FILE *file;
19 	int ret = 0;
20 
21 	file = fopen(path, "r");
22 	if (!file)
23 		return -1;
24 	if (fscanf(file, "%ld", value) != 1)
25 		ret = -1;
26 	fclose(file);
27 	return ret;
28 }
29 
30 static int set_min_free_kb(size_t value)
31 {
32 	FILE *file;
33 	int ret;
34 
35 	file = fopen("/proc/sys/vm/min_free_kbytes", "w");
36 	if (!file)
37 		return -1;
38 	ret = fprintf(file, "%ld\n", value);
39 	fclose(file);
40 	return ret;
41 }
42 
43 static int read_min_free_kb(size_t *value)
44 {
45 	return read_int("/proc/sys/vm/min_free_kbytes", value);
46 }
47 
48 static int get_zswap_stored_pages(size_t *value)
49 {
50 	return read_int("/sys/kernel/debug/zswap/stored_pages", value);
51 }
52 
53 static int get_zswap_written_back_pages(size_t *value)
54 {
55 	return read_int("/sys/kernel/debug/zswap/written_back_pages", value);
56 }
57 
58 static int allocate_bytes(const char *cgroup, void *arg)
59 {
60 	size_t size = (size_t)arg;
61 	char *mem = (char *)malloc(size);
62 
63 	if (!mem)
64 		return -1;
65 	for (int i = 0; i < size; i += 4095)
66 		mem[i] = 'a';
67 	free(mem);
68 	return 0;
69 }
70 
71 /*
72  * When trying to store a memcg page in zswap, if the memcg hits its memory
73  * limit in zswap, writeback should not be triggered.
74  *
75  * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may
76  * not zswap"). Needs to be revised when a per memcg writeback mechanism is
77  * implemented.
78  */
79 static int test_no_invasive_cgroup_shrink(const char *root)
80 {
81 	size_t written_back_before, written_back_after;
82 	int ret = KSFT_FAIL;
83 	char *test_group;
84 
85 	/* Set up */
86 	test_group = cg_name(root, "no_shrink_test");
87 	if (!test_group)
88 		goto out;
89 	if (cg_create(test_group))
90 		goto out;
91 	if (cg_write(test_group, "memory.max", "1M"))
92 		goto out;
93 	if (cg_write(test_group, "memory.zswap.max", "10K"))
94 		goto out;
95 	if (get_zswap_written_back_pages(&written_back_before))
96 		goto out;
97 
98 	/* Allocate 10x memory.max to push memory into zswap */
99 	if (cg_run(test_group, allocate_bytes, (void *)MB(10)))
100 		goto out;
101 
102 	/* Verify that no writeback happened because of the memcg allocation */
103 	if (get_zswap_written_back_pages(&written_back_after))
104 		goto out;
105 	if (written_back_after == written_back_before)
106 		ret = KSFT_PASS;
107 out:
108 	cg_destroy(test_group);
109 	free(test_group);
110 	return ret;
111 }
112 
113 struct no_kmem_bypass_child_args {
114 	size_t target_alloc_bytes;
115 	size_t child_allocated;
116 };
117 
118 static int no_kmem_bypass_child(const char *cgroup, void *arg)
119 {
120 	struct no_kmem_bypass_child_args *values = arg;
121 	void *allocation;
122 
123 	allocation = malloc(values->target_alloc_bytes);
124 	if (!allocation) {
125 		values->child_allocated = true;
126 		return -1;
127 	}
128 	for (long i = 0; i < values->target_alloc_bytes; i += 4095)
129 		((char *)allocation)[i] = 'a';
130 	values->child_allocated = true;
131 	pause();
132 	free(allocation);
133 	return 0;
134 }
135 
136 /*
137  * When pages owned by a memcg are pushed to zswap by kswapd, they should be
138  * charged to that cgroup. This wasn't the case before commit
139  * cd08d80ecdac("mm: correctly charge compressed memory to its memcg").
140  *
141  * The test first allocates memory in a memcg, then raises min_free_kbytes to
142  * a very high value so that the allocation falls below low wm, then makes
143  * another allocation to trigger kswapd that should push the memcg-owned pages
144  * to zswap and verifies that the zswap pages are correctly charged.
145  *
146  * To be run on a VM with at most 4G of memory.
147  */
148 static int test_no_kmem_bypass(const char *root)
149 {
150 	size_t min_free_kb_high, min_free_kb_low, min_free_kb_original;
151 	struct no_kmem_bypass_child_args *values;
152 	size_t trigger_allocation_size;
153 	int wait_child_iteration = 0;
154 	long stored_pages_threshold;
155 	struct sysinfo sys_info;
156 	int ret = KSFT_FAIL;
157 	int child_status;
158 	char *test_group;
159 	pid_t child_pid;
160 
161 	/* Read sys info and compute test values accordingly */
162 	if (sysinfo(&sys_info) != 0)
163 		return KSFT_FAIL;
164 	if (sys_info.totalram > 5000000000)
165 		return KSFT_SKIP;
166 	values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ |
167 			PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
168 	if (values == MAP_FAILED)
169 		return KSFT_FAIL;
170 	if (read_min_free_kb(&min_free_kb_original))
171 		return KSFT_FAIL;
172 	min_free_kb_high = sys_info.totalram / 2000;
173 	min_free_kb_low = sys_info.totalram / 500000;
174 	values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) +
175 		sys_info.totalram * 5 / 100;
176 	stored_pages_threshold = sys_info.totalram / 5 / 4096;
177 	trigger_allocation_size = sys_info.totalram / 20;
178 
179 	/* Set up test memcg */
180 	if (cg_write(root, "cgroup.subtree_control", "+memory"))
181 		goto out;
182 	test_group = cg_name(root, "kmem_bypass_test");
183 	if (!test_group)
184 		goto out;
185 
186 	/* Spawn memcg child and wait for it to allocate */
187 	set_min_free_kb(min_free_kb_low);
188 	if (cg_create(test_group))
189 		goto out;
190 	values->child_allocated = false;
191 	child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values);
192 	if (child_pid < 0)
193 		goto out;
194 	while (!values->child_allocated && wait_child_iteration++ < 10000)
195 		usleep(1000);
196 
197 	/* Try to wakeup kswapd and let it push child memory to zswap */
198 	set_min_free_kb(min_free_kb_high);
199 	for (int i = 0; i < 20; i++) {
200 		size_t stored_pages;
201 		char *trigger_allocation = malloc(trigger_allocation_size);
202 
203 		if (!trigger_allocation)
204 			break;
205 		for (int i = 0; i < trigger_allocation_size; i += 4095)
206 			trigger_allocation[i] = 'b';
207 		usleep(100000);
208 		free(trigger_allocation);
209 		if (get_zswap_stored_pages(&stored_pages))
210 			break;
211 		if (stored_pages < 0)
212 			break;
213 		/* If memory was pushed to zswap, verify it belongs to memcg */
214 		if (stored_pages > stored_pages_threshold) {
215 			int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped ");
216 			int delta = stored_pages * 4096 - zswapped;
217 			int result_ok = delta < stored_pages * 4096 / 4;
218 
219 			ret = result_ok ? KSFT_PASS : KSFT_FAIL;
220 			break;
221 		}
222 	}
223 
224 	kill(child_pid, SIGTERM);
225 	waitpid(child_pid, &child_status, 0);
226 out:
227 	set_min_free_kb(min_free_kb_original);
228 	cg_destroy(test_group);
229 	free(test_group);
230 	return ret;
231 }
232 
233 #define T(x) { x, #x }
234 struct zswap_test {
235 	int (*fn)(const char *root);
236 	const char *name;
237 } tests[] = {
238 	T(test_no_kmem_bypass),
239 	T(test_no_invasive_cgroup_shrink),
240 };
241 #undef T
242 
243 static bool zswap_configured(void)
244 {
245 	return access("/sys/module/zswap", F_OK) == 0;
246 }
247 
248 int main(int argc, char **argv)
249 {
250 	char root[PATH_MAX];
251 	int i, ret = EXIT_SUCCESS;
252 
253 	if (cg_find_unified_root(root, sizeof(root), NULL))
254 		ksft_exit_skip("cgroup v2 isn't mounted\n");
255 
256 	if (!zswap_configured())
257 		ksft_exit_skip("zswap isn't configured\n");
258 
259 	/*
260 	 * Check that memory controller is available:
261 	 * memory is listed in cgroup.controllers
262 	 */
263 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
264 		ksft_exit_skip("memory controller isn't available\n");
265 
266 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
267 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
268 			ksft_exit_skip("Failed to set memory controller\n");
269 
270 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
271 		switch (tests[i].fn(root)) {
272 		case KSFT_PASS:
273 			ksft_test_result_pass("%s\n", tests[i].name);
274 			break;
275 		case KSFT_SKIP:
276 			ksft_test_result_skip("%s\n", tests[i].name);
277 			break;
278 		default:
279 			ret = EXIT_FAILURE;
280 			ksft_test_result_fail("%s\n", tests[i].name);
281 			break;
282 		}
283 	}
284 
285 	return ret;
286 }
287