1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16 
17 #include "../kselftest.h"
18 #include "cgroup_util.h"
19 
20 
21 /*
22  * Memory cgroup charging and vmstat data aggregation is performed using
23  * percpu batches 32 pages big (look at MEMCG_CHARGE_BATCH). So the maximum
24  * discrepancy between charge and vmstat entries is number of cpus multiplied
25  * by 32 pages multiplied by 2.
26  */
27 #define MAX_VMSTAT_ERROR (4096 * 32 * 2 * get_nprocs())
28 
29 
30 static int alloc_dcache(const char *cgroup, void *arg)
31 {
32 	unsigned long i;
33 	struct stat st;
34 	char buf[128];
35 
36 	for (i = 0; i < (unsigned long)arg; i++) {
37 		snprintf(buf, sizeof(buf),
38 			"/something-non-existent-with-a-long-name-%64lu-%d",
39 			 i, getpid());
40 		stat(buf, &st);
41 	}
42 
43 	return 0;
44 }
45 
46 /*
47  * This test allocates 100000 of negative dentries with long names.
48  * Then it checks that "slab" in memory.stat is larger than 1M.
49  * Then it sets memory.high to 1M and checks that at least 1/2
50  * of slab memory has been reclaimed.
51  */
52 static int test_kmem_basic(const char *root)
53 {
54 	int ret = KSFT_FAIL;
55 	char *cg = NULL;
56 	long slab0, slab1, current;
57 
58 	cg = cg_name(root, "kmem_basic_test");
59 	if (!cg)
60 		goto cleanup;
61 
62 	if (cg_create(cg))
63 		goto cleanup;
64 
65 	if (cg_run(cg, alloc_dcache, (void *)100000))
66 		goto cleanup;
67 
68 	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
69 	if (slab0 < (1 << 20))
70 		goto cleanup;
71 
72 	cg_write(cg, "memory.high", "1M");
73 	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
74 	if (slab1 <= 0)
75 		goto cleanup;
76 
77 	current = cg_read_long(cg, "memory.current");
78 	if (current <= 0)
79 		goto cleanup;
80 
81 	if (slab1 < slab0 / 2 && current < slab0 / 2)
82 		ret = KSFT_PASS;
83 cleanup:
84 	cg_destroy(cg);
85 	free(cg);
86 
87 	return ret;
88 }
89 
90 static void *alloc_kmem_fn(void *arg)
91 {
92 	alloc_dcache(NULL, (void *)100);
93 	return NULL;
94 }
95 
96 static int alloc_kmem_smp(const char *cgroup, void *arg)
97 {
98 	int nr_threads = 2 * get_nprocs();
99 	pthread_t *tinfo;
100 	unsigned long i;
101 	int ret = -1;
102 
103 	tinfo = calloc(nr_threads, sizeof(pthread_t));
104 	if (tinfo == NULL)
105 		return -1;
106 
107 	for (i = 0; i < nr_threads; i++) {
108 		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
109 				   (void *)i)) {
110 			free(tinfo);
111 			return -1;
112 		}
113 	}
114 
115 	for (i = 0; i < nr_threads; i++) {
116 		ret = pthread_join(tinfo[i], NULL);
117 		if (ret)
118 			break;
119 	}
120 
121 	free(tinfo);
122 	return ret;
123 }
124 
125 static int cg_run_in_subcgroups(const char *parent,
126 				int (*fn)(const char *cgroup, void *arg),
127 				void *arg, int times)
128 {
129 	char *child;
130 	int i;
131 
132 	for (i = 0; i < times; i++) {
133 		child = cg_name_indexed(parent, "child", i);
134 		if (!child)
135 			return -1;
136 
137 		if (cg_create(child)) {
138 			cg_destroy(child);
139 			free(child);
140 			return -1;
141 		}
142 
143 		if (cg_run(child, fn, NULL)) {
144 			cg_destroy(child);
145 			free(child);
146 			return -1;
147 		}
148 
149 		cg_destroy(child);
150 		free(child);
151 	}
152 
153 	return 0;
154 }
155 
156 /*
157  * The test creates and destroys a large number of cgroups. In each cgroup it
158  * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
159  * threads. Then it checks the sanity of numbers on the parent level:
160  * the total size of the cgroups should be roughly equal to
161  * anon + file + slab + kernel_stack.
162  */
163 static int test_kmem_memcg_deletion(const char *root)
164 {
165 	long current, slab, anon, file, kernel_stack, sum;
166 	int ret = KSFT_FAIL;
167 	char *parent;
168 
169 	parent = cg_name(root, "kmem_memcg_deletion_test");
170 	if (!parent)
171 		goto cleanup;
172 
173 	if (cg_create(parent))
174 		goto cleanup;
175 
176 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
177 		goto cleanup;
178 
179 	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
180 		goto cleanup;
181 
182 	current = cg_read_long(parent, "memory.current");
183 	slab = cg_read_key_long(parent, "memory.stat", "slab ");
184 	anon = cg_read_key_long(parent, "memory.stat", "anon ");
185 	file = cg_read_key_long(parent, "memory.stat", "file ");
186 	kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
187 	if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
188 	    kernel_stack < 0)
189 		goto cleanup;
190 
191 	sum = slab + anon + file + kernel_stack;
192 	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
193 		ret = KSFT_PASS;
194 	} else {
195 		printf("memory.current = %ld\n", current);
196 		printf("slab + anon + file + kernel_stack = %ld\n", sum);
197 		printf("slab = %ld\n", slab);
198 		printf("anon = %ld\n", anon);
199 		printf("file = %ld\n", file);
200 		printf("kernel_stack = %ld\n", kernel_stack);
201 	}
202 
203 cleanup:
204 	cg_destroy(parent);
205 	free(parent);
206 
207 	return ret;
208 }
209 
210 /*
211  * The test reads the entire /proc/kpagecgroup. If the operation went
212  * successfully (and the kernel didn't panic), the test is treated as passed.
213  */
214 static int test_kmem_proc_kpagecgroup(const char *root)
215 {
216 	unsigned long buf[128];
217 	int ret = KSFT_FAIL;
218 	ssize_t len;
219 	int fd;
220 
221 	fd = open("/proc/kpagecgroup", O_RDONLY);
222 	if (fd < 0)
223 		return ret;
224 
225 	do {
226 		len = read(fd, buf, sizeof(buf));
227 	} while (len > 0);
228 
229 	if (len == 0)
230 		ret = KSFT_PASS;
231 
232 	close(fd);
233 	return ret;
234 }
235 
236 static void *pthread_wait_fn(void *arg)
237 {
238 	sleep(100);
239 	return NULL;
240 }
241 
242 static int spawn_1000_threads(const char *cgroup, void *arg)
243 {
244 	int nr_threads = 1000;
245 	pthread_t *tinfo;
246 	unsigned long i;
247 	long stack;
248 	int ret = -1;
249 
250 	tinfo = calloc(nr_threads, sizeof(pthread_t));
251 	if (tinfo == NULL)
252 		return -1;
253 
254 	for (i = 0; i < nr_threads; i++) {
255 		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
256 				   (void *)i)) {
257 			free(tinfo);
258 			return(-1);
259 		}
260 	}
261 
262 	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
263 	if (stack >= 4096 * 1000)
264 		ret = 0;
265 
266 	free(tinfo);
267 	return ret;
268 }
269 
270 /*
271  * The test spawns a process, which spawns 1000 threads. Then it checks
272  * that memory.stat's kernel_stack is at least 1000 pages large.
273  */
274 static int test_kmem_kernel_stacks(const char *root)
275 {
276 	int ret = KSFT_FAIL;
277 	char *cg = NULL;
278 
279 	cg = cg_name(root, "kmem_kernel_stacks_test");
280 	if (!cg)
281 		goto cleanup;
282 
283 	if (cg_create(cg))
284 		goto cleanup;
285 
286 	if (cg_run(cg, spawn_1000_threads, NULL))
287 		goto cleanup;
288 
289 	ret = KSFT_PASS;
290 cleanup:
291 	cg_destroy(cg);
292 	free(cg);
293 
294 	return ret;
295 }
296 
297 /*
298  * This test sequentionally creates 30 child cgroups, allocates some
299  * kernel memory in each of them, and deletes them. Then it checks
300  * that the number of dying cgroups on the parent level is 0.
301  */
302 static int test_kmem_dead_cgroups(const char *root)
303 {
304 	int ret = KSFT_FAIL;
305 	char *parent;
306 	long dead;
307 	int i;
308 
309 	parent = cg_name(root, "kmem_dead_cgroups_test");
310 	if (!parent)
311 		goto cleanup;
312 
313 	if (cg_create(parent))
314 		goto cleanup;
315 
316 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
317 		goto cleanup;
318 
319 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
320 		goto cleanup;
321 
322 	for (i = 0; i < 5; i++) {
323 		dead = cg_read_key_long(parent, "cgroup.stat",
324 					"nr_dying_descendants ");
325 		if (dead == 0) {
326 			ret = KSFT_PASS;
327 			break;
328 		}
329 		/*
330 		 * Reclaiming cgroups might take some time,
331 		 * let's wait a bit and repeat.
332 		 */
333 		sleep(1);
334 	}
335 
336 cleanup:
337 	cg_destroy(parent);
338 	free(parent);
339 
340 	return ret;
341 }
342 
343 /*
344  * This test creates a sub-tree with 1000 memory cgroups.
345  * Then it checks that the memory.current on the parent level
346  * is greater than 0 and approximates matches the percpu value
347  * from memory.stat.
348  */
349 static int test_percpu_basic(const char *root)
350 {
351 	int ret = KSFT_FAIL;
352 	char *parent, *child;
353 	long current, percpu;
354 	int i;
355 
356 	parent = cg_name(root, "percpu_basic_test");
357 	if (!parent)
358 		goto cleanup;
359 
360 	if (cg_create(parent))
361 		goto cleanup;
362 
363 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
364 		goto cleanup;
365 
366 	for (i = 0; i < 1000; i++) {
367 		child = cg_name_indexed(parent, "child", i);
368 		if (!child)
369 			return -1;
370 
371 		if (cg_create(child))
372 			goto cleanup_children;
373 
374 		free(child);
375 	}
376 
377 	current = cg_read_long(parent, "memory.current");
378 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
379 
380 	if (current > 0 && percpu > 0 && abs(current - percpu) <
381 	    MAX_VMSTAT_ERROR)
382 		ret = KSFT_PASS;
383 	else
384 		printf("memory.current %ld\npercpu %ld\n",
385 		       current, percpu);
386 
387 cleanup_children:
388 	for (i = 0; i < 1000; i++) {
389 		child = cg_name_indexed(parent, "child", i);
390 		cg_destroy(child);
391 		free(child);
392 	}
393 
394 cleanup:
395 	cg_destroy(parent);
396 	free(parent);
397 
398 	return ret;
399 }
400 
401 #define T(x) { x, #x }
402 struct kmem_test {
403 	int (*fn)(const char *root);
404 	const char *name;
405 } tests[] = {
406 	T(test_kmem_basic),
407 	T(test_kmem_memcg_deletion),
408 	T(test_kmem_proc_kpagecgroup),
409 	T(test_kmem_kernel_stacks),
410 	T(test_kmem_dead_cgroups),
411 	T(test_percpu_basic),
412 };
413 #undef T
414 
415 int main(int argc, char **argv)
416 {
417 	char root[PATH_MAX];
418 	int i, ret = EXIT_SUCCESS;
419 
420 	if (cg_find_unified_root(root, sizeof(root)))
421 		ksft_exit_skip("cgroup v2 isn't mounted\n");
422 
423 	/*
424 	 * Check that memory controller is available:
425 	 * memory is listed in cgroup.controllers
426 	 */
427 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
428 		ksft_exit_skip("memory controller isn't available\n");
429 
430 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
431 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
432 			ksft_exit_skip("Failed to set memory controller\n");
433 
434 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
435 		switch (tests[i].fn(root)) {
436 		case KSFT_PASS:
437 			ksft_test_result_pass("%s\n", tests[i].name);
438 			break;
439 		case KSFT_SKIP:
440 			ksft_test_result_skip("%s\n", tests[i].name);
441 			break;
442 		default:
443 			ret = EXIT_FAILURE;
444 			ksft_test_result_fail("%s\n", tests[i].name);
445 			break;
446 		}
447 	}
448 
449 	return ret;
450 }
451