1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16 
17 #include "../kselftest.h"
18 #include "cgroup_util.h"
19 
20 
21 /*
22  * Memory cgroup charging is performed using percpu batches 64 pages
23  * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
24  * the maximum discrepancy between charge and vmstat entries is number
25  * of cpus multiplied by 64 pages.
26  */
27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
28 
29 
30 static int alloc_dcache(const char *cgroup, void *arg)
31 {
32 	unsigned long i;
33 	struct stat st;
34 	char buf[128];
35 
36 	for (i = 0; i < (unsigned long)arg; i++) {
37 		snprintf(buf, sizeof(buf),
38 			"/something-non-existent-with-a-long-name-%64lu-%d",
39 			 i, getpid());
40 		stat(buf, &st);
41 	}
42 
43 	return 0;
44 }
45 
46 /*
47  * This test allocates 100000 of negative dentries with long names.
48  * Then it checks that "slab" in memory.stat is larger than 1M.
49  * Then it sets memory.high to 1M and checks that at least 1/2
50  * of slab memory has been reclaimed.
51  */
52 static int test_kmem_basic(const char *root)
53 {
54 	int ret = KSFT_FAIL;
55 	char *cg = NULL;
56 	long slab0, slab1, current;
57 
58 	cg = cg_name(root, "kmem_basic_test");
59 	if (!cg)
60 		goto cleanup;
61 
62 	if (cg_create(cg))
63 		goto cleanup;
64 
65 	if (cg_run(cg, alloc_dcache, (void *)100000))
66 		goto cleanup;
67 
68 	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
69 	if (slab0 < (1 << 20))
70 		goto cleanup;
71 
72 	cg_write(cg, "memory.high", "1M");
73 
74 	/* wait for RCU freeing */
75 	sleep(1);
76 
77 	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
78 	if (slab1 < 0)
79 		goto cleanup;
80 
81 	current = cg_read_long(cg, "memory.current");
82 	if (current < 0)
83 		goto cleanup;
84 
85 	if (slab1 < slab0 / 2 && current < slab0 / 2)
86 		ret = KSFT_PASS;
87 cleanup:
88 	cg_destroy(cg);
89 	free(cg);
90 
91 	return ret;
92 }
93 
94 static void *alloc_kmem_fn(void *arg)
95 {
96 	alloc_dcache(NULL, (void *)100);
97 	return NULL;
98 }
99 
100 static int alloc_kmem_smp(const char *cgroup, void *arg)
101 {
102 	int nr_threads = 2 * get_nprocs();
103 	pthread_t *tinfo;
104 	unsigned long i;
105 	int ret = -1;
106 
107 	tinfo = calloc(nr_threads, sizeof(pthread_t));
108 	if (tinfo == NULL)
109 		return -1;
110 
111 	for (i = 0; i < nr_threads; i++) {
112 		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
113 				   (void *)i)) {
114 			free(tinfo);
115 			return -1;
116 		}
117 	}
118 
119 	for (i = 0; i < nr_threads; i++) {
120 		ret = pthread_join(tinfo[i], NULL);
121 		if (ret)
122 			break;
123 	}
124 
125 	free(tinfo);
126 	return ret;
127 }
128 
129 static int cg_run_in_subcgroups(const char *parent,
130 				int (*fn)(const char *cgroup, void *arg),
131 				void *arg, int times)
132 {
133 	char *child;
134 	int i;
135 
136 	for (i = 0; i < times; i++) {
137 		child = cg_name_indexed(parent, "child", i);
138 		if (!child)
139 			return -1;
140 
141 		if (cg_create(child)) {
142 			cg_destroy(child);
143 			free(child);
144 			return -1;
145 		}
146 
147 		if (cg_run(child, fn, NULL)) {
148 			cg_destroy(child);
149 			free(child);
150 			return -1;
151 		}
152 
153 		cg_destroy(child);
154 		free(child);
155 	}
156 
157 	return 0;
158 }
159 
160 /*
161  * The test creates and destroys a large number of cgroups. In each cgroup it
162  * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
163  * threads. Then it checks the sanity of numbers on the parent level:
164  * the total size of the cgroups should be roughly equal to
165  * anon + file + kernel + sock.
166  */
167 static int test_kmem_memcg_deletion(const char *root)
168 {
169 	long current, anon, file, kernel, sock, sum;
170 	int ret = KSFT_FAIL;
171 	char *parent;
172 
173 	parent = cg_name(root, "kmem_memcg_deletion_test");
174 	if (!parent)
175 		goto cleanup;
176 
177 	if (cg_create(parent))
178 		goto cleanup;
179 
180 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
181 		goto cleanup;
182 
183 	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
184 		goto cleanup;
185 
186 	current = cg_read_long(parent, "memory.current");
187 	anon = cg_read_key_long(parent, "memory.stat", "anon ");
188 	file = cg_read_key_long(parent, "memory.stat", "file ");
189 	kernel = cg_read_key_long(parent, "memory.stat", "kernel ");
190 	sock = cg_read_key_long(parent, "memory.stat", "sock ");
191 	if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0)
192 		goto cleanup;
193 
194 	sum = anon + file + kernel + sock;
195 	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
196 		ret = KSFT_PASS;
197 	} else {
198 		printf("memory.current = %ld\n", current);
199 		printf("anon + file + kernel + sock = %ld\n", sum);
200 		printf("anon = %ld\n", anon);
201 		printf("file = %ld\n", file);
202 		printf("kernel = %ld\n", kernel);
203 		printf("sock = %ld\n", sock);
204 	}
205 
206 cleanup:
207 	cg_destroy(parent);
208 	free(parent);
209 
210 	return ret;
211 }
212 
213 /*
214  * The test reads the entire /proc/kpagecgroup. If the operation went
215  * successfully (and the kernel didn't panic), the test is treated as passed.
216  */
217 static int test_kmem_proc_kpagecgroup(const char *root)
218 {
219 	unsigned long buf[128];
220 	int ret = KSFT_FAIL;
221 	ssize_t len;
222 	int fd;
223 
224 	fd = open("/proc/kpagecgroup", O_RDONLY);
225 	if (fd < 0)
226 		return ret;
227 
228 	do {
229 		len = read(fd, buf, sizeof(buf));
230 	} while (len > 0);
231 
232 	if (len == 0)
233 		ret = KSFT_PASS;
234 
235 	close(fd);
236 	return ret;
237 }
238 
239 static void *pthread_wait_fn(void *arg)
240 {
241 	sleep(100);
242 	return NULL;
243 }
244 
245 static int spawn_1000_threads(const char *cgroup, void *arg)
246 {
247 	int nr_threads = 1000;
248 	pthread_t *tinfo;
249 	unsigned long i;
250 	long stack;
251 	int ret = -1;
252 
253 	tinfo = calloc(nr_threads, sizeof(pthread_t));
254 	if (tinfo == NULL)
255 		return -1;
256 
257 	for (i = 0; i < nr_threads; i++) {
258 		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
259 				   (void *)i)) {
260 			free(tinfo);
261 			return(-1);
262 		}
263 	}
264 
265 	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
266 	if (stack >= 4096 * 1000)
267 		ret = 0;
268 
269 	free(tinfo);
270 	return ret;
271 }
272 
273 /*
274  * The test spawns a process, which spawns 1000 threads. Then it checks
275  * that memory.stat's kernel_stack is at least 1000 pages large.
276  */
277 static int test_kmem_kernel_stacks(const char *root)
278 {
279 	int ret = KSFT_FAIL;
280 	char *cg = NULL;
281 
282 	cg = cg_name(root, "kmem_kernel_stacks_test");
283 	if (!cg)
284 		goto cleanup;
285 
286 	if (cg_create(cg))
287 		goto cleanup;
288 
289 	if (cg_run(cg, spawn_1000_threads, NULL))
290 		goto cleanup;
291 
292 	ret = KSFT_PASS;
293 cleanup:
294 	cg_destroy(cg);
295 	free(cg);
296 
297 	return ret;
298 }
299 
300 /*
301  * This test sequentionally creates 30 child cgroups, allocates some
302  * kernel memory in each of them, and deletes them. Then it checks
303  * that the number of dying cgroups on the parent level is 0.
304  */
305 static int test_kmem_dead_cgroups(const char *root)
306 {
307 	int ret = KSFT_FAIL;
308 	char *parent;
309 	long dead;
310 	int i;
311 
312 	parent = cg_name(root, "kmem_dead_cgroups_test");
313 	if (!parent)
314 		goto cleanup;
315 
316 	if (cg_create(parent))
317 		goto cleanup;
318 
319 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
320 		goto cleanup;
321 
322 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
323 		goto cleanup;
324 
325 	for (i = 0; i < 5; i++) {
326 		dead = cg_read_key_long(parent, "cgroup.stat",
327 					"nr_dying_descendants ");
328 		if (dead == 0) {
329 			ret = KSFT_PASS;
330 			break;
331 		}
332 		/*
333 		 * Reclaiming cgroups might take some time,
334 		 * let's wait a bit and repeat.
335 		 */
336 		sleep(1);
337 	}
338 
339 cleanup:
340 	cg_destroy(parent);
341 	free(parent);
342 
343 	return ret;
344 }
345 
346 /*
347  * This test creates a sub-tree with 1000 memory cgroups.
348  * Then it checks that the memory.current on the parent level
349  * is greater than 0 and approximates matches the percpu value
350  * from memory.stat.
351  */
352 static int test_percpu_basic(const char *root)
353 {
354 	int ret = KSFT_FAIL;
355 	char *parent, *child;
356 	long current, percpu;
357 	int i;
358 
359 	parent = cg_name(root, "percpu_basic_test");
360 	if (!parent)
361 		goto cleanup;
362 
363 	if (cg_create(parent))
364 		goto cleanup;
365 
366 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
367 		goto cleanup;
368 
369 	for (i = 0; i < 1000; i++) {
370 		child = cg_name_indexed(parent, "child", i);
371 		if (!child)
372 			return -1;
373 
374 		if (cg_create(child))
375 			goto cleanup_children;
376 
377 		free(child);
378 	}
379 
380 	current = cg_read_long(parent, "memory.current");
381 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
382 
383 	if (current > 0 && percpu > 0 && abs(current - percpu) <
384 	    MAX_VMSTAT_ERROR)
385 		ret = KSFT_PASS;
386 	else
387 		printf("memory.current %ld\npercpu %ld\n",
388 		       current, percpu);
389 
390 cleanup_children:
391 	for (i = 0; i < 1000; i++) {
392 		child = cg_name_indexed(parent, "child", i);
393 		cg_destroy(child);
394 		free(child);
395 	}
396 
397 cleanup:
398 	cg_destroy(parent);
399 	free(parent);
400 
401 	return ret;
402 }
403 
404 #define T(x) { x, #x }
405 struct kmem_test {
406 	int (*fn)(const char *root);
407 	const char *name;
408 } tests[] = {
409 	T(test_kmem_basic),
410 	T(test_kmem_memcg_deletion),
411 	T(test_kmem_proc_kpagecgroup),
412 	T(test_kmem_kernel_stacks),
413 	T(test_kmem_dead_cgroups),
414 	T(test_percpu_basic),
415 };
416 #undef T
417 
418 int main(int argc, char **argv)
419 {
420 	char root[PATH_MAX];
421 	int i, ret = EXIT_SUCCESS;
422 
423 	if (cg_find_unified_root(root, sizeof(root), NULL))
424 		ksft_exit_skip("cgroup v2 isn't mounted\n");
425 
426 	/*
427 	 * Check that memory controller is available:
428 	 * memory is listed in cgroup.controllers
429 	 */
430 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
431 		ksft_exit_skip("memory controller isn't available\n");
432 
433 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
434 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
435 			ksft_exit_skip("Failed to set memory controller\n");
436 
437 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
438 		switch (tests[i].fn(root)) {
439 		case KSFT_PASS:
440 			ksft_test_result_pass("%s\n", tests[i].name);
441 			break;
442 		case KSFT_SKIP:
443 			ksft_test_result_skip("%s\n", tests[i].name);
444 			break;
445 		default:
446 			ret = EXIT_FAILURE;
447 			ksft_test_result_fail("%s\n", tests[i].name);
448 			break;
449 		}
450 	}
451 
452 	return ret;
453 }
454