1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16 
17 #include "../kselftest.h"
18 #include "cgroup_util.h"
19 
20 
21 /*
22  * Memory cgroup charging is performed using percpu batches 64 pages
23  * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
24  * the maximum discrepancy between charge and vmstat entries is number
25  * of cpus multiplied by 64 pages.
26  */
27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
28 
29 
30 static int alloc_dcache(const char *cgroup, void *arg)
31 {
32 	unsigned long i;
33 	struct stat st;
34 	char buf[128];
35 
36 	for (i = 0; i < (unsigned long)arg; i++) {
37 		snprintf(buf, sizeof(buf),
38 			"/something-non-existent-with-a-long-name-%64lu-%d",
39 			 i, getpid());
40 		stat(buf, &st);
41 	}
42 
43 	return 0;
44 }
45 
46 /*
47  * This test allocates 100000 of negative dentries with long names.
48  * Then it checks that "slab" in memory.stat is larger than 1M.
49  * Then it sets memory.high to 1M and checks that at least 1/2
50  * of slab memory has been reclaimed.
51  */
52 static int test_kmem_basic(const char *root)
53 {
54 	int ret = KSFT_FAIL;
55 	char *cg = NULL;
56 	long slab0, slab1, current;
57 
58 	cg = cg_name(root, "kmem_basic_test");
59 	if (!cg)
60 		goto cleanup;
61 
62 	if (cg_create(cg))
63 		goto cleanup;
64 
65 	if (cg_run(cg, alloc_dcache, (void *)100000))
66 		goto cleanup;
67 
68 	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
69 	if (slab0 < (1 << 20))
70 		goto cleanup;
71 
72 	cg_write(cg, "memory.high", "1M");
73 
74 	/* wait for RCU freeing */
75 	sleep(1);
76 
77 	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
78 	if (slab1 <= 0)
79 		goto cleanup;
80 
81 	current = cg_read_long(cg, "memory.current");
82 	if (current <= 0)
83 		goto cleanup;
84 
85 	if (slab1 < slab0 / 2 && current < slab0 / 2)
86 		ret = KSFT_PASS;
87 cleanup:
88 	cg_destroy(cg);
89 	free(cg);
90 
91 	return ret;
92 }
93 
94 static void *alloc_kmem_fn(void *arg)
95 {
96 	alloc_dcache(NULL, (void *)100);
97 	return NULL;
98 }
99 
100 static int alloc_kmem_smp(const char *cgroup, void *arg)
101 {
102 	int nr_threads = 2 * get_nprocs();
103 	pthread_t *tinfo;
104 	unsigned long i;
105 	int ret = -1;
106 
107 	tinfo = calloc(nr_threads, sizeof(pthread_t));
108 	if (tinfo == NULL)
109 		return -1;
110 
111 	for (i = 0; i < nr_threads; i++) {
112 		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
113 				   (void *)i)) {
114 			free(tinfo);
115 			return -1;
116 		}
117 	}
118 
119 	for (i = 0; i < nr_threads; i++) {
120 		ret = pthread_join(tinfo[i], NULL);
121 		if (ret)
122 			break;
123 	}
124 
125 	free(tinfo);
126 	return ret;
127 }
128 
129 static int cg_run_in_subcgroups(const char *parent,
130 				int (*fn)(const char *cgroup, void *arg),
131 				void *arg, int times)
132 {
133 	char *child;
134 	int i;
135 
136 	for (i = 0; i < times; i++) {
137 		child = cg_name_indexed(parent, "child", i);
138 		if (!child)
139 			return -1;
140 
141 		if (cg_create(child)) {
142 			cg_destroy(child);
143 			free(child);
144 			return -1;
145 		}
146 
147 		if (cg_run(child, fn, NULL)) {
148 			cg_destroy(child);
149 			free(child);
150 			return -1;
151 		}
152 
153 		cg_destroy(child);
154 		free(child);
155 	}
156 
157 	return 0;
158 }
159 
160 /*
161  * The test creates and destroys a large number of cgroups. In each cgroup it
162  * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
163  * threads. Then it checks the sanity of numbers on the parent level:
164  * the total size of the cgroups should be roughly equal to
165  * anon + file + slab + kernel_stack.
166  */
167 static int test_kmem_memcg_deletion(const char *root)
168 {
169 	long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum;
170 	int ret = KSFT_FAIL;
171 	char *parent;
172 
173 	parent = cg_name(root, "kmem_memcg_deletion_test");
174 	if (!parent)
175 		goto cleanup;
176 
177 	if (cg_create(parent))
178 		goto cleanup;
179 
180 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
181 		goto cleanup;
182 
183 	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
184 		goto cleanup;
185 
186 	current = cg_read_long(parent, "memory.current");
187 	slab = cg_read_key_long(parent, "memory.stat", "slab ");
188 	anon = cg_read_key_long(parent, "memory.stat", "anon ");
189 	file = cg_read_key_long(parent, "memory.stat", "file ");
190 	kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
191 	pagetables = cg_read_key_long(parent, "memory.stat", "pagetables ");
192 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
193 	sock = cg_read_key_long(parent, "memory.stat", "sock ");
194 	if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
195 	    kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0)
196 		goto cleanup;
197 
198 	sum = slab + anon + file + kernel_stack + pagetables + percpu + sock;
199 	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
200 		ret = KSFT_PASS;
201 	} else {
202 		printf("memory.current = %ld\n", current);
203 		printf("slab + anon + file + kernel_stack = %ld\n", sum);
204 		printf("slab = %ld\n", slab);
205 		printf("anon = %ld\n", anon);
206 		printf("file = %ld\n", file);
207 		printf("kernel_stack = %ld\n", kernel_stack);
208 		printf("pagetables = %ld\n", pagetables);
209 		printf("percpu = %ld\n", percpu);
210 		printf("sock = %ld\n", sock);
211 	}
212 
213 cleanup:
214 	cg_destroy(parent);
215 	free(parent);
216 
217 	return ret;
218 }
219 
220 /*
221  * The test reads the entire /proc/kpagecgroup. If the operation went
222  * successfully (and the kernel didn't panic), the test is treated as passed.
223  */
224 static int test_kmem_proc_kpagecgroup(const char *root)
225 {
226 	unsigned long buf[128];
227 	int ret = KSFT_FAIL;
228 	ssize_t len;
229 	int fd;
230 
231 	fd = open("/proc/kpagecgroup", O_RDONLY);
232 	if (fd < 0)
233 		return ret;
234 
235 	do {
236 		len = read(fd, buf, sizeof(buf));
237 	} while (len > 0);
238 
239 	if (len == 0)
240 		ret = KSFT_PASS;
241 
242 	close(fd);
243 	return ret;
244 }
245 
246 static void *pthread_wait_fn(void *arg)
247 {
248 	sleep(100);
249 	return NULL;
250 }
251 
252 static int spawn_1000_threads(const char *cgroup, void *arg)
253 {
254 	int nr_threads = 1000;
255 	pthread_t *tinfo;
256 	unsigned long i;
257 	long stack;
258 	int ret = -1;
259 
260 	tinfo = calloc(nr_threads, sizeof(pthread_t));
261 	if (tinfo == NULL)
262 		return -1;
263 
264 	for (i = 0; i < nr_threads; i++) {
265 		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
266 				   (void *)i)) {
267 			free(tinfo);
268 			return(-1);
269 		}
270 	}
271 
272 	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
273 	if (stack >= 4096 * 1000)
274 		ret = 0;
275 
276 	free(tinfo);
277 	return ret;
278 }
279 
280 /*
281  * The test spawns a process, which spawns 1000 threads. Then it checks
282  * that memory.stat's kernel_stack is at least 1000 pages large.
283  */
284 static int test_kmem_kernel_stacks(const char *root)
285 {
286 	int ret = KSFT_FAIL;
287 	char *cg = NULL;
288 
289 	cg = cg_name(root, "kmem_kernel_stacks_test");
290 	if (!cg)
291 		goto cleanup;
292 
293 	if (cg_create(cg))
294 		goto cleanup;
295 
296 	if (cg_run(cg, spawn_1000_threads, NULL))
297 		goto cleanup;
298 
299 	ret = KSFT_PASS;
300 cleanup:
301 	cg_destroy(cg);
302 	free(cg);
303 
304 	return ret;
305 }
306 
307 /*
308  * This test sequentionally creates 30 child cgroups, allocates some
309  * kernel memory in each of them, and deletes them. Then it checks
310  * that the number of dying cgroups on the parent level is 0.
311  */
312 static int test_kmem_dead_cgroups(const char *root)
313 {
314 	int ret = KSFT_FAIL;
315 	char *parent;
316 	long dead;
317 	int i;
318 
319 	parent = cg_name(root, "kmem_dead_cgroups_test");
320 	if (!parent)
321 		goto cleanup;
322 
323 	if (cg_create(parent))
324 		goto cleanup;
325 
326 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
327 		goto cleanup;
328 
329 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
330 		goto cleanup;
331 
332 	for (i = 0; i < 5; i++) {
333 		dead = cg_read_key_long(parent, "cgroup.stat",
334 					"nr_dying_descendants ");
335 		if (dead == 0) {
336 			ret = KSFT_PASS;
337 			break;
338 		}
339 		/*
340 		 * Reclaiming cgroups might take some time,
341 		 * let's wait a bit and repeat.
342 		 */
343 		sleep(1);
344 	}
345 
346 cleanup:
347 	cg_destroy(parent);
348 	free(parent);
349 
350 	return ret;
351 }
352 
353 /*
354  * This test creates a sub-tree with 1000 memory cgroups.
355  * Then it checks that the memory.current on the parent level
356  * is greater than 0 and approximates matches the percpu value
357  * from memory.stat.
358  */
359 static int test_percpu_basic(const char *root)
360 {
361 	int ret = KSFT_FAIL;
362 	char *parent, *child;
363 	long current, percpu;
364 	int i;
365 
366 	parent = cg_name(root, "percpu_basic_test");
367 	if (!parent)
368 		goto cleanup;
369 
370 	if (cg_create(parent))
371 		goto cleanup;
372 
373 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
374 		goto cleanup;
375 
376 	for (i = 0; i < 1000; i++) {
377 		child = cg_name_indexed(parent, "child", i);
378 		if (!child)
379 			return -1;
380 
381 		if (cg_create(child))
382 			goto cleanup_children;
383 
384 		free(child);
385 	}
386 
387 	current = cg_read_long(parent, "memory.current");
388 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
389 
390 	if (current > 0 && percpu > 0 && abs(current - percpu) <
391 	    MAX_VMSTAT_ERROR)
392 		ret = KSFT_PASS;
393 	else
394 		printf("memory.current %ld\npercpu %ld\n",
395 		       current, percpu);
396 
397 cleanup_children:
398 	for (i = 0; i < 1000; i++) {
399 		child = cg_name_indexed(parent, "child", i);
400 		cg_destroy(child);
401 		free(child);
402 	}
403 
404 cleanup:
405 	cg_destroy(parent);
406 	free(parent);
407 
408 	return ret;
409 }
410 
411 #define T(x) { x, #x }
412 struct kmem_test {
413 	int (*fn)(const char *root);
414 	const char *name;
415 } tests[] = {
416 	T(test_kmem_basic),
417 	T(test_kmem_memcg_deletion),
418 	T(test_kmem_proc_kpagecgroup),
419 	T(test_kmem_kernel_stacks),
420 	T(test_kmem_dead_cgroups),
421 	T(test_percpu_basic),
422 };
423 #undef T
424 
425 int main(int argc, char **argv)
426 {
427 	char root[PATH_MAX];
428 	int i, ret = EXIT_SUCCESS;
429 
430 	if (cg_find_unified_root(root, sizeof(root)))
431 		ksft_exit_skip("cgroup v2 isn't mounted\n");
432 
433 	/*
434 	 * Check that memory controller is available:
435 	 * memory is listed in cgroup.controllers
436 	 */
437 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
438 		ksft_exit_skip("memory controller isn't available\n");
439 
440 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
441 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
442 			ksft_exit_skip("Failed to set memory controller\n");
443 
444 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
445 		switch (tests[i].fn(root)) {
446 		case KSFT_PASS:
447 			ksft_test_result_pass("%s\n", tests[i].name);
448 			break;
449 		case KSFT_SKIP:
450 			ksft_test_result_skip("%s\n", tests[i].name);
451 			break;
452 		default:
453 			ret = EXIT_FAILURE;
454 			ksft_test_result_fail("%s\n", tests[i].name);
455 			break;
456 		}
457 	}
458 
459 	return ret;
460 }
461