1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 /*
25  * This test creates two nested cgroups with and without enabling
26  * the memory controller.
27  */
28 static int test_memcg_subtree_control(const char *root)
29 {
30 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
31 	int ret = KSFT_FAIL;
32 	char buf[PAGE_SIZE];
33 
34 	/* Create two nested cgroups with the memory controller enabled */
35 	parent = cg_name(root, "memcg_test_0");
36 	child = cg_name(root, "memcg_test_0/memcg_test_1");
37 	if (!parent || !child)
38 		goto cleanup_free;
39 
40 	if (cg_create(parent))
41 		goto cleanup_free;
42 
43 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
44 		goto cleanup_parent;
45 
46 	if (cg_create(child))
47 		goto cleanup_parent;
48 
49 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
50 		goto cleanup_child;
51 
52 	/* Create two nested cgroups without enabling memory controller */
53 	parent2 = cg_name(root, "memcg_test_1");
54 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
55 	if (!parent2 || !child2)
56 		goto cleanup_free2;
57 
58 	if (cg_create(parent2))
59 		goto cleanup_free2;
60 
61 	if (cg_create(child2))
62 		goto cleanup_parent2;
63 
64 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
65 		goto cleanup_all;
66 
67 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
68 		goto cleanup_all;
69 
70 	ret = KSFT_PASS;
71 
72 cleanup_all:
73 	cg_destroy(child2);
74 cleanup_parent2:
75 	cg_destroy(parent2);
76 cleanup_free2:
77 	free(parent2);
78 	free(child2);
79 cleanup_child:
80 	cg_destroy(child);
81 cleanup_parent:
82 	cg_destroy(parent);
83 cleanup_free:
84 	free(parent);
85 	free(child);
86 
87 	return ret;
88 }
89 
90 static int alloc_anon_50M_check(const char *cgroup, void *arg)
91 {
92 	size_t size = MB(50);
93 	char *buf, *ptr;
94 	long anon, current;
95 	int ret = -1;
96 
97 	buf = malloc(size);
98 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
99 		*ptr = 0;
100 
101 	current = cg_read_long(cgroup, "memory.current");
102 	if (current < size)
103 		goto cleanup;
104 
105 	if (!values_close(size, current, 3))
106 		goto cleanup;
107 
108 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
109 	if (anon < 0)
110 		goto cleanup;
111 
112 	if (!values_close(anon, current, 3))
113 		goto cleanup;
114 
115 	ret = 0;
116 cleanup:
117 	free(buf);
118 	return ret;
119 }
120 
121 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
122 {
123 	size_t size = MB(50);
124 	int ret = -1;
125 	long current, file;
126 	int fd;
127 
128 	fd = get_temp_fd();
129 	if (fd < 0)
130 		return -1;
131 
132 	if (alloc_pagecache(fd, size))
133 		goto cleanup;
134 
135 	current = cg_read_long(cgroup, "memory.current");
136 	if (current < size)
137 		goto cleanup;
138 
139 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
140 	if (file < 0)
141 		goto cleanup;
142 
143 	if (!values_close(file, current, 10))
144 		goto cleanup;
145 
146 	ret = 0;
147 
148 cleanup:
149 	close(fd);
150 	return ret;
151 }
152 
153 /*
154  * This test create a memory cgroup, allocates
155  * some anonymous memory and some pagecache
156  * and check memory.current and some memory.stat values.
157  */
158 static int test_memcg_current(const char *root)
159 {
160 	int ret = KSFT_FAIL;
161 	long current;
162 	char *memcg;
163 
164 	memcg = cg_name(root, "memcg_test");
165 	if (!memcg)
166 		goto cleanup;
167 
168 	if (cg_create(memcg))
169 		goto cleanup;
170 
171 	current = cg_read_long(memcg, "memory.current");
172 	if (current != 0)
173 		goto cleanup;
174 
175 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
176 		goto cleanup;
177 
178 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
179 		goto cleanup;
180 
181 	ret = KSFT_PASS;
182 
183 cleanup:
184 	cg_destroy(memcg);
185 	free(memcg);
186 
187 	return ret;
188 }
189 
190 static int alloc_pagecache_50M(const char *cgroup, void *arg)
191 {
192 	int fd = (long)arg;
193 
194 	return alloc_pagecache(fd, MB(50));
195 }
196 
197 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
198 {
199 	int fd = (long)arg;
200 	int ppid = getppid();
201 
202 	if (alloc_pagecache(fd, MB(50)))
203 		return -1;
204 
205 	while (getppid() == ppid)
206 		sleep(1);
207 
208 	return 0;
209 }
210 
211 static int alloc_anon_noexit(const char *cgroup, void *arg)
212 {
213 	int ppid = getppid();
214 
215 	if (alloc_anon(cgroup, arg))
216 		return -1;
217 
218 	while (getppid() == ppid)
219 		sleep(1);
220 
221 	return 0;
222 }
223 
224 /*
225  * Wait until processes are killed asynchronously by the OOM killer
226  * If we exceed a timeout, fail.
227  */
228 static int cg_test_proc_killed(const char *cgroup)
229 {
230 	int limit;
231 
232 	for (limit = 10; limit > 0; limit--) {
233 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
234 			return 0;
235 
236 		usleep(100000);
237 	}
238 	return -1;
239 }
240 
241 /*
242  * First, this test creates the following hierarchy:
243  * A       memory.min = 50M,  memory.max = 200M
244  * A/B     memory.min = 50M,  memory.current = 50M
245  * A/B/C   memory.min = 75M,  memory.current = 50M
246  * A/B/D   memory.min = 25M,  memory.current = 50M
247  * A/B/E   memory.min = 500M, memory.current = 0
248  * A/B/F   memory.min = 0,    memory.current = 50M
249  *
250  * Usages are pagecache, but the test keeps a running
251  * process in every leaf cgroup.
252  * Then it creates A/G and creates a significant
253  * memory pressure in it.
254  *
255  * A/B    memory.current ~= 50M
256  * A/B/C  memory.current ~= 33M
257  * A/B/D  memory.current ~= 17M
258  * A/B/E  memory.current ~= 0
259  *
260  * After that it tries to allocate more than there is
261  * unprotected memory in A available, and checks
262  * checks that memory.min protects pagecache even
263  * in this case.
264  */
265 static int test_memcg_min(const char *root)
266 {
267 	int ret = KSFT_FAIL;
268 	char *parent[3] = {NULL};
269 	char *children[4] = {NULL};
270 	long c[4];
271 	int i, attempts;
272 	int fd;
273 
274 	fd = get_temp_fd();
275 	if (fd < 0)
276 		goto cleanup;
277 
278 	parent[0] = cg_name(root, "memcg_test_0");
279 	if (!parent[0])
280 		goto cleanup;
281 
282 	parent[1] = cg_name(parent[0], "memcg_test_1");
283 	if (!parent[1])
284 		goto cleanup;
285 
286 	parent[2] = cg_name(parent[0], "memcg_test_2");
287 	if (!parent[2])
288 		goto cleanup;
289 
290 	if (cg_create(parent[0]))
291 		goto cleanup;
292 
293 	if (cg_read_long(parent[0], "memory.min")) {
294 		ret = KSFT_SKIP;
295 		goto cleanup;
296 	}
297 
298 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
299 		goto cleanup;
300 
301 	if (cg_write(parent[0], "memory.max", "200M"))
302 		goto cleanup;
303 
304 	if (cg_write(parent[0], "memory.swap.max", "0"))
305 		goto cleanup;
306 
307 	if (cg_create(parent[1]))
308 		goto cleanup;
309 
310 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
311 		goto cleanup;
312 
313 	if (cg_create(parent[2]))
314 		goto cleanup;
315 
316 	for (i = 0; i < ARRAY_SIZE(children); i++) {
317 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
318 		if (!children[i])
319 			goto cleanup;
320 
321 		if (cg_create(children[i]))
322 			goto cleanup;
323 
324 		if (i == 2)
325 			continue;
326 
327 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
328 			      (void *)(long)fd);
329 	}
330 
331 	if (cg_write(parent[0], "memory.min", "50M"))
332 		goto cleanup;
333 	if (cg_write(parent[1], "memory.min", "50M"))
334 		goto cleanup;
335 	if (cg_write(children[0], "memory.min", "75M"))
336 		goto cleanup;
337 	if (cg_write(children[1], "memory.min", "25M"))
338 		goto cleanup;
339 	if (cg_write(children[2], "memory.min", "500M"))
340 		goto cleanup;
341 	if (cg_write(children[3], "memory.min", "0"))
342 		goto cleanup;
343 
344 	attempts = 0;
345 	while (!values_close(cg_read_long(parent[1], "memory.current"),
346 			     MB(150), 3)) {
347 		if (attempts++ > 5)
348 			break;
349 		sleep(1);
350 	}
351 
352 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
353 		goto cleanup;
354 
355 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
356 		goto cleanup;
357 
358 	for (i = 0; i < ARRAY_SIZE(children); i++)
359 		c[i] = cg_read_long(children[i], "memory.current");
360 
361 	if (!values_close(c[0], MB(33), 10))
362 		goto cleanup;
363 
364 	if (!values_close(c[1], MB(17), 10))
365 		goto cleanup;
366 
367 	if (!values_close(c[2], 0, 1))
368 		goto cleanup;
369 
370 	if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
371 		goto cleanup;
372 
373 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
374 		goto cleanup;
375 
376 	ret = KSFT_PASS;
377 
378 cleanup:
379 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
380 		if (!children[i])
381 			continue;
382 
383 		cg_destroy(children[i]);
384 		free(children[i]);
385 	}
386 
387 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
388 		if (!parent[i])
389 			continue;
390 
391 		cg_destroy(parent[i]);
392 		free(parent[i]);
393 	}
394 	close(fd);
395 	return ret;
396 }
397 
398 /*
399  * First, this test creates the following hierarchy:
400  * A       memory.low = 50M,  memory.max = 200M
401  * A/B     memory.low = 50M,  memory.current = 50M
402  * A/B/C   memory.low = 75M,  memory.current = 50M
403  * A/B/D   memory.low = 25M,  memory.current = 50M
404  * A/B/E   memory.low = 500M, memory.current = 0
405  * A/B/F   memory.low = 0,    memory.current = 50M
406  *
407  * Usages are pagecache.
408  * Then it creates A/G an creates a significant
409  * memory pressure in it.
410  *
411  * Then it checks actual memory usages and expects that:
412  * A/B    memory.current ~= 50M
413  * A/B/   memory.current ~= 33M
414  * A/B/D  memory.current ~= 17M
415  * A/B/E  memory.current ~= 0
416  *
417  * After that it tries to allocate more than there is
418  * unprotected memory in A available,
419  * and checks low and oom events in memory.events.
420  */
421 static int test_memcg_low(const char *root)
422 {
423 	int ret = KSFT_FAIL;
424 	char *parent[3] = {NULL};
425 	char *children[4] = {NULL};
426 	long low, oom;
427 	long c[4];
428 	int i;
429 	int fd;
430 
431 	fd = get_temp_fd();
432 	if (fd < 0)
433 		goto cleanup;
434 
435 	parent[0] = cg_name(root, "memcg_test_0");
436 	if (!parent[0])
437 		goto cleanup;
438 
439 	parent[1] = cg_name(parent[0], "memcg_test_1");
440 	if (!parent[1])
441 		goto cleanup;
442 
443 	parent[2] = cg_name(parent[0], "memcg_test_2");
444 	if (!parent[2])
445 		goto cleanup;
446 
447 	if (cg_create(parent[0]))
448 		goto cleanup;
449 
450 	if (cg_read_long(parent[0], "memory.low"))
451 		goto cleanup;
452 
453 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
454 		goto cleanup;
455 
456 	if (cg_write(parent[0], "memory.max", "200M"))
457 		goto cleanup;
458 
459 	if (cg_write(parent[0], "memory.swap.max", "0"))
460 		goto cleanup;
461 
462 	if (cg_create(parent[1]))
463 		goto cleanup;
464 
465 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
466 		goto cleanup;
467 
468 	if (cg_create(parent[2]))
469 		goto cleanup;
470 
471 	for (i = 0; i < ARRAY_SIZE(children); i++) {
472 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
473 		if (!children[i])
474 			goto cleanup;
475 
476 		if (cg_create(children[i]))
477 			goto cleanup;
478 
479 		if (i == 2)
480 			continue;
481 
482 		if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
483 			goto cleanup;
484 	}
485 
486 	if (cg_write(parent[0], "memory.low", "50M"))
487 		goto cleanup;
488 	if (cg_write(parent[1], "memory.low", "50M"))
489 		goto cleanup;
490 	if (cg_write(children[0], "memory.low", "75M"))
491 		goto cleanup;
492 	if (cg_write(children[1], "memory.low", "25M"))
493 		goto cleanup;
494 	if (cg_write(children[2], "memory.low", "500M"))
495 		goto cleanup;
496 	if (cg_write(children[3], "memory.low", "0"))
497 		goto cleanup;
498 
499 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
500 		goto cleanup;
501 
502 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
503 		goto cleanup;
504 
505 	for (i = 0; i < ARRAY_SIZE(children); i++)
506 		c[i] = cg_read_long(children[i], "memory.current");
507 
508 	if (!values_close(c[0], MB(33), 10))
509 		goto cleanup;
510 
511 	if (!values_close(c[1], MB(17), 10))
512 		goto cleanup;
513 
514 	if (!values_close(c[2], 0, 1))
515 		goto cleanup;
516 
517 	if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
518 		fprintf(stderr,
519 			"memory.low prevents from allocating anon memory\n");
520 		goto cleanup;
521 	}
522 
523 	for (i = 0; i < ARRAY_SIZE(children); i++) {
524 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
525 		low = cg_read_key_long(children[i], "memory.events", "low ");
526 
527 		if (oom)
528 			goto cleanup;
529 		if (i < 2 && low <= 0)
530 			goto cleanup;
531 		if (i >= 2 && low)
532 			goto cleanup;
533 	}
534 
535 	ret = KSFT_PASS;
536 
537 cleanup:
538 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
539 		if (!children[i])
540 			continue;
541 
542 		cg_destroy(children[i]);
543 		free(children[i]);
544 	}
545 
546 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
547 		if (!parent[i])
548 			continue;
549 
550 		cg_destroy(parent[i]);
551 		free(parent[i]);
552 	}
553 	close(fd);
554 	return ret;
555 }
556 
557 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
558 {
559 	size_t size = MB(50);
560 	int ret = -1;
561 	long current;
562 	int fd;
563 
564 	fd = get_temp_fd();
565 	if (fd < 0)
566 		return -1;
567 
568 	if (alloc_pagecache(fd, size))
569 		goto cleanup;
570 
571 	current = cg_read_long(cgroup, "memory.current");
572 	if (current <= MB(29) || current > MB(30))
573 		goto cleanup;
574 
575 	ret = 0;
576 
577 cleanup:
578 	close(fd);
579 	return ret;
580 
581 }
582 
583 /*
584  * This test checks that memory.high limits the amount of
585  * memory which can be consumed by either anonymous memory
586  * or pagecache.
587  */
588 static int test_memcg_high(const char *root)
589 {
590 	int ret = KSFT_FAIL;
591 	char *memcg;
592 	long high;
593 
594 	memcg = cg_name(root, "memcg_test");
595 	if (!memcg)
596 		goto cleanup;
597 
598 	if (cg_create(memcg))
599 		goto cleanup;
600 
601 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
602 		goto cleanup;
603 
604 	if (cg_write(memcg, "memory.swap.max", "0"))
605 		goto cleanup;
606 
607 	if (cg_write(memcg, "memory.high", "30M"))
608 		goto cleanup;
609 
610 	if (cg_run(memcg, alloc_anon, (void *)MB(100)))
611 		goto cleanup;
612 
613 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
614 		goto cleanup;
615 
616 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
617 		goto cleanup;
618 
619 	high = cg_read_key_long(memcg, "memory.events", "high ");
620 	if (high <= 0)
621 		goto cleanup;
622 
623 	ret = KSFT_PASS;
624 
625 cleanup:
626 	cg_destroy(memcg);
627 	free(memcg);
628 
629 	return ret;
630 }
631 
632 static int alloc_anon_mlock(const char *cgroup, void *arg)
633 {
634 	size_t size = (size_t)arg;
635 	void *buf;
636 
637 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
638 		   0, 0);
639 	if (buf == MAP_FAILED)
640 		return -1;
641 
642 	mlock(buf, size);
643 	munmap(buf, size);
644 	return 0;
645 }
646 
647 /*
648  * This test checks that memory.high is able to throttle big single shot
649  * allocation i.e. large allocation within one kernel entry.
650  */
651 static int test_memcg_high_sync(const char *root)
652 {
653 	int ret = KSFT_FAIL, pid, fd = -1;
654 	char *memcg;
655 	long pre_high, pre_max;
656 	long post_high, post_max;
657 
658 	memcg = cg_name(root, "memcg_test");
659 	if (!memcg)
660 		goto cleanup;
661 
662 	if (cg_create(memcg))
663 		goto cleanup;
664 
665 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
666 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
667 	if (pre_high < 0 || pre_max < 0)
668 		goto cleanup;
669 
670 	if (cg_write(memcg, "memory.swap.max", "0"))
671 		goto cleanup;
672 
673 	if (cg_write(memcg, "memory.high", "30M"))
674 		goto cleanup;
675 
676 	if (cg_write(memcg, "memory.max", "140M"))
677 		goto cleanup;
678 
679 	fd = memcg_prepare_for_wait(memcg);
680 	if (fd < 0)
681 		goto cleanup;
682 
683 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
684 	if (pid < 0)
685 		goto cleanup;
686 
687 	cg_wait_for(fd);
688 
689 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
690 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
691 	if (post_high < 0 || post_max < 0)
692 		goto cleanup;
693 
694 	if (pre_high == post_high || pre_max != post_max)
695 		goto cleanup;
696 
697 	ret = KSFT_PASS;
698 
699 cleanup:
700 	if (fd >= 0)
701 		close(fd);
702 	cg_destroy(memcg);
703 	free(memcg);
704 
705 	return ret;
706 }
707 
708 /*
709  * This test checks that memory.max limits the amount of
710  * memory which can be consumed by either anonymous memory
711  * or pagecache.
712  */
713 static int test_memcg_max(const char *root)
714 {
715 	int ret = KSFT_FAIL;
716 	char *memcg;
717 	long current, max;
718 
719 	memcg = cg_name(root, "memcg_test");
720 	if (!memcg)
721 		goto cleanup;
722 
723 	if (cg_create(memcg))
724 		goto cleanup;
725 
726 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
727 		goto cleanup;
728 
729 	if (cg_write(memcg, "memory.swap.max", "0"))
730 		goto cleanup;
731 
732 	if (cg_write(memcg, "memory.max", "30M"))
733 		goto cleanup;
734 
735 	/* Should be killed by OOM killer */
736 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
737 		goto cleanup;
738 
739 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
740 		goto cleanup;
741 
742 	current = cg_read_long(memcg, "memory.current");
743 	if (current > MB(30) || !current)
744 		goto cleanup;
745 
746 	max = cg_read_key_long(memcg, "memory.events", "max ");
747 	if (max <= 0)
748 		goto cleanup;
749 
750 	ret = KSFT_PASS;
751 
752 cleanup:
753 	cg_destroy(memcg);
754 	free(memcg);
755 
756 	return ret;
757 }
758 
759 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
760 {
761 	long mem_max = (long)arg;
762 	size_t size = MB(50);
763 	char *buf, *ptr;
764 	long mem_current, swap_current;
765 	int ret = -1;
766 
767 	buf = malloc(size);
768 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
769 		*ptr = 0;
770 
771 	mem_current = cg_read_long(cgroup, "memory.current");
772 	if (!mem_current || !values_close(mem_current, mem_max, 3))
773 		goto cleanup;
774 
775 	swap_current = cg_read_long(cgroup, "memory.swap.current");
776 	if (!swap_current ||
777 	    !values_close(mem_current + swap_current, size, 3))
778 		goto cleanup;
779 
780 	ret = 0;
781 cleanup:
782 	free(buf);
783 	return ret;
784 }
785 
786 /*
787  * This test checks that memory.swap.max limits the amount of
788  * anonymous memory which can be swapped out.
789  */
790 static int test_memcg_swap_max(const char *root)
791 {
792 	int ret = KSFT_FAIL;
793 	char *memcg;
794 	long max;
795 
796 	if (!is_swap_enabled())
797 		return KSFT_SKIP;
798 
799 	memcg = cg_name(root, "memcg_test");
800 	if (!memcg)
801 		goto cleanup;
802 
803 	if (cg_create(memcg))
804 		goto cleanup;
805 
806 	if (cg_read_long(memcg, "memory.swap.current")) {
807 		ret = KSFT_SKIP;
808 		goto cleanup;
809 	}
810 
811 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
812 		goto cleanup;
813 
814 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
815 		goto cleanup;
816 
817 	if (cg_write(memcg, "memory.swap.max", "30M"))
818 		goto cleanup;
819 
820 	if (cg_write(memcg, "memory.max", "30M"))
821 		goto cleanup;
822 
823 	/* Should be killed by OOM killer */
824 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
825 		goto cleanup;
826 
827 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
828 		goto cleanup;
829 
830 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
831 		goto cleanup;
832 
833 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
834 		goto cleanup;
835 
836 	max = cg_read_key_long(memcg, "memory.events", "max ");
837 	if (max <= 0)
838 		goto cleanup;
839 
840 	ret = KSFT_PASS;
841 
842 cleanup:
843 	cg_destroy(memcg);
844 	free(memcg);
845 
846 	return ret;
847 }
848 
849 /*
850  * This test disables swapping and tries to allocate anonymous memory
851  * up to OOM. Then it checks for oom and oom_kill events in
852  * memory.events.
853  */
854 static int test_memcg_oom_events(const char *root)
855 {
856 	int ret = KSFT_FAIL;
857 	char *memcg;
858 
859 	memcg = cg_name(root, "memcg_test");
860 	if (!memcg)
861 		goto cleanup;
862 
863 	if (cg_create(memcg))
864 		goto cleanup;
865 
866 	if (cg_write(memcg, "memory.max", "30M"))
867 		goto cleanup;
868 
869 	if (cg_write(memcg, "memory.swap.max", "0"))
870 		goto cleanup;
871 
872 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
873 		goto cleanup;
874 
875 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
876 		goto cleanup;
877 
878 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
879 		goto cleanup;
880 
881 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
882 		goto cleanup;
883 
884 	ret = KSFT_PASS;
885 
886 cleanup:
887 	cg_destroy(memcg);
888 	free(memcg);
889 
890 	return ret;
891 }
892 
893 struct tcp_server_args {
894 	unsigned short port;
895 	int ctl[2];
896 };
897 
898 static int tcp_server(const char *cgroup, void *arg)
899 {
900 	struct tcp_server_args *srv_args = arg;
901 	struct sockaddr_in6 saddr = { 0 };
902 	socklen_t slen = sizeof(saddr);
903 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
904 
905 	close(srv_args->ctl[0]);
906 	ctl_fd = srv_args->ctl[1];
907 
908 	saddr.sin6_family = AF_INET6;
909 	saddr.sin6_addr = in6addr_any;
910 	saddr.sin6_port = htons(srv_args->port);
911 
912 	sk = socket(AF_INET6, SOCK_STREAM, 0);
913 	if (sk < 0)
914 		return ret;
915 
916 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
917 		goto cleanup;
918 
919 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
920 		write(ctl_fd, &errno, sizeof(errno));
921 		goto cleanup;
922 	}
923 
924 	if (listen(sk, 1))
925 		goto cleanup;
926 
927 	ret = 0;
928 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
929 		ret = -1;
930 		goto cleanup;
931 	}
932 
933 	client_sk = accept(sk, NULL, NULL);
934 	if (client_sk < 0)
935 		goto cleanup;
936 
937 	ret = -1;
938 	for (;;) {
939 		uint8_t buf[0x100000];
940 
941 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
942 			if (errno == ECONNRESET)
943 				ret = 0;
944 			break;
945 		}
946 	}
947 
948 	close(client_sk);
949 
950 cleanup:
951 	close(sk);
952 	return ret;
953 }
954 
955 static int tcp_client(const char *cgroup, unsigned short port)
956 {
957 	const char server[] = "localhost";
958 	struct addrinfo *ai;
959 	char servport[6];
960 	int retries = 0x10; /* nice round number */
961 	int sk, ret;
962 
963 	snprintf(servport, sizeof(servport), "%hd", port);
964 	ret = getaddrinfo(server, servport, NULL, &ai);
965 	if (ret)
966 		return ret;
967 
968 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
969 	if (sk < 0)
970 		goto free_ainfo;
971 
972 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
973 	if (ret < 0)
974 		goto close_sk;
975 
976 	ret = KSFT_FAIL;
977 	while (retries--) {
978 		uint8_t buf[0x100000];
979 		long current, sock;
980 
981 		if (read(sk, buf, sizeof(buf)) <= 0)
982 			goto close_sk;
983 
984 		current = cg_read_long(cgroup, "memory.current");
985 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
986 
987 		if (current < 0 || sock < 0)
988 			goto close_sk;
989 
990 		if (current < sock)
991 			goto close_sk;
992 
993 		if (values_close(current, sock, 10)) {
994 			ret = KSFT_PASS;
995 			break;
996 		}
997 	}
998 
999 close_sk:
1000 	close(sk);
1001 free_ainfo:
1002 	freeaddrinfo(ai);
1003 	return ret;
1004 }
1005 
1006 /*
1007  * This test checks socket memory accounting.
1008  * The test forks a TCP server listens on a random port between 1000
1009  * and 61000. Once it gets a client connection, it starts writing to
1010  * its socket.
1011  * The TCP client interleaves reads from the socket with check whether
1012  * memory.current and memory.stat.sock are similar.
1013  */
1014 static int test_memcg_sock(const char *root)
1015 {
1016 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1017 	unsigned short port;
1018 	char *memcg;
1019 
1020 	memcg = cg_name(root, "memcg_test");
1021 	if (!memcg)
1022 		goto cleanup;
1023 
1024 	if (cg_create(memcg))
1025 		goto cleanup;
1026 
1027 	while (bind_retries--) {
1028 		struct tcp_server_args args;
1029 
1030 		if (pipe(args.ctl))
1031 			goto cleanup;
1032 
1033 		port = args.port = 1000 + rand() % 60000;
1034 
1035 		pid = cg_run_nowait(memcg, tcp_server, &args);
1036 		if (pid < 0)
1037 			goto cleanup;
1038 
1039 		close(args.ctl[1]);
1040 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1041 			goto cleanup;
1042 		close(args.ctl[0]);
1043 
1044 		if (!err)
1045 			break;
1046 		if (err != EADDRINUSE)
1047 			goto cleanup;
1048 
1049 		waitpid(pid, NULL, 0);
1050 	}
1051 
1052 	if (err == EADDRINUSE) {
1053 		ret = KSFT_SKIP;
1054 		goto cleanup;
1055 	}
1056 
1057 	if (tcp_client(memcg, port) != KSFT_PASS)
1058 		goto cleanup;
1059 
1060 	waitpid(pid, &err, 0);
1061 	if (WEXITSTATUS(err))
1062 		goto cleanup;
1063 
1064 	if (cg_read_long(memcg, "memory.current") < 0)
1065 		goto cleanup;
1066 
1067 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1068 		goto cleanup;
1069 
1070 	ret = KSFT_PASS;
1071 
1072 cleanup:
1073 	cg_destroy(memcg);
1074 	free(memcg);
1075 
1076 	return ret;
1077 }
1078 
1079 /*
1080  * This test disables swapping and tries to allocate anonymous memory
1081  * up to OOM with memory.group.oom set. Then it checks that all
1082  * processes in the leaf (but not the parent) were killed.
1083  */
1084 static int test_memcg_oom_group_leaf_events(const char *root)
1085 {
1086 	int ret = KSFT_FAIL;
1087 	char *parent, *child;
1088 
1089 	parent = cg_name(root, "memcg_test_0");
1090 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1091 
1092 	if (!parent || !child)
1093 		goto cleanup;
1094 
1095 	if (cg_create(parent))
1096 		goto cleanup;
1097 
1098 	if (cg_create(child))
1099 		goto cleanup;
1100 
1101 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1102 		goto cleanup;
1103 
1104 	if (cg_write(child, "memory.max", "50M"))
1105 		goto cleanup;
1106 
1107 	if (cg_write(child, "memory.swap.max", "0"))
1108 		goto cleanup;
1109 
1110 	if (cg_write(child, "memory.oom.group", "1"))
1111 		goto cleanup;
1112 
1113 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1114 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1115 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1116 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1117 		goto cleanup;
1118 
1119 	if (cg_test_proc_killed(child))
1120 		goto cleanup;
1121 
1122 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1123 		goto cleanup;
1124 
1125 	if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0)
1126 		goto cleanup;
1127 
1128 	ret = KSFT_PASS;
1129 
1130 cleanup:
1131 	if (child)
1132 		cg_destroy(child);
1133 	if (parent)
1134 		cg_destroy(parent);
1135 	free(child);
1136 	free(parent);
1137 
1138 	return ret;
1139 }
1140 
1141 /*
1142  * This test disables swapping and tries to allocate anonymous memory
1143  * up to OOM with memory.group.oom set. Then it checks that all
1144  * processes in the parent and leaf were killed.
1145  */
1146 static int test_memcg_oom_group_parent_events(const char *root)
1147 {
1148 	int ret = KSFT_FAIL;
1149 	char *parent, *child;
1150 
1151 	parent = cg_name(root, "memcg_test_0");
1152 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1153 
1154 	if (!parent || !child)
1155 		goto cleanup;
1156 
1157 	if (cg_create(parent))
1158 		goto cleanup;
1159 
1160 	if (cg_create(child))
1161 		goto cleanup;
1162 
1163 	if (cg_write(parent, "memory.max", "80M"))
1164 		goto cleanup;
1165 
1166 	if (cg_write(parent, "memory.swap.max", "0"))
1167 		goto cleanup;
1168 
1169 	if (cg_write(parent, "memory.oom.group", "1"))
1170 		goto cleanup;
1171 
1172 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1173 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1174 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1175 
1176 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1177 		goto cleanup;
1178 
1179 	if (cg_test_proc_killed(child))
1180 		goto cleanup;
1181 	if (cg_test_proc_killed(parent))
1182 		goto cleanup;
1183 
1184 	ret = KSFT_PASS;
1185 
1186 cleanup:
1187 	if (child)
1188 		cg_destroy(child);
1189 	if (parent)
1190 		cg_destroy(parent);
1191 	free(child);
1192 	free(parent);
1193 
1194 	return ret;
1195 }
1196 
1197 /*
1198  * This test disables swapping and tries to allocate anonymous memory
1199  * up to OOM with memory.group.oom set. Then it checks that all
1200  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1201  */
1202 static int test_memcg_oom_group_score_events(const char *root)
1203 {
1204 	int ret = KSFT_FAIL;
1205 	char *memcg;
1206 	int safe_pid;
1207 
1208 	memcg = cg_name(root, "memcg_test_0");
1209 
1210 	if (!memcg)
1211 		goto cleanup;
1212 
1213 	if (cg_create(memcg))
1214 		goto cleanup;
1215 
1216 	if (cg_write(memcg, "memory.max", "50M"))
1217 		goto cleanup;
1218 
1219 	if (cg_write(memcg, "memory.swap.max", "0"))
1220 		goto cleanup;
1221 
1222 	if (cg_write(memcg, "memory.oom.group", "1"))
1223 		goto cleanup;
1224 
1225 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1226 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1227 		goto cleanup;
1228 
1229 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1230 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1231 		goto cleanup;
1232 
1233 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1234 		goto cleanup;
1235 
1236 	if (kill(safe_pid, SIGKILL))
1237 		goto cleanup;
1238 
1239 	ret = KSFT_PASS;
1240 
1241 cleanup:
1242 	if (memcg)
1243 		cg_destroy(memcg);
1244 	free(memcg);
1245 
1246 	return ret;
1247 }
1248 
1249 
1250 #define T(x) { x, #x }
1251 struct memcg_test {
1252 	int (*fn)(const char *root);
1253 	const char *name;
1254 } tests[] = {
1255 	T(test_memcg_subtree_control),
1256 	T(test_memcg_current),
1257 	T(test_memcg_min),
1258 	T(test_memcg_low),
1259 	T(test_memcg_high),
1260 	T(test_memcg_high_sync),
1261 	T(test_memcg_max),
1262 	T(test_memcg_oom_events),
1263 	T(test_memcg_swap_max),
1264 	T(test_memcg_sock),
1265 	T(test_memcg_oom_group_leaf_events),
1266 	T(test_memcg_oom_group_parent_events),
1267 	T(test_memcg_oom_group_score_events),
1268 };
1269 #undef T
1270 
1271 int main(int argc, char **argv)
1272 {
1273 	char root[PATH_MAX];
1274 	int i, ret = EXIT_SUCCESS;
1275 
1276 	if (cg_find_unified_root(root, sizeof(root)))
1277 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1278 
1279 	/*
1280 	 * Check that memory controller is available:
1281 	 * memory is listed in cgroup.controllers
1282 	 */
1283 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1284 		ksft_exit_skip("memory controller isn't available\n");
1285 
1286 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1287 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1288 			ksft_exit_skip("Failed to set memory controller\n");
1289 
1290 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1291 		switch (tests[i].fn(root)) {
1292 		case KSFT_PASS:
1293 			ksft_test_result_pass("%s\n", tests[i].name);
1294 			break;
1295 		case KSFT_SKIP:
1296 			ksft_test_result_skip("%s\n", tests[i].name);
1297 			break;
1298 		default:
1299 			ret = EXIT_FAILURE;
1300 			ksft_test_result_fail("%s\n", tests[i].name);
1301 			break;
1302 		}
1303 	}
1304 
1305 	return ret;
1306 }
1307