1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 /*
25  * This test creates two nested cgroups with and without enabling
26  * the memory controller.
27  */
28 static int test_memcg_subtree_control(const char *root)
29 {
30 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
31 	int ret = KSFT_FAIL;
32 	char buf[PAGE_SIZE];
33 
34 	/* Create two nested cgroups with the memory controller enabled */
35 	parent = cg_name(root, "memcg_test_0");
36 	child = cg_name(root, "memcg_test_0/memcg_test_1");
37 	if (!parent || !child)
38 		goto cleanup_free;
39 
40 	if (cg_create(parent))
41 		goto cleanup_free;
42 
43 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
44 		goto cleanup_parent;
45 
46 	if (cg_create(child))
47 		goto cleanup_parent;
48 
49 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
50 		goto cleanup_child;
51 
52 	/* Create two nested cgroups without enabling memory controller */
53 	parent2 = cg_name(root, "memcg_test_1");
54 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
55 	if (!parent2 || !child2)
56 		goto cleanup_free2;
57 
58 	if (cg_create(parent2))
59 		goto cleanup_free2;
60 
61 	if (cg_create(child2))
62 		goto cleanup_parent2;
63 
64 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
65 		goto cleanup_all;
66 
67 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
68 		goto cleanup_all;
69 
70 	ret = KSFT_PASS;
71 
72 cleanup_all:
73 	cg_destroy(child2);
74 cleanup_parent2:
75 	cg_destroy(parent2);
76 cleanup_free2:
77 	free(parent2);
78 	free(child2);
79 cleanup_child:
80 	cg_destroy(child);
81 cleanup_parent:
82 	cg_destroy(parent);
83 cleanup_free:
84 	free(parent);
85 	free(child);
86 
87 	return ret;
88 }
89 
90 static int alloc_anon_50M_check(const char *cgroup, void *arg)
91 {
92 	size_t size = MB(50);
93 	char *buf, *ptr;
94 	long anon, current;
95 	int ret = -1;
96 
97 	buf = malloc(size);
98 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
99 		*ptr = 0;
100 
101 	current = cg_read_long(cgroup, "memory.current");
102 	if (current < size)
103 		goto cleanup;
104 
105 	if (!values_close(size, current, 3))
106 		goto cleanup;
107 
108 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
109 	if (anon < 0)
110 		goto cleanup;
111 
112 	if (!values_close(anon, current, 3))
113 		goto cleanup;
114 
115 	ret = 0;
116 cleanup:
117 	free(buf);
118 	return ret;
119 }
120 
121 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
122 {
123 	size_t size = MB(50);
124 	int ret = -1;
125 	long current, file;
126 	int fd;
127 
128 	fd = get_temp_fd();
129 	if (fd < 0)
130 		return -1;
131 
132 	if (alloc_pagecache(fd, size))
133 		goto cleanup;
134 
135 	current = cg_read_long(cgroup, "memory.current");
136 	if (current < size)
137 		goto cleanup;
138 
139 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
140 	if (file < 0)
141 		goto cleanup;
142 
143 	if (!values_close(file, current, 10))
144 		goto cleanup;
145 
146 	ret = 0;
147 
148 cleanup:
149 	close(fd);
150 	return ret;
151 }
152 
153 /*
154  * This test create a memory cgroup, allocates
155  * some anonymous memory and some pagecache
156  * and check memory.current and some memory.stat values.
157  */
158 static int test_memcg_current(const char *root)
159 {
160 	int ret = KSFT_FAIL;
161 	long current;
162 	char *memcg;
163 
164 	memcg = cg_name(root, "memcg_test");
165 	if (!memcg)
166 		goto cleanup;
167 
168 	if (cg_create(memcg))
169 		goto cleanup;
170 
171 	current = cg_read_long(memcg, "memory.current");
172 	if (current != 0)
173 		goto cleanup;
174 
175 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
176 		goto cleanup;
177 
178 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
179 		goto cleanup;
180 
181 	ret = KSFT_PASS;
182 
183 cleanup:
184 	cg_destroy(memcg);
185 	free(memcg);
186 
187 	return ret;
188 }
189 
190 static int alloc_pagecache_50M(const char *cgroup, void *arg)
191 {
192 	int fd = (long)arg;
193 
194 	return alloc_pagecache(fd, MB(50));
195 }
196 
197 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
198 {
199 	int fd = (long)arg;
200 	int ppid = getppid();
201 
202 	if (alloc_pagecache(fd, MB(50)))
203 		return -1;
204 
205 	while (getppid() == ppid)
206 		sleep(1);
207 
208 	return 0;
209 }
210 
211 static int alloc_anon_noexit(const char *cgroup, void *arg)
212 {
213 	int ppid = getppid();
214 	size_t size = (unsigned long)arg;
215 	char *buf, *ptr;
216 
217 	buf = malloc(size);
218 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
219 		*ptr = 0;
220 
221 	while (getppid() == ppid)
222 		sleep(1);
223 
224 	free(buf);
225 	return 0;
226 }
227 
228 /*
229  * Wait until processes are killed asynchronously by the OOM killer
230  * If we exceed a timeout, fail.
231  */
232 static int cg_test_proc_killed(const char *cgroup)
233 {
234 	int limit;
235 
236 	for (limit = 10; limit > 0; limit--) {
237 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
238 			return 0;
239 
240 		usleep(100000);
241 	}
242 	return -1;
243 }
244 
245 /*
246  * First, this test creates the following hierarchy:
247  * A       memory.min = 50M,  memory.max = 200M
248  * A/B     memory.min = 50M,  memory.current = 50M
249  * A/B/C   memory.min = 75M,  memory.current = 50M
250  * A/B/D   memory.min = 25M,  memory.current = 50M
251  * A/B/E   memory.min = 500M, memory.current = 0
252  * A/B/F   memory.min = 0,    memory.current = 50M
253  *
254  * Usages are pagecache, but the test keeps a running
255  * process in every leaf cgroup.
256  * Then it creates A/G and creates a significant
257  * memory pressure in it.
258  *
259  * A/B    memory.current ~= 50M
260  * A/B/C  memory.current ~= 33M
261  * A/B/D  memory.current ~= 17M
262  * A/B/E  memory.current ~= 0
263  *
264  * After that it tries to allocate more than there is
265  * unprotected memory in A available, and checks
266  * checks that memory.min protects pagecache even
267  * in this case.
268  */
269 static int test_memcg_min(const char *root)
270 {
271 	int ret = KSFT_FAIL;
272 	char *parent[3] = {NULL};
273 	char *children[4] = {NULL};
274 	long c[4];
275 	int i, attempts;
276 	int fd;
277 
278 	fd = get_temp_fd();
279 	if (fd < 0)
280 		goto cleanup;
281 
282 	parent[0] = cg_name(root, "memcg_test_0");
283 	if (!parent[0])
284 		goto cleanup;
285 
286 	parent[1] = cg_name(parent[0], "memcg_test_1");
287 	if (!parent[1])
288 		goto cleanup;
289 
290 	parent[2] = cg_name(parent[0], "memcg_test_2");
291 	if (!parent[2])
292 		goto cleanup;
293 
294 	if (cg_create(parent[0]))
295 		goto cleanup;
296 
297 	if (cg_read_long(parent[0], "memory.min")) {
298 		ret = KSFT_SKIP;
299 		goto cleanup;
300 	}
301 
302 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
303 		goto cleanup;
304 
305 	if (cg_write(parent[0], "memory.max", "200M"))
306 		goto cleanup;
307 
308 	if (cg_write(parent[0], "memory.swap.max", "0"))
309 		goto cleanup;
310 
311 	if (cg_create(parent[1]))
312 		goto cleanup;
313 
314 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
315 		goto cleanup;
316 
317 	if (cg_create(parent[2]))
318 		goto cleanup;
319 
320 	for (i = 0; i < ARRAY_SIZE(children); i++) {
321 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
322 		if (!children[i])
323 			goto cleanup;
324 
325 		if (cg_create(children[i]))
326 			goto cleanup;
327 
328 		if (i == 2)
329 			continue;
330 
331 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
332 			      (void *)(long)fd);
333 	}
334 
335 	if (cg_write(parent[0], "memory.min", "50M"))
336 		goto cleanup;
337 	if (cg_write(parent[1], "memory.min", "50M"))
338 		goto cleanup;
339 	if (cg_write(children[0], "memory.min", "75M"))
340 		goto cleanup;
341 	if (cg_write(children[1], "memory.min", "25M"))
342 		goto cleanup;
343 	if (cg_write(children[2], "memory.min", "500M"))
344 		goto cleanup;
345 	if (cg_write(children[3], "memory.min", "0"))
346 		goto cleanup;
347 
348 	attempts = 0;
349 	while (!values_close(cg_read_long(parent[1], "memory.current"),
350 			     MB(150), 3)) {
351 		if (attempts++ > 5)
352 			break;
353 		sleep(1);
354 	}
355 
356 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
357 		goto cleanup;
358 
359 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
360 		goto cleanup;
361 
362 	for (i = 0; i < ARRAY_SIZE(children); i++)
363 		c[i] = cg_read_long(children[i], "memory.current");
364 
365 	if (!values_close(c[0], MB(33), 10))
366 		goto cleanup;
367 
368 	if (!values_close(c[1], MB(17), 10))
369 		goto cleanup;
370 
371 	if (!values_close(c[2], 0, 1))
372 		goto cleanup;
373 
374 	if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
375 		goto cleanup;
376 
377 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
378 		goto cleanup;
379 
380 	ret = KSFT_PASS;
381 
382 cleanup:
383 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
384 		if (!children[i])
385 			continue;
386 
387 		cg_destroy(children[i]);
388 		free(children[i]);
389 	}
390 
391 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
392 		if (!parent[i])
393 			continue;
394 
395 		cg_destroy(parent[i]);
396 		free(parent[i]);
397 	}
398 	close(fd);
399 	return ret;
400 }
401 
402 /*
403  * First, this test creates the following hierarchy:
404  * A       memory.low = 50M,  memory.max = 200M
405  * A/B     memory.low = 50M,  memory.current = 50M
406  * A/B/C   memory.low = 75M,  memory.current = 50M
407  * A/B/D   memory.low = 25M,  memory.current = 50M
408  * A/B/E   memory.low = 500M, memory.current = 0
409  * A/B/F   memory.low = 0,    memory.current = 50M
410  *
411  * Usages are pagecache.
412  * Then it creates A/G an creates a significant
413  * memory pressure in it.
414  *
415  * Then it checks actual memory usages and expects that:
416  * A/B    memory.current ~= 50M
417  * A/B/   memory.current ~= 33M
418  * A/B/D  memory.current ~= 17M
419  * A/B/E  memory.current ~= 0
420  *
421  * After that it tries to allocate more than there is
422  * unprotected memory in A available,
423  * and checks low and oom events in memory.events.
424  */
425 static int test_memcg_low(const char *root)
426 {
427 	int ret = KSFT_FAIL;
428 	char *parent[3] = {NULL};
429 	char *children[4] = {NULL};
430 	long low, oom;
431 	long c[4];
432 	int i;
433 	int fd;
434 
435 	fd = get_temp_fd();
436 	if (fd < 0)
437 		goto cleanup;
438 
439 	parent[0] = cg_name(root, "memcg_test_0");
440 	if (!parent[0])
441 		goto cleanup;
442 
443 	parent[1] = cg_name(parent[0], "memcg_test_1");
444 	if (!parent[1])
445 		goto cleanup;
446 
447 	parent[2] = cg_name(parent[0], "memcg_test_2");
448 	if (!parent[2])
449 		goto cleanup;
450 
451 	if (cg_create(parent[0]))
452 		goto cleanup;
453 
454 	if (cg_read_long(parent[0], "memory.low"))
455 		goto cleanup;
456 
457 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
458 		goto cleanup;
459 
460 	if (cg_write(parent[0], "memory.max", "200M"))
461 		goto cleanup;
462 
463 	if (cg_write(parent[0], "memory.swap.max", "0"))
464 		goto cleanup;
465 
466 	if (cg_create(parent[1]))
467 		goto cleanup;
468 
469 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
470 		goto cleanup;
471 
472 	if (cg_create(parent[2]))
473 		goto cleanup;
474 
475 	for (i = 0; i < ARRAY_SIZE(children); i++) {
476 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
477 		if (!children[i])
478 			goto cleanup;
479 
480 		if (cg_create(children[i]))
481 			goto cleanup;
482 
483 		if (i == 2)
484 			continue;
485 
486 		if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
487 			goto cleanup;
488 	}
489 
490 	if (cg_write(parent[0], "memory.low", "50M"))
491 		goto cleanup;
492 	if (cg_write(parent[1], "memory.low", "50M"))
493 		goto cleanup;
494 	if (cg_write(children[0], "memory.low", "75M"))
495 		goto cleanup;
496 	if (cg_write(children[1], "memory.low", "25M"))
497 		goto cleanup;
498 	if (cg_write(children[2], "memory.low", "500M"))
499 		goto cleanup;
500 	if (cg_write(children[3], "memory.low", "0"))
501 		goto cleanup;
502 
503 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
504 		goto cleanup;
505 
506 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
507 		goto cleanup;
508 
509 	for (i = 0; i < ARRAY_SIZE(children); i++)
510 		c[i] = cg_read_long(children[i], "memory.current");
511 
512 	if (!values_close(c[0], MB(33), 10))
513 		goto cleanup;
514 
515 	if (!values_close(c[1], MB(17), 10))
516 		goto cleanup;
517 
518 	if (!values_close(c[2], 0, 1))
519 		goto cleanup;
520 
521 	if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
522 		fprintf(stderr,
523 			"memory.low prevents from allocating anon memory\n");
524 		goto cleanup;
525 	}
526 
527 	for (i = 0; i < ARRAY_SIZE(children); i++) {
528 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
529 		low = cg_read_key_long(children[i], "memory.events", "low ");
530 
531 		if (oom)
532 			goto cleanup;
533 		if (i < 2 && low <= 0)
534 			goto cleanup;
535 		if (i >= 2 && low)
536 			goto cleanup;
537 	}
538 
539 	ret = KSFT_PASS;
540 
541 cleanup:
542 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
543 		if (!children[i])
544 			continue;
545 
546 		cg_destroy(children[i]);
547 		free(children[i]);
548 	}
549 
550 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
551 		if (!parent[i])
552 			continue;
553 
554 		cg_destroy(parent[i]);
555 		free(parent[i]);
556 	}
557 	close(fd);
558 	return ret;
559 }
560 
561 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
562 {
563 	size_t size = MB(50);
564 	int ret = -1;
565 	long current;
566 	int fd;
567 
568 	fd = get_temp_fd();
569 	if (fd < 0)
570 		return -1;
571 
572 	if (alloc_pagecache(fd, size))
573 		goto cleanup;
574 
575 	current = cg_read_long(cgroup, "memory.current");
576 	if (current <= MB(29) || current > MB(30))
577 		goto cleanup;
578 
579 	ret = 0;
580 
581 cleanup:
582 	close(fd);
583 	return ret;
584 
585 }
586 
587 /*
588  * This test checks that memory.high limits the amount of
589  * memory which can be consumed by either anonymous memory
590  * or pagecache.
591  */
592 static int test_memcg_high(const char *root)
593 {
594 	int ret = KSFT_FAIL;
595 	char *memcg;
596 	long high;
597 
598 	memcg = cg_name(root, "memcg_test");
599 	if (!memcg)
600 		goto cleanup;
601 
602 	if (cg_create(memcg))
603 		goto cleanup;
604 
605 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
606 		goto cleanup;
607 
608 	if (cg_write(memcg, "memory.swap.max", "0"))
609 		goto cleanup;
610 
611 	if (cg_write(memcg, "memory.high", "30M"))
612 		goto cleanup;
613 
614 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
615 		goto cleanup;
616 
617 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
618 		goto cleanup;
619 
620 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
621 		goto cleanup;
622 
623 	high = cg_read_key_long(memcg, "memory.events", "high ");
624 	if (high <= 0)
625 		goto cleanup;
626 
627 	ret = KSFT_PASS;
628 
629 cleanup:
630 	cg_destroy(memcg);
631 	free(memcg);
632 
633 	return ret;
634 }
635 
636 static int alloc_anon_mlock(const char *cgroup, void *arg)
637 {
638 	size_t size = (size_t)arg;
639 	void *buf;
640 
641 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
642 		   0, 0);
643 	if (buf == MAP_FAILED)
644 		return -1;
645 
646 	mlock(buf, size);
647 	munmap(buf, size);
648 	return 0;
649 }
650 
651 /*
652  * This test checks that memory.high is able to throttle big single shot
653  * allocation i.e. large allocation within one kernel entry.
654  */
655 static int test_memcg_high_sync(const char *root)
656 {
657 	int ret = KSFT_FAIL, pid, fd = -1;
658 	char *memcg;
659 	long pre_high, pre_max;
660 	long post_high, post_max;
661 
662 	memcg = cg_name(root, "memcg_test");
663 	if (!memcg)
664 		goto cleanup;
665 
666 	if (cg_create(memcg))
667 		goto cleanup;
668 
669 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
670 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
671 	if (pre_high < 0 || pre_max < 0)
672 		goto cleanup;
673 
674 	if (cg_write(memcg, "memory.swap.max", "0"))
675 		goto cleanup;
676 
677 	if (cg_write(memcg, "memory.high", "30M"))
678 		goto cleanup;
679 
680 	if (cg_write(memcg, "memory.max", "140M"))
681 		goto cleanup;
682 
683 	fd = memcg_prepare_for_wait(memcg);
684 	if (fd < 0)
685 		goto cleanup;
686 
687 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
688 	if (pid < 0)
689 		goto cleanup;
690 
691 	cg_wait_for(fd);
692 
693 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
694 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
695 	if (post_high < 0 || post_max < 0)
696 		goto cleanup;
697 
698 	if (pre_high == post_high || pre_max != post_max)
699 		goto cleanup;
700 
701 	ret = KSFT_PASS;
702 
703 cleanup:
704 	if (fd >= 0)
705 		close(fd);
706 	cg_destroy(memcg);
707 	free(memcg);
708 
709 	return ret;
710 }
711 
712 /*
713  * This test checks that memory.max limits the amount of
714  * memory which can be consumed by either anonymous memory
715  * or pagecache.
716  */
717 static int test_memcg_max(const char *root)
718 {
719 	int ret = KSFT_FAIL;
720 	char *memcg;
721 	long current, max;
722 
723 	memcg = cg_name(root, "memcg_test");
724 	if (!memcg)
725 		goto cleanup;
726 
727 	if (cg_create(memcg))
728 		goto cleanup;
729 
730 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
731 		goto cleanup;
732 
733 	if (cg_write(memcg, "memory.swap.max", "0"))
734 		goto cleanup;
735 
736 	if (cg_write(memcg, "memory.max", "30M"))
737 		goto cleanup;
738 
739 	/* Should be killed by OOM killer */
740 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
741 		goto cleanup;
742 
743 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
744 		goto cleanup;
745 
746 	current = cg_read_long(memcg, "memory.current");
747 	if (current > MB(30) || !current)
748 		goto cleanup;
749 
750 	max = cg_read_key_long(memcg, "memory.events", "max ");
751 	if (max <= 0)
752 		goto cleanup;
753 
754 	ret = KSFT_PASS;
755 
756 cleanup:
757 	cg_destroy(memcg);
758 	free(memcg);
759 
760 	return ret;
761 }
762 
763 /*
764  * This test checks that memory.reclaim reclaims the given
765  * amount of memory (from both anon and file, if possible).
766  */
767 static int test_memcg_reclaim(const char *root)
768 {
769 	int ret = KSFT_FAIL, fd, retries;
770 	char *memcg;
771 	long current, expected_usage, to_reclaim;
772 	char buf[64];
773 
774 	memcg = cg_name(root, "memcg_test");
775 	if (!memcg)
776 		goto cleanup;
777 
778 	if (cg_create(memcg))
779 		goto cleanup;
780 
781 	current = cg_read_long(memcg, "memory.current");
782 	if (current != 0)
783 		goto cleanup;
784 
785 	fd = get_temp_fd();
786 	if (fd < 0)
787 		goto cleanup;
788 
789 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
790 
791 	/*
792 	 * If swap is enabled, try to reclaim from both anon and file, else try
793 	 * to reclaim from file only.
794 	 */
795 	if (is_swap_enabled()) {
796 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
797 		expected_usage = MB(100);
798 	} else
799 		expected_usage = MB(50);
800 
801 	/*
802 	 * Wait until current usage reaches the expected usage (or we run out of
803 	 * retries).
804 	 */
805 	retries = 5;
806 	while (!values_close(cg_read_long(memcg, "memory.current"),
807 			    expected_usage, 10)) {
808 		if (retries--) {
809 			sleep(1);
810 			continue;
811 		} else {
812 			fprintf(stderr,
813 				"failed to allocate %ld for memcg reclaim test\n",
814 				expected_usage);
815 			goto cleanup;
816 		}
817 	}
818 
819 	/*
820 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
821 	 * and file if swap is enabled.
822 	 */
823 	retries = 5;
824 	while (true) {
825 		int err;
826 
827 		current = cg_read_long(memcg, "memory.current");
828 		to_reclaim = current - MB(30);
829 
830 		/*
831 		 * We only keep looping if we get EAGAIN, which means we could
832 		 * not reclaim the full amount.
833 		 */
834 		if (to_reclaim <= 0)
835 			goto cleanup;
836 
837 
838 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
839 		err = cg_write(memcg, "memory.reclaim", buf);
840 		if (!err) {
841 			/*
842 			 * If writing succeeds, then the written amount should have been
843 			 * fully reclaimed (and maybe more).
844 			 */
845 			current = cg_read_long(memcg, "memory.current");
846 			if (!values_close(current, MB(30), 3) && current > MB(30))
847 				goto cleanup;
848 			break;
849 		}
850 
851 		/* The kernel could not reclaim the full amount, try again. */
852 		if (err == -EAGAIN && retries--)
853 			continue;
854 
855 		/* We got an unexpected error or ran out of retries. */
856 		goto cleanup;
857 	}
858 
859 	ret = KSFT_PASS;
860 cleanup:
861 	cg_destroy(memcg);
862 	free(memcg);
863 	close(fd);
864 
865 	return ret;
866 }
867 
868 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
869 {
870 	long mem_max = (long)arg;
871 	size_t size = MB(50);
872 	char *buf, *ptr;
873 	long mem_current, swap_current;
874 	int ret = -1;
875 
876 	buf = malloc(size);
877 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
878 		*ptr = 0;
879 
880 	mem_current = cg_read_long(cgroup, "memory.current");
881 	if (!mem_current || !values_close(mem_current, mem_max, 3))
882 		goto cleanup;
883 
884 	swap_current = cg_read_long(cgroup, "memory.swap.current");
885 	if (!swap_current ||
886 	    !values_close(mem_current + swap_current, size, 3))
887 		goto cleanup;
888 
889 	ret = 0;
890 cleanup:
891 	free(buf);
892 	return ret;
893 }
894 
895 /*
896  * This test checks that memory.swap.max limits the amount of
897  * anonymous memory which can be swapped out.
898  */
899 static int test_memcg_swap_max(const char *root)
900 {
901 	int ret = KSFT_FAIL;
902 	char *memcg;
903 	long max;
904 
905 	if (!is_swap_enabled())
906 		return KSFT_SKIP;
907 
908 	memcg = cg_name(root, "memcg_test");
909 	if (!memcg)
910 		goto cleanup;
911 
912 	if (cg_create(memcg))
913 		goto cleanup;
914 
915 	if (cg_read_long(memcg, "memory.swap.current")) {
916 		ret = KSFT_SKIP;
917 		goto cleanup;
918 	}
919 
920 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
921 		goto cleanup;
922 
923 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
924 		goto cleanup;
925 
926 	if (cg_write(memcg, "memory.swap.max", "30M"))
927 		goto cleanup;
928 
929 	if (cg_write(memcg, "memory.max", "30M"))
930 		goto cleanup;
931 
932 	/* Should be killed by OOM killer */
933 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
934 		goto cleanup;
935 
936 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
937 		goto cleanup;
938 
939 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
940 		goto cleanup;
941 
942 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
943 		goto cleanup;
944 
945 	max = cg_read_key_long(memcg, "memory.events", "max ");
946 	if (max <= 0)
947 		goto cleanup;
948 
949 	ret = KSFT_PASS;
950 
951 cleanup:
952 	cg_destroy(memcg);
953 	free(memcg);
954 
955 	return ret;
956 }
957 
958 /*
959  * This test disables swapping and tries to allocate anonymous memory
960  * up to OOM. Then it checks for oom and oom_kill events in
961  * memory.events.
962  */
963 static int test_memcg_oom_events(const char *root)
964 {
965 	int ret = KSFT_FAIL;
966 	char *memcg;
967 
968 	memcg = cg_name(root, "memcg_test");
969 	if (!memcg)
970 		goto cleanup;
971 
972 	if (cg_create(memcg))
973 		goto cleanup;
974 
975 	if (cg_write(memcg, "memory.max", "30M"))
976 		goto cleanup;
977 
978 	if (cg_write(memcg, "memory.swap.max", "0"))
979 		goto cleanup;
980 
981 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
982 		goto cleanup;
983 
984 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
985 		goto cleanup;
986 
987 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
988 		goto cleanup;
989 
990 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
991 		goto cleanup;
992 
993 	ret = KSFT_PASS;
994 
995 cleanup:
996 	cg_destroy(memcg);
997 	free(memcg);
998 
999 	return ret;
1000 }
1001 
1002 struct tcp_server_args {
1003 	unsigned short port;
1004 	int ctl[2];
1005 };
1006 
1007 static int tcp_server(const char *cgroup, void *arg)
1008 {
1009 	struct tcp_server_args *srv_args = arg;
1010 	struct sockaddr_in6 saddr = { 0 };
1011 	socklen_t slen = sizeof(saddr);
1012 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1013 
1014 	close(srv_args->ctl[0]);
1015 	ctl_fd = srv_args->ctl[1];
1016 
1017 	saddr.sin6_family = AF_INET6;
1018 	saddr.sin6_addr = in6addr_any;
1019 	saddr.sin6_port = htons(srv_args->port);
1020 
1021 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1022 	if (sk < 0)
1023 		return ret;
1024 
1025 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1026 		goto cleanup;
1027 
1028 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1029 		write(ctl_fd, &errno, sizeof(errno));
1030 		goto cleanup;
1031 	}
1032 
1033 	if (listen(sk, 1))
1034 		goto cleanup;
1035 
1036 	ret = 0;
1037 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1038 		ret = -1;
1039 		goto cleanup;
1040 	}
1041 
1042 	client_sk = accept(sk, NULL, NULL);
1043 	if (client_sk < 0)
1044 		goto cleanup;
1045 
1046 	ret = -1;
1047 	for (;;) {
1048 		uint8_t buf[0x100000];
1049 
1050 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1051 			if (errno == ECONNRESET)
1052 				ret = 0;
1053 			break;
1054 		}
1055 	}
1056 
1057 	close(client_sk);
1058 
1059 cleanup:
1060 	close(sk);
1061 	return ret;
1062 }
1063 
1064 static int tcp_client(const char *cgroup, unsigned short port)
1065 {
1066 	const char server[] = "localhost";
1067 	struct addrinfo *ai;
1068 	char servport[6];
1069 	int retries = 0x10; /* nice round number */
1070 	int sk, ret;
1071 
1072 	snprintf(servport, sizeof(servport), "%hd", port);
1073 	ret = getaddrinfo(server, servport, NULL, &ai);
1074 	if (ret)
1075 		return ret;
1076 
1077 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1078 	if (sk < 0)
1079 		goto free_ainfo;
1080 
1081 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1082 	if (ret < 0)
1083 		goto close_sk;
1084 
1085 	ret = KSFT_FAIL;
1086 	while (retries--) {
1087 		uint8_t buf[0x100000];
1088 		long current, sock;
1089 
1090 		if (read(sk, buf, sizeof(buf)) <= 0)
1091 			goto close_sk;
1092 
1093 		current = cg_read_long(cgroup, "memory.current");
1094 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1095 
1096 		if (current < 0 || sock < 0)
1097 			goto close_sk;
1098 
1099 		if (current < sock)
1100 			goto close_sk;
1101 
1102 		if (values_close(current, sock, 10)) {
1103 			ret = KSFT_PASS;
1104 			break;
1105 		}
1106 	}
1107 
1108 close_sk:
1109 	close(sk);
1110 free_ainfo:
1111 	freeaddrinfo(ai);
1112 	return ret;
1113 }
1114 
1115 /*
1116  * This test checks socket memory accounting.
1117  * The test forks a TCP server listens on a random port between 1000
1118  * and 61000. Once it gets a client connection, it starts writing to
1119  * its socket.
1120  * The TCP client interleaves reads from the socket with check whether
1121  * memory.current and memory.stat.sock are similar.
1122  */
1123 static int test_memcg_sock(const char *root)
1124 {
1125 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1126 	unsigned short port;
1127 	char *memcg;
1128 
1129 	memcg = cg_name(root, "memcg_test");
1130 	if (!memcg)
1131 		goto cleanup;
1132 
1133 	if (cg_create(memcg))
1134 		goto cleanup;
1135 
1136 	while (bind_retries--) {
1137 		struct tcp_server_args args;
1138 
1139 		if (pipe(args.ctl))
1140 			goto cleanup;
1141 
1142 		port = args.port = 1000 + rand() % 60000;
1143 
1144 		pid = cg_run_nowait(memcg, tcp_server, &args);
1145 		if (pid < 0)
1146 			goto cleanup;
1147 
1148 		close(args.ctl[1]);
1149 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1150 			goto cleanup;
1151 		close(args.ctl[0]);
1152 
1153 		if (!err)
1154 			break;
1155 		if (err != EADDRINUSE)
1156 			goto cleanup;
1157 
1158 		waitpid(pid, NULL, 0);
1159 	}
1160 
1161 	if (err == EADDRINUSE) {
1162 		ret = KSFT_SKIP;
1163 		goto cleanup;
1164 	}
1165 
1166 	if (tcp_client(memcg, port) != KSFT_PASS)
1167 		goto cleanup;
1168 
1169 	waitpid(pid, &err, 0);
1170 	if (WEXITSTATUS(err))
1171 		goto cleanup;
1172 
1173 	if (cg_read_long(memcg, "memory.current") < 0)
1174 		goto cleanup;
1175 
1176 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1177 		goto cleanup;
1178 
1179 	ret = KSFT_PASS;
1180 
1181 cleanup:
1182 	cg_destroy(memcg);
1183 	free(memcg);
1184 
1185 	return ret;
1186 }
1187 
1188 /*
1189  * This test disables swapping and tries to allocate anonymous memory
1190  * up to OOM with memory.group.oom set. Then it checks that all
1191  * processes in the leaf were killed. It also checks that oom_events
1192  * were propagated to the parent level.
1193  */
1194 static int test_memcg_oom_group_leaf_events(const char *root)
1195 {
1196 	int ret = KSFT_FAIL;
1197 	char *parent, *child;
1198 
1199 	parent = cg_name(root, "memcg_test_0");
1200 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1201 
1202 	if (!parent || !child)
1203 		goto cleanup;
1204 
1205 	if (cg_create(parent))
1206 		goto cleanup;
1207 
1208 	if (cg_create(child))
1209 		goto cleanup;
1210 
1211 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1212 		goto cleanup;
1213 
1214 	if (cg_write(child, "memory.max", "50M"))
1215 		goto cleanup;
1216 
1217 	if (cg_write(child, "memory.swap.max", "0"))
1218 		goto cleanup;
1219 
1220 	if (cg_write(child, "memory.oom.group", "1"))
1221 		goto cleanup;
1222 
1223 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1224 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1225 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1226 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1227 		goto cleanup;
1228 
1229 	if (cg_test_proc_killed(child))
1230 		goto cleanup;
1231 
1232 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1233 		goto cleanup;
1234 
1235 	if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0)
1236 		goto cleanup;
1237 
1238 	ret = KSFT_PASS;
1239 
1240 cleanup:
1241 	if (child)
1242 		cg_destroy(child);
1243 	if (parent)
1244 		cg_destroy(parent);
1245 	free(child);
1246 	free(parent);
1247 
1248 	return ret;
1249 }
1250 
1251 /*
1252  * This test disables swapping and tries to allocate anonymous memory
1253  * up to OOM with memory.group.oom set. Then it checks that all
1254  * processes in the parent and leaf were killed.
1255  */
1256 static int test_memcg_oom_group_parent_events(const char *root)
1257 {
1258 	int ret = KSFT_FAIL;
1259 	char *parent, *child;
1260 
1261 	parent = cg_name(root, "memcg_test_0");
1262 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1263 
1264 	if (!parent || !child)
1265 		goto cleanup;
1266 
1267 	if (cg_create(parent))
1268 		goto cleanup;
1269 
1270 	if (cg_create(child))
1271 		goto cleanup;
1272 
1273 	if (cg_write(parent, "memory.max", "80M"))
1274 		goto cleanup;
1275 
1276 	if (cg_write(parent, "memory.swap.max", "0"))
1277 		goto cleanup;
1278 
1279 	if (cg_write(parent, "memory.oom.group", "1"))
1280 		goto cleanup;
1281 
1282 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1283 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1284 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1285 
1286 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1287 		goto cleanup;
1288 
1289 	if (cg_test_proc_killed(child))
1290 		goto cleanup;
1291 	if (cg_test_proc_killed(parent))
1292 		goto cleanup;
1293 
1294 	ret = KSFT_PASS;
1295 
1296 cleanup:
1297 	if (child)
1298 		cg_destroy(child);
1299 	if (parent)
1300 		cg_destroy(parent);
1301 	free(child);
1302 	free(parent);
1303 
1304 	return ret;
1305 }
1306 
1307 /*
1308  * This test disables swapping and tries to allocate anonymous memory
1309  * up to OOM with memory.group.oom set. Then it checks that all
1310  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1311  */
1312 static int test_memcg_oom_group_score_events(const char *root)
1313 {
1314 	int ret = KSFT_FAIL;
1315 	char *memcg;
1316 	int safe_pid;
1317 
1318 	memcg = cg_name(root, "memcg_test_0");
1319 
1320 	if (!memcg)
1321 		goto cleanup;
1322 
1323 	if (cg_create(memcg))
1324 		goto cleanup;
1325 
1326 	if (cg_write(memcg, "memory.max", "50M"))
1327 		goto cleanup;
1328 
1329 	if (cg_write(memcg, "memory.swap.max", "0"))
1330 		goto cleanup;
1331 
1332 	if (cg_write(memcg, "memory.oom.group", "1"))
1333 		goto cleanup;
1334 
1335 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1336 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1337 		goto cleanup;
1338 
1339 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1340 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1341 		goto cleanup;
1342 
1343 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1344 		goto cleanup;
1345 
1346 	if (kill(safe_pid, SIGKILL))
1347 		goto cleanup;
1348 
1349 	ret = KSFT_PASS;
1350 
1351 cleanup:
1352 	if (memcg)
1353 		cg_destroy(memcg);
1354 	free(memcg);
1355 
1356 	return ret;
1357 }
1358 
1359 
1360 #define T(x) { x, #x }
1361 struct memcg_test {
1362 	int (*fn)(const char *root);
1363 	const char *name;
1364 } tests[] = {
1365 	T(test_memcg_subtree_control),
1366 	T(test_memcg_current),
1367 	T(test_memcg_min),
1368 	T(test_memcg_low),
1369 	T(test_memcg_high),
1370 	T(test_memcg_high_sync),
1371 	T(test_memcg_max),
1372 	T(test_memcg_reclaim),
1373 	T(test_memcg_oom_events),
1374 	T(test_memcg_swap_max),
1375 	T(test_memcg_sock),
1376 	T(test_memcg_oom_group_leaf_events),
1377 	T(test_memcg_oom_group_parent_events),
1378 	T(test_memcg_oom_group_score_events),
1379 };
1380 #undef T
1381 
1382 int main(int argc, char **argv)
1383 {
1384 	char root[PATH_MAX];
1385 	int i, ret = EXIT_SUCCESS;
1386 
1387 	if (cg_find_unified_root(root, sizeof(root)))
1388 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1389 
1390 	/*
1391 	 * Check that memory controller is available:
1392 	 * memory is listed in cgroup.controllers
1393 	 */
1394 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1395 		ksft_exit_skip("memory controller isn't available\n");
1396 
1397 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1398 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1399 			ksft_exit_skip("Failed to set memory controller\n");
1400 
1401 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1402 		switch (tests[i].fn(root)) {
1403 		case KSFT_PASS:
1404 			ksft_test_result_pass("%s\n", tests[i].name);
1405 			break;
1406 		case KSFT_SKIP:
1407 			ksft_test_result_skip("%s\n", tests[i].name);
1408 			break;
1409 		default:
1410 			ret = EXIT_FAILURE;
1411 			ksft_test_result_fail("%s\n", tests[i].name);
1412 			break;
1413 		}
1414 	}
1415 
1416 	return ret;
1417 }
1418