1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
102 		*ptr = 0;
103 
104 	current = cg_read_long(cgroup, "memory.current");
105 	if (current < size)
106 		goto cleanup;
107 
108 	if (!values_close(size, current, 3))
109 		goto cleanup;
110 
111 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
112 	if (anon < 0)
113 		goto cleanup;
114 
115 	if (!values_close(anon, current, 3))
116 		goto cleanup;
117 
118 	ret = 0;
119 cleanup:
120 	free(buf);
121 	return ret;
122 }
123 
124 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
125 {
126 	size_t size = MB(50);
127 	int ret = -1;
128 	long current, file;
129 	int fd;
130 
131 	fd = get_temp_fd();
132 	if (fd < 0)
133 		return -1;
134 
135 	if (alloc_pagecache(fd, size))
136 		goto cleanup;
137 
138 	current = cg_read_long(cgroup, "memory.current");
139 	if (current < size)
140 		goto cleanup;
141 
142 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
143 	if (file < 0)
144 		goto cleanup;
145 
146 	if (!values_close(file, current, 10))
147 		goto cleanup;
148 
149 	ret = 0;
150 
151 cleanup:
152 	close(fd);
153 	return ret;
154 }
155 
156 /*
157  * This test create a memory cgroup, allocates
158  * some anonymous memory and some pagecache
159  * and check memory.current and some memory.stat values.
160  */
161 static int test_memcg_current(const char *root)
162 {
163 	int ret = KSFT_FAIL;
164 	long current;
165 	char *memcg;
166 
167 	memcg = cg_name(root, "memcg_test");
168 	if (!memcg)
169 		goto cleanup;
170 
171 	if (cg_create(memcg))
172 		goto cleanup;
173 
174 	current = cg_read_long(memcg, "memory.current");
175 	if (current != 0)
176 		goto cleanup;
177 
178 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
179 		goto cleanup;
180 
181 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
182 		goto cleanup;
183 
184 	ret = KSFT_PASS;
185 
186 cleanup:
187 	cg_destroy(memcg);
188 	free(memcg);
189 
190 	return ret;
191 }
192 
193 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
194 {
195 	int fd = (long)arg;
196 	int ppid = getppid();
197 
198 	if (alloc_pagecache(fd, MB(50)))
199 		return -1;
200 
201 	while (getppid() == ppid)
202 		sleep(1);
203 
204 	return 0;
205 }
206 
207 static int alloc_anon_noexit(const char *cgroup, void *arg)
208 {
209 	int ppid = getppid();
210 	size_t size = (unsigned long)arg;
211 	char *buf, *ptr;
212 
213 	buf = malloc(size);
214 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
215 		*ptr = 0;
216 
217 	while (getppid() == ppid)
218 		sleep(1);
219 
220 	free(buf);
221 	return 0;
222 }
223 
224 /*
225  * Wait until processes are killed asynchronously by the OOM killer
226  * If we exceed a timeout, fail.
227  */
228 static int cg_test_proc_killed(const char *cgroup)
229 {
230 	int limit;
231 
232 	for (limit = 10; limit > 0; limit--) {
233 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
234 			return 0;
235 
236 		usleep(100000);
237 	}
238 	return -1;
239 }
240 
241 /*
242  * First, this test creates the following hierarchy:
243  * A       memory.min = 0,    memory.max = 200M
244  * A/B     memory.min = 50M
245  * A/B/C   memory.min = 75M,  memory.current = 50M
246  * A/B/D   memory.min = 25M,  memory.current = 50M
247  * A/B/E   memory.min = 0,    memory.current = 50M
248  * A/B/F   memory.min = 500M, memory.current = 0
249  *
250  * (or memory.low if we test soft protection)
251  *
252  * Usages are pagecache and the test keeps a running
253  * process in every leaf cgroup.
254  * Then it creates A/G and creates a significant
255  * memory pressure in A.
256  *
257  * Then it checks actual memory usages and expects that:
258  * A/B    memory.current ~= 50M
259  * A/B/C  memory.current ~= 29M
260  * A/B/D  memory.current ~= 21M
261  * A/B/E  memory.current ~= 0
262  * A/B/F  memory.current  = 0
263  * (for origin of the numbers, see model in memcg_protection.m.)
264  *
265  * After that it tries to allocate more than there is
266  * unprotected memory in A available, and checks that:
267  * a) memory.min protects pagecache even in this case,
268  * b) memory.low allows reclaiming page cache with low events.
269  */
270 static int test_memcg_protection(const char *root, bool min)
271 {
272 	int ret = KSFT_FAIL, rc;
273 	char *parent[3] = {NULL};
274 	char *children[4] = {NULL};
275 	const char *attribute = min ? "memory.min" : "memory.low";
276 	long c[4];
277 	int i, attempts;
278 	int fd;
279 
280 	fd = get_temp_fd();
281 	if (fd < 0)
282 		goto cleanup;
283 
284 	parent[0] = cg_name(root, "memcg_test_0");
285 	if (!parent[0])
286 		goto cleanup;
287 
288 	parent[1] = cg_name(parent[0], "memcg_test_1");
289 	if (!parent[1])
290 		goto cleanup;
291 
292 	parent[2] = cg_name(parent[0], "memcg_test_2");
293 	if (!parent[2])
294 		goto cleanup;
295 
296 	if (cg_create(parent[0]))
297 		goto cleanup;
298 
299 	if (cg_read_long(parent[0], attribute)) {
300 		/* No memory.min on older kernels is fine */
301 		if (min)
302 			ret = KSFT_SKIP;
303 		goto cleanup;
304 	}
305 
306 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
307 		goto cleanup;
308 
309 	if (cg_write(parent[0], "memory.max", "200M"))
310 		goto cleanup;
311 
312 	if (cg_write(parent[0], "memory.swap.max", "0"))
313 		goto cleanup;
314 
315 	if (cg_create(parent[1]))
316 		goto cleanup;
317 
318 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
319 		goto cleanup;
320 
321 	if (cg_create(parent[2]))
322 		goto cleanup;
323 
324 	for (i = 0; i < ARRAY_SIZE(children); i++) {
325 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
326 		if (!children[i])
327 			goto cleanup;
328 
329 		if (cg_create(children[i]))
330 			goto cleanup;
331 
332 		if (i > 2)
333 			continue;
334 
335 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
336 			      (void *)(long)fd);
337 	}
338 
339 	if (cg_write(parent[1],   attribute, "50M"))
340 		goto cleanup;
341 	if (cg_write(children[0], attribute, "75M"))
342 		goto cleanup;
343 	if (cg_write(children[1], attribute, "25M"))
344 		goto cleanup;
345 	if (cg_write(children[2], attribute, "0"))
346 		goto cleanup;
347 	if (cg_write(children[3], attribute, "500M"))
348 		goto cleanup;
349 
350 	attempts = 0;
351 	while (!values_close(cg_read_long(parent[1], "memory.current"),
352 			     MB(150), 3)) {
353 		if (attempts++ > 5)
354 			break;
355 		sleep(1);
356 	}
357 
358 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
359 		goto cleanup;
360 
361 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
362 		goto cleanup;
363 
364 	for (i = 0; i < ARRAY_SIZE(children); i++)
365 		c[i] = cg_read_long(children[i], "memory.current");
366 
367 	if (!values_close(c[0], MB(29), 10))
368 		goto cleanup;
369 
370 	if (!values_close(c[1], MB(21), 10))
371 		goto cleanup;
372 
373 	if (c[3] != 0)
374 		goto cleanup;
375 
376 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
377 	if (min && !rc)
378 		goto cleanup;
379 	else if (!min && rc) {
380 		fprintf(stderr,
381 			"memory.low prevents from allocating anon memory\n");
382 		goto cleanup;
383 	}
384 
385 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
386 		goto cleanup;
387 
388 	if (min) {
389 		ret = KSFT_PASS;
390 		goto cleanup;
391 	}
392 
393 	for (i = 0; i < ARRAY_SIZE(children); i++) {
394 		int no_low_events_index = 1;
395 		long low, oom;
396 
397 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
398 		low = cg_read_key_long(children[i], "memory.events", "low ");
399 
400 		if (oom)
401 			goto cleanup;
402 		if (i <= no_low_events_index && low <= 0)
403 			goto cleanup;
404 		if (i > no_low_events_index && low)
405 			goto cleanup;
406 
407 	}
408 
409 	ret = KSFT_PASS;
410 
411 cleanup:
412 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
413 		if (!children[i])
414 			continue;
415 
416 		cg_destroy(children[i]);
417 		free(children[i]);
418 	}
419 
420 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
421 		if (!parent[i])
422 			continue;
423 
424 		cg_destroy(parent[i]);
425 		free(parent[i]);
426 	}
427 	close(fd);
428 	return ret;
429 }
430 
431 static int test_memcg_min(const char *root)
432 {
433 	return test_memcg_protection(root, true);
434 }
435 
436 static int test_memcg_low(const char *root)
437 {
438 	return test_memcg_protection(root, false);
439 }
440 
441 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
442 {
443 	size_t size = MB(50);
444 	int ret = -1;
445 	long current, high, max;
446 	int fd;
447 
448 	high = cg_read_long(cgroup, "memory.high");
449 	max = cg_read_long(cgroup, "memory.max");
450 	if (high != MB(30) && max != MB(30))
451 		return -1;
452 
453 	fd = get_temp_fd();
454 	if (fd < 0)
455 		return -1;
456 
457 	if (alloc_pagecache(fd, size))
458 		goto cleanup;
459 
460 	current = cg_read_long(cgroup, "memory.current");
461 	if (!values_close(current, MB(30), 5))
462 		goto cleanup;
463 
464 	ret = 0;
465 
466 cleanup:
467 	close(fd);
468 	return ret;
469 
470 }
471 
472 /*
473  * This test checks that memory.high limits the amount of
474  * memory which can be consumed by either anonymous memory
475  * or pagecache.
476  */
477 static int test_memcg_high(const char *root)
478 {
479 	int ret = KSFT_FAIL;
480 	char *memcg;
481 	long high;
482 
483 	memcg = cg_name(root, "memcg_test");
484 	if (!memcg)
485 		goto cleanup;
486 
487 	if (cg_create(memcg))
488 		goto cleanup;
489 
490 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
491 		goto cleanup;
492 
493 	if (cg_write(memcg, "memory.swap.max", "0"))
494 		goto cleanup;
495 
496 	if (cg_write(memcg, "memory.high", "30M"))
497 		goto cleanup;
498 
499 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
500 		goto cleanup;
501 
502 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
503 		goto cleanup;
504 
505 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
506 		goto cleanup;
507 
508 	high = cg_read_key_long(memcg, "memory.events", "high ");
509 	if (high <= 0)
510 		goto cleanup;
511 
512 	ret = KSFT_PASS;
513 
514 cleanup:
515 	cg_destroy(memcg);
516 	free(memcg);
517 
518 	return ret;
519 }
520 
521 static int alloc_anon_mlock(const char *cgroup, void *arg)
522 {
523 	size_t size = (size_t)arg;
524 	void *buf;
525 
526 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
527 		   0, 0);
528 	if (buf == MAP_FAILED)
529 		return -1;
530 
531 	mlock(buf, size);
532 	munmap(buf, size);
533 	return 0;
534 }
535 
536 /*
537  * This test checks that memory.high is able to throttle big single shot
538  * allocation i.e. large allocation within one kernel entry.
539  */
540 static int test_memcg_high_sync(const char *root)
541 {
542 	int ret = KSFT_FAIL, pid, fd = -1;
543 	char *memcg;
544 	long pre_high, pre_max;
545 	long post_high, post_max;
546 
547 	memcg = cg_name(root, "memcg_test");
548 	if (!memcg)
549 		goto cleanup;
550 
551 	if (cg_create(memcg))
552 		goto cleanup;
553 
554 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
555 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
556 	if (pre_high < 0 || pre_max < 0)
557 		goto cleanup;
558 
559 	if (cg_write(memcg, "memory.swap.max", "0"))
560 		goto cleanup;
561 
562 	if (cg_write(memcg, "memory.high", "30M"))
563 		goto cleanup;
564 
565 	if (cg_write(memcg, "memory.max", "140M"))
566 		goto cleanup;
567 
568 	fd = memcg_prepare_for_wait(memcg);
569 	if (fd < 0)
570 		goto cleanup;
571 
572 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
573 	if (pid < 0)
574 		goto cleanup;
575 
576 	cg_wait_for(fd);
577 
578 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
579 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
580 	if (post_high < 0 || post_max < 0)
581 		goto cleanup;
582 
583 	if (pre_high == post_high || pre_max != post_max)
584 		goto cleanup;
585 
586 	ret = KSFT_PASS;
587 
588 cleanup:
589 	if (fd >= 0)
590 		close(fd);
591 	cg_destroy(memcg);
592 	free(memcg);
593 
594 	return ret;
595 }
596 
597 /*
598  * This test checks that memory.max limits the amount of
599  * memory which can be consumed by either anonymous memory
600  * or pagecache.
601  */
602 static int test_memcg_max(const char *root)
603 {
604 	int ret = KSFT_FAIL;
605 	char *memcg;
606 	long current, max;
607 
608 	memcg = cg_name(root, "memcg_test");
609 	if (!memcg)
610 		goto cleanup;
611 
612 	if (cg_create(memcg))
613 		goto cleanup;
614 
615 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
616 		goto cleanup;
617 
618 	if (cg_write(memcg, "memory.swap.max", "0"))
619 		goto cleanup;
620 
621 	if (cg_write(memcg, "memory.max", "30M"))
622 		goto cleanup;
623 
624 	/* Should be killed by OOM killer */
625 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
626 		goto cleanup;
627 
628 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
629 		goto cleanup;
630 
631 	current = cg_read_long(memcg, "memory.current");
632 	if (current > MB(30) || !current)
633 		goto cleanup;
634 
635 	max = cg_read_key_long(memcg, "memory.events", "max ");
636 	if (max <= 0)
637 		goto cleanup;
638 
639 	ret = KSFT_PASS;
640 
641 cleanup:
642 	cg_destroy(memcg);
643 	free(memcg);
644 
645 	return ret;
646 }
647 
648 /*
649  * This test checks that memory.reclaim reclaims the given
650  * amount of memory (from both anon and file, if possible).
651  */
652 static int test_memcg_reclaim(const char *root)
653 {
654 	int ret = KSFT_FAIL, fd, retries;
655 	char *memcg;
656 	long current, expected_usage, to_reclaim;
657 	char buf[64];
658 
659 	memcg = cg_name(root, "memcg_test");
660 	if (!memcg)
661 		goto cleanup;
662 
663 	if (cg_create(memcg))
664 		goto cleanup;
665 
666 	current = cg_read_long(memcg, "memory.current");
667 	if (current != 0)
668 		goto cleanup;
669 
670 	fd = get_temp_fd();
671 	if (fd < 0)
672 		goto cleanup;
673 
674 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
675 
676 	/*
677 	 * If swap is enabled, try to reclaim from both anon and file, else try
678 	 * to reclaim from file only.
679 	 */
680 	if (is_swap_enabled()) {
681 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
682 		expected_usage = MB(100);
683 	} else
684 		expected_usage = MB(50);
685 
686 	/*
687 	 * Wait until current usage reaches the expected usage (or we run out of
688 	 * retries).
689 	 */
690 	retries = 5;
691 	while (!values_close(cg_read_long(memcg, "memory.current"),
692 			    expected_usage, 10)) {
693 		if (retries--) {
694 			sleep(1);
695 			continue;
696 		} else {
697 			fprintf(stderr,
698 				"failed to allocate %ld for memcg reclaim test\n",
699 				expected_usage);
700 			goto cleanup;
701 		}
702 	}
703 
704 	/*
705 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
706 	 * and file if swap is enabled.
707 	 */
708 	retries = 5;
709 	while (true) {
710 		int err;
711 
712 		current = cg_read_long(memcg, "memory.current");
713 		to_reclaim = current - MB(30);
714 
715 		/*
716 		 * We only keep looping if we get EAGAIN, which means we could
717 		 * not reclaim the full amount.
718 		 */
719 		if (to_reclaim <= 0)
720 			goto cleanup;
721 
722 
723 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
724 		err = cg_write(memcg, "memory.reclaim", buf);
725 		if (!err) {
726 			/*
727 			 * If writing succeeds, then the written amount should have been
728 			 * fully reclaimed (and maybe more).
729 			 */
730 			current = cg_read_long(memcg, "memory.current");
731 			if (!values_close(current, MB(30), 3) && current > MB(30))
732 				goto cleanup;
733 			break;
734 		}
735 
736 		/* The kernel could not reclaim the full amount, try again. */
737 		if (err == -EAGAIN && retries--)
738 			continue;
739 
740 		/* We got an unexpected error or ran out of retries. */
741 		goto cleanup;
742 	}
743 
744 	ret = KSFT_PASS;
745 cleanup:
746 	cg_destroy(memcg);
747 	free(memcg);
748 	close(fd);
749 
750 	return ret;
751 }
752 
753 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
754 {
755 	long mem_max = (long)arg;
756 	size_t size = MB(50);
757 	char *buf, *ptr;
758 	long mem_current, swap_current;
759 	int ret = -1;
760 
761 	buf = malloc(size);
762 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
763 		*ptr = 0;
764 
765 	mem_current = cg_read_long(cgroup, "memory.current");
766 	if (!mem_current || !values_close(mem_current, mem_max, 3))
767 		goto cleanup;
768 
769 	swap_current = cg_read_long(cgroup, "memory.swap.current");
770 	if (!swap_current ||
771 	    !values_close(mem_current + swap_current, size, 3))
772 		goto cleanup;
773 
774 	ret = 0;
775 cleanup:
776 	free(buf);
777 	return ret;
778 }
779 
780 /*
781  * This test checks that memory.swap.max limits the amount of
782  * anonymous memory which can be swapped out.
783  */
784 static int test_memcg_swap_max(const char *root)
785 {
786 	int ret = KSFT_FAIL;
787 	char *memcg;
788 	long max;
789 
790 	if (!is_swap_enabled())
791 		return KSFT_SKIP;
792 
793 	memcg = cg_name(root, "memcg_test");
794 	if (!memcg)
795 		goto cleanup;
796 
797 	if (cg_create(memcg))
798 		goto cleanup;
799 
800 	if (cg_read_long(memcg, "memory.swap.current")) {
801 		ret = KSFT_SKIP;
802 		goto cleanup;
803 	}
804 
805 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
806 		goto cleanup;
807 
808 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
809 		goto cleanup;
810 
811 	if (cg_write(memcg, "memory.swap.max", "30M"))
812 		goto cleanup;
813 
814 	if (cg_write(memcg, "memory.max", "30M"))
815 		goto cleanup;
816 
817 	/* Should be killed by OOM killer */
818 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
819 		goto cleanup;
820 
821 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
822 		goto cleanup;
823 
824 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
825 		goto cleanup;
826 
827 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
828 		goto cleanup;
829 
830 	max = cg_read_key_long(memcg, "memory.events", "max ");
831 	if (max <= 0)
832 		goto cleanup;
833 
834 	ret = KSFT_PASS;
835 
836 cleanup:
837 	cg_destroy(memcg);
838 	free(memcg);
839 
840 	return ret;
841 }
842 
843 /*
844  * This test disables swapping and tries to allocate anonymous memory
845  * up to OOM. Then it checks for oom and oom_kill events in
846  * memory.events.
847  */
848 static int test_memcg_oom_events(const char *root)
849 {
850 	int ret = KSFT_FAIL;
851 	char *memcg;
852 
853 	memcg = cg_name(root, "memcg_test");
854 	if (!memcg)
855 		goto cleanup;
856 
857 	if (cg_create(memcg))
858 		goto cleanup;
859 
860 	if (cg_write(memcg, "memory.max", "30M"))
861 		goto cleanup;
862 
863 	if (cg_write(memcg, "memory.swap.max", "0"))
864 		goto cleanup;
865 
866 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
867 		goto cleanup;
868 
869 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
870 		goto cleanup;
871 
872 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
873 		goto cleanup;
874 
875 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
876 		goto cleanup;
877 
878 	ret = KSFT_PASS;
879 
880 cleanup:
881 	cg_destroy(memcg);
882 	free(memcg);
883 
884 	return ret;
885 }
886 
887 struct tcp_server_args {
888 	unsigned short port;
889 	int ctl[2];
890 };
891 
892 static int tcp_server(const char *cgroup, void *arg)
893 {
894 	struct tcp_server_args *srv_args = arg;
895 	struct sockaddr_in6 saddr = { 0 };
896 	socklen_t slen = sizeof(saddr);
897 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
898 
899 	close(srv_args->ctl[0]);
900 	ctl_fd = srv_args->ctl[1];
901 
902 	saddr.sin6_family = AF_INET6;
903 	saddr.sin6_addr = in6addr_any;
904 	saddr.sin6_port = htons(srv_args->port);
905 
906 	sk = socket(AF_INET6, SOCK_STREAM, 0);
907 	if (sk < 0)
908 		return ret;
909 
910 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
911 		goto cleanup;
912 
913 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
914 		write(ctl_fd, &errno, sizeof(errno));
915 		goto cleanup;
916 	}
917 
918 	if (listen(sk, 1))
919 		goto cleanup;
920 
921 	ret = 0;
922 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
923 		ret = -1;
924 		goto cleanup;
925 	}
926 
927 	client_sk = accept(sk, NULL, NULL);
928 	if (client_sk < 0)
929 		goto cleanup;
930 
931 	ret = -1;
932 	for (;;) {
933 		uint8_t buf[0x100000];
934 
935 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
936 			if (errno == ECONNRESET)
937 				ret = 0;
938 			break;
939 		}
940 	}
941 
942 	close(client_sk);
943 
944 cleanup:
945 	close(sk);
946 	return ret;
947 }
948 
949 static int tcp_client(const char *cgroup, unsigned short port)
950 {
951 	const char server[] = "localhost";
952 	struct addrinfo *ai;
953 	char servport[6];
954 	int retries = 0x10; /* nice round number */
955 	int sk, ret;
956 
957 	snprintf(servport, sizeof(servport), "%hd", port);
958 	ret = getaddrinfo(server, servport, NULL, &ai);
959 	if (ret)
960 		return ret;
961 
962 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
963 	if (sk < 0)
964 		goto free_ainfo;
965 
966 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
967 	if (ret < 0)
968 		goto close_sk;
969 
970 	ret = KSFT_FAIL;
971 	while (retries--) {
972 		uint8_t buf[0x100000];
973 		long current, sock;
974 
975 		if (read(sk, buf, sizeof(buf)) <= 0)
976 			goto close_sk;
977 
978 		current = cg_read_long(cgroup, "memory.current");
979 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
980 
981 		if (current < 0 || sock < 0)
982 			goto close_sk;
983 
984 		if (values_close(current, sock, 10)) {
985 			ret = KSFT_PASS;
986 			break;
987 		}
988 	}
989 
990 close_sk:
991 	close(sk);
992 free_ainfo:
993 	freeaddrinfo(ai);
994 	return ret;
995 }
996 
997 /*
998  * This test checks socket memory accounting.
999  * The test forks a TCP server listens on a random port between 1000
1000  * and 61000. Once it gets a client connection, it starts writing to
1001  * its socket.
1002  * The TCP client interleaves reads from the socket with check whether
1003  * memory.current and memory.stat.sock are similar.
1004  */
1005 static int test_memcg_sock(const char *root)
1006 {
1007 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1008 	unsigned short port;
1009 	char *memcg;
1010 
1011 	memcg = cg_name(root, "memcg_test");
1012 	if (!memcg)
1013 		goto cleanup;
1014 
1015 	if (cg_create(memcg))
1016 		goto cleanup;
1017 
1018 	while (bind_retries--) {
1019 		struct tcp_server_args args;
1020 
1021 		if (pipe(args.ctl))
1022 			goto cleanup;
1023 
1024 		port = args.port = 1000 + rand() % 60000;
1025 
1026 		pid = cg_run_nowait(memcg, tcp_server, &args);
1027 		if (pid < 0)
1028 			goto cleanup;
1029 
1030 		close(args.ctl[1]);
1031 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1032 			goto cleanup;
1033 		close(args.ctl[0]);
1034 
1035 		if (!err)
1036 			break;
1037 		if (err != EADDRINUSE)
1038 			goto cleanup;
1039 
1040 		waitpid(pid, NULL, 0);
1041 	}
1042 
1043 	if (err == EADDRINUSE) {
1044 		ret = KSFT_SKIP;
1045 		goto cleanup;
1046 	}
1047 
1048 	if (tcp_client(memcg, port) != KSFT_PASS)
1049 		goto cleanup;
1050 
1051 	waitpid(pid, &err, 0);
1052 	if (WEXITSTATUS(err))
1053 		goto cleanup;
1054 
1055 	if (cg_read_long(memcg, "memory.current") < 0)
1056 		goto cleanup;
1057 
1058 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1059 		goto cleanup;
1060 
1061 	ret = KSFT_PASS;
1062 
1063 cleanup:
1064 	cg_destroy(memcg);
1065 	free(memcg);
1066 
1067 	return ret;
1068 }
1069 
1070 /*
1071  * This test disables swapping and tries to allocate anonymous memory
1072  * up to OOM with memory.group.oom set. Then it checks that all
1073  * processes in the leaf were killed. It also checks that oom_events
1074  * were propagated to the parent level.
1075  */
1076 static int test_memcg_oom_group_leaf_events(const char *root)
1077 {
1078 	int ret = KSFT_FAIL;
1079 	char *parent, *child;
1080 	long parent_oom_events;
1081 
1082 	parent = cg_name(root, "memcg_test_0");
1083 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1084 
1085 	if (!parent || !child)
1086 		goto cleanup;
1087 
1088 	if (cg_create(parent))
1089 		goto cleanup;
1090 
1091 	if (cg_create(child))
1092 		goto cleanup;
1093 
1094 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1095 		goto cleanup;
1096 
1097 	if (cg_write(child, "memory.max", "50M"))
1098 		goto cleanup;
1099 
1100 	if (cg_write(child, "memory.swap.max", "0"))
1101 		goto cleanup;
1102 
1103 	if (cg_write(child, "memory.oom.group", "1"))
1104 		goto cleanup;
1105 
1106 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1107 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1108 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1109 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1110 		goto cleanup;
1111 
1112 	if (cg_test_proc_killed(child))
1113 		goto cleanup;
1114 
1115 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1116 		goto cleanup;
1117 
1118 	parent_oom_events = cg_read_key_long(
1119 			parent, "memory.events", "oom_kill ");
1120 	/*
1121 	 * If memory_localevents is not enabled (the default), the parent should
1122 	 * count OOM events in its children groups. Otherwise, it should not
1123 	 * have observed any events.
1124 	 */
1125 	if (has_localevents && parent_oom_events != 0)
1126 		goto cleanup;
1127 	else if (!has_localevents && parent_oom_events <= 0)
1128 		goto cleanup;
1129 
1130 	ret = KSFT_PASS;
1131 
1132 cleanup:
1133 	if (child)
1134 		cg_destroy(child);
1135 	if (parent)
1136 		cg_destroy(parent);
1137 	free(child);
1138 	free(parent);
1139 
1140 	return ret;
1141 }
1142 
1143 /*
1144  * This test disables swapping and tries to allocate anonymous memory
1145  * up to OOM with memory.group.oom set. Then it checks that all
1146  * processes in the parent and leaf were killed.
1147  */
1148 static int test_memcg_oom_group_parent_events(const char *root)
1149 {
1150 	int ret = KSFT_FAIL;
1151 	char *parent, *child;
1152 
1153 	parent = cg_name(root, "memcg_test_0");
1154 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1155 
1156 	if (!parent || !child)
1157 		goto cleanup;
1158 
1159 	if (cg_create(parent))
1160 		goto cleanup;
1161 
1162 	if (cg_create(child))
1163 		goto cleanup;
1164 
1165 	if (cg_write(parent, "memory.max", "80M"))
1166 		goto cleanup;
1167 
1168 	if (cg_write(parent, "memory.swap.max", "0"))
1169 		goto cleanup;
1170 
1171 	if (cg_write(parent, "memory.oom.group", "1"))
1172 		goto cleanup;
1173 
1174 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1175 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1176 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1177 
1178 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1179 		goto cleanup;
1180 
1181 	if (cg_test_proc_killed(child))
1182 		goto cleanup;
1183 	if (cg_test_proc_killed(parent))
1184 		goto cleanup;
1185 
1186 	ret = KSFT_PASS;
1187 
1188 cleanup:
1189 	if (child)
1190 		cg_destroy(child);
1191 	if (parent)
1192 		cg_destroy(parent);
1193 	free(child);
1194 	free(parent);
1195 
1196 	return ret;
1197 }
1198 
1199 /*
1200  * This test disables swapping and tries to allocate anonymous memory
1201  * up to OOM with memory.group.oom set. Then it checks that all
1202  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1203  */
1204 static int test_memcg_oom_group_score_events(const char *root)
1205 {
1206 	int ret = KSFT_FAIL;
1207 	char *memcg;
1208 	int safe_pid;
1209 
1210 	memcg = cg_name(root, "memcg_test_0");
1211 
1212 	if (!memcg)
1213 		goto cleanup;
1214 
1215 	if (cg_create(memcg))
1216 		goto cleanup;
1217 
1218 	if (cg_write(memcg, "memory.max", "50M"))
1219 		goto cleanup;
1220 
1221 	if (cg_write(memcg, "memory.swap.max", "0"))
1222 		goto cleanup;
1223 
1224 	if (cg_write(memcg, "memory.oom.group", "1"))
1225 		goto cleanup;
1226 
1227 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1228 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1229 		goto cleanup;
1230 
1231 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1232 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1233 		goto cleanup;
1234 
1235 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1236 		goto cleanup;
1237 
1238 	if (kill(safe_pid, SIGKILL))
1239 		goto cleanup;
1240 
1241 	ret = KSFT_PASS;
1242 
1243 cleanup:
1244 	if (memcg)
1245 		cg_destroy(memcg);
1246 	free(memcg);
1247 
1248 	return ret;
1249 }
1250 
1251 #define T(x) { x, #x }
1252 struct memcg_test {
1253 	int (*fn)(const char *root);
1254 	const char *name;
1255 } tests[] = {
1256 	T(test_memcg_subtree_control),
1257 	T(test_memcg_current),
1258 	T(test_memcg_min),
1259 	T(test_memcg_low),
1260 	T(test_memcg_high),
1261 	T(test_memcg_high_sync),
1262 	T(test_memcg_max),
1263 	T(test_memcg_reclaim),
1264 	T(test_memcg_oom_events),
1265 	T(test_memcg_swap_max),
1266 	T(test_memcg_sock),
1267 	T(test_memcg_oom_group_leaf_events),
1268 	T(test_memcg_oom_group_parent_events),
1269 	T(test_memcg_oom_group_score_events),
1270 };
1271 #undef T
1272 
1273 int main(int argc, char **argv)
1274 {
1275 	char root[PATH_MAX];
1276 	int i, proc_status, ret = EXIT_SUCCESS;
1277 
1278 	if (cg_find_unified_root(root, sizeof(root)))
1279 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1280 
1281 	/*
1282 	 * Check that memory controller is available:
1283 	 * memory is listed in cgroup.controllers
1284 	 */
1285 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1286 		ksft_exit_skip("memory controller isn't available\n");
1287 
1288 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1289 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1290 			ksft_exit_skip("Failed to set memory controller\n");
1291 
1292 	proc_status = proc_mount_contains("memory_recursiveprot");
1293 	if (proc_status < 0)
1294 		ksft_exit_skip("Failed to query cgroup mount option\n");
1295 	has_recursiveprot = proc_status;
1296 
1297 	proc_status = proc_mount_contains("memory_localevents");
1298 	if (proc_status < 0)
1299 		ksft_exit_skip("Failed to query cgroup mount option\n");
1300 	has_localevents = proc_status;
1301 
1302 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1303 		switch (tests[i].fn(root)) {
1304 		case KSFT_PASS:
1305 			ksft_test_result_pass("%s\n", tests[i].name);
1306 			break;
1307 		case KSFT_SKIP:
1308 			ksft_test_result_skip("%s\n", tests[i].name);
1309 			break;
1310 		default:
1311 			ret = EXIT_FAILURE;
1312 			ksft_test_result_fail("%s\n", tests[i].name);
1313 			break;
1314 		}
1315 	}
1316 
1317 	return ret;
1318 }
1319