1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
102 		*ptr = 0;
103 
104 	current = cg_read_long(cgroup, "memory.current");
105 	if (current < size)
106 		goto cleanup;
107 
108 	if (!values_close(size, current, 3))
109 		goto cleanup;
110 
111 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
112 	if (anon < 0)
113 		goto cleanup;
114 
115 	if (!values_close(anon, current, 3))
116 		goto cleanup;
117 
118 	ret = 0;
119 cleanup:
120 	free(buf);
121 	return ret;
122 }
123 
124 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
125 {
126 	size_t size = MB(50);
127 	int ret = -1;
128 	long current, file;
129 	int fd;
130 
131 	fd = get_temp_fd();
132 	if (fd < 0)
133 		return -1;
134 
135 	if (alloc_pagecache(fd, size))
136 		goto cleanup;
137 
138 	current = cg_read_long(cgroup, "memory.current");
139 	if (current < size)
140 		goto cleanup;
141 
142 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
143 	if (file < 0)
144 		goto cleanup;
145 
146 	if (!values_close(file, current, 10))
147 		goto cleanup;
148 
149 	ret = 0;
150 
151 cleanup:
152 	close(fd);
153 	return ret;
154 }
155 
156 /*
157  * This test create a memory cgroup, allocates
158  * some anonymous memory and some pagecache
159  * and check memory.current and some memory.stat values.
160  */
161 static int test_memcg_current(const char *root)
162 {
163 	int ret = KSFT_FAIL;
164 	long current;
165 	char *memcg;
166 
167 	memcg = cg_name(root, "memcg_test");
168 	if (!memcg)
169 		goto cleanup;
170 
171 	if (cg_create(memcg))
172 		goto cleanup;
173 
174 	current = cg_read_long(memcg, "memory.current");
175 	if (current != 0)
176 		goto cleanup;
177 
178 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
179 		goto cleanup;
180 
181 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
182 		goto cleanup;
183 
184 	ret = KSFT_PASS;
185 
186 cleanup:
187 	cg_destroy(memcg);
188 	free(memcg);
189 
190 	return ret;
191 }
192 
193 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
194 {
195 	int fd = (long)arg;
196 	int ppid = getppid();
197 
198 	if (alloc_pagecache(fd, MB(50)))
199 		return -1;
200 
201 	while (getppid() == ppid)
202 		sleep(1);
203 
204 	return 0;
205 }
206 
207 static int alloc_anon_noexit(const char *cgroup, void *arg)
208 {
209 	int ppid = getppid();
210 	size_t size = (unsigned long)arg;
211 	char *buf, *ptr;
212 
213 	buf = malloc(size);
214 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
215 		*ptr = 0;
216 
217 	while (getppid() == ppid)
218 		sleep(1);
219 
220 	free(buf);
221 	return 0;
222 }
223 
224 /*
225  * Wait until processes are killed asynchronously by the OOM killer
226  * If we exceed a timeout, fail.
227  */
228 static int cg_test_proc_killed(const char *cgroup)
229 {
230 	int limit;
231 
232 	for (limit = 10; limit > 0; limit--) {
233 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
234 			return 0;
235 
236 		usleep(100000);
237 	}
238 	return -1;
239 }
240 
241 static bool reclaim_until(const char *memcg, long goal);
242 
243 /*
244  * First, this test creates the following hierarchy:
245  * A       memory.min = 0,    memory.max = 200M
246  * A/B     memory.min = 50M
247  * A/B/C   memory.min = 75M,  memory.current = 50M
248  * A/B/D   memory.min = 25M,  memory.current = 50M
249  * A/B/E   memory.min = 0,    memory.current = 50M
250  * A/B/F   memory.min = 500M, memory.current = 0
251  *
252  * (or memory.low if we test soft protection)
253  *
254  * Usages are pagecache and the test keeps a running
255  * process in every leaf cgroup.
256  * Then it creates A/G and creates a significant
257  * memory pressure in A.
258  *
259  * Then it checks actual memory usages and expects that:
260  * A/B    memory.current ~= 50M
261  * A/B/C  memory.current ~= 29M
262  * A/B/D  memory.current ~= 21M
263  * A/B/E  memory.current ~= 0
264  * A/B/F  memory.current  = 0
265  * (for origin of the numbers, see model in memcg_protection.m.)
266  *
267  * After that it tries to allocate more than there is
268  * unprotected memory in A available, and checks that:
269  * a) memory.min protects pagecache even in this case,
270  * b) memory.low allows reclaiming page cache with low events.
271  *
272  * Then we try to reclaim from A/B/C using memory.reclaim until its
273  * usage reaches 10M.
274  * This makes sure that:
275  * (a) We ignore the protection of the reclaim target memcg.
276  * (b) The previously calculated emin value (~29M) should be dismissed.
277  */
278 static int test_memcg_protection(const char *root, bool min)
279 {
280 	int ret = KSFT_FAIL, rc;
281 	char *parent[3] = {NULL};
282 	char *children[4] = {NULL};
283 	const char *attribute = min ? "memory.min" : "memory.low";
284 	long c[4];
285 	int i, attempts;
286 	int fd;
287 
288 	fd = get_temp_fd();
289 	if (fd < 0)
290 		goto cleanup;
291 
292 	parent[0] = cg_name(root, "memcg_test_0");
293 	if (!parent[0])
294 		goto cleanup;
295 
296 	parent[1] = cg_name(parent[0], "memcg_test_1");
297 	if (!parent[1])
298 		goto cleanup;
299 
300 	parent[2] = cg_name(parent[0], "memcg_test_2");
301 	if (!parent[2])
302 		goto cleanup;
303 
304 	if (cg_create(parent[0]))
305 		goto cleanup;
306 
307 	if (cg_read_long(parent[0], attribute)) {
308 		/* No memory.min on older kernels is fine */
309 		if (min)
310 			ret = KSFT_SKIP;
311 		goto cleanup;
312 	}
313 
314 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
315 		goto cleanup;
316 
317 	if (cg_write(parent[0], "memory.max", "200M"))
318 		goto cleanup;
319 
320 	if (cg_write(parent[0], "memory.swap.max", "0"))
321 		goto cleanup;
322 
323 	if (cg_create(parent[1]))
324 		goto cleanup;
325 
326 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
327 		goto cleanup;
328 
329 	if (cg_create(parent[2]))
330 		goto cleanup;
331 
332 	for (i = 0; i < ARRAY_SIZE(children); i++) {
333 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
334 		if (!children[i])
335 			goto cleanup;
336 
337 		if (cg_create(children[i]))
338 			goto cleanup;
339 
340 		if (i > 2)
341 			continue;
342 
343 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
344 			      (void *)(long)fd);
345 	}
346 
347 	if (cg_write(parent[1],   attribute, "50M"))
348 		goto cleanup;
349 	if (cg_write(children[0], attribute, "75M"))
350 		goto cleanup;
351 	if (cg_write(children[1], attribute, "25M"))
352 		goto cleanup;
353 	if (cg_write(children[2], attribute, "0"))
354 		goto cleanup;
355 	if (cg_write(children[3], attribute, "500M"))
356 		goto cleanup;
357 
358 	attempts = 0;
359 	while (!values_close(cg_read_long(parent[1], "memory.current"),
360 			     MB(150), 3)) {
361 		if (attempts++ > 5)
362 			break;
363 		sleep(1);
364 	}
365 
366 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
367 		goto cleanup;
368 
369 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
370 		goto cleanup;
371 
372 	for (i = 0; i < ARRAY_SIZE(children); i++)
373 		c[i] = cg_read_long(children[i], "memory.current");
374 
375 	if (!values_close(c[0], MB(29), 10))
376 		goto cleanup;
377 
378 	if (!values_close(c[1], MB(21), 10))
379 		goto cleanup;
380 
381 	if (c[3] != 0)
382 		goto cleanup;
383 
384 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
385 	if (min && !rc)
386 		goto cleanup;
387 	else if (!min && rc) {
388 		fprintf(stderr,
389 			"memory.low prevents from allocating anon memory\n");
390 		goto cleanup;
391 	}
392 
393 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
394 		goto cleanup;
395 
396 	if (!reclaim_until(children[0], MB(10)))
397 		goto cleanup;
398 
399 	if (min) {
400 		ret = KSFT_PASS;
401 		goto cleanup;
402 	}
403 
404 	for (i = 0; i < ARRAY_SIZE(children); i++) {
405 		int no_low_events_index = 1;
406 		long low, oom;
407 
408 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
409 		low = cg_read_key_long(children[i], "memory.events", "low ");
410 
411 		if (oom)
412 			goto cleanup;
413 		if (i <= no_low_events_index && low <= 0)
414 			goto cleanup;
415 		if (i > no_low_events_index && low)
416 			goto cleanup;
417 
418 	}
419 
420 	ret = KSFT_PASS;
421 
422 cleanup:
423 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
424 		if (!children[i])
425 			continue;
426 
427 		cg_destroy(children[i]);
428 		free(children[i]);
429 	}
430 
431 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
432 		if (!parent[i])
433 			continue;
434 
435 		cg_destroy(parent[i]);
436 		free(parent[i]);
437 	}
438 	close(fd);
439 	return ret;
440 }
441 
442 static int test_memcg_min(const char *root)
443 {
444 	return test_memcg_protection(root, true);
445 }
446 
447 static int test_memcg_low(const char *root)
448 {
449 	return test_memcg_protection(root, false);
450 }
451 
452 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
453 {
454 	size_t size = MB(50);
455 	int ret = -1;
456 	long current, high, max;
457 	int fd;
458 
459 	high = cg_read_long(cgroup, "memory.high");
460 	max = cg_read_long(cgroup, "memory.max");
461 	if (high != MB(30) && max != MB(30))
462 		return -1;
463 
464 	fd = get_temp_fd();
465 	if (fd < 0)
466 		return -1;
467 
468 	if (alloc_pagecache(fd, size))
469 		goto cleanup;
470 
471 	current = cg_read_long(cgroup, "memory.current");
472 	if (!values_close(current, MB(30), 5))
473 		goto cleanup;
474 
475 	ret = 0;
476 
477 cleanup:
478 	close(fd);
479 	return ret;
480 
481 }
482 
483 /*
484  * This test checks that memory.high limits the amount of
485  * memory which can be consumed by either anonymous memory
486  * or pagecache.
487  */
488 static int test_memcg_high(const char *root)
489 {
490 	int ret = KSFT_FAIL;
491 	char *memcg;
492 	long high;
493 
494 	memcg = cg_name(root, "memcg_test");
495 	if (!memcg)
496 		goto cleanup;
497 
498 	if (cg_create(memcg))
499 		goto cleanup;
500 
501 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
502 		goto cleanup;
503 
504 	if (cg_write(memcg, "memory.swap.max", "0"))
505 		goto cleanup;
506 
507 	if (cg_write(memcg, "memory.high", "30M"))
508 		goto cleanup;
509 
510 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
511 		goto cleanup;
512 
513 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
514 		goto cleanup;
515 
516 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
517 		goto cleanup;
518 
519 	high = cg_read_key_long(memcg, "memory.events", "high ");
520 	if (high <= 0)
521 		goto cleanup;
522 
523 	ret = KSFT_PASS;
524 
525 cleanup:
526 	cg_destroy(memcg);
527 	free(memcg);
528 
529 	return ret;
530 }
531 
532 static int alloc_anon_mlock(const char *cgroup, void *arg)
533 {
534 	size_t size = (size_t)arg;
535 	void *buf;
536 
537 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
538 		   0, 0);
539 	if (buf == MAP_FAILED)
540 		return -1;
541 
542 	mlock(buf, size);
543 	munmap(buf, size);
544 	return 0;
545 }
546 
547 /*
548  * This test checks that memory.high is able to throttle big single shot
549  * allocation i.e. large allocation within one kernel entry.
550  */
551 static int test_memcg_high_sync(const char *root)
552 {
553 	int ret = KSFT_FAIL, pid, fd = -1;
554 	char *memcg;
555 	long pre_high, pre_max;
556 	long post_high, post_max;
557 
558 	memcg = cg_name(root, "memcg_test");
559 	if (!memcg)
560 		goto cleanup;
561 
562 	if (cg_create(memcg))
563 		goto cleanup;
564 
565 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
566 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
567 	if (pre_high < 0 || pre_max < 0)
568 		goto cleanup;
569 
570 	if (cg_write(memcg, "memory.swap.max", "0"))
571 		goto cleanup;
572 
573 	if (cg_write(memcg, "memory.high", "30M"))
574 		goto cleanup;
575 
576 	if (cg_write(memcg, "memory.max", "140M"))
577 		goto cleanup;
578 
579 	fd = memcg_prepare_for_wait(memcg);
580 	if (fd < 0)
581 		goto cleanup;
582 
583 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
584 	if (pid < 0)
585 		goto cleanup;
586 
587 	cg_wait_for(fd);
588 
589 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
590 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
591 	if (post_high < 0 || post_max < 0)
592 		goto cleanup;
593 
594 	if (pre_high == post_high || pre_max != post_max)
595 		goto cleanup;
596 
597 	ret = KSFT_PASS;
598 
599 cleanup:
600 	if (fd >= 0)
601 		close(fd);
602 	cg_destroy(memcg);
603 	free(memcg);
604 
605 	return ret;
606 }
607 
608 /*
609  * This test checks that memory.max limits the amount of
610  * memory which can be consumed by either anonymous memory
611  * or pagecache.
612  */
613 static int test_memcg_max(const char *root)
614 {
615 	int ret = KSFT_FAIL;
616 	char *memcg;
617 	long current, max;
618 
619 	memcg = cg_name(root, "memcg_test");
620 	if (!memcg)
621 		goto cleanup;
622 
623 	if (cg_create(memcg))
624 		goto cleanup;
625 
626 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
627 		goto cleanup;
628 
629 	if (cg_write(memcg, "memory.swap.max", "0"))
630 		goto cleanup;
631 
632 	if (cg_write(memcg, "memory.max", "30M"))
633 		goto cleanup;
634 
635 	/* Should be killed by OOM killer */
636 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
637 		goto cleanup;
638 
639 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
640 		goto cleanup;
641 
642 	current = cg_read_long(memcg, "memory.current");
643 	if (current > MB(30) || !current)
644 		goto cleanup;
645 
646 	max = cg_read_key_long(memcg, "memory.events", "max ");
647 	if (max <= 0)
648 		goto cleanup;
649 
650 	ret = KSFT_PASS;
651 
652 cleanup:
653 	cg_destroy(memcg);
654 	free(memcg);
655 
656 	return ret;
657 }
658 
659 /*
660  * Reclaim from @memcg until usage reaches @goal by writing to
661  * memory.reclaim.
662  *
663  * This function will return false if the usage is already below the
664  * goal.
665  *
666  * This function assumes that writing to memory.reclaim is the only
667  * source of change in memory.current (no concurrent allocations or
668  * reclaim).
669  *
670  * This function makes sure memory.reclaim is sane. It will return
671  * false if memory.reclaim's error codes do not make sense, even if
672  * the usage goal was satisfied.
673  */
674 static bool reclaim_until(const char *memcg, long goal)
675 {
676 	char buf[64];
677 	int retries, err;
678 	long current, to_reclaim;
679 	bool reclaimed = false;
680 
681 	for (retries = 5; retries > 0; retries--) {
682 		current = cg_read_long(memcg, "memory.current");
683 
684 		if (current < goal || values_close(current, goal, 3))
685 			break;
686 		/* Did memory.reclaim return 0 incorrectly? */
687 		else if (reclaimed)
688 			return false;
689 
690 		to_reclaim = current - goal;
691 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
692 		err = cg_write(memcg, "memory.reclaim", buf);
693 		if (!err)
694 			reclaimed = true;
695 		else if (err != -EAGAIN)
696 			return false;
697 	}
698 	return reclaimed;
699 }
700 
701 /*
702  * This test checks that memory.reclaim reclaims the given
703  * amount of memory (from both anon and file, if possible).
704  */
705 static int test_memcg_reclaim(const char *root)
706 {
707 	int ret = KSFT_FAIL, fd, retries;
708 	char *memcg;
709 	long current, expected_usage;
710 
711 	memcg = cg_name(root, "memcg_test");
712 	if (!memcg)
713 		goto cleanup;
714 
715 	if (cg_create(memcg))
716 		goto cleanup;
717 
718 	current = cg_read_long(memcg, "memory.current");
719 	if (current != 0)
720 		goto cleanup;
721 
722 	fd = get_temp_fd();
723 	if (fd < 0)
724 		goto cleanup;
725 
726 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
727 
728 	/*
729 	 * If swap is enabled, try to reclaim from both anon and file, else try
730 	 * to reclaim from file only.
731 	 */
732 	if (is_swap_enabled()) {
733 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
734 		expected_usage = MB(100);
735 	} else
736 		expected_usage = MB(50);
737 
738 	/*
739 	 * Wait until current usage reaches the expected usage (or we run out of
740 	 * retries).
741 	 */
742 	retries = 5;
743 	while (!values_close(cg_read_long(memcg, "memory.current"),
744 			    expected_usage, 10)) {
745 		if (retries--) {
746 			sleep(1);
747 			continue;
748 		} else {
749 			fprintf(stderr,
750 				"failed to allocate %ld for memcg reclaim test\n",
751 				expected_usage);
752 			goto cleanup;
753 		}
754 	}
755 
756 	/*
757 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
758 	 * and file if swap is enabled.
759 	 */
760 	if (!reclaim_until(memcg, MB(30)))
761 		goto cleanup;
762 
763 	ret = KSFT_PASS;
764 cleanup:
765 	cg_destroy(memcg);
766 	free(memcg);
767 	close(fd);
768 
769 	return ret;
770 }
771 
772 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
773 {
774 	long mem_max = (long)arg;
775 	size_t size = MB(50);
776 	char *buf, *ptr;
777 	long mem_current, swap_current;
778 	int ret = -1;
779 
780 	buf = malloc(size);
781 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
782 		*ptr = 0;
783 
784 	mem_current = cg_read_long(cgroup, "memory.current");
785 	if (!mem_current || !values_close(mem_current, mem_max, 3))
786 		goto cleanup;
787 
788 	swap_current = cg_read_long(cgroup, "memory.swap.current");
789 	if (!swap_current ||
790 	    !values_close(mem_current + swap_current, size, 3))
791 		goto cleanup;
792 
793 	ret = 0;
794 cleanup:
795 	free(buf);
796 	return ret;
797 }
798 
799 /*
800  * This test checks that memory.swap.max limits the amount of
801  * anonymous memory which can be swapped out.
802  */
803 static int test_memcg_swap_max(const char *root)
804 {
805 	int ret = KSFT_FAIL;
806 	char *memcg;
807 	long max;
808 
809 	if (!is_swap_enabled())
810 		return KSFT_SKIP;
811 
812 	memcg = cg_name(root, "memcg_test");
813 	if (!memcg)
814 		goto cleanup;
815 
816 	if (cg_create(memcg))
817 		goto cleanup;
818 
819 	if (cg_read_long(memcg, "memory.swap.current")) {
820 		ret = KSFT_SKIP;
821 		goto cleanup;
822 	}
823 
824 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
825 		goto cleanup;
826 
827 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
828 		goto cleanup;
829 
830 	if (cg_write(memcg, "memory.swap.max", "30M"))
831 		goto cleanup;
832 
833 	if (cg_write(memcg, "memory.max", "30M"))
834 		goto cleanup;
835 
836 	/* Should be killed by OOM killer */
837 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
838 		goto cleanup;
839 
840 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
841 		goto cleanup;
842 
843 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
844 		goto cleanup;
845 
846 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
847 		goto cleanup;
848 
849 	max = cg_read_key_long(memcg, "memory.events", "max ");
850 	if (max <= 0)
851 		goto cleanup;
852 
853 	ret = KSFT_PASS;
854 
855 cleanup:
856 	cg_destroy(memcg);
857 	free(memcg);
858 
859 	return ret;
860 }
861 
862 /*
863  * This test disables swapping and tries to allocate anonymous memory
864  * up to OOM. Then it checks for oom and oom_kill events in
865  * memory.events.
866  */
867 static int test_memcg_oom_events(const char *root)
868 {
869 	int ret = KSFT_FAIL;
870 	char *memcg;
871 
872 	memcg = cg_name(root, "memcg_test");
873 	if (!memcg)
874 		goto cleanup;
875 
876 	if (cg_create(memcg))
877 		goto cleanup;
878 
879 	if (cg_write(memcg, "memory.max", "30M"))
880 		goto cleanup;
881 
882 	if (cg_write(memcg, "memory.swap.max", "0"))
883 		goto cleanup;
884 
885 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
886 		goto cleanup;
887 
888 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
889 		goto cleanup;
890 
891 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
892 		goto cleanup;
893 
894 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
895 		goto cleanup;
896 
897 	ret = KSFT_PASS;
898 
899 cleanup:
900 	cg_destroy(memcg);
901 	free(memcg);
902 
903 	return ret;
904 }
905 
906 struct tcp_server_args {
907 	unsigned short port;
908 	int ctl[2];
909 };
910 
911 static int tcp_server(const char *cgroup, void *arg)
912 {
913 	struct tcp_server_args *srv_args = arg;
914 	struct sockaddr_in6 saddr = { 0 };
915 	socklen_t slen = sizeof(saddr);
916 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
917 
918 	close(srv_args->ctl[0]);
919 	ctl_fd = srv_args->ctl[1];
920 
921 	saddr.sin6_family = AF_INET6;
922 	saddr.sin6_addr = in6addr_any;
923 	saddr.sin6_port = htons(srv_args->port);
924 
925 	sk = socket(AF_INET6, SOCK_STREAM, 0);
926 	if (sk < 0)
927 		return ret;
928 
929 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
930 		goto cleanup;
931 
932 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
933 		write(ctl_fd, &errno, sizeof(errno));
934 		goto cleanup;
935 	}
936 
937 	if (listen(sk, 1))
938 		goto cleanup;
939 
940 	ret = 0;
941 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
942 		ret = -1;
943 		goto cleanup;
944 	}
945 
946 	client_sk = accept(sk, NULL, NULL);
947 	if (client_sk < 0)
948 		goto cleanup;
949 
950 	ret = -1;
951 	for (;;) {
952 		uint8_t buf[0x100000];
953 
954 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
955 			if (errno == ECONNRESET)
956 				ret = 0;
957 			break;
958 		}
959 	}
960 
961 	close(client_sk);
962 
963 cleanup:
964 	close(sk);
965 	return ret;
966 }
967 
968 static int tcp_client(const char *cgroup, unsigned short port)
969 {
970 	const char server[] = "localhost";
971 	struct addrinfo *ai;
972 	char servport[6];
973 	int retries = 0x10; /* nice round number */
974 	int sk, ret;
975 
976 	snprintf(servport, sizeof(servport), "%hd", port);
977 	ret = getaddrinfo(server, servport, NULL, &ai);
978 	if (ret)
979 		return ret;
980 
981 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
982 	if (sk < 0)
983 		goto free_ainfo;
984 
985 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
986 	if (ret < 0)
987 		goto close_sk;
988 
989 	ret = KSFT_FAIL;
990 	while (retries--) {
991 		uint8_t buf[0x100000];
992 		long current, sock;
993 
994 		if (read(sk, buf, sizeof(buf)) <= 0)
995 			goto close_sk;
996 
997 		current = cg_read_long(cgroup, "memory.current");
998 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
999 
1000 		if (current < 0 || sock < 0)
1001 			goto close_sk;
1002 
1003 		if (values_close(current, sock, 10)) {
1004 			ret = KSFT_PASS;
1005 			break;
1006 		}
1007 	}
1008 
1009 close_sk:
1010 	close(sk);
1011 free_ainfo:
1012 	freeaddrinfo(ai);
1013 	return ret;
1014 }
1015 
1016 /*
1017  * This test checks socket memory accounting.
1018  * The test forks a TCP server listens on a random port between 1000
1019  * and 61000. Once it gets a client connection, it starts writing to
1020  * its socket.
1021  * The TCP client interleaves reads from the socket with check whether
1022  * memory.current and memory.stat.sock are similar.
1023  */
1024 static int test_memcg_sock(const char *root)
1025 {
1026 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1027 	unsigned short port;
1028 	char *memcg;
1029 
1030 	memcg = cg_name(root, "memcg_test");
1031 	if (!memcg)
1032 		goto cleanup;
1033 
1034 	if (cg_create(memcg))
1035 		goto cleanup;
1036 
1037 	while (bind_retries--) {
1038 		struct tcp_server_args args;
1039 
1040 		if (pipe(args.ctl))
1041 			goto cleanup;
1042 
1043 		port = args.port = 1000 + rand() % 60000;
1044 
1045 		pid = cg_run_nowait(memcg, tcp_server, &args);
1046 		if (pid < 0)
1047 			goto cleanup;
1048 
1049 		close(args.ctl[1]);
1050 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1051 			goto cleanup;
1052 		close(args.ctl[0]);
1053 
1054 		if (!err)
1055 			break;
1056 		if (err != EADDRINUSE)
1057 			goto cleanup;
1058 
1059 		waitpid(pid, NULL, 0);
1060 	}
1061 
1062 	if (err == EADDRINUSE) {
1063 		ret = KSFT_SKIP;
1064 		goto cleanup;
1065 	}
1066 
1067 	if (tcp_client(memcg, port) != KSFT_PASS)
1068 		goto cleanup;
1069 
1070 	waitpid(pid, &err, 0);
1071 	if (WEXITSTATUS(err))
1072 		goto cleanup;
1073 
1074 	if (cg_read_long(memcg, "memory.current") < 0)
1075 		goto cleanup;
1076 
1077 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1078 		goto cleanup;
1079 
1080 	ret = KSFT_PASS;
1081 
1082 cleanup:
1083 	cg_destroy(memcg);
1084 	free(memcg);
1085 
1086 	return ret;
1087 }
1088 
1089 /*
1090  * This test disables swapping and tries to allocate anonymous memory
1091  * up to OOM with memory.group.oom set. Then it checks that all
1092  * processes in the leaf were killed. It also checks that oom_events
1093  * were propagated to the parent level.
1094  */
1095 static int test_memcg_oom_group_leaf_events(const char *root)
1096 {
1097 	int ret = KSFT_FAIL;
1098 	char *parent, *child;
1099 	long parent_oom_events;
1100 
1101 	parent = cg_name(root, "memcg_test_0");
1102 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1103 
1104 	if (!parent || !child)
1105 		goto cleanup;
1106 
1107 	if (cg_create(parent))
1108 		goto cleanup;
1109 
1110 	if (cg_create(child))
1111 		goto cleanup;
1112 
1113 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1114 		goto cleanup;
1115 
1116 	if (cg_write(child, "memory.max", "50M"))
1117 		goto cleanup;
1118 
1119 	if (cg_write(child, "memory.swap.max", "0"))
1120 		goto cleanup;
1121 
1122 	if (cg_write(child, "memory.oom.group", "1"))
1123 		goto cleanup;
1124 
1125 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1126 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1127 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1128 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1129 		goto cleanup;
1130 
1131 	if (cg_test_proc_killed(child))
1132 		goto cleanup;
1133 
1134 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1135 		goto cleanup;
1136 
1137 	parent_oom_events = cg_read_key_long(
1138 			parent, "memory.events", "oom_kill ");
1139 	/*
1140 	 * If memory_localevents is not enabled (the default), the parent should
1141 	 * count OOM events in its children groups. Otherwise, it should not
1142 	 * have observed any events.
1143 	 */
1144 	if (has_localevents && parent_oom_events != 0)
1145 		goto cleanup;
1146 	else if (!has_localevents && parent_oom_events <= 0)
1147 		goto cleanup;
1148 
1149 	ret = KSFT_PASS;
1150 
1151 cleanup:
1152 	if (child)
1153 		cg_destroy(child);
1154 	if (parent)
1155 		cg_destroy(parent);
1156 	free(child);
1157 	free(parent);
1158 
1159 	return ret;
1160 }
1161 
1162 /*
1163  * This test disables swapping and tries to allocate anonymous memory
1164  * up to OOM with memory.group.oom set. Then it checks that all
1165  * processes in the parent and leaf were killed.
1166  */
1167 static int test_memcg_oom_group_parent_events(const char *root)
1168 {
1169 	int ret = KSFT_FAIL;
1170 	char *parent, *child;
1171 
1172 	parent = cg_name(root, "memcg_test_0");
1173 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1174 
1175 	if (!parent || !child)
1176 		goto cleanup;
1177 
1178 	if (cg_create(parent))
1179 		goto cleanup;
1180 
1181 	if (cg_create(child))
1182 		goto cleanup;
1183 
1184 	if (cg_write(parent, "memory.max", "80M"))
1185 		goto cleanup;
1186 
1187 	if (cg_write(parent, "memory.swap.max", "0"))
1188 		goto cleanup;
1189 
1190 	if (cg_write(parent, "memory.oom.group", "1"))
1191 		goto cleanup;
1192 
1193 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1194 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1195 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1196 
1197 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1198 		goto cleanup;
1199 
1200 	if (cg_test_proc_killed(child))
1201 		goto cleanup;
1202 	if (cg_test_proc_killed(parent))
1203 		goto cleanup;
1204 
1205 	ret = KSFT_PASS;
1206 
1207 cleanup:
1208 	if (child)
1209 		cg_destroy(child);
1210 	if (parent)
1211 		cg_destroy(parent);
1212 	free(child);
1213 	free(parent);
1214 
1215 	return ret;
1216 }
1217 
1218 /*
1219  * This test disables swapping and tries to allocate anonymous memory
1220  * up to OOM with memory.group.oom set. Then it checks that all
1221  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1222  */
1223 static int test_memcg_oom_group_score_events(const char *root)
1224 {
1225 	int ret = KSFT_FAIL;
1226 	char *memcg;
1227 	int safe_pid;
1228 
1229 	memcg = cg_name(root, "memcg_test_0");
1230 
1231 	if (!memcg)
1232 		goto cleanup;
1233 
1234 	if (cg_create(memcg))
1235 		goto cleanup;
1236 
1237 	if (cg_write(memcg, "memory.max", "50M"))
1238 		goto cleanup;
1239 
1240 	if (cg_write(memcg, "memory.swap.max", "0"))
1241 		goto cleanup;
1242 
1243 	if (cg_write(memcg, "memory.oom.group", "1"))
1244 		goto cleanup;
1245 
1246 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1247 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1248 		goto cleanup;
1249 
1250 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1251 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1252 		goto cleanup;
1253 
1254 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1255 		goto cleanup;
1256 
1257 	if (kill(safe_pid, SIGKILL))
1258 		goto cleanup;
1259 
1260 	ret = KSFT_PASS;
1261 
1262 cleanup:
1263 	if (memcg)
1264 		cg_destroy(memcg);
1265 	free(memcg);
1266 
1267 	return ret;
1268 }
1269 
1270 #define T(x) { x, #x }
1271 struct memcg_test {
1272 	int (*fn)(const char *root);
1273 	const char *name;
1274 } tests[] = {
1275 	T(test_memcg_subtree_control),
1276 	T(test_memcg_current),
1277 	T(test_memcg_min),
1278 	T(test_memcg_low),
1279 	T(test_memcg_high),
1280 	T(test_memcg_high_sync),
1281 	T(test_memcg_max),
1282 	T(test_memcg_reclaim),
1283 	T(test_memcg_oom_events),
1284 	T(test_memcg_swap_max),
1285 	T(test_memcg_sock),
1286 	T(test_memcg_oom_group_leaf_events),
1287 	T(test_memcg_oom_group_parent_events),
1288 	T(test_memcg_oom_group_score_events),
1289 };
1290 #undef T
1291 
1292 int main(int argc, char **argv)
1293 {
1294 	char root[PATH_MAX];
1295 	int i, proc_status, ret = EXIT_SUCCESS;
1296 
1297 	if (cg_find_unified_root(root, sizeof(root)))
1298 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1299 
1300 	/*
1301 	 * Check that memory controller is available:
1302 	 * memory is listed in cgroup.controllers
1303 	 */
1304 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1305 		ksft_exit_skip("memory controller isn't available\n");
1306 
1307 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1308 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1309 			ksft_exit_skip("Failed to set memory controller\n");
1310 
1311 	proc_status = proc_mount_contains("memory_recursiveprot");
1312 	if (proc_status < 0)
1313 		ksft_exit_skip("Failed to query cgroup mount option\n");
1314 	has_recursiveprot = proc_status;
1315 
1316 	proc_status = proc_mount_contains("memory_localevents");
1317 	if (proc_status < 0)
1318 		ksft_exit_skip("Failed to query cgroup mount option\n");
1319 	has_localevents = proc_status;
1320 
1321 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1322 		switch (tests[i].fn(root)) {
1323 		case KSFT_PASS:
1324 			ksft_test_result_pass("%s\n", tests[i].name);
1325 			break;
1326 		case KSFT_SKIP:
1327 			ksft_test_result_skip("%s\n", tests[i].name);
1328 			break;
1329 		default:
1330 			ret = EXIT_FAILURE;
1331 			ksft_test_result_fail("%s\n", tests[i].name);
1332 			break;
1333 		}
1334 	}
1335 
1336 	return ret;
1337 }
1338