xref: /openbmc/linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision e6b9d8eddb1772d99a676a906d42865293934edd)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	if (buf == NULL) {
102 		fprintf(stderr, "malloc() failed\n");
103 		return -1;
104 	}
105 
106 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107 		*ptr = 0;
108 
109 	current = cg_read_long(cgroup, "memory.current");
110 	if (current < size)
111 		goto cleanup;
112 
113 	if (!values_close(size, current, 3))
114 		goto cleanup;
115 
116 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117 	if (anon < 0)
118 		goto cleanup;
119 
120 	if (!values_close(anon, current, 3))
121 		goto cleanup;
122 
123 	ret = 0;
124 cleanup:
125 	free(buf);
126 	return ret;
127 }
128 
129 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130 {
131 	size_t size = MB(50);
132 	int ret = -1;
133 	long current, file;
134 	int fd;
135 
136 	fd = get_temp_fd();
137 	if (fd < 0)
138 		return -1;
139 
140 	if (alloc_pagecache(fd, size))
141 		goto cleanup;
142 
143 	current = cg_read_long(cgroup, "memory.current");
144 	if (current < size)
145 		goto cleanup;
146 
147 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148 	if (file < 0)
149 		goto cleanup;
150 
151 	if (!values_close(file, current, 10))
152 		goto cleanup;
153 
154 	ret = 0;
155 
156 cleanup:
157 	close(fd);
158 	return ret;
159 }
160 
161 /*
162  * This test create a memory cgroup, allocates
163  * some anonymous memory and some pagecache
164  * and check memory.current and some memory.stat values.
165  */
166 static int test_memcg_current(const char *root)
167 {
168 	int ret = KSFT_FAIL;
169 	long current;
170 	char *memcg;
171 
172 	memcg = cg_name(root, "memcg_test");
173 	if (!memcg)
174 		goto cleanup;
175 
176 	if (cg_create(memcg))
177 		goto cleanup;
178 
179 	current = cg_read_long(memcg, "memory.current");
180 	if (current != 0)
181 		goto cleanup;
182 
183 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
184 		goto cleanup;
185 
186 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
187 		goto cleanup;
188 
189 	ret = KSFT_PASS;
190 
191 cleanup:
192 	cg_destroy(memcg);
193 	free(memcg);
194 
195 	return ret;
196 }
197 
198 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
199 {
200 	int fd = (long)arg;
201 	int ppid = getppid();
202 
203 	if (alloc_pagecache(fd, MB(50)))
204 		return -1;
205 
206 	while (getppid() == ppid)
207 		sleep(1);
208 
209 	return 0;
210 }
211 
212 static int alloc_anon_noexit(const char *cgroup, void *arg)
213 {
214 	int ppid = getppid();
215 	size_t size = (unsigned long)arg;
216 	char *buf, *ptr;
217 
218 	buf = malloc(size);
219 	if (buf == NULL) {
220 		fprintf(stderr, "malloc() failed\n");
221 		return -1;
222 	}
223 
224 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
225 		*ptr = 0;
226 
227 	while (getppid() == ppid)
228 		sleep(1);
229 
230 	free(buf);
231 	return 0;
232 }
233 
234 /*
235  * Wait until processes are killed asynchronously by the OOM killer
236  * If we exceed a timeout, fail.
237  */
238 static int cg_test_proc_killed(const char *cgroup)
239 {
240 	int limit;
241 
242 	for (limit = 10; limit > 0; limit--) {
243 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
244 			return 0;
245 
246 		usleep(100000);
247 	}
248 	return -1;
249 }
250 
251 static bool reclaim_until(const char *memcg, long goal);
252 
253 /*
254  * First, this test creates the following hierarchy:
255  * A       memory.min = 0,    memory.max = 200M
256  * A/B     memory.min = 50M
257  * A/B/C   memory.min = 75M,  memory.current = 50M
258  * A/B/D   memory.min = 25M,  memory.current = 50M
259  * A/B/E   memory.min = 0,    memory.current = 50M
260  * A/B/F   memory.min = 500M, memory.current = 0
261  *
262  * (or memory.low if we test soft protection)
263  *
264  * Usages are pagecache and the test keeps a running
265  * process in every leaf cgroup.
266  * Then it creates A/G and creates a significant
267  * memory pressure in A.
268  *
269  * Then it checks actual memory usages and expects that:
270  * A/B    memory.current ~= 50M
271  * A/B/C  memory.current ~= 29M
272  * A/B/D  memory.current ~= 21M
273  * A/B/E  memory.current ~= 0
274  * A/B/F  memory.current  = 0
275  * (for origin of the numbers, see model in memcg_protection.m.)
276  *
277  * After that it tries to allocate more than there is
278  * unprotected memory in A available, and checks that:
279  * a) memory.min protects pagecache even in this case,
280  * b) memory.low allows reclaiming page cache with low events.
281  *
282  * Then we try to reclaim from A/B/C using memory.reclaim until its
283  * usage reaches 10M.
284  * This makes sure that:
285  * (a) We ignore the protection of the reclaim target memcg.
286  * (b) The previously calculated emin value (~29M) should be dismissed.
287  */
288 static int test_memcg_protection(const char *root, bool min)
289 {
290 	int ret = KSFT_FAIL, rc;
291 	char *parent[3] = {NULL};
292 	char *children[4] = {NULL};
293 	const char *attribute = min ? "memory.min" : "memory.low";
294 	long c[4];
295 	int i, attempts;
296 	int fd;
297 
298 	fd = get_temp_fd();
299 	if (fd < 0)
300 		goto cleanup;
301 
302 	parent[0] = cg_name(root, "memcg_test_0");
303 	if (!parent[0])
304 		goto cleanup;
305 
306 	parent[1] = cg_name(parent[0], "memcg_test_1");
307 	if (!parent[1])
308 		goto cleanup;
309 
310 	parent[2] = cg_name(parent[0], "memcg_test_2");
311 	if (!parent[2])
312 		goto cleanup;
313 
314 	if (cg_create(parent[0]))
315 		goto cleanup;
316 
317 	if (cg_read_long(parent[0], attribute)) {
318 		/* No memory.min on older kernels is fine */
319 		if (min)
320 			ret = KSFT_SKIP;
321 		goto cleanup;
322 	}
323 
324 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
325 		goto cleanup;
326 
327 	if (cg_write(parent[0], "memory.max", "200M"))
328 		goto cleanup;
329 
330 	if (cg_write(parent[0], "memory.swap.max", "0"))
331 		goto cleanup;
332 
333 	if (cg_create(parent[1]))
334 		goto cleanup;
335 
336 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
337 		goto cleanup;
338 
339 	if (cg_create(parent[2]))
340 		goto cleanup;
341 
342 	for (i = 0; i < ARRAY_SIZE(children); i++) {
343 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
344 		if (!children[i])
345 			goto cleanup;
346 
347 		if (cg_create(children[i]))
348 			goto cleanup;
349 
350 		if (i > 2)
351 			continue;
352 
353 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
354 			      (void *)(long)fd);
355 	}
356 
357 	if (cg_write(parent[1],   attribute, "50M"))
358 		goto cleanup;
359 	if (cg_write(children[0], attribute, "75M"))
360 		goto cleanup;
361 	if (cg_write(children[1], attribute, "25M"))
362 		goto cleanup;
363 	if (cg_write(children[2], attribute, "0"))
364 		goto cleanup;
365 	if (cg_write(children[3], attribute, "500M"))
366 		goto cleanup;
367 
368 	attempts = 0;
369 	while (!values_close(cg_read_long(parent[1], "memory.current"),
370 			     MB(150), 3)) {
371 		if (attempts++ > 5)
372 			break;
373 		sleep(1);
374 	}
375 
376 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
377 		goto cleanup;
378 
379 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
380 		goto cleanup;
381 
382 	for (i = 0; i < ARRAY_SIZE(children); i++)
383 		c[i] = cg_read_long(children[i], "memory.current");
384 
385 	if (!values_close(c[0], MB(29), 10))
386 		goto cleanup;
387 
388 	if (!values_close(c[1], MB(21), 10))
389 		goto cleanup;
390 
391 	if (c[3] != 0)
392 		goto cleanup;
393 
394 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
395 	if (min && !rc)
396 		goto cleanup;
397 	else if (!min && rc) {
398 		fprintf(stderr,
399 			"memory.low prevents from allocating anon memory\n");
400 		goto cleanup;
401 	}
402 
403 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
404 		goto cleanup;
405 
406 	if (!reclaim_until(children[0], MB(10)))
407 		goto cleanup;
408 
409 	if (min) {
410 		ret = KSFT_PASS;
411 		goto cleanup;
412 	}
413 
414 	for (i = 0; i < ARRAY_SIZE(children); i++) {
415 		int no_low_events_index = 1;
416 		long low, oom;
417 
418 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
419 		low = cg_read_key_long(children[i], "memory.events", "low ");
420 
421 		if (oom)
422 			goto cleanup;
423 		if (i <= no_low_events_index && low <= 0)
424 			goto cleanup;
425 		if (i > no_low_events_index && low)
426 			goto cleanup;
427 
428 	}
429 
430 	ret = KSFT_PASS;
431 
432 cleanup:
433 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
434 		if (!children[i])
435 			continue;
436 
437 		cg_destroy(children[i]);
438 		free(children[i]);
439 	}
440 
441 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
442 		if (!parent[i])
443 			continue;
444 
445 		cg_destroy(parent[i]);
446 		free(parent[i]);
447 	}
448 	close(fd);
449 	return ret;
450 }
451 
452 static int test_memcg_min(const char *root)
453 {
454 	return test_memcg_protection(root, true);
455 }
456 
457 static int test_memcg_low(const char *root)
458 {
459 	return test_memcg_protection(root, false);
460 }
461 
462 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
463 {
464 	size_t size = MB(50);
465 	int ret = -1;
466 	long current, high, max;
467 	int fd;
468 
469 	high = cg_read_long(cgroup, "memory.high");
470 	max = cg_read_long(cgroup, "memory.max");
471 	if (high != MB(30) && max != MB(30))
472 		return -1;
473 
474 	fd = get_temp_fd();
475 	if (fd < 0)
476 		return -1;
477 
478 	if (alloc_pagecache(fd, size))
479 		goto cleanup;
480 
481 	current = cg_read_long(cgroup, "memory.current");
482 	if (!values_close(current, MB(30), 5))
483 		goto cleanup;
484 
485 	ret = 0;
486 
487 cleanup:
488 	close(fd);
489 	return ret;
490 
491 }
492 
493 /*
494  * This test checks that memory.high limits the amount of
495  * memory which can be consumed by either anonymous memory
496  * or pagecache.
497  */
498 static int test_memcg_high(const char *root)
499 {
500 	int ret = KSFT_FAIL;
501 	char *memcg;
502 	long high;
503 
504 	memcg = cg_name(root, "memcg_test");
505 	if (!memcg)
506 		goto cleanup;
507 
508 	if (cg_create(memcg))
509 		goto cleanup;
510 
511 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
512 		goto cleanup;
513 
514 	if (cg_write(memcg, "memory.swap.max", "0"))
515 		goto cleanup;
516 
517 	if (cg_write(memcg, "memory.high", "30M"))
518 		goto cleanup;
519 
520 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
521 		goto cleanup;
522 
523 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
524 		goto cleanup;
525 
526 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
527 		goto cleanup;
528 
529 	high = cg_read_key_long(memcg, "memory.events", "high ");
530 	if (high <= 0)
531 		goto cleanup;
532 
533 	ret = KSFT_PASS;
534 
535 cleanup:
536 	cg_destroy(memcg);
537 	free(memcg);
538 
539 	return ret;
540 }
541 
542 static int alloc_anon_mlock(const char *cgroup, void *arg)
543 {
544 	size_t size = (size_t)arg;
545 	void *buf;
546 
547 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
548 		   0, 0);
549 	if (buf == MAP_FAILED)
550 		return -1;
551 
552 	mlock(buf, size);
553 	munmap(buf, size);
554 	return 0;
555 }
556 
557 /*
558  * This test checks that memory.high is able to throttle big single shot
559  * allocation i.e. large allocation within one kernel entry.
560  */
561 static int test_memcg_high_sync(const char *root)
562 {
563 	int ret = KSFT_FAIL, pid, fd = -1;
564 	char *memcg;
565 	long pre_high, pre_max;
566 	long post_high, post_max;
567 
568 	memcg = cg_name(root, "memcg_test");
569 	if (!memcg)
570 		goto cleanup;
571 
572 	if (cg_create(memcg))
573 		goto cleanup;
574 
575 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
576 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
577 	if (pre_high < 0 || pre_max < 0)
578 		goto cleanup;
579 
580 	if (cg_write(memcg, "memory.swap.max", "0"))
581 		goto cleanup;
582 
583 	if (cg_write(memcg, "memory.high", "30M"))
584 		goto cleanup;
585 
586 	if (cg_write(memcg, "memory.max", "140M"))
587 		goto cleanup;
588 
589 	fd = memcg_prepare_for_wait(memcg);
590 	if (fd < 0)
591 		goto cleanup;
592 
593 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
594 	if (pid < 0)
595 		goto cleanup;
596 
597 	cg_wait_for(fd);
598 
599 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
600 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
601 	if (post_high < 0 || post_max < 0)
602 		goto cleanup;
603 
604 	if (pre_high == post_high || pre_max != post_max)
605 		goto cleanup;
606 
607 	ret = KSFT_PASS;
608 
609 cleanup:
610 	if (fd >= 0)
611 		close(fd);
612 	cg_destroy(memcg);
613 	free(memcg);
614 
615 	return ret;
616 }
617 
618 /*
619  * This test checks that memory.max limits the amount of
620  * memory which can be consumed by either anonymous memory
621  * or pagecache.
622  */
623 static int test_memcg_max(const char *root)
624 {
625 	int ret = KSFT_FAIL;
626 	char *memcg;
627 	long current, max;
628 
629 	memcg = cg_name(root, "memcg_test");
630 	if (!memcg)
631 		goto cleanup;
632 
633 	if (cg_create(memcg))
634 		goto cleanup;
635 
636 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
637 		goto cleanup;
638 
639 	if (cg_write(memcg, "memory.swap.max", "0"))
640 		goto cleanup;
641 
642 	if (cg_write(memcg, "memory.max", "30M"))
643 		goto cleanup;
644 
645 	/* Should be killed by OOM killer */
646 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
647 		goto cleanup;
648 
649 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
650 		goto cleanup;
651 
652 	current = cg_read_long(memcg, "memory.current");
653 	if (current > MB(30) || !current)
654 		goto cleanup;
655 
656 	max = cg_read_key_long(memcg, "memory.events", "max ");
657 	if (max <= 0)
658 		goto cleanup;
659 
660 	ret = KSFT_PASS;
661 
662 cleanup:
663 	cg_destroy(memcg);
664 	free(memcg);
665 
666 	return ret;
667 }
668 
669 /*
670  * Reclaim from @memcg until usage reaches @goal by writing to
671  * memory.reclaim.
672  *
673  * This function will return false if the usage is already below the
674  * goal.
675  *
676  * This function assumes that writing to memory.reclaim is the only
677  * source of change in memory.current (no concurrent allocations or
678  * reclaim).
679  *
680  * This function makes sure memory.reclaim is sane. It will return
681  * false if memory.reclaim's error codes do not make sense, even if
682  * the usage goal was satisfied.
683  */
684 static bool reclaim_until(const char *memcg, long goal)
685 {
686 	char buf[64];
687 	int retries, err;
688 	long current, to_reclaim;
689 	bool reclaimed = false;
690 
691 	for (retries = 5; retries > 0; retries--) {
692 		current = cg_read_long(memcg, "memory.current");
693 
694 		if (current < goal || values_close(current, goal, 3))
695 			break;
696 		/* Did memory.reclaim return 0 incorrectly? */
697 		else if (reclaimed)
698 			return false;
699 
700 		to_reclaim = current - goal;
701 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
702 		err = cg_write(memcg, "memory.reclaim", buf);
703 		if (!err)
704 			reclaimed = true;
705 		else if (err != -EAGAIN)
706 			return false;
707 	}
708 	return reclaimed;
709 }
710 
711 /*
712  * This test checks that memory.reclaim reclaims the given
713  * amount of memory (from both anon and file, if possible).
714  */
715 static int test_memcg_reclaim(const char *root)
716 {
717 	int ret = KSFT_FAIL, fd, retries;
718 	char *memcg;
719 	long current, expected_usage;
720 
721 	memcg = cg_name(root, "memcg_test");
722 	if (!memcg)
723 		goto cleanup;
724 
725 	if (cg_create(memcg))
726 		goto cleanup;
727 
728 	current = cg_read_long(memcg, "memory.current");
729 	if (current != 0)
730 		goto cleanup;
731 
732 	fd = get_temp_fd();
733 	if (fd < 0)
734 		goto cleanup;
735 
736 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
737 
738 	/*
739 	 * If swap is enabled, try to reclaim from both anon and file, else try
740 	 * to reclaim from file only.
741 	 */
742 	if (is_swap_enabled()) {
743 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
744 		expected_usage = MB(100);
745 	} else
746 		expected_usage = MB(50);
747 
748 	/*
749 	 * Wait until current usage reaches the expected usage (or we run out of
750 	 * retries).
751 	 */
752 	retries = 5;
753 	while (!values_close(cg_read_long(memcg, "memory.current"),
754 			    expected_usage, 10)) {
755 		if (retries--) {
756 			sleep(1);
757 			continue;
758 		} else {
759 			fprintf(stderr,
760 				"failed to allocate %ld for memcg reclaim test\n",
761 				expected_usage);
762 			goto cleanup;
763 		}
764 	}
765 
766 	/*
767 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
768 	 * and file if swap is enabled.
769 	 */
770 	if (!reclaim_until(memcg, MB(30)))
771 		goto cleanup;
772 
773 	ret = KSFT_PASS;
774 cleanup:
775 	cg_destroy(memcg);
776 	free(memcg);
777 	close(fd);
778 
779 	return ret;
780 }
781 
782 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
783 {
784 	long mem_max = (long)arg;
785 	size_t size = MB(50);
786 	char *buf, *ptr;
787 	long mem_current, swap_current;
788 	int ret = -1;
789 
790 	buf = malloc(size);
791 	if (buf == NULL) {
792 		fprintf(stderr, "malloc() failed\n");
793 		return -1;
794 	}
795 
796 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
797 		*ptr = 0;
798 
799 	mem_current = cg_read_long(cgroup, "memory.current");
800 	if (!mem_current || !values_close(mem_current, mem_max, 3))
801 		goto cleanup;
802 
803 	swap_current = cg_read_long(cgroup, "memory.swap.current");
804 	if (!swap_current ||
805 	    !values_close(mem_current + swap_current, size, 3))
806 		goto cleanup;
807 
808 	ret = 0;
809 cleanup:
810 	free(buf);
811 	return ret;
812 }
813 
814 /*
815  * This test checks that memory.swap.max limits the amount of
816  * anonymous memory which can be swapped out.
817  */
818 static int test_memcg_swap_max(const char *root)
819 {
820 	int ret = KSFT_FAIL;
821 	char *memcg;
822 	long max;
823 
824 	if (!is_swap_enabled())
825 		return KSFT_SKIP;
826 
827 	memcg = cg_name(root, "memcg_test");
828 	if (!memcg)
829 		goto cleanup;
830 
831 	if (cg_create(memcg))
832 		goto cleanup;
833 
834 	if (cg_read_long(memcg, "memory.swap.current")) {
835 		ret = KSFT_SKIP;
836 		goto cleanup;
837 	}
838 
839 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
840 		goto cleanup;
841 
842 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
843 		goto cleanup;
844 
845 	if (cg_write(memcg, "memory.swap.max", "30M"))
846 		goto cleanup;
847 
848 	if (cg_write(memcg, "memory.max", "30M"))
849 		goto cleanup;
850 
851 	/* Should be killed by OOM killer */
852 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
853 		goto cleanup;
854 
855 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
856 		goto cleanup;
857 
858 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
859 		goto cleanup;
860 
861 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
862 		goto cleanup;
863 
864 	max = cg_read_key_long(memcg, "memory.events", "max ");
865 	if (max <= 0)
866 		goto cleanup;
867 
868 	ret = KSFT_PASS;
869 
870 cleanup:
871 	cg_destroy(memcg);
872 	free(memcg);
873 
874 	return ret;
875 }
876 
877 /*
878  * This test disables swapping and tries to allocate anonymous memory
879  * up to OOM. Then it checks for oom and oom_kill events in
880  * memory.events.
881  */
882 static int test_memcg_oom_events(const char *root)
883 {
884 	int ret = KSFT_FAIL;
885 	char *memcg;
886 
887 	memcg = cg_name(root, "memcg_test");
888 	if (!memcg)
889 		goto cleanup;
890 
891 	if (cg_create(memcg))
892 		goto cleanup;
893 
894 	if (cg_write(memcg, "memory.max", "30M"))
895 		goto cleanup;
896 
897 	if (cg_write(memcg, "memory.swap.max", "0"))
898 		goto cleanup;
899 
900 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
901 		goto cleanup;
902 
903 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
904 		goto cleanup;
905 
906 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
907 		goto cleanup;
908 
909 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
910 		goto cleanup;
911 
912 	ret = KSFT_PASS;
913 
914 cleanup:
915 	cg_destroy(memcg);
916 	free(memcg);
917 
918 	return ret;
919 }
920 
921 struct tcp_server_args {
922 	unsigned short port;
923 	int ctl[2];
924 };
925 
926 static int tcp_server(const char *cgroup, void *arg)
927 {
928 	struct tcp_server_args *srv_args = arg;
929 	struct sockaddr_in6 saddr = { 0 };
930 	socklen_t slen = sizeof(saddr);
931 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
932 
933 	close(srv_args->ctl[0]);
934 	ctl_fd = srv_args->ctl[1];
935 
936 	saddr.sin6_family = AF_INET6;
937 	saddr.sin6_addr = in6addr_any;
938 	saddr.sin6_port = htons(srv_args->port);
939 
940 	sk = socket(AF_INET6, SOCK_STREAM, 0);
941 	if (sk < 0)
942 		return ret;
943 
944 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
945 		goto cleanup;
946 
947 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
948 		write(ctl_fd, &errno, sizeof(errno));
949 		goto cleanup;
950 	}
951 
952 	if (listen(sk, 1))
953 		goto cleanup;
954 
955 	ret = 0;
956 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
957 		ret = -1;
958 		goto cleanup;
959 	}
960 
961 	client_sk = accept(sk, NULL, NULL);
962 	if (client_sk < 0)
963 		goto cleanup;
964 
965 	ret = -1;
966 	for (;;) {
967 		uint8_t buf[0x100000];
968 
969 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
970 			if (errno == ECONNRESET)
971 				ret = 0;
972 			break;
973 		}
974 	}
975 
976 	close(client_sk);
977 
978 cleanup:
979 	close(sk);
980 	return ret;
981 }
982 
983 static int tcp_client(const char *cgroup, unsigned short port)
984 {
985 	const char server[] = "localhost";
986 	struct addrinfo *ai;
987 	char servport[6];
988 	int retries = 0x10; /* nice round number */
989 	int sk, ret;
990 
991 	snprintf(servport, sizeof(servport), "%hd", port);
992 	ret = getaddrinfo(server, servport, NULL, &ai);
993 	if (ret)
994 		return ret;
995 
996 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
997 	if (sk < 0)
998 		goto free_ainfo;
999 
1000 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1001 	if (ret < 0)
1002 		goto close_sk;
1003 
1004 	ret = KSFT_FAIL;
1005 	while (retries--) {
1006 		uint8_t buf[0x100000];
1007 		long current, sock;
1008 
1009 		if (read(sk, buf, sizeof(buf)) <= 0)
1010 			goto close_sk;
1011 
1012 		current = cg_read_long(cgroup, "memory.current");
1013 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1014 
1015 		if (current < 0 || sock < 0)
1016 			goto close_sk;
1017 
1018 		if (values_close(current, sock, 10)) {
1019 			ret = KSFT_PASS;
1020 			break;
1021 		}
1022 	}
1023 
1024 close_sk:
1025 	close(sk);
1026 free_ainfo:
1027 	freeaddrinfo(ai);
1028 	return ret;
1029 }
1030 
1031 /*
1032  * This test checks socket memory accounting.
1033  * The test forks a TCP server listens on a random port between 1000
1034  * and 61000. Once it gets a client connection, it starts writing to
1035  * its socket.
1036  * The TCP client interleaves reads from the socket with check whether
1037  * memory.current and memory.stat.sock are similar.
1038  */
1039 static int test_memcg_sock(const char *root)
1040 {
1041 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1042 	unsigned short port;
1043 	char *memcg;
1044 
1045 	memcg = cg_name(root, "memcg_test");
1046 	if (!memcg)
1047 		goto cleanup;
1048 
1049 	if (cg_create(memcg))
1050 		goto cleanup;
1051 
1052 	while (bind_retries--) {
1053 		struct tcp_server_args args;
1054 
1055 		if (pipe(args.ctl))
1056 			goto cleanup;
1057 
1058 		port = args.port = 1000 + rand() % 60000;
1059 
1060 		pid = cg_run_nowait(memcg, tcp_server, &args);
1061 		if (pid < 0)
1062 			goto cleanup;
1063 
1064 		close(args.ctl[1]);
1065 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1066 			goto cleanup;
1067 		close(args.ctl[0]);
1068 
1069 		if (!err)
1070 			break;
1071 		if (err != EADDRINUSE)
1072 			goto cleanup;
1073 
1074 		waitpid(pid, NULL, 0);
1075 	}
1076 
1077 	if (err == EADDRINUSE) {
1078 		ret = KSFT_SKIP;
1079 		goto cleanup;
1080 	}
1081 
1082 	if (tcp_client(memcg, port) != KSFT_PASS)
1083 		goto cleanup;
1084 
1085 	waitpid(pid, &err, 0);
1086 	if (WEXITSTATUS(err))
1087 		goto cleanup;
1088 
1089 	if (cg_read_long(memcg, "memory.current") < 0)
1090 		goto cleanup;
1091 
1092 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1093 		goto cleanup;
1094 
1095 	ret = KSFT_PASS;
1096 
1097 cleanup:
1098 	cg_destroy(memcg);
1099 	free(memcg);
1100 
1101 	return ret;
1102 }
1103 
1104 /*
1105  * This test disables swapping and tries to allocate anonymous memory
1106  * up to OOM with memory.group.oom set. Then it checks that all
1107  * processes in the leaf were killed. It also checks that oom_events
1108  * were propagated to the parent level.
1109  */
1110 static int test_memcg_oom_group_leaf_events(const char *root)
1111 {
1112 	int ret = KSFT_FAIL;
1113 	char *parent, *child;
1114 	long parent_oom_events;
1115 
1116 	parent = cg_name(root, "memcg_test_0");
1117 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1118 
1119 	if (!parent || !child)
1120 		goto cleanup;
1121 
1122 	if (cg_create(parent))
1123 		goto cleanup;
1124 
1125 	if (cg_create(child))
1126 		goto cleanup;
1127 
1128 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1129 		goto cleanup;
1130 
1131 	if (cg_write(child, "memory.max", "50M"))
1132 		goto cleanup;
1133 
1134 	if (cg_write(child, "memory.swap.max", "0"))
1135 		goto cleanup;
1136 
1137 	if (cg_write(child, "memory.oom.group", "1"))
1138 		goto cleanup;
1139 
1140 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1141 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1142 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1143 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1144 		goto cleanup;
1145 
1146 	if (cg_test_proc_killed(child))
1147 		goto cleanup;
1148 
1149 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1150 		goto cleanup;
1151 
1152 	parent_oom_events = cg_read_key_long(
1153 			parent, "memory.events", "oom_kill ");
1154 	/*
1155 	 * If memory_localevents is not enabled (the default), the parent should
1156 	 * count OOM events in its children groups. Otherwise, it should not
1157 	 * have observed any events.
1158 	 */
1159 	if (has_localevents && parent_oom_events != 0)
1160 		goto cleanup;
1161 	else if (!has_localevents && parent_oom_events <= 0)
1162 		goto cleanup;
1163 
1164 	ret = KSFT_PASS;
1165 
1166 cleanup:
1167 	if (child)
1168 		cg_destroy(child);
1169 	if (parent)
1170 		cg_destroy(parent);
1171 	free(child);
1172 	free(parent);
1173 
1174 	return ret;
1175 }
1176 
1177 /*
1178  * This test disables swapping and tries to allocate anonymous memory
1179  * up to OOM with memory.group.oom set. Then it checks that all
1180  * processes in the parent and leaf were killed.
1181  */
1182 static int test_memcg_oom_group_parent_events(const char *root)
1183 {
1184 	int ret = KSFT_FAIL;
1185 	char *parent, *child;
1186 
1187 	parent = cg_name(root, "memcg_test_0");
1188 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1189 
1190 	if (!parent || !child)
1191 		goto cleanup;
1192 
1193 	if (cg_create(parent))
1194 		goto cleanup;
1195 
1196 	if (cg_create(child))
1197 		goto cleanup;
1198 
1199 	if (cg_write(parent, "memory.max", "80M"))
1200 		goto cleanup;
1201 
1202 	if (cg_write(parent, "memory.swap.max", "0"))
1203 		goto cleanup;
1204 
1205 	if (cg_write(parent, "memory.oom.group", "1"))
1206 		goto cleanup;
1207 
1208 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1209 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1210 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1211 
1212 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1213 		goto cleanup;
1214 
1215 	if (cg_test_proc_killed(child))
1216 		goto cleanup;
1217 	if (cg_test_proc_killed(parent))
1218 		goto cleanup;
1219 
1220 	ret = KSFT_PASS;
1221 
1222 cleanup:
1223 	if (child)
1224 		cg_destroy(child);
1225 	if (parent)
1226 		cg_destroy(parent);
1227 	free(child);
1228 	free(parent);
1229 
1230 	return ret;
1231 }
1232 
1233 /*
1234  * This test disables swapping and tries to allocate anonymous memory
1235  * up to OOM with memory.group.oom set. Then it checks that all
1236  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1237  */
1238 static int test_memcg_oom_group_score_events(const char *root)
1239 {
1240 	int ret = KSFT_FAIL;
1241 	char *memcg;
1242 	int safe_pid;
1243 
1244 	memcg = cg_name(root, "memcg_test_0");
1245 
1246 	if (!memcg)
1247 		goto cleanup;
1248 
1249 	if (cg_create(memcg))
1250 		goto cleanup;
1251 
1252 	if (cg_write(memcg, "memory.max", "50M"))
1253 		goto cleanup;
1254 
1255 	if (cg_write(memcg, "memory.swap.max", "0"))
1256 		goto cleanup;
1257 
1258 	if (cg_write(memcg, "memory.oom.group", "1"))
1259 		goto cleanup;
1260 
1261 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1262 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1263 		goto cleanup;
1264 
1265 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1266 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1267 		goto cleanup;
1268 
1269 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1270 		goto cleanup;
1271 
1272 	if (kill(safe_pid, SIGKILL))
1273 		goto cleanup;
1274 
1275 	ret = KSFT_PASS;
1276 
1277 cleanup:
1278 	if (memcg)
1279 		cg_destroy(memcg);
1280 	free(memcg);
1281 
1282 	return ret;
1283 }
1284 
1285 #define T(x) { x, #x }
1286 struct memcg_test {
1287 	int (*fn)(const char *root);
1288 	const char *name;
1289 } tests[] = {
1290 	T(test_memcg_subtree_control),
1291 	T(test_memcg_current),
1292 	T(test_memcg_min),
1293 	T(test_memcg_low),
1294 	T(test_memcg_high),
1295 	T(test_memcg_high_sync),
1296 	T(test_memcg_max),
1297 	T(test_memcg_reclaim),
1298 	T(test_memcg_oom_events),
1299 	T(test_memcg_swap_max),
1300 	T(test_memcg_sock),
1301 	T(test_memcg_oom_group_leaf_events),
1302 	T(test_memcg_oom_group_parent_events),
1303 	T(test_memcg_oom_group_score_events),
1304 };
1305 #undef T
1306 
1307 int main(int argc, char **argv)
1308 {
1309 	char root[PATH_MAX];
1310 	int i, proc_status, ret = EXIT_SUCCESS;
1311 
1312 	if (cg_find_unified_root(root, sizeof(root)))
1313 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1314 
1315 	/*
1316 	 * Check that memory controller is available:
1317 	 * memory is listed in cgroup.controllers
1318 	 */
1319 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1320 		ksft_exit_skip("memory controller isn't available\n");
1321 
1322 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1323 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1324 			ksft_exit_skip("Failed to set memory controller\n");
1325 
1326 	proc_status = proc_mount_contains("memory_recursiveprot");
1327 	if (proc_status < 0)
1328 		ksft_exit_skip("Failed to query cgroup mount option\n");
1329 	has_recursiveprot = proc_status;
1330 
1331 	proc_status = proc_mount_contains("memory_localevents");
1332 	if (proc_status < 0)
1333 		ksft_exit_skip("Failed to query cgroup mount option\n");
1334 	has_localevents = proc_status;
1335 
1336 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1337 		switch (tests[i].fn(root)) {
1338 		case KSFT_PASS:
1339 			ksft_test_result_pass("%s\n", tests[i].name);
1340 			break;
1341 		case KSFT_SKIP:
1342 			ksft_test_result_skip("%s\n", tests[i].name);
1343 			break;
1344 		default:
1345 			ret = EXIT_FAILURE;
1346 			ksft_test_result_fail("%s\n", tests[i].name);
1347 			break;
1348 		}
1349 	}
1350 
1351 	return ret;
1352 }
1353