1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
102 		*ptr = 0;
103 
104 	current = cg_read_long(cgroup, "memory.current");
105 	if (current < size)
106 		goto cleanup;
107 
108 	if (!values_close(size, current, 3))
109 		goto cleanup;
110 
111 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
112 	if (anon < 0)
113 		goto cleanup;
114 
115 	if (!values_close(anon, current, 3))
116 		goto cleanup;
117 
118 	ret = 0;
119 cleanup:
120 	free(buf);
121 	return ret;
122 }
123 
124 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
125 {
126 	size_t size = MB(50);
127 	int ret = -1;
128 	long current, file;
129 	int fd;
130 
131 	fd = get_temp_fd();
132 	if (fd < 0)
133 		return -1;
134 
135 	if (alloc_pagecache(fd, size))
136 		goto cleanup;
137 
138 	current = cg_read_long(cgroup, "memory.current");
139 	if (current < size)
140 		goto cleanup;
141 
142 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
143 	if (file < 0)
144 		goto cleanup;
145 
146 	if (!values_close(file, current, 10))
147 		goto cleanup;
148 
149 	ret = 0;
150 
151 cleanup:
152 	close(fd);
153 	return ret;
154 }
155 
156 /*
157  * This test create a memory cgroup, allocates
158  * some anonymous memory and some pagecache
159  * and check memory.current and some memory.stat values.
160  */
161 static int test_memcg_current(const char *root)
162 {
163 	int ret = KSFT_FAIL;
164 	long current;
165 	char *memcg;
166 
167 	memcg = cg_name(root, "memcg_test");
168 	if (!memcg)
169 		goto cleanup;
170 
171 	if (cg_create(memcg))
172 		goto cleanup;
173 
174 	current = cg_read_long(memcg, "memory.current");
175 	if (current != 0)
176 		goto cleanup;
177 
178 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
179 		goto cleanup;
180 
181 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
182 		goto cleanup;
183 
184 	ret = KSFT_PASS;
185 
186 cleanup:
187 	cg_destroy(memcg);
188 	free(memcg);
189 
190 	return ret;
191 }
192 
193 static int alloc_pagecache_50M(const char *cgroup, void *arg)
194 {
195 	int fd = (long)arg;
196 
197 	return alloc_pagecache(fd, MB(50));
198 }
199 
200 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
201 {
202 	int fd = (long)arg;
203 	int ppid = getppid();
204 
205 	if (alloc_pagecache(fd, MB(50)))
206 		return -1;
207 
208 	while (getppid() == ppid)
209 		sleep(1);
210 
211 	return 0;
212 }
213 
214 static int alloc_anon_noexit(const char *cgroup, void *arg)
215 {
216 	int ppid = getppid();
217 	size_t size = (unsigned long)arg;
218 	char *buf, *ptr;
219 
220 	buf = malloc(size);
221 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
222 		*ptr = 0;
223 
224 	while (getppid() == ppid)
225 		sleep(1);
226 
227 	free(buf);
228 	return 0;
229 }
230 
231 /*
232  * Wait until processes are killed asynchronously by the OOM killer
233  * If we exceed a timeout, fail.
234  */
235 static int cg_test_proc_killed(const char *cgroup)
236 {
237 	int limit;
238 
239 	for (limit = 10; limit > 0; limit--) {
240 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
241 			return 0;
242 
243 		usleep(100000);
244 	}
245 	return -1;
246 }
247 
248 /*
249  * First, this test creates the following hierarchy:
250  * A       memory.min = 50M,  memory.max = 200M
251  * A/B     memory.min = 50M,  memory.current = 50M
252  * A/B/C   memory.min = 75M,  memory.current = 50M
253  * A/B/D   memory.min = 25M,  memory.current = 50M
254  * A/B/E   memory.min = 0,    memory.current = 50M
255  * A/B/F   memory.min = 500M, memory.current = 0
256  *
257  * Usages are pagecache, but the test keeps a running
258  * process in every leaf cgroup.
259  * Then it creates A/G and creates a significant
260  * memory pressure in it.
261  *
262  * A/B    memory.current ~= 50M
263  * A/B/C  memory.current ~= 33M
264  * A/B/D  memory.current ~= 17M
265  * A/B/F  memory.current ~= 0
266  *
267  * After that it tries to allocate more than there is
268  * unprotected memory in A available, and checks
269  * checks that memory.min protects pagecache even
270  * in this case.
271  */
272 static int test_memcg_min(const char *root)
273 {
274 	int ret = KSFT_FAIL;
275 	char *parent[3] = {NULL};
276 	char *children[4] = {NULL};
277 	long c[4];
278 	int i, attempts;
279 	int fd;
280 
281 	fd = get_temp_fd();
282 	if (fd < 0)
283 		goto cleanup;
284 
285 	parent[0] = cg_name(root, "memcg_test_0");
286 	if (!parent[0])
287 		goto cleanup;
288 
289 	parent[1] = cg_name(parent[0], "memcg_test_1");
290 	if (!parent[1])
291 		goto cleanup;
292 
293 	parent[2] = cg_name(parent[0], "memcg_test_2");
294 	if (!parent[2])
295 		goto cleanup;
296 
297 	if (cg_create(parent[0]))
298 		goto cleanup;
299 
300 	if (cg_read_long(parent[0], "memory.min")) {
301 		ret = KSFT_SKIP;
302 		goto cleanup;
303 	}
304 
305 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
306 		goto cleanup;
307 
308 	if (cg_write(parent[0], "memory.max", "200M"))
309 		goto cleanup;
310 
311 	if (cg_write(parent[0], "memory.swap.max", "0"))
312 		goto cleanup;
313 
314 	if (cg_create(parent[1]))
315 		goto cleanup;
316 
317 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
318 		goto cleanup;
319 
320 	if (cg_create(parent[2]))
321 		goto cleanup;
322 
323 	for (i = 0; i < ARRAY_SIZE(children); i++) {
324 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
325 		if (!children[i])
326 			goto cleanup;
327 
328 		if (cg_create(children[i]))
329 			goto cleanup;
330 
331 		if (i > 2)
332 			continue;
333 
334 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
335 			      (void *)(long)fd);
336 	}
337 
338 	if (cg_write(parent[0], "memory.min", "50M"))
339 		goto cleanup;
340 	if (cg_write(parent[1], "memory.min", "50M"))
341 		goto cleanup;
342 	if (cg_write(children[0], "memory.min", "75M"))
343 		goto cleanup;
344 	if (cg_write(children[1], "memory.min", "25M"))
345 		goto cleanup;
346 	if (cg_write(children[2], "memory.min", "0"))
347 		goto cleanup;
348 	if (cg_write(children[3], "memory.min", "500M"))
349 		goto cleanup;
350 
351 	attempts = 0;
352 	while (!values_close(cg_read_long(parent[1], "memory.current"),
353 			     MB(150), 3)) {
354 		if (attempts++ > 5)
355 			break;
356 		sleep(1);
357 	}
358 
359 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
360 		goto cleanup;
361 
362 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
363 		goto cleanup;
364 
365 	for (i = 0; i < ARRAY_SIZE(children); i++)
366 		c[i] = cg_read_long(children[i], "memory.current");
367 
368 	if (!values_close(c[0], MB(33), 10))
369 		goto cleanup;
370 
371 	if (!values_close(c[1], MB(17), 10))
372 		goto cleanup;
373 
374 	if (c[3] != 0)
375 		goto cleanup;
376 
377 	if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
378 		goto cleanup;
379 
380 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
381 		goto cleanup;
382 
383 	ret = KSFT_PASS;
384 
385 cleanup:
386 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
387 		if (!children[i])
388 			continue;
389 
390 		cg_destroy(children[i]);
391 		free(children[i]);
392 	}
393 
394 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
395 		if (!parent[i])
396 			continue;
397 
398 		cg_destroy(parent[i]);
399 		free(parent[i]);
400 	}
401 	close(fd);
402 	return ret;
403 }
404 
405 /*
406  * First, this test creates the following hierarchy:
407  * A       memory.low = 50M,  memory.max = 200M
408  * A/B     memory.low = 50M,  memory.current = 50M
409  * A/B/C   memory.low = 75M,  memory.current = 50M
410  * A/B/D   memory.low = 25M,  memory.current = 50M
411  * A/B/E   memory.low = 0,    memory.current = 50M
412  * A/B/F   memory.low = 500M, memory.current = 0
413  *
414  * Usages are pagecache.
415  * Then it creates A/G an creates a significant
416  * memory pressure in it.
417  *
418  * Then it checks actual memory usages and expects that:
419  * A/B    memory.current ~= 50M
420  * A/B/   memory.current ~= 33M
421  * A/B/D  memory.current ~= 17M
422  * A/B/F  memory.current ~= 0
423  *
424  * After that it tries to allocate more than there is
425  * unprotected memory in A available,
426  * and checks low and oom events in memory.events.
427  */
428 static int test_memcg_low(const char *root)
429 {
430 	int ret = KSFT_FAIL;
431 	char *parent[3] = {NULL};
432 	char *children[4] = {NULL};
433 	long low, oom;
434 	long c[4];
435 	int i;
436 	int fd;
437 
438 	fd = get_temp_fd();
439 	if (fd < 0)
440 		goto cleanup;
441 
442 	parent[0] = cg_name(root, "memcg_test_0");
443 	if (!parent[0])
444 		goto cleanup;
445 
446 	parent[1] = cg_name(parent[0], "memcg_test_1");
447 	if (!parent[1])
448 		goto cleanup;
449 
450 	parent[2] = cg_name(parent[0], "memcg_test_2");
451 	if (!parent[2])
452 		goto cleanup;
453 
454 	if (cg_create(parent[0]))
455 		goto cleanup;
456 
457 	if (cg_read_long(parent[0], "memory.low"))
458 		goto cleanup;
459 
460 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
461 		goto cleanup;
462 
463 	if (cg_write(parent[0], "memory.max", "200M"))
464 		goto cleanup;
465 
466 	if (cg_write(parent[0], "memory.swap.max", "0"))
467 		goto cleanup;
468 
469 	if (cg_create(parent[1]))
470 		goto cleanup;
471 
472 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
473 		goto cleanup;
474 
475 	if (cg_create(parent[2]))
476 		goto cleanup;
477 
478 	for (i = 0; i < ARRAY_SIZE(children); i++) {
479 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
480 		if (!children[i])
481 			goto cleanup;
482 
483 		if (cg_create(children[i]))
484 			goto cleanup;
485 
486 		if (i > 2)
487 			continue;
488 
489 		if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
490 			goto cleanup;
491 	}
492 
493 	if (cg_write(parent[0], "memory.low", "50M"))
494 		goto cleanup;
495 	if (cg_write(parent[1], "memory.low", "50M"))
496 		goto cleanup;
497 	if (cg_write(children[0], "memory.low", "75M"))
498 		goto cleanup;
499 	if (cg_write(children[1], "memory.low", "25M"))
500 		goto cleanup;
501 	if (cg_write(children[2], "memory.low", "0"))
502 		goto cleanup;
503 	if (cg_write(children[3], "memory.low", "500M"))
504 		goto cleanup;
505 
506 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
507 		goto cleanup;
508 
509 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
510 		goto cleanup;
511 
512 	for (i = 0; i < ARRAY_SIZE(children); i++)
513 		c[i] = cg_read_long(children[i], "memory.current");
514 
515 	if (!values_close(c[0], MB(33), 10))
516 		goto cleanup;
517 
518 	if (!values_close(c[1], MB(17), 10))
519 		goto cleanup;
520 
521 	if (c[3] != 0)
522 		goto cleanup;
523 
524 	if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
525 		fprintf(stderr,
526 			"memory.low prevents from allocating anon memory\n");
527 		goto cleanup;
528 	}
529 
530 	for (i = 0; i < ARRAY_SIZE(children); i++) {
531 		int no_low_events_index = 1;
532 
533 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
534 		low = cg_read_key_long(children[i], "memory.events", "low ");
535 
536 		if (oom)
537 			goto cleanup;
538 		if (i <= no_low_events_index && low <= 0)
539 			goto cleanup;
540 		if (i > no_low_events_index && low)
541 			goto cleanup;
542 
543 	}
544 
545 	ret = KSFT_PASS;
546 
547 cleanup:
548 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
549 		if (!children[i])
550 			continue;
551 
552 		cg_destroy(children[i]);
553 		free(children[i]);
554 	}
555 
556 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
557 		if (!parent[i])
558 			continue;
559 
560 		cg_destroy(parent[i]);
561 		free(parent[i]);
562 	}
563 	close(fd);
564 	return ret;
565 }
566 
567 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
568 {
569 	size_t size = MB(50);
570 	int ret = -1;
571 	long current, high, max;
572 	int fd;
573 
574 	high = cg_read_long(cgroup, "memory.high");
575 	max = cg_read_long(cgroup, "memory.max");
576 	if (high != MB(30) && max != MB(30))
577 		return -1;
578 
579 	fd = get_temp_fd();
580 	if (fd < 0)
581 		return -1;
582 
583 	if (alloc_pagecache(fd, size))
584 		goto cleanup;
585 
586 	current = cg_read_long(cgroup, "memory.current");
587 	if (!values_close(current, MB(30), 5))
588 		goto cleanup;
589 
590 	ret = 0;
591 
592 cleanup:
593 	close(fd);
594 	return ret;
595 
596 }
597 
598 /*
599  * This test checks that memory.high limits the amount of
600  * memory which can be consumed by either anonymous memory
601  * or pagecache.
602  */
603 static int test_memcg_high(const char *root)
604 {
605 	int ret = KSFT_FAIL;
606 	char *memcg;
607 	long high;
608 
609 	memcg = cg_name(root, "memcg_test");
610 	if (!memcg)
611 		goto cleanup;
612 
613 	if (cg_create(memcg))
614 		goto cleanup;
615 
616 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
617 		goto cleanup;
618 
619 	if (cg_write(memcg, "memory.swap.max", "0"))
620 		goto cleanup;
621 
622 	if (cg_write(memcg, "memory.high", "30M"))
623 		goto cleanup;
624 
625 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
626 		goto cleanup;
627 
628 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
629 		goto cleanup;
630 
631 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
632 		goto cleanup;
633 
634 	high = cg_read_key_long(memcg, "memory.events", "high ");
635 	if (high <= 0)
636 		goto cleanup;
637 
638 	ret = KSFT_PASS;
639 
640 cleanup:
641 	cg_destroy(memcg);
642 	free(memcg);
643 
644 	return ret;
645 }
646 
647 static int alloc_anon_mlock(const char *cgroup, void *arg)
648 {
649 	size_t size = (size_t)arg;
650 	void *buf;
651 
652 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
653 		   0, 0);
654 	if (buf == MAP_FAILED)
655 		return -1;
656 
657 	mlock(buf, size);
658 	munmap(buf, size);
659 	return 0;
660 }
661 
662 /*
663  * This test checks that memory.high is able to throttle big single shot
664  * allocation i.e. large allocation within one kernel entry.
665  */
666 static int test_memcg_high_sync(const char *root)
667 {
668 	int ret = KSFT_FAIL, pid, fd = -1;
669 	char *memcg;
670 	long pre_high, pre_max;
671 	long post_high, post_max;
672 
673 	memcg = cg_name(root, "memcg_test");
674 	if (!memcg)
675 		goto cleanup;
676 
677 	if (cg_create(memcg))
678 		goto cleanup;
679 
680 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
681 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
682 	if (pre_high < 0 || pre_max < 0)
683 		goto cleanup;
684 
685 	if (cg_write(memcg, "memory.swap.max", "0"))
686 		goto cleanup;
687 
688 	if (cg_write(memcg, "memory.high", "30M"))
689 		goto cleanup;
690 
691 	if (cg_write(memcg, "memory.max", "140M"))
692 		goto cleanup;
693 
694 	fd = memcg_prepare_for_wait(memcg);
695 	if (fd < 0)
696 		goto cleanup;
697 
698 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
699 	if (pid < 0)
700 		goto cleanup;
701 
702 	cg_wait_for(fd);
703 
704 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
705 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
706 	if (post_high < 0 || post_max < 0)
707 		goto cleanup;
708 
709 	if (pre_high == post_high || pre_max != post_max)
710 		goto cleanup;
711 
712 	ret = KSFT_PASS;
713 
714 cleanup:
715 	if (fd >= 0)
716 		close(fd);
717 	cg_destroy(memcg);
718 	free(memcg);
719 
720 	return ret;
721 }
722 
723 /*
724  * This test checks that memory.max limits the amount of
725  * memory which can be consumed by either anonymous memory
726  * or pagecache.
727  */
728 static int test_memcg_max(const char *root)
729 {
730 	int ret = KSFT_FAIL;
731 	char *memcg;
732 	long current, max;
733 
734 	memcg = cg_name(root, "memcg_test");
735 	if (!memcg)
736 		goto cleanup;
737 
738 	if (cg_create(memcg))
739 		goto cleanup;
740 
741 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
742 		goto cleanup;
743 
744 	if (cg_write(memcg, "memory.swap.max", "0"))
745 		goto cleanup;
746 
747 	if (cg_write(memcg, "memory.max", "30M"))
748 		goto cleanup;
749 
750 	/* Should be killed by OOM killer */
751 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
752 		goto cleanup;
753 
754 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
755 		goto cleanup;
756 
757 	current = cg_read_long(memcg, "memory.current");
758 	if (current > MB(30) || !current)
759 		goto cleanup;
760 
761 	max = cg_read_key_long(memcg, "memory.events", "max ");
762 	if (max <= 0)
763 		goto cleanup;
764 
765 	ret = KSFT_PASS;
766 
767 cleanup:
768 	cg_destroy(memcg);
769 	free(memcg);
770 
771 	return ret;
772 }
773 
774 /*
775  * This test checks that memory.reclaim reclaims the given
776  * amount of memory (from both anon and file, if possible).
777  */
778 static int test_memcg_reclaim(const char *root)
779 {
780 	int ret = KSFT_FAIL, fd, retries;
781 	char *memcg;
782 	long current, expected_usage, to_reclaim;
783 	char buf[64];
784 
785 	memcg = cg_name(root, "memcg_test");
786 	if (!memcg)
787 		goto cleanup;
788 
789 	if (cg_create(memcg))
790 		goto cleanup;
791 
792 	current = cg_read_long(memcg, "memory.current");
793 	if (current != 0)
794 		goto cleanup;
795 
796 	fd = get_temp_fd();
797 	if (fd < 0)
798 		goto cleanup;
799 
800 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
801 
802 	/*
803 	 * If swap is enabled, try to reclaim from both anon and file, else try
804 	 * to reclaim from file only.
805 	 */
806 	if (is_swap_enabled()) {
807 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
808 		expected_usage = MB(100);
809 	} else
810 		expected_usage = MB(50);
811 
812 	/*
813 	 * Wait until current usage reaches the expected usage (or we run out of
814 	 * retries).
815 	 */
816 	retries = 5;
817 	while (!values_close(cg_read_long(memcg, "memory.current"),
818 			    expected_usage, 10)) {
819 		if (retries--) {
820 			sleep(1);
821 			continue;
822 		} else {
823 			fprintf(stderr,
824 				"failed to allocate %ld for memcg reclaim test\n",
825 				expected_usage);
826 			goto cleanup;
827 		}
828 	}
829 
830 	/*
831 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
832 	 * and file if swap is enabled.
833 	 */
834 	retries = 5;
835 	while (true) {
836 		int err;
837 
838 		current = cg_read_long(memcg, "memory.current");
839 		to_reclaim = current - MB(30);
840 
841 		/*
842 		 * We only keep looping if we get EAGAIN, which means we could
843 		 * not reclaim the full amount.
844 		 */
845 		if (to_reclaim <= 0)
846 			goto cleanup;
847 
848 
849 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
850 		err = cg_write(memcg, "memory.reclaim", buf);
851 		if (!err) {
852 			/*
853 			 * If writing succeeds, then the written amount should have been
854 			 * fully reclaimed (and maybe more).
855 			 */
856 			current = cg_read_long(memcg, "memory.current");
857 			if (!values_close(current, MB(30), 3) && current > MB(30))
858 				goto cleanup;
859 			break;
860 		}
861 
862 		/* The kernel could not reclaim the full amount, try again. */
863 		if (err == -EAGAIN && retries--)
864 			continue;
865 
866 		/* We got an unexpected error or ran out of retries. */
867 		goto cleanup;
868 	}
869 
870 	ret = KSFT_PASS;
871 cleanup:
872 	cg_destroy(memcg);
873 	free(memcg);
874 	close(fd);
875 
876 	return ret;
877 }
878 
879 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
880 {
881 	long mem_max = (long)arg;
882 	size_t size = MB(50);
883 	char *buf, *ptr;
884 	long mem_current, swap_current;
885 	int ret = -1;
886 
887 	buf = malloc(size);
888 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
889 		*ptr = 0;
890 
891 	mem_current = cg_read_long(cgroup, "memory.current");
892 	if (!mem_current || !values_close(mem_current, mem_max, 3))
893 		goto cleanup;
894 
895 	swap_current = cg_read_long(cgroup, "memory.swap.current");
896 	if (!swap_current ||
897 	    !values_close(mem_current + swap_current, size, 3))
898 		goto cleanup;
899 
900 	ret = 0;
901 cleanup:
902 	free(buf);
903 	return ret;
904 }
905 
906 /*
907  * This test checks that memory.swap.max limits the amount of
908  * anonymous memory which can be swapped out.
909  */
910 static int test_memcg_swap_max(const char *root)
911 {
912 	int ret = KSFT_FAIL;
913 	char *memcg;
914 	long max;
915 
916 	if (!is_swap_enabled())
917 		return KSFT_SKIP;
918 
919 	memcg = cg_name(root, "memcg_test");
920 	if (!memcg)
921 		goto cleanup;
922 
923 	if (cg_create(memcg))
924 		goto cleanup;
925 
926 	if (cg_read_long(memcg, "memory.swap.current")) {
927 		ret = KSFT_SKIP;
928 		goto cleanup;
929 	}
930 
931 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
932 		goto cleanup;
933 
934 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
935 		goto cleanup;
936 
937 	if (cg_write(memcg, "memory.swap.max", "30M"))
938 		goto cleanup;
939 
940 	if (cg_write(memcg, "memory.max", "30M"))
941 		goto cleanup;
942 
943 	/* Should be killed by OOM killer */
944 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
945 		goto cleanup;
946 
947 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
948 		goto cleanup;
949 
950 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
951 		goto cleanup;
952 
953 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
954 		goto cleanup;
955 
956 	max = cg_read_key_long(memcg, "memory.events", "max ");
957 	if (max <= 0)
958 		goto cleanup;
959 
960 	ret = KSFT_PASS;
961 
962 cleanup:
963 	cg_destroy(memcg);
964 	free(memcg);
965 
966 	return ret;
967 }
968 
969 /*
970  * This test disables swapping and tries to allocate anonymous memory
971  * up to OOM. Then it checks for oom and oom_kill events in
972  * memory.events.
973  */
974 static int test_memcg_oom_events(const char *root)
975 {
976 	int ret = KSFT_FAIL;
977 	char *memcg;
978 
979 	memcg = cg_name(root, "memcg_test");
980 	if (!memcg)
981 		goto cleanup;
982 
983 	if (cg_create(memcg))
984 		goto cleanup;
985 
986 	if (cg_write(memcg, "memory.max", "30M"))
987 		goto cleanup;
988 
989 	if (cg_write(memcg, "memory.swap.max", "0"))
990 		goto cleanup;
991 
992 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
993 		goto cleanup;
994 
995 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
996 		goto cleanup;
997 
998 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
999 		goto cleanup;
1000 
1001 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1002 		goto cleanup;
1003 
1004 	ret = KSFT_PASS;
1005 
1006 cleanup:
1007 	cg_destroy(memcg);
1008 	free(memcg);
1009 
1010 	return ret;
1011 }
1012 
1013 struct tcp_server_args {
1014 	unsigned short port;
1015 	int ctl[2];
1016 };
1017 
1018 static int tcp_server(const char *cgroup, void *arg)
1019 {
1020 	struct tcp_server_args *srv_args = arg;
1021 	struct sockaddr_in6 saddr = { 0 };
1022 	socklen_t slen = sizeof(saddr);
1023 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1024 
1025 	close(srv_args->ctl[0]);
1026 	ctl_fd = srv_args->ctl[1];
1027 
1028 	saddr.sin6_family = AF_INET6;
1029 	saddr.sin6_addr = in6addr_any;
1030 	saddr.sin6_port = htons(srv_args->port);
1031 
1032 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1033 	if (sk < 0)
1034 		return ret;
1035 
1036 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1037 		goto cleanup;
1038 
1039 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1040 		write(ctl_fd, &errno, sizeof(errno));
1041 		goto cleanup;
1042 	}
1043 
1044 	if (listen(sk, 1))
1045 		goto cleanup;
1046 
1047 	ret = 0;
1048 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1049 		ret = -1;
1050 		goto cleanup;
1051 	}
1052 
1053 	client_sk = accept(sk, NULL, NULL);
1054 	if (client_sk < 0)
1055 		goto cleanup;
1056 
1057 	ret = -1;
1058 	for (;;) {
1059 		uint8_t buf[0x100000];
1060 
1061 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1062 			if (errno == ECONNRESET)
1063 				ret = 0;
1064 			break;
1065 		}
1066 	}
1067 
1068 	close(client_sk);
1069 
1070 cleanup:
1071 	close(sk);
1072 	return ret;
1073 }
1074 
1075 static int tcp_client(const char *cgroup, unsigned short port)
1076 {
1077 	const char server[] = "localhost";
1078 	struct addrinfo *ai;
1079 	char servport[6];
1080 	int retries = 0x10; /* nice round number */
1081 	int sk, ret;
1082 
1083 	snprintf(servport, sizeof(servport), "%hd", port);
1084 	ret = getaddrinfo(server, servport, NULL, &ai);
1085 	if (ret)
1086 		return ret;
1087 
1088 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1089 	if (sk < 0)
1090 		goto free_ainfo;
1091 
1092 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1093 	if (ret < 0)
1094 		goto close_sk;
1095 
1096 	ret = KSFT_FAIL;
1097 	while (retries--) {
1098 		uint8_t buf[0x100000];
1099 		long current, sock;
1100 
1101 		if (read(sk, buf, sizeof(buf)) <= 0)
1102 			goto close_sk;
1103 
1104 		current = cg_read_long(cgroup, "memory.current");
1105 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1106 
1107 		if (current < 0 || sock < 0)
1108 			goto close_sk;
1109 
1110 		if (values_close(current, sock, 10)) {
1111 			ret = KSFT_PASS;
1112 			break;
1113 		}
1114 	}
1115 
1116 close_sk:
1117 	close(sk);
1118 free_ainfo:
1119 	freeaddrinfo(ai);
1120 	return ret;
1121 }
1122 
1123 /*
1124  * This test checks socket memory accounting.
1125  * The test forks a TCP server listens on a random port between 1000
1126  * and 61000. Once it gets a client connection, it starts writing to
1127  * its socket.
1128  * The TCP client interleaves reads from the socket with check whether
1129  * memory.current and memory.stat.sock are similar.
1130  */
1131 static int test_memcg_sock(const char *root)
1132 {
1133 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1134 	unsigned short port;
1135 	char *memcg;
1136 
1137 	memcg = cg_name(root, "memcg_test");
1138 	if (!memcg)
1139 		goto cleanup;
1140 
1141 	if (cg_create(memcg))
1142 		goto cleanup;
1143 
1144 	while (bind_retries--) {
1145 		struct tcp_server_args args;
1146 
1147 		if (pipe(args.ctl))
1148 			goto cleanup;
1149 
1150 		port = args.port = 1000 + rand() % 60000;
1151 
1152 		pid = cg_run_nowait(memcg, tcp_server, &args);
1153 		if (pid < 0)
1154 			goto cleanup;
1155 
1156 		close(args.ctl[1]);
1157 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1158 			goto cleanup;
1159 		close(args.ctl[0]);
1160 
1161 		if (!err)
1162 			break;
1163 		if (err != EADDRINUSE)
1164 			goto cleanup;
1165 
1166 		waitpid(pid, NULL, 0);
1167 	}
1168 
1169 	if (err == EADDRINUSE) {
1170 		ret = KSFT_SKIP;
1171 		goto cleanup;
1172 	}
1173 
1174 	if (tcp_client(memcg, port) != KSFT_PASS)
1175 		goto cleanup;
1176 
1177 	waitpid(pid, &err, 0);
1178 	if (WEXITSTATUS(err))
1179 		goto cleanup;
1180 
1181 	if (cg_read_long(memcg, "memory.current") < 0)
1182 		goto cleanup;
1183 
1184 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1185 		goto cleanup;
1186 
1187 	ret = KSFT_PASS;
1188 
1189 cleanup:
1190 	cg_destroy(memcg);
1191 	free(memcg);
1192 
1193 	return ret;
1194 }
1195 
1196 /*
1197  * This test disables swapping and tries to allocate anonymous memory
1198  * up to OOM with memory.group.oom set. Then it checks that all
1199  * processes in the leaf were killed. It also checks that oom_events
1200  * were propagated to the parent level.
1201  */
1202 static int test_memcg_oom_group_leaf_events(const char *root)
1203 {
1204 	int ret = KSFT_FAIL;
1205 	char *parent, *child;
1206 	long parent_oom_events;
1207 
1208 	parent = cg_name(root, "memcg_test_0");
1209 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1210 
1211 	if (!parent || !child)
1212 		goto cleanup;
1213 
1214 	if (cg_create(parent))
1215 		goto cleanup;
1216 
1217 	if (cg_create(child))
1218 		goto cleanup;
1219 
1220 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1221 		goto cleanup;
1222 
1223 	if (cg_write(child, "memory.max", "50M"))
1224 		goto cleanup;
1225 
1226 	if (cg_write(child, "memory.swap.max", "0"))
1227 		goto cleanup;
1228 
1229 	if (cg_write(child, "memory.oom.group", "1"))
1230 		goto cleanup;
1231 
1232 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1233 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1234 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1235 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1236 		goto cleanup;
1237 
1238 	if (cg_test_proc_killed(child))
1239 		goto cleanup;
1240 
1241 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1242 		goto cleanup;
1243 
1244 	parent_oom_events = cg_read_key_long(
1245 			parent, "memory.events", "oom_kill ");
1246 	/*
1247 	 * If memory_localevents is not enabled (the default), the parent should
1248 	 * count OOM events in its children groups. Otherwise, it should not
1249 	 * have observed any events.
1250 	 */
1251 	if (has_localevents && parent_oom_events != 0)
1252 		goto cleanup;
1253 	else if (!has_localevents && parent_oom_events <= 0)
1254 		goto cleanup;
1255 
1256 	ret = KSFT_PASS;
1257 
1258 cleanup:
1259 	if (child)
1260 		cg_destroy(child);
1261 	if (parent)
1262 		cg_destroy(parent);
1263 	free(child);
1264 	free(parent);
1265 
1266 	return ret;
1267 }
1268 
1269 /*
1270  * This test disables swapping and tries to allocate anonymous memory
1271  * up to OOM with memory.group.oom set. Then it checks that all
1272  * processes in the parent and leaf were killed.
1273  */
1274 static int test_memcg_oom_group_parent_events(const char *root)
1275 {
1276 	int ret = KSFT_FAIL;
1277 	char *parent, *child;
1278 
1279 	parent = cg_name(root, "memcg_test_0");
1280 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1281 
1282 	if (!parent || !child)
1283 		goto cleanup;
1284 
1285 	if (cg_create(parent))
1286 		goto cleanup;
1287 
1288 	if (cg_create(child))
1289 		goto cleanup;
1290 
1291 	if (cg_write(parent, "memory.max", "80M"))
1292 		goto cleanup;
1293 
1294 	if (cg_write(parent, "memory.swap.max", "0"))
1295 		goto cleanup;
1296 
1297 	if (cg_write(parent, "memory.oom.group", "1"))
1298 		goto cleanup;
1299 
1300 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1301 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1302 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1303 
1304 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1305 		goto cleanup;
1306 
1307 	if (cg_test_proc_killed(child))
1308 		goto cleanup;
1309 	if (cg_test_proc_killed(parent))
1310 		goto cleanup;
1311 
1312 	ret = KSFT_PASS;
1313 
1314 cleanup:
1315 	if (child)
1316 		cg_destroy(child);
1317 	if (parent)
1318 		cg_destroy(parent);
1319 	free(child);
1320 	free(parent);
1321 
1322 	return ret;
1323 }
1324 
1325 /*
1326  * This test disables swapping and tries to allocate anonymous memory
1327  * up to OOM with memory.group.oom set. Then it checks that all
1328  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1329  */
1330 static int test_memcg_oom_group_score_events(const char *root)
1331 {
1332 	int ret = KSFT_FAIL;
1333 	char *memcg;
1334 	int safe_pid;
1335 
1336 	memcg = cg_name(root, "memcg_test_0");
1337 
1338 	if (!memcg)
1339 		goto cleanup;
1340 
1341 	if (cg_create(memcg))
1342 		goto cleanup;
1343 
1344 	if (cg_write(memcg, "memory.max", "50M"))
1345 		goto cleanup;
1346 
1347 	if (cg_write(memcg, "memory.swap.max", "0"))
1348 		goto cleanup;
1349 
1350 	if (cg_write(memcg, "memory.oom.group", "1"))
1351 		goto cleanup;
1352 
1353 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1354 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1355 		goto cleanup;
1356 
1357 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1358 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1359 		goto cleanup;
1360 
1361 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1362 		goto cleanup;
1363 
1364 	if (kill(safe_pid, SIGKILL))
1365 		goto cleanup;
1366 
1367 	ret = KSFT_PASS;
1368 
1369 cleanup:
1370 	if (memcg)
1371 		cg_destroy(memcg);
1372 	free(memcg);
1373 
1374 	return ret;
1375 }
1376 
1377 #define T(x) { x, #x }
1378 struct memcg_test {
1379 	int (*fn)(const char *root);
1380 	const char *name;
1381 } tests[] = {
1382 	T(test_memcg_subtree_control),
1383 	T(test_memcg_current),
1384 	T(test_memcg_min),
1385 	T(test_memcg_low),
1386 	T(test_memcg_high),
1387 	T(test_memcg_high_sync),
1388 	T(test_memcg_max),
1389 	T(test_memcg_reclaim),
1390 	T(test_memcg_oom_events),
1391 	T(test_memcg_swap_max),
1392 	T(test_memcg_sock),
1393 	T(test_memcg_oom_group_leaf_events),
1394 	T(test_memcg_oom_group_parent_events),
1395 	T(test_memcg_oom_group_score_events),
1396 };
1397 #undef T
1398 
1399 int main(int argc, char **argv)
1400 {
1401 	char root[PATH_MAX];
1402 	int i, proc_status, ret = EXIT_SUCCESS;
1403 
1404 	if (cg_find_unified_root(root, sizeof(root)))
1405 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1406 
1407 	/*
1408 	 * Check that memory controller is available:
1409 	 * memory is listed in cgroup.controllers
1410 	 */
1411 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1412 		ksft_exit_skip("memory controller isn't available\n");
1413 
1414 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1415 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1416 			ksft_exit_skip("Failed to set memory controller\n");
1417 
1418 	proc_status = proc_mount_contains("memory_recursiveprot");
1419 	if (proc_status < 0)
1420 		ksft_exit_skip("Failed to query cgroup mount option\n");
1421 	has_recursiveprot = proc_status;
1422 
1423 	proc_status = proc_mount_contains("memory_localevents");
1424 	if (proc_status < 0)
1425 		ksft_exit_skip("Failed to query cgroup mount option\n");
1426 	has_localevents = proc_status;
1427 
1428 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1429 		switch (tests[i].fn(root)) {
1430 		case KSFT_PASS:
1431 			ksft_test_result_pass("%s\n", tests[i].name);
1432 			break;
1433 		case KSFT_SKIP:
1434 			ksft_test_result_skip("%s\n", tests[i].name);
1435 			break;
1436 		default:
1437 			ret = EXIT_FAILURE;
1438 			ksft_test_result_fail("%s\n", tests[i].name);
1439 			break;
1440 		}
1441 	}
1442 
1443 	return ret;
1444 }
1445