1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #define _GNU_SOURCE 3 4 #include <linux/limits.h> 5 #include <linux/oom.h> 6 #include <fcntl.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <sys/stat.h> 11 #include <sys/types.h> 12 #include <unistd.h> 13 #include <sys/socket.h> 14 #include <sys/wait.h> 15 #include <arpa/inet.h> 16 #include <netinet/in.h> 17 #include <netdb.h> 18 #include <errno.h> 19 #include <sys/mman.h> 20 21 #include "../kselftest.h" 22 #include "cgroup_util.h" 23 24 static bool has_localevents; 25 static bool has_recursiveprot; 26 27 /* 28 * This test creates two nested cgroups with and without enabling 29 * the memory controller. 30 */ 31 static int test_memcg_subtree_control(const char *root) 32 { 33 char *parent, *child, *parent2 = NULL, *child2 = NULL; 34 int ret = KSFT_FAIL; 35 char buf[PAGE_SIZE]; 36 37 /* Create two nested cgroups with the memory controller enabled */ 38 parent = cg_name(root, "memcg_test_0"); 39 child = cg_name(root, "memcg_test_0/memcg_test_1"); 40 if (!parent || !child) 41 goto cleanup_free; 42 43 if (cg_create(parent)) 44 goto cleanup_free; 45 46 if (cg_write(parent, "cgroup.subtree_control", "+memory")) 47 goto cleanup_parent; 48 49 if (cg_create(child)) 50 goto cleanup_parent; 51 52 if (cg_read_strstr(child, "cgroup.controllers", "memory")) 53 goto cleanup_child; 54 55 /* Create two nested cgroups without enabling memory controller */ 56 parent2 = cg_name(root, "memcg_test_1"); 57 child2 = cg_name(root, "memcg_test_1/memcg_test_1"); 58 if (!parent2 || !child2) 59 goto cleanup_free2; 60 61 if (cg_create(parent2)) 62 goto cleanup_free2; 63 64 if (cg_create(child2)) 65 goto cleanup_parent2; 66 67 if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf))) 68 goto cleanup_all; 69 70 if (!cg_read_strstr(child2, "cgroup.controllers", "memory")) 71 goto cleanup_all; 72 73 ret = KSFT_PASS; 74 75 cleanup_all: 76 cg_destroy(child2); 77 cleanup_parent2: 78 cg_destroy(parent2); 79 cleanup_free2: 80 free(parent2); 81 free(child2); 82 cleanup_child: 83 cg_destroy(child); 84 cleanup_parent: 85 cg_destroy(parent); 86 cleanup_free: 87 free(parent); 88 free(child); 89 90 return ret; 91 } 92 93 static int alloc_anon_50M_check(const char *cgroup, void *arg) 94 { 95 size_t size = MB(50); 96 char *buf, *ptr; 97 long anon, current; 98 int ret = -1; 99 100 buf = malloc(size); 101 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 102 *ptr = 0; 103 104 current = cg_read_long(cgroup, "memory.current"); 105 if (current < size) 106 goto cleanup; 107 108 if (!values_close(size, current, 3)) 109 goto cleanup; 110 111 anon = cg_read_key_long(cgroup, "memory.stat", "anon "); 112 if (anon < 0) 113 goto cleanup; 114 115 if (!values_close(anon, current, 3)) 116 goto cleanup; 117 118 ret = 0; 119 cleanup: 120 free(buf); 121 return ret; 122 } 123 124 static int alloc_pagecache_50M_check(const char *cgroup, void *arg) 125 { 126 size_t size = MB(50); 127 int ret = -1; 128 long current, file; 129 int fd; 130 131 fd = get_temp_fd(); 132 if (fd < 0) 133 return -1; 134 135 if (alloc_pagecache(fd, size)) 136 goto cleanup; 137 138 current = cg_read_long(cgroup, "memory.current"); 139 if (current < size) 140 goto cleanup; 141 142 file = cg_read_key_long(cgroup, "memory.stat", "file "); 143 if (file < 0) 144 goto cleanup; 145 146 if (!values_close(file, current, 10)) 147 goto cleanup; 148 149 ret = 0; 150 151 cleanup: 152 close(fd); 153 return ret; 154 } 155 156 /* 157 * This test create a memory cgroup, allocates 158 * some anonymous memory and some pagecache 159 * and check memory.current and some memory.stat values. 160 */ 161 static int test_memcg_current(const char *root) 162 { 163 int ret = KSFT_FAIL; 164 long current; 165 char *memcg; 166 167 memcg = cg_name(root, "memcg_test"); 168 if (!memcg) 169 goto cleanup; 170 171 if (cg_create(memcg)) 172 goto cleanup; 173 174 current = cg_read_long(memcg, "memory.current"); 175 if (current != 0) 176 goto cleanup; 177 178 if (cg_run(memcg, alloc_anon_50M_check, NULL)) 179 goto cleanup; 180 181 if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) 182 goto cleanup; 183 184 ret = KSFT_PASS; 185 186 cleanup: 187 cg_destroy(memcg); 188 free(memcg); 189 190 return ret; 191 } 192 193 static int alloc_pagecache_50M(const char *cgroup, void *arg) 194 { 195 int fd = (long)arg; 196 197 return alloc_pagecache(fd, MB(50)); 198 } 199 200 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) 201 { 202 int fd = (long)arg; 203 int ppid = getppid(); 204 205 if (alloc_pagecache(fd, MB(50))) 206 return -1; 207 208 while (getppid() == ppid) 209 sleep(1); 210 211 return 0; 212 } 213 214 static int alloc_anon_noexit(const char *cgroup, void *arg) 215 { 216 int ppid = getppid(); 217 size_t size = (unsigned long)arg; 218 char *buf, *ptr; 219 220 buf = malloc(size); 221 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 222 *ptr = 0; 223 224 while (getppid() == ppid) 225 sleep(1); 226 227 free(buf); 228 return 0; 229 } 230 231 /* 232 * Wait until processes are killed asynchronously by the OOM killer 233 * If we exceed a timeout, fail. 234 */ 235 static int cg_test_proc_killed(const char *cgroup) 236 { 237 int limit; 238 239 for (limit = 10; limit > 0; limit--) { 240 if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0) 241 return 0; 242 243 usleep(100000); 244 } 245 return -1; 246 } 247 248 /* 249 * First, this test creates the following hierarchy: 250 * A memory.min = 50M, memory.max = 200M 251 * A/B memory.min = 50M, memory.current = 50M 252 * A/B/C memory.min = 75M, memory.current = 50M 253 * A/B/D memory.min = 25M, memory.current = 50M 254 * A/B/E memory.min = 0, memory.current = 50M 255 * A/B/F memory.min = 500M, memory.current = 0 256 * 257 * Usages are pagecache, but the test keeps a running 258 * process in every leaf cgroup. 259 * Then it creates A/G and creates a significant 260 * memory pressure in it. 261 * 262 * A/B memory.current ~= 50M 263 * A/B/C memory.current ~= 33M 264 * A/B/D memory.current ~= 17M 265 * A/B/F memory.current ~= 0 266 * 267 * After that it tries to allocate more than there is 268 * unprotected memory in A available, and checks 269 * checks that memory.min protects pagecache even 270 * in this case. 271 */ 272 static int test_memcg_min(const char *root) 273 { 274 int ret = KSFT_FAIL; 275 char *parent[3] = {NULL}; 276 char *children[4] = {NULL}; 277 long c[4]; 278 int i, attempts; 279 int fd; 280 281 fd = get_temp_fd(); 282 if (fd < 0) 283 goto cleanup; 284 285 parent[0] = cg_name(root, "memcg_test_0"); 286 if (!parent[0]) 287 goto cleanup; 288 289 parent[1] = cg_name(parent[0], "memcg_test_1"); 290 if (!parent[1]) 291 goto cleanup; 292 293 parent[2] = cg_name(parent[0], "memcg_test_2"); 294 if (!parent[2]) 295 goto cleanup; 296 297 if (cg_create(parent[0])) 298 goto cleanup; 299 300 if (cg_read_long(parent[0], "memory.min")) { 301 ret = KSFT_SKIP; 302 goto cleanup; 303 } 304 305 if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) 306 goto cleanup; 307 308 if (cg_write(parent[0], "memory.max", "200M")) 309 goto cleanup; 310 311 if (cg_write(parent[0], "memory.swap.max", "0")) 312 goto cleanup; 313 314 if (cg_create(parent[1])) 315 goto cleanup; 316 317 if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) 318 goto cleanup; 319 320 if (cg_create(parent[2])) 321 goto cleanup; 322 323 for (i = 0; i < ARRAY_SIZE(children); i++) { 324 children[i] = cg_name_indexed(parent[1], "child_memcg", i); 325 if (!children[i]) 326 goto cleanup; 327 328 if (cg_create(children[i])) 329 goto cleanup; 330 331 if (i > 2) 332 continue; 333 334 cg_run_nowait(children[i], alloc_pagecache_50M_noexit, 335 (void *)(long)fd); 336 } 337 338 if (cg_write(parent[0], "memory.min", "50M")) 339 goto cleanup; 340 if (cg_write(parent[1], "memory.min", "50M")) 341 goto cleanup; 342 if (cg_write(children[0], "memory.min", "75M")) 343 goto cleanup; 344 if (cg_write(children[1], "memory.min", "25M")) 345 goto cleanup; 346 if (cg_write(children[2], "memory.min", "0")) 347 goto cleanup; 348 if (cg_write(children[3], "memory.min", "500M")) 349 goto cleanup; 350 351 attempts = 0; 352 while (!values_close(cg_read_long(parent[1], "memory.current"), 353 MB(150), 3)) { 354 if (attempts++ > 5) 355 break; 356 sleep(1); 357 } 358 359 if (cg_run(parent[2], alloc_anon, (void *)MB(148))) 360 goto cleanup; 361 362 if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 363 goto cleanup; 364 365 for (i = 0; i < ARRAY_SIZE(children); i++) 366 c[i] = cg_read_long(children[i], "memory.current"); 367 368 if (!values_close(c[0], MB(33), 10)) 369 goto cleanup; 370 371 if (!values_close(c[1], MB(17), 10)) 372 goto cleanup; 373 374 if (c[3] != 0) 375 goto cleanup; 376 377 if (!cg_run(parent[2], alloc_anon, (void *)MB(170))) 378 goto cleanup; 379 380 if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 381 goto cleanup; 382 383 ret = KSFT_PASS; 384 385 cleanup: 386 for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { 387 if (!children[i]) 388 continue; 389 390 cg_destroy(children[i]); 391 free(children[i]); 392 } 393 394 for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { 395 if (!parent[i]) 396 continue; 397 398 cg_destroy(parent[i]); 399 free(parent[i]); 400 } 401 close(fd); 402 return ret; 403 } 404 405 /* 406 * First, this test creates the following hierarchy: 407 * A memory.low = 50M, memory.max = 200M 408 * A/B memory.low = 50M, memory.current = 50M 409 * A/B/C memory.low = 75M, memory.current = 50M 410 * A/B/D memory.low = 25M, memory.current = 50M 411 * A/B/E memory.low = 0, memory.current = 50M 412 * A/B/F memory.low = 500M, memory.current = 0 413 * 414 * Usages are pagecache. 415 * Then it creates A/G an creates a significant 416 * memory pressure in it. 417 * 418 * Then it checks actual memory usages and expects that: 419 * A/B memory.current ~= 50M 420 * A/B/ memory.current ~= 33M 421 * A/B/D memory.current ~= 17M 422 * A/B/F memory.current ~= 0 423 * 424 * After that it tries to allocate more than there is 425 * unprotected memory in A available, 426 * and checks low and oom events in memory.events. 427 */ 428 static int test_memcg_low(const char *root) 429 { 430 int ret = KSFT_FAIL; 431 char *parent[3] = {NULL}; 432 char *children[4] = {NULL}; 433 long low, oom; 434 long c[4]; 435 int i; 436 int fd; 437 438 fd = get_temp_fd(); 439 if (fd < 0) 440 goto cleanup; 441 442 parent[0] = cg_name(root, "memcg_test_0"); 443 if (!parent[0]) 444 goto cleanup; 445 446 parent[1] = cg_name(parent[0], "memcg_test_1"); 447 if (!parent[1]) 448 goto cleanup; 449 450 parent[2] = cg_name(parent[0], "memcg_test_2"); 451 if (!parent[2]) 452 goto cleanup; 453 454 if (cg_create(parent[0])) 455 goto cleanup; 456 457 if (cg_read_long(parent[0], "memory.low")) 458 goto cleanup; 459 460 if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) 461 goto cleanup; 462 463 if (cg_write(parent[0], "memory.max", "200M")) 464 goto cleanup; 465 466 if (cg_write(parent[0], "memory.swap.max", "0")) 467 goto cleanup; 468 469 if (cg_create(parent[1])) 470 goto cleanup; 471 472 if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) 473 goto cleanup; 474 475 if (cg_create(parent[2])) 476 goto cleanup; 477 478 for (i = 0; i < ARRAY_SIZE(children); i++) { 479 children[i] = cg_name_indexed(parent[1], "child_memcg", i); 480 if (!children[i]) 481 goto cleanup; 482 483 if (cg_create(children[i])) 484 goto cleanup; 485 486 if (i > 2) 487 continue; 488 489 if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd)) 490 goto cleanup; 491 } 492 493 if (cg_write(parent[0], "memory.low", "50M")) 494 goto cleanup; 495 if (cg_write(parent[1], "memory.low", "50M")) 496 goto cleanup; 497 if (cg_write(children[0], "memory.low", "75M")) 498 goto cleanup; 499 if (cg_write(children[1], "memory.low", "25M")) 500 goto cleanup; 501 if (cg_write(children[2], "memory.low", "0")) 502 goto cleanup; 503 if (cg_write(children[3], "memory.low", "500M")) 504 goto cleanup; 505 506 if (cg_run(parent[2], alloc_anon, (void *)MB(148))) 507 goto cleanup; 508 509 if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 510 goto cleanup; 511 512 for (i = 0; i < ARRAY_SIZE(children); i++) 513 c[i] = cg_read_long(children[i], "memory.current"); 514 515 if (!values_close(c[0], MB(33), 10)) 516 goto cleanup; 517 518 if (!values_close(c[1], MB(17), 10)) 519 goto cleanup; 520 521 if (c[3] != 0) 522 goto cleanup; 523 524 if (cg_run(parent[2], alloc_anon, (void *)MB(166))) { 525 fprintf(stderr, 526 "memory.low prevents from allocating anon memory\n"); 527 goto cleanup; 528 } 529 530 for (i = 0; i < ARRAY_SIZE(children); i++) { 531 int no_low_events_index = has_recursiveprot ? 2 : 1; 532 533 oom = cg_read_key_long(children[i], "memory.events", "oom "); 534 low = cg_read_key_long(children[i], "memory.events", "low "); 535 536 if (oom) 537 goto cleanup; 538 if (i <= no_low_events_index && low <= 0) 539 goto cleanup; 540 if (i > no_low_events_index && low) 541 goto cleanup; 542 543 } 544 545 ret = KSFT_PASS; 546 547 cleanup: 548 for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { 549 if (!children[i]) 550 continue; 551 552 cg_destroy(children[i]); 553 free(children[i]); 554 } 555 556 for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { 557 if (!parent[i]) 558 continue; 559 560 cg_destroy(parent[i]); 561 free(parent[i]); 562 } 563 close(fd); 564 return ret; 565 } 566 567 static int alloc_pagecache_max_30M(const char *cgroup, void *arg) 568 { 569 size_t size = MB(50); 570 int ret = -1; 571 long current; 572 int fd; 573 574 fd = get_temp_fd(); 575 if (fd < 0) 576 return -1; 577 578 if (alloc_pagecache(fd, size)) 579 goto cleanup; 580 581 current = cg_read_long(cgroup, "memory.current"); 582 if (current <= MB(29) || current > MB(30)) 583 goto cleanup; 584 585 ret = 0; 586 587 cleanup: 588 close(fd); 589 return ret; 590 591 } 592 593 /* 594 * This test checks that memory.high limits the amount of 595 * memory which can be consumed by either anonymous memory 596 * or pagecache. 597 */ 598 static int test_memcg_high(const char *root) 599 { 600 int ret = KSFT_FAIL; 601 char *memcg; 602 long high; 603 604 memcg = cg_name(root, "memcg_test"); 605 if (!memcg) 606 goto cleanup; 607 608 if (cg_create(memcg)) 609 goto cleanup; 610 611 if (cg_read_strcmp(memcg, "memory.high", "max\n")) 612 goto cleanup; 613 614 if (cg_write(memcg, "memory.swap.max", "0")) 615 goto cleanup; 616 617 if (cg_write(memcg, "memory.high", "30M")) 618 goto cleanup; 619 620 if (cg_run(memcg, alloc_anon, (void *)MB(31))) 621 goto cleanup; 622 623 if (!cg_run(memcg, alloc_pagecache_50M_check, NULL)) 624 goto cleanup; 625 626 if (cg_run(memcg, alloc_pagecache_max_30M, NULL)) 627 goto cleanup; 628 629 high = cg_read_key_long(memcg, "memory.events", "high "); 630 if (high <= 0) 631 goto cleanup; 632 633 ret = KSFT_PASS; 634 635 cleanup: 636 cg_destroy(memcg); 637 free(memcg); 638 639 return ret; 640 } 641 642 static int alloc_anon_mlock(const char *cgroup, void *arg) 643 { 644 size_t size = (size_t)arg; 645 void *buf; 646 647 buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 648 0, 0); 649 if (buf == MAP_FAILED) 650 return -1; 651 652 mlock(buf, size); 653 munmap(buf, size); 654 return 0; 655 } 656 657 /* 658 * This test checks that memory.high is able to throttle big single shot 659 * allocation i.e. large allocation within one kernel entry. 660 */ 661 static int test_memcg_high_sync(const char *root) 662 { 663 int ret = KSFT_FAIL, pid, fd = -1; 664 char *memcg; 665 long pre_high, pre_max; 666 long post_high, post_max; 667 668 memcg = cg_name(root, "memcg_test"); 669 if (!memcg) 670 goto cleanup; 671 672 if (cg_create(memcg)) 673 goto cleanup; 674 675 pre_high = cg_read_key_long(memcg, "memory.events", "high "); 676 pre_max = cg_read_key_long(memcg, "memory.events", "max "); 677 if (pre_high < 0 || pre_max < 0) 678 goto cleanup; 679 680 if (cg_write(memcg, "memory.swap.max", "0")) 681 goto cleanup; 682 683 if (cg_write(memcg, "memory.high", "30M")) 684 goto cleanup; 685 686 if (cg_write(memcg, "memory.max", "140M")) 687 goto cleanup; 688 689 fd = memcg_prepare_for_wait(memcg); 690 if (fd < 0) 691 goto cleanup; 692 693 pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200)); 694 if (pid < 0) 695 goto cleanup; 696 697 cg_wait_for(fd); 698 699 post_high = cg_read_key_long(memcg, "memory.events", "high "); 700 post_max = cg_read_key_long(memcg, "memory.events", "max "); 701 if (post_high < 0 || post_max < 0) 702 goto cleanup; 703 704 if (pre_high == post_high || pre_max != post_max) 705 goto cleanup; 706 707 ret = KSFT_PASS; 708 709 cleanup: 710 if (fd >= 0) 711 close(fd); 712 cg_destroy(memcg); 713 free(memcg); 714 715 return ret; 716 } 717 718 /* 719 * This test checks that memory.max limits the amount of 720 * memory which can be consumed by either anonymous memory 721 * or pagecache. 722 */ 723 static int test_memcg_max(const char *root) 724 { 725 int ret = KSFT_FAIL; 726 char *memcg; 727 long current, max; 728 729 memcg = cg_name(root, "memcg_test"); 730 if (!memcg) 731 goto cleanup; 732 733 if (cg_create(memcg)) 734 goto cleanup; 735 736 if (cg_read_strcmp(memcg, "memory.max", "max\n")) 737 goto cleanup; 738 739 if (cg_write(memcg, "memory.swap.max", "0")) 740 goto cleanup; 741 742 if (cg_write(memcg, "memory.max", "30M")) 743 goto cleanup; 744 745 /* Should be killed by OOM killer */ 746 if (!cg_run(memcg, alloc_anon, (void *)MB(100))) 747 goto cleanup; 748 749 if (cg_run(memcg, alloc_pagecache_max_30M, NULL)) 750 goto cleanup; 751 752 current = cg_read_long(memcg, "memory.current"); 753 if (current > MB(30) || !current) 754 goto cleanup; 755 756 max = cg_read_key_long(memcg, "memory.events", "max "); 757 if (max <= 0) 758 goto cleanup; 759 760 ret = KSFT_PASS; 761 762 cleanup: 763 cg_destroy(memcg); 764 free(memcg); 765 766 return ret; 767 } 768 769 /* 770 * This test checks that memory.reclaim reclaims the given 771 * amount of memory (from both anon and file, if possible). 772 */ 773 static int test_memcg_reclaim(const char *root) 774 { 775 int ret = KSFT_FAIL, fd, retries; 776 char *memcg; 777 long current, expected_usage, to_reclaim; 778 char buf[64]; 779 780 memcg = cg_name(root, "memcg_test"); 781 if (!memcg) 782 goto cleanup; 783 784 if (cg_create(memcg)) 785 goto cleanup; 786 787 current = cg_read_long(memcg, "memory.current"); 788 if (current != 0) 789 goto cleanup; 790 791 fd = get_temp_fd(); 792 if (fd < 0) 793 goto cleanup; 794 795 cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd); 796 797 /* 798 * If swap is enabled, try to reclaim from both anon and file, else try 799 * to reclaim from file only. 800 */ 801 if (is_swap_enabled()) { 802 cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50)); 803 expected_usage = MB(100); 804 } else 805 expected_usage = MB(50); 806 807 /* 808 * Wait until current usage reaches the expected usage (or we run out of 809 * retries). 810 */ 811 retries = 5; 812 while (!values_close(cg_read_long(memcg, "memory.current"), 813 expected_usage, 10)) { 814 if (retries--) { 815 sleep(1); 816 continue; 817 } else { 818 fprintf(stderr, 819 "failed to allocate %ld for memcg reclaim test\n", 820 expected_usage); 821 goto cleanup; 822 } 823 } 824 825 /* 826 * Reclaim until current reaches 30M, this makes sure we hit both anon 827 * and file if swap is enabled. 828 */ 829 retries = 5; 830 while (true) { 831 int err; 832 833 current = cg_read_long(memcg, "memory.current"); 834 to_reclaim = current - MB(30); 835 836 /* 837 * We only keep looping if we get EAGAIN, which means we could 838 * not reclaim the full amount. 839 */ 840 if (to_reclaim <= 0) 841 goto cleanup; 842 843 844 snprintf(buf, sizeof(buf), "%ld", to_reclaim); 845 err = cg_write(memcg, "memory.reclaim", buf); 846 if (!err) { 847 /* 848 * If writing succeeds, then the written amount should have been 849 * fully reclaimed (and maybe more). 850 */ 851 current = cg_read_long(memcg, "memory.current"); 852 if (!values_close(current, MB(30), 3) && current > MB(30)) 853 goto cleanup; 854 break; 855 } 856 857 /* The kernel could not reclaim the full amount, try again. */ 858 if (err == -EAGAIN && retries--) 859 continue; 860 861 /* We got an unexpected error or ran out of retries. */ 862 goto cleanup; 863 } 864 865 ret = KSFT_PASS; 866 cleanup: 867 cg_destroy(memcg); 868 free(memcg); 869 close(fd); 870 871 return ret; 872 } 873 874 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) 875 { 876 long mem_max = (long)arg; 877 size_t size = MB(50); 878 char *buf, *ptr; 879 long mem_current, swap_current; 880 int ret = -1; 881 882 buf = malloc(size); 883 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 884 *ptr = 0; 885 886 mem_current = cg_read_long(cgroup, "memory.current"); 887 if (!mem_current || !values_close(mem_current, mem_max, 3)) 888 goto cleanup; 889 890 swap_current = cg_read_long(cgroup, "memory.swap.current"); 891 if (!swap_current || 892 !values_close(mem_current + swap_current, size, 3)) 893 goto cleanup; 894 895 ret = 0; 896 cleanup: 897 free(buf); 898 return ret; 899 } 900 901 /* 902 * This test checks that memory.swap.max limits the amount of 903 * anonymous memory which can be swapped out. 904 */ 905 static int test_memcg_swap_max(const char *root) 906 { 907 int ret = KSFT_FAIL; 908 char *memcg; 909 long max; 910 911 if (!is_swap_enabled()) 912 return KSFT_SKIP; 913 914 memcg = cg_name(root, "memcg_test"); 915 if (!memcg) 916 goto cleanup; 917 918 if (cg_create(memcg)) 919 goto cleanup; 920 921 if (cg_read_long(memcg, "memory.swap.current")) { 922 ret = KSFT_SKIP; 923 goto cleanup; 924 } 925 926 if (cg_read_strcmp(memcg, "memory.max", "max\n")) 927 goto cleanup; 928 929 if (cg_read_strcmp(memcg, "memory.swap.max", "max\n")) 930 goto cleanup; 931 932 if (cg_write(memcg, "memory.swap.max", "30M")) 933 goto cleanup; 934 935 if (cg_write(memcg, "memory.max", "30M")) 936 goto cleanup; 937 938 /* Should be killed by OOM killer */ 939 if (!cg_run(memcg, alloc_anon, (void *)MB(100))) 940 goto cleanup; 941 942 if (cg_read_key_long(memcg, "memory.events", "oom ") != 1) 943 goto cleanup; 944 945 if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) 946 goto cleanup; 947 948 if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) 949 goto cleanup; 950 951 max = cg_read_key_long(memcg, "memory.events", "max "); 952 if (max <= 0) 953 goto cleanup; 954 955 ret = KSFT_PASS; 956 957 cleanup: 958 cg_destroy(memcg); 959 free(memcg); 960 961 return ret; 962 } 963 964 /* 965 * This test disables swapping and tries to allocate anonymous memory 966 * up to OOM. Then it checks for oom and oom_kill events in 967 * memory.events. 968 */ 969 static int test_memcg_oom_events(const char *root) 970 { 971 int ret = KSFT_FAIL; 972 char *memcg; 973 974 memcg = cg_name(root, "memcg_test"); 975 if (!memcg) 976 goto cleanup; 977 978 if (cg_create(memcg)) 979 goto cleanup; 980 981 if (cg_write(memcg, "memory.max", "30M")) 982 goto cleanup; 983 984 if (cg_write(memcg, "memory.swap.max", "0")) 985 goto cleanup; 986 987 if (!cg_run(memcg, alloc_anon, (void *)MB(100))) 988 goto cleanup; 989 990 if (cg_read_strcmp(memcg, "cgroup.procs", "")) 991 goto cleanup; 992 993 if (cg_read_key_long(memcg, "memory.events", "oom ") != 1) 994 goto cleanup; 995 996 if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) 997 goto cleanup; 998 999 ret = KSFT_PASS; 1000 1001 cleanup: 1002 cg_destroy(memcg); 1003 free(memcg); 1004 1005 return ret; 1006 } 1007 1008 struct tcp_server_args { 1009 unsigned short port; 1010 int ctl[2]; 1011 }; 1012 1013 static int tcp_server(const char *cgroup, void *arg) 1014 { 1015 struct tcp_server_args *srv_args = arg; 1016 struct sockaddr_in6 saddr = { 0 }; 1017 socklen_t slen = sizeof(saddr); 1018 int sk, client_sk, ctl_fd, yes = 1, ret = -1; 1019 1020 close(srv_args->ctl[0]); 1021 ctl_fd = srv_args->ctl[1]; 1022 1023 saddr.sin6_family = AF_INET6; 1024 saddr.sin6_addr = in6addr_any; 1025 saddr.sin6_port = htons(srv_args->port); 1026 1027 sk = socket(AF_INET6, SOCK_STREAM, 0); 1028 if (sk < 0) 1029 return ret; 1030 1031 if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 1032 goto cleanup; 1033 1034 if (bind(sk, (struct sockaddr *)&saddr, slen)) { 1035 write(ctl_fd, &errno, sizeof(errno)); 1036 goto cleanup; 1037 } 1038 1039 if (listen(sk, 1)) 1040 goto cleanup; 1041 1042 ret = 0; 1043 if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { 1044 ret = -1; 1045 goto cleanup; 1046 } 1047 1048 client_sk = accept(sk, NULL, NULL); 1049 if (client_sk < 0) 1050 goto cleanup; 1051 1052 ret = -1; 1053 for (;;) { 1054 uint8_t buf[0x100000]; 1055 1056 if (write(client_sk, buf, sizeof(buf)) <= 0) { 1057 if (errno == ECONNRESET) 1058 ret = 0; 1059 break; 1060 } 1061 } 1062 1063 close(client_sk); 1064 1065 cleanup: 1066 close(sk); 1067 return ret; 1068 } 1069 1070 static int tcp_client(const char *cgroup, unsigned short port) 1071 { 1072 const char server[] = "localhost"; 1073 struct addrinfo *ai; 1074 char servport[6]; 1075 int retries = 0x10; /* nice round number */ 1076 int sk, ret; 1077 1078 snprintf(servport, sizeof(servport), "%hd", port); 1079 ret = getaddrinfo(server, servport, NULL, &ai); 1080 if (ret) 1081 return ret; 1082 1083 sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); 1084 if (sk < 0) 1085 goto free_ainfo; 1086 1087 ret = connect(sk, ai->ai_addr, ai->ai_addrlen); 1088 if (ret < 0) 1089 goto close_sk; 1090 1091 ret = KSFT_FAIL; 1092 while (retries--) { 1093 uint8_t buf[0x100000]; 1094 long current, sock; 1095 1096 if (read(sk, buf, sizeof(buf)) <= 0) 1097 goto close_sk; 1098 1099 current = cg_read_long(cgroup, "memory.current"); 1100 sock = cg_read_key_long(cgroup, "memory.stat", "sock "); 1101 1102 if (current < 0 || sock < 0) 1103 goto close_sk; 1104 1105 if (values_close(current, sock, 10)) { 1106 ret = KSFT_PASS; 1107 break; 1108 } 1109 } 1110 1111 close_sk: 1112 close(sk); 1113 free_ainfo: 1114 freeaddrinfo(ai); 1115 return ret; 1116 } 1117 1118 /* 1119 * This test checks socket memory accounting. 1120 * The test forks a TCP server listens on a random port between 1000 1121 * and 61000. Once it gets a client connection, it starts writing to 1122 * its socket. 1123 * The TCP client interleaves reads from the socket with check whether 1124 * memory.current and memory.stat.sock are similar. 1125 */ 1126 static int test_memcg_sock(const char *root) 1127 { 1128 int bind_retries = 5, ret = KSFT_FAIL, pid, err; 1129 unsigned short port; 1130 char *memcg; 1131 1132 memcg = cg_name(root, "memcg_test"); 1133 if (!memcg) 1134 goto cleanup; 1135 1136 if (cg_create(memcg)) 1137 goto cleanup; 1138 1139 while (bind_retries--) { 1140 struct tcp_server_args args; 1141 1142 if (pipe(args.ctl)) 1143 goto cleanup; 1144 1145 port = args.port = 1000 + rand() % 60000; 1146 1147 pid = cg_run_nowait(memcg, tcp_server, &args); 1148 if (pid < 0) 1149 goto cleanup; 1150 1151 close(args.ctl[1]); 1152 if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err)) 1153 goto cleanup; 1154 close(args.ctl[0]); 1155 1156 if (!err) 1157 break; 1158 if (err != EADDRINUSE) 1159 goto cleanup; 1160 1161 waitpid(pid, NULL, 0); 1162 } 1163 1164 if (err == EADDRINUSE) { 1165 ret = KSFT_SKIP; 1166 goto cleanup; 1167 } 1168 1169 if (tcp_client(memcg, port) != KSFT_PASS) 1170 goto cleanup; 1171 1172 waitpid(pid, &err, 0); 1173 if (WEXITSTATUS(err)) 1174 goto cleanup; 1175 1176 if (cg_read_long(memcg, "memory.current") < 0) 1177 goto cleanup; 1178 1179 if (cg_read_key_long(memcg, "memory.stat", "sock ")) 1180 goto cleanup; 1181 1182 ret = KSFT_PASS; 1183 1184 cleanup: 1185 cg_destroy(memcg); 1186 free(memcg); 1187 1188 return ret; 1189 } 1190 1191 /* 1192 * This test disables swapping and tries to allocate anonymous memory 1193 * up to OOM with memory.group.oom set. Then it checks that all 1194 * processes in the leaf were killed. It also checks that oom_events 1195 * were propagated to the parent level. 1196 */ 1197 static int test_memcg_oom_group_leaf_events(const char *root) 1198 { 1199 int ret = KSFT_FAIL; 1200 char *parent, *child; 1201 long parent_oom_events; 1202 1203 parent = cg_name(root, "memcg_test_0"); 1204 child = cg_name(root, "memcg_test_0/memcg_test_1"); 1205 1206 if (!parent || !child) 1207 goto cleanup; 1208 1209 if (cg_create(parent)) 1210 goto cleanup; 1211 1212 if (cg_create(child)) 1213 goto cleanup; 1214 1215 if (cg_write(parent, "cgroup.subtree_control", "+memory")) 1216 goto cleanup; 1217 1218 if (cg_write(child, "memory.max", "50M")) 1219 goto cleanup; 1220 1221 if (cg_write(child, "memory.swap.max", "0")) 1222 goto cleanup; 1223 1224 if (cg_write(child, "memory.oom.group", "1")) 1225 goto cleanup; 1226 1227 cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); 1228 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); 1229 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); 1230 if (!cg_run(child, alloc_anon, (void *)MB(100))) 1231 goto cleanup; 1232 1233 if (cg_test_proc_killed(child)) 1234 goto cleanup; 1235 1236 if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) 1237 goto cleanup; 1238 1239 if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0) 1240 goto cleanup; 1241 1242 ret = KSFT_PASS; 1243 1244 cleanup: 1245 if (child) 1246 cg_destroy(child); 1247 if (parent) 1248 cg_destroy(parent); 1249 free(child); 1250 free(parent); 1251 1252 return ret; 1253 } 1254 1255 /* 1256 * This test disables swapping and tries to allocate anonymous memory 1257 * up to OOM with memory.group.oom set. Then it checks that all 1258 * processes in the parent and leaf were killed. 1259 */ 1260 static int test_memcg_oom_group_parent_events(const char *root) 1261 { 1262 int ret = KSFT_FAIL; 1263 char *parent, *child; 1264 1265 parent = cg_name(root, "memcg_test_0"); 1266 child = cg_name(root, "memcg_test_0/memcg_test_1"); 1267 1268 if (!parent || !child) 1269 goto cleanup; 1270 1271 if (cg_create(parent)) 1272 goto cleanup; 1273 1274 if (cg_create(child)) 1275 goto cleanup; 1276 1277 if (cg_write(parent, "memory.max", "80M")) 1278 goto cleanup; 1279 1280 if (cg_write(parent, "memory.swap.max", "0")) 1281 goto cleanup; 1282 1283 if (cg_write(parent, "memory.oom.group", "1")) 1284 goto cleanup; 1285 1286 cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); 1287 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); 1288 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); 1289 1290 if (!cg_run(child, alloc_anon, (void *)MB(100))) 1291 goto cleanup; 1292 1293 if (cg_test_proc_killed(child)) 1294 goto cleanup; 1295 if (cg_test_proc_killed(parent)) 1296 goto cleanup; 1297 1298 ret = KSFT_PASS; 1299 1300 cleanup: 1301 if (child) 1302 cg_destroy(child); 1303 if (parent) 1304 cg_destroy(parent); 1305 free(child); 1306 free(parent); 1307 1308 return ret; 1309 } 1310 1311 /* 1312 * This test disables swapping and tries to allocate anonymous memory 1313 * up to OOM with memory.group.oom set. Then it checks that all 1314 * processes were killed except those set with OOM_SCORE_ADJ_MIN 1315 */ 1316 static int test_memcg_oom_group_score_events(const char *root) 1317 { 1318 int ret = KSFT_FAIL; 1319 char *memcg; 1320 int safe_pid; 1321 1322 memcg = cg_name(root, "memcg_test_0"); 1323 1324 if (!memcg) 1325 goto cleanup; 1326 1327 if (cg_create(memcg)) 1328 goto cleanup; 1329 1330 if (cg_write(memcg, "memory.max", "50M")) 1331 goto cleanup; 1332 1333 if (cg_write(memcg, "memory.swap.max", "0")) 1334 goto cleanup; 1335 1336 if (cg_write(memcg, "memory.oom.group", "1")) 1337 goto cleanup; 1338 1339 safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); 1340 if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN)) 1341 goto cleanup; 1342 1343 cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); 1344 if (!cg_run(memcg, alloc_anon, (void *)MB(100))) 1345 goto cleanup; 1346 1347 parent_oom_events = cg_read_key_long( 1348 parent, "memory.events", "oom_kill "); 1349 /* 1350 * If memory_localevents is not enabled (the default), the parent should 1351 * count OOM events in its children groups. Otherwise, it should not 1352 * have observed any events. 1353 */ 1354 if ((has_localevents && parent_oom_events == 0) || 1355 parent_oom_events > 0) 1356 ret = KSFT_PASS; 1357 1358 if (kill(safe_pid, SIGKILL)) 1359 goto cleanup; 1360 1361 cleanup: 1362 if (memcg) 1363 cg_destroy(memcg); 1364 free(memcg); 1365 1366 return ret; 1367 } 1368 1369 #define T(x) { x, #x } 1370 struct memcg_test { 1371 int (*fn)(const char *root); 1372 const char *name; 1373 } tests[] = { 1374 T(test_memcg_subtree_control), 1375 T(test_memcg_current), 1376 T(test_memcg_min), 1377 T(test_memcg_low), 1378 T(test_memcg_high), 1379 T(test_memcg_high_sync), 1380 T(test_memcg_max), 1381 T(test_memcg_reclaim), 1382 T(test_memcg_oom_events), 1383 T(test_memcg_swap_max), 1384 T(test_memcg_sock), 1385 T(test_memcg_oom_group_leaf_events), 1386 T(test_memcg_oom_group_parent_events), 1387 T(test_memcg_oom_group_score_events), 1388 }; 1389 #undef T 1390 1391 int main(int argc, char **argv) 1392 { 1393 char root[PATH_MAX]; 1394 int i, proc_status, ret = EXIT_SUCCESS; 1395 1396 if (cg_find_unified_root(root, sizeof(root))) 1397 ksft_exit_skip("cgroup v2 isn't mounted\n"); 1398 1399 /* 1400 * Check that memory controller is available: 1401 * memory is listed in cgroup.controllers 1402 */ 1403 if (cg_read_strstr(root, "cgroup.controllers", "memory")) 1404 ksft_exit_skip("memory controller isn't available\n"); 1405 1406 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 1407 if (cg_write(root, "cgroup.subtree_control", "+memory")) 1408 ksft_exit_skip("Failed to set memory controller\n"); 1409 1410 proc_status = proc_mount_contains("memory_recursiveprot"); 1411 if (proc_status < 0) 1412 ksft_exit_skip("Failed to query cgroup mount option\n"); 1413 has_recursiveprot = proc_status; 1414 1415 proc_status = proc_mount_contains("memory_localevents"); 1416 if (proc_status < 0) 1417 ksft_exit_skip("Failed to query cgroup mount option\n"); 1418 has_localevents = proc_status; 1419 1420 for (i = 0; i < ARRAY_SIZE(tests); i++) { 1421 switch (tests[i].fn(root)) { 1422 case KSFT_PASS: 1423 ksft_test_result_pass("%s\n", tests[i].name); 1424 break; 1425 case KSFT_SKIP: 1426 ksft_test_result_skip("%s\n", tests[i].name); 1427 break; 1428 default: 1429 ret = EXIT_FAILURE; 1430 ksft_test_result_fail("%s\n", tests[i].name); 1431 break; 1432 } 1433 } 1434 1435 return ret; 1436 } 1437