1 /* SPDX-License-Identifier: GPL-2.0 */ 2 3 #define _GNU_SOURCE 4 5 #include <errno.h> 6 #include <fcntl.h> 7 #include <linux/limits.h> 8 #include <poll.h> 9 #include <signal.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <sys/inotify.h> 14 #include <sys/stat.h> 15 #include <sys/types.h> 16 #include <sys/wait.h> 17 #include <unistd.h> 18 19 #include "cgroup_util.h" 20 #include "../clone3/clone3_selftests.h" 21 22 static ssize_t read_text(const char *path, char *buf, size_t max_len) 23 { 24 ssize_t len; 25 int fd; 26 27 fd = open(path, O_RDONLY); 28 if (fd < 0) 29 return fd; 30 31 len = read(fd, buf, max_len - 1); 32 if (len < 0) 33 goto out; 34 35 buf[len] = 0; 36 out: 37 close(fd); 38 return len; 39 } 40 41 static ssize_t write_text(const char *path, char *buf, ssize_t len) 42 { 43 int fd; 44 45 fd = open(path, O_WRONLY | O_APPEND); 46 if (fd < 0) 47 return fd; 48 49 len = write(fd, buf, len); 50 if (len < 0) { 51 close(fd); 52 return len; 53 } 54 55 close(fd); 56 57 return len; 58 } 59 60 char *cg_name(const char *root, const char *name) 61 { 62 size_t len = strlen(root) + strlen(name) + 2; 63 char *ret = malloc(len); 64 65 snprintf(ret, len, "%s/%s", root, name); 66 67 return ret; 68 } 69 70 char *cg_name_indexed(const char *root, const char *name, int index) 71 { 72 size_t len = strlen(root) + strlen(name) + 10; 73 char *ret = malloc(len); 74 75 snprintf(ret, len, "%s/%s_%d", root, name, index); 76 77 return ret; 78 } 79 80 char *cg_control(const char *cgroup, const char *control) 81 { 82 size_t len = strlen(cgroup) + strlen(control) + 2; 83 char *ret = malloc(len); 84 85 snprintf(ret, len, "%s/%s", cgroup, control); 86 87 return ret; 88 } 89 90 int cg_read(const char *cgroup, const char *control, char *buf, size_t len) 91 { 92 char path[PATH_MAX]; 93 94 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 95 96 if (read_text(path, buf, len) >= 0) 97 return 0; 98 99 return -1; 100 } 101 102 int cg_read_strcmp(const char *cgroup, const char *control, 103 const char *expected) 104 { 105 size_t size; 106 char *buf; 107 int ret; 108 109 /* Handle the case of comparing against empty string */ 110 if (!expected) 111 return -1; 112 else 113 size = strlen(expected) + 1; 114 115 buf = malloc(size); 116 if (!buf) 117 return -1; 118 119 if (cg_read(cgroup, control, buf, size)) { 120 free(buf); 121 return -1; 122 } 123 124 ret = strcmp(expected, buf); 125 free(buf); 126 return ret; 127 } 128 129 int cg_read_strstr(const char *cgroup, const char *control, const char *needle) 130 { 131 char buf[PAGE_SIZE]; 132 133 if (cg_read(cgroup, control, buf, sizeof(buf))) 134 return -1; 135 136 return strstr(buf, needle) ? 0 : -1; 137 } 138 139 long cg_read_long(const char *cgroup, const char *control) 140 { 141 char buf[128]; 142 143 if (cg_read(cgroup, control, buf, sizeof(buf))) 144 return -1; 145 146 return atol(buf); 147 } 148 149 long cg_read_key_long(const char *cgroup, const char *control, const char *key) 150 { 151 char buf[PAGE_SIZE]; 152 char *ptr; 153 154 if (cg_read(cgroup, control, buf, sizeof(buf))) 155 return -1; 156 157 ptr = strstr(buf, key); 158 if (!ptr) 159 return -1; 160 161 return atol(ptr + strlen(key)); 162 } 163 164 long cg_read_lc(const char *cgroup, const char *control) 165 { 166 char buf[PAGE_SIZE]; 167 const char delim[] = "\n"; 168 char *line; 169 long cnt = 0; 170 171 if (cg_read(cgroup, control, buf, sizeof(buf))) 172 return -1; 173 174 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 175 cnt++; 176 177 return cnt; 178 } 179 180 int cg_write(const char *cgroup, const char *control, char *buf) 181 { 182 char path[PATH_MAX]; 183 ssize_t len = strlen(buf); 184 185 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 186 187 if (write_text(path, buf, len) == len) 188 return 0; 189 190 return -1; 191 } 192 193 int cg_find_unified_root(char *root, size_t len) 194 { 195 char buf[10 * PAGE_SIZE]; 196 char *fs, *mount, *type; 197 const char delim[] = "\n\t "; 198 199 if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) 200 return -1; 201 202 /* 203 * Example: 204 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0 205 */ 206 for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { 207 mount = strtok(NULL, delim); 208 type = strtok(NULL, delim); 209 strtok(NULL, delim); 210 strtok(NULL, delim); 211 strtok(NULL, delim); 212 213 if (strcmp(type, "cgroup2") == 0) { 214 strncpy(root, mount, len); 215 return 0; 216 } 217 } 218 219 return -1; 220 } 221 222 int cg_create(const char *cgroup) 223 { 224 return mkdir(cgroup, 0755); 225 } 226 227 int cg_wait_for_proc_count(const char *cgroup, int count) 228 { 229 char buf[10 * PAGE_SIZE] = {0}; 230 int attempts; 231 char *ptr; 232 233 for (attempts = 10; attempts >= 0; attempts--) { 234 int nr = 0; 235 236 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 237 break; 238 239 for (ptr = buf; *ptr; ptr++) 240 if (*ptr == '\n') 241 nr++; 242 243 if (nr >= count) 244 return 0; 245 246 usleep(100000); 247 } 248 249 return -1; 250 } 251 252 int cg_killall(const char *cgroup) 253 { 254 char buf[PAGE_SIZE]; 255 char *ptr = buf; 256 257 /* If cgroup.kill exists use it. */ 258 if (!cg_write(cgroup, "cgroup.kill", "1")) 259 return 0; 260 261 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 262 return -1; 263 264 while (ptr < buf + sizeof(buf)) { 265 int pid = strtol(ptr, &ptr, 10); 266 267 if (pid == 0) 268 break; 269 if (*ptr) 270 ptr++; 271 else 272 break; 273 if (kill(pid, SIGKILL)) 274 return -1; 275 } 276 277 return 0; 278 } 279 280 int cg_destroy(const char *cgroup) 281 { 282 int ret; 283 284 retry: 285 ret = rmdir(cgroup); 286 if (ret && errno == EBUSY) { 287 cg_killall(cgroup); 288 usleep(100); 289 goto retry; 290 } 291 292 if (ret && errno == ENOENT) 293 ret = 0; 294 295 return ret; 296 } 297 298 int cg_enter(const char *cgroup, int pid) 299 { 300 char pidbuf[64]; 301 302 snprintf(pidbuf, sizeof(pidbuf), "%d", pid); 303 return cg_write(cgroup, "cgroup.procs", pidbuf); 304 } 305 306 int cg_enter_current(const char *cgroup) 307 { 308 return cg_write(cgroup, "cgroup.procs", "0"); 309 } 310 311 int cg_enter_current_thread(const char *cgroup) 312 { 313 return cg_write(cgroup, "cgroup.threads", "0"); 314 } 315 316 int cg_run(const char *cgroup, 317 int (*fn)(const char *cgroup, void *arg), 318 void *arg) 319 { 320 int pid, retcode; 321 322 pid = fork(); 323 if (pid < 0) { 324 return pid; 325 } else if (pid == 0) { 326 char buf[64]; 327 328 snprintf(buf, sizeof(buf), "%d", getpid()); 329 if (cg_write(cgroup, "cgroup.procs", buf)) 330 exit(EXIT_FAILURE); 331 exit(fn(cgroup, arg)); 332 } else { 333 waitpid(pid, &retcode, 0); 334 if (WIFEXITED(retcode)) 335 return WEXITSTATUS(retcode); 336 else 337 return -1; 338 } 339 } 340 341 pid_t clone_into_cgroup(int cgroup_fd) 342 { 343 #ifdef CLONE_ARGS_SIZE_VER2 344 pid_t pid; 345 346 struct __clone_args args = { 347 .flags = CLONE_INTO_CGROUP, 348 .exit_signal = SIGCHLD, 349 .cgroup = cgroup_fd, 350 }; 351 352 pid = sys_clone3(&args, sizeof(struct __clone_args)); 353 /* 354 * Verify that this is a genuine test failure: 355 * ENOSYS -> clone3() not available 356 * E2BIG -> CLONE_INTO_CGROUP not available 357 */ 358 if (pid < 0 && (errno == ENOSYS || errno == E2BIG)) 359 goto pretend_enosys; 360 361 return pid; 362 363 pretend_enosys: 364 #endif 365 errno = ENOSYS; 366 return -ENOSYS; 367 } 368 369 int clone_reap(pid_t pid, int options) 370 { 371 int ret; 372 siginfo_t info = { 373 .si_signo = 0, 374 }; 375 376 again: 377 ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD); 378 if (ret < 0) { 379 if (errno == EINTR) 380 goto again; 381 return -1; 382 } 383 384 if (options & WEXITED) { 385 if (WIFEXITED(info.si_status)) 386 return WEXITSTATUS(info.si_status); 387 } 388 389 if (options & WSTOPPED) { 390 if (WIFSTOPPED(info.si_status)) 391 return WSTOPSIG(info.si_status); 392 } 393 394 if (options & WCONTINUED) { 395 if (WIFCONTINUED(info.si_status)) 396 return 0; 397 } 398 399 return -1; 400 } 401 402 int dirfd_open_opath(const char *dir) 403 { 404 return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 405 } 406 407 #define close_prot_errno(fd) \ 408 if (fd >= 0) { \ 409 int _e_ = errno; \ 410 close(fd); \ 411 errno = _e_; \ 412 } 413 414 static int clone_into_cgroup_run_nowait(const char *cgroup, 415 int (*fn)(const char *cgroup, void *arg), 416 void *arg) 417 { 418 int cgroup_fd; 419 pid_t pid; 420 421 cgroup_fd = dirfd_open_opath(cgroup); 422 if (cgroup_fd < 0) 423 return -1; 424 425 pid = clone_into_cgroup(cgroup_fd); 426 close_prot_errno(cgroup_fd); 427 if (pid == 0) 428 exit(fn(cgroup, arg)); 429 430 return pid; 431 } 432 433 int cg_run_nowait(const char *cgroup, 434 int (*fn)(const char *cgroup, void *arg), 435 void *arg) 436 { 437 int pid; 438 439 pid = clone_into_cgroup_run_nowait(cgroup, fn, arg); 440 if (pid > 0) 441 return pid; 442 443 /* Genuine test failure. */ 444 if (pid < 0 && errno != ENOSYS) 445 return -1; 446 447 pid = fork(); 448 if (pid == 0) { 449 char buf[64]; 450 451 snprintf(buf, sizeof(buf), "%d", getpid()); 452 if (cg_write(cgroup, "cgroup.procs", buf)) 453 exit(EXIT_FAILURE); 454 exit(fn(cgroup, arg)); 455 } 456 457 return pid; 458 } 459 460 int get_temp_fd(void) 461 { 462 return open(".", O_TMPFILE | O_RDWR | O_EXCL); 463 } 464 465 int alloc_pagecache(int fd, size_t size) 466 { 467 char buf[PAGE_SIZE]; 468 struct stat st; 469 int i; 470 471 if (fstat(fd, &st)) 472 goto cleanup; 473 474 size += st.st_size; 475 476 if (ftruncate(fd, size)) 477 goto cleanup; 478 479 for (i = 0; i < size; i += sizeof(buf)) 480 read(fd, buf, sizeof(buf)); 481 482 return 0; 483 484 cleanup: 485 return -1; 486 } 487 488 int alloc_anon(const char *cgroup, void *arg) 489 { 490 size_t size = (unsigned long)arg; 491 char *buf, *ptr; 492 493 buf = malloc(size); 494 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 495 *ptr = 0; 496 497 free(buf); 498 return 0; 499 } 500 501 int is_swap_enabled(void) 502 { 503 char buf[PAGE_SIZE]; 504 const char delim[] = "\n"; 505 int cnt = 0; 506 char *line; 507 508 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) 509 return -1; 510 511 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 512 cnt++; 513 514 return cnt > 1; 515 } 516 517 int set_oom_adj_score(int pid, int score) 518 { 519 char path[PATH_MAX]; 520 int fd, len; 521 522 sprintf(path, "/proc/%d/oom_score_adj", pid); 523 524 fd = open(path, O_WRONLY | O_APPEND); 525 if (fd < 0) 526 return fd; 527 528 len = dprintf(fd, "%d", score); 529 if (len < 0) { 530 close(fd); 531 return len; 532 } 533 534 close(fd); 535 return 0; 536 } 537 538 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) 539 { 540 char path[PATH_MAX]; 541 542 if (!pid) 543 snprintf(path, sizeof(path), "/proc/%s/%s", 544 thread ? "thread-self" : "self", item); 545 else 546 snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); 547 548 return read_text(path, buf, size); 549 } 550 551 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) 552 { 553 char buf[PAGE_SIZE]; 554 555 if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) 556 return -1; 557 558 return strstr(buf, needle) ? 0 : -1; 559 } 560 561 int clone_into_cgroup_run_wait(const char *cgroup) 562 { 563 int cgroup_fd; 564 pid_t pid; 565 566 cgroup_fd = dirfd_open_opath(cgroup); 567 if (cgroup_fd < 0) 568 return -1; 569 570 pid = clone_into_cgroup(cgroup_fd); 571 close_prot_errno(cgroup_fd); 572 if (pid < 0) 573 return -1; 574 575 if (pid == 0) 576 exit(EXIT_SUCCESS); 577 578 /* 579 * We don't care whether this fails. We only care whether the initial 580 * clone succeeded. 581 */ 582 (void)clone_reap(pid, WEXITED); 583 return 0; 584 } 585 586 static int __prepare_for_wait(const char *cgroup, const char *filename) 587 { 588 int fd, ret = -1; 589 590 fd = inotify_init1(0); 591 if (fd == -1) 592 return fd; 593 594 ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY); 595 if (ret == -1) { 596 close(fd); 597 fd = -1; 598 } 599 600 return fd; 601 } 602 603 int cg_prepare_for_wait(const char *cgroup) 604 { 605 return __prepare_for_wait(cgroup, "cgroup.events"); 606 } 607 608 int memcg_prepare_for_wait(const char *cgroup) 609 { 610 return __prepare_for_wait(cgroup, "memory.events"); 611 } 612 613 int cg_wait_for(int fd) 614 { 615 int ret = -1; 616 struct pollfd fds = { 617 .fd = fd, 618 .events = POLLIN, 619 }; 620 621 while (true) { 622 ret = poll(&fds, 1, 10000); 623 624 if (ret == -1) { 625 if (errno == EINTR) 626 continue; 627 628 break; 629 } 630 631 if (ret > 0 && fds.revents & POLLIN) { 632 ret = 0; 633 break; 634 } 635 } 636 637 return ret; 638 } 639