1 /* SPDX-License-Identifier: GPL-2.0 */
2 
3 #define _GNU_SOURCE
4 
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/limits.h>
8 #include <poll.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/inotify.h>
14 #include <sys/stat.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <unistd.h>
18 
19 #include "cgroup_util.h"
20 #include "../clone3/clone3_selftests.h"
21 
22 /* Returns read len on success, or -errno on failure. */
23 static ssize_t read_text(const char *path, char *buf, size_t max_len)
24 {
25 	ssize_t len;
26 	int fd;
27 
28 	fd = open(path, O_RDONLY);
29 	if (fd < 0)
30 		return -errno;
31 
32 	len = read(fd, buf, max_len - 1);
33 
34 	if (len >= 0)
35 		buf[len] = 0;
36 
37 	close(fd);
38 	return len < 0 ? -errno : len;
39 }
40 
41 /* Returns written len on success, or -errno on failure. */
42 static ssize_t write_text(const char *path, char *buf, ssize_t len)
43 {
44 	int fd;
45 
46 	fd = open(path, O_WRONLY | O_APPEND);
47 	if (fd < 0)
48 		return -errno;
49 
50 	len = write(fd, buf, len);
51 	close(fd);
52 	return len < 0 ? -errno : len;
53 }
54 
55 char *cg_name(const char *root, const char *name)
56 {
57 	size_t len = strlen(root) + strlen(name) + 2;
58 	char *ret = malloc(len);
59 
60 	snprintf(ret, len, "%s/%s", root, name);
61 
62 	return ret;
63 }
64 
65 char *cg_name_indexed(const char *root, const char *name, int index)
66 {
67 	size_t len = strlen(root) + strlen(name) + 10;
68 	char *ret = malloc(len);
69 
70 	snprintf(ret, len, "%s/%s_%d", root, name, index);
71 
72 	return ret;
73 }
74 
75 char *cg_control(const char *cgroup, const char *control)
76 {
77 	size_t len = strlen(cgroup) + strlen(control) + 2;
78 	char *ret = malloc(len);
79 
80 	snprintf(ret, len, "%s/%s", cgroup, control);
81 
82 	return ret;
83 }
84 
85 /* Returns 0 on success, or -errno on failure. */
86 int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
87 {
88 	char path[PATH_MAX];
89 	ssize_t ret;
90 
91 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
92 
93 	ret = read_text(path, buf, len);
94 	return ret >= 0 ? 0 : ret;
95 }
96 
97 int cg_read_strcmp(const char *cgroup, const char *control,
98 		   const char *expected)
99 {
100 	size_t size;
101 	char *buf;
102 	int ret;
103 
104 	/* Handle the case of comparing against empty string */
105 	if (!expected)
106 		return -1;
107 	else
108 		size = strlen(expected) + 1;
109 
110 	buf = malloc(size);
111 	if (!buf)
112 		return -1;
113 
114 	if (cg_read(cgroup, control, buf, size)) {
115 		free(buf);
116 		return -1;
117 	}
118 
119 	ret = strcmp(expected, buf);
120 	free(buf);
121 	return ret;
122 }
123 
124 int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
125 {
126 	char buf[PAGE_SIZE];
127 
128 	if (cg_read(cgroup, control, buf, sizeof(buf)))
129 		return -1;
130 
131 	return strstr(buf, needle) ? 0 : -1;
132 }
133 
134 long cg_read_long(const char *cgroup, const char *control)
135 {
136 	char buf[128];
137 
138 	if (cg_read(cgroup, control, buf, sizeof(buf)))
139 		return -1;
140 
141 	return atol(buf);
142 }
143 
144 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
145 {
146 	char buf[PAGE_SIZE];
147 	char *ptr;
148 
149 	if (cg_read(cgroup, control, buf, sizeof(buf)))
150 		return -1;
151 
152 	ptr = strstr(buf, key);
153 	if (!ptr)
154 		return -1;
155 
156 	return atol(ptr + strlen(key));
157 }
158 
159 long cg_read_lc(const char *cgroup, const char *control)
160 {
161 	char buf[PAGE_SIZE];
162 	const char delim[] = "\n";
163 	char *line;
164 	long cnt = 0;
165 
166 	if (cg_read(cgroup, control, buf, sizeof(buf)))
167 		return -1;
168 
169 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
170 		cnt++;
171 
172 	return cnt;
173 }
174 
175 /* Returns 0 on success, or -errno on failure. */
176 int cg_write(const char *cgroup, const char *control, char *buf)
177 {
178 	char path[PATH_MAX];
179 	ssize_t len = strlen(buf), ret;
180 
181 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
182 	ret = write_text(path, buf, len);
183 	return ret == len ? 0 : ret;
184 }
185 
186 int cg_find_unified_root(char *root, size_t len)
187 {
188 	char buf[10 * PAGE_SIZE];
189 	char *fs, *mount, *type;
190 	const char delim[] = "\n\t ";
191 
192 	if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
193 		return -1;
194 
195 	/*
196 	 * Example:
197 	 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
198 	 */
199 	for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
200 		mount = strtok(NULL, delim);
201 		type = strtok(NULL, delim);
202 		strtok(NULL, delim);
203 		strtok(NULL, delim);
204 		strtok(NULL, delim);
205 
206 		if (strcmp(type, "cgroup2") == 0) {
207 			strncpy(root, mount, len);
208 			return 0;
209 		}
210 	}
211 
212 	return -1;
213 }
214 
215 int cg_create(const char *cgroup)
216 {
217 	return mkdir(cgroup, 0755);
218 }
219 
220 int cg_wait_for_proc_count(const char *cgroup, int count)
221 {
222 	char buf[10 * PAGE_SIZE] = {0};
223 	int attempts;
224 	char *ptr;
225 
226 	for (attempts = 10; attempts >= 0; attempts--) {
227 		int nr = 0;
228 
229 		if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
230 			break;
231 
232 		for (ptr = buf; *ptr; ptr++)
233 			if (*ptr == '\n')
234 				nr++;
235 
236 		if (nr >= count)
237 			return 0;
238 
239 		usleep(100000);
240 	}
241 
242 	return -1;
243 }
244 
245 int cg_killall(const char *cgroup)
246 {
247 	char buf[PAGE_SIZE];
248 	char *ptr = buf;
249 
250 	/* If cgroup.kill exists use it. */
251 	if (!cg_write(cgroup, "cgroup.kill", "1"))
252 		return 0;
253 
254 	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
255 		return -1;
256 
257 	while (ptr < buf + sizeof(buf)) {
258 		int pid = strtol(ptr, &ptr, 10);
259 
260 		if (pid == 0)
261 			break;
262 		if (*ptr)
263 			ptr++;
264 		else
265 			break;
266 		if (kill(pid, SIGKILL))
267 			return -1;
268 	}
269 
270 	return 0;
271 }
272 
273 int cg_destroy(const char *cgroup)
274 {
275 	int ret;
276 
277 retry:
278 	ret = rmdir(cgroup);
279 	if (ret && errno == EBUSY) {
280 		cg_killall(cgroup);
281 		usleep(100);
282 		goto retry;
283 	}
284 
285 	if (ret && errno == ENOENT)
286 		ret = 0;
287 
288 	return ret;
289 }
290 
291 int cg_enter(const char *cgroup, int pid)
292 {
293 	char pidbuf[64];
294 
295 	snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
296 	return cg_write(cgroup, "cgroup.procs", pidbuf);
297 }
298 
299 int cg_enter_current(const char *cgroup)
300 {
301 	return cg_write(cgroup, "cgroup.procs", "0");
302 }
303 
304 int cg_enter_current_thread(const char *cgroup)
305 {
306 	return cg_write(cgroup, "cgroup.threads", "0");
307 }
308 
309 int cg_run(const char *cgroup,
310 	   int (*fn)(const char *cgroup, void *arg),
311 	   void *arg)
312 {
313 	int pid, retcode;
314 
315 	pid = fork();
316 	if (pid < 0) {
317 		return pid;
318 	} else if (pid == 0) {
319 		char buf[64];
320 
321 		snprintf(buf, sizeof(buf), "%d", getpid());
322 		if (cg_write(cgroup, "cgroup.procs", buf))
323 			exit(EXIT_FAILURE);
324 		exit(fn(cgroup, arg));
325 	} else {
326 		waitpid(pid, &retcode, 0);
327 		if (WIFEXITED(retcode))
328 			return WEXITSTATUS(retcode);
329 		else
330 			return -1;
331 	}
332 }
333 
334 pid_t clone_into_cgroup(int cgroup_fd)
335 {
336 #ifdef CLONE_ARGS_SIZE_VER2
337 	pid_t pid;
338 
339 	struct __clone_args args = {
340 		.flags = CLONE_INTO_CGROUP,
341 		.exit_signal = SIGCHLD,
342 		.cgroup = cgroup_fd,
343 	};
344 
345 	pid = sys_clone3(&args, sizeof(struct __clone_args));
346 	/*
347 	 * Verify that this is a genuine test failure:
348 	 * ENOSYS -> clone3() not available
349 	 * E2BIG  -> CLONE_INTO_CGROUP not available
350 	 */
351 	if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
352 		goto pretend_enosys;
353 
354 	return pid;
355 
356 pretend_enosys:
357 #endif
358 	errno = ENOSYS;
359 	return -ENOSYS;
360 }
361 
362 int clone_reap(pid_t pid, int options)
363 {
364 	int ret;
365 	siginfo_t info = {
366 		.si_signo = 0,
367 	};
368 
369 again:
370 	ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
371 	if (ret < 0) {
372 		if (errno == EINTR)
373 			goto again;
374 		return -1;
375 	}
376 
377 	if (options & WEXITED) {
378 		if (WIFEXITED(info.si_status))
379 			return WEXITSTATUS(info.si_status);
380 	}
381 
382 	if (options & WSTOPPED) {
383 		if (WIFSTOPPED(info.si_status))
384 			return WSTOPSIG(info.si_status);
385 	}
386 
387 	if (options & WCONTINUED) {
388 		if (WIFCONTINUED(info.si_status))
389 			return 0;
390 	}
391 
392 	return -1;
393 }
394 
395 int dirfd_open_opath(const char *dir)
396 {
397 	return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
398 }
399 
400 #define close_prot_errno(fd)                                                   \
401 	if (fd >= 0) {                                                         \
402 		int _e_ = errno;                                               \
403 		close(fd);                                                     \
404 		errno = _e_;                                                   \
405 	}
406 
407 static int clone_into_cgroup_run_nowait(const char *cgroup,
408 					int (*fn)(const char *cgroup, void *arg),
409 					void *arg)
410 {
411 	int cgroup_fd;
412 	pid_t pid;
413 
414 	cgroup_fd =  dirfd_open_opath(cgroup);
415 	if (cgroup_fd < 0)
416 		return -1;
417 
418 	pid = clone_into_cgroup(cgroup_fd);
419 	close_prot_errno(cgroup_fd);
420 	if (pid == 0)
421 		exit(fn(cgroup, arg));
422 
423 	return pid;
424 }
425 
426 int cg_run_nowait(const char *cgroup,
427 		  int (*fn)(const char *cgroup, void *arg),
428 		  void *arg)
429 {
430 	int pid;
431 
432 	pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
433 	if (pid > 0)
434 		return pid;
435 
436 	/* Genuine test failure. */
437 	if (pid < 0 && errno != ENOSYS)
438 		return -1;
439 
440 	pid = fork();
441 	if (pid == 0) {
442 		char buf[64];
443 
444 		snprintf(buf, sizeof(buf), "%d", getpid());
445 		if (cg_write(cgroup, "cgroup.procs", buf))
446 			exit(EXIT_FAILURE);
447 		exit(fn(cgroup, arg));
448 	}
449 
450 	return pid;
451 }
452 
453 int get_temp_fd(void)
454 {
455 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
456 }
457 
458 int alloc_pagecache(int fd, size_t size)
459 {
460 	char buf[PAGE_SIZE];
461 	struct stat st;
462 	int i;
463 
464 	if (fstat(fd, &st))
465 		goto cleanup;
466 
467 	size += st.st_size;
468 
469 	if (ftruncate(fd, size))
470 		goto cleanup;
471 
472 	for (i = 0; i < size; i += sizeof(buf))
473 		read(fd, buf, sizeof(buf));
474 
475 	return 0;
476 
477 cleanup:
478 	return -1;
479 }
480 
481 int alloc_anon(const char *cgroup, void *arg)
482 {
483 	size_t size = (unsigned long)arg;
484 	char *buf, *ptr;
485 
486 	buf = malloc(size);
487 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
488 		*ptr = 0;
489 
490 	free(buf);
491 	return 0;
492 }
493 
494 int is_swap_enabled(void)
495 {
496 	char buf[PAGE_SIZE];
497 	const char delim[] = "\n";
498 	int cnt = 0;
499 	char *line;
500 
501 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
502 		return -1;
503 
504 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
505 		cnt++;
506 
507 	return cnt > 1;
508 }
509 
510 int set_oom_adj_score(int pid, int score)
511 {
512 	char path[PATH_MAX];
513 	int fd, len;
514 
515 	sprintf(path, "/proc/%d/oom_score_adj", pid);
516 
517 	fd = open(path, O_WRONLY | O_APPEND);
518 	if (fd < 0)
519 		return fd;
520 
521 	len = dprintf(fd, "%d", score);
522 	if (len < 0) {
523 		close(fd);
524 		return len;
525 	}
526 
527 	close(fd);
528 	return 0;
529 }
530 
531 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
532 {
533 	char path[PATH_MAX];
534 
535 	if (!pid)
536 		snprintf(path, sizeof(path), "/proc/%s/%s",
537 			 thread ? "thread-self" : "self", item);
538 	else
539 		snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
540 
541 	size = read_text(path, buf, size);
542 	return size < 0 ? -1 : size;
543 }
544 
545 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
546 {
547 	char buf[PAGE_SIZE];
548 
549 	if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
550 		return -1;
551 
552 	return strstr(buf, needle) ? 0 : -1;
553 }
554 
555 int clone_into_cgroup_run_wait(const char *cgroup)
556 {
557 	int cgroup_fd;
558 	pid_t pid;
559 
560 	cgroup_fd =  dirfd_open_opath(cgroup);
561 	if (cgroup_fd < 0)
562 		return -1;
563 
564 	pid = clone_into_cgroup(cgroup_fd);
565 	close_prot_errno(cgroup_fd);
566 	if (pid < 0)
567 		return -1;
568 
569 	if (pid == 0)
570 		exit(EXIT_SUCCESS);
571 
572 	/*
573 	 * We don't care whether this fails. We only care whether the initial
574 	 * clone succeeded.
575 	 */
576 	(void)clone_reap(pid, WEXITED);
577 	return 0;
578 }
579 
580 static int __prepare_for_wait(const char *cgroup, const char *filename)
581 {
582 	int fd, ret = -1;
583 
584 	fd = inotify_init1(0);
585 	if (fd == -1)
586 		return fd;
587 
588 	ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
589 	if (ret == -1) {
590 		close(fd);
591 		fd = -1;
592 	}
593 
594 	return fd;
595 }
596 
597 int cg_prepare_for_wait(const char *cgroup)
598 {
599 	return __prepare_for_wait(cgroup, "cgroup.events");
600 }
601 
602 int memcg_prepare_for_wait(const char *cgroup)
603 {
604 	return __prepare_for_wait(cgroup, "memory.events");
605 }
606 
607 int cg_wait_for(int fd)
608 {
609 	int ret = -1;
610 	struct pollfd fds = {
611 		.fd = fd,
612 		.events = POLLIN,
613 	};
614 
615 	while (true) {
616 		ret = poll(&fds, 1, 10000);
617 
618 		if (ret == -1) {
619 			if (errno == EINTR)
620 				continue;
621 
622 			break;
623 		}
624 
625 		if (ret > 0 && fds.revents & POLLIN) {
626 			ret = 0;
627 			break;
628 		}
629 	}
630 
631 	return ret;
632 }
633