1 /* SPDX-License-Identifier: GPL-2.0 */
2 
3 #define _GNU_SOURCE
4 
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/limits.h>
8 #include <poll.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/inotify.h>
14 #include <sys/stat.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <unistd.h>
18 
19 #include "cgroup_util.h"
20 #include "../clone3/clone3_selftests.h"
21 
22 static ssize_t read_text(const char *path, char *buf, size_t max_len)
23 {
24 	ssize_t len;
25 	int fd;
26 
27 	fd = open(path, O_RDONLY);
28 	if (fd < 0)
29 		return fd;
30 
31 	len = read(fd, buf, max_len - 1);
32 	if (len < 0)
33 		goto out;
34 
35 	buf[len] = 0;
36 out:
37 	close(fd);
38 	return len;
39 }
40 
41 static ssize_t write_text(const char *path, char *buf, ssize_t len)
42 {
43 	int fd;
44 
45 	fd = open(path, O_WRONLY | O_APPEND);
46 	if (fd < 0)
47 		return fd;
48 
49 	len = write(fd, buf, len);
50 	if (len < 0) {
51 		close(fd);
52 		return len;
53 	}
54 
55 	close(fd);
56 
57 	return len;
58 }
59 
60 char *cg_name(const char *root, const char *name)
61 {
62 	size_t len = strlen(root) + strlen(name) + 2;
63 	char *ret = malloc(len);
64 
65 	snprintf(ret, len, "%s/%s", root, name);
66 
67 	return ret;
68 }
69 
70 char *cg_name_indexed(const char *root, const char *name, int index)
71 {
72 	size_t len = strlen(root) + strlen(name) + 10;
73 	char *ret = malloc(len);
74 
75 	snprintf(ret, len, "%s/%s_%d", root, name, index);
76 
77 	return ret;
78 }
79 
80 char *cg_control(const char *cgroup, const char *control)
81 {
82 	size_t len = strlen(cgroup) + strlen(control) + 2;
83 	char *ret = malloc(len);
84 
85 	snprintf(ret, len, "%s/%s", cgroup, control);
86 
87 	return ret;
88 }
89 
90 int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
91 {
92 	char path[PATH_MAX];
93 
94 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
95 
96 	if (read_text(path, buf, len) >= 0)
97 		return 0;
98 
99 	return -1;
100 }
101 
102 int cg_read_strcmp(const char *cgroup, const char *control,
103 		   const char *expected)
104 {
105 	size_t size;
106 	char *buf;
107 	int ret;
108 
109 	/* Handle the case of comparing against empty string */
110 	if (!expected)
111 		return -1;
112 	else
113 		size = strlen(expected) + 1;
114 
115 	buf = malloc(size);
116 	if (!buf)
117 		return -1;
118 
119 	if (cg_read(cgroup, control, buf, size)) {
120 		free(buf);
121 		return -1;
122 	}
123 
124 	ret = strcmp(expected, buf);
125 	free(buf);
126 	return ret;
127 }
128 
129 int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
130 {
131 	char buf[PAGE_SIZE];
132 
133 	if (cg_read(cgroup, control, buf, sizeof(buf)))
134 		return -1;
135 
136 	return strstr(buf, needle) ? 0 : -1;
137 }
138 
139 long cg_read_long(const char *cgroup, const char *control)
140 {
141 	char buf[128];
142 
143 	if (cg_read(cgroup, control, buf, sizeof(buf)))
144 		return -1;
145 
146 	return atol(buf);
147 }
148 
149 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
150 {
151 	char buf[PAGE_SIZE];
152 	char *ptr;
153 
154 	if (cg_read(cgroup, control, buf, sizeof(buf)))
155 		return -1;
156 
157 	ptr = strstr(buf, key);
158 	if (!ptr)
159 		return -1;
160 
161 	return atol(ptr + strlen(key));
162 }
163 
164 long cg_read_lc(const char *cgroup, const char *control)
165 {
166 	char buf[PAGE_SIZE];
167 	const char delim[] = "\n";
168 	char *line;
169 	long cnt = 0;
170 
171 	if (cg_read(cgroup, control, buf, sizeof(buf)))
172 		return -1;
173 
174 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
175 		cnt++;
176 
177 	return cnt;
178 }
179 
180 int cg_write(const char *cgroup, const char *control, char *buf)
181 {
182 	char path[PATH_MAX];
183 	ssize_t len = strlen(buf);
184 
185 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
186 
187 	if (write_text(path, buf, len) == len)
188 		return 0;
189 
190 	return -1;
191 }
192 
193 int cg_find_unified_root(char *root, size_t len)
194 {
195 	char buf[10 * PAGE_SIZE];
196 	char *fs, *mount, *type;
197 	const char delim[] = "\n\t ";
198 
199 	if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
200 		return -1;
201 
202 	/*
203 	 * Example:
204 	 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
205 	 */
206 	for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
207 		mount = strtok(NULL, delim);
208 		type = strtok(NULL, delim);
209 		strtok(NULL, delim);
210 		strtok(NULL, delim);
211 		strtok(NULL, delim);
212 
213 		if (strcmp(type, "cgroup2") == 0) {
214 			strncpy(root, mount, len);
215 			return 0;
216 		}
217 	}
218 
219 	return -1;
220 }
221 
222 int cg_create(const char *cgroup)
223 {
224 	return mkdir(cgroup, 0755);
225 }
226 
227 int cg_wait_for_proc_count(const char *cgroup, int count)
228 {
229 	char buf[10 * PAGE_SIZE] = {0};
230 	int attempts;
231 	char *ptr;
232 
233 	for (attempts = 10; attempts >= 0; attempts--) {
234 		int nr = 0;
235 
236 		if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
237 			break;
238 
239 		for (ptr = buf; *ptr; ptr++)
240 			if (*ptr == '\n')
241 				nr++;
242 
243 		if (nr >= count)
244 			return 0;
245 
246 		usleep(100000);
247 	}
248 
249 	return -1;
250 }
251 
252 int cg_killall(const char *cgroup)
253 {
254 	char buf[PAGE_SIZE];
255 	char *ptr = buf;
256 
257 	/* If cgroup.kill exists use it. */
258 	if (!cg_write(cgroup, "cgroup.kill", "1"))
259 		return 0;
260 
261 	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
262 		return -1;
263 
264 	while (ptr < buf + sizeof(buf)) {
265 		int pid = strtol(ptr, &ptr, 10);
266 
267 		if (pid == 0)
268 			break;
269 		if (*ptr)
270 			ptr++;
271 		else
272 			break;
273 		if (kill(pid, SIGKILL))
274 			return -1;
275 	}
276 
277 	return 0;
278 }
279 
280 int cg_destroy(const char *cgroup)
281 {
282 	int ret;
283 
284 retry:
285 	ret = rmdir(cgroup);
286 	if (ret && errno == EBUSY) {
287 		cg_killall(cgroup);
288 		usleep(100);
289 		goto retry;
290 	}
291 
292 	if (ret && errno == ENOENT)
293 		ret = 0;
294 
295 	return ret;
296 }
297 
298 int cg_enter(const char *cgroup, int pid)
299 {
300 	char pidbuf[64];
301 
302 	snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
303 	return cg_write(cgroup, "cgroup.procs", pidbuf);
304 }
305 
306 int cg_enter_current(const char *cgroup)
307 {
308 	return cg_write(cgroup, "cgroup.procs", "0");
309 }
310 
311 int cg_enter_current_thread(const char *cgroup)
312 {
313 	return cg_write(cgroup, "cgroup.threads", "0");
314 }
315 
316 int cg_run(const char *cgroup,
317 	   int (*fn)(const char *cgroup, void *arg),
318 	   void *arg)
319 {
320 	int pid, retcode;
321 
322 	pid = fork();
323 	if (pid < 0) {
324 		return pid;
325 	} else if (pid == 0) {
326 		char buf[64];
327 
328 		snprintf(buf, sizeof(buf), "%d", getpid());
329 		if (cg_write(cgroup, "cgroup.procs", buf))
330 			exit(EXIT_FAILURE);
331 		exit(fn(cgroup, arg));
332 	} else {
333 		waitpid(pid, &retcode, 0);
334 		if (WIFEXITED(retcode))
335 			return WEXITSTATUS(retcode);
336 		else
337 			return -1;
338 	}
339 }
340 
341 pid_t clone_into_cgroup(int cgroup_fd)
342 {
343 #ifdef CLONE_ARGS_SIZE_VER2
344 	pid_t pid;
345 
346 	struct __clone_args args = {
347 		.flags = CLONE_INTO_CGROUP,
348 		.exit_signal = SIGCHLD,
349 		.cgroup = cgroup_fd,
350 	};
351 
352 	pid = sys_clone3(&args, sizeof(struct __clone_args));
353 	/*
354 	 * Verify that this is a genuine test failure:
355 	 * ENOSYS -> clone3() not available
356 	 * E2BIG  -> CLONE_INTO_CGROUP not available
357 	 */
358 	if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
359 		goto pretend_enosys;
360 
361 	return pid;
362 
363 pretend_enosys:
364 #endif
365 	errno = ENOSYS;
366 	return -ENOSYS;
367 }
368 
369 int clone_reap(pid_t pid, int options)
370 {
371 	int ret;
372 	siginfo_t info = {
373 		.si_signo = 0,
374 	};
375 
376 again:
377 	ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
378 	if (ret < 0) {
379 		if (errno == EINTR)
380 			goto again;
381 		return -1;
382 	}
383 
384 	if (options & WEXITED) {
385 		if (WIFEXITED(info.si_status))
386 			return WEXITSTATUS(info.si_status);
387 	}
388 
389 	if (options & WSTOPPED) {
390 		if (WIFSTOPPED(info.si_status))
391 			return WSTOPSIG(info.si_status);
392 	}
393 
394 	if (options & WCONTINUED) {
395 		if (WIFCONTINUED(info.si_status))
396 			return 0;
397 	}
398 
399 	return -1;
400 }
401 
402 int dirfd_open_opath(const char *dir)
403 {
404 	return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
405 }
406 
407 #define close_prot_errno(fd)                                                   \
408 	if (fd >= 0) {                                                         \
409 		int _e_ = errno;                                               \
410 		close(fd);                                                     \
411 		errno = _e_;                                                   \
412 	}
413 
414 static int clone_into_cgroup_run_nowait(const char *cgroup,
415 					int (*fn)(const char *cgroup, void *arg),
416 					void *arg)
417 {
418 	int cgroup_fd;
419 	pid_t pid;
420 
421 	cgroup_fd =  dirfd_open_opath(cgroup);
422 	if (cgroup_fd < 0)
423 		return -1;
424 
425 	pid = clone_into_cgroup(cgroup_fd);
426 	close_prot_errno(cgroup_fd);
427 	if (pid == 0)
428 		exit(fn(cgroup, arg));
429 
430 	return pid;
431 }
432 
433 int cg_run_nowait(const char *cgroup,
434 		  int (*fn)(const char *cgroup, void *arg),
435 		  void *arg)
436 {
437 	int pid;
438 
439 	pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
440 	if (pid > 0)
441 		return pid;
442 
443 	/* Genuine test failure. */
444 	if (pid < 0 && errno != ENOSYS)
445 		return -1;
446 
447 	pid = fork();
448 	if (pid == 0) {
449 		char buf[64];
450 
451 		snprintf(buf, sizeof(buf), "%d", getpid());
452 		if (cg_write(cgroup, "cgroup.procs", buf))
453 			exit(EXIT_FAILURE);
454 		exit(fn(cgroup, arg));
455 	}
456 
457 	return pid;
458 }
459 
460 int get_temp_fd(void)
461 {
462 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
463 }
464 
465 int alloc_pagecache(int fd, size_t size)
466 {
467 	char buf[PAGE_SIZE];
468 	struct stat st;
469 	int i;
470 
471 	if (fstat(fd, &st))
472 		goto cleanup;
473 
474 	size += st.st_size;
475 
476 	if (ftruncate(fd, size))
477 		goto cleanup;
478 
479 	for (i = 0; i < size; i += sizeof(buf))
480 		read(fd, buf, sizeof(buf));
481 
482 	return 0;
483 
484 cleanup:
485 	return -1;
486 }
487 
488 int alloc_anon(const char *cgroup, void *arg)
489 {
490 	size_t size = (unsigned long)arg;
491 	char *buf, *ptr;
492 
493 	buf = malloc(size);
494 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
495 		*ptr = 0;
496 
497 	free(buf);
498 	return 0;
499 }
500 
501 int is_swap_enabled(void)
502 {
503 	char buf[PAGE_SIZE];
504 	const char delim[] = "\n";
505 	int cnt = 0;
506 	char *line;
507 
508 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
509 		return -1;
510 
511 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
512 		cnt++;
513 
514 	return cnt > 1;
515 }
516 
517 int set_oom_adj_score(int pid, int score)
518 {
519 	char path[PATH_MAX];
520 	int fd, len;
521 
522 	sprintf(path, "/proc/%d/oom_score_adj", pid);
523 
524 	fd = open(path, O_WRONLY | O_APPEND);
525 	if (fd < 0)
526 		return fd;
527 
528 	len = dprintf(fd, "%d", score);
529 	if (len < 0) {
530 		close(fd);
531 		return len;
532 	}
533 
534 	close(fd);
535 	return 0;
536 }
537 
538 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
539 {
540 	char path[PATH_MAX];
541 
542 	if (!pid)
543 		snprintf(path, sizeof(path), "/proc/%s/%s",
544 			 thread ? "thread-self" : "self", item);
545 	else
546 		snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
547 
548 	return read_text(path, buf, size);
549 }
550 
551 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
552 {
553 	char buf[PAGE_SIZE];
554 
555 	if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
556 		return -1;
557 
558 	return strstr(buf, needle) ? 0 : -1;
559 }
560 
561 int clone_into_cgroup_run_wait(const char *cgroup)
562 {
563 	int cgroup_fd;
564 	pid_t pid;
565 
566 	cgroup_fd =  dirfd_open_opath(cgroup);
567 	if (cgroup_fd < 0)
568 		return -1;
569 
570 	pid = clone_into_cgroup(cgroup_fd);
571 	close_prot_errno(cgroup_fd);
572 	if (pid < 0)
573 		return -1;
574 
575 	if (pid == 0)
576 		exit(EXIT_SUCCESS);
577 
578 	/*
579 	 * We don't care whether this fails. We only care whether the initial
580 	 * clone succeeded.
581 	 */
582 	(void)clone_reap(pid, WEXITED);
583 	return 0;
584 }
585 
586 static int __prepare_for_wait(const char *cgroup, const char *filename)
587 {
588 	int fd, ret = -1;
589 
590 	fd = inotify_init1(0);
591 	if (fd == -1)
592 		return fd;
593 
594 	ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
595 	if (ret == -1) {
596 		close(fd);
597 		fd = -1;
598 	}
599 
600 	return fd;
601 }
602 
603 int cg_prepare_for_wait(const char *cgroup)
604 {
605 	return __prepare_for_wait(cgroup, "cgroup.events");
606 }
607 
608 int memcg_prepare_for_wait(const char *cgroup)
609 {
610 	return __prepare_for_wait(cgroup, "memory.events");
611 }
612 
613 int cg_wait_for(int fd)
614 {
615 	int ret = -1;
616 	struct pollfd fds = {
617 		.fd = fd,
618 		.events = POLLIN,
619 	};
620 
621 	while (true) {
622 		ret = poll(&fds, 1, 10000);
623 
624 		if (ret == -1) {
625 			if (errno == EINTR)
626 				continue;
627 
628 			break;
629 		}
630 
631 		if (ret > 0 && fds.revents & POLLIN) {
632 			ret = 0;
633 			break;
634 		}
635 	}
636 
637 	return ret;
638 }
639