1 /* SPDX-License-Identifier: GPL-2.0 */
2 
3 #define _GNU_SOURCE
4 
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/limits.h>
8 #include <poll.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/inotify.h>
14 #include <sys/stat.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <unistd.h>
18 
19 #include "cgroup_util.h"
20 #include "../clone3/clone3_selftests.h"
21 
22 /* Returns read len on success, or -errno on failure. */
read_text(const char * path,char * buf,size_t max_len)23 static ssize_t read_text(const char *path, char *buf, size_t max_len)
24 {
25 	ssize_t len;
26 	int fd;
27 
28 	fd = open(path, O_RDONLY);
29 	if (fd < 0)
30 		return -errno;
31 
32 	len = read(fd, buf, max_len - 1);
33 
34 	if (len >= 0)
35 		buf[len] = 0;
36 
37 	close(fd);
38 	return len < 0 ? -errno : len;
39 }
40 
41 /* Returns written len on success, or -errno on failure. */
write_text(const char * path,char * buf,ssize_t len)42 static ssize_t write_text(const char *path, char *buf, ssize_t len)
43 {
44 	int fd;
45 
46 	fd = open(path, O_WRONLY | O_APPEND);
47 	if (fd < 0)
48 		return -errno;
49 
50 	len = write(fd, buf, len);
51 	close(fd);
52 	return len < 0 ? -errno : len;
53 }
54 
cg_name(const char * root,const char * name)55 char *cg_name(const char *root, const char *name)
56 {
57 	size_t len = strlen(root) + strlen(name) + 2;
58 	char *ret = malloc(len);
59 
60 	snprintf(ret, len, "%s/%s", root, name);
61 
62 	return ret;
63 }
64 
cg_name_indexed(const char * root,const char * name,int index)65 char *cg_name_indexed(const char *root, const char *name, int index)
66 {
67 	size_t len = strlen(root) + strlen(name) + 10;
68 	char *ret = malloc(len);
69 
70 	snprintf(ret, len, "%s/%s_%d", root, name, index);
71 
72 	return ret;
73 }
74 
cg_control(const char * cgroup,const char * control)75 char *cg_control(const char *cgroup, const char *control)
76 {
77 	size_t len = strlen(cgroup) + strlen(control) + 2;
78 	char *ret = malloc(len);
79 
80 	snprintf(ret, len, "%s/%s", cgroup, control);
81 
82 	return ret;
83 }
84 
85 /* Returns 0 on success, or -errno on failure. */
cg_read(const char * cgroup,const char * control,char * buf,size_t len)86 int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
87 {
88 	char path[PATH_MAX];
89 	ssize_t ret;
90 
91 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
92 
93 	ret = read_text(path, buf, len);
94 	return ret >= 0 ? 0 : ret;
95 }
96 
cg_read_strcmp(const char * cgroup,const char * control,const char * expected)97 int cg_read_strcmp(const char *cgroup, const char *control,
98 		   const char *expected)
99 {
100 	size_t size;
101 	char *buf;
102 	int ret;
103 
104 	/* Handle the case of comparing against empty string */
105 	if (!expected)
106 		return -1;
107 	else
108 		size = strlen(expected) + 1;
109 
110 	buf = malloc(size);
111 	if (!buf)
112 		return -1;
113 
114 	if (cg_read(cgroup, control, buf, size)) {
115 		free(buf);
116 		return -1;
117 	}
118 
119 	ret = strcmp(expected, buf);
120 	free(buf);
121 	return ret;
122 }
123 
cg_read_strstr(const char * cgroup,const char * control,const char * needle)124 int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
125 {
126 	char buf[PAGE_SIZE];
127 
128 	if (cg_read(cgroup, control, buf, sizeof(buf)))
129 		return -1;
130 
131 	return strstr(buf, needle) ? 0 : -1;
132 }
133 
cg_read_long(const char * cgroup,const char * control)134 long cg_read_long(const char *cgroup, const char *control)
135 {
136 	char buf[128];
137 
138 	if (cg_read(cgroup, control, buf, sizeof(buf)))
139 		return -1;
140 
141 	return atol(buf);
142 }
143 
cg_read_key_long(const char * cgroup,const char * control,const char * key)144 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
145 {
146 	char buf[PAGE_SIZE];
147 	char *ptr;
148 
149 	if (cg_read(cgroup, control, buf, sizeof(buf)))
150 		return -1;
151 
152 	ptr = strstr(buf, key);
153 	if (!ptr)
154 		return -1;
155 
156 	return atol(ptr + strlen(key));
157 }
158 
cg_read_lc(const char * cgroup,const char * control)159 long cg_read_lc(const char *cgroup, const char *control)
160 {
161 	char buf[PAGE_SIZE];
162 	const char delim[] = "\n";
163 	char *line;
164 	long cnt = 0;
165 
166 	if (cg_read(cgroup, control, buf, sizeof(buf)))
167 		return -1;
168 
169 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
170 		cnt++;
171 
172 	return cnt;
173 }
174 
175 /* Returns 0 on success, or -errno on failure. */
cg_write(const char * cgroup,const char * control,char * buf)176 int cg_write(const char *cgroup, const char *control, char *buf)
177 {
178 	char path[PATH_MAX];
179 	ssize_t len = strlen(buf), ret;
180 
181 	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
182 	ret = write_text(path, buf, len);
183 	return ret == len ? 0 : ret;
184 }
185 
cg_write_numeric(const char * cgroup,const char * control,long value)186 int cg_write_numeric(const char *cgroup, const char *control, long value)
187 {
188 	char buf[64];
189 	int ret;
190 
191 	ret = sprintf(buf, "%lu", value);
192 	if (ret < 0)
193 		return ret;
194 
195 	return cg_write(cgroup, control, buf);
196 }
197 
cg_find_unified_root(char * root,size_t len,bool * nsdelegate)198 int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
199 {
200 	char buf[10 * PAGE_SIZE];
201 	char *fs, *mount, *type, *options;
202 	const char delim[] = "\n\t ";
203 
204 	if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
205 		return -1;
206 
207 	/*
208 	 * Example:
209 	 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
210 	 */
211 	for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
212 		mount = strtok(NULL, delim);
213 		type = strtok(NULL, delim);
214 		options = strtok(NULL, delim);
215 		strtok(NULL, delim);
216 		strtok(NULL, delim);
217 
218 		if (strcmp(type, "cgroup2") == 0) {
219 			strncpy(root, mount, len);
220 			if (nsdelegate)
221 				*nsdelegate = !!strstr(options, "nsdelegate");
222 			return 0;
223 		}
224 	}
225 
226 	return -1;
227 }
228 
cg_create(const char * cgroup)229 int cg_create(const char *cgroup)
230 {
231 	return mkdir(cgroup, 0755);
232 }
233 
cg_wait_for_proc_count(const char * cgroup,int count)234 int cg_wait_for_proc_count(const char *cgroup, int count)
235 {
236 	char buf[10 * PAGE_SIZE] = {0};
237 	int attempts;
238 	char *ptr;
239 
240 	for (attempts = 10; attempts >= 0; attempts--) {
241 		int nr = 0;
242 
243 		if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
244 			break;
245 
246 		for (ptr = buf; *ptr; ptr++)
247 			if (*ptr == '\n')
248 				nr++;
249 
250 		if (nr >= count)
251 			return 0;
252 
253 		usleep(100000);
254 	}
255 
256 	return -1;
257 }
258 
cg_killall(const char * cgroup)259 int cg_killall(const char *cgroup)
260 {
261 	char buf[PAGE_SIZE];
262 	char *ptr = buf;
263 
264 	/* If cgroup.kill exists use it. */
265 	if (!cg_write(cgroup, "cgroup.kill", "1"))
266 		return 0;
267 
268 	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
269 		return -1;
270 
271 	while (ptr < buf + sizeof(buf)) {
272 		int pid = strtol(ptr, &ptr, 10);
273 
274 		if (pid == 0)
275 			break;
276 		if (*ptr)
277 			ptr++;
278 		else
279 			break;
280 		if (kill(pid, SIGKILL))
281 			return -1;
282 	}
283 
284 	return 0;
285 }
286 
cg_destroy(const char * cgroup)287 int cg_destroy(const char *cgroup)
288 {
289 	int ret;
290 
291 	if (!cgroup)
292 		return 0;
293 retry:
294 	ret = rmdir(cgroup);
295 	if (ret && errno == EBUSY) {
296 		cg_killall(cgroup);
297 		usleep(100);
298 		goto retry;
299 	}
300 
301 	if (ret && errno == ENOENT)
302 		ret = 0;
303 
304 	return ret;
305 }
306 
cg_enter(const char * cgroup,int pid)307 int cg_enter(const char *cgroup, int pid)
308 {
309 	char pidbuf[64];
310 
311 	snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
312 	return cg_write(cgroup, "cgroup.procs", pidbuf);
313 }
314 
cg_enter_current(const char * cgroup)315 int cg_enter_current(const char *cgroup)
316 {
317 	return cg_write(cgroup, "cgroup.procs", "0");
318 }
319 
cg_enter_current_thread(const char * cgroup)320 int cg_enter_current_thread(const char *cgroup)
321 {
322 	return cg_write(cgroup, "cgroup.threads", "0");
323 }
324 
cg_run(const char * cgroup,int (* fn)(const char * cgroup,void * arg),void * arg)325 int cg_run(const char *cgroup,
326 	   int (*fn)(const char *cgroup, void *arg),
327 	   void *arg)
328 {
329 	int pid, retcode;
330 
331 	pid = fork();
332 	if (pid < 0) {
333 		return pid;
334 	} else if (pid == 0) {
335 		char buf[64];
336 
337 		snprintf(buf, sizeof(buf), "%d", getpid());
338 		if (cg_write(cgroup, "cgroup.procs", buf))
339 			exit(EXIT_FAILURE);
340 		exit(fn(cgroup, arg));
341 	} else {
342 		waitpid(pid, &retcode, 0);
343 		if (WIFEXITED(retcode))
344 			return WEXITSTATUS(retcode);
345 		else
346 			return -1;
347 	}
348 }
349 
clone_into_cgroup(int cgroup_fd)350 pid_t clone_into_cgroup(int cgroup_fd)
351 {
352 #ifdef CLONE_ARGS_SIZE_VER2
353 	pid_t pid;
354 
355 	struct __clone_args args = {
356 		.flags = CLONE_INTO_CGROUP,
357 		.exit_signal = SIGCHLD,
358 		.cgroup = cgroup_fd,
359 	};
360 
361 	pid = sys_clone3(&args, sizeof(struct __clone_args));
362 	/*
363 	 * Verify that this is a genuine test failure:
364 	 * ENOSYS -> clone3() not available
365 	 * E2BIG  -> CLONE_INTO_CGROUP not available
366 	 */
367 	if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
368 		goto pretend_enosys;
369 
370 	return pid;
371 
372 pretend_enosys:
373 #endif
374 	errno = ENOSYS;
375 	return -ENOSYS;
376 }
377 
clone_reap(pid_t pid,int options)378 int clone_reap(pid_t pid, int options)
379 {
380 	int ret;
381 	siginfo_t info = {
382 		.si_signo = 0,
383 	};
384 
385 again:
386 	ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
387 	if (ret < 0) {
388 		if (errno == EINTR)
389 			goto again;
390 		return -1;
391 	}
392 
393 	if (options & WEXITED) {
394 		if (WIFEXITED(info.si_status))
395 			return WEXITSTATUS(info.si_status);
396 	}
397 
398 	if (options & WSTOPPED) {
399 		if (WIFSTOPPED(info.si_status))
400 			return WSTOPSIG(info.si_status);
401 	}
402 
403 	if (options & WCONTINUED) {
404 		if (WIFCONTINUED(info.si_status))
405 			return 0;
406 	}
407 
408 	return -1;
409 }
410 
dirfd_open_opath(const char * dir)411 int dirfd_open_opath(const char *dir)
412 {
413 	return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
414 }
415 
416 #define close_prot_errno(fd)                                                   \
417 	if (fd >= 0) {                                                         \
418 		int _e_ = errno;                                               \
419 		close(fd);                                                     \
420 		errno = _e_;                                                   \
421 	}
422 
clone_into_cgroup_run_nowait(const char * cgroup,int (* fn)(const char * cgroup,void * arg),void * arg)423 static int clone_into_cgroup_run_nowait(const char *cgroup,
424 					int (*fn)(const char *cgroup, void *arg),
425 					void *arg)
426 {
427 	int cgroup_fd;
428 	pid_t pid;
429 
430 	cgroup_fd =  dirfd_open_opath(cgroup);
431 	if (cgroup_fd < 0)
432 		return -1;
433 
434 	pid = clone_into_cgroup(cgroup_fd);
435 	close_prot_errno(cgroup_fd);
436 	if (pid == 0)
437 		exit(fn(cgroup, arg));
438 
439 	return pid;
440 }
441 
cg_run_nowait(const char * cgroup,int (* fn)(const char * cgroup,void * arg),void * arg)442 int cg_run_nowait(const char *cgroup,
443 		  int (*fn)(const char *cgroup, void *arg),
444 		  void *arg)
445 {
446 	int pid;
447 
448 	pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
449 	if (pid > 0)
450 		return pid;
451 
452 	/* Genuine test failure. */
453 	if (pid < 0 && errno != ENOSYS)
454 		return -1;
455 
456 	pid = fork();
457 	if (pid == 0) {
458 		char buf[64];
459 
460 		snprintf(buf, sizeof(buf), "%d", getpid());
461 		if (cg_write(cgroup, "cgroup.procs", buf))
462 			exit(EXIT_FAILURE);
463 		exit(fn(cgroup, arg));
464 	}
465 
466 	return pid;
467 }
468 
get_temp_fd(void)469 int get_temp_fd(void)
470 {
471 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
472 }
473 
alloc_pagecache(int fd,size_t size)474 int alloc_pagecache(int fd, size_t size)
475 {
476 	char buf[PAGE_SIZE];
477 	struct stat st;
478 	int i;
479 
480 	if (fstat(fd, &st))
481 		goto cleanup;
482 
483 	size += st.st_size;
484 
485 	if (ftruncate(fd, size))
486 		goto cleanup;
487 
488 	for (i = 0; i < size; i += sizeof(buf))
489 		read(fd, buf, sizeof(buf));
490 
491 	return 0;
492 
493 cleanup:
494 	return -1;
495 }
496 
alloc_anon(const char * cgroup,void * arg)497 int alloc_anon(const char *cgroup, void *arg)
498 {
499 	size_t size = (unsigned long)arg;
500 	char *buf, *ptr;
501 
502 	buf = malloc(size);
503 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
504 		*ptr = 0;
505 
506 	free(buf);
507 	return 0;
508 }
509 
is_swap_enabled(void)510 int is_swap_enabled(void)
511 {
512 	char buf[PAGE_SIZE];
513 	const char delim[] = "\n";
514 	int cnt = 0;
515 	char *line;
516 
517 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
518 		return -1;
519 
520 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
521 		cnt++;
522 
523 	return cnt > 1;
524 }
525 
set_oom_adj_score(int pid,int score)526 int set_oom_adj_score(int pid, int score)
527 {
528 	char path[PATH_MAX];
529 	int fd, len;
530 
531 	sprintf(path, "/proc/%d/oom_score_adj", pid);
532 
533 	fd = open(path, O_WRONLY | O_APPEND);
534 	if (fd < 0)
535 		return fd;
536 
537 	len = dprintf(fd, "%d", score);
538 	if (len < 0) {
539 		close(fd);
540 		return len;
541 	}
542 
543 	close(fd);
544 	return 0;
545 }
546 
proc_mount_contains(const char * option)547 int proc_mount_contains(const char *option)
548 {
549 	char buf[4 * PAGE_SIZE];
550 	ssize_t read;
551 
552 	read = read_text("/proc/mounts", buf, sizeof(buf));
553 	if (read < 0)
554 		return read;
555 
556 	return strstr(buf, option) != NULL;
557 }
558 
proc_read_text(int pid,bool thread,const char * item,char * buf,size_t size)559 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
560 {
561 	char path[PATH_MAX];
562 	ssize_t ret;
563 
564 	if (!pid)
565 		snprintf(path, sizeof(path), "/proc/%s/%s",
566 			 thread ? "thread-self" : "self", item);
567 	else
568 		snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
569 
570 	ret = read_text(path, buf, size);
571 	return ret < 0 ? -1 : ret;
572 }
573 
proc_read_strstr(int pid,bool thread,const char * item,const char * needle)574 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
575 {
576 	char buf[PAGE_SIZE];
577 
578 	if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
579 		return -1;
580 
581 	return strstr(buf, needle) ? 0 : -1;
582 }
583 
clone_into_cgroup_run_wait(const char * cgroup)584 int clone_into_cgroup_run_wait(const char *cgroup)
585 {
586 	int cgroup_fd;
587 	pid_t pid;
588 
589 	cgroup_fd =  dirfd_open_opath(cgroup);
590 	if (cgroup_fd < 0)
591 		return -1;
592 
593 	pid = clone_into_cgroup(cgroup_fd);
594 	close_prot_errno(cgroup_fd);
595 	if (pid < 0)
596 		return -1;
597 
598 	if (pid == 0)
599 		exit(EXIT_SUCCESS);
600 
601 	/*
602 	 * We don't care whether this fails. We only care whether the initial
603 	 * clone succeeded.
604 	 */
605 	(void)clone_reap(pid, WEXITED);
606 	return 0;
607 }
608 
__prepare_for_wait(const char * cgroup,const char * filename)609 static int __prepare_for_wait(const char *cgroup, const char *filename)
610 {
611 	int fd, ret = -1;
612 
613 	fd = inotify_init1(0);
614 	if (fd == -1)
615 		return fd;
616 
617 	ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
618 	if (ret == -1) {
619 		close(fd);
620 		fd = -1;
621 	}
622 
623 	return fd;
624 }
625 
cg_prepare_for_wait(const char * cgroup)626 int cg_prepare_for_wait(const char *cgroup)
627 {
628 	return __prepare_for_wait(cgroup, "cgroup.events");
629 }
630 
memcg_prepare_for_wait(const char * cgroup)631 int memcg_prepare_for_wait(const char *cgroup)
632 {
633 	return __prepare_for_wait(cgroup, "memory.events");
634 }
635 
cg_wait_for(int fd)636 int cg_wait_for(int fd)
637 {
638 	int ret = -1;
639 	struct pollfd fds = {
640 		.fd = fd,
641 		.events = POLLIN,
642 	};
643 
644 	while (true) {
645 		ret = poll(&fds, 1, 10000);
646 
647 		if (ret == -1) {
648 			if (errno == EINTR)
649 				continue;
650 
651 			break;
652 		}
653 
654 		if (ret > 0 && fds.revents & POLLIN) {
655 			ret = 0;
656 			break;
657 		}
658 	}
659 
660 	return ret;
661 }
662