1 // SPDX-License-Identifier: GPL-2.0
2 
3 #define _GNU_SOURCE
4 #include <errno.h>
5 #include <fcntl.h>
6 #include <limits.h>
7 #include <linux/types.h>
8 #include <sched.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <syscall.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <unistd.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 
20 #include "pidfd.h"
21 #include "../clone3/clone3_selftests.h"
22 #include "../kselftest_harness.h"
23 
24 enum {
25 	PIDFD_NS_USER,
26 	PIDFD_NS_MNT,
27 	PIDFD_NS_PID,
28 	PIDFD_NS_UTS,
29 	PIDFD_NS_IPC,
30 	PIDFD_NS_NET,
31 	PIDFD_NS_CGROUP,
32 	PIDFD_NS_PIDCLD,
33 	PIDFD_NS_TIME,
34 	PIDFD_NS_MAX
35 };
36 
37 const struct ns_info {
38 	const char *name;
39 	int flag;
40 } ns_info[] = {
41 	[PIDFD_NS_USER]   = { "user",             CLONE_NEWUSER,   },
42 	[PIDFD_NS_MNT]    = { "mnt",              CLONE_NEWNS,     },
43 	[PIDFD_NS_PID]    = { "pid",              CLONE_NEWPID,    },
44 	[PIDFD_NS_UTS]    = { "uts",              CLONE_NEWUTS,    },
45 	[PIDFD_NS_IPC]    = { "ipc",              CLONE_NEWIPC,    },
46 	[PIDFD_NS_NET]    = { "net",              CLONE_NEWNET,    },
47 	[PIDFD_NS_CGROUP] = { "cgroup",           CLONE_NEWCGROUP, },
48 	[PIDFD_NS_PIDCLD] = { "pid_for_children", 0,               },
49 	[PIDFD_NS_TIME]	  = { "time",             CLONE_NEWTIME,   },
50 };
51 
52 FIXTURE(current_nsset)
53 {
54 	pid_t pid;
55 	int pidfd;
56 	int nsfds[PIDFD_NS_MAX];
57 
58 	pid_t child_pid_exited;
59 	int child_pidfd_exited;
60 
61 	pid_t child_pid1;
62 	int child_pidfd1;
63 	int child_nsfds1[PIDFD_NS_MAX];
64 
65 	pid_t child_pid2;
66 	int child_pidfd2;
67 	int child_nsfds2[PIDFD_NS_MAX];
68 };
69 
70 static int sys_waitid(int which, pid_t pid, int options)
71 {
72 	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
73 }
74 
75 pid_t create_child(int *pidfd, unsigned flags)
76 {
77 	struct __clone_args args = {
78 		.flags		= CLONE_PIDFD | flags,
79 		.exit_signal	= SIGCHLD,
80 		.pidfd		= ptr_to_u64(pidfd),
81 	};
82 
83 	return sys_clone3(&args, sizeof(struct clone_args));
84 }
85 
86 static bool switch_timens(void)
87 {
88 	int fd, ret;
89 
90 	if (unshare(CLONE_NEWTIME))
91 		return false;
92 
93 	fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
94 	if (fd < 0)
95 		return false;
96 
97 	ret = setns(fd, CLONE_NEWTIME);
98 	close(fd);
99 	return ret == 0;
100 }
101 
102 static ssize_t read_nointr(int fd, void *buf, size_t count)
103 {
104 	ssize_t ret;
105 
106 	do {
107 		ret = read(fd, buf, count);
108 	} while (ret < 0 && errno == EINTR);
109 
110 	return ret;
111 }
112 
113 static ssize_t write_nointr(int fd, const void *buf, size_t count)
114 {
115 	ssize_t ret;
116 
117 	do {
118 		ret = write(fd, buf, count);
119 	} while (ret < 0 && errno == EINTR);
120 
121 	return ret;
122 }
123 
124 FIXTURE_SETUP(current_nsset)
125 {
126 	int i, proc_fd, ret;
127 	int ipc_sockets[2];
128 	char c;
129 
130 	for (i = 0; i < PIDFD_NS_MAX; i++) {
131 		self->nsfds[i]		= -EBADF;
132 		self->child_nsfds1[i]	= -EBADF;
133 		self->child_nsfds2[i]	= -EBADF;
134 	}
135 
136 	proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
137 	ASSERT_GE(proc_fd, 0) {
138 		TH_LOG("%m - Failed to open /proc/self/ns");
139 	}
140 
141 	self->pid = getpid();
142 	for (i = 0; i < PIDFD_NS_MAX; i++) {
143 		const struct ns_info *info = &ns_info[i];
144 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
145 		if (self->nsfds[i] < 0) {
146 			EXPECT_EQ(errno, ENOENT) {
147 				TH_LOG("%m - Failed to open %s namespace for process %d",
148 				       info->name, self->pid);
149 			}
150 		}
151 	}
152 
153 	self->pidfd = sys_pidfd_open(self->pid, 0);
154 	EXPECT_GT(self->pidfd, 0) {
155 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
156 	}
157 
158 	/* Create task that exits right away. */
159 	self->child_pid_exited = create_child(&self->child_pidfd_exited,
160 					      CLONE_NEWUSER | CLONE_NEWNET);
161 	EXPECT_GT(self->child_pid_exited, 0);
162 
163 	if (self->child_pid_exited == 0)
164 		_exit(EXIT_SUCCESS);
165 
166 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
167 
168 	self->pidfd = sys_pidfd_open(self->pid, 0);
169 	EXPECT_GE(self->pidfd, 0) {
170 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
171 	}
172 
173 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
174 	EXPECT_EQ(ret, 0);
175 
176 	/* Create tasks that will be stopped. */
177 	self->child_pid1 = create_child(&self->child_pidfd1,
178 					CLONE_NEWUSER | CLONE_NEWNS |
179 					CLONE_NEWCGROUP | CLONE_NEWIPC |
180 					CLONE_NEWUTS | CLONE_NEWPID |
181 					CLONE_NEWNET);
182 	EXPECT_GE(self->child_pid1, 0);
183 
184 	if (self->child_pid1 == 0) {
185 		close(ipc_sockets[0]);
186 
187 		if (!switch_timens())
188 			_exit(EXIT_FAILURE);
189 
190 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
191 			_exit(EXIT_FAILURE);
192 
193 		close(ipc_sockets[1]);
194 
195 		pause();
196 		_exit(EXIT_SUCCESS);
197 	}
198 
199 	close(ipc_sockets[1]);
200 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
201 	close(ipc_sockets[0]);
202 
203 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
204 	EXPECT_EQ(ret, 0);
205 
206 	self->child_pid2 = create_child(&self->child_pidfd2,
207 					CLONE_NEWUSER | CLONE_NEWNS |
208 					CLONE_NEWCGROUP | CLONE_NEWIPC |
209 					CLONE_NEWUTS | CLONE_NEWPID |
210 					CLONE_NEWNET);
211 	EXPECT_GE(self->child_pid2, 0);
212 
213 	if (self->child_pid2 == 0) {
214 		close(ipc_sockets[0]);
215 
216 		if (!switch_timens())
217 			_exit(EXIT_FAILURE);
218 
219 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
220 			_exit(EXIT_FAILURE);
221 
222 		close(ipc_sockets[1]);
223 
224 		pause();
225 		_exit(EXIT_SUCCESS);
226 	}
227 
228 	close(ipc_sockets[1]);
229 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
230 	close(ipc_sockets[0]);
231 
232 	for (i = 0; i < PIDFD_NS_MAX; i++) {
233 		char p[100];
234 
235 		const struct ns_info *info = &ns_info[i];
236 
237 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
238 		if (self->nsfds[i] < 0) {
239 			EXPECT_EQ(errno, ENOENT) {
240 				TH_LOG("%m - Failed to open %s namespace for process %d",
241 				       info->name, self->pid);
242 			}
243 		}
244 
245 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
246 			       self->child_pid1, info->name);
247 		EXPECT_GT(ret, 0);
248 		EXPECT_LT(ret, sizeof(p));
249 
250 		self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
251 		if (self->child_nsfds1[i] < 0) {
252 			EXPECT_EQ(errno, ENOENT) {
253 				TH_LOG("%m - Failed to open %s namespace for process %d",
254 				       info->name, self->child_pid1);
255 			}
256 		}
257 
258 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
259 			       self->child_pid2, info->name);
260 		EXPECT_GT(ret, 0);
261 		EXPECT_LT(ret, sizeof(p));
262 
263 		self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
264 		if (self->child_nsfds2[i] < 0) {
265 			EXPECT_EQ(errno, ENOENT) {
266 				TH_LOG("%m - Failed to open %s namespace for process %d",
267 				       info->name, self->child_pid1);
268 			}
269 		}
270 	}
271 
272 	close(proc_fd);
273 }
274 
275 FIXTURE_TEARDOWN(current_nsset)
276 {
277 	int i;
278 
279 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
280 					SIGKILL, NULL, 0), 0);
281 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
282 					SIGKILL, NULL, 0), 0);
283 
284 	for (i = 0; i < PIDFD_NS_MAX; i++) {
285 		if (self->nsfds[i] >= 0)
286 			close(self->nsfds[i]);
287 		if (self->child_nsfds1[i] >= 0)
288 			close(self->child_nsfds1[i]);
289 		if (self->child_nsfds2[i] >= 0)
290 			close(self->child_nsfds2[i]);
291 	}
292 
293 	if (self->child_pidfd1 >= 0)
294 		EXPECT_EQ(0, close(self->child_pidfd1));
295 	if (self->child_pidfd2 >= 0)
296 		EXPECT_EQ(0, close(self->child_pidfd2));
297 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
298 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
299 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
300 }
301 
302 static int preserve_ns(const int pid, const char *ns)
303 {
304 	int ret;
305 	char path[50];
306 
307 	ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
308 	if (ret < 0 || (size_t)ret >= sizeof(path))
309 		return -EIO;
310 
311 	return open(path, O_RDONLY | O_CLOEXEC);
312 }
313 
314 static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
315 {
316 	int ns_fd2 = -EBADF;
317 	int ret = -1;
318 	struct stat ns_st1, ns_st2;
319 
320 	ret = fstat(ns_fd1, &ns_st1);
321 	if (ret < 0)
322 		return -1;
323 
324 	ns_fd2 = preserve_ns(pid2, ns);
325 	if (ns_fd2 < 0)
326 		return -1;
327 
328 	ret = fstat(ns_fd2, &ns_st2);
329 	close(ns_fd2);
330 	if (ret < 0)
331 		return -1;
332 
333 	/* processes are in the same namespace */
334 	if ((ns_st1.st_dev == ns_st2.st_dev) &&
335 	    (ns_st1.st_ino == ns_st2.st_ino))
336 		return 1;
337 
338 	/* processes are in different namespaces */
339 	return 0;
340 }
341 
342 /* Test that we can't pass garbage to the kernel. */
343 TEST_F(current_nsset, invalid_flags)
344 {
345 	ASSERT_NE(setns(self->pidfd, 0), 0);
346 	EXPECT_EQ(errno, EINVAL);
347 
348 	ASSERT_NE(setns(self->pidfd, -1), 0);
349 	EXPECT_EQ(errno, EINVAL);
350 
351 	ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
352 	EXPECT_EQ(errno, EINVAL);
353 
354 	ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
355 	EXPECT_EQ(errno, EINVAL);
356 }
357 
358 /* Test that we can't attach to a task that has already exited. */
359 TEST_F(current_nsset, pidfd_exited_child)
360 {
361 	int i;
362 	pid_t pid;
363 
364 	ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
365 		  0);
366 	EXPECT_EQ(errno, ESRCH);
367 
368 	pid = getpid();
369 	for (i = 0; i < PIDFD_NS_MAX; i++) {
370 		const struct ns_info *info = &ns_info[i];
371 		/* Verify that we haven't changed any namespaces. */
372 		if (self->nsfds[i] >= 0)
373 			ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
374 	}
375 }
376 
377 TEST_F(current_nsset, pidfd_incremental_setns)
378 {
379 	int i;
380 	pid_t pid;
381 
382 	pid = getpid();
383 	for (i = 0; i < PIDFD_NS_MAX; i++) {
384 		const struct ns_info *info = &ns_info[i];
385 		int nsfd;
386 
387 		if (self->child_nsfds1[i] < 0)
388 			continue;
389 
390 		if (info->flag) {
391 			ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
392 				TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
393 				       info->name, self->child_pid1,
394 				       self->child_pidfd1);
395 			}
396 		}
397 
398 		/* Verify that we have changed to the correct namespaces. */
399 		if (info->flag == CLONE_NEWPID)
400 			nsfd = self->nsfds[i];
401 		else
402 			nsfd = self->child_nsfds1[i];
403 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
404 			TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
405 			       info->name, self->child_pid1,
406 			       self->child_pidfd1);
407 		}
408 		TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
409 		       info->name, self->child_pid1, self->child_pidfd1);
410 	}
411 }
412 
413 TEST_F(current_nsset, nsfd_incremental_setns)
414 {
415 	int i;
416 	pid_t pid;
417 
418 	pid = getpid();
419 	for (i = 0; i < PIDFD_NS_MAX; i++) {
420 		const struct ns_info *info = &ns_info[i];
421 		int nsfd;
422 
423 		if (self->child_nsfds1[i] < 0)
424 			continue;
425 
426 		if (info->flag) {
427 			ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
428 				TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
429 				       info->name, self->child_pid1,
430 				       self->child_nsfds1[i]);
431 			}
432 		}
433 
434 		/* Verify that we have changed to the correct namespaces. */
435 		if (info->flag == CLONE_NEWPID)
436 			nsfd = self->nsfds[i];
437 		else
438 			nsfd = self->child_nsfds1[i];
439 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
440 			TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
441 			       info->name, self->child_pid1,
442 			       self->child_nsfds1[i]);
443 		}
444 		TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
445 		       info->name, self->child_pid1, self->child_nsfds1[i]);
446 	}
447 }
448 
449 TEST_F(current_nsset, pidfd_one_shot_setns)
450 {
451 	unsigned flags = 0;
452 	int i;
453 	pid_t pid;
454 
455 	for (i = 0; i < PIDFD_NS_MAX; i++) {
456 		const struct ns_info *info = &ns_info[i];
457 
458 		if (self->child_nsfds1[i] < 0)
459 			continue;
460 
461 		flags |= info->flag;
462 		TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
463 		       info->name, self->child_pid1);
464 	}
465 
466 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
467 		TH_LOG("%m - Failed to setns to namespaces of %d",
468 		       self->child_pid1);
469 	}
470 
471 	pid = getpid();
472 	for (i = 0; i < PIDFD_NS_MAX; i++) {
473 		const struct ns_info *info = &ns_info[i];
474 		int nsfd;
475 
476 		if (self->child_nsfds1[i] < 0)
477 			continue;
478 
479 		/* Verify that we have changed to the correct namespaces. */
480 		if (info->flag == CLONE_NEWPID)
481 			nsfd = self->nsfds[i];
482 		else
483 			nsfd = self->child_nsfds1[i];
484 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
485 			TH_LOG("setns failed to place us correctly into %s namespace of %d",
486 			       info->name, self->child_pid1);
487 		}
488 		TH_LOG("Managed to correctly setns to %s namespace of %d",
489 		       info->name, self->child_pid1);
490 	}
491 }
492 
493 TEST_F(current_nsset, no_foul_play)
494 {
495 	unsigned flags = 0;
496 	int i;
497 
498 	for (i = 0; i < PIDFD_NS_MAX; i++) {
499 		const struct ns_info *info = &ns_info[i];
500 
501 		if (self->child_nsfds1[i] < 0)
502 			continue;
503 
504 		flags |= info->flag;
505 		if (info->flag) /* No use logging pid_for_children. */
506 			TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
507 			       info->name, self->child_pid1);
508 	}
509 
510 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
511 		TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
512 		       self->child_pid1, self->child_pidfd1);
513 	}
514 
515 	/*
516 	 * Can't setns to a user namespace outside of our hierarchy since we
517 	 * don't have caps in there and didn't create it. That means that under
518 	 * no circumstances should we be able to setns to any of the other
519 	 * ones since they aren't owned by our user namespace.
520 	 */
521 	for (i = 0; i < PIDFD_NS_MAX; i++) {
522 		const struct ns_info *info = &ns_info[i];
523 
524 		if (self->child_nsfds2[i] < 0 || !info->flag)
525 			continue;
526 
527 		ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
528 			TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
529 			       info->name, self->child_pid2,
530 			       self->child_pidfd2);
531 		}
532 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
533 		       info->name, self->child_pid2,
534 		       self->child_pidfd2);
535 
536 		ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
537 			TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
538 			       info->name, self->child_pid2,
539 			       self->child_nsfds2[i]);
540 		}
541 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
542 		       info->name, self->child_pid2,
543 		       self->child_nsfds2[i]);
544 	}
545 }
546 
547 TEST(setns_einval)
548 {
549 	int fd;
550 
551 	fd = sys_memfd_create("rostock", 0);
552 	EXPECT_GT(fd, 0);
553 
554 	ASSERT_NE(setns(fd, 0), 0);
555 	EXPECT_EQ(errno, EINVAL);
556 	close(fd);
557 }
558 
559 TEST_HARNESS_MAIN
560