1 // SPDX-License-Identifier: GPL-2.0
2 
3 #define _GNU_SOURCE
4 #include <errno.h>
5 #include <fcntl.h>
6 #include <limits.h>
7 #include <linux/types.h>
8 #include <sched.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <syscall.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <unistd.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <linux/kcmp.h>
20 
21 #include "pidfd.h"
22 #include "../clone3/clone3_selftests.h"
23 #include "../kselftest_harness.h"
24 
25 enum {
26 	PIDFD_NS_USER,
27 	PIDFD_NS_MNT,
28 	PIDFD_NS_PID,
29 	PIDFD_NS_UTS,
30 	PIDFD_NS_IPC,
31 	PIDFD_NS_NET,
32 	PIDFD_NS_CGROUP,
33 	PIDFD_NS_PIDCLD,
34 	PIDFD_NS_TIME,
35 	PIDFD_NS_MAX
36 };
37 
38 const struct ns_info {
39 	const char *name;
40 	int flag;
41 } ns_info[] = {
42 	[PIDFD_NS_USER]   = { "user",             CLONE_NEWUSER,   },
43 	[PIDFD_NS_MNT]    = { "mnt",              CLONE_NEWNS,     },
44 	[PIDFD_NS_PID]    = { "pid",              CLONE_NEWPID,    },
45 	[PIDFD_NS_UTS]    = { "uts",              CLONE_NEWUTS,    },
46 	[PIDFD_NS_IPC]    = { "ipc",              CLONE_NEWIPC,    },
47 	[PIDFD_NS_NET]    = { "net",              CLONE_NEWNET,    },
48 	[PIDFD_NS_CGROUP] = { "cgroup",           CLONE_NEWCGROUP, },
49 	[PIDFD_NS_PIDCLD] = { "pid_for_children", 0,               },
50 	[PIDFD_NS_TIME]	  = { "time",             CLONE_NEWTIME,   },
51 };
52 
53 FIXTURE(current_nsset)
54 {
55 	pid_t pid;
56 	int pidfd;
57 	int nsfds[PIDFD_NS_MAX];
58 
59 	pid_t child_pid_exited;
60 	int child_pidfd_exited;
61 
62 	pid_t child_pid1;
63 	int child_pidfd1;
64 	int child_nsfds1[PIDFD_NS_MAX];
65 
66 	pid_t child_pid2;
67 	int child_pidfd2;
68 	int child_nsfds2[PIDFD_NS_MAX];
69 };
70 
71 static int sys_waitid(int which, pid_t pid, int options)
72 {
73 	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
74 }
75 
76 pid_t create_child(int *pidfd, unsigned flags)
77 {
78 	struct clone_args args = {
79 		.flags		= CLONE_PIDFD | flags,
80 		.exit_signal	= SIGCHLD,
81 		.pidfd		= ptr_to_u64(pidfd),
82 	};
83 
84 	return sys_clone3(&args, sizeof(struct clone_args));
85 }
86 
87 static bool switch_timens(void)
88 {
89 	int fd, ret;
90 
91 	if (unshare(CLONE_NEWTIME))
92 		return false;
93 
94 	fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
95 	if (fd < 0)
96 		return false;
97 
98 	ret = setns(fd, CLONE_NEWTIME);
99 	close(fd);
100 	return ret == 0;
101 }
102 
103 static ssize_t read_nointr(int fd, void *buf, size_t count)
104 {
105 	ssize_t ret;
106 
107 	do {
108 		ret = read(fd, buf, count);
109 	} while (ret < 0 && errno == EINTR);
110 
111 	return ret;
112 }
113 
114 static ssize_t write_nointr(int fd, const void *buf, size_t count)
115 {
116 	ssize_t ret;
117 
118 	do {
119 		ret = write(fd, buf, count);
120 	} while (ret < 0 && errno == EINTR);
121 
122 	return ret;
123 }
124 
125 FIXTURE_SETUP(current_nsset)
126 {
127 	int i, proc_fd, ret;
128 	int ipc_sockets[2];
129 	char c;
130 
131 	for (i = 0; i < PIDFD_NS_MAX; i++) {
132 		self->nsfds[i]		= -EBADF;
133 		self->child_nsfds1[i]	= -EBADF;
134 		self->child_nsfds2[i]	= -EBADF;
135 	}
136 
137 	proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
138 	ASSERT_GE(proc_fd, 0) {
139 		TH_LOG("%m - Failed to open /proc/self/ns");
140 	}
141 
142 	self->pid = getpid();
143 	for (i = 0; i < PIDFD_NS_MAX; i++) {
144 		const struct ns_info *info = &ns_info[i];
145 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
146 		if (self->nsfds[i] < 0) {
147 			EXPECT_EQ(errno, ENOENT) {
148 				TH_LOG("%m - Failed to open %s namespace for process %d",
149 				       info->name, self->pid);
150 			}
151 		}
152 	}
153 
154 	self->pidfd = sys_pidfd_open(self->pid, 0);
155 	EXPECT_GT(self->pidfd, 0) {
156 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
157 	}
158 
159 	/* Create task that exits right away. */
160 	self->child_pid_exited = create_child(&self->child_pidfd_exited,
161 					      CLONE_NEWUSER | CLONE_NEWNET);
162 	EXPECT_GT(self->child_pid_exited, 0);
163 
164 	if (self->child_pid_exited == 0)
165 		_exit(EXIT_SUCCESS);
166 
167 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
168 
169 	self->pidfd = sys_pidfd_open(self->pid, 0);
170 	EXPECT_GE(self->pidfd, 0) {
171 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
172 	}
173 
174 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
175 	EXPECT_EQ(ret, 0);
176 
177 	/* Create tasks that will be stopped. */
178 	self->child_pid1 = create_child(&self->child_pidfd1,
179 					CLONE_NEWUSER | CLONE_NEWNS |
180 					CLONE_NEWCGROUP | CLONE_NEWIPC |
181 					CLONE_NEWUTS | CLONE_NEWPID |
182 					CLONE_NEWNET);
183 	EXPECT_GE(self->child_pid1, 0);
184 
185 	if (self->child_pid1 == 0) {
186 		close(ipc_sockets[0]);
187 
188 		if (!switch_timens())
189 			_exit(EXIT_FAILURE);
190 
191 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
192 			_exit(EXIT_FAILURE);
193 
194 		close(ipc_sockets[1]);
195 
196 		pause();
197 		_exit(EXIT_SUCCESS);
198 	}
199 
200 	close(ipc_sockets[1]);
201 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
202 	close(ipc_sockets[0]);
203 
204 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
205 	EXPECT_EQ(ret, 0);
206 
207 	self->child_pid2 = create_child(&self->child_pidfd2,
208 					CLONE_NEWUSER | CLONE_NEWNS |
209 					CLONE_NEWCGROUP | CLONE_NEWIPC |
210 					CLONE_NEWUTS | CLONE_NEWPID |
211 					CLONE_NEWNET);
212 	EXPECT_GE(self->child_pid2, 0);
213 
214 	if (self->child_pid2 == 0) {
215 		close(ipc_sockets[0]);
216 
217 		if (!switch_timens())
218 			_exit(EXIT_FAILURE);
219 
220 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
221 			_exit(EXIT_FAILURE);
222 
223 		close(ipc_sockets[1]);
224 
225 		pause();
226 		_exit(EXIT_SUCCESS);
227 	}
228 
229 	close(ipc_sockets[1]);
230 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
231 	close(ipc_sockets[0]);
232 
233 	for (i = 0; i < PIDFD_NS_MAX; i++) {
234 		char p[100];
235 
236 		const struct ns_info *info = &ns_info[i];
237 
238 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
239 		if (self->nsfds[i] < 0) {
240 			EXPECT_EQ(errno, ENOENT) {
241 				TH_LOG("%m - Failed to open %s namespace for process %d",
242 				       info->name, self->pid);
243 			}
244 		}
245 
246 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
247 			       self->child_pid1, info->name);
248 		EXPECT_GT(ret, 0);
249 		EXPECT_LT(ret, sizeof(p));
250 
251 		self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
252 		if (self->child_nsfds1[i] < 0) {
253 			EXPECT_EQ(errno, ENOENT) {
254 				TH_LOG("%m - Failed to open %s namespace for process %d",
255 				       info->name, self->child_pid1);
256 			}
257 		}
258 
259 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
260 			       self->child_pid2, info->name);
261 		EXPECT_GT(ret, 0);
262 		EXPECT_LT(ret, sizeof(p));
263 
264 		self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
265 		if (self->child_nsfds2[i] < 0) {
266 			EXPECT_EQ(errno, ENOENT) {
267 				TH_LOG("%m - Failed to open %s namespace for process %d",
268 				       info->name, self->child_pid1);
269 			}
270 		}
271 	}
272 
273 	close(proc_fd);
274 }
275 
276 FIXTURE_TEARDOWN(current_nsset)
277 {
278 	int i;
279 
280 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
281 					SIGKILL, NULL, 0), 0);
282 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
283 					SIGKILL, NULL, 0), 0);
284 
285 	for (i = 0; i < PIDFD_NS_MAX; i++) {
286 		if (self->nsfds[i] >= 0)
287 			close(self->nsfds[i]);
288 		if (self->child_nsfds1[i] >= 0)
289 			close(self->child_nsfds1[i]);
290 		if (self->child_nsfds2[i] >= 0)
291 			close(self->child_nsfds2[i]);
292 	}
293 
294 	if (self->child_pidfd1 >= 0)
295 		EXPECT_EQ(0, close(self->child_pidfd1));
296 	if (self->child_pidfd2 >= 0)
297 		EXPECT_EQ(0, close(self->child_pidfd2));
298 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
299 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
300 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
301 }
302 
303 static int preserve_ns(const int pid, const char *ns)
304 {
305 	int ret;
306 	char path[50];
307 
308 	ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
309 	if (ret < 0 || (size_t)ret >= sizeof(path))
310 		return -EIO;
311 
312 	return open(path, O_RDONLY | O_CLOEXEC);
313 }
314 
315 static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
316 {
317 	int ns_fd2 = -EBADF;
318 	int ret = -1;
319 	struct stat ns_st1, ns_st2;
320 
321 	ret = fstat(ns_fd1, &ns_st1);
322 	if (ret < 0)
323 		return -1;
324 
325 	ns_fd2 = preserve_ns(pid2, ns);
326 	if (ns_fd2 < 0)
327 		return -1;
328 
329 	ret = fstat(ns_fd2, &ns_st2);
330 	close(ns_fd2);
331 	if (ret < 0)
332 		return -1;
333 
334 	/* processes are in the same namespace */
335 	if ((ns_st1.st_dev == ns_st2.st_dev) &&
336 	    (ns_st1.st_ino == ns_st2.st_ino))
337 		return 1;
338 
339 	/* processes are in different namespaces */
340 	return 0;
341 }
342 
343 /* Test that we can't pass garbage to the kernel. */
344 TEST_F(current_nsset, invalid_flags)
345 {
346 	ASSERT_NE(setns(self->pidfd, 0), 0);
347 	EXPECT_EQ(errno, EINVAL);
348 
349 	ASSERT_NE(setns(self->pidfd, -1), 0);
350 	EXPECT_EQ(errno, EINVAL);
351 
352 	ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
353 	EXPECT_EQ(errno, EINVAL);
354 
355 	ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
356 	EXPECT_EQ(errno, EINVAL);
357 }
358 
359 /* Test that we can't attach to a task that has already exited. */
360 TEST_F(current_nsset, pidfd_exited_child)
361 {
362 	int i;
363 	pid_t pid;
364 
365 	ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
366 		  0);
367 	EXPECT_EQ(errno, ESRCH);
368 
369 	pid = getpid();
370 	for (i = 0; i < PIDFD_NS_MAX; i++) {
371 		const struct ns_info *info = &ns_info[i];
372 		/* Verify that we haven't changed any namespaces. */
373 		if (self->nsfds[i] >= 0)
374 			ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
375 	}
376 }
377 
378 TEST_F(current_nsset, pidfd_incremental_setns)
379 {
380 	int i;
381 	pid_t pid;
382 
383 	pid = getpid();
384 	for (i = 0; i < PIDFD_NS_MAX; i++) {
385 		const struct ns_info *info = &ns_info[i];
386 		int nsfd;
387 
388 		if (self->child_nsfds1[i] < 0)
389 			continue;
390 
391 		if (info->flag) {
392 			ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
393 				TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
394 				       info->name, self->child_pid1,
395 				       self->child_pidfd1);
396 			}
397 		}
398 
399 		/* Verify that we have changed to the correct namespaces. */
400 		if (info->flag == CLONE_NEWPID)
401 			nsfd = self->nsfds[i];
402 		else
403 			nsfd = self->child_nsfds1[i];
404 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
405 			TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
406 			       info->name, self->child_pid1,
407 			       self->child_pidfd1);
408 		}
409 		TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
410 		       info->name, self->child_pid1, self->child_pidfd1);
411 	}
412 }
413 
414 TEST_F(current_nsset, nsfd_incremental_setns)
415 {
416 	int i;
417 	pid_t pid;
418 
419 	pid = getpid();
420 	for (i = 0; i < PIDFD_NS_MAX; i++) {
421 		const struct ns_info *info = &ns_info[i];
422 		int nsfd;
423 
424 		if (self->child_nsfds1[i] < 0)
425 			continue;
426 
427 		if (info->flag) {
428 			ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
429 				TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
430 				       info->name, self->child_pid1,
431 				       self->child_nsfds1[i]);
432 			}
433 		}
434 
435 		/* Verify that we have changed to the correct namespaces. */
436 		if (info->flag == CLONE_NEWPID)
437 			nsfd = self->nsfds[i];
438 		else
439 			nsfd = self->child_nsfds1[i];
440 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
441 			TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
442 			       info->name, self->child_pid1,
443 			       self->child_nsfds1[i]);
444 		}
445 		TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
446 		       info->name, self->child_pid1, self->child_nsfds1[i]);
447 	}
448 }
449 
450 TEST_F(current_nsset, pidfd_one_shot_setns)
451 {
452 	unsigned flags = 0;
453 	int i;
454 	pid_t pid;
455 
456 	for (i = 0; i < PIDFD_NS_MAX; i++) {
457 		const struct ns_info *info = &ns_info[i];
458 
459 		if (self->child_nsfds1[i] < 0)
460 			continue;
461 
462 		flags |= info->flag;
463 		TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
464 		       info->name, self->child_pid1);
465 	}
466 
467 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
468 		TH_LOG("%m - Failed to setns to namespaces of %d",
469 		       self->child_pid1);
470 	}
471 
472 	pid = getpid();
473 	for (i = 0; i < PIDFD_NS_MAX; i++) {
474 		const struct ns_info *info = &ns_info[i];
475 		int nsfd;
476 
477 		if (self->child_nsfds1[i] < 0)
478 			continue;
479 
480 		/* Verify that we have changed to the correct namespaces. */
481 		if (info->flag == CLONE_NEWPID)
482 			nsfd = self->nsfds[i];
483 		else
484 			nsfd = self->child_nsfds1[i];
485 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
486 			TH_LOG("setns failed to place us correctly into %s namespace of %d",
487 			       info->name, self->child_pid1);
488 		}
489 		TH_LOG("Managed to correctly setns to %s namespace of %d",
490 		       info->name, self->child_pid1);
491 	}
492 }
493 
494 TEST_F(current_nsset, no_foul_play)
495 {
496 	unsigned flags = 0;
497 	int i;
498 
499 	for (i = 0; i < PIDFD_NS_MAX; i++) {
500 		const struct ns_info *info = &ns_info[i];
501 
502 		if (self->child_nsfds1[i] < 0)
503 			continue;
504 
505 		flags |= info->flag;
506 		if (info->flag) /* No use logging pid_for_children. */
507 			TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
508 			       info->name, self->child_pid1);
509 	}
510 
511 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
512 		TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
513 		       self->child_pid1, self->child_pidfd1);
514 	}
515 
516 	/*
517 	 * Can't setns to a user namespace outside of our hierarchy since we
518 	 * don't have caps in there and didn't create it. That means that under
519 	 * no circumstances should we be able to setns to any of the other
520 	 * ones since they aren't owned by our user namespace.
521 	 */
522 	for (i = 0; i < PIDFD_NS_MAX; i++) {
523 		const struct ns_info *info = &ns_info[i];
524 
525 		if (self->child_nsfds2[i] < 0 || !info->flag)
526 			continue;
527 
528 		ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
529 			TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
530 			       info->name, self->child_pid2,
531 			       self->child_pidfd2);
532 		}
533 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
534 		       info->name, self->child_pid2,
535 		       self->child_pidfd2);
536 
537 		ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
538 			TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
539 			       info->name, self->child_pid2,
540 			       self->child_nsfds2[i]);
541 		}
542 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
543 		       info->name, self->child_pid2,
544 		       self->child_nsfds2[i]);
545 	}
546 }
547 
548 TEST(setns_einval)
549 {
550 	int fd;
551 
552 	fd = sys_memfd_create("rostock", 0);
553 	EXPECT_GT(fd, 0);
554 
555 	ASSERT_NE(setns(fd, 0), 0);
556 	EXPECT_EQ(errno, EINVAL);
557 	close(fd);
558 }
559 
560 TEST_HARNESS_MAIN
561