1 // SPDX-License-Identifier: GPL-2.0
2 
3 #define _GNU_SOURCE
4 #include <errno.h>
5 #include <fcntl.h>
6 #include <limits.h>
7 #include <linux/types.h>
8 #include <sched.h>
9 #include <signal.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <syscall.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <unistd.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <linux/kcmp.h>
20 
21 #include "pidfd.h"
22 #include "../clone3/clone3_selftests.h"
23 #include "../kselftest.h"
24 #include "../kselftest_harness.h"
25 
26 enum {
27 	PIDFD_NS_USER,
28 	PIDFD_NS_MNT,
29 	PIDFD_NS_PID,
30 	PIDFD_NS_UTS,
31 	PIDFD_NS_IPC,
32 	PIDFD_NS_NET,
33 	PIDFD_NS_CGROUP,
34 	PIDFD_NS_PIDCLD,
35 	PIDFD_NS_TIME,
36 	PIDFD_NS_MAX
37 };
38 
39 const struct ns_info {
40 	const char *name;
41 	int flag;
42 } ns_info[] = {
43 	[PIDFD_NS_USER]   = { "user",             CLONE_NEWUSER,   },
44 	[PIDFD_NS_MNT]    = { "mnt",              CLONE_NEWNS,     },
45 	[PIDFD_NS_PID]    = { "pid",              CLONE_NEWPID,    },
46 	[PIDFD_NS_UTS]    = { "uts",              CLONE_NEWUTS,    },
47 	[PIDFD_NS_IPC]    = { "ipc",              CLONE_NEWIPC,    },
48 	[PIDFD_NS_NET]    = { "net",              CLONE_NEWNET,    },
49 	[PIDFD_NS_CGROUP] = { "cgroup",           CLONE_NEWCGROUP, },
50 	[PIDFD_NS_PIDCLD] = { "pid_for_children", 0,               },
51 	[PIDFD_NS_TIME]	  = { "time",             CLONE_NEWTIME,   },
52 };
53 
54 FIXTURE(current_nsset)
55 {
56 	pid_t pid;
57 	int pidfd;
58 	int nsfds[PIDFD_NS_MAX];
59 
60 	pid_t child_pid_exited;
61 	int child_pidfd_exited;
62 
63 	pid_t child_pid1;
64 	int child_pidfd1;
65 	int child_nsfds1[PIDFD_NS_MAX];
66 
67 	pid_t child_pid2;
68 	int child_pidfd2;
69 	int child_nsfds2[PIDFD_NS_MAX];
70 };
71 
72 static int sys_waitid(int which, pid_t pid, int options)
73 {
74 	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
75 }
76 
77 pid_t create_child(int *pidfd, unsigned flags)
78 {
79 	struct clone_args args = {
80 		.flags		= CLONE_PIDFD | flags,
81 		.exit_signal	= SIGCHLD,
82 		.pidfd		= ptr_to_u64(pidfd),
83 	};
84 
85 	return sys_clone3(&args, sizeof(struct clone_args));
86 }
87 
88 static bool switch_timens(void)
89 {
90 	int fd, ret;
91 
92 	if (unshare(CLONE_NEWTIME))
93 		return false;
94 
95 	fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
96 	if (fd < 0)
97 		return false;
98 
99 	ret = setns(fd, CLONE_NEWTIME);
100 	close(fd);
101 	return ret == 0;
102 }
103 
104 static ssize_t read_nointr(int fd, void *buf, size_t count)
105 {
106 	ssize_t ret;
107 
108 	do {
109 		ret = read(fd, buf, count);
110 	} while (ret < 0 && errno == EINTR);
111 
112 	return ret;
113 }
114 
115 static ssize_t write_nointr(int fd, const void *buf, size_t count)
116 {
117 	ssize_t ret;
118 
119 	do {
120 		ret = write(fd, buf, count);
121 	} while (ret < 0 && errno == EINTR);
122 
123 	return ret;
124 }
125 
126 FIXTURE_SETUP(current_nsset)
127 {
128 	int i, proc_fd, ret;
129 	int ipc_sockets[2];
130 	char c;
131 
132 	for (i = 0; i < PIDFD_NS_MAX; i++) {
133 		self->nsfds[i]		= -EBADF;
134 		self->child_nsfds1[i]	= -EBADF;
135 		self->child_nsfds2[i]	= -EBADF;
136 	}
137 
138 	proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
139 	ASSERT_GE(proc_fd, 0) {
140 		TH_LOG("%m - Failed to open /proc/self/ns");
141 	}
142 
143 	self->pid = getpid();
144 	for (i = 0; i < PIDFD_NS_MAX; i++) {
145 		const struct ns_info *info = &ns_info[i];
146 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
147 		if (self->nsfds[i] < 0) {
148 			EXPECT_EQ(errno, ENOENT) {
149 				TH_LOG("%m - Failed to open %s namespace for process %d",
150 				       info->name, self->pid);
151 			}
152 		}
153 	}
154 
155 	self->pidfd = sys_pidfd_open(self->pid, 0);
156 	EXPECT_GT(self->pidfd, 0) {
157 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
158 	}
159 
160 	/* Create task that exits right away. */
161 	self->child_pid_exited = create_child(&self->child_pidfd_exited,
162 					      CLONE_NEWUSER | CLONE_NEWNET);
163 	EXPECT_GT(self->child_pid_exited, 0);
164 
165 	if (self->child_pid_exited == 0)
166 		_exit(EXIT_SUCCESS);
167 
168 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
169 
170 	self->pidfd = sys_pidfd_open(self->pid, 0);
171 	EXPECT_GE(self->pidfd, 0) {
172 		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
173 	}
174 
175 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
176 	EXPECT_EQ(ret, 0);
177 
178 	/* Create tasks that will be stopped. */
179 	self->child_pid1 = create_child(&self->child_pidfd1,
180 					CLONE_NEWUSER | CLONE_NEWNS |
181 					CLONE_NEWCGROUP | CLONE_NEWIPC |
182 					CLONE_NEWUTS | CLONE_NEWPID |
183 					CLONE_NEWNET);
184 	EXPECT_GE(self->child_pid1, 0);
185 
186 	if (self->child_pid1 == 0) {
187 		close(ipc_sockets[0]);
188 
189 		if (!switch_timens())
190 			_exit(EXIT_FAILURE);
191 
192 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
193 			_exit(EXIT_FAILURE);
194 
195 		close(ipc_sockets[1]);
196 
197 		pause();
198 		_exit(EXIT_SUCCESS);
199 	}
200 
201 	close(ipc_sockets[1]);
202 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
203 	close(ipc_sockets[0]);
204 
205 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
206 	EXPECT_EQ(ret, 0);
207 
208 	self->child_pid2 = create_child(&self->child_pidfd2,
209 					CLONE_NEWUSER | CLONE_NEWNS |
210 					CLONE_NEWCGROUP | CLONE_NEWIPC |
211 					CLONE_NEWUTS | CLONE_NEWPID |
212 					CLONE_NEWNET);
213 	EXPECT_GE(self->child_pid2, 0);
214 
215 	if (self->child_pid2 == 0) {
216 		close(ipc_sockets[0]);
217 
218 		if (!switch_timens())
219 			_exit(EXIT_FAILURE);
220 
221 		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
222 			_exit(EXIT_FAILURE);
223 
224 		close(ipc_sockets[1]);
225 
226 		pause();
227 		_exit(EXIT_SUCCESS);
228 	}
229 
230 	close(ipc_sockets[1]);
231 	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
232 	close(ipc_sockets[0]);
233 
234 	for (i = 0; i < PIDFD_NS_MAX; i++) {
235 		char p[100];
236 
237 		const struct ns_info *info = &ns_info[i];
238 
239 		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
240 		if (self->nsfds[i] < 0) {
241 			EXPECT_EQ(errno, ENOENT) {
242 				TH_LOG("%m - Failed to open %s namespace for process %d",
243 				       info->name, self->pid);
244 			}
245 		}
246 
247 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
248 			       self->child_pid1, info->name);
249 		EXPECT_GT(ret, 0);
250 		EXPECT_LT(ret, sizeof(p));
251 
252 		self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
253 		if (self->child_nsfds1[i] < 0) {
254 			EXPECT_EQ(errno, ENOENT) {
255 				TH_LOG("%m - Failed to open %s namespace for process %d",
256 				       info->name, self->child_pid1);
257 			}
258 		}
259 
260 		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
261 			       self->child_pid2, info->name);
262 		EXPECT_GT(ret, 0);
263 		EXPECT_LT(ret, sizeof(p));
264 
265 		self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
266 		if (self->child_nsfds2[i] < 0) {
267 			EXPECT_EQ(errno, ENOENT) {
268 				TH_LOG("%m - Failed to open %s namespace for process %d",
269 				       info->name, self->child_pid1);
270 			}
271 		}
272 	}
273 
274 	close(proc_fd);
275 }
276 
277 FIXTURE_TEARDOWN(current_nsset)
278 {
279 	int i;
280 
281 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
282 					SIGKILL, NULL, 0), 0);
283 	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
284 					SIGKILL, NULL, 0), 0);
285 
286 	for (i = 0; i < PIDFD_NS_MAX; i++) {
287 		if (self->nsfds[i] >= 0)
288 			close(self->nsfds[i]);
289 		if (self->child_nsfds1[i] >= 0)
290 			close(self->child_nsfds1[i]);
291 		if (self->child_nsfds2[i] >= 0)
292 			close(self->child_nsfds2[i]);
293 	}
294 
295 	if (self->child_pidfd1 >= 0)
296 		EXPECT_EQ(0, close(self->child_pidfd1));
297 	if (self->child_pidfd2 >= 0)
298 		EXPECT_EQ(0, close(self->child_pidfd2));
299 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
300 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
301 	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
302 }
303 
304 static int preserve_ns(const int pid, const char *ns)
305 {
306 	int ret;
307 	char path[50];
308 
309 	ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
310 	if (ret < 0 || (size_t)ret >= sizeof(path))
311 		return -EIO;
312 
313 	return open(path, O_RDONLY | O_CLOEXEC);
314 }
315 
316 static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
317 {
318 	int ns_fd2 = -EBADF;
319 	int ret = -1;
320 	struct stat ns_st1, ns_st2;
321 
322 	ret = fstat(ns_fd1, &ns_st1);
323 	if (ret < 0)
324 		return -1;
325 
326 	ns_fd2 = preserve_ns(pid2, ns);
327 	if (ns_fd2 < 0)
328 		return -1;
329 
330 	ret = fstat(ns_fd2, &ns_st2);
331 	close(ns_fd2);
332 	if (ret < 0)
333 		return -1;
334 
335 	/* processes are in the same namespace */
336 	if ((ns_st1.st_dev == ns_st2.st_dev) &&
337 	    (ns_st1.st_ino == ns_st2.st_ino))
338 		return 1;
339 
340 	/* processes are in different namespaces */
341 	return 0;
342 }
343 
344 /* Test that we can't pass garbage to the kernel. */
345 TEST_F(current_nsset, invalid_flags)
346 {
347 	ASSERT_NE(setns(self->pidfd, 0), 0);
348 	EXPECT_EQ(errno, EINVAL);
349 
350 	ASSERT_NE(setns(self->pidfd, -1), 0);
351 	EXPECT_EQ(errno, EINVAL);
352 
353 	ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
354 	EXPECT_EQ(errno, EINVAL);
355 
356 	ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
357 	EXPECT_EQ(errno, EINVAL);
358 }
359 
360 /* Test that we can't attach to a task that has already exited. */
361 TEST_F(current_nsset, pidfd_exited_child)
362 {
363 	int i;
364 	pid_t pid;
365 
366 	ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
367 		  0);
368 	EXPECT_EQ(errno, ESRCH);
369 
370 	pid = getpid();
371 	for (i = 0; i < PIDFD_NS_MAX; i++) {
372 		const struct ns_info *info = &ns_info[i];
373 		/* Verify that we haven't changed any namespaces. */
374 		if (self->nsfds[i] >= 0)
375 			ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
376 	}
377 }
378 
379 TEST_F(current_nsset, pidfd_incremental_setns)
380 {
381 	int i;
382 	pid_t pid;
383 
384 	pid = getpid();
385 	for (i = 0; i < PIDFD_NS_MAX; i++) {
386 		const struct ns_info *info = &ns_info[i];
387 		int nsfd;
388 
389 		if (self->child_nsfds1[i] < 0)
390 			continue;
391 
392 		if (info->flag) {
393 			ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
394 				TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
395 				       info->name, self->child_pid1,
396 				       self->child_pidfd1);
397 			}
398 		}
399 
400 		/* Verify that we have changed to the correct namespaces. */
401 		if (info->flag == CLONE_NEWPID)
402 			nsfd = self->nsfds[i];
403 		else
404 			nsfd = self->child_nsfds1[i];
405 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
406 			TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
407 			       info->name, self->child_pid1,
408 			       self->child_pidfd1);
409 		}
410 		TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
411 		       info->name, self->child_pid1, self->child_pidfd1);
412 	}
413 }
414 
415 TEST_F(current_nsset, nsfd_incremental_setns)
416 {
417 	int i;
418 	pid_t pid;
419 
420 	pid = getpid();
421 	for (i = 0; i < PIDFD_NS_MAX; i++) {
422 		const struct ns_info *info = &ns_info[i];
423 		int nsfd;
424 
425 		if (self->child_nsfds1[i] < 0)
426 			continue;
427 
428 		if (info->flag) {
429 			ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
430 				TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
431 				       info->name, self->child_pid1,
432 				       self->child_nsfds1[i]);
433 			}
434 		}
435 
436 		/* Verify that we have changed to the correct namespaces. */
437 		if (info->flag == CLONE_NEWPID)
438 			nsfd = self->nsfds[i];
439 		else
440 			nsfd = self->child_nsfds1[i];
441 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
442 			TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
443 			       info->name, self->child_pid1,
444 			       self->child_nsfds1[i]);
445 		}
446 		TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
447 		       info->name, self->child_pid1, self->child_nsfds1[i]);
448 	}
449 }
450 
451 TEST_F(current_nsset, pidfd_one_shot_setns)
452 {
453 	unsigned flags = 0;
454 	int i;
455 	pid_t pid;
456 
457 	for (i = 0; i < PIDFD_NS_MAX; i++) {
458 		const struct ns_info *info = &ns_info[i];
459 
460 		if (self->child_nsfds1[i] < 0)
461 			continue;
462 
463 		flags |= info->flag;
464 		TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
465 		       info->name, self->child_pid1);
466 	}
467 
468 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
469 		TH_LOG("%m - Failed to setns to namespaces of %d",
470 		       self->child_pid1);
471 	}
472 
473 	pid = getpid();
474 	for (i = 0; i < PIDFD_NS_MAX; i++) {
475 		const struct ns_info *info = &ns_info[i];
476 		int nsfd;
477 
478 		if (self->child_nsfds1[i] < 0)
479 			continue;
480 
481 		/* Verify that we have changed to the correct namespaces. */
482 		if (info->flag == CLONE_NEWPID)
483 			nsfd = self->nsfds[i];
484 		else
485 			nsfd = self->child_nsfds1[i];
486 		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
487 			TH_LOG("setns failed to place us correctly into %s namespace of %d",
488 			       info->name, self->child_pid1);
489 		}
490 		TH_LOG("Managed to correctly setns to %s namespace of %d",
491 		       info->name, self->child_pid1);
492 	}
493 }
494 
495 TEST_F(current_nsset, no_foul_play)
496 {
497 	unsigned flags = 0;
498 	int i;
499 
500 	for (i = 0; i < PIDFD_NS_MAX; i++) {
501 		const struct ns_info *info = &ns_info[i];
502 
503 		if (self->child_nsfds1[i] < 0)
504 			continue;
505 
506 		flags |= info->flag;
507 		if (info->flag) /* No use logging pid_for_children. */
508 			TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
509 			       info->name, self->child_pid1);
510 	}
511 
512 	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
513 		TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
514 		       self->child_pid1, self->child_pidfd1);
515 	}
516 
517 	/*
518 	 * Can't setns to a user namespace outside of our hierarchy since we
519 	 * don't have caps in there and didn't create it. That means that under
520 	 * no circumstances should we be able to setns to any of the other
521 	 * ones since they aren't owned by our user namespace.
522 	 */
523 	for (i = 0; i < PIDFD_NS_MAX; i++) {
524 		const struct ns_info *info = &ns_info[i];
525 
526 		if (self->child_nsfds2[i] < 0 || !info->flag)
527 			continue;
528 
529 		ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
530 			TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
531 			       info->name, self->child_pid2,
532 			       self->child_pidfd2);
533 		}
534 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
535 		       info->name, self->child_pid2,
536 		       self->child_pidfd2);
537 
538 		ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
539 			TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
540 			       info->name, self->child_pid2,
541 			       self->child_nsfds2[i]);
542 		}
543 		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
544 		       info->name, self->child_pid2,
545 		       self->child_nsfds2[i]);
546 	}
547 }
548 
549 TEST(setns_einval)
550 {
551 	int fd;
552 
553 	fd = sys_memfd_create("rostock", 0);
554 	EXPECT_GT(fd, 0);
555 
556 	ASSERT_NE(setns(fd, 0), 0);
557 	EXPECT_EQ(errno, EINVAL);
558 	close(fd);
559 }
560 
561 TEST_HARNESS_MAIN
562