1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 
50 #include <unistd.h>
51 #include <sys/syscall.h>
52 #include <poll.h>
53 
54 #include "../kselftest_harness.h"
55 #include "../clone3/clone3_selftests.h"
56 
57 /* Attempt to de-conflict with the selftests tree. */
58 #ifndef SKIP
59 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
60 #endif
61 
62 #ifndef PR_SET_PTRACER
63 # define PR_SET_PTRACER 0x59616d61
64 #endif
65 
66 #ifndef PR_SET_NO_NEW_PRIVS
67 #define PR_SET_NO_NEW_PRIVS 38
68 #define PR_GET_NO_NEW_PRIVS 39
69 #endif
70 
71 #ifndef PR_SECCOMP_EXT
72 #define PR_SECCOMP_EXT 43
73 #endif
74 
75 #ifndef SECCOMP_EXT_ACT
76 #define SECCOMP_EXT_ACT 1
77 #endif
78 
79 #ifndef SECCOMP_EXT_ACT_TSYNC
80 #define SECCOMP_EXT_ACT_TSYNC 1
81 #endif
82 
83 #ifndef SECCOMP_MODE_STRICT
84 #define SECCOMP_MODE_STRICT 1
85 #endif
86 
87 #ifndef SECCOMP_MODE_FILTER
88 #define SECCOMP_MODE_FILTER 2
89 #endif
90 
91 #ifndef SECCOMP_RET_ALLOW
92 struct seccomp_data {
93 	int nr;
94 	__u32 arch;
95 	__u64 instruction_pointer;
96 	__u64 args[6];
97 };
98 #endif
99 
100 #ifndef SECCOMP_RET_KILL_PROCESS
101 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
102 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
103 #endif
104 #ifndef SECCOMP_RET_KILL
105 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
106 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
107 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
108 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
109 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
110 #endif
111 #ifndef SECCOMP_RET_LOG
112 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
113 #endif
114 
115 #ifndef __NR_seccomp
116 # if defined(__i386__)
117 #  define __NR_seccomp 354
118 # elif defined(__x86_64__)
119 #  define __NR_seccomp 317
120 # elif defined(__arm__)
121 #  define __NR_seccomp 383
122 # elif defined(__aarch64__)
123 #  define __NR_seccomp 277
124 # elif defined(__riscv)
125 #  define __NR_seccomp 277
126 # elif defined(__hppa__)
127 #  define __NR_seccomp 338
128 # elif defined(__powerpc__)
129 #  define __NR_seccomp 358
130 # elif defined(__s390__)
131 #  define __NR_seccomp 348
132 # else
133 #  warning "seccomp syscall number unknown for this architecture"
134 #  define __NR_seccomp 0xffff
135 # endif
136 #endif
137 
138 #ifndef SECCOMP_SET_MODE_STRICT
139 #define SECCOMP_SET_MODE_STRICT 0
140 #endif
141 
142 #ifndef SECCOMP_SET_MODE_FILTER
143 #define SECCOMP_SET_MODE_FILTER 1
144 #endif
145 
146 #ifndef SECCOMP_GET_ACTION_AVAIL
147 #define SECCOMP_GET_ACTION_AVAIL 2
148 #endif
149 
150 #ifndef SECCOMP_GET_NOTIF_SIZES
151 #define SECCOMP_GET_NOTIF_SIZES 3
152 #endif
153 
154 #ifndef SECCOMP_FILTER_FLAG_TSYNC
155 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
156 #endif
157 
158 #ifndef SECCOMP_FILTER_FLAG_LOG
159 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
160 #endif
161 
162 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
163 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
164 #endif
165 
166 #ifndef PTRACE_SECCOMP_GET_METADATA
167 #define PTRACE_SECCOMP_GET_METADATA	0x420d
168 
169 struct seccomp_metadata {
170 	__u64 filter_off;       /* Input: which filter */
171 	__u64 flags;             /* Output: filter's flags */
172 };
173 #endif
174 
175 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
176 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
177 #endif
178 
179 #ifndef SECCOMP_RET_USER_NOTIF
180 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
181 
182 #define SECCOMP_IOC_MAGIC		'!'
183 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
184 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
185 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
186 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
187 
188 /* Flags for seccomp notification fd ioctl. */
189 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
190 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
191 						struct seccomp_notif_resp)
192 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
193 
194 struct seccomp_notif {
195 	__u64 id;
196 	__u32 pid;
197 	__u32 flags;
198 	struct seccomp_data data;
199 };
200 
201 struct seccomp_notif_resp {
202 	__u64 id;
203 	__s64 val;
204 	__s32 error;
205 	__u32 flags;
206 };
207 
208 struct seccomp_notif_sizes {
209 	__u16 seccomp_notif;
210 	__u16 seccomp_notif_resp;
211 	__u16 seccomp_data;
212 };
213 #endif
214 
215 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
216 /* On success, the return value is the remote process's added fd number */
217 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
218 						struct seccomp_notif_addfd)
219 
220 /* valid flags for seccomp_notif_addfd */
221 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
222 
223 struct seccomp_notif_addfd {
224 	__u64 id;
225 	__u32 flags;
226 	__u32 srcfd;
227 	__u32 newfd;
228 	__u32 newfd_flags;
229 };
230 #endif
231 
232 struct seccomp_notif_addfd_small {
233 	__u64 id;
234 	char weird[4];
235 };
236 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
237 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
238 
239 struct seccomp_notif_addfd_big {
240 	union {
241 		struct seccomp_notif_addfd addfd;
242 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
243 	};
244 };
245 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
246 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
247 
248 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
249 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
250 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
251 #endif
252 
253 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
254 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
255 #endif
256 
257 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
258 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
259 #endif
260 
261 #ifndef seccomp
262 int seccomp(unsigned int op, unsigned int flags, void *args)
263 {
264 	errno = 0;
265 	return syscall(__NR_seccomp, op, flags, args);
266 }
267 #endif
268 
269 #if __BYTE_ORDER == __LITTLE_ENDIAN
270 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
271 #elif __BYTE_ORDER == __BIG_ENDIAN
272 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
273 #else
274 #error "wut? Unknown __BYTE_ORDER?!"
275 #endif
276 
277 #define SIBLING_EXIT_UNKILLED	0xbadbeef
278 #define SIBLING_EXIT_FAILURE	0xbadface
279 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
280 
281 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
282 {
283 #ifdef __NR_kcmp
284 	errno = 0;
285 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
286 #else
287 	errno = ENOSYS;
288 	return -1;
289 #endif
290 }
291 
292 /* Have TH_LOG report actual location filecmp() is used. */
293 #define filecmp(pid1, pid2, fd1, fd2)	({		\
294 	int _ret;					\
295 							\
296 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
297 	if (_ret != 0) {				\
298 		if (_ret < 0 && errno == ENOSYS) {	\
299 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
300 			_ret = 0;			\
301 		}					\
302 	}						\
303 	_ret; })
304 
305 TEST(kcmp)
306 {
307 	int ret;
308 
309 	ret = __filecmp(getpid(), getpid(), 1, 1);
310 	EXPECT_EQ(ret, 0);
311 	if (ret != 0 && errno == ENOSYS)
312 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_CHECKPOINT_RESTORE?)");
313 }
314 
315 TEST(mode_strict_support)
316 {
317 	long ret;
318 
319 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
320 	ASSERT_EQ(0, ret) {
321 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
322 	}
323 	syscall(__NR_exit, 0);
324 }
325 
326 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
327 {
328 	long ret;
329 
330 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
331 	ASSERT_EQ(0, ret) {
332 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
333 	}
334 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
335 		NULL, NULL, NULL);
336 	EXPECT_FALSE(true) {
337 		TH_LOG("Unreachable!");
338 	}
339 }
340 
341 /* Note! This doesn't test no new privs behavior */
342 TEST(no_new_privs_support)
343 {
344 	long ret;
345 
346 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
347 	EXPECT_EQ(0, ret) {
348 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
349 	}
350 }
351 
352 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
353 TEST(mode_filter_support)
354 {
355 	long ret;
356 
357 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
358 	ASSERT_EQ(0, ret) {
359 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
360 	}
361 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
362 	EXPECT_EQ(-1, ret);
363 	EXPECT_EQ(EFAULT, errno) {
364 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
365 	}
366 }
367 
368 TEST(mode_filter_without_nnp)
369 {
370 	struct sock_filter filter[] = {
371 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
372 	};
373 	struct sock_fprog prog = {
374 		.len = (unsigned short)ARRAY_SIZE(filter),
375 		.filter = filter,
376 	};
377 	long ret;
378 
379 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
380 	ASSERT_LE(0, ret) {
381 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
382 	}
383 	errno = 0;
384 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
385 	/* Succeeds with CAP_SYS_ADMIN, fails without */
386 	/* TODO(wad) check caps not euid */
387 	if (geteuid()) {
388 		EXPECT_EQ(-1, ret);
389 		EXPECT_EQ(EACCES, errno);
390 	} else {
391 		EXPECT_EQ(0, ret);
392 	}
393 }
394 
395 #define MAX_INSNS_PER_PATH 32768
396 
397 TEST(filter_size_limits)
398 {
399 	int i;
400 	int count = BPF_MAXINSNS + 1;
401 	struct sock_filter allow[] = {
402 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
403 	};
404 	struct sock_filter *filter;
405 	struct sock_fprog prog = { };
406 	long ret;
407 
408 	filter = calloc(count, sizeof(*filter));
409 	ASSERT_NE(NULL, filter);
410 
411 	for (i = 0; i < count; i++)
412 		filter[i] = allow[0];
413 
414 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
415 	ASSERT_EQ(0, ret);
416 
417 	prog.filter = filter;
418 	prog.len = count;
419 
420 	/* Too many filter instructions in a single filter. */
421 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
422 	ASSERT_NE(0, ret) {
423 		TH_LOG("Installing %d insn filter was allowed", prog.len);
424 	}
425 
426 	/* One less is okay, though. */
427 	prog.len -= 1;
428 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
429 	ASSERT_EQ(0, ret) {
430 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
431 	}
432 }
433 
434 TEST(filter_chain_limits)
435 {
436 	int i;
437 	int count = BPF_MAXINSNS;
438 	struct sock_filter allow[] = {
439 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
440 	};
441 	struct sock_filter *filter;
442 	struct sock_fprog prog = { };
443 	long ret;
444 
445 	filter = calloc(count, sizeof(*filter));
446 	ASSERT_NE(NULL, filter);
447 
448 	for (i = 0; i < count; i++)
449 		filter[i] = allow[0];
450 
451 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
452 	ASSERT_EQ(0, ret);
453 
454 	prog.filter = filter;
455 	prog.len = 1;
456 
457 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
458 	ASSERT_EQ(0, ret);
459 
460 	prog.len = count;
461 
462 	/* Too many total filter instructions. */
463 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
464 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
465 		if (ret != 0)
466 			break;
467 	}
468 	ASSERT_NE(0, ret) {
469 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
470 		       i, count, i * (count + 4));
471 	}
472 }
473 
474 TEST(mode_filter_cannot_move_to_strict)
475 {
476 	struct sock_filter filter[] = {
477 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
478 	};
479 	struct sock_fprog prog = {
480 		.len = (unsigned short)ARRAY_SIZE(filter),
481 		.filter = filter,
482 	};
483 	long ret;
484 
485 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
486 	ASSERT_EQ(0, ret);
487 
488 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
489 	ASSERT_EQ(0, ret);
490 
491 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
492 	EXPECT_EQ(-1, ret);
493 	EXPECT_EQ(EINVAL, errno);
494 }
495 
496 
497 TEST(mode_filter_get_seccomp)
498 {
499 	struct sock_filter filter[] = {
500 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
501 	};
502 	struct sock_fprog prog = {
503 		.len = (unsigned short)ARRAY_SIZE(filter),
504 		.filter = filter,
505 	};
506 	long ret;
507 
508 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
509 	ASSERT_EQ(0, ret);
510 
511 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
512 	EXPECT_EQ(0, ret);
513 
514 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
515 	ASSERT_EQ(0, ret);
516 
517 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
518 	EXPECT_EQ(2, ret);
519 }
520 
521 
522 TEST(ALLOW_all)
523 {
524 	struct sock_filter filter[] = {
525 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
526 	};
527 	struct sock_fprog prog = {
528 		.len = (unsigned short)ARRAY_SIZE(filter),
529 		.filter = filter,
530 	};
531 	long ret;
532 
533 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
534 	ASSERT_EQ(0, ret);
535 
536 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
537 	ASSERT_EQ(0, ret);
538 }
539 
540 TEST(empty_prog)
541 {
542 	struct sock_filter filter[] = {
543 	};
544 	struct sock_fprog prog = {
545 		.len = (unsigned short)ARRAY_SIZE(filter),
546 		.filter = filter,
547 	};
548 	long ret;
549 
550 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
551 	ASSERT_EQ(0, ret);
552 
553 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
554 	EXPECT_EQ(-1, ret);
555 	EXPECT_EQ(EINVAL, errno);
556 }
557 
558 TEST(log_all)
559 {
560 	struct sock_filter filter[] = {
561 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
562 	};
563 	struct sock_fprog prog = {
564 		.len = (unsigned short)ARRAY_SIZE(filter),
565 		.filter = filter,
566 	};
567 	long ret;
568 	pid_t parent = getppid();
569 
570 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
571 	ASSERT_EQ(0, ret);
572 
573 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
574 	ASSERT_EQ(0, ret);
575 
576 	/* getppid() should succeed and be logged (no check for logging) */
577 	EXPECT_EQ(parent, syscall(__NR_getppid));
578 }
579 
580 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
581 {
582 	struct sock_filter filter[] = {
583 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
584 	};
585 	struct sock_fprog prog = {
586 		.len = (unsigned short)ARRAY_SIZE(filter),
587 		.filter = filter,
588 	};
589 	long ret;
590 
591 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
592 	ASSERT_EQ(0, ret);
593 
594 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
595 	ASSERT_EQ(0, ret);
596 	EXPECT_EQ(0, syscall(__NR_getpid)) {
597 		TH_LOG("getpid() shouldn't ever return");
598 	}
599 }
600 
601 /* return code >= 0x80000000 is unused. */
602 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
603 {
604 	struct sock_filter filter[] = {
605 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
606 	};
607 	struct sock_fprog prog = {
608 		.len = (unsigned short)ARRAY_SIZE(filter),
609 		.filter = filter,
610 	};
611 	long ret;
612 
613 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
614 	ASSERT_EQ(0, ret);
615 
616 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
617 	ASSERT_EQ(0, ret);
618 	EXPECT_EQ(0, syscall(__NR_getpid)) {
619 		TH_LOG("getpid() shouldn't ever return");
620 	}
621 }
622 
623 TEST_SIGNAL(KILL_all, SIGSYS)
624 {
625 	struct sock_filter filter[] = {
626 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
627 	};
628 	struct sock_fprog prog = {
629 		.len = (unsigned short)ARRAY_SIZE(filter),
630 		.filter = filter,
631 	};
632 	long ret;
633 
634 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
635 	ASSERT_EQ(0, ret);
636 
637 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
638 	ASSERT_EQ(0, ret);
639 }
640 
641 TEST_SIGNAL(KILL_one, SIGSYS)
642 {
643 	struct sock_filter filter[] = {
644 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
645 			offsetof(struct seccomp_data, nr)),
646 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
647 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
648 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
649 	};
650 	struct sock_fprog prog = {
651 		.len = (unsigned short)ARRAY_SIZE(filter),
652 		.filter = filter,
653 	};
654 	long ret;
655 	pid_t parent = getppid();
656 
657 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
658 	ASSERT_EQ(0, ret);
659 
660 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
661 	ASSERT_EQ(0, ret);
662 
663 	EXPECT_EQ(parent, syscall(__NR_getppid));
664 	/* getpid() should never return. */
665 	EXPECT_EQ(0, syscall(__NR_getpid));
666 }
667 
668 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
669 {
670 	void *fatal_address;
671 	struct sock_filter filter[] = {
672 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
673 			offsetof(struct seccomp_data, nr)),
674 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
675 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
676 		/* Only both with lower 32-bit for now. */
677 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
678 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
679 			(unsigned long)&fatal_address, 0, 1),
680 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
681 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
682 	};
683 	struct sock_fprog prog = {
684 		.len = (unsigned short)ARRAY_SIZE(filter),
685 		.filter = filter,
686 	};
687 	long ret;
688 	pid_t parent = getppid();
689 	struct tms timebuf;
690 	clock_t clock = times(&timebuf);
691 
692 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
693 	ASSERT_EQ(0, ret);
694 
695 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
696 	ASSERT_EQ(0, ret);
697 
698 	EXPECT_EQ(parent, syscall(__NR_getppid));
699 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
700 	/* times() should never return. */
701 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
702 }
703 
704 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
705 {
706 #ifndef __NR_mmap2
707 	int sysno = __NR_mmap;
708 #else
709 	int sysno = __NR_mmap2;
710 #endif
711 	struct sock_filter filter[] = {
712 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
713 			offsetof(struct seccomp_data, nr)),
714 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
715 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
716 		/* Only both with lower 32-bit for now. */
717 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
718 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
719 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
720 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
721 	};
722 	struct sock_fprog prog = {
723 		.len = (unsigned short)ARRAY_SIZE(filter),
724 		.filter = filter,
725 	};
726 	long ret;
727 	pid_t parent = getppid();
728 	int fd;
729 	void *map1, *map2;
730 	int page_size = sysconf(_SC_PAGESIZE);
731 
732 	ASSERT_LT(0, page_size);
733 
734 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
735 	ASSERT_EQ(0, ret);
736 
737 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
738 	ASSERT_EQ(0, ret);
739 
740 	fd = open("/dev/zero", O_RDONLY);
741 	ASSERT_NE(-1, fd);
742 
743 	EXPECT_EQ(parent, syscall(__NR_getppid));
744 	map1 = (void *)syscall(sysno,
745 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
746 	EXPECT_NE(MAP_FAILED, map1);
747 	/* mmap2() should never return. */
748 	map2 = (void *)syscall(sysno,
749 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
750 	EXPECT_EQ(MAP_FAILED, map2);
751 
752 	/* The test failed, so clean up the resources. */
753 	munmap(map1, page_size);
754 	munmap(map2, page_size);
755 	close(fd);
756 }
757 
758 /* This is a thread task to die via seccomp filter violation. */
759 void *kill_thread(void *data)
760 {
761 	bool die = (bool)data;
762 
763 	if (die) {
764 		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
765 		return (void *)SIBLING_EXIT_FAILURE;
766 	}
767 
768 	return (void *)SIBLING_EXIT_UNKILLED;
769 }
770 
771 /* Prepare a thread that will kill itself or both of us. */
772 void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
773 {
774 	pthread_t thread;
775 	void *status;
776 	/* Kill only when calling __NR_prctl. */
777 	struct sock_filter filter_thread[] = {
778 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
779 			offsetof(struct seccomp_data, nr)),
780 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
781 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
782 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
783 	};
784 	struct sock_fprog prog_thread = {
785 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
786 		.filter = filter_thread,
787 	};
788 	struct sock_filter filter_process[] = {
789 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
790 			offsetof(struct seccomp_data, nr)),
791 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
792 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS),
793 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
794 	};
795 	struct sock_fprog prog_process = {
796 		.len = (unsigned short)ARRAY_SIZE(filter_process),
797 		.filter = filter_process,
798 	};
799 
800 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
801 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
802 	}
803 
804 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
805 			     kill_process ? &prog_process : &prog_thread));
806 
807 	/*
808 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
809 	 * flag cannot be downgraded by a new filter.
810 	 */
811 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
812 
813 	/* Start a thread that will exit immediately. */
814 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
815 	ASSERT_EQ(0, pthread_join(thread, &status));
816 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
817 
818 	/* Start a thread that will die immediately. */
819 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
820 	ASSERT_EQ(0, pthread_join(thread, &status));
821 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
822 
823 	/*
824 	 * If we get here, only the spawned thread died. Let the parent know
825 	 * the whole process didn't die (i.e. this thread, the spawner,
826 	 * stayed running).
827 	 */
828 	exit(42);
829 }
830 
831 TEST(KILL_thread)
832 {
833 	int status;
834 	pid_t child_pid;
835 
836 	child_pid = fork();
837 	ASSERT_LE(0, child_pid);
838 	if (child_pid == 0) {
839 		kill_thread_or_group(_metadata, false);
840 		_exit(38);
841 	}
842 
843 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
844 
845 	/* If only the thread was killed, we'll see exit 42. */
846 	ASSERT_TRUE(WIFEXITED(status));
847 	ASSERT_EQ(42, WEXITSTATUS(status));
848 }
849 
850 TEST(KILL_process)
851 {
852 	int status;
853 	pid_t child_pid;
854 
855 	child_pid = fork();
856 	ASSERT_LE(0, child_pid);
857 	if (child_pid == 0) {
858 		kill_thread_or_group(_metadata, true);
859 		_exit(38);
860 	}
861 
862 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
863 
864 	/* If the entire process was killed, we'll see SIGSYS. */
865 	ASSERT_TRUE(WIFSIGNALED(status));
866 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
867 }
868 
869 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
870 TEST(arg_out_of_range)
871 {
872 	struct sock_filter filter[] = {
873 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
874 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
875 	};
876 	struct sock_fprog prog = {
877 		.len = (unsigned short)ARRAY_SIZE(filter),
878 		.filter = filter,
879 	};
880 	long ret;
881 
882 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
883 	ASSERT_EQ(0, ret);
884 
885 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
886 	EXPECT_EQ(-1, ret);
887 	EXPECT_EQ(EINVAL, errno);
888 }
889 
890 #define ERRNO_FILTER(name, errno)					\
891 	struct sock_filter _read_filter_##name[] = {			\
892 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
893 			offsetof(struct seccomp_data, nr)),		\
894 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
895 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
896 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
897 	};								\
898 	struct sock_fprog prog_##name = {				\
899 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
900 		.filter = _read_filter_##name,				\
901 	}
902 
903 /* Make sure basic errno values are correctly passed through a filter. */
904 TEST(ERRNO_valid)
905 {
906 	ERRNO_FILTER(valid, E2BIG);
907 	long ret;
908 	pid_t parent = getppid();
909 
910 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
911 	ASSERT_EQ(0, ret);
912 
913 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
914 	ASSERT_EQ(0, ret);
915 
916 	EXPECT_EQ(parent, syscall(__NR_getppid));
917 	EXPECT_EQ(-1, read(0, NULL, 0));
918 	EXPECT_EQ(E2BIG, errno);
919 }
920 
921 /* Make sure an errno of zero is correctly handled by the arch code. */
922 TEST(ERRNO_zero)
923 {
924 	ERRNO_FILTER(zero, 0);
925 	long ret;
926 	pid_t parent = getppid();
927 
928 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
929 	ASSERT_EQ(0, ret);
930 
931 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
932 	ASSERT_EQ(0, ret);
933 
934 	EXPECT_EQ(parent, syscall(__NR_getppid));
935 	/* "errno" of 0 is ok. */
936 	EXPECT_EQ(0, read(0, NULL, 0));
937 }
938 
939 /*
940  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
941  * This tests that the errno value gets capped correctly, fixed by
942  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
943  */
944 TEST(ERRNO_capped)
945 {
946 	ERRNO_FILTER(capped, 4096);
947 	long ret;
948 	pid_t parent = getppid();
949 
950 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
951 	ASSERT_EQ(0, ret);
952 
953 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
954 	ASSERT_EQ(0, ret);
955 
956 	EXPECT_EQ(parent, syscall(__NR_getppid));
957 	EXPECT_EQ(-1, read(0, NULL, 0));
958 	EXPECT_EQ(4095, errno);
959 }
960 
961 /*
962  * Filters are processed in reverse order: last applied is executed first.
963  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
964  * SECCOMP_RET_DATA mask results will follow the most recently applied
965  * matching filter return (and not the lowest or highest value).
966  */
967 TEST(ERRNO_order)
968 {
969 	ERRNO_FILTER(first,  11);
970 	ERRNO_FILTER(second, 13);
971 	ERRNO_FILTER(third,  12);
972 	long ret;
973 	pid_t parent = getppid();
974 
975 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
976 	ASSERT_EQ(0, ret);
977 
978 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
979 	ASSERT_EQ(0, ret);
980 
981 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
982 	ASSERT_EQ(0, ret);
983 
984 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
985 	ASSERT_EQ(0, ret);
986 
987 	EXPECT_EQ(parent, syscall(__NR_getppid));
988 	EXPECT_EQ(-1, read(0, NULL, 0));
989 	EXPECT_EQ(12, errno);
990 }
991 
992 FIXTURE(TRAP) {
993 	struct sock_fprog prog;
994 };
995 
996 FIXTURE_SETUP(TRAP)
997 {
998 	struct sock_filter filter[] = {
999 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1000 			offsetof(struct seccomp_data, nr)),
1001 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1002 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1003 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1004 	};
1005 
1006 	memset(&self->prog, 0, sizeof(self->prog));
1007 	self->prog.filter = malloc(sizeof(filter));
1008 	ASSERT_NE(NULL, self->prog.filter);
1009 	memcpy(self->prog.filter, filter, sizeof(filter));
1010 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1011 }
1012 
1013 FIXTURE_TEARDOWN(TRAP)
1014 {
1015 	if (self->prog.filter)
1016 		free(self->prog.filter);
1017 }
1018 
1019 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1020 {
1021 	long ret;
1022 
1023 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1024 	ASSERT_EQ(0, ret);
1025 
1026 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1027 	ASSERT_EQ(0, ret);
1028 	syscall(__NR_getpid);
1029 }
1030 
1031 /* Ensure that SIGSYS overrides SIG_IGN */
1032 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1033 {
1034 	long ret;
1035 
1036 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1037 	ASSERT_EQ(0, ret);
1038 
1039 	signal(SIGSYS, SIG_IGN);
1040 
1041 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1042 	ASSERT_EQ(0, ret);
1043 	syscall(__NR_getpid);
1044 }
1045 
1046 static siginfo_t TRAP_info;
1047 static volatile int TRAP_nr;
1048 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1049 {
1050 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1051 	TRAP_nr = nr;
1052 }
1053 
1054 TEST_F(TRAP, handler)
1055 {
1056 	int ret, test;
1057 	struct sigaction act;
1058 	sigset_t mask;
1059 
1060 	memset(&act, 0, sizeof(act));
1061 	sigemptyset(&mask);
1062 	sigaddset(&mask, SIGSYS);
1063 
1064 	act.sa_sigaction = &TRAP_action;
1065 	act.sa_flags = SA_SIGINFO;
1066 	ret = sigaction(SIGSYS, &act, NULL);
1067 	ASSERT_EQ(0, ret) {
1068 		TH_LOG("sigaction failed");
1069 	}
1070 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1071 	ASSERT_EQ(0, ret) {
1072 		TH_LOG("sigprocmask failed");
1073 	}
1074 
1075 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1076 	ASSERT_EQ(0, ret);
1077 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1078 	ASSERT_EQ(0, ret);
1079 	TRAP_nr = 0;
1080 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1081 	/* Expect the registers to be rolled back. (nr = error) may vary
1082 	 * based on arch. */
1083 	ret = syscall(__NR_getpid);
1084 	/* Silence gcc warning about volatile. */
1085 	test = TRAP_nr;
1086 	EXPECT_EQ(SIGSYS, test);
1087 	struct local_sigsys {
1088 		void *_call_addr;	/* calling user insn */
1089 		int _syscall;		/* triggering system call number */
1090 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1091 	} *sigsys = (struct local_sigsys *)
1092 #ifdef si_syscall
1093 		&(TRAP_info.si_call_addr);
1094 #else
1095 		&TRAP_info.si_pid;
1096 #endif
1097 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1098 	/* Make sure arch is non-zero. */
1099 	EXPECT_NE(0, sigsys->_arch);
1100 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1101 }
1102 
1103 FIXTURE(precedence) {
1104 	struct sock_fprog allow;
1105 	struct sock_fprog log;
1106 	struct sock_fprog trace;
1107 	struct sock_fprog error;
1108 	struct sock_fprog trap;
1109 	struct sock_fprog kill;
1110 };
1111 
1112 FIXTURE_SETUP(precedence)
1113 {
1114 	struct sock_filter allow_insns[] = {
1115 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1116 	};
1117 	struct sock_filter log_insns[] = {
1118 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1119 			offsetof(struct seccomp_data, nr)),
1120 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1121 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1122 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1123 	};
1124 	struct sock_filter trace_insns[] = {
1125 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1126 			offsetof(struct seccomp_data, nr)),
1127 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1128 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1129 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1130 	};
1131 	struct sock_filter error_insns[] = {
1132 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1133 			offsetof(struct seccomp_data, nr)),
1134 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1135 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1136 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1137 	};
1138 	struct sock_filter trap_insns[] = {
1139 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1140 			offsetof(struct seccomp_data, nr)),
1141 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1142 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1143 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1144 	};
1145 	struct sock_filter kill_insns[] = {
1146 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1147 			offsetof(struct seccomp_data, nr)),
1148 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1149 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1150 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1151 	};
1152 
1153 	memset(self, 0, sizeof(*self));
1154 #define FILTER_ALLOC(_x) \
1155 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1156 	ASSERT_NE(NULL, self->_x.filter); \
1157 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1158 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1159 	FILTER_ALLOC(allow);
1160 	FILTER_ALLOC(log);
1161 	FILTER_ALLOC(trace);
1162 	FILTER_ALLOC(error);
1163 	FILTER_ALLOC(trap);
1164 	FILTER_ALLOC(kill);
1165 }
1166 
1167 FIXTURE_TEARDOWN(precedence)
1168 {
1169 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1170 	FILTER_FREE(allow);
1171 	FILTER_FREE(log);
1172 	FILTER_FREE(trace);
1173 	FILTER_FREE(error);
1174 	FILTER_FREE(trap);
1175 	FILTER_FREE(kill);
1176 }
1177 
1178 TEST_F(precedence, allow_ok)
1179 {
1180 	pid_t parent, res = 0;
1181 	long ret;
1182 
1183 	parent = getppid();
1184 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1185 	ASSERT_EQ(0, ret);
1186 
1187 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1188 	ASSERT_EQ(0, ret);
1189 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1190 	ASSERT_EQ(0, ret);
1191 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1192 	ASSERT_EQ(0, ret);
1193 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1194 	ASSERT_EQ(0, ret);
1195 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1196 	ASSERT_EQ(0, ret);
1197 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1198 	ASSERT_EQ(0, ret);
1199 	/* Should work just fine. */
1200 	res = syscall(__NR_getppid);
1201 	EXPECT_EQ(parent, res);
1202 }
1203 
1204 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1205 {
1206 	pid_t parent, res = 0;
1207 	long ret;
1208 
1209 	parent = getppid();
1210 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1211 	ASSERT_EQ(0, ret);
1212 
1213 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1214 	ASSERT_EQ(0, ret);
1215 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1216 	ASSERT_EQ(0, ret);
1217 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1218 	ASSERT_EQ(0, ret);
1219 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1220 	ASSERT_EQ(0, ret);
1221 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1222 	ASSERT_EQ(0, ret);
1223 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1224 	ASSERT_EQ(0, ret);
1225 	/* Should work just fine. */
1226 	res = syscall(__NR_getppid);
1227 	EXPECT_EQ(parent, res);
1228 	/* getpid() should never return. */
1229 	res = syscall(__NR_getpid);
1230 	EXPECT_EQ(0, res);
1231 }
1232 
1233 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1234 {
1235 	pid_t parent;
1236 	long ret;
1237 
1238 	parent = getppid();
1239 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1240 	ASSERT_EQ(0, ret);
1241 
1242 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1243 	ASSERT_EQ(0, ret);
1244 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1245 	ASSERT_EQ(0, ret);
1246 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1247 	ASSERT_EQ(0, ret);
1248 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1249 	ASSERT_EQ(0, ret);
1250 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1251 	ASSERT_EQ(0, ret);
1252 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1253 	ASSERT_EQ(0, ret);
1254 	/* Should work just fine. */
1255 	EXPECT_EQ(parent, syscall(__NR_getppid));
1256 	/* getpid() should never return. */
1257 	EXPECT_EQ(0, syscall(__NR_getpid));
1258 }
1259 
1260 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1261 {
1262 	pid_t parent;
1263 	long ret;
1264 
1265 	parent = getppid();
1266 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1267 	ASSERT_EQ(0, ret);
1268 
1269 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1270 	ASSERT_EQ(0, ret);
1271 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1272 	ASSERT_EQ(0, ret);
1273 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1274 	ASSERT_EQ(0, ret);
1275 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1276 	ASSERT_EQ(0, ret);
1277 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1278 	ASSERT_EQ(0, ret);
1279 	/* Should work just fine. */
1280 	EXPECT_EQ(parent, syscall(__NR_getppid));
1281 	/* getpid() should never return. */
1282 	EXPECT_EQ(0, syscall(__NR_getpid));
1283 }
1284 
1285 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1286 {
1287 	pid_t parent;
1288 	long ret;
1289 
1290 	parent = getppid();
1291 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1292 	ASSERT_EQ(0, ret);
1293 
1294 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1295 	ASSERT_EQ(0, ret);
1296 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1297 	ASSERT_EQ(0, ret);
1298 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1299 	ASSERT_EQ(0, ret);
1300 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1301 	ASSERT_EQ(0, ret);
1302 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1303 	ASSERT_EQ(0, ret);
1304 	/* Should work just fine. */
1305 	EXPECT_EQ(parent, syscall(__NR_getppid));
1306 	/* getpid() should never return. */
1307 	EXPECT_EQ(0, syscall(__NR_getpid));
1308 }
1309 
1310 TEST_F(precedence, errno_is_third)
1311 {
1312 	pid_t parent;
1313 	long ret;
1314 
1315 	parent = getppid();
1316 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1317 	ASSERT_EQ(0, ret);
1318 
1319 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1320 	ASSERT_EQ(0, ret);
1321 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1322 	ASSERT_EQ(0, ret);
1323 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1324 	ASSERT_EQ(0, ret);
1325 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1326 	ASSERT_EQ(0, ret);
1327 	/* Should work just fine. */
1328 	EXPECT_EQ(parent, syscall(__NR_getppid));
1329 	EXPECT_EQ(0, syscall(__NR_getpid));
1330 }
1331 
1332 TEST_F(precedence, errno_is_third_in_any_order)
1333 {
1334 	pid_t parent;
1335 	long ret;
1336 
1337 	parent = getppid();
1338 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1339 	ASSERT_EQ(0, ret);
1340 
1341 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1342 	ASSERT_EQ(0, ret);
1343 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1344 	ASSERT_EQ(0, ret);
1345 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1346 	ASSERT_EQ(0, ret);
1347 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1348 	ASSERT_EQ(0, ret);
1349 	/* Should work just fine. */
1350 	EXPECT_EQ(parent, syscall(__NR_getppid));
1351 	EXPECT_EQ(0, syscall(__NR_getpid));
1352 }
1353 
1354 TEST_F(precedence, trace_is_fourth)
1355 {
1356 	pid_t parent;
1357 	long ret;
1358 
1359 	parent = getppid();
1360 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1361 	ASSERT_EQ(0, ret);
1362 
1363 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1364 	ASSERT_EQ(0, ret);
1365 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1366 	ASSERT_EQ(0, ret);
1367 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1368 	ASSERT_EQ(0, ret);
1369 	/* Should work just fine. */
1370 	EXPECT_EQ(parent, syscall(__NR_getppid));
1371 	/* No ptracer */
1372 	EXPECT_EQ(-1, syscall(__NR_getpid));
1373 }
1374 
1375 TEST_F(precedence, trace_is_fourth_in_any_order)
1376 {
1377 	pid_t parent;
1378 	long ret;
1379 
1380 	parent = getppid();
1381 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1382 	ASSERT_EQ(0, ret);
1383 
1384 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1385 	ASSERT_EQ(0, ret);
1386 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1387 	ASSERT_EQ(0, ret);
1388 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1389 	ASSERT_EQ(0, ret);
1390 	/* Should work just fine. */
1391 	EXPECT_EQ(parent, syscall(__NR_getppid));
1392 	/* No ptracer */
1393 	EXPECT_EQ(-1, syscall(__NR_getpid));
1394 }
1395 
1396 TEST_F(precedence, log_is_fifth)
1397 {
1398 	pid_t mypid, parent;
1399 	long ret;
1400 
1401 	mypid = getpid();
1402 	parent = getppid();
1403 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1404 	ASSERT_EQ(0, ret);
1405 
1406 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1407 	ASSERT_EQ(0, ret);
1408 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1409 	ASSERT_EQ(0, ret);
1410 	/* Should work just fine. */
1411 	EXPECT_EQ(parent, syscall(__NR_getppid));
1412 	/* Should also work just fine */
1413 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1414 }
1415 
1416 TEST_F(precedence, log_is_fifth_in_any_order)
1417 {
1418 	pid_t mypid, parent;
1419 	long ret;
1420 
1421 	mypid = getpid();
1422 	parent = getppid();
1423 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1424 	ASSERT_EQ(0, ret);
1425 
1426 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1427 	ASSERT_EQ(0, ret);
1428 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1429 	ASSERT_EQ(0, ret);
1430 	/* Should work just fine. */
1431 	EXPECT_EQ(parent, syscall(__NR_getppid));
1432 	/* Should also work just fine */
1433 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1434 }
1435 
1436 #ifndef PTRACE_O_TRACESECCOMP
1437 #define PTRACE_O_TRACESECCOMP	0x00000080
1438 #endif
1439 
1440 /* Catch the Ubuntu 12.04 value error. */
1441 #if PTRACE_EVENT_SECCOMP != 7
1442 #undef PTRACE_EVENT_SECCOMP
1443 #endif
1444 
1445 #ifndef PTRACE_EVENT_SECCOMP
1446 #define PTRACE_EVENT_SECCOMP 7
1447 #endif
1448 
1449 #define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
1450 bool tracer_running;
1451 void tracer_stop(int sig)
1452 {
1453 	tracer_running = false;
1454 }
1455 
1456 typedef void tracer_func_t(struct __test_metadata *_metadata,
1457 			   pid_t tracee, int status, void *args);
1458 
1459 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1460 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1461 {
1462 	int ret = -1;
1463 	struct sigaction action = {
1464 		.sa_handler = tracer_stop,
1465 	};
1466 
1467 	/* Allow external shutdown. */
1468 	tracer_running = true;
1469 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1470 
1471 	errno = 0;
1472 	while (ret == -1 && errno != EINVAL)
1473 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1474 	ASSERT_EQ(0, ret) {
1475 		kill(tracee, SIGKILL);
1476 	}
1477 	/* Wait for attach stop */
1478 	wait(NULL);
1479 
1480 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1481 						      PTRACE_O_TRACESYSGOOD :
1482 						      PTRACE_O_TRACESECCOMP);
1483 	ASSERT_EQ(0, ret) {
1484 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1485 		kill(tracee, SIGKILL);
1486 	}
1487 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1488 		     tracee, NULL, 0);
1489 	ASSERT_EQ(0, ret);
1490 
1491 	/* Unblock the tracee */
1492 	ASSERT_EQ(1, write(fd, "A", 1));
1493 	ASSERT_EQ(0, close(fd));
1494 
1495 	/* Run until we're shut down. Must assert to stop execution. */
1496 	while (tracer_running) {
1497 		int status;
1498 
1499 		if (wait(&status) != tracee)
1500 			continue;
1501 		if (WIFSIGNALED(status) || WIFEXITED(status))
1502 			/* Child is dead. Time to go. */
1503 			return;
1504 
1505 		/* Check if this is a seccomp event. */
1506 		ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
1507 
1508 		tracer_func(_metadata, tracee, status, args);
1509 
1510 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1511 			     tracee, NULL, 0);
1512 		ASSERT_EQ(0, ret);
1513 	}
1514 	/* Directly report the status of our test harness results. */
1515 	syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1516 }
1517 
1518 /* Common tracer setup/teardown functions. */
1519 void cont_handler(int num)
1520 { }
1521 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1522 			  tracer_func_t func, void *args, bool ptrace_syscall)
1523 {
1524 	char sync;
1525 	int pipefd[2];
1526 	pid_t tracer_pid;
1527 	pid_t tracee = getpid();
1528 
1529 	/* Setup a pipe for clean synchronization. */
1530 	ASSERT_EQ(0, pipe(pipefd));
1531 
1532 	/* Fork a child which we'll promote to tracer */
1533 	tracer_pid = fork();
1534 	ASSERT_LE(0, tracer_pid);
1535 	signal(SIGALRM, cont_handler);
1536 	if (tracer_pid == 0) {
1537 		close(pipefd[0]);
1538 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1539 			     ptrace_syscall);
1540 		syscall(__NR_exit, 0);
1541 	}
1542 	close(pipefd[1]);
1543 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1544 	read(pipefd[0], &sync, 1);
1545 	close(pipefd[0]);
1546 
1547 	return tracer_pid;
1548 }
1549 
1550 void teardown_trace_fixture(struct __test_metadata *_metadata,
1551 			    pid_t tracer)
1552 {
1553 	if (tracer) {
1554 		int status;
1555 		/*
1556 		 * Extract the exit code from the other process and
1557 		 * adopt it for ourselves in case its asserts failed.
1558 		 */
1559 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1560 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1561 		if (WEXITSTATUS(status))
1562 			_metadata->passed = 0;
1563 	}
1564 }
1565 
1566 /* "poke" tracer arguments and function. */
1567 struct tracer_args_poke_t {
1568 	unsigned long poke_addr;
1569 };
1570 
1571 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1572 		 void *args)
1573 {
1574 	int ret;
1575 	unsigned long msg;
1576 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1577 
1578 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1579 	EXPECT_EQ(0, ret);
1580 	/* If this fails, don't try to recover. */
1581 	ASSERT_EQ(0x1001, msg) {
1582 		kill(tracee, SIGKILL);
1583 	}
1584 	/*
1585 	 * Poke in the message.
1586 	 * Registers are not touched to try to keep this relatively arch
1587 	 * agnostic.
1588 	 */
1589 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1590 	EXPECT_EQ(0, ret);
1591 }
1592 
1593 FIXTURE(TRACE_poke) {
1594 	struct sock_fprog prog;
1595 	pid_t tracer;
1596 	long poked;
1597 	struct tracer_args_poke_t tracer_args;
1598 };
1599 
1600 FIXTURE_SETUP(TRACE_poke)
1601 {
1602 	struct sock_filter filter[] = {
1603 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1604 			offsetof(struct seccomp_data, nr)),
1605 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1606 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1607 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1608 	};
1609 
1610 	self->poked = 0;
1611 	memset(&self->prog, 0, sizeof(self->prog));
1612 	self->prog.filter = malloc(sizeof(filter));
1613 	ASSERT_NE(NULL, self->prog.filter);
1614 	memcpy(self->prog.filter, filter, sizeof(filter));
1615 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1616 
1617 	/* Set up tracer args. */
1618 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1619 
1620 	/* Launch tracer. */
1621 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1622 					   &self->tracer_args, false);
1623 }
1624 
1625 FIXTURE_TEARDOWN(TRACE_poke)
1626 {
1627 	teardown_trace_fixture(_metadata, self->tracer);
1628 	if (self->prog.filter)
1629 		free(self->prog.filter);
1630 }
1631 
1632 TEST_F(TRACE_poke, read_has_side_effects)
1633 {
1634 	ssize_t ret;
1635 
1636 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1637 	ASSERT_EQ(0, ret);
1638 
1639 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1640 	ASSERT_EQ(0, ret);
1641 
1642 	EXPECT_EQ(0, self->poked);
1643 	ret = read(-1, NULL, 0);
1644 	EXPECT_EQ(-1, ret);
1645 	EXPECT_EQ(0x1001, self->poked);
1646 }
1647 
1648 TEST_F(TRACE_poke, getpid_runs_normally)
1649 {
1650 	long ret;
1651 
1652 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1653 	ASSERT_EQ(0, ret);
1654 
1655 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1656 	ASSERT_EQ(0, ret);
1657 
1658 	EXPECT_EQ(0, self->poked);
1659 	EXPECT_NE(0, syscall(__NR_getpid));
1660 	EXPECT_EQ(0, self->poked);
1661 }
1662 
1663 #if defined(__x86_64__)
1664 # define ARCH_REGS	struct user_regs_struct
1665 # define SYSCALL_NUM	orig_rax
1666 # define SYSCALL_RET	rax
1667 #elif defined(__i386__)
1668 # define ARCH_REGS	struct user_regs_struct
1669 # define SYSCALL_NUM	orig_eax
1670 # define SYSCALL_RET	eax
1671 #elif defined(__arm__)
1672 # define ARCH_REGS	struct pt_regs
1673 # define SYSCALL_NUM	ARM_r7
1674 # define SYSCALL_RET	ARM_r0
1675 #elif defined(__aarch64__)
1676 # define ARCH_REGS	struct user_pt_regs
1677 # define SYSCALL_NUM	regs[8]
1678 # define SYSCALL_RET	regs[0]
1679 #elif defined(__riscv) && __riscv_xlen == 64
1680 # define ARCH_REGS	struct user_regs_struct
1681 # define SYSCALL_NUM	a7
1682 # define SYSCALL_RET	a0
1683 #elif defined(__hppa__)
1684 # define ARCH_REGS	struct user_regs_struct
1685 # define SYSCALL_NUM	gr[20]
1686 # define SYSCALL_RET	gr[28]
1687 #elif defined(__powerpc__)
1688 # define ARCH_REGS	struct pt_regs
1689 # define SYSCALL_NUM	gpr[0]
1690 # define SYSCALL_RET	gpr[3]
1691 #elif defined(__s390__)
1692 # define ARCH_REGS     s390_regs
1693 # define SYSCALL_NUM   gprs[2]
1694 # define SYSCALL_RET   gprs[2]
1695 # define SYSCALL_NUM_RET_SHARE_REG
1696 #elif defined(__mips__)
1697 # define ARCH_REGS	struct pt_regs
1698 # define SYSCALL_NUM	regs[2]
1699 # define SYSCALL_SYSCALL_NUM regs[4]
1700 # define SYSCALL_RET	regs[2]
1701 # define SYSCALL_NUM_RET_SHARE_REG
1702 #else
1703 # error "Do not know how to find your architecture's registers and syscalls"
1704 #endif
1705 
1706 /* When the syscall return can't be changed, stub out the tests for it. */
1707 #ifdef SYSCALL_NUM_RET_SHARE_REG
1708 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1709 #else
1710 # define EXPECT_SYSCALL_RETURN(val, action)		\
1711 	do {						\
1712 		errno = 0;				\
1713 		if (val < 0) {				\
1714 			EXPECT_EQ(-1, action);		\
1715 			EXPECT_EQ(-(val), errno);	\
1716 		} else {				\
1717 			EXPECT_EQ(val, action);		\
1718 		}					\
1719 	} while (0)
1720 #endif
1721 
1722 /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1723  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1724  */
1725 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1726 #define HAVE_GETREGS
1727 #endif
1728 
1729 /* Architecture-specific syscall fetching routine. */
1730 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1731 {
1732 	ARCH_REGS regs;
1733 #ifdef HAVE_GETREGS
1734 	EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, &regs)) {
1735 		TH_LOG("PTRACE_GETREGS failed");
1736 		return -1;
1737 	}
1738 #else
1739 	struct iovec iov;
1740 
1741 	iov.iov_base = &regs;
1742 	iov.iov_len = sizeof(regs);
1743 	EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
1744 		TH_LOG("PTRACE_GETREGSET failed");
1745 		return -1;
1746 	}
1747 #endif
1748 
1749 #if defined(__mips__)
1750 	if (regs.SYSCALL_NUM == __NR_O32_Linux)
1751 		return regs.SYSCALL_SYSCALL_NUM;
1752 #endif
1753 	return regs.SYSCALL_NUM;
1754 }
1755 
1756 /* Architecture-specific syscall changing routine. */
1757 void change_syscall(struct __test_metadata *_metadata,
1758 		    pid_t tracee, int syscall, int result)
1759 {
1760 	int ret;
1761 	ARCH_REGS regs;
1762 #ifdef HAVE_GETREGS
1763 	ret = ptrace(PTRACE_GETREGS, tracee, 0, &regs);
1764 #else
1765 	struct iovec iov;
1766 	iov.iov_base = &regs;
1767 	iov.iov_len = sizeof(regs);
1768 	ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
1769 #endif
1770 	EXPECT_EQ(0, ret) {}
1771 
1772 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
1773 	defined(__s390__) || defined(__hppa__) || defined(__riscv)
1774 	{
1775 		regs.SYSCALL_NUM = syscall;
1776 	}
1777 #elif defined(__mips__)
1778 	{
1779 		if (regs.SYSCALL_NUM == __NR_O32_Linux)
1780 			regs.SYSCALL_SYSCALL_NUM = syscall;
1781 		else
1782 			regs.SYSCALL_NUM = syscall;
1783 	}
1784 
1785 #elif defined(__arm__)
1786 # ifndef PTRACE_SET_SYSCALL
1787 #  define PTRACE_SET_SYSCALL   23
1788 # endif
1789 	{
1790 		ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
1791 		EXPECT_EQ(0, ret);
1792 	}
1793 
1794 #elif defined(__aarch64__)
1795 # ifndef NT_ARM_SYSTEM_CALL
1796 #  define NT_ARM_SYSTEM_CALL 0x404
1797 # endif
1798 	{
1799 		iov.iov_base = &syscall;
1800 		iov.iov_len = sizeof(syscall);
1801 		ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL,
1802 			     &iov);
1803 		EXPECT_EQ(0, ret);
1804 	}
1805 
1806 #else
1807 	ASSERT_EQ(1, 0) {
1808 		TH_LOG("How is the syscall changed on this architecture?");
1809 	}
1810 #endif
1811 
1812 	/* If syscall is skipped, change return value. */
1813 	if (syscall == -1)
1814 #ifdef SYSCALL_NUM_RET_SHARE_REG
1815 		TH_LOG("Can't modify syscall return on this architecture");
1816 #else
1817 		regs.SYSCALL_RET = result;
1818 #endif
1819 
1820 #ifdef HAVE_GETREGS
1821 	ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
1822 #else
1823 	iov.iov_base = &regs;
1824 	iov.iov_len = sizeof(regs);
1825 	ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
1826 #endif
1827 	EXPECT_EQ(0, ret);
1828 }
1829 
1830 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1831 		    int status, void *args)
1832 {
1833 	int ret;
1834 	unsigned long msg;
1835 
1836 	/* Make sure we got the right message. */
1837 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1838 	EXPECT_EQ(0, ret);
1839 
1840 	/* Validate and take action on expected syscalls. */
1841 	switch (msg) {
1842 	case 0x1002:
1843 		/* change getpid to getppid. */
1844 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1845 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1846 		break;
1847 	case 0x1003:
1848 		/* skip gettid with valid return code. */
1849 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1850 		change_syscall(_metadata, tracee, -1, 45000);
1851 		break;
1852 	case 0x1004:
1853 		/* skip openat with error. */
1854 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1855 		change_syscall(_metadata, tracee, -1, -ESRCH);
1856 		break;
1857 	case 0x1005:
1858 		/* do nothing (allow getppid) */
1859 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
1860 		break;
1861 	default:
1862 		EXPECT_EQ(0, msg) {
1863 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
1864 			kill(tracee, SIGKILL);
1865 		}
1866 	}
1867 
1868 }
1869 
1870 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
1871 		   int status, void *args)
1872 {
1873 	int ret, nr;
1874 	unsigned long msg;
1875 	static bool entry;
1876 
1877 	/*
1878 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
1879 	 * is by counting.
1880 	 */
1881 	entry = !entry;
1882 
1883 	/* Make sure we got an appropriate message. */
1884 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1885 	EXPECT_EQ(0, ret);
1886 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
1887 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
1888 
1889 	if (!entry)
1890 		return;
1891 
1892 	nr = get_syscall(_metadata, tracee);
1893 
1894 	if (nr == __NR_getpid)
1895 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1896 	if (nr == __NR_gettid)
1897 		change_syscall(_metadata, tracee, -1, 45000);
1898 	if (nr == __NR_openat)
1899 		change_syscall(_metadata, tracee, -1, -ESRCH);
1900 }
1901 
1902 FIXTURE(TRACE_syscall) {
1903 	struct sock_fprog prog;
1904 	pid_t tracer, mytid, mypid, parent;
1905 };
1906 
1907 FIXTURE_VARIANT(TRACE_syscall) {
1908 	/*
1909 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
1910 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
1911 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
1912 	 * ptrace (true).
1913 	 */
1914 	bool use_ptrace;
1915 };
1916 
1917 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
1918 	.use_ptrace = true,
1919 };
1920 
1921 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
1922 	.use_ptrace = false,
1923 };
1924 
1925 FIXTURE_SETUP(TRACE_syscall)
1926 {
1927 	struct sock_filter filter[] = {
1928 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1929 			offsetof(struct seccomp_data, nr)),
1930 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1931 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
1932 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
1933 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
1934 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
1935 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
1936 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
1937 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
1938 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1939 	};
1940 	struct sock_fprog prog = {
1941 		.len = (unsigned short)ARRAY_SIZE(filter),
1942 		.filter = filter,
1943 	};
1944 	long ret;
1945 
1946 	/* Prepare some testable syscall results. */
1947 	self->mytid = syscall(__NR_gettid);
1948 	ASSERT_GT(self->mytid, 0);
1949 	ASSERT_NE(self->mytid, 1) {
1950 		TH_LOG("Running this test as init is not supported. :)");
1951 	}
1952 
1953 	self->mypid = getpid();
1954 	ASSERT_GT(self->mypid, 0);
1955 	ASSERT_EQ(self->mytid, self->mypid);
1956 
1957 	self->parent = getppid();
1958 	ASSERT_GT(self->parent, 0);
1959 	ASSERT_NE(self->parent, self->mypid);
1960 
1961 	/* Launch tracer. */
1962 	self->tracer = setup_trace_fixture(_metadata,
1963 					   variant->use_ptrace ? tracer_ptrace
1964 							       : tracer_seccomp,
1965 					   NULL, variant->use_ptrace);
1966 
1967 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1968 	ASSERT_EQ(0, ret);
1969 
1970 	if (variant->use_ptrace)
1971 		return;
1972 
1973 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
1974 	ASSERT_EQ(0, ret);
1975 }
1976 
1977 FIXTURE_TEARDOWN(TRACE_syscall)
1978 {
1979 	teardown_trace_fixture(_metadata, self->tracer);
1980 }
1981 
1982 TEST(negative_ENOSYS)
1983 {
1984 	/*
1985 	 * There should be no difference between an "internal" skip
1986 	 * and userspace asking for syscall "-1".
1987 	 */
1988 	errno = 0;
1989 	EXPECT_EQ(-1, syscall(-1));
1990 	EXPECT_EQ(errno, ENOSYS);
1991 	/* And no difference for "still not valid but not -1". */
1992 	errno = 0;
1993 	EXPECT_EQ(-1, syscall(-101));
1994 	EXPECT_EQ(errno, ENOSYS);
1995 }
1996 
1997 TEST_F(TRACE_syscall, negative_ENOSYS)
1998 {
1999 	negative_ENOSYS(_metadata);
2000 }
2001 
2002 TEST_F(TRACE_syscall, syscall_allowed)
2003 {
2004 	/* getppid works as expected (no changes). */
2005 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2006 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2007 }
2008 
2009 TEST_F(TRACE_syscall, syscall_redirected)
2010 {
2011 	/* getpid has been redirected to getppid as expected. */
2012 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2013 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2014 }
2015 
2016 TEST_F(TRACE_syscall, syscall_errno)
2017 {
2018 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2019 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2020 }
2021 
2022 TEST_F(TRACE_syscall, syscall_faked)
2023 {
2024 	/* Tracer skips the gettid syscall and store altered return value. */
2025 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2026 }
2027 
2028 TEST_F(TRACE_syscall, skip_after)
2029 {
2030 	struct sock_filter filter[] = {
2031 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2032 			offsetof(struct seccomp_data, nr)),
2033 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2034 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2035 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2036 	};
2037 	struct sock_fprog prog = {
2038 		.len = (unsigned short)ARRAY_SIZE(filter),
2039 		.filter = filter,
2040 	};
2041 	long ret;
2042 
2043 	/* Install additional "errno on getppid" filter. */
2044 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2045 	ASSERT_EQ(0, ret);
2046 
2047 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2048 	errno = 0;
2049 	EXPECT_EQ(-1, syscall(__NR_getpid));
2050 	EXPECT_EQ(EPERM, errno);
2051 }
2052 
2053 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2054 {
2055 	struct sock_filter filter[] = {
2056 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2057 			offsetof(struct seccomp_data, nr)),
2058 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2059 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2060 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2061 	};
2062 	struct sock_fprog prog = {
2063 		.len = (unsigned short)ARRAY_SIZE(filter),
2064 		.filter = filter,
2065 	};
2066 	long ret;
2067 
2068 	/* Install additional "death on getppid" filter. */
2069 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2070 	ASSERT_EQ(0, ret);
2071 
2072 	/* Tracer will redirect getpid to getppid, and we should die. */
2073 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2074 }
2075 
2076 TEST(seccomp_syscall)
2077 {
2078 	struct sock_filter filter[] = {
2079 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2080 	};
2081 	struct sock_fprog prog = {
2082 		.len = (unsigned short)ARRAY_SIZE(filter),
2083 		.filter = filter,
2084 	};
2085 	long ret;
2086 
2087 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2088 	ASSERT_EQ(0, ret) {
2089 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2090 	}
2091 
2092 	/* Reject insane operation. */
2093 	ret = seccomp(-1, 0, &prog);
2094 	ASSERT_NE(ENOSYS, errno) {
2095 		TH_LOG("Kernel does not support seccomp syscall!");
2096 	}
2097 	EXPECT_EQ(EINVAL, errno) {
2098 		TH_LOG("Did not reject crazy op value!");
2099 	}
2100 
2101 	/* Reject strict with flags or pointer. */
2102 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2103 	EXPECT_EQ(EINVAL, errno) {
2104 		TH_LOG("Did not reject mode strict with flags!");
2105 	}
2106 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2107 	EXPECT_EQ(EINVAL, errno) {
2108 		TH_LOG("Did not reject mode strict with uargs!");
2109 	}
2110 
2111 	/* Reject insane args for filter. */
2112 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2113 	EXPECT_EQ(EINVAL, errno) {
2114 		TH_LOG("Did not reject crazy filter flags!");
2115 	}
2116 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2117 	EXPECT_EQ(EFAULT, errno) {
2118 		TH_LOG("Did not reject NULL filter!");
2119 	}
2120 
2121 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2122 	EXPECT_EQ(0, errno) {
2123 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2124 			strerror(errno));
2125 	}
2126 }
2127 
2128 TEST(seccomp_syscall_mode_lock)
2129 {
2130 	struct sock_filter filter[] = {
2131 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2132 	};
2133 	struct sock_fprog prog = {
2134 		.len = (unsigned short)ARRAY_SIZE(filter),
2135 		.filter = filter,
2136 	};
2137 	long ret;
2138 
2139 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2140 	ASSERT_EQ(0, ret) {
2141 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2142 	}
2143 
2144 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2145 	ASSERT_NE(ENOSYS, errno) {
2146 		TH_LOG("Kernel does not support seccomp syscall!");
2147 	}
2148 	EXPECT_EQ(0, ret) {
2149 		TH_LOG("Could not install filter!");
2150 	}
2151 
2152 	/* Make sure neither entry point will switch to strict. */
2153 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2154 	EXPECT_EQ(EINVAL, errno) {
2155 		TH_LOG("Switched to mode strict!");
2156 	}
2157 
2158 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2159 	EXPECT_EQ(EINVAL, errno) {
2160 		TH_LOG("Switched to mode strict!");
2161 	}
2162 }
2163 
2164 /*
2165  * Test detection of known and unknown filter flags. Userspace needs to be able
2166  * to check if a filter flag is supported by the current kernel and a good way
2167  * of doing that is by attempting to enter filter mode, with the flag bit in
2168  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2169  * that the flag is valid and EINVAL indicates that the flag is invalid.
2170  */
2171 TEST(detect_seccomp_filter_flags)
2172 {
2173 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2174 				 SECCOMP_FILTER_FLAG_LOG,
2175 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2176 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2177 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2178 	unsigned int exclusive[] = {
2179 				SECCOMP_FILTER_FLAG_TSYNC,
2180 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2181 	unsigned int flag, all_flags, exclusive_mask;
2182 	int i;
2183 	long ret;
2184 
2185 	/* Test detection of individual known-good filter flags */
2186 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2187 		int bits = 0;
2188 
2189 		flag = flags[i];
2190 		/* Make sure the flag is a single bit! */
2191 		while (flag) {
2192 			if (flag & 0x1)
2193 				bits ++;
2194 			flag >>= 1;
2195 		}
2196 		ASSERT_EQ(1, bits);
2197 		flag = flags[i];
2198 
2199 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2200 		ASSERT_NE(ENOSYS, errno) {
2201 			TH_LOG("Kernel does not support seccomp syscall!");
2202 		}
2203 		EXPECT_EQ(-1, ret);
2204 		EXPECT_EQ(EFAULT, errno) {
2205 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2206 			       flag);
2207 		}
2208 
2209 		all_flags |= flag;
2210 	}
2211 
2212 	/*
2213 	 * Test detection of all known-good filter flags combined. But
2214 	 * for the exclusive flags we need to mask them out and try them
2215 	 * individually for the "all flags" testing.
2216 	 */
2217 	exclusive_mask = 0;
2218 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2219 		exclusive_mask |= exclusive[i];
2220 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2221 		flag = all_flags & ~exclusive_mask;
2222 		flag |= exclusive[i];
2223 
2224 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2225 		EXPECT_EQ(-1, ret);
2226 		EXPECT_EQ(EFAULT, errno) {
2227 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2228 			       flag);
2229 		}
2230 	}
2231 
2232 	/* Test detection of an unknown filter flags, without exclusives. */
2233 	flag = -1;
2234 	flag &= ~exclusive_mask;
2235 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2236 	EXPECT_EQ(-1, ret);
2237 	EXPECT_EQ(EINVAL, errno) {
2238 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2239 		       flag);
2240 	}
2241 
2242 	/*
2243 	 * Test detection of an unknown filter flag that may simply need to be
2244 	 * added to this test
2245 	 */
2246 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2247 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2248 	EXPECT_EQ(-1, ret);
2249 	EXPECT_EQ(EINVAL, errno) {
2250 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2251 		       flag);
2252 	}
2253 }
2254 
2255 TEST(TSYNC_first)
2256 {
2257 	struct sock_filter filter[] = {
2258 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2259 	};
2260 	struct sock_fprog prog = {
2261 		.len = (unsigned short)ARRAY_SIZE(filter),
2262 		.filter = filter,
2263 	};
2264 	long ret;
2265 
2266 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2267 	ASSERT_EQ(0, ret) {
2268 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2269 	}
2270 
2271 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2272 		      &prog);
2273 	ASSERT_NE(ENOSYS, errno) {
2274 		TH_LOG("Kernel does not support seccomp syscall!");
2275 	}
2276 	EXPECT_EQ(0, ret) {
2277 		TH_LOG("Could not install initial filter with TSYNC!");
2278 	}
2279 }
2280 
2281 #define TSYNC_SIBLINGS 2
2282 struct tsync_sibling {
2283 	pthread_t tid;
2284 	pid_t system_tid;
2285 	sem_t *started;
2286 	pthread_cond_t *cond;
2287 	pthread_mutex_t *mutex;
2288 	int diverge;
2289 	int num_waits;
2290 	struct sock_fprog *prog;
2291 	struct __test_metadata *metadata;
2292 };
2293 
2294 /*
2295  * To avoid joining joined threads (which is not allowed by Bionic),
2296  * make sure we both successfully join and clear the tid to skip a
2297  * later join attempt during fixture teardown. Any remaining threads
2298  * will be directly killed during teardown.
2299  */
2300 #define PTHREAD_JOIN(tid, status)					\
2301 	do {								\
2302 		int _rc = pthread_join(tid, status);			\
2303 		if (_rc) {						\
2304 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2305 				(unsigned int)tid, _rc);		\
2306 		} else {						\
2307 			tid = 0;					\
2308 		}							\
2309 	} while (0)
2310 
2311 FIXTURE(TSYNC) {
2312 	struct sock_fprog root_prog, apply_prog;
2313 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2314 	sem_t started;
2315 	pthread_cond_t cond;
2316 	pthread_mutex_t mutex;
2317 	int sibling_count;
2318 };
2319 
2320 FIXTURE_SETUP(TSYNC)
2321 {
2322 	struct sock_filter root_filter[] = {
2323 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2324 	};
2325 	struct sock_filter apply_filter[] = {
2326 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2327 			offsetof(struct seccomp_data, nr)),
2328 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2329 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2330 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2331 	};
2332 
2333 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2334 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2335 	memset(&self->sibling, 0, sizeof(self->sibling));
2336 	self->root_prog.filter = malloc(sizeof(root_filter));
2337 	ASSERT_NE(NULL, self->root_prog.filter);
2338 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2339 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2340 
2341 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2342 	ASSERT_NE(NULL, self->apply_prog.filter);
2343 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2344 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2345 
2346 	self->sibling_count = 0;
2347 	pthread_mutex_init(&self->mutex, NULL);
2348 	pthread_cond_init(&self->cond, NULL);
2349 	sem_init(&self->started, 0, 0);
2350 	self->sibling[0].tid = 0;
2351 	self->sibling[0].cond = &self->cond;
2352 	self->sibling[0].started = &self->started;
2353 	self->sibling[0].mutex = &self->mutex;
2354 	self->sibling[0].diverge = 0;
2355 	self->sibling[0].num_waits = 1;
2356 	self->sibling[0].prog = &self->root_prog;
2357 	self->sibling[0].metadata = _metadata;
2358 	self->sibling[1].tid = 0;
2359 	self->sibling[1].cond = &self->cond;
2360 	self->sibling[1].started = &self->started;
2361 	self->sibling[1].mutex = &self->mutex;
2362 	self->sibling[1].diverge = 0;
2363 	self->sibling[1].prog = &self->root_prog;
2364 	self->sibling[1].num_waits = 1;
2365 	self->sibling[1].metadata = _metadata;
2366 }
2367 
2368 FIXTURE_TEARDOWN(TSYNC)
2369 {
2370 	int sib = 0;
2371 
2372 	if (self->root_prog.filter)
2373 		free(self->root_prog.filter);
2374 	if (self->apply_prog.filter)
2375 		free(self->apply_prog.filter);
2376 
2377 	for ( ; sib < self->sibling_count; ++sib) {
2378 		struct tsync_sibling *s = &self->sibling[sib];
2379 
2380 		if (!s->tid)
2381 			continue;
2382 		/*
2383 		 * If a thread is still running, it may be stuck, so hit
2384 		 * it over the head really hard.
2385 		 */
2386 		pthread_kill(s->tid, 9);
2387 	}
2388 	pthread_mutex_destroy(&self->mutex);
2389 	pthread_cond_destroy(&self->cond);
2390 	sem_destroy(&self->started);
2391 }
2392 
2393 void *tsync_sibling(void *data)
2394 {
2395 	long ret = 0;
2396 	struct tsync_sibling *me = data;
2397 
2398 	me->system_tid = syscall(__NR_gettid);
2399 
2400 	pthread_mutex_lock(me->mutex);
2401 	if (me->diverge) {
2402 		/* Just re-apply the root prog to fork the tree */
2403 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2404 				me->prog, 0, 0);
2405 	}
2406 	sem_post(me->started);
2407 	/* Return outside of started so parent notices failures. */
2408 	if (ret) {
2409 		pthread_mutex_unlock(me->mutex);
2410 		return (void *)SIBLING_EXIT_FAILURE;
2411 	}
2412 	do {
2413 		pthread_cond_wait(me->cond, me->mutex);
2414 		me->num_waits = me->num_waits - 1;
2415 	} while (me->num_waits);
2416 	pthread_mutex_unlock(me->mutex);
2417 
2418 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2419 	if (!ret)
2420 		return (void *)SIBLING_EXIT_NEWPRIVS;
2421 	read(0, NULL, 0);
2422 	return (void *)SIBLING_EXIT_UNKILLED;
2423 }
2424 
2425 void tsync_start_sibling(struct tsync_sibling *sibling)
2426 {
2427 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2428 }
2429 
2430 TEST_F(TSYNC, siblings_fail_prctl)
2431 {
2432 	long ret;
2433 	void *status;
2434 	struct sock_filter filter[] = {
2435 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2436 			offsetof(struct seccomp_data, nr)),
2437 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2438 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2439 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2440 	};
2441 	struct sock_fprog prog = {
2442 		.len = (unsigned short)ARRAY_SIZE(filter),
2443 		.filter = filter,
2444 	};
2445 
2446 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2447 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2448 	}
2449 
2450 	/* Check prctl failure detection by requesting sib 0 diverge. */
2451 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2452 	ASSERT_NE(ENOSYS, errno) {
2453 		TH_LOG("Kernel does not support seccomp syscall!");
2454 	}
2455 	ASSERT_EQ(0, ret) {
2456 		TH_LOG("setting filter failed");
2457 	}
2458 
2459 	self->sibling[0].diverge = 1;
2460 	tsync_start_sibling(&self->sibling[0]);
2461 	tsync_start_sibling(&self->sibling[1]);
2462 
2463 	while (self->sibling_count < TSYNC_SIBLINGS) {
2464 		sem_wait(&self->started);
2465 		self->sibling_count++;
2466 	}
2467 
2468 	/* Signal the threads to clean up*/
2469 	pthread_mutex_lock(&self->mutex);
2470 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2471 		TH_LOG("cond broadcast non-zero");
2472 	}
2473 	pthread_mutex_unlock(&self->mutex);
2474 
2475 	/* Ensure diverging sibling failed to call prctl. */
2476 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2477 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2478 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2479 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2480 }
2481 
2482 TEST_F(TSYNC, two_siblings_with_ancestor)
2483 {
2484 	long ret;
2485 	void *status;
2486 
2487 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2488 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2489 	}
2490 
2491 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2492 	ASSERT_NE(ENOSYS, errno) {
2493 		TH_LOG("Kernel does not support seccomp syscall!");
2494 	}
2495 	ASSERT_EQ(0, ret) {
2496 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2497 	}
2498 	tsync_start_sibling(&self->sibling[0]);
2499 	tsync_start_sibling(&self->sibling[1]);
2500 
2501 	while (self->sibling_count < TSYNC_SIBLINGS) {
2502 		sem_wait(&self->started);
2503 		self->sibling_count++;
2504 	}
2505 
2506 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2507 		      &self->apply_prog);
2508 	ASSERT_EQ(0, ret) {
2509 		TH_LOG("Could install filter on all threads!");
2510 	}
2511 	/* Tell the siblings to test the policy */
2512 	pthread_mutex_lock(&self->mutex);
2513 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2514 		TH_LOG("cond broadcast non-zero");
2515 	}
2516 	pthread_mutex_unlock(&self->mutex);
2517 	/* Ensure they are both killed and don't exit cleanly. */
2518 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2519 	EXPECT_EQ(0x0, (long)status);
2520 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2521 	EXPECT_EQ(0x0, (long)status);
2522 }
2523 
2524 TEST_F(TSYNC, two_sibling_want_nnp)
2525 {
2526 	void *status;
2527 
2528 	/* start siblings before any prctl() operations */
2529 	tsync_start_sibling(&self->sibling[0]);
2530 	tsync_start_sibling(&self->sibling[1]);
2531 	while (self->sibling_count < TSYNC_SIBLINGS) {
2532 		sem_wait(&self->started);
2533 		self->sibling_count++;
2534 	}
2535 
2536 	/* Tell the siblings to test no policy */
2537 	pthread_mutex_lock(&self->mutex);
2538 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2539 		TH_LOG("cond broadcast non-zero");
2540 	}
2541 	pthread_mutex_unlock(&self->mutex);
2542 
2543 	/* Ensure they are both upset about lacking nnp. */
2544 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2545 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2546 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2547 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2548 }
2549 
2550 TEST_F(TSYNC, two_siblings_with_no_filter)
2551 {
2552 	long ret;
2553 	void *status;
2554 
2555 	/* start siblings before any prctl() operations */
2556 	tsync_start_sibling(&self->sibling[0]);
2557 	tsync_start_sibling(&self->sibling[1]);
2558 	while (self->sibling_count < TSYNC_SIBLINGS) {
2559 		sem_wait(&self->started);
2560 		self->sibling_count++;
2561 	}
2562 
2563 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2564 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2565 	}
2566 
2567 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2568 		      &self->apply_prog);
2569 	ASSERT_NE(ENOSYS, errno) {
2570 		TH_LOG("Kernel does not support seccomp syscall!");
2571 	}
2572 	ASSERT_EQ(0, ret) {
2573 		TH_LOG("Could install filter on all threads!");
2574 	}
2575 
2576 	/* Tell the siblings to test the policy */
2577 	pthread_mutex_lock(&self->mutex);
2578 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2579 		TH_LOG("cond broadcast non-zero");
2580 	}
2581 	pthread_mutex_unlock(&self->mutex);
2582 
2583 	/* Ensure they are both killed and don't exit cleanly. */
2584 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2585 	EXPECT_EQ(0x0, (long)status);
2586 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2587 	EXPECT_EQ(0x0, (long)status);
2588 }
2589 
2590 TEST_F(TSYNC, two_siblings_with_one_divergence)
2591 {
2592 	long ret;
2593 	void *status;
2594 
2595 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2596 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2597 	}
2598 
2599 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2600 	ASSERT_NE(ENOSYS, errno) {
2601 		TH_LOG("Kernel does not support seccomp syscall!");
2602 	}
2603 	ASSERT_EQ(0, ret) {
2604 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2605 	}
2606 	self->sibling[0].diverge = 1;
2607 	tsync_start_sibling(&self->sibling[0]);
2608 	tsync_start_sibling(&self->sibling[1]);
2609 
2610 	while (self->sibling_count < TSYNC_SIBLINGS) {
2611 		sem_wait(&self->started);
2612 		self->sibling_count++;
2613 	}
2614 
2615 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2616 		      &self->apply_prog);
2617 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2618 		TH_LOG("Did not fail on diverged sibling.");
2619 	}
2620 
2621 	/* Wake the threads */
2622 	pthread_mutex_lock(&self->mutex);
2623 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2624 		TH_LOG("cond broadcast non-zero");
2625 	}
2626 	pthread_mutex_unlock(&self->mutex);
2627 
2628 	/* Ensure they are both unkilled. */
2629 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2630 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2631 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2632 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2633 }
2634 
2635 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2636 {
2637 	long ret, flags;
2638 	void *status;
2639 
2640 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2641 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2642 	}
2643 
2644 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2645 	ASSERT_NE(ENOSYS, errno) {
2646 		TH_LOG("Kernel does not support seccomp syscall!");
2647 	}
2648 	ASSERT_EQ(0, ret) {
2649 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2650 	}
2651 	self->sibling[0].diverge = 1;
2652 	tsync_start_sibling(&self->sibling[0]);
2653 	tsync_start_sibling(&self->sibling[1]);
2654 
2655 	while (self->sibling_count < TSYNC_SIBLINGS) {
2656 		sem_wait(&self->started);
2657 		self->sibling_count++;
2658 	}
2659 
2660 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2661 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2662 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2663 	ASSERT_EQ(ESRCH, errno) {
2664 		TH_LOG("Did not return ESRCH for diverged sibling.");
2665 	}
2666 	ASSERT_EQ(-1, ret) {
2667 		TH_LOG("Did not fail on diverged sibling.");
2668 	}
2669 
2670 	/* Wake the threads */
2671 	pthread_mutex_lock(&self->mutex);
2672 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2673 		TH_LOG("cond broadcast non-zero");
2674 	}
2675 	pthread_mutex_unlock(&self->mutex);
2676 
2677 	/* Ensure they are both unkilled. */
2678 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2679 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2680 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2681 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2682 }
2683 
2684 TEST_F(TSYNC, two_siblings_not_under_filter)
2685 {
2686 	long ret, sib;
2687 	void *status;
2688 	struct timespec delay = { .tv_nsec = 100000000 };
2689 
2690 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2691 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2692 	}
2693 
2694 	/*
2695 	 * Sibling 0 will have its own seccomp policy
2696 	 * and Sibling 1 will not be under seccomp at
2697 	 * all. Sibling 1 will enter seccomp and 0
2698 	 * will cause failure.
2699 	 */
2700 	self->sibling[0].diverge = 1;
2701 	tsync_start_sibling(&self->sibling[0]);
2702 	tsync_start_sibling(&self->sibling[1]);
2703 
2704 	while (self->sibling_count < TSYNC_SIBLINGS) {
2705 		sem_wait(&self->started);
2706 		self->sibling_count++;
2707 	}
2708 
2709 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2710 	ASSERT_NE(ENOSYS, errno) {
2711 		TH_LOG("Kernel does not support seccomp syscall!");
2712 	}
2713 	ASSERT_EQ(0, ret) {
2714 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2715 	}
2716 
2717 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2718 		      &self->apply_prog);
2719 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2720 		TH_LOG("Did not fail on diverged sibling.");
2721 	}
2722 	sib = 1;
2723 	if (ret == self->sibling[0].system_tid)
2724 		sib = 0;
2725 
2726 	pthread_mutex_lock(&self->mutex);
2727 
2728 	/* Increment the other siblings num_waits so we can clean up
2729 	 * the one we just saw.
2730 	 */
2731 	self->sibling[!sib].num_waits += 1;
2732 
2733 	/* Signal the thread to clean up*/
2734 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2735 		TH_LOG("cond broadcast non-zero");
2736 	}
2737 	pthread_mutex_unlock(&self->mutex);
2738 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2739 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2740 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2741 	while (!kill(self->sibling[sib].system_tid, 0))
2742 		nanosleep(&delay, NULL);
2743 	/* Switch to the remaining sibling */
2744 	sib = !sib;
2745 
2746 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2747 		      &self->apply_prog);
2748 	ASSERT_EQ(0, ret) {
2749 		TH_LOG("Expected the remaining sibling to sync");
2750 	};
2751 
2752 	pthread_mutex_lock(&self->mutex);
2753 
2754 	/* If remaining sibling didn't have a chance to wake up during
2755 	 * the first broadcast, manually reduce the num_waits now.
2756 	 */
2757 	if (self->sibling[sib].num_waits > 1)
2758 		self->sibling[sib].num_waits = 1;
2759 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2760 		TH_LOG("cond broadcast non-zero");
2761 	}
2762 	pthread_mutex_unlock(&self->mutex);
2763 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2764 	EXPECT_EQ(0, (long)status);
2765 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2766 	while (!kill(self->sibling[sib].system_tid, 0))
2767 		nanosleep(&delay, NULL);
2768 
2769 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2770 		      &self->apply_prog);
2771 	ASSERT_EQ(0, ret);  /* just us chickens */
2772 }
2773 
2774 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2775 TEST(syscall_restart)
2776 {
2777 	long ret;
2778 	unsigned long msg;
2779 	pid_t child_pid;
2780 	int pipefd[2];
2781 	int status;
2782 	siginfo_t info = { };
2783 	struct sock_filter filter[] = {
2784 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2785 			 offsetof(struct seccomp_data, nr)),
2786 
2787 #ifdef __NR_sigreturn
2788 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
2789 #endif
2790 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
2791 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
2792 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
2793 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
2794 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
2795 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
2796 
2797 		/* Allow __NR_write for easy logging. */
2798 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
2799 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2800 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2801 		/* The nanosleep jump target. */
2802 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
2803 		/* The restart_syscall jump target. */
2804 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
2805 	};
2806 	struct sock_fprog prog = {
2807 		.len = (unsigned short)ARRAY_SIZE(filter),
2808 		.filter = filter,
2809 	};
2810 #if defined(__arm__)
2811 	struct utsname utsbuf;
2812 #endif
2813 
2814 	ASSERT_EQ(0, pipe(pipefd));
2815 
2816 	child_pid = fork();
2817 	ASSERT_LE(0, child_pid);
2818 	if (child_pid == 0) {
2819 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
2820 		char buf = ' ';
2821 		struct timespec timeout = { };
2822 
2823 		/* Attach parent as tracer and stop. */
2824 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
2825 		EXPECT_EQ(0, raise(SIGSTOP));
2826 
2827 		EXPECT_EQ(0, close(pipefd[1]));
2828 
2829 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2830 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2831 		}
2832 
2833 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2834 		EXPECT_EQ(0, ret) {
2835 			TH_LOG("Failed to install filter!");
2836 		}
2837 
2838 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2839 			TH_LOG("Failed to read() sync from parent");
2840 		}
2841 		EXPECT_EQ('.', buf) {
2842 			TH_LOG("Failed to get sync data from read()");
2843 		}
2844 
2845 		/* Start nanosleep to be interrupted. */
2846 		timeout.tv_sec = 1;
2847 		errno = 0;
2848 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
2849 			TH_LOG("Call to nanosleep() failed (errno %d)", errno);
2850 		}
2851 
2852 		/* Read final sync from parent. */
2853 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2854 			TH_LOG("Failed final read() from parent");
2855 		}
2856 		EXPECT_EQ('!', buf) {
2857 			TH_LOG("Failed to get final data from read()");
2858 		}
2859 
2860 		/* Directly report the status of our test harness results. */
2861 		syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
2862 						     : EXIT_FAILURE);
2863 	}
2864 	EXPECT_EQ(0, close(pipefd[0]));
2865 
2866 	/* Attach to child, setup options, and release. */
2867 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2868 	ASSERT_EQ(true, WIFSTOPPED(status));
2869 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
2870 			    PTRACE_O_TRACESECCOMP));
2871 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2872 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
2873 
2874 	/* Wait for nanosleep() to start. */
2875 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2876 	ASSERT_EQ(true, WIFSTOPPED(status));
2877 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2878 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2879 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2880 	ASSERT_EQ(0x100, msg);
2881 	ret = get_syscall(_metadata, child_pid);
2882 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
2883 
2884 	/* Might as well check siginfo for sanity while we're here. */
2885 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2886 	ASSERT_EQ(SIGTRAP, info.si_signo);
2887 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
2888 	EXPECT_EQ(0, info.si_errno);
2889 	EXPECT_EQ(getuid(), info.si_uid);
2890 	/* Verify signal delivery came from child (seccomp-triggered). */
2891 	EXPECT_EQ(child_pid, info.si_pid);
2892 
2893 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
2894 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
2895 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2896 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2897 	ASSERT_EQ(true, WIFSTOPPED(status));
2898 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
2899 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2900 	/*
2901 	 * There is no siginfo on SIGSTOP any more, so we can't verify
2902 	 * signal delivery came from parent now (getpid() == info.si_pid).
2903 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
2904 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
2905 	 */
2906 	EXPECT_EQ(SIGSTOP, info.si_signo);
2907 
2908 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
2909 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
2910 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2911 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2912 	ASSERT_EQ(true, WIFSTOPPED(status));
2913 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
2914 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2915 
2916 	/* Wait for restart_syscall() to start. */
2917 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2918 	ASSERT_EQ(true, WIFSTOPPED(status));
2919 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2920 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2921 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2922 
2923 	ASSERT_EQ(0x200, msg);
2924 	ret = get_syscall(_metadata, child_pid);
2925 #if defined(__arm__)
2926 	/*
2927 	 * FIXME:
2928 	 * - native ARM registers do NOT expose true syscall.
2929 	 * - compat ARM registers on ARM64 DO expose true syscall.
2930 	 */
2931 	ASSERT_EQ(0, uname(&utsbuf));
2932 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
2933 		EXPECT_EQ(__NR_nanosleep, ret);
2934 	} else
2935 #endif
2936 	{
2937 		EXPECT_EQ(__NR_restart_syscall, ret);
2938 	}
2939 
2940 	/* Write again to end test. */
2941 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2942 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
2943 	EXPECT_EQ(0, close(pipefd[1]));
2944 
2945 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2946 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
2947 		_metadata->passed = 0;
2948 }
2949 
2950 TEST_SIGNAL(filter_flag_log, SIGSYS)
2951 {
2952 	struct sock_filter allow_filter[] = {
2953 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2954 	};
2955 	struct sock_filter kill_filter[] = {
2956 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2957 			offsetof(struct seccomp_data, nr)),
2958 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2959 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2960 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2961 	};
2962 	struct sock_fprog allow_prog = {
2963 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
2964 		.filter = allow_filter,
2965 	};
2966 	struct sock_fprog kill_prog = {
2967 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
2968 		.filter = kill_filter,
2969 	};
2970 	long ret;
2971 	pid_t parent = getppid();
2972 
2973 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2974 	ASSERT_EQ(0, ret);
2975 
2976 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
2977 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
2978 		      &allow_prog);
2979 	ASSERT_NE(ENOSYS, errno) {
2980 		TH_LOG("Kernel does not support seccomp syscall!");
2981 	}
2982 	EXPECT_NE(0, ret) {
2983 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
2984 	}
2985 	EXPECT_EQ(EINVAL, errno) {
2986 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
2987 	}
2988 
2989 	/* Verify that a simple, permissive filter can be added with no flags */
2990 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
2991 	EXPECT_EQ(0, ret);
2992 
2993 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
2994 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
2995 		      &allow_prog);
2996 	ASSERT_NE(EINVAL, errno) {
2997 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
2998 	}
2999 	EXPECT_EQ(0, ret);
3000 
3001 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3002 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3003 		      &kill_prog);
3004 	EXPECT_EQ(0, ret);
3005 
3006 	EXPECT_EQ(parent, syscall(__NR_getppid));
3007 	/* getpid() should never return. */
3008 	EXPECT_EQ(0, syscall(__NR_getpid));
3009 }
3010 
3011 TEST(get_action_avail)
3012 {
3013 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3014 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3015 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3016 	__u32 unknown_action = 0x10000000U;
3017 	int i;
3018 	long ret;
3019 
3020 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3021 	ASSERT_NE(ENOSYS, errno) {
3022 		TH_LOG("Kernel does not support seccomp syscall!");
3023 	}
3024 	ASSERT_NE(EINVAL, errno) {
3025 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3026 	}
3027 	EXPECT_EQ(ret, 0);
3028 
3029 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3030 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3031 		EXPECT_EQ(ret, 0) {
3032 			TH_LOG("Expected action (0x%X) not available!",
3033 			       actions[i]);
3034 		}
3035 	}
3036 
3037 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3038 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3039 	EXPECT_EQ(ret, -1);
3040 	EXPECT_EQ(errno, EOPNOTSUPP);
3041 }
3042 
3043 TEST(get_metadata)
3044 {
3045 	pid_t pid;
3046 	int pipefd[2];
3047 	char buf;
3048 	struct seccomp_metadata md;
3049 	long ret;
3050 
3051 	/* Only real root can get metadata. */
3052 	if (geteuid()) {
3053 		SKIP(return, "get_metadata requires real root");
3054 		return;
3055 	}
3056 
3057 	ASSERT_EQ(0, pipe(pipefd));
3058 
3059 	pid = fork();
3060 	ASSERT_GE(pid, 0);
3061 	if (pid == 0) {
3062 		struct sock_filter filter[] = {
3063 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3064 		};
3065 		struct sock_fprog prog = {
3066 			.len = (unsigned short)ARRAY_SIZE(filter),
3067 			.filter = filter,
3068 		};
3069 
3070 		/* one with log, one without */
3071 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3072 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3073 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3074 
3075 		EXPECT_EQ(0, close(pipefd[0]));
3076 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3077 		ASSERT_EQ(0, close(pipefd[1]));
3078 
3079 		while (1)
3080 			sleep(100);
3081 	}
3082 
3083 	ASSERT_EQ(0, close(pipefd[1]));
3084 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3085 
3086 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3087 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3088 
3089 	/* Past here must not use ASSERT or child process is never killed. */
3090 
3091 	md.filter_off = 0;
3092 	errno = 0;
3093 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3094 	EXPECT_EQ(sizeof(md), ret) {
3095 		if (errno == EINVAL)
3096 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3097 	}
3098 
3099 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3100 	EXPECT_EQ(md.filter_off, 0);
3101 
3102 	md.filter_off = 1;
3103 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3104 	EXPECT_EQ(sizeof(md), ret);
3105 	EXPECT_EQ(md.flags, 0);
3106 	EXPECT_EQ(md.filter_off, 1);
3107 
3108 skip:
3109 	ASSERT_EQ(0, kill(pid, SIGKILL));
3110 }
3111 
3112 static int user_notif_syscall(int nr, unsigned int flags)
3113 {
3114 	struct sock_filter filter[] = {
3115 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
3116 			offsetof(struct seccomp_data, nr)),
3117 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
3118 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
3119 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
3120 	};
3121 
3122 	struct sock_fprog prog = {
3123 		.len = (unsigned short)ARRAY_SIZE(filter),
3124 		.filter = filter,
3125 	};
3126 
3127 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3128 }
3129 
3130 #define USER_NOTIF_MAGIC INT_MAX
3131 TEST(user_notification_basic)
3132 {
3133 	pid_t pid;
3134 	long ret;
3135 	int status, listener;
3136 	struct seccomp_notif req = {};
3137 	struct seccomp_notif_resp resp = {};
3138 	struct pollfd pollfd;
3139 
3140 	struct sock_filter filter[] = {
3141 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3142 	};
3143 	struct sock_fprog prog = {
3144 		.len = (unsigned short)ARRAY_SIZE(filter),
3145 		.filter = filter,
3146 	};
3147 
3148 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3149 	ASSERT_EQ(0, ret) {
3150 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3151 	}
3152 
3153 	pid = fork();
3154 	ASSERT_GE(pid, 0);
3155 
3156 	/* Check that we get -ENOSYS with no listener attached */
3157 	if (pid == 0) {
3158 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3159 			exit(1);
3160 		ret = syscall(__NR_getppid);
3161 		exit(ret >= 0 || errno != ENOSYS);
3162 	}
3163 
3164 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3165 	EXPECT_EQ(true, WIFEXITED(status));
3166 	EXPECT_EQ(0, WEXITSTATUS(status));
3167 
3168 	/* Add some no-op filters for grins. */
3169 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3170 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3171 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3172 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3173 
3174 	/* Check that the basic notification machinery works */
3175 	listener = user_notif_syscall(__NR_getppid,
3176 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3177 	ASSERT_GE(listener, 0);
3178 
3179 	/* Installing a second listener in the chain should EBUSY */
3180 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3181 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3182 		  -1);
3183 	EXPECT_EQ(errno, EBUSY);
3184 
3185 	pid = fork();
3186 	ASSERT_GE(pid, 0);
3187 
3188 	if (pid == 0) {
3189 		ret = syscall(__NR_getppid);
3190 		exit(ret != USER_NOTIF_MAGIC);
3191 	}
3192 
3193 	pollfd.fd = listener;
3194 	pollfd.events = POLLIN | POLLOUT;
3195 
3196 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3197 	EXPECT_EQ(pollfd.revents, POLLIN);
3198 
3199 	/* Test that we can't pass garbage to the kernel. */
3200 	memset(&req, 0, sizeof(req));
3201 	req.pid = -1;
3202 	errno = 0;
3203 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3204 	EXPECT_EQ(-1, ret);
3205 	EXPECT_EQ(EINVAL, errno);
3206 
3207 	if (ret) {
3208 		req.pid = 0;
3209 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3210 	}
3211 
3212 	pollfd.fd = listener;
3213 	pollfd.events = POLLIN | POLLOUT;
3214 
3215 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3216 	EXPECT_EQ(pollfd.revents, POLLOUT);
3217 
3218 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3219 
3220 	resp.id = req.id;
3221 	resp.error = 0;
3222 	resp.val = USER_NOTIF_MAGIC;
3223 
3224 	/* check that we make sure flags == 0 */
3225 	resp.flags = 1;
3226 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3227 	EXPECT_EQ(errno, EINVAL);
3228 
3229 	resp.flags = 0;
3230 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3231 
3232 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3233 	EXPECT_EQ(true, WIFEXITED(status));
3234 	EXPECT_EQ(0, WEXITSTATUS(status));
3235 }
3236 
3237 TEST(user_notification_with_tsync)
3238 {
3239 	int ret;
3240 	unsigned int flags;
3241 
3242 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3243 	ASSERT_EQ(0, ret) {
3244 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3245 	}
3246 
3247 	/* these were exclusive */
3248 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3249 		SECCOMP_FILTER_FLAG_TSYNC;
3250 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3251 	ASSERT_EQ(EINVAL, errno);
3252 
3253 	/* but now they're not */
3254 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3255 	ret = user_notif_syscall(__NR_getppid, flags);
3256 	close(ret);
3257 	ASSERT_LE(0, ret);
3258 }
3259 
3260 TEST(user_notification_kill_in_middle)
3261 {
3262 	pid_t pid;
3263 	long ret;
3264 	int listener;
3265 	struct seccomp_notif req = {};
3266 	struct seccomp_notif_resp resp = {};
3267 
3268 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3269 	ASSERT_EQ(0, ret) {
3270 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3271 	}
3272 
3273 	listener = user_notif_syscall(__NR_getppid,
3274 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3275 	ASSERT_GE(listener, 0);
3276 
3277 	/*
3278 	 * Check that nothing bad happens when we kill the task in the middle
3279 	 * of a syscall.
3280 	 */
3281 	pid = fork();
3282 	ASSERT_GE(pid, 0);
3283 
3284 	if (pid == 0) {
3285 		ret = syscall(__NR_getppid);
3286 		exit(ret != USER_NOTIF_MAGIC);
3287 	}
3288 
3289 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3290 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3291 
3292 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3293 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3294 
3295 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3296 
3297 	resp.id = req.id;
3298 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3299 	EXPECT_EQ(ret, -1);
3300 	EXPECT_EQ(errno, ENOENT);
3301 }
3302 
3303 static int handled = -1;
3304 
3305 static void signal_handler(int signal)
3306 {
3307 	if (write(handled, "c", 1) != 1)
3308 		perror("write from signal");
3309 }
3310 
3311 TEST(user_notification_signal)
3312 {
3313 	pid_t pid;
3314 	long ret;
3315 	int status, listener, sk_pair[2];
3316 	struct seccomp_notif req = {};
3317 	struct seccomp_notif_resp resp = {};
3318 	char c;
3319 
3320 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3321 	ASSERT_EQ(0, ret) {
3322 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3323 	}
3324 
3325 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3326 
3327 	listener = user_notif_syscall(__NR_gettid,
3328 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3329 	ASSERT_GE(listener, 0);
3330 
3331 	pid = fork();
3332 	ASSERT_GE(pid, 0);
3333 
3334 	if (pid == 0) {
3335 		close(sk_pair[0]);
3336 		handled = sk_pair[1];
3337 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3338 			perror("signal");
3339 			exit(1);
3340 		}
3341 		/*
3342 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3343 		 * to rely on a signal that has not yet been handled. Let's at
3344 		 * least check that the error code gets propagated through, and
3345 		 * hope that it doesn't break when there is actually a signal :)
3346 		 */
3347 		ret = syscall(__NR_gettid);
3348 		exit(!(ret == -1 && errno == 512));
3349 	}
3350 
3351 	close(sk_pair[1]);
3352 
3353 	memset(&req, 0, sizeof(req));
3354 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3355 
3356 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3357 
3358 	/*
3359 	 * Make sure the signal really is delivered, which means we're not
3360 	 * stuck in the user notification code any more and the notification
3361 	 * should be dead.
3362 	 */
3363 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3364 
3365 	resp.id = req.id;
3366 	resp.error = -EPERM;
3367 	resp.val = 0;
3368 
3369 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3370 	EXPECT_EQ(errno, ENOENT);
3371 
3372 	memset(&req, 0, sizeof(req));
3373 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3374 
3375 	resp.id = req.id;
3376 	resp.error = -512; /* -ERESTARTSYS */
3377 	resp.val = 0;
3378 
3379 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3380 
3381 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3382 	EXPECT_EQ(true, WIFEXITED(status));
3383 	EXPECT_EQ(0, WEXITSTATUS(status));
3384 }
3385 
3386 TEST(user_notification_closed_listener)
3387 {
3388 	pid_t pid;
3389 	long ret;
3390 	int status, listener;
3391 
3392 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3393 	ASSERT_EQ(0, ret) {
3394 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3395 	}
3396 
3397 	listener = user_notif_syscall(__NR_getppid,
3398 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3399 	ASSERT_GE(listener, 0);
3400 
3401 	/*
3402 	 * Check that we get an ENOSYS when the listener is closed.
3403 	 */
3404 	pid = fork();
3405 	ASSERT_GE(pid, 0);
3406 	if (pid == 0) {
3407 		close(listener);
3408 		ret = syscall(__NR_getppid);
3409 		exit(ret != -1 && errno != ENOSYS);
3410 	}
3411 
3412 	close(listener);
3413 
3414 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3415 	EXPECT_EQ(true, WIFEXITED(status));
3416 	EXPECT_EQ(0, WEXITSTATUS(status));
3417 }
3418 
3419 /*
3420  * Check that a pid in a child namespace still shows up as valid in ours.
3421  */
3422 TEST(user_notification_child_pid_ns)
3423 {
3424 	pid_t pid;
3425 	int status, listener;
3426 	struct seccomp_notif req = {};
3427 	struct seccomp_notif_resp resp = {};
3428 
3429 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3430 		if (errno == EINVAL)
3431 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3432 	};
3433 
3434 	listener = user_notif_syscall(__NR_getppid,
3435 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3436 	ASSERT_GE(listener, 0);
3437 
3438 	pid = fork();
3439 	ASSERT_GE(pid, 0);
3440 
3441 	if (pid == 0)
3442 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3443 
3444 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3445 	EXPECT_EQ(req.pid, pid);
3446 
3447 	resp.id = req.id;
3448 	resp.error = 0;
3449 	resp.val = USER_NOTIF_MAGIC;
3450 
3451 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3452 
3453 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3454 	EXPECT_EQ(true, WIFEXITED(status));
3455 	EXPECT_EQ(0, WEXITSTATUS(status));
3456 	close(listener);
3457 }
3458 
3459 /*
3460  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3461  * invalid.
3462  */
3463 TEST(user_notification_sibling_pid_ns)
3464 {
3465 	pid_t pid, pid2;
3466 	int status, listener;
3467 	struct seccomp_notif req = {};
3468 	struct seccomp_notif_resp resp = {};
3469 
3470 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3471 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3472 	}
3473 
3474 	listener = user_notif_syscall(__NR_getppid,
3475 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3476 	ASSERT_GE(listener, 0);
3477 
3478 	pid = fork();
3479 	ASSERT_GE(pid, 0);
3480 
3481 	if (pid == 0) {
3482 		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3483 
3484 		pid2 = fork();
3485 		ASSERT_GE(pid2, 0);
3486 
3487 		if (pid2 == 0)
3488 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3489 
3490 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3491 		EXPECT_EQ(true, WIFEXITED(status));
3492 		EXPECT_EQ(0, WEXITSTATUS(status));
3493 		exit(WEXITSTATUS(status));
3494 	}
3495 
3496 	/* Create the sibling ns, and sibling in it. */
3497 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3498 		if (errno == EPERM)
3499 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3500 	}
3501 	ASSERT_EQ(errno, 0);
3502 
3503 	pid2 = fork();
3504 	ASSERT_GE(pid2, 0);
3505 
3506 	if (pid2 == 0) {
3507 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3508 		/*
3509 		 * The pid should be 0, i.e. the task is in some namespace that
3510 		 * we can't "see".
3511 		 */
3512 		EXPECT_EQ(req.pid, 0);
3513 
3514 		resp.id = req.id;
3515 		resp.error = 0;
3516 		resp.val = USER_NOTIF_MAGIC;
3517 
3518 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3519 		exit(0);
3520 	}
3521 
3522 	close(listener);
3523 
3524 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3525 	EXPECT_EQ(true, WIFEXITED(status));
3526 	EXPECT_EQ(0, WEXITSTATUS(status));
3527 
3528 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3529 	EXPECT_EQ(true, WIFEXITED(status));
3530 	EXPECT_EQ(0, WEXITSTATUS(status));
3531 }
3532 
3533 TEST(user_notification_fault_recv)
3534 {
3535 	pid_t pid;
3536 	int status, listener;
3537 	struct seccomp_notif req = {};
3538 	struct seccomp_notif_resp resp = {};
3539 
3540 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
3541 
3542 	listener = user_notif_syscall(__NR_getppid,
3543 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3544 	ASSERT_GE(listener, 0);
3545 
3546 	pid = fork();
3547 	ASSERT_GE(pid, 0);
3548 
3549 	if (pid == 0)
3550 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3551 
3552 	/* Do a bad recv() */
3553 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3554 	EXPECT_EQ(errno, EFAULT);
3555 
3556 	/* We should still be able to receive this notification, though. */
3557 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3558 	EXPECT_EQ(req.pid, pid);
3559 
3560 	resp.id = req.id;
3561 	resp.error = 0;
3562 	resp.val = USER_NOTIF_MAGIC;
3563 
3564 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3565 
3566 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3567 	EXPECT_EQ(true, WIFEXITED(status));
3568 	EXPECT_EQ(0, WEXITSTATUS(status));
3569 }
3570 
3571 TEST(seccomp_get_notif_sizes)
3572 {
3573 	struct seccomp_notif_sizes sizes;
3574 
3575 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3576 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3577 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3578 }
3579 
3580 TEST(user_notification_continue)
3581 {
3582 	pid_t pid;
3583 	long ret;
3584 	int status, listener;
3585 	struct seccomp_notif req = {};
3586 	struct seccomp_notif_resp resp = {};
3587 	struct pollfd pollfd;
3588 
3589 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3590 	ASSERT_EQ(0, ret) {
3591 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3592 	}
3593 
3594 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3595 	ASSERT_GE(listener, 0);
3596 
3597 	pid = fork();
3598 	ASSERT_GE(pid, 0);
3599 
3600 	if (pid == 0) {
3601 		int dup_fd, pipe_fds[2];
3602 		pid_t self;
3603 
3604 		ASSERT_GE(pipe(pipe_fds), 0);
3605 
3606 		dup_fd = dup(pipe_fds[0]);
3607 		ASSERT_GE(dup_fd, 0);
3608 		EXPECT_NE(pipe_fds[0], dup_fd);
3609 
3610 		self = getpid();
3611 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3612 		exit(0);
3613 	}
3614 
3615 	pollfd.fd = listener;
3616 	pollfd.events = POLLIN | POLLOUT;
3617 
3618 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3619 	EXPECT_EQ(pollfd.revents, POLLIN);
3620 
3621 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3622 
3623 	pollfd.fd = listener;
3624 	pollfd.events = POLLIN | POLLOUT;
3625 
3626 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3627 	EXPECT_EQ(pollfd.revents, POLLOUT);
3628 
3629 	EXPECT_EQ(req.data.nr, __NR_dup);
3630 
3631 	resp.id = req.id;
3632 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3633 
3634 	/*
3635 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3636 	 * args be set to 0.
3637 	 */
3638 	resp.error = 0;
3639 	resp.val = USER_NOTIF_MAGIC;
3640 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3641 	EXPECT_EQ(errno, EINVAL);
3642 
3643 	resp.error = USER_NOTIF_MAGIC;
3644 	resp.val = 0;
3645 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3646 	EXPECT_EQ(errno, EINVAL);
3647 
3648 	resp.error = 0;
3649 	resp.val = 0;
3650 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3651 		if (errno == EINVAL)
3652 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3653 	}
3654 
3655 skip:
3656 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3657 	EXPECT_EQ(true, WIFEXITED(status));
3658 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3659 		if (WEXITSTATUS(status) == 2) {
3660 			SKIP(return, "Kernel does not support kcmp() syscall");
3661 			return;
3662 		}
3663 	}
3664 }
3665 
3666 TEST(user_notification_filter_empty)
3667 {
3668 	pid_t pid;
3669 	long ret;
3670 	int status;
3671 	struct pollfd pollfd;
3672 	struct clone_args args = {
3673 		.flags = CLONE_FILES,
3674 		.exit_signal = SIGCHLD,
3675 	};
3676 
3677 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3678 	ASSERT_EQ(0, ret) {
3679 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3680 	}
3681 
3682 	pid = sys_clone3(&args, sizeof(args));
3683 	ASSERT_GE(pid, 0);
3684 
3685 	if (pid == 0) {
3686 		int listener;
3687 
3688 		listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3689 		if (listener < 0)
3690 			_exit(EXIT_FAILURE);
3691 
3692 		if (dup2(listener, 200) != 200)
3693 			_exit(EXIT_FAILURE);
3694 
3695 		close(listener);
3696 
3697 		_exit(EXIT_SUCCESS);
3698 	}
3699 
3700 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3701 	EXPECT_EQ(true, WIFEXITED(status));
3702 	EXPECT_EQ(0, WEXITSTATUS(status));
3703 
3704 	/*
3705 	 * The seccomp filter has become unused so we should be notified once
3706 	 * the kernel gets around to cleaning up task struct.
3707 	 */
3708 	pollfd.fd = 200;
3709 	pollfd.events = POLLHUP;
3710 
3711 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3712 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3713 }
3714 
3715 static void *do_thread(void *data)
3716 {
3717 	return NULL;
3718 }
3719 
3720 TEST(user_notification_filter_empty_threaded)
3721 {
3722 	pid_t pid;
3723 	long ret;
3724 	int status;
3725 	struct pollfd pollfd;
3726 	struct clone_args args = {
3727 		.flags = CLONE_FILES,
3728 		.exit_signal = SIGCHLD,
3729 	};
3730 
3731 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3732 	ASSERT_EQ(0, ret) {
3733 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3734 	}
3735 
3736 	pid = sys_clone3(&args, sizeof(args));
3737 	ASSERT_GE(pid, 0);
3738 
3739 	if (pid == 0) {
3740 		pid_t pid1, pid2;
3741 		int listener, status;
3742 		pthread_t thread;
3743 
3744 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3745 		if (listener < 0)
3746 			_exit(EXIT_FAILURE);
3747 
3748 		if (dup2(listener, 200) != 200)
3749 			_exit(EXIT_FAILURE);
3750 
3751 		close(listener);
3752 
3753 		pid1 = fork();
3754 		if (pid1 < 0)
3755 			_exit(EXIT_FAILURE);
3756 
3757 		if (pid1 == 0)
3758 			_exit(EXIT_SUCCESS);
3759 
3760 		pid2 = fork();
3761 		if (pid2 < 0)
3762 			_exit(EXIT_FAILURE);
3763 
3764 		if (pid2 == 0)
3765 			_exit(EXIT_SUCCESS);
3766 
3767 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3768 		    pthread_join(thread, NULL))
3769 			_exit(EXIT_FAILURE);
3770 
3771 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3772 		    pthread_join(thread, NULL))
3773 			_exit(EXIT_FAILURE);
3774 
3775 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3776 		    WEXITSTATUS(status))
3777 			_exit(EXIT_FAILURE);
3778 
3779 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
3780 		    WEXITSTATUS(status))
3781 			_exit(EXIT_FAILURE);
3782 
3783 		exit(EXIT_SUCCESS);
3784 	}
3785 
3786 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3787 	EXPECT_EQ(true, WIFEXITED(status));
3788 	EXPECT_EQ(0, WEXITSTATUS(status));
3789 
3790 	/*
3791 	 * The seccomp filter has become unused so we should be notified once
3792 	 * the kernel gets around to cleaning up task struct.
3793 	 */
3794 	pollfd.fd = 200;
3795 	pollfd.events = POLLHUP;
3796 
3797 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3798 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3799 }
3800 
3801 TEST(user_notification_addfd)
3802 {
3803 	pid_t pid;
3804 	long ret;
3805 	int status, listener, memfd, fd;
3806 	struct seccomp_notif_addfd addfd = {};
3807 	struct seccomp_notif_addfd_small small = {};
3808 	struct seccomp_notif_addfd_big big = {};
3809 	struct seccomp_notif req = {};
3810 	struct seccomp_notif_resp resp = {};
3811 	/* 100 ms */
3812 	struct timespec delay = { .tv_nsec = 100000000 };
3813 
3814 	memfd = memfd_create("test", 0);
3815 	ASSERT_GE(memfd, 0);
3816 
3817 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3818 	ASSERT_EQ(0, ret) {
3819 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3820 	}
3821 
3822 	/* Check that the basic notification machinery works */
3823 	listener = user_notif_syscall(__NR_getppid,
3824 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3825 	ASSERT_GE(listener, 0);
3826 
3827 	pid = fork();
3828 	ASSERT_GE(pid, 0);
3829 
3830 	if (pid == 0) {
3831 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
3832 			exit(1);
3833 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3834 	}
3835 
3836 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3837 
3838 	addfd.srcfd = memfd;
3839 	addfd.newfd = 0;
3840 	addfd.id = req.id;
3841 	addfd.flags = 0x0;
3842 
3843 	/* Verify bad newfd_flags cannot be set */
3844 	addfd.newfd_flags = ~O_CLOEXEC;
3845 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3846 	EXPECT_EQ(errno, EINVAL);
3847 	addfd.newfd_flags = O_CLOEXEC;
3848 
3849 	/* Verify bad flags cannot be set */
3850 	addfd.flags = 0xff;
3851 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3852 	EXPECT_EQ(errno, EINVAL);
3853 	addfd.flags = 0;
3854 
3855 	/* Verify that remote_fd cannot be set without setting flags */
3856 	addfd.newfd = 1;
3857 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3858 	EXPECT_EQ(errno, EINVAL);
3859 	addfd.newfd = 0;
3860 
3861 	/* Verify small size cannot be set */
3862 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
3863 	EXPECT_EQ(errno, EINVAL);
3864 
3865 	/* Verify we can't send bits filled in unknown buffer area */
3866 	memset(&big, 0xAA, sizeof(big));
3867 	big.addfd = addfd;
3868 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
3869 	EXPECT_EQ(errno, E2BIG);
3870 
3871 
3872 	/* Verify we can set an arbitrary remote fd */
3873 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
3874 	/*
3875 	 * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd),
3876 	 * 4(listener), so the newly allocated fd should be 5.
3877 	 */
3878 	EXPECT_EQ(fd, 5);
3879 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
3880 
3881 	/* Verify we can set an arbitrary remote fd with large size */
3882 	memset(&big, 0x0, sizeof(big));
3883 	big.addfd = addfd;
3884 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
3885 	EXPECT_EQ(fd, 6);
3886 
3887 	/* Verify we can set a specific remote fd */
3888 	addfd.newfd = 42;
3889 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
3890 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
3891 	EXPECT_EQ(fd, 42);
3892 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
3893 
3894 	/* Resume syscall */
3895 	resp.id = req.id;
3896 	resp.error = 0;
3897 	resp.val = USER_NOTIF_MAGIC;
3898 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3899 
3900 	/*
3901 	 * This sets the ID of the ADD FD to the last request plus 1. The
3902 	 * notification ID increments 1 per notification.
3903 	 */
3904 	addfd.id = req.id + 1;
3905 
3906 	/* This spins until the underlying notification is generated */
3907 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
3908 	       errno != -EINPROGRESS)
3909 		nanosleep(&delay, NULL);
3910 
3911 	memset(&req, 0, sizeof(req));
3912 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3913 	ASSERT_EQ(addfd.id, req.id);
3914 
3915 	resp.id = req.id;
3916 	resp.error = 0;
3917 	resp.val = USER_NOTIF_MAGIC;
3918 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3919 
3920 	/* Wait for child to finish. */
3921 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3922 	EXPECT_EQ(true, WIFEXITED(status));
3923 	EXPECT_EQ(0, WEXITSTATUS(status));
3924 
3925 	close(memfd);
3926 }
3927 
3928 TEST(user_notification_addfd_rlimit)
3929 {
3930 	pid_t pid;
3931 	long ret;
3932 	int status, listener, memfd;
3933 	struct seccomp_notif_addfd addfd = {};
3934 	struct seccomp_notif req = {};
3935 	struct seccomp_notif_resp resp = {};
3936 	const struct rlimit lim = {
3937 		.rlim_cur	= 0,
3938 		.rlim_max	= 0,
3939 	};
3940 
3941 	memfd = memfd_create("test", 0);
3942 	ASSERT_GE(memfd, 0);
3943 
3944 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3945 	ASSERT_EQ(0, ret) {
3946 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3947 	}
3948 
3949 	/* Check that the basic notification machinery works */
3950 	listener = user_notif_syscall(__NR_getppid,
3951 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3952 	ASSERT_GE(listener, 0);
3953 
3954 	pid = fork();
3955 	ASSERT_GE(pid, 0);
3956 
3957 	if (pid == 0)
3958 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3959 
3960 
3961 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3962 
3963 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
3964 
3965 	addfd.srcfd = memfd;
3966 	addfd.newfd_flags = O_CLOEXEC;
3967 	addfd.newfd = 0;
3968 	addfd.id = req.id;
3969 	addfd.flags = 0;
3970 
3971 	/* Should probably spot check /proc/sys/fs/file-nr */
3972 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3973 	EXPECT_EQ(errno, EMFILE);
3974 
3975 	addfd.newfd = 100;
3976 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
3977 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3978 	EXPECT_EQ(errno, EBADF);
3979 
3980 	resp.id = req.id;
3981 	resp.error = 0;
3982 	resp.val = USER_NOTIF_MAGIC;
3983 
3984 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3985 
3986 	/* Wait for child to finish. */
3987 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3988 	EXPECT_EQ(true, WIFEXITED(status));
3989 	EXPECT_EQ(0, WEXITSTATUS(status));
3990 
3991 	close(memfd);
3992 }
3993 
3994 /*
3995  * TODO:
3996  * - expand NNP testing
3997  * - better arch-specific TRACE and TRAP handlers.
3998  * - endianness checking when appropriate
3999  * - 64-bit arg prodding
4000  * - arch value testing (x86 modes especially)
4001  * - verify that FILTER_FLAG_LOG filters generate log messages
4002  * - verify that RET_LOG generates log messages
4003  */
4004 
4005 TEST_HARNESS_MAIN
4006