xref: /openbmc/linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision 5fa1f7680f2728d62561db6d4a9282c4d21f2324)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 
50 #include <unistd.h>
51 #include <sys/syscall.h>
52 #include <poll.h>
53 
54 #include "../kselftest_harness.h"
55 #include "../clone3/clone3_selftests.h"
56 
57 /* Attempt to de-conflict with the selftests tree. */
58 #ifndef SKIP
59 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
60 #endif
61 
62 #ifndef PR_SET_PTRACER
63 # define PR_SET_PTRACER 0x59616d61
64 #endif
65 
66 #ifndef PR_SET_NO_NEW_PRIVS
67 #define PR_SET_NO_NEW_PRIVS 38
68 #define PR_GET_NO_NEW_PRIVS 39
69 #endif
70 
71 #ifndef PR_SECCOMP_EXT
72 #define PR_SECCOMP_EXT 43
73 #endif
74 
75 #ifndef SECCOMP_EXT_ACT
76 #define SECCOMP_EXT_ACT 1
77 #endif
78 
79 #ifndef SECCOMP_EXT_ACT_TSYNC
80 #define SECCOMP_EXT_ACT_TSYNC 1
81 #endif
82 
83 #ifndef SECCOMP_MODE_STRICT
84 #define SECCOMP_MODE_STRICT 1
85 #endif
86 
87 #ifndef SECCOMP_MODE_FILTER
88 #define SECCOMP_MODE_FILTER 2
89 #endif
90 
91 #ifndef SECCOMP_RET_ALLOW
92 struct seccomp_data {
93 	int nr;
94 	__u32 arch;
95 	__u64 instruction_pointer;
96 	__u64 args[6];
97 };
98 #endif
99 
100 #ifndef SECCOMP_RET_KILL_PROCESS
101 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
102 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
103 #endif
104 #ifndef SECCOMP_RET_KILL
105 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
106 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
107 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
108 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
109 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
110 #endif
111 #ifndef SECCOMP_RET_LOG
112 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
113 #endif
114 
115 #ifndef __NR_seccomp
116 # if defined(__i386__)
117 #  define __NR_seccomp 354
118 # elif defined(__x86_64__)
119 #  define __NR_seccomp 317
120 # elif defined(__arm__)
121 #  define __NR_seccomp 383
122 # elif defined(__aarch64__)
123 #  define __NR_seccomp 277
124 # elif defined(__riscv)
125 #  define __NR_seccomp 277
126 # elif defined(__csky__)
127 #  define __NR_seccomp 277
128 # elif defined(__hppa__)
129 #  define __NR_seccomp 338
130 # elif defined(__powerpc__)
131 #  define __NR_seccomp 358
132 # elif defined(__s390__)
133 #  define __NR_seccomp 348
134 # elif defined(__xtensa__)
135 #  define __NR_seccomp 337
136 # elif defined(__sh__)
137 #  define __NR_seccomp 372
138 # else
139 #  warning "seccomp syscall number unknown for this architecture"
140 #  define __NR_seccomp 0xffff
141 # endif
142 #endif
143 
144 #ifndef SECCOMP_SET_MODE_STRICT
145 #define SECCOMP_SET_MODE_STRICT 0
146 #endif
147 
148 #ifndef SECCOMP_SET_MODE_FILTER
149 #define SECCOMP_SET_MODE_FILTER 1
150 #endif
151 
152 #ifndef SECCOMP_GET_ACTION_AVAIL
153 #define SECCOMP_GET_ACTION_AVAIL 2
154 #endif
155 
156 #ifndef SECCOMP_GET_NOTIF_SIZES
157 #define SECCOMP_GET_NOTIF_SIZES 3
158 #endif
159 
160 #ifndef SECCOMP_FILTER_FLAG_TSYNC
161 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
162 #endif
163 
164 #ifndef SECCOMP_FILTER_FLAG_LOG
165 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
166 #endif
167 
168 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
169 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
170 #endif
171 
172 #ifndef PTRACE_SECCOMP_GET_METADATA
173 #define PTRACE_SECCOMP_GET_METADATA	0x420d
174 
175 struct seccomp_metadata {
176 	__u64 filter_off;       /* Input: which filter */
177 	__u64 flags;             /* Output: filter's flags */
178 };
179 #endif
180 
181 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
182 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
183 #endif
184 
185 #ifndef SECCOMP_RET_USER_NOTIF
186 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
187 
188 #define SECCOMP_IOC_MAGIC		'!'
189 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
190 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
191 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
192 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
193 
194 /* Flags for seccomp notification fd ioctl. */
195 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
196 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
197 						struct seccomp_notif_resp)
198 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
199 
200 struct seccomp_notif {
201 	__u64 id;
202 	__u32 pid;
203 	__u32 flags;
204 	struct seccomp_data data;
205 };
206 
207 struct seccomp_notif_resp {
208 	__u64 id;
209 	__s64 val;
210 	__s32 error;
211 	__u32 flags;
212 };
213 
214 struct seccomp_notif_sizes {
215 	__u16 seccomp_notif;
216 	__u16 seccomp_notif_resp;
217 	__u16 seccomp_data;
218 };
219 #endif
220 
221 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
222 /* On success, the return value is the remote process's added fd number */
223 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
224 						struct seccomp_notif_addfd)
225 
226 /* valid flags for seccomp_notif_addfd */
227 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
228 
229 struct seccomp_notif_addfd {
230 	__u64 id;
231 	__u32 flags;
232 	__u32 srcfd;
233 	__u32 newfd;
234 	__u32 newfd_flags;
235 };
236 #endif
237 
238 struct seccomp_notif_addfd_small {
239 	__u64 id;
240 	char weird[4];
241 };
242 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
243 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
244 
245 struct seccomp_notif_addfd_big {
246 	union {
247 		struct seccomp_notif_addfd addfd;
248 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
249 	};
250 };
251 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
252 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
253 
254 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
255 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
256 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
257 #endif
258 
259 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
260 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
261 #endif
262 
263 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
264 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
265 #endif
266 
267 #ifndef seccomp
268 int seccomp(unsigned int op, unsigned int flags, void *args)
269 {
270 	errno = 0;
271 	return syscall(__NR_seccomp, op, flags, args);
272 }
273 #endif
274 
275 #if __BYTE_ORDER == __LITTLE_ENDIAN
276 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
277 #elif __BYTE_ORDER == __BIG_ENDIAN
278 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
279 #else
280 #error "wut? Unknown __BYTE_ORDER?!"
281 #endif
282 
283 #define SIBLING_EXIT_UNKILLED	0xbadbeef
284 #define SIBLING_EXIT_FAILURE	0xbadface
285 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
286 
287 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
288 {
289 #ifdef __NR_kcmp
290 	errno = 0;
291 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
292 #else
293 	errno = ENOSYS;
294 	return -1;
295 #endif
296 }
297 
298 /* Have TH_LOG report actual location filecmp() is used. */
299 #define filecmp(pid1, pid2, fd1, fd2)	({		\
300 	int _ret;					\
301 							\
302 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
303 	if (_ret != 0) {				\
304 		if (_ret < 0 && errno == ENOSYS) {	\
305 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
306 			_ret = 0;			\
307 		}					\
308 	}						\
309 	_ret; })
310 
311 TEST(kcmp)
312 {
313 	int ret;
314 
315 	ret = __filecmp(getpid(), getpid(), 1, 1);
316 	EXPECT_EQ(ret, 0);
317 	if (ret != 0 && errno == ENOSYS)
318 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_CHECKPOINT_RESTORE?)");
319 }
320 
321 TEST(mode_strict_support)
322 {
323 	long ret;
324 
325 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
326 	ASSERT_EQ(0, ret) {
327 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
328 	}
329 	syscall(__NR_exit, 0);
330 }
331 
332 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
333 {
334 	long ret;
335 
336 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
337 	ASSERT_EQ(0, ret) {
338 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
339 	}
340 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
341 		NULL, NULL, NULL);
342 	EXPECT_FALSE(true) {
343 		TH_LOG("Unreachable!");
344 	}
345 }
346 
347 /* Note! This doesn't test no new privs behavior */
348 TEST(no_new_privs_support)
349 {
350 	long ret;
351 
352 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
353 	EXPECT_EQ(0, ret) {
354 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
355 	}
356 }
357 
358 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
359 TEST(mode_filter_support)
360 {
361 	long ret;
362 
363 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
364 	ASSERT_EQ(0, ret) {
365 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
366 	}
367 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
368 	EXPECT_EQ(-1, ret);
369 	EXPECT_EQ(EFAULT, errno) {
370 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
371 	}
372 }
373 
374 TEST(mode_filter_without_nnp)
375 {
376 	struct sock_filter filter[] = {
377 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
378 	};
379 	struct sock_fprog prog = {
380 		.len = (unsigned short)ARRAY_SIZE(filter),
381 		.filter = filter,
382 	};
383 	long ret;
384 
385 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
386 	ASSERT_LE(0, ret) {
387 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
388 	}
389 	errno = 0;
390 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
391 	/* Succeeds with CAP_SYS_ADMIN, fails without */
392 	/* TODO(wad) check caps not euid */
393 	if (geteuid()) {
394 		EXPECT_EQ(-1, ret);
395 		EXPECT_EQ(EACCES, errno);
396 	} else {
397 		EXPECT_EQ(0, ret);
398 	}
399 }
400 
401 #define MAX_INSNS_PER_PATH 32768
402 
403 TEST(filter_size_limits)
404 {
405 	int i;
406 	int count = BPF_MAXINSNS + 1;
407 	struct sock_filter allow[] = {
408 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
409 	};
410 	struct sock_filter *filter;
411 	struct sock_fprog prog = { };
412 	long ret;
413 
414 	filter = calloc(count, sizeof(*filter));
415 	ASSERT_NE(NULL, filter);
416 
417 	for (i = 0; i < count; i++)
418 		filter[i] = allow[0];
419 
420 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
421 	ASSERT_EQ(0, ret);
422 
423 	prog.filter = filter;
424 	prog.len = count;
425 
426 	/* Too many filter instructions in a single filter. */
427 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
428 	ASSERT_NE(0, ret) {
429 		TH_LOG("Installing %d insn filter was allowed", prog.len);
430 	}
431 
432 	/* One less is okay, though. */
433 	prog.len -= 1;
434 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
435 	ASSERT_EQ(0, ret) {
436 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
437 	}
438 }
439 
440 TEST(filter_chain_limits)
441 {
442 	int i;
443 	int count = BPF_MAXINSNS;
444 	struct sock_filter allow[] = {
445 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
446 	};
447 	struct sock_filter *filter;
448 	struct sock_fprog prog = { };
449 	long ret;
450 
451 	filter = calloc(count, sizeof(*filter));
452 	ASSERT_NE(NULL, filter);
453 
454 	for (i = 0; i < count; i++)
455 		filter[i] = allow[0];
456 
457 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
458 	ASSERT_EQ(0, ret);
459 
460 	prog.filter = filter;
461 	prog.len = 1;
462 
463 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
464 	ASSERT_EQ(0, ret);
465 
466 	prog.len = count;
467 
468 	/* Too many total filter instructions. */
469 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
470 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
471 		if (ret != 0)
472 			break;
473 	}
474 	ASSERT_NE(0, ret) {
475 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
476 		       i, count, i * (count + 4));
477 	}
478 }
479 
480 TEST(mode_filter_cannot_move_to_strict)
481 {
482 	struct sock_filter filter[] = {
483 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
484 	};
485 	struct sock_fprog prog = {
486 		.len = (unsigned short)ARRAY_SIZE(filter),
487 		.filter = filter,
488 	};
489 	long ret;
490 
491 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
492 	ASSERT_EQ(0, ret);
493 
494 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
495 	ASSERT_EQ(0, ret);
496 
497 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
498 	EXPECT_EQ(-1, ret);
499 	EXPECT_EQ(EINVAL, errno);
500 }
501 
502 
503 TEST(mode_filter_get_seccomp)
504 {
505 	struct sock_filter filter[] = {
506 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
507 	};
508 	struct sock_fprog prog = {
509 		.len = (unsigned short)ARRAY_SIZE(filter),
510 		.filter = filter,
511 	};
512 	long ret;
513 
514 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
515 	ASSERT_EQ(0, ret);
516 
517 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
518 	EXPECT_EQ(0, ret);
519 
520 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
521 	ASSERT_EQ(0, ret);
522 
523 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
524 	EXPECT_EQ(2, ret);
525 }
526 
527 
528 TEST(ALLOW_all)
529 {
530 	struct sock_filter filter[] = {
531 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
532 	};
533 	struct sock_fprog prog = {
534 		.len = (unsigned short)ARRAY_SIZE(filter),
535 		.filter = filter,
536 	};
537 	long ret;
538 
539 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
540 	ASSERT_EQ(0, ret);
541 
542 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
543 	ASSERT_EQ(0, ret);
544 }
545 
546 TEST(empty_prog)
547 {
548 	struct sock_filter filter[] = {
549 	};
550 	struct sock_fprog prog = {
551 		.len = (unsigned short)ARRAY_SIZE(filter),
552 		.filter = filter,
553 	};
554 	long ret;
555 
556 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
557 	ASSERT_EQ(0, ret);
558 
559 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
560 	EXPECT_EQ(-1, ret);
561 	EXPECT_EQ(EINVAL, errno);
562 }
563 
564 TEST(log_all)
565 {
566 	struct sock_filter filter[] = {
567 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
568 	};
569 	struct sock_fprog prog = {
570 		.len = (unsigned short)ARRAY_SIZE(filter),
571 		.filter = filter,
572 	};
573 	long ret;
574 	pid_t parent = getppid();
575 
576 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
577 	ASSERT_EQ(0, ret);
578 
579 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
580 	ASSERT_EQ(0, ret);
581 
582 	/* getppid() should succeed and be logged (no check for logging) */
583 	EXPECT_EQ(parent, syscall(__NR_getppid));
584 }
585 
586 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
587 {
588 	struct sock_filter filter[] = {
589 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
590 	};
591 	struct sock_fprog prog = {
592 		.len = (unsigned short)ARRAY_SIZE(filter),
593 		.filter = filter,
594 	};
595 	long ret;
596 
597 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
598 	ASSERT_EQ(0, ret);
599 
600 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
601 	ASSERT_EQ(0, ret);
602 	EXPECT_EQ(0, syscall(__NR_getpid)) {
603 		TH_LOG("getpid() shouldn't ever return");
604 	}
605 }
606 
607 /* return code >= 0x80000000 is unused. */
608 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
609 {
610 	struct sock_filter filter[] = {
611 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
612 	};
613 	struct sock_fprog prog = {
614 		.len = (unsigned short)ARRAY_SIZE(filter),
615 		.filter = filter,
616 	};
617 	long ret;
618 
619 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
620 	ASSERT_EQ(0, ret);
621 
622 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
623 	ASSERT_EQ(0, ret);
624 	EXPECT_EQ(0, syscall(__NR_getpid)) {
625 		TH_LOG("getpid() shouldn't ever return");
626 	}
627 }
628 
629 TEST_SIGNAL(KILL_all, SIGSYS)
630 {
631 	struct sock_filter filter[] = {
632 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
633 	};
634 	struct sock_fprog prog = {
635 		.len = (unsigned short)ARRAY_SIZE(filter),
636 		.filter = filter,
637 	};
638 	long ret;
639 
640 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
641 	ASSERT_EQ(0, ret);
642 
643 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
644 	ASSERT_EQ(0, ret);
645 }
646 
647 TEST_SIGNAL(KILL_one, SIGSYS)
648 {
649 	struct sock_filter filter[] = {
650 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
651 			offsetof(struct seccomp_data, nr)),
652 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
653 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
654 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
655 	};
656 	struct sock_fprog prog = {
657 		.len = (unsigned short)ARRAY_SIZE(filter),
658 		.filter = filter,
659 	};
660 	long ret;
661 	pid_t parent = getppid();
662 
663 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
664 	ASSERT_EQ(0, ret);
665 
666 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
667 	ASSERT_EQ(0, ret);
668 
669 	EXPECT_EQ(parent, syscall(__NR_getppid));
670 	/* getpid() should never return. */
671 	EXPECT_EQ(0, syscall(__NR_getpid));
672 }
673 
674 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
675 {
676 	void *fatal_address;
677 	struct sock_filter filter[] = {
678 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
679 			offsetof(struct seccomp_data, nr)),
680 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
681 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
682 		/* Only both with lower 32-bit for now. */
683 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
684 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
685 			(unsigned long)&fatal_address, 0, 1),
686 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
687 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
688 	};
689 	struct sock_fprog prog = {
690 		.len = (unsigned short)ARRAY_SIZE(filter),
691 		.filter = filter,
692 	};
693 	long ret;
694 	pid_t parent = getppid();
695 	struct tms timebuf;
696 	clock_t clock = times(&timebuf);
697 
698 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
699 	ASSERT_EQ(0, ret);
700 
701 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
702 	ASSERT_EQ(0, ret);
703 
704 	EXPECT_EQ(parent, syscall(__NR_getppid));
705 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
706 	/* times() should never return. */
707 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
708 }
709 
710 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
711 {
712 #ifndef __NR_mmap2
713 	int sysno = __NR_mmap;
714 #else
715 	int sysno = __NR_mmap2;
716 #endif
717 	struct sock_filter filter[] = {
718 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
719 			offsetof(struct seccomp_data, nr)),
720 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
721 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
722 		/* Only both with lower 32-bit for now. */
723 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
724 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
725 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
726 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
727 	};
728 	struct sock_fprog prog = {
729 		.len = (unsigned short)ARRAY_SIZE(filter),
730 		.filter = filter,
731 	};
732 	long ret;
733 	pid_t parent = getppid();
734 	int fd;
735 	void *map1, *map2;
736 	int page_size = sysconf(_SC_PAGESIZE);
737 
738 	ASSERT_LT(0, page_size);
739 
740 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
741 	ASSERT_EQ(0, ret);
742 
743 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
744 	ASSERT_EQ(0, ret);
745 
746 	fd = open("/dev/zero", O_RDONLY);
747 	ASSERT_NE(-1, fd);
748 
749 	EXPECT_EQ(parent, syscall(__NR_getppid));
750 	map1 = (void *)syscall(sysno,
751 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
752 	EXPECT_NE(MAP_FAILED, map1);
753 	/* mmap2() should never return. */
754 	map2 = (void *)syscall(sysno,
755 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
756 	EXPECT_EQ(MAP_FAILED, map2);
757 
758 	/* The test failed, so clean up the resources. */
759 	munmap(map1, page_size);
760 	munmap(map2, page_size);
761 	close(fd);
762 }
763 
764 /* This is a thread task to die via seccomp filter violation. */
765 void *kill_thread(void *data)
766 {
767 	bool die = (bool)data;
768 
769 	if (die) {
770 		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
771 		return (void *)SIBLING_EXIT_FAILURE;
772 	}
773 
774 	return (void *)SIBLING_EXIT_UNKILLED;
775 }
776 
777 /* Prepare a thread that will kill itself or both of us. */
778 void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
779 {
780 	pthread_t thread;
781 	void *status;
782 	/* Kill only when calling __NR_prctl. */
783 	struct sock_filter filter_thread[] = {
784 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
785 			offsetof(struct seccomp_data, nr)),
786 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
787 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
788 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
789 	};
790 	struct sock_fprog prog_thread = {
791 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
792 		.filter = filter_thread,
793 	};
794 	struct sock_filter filter_process[] = {
795 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
796 			offsetof(struct seccomp_data, nr)),
797 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
798 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS),
799 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
800 	};
801 	struct sock_fprog prog_process = {
802 		.len = (unsigned short)ARRAY_SIZE(filter_process),
803 		.filter = filter_process,
804 	};
805 
806 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
807 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
808 	}
809 
810 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
811 			     kill_process ? &prog_process : &prog_thread));
812 
813 	/*
814 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
815 	 * flag cannot be downgraded by a new filter.
816 	 */
817 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
818 
819 	/* Start a thread that will exit immediately. */
820 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
821 	ASSERT_EQ(0, pthread_join(thread, &status));
822 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
823 
824 	/* Start a thread that will die immediately. */
825 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
826 	ASSERT_EQ(0, pthread_join(thread, &status));
827 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
828 
829 	/*
830 	 * If we get here, only the spawned thread died. Let the parent know
831 	 * the whole process didn't die (i.e. this thread, the spawner,
832 	 * stayed running).
833 	 */
834 	exit(42);
835 }
836 
837 TEST(KILL_thread)
838 {
839 	int status;
840 	pid_t child_pid;
841 
842 	child_pid = fork();
843 	ASSERT_LE(0, child_pid);
844 	if (child_pid == 0) {
845 		kill_thread_or_group(_metadata, false);
846 		_exit(38);
847 	}
848 
849 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
850 
851 	/* If only the thread was killed, we'll see exit 42. */
852 	ASSERT_TRUE(WIFEXITED(status));
853 	ASSERT_EQ(42, WEXITSTATUS(status));
854 }
855 
856 TEST(KILL_process)
857 {
858 	int status;
859 	pid_t child_pid;
860 
861 	child_pid = fork();
862 	ASSERT_LE(0, child_pid);
863 	if (child_pid == 0) {
864 		kill_thread_or_group(_metadata, true);
865 		_exit(38);
866 	}
867 
868 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
869 
870 	/* If the entire process was killed, we'll see SIGSYS. */
871 	ASSERT_TRUE(WIFSIGNALED(status));
872 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
873 }
874 
875 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
876 TEST(arg_out_of_range)
877 {
878 	struct sock_filter filter[] = {
879 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
880 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
881 	};
882 	struct sock_fprog prog = {
883 		.len = (unsigned short)ARRAY_SIZE(filter),
884 		.filter = filter,
885 	};
886 	long ret;
887 
888 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
889 	ASSERT_EQ(0, ret);
890 
891 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
892 	EXPECT_EQ(-1, ret);
893 	EXPECT_EQ(EINVAL, errno);
894 }
895 
896 #define ERRNO_FILTER(name, errno)					\
897 	struct sock_filter _read_filter_##name[] = {			\
898 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
899 			offsetof(struct seccomp_data, nr)),		\
900 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
901 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
902 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
903 	};								\
904 	struct sock_fprog prog_##name = {				\
905 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
906 		.filter = _read_filter_##name,				\
907 	}
908 
909 /* Make sure basic errno values are correctly passed through a filter. */
910 TEST(ERRNO_valid)
911 {
912 	ERRNO_FILTER(valid, E2BIG);
913 	long ret;
914 	pid_t parent = getppid();
915 
916 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
917 	ASSERT_EQ(0, ret);
918 
919 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
920 	ASSERT_EQ(0, ret);
921 
922 	EXPECT_EQ(parent, syscall(__NR_getppid));
923 	EXPECT_EQ(-1, read(0, NULL, 0));
924 	EXPECT_EQ(E2BIG, errno);
925 }
926 
927 /* Make sure an errno of zero is correctly handled by the arch code. */
928 TEST(ERRNO_zero)
929 {
930 	ERRNO_FILTER(zero, 0);
931 	long ret;
932 	pid_t parent = getppid();
933 
934 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
935 	ASSERT_EQ(0, ret);
936 
937 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
938 	ASSERT_EQ(0, ret);
939 
940 	EXPECT_EQ(parent, syscall(__NR_getppid));
941 	/* "errno" of 0 is ok. */
942 	EXPECT_EQ(0, read(0, NULL, 0));
943 }
944 
945 /*
946  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
947  * This tests that the errno value gets capped correctly, fixed by
948  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
949  */
950 TEST(ERRNO_capped)
951 {
952 	ERRNO_FILTER(capped, 4096);
953 	long ret;
954 	pid_t parent = getppid();
955 
956 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
957 	ASSERT_EQ(0, ret);
958 
959 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
960 	ASSERT_EQ(0, ret);
961 
962 	EXPECT_EQ(parent, syscall(__NR_getppid));
963 	EXPECT_EQ(-1, read(0, NULL, 0));
964 	EXPECT_EQ(4095, errno);
965 }
966 
967 /*
968  * Filters are processed in reverse order: last applied is executed first.
969  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
970  * SECCOMP_RET_DATA mask results will follow the most recently applied
971  * matching filter return (and not the lowest or highest value).
972  */
973 TEST(ERRNO_order)
974 {
975 	ERRNO_FILTER(first,  11);
976 	ERRNO_FILTER(second, 13);
977 	ERRNO_FILTER(third,  12);
978 	long ret;
979 	pid_t parent = getppid();
980 
981 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
982 	ASSERT_EQ(0, ret);
983 
984 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
985 	ASSERT_EQ(0, ret);
986 
987 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
988 	ASSERT_EQ(0, ret);
989 
990 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
991 	ASSERT_EQ(0, ret);
992 
993 	EXPECT_EQ(parent, syscall(__NR_getppid));
994 	EXPECT_EQ(-1, read(0, NULL, 0));
995 	EXPECT_EQ(12, errno);
996 }
997 
998 FIXTURE(TRAP) {
999 	struct sock_fprog prog;
1000 };
1001 
1002 FIXTURE_SETUP(TRAP)
1003 {
1004 	struct sock_filter filter[] = {
1005 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1006 			offsetof(struct seccomp_data, nr)),
1007 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1008 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1009 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1010 	};
1011 
1012 	memset(&self->prog, 0, sizeof(self->prog));
1013 	self->prog.filter = malloc(sizeof(filter));
1014 	ASSERT_NE(NULL, self->prog.filter);
1015 	memcpy(self->prog.filter, filter, sizeof(filter));
1016 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1017 }
1018 
1019 FIXTURE_TEARDOWN(TRAP)
1020 {
1021 	if (self->prog.filter)
1022 		free(self->prog.filter);
1023 }
1024 
1025 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1026 {
1027 	long ret;
1028 
1029 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1030 	ASSERT_EQ(0, ret);
1031 
1032 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1033 	ASSERT_EQ(0, ret);
1034 	syscall(__NR_getpid);
1035 }
1036 
1037 /* Ensure that SIGSYS overrides SIG_IGN */
1038 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1039 {
1040 	long ret;
1041 
1042 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1043 	ASSERT_EQ(0, ret);
1044 
1045 	signal(SIGSYS, SIG_IGN);
1046 
1047 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1048 	ASSERT_EQ(0, ret);
1049 	syscall(__NR_getpid);
1050 }
1051 
1052 static siginfo_t TRAP_info;
1053 static volatile int TRAP_nr;
1054 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1055 {
1056 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1057 	TRAP_nr = nr;
1058 }
1059 
1060 TEST_F(TRAP, handler)
1061 {
1062 	int ret, test;
1063 	struct sigaction act;
1064 	sigset_t mask;
1065 
1066 	memset(&act, 0, sizeof(act));
1067 	sigemptyset(&mask);
1068 	sigaddset(&mask, SIGSYS);
1069 
1070 	act.sa_sigaction = &TRAP_action;
1071 	act.sa_flags = SA_SIGINFO;
1072 	ret = sigaction(SIGSYS, &act, NULL);
1073 	ASSERT_EQ(0, ret) {
1074 		TH_LOG("sigaction failed");
1075 	}
1076 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1077 	ASSERT_EQ(0, ret) {
1078 		TH_LOG("sigprocmask failed");
1079 	}
1080 
1081 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1082 	ASSERT_EQ(0, ret);
1083 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1084 	ASSERT_EQ(0, ret);
1085 	TRAP_nr = 0;
1086 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1087 	/* Expect the registers to be rolled back. (nr = error) may vary
1088 	 * based on arch. */
1089 	ret = syscall(__NR_getpid);
1090 	/* Silence gcc warning about volatile. */
1091 	test = TRAP_nr;
1092 	EXPECT_EQ(SIGSYS, test);
1093 	struct local_sigsys {
1094 		void *_call_addr;	/* calling user insn */
1095 		int _syscall;		/* triggering system call number */
1096 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1097 	} *sigsys = (struct local_sigsys *)
1098 #ifdef si_syscall
1099 		&(TRAP_info.si_call_addr);
1100 #else
1101 		&TRAP_info.si_pid;
1102 #endif
1103 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1104 	/* Make sure arch is non-zero. */
1105 	EXPECT_NE(0, sigsys->_arch);
1106 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1107 }
1108 
1109 FIXTURE(precedence) {
1110 	struct sock_fprog allow;
1111 	struct sock_fprog log;
1112 	struct sock_fprog trace;
1113 	struct sock_fprog error;
1114 	struct sock_fprog trap;
1115 	struct sock_fprog kill;
1116 };
1117 
1118 FIXTURE_SETUP(precedence)
1119 {
1120 	struct sock_filter allow_insns[] = {
1121 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1122 	};
1123 	struct sock_filter log_insns[] = {
1124 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1125 			offsetof(struct seccomp_data, nr)),
1126 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1127 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1128 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1129 	};
1130 	struct sock_filter trace_insns[] = {
1131 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1132 			offsetof(struct seccomp_data, nr)),
1133 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1134 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1135 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1136 	};
1137 	struct sock_filter error_insns[] = {
1138 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1139 			offsetof(struct seccomp_data, nr)),
1140 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1141 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1142 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1143 	};
1144 	struct sock_filter trap_insns[] = {
1145 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1146 			offsetof(struct seccomp_data, nr)),
1147 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1148 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1149 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1150 	};
1151 	struct sock_filter kill_insns[] = {
1152 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1153 			offsetof(struct seccomp_data, nr)),
1154 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1155 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1156 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1157 	};
1158 
1159 	memset(self, 0, sizeof(*self));
1160 #define FILTER_ALLOC(_x) \
1161 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1162 	ASSERT_NE(NULL, self->_x.filter); \
1163 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1164 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1165 	FILTER_ALLOC(allow);
1166 	FILTER_ALLOC(log);
1167 	FILTER_ALLOC(trace);
1168 	FILTER_ALLOC(error);
1169 	FILTER_ALLOC(trap);
1170 	FILTER_ALLOC(kill);
1171 }
1172 
1173 FIXTURE_TEARDOWN(precedence)
1174 {
1175 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1176 	FILTER_FREE(allow);
1177 	FILTER_FREE(log);
1178 	FILTER_FREE(trace);
1179 	FILTER_FREE(error);
1180 	FILTER_FREE(trap);
1181 	FILTER_FREE(kill);
1182 }
1183 
1184 TEST_F(precedence, allow_ok)
1185 {
1186 	pid_t parent, res = 0;
1187 	long ret;
1188 
1189 	parent = getppid();
1190 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1191 	ASSERT_EQ(0, ret);
1192 
1193 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1194 	ASSERT_EQ(0, ret);
1195 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1196 	ASSERT_EQ(0, ret);
1197 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1198 	ASSERT_EQ(0, ret);
1199 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1200 	ASSERT_EQ(0, ret);
1201 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1202 	ASSERT_EQ(0, ret);
1203 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1204 	ASSERT_EQ(0, ret);
1205 	/* Should work just fine. */
1206 	res = syscall(__NR_getppid);
1207 	EXPECT_EQ(parent, res);
1208 }
1209 
1210 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1211 {
1212 	pid_t parent, res = 0;
1213 	long ret;
1214 
1215 	parent = getppid();
1216 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1217 	ASSERT_EQ(0, ret);
1218 
1219 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1220 	ASSERT_EQ(0, ret);
1221 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1222 	ASSERT_EQ(0, ret);
1223 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1224 	ASSERT_EQ(0, ret);
1225 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1226 	ASSERT_EQ(0, ret);
1227 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1228 	ASSERT_EQ(0, ret);
1229 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1230 	ASSERT_EQ(0, ret);
1231 	/* Should work just fine. */
1232 	res = syscall(__NR_getppid);
1233 	EXPECT_EQ(parent, res);
1234 	/* getpid() should never return. */
1235 	res = syscall(__NR_getpid);
1236 	EXPECT_EQ(0, res);
1237 }
1238 
1239 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1240 {
1241 	pid_t parent;
1242 	long ret;
1243 
1244 	parent = getppid();
1245 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1246 	ASSERT_EQ(0, ret);
1247 
1248 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1249 	ASSERT_EQ(0, ret);
1250 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1251 	ASSERT_EQ(0, ret);
1252 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1253 	ASSERT_EQ(0, ret);
1254 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1255 	ASSERT_EQ(0, ret);
1256 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1257 	ASSERT_EQ(0, ret);
1258 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1259 	ASSERT_EQ(0, ret);
1260 	/* Should work just fine. */
1261 	EXPECT_EQ(parent, syscall(__NR_getppid));
1262 	/* getpid() should never return. */
1263 	EXPECT_EQ(0, syscall(__NR_getpid));
1264 }
1265 
1266 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1267 {
1268 	pid_t parent;
1269 	long ret;
1270 
1271 	parent = getppid();
1272 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1273 	ASSERT_EQ(0, ret);
1274 
1275 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1276 	ASSERT_EQ(0, ret);
1277 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1278 	ASSERT_EQ(0, ret);
1279 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1280 	ASSERT_EQ(0, ret);
1281 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1282 	ASSERT_EQ(0, ret);
1283 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1284 	ASSERT_EQ(0, ret);
1285 	/* Should work just fine. */
1286 	EXPECT_EQ(parent, syscall(__NR_getppid));
1287 	/* getpid() should never return. */
1288 	EXPECT_EQ(0, syscall(__NR_getpid));
1289 }
1290 
1291 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1292 {
1293 	pid_t parent;
1294 	long ret;
1295 
1296 	parent = getppid();
1297 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1298 	ASSERT_EQ(0, ret);
1299 
1300 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1301 	ASSERT_EQ(0, ret);
1302 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1303 	ASSERT_EQ(0, ret);
1304 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1305 	ASSERT_EQ(0, ret);
1306 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1307 	ASSERT_EQ(0, ret);
1308 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1309 	ASSERT_EQ(0, ret);
1310 	/* Should work just fine. */
1311 	EXPECT_EQ(parent, syscall(__NR_getppid));
1312 	/* getpid() should never return. */
1313 	EXPECT_EQ(0, syscall(__NR_getpid));
1314 }
1315 
1316 TEST_F(precedence, errno_is_third)
1317 {
1318 	pid_t parent;
1319 	long ret;
1320 
1321 	parent = getppid();
1322 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1323 	ASSERT_EQ(0, ret);
1324 
1325 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1326 	ASSERT_EQ(0, ret);
1327 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1328 	ASSERT_EQ(0, ret);
1329 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1330 	ASSERT_EQ(0, ret);
1331 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1332 	ASSERT_EQ(0, ret);
1333 	/* Should work just fine. */
1334 	EXPECT_EQ(parent, syscall(__NR_getppid));
1335 	EXPECT_EQ(0, syscall(__NR_getpid));
1336 }
1337 
1338 TEST_F(precedence, errno_is_third_in_any_order)
1339 {
1340 	pid_t parent;
1341 	long ret;
1342 
1343 	parent = getppid();
1344 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1345 	ASSERT_EQ(0, ret);
1346 
1347 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1348 	ASSERT_EQ(0, ret);
1349 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1350 	ASSERT_EQ(0, ret);
1351 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1352 	ASSERT_EQ(0, ret);
1353 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1354 	ASSERT_EQ(0, ret);
1355 	/* Should work just fine. */
1356 	EXPECT_EQ(parent, syscall(__NR_getppid));
1357 	EXPECT_EQ(0, syscall(__NR_getpid));
1358 }
1359 
1360 TEST_F(precedence, trace_is_fourth)
1361 {
1362 	pid_t parent;
1363 	long ret;
1364 
1365 	parent = getppid();
1366 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1367 	ASSERT_EQ(0, ret);
1368 
1369 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1370 	ASSERT_EQ(0, ret);
1371 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1372 	ASSERT_EQ(0, ret);
1373 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1374 	ASSERT_EQ(0, ret);
1375 	/* Should work just fine. */
1376 	EXPECT_EQ(parent, syscall(__NR_getppid));
1377 	/* No ptracer */
1378 	EXPECT_EQ(-1, syscall(__NR_getpid));
1379 }
1380 
1381 TEST_F(precedence, trace_is_fourth_in_any_order)
1382 {
1383 	pid_t parent;
1384 	long ret;
1385 
1386 	parent = getppid();
1387 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1388 	ASSERT_EQ(0, ret);
1389 
1390 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1391 	ASSERT_EQ(0, ret);
1392 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1393 	ASSERT_EQ(0, ret);
1394 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1395 	ASSERT_EQ(0, ret);
1396 	/* Should work just fine. */
1397 	EXPECT_EQ(parent, syscall(__NR_getppid));
1398 	/* No ptracer */
1399 	EXPECT_EQ(-1, syscall(__NR_getpid));
1400 }
1401 
1402 TEST_F(precedence, log_is_fifth)
1403 {
1404 	pid_t mypid, parent;
1405 	long ret;
1406 
1407 	mypid = getpid();
1408 	parent = getppid();
1409 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1410 	ASSERT_EQ(0, ret);
1411 
1412 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1413 	ASSERT_EQ(0, ret);
1414 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1415 	ASSERT_EQ(0, ret);
1416 	/* Should work just fine. */
1417 	EXPECT_EQ(parent, syscall(__NR_getppid));
1418 	/* Should also work just fine */
1419 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1420 }
1421 
1422 TEST_F(precedence, log_is_fifth_in_any_order)
1423 {
1424 	pid_t mypid, parent;
1425 	long ret;
1426 
1427 	mypid = getpid();
1428 	parent = getppid();
1429 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1430 	ASSERT_EQ(0, ret);
1431 
1432 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1433 	ASSERT_EQ(0, ret);
1434 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1435 	ASSERT_EQ(0, ret);
1436 	/* Should work just fine. */
1437 	EXPECT_EQ(parent, syscall(__NR_getppid));
1438 	/* Should also work just fine */
1439 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1440 }
1441 
1442 #ifndef PTRACE_O_TRACESECCOMP
1443 #define PTRACE_O_TRACESECCOMP	0x00000080
1444 #endif
1445 
1446 /* Catch the Ubuntu 12.04 value error. */
1447 #if PTRACE_EVENT_SECCOMP != 7
1448 #undef PTRACE_EVENT_SECCOMP
1449 #endif
1450 
1451 #ifndef PTRACE_EVENT_SECCOMP
1452 #define PTRACE_EVENT_SECCOMP 7
1453 #endif
1454 
1455 #define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
1456 bool tracer_running;
1457 void tracer_stop(int sig)
1458 {
1459 	tracer_running = false;
1460 }
1461 
1462 typedef void tracer_func_t(struct __test_metadata *_metadata,
1463 			   pid_t tracee, int status, void *args);
1464 
1465 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1466 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1467 {
1468 	int ret = -1;
1469 	struct sigaction action = {
1470 		.sa_handler = tracer_stop,
1471 	};
1472 
1473 	/* Allow external shutdown. */
1474 	tracer_running = true;
1475 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1476 
1477 	errno = 0;
1478 	while (ret == -1 && errno != EINVAL)
1479 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1480 	ASSERT_EQ(0, ret) {
1481 		kill(tracee, SIGKILL);
1482 	}
1483 	/* Wait for attach stop */
1484 	wait(NULL);
1485 
1486 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1487 						      PTRACE_O_TRACESYSGOOD :
1488 						      PTRACE_O_TRACESECCOMP);
1489 	ASSERT_EQ(0, ret) {
1490 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1491 		kill(tracee, SIGKILL);
1492 	}
1493 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1494 		     tracee, NULL, 0);
1495 	ASSERT_EQ(0, ret);
1496 
1497 	/* Unblock the tracee */
1498 	ASSERT_EQ(1, write(fd, "A", 1));
1499 	ASSERT_EQ(0, close(fd));
1500 
1501 	/* Run until we're shut down. Must assert to stop execution. */
1502 	while (tracer_running) {
1503 		int status;
1504 
1505 		if (wait(&status) != tracee)
1506 			continue;
1507 		if (WIFSIGNALED(status) || WIFEXITED(status))
1508 			/* Child is dead. Time to go. */
1509 			return;
1510 
1511 		/* Check if this is a seccomp event. */
1512 		ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
1513 
1514 		tracer_func(_metadata, tracee, status, args);
1515 
1516 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1517 			     tracee, NULL, 0);
1518 		ASSERT_EQ(0, ret);
1519 	}
1520 	/* Directly report the status of our test harness results. */
1521 	syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1522 }
1523 
1524 /* Common tracer setup/teardown functions. */
1525 void cont_handler(int num)
1526 { }
1527 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1528 			  tracer_func_t func, void *args, bool ptrace_syscall)
1529 {
1530 	char sync;
1531 	int pipefd[2];
1532 	pid_t tracer_pid;
1533 	pid_t tracee = getpid();
1534 
1535 	/* Setup a pipe for clean synchronization. */
1536 	ASSERT_EQ(0, pipe(pipefd));
1537 
1538 	/* Fork a child which we'll promote to tracer */
1539 	tracer_pid = fork();
1540 	ASSERT_LE(0, tracer_pid);
1541 	signal(SIGALRM, cont_handler);
1542 	if (tracer_pid == 0) {
1543 		close(pipefd[0]);
1544 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1545 			     ptrace_syscall);
1546 		syscall(__NR_exit, 0);
1547 	}
1548 	close(pipefd[1]);
1549 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1550 	read(pipefd[0], &sync, 1);
1551 	close(pipefd[0]);
1552 
1553 	return tracer_pid;
1554 }
1555 
1556 void teardown_trace_fixture(struct __test_metadata *_metadata,
1557 			    pid_t tracer)
1558 {
1559 	if (tracer) {
1560 		int status;
1561 		/*
1562 		 * Extract the exit code from the other process and
1563 		 * adopt it for ourselves in case its asserts failed.
1564 		 */
1565 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1566 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1567 		if (WEXITSTATUS(status))
1568 			_metadata->passed = 0;
1569 	}
1570 }
1571 
1572 /* "poke" tracer arguments and function. */
1573 struct tracer_args_poke_t {
1574 	unsigned long poke_addr;
1575 };
1576 
1577 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1578 		 void *args)
1579 {
1580 	int ret;
1581 	unsigned long msg;
1582 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1583 
1584 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1585 	EXPECT_EQ(0, ret);
1586 	/* If this fails, don't try to recover. */
1587 	ASSERT_EQ(0x1001, msg) {
1588 		kill(tracee, SIGKILL);
1589 	}
1590 	/*
1591 	 * Poke in the message.
1592 	 * Registers are not touched to try to keep this relatively arch
1593 	 * agnostic.
1594 	 */
1595 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1596 	EXPECT_EQ(0, ret);
1597 }
1598 
1599 FIXTURE(TRACE_poke) {
1600 	struct sock_fprog prog;
1601 	pid_t tracer;
1602 	long poked;
1603 	struct tracer_args_poke_t tracer_args;
1604 };
1605 
1606 FIXTURE_SETUP(TRACE_poke)
1607 {
1608 	struct sock_filter filter[] = {
1609 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1610 			offsetof(struct seccomp_data, nr)),
1611 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1612 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1613 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1614 	};
1615 
1616 	self->poked = 0;
1617 	memset(&self->prog, 0, sizeof(self->prog));
1618 	self->prog.filter = malloc(sizeof(filter));
1619 	ASSERT_NE(NULL, self->prog.filter);
1620 	memcpy(self->prog.filter, filter, sizeof(filter));
1621 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1622 
1623 	/* Set up tracer args. */
1624 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1625 
1626 	/* Launch tracer. */
1627 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1628 					   &self->tracer_args, false);
1629 }
1630 
1631 FIXTURE_TEARDOWN(TRACE_poke)
1632 {
1633 	teardown_trace_fixture(_metadata, self->tracer);
1634 	if (self->prog.filter)
1635 		free(self->prog.filter);
1636 }
1637 
1638 TEST_F(TRACE_poke, read_has_side_effects)
1639 {
1640 	ssize_t ret;
1641 
1642 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1643 	ASSERT_EQ(0, ret);
1644 
1645 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1646 	ASSERT_EQ(0, ret);
1647 
1648 	EXPECT_EQ(0, self->poked);
1649 	ret = read(-1, NULL, 0);
1650 	EXPECT_EQ(-1, ret);
1651 	EXPECT_EQ(0x1001, self->poked);
1652 }
1653 
1654 TEST_F(TRACE_poke, getpid_runs_normally)
1655 {
1656 	long ret;
1657 
1658 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1659 	ASSERT_EQ(0, ret);
1660 
1661 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1662 	ASSERT_EQ(0, ret);
1663 
1664 	EXPECT_EQ(0, self->poked);
1665 	EXPECT_NE(0, syscall(__NR_getpid));
1666 	EXPECT_EQ(0, self->poked);
1667 }
1668 
1669 #if defined(__x86_64__)
1670 # define ARCH_REGS	struct user_regs_struct
1671 # define SYSCALL_NUM	orig_rax
1672 # define SYSCALL_RET	rax
1673 #elif defined(__i386__)
1674 # define ARCH_REGS	struct user_regs_struct
1675 # define SYSCALL_NUM	orig_eax
1676 # define SYSCALL_RET	eax
1677 #elif defined(__arm__)
1678 # define ARCH_REGS	struct pt_regs
1679 # define SYSCALL_NUM	ARM_r7
1680 # define SYSCALL_RET	ARM_r0
1681 #elif defined(__aarch64__)
1682 # define ARCH_REGS	struct user_pt_regs
1683 # define SYSCALL_NUM	regs[8]
1684 # define SYSCALL_RET	regs[0]
1685 #elif defined(__riscv) && __riscv_xlen == 64
1686 # define ARCH_REGS	struct user_regs_struct
1687 # define SYSCALL_NUM	a7
1688 # define SYSCALL_RET	a0
1689 #elif defined(__csky__)
1690 # define ARCH_REGS	struct pt_regs
1691 #if defined(__CSKYABIV2__)
1692 # define SYSCALL_NUM	regs[3]
1693 #else
1694 # define SYSCALL_NUM	regs[9]
1695 #endif
1696 # define SYSCALL_RET	a0
1697 #elif defined(__hppa__)
1698 # define ARCH_REGS	struct user_regs_struct
1699 # define SYSCALL_NUM	gr[20]
1700 # define SYSCALL_RET	gr[28]
1701 #elif defined(__powerpc__)
1702 # define ARCH_REGS	struct pt_regs
1703 # define SYSCALL_NUM	gpr[0]
1704 # define SYSCALL_RET	gpr[3]
1705 #elif defined(__s390__)
1706 # define ARCH_REGS     s390_regs
1707 # define SYSCALL_NUM   gprs[2]
1708 # define SYSCALL_RET   gprs[2]
1709 # define SYSCALL_NUM_RET_SHARE_REG
1710 #elif defined(__mips__)
1711 # define ARCH_REGS	struct pt_regs
1712 # define SYSCALL_NUM	regs[2]
1713 # define SYSCALL_SYSCALL_NUM regs[4]
1714 # define SYSCALL_RET	regs[2]
1715 # define SYSCALL_NUM_RET_SHARE_REG
1716 #elif defined(__xtensa__)
1717 # define ARCH_REGS	struct user_pt_regs
1718 # define SYSCALL_NUM	syscall
1719 /*
1720  * On xtensa syscall return value is in the register
1721  * a2 of the current window which is not fixed.
1722  */
1723 #define SYSCALL_RET(reg) a[(reg).windowbase * 4 + 2]
1724 #elif defined(__sh__)
1725 # define ARCH_REGS	struct pt_regs
1726 # define SYSCALL_NUM	gpr[3]
1727 # define SYSCALL_RET	gpr[0]
1728 #else
1729 # error "Do not know how to find your architecture's registers and syscalls"
1730 #endif
1731 
1732 /* When the syscall return can't be changed, stub out the tests for it. */
1733 #ifdef SYSCALL_NUM_RET_SHARE_REG
1734 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1735 #else
1736 # define EXPECT_SYSCALL_RETURN(val, action)		\
1737 	do {						\
1738 		errno = 0;				\
1739 		if (val < 0) {				\
1740 			EXPECT_EQ(-1, action);		\
1741 			EXPECT_EQ(-(val), errno);	\
1742 		} else {				\
1743 			EXPECT_EQ(val, action);		\
1744 		}					\
1745 	} while (0)
1746 #endif
1747 
1748 /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1749  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1750  */
1751 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1752 #define HAVE_GETREGS
1753 #endif
1754 
1755 /* Architecture-specific syscall fetching routine. */
1756 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1757 {
1758 	ARCH_REGS regs;
1759 #ifdef HAVE_GETREGS
1760 	EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, &regs)) {
1761 		TH_LOG("PTRACE_GETREGS failed");
1762 		return -1;
1763 	}
1764 #else
1765 	struct iovec iov;
1766 
1767 	iov.iov_base = &regs;
1768 	iov.iov_len = sizeof(regs);
1769 	EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
1770 		TH_LOG("PTRACE_GETREGSET failed");
1771 		return -1;
1772 	}
1773 #endif
1774 
1775 #if defined(__mips__)
1776 	if (regs.SYSCALL_NUM == __NR_O32_Linux)
1777 		return regs.SYSCALL_SYSCALL_NUM;
1778 #endif
1779 	return regs.SYSCALL_NUM;
1780 }
1781 
1782 /* Architecture-specific syscall changing routine. */
1783 void change_syscall(struct __test_metadata *_metadata,
1784 		    pid_t tracee, int syscall, int result)
1785 {
1786 	int ret;
1787 	ARCH_REGS regs;
1788 #ifdef HAVE_GETREGS
1789 	ret = ptrace(PTRACE_GETREGS, tracee, 0, &regs);
1790 #else
1791 	struct iovec iov;
1792 	iov.iov_base = &regs;
1793 	iov.iov_len = sizeof(regs);
1794 	ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
1795 #endif
1796 	EXPECT_EQ(0, ret) {}
1797 
1798 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
1799 	defined(__s390__) || defined(__hppa__) || defined(__riscv) || \
1800 	defined(__xtensa__) || defined(__csky__) || defined(__sh__)
1801 	{
1802 		regs.SYSCALL_NUM = syscall;
1803 	}
1804 #elif defined(__mips__)
1805 	{
1806 		if (regs.SYSCALL_NUM == __NR_O32_Linux)
1807 			regs.SYSCALL_SYSCALL_NUM = syscall;
1808 		else
1809 			regs.SYSCALL_NUM = syscall;
1810 	}
1811 
1812 #elif defined(__arm__)
1813 # ifndef PTRACE_SET_SYSCALL
1814 #  define PTRACE_SET_SYSCALL   23
1815 # endif
1816 	{
1817 		ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
1818 		EXPECT_EQ(0, ret);
1819 	}
1820 
1821 #elif defined(__aarch64__)
1822 # ifndef NT_ARM_SYSTEM_CALL
1823 #  define NT_ARM_SYSTEM_CALL 0x404
1824 # endif
1825 	{
1826 		iov.iov_base = &syscall;
1827 		iov.iov_len = sizeof(syscall);
1828 		ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL,
1829 			     &iov);
1830 		EXPECT_EQ(0, ret);
1831 	}
1832 
1833 #else
1834 	ASSERT_EQ(1, 0) {
1835 		TH_LOG("How is the syscall changed on this architecture?");
1836 	}
1837 #endif
1838 
1839 	/* If syscall is skipped, change return value. */
1840 	if (syscall == -1)
1841 #ifdef SYSCALL_NUM_RET_SHARE_REG
1842 		TH_LOG("Can't modify syscall return on this architecture");
1843 
1844 #elif defined(__xtensa__)
1845 		regs.SYSCALL_RET(regs) = result;
1846 #else
1847 		regs.SYSCALL_RET = result;
1848 #endif
1849 
1850 #ifdef HAVE_GETREGS
1851 	ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
1852 #else
1853 	iov.iov_base = &regs;
1854 	iov.iov_len = sizeof(regs);
1855 	ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
1856 #endif
1857 	EXPECT_EQ(0, ret);
1858 }
1859 
1860 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1861 		    int status, void *args)
1862 {
1863 	int ret;
1864 	unsigned long msg;
1865 
1866 	/* Make sure we got the right message. */
1867 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1868 	EXPECT_EQ(0, ret);
1869 
1870 	/* Validate and take action on expected syscalls. */
1871 	switch (msg) {
1872 	case 0x1002:
1873 		/* change getpid to getppid. */
1874 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1875 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1876 		break;
1877 	case 0x1003:
1878 		/* skip gettid with valid return code. */
1879 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1880 		change_syscall(_metadata, tracee, -1, 45000);
1881 		break;
1882 	case 0x1004:
1883 		/* skip openat with error. */
1884 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1885 		change_syscall(_metadata, tracee, -1, -ESRCH);
1886 		break;
1887 	case 0x1005:
1888 		/* do nothing (allow getppid) */
1889 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
1890 		break;
1891 	default:
1892 		EXPECT_EQ(0, msg) {
1893 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
1894 			kill(tracee, SIGKILL);
1895 		}
1896 	}
1897 
1898 }
1899 
1900 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
1901 		   int status, void *args)
1902 {
1903 	int ret, nr;
1904 	unsigned long msg;
1905 	static bool entry;
1906 
1907 	/*
1908 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
1909 	 * is by counting.
1910 	 */
1911 	entry = !entry;
1912 
1913 	/* Make sure we got an appropriate message. */
1914 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1915 	EXPECT_EQ(0, ret);
1916 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
1917 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
1918 
1919 	if (!entry)
1920 		return;
1921 
1922 	nr = get_syscall(_metadata, tracee);
1923 
1924 	if (nr == __NR_getpid)
1925 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1926 	if (nr == __NR_gettid)
1927 		change_syscall(_metadata, tracee, -1, 45000);
1928 	if (nr == __NR_openat)
1929 		change_syscall(_metadata, tracee, -1, -ESRCH);
1930 }
1931 
1932 FIXTURE(TRACE_syscall) {
1933 	struct sock_fprog prog;
1934 	pid_t tracer, mytid, mypid, parent;
1935 };
1936 
1937 FIXTURE_VARIANT(TRACE_syscall) {
1938 	/*
1939 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
1940 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
1941 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
1942 	 * ptrace (true).
1943 	 */
1944 	bool use_ptrace;
1945 };
1946 
1947 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
1948 	.use_ptrace = true,
1949 };
1950 
1951 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
1952 	.use_ptrace = false,
1953 };
1954 
1955 FIXTURE_SETUP(TRACE_syscall)
1956 {
1957 	struct sock_filter filter[] = {
1958 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1959 			offsetof(struct seccomp_data, nr)),
1960 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1961 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
1962 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
1963 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
1964 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
1965 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
1966 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
1967 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
1968 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1969 	};
1970 	struct sock_fprog prog = {
1971 		.len = (unsigned short)ARRAY_SIZE(filter),
1972 		.filter = filter,
1973 	};
1974 	long ret;
1975 
1976 	/* Prepare some testable syscall results. */
1977 	self->mytid = syscall(__NR_gettid);
1978 	ASSERT_GT(self->mytid, 0);
1979 	ASSERT_NE(self->mytid, 1) {
1980 		TH_LOG("Running this test as init is not supported. :)");
1981 	}
1982 
1983 	self->mypid = getpid();
1984 	ASSERT_GT(self->mypid, 0);
1985 	ASSERT_EQ(self->mytid, self->mypid);
1986 
1987 	self->parent = getppid();
1988 	ASSERT_GT(self->parent, 0);
1989 	ASSERT_NE(self->parent, self->mypid);
1990 
1991 	/* Launch tracer. */
1992 	self->tracer = setup_trace_fixture(_metadata,
1993 					   variant->use_ptrace ? tracer_ptrace
1994 							       : tracer_seccomp,
1995 					   NULL, variant->use_ptrace);
1996 
1997 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1998 	ASSERT_EQ(0, ret);
1999 
2000 	if (variant->use_ptrace)
2001 		return;
2002 
2003 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2004 	ASSERT_EQ(0, ret);
2005 }
2006 
2007 FIXTURE_TEARDOWN(TRACE_syscall)
2008 {
2009 	teardown_trace_fixture(_metadata, self->tracer);
2010 }
2011 
2012 TEST(negative_ENOSYS)
2013 {
2014 	/*
2015 	 * There should be no difference between an "internal" skip
2016 	 * and userspace asking for syscall "-1".
2017 	 */
2018 	errno = 0;
2019 	EXPECT_EQ(-1, syscall(-1));
2020 	EXPECT_EQ(errno, ENOSYS);
2021 	/* And no difference for "still not valid but not -1". */
2022 	errno = 0;
2023 	EXPECT_EQ(-1, syscall(-101));
2024 	EXPECT_EQ(errno, ENOSYS);
2025 }
2026 
2027 TEST_F(TRACE_syscall, negative_ENOSYS)
2028 {
2029 	negative_ENOSYS(_metadata);
2030 }
2031 
2032 TEST_F(TRACE_syscall, syscall_allowed)
2033 {
2034 	/* getppid works as expected (no changes). */
2035 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2036 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2037 }
2038 
2039 TEST_F(TRACE_syscall, syscall_redirected)
2040 {
2041 	/* getpid has been redirected to getppid as expected. */
2042 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2043 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2044 }
2045 
2046 TEST_F(TRACE_syscall, syscall_errno)
2047 {
2048 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2049 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2050 }
2051 
2052 TEST_F(TRACE_syscall, syscall_faked)
2053 {
2054 	/* Tracer skips the gettid syscall and store altered return value. */
2055 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2056 }
2057 
2058 TEST_F(TRACE_syscall, skip_after)
2059 {
2060 	struct sock_filter filter[] = {
2061 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2062 			offsetof(struct seccomp_data, nr)),
2063 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2064 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2065 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2066 	};
2067 	struct sock_fprog prog = {
2068 		.len = (unsigned short)ARRAY_SIZE(filter),
2069 		.filter = filter,
2070 	};
2071 	long ret;
2072 
2073 	/* Install additional "errno on getppid" filter. */
2074 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2075 	ASSERT_EQ(0, ret);
2076 
2077 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2078 	errno = 0;
2079 	EXPECT_EQ(-1, syscall(__NR_getpid));
2080 	EXPECT_EQ(EPERM, errno);
2081 }
2082 
2083 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2084 {
2085 	struct sock_filter filter[] = {
2086 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2087 			offsetof(struct seccomp_data, nr)),
2088 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2089 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2090 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2091 	};
2092 	struct sock_fprog prog = {
2093 		.len = (unsigned short)ARRAY_SIZE(filter),
2094 		.filter = filter,
2095 	};
2096 	long ret;
2097 
2098 	/* Install additional "death on getppid" filter. */
2099 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2100 	ASSERT_EQ(0, ret);
2101 
2102 	/* Tracer will redirect getpid to getppid, and we should die. */
2103 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2104 }
2105 
2106 TEST(seccomp_syscall)
2107 {
2108 	struct sock_filter filter[] = {
2109 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2110 	};
2111 	struct sock_fprog prog = {
2112 		.len = (unsigned short)ARRAY_SIZE(filter),
2113 		.filter = filter,
2114 	};
2115 	long ret;
2116 
2117 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2118 	ASSERT_EQ(0, ret) {
2119 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2120 	}
2121 
2122 	/* Reject insane operation. */
2123 	ret = seccomp(-1, 0, &prog);
2124 	ASSERT_NE(ENOSYS, errno) {
2125 		TH_LOG("Kernel does not support seccomp syscall!");
2126 	}
2127 	EXPECT_EQ(EINVAL, errno) {
2128 		TH_LOG("Did not reject crazy op value!");
2129 	}
2130 
2131 	/* Reject strict with flags or pointer. */
2132 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2133 	EXPECT_EQ(EINVAL, errno) {
2134 		TH_LOG("Did not reject mode strict with flags!");
2135 	}
2136 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2137 	EXPECT_EQ(EINVAL, errno) {
2138 		TH_LOG("Did not reject mode strict with uargs!");
2139 	}
2140 
2141 	/* Reject insane args for filter. */
2142 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2143 	EXPECT_EQ(EINVAL, errno) {
2144 		TH_LOG("Did not reject crazy filter flags!");
2145 	}
2146 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2147 	EXPECT_EQ(EFAULT, errno) {
2148 		TH_LOG("Did not reject NULL filter!");
2149 	}
2150 
2151 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2152 	EXPECT_EQ(0, errno) {
2153 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2154 			strerror(errno));
2155 	}
2156 }
2157 
2158 TEST(seccomp_syscall_mode_lock)
2159 {
2160 	struct sock_filter filter[] = {
2161 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2162 	};
2163 	struct sock_fprog prog = {
2164 		.len = (unsigned short)ARRAY_SIZE(filter),
2165 		.filter = filter,
2166 	};
2167 	long ret;
2168 
2169 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2170 	ASSERT_EQ(0, ret) {
2171 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2172 	}
2173 
2174 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2175 	ASSERT_NE(ENOSYS, errno) {
2176 		TH_LOG("Kernel does not support seccomp syscall!");
2177 	}
2178 	EXPECT_EQ(0, ret) {
2179 		TH_LOG("Could not install filter!");
2180 	}
2181 
2182 	/* Make sure neither entry point will switch to strict. */
2183 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2184 	EXPECT_EQ(EINVAL, errno) {
2185 		TH_LOG("Switched to mode strict!");
2186 	}
2187 
2188 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2189 	EXPECT_EQ(EINVAL, errno) {
2190 		TH_LOG("Switched to mode strict!");
2191 	}
2192 }
2193 
2194 /*
2195  * Test detection of known and unknown filter flags. Userspace needs to be able
2196  * to check if a filter flag is supported by the current kernel and a good way
2197  * of doing that is by attempting to enter filter mode, with the flag bit in
2198  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2199  * that the flag is valid and EINVAL indicates that the flag is invalid.
2200  */
2201 TEST(detect_seccomp_filter_flags)
2202 {
2203 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2204 				 SECCOMP_FILTER_FLAG_LOG,
2205 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2206 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2207 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2208 	unsigned int exclusive[] = {
2209 				SECCOMP_FILTER_FLAG_TSYNC,
2210 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2211 	unsigned int flag, all_flags, exclusive_mask;
2212 	int i;
2213 	long ret;
2214 
2215 	/* Test detection of individual known-good filter flags */
2216 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2217 		int bits = 0;
2218 
2219 		flag = flags[i];
2220 		/* Make sure the flag is a single bit! */
2221 		while (flag) {
2222 			if (flag & 0x1)
2223 				bits ++;
2224 			flag >>= 1;
2225 		}
2226 		ASSERT_EQ(1, bits);
2227 		flag = flags[i];
2228 
2229 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2230 		ASSERT_NE(ENOSYS, errno) {
2231 			TH_LOG("Kernel does not support seccomp syscall!");
2232 		}
2233 		EXPECT_EQ(-1, ret);
2234 		EXPECT_EQ(EFAULT, errno) {
2235 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2236 			       flag);
2237 		}
2238 
2239 		all_flags |= flag;
2240 	}
2241 
2242 	/*
2243 	 * Test detection of all known-good filter flags combined. But
2244 	 * for the exclusive flags we need to mask them out and try them
2245 	 * individually for the "all flags" testing.
2246 	 */
2247 	exclusive_mask = 0;
2248 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2249 		exclusive_mask |= exclusive[i];
2250 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2251 		flag = all_flags & ~exclusive_mask;
2252 		flag |= exclusive[i];
2253 
2254 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2255 		EXPECT_EQ(-1, ret);
2256 		EXPECT_EQ(EFAULT, errno) {
2257 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2258 			       flag);
2259 		}
2260 	}
2261 
2262 	/* Test detection of an unknown filter flags, without exclusives. */
2263 	flag = -1;
2264 	flag &= ~exclusive_mask;
2265 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2266 	EXPECT_EQ(-1, ret);
2267 	EXPECT_EQ(EINVAL, errno) {
2268 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2269 		       flag);
2270 	}
2271 
2272 	/*
2273 	 * Test detection of an unknown filter flag that may simply need to be
2274 	 * added to this test
2275 	 */
2276 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2277 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2278 	EXPECT_EQ(-1, ret);
2279 	EXPECT_EQ(EINVAL, errno) {
2280 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2281 		       flag);
2282 	}
2283 }
2284 
2285 TEST(TSYNC_first)
2286 {
2287 	struct sock_filter filter[] = {
2288 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2289 	};
2290 	struct sock_fprog prog = {
2291 		.len = (unsigned short)ARRAY_SIZE(filter),
2292 		.filter = filter,
2293 	};
2294 	long ret;
2295 
2296 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2297 	ASSERT_EQ(0, ret) {
2298 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2299 	}
2300 
2301 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2302 		      &prog);
2303 	ASSERT_NE(ENOSYS, errno) {
2304 		TH_LOG("Kernel does not support seccomp syscall!");
2305 	}
2306 	EXPECT_EQ(0, ret) {
2307 		TH_LOG("Could not install initial filter with TSYNC!");
2308 	}
2309 }
2310 
2311 #define TSYNC_SIBLINGS 2
2312 struct tsync_sibling {
2313 	pthread_t tid;
2314 	pid_t system_tid;
2315 	sem_t *started;
2316 	pthread_cond_t *cond;
2317 	pthread_mutex_t *mutex;
2318 	int diverge;
2319 	int num_waits;
2320 	struct sock_fprog *prog;
2321 	struct __test_metadata *metadata;
2322 };
2323 
2324 /*
2325  * To avoid joining joined threads (which is not allowed by Bionic),
2326  * make sure we both successfully join and clear the tid to skip a
2327  * later join attempt during fixture teardown. Any remaining threads
2328  * will be directly killed during teardown.
2329  */
2330 #define PTHREAD_JOIN(tid, status)					\
2331 	do {								\
2332 		int _rc = pthread_join(tid, status);			\
2333 		if (_rc) {						\
2334 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2335 				(unsigned int)tid, _rc);		\
2336 		} else {						\
2337 			tid = 0;					\
2338 		}							\
2339 	} while (0)
2340 
2341 FIXTURE(TSYNC) {
2342 	struct sock_fprog root_prog, apply_prog;
2343 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2344 	sem_t started;
2345 	pthread_cond_t cond;
2346 	pthread_mutex_t mutex;
2347 	int sibling_count;
2348 };
2349 
2350 FIXTURE_SETUP(TSYNC)
2351 {
2352 	struct sock_filter root_filter[] = {
2353 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2354 	};
2355 	struct sock_filter apply_filter[] = {
2356 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2357 			offsetof(struct seccomp_data, nr)),
2358 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2359 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2360 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2361 	};
2362 
2363 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2364 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2365 	memset(&self->sibling, 0, sizeof(self->sibling));
2366 	self->root_prog.filter = malloc(sizeof(root_filter));
2367 	ASSERT_NE(NULL, self->root_prog.filter);
2368 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2369 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2370 
2371 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2372 	ASSERT_NE(NULL, self->apply_prog.filter);
2373 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2374 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2375 
2376 	self->sibling_count = 0;
2377 	pthread_mutex_init(&self->mutex, NULL);
2378 	pthread_cond_init(&self->cond, NULL);
2379 	sem_init(&self->started, 0, 0);
2380 	self->sibling[0].tid = 0;
2381 	self->sibling[0].cond = &self->cond;
2382 	self->sibling[0].started = &self->started;
2383 	self->sibling[0].mutex = &self->mutex;
2384 	self->sibling[0].diverge = 0;
2385 	self->sibling[0].num_waits = 1;
2386 	self->sibling[0].prog = &self->root_prog;
2387 	self->sibling[0].metadata = _metadata;
2388 	self->sibling[1].tid = 0;
2389 	self->sibling[1].cond = &self->cond;
2390 	self->sibling[1].started = &self->started;
2391 	self->sibling[1].mutex = &self->mutex;
2392 	self->sibling[1].diverge = 0;
2393 	self->sibling[1].prog = &self->root_prog;
2394 	self->sibling[1].num_waits = 1;
2395 	self->sibling[1].metadata = _metadata;
2396 }
2397 
2398 FIXTURE_TEARDOWN(TSYNC)
2399 {
2400 	int sib = 0;
2401 
2402 	if (self->root_prog.filter)
2403 		free(self->root_prog.filter);
2404 	if (self->apply_prog.filter)
2405 		free(self->apply_prog.filter);
2406 
2407 	for ( ; sib < self->sibling_count; ++sib) {
2408 		struct tsync_sibling *s = &self->sibling[sib];
2409 
2410 		if (!s->tid)
2411 			continue;
2412 		/*
2413 		 * If a thread is still running, it may be stuck, so hit
2414 		 * it over the head really hard.
2415 		 */
2416 		pthread_kill(s->tid, 9);
2417 	}
2418 	pthread_mutex_destroy(&self->mutex);
2419 	pthread_cond_destroy(&self->cond);
2420 	sem_destroy(&self->started);
2421 }
2422 
2423 void *tsync_sibling(void *data)
2424 {
2425 	long ret = 0;
2426 	struct tsync_sibling *me = data;
2427 
2428 	me->system_tid = syscall(__NR_gettid);
2429 
2430 	pthread_mutex_lock(me->mutex);
2431 	if (me->diverge) {
2432 		/* Just re-apply the root prog to fork the tree */
2433 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2434 				me->prog, 0, 0);
2435 	}
2436 	sem_post(me->started);
2437 	/* Return outside of started so parent notices failures. */
2438 	if (ret) {
2439 		pthread_mutex_unlock(me->mutex);
2440 		return (void *)SIBLING_EXIT_FAILURE;
2441 	}
2442 	do {
2443 		pthread_cond_wait(me->cond, me->mutex);
2444 		me->num_waits = me->num_waits - 1;
2445 	} while (me->num_waits);
2446 	pthread_mutex_unlock(me->mutex);
2447 
2448 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2449 	if (!ret)
2450 		return (void *)SIBLING_EXIT_NEWPRIVS;
2451 	read(0, NULL, 0);
2452 	return (void *)SIBLING_EXIT_UNKILLED;
2453 }
2454 
2455 void tsync_start_sibling(struct tsync_sibling *sibling)
2456 {
2457 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2458 }
2459 
2460 TEST_F(TSYNC, siblings_fail_prctl)
2461 {
2462 	long ret;
2463 	void *status;
2464 	struct sock_filter filter[] = {
2465 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2466 			offsetof(struct seccomp_data, nr)),
2467 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2468 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2469 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2470 	};
2471 	struct sock_fprog prog = {
2472 		.len = (unsigned short)ARRAY_SIZE(filter),
2473 		.filter = filter,
2474 	};
2475 
2476 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2477 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2478 	}
2479 
2480 	/* Check prctl failure detection by requesting sib 0 diverge. */
2481 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2482 	ASSERT_NE(ENOSYS, errno) {
2483 		TH_LOG("Kernel does not support seccomp syscall!");
2484 	}
2485 	ASSERT_EQ(0, ret) {
2486 		TH_LOG("setting filter failed");
2487 	}
2488 
2489 	self->sibling[0].diverge = 1;
2490 	tsync_start_sibling(&self->sibling[0]);
2491 	tsync_start_sibling(&self->sibling[1]);
2492 
2493 	while (self->sibling_count < TSYNC_SIBLINGS) {
2494 		sem_wait(&self->started);
2495 		self->sibling_count++;
2496 	}
2497 
2498 	/* Signal the threads to clean up*/
2499 	pthread_mutex_lock(&self->mutex);
2500 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2501 		TH_LOG("cond broadcast non-zero");
2502 	}
2503 	pthread_mutex_unlock(&self->mutex);
2504 
2505 	/* Ensure diverging sibling failed to call prctl. */
2506 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2507 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2508 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2509 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2510 }
2511 
2512 TEST_F(TSYNC, two_siblings_with_ancestor)
2513 {
2514 	long ret;
2515 	void *status;
2516 
2517 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2518 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2519 	}
2520 
2521 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2522 	ASSERT_NE(ENOSYS, errno) {
2523 		TH_LOG("Kernel does not support seccomp syscall!");
2524 	}
2525 	ASSERT_EQ(0, ret) {
2526 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2527 	}
2528 	tsync_start_sibling(&self->sibling[0]);
2529 	tsync_start_sibling(&self->sibling[1]);
2530 
2531 	while (self->sibling_count < TSYNC_SIBLINGS) {
2532 		sem_wait(&self->started);
2533 		self->sibling_count++;
2534 	}
2535 
2536 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2537 		      &self->apply_prog);
2538 	ASSERT_EQ(0, ret) {
2539 		TH_LOG("Could install filter on all threads!");
2540 	}
2541 	/* Tell the siblings to test the policy */
2542 	pthread_mutex_lock(&self->mutex);
2543 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2544 		TH_LOG("cond broadcast non-zero");
2545 	}
2546 	pthread_mutex_unlock(&self->mutex);
2547 	/* Ensure they are both killed and don't exit cleanly. */
2548 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2549 	EXPECT_EQ(0x0, (long)status);
2550 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2551 	EXPECT_EQ(0x0, (long)status);
2552 }
2553 
2554 TEST_F(TSYNC, two_sibling_want_nnp)
2555 {
2556 	void *status;
2557 
2558 	/* start siblings before any prctl() operations */
2559 	tsync_start_sibling(&self->sibling[0]);
2560 	tsync_start_sibling(&self->sibling[1]);
2561 	while (self->sibling_count < TSYNC_SIBLINGS) {
2562 		sem_wait(&self->started);
2563 		self->sibling_count++;
2564 	}
2565 
2566 	/* Tell the siblings to test no policy */
2567 	pthread_mutex_lock(&self->mutex);
2568 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2569 		TH_LOG("cond broadcast non-zero");
2570 	}
2571 	pthread_mutex_unlock(&self->mutex);
2572 
2573 	/* Ensure they are both upset about lacking nnp. */
2574 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2575 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2576 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2577 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2578 }
2579 
2580 TEST_F(TSYNC, two_siblings_with_no_filter)
2581 {
2582 	long ret;
2583 	void *status;
2584 
2585 	/* start siblings before any prctl() operations */
2586 	tsync_start_sibling(&self->sibling[0]);
2587 	tsync_start_sibling(&self->sibling[1]);
2588 	while (self->sibling_count < TSYNC_SIBLINGS) {
2589 		sem_wait(&self->started);
2590 		self->sibling_count++;
2591 	}
2592 
2593 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2594 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2595 	}
2596 
2597 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2598 		      &self->apply_prog);
2599 	ASSERT_NE(ENOSYS, errno) {
2600 		TH_LOG("Kernel does not support seccomp syscall!");
2601 	}
2602 	ASSERT_EQ(0, ret) {
2603 		TH_LOG("Could install filter on all threads!");
2604 	}
2605 
2606 	/* Tell the siblings to test the policy */
2607 	pthread_mutex_lock(&self->mutex);
2608 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2609 		TH_LOG("cond broadcast non-zero");
2610 	}
2611 	pthread_mutex_unlock(&self->mutex);
2612 
2613 	/* Ensure they are both killed and don't exit cleanly. */
2614 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2615 	EXPECT_EQ(0x0, (long)status);
2616 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2617 	EXPECT_EQ(0x0, (long)status);
2618 }
2619 
2620 TEST_F(TSYNC, two_siblings_with_one_divergence)
2621 {
2622 	long ret;
2623 	void *status;
2624 
2625 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2626 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2627 	}
2628 
2629 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2630 	ASSERT_NE(ENOSYS, errno) {
2631 		TH_LOG("Kernel does not support seccomp syscall!");
2632 	}
2633 	ASSERT_EQ(0, ret) {
2634 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2635 	}
2636 	self->sibling[0].diverge = 1;
2637 	tsync_start_sibling(&self->sibling[0]);
2638 	tsync_start_sibling(&self->sibling[1]);
2639 
2640 	while (self->sibling_count < TSYNC_SIBLINGS) {
2641 		sem_wait(&self->started);
2642 		self->sibling_count++;
2643 	}
2644 
2645 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2646 		      &self->apply_prog);
2647 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2648 		TH_LOG("Did not fail on diverged sibling.");
2649 	}
2650 
2651 	/* Wake the threads */
2652 	pthread_mutex_lock(&self->mutex);
2653 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2654 		TH_LOG("cond broadcast non-zero");
2655 	}
2656 	pthread_mutex_unlock(&self->mutex);
2657 
2658 	/* Ensure they are both unkilled. */
2659 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2660 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2661 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2662 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2663 }
2664 
2665 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2666 {
2667 	long ret, flags;
2668 	void *status;
2669 
2670 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2671 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2672 	}
2673 
2674 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2675 	ASSERT_NE(ENOSYS, errno) {
2676 		TH_LOG("Kernel does not support seccomp syscall!");
2677 	}
2678 	ASSERT_EQ(0, ret) {
2679 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2680 	}
2681 	self->sibling[0].diverge = 1;
2682 	tsync_start_sibling(&self->sibling[0]);
2683 	tsync_start_sibling(&self->sibling[1]);
2684 
2685 	while (self->sibling_count < TSYNC_SIBLINGS) {
2686 		sem_wait(&self->started);
2687 		self->sibling_count++;
2688 	}
2689 
2690 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2691 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2692 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2693 	ASSERT_EQ(ESRCH, errno) {
2694 		TH_LOG("Did not return ESRCH for diverged sibling.");
2695 	}
2696 	ASSERT_EQ(-1, ret) {
2697 		TH_LOG("Did not fail on diverged sibling.");
2698 	}
2699 
2700 	/* Wake the threads */
2701 	pthread_mutex_lock(&self->mutex);
2702 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2703 		TH_LOG("cond broadcast non-zero");
2704 	}
2705 	pthread_mutex_unlock(&self->mutex);
2706 
2707 	/* Ensure they are both unkilled. */
2708 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2709 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2710 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2711 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2712 }
2713 
2714 TEST_F(TSYNC, two_siblings_not_under_filter)
2715 {
2716 	long ret, sib;
2717 	void *status;
2718 	struct timespec delay = { .tv_nsec = 100000000 };
2719 
2720 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2721 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2722 	}
2723 
2724 	/*
2725 	 * Sibling 0 will have its own seccomp policy
2726 	 * and Sibling 1 will not be under seccomp at
2727 	 * all. Sibling 1 will enter seccomp and 0
2728 	 * will cause failure.
2729 	 */
2730 	self->sibling[0].diverge = 1;
2731 	tsync_start_sibling(&self->sibling[0]);
2732 	tsync_start_sibling(&self->sibling[1]);
2733 
2734 	while (self->sibling_count < TSYNC_SIBLINGS) {
2735 		sem_wait(&self->started);
2736 		self->sibling_count++;
2737 	}
2738 
2739 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2740 	ASSERT_NE(ENOSYS, errno) {
2741 		TH_LOG("Kernel does not support seccomp syscall!");
2742 	}
2743 	ASSERT_EQ(0, ret) {
2744 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2745 	}
2746 
2747 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2748 		      &self->apply_prog);
2749 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2750 		TH_LOG("Did not fail on diverged sibling.");
2751 	}
2752 	sib = 1;
2753 	if (ret == self->sibling[0].system_tid)
2754 		sib = 0;
2755 
2756 	pthread_mutex_lock(&self->mutex);
2757 
2758 	/* Increment the other siblings num_waits so we can clean up
2759 	 * the one we just saw.
2760 	 */
2761 	self->sibling[!sib].num_waits += 1;
2762 
2763 	/* Signal the thread to clean up*/
2764 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2765 		TH_LOG("cond broadcast non-zero");
2766 	}
2767 	pthread_mutex_unlock(&self->mutex);
2768 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2769 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2770 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2771 	while (!kill(self->sibling[sib].system_tid, 0))
2772 		nanosleep(&delay, NULL);
2773 	/* Switch to the remaining sibling */
2774 	sib = !sib;
2775 
2776 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2777 		      &self->apply_prog);
2778 	ASSERT_EQ(0, ret) {
2779 		TH_LOG("Expected the remaining sibling to sync");
2780 	};
2781 
2782 	pthread_mutex_lock(&self->mutex);
2783 
2784 	/* If remaining sibling didn't have a chance to wake up during
2785 	 * the first broadcast, manually reduce the num_waits now.
2786 	 */
2787 	if (self->sibling[sib].num_waits > 1)
2788 		self->sibling[sib].num_waits = 1;
2789 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2790 		TH_LOG("cond broadcast non-zero");
2791 	}
2792 	pthread_mutex_unlock(&self->mutex);
2793 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2794 	EXPECT_EQ(0, (long)status);
2795 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2796 	while (!kill(self->sibling[sib].system_tid, 0))
2797 		nanosleep(&delay, NULL);
2798 
2799 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2800 		      &self->apply_prog);
2801 	ASSERT_EQ(0, ret);  /* just us chickens */
2802 }
2803 
2804 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2805 TEST(syscall_restart)
2806 {
2807 	long ret;
2808 	unsigned long msg;
2809 	pid_t child_pid;
2810 	int pipefd[2];
2811 	int status;
2812 	siginfo_t info = { };
2813 	struct sock_filter filter[] = {
2814 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2815 			 offsetof(struct seccomp_data, nr)),
2816 
2817 #ifdef __NR_sigreturn
2818 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
2819 #endif
2820 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
2821 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
2822 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
2823 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
2824 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
2825 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
2826 
2827 		/* Allow __NR_write for easy logging. */
2828 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
2829 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2830 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2831 		/* The nanosleep jump target. */
2832 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
2833 		/* The restart_syscall jump target. */
2834 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
2835 	};
2836 	struct sock_fprog prog = {
2837 		.len = (unsigned short)ARRAY_SIZE(filter),
2838 		.filter = filter,
2839 	};
2840 #if defined(__arm__)
2841 	struct utsname utsbuf;
2842 #endif
2843 
2844 	ASSERT_EQ(0, pipe(pipefd));
2845 
2846 	child_pid = fork();
2847 	ASSERT_LE(0, child_pid);
2848 	if (child_pid == 0) {
2849 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
2850 		char buf = ' ';
2851 		struct timespec timeout = { };
2852 
2853 		/* Attach parent as tracer and stop. */
2854 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
2855 		EXPECT_EQ(0, raise(SIGSTOP));
2856 
2857 		EXPECT_EQ(0, close(pipefd[1]));
2858 
2859 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2860 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2861 		}
2862 
2863 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2864 		EXPECT_EQ(0, ret) {
2865 			TH_LOG("Failed to install filter!");
2866 		}
2867 
2868 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2869 			TH_LOG("Failed to read() sync from parent");
2870 		}
2871 		EXPECT_EQ('.', buf) {
2872 			TH_LOG("Failed to get sync data from read()");
2873 		}
2874 
2875 		/* Start nanosleep to be interrupted. */
2876 		timeout.tv_sec = 1;
2877 		errno = 0;
2878 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
2879 			TH_LOG("Call to nanosleep() failed (errno %d)", errno);
2880 		}
2881 
2882 		/* Read final sync from parent. */
2883 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2884 			TH_LOG("Failed final read() from parent");
2885 		}
2886 		EXPECT_EQ('!', buf) {
2887 			TH_LOG("Failed to get final data from read()");
2888 		}
2889 
2890 		/* Directly report the status of our test harness results. */
2891 		syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
2892 						     : EXIT_FAILURE);
2893 	}
2894 	EXPECT_EQ(0, close(pipefd[0]));
2895 
2896 	/* Attach to child, setup options, and release. */
2897 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2898 	ASSERT_EQ(true, WIFSTOPPED(status));
2899 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
2900 			    PTRACE_O_TRACESECCOMP));
2901 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2902 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
2903 
2904 	/* Wait for nanosleep() to start. */
2905 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2906 	ASSERT_EQ(true, WIFSTOPPED(status));
2907 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2908 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2909 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2910 	ASSERT_EQ(0x100, msg);
2911 	ret = get_syscall(_metadata, child_pid);
2912 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
2913 
2914 	/* Might as well check siginfo for sanity while we're here. */
2915 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2916 	ASSERT_EQ(SIGTRAP, info.si_signo);
2917 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
2918 	EXPECT_EQ(0, info.si_errno);
2919 	EXPECT_EQ(getuid(), info.si_uid);
2920 	/* Verify signal delivery came from child (seccomp-triggered). */
2921 	EXPECT_EQ(child_pid, info.si_pid);
2922 
2923 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
2924 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
2925 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2926 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2927 	ASSERT_EQ(true, WIFSTOPPED(status));
2928 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
2929 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2930 	/*
2931 	 * There is no siginfo on SIGSTOP any more, so we can't verify
2932 	 * signal delivery came from parent now (getpid() == info.si_pid).
2933 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
2934 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
2935 	 */
2936 	EXPECT_EQ(SIGSTOP, info.si_signo);
2937 
2938 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
2939 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
2940 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2941 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2942 	ASSERT_EQ(true, WIFSTOPPED(status));
2943 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
2944 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2945 
2946 	/* Wait for restart_syscall() to start. */
2947 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2948 	ASSERT_EQ(true, WIFSTOPPED(status));
2949 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2950 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2951 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2952 
2953 	ASSERT_EQ(0x200, msg);
2954 	ret = get_syscall(_metadata, child_pid);
2955 #if defined(__arm__)
2956 	/*
2957 	 * FIXME:
2958 	 * - native ARM registers do NOT expose true syscall.
2959 	 * - compat ARM registers on ARM64 DO expose true syscall.
2960 	 */
2961 	ASSERT_EQ(0, uname(&utsbuf));
2962 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
2963 		EXPECT_EQ(__NR_nanosleep, ret);
2964 	} else
2965 #endif
2966 	{
2967 		EXPECT_EQ(__NR_restart_syscall, ret);
2968 	}
2969 
2970 	/* Write again to end test. */
2971 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2972 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
2973 	EXPECT_EQ(0, close(pipefd[1]));
2974 
2975 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2976 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
2977 		_metadata->passed = 0;
2978 }
2979 
2980 TEST_SIGNAL(filter_flag_log, SIGSYS)
2981 {
2982 	struct sock_filter allow_filter[] = {
2983 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2984 	};
2985 	struct sock_filter kill_filter[] = {
2986 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2987 			offsetof(struct seccomp_data, nr)),
2988 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2989 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2990 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2991 	};
2992 	struct sock_fprog allow_prog = {
2993 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
2994 		.filter = allow_filter,
2995 	};
2996 	struct sock_fprog kill_prog = {
2997 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
2998 		.filter = kill_filter,
2999 	};
3000 	long ret;
3001 	pid_t parent = getppid();
3002 
3003 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3004 	ASSERT_EQ(0, ret);
3005 
3006 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3007 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3008 		      &allow_prog);
3009 	ASSERT_NE(ENOSYS, errno) {
3010 		TH_LOG("Kernel does not support seccomp syscall!");
3011 	}
3012 	EXPECT_NE(0, ret) {
3013 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3014 	}
3015 	EXPECT_EQ(EINVAL, errno) {
3016 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3017 	}
3018 
3019 	/* Verify that a simple, permissive filter can be added with no flags */
3020 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3021 	EXPECT_EQ(0, ret);
3022 
3023 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3024 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3025 		      &allow_prog);
3026 	ASSERT_NE(EINVAL, errno) {
3027 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3028 	}
3029 	EXPECT_EQ(0, ret);
3030 
3031 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3032 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3033 		      &kill_prog);
3034 	EXPECT_EQ(0, ret);
3035 
3036 	EXPECT_EQ(parent, syscall(__NR_getppid));
3037 	/* getpid() should never return. */
3038 	EXPECT_EQ(0, syscall(__NR_getpid));
3039 }
3040 
3041 TEST(get_action_avail)
3042 {
3043 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3044 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3045 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3046 	__u32 unknown_action = 0x10000000U;
3047 	int i;
3048 	long ret;
3049 
3050 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3051 	ASSERT_NE(ENOSYS, errno) {
3052 		TH_LOG("Kernel does not support seccomp syscall!");
3053 	}
3054 	ASSERT_NE(EINVAL, errno) {
3055 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3056 	}
3057 	EXPECT_EQ(ret, 0);
3058 
3059 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3060 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3061 		EXPECT_EQ(ret, 0) {
3062 			TH_LOG("Expected action (0x%X) not available!",
3063 			       actions[i]);
3064 		}
3065 	}
3066 
3067 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3068 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3069 	EXPECT_EQ(ret, -1);
3070 	EXPECT_EQ(errno, EOPNOTSUPP);
3071 }
3072 
3073 TEST(get_metadata)
3074 {
3075 	pid_t pid;
3076 	int pipefd[2];
3077 	char buf;
3078 	struct seccomp_metadata md;
3079 	long ret;
3080 
3081 	/* Only real root can get metadata. */
3082 	if (geteuid()) {
3083 		SKIP(return, "get_metadata requires real root");
3084 		return;
3085 	}
3086 
3087 	ASSERT_EQ(0, pipe(pipefd));
3088 
3089 	pid = fork();
3090 	ASSERT_GE(pid, 0);
3091 	if (pid == 0) {
3092 		struct sock_filter filter[] = {
3093 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3094 		};
3095 		struct sock_fprog prog = {
3096 			.len = (unsigned short)ARRAY_SIZE(filter),
3097 			.filter = filter,
3098 		};
3099 
3100 		/* one with log, one without */
3101 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3102 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3103 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3104 
3105 		EXPECT_EQ(0, close(pipefd[0]));
3106 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3107 		ASSERT_EQ(0, close(pipefd[1]));
3108 
3109 		while (1)
3110 			sleep(100);
3111 	}
3112 
3113 	ASSERT_EQ(0, close(pipefd[1]));
3114 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3115 
3116 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3117 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3118 
3119 	/* Past here must not use ASSERT or child process is never killed. */
3120 
3121 	md.filter_off = 0;
3122 	errno = 0;
3123 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3124 	EXPECT_EQ(sizeof(md), ret) {
3125 		if (errno == EINVAL)
3126 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3127 	}
3128 
3129 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3130 	EXPECT_EQ(md.filter_off, 0);
3131 
3132 	md.filter_off = 1;
3133 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3134 	EXPECT_EQ(sizeof(md), ret);
3135 	EXPECT_EQ(md.flags, 0);
3136 	EXPECT_EQ(md.filter_off, 1);
3137 
3138 skip:
3139 	ASSERT_EQ(0, kill(pid, SIGKILL));
3140 }
3141 
3142 static int user_notif_syscall(int nr, unsigned int flags)
3143 {
3144 	struct sock_filter filter[] = {
3145 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
3146 			offsetof(struct seccomp_data, nr)),
3147 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
3148 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
3149 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
3150 	};
3151 
3152 	struct sock_fprog prog = {
3153 		.len = (unsigned short)ARRAY_SIZE(filter),
3154 		.filter = filter,
3155 	};
3156 
3157 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3158 }
3159 
3160 #define USER_NOTIF_MAGIC INT_MAX
3161 TEST(user_notification_basic)
3162 {
3163 	pid_t pid;
3164 	long ret;
3165 	int status, listener;
3166 	struct seccomp_notif req = {};
3167 	struct seccomp_notif_resp resp = {};
3168 	struct pollfd pollfd;
3169 
3170 	struct sock_filter filter[] = {
3171 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3172 	};
3173 	struct sock_fprog prog = {
3174 		.len = (unsigned short)ARRAY_SIZE(filter),
3175 		.filter = filter,
3176 	};
3177 
3178 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3179 	ASSERT_EQ(0, ret) {
3180 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3181 	}
3182 
3183 	pid = fork();
3184 	ASSERT_GE(pid, 0);
3185 
3186 	/* Check that we get -ENOSYS with no listener attached */
3187 	if (pid == 0) {
3188 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3189 			exit(1);
3190 		ret = syscall(__NR_getppid);
3191 		exit(ret >= 0 || errno != ENOSYS);
3192 	}
3193 
3194 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3195 	EXPECT_EQ(true, WIFEXITED(status));
3196 	EXPECT_EQ(0, WEXITSTATUS(status));
3197 
3198 	/* Add some no-op filters for grins. */
3199 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3200 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3201 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3202 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3203 
3204 	/* Check that the basic notification machinery works */
3205 	listener = user_notif_syscall(__NR_getppid,
3206 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3207 	ASSERT_GE(listener, 0);
3208 
3209 	/* Installing a second listener in the chain should EBUSY */
3210 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3211 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3212 		  -1);
3213 	EXPECT_EQ(errno, EBUSY);
3214 
3215 	pid = fork();
3216 	ASSERT_GE(pid, 0);
3217 
3218 	if (pid == 0) {
3219 		ret = syscall(__NR_getppid);
3220 		exit(ret != USER_NOTIF_MAGIC);
3221 	}
3222 
3223 	pollfd.fd = listener;
3224 	pollfd.events = POLLIN | POLLOUT;
3225 
3226 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3227 	EXPECT_EQ(pollfd.revents, POLLIN);
3228 
3229 	/* Test that we can't pass garbage to the kernel. */
3230 	memset(&req, 0, sizeof(req));
3231 	req.pid = -1;
3232 	errno = 0;
3233 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3234 	EXPECT_EQ(-1, ret);
3235 	EXPECT_EQ(EINVAL, errno);
3236 
3237 	if (ret) {
3238 		req.pid = 0;
3239 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3240 	}
3241 
3242 	pollfd.fd = listener;
3243 	pollfd.events = POLLIN | POLLOUT;
3244 
3245 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3246 	EXPECT_EQ(pollfd.revents, POLLOUT);
3247 
3248 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3249 
3250 	resp.id = req.id;
3251 	resp.error = 0;
3252 	resp.val = USER_NOTIF_MAGIC;
3253 
3254 	/* check that we make sure flags == 0 */
3255 	resp.flags = 1;
3256 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3257 	EXPECT_EQ(errno, EINVAL);
3258 
3259 	resp.flags = 0;
3260 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3261 
3262 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3263 	EXPECT_EQ(true, WIFEXITED(status));
3264 	EXPECT_EQ(0, WEXITSTATUS(status));
3265 }
3266 
3267 TEST(user_notification_with_tsync)
3268 {
3269 	int ret;
3270 	unsigned int flags;
3271 
3272 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3273 	ASSERT_EQ(0, ret) {
3274 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3275 	}
3276 
3277 	/* these were exclusive */
3278 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3279 		SECCOMP_FILTER_FLAG_TSYNC;
3280 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3281 	ASSERT_EQ(EINVAL, errno);
3282 
3283 	/* but now they're not */
3284 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3285 	ret = user_notif_syscall(__NR_getppid, flags);
3286 	close(ret);
3287 	ASSERT_LE(0, ret);
3288 }
3289 
3290 TEST(user_notification_kill_in_middle)
3291 {
3292 	pid_t pid;
3293 	long ret;
3294 	int listener;
3295 	struct seccomp_notif req = {};
3296 	struct seccomp_notif_resp resp = {};
3297 
3298 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3299 	ASSERT_EQ(0, ret) {
3300 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3301 	}
3302 
3303 	listener = user_notif_syscall(__NR_getppid,
3304 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3305 	ASSERT_GE(listener, 0);
3306 
3307 	/*
3308 	 * Check that nothing bad happens when we kill the task in the middle
3309 	 * of a syscall.
3310 	 */
3311 	pid = fork();
3312 	ASSERT_GE(pid, 0);
3313 
3314 	if (pid == 0) {
3315 		ret = syscall(__NR_getppid);
3316 		exit(ret != USER_NOTIF_MAGIC);
3317 	}
3318 
3319 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3320 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3321 
3322 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3323 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3324 
3325 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3326 
3327 	resp.id = req.id;
3328 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3329 	EXPECT_EQ(ret, -1);
3330 	EXPECT_EQ(errno, ENOENT);
3331 }
3332 
3333 static int handled = -1;
3334 
3335 static void signal_handler(int signal)
3336 {
3337 	if (write(handled, "c", 1) != 1)
3338 		perror("write from signal");
3339 }
3340 
3341 TEST(user_notification_signal)
3342 {
3343 	pid_t pid;
3344 	long ret;
3345 	int status, listener, sk_pair[2];
3346 	struct seccomp_notif req = {};
3347 	struct seccomp_notif_resp resp = {};
3348 	char c;
3349 
3350 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3351 	ASSERT_EQ(0, ret) {
3352 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3353 	}
3354 
3355 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3356 
3357 	listener = user_notif_syscall(__NR_gettid,
3358 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3359 	ASSERT_GE(listener, 0);
3360 
3361 	pid = fork();
3362 	ASSERT_GE(pid, 0);
3363 
3364 	if (pid == 0) {
3365 		close(sk_pair[0]);
3366 		handled = sk_pair[1];
3367 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3368 			perror("signal");
3369 			exit(1);
3370 		}
3371 		/*
3372 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3373 		 * to rely on a signal that has not yet been handled. Let's at
3374 		 * least check that the error code gets propagated through, and
3375 		 * hope that it doesn't break when there is actually a signal :)
3376 		 */
3377 		ret = syscall(__NR_gettid);
3378 		exit(!(ret == -1 && errno == 512));
3379 	}
3380 
3381 	close(sk_pair[1]);
3382 
3383 	memset(&req, 0, sizeof(req));
3384 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3385 
3386 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3387 
3388 	/*
3389 	 * Make sure the signal really is delivered, which means we're not
3390 	 * stuck in the user notification code any more and the notification
3391 	 * should be dead.
3392 	 */
3393 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3394 
3395 	resp.id = req.id;
3396 	resp.error = -EPERM;
3397 	resp.val = 0;
3398 
3399 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3400 	EXPECT_EQ(errno, ENOENT);
3401 
3402 	memset(&req, 0, sizeof(req));
3403 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3404 
3405 	resp.id = req.id;
3406 	resp.error = -512; /* -ERESTARTSYS */
3407 	resp.val = 0;
3408 
3409 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3410 
3411 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3412 	EXPECT_EQ(true, WIFEXITED(status));
3413 	EXPECT_EQ(0, WEXITSTATUS(status));
3414 }
3415 
3416 TEST(user_notification_closed_listener)
3417 {
3418 	pid_t pid;
3419 	long ret;
3420 	int status, listener;
3421 
3422 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3423 	ASSERT_EQ(0, ret) {
3424 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3425 	}
3426 
3427 	listener = user_notif_syscall(__NR_getppid,
3428 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3429 	ASSERT_GE(listener, 0);
3430 
3431 	/*
3432 	 * Check that we get an ENOSYS when the listener is closed.
3433 	 */
3434 	pid = fork();
3435 	ASSERT_GE(pid, 0);
3436 	if (pid == 0) {
3437 		close(listener);
3438 		ret = syscall(__NR_getppid);
3439 		exit(ret != -1 && errno != ENOSYS);
3440 	}
3441 
3442 	close(listener);
3443 
3444 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3445 	EXPECT_EQ(true, WIFEXITED(status));
3446 	EXPECT_EQ(0, WEXITSTATUS(status));
3447 }
3448 
3449 /*
3450  * Check that a pid in a child namespace still shows up as valid in ours.
3451  */
3452 TEST(user_notification_child_pid_ns)
3453 {
3454 	pid_t pid;
3455 	int status, listener;
3456 	struct seccomp_notif req = {};
3457 	struct seccomp_notif_resp resp = {};
3458 
3459 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3460 		if (errno == EINVAL)
3461 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3462 	};
3463 
3464 	listener = user_notif_syscall(__NR_getppid,
3465 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3466 	ASSERT_GE(listener, 0);
3467 
3468 	pid = fork();
3469 	ASSERT_GE(pid, 0);
3470 
3471 	if (pid == 0)
3472 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3473 
3474 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3475 	EXPECT_EQ(req.pid, pid);
3476 
3477 	resp.id = req.id;
3478 	resp.error = 0;
3479 	resp.val = USER_NOTIF_MAGIC;
3480 
3481 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3482 
3483 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3484 	EXPECT_EQ(true, WIFEXITED(status));
3485 	EXPECT_EQ(0, WEXITSTATUS(status));
3486 	close(listener);
3487 }
3488 
3489 /*
3490  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3491  * invalid.
3492  */
3493 TEST(user_notification_sibling_pid_ns)
3494 {
3495 	pid_t pid, pid2;
3496 	int status, listener;
3497 	struct seccomp_notif req = {};
3498 	struct seccomp_notif_resp resp = {};
3499 
3500 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3501 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3502 	}
3503 
3504 	listener = user_notif_syscall(__NR_getppid,
3505 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3506 	ASSERT_GE(listener, 0);
3507 
3508 	pid = fork();
3509 	ASSERT_GE(pid, 0);
3510 
3511 	if (pid == 0) {
3512 		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3513 
3514 		pid2 = fork();
3515 		ASSERT_GE(pid2, 0);
3516 
3517 		if (pid2 == 0)
3518 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3519 
3520 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3521 		EXPECT_EQ(true, WIFEXITED(status));
3522 		EXPECT_EQ(0, WEXITSTATUS(status));
3523 		exit(WEXITSTATUS(status));
3524 	}
3525 
3526 	/* Create the sibling ns, and sibling in it. */
3527 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3528 		if (errno == EPERM)
3529 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3530 	}
3531 	ASSERT_EQ(errno, 0);
3532 
3533 	pid2 = fork();
3534 	ASSERT_GE(pid2, 0);
3535 
3536 	if (pid2 == 0) {
3537 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3538 		/*
3539 		 * The pid should be 0, i.e. the task is in some namespace that
3540 		 * we can't "see".
3541 		 */
3542 		EXPECT_EQ(req.pid, 0);
3543 
3544 		resp.id = req.id;
3545 		resp.error = 0;
3546 		resp.val = USER_NOTIF_MAGIC;
3547 
3548 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3549 		exit(0);
3550 	}
3551 
3552 	close(listener);
3553 
3554 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3555 	EXPECT_EQ(true, WIFEXITED(status));
3556 	EXPECT_EQ(0, WEXITSTATUS(status));
3557 
3558 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3559 	EXPECT_EQ(true, WIFEXITED(status));
3560 	EXPECT_EQ(0, WEXITSTATUS(status));
3561 }
3562 
3563 TEST(user_notification_fault_recv)
3564 {
3565 	pid_t pid;
3566 	int status, listener;
3567 	struct seccomp_notif req = {};
3568 	struct seccomp_notif_resp resp = {};
3569 
3570 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
3571 
3572 	listener = user_notif_syscall(__NR_getppid,
3573 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3574 	ASSERT_GE(listener, 0);
3575 
3576 	pid = fork();
3577 	ASSERT_GE(pid, 0);
3578 
3579 	if (pid == 0)
3580 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3581 
3582 	/* Do a bad recv() */
3583 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3584 	EXPECT_EQ(errno, EFAULT);
3585 
3586 	/* We should still be able to receive this notification, though. */
3587 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3588 	EXPECT_EQ(req.pid, pid);
3589 
3590 	resp.id = req.id;
3591 	resp.error = 0;
3592 	resp.val = USER_NOTIF_MAGIC;
3593 
3594 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3595 
3596 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3597 	EXPECT_EQ(true, WIFEXITED(status));
3598 	EXPECT_EQ(0, WEXITSTATUS(status));
3599 }
3600 
3601 TEST(seccomp_get_notif_sizes)
3602 {
3603 	struct seccomp_notif_sizes sizes;
3604 
3605 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3606 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3607 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3608 }
3609 
3610 TEST(user_notification_continue)
3611 {
3612 	pid_t pid;
3613 	long ret;
3614 	int status, listener;
3615 	struct seccomp_notif req = {};
3616 	struct seccomp_notif_resp resp = {};
3617 	struct pollfd pollfd;
3618 
3619 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3620 	ASSERT_EQ(0, ret) {
3621 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3622 	}
3623 
3624 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3625 	ASSERT_GE(listener, 0);
3626 
3627 	pid = fork();
3628 	ASSERT_GE(pid, 0);
3629 
3630 	if (pid == 0) {
3631 		int dup_fd, pipe_fds[2];
3632 		pid_t self;
3633 
3634 		ASSERT_GE(pipe(pipe_fds), 0);
3635 
3636 		dup_fd = dup(pipe_fds[0]);
3637 		ASSERT_GE(dup_fd, 0);
3638 		EXPECT_NE(pipe_fds[0], dup_fd);
3639 
3640 		self = getpid();
3641 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3642 		exit(0);
3643 	}
3644 
3645 	pollfd.fd = listener;
3646 	pollfd.events = POLLIN | POLLOUT;
3647 
3648 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3649 	EXPECT_EQ(pollfd.revents, POLLIN);
3650 
3651 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3652 
3653 	pollfd.fd = listener;
3654 	pollfd.events = POLLIN | POLLOUT;
3655 
3656 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3657 	EXPECT_EQ(pollfd.revents, POLLOUT);
3658 
3659 	EXPECT_EQ(req.data.nr, __NR_dup);
3660 
3661 	resp.id = req.id;
3662 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3663 
3664 	/*
3665 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3666 	 * args be set to 0.
3667 	 */
3668 	resp.error = 0;
3669 	resp.val = USER_NOTIF_MAGIC;
3670 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3671 	EXPECT_EQ(errno, EINVAL);
3672 
3673 	resp.error = USER_NOTIF_MAGIC;
3674 	resp.val = 0;
3675 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3676 	EXPECT_EQ(errno, EINVAL);
3677 
3678 	resp.error = 0;
3679 	resp.val = 0;
3680 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3681 		if (errno == EINVAL)
3682 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3683 	}
3684 
3685 skip:
3686 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3687 	EXPECT_EQ(true, WIFEXITED(status));
3688 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3689 		if (WEXITSTATUS(status) == 2) {
3690 			SKIP(return, "Kernel does not support kcmp() syscall");
3691 			return;
3692 		}
3693 	}
3694 }
3695 
3696 TEST(user_notification_filter_empty)
3697 {
3698 	pid_t pid;
3699 	long ret;
3700 	int status;
3701 	struct pollfd pollfd;
3702 	struct clone_args args = {
3703 		.flags = CLONE_FILES,
3704 		.exit_signal = SIGCHLD,
3705 	};
3706 
3707 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3708 	ASSERT_EQ(0, ret) {
3709 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3710 	}
3711 
3712 	pid = sys_clone3(&args, sizeof(args));
3713 	ASSERT_GE(pid, 0);
3714 
3715 	if (pid == 0) {
3716 		int listener;
3717 
3718 		listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3719 		if (listener < 0)
3720 			_exit(EXIT_FAILURE);
3721 
3722 		if (dup2(listener, 200) != 200)
3723 			_exit(EXIT_FAILURE);
3724 
3725 		close(listener);
3726 
3727 		_exit(EXIT_SUCCESS);
3728 	}
3729 
3730 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3731 	EXPECT_EQ(true, WIFEXITED(status));
3732 	EXPECT_EQ(0, WEXITSTATUS(status));
3733 
3734 	/*
3735 	 * The seccomp filter has become unused so we should be notified once
3736 	 * the kernel gets around to cleaning up task struct.
3737 	 */
3738 	pollfd.fd = 200;
3739 	pollfd.events = POLLHUP;
3740 
3741 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3742 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3743 }
3744 
3745 static void *do_thread(void *data)
3746 {
3747 	return NULL;
3748 }
3749 
3750 TEST(user_notification_filter_empty_threaded)
3751 {
3752 	pid_t pid;
3753 	long ret;
3754 	int status;
3755 	struct pollfd pollfd;
3756 	struct clone_args args = {
3757 		.flags = CLONE_FILES,
3758 		.exit_signal = SIGCHLD,
3759 	};
3760 
3761 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3762 	ASSERT_EQ(0, ret) {
3763 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3764 	}
3765 
3766 	pid = sys_clone3(&args, sizeof(args));
3767 	ASSERT_GE(pid, 0);
3768 
3769 	if (pid == 0) {
3770 		pid_t pid1, pid2;
3771 		int listener, status;
3772 		pthread_t thread;
3773 
3774 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3775 		if (listener < 0)
3776 			_exit(EXIT_FAILURE);
3777 
3778 		if (dup2(listener, 200) != 200)
3779 			_exit(EXIT_FAILURE);
3780 
3781 		close(listener);
3782 
3783 		pid1 = fork();
3784 		if (pid1 < 0)
3785 			_exit(EXIT_FAILURE);
3786 
3787 		if (pid1 == 0)
3788 			_exit(EXIT_SUCCESS);
3789 
3790 		pid2 = fork();
3791 		if (pid2 < 0)
3792 			_exit(EXIT_FAILURE);
3793 
3794 		if (pid2 == 0)
3795 			_exit(EXIT_SUCCESS);
3796 
3797 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3798 		    pthread_join(thread, NULL))
3799 			_exit(EXIT_FAILURE);
3800 
3801 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3802 		    pthread_join(thread, NULL))
3803 			_exit(EXIT_FAILURE);
3804 
3805 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3806 		    WEXITSTATUS(status))
3807 			_exit(EXIT_FAILURE);
3808 
3809 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
3810 		    WEXITSTATUS(status))
3811 			_exit(EXIT_FAILURE);
3812 
3813 		exit(EXIT_SUCCESS);
3814 	}
3815 
3816 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3817 	EXPECT_EQ(true, WIFEXITED(status));
3818 	EXPECT_EQ(0, WEXITSTATUS(status));
3819 
3820 	/*
3821 	 * The seccomp filter has become unused so we should be notified once
3822 	 * the kernel gets around to cleaning up task struct.
3823 	 */
3824 	pollfd.fd = 200;
3825 	pollfd.events = POLLHUP;
3826 
3827 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3828 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3829 }
3830 
3831 TEST(user_notification_addfd)
3832 {
3833 	pid_t pid;
3834 	long ret;
3835 	int status, listener, memfd, fd;
3836 	struct seccomp_notif_addfd addfd = {};
3837 	struct seccomp_notif_addfd_small small = {};
3838 	struct seccomp_notif_addfd_big big = {};
3839 	struct seccomp_notif req = {};
3840 	struct seccomp_notif_resp resp = {};
3841 	/* 100 ms */
3842 	struct timespec delay = { .tv_nsec = 100000000 };
3843 
3844 	memfd = memfd_create("test", 0);
3845 	ASSERT_GE(memfd, 0);
3846 
3847 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3848 	ASSERT_EQ(0, ret) {
3849 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3850 	}
3851 
3852 	/* Check that the basic notification machinery works */
3853 	listener = user_notif_syscall(__NR_getppid,
3854 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3855 	ASSERT_GE(listener, 0);
3856 
3857 	pid = fork();
3858 	ASSERT_GE(pid, 0);
3859 
3860 	if (pid == 0) {
3861 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
3862 			exit(1);
3863 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3864 	}
3865 
3866 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3867 
3868 	addfd.srcfd = memfd;
3869 	addfd.newfd = 0;
3870 	addfd.id = req.id;
3871 	addfd.flags = 0x0;
3872 
3873 	/* Verify bad newfd_flags cannot be set */
3874 	addfd.newfd_flags = ~O_CLOEXEC;
3875 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3876 	EXPECT_EQ(errno, EINVAL);
3877 	addfd.newfd_flags = O_CLOEXEC;
3878 
3879 	/* Verify bad flags cannot be set */
3880 	addfd.flags = 0xff;
3881 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3882 	EXPECT_EQ(errno, EINVAL);
3883 	addfd.flags = 0;
3884 
3885 	/* Verify that remote_fd cannot be set without setting flags */
3886 	addfd.newfd = 1;
3887 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
3888 	EXPECT_EQ(errno, EINVAL);
3889 	addfd.newfd = 0;
3890 
3891 	/* Verify small size cannot be set */
3892 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
3893 	EXPECT_EQ(errno, EINVAL);
3894 
3895 	/* Verify we can't send bits filled in unknown buffer area */
3896 	memset(&big, 0xAA, sizeof(big));
3897 	big.addfd = addfd;
3898 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
3899 	EXPECT_EQ(errno, E2BIG);
3900 
3901 
3902 	/* Verify we can set an arbitrary remote fd */
3903 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
3904 	/*
3905 	 * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd),
3906 	 * 4(listener), so the newly allocated fd should be 5.
3907 	 */
3908 	EXPECT_EQ(fd, 5);
3909 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
3910 
3911 	/* Verify we can set an arbitrary remote fd with large size */
3912 	memset(&big, 0x0, sizeof(big));
3913 	big.addfd = addfd;
3914 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
3915 	EXPECT_EQ(fd, 6);
3916 
3917 	/* Verify we can set a specific remote fd */
3918 	addfd.newfd = 42;
3919 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
3920 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
3921 	EXPECT_EQ(fd, 42);
3922 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
3923 
3924 	/* Resume syscall */
3925 	resp.id = req.id;
3926 	resp.error = 0;
3927 	resp.val = USER_NOTIF_MAGIC;
3928 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3929 
3930 	/*
3931 	 * This sets the ID of the ADD FD to the last request plus 1. The
3932 	 * notification ID increments 1 per notification.
3933 	 */
3934 	addfd.id = req.id + 1;
3935 
3936 	/* This spins until the underlying notification is generated */
3937 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
3938 	       errno != -EINPROGRESS)
3939 		nanosleep(&delay, NULL);
3940 
3941 	memset(&req, 0, sizeof(req));
3942 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3943 	ASSERT_EQ(addfd.id, req.id);
3944 
3945 	resp.id = req.id;
3946 	resp.error = 0;
3947 	resp.val = USER_NOTIF_MAGIC;
3948 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3949 
3950 	/* Wait for child to finish. */
3951 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3952 	EXPECT_EQ(true, WIFEXITED(status));
3953 	EXPECT_EQ(0, WEXITSTATUS(status));
3954 
3955 	close(memfd);
3956 }
3957 
3958 TEST(user_notification_addfd_rlimit)
3959 {
3960 	pid_t pid;
3961 	long ret;
3962 	int status, listener, memfd;
3963 	struct seccomp_notif_addfd addfd = {};
3964 	struct seccomp_notif req = {};
3965 	struct seccomp_notif_resp resp = {};
3966 	const struct rlimit lim = {
3967 		.rlim_cur	= 0,
3968 		.rlim_max	= 0,
3969 	};
3970 
3971 	memfd = memfd_create("test", 0);
3972 	ASSERT_GE(memfd, 0);
3973 
3974 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3975 	ASSERT_EQ(0, ret) {
3976 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3977 	}
3978 
3979 	/* Check that the basic notification machinery works */
3980 	listener = user_notif_syscall(__NR_getppid,
3981 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3982 	ASSERT_GE(listener, 0);
3983 
3984 	pid = fork();
3985 	ASSERT_GE(pid, 0);
3986 
3987 	if (pid == 0)
3988 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3989 
3990 
3991 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3992 
3993 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
3994 
3995 	addfd.srcfd = memfd;
3996 	addfd.newfd_flags = O_CLOEXEC;
3997 	addfd.newfd = 0;
3998 	addfd.id = req.id;
3999 	addfd.flags = 0;
4000 
4001 	/* Should probably spot check /proc/sys/fs/file-nr */
4002 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4003 	EXPECT_EQ(errno, EMFILE);
4004 
4005 	addfd.newfd = 100;
4006 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4007 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4008 	EXPECT_EQ(errno, EBADF);
4009 
4010 	resp.id = req.id;
4011 	resp.error = 0;
4012 	resp.val = USER_NOTIF_MAGIC;
4013 
4014 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4015 
4016 	/* Wait for child to finish. */
4017 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4018 	EXPECT_EQ(true, WIFEXITED(status));
4019 	EXPECT_EQ(0, WEXITSTATUS(status));
4020 
4021 	close(memfd);
4022 }
4023 
4024 /*
4025  * TODO:
4026  * - expand NNP testing
4027  * - better arch-specific TRACE and TRAP handlers.
4028  * - endianness checking when appropriate
4029  * - 64-bit arg prodding
4030  * - arch value testing (x86 modes especially)
4031  * - verify that FILTER_FLAG_LOG filters generate log messages
4032  * - verify that RET_LOG generates log messages
4033  */
4034 
4035 TEST_HARNESS_MAIN
4036