xref: /openbmc/linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision f8523d0e83613ab8d082cd504dc53a09fbba4889)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 
49 #include <unistd.h>
50 #include <sys/syscall.h>
51 #include <poll.h>
52 
53 #include "../kselftest_harness.h"
54 
55 #ifndef PR_SET_PTRACER
56 # define PR_SET_PTRACER 0x59616d61
57 #endif
58 
59 #ifndef PR_SET_NO_NEW_PRIVS
60 #define PR_SET_NO_NEW_PRIVS 38
61 #define PR_GET_NO_NEW_PRIVS 39
62 #endif
63 
64 #ifndef PR_SECCOMP_EXT
65 #define PR_SECCOMP_EXT 43
66 #endif
67 
68 #ifndef SECCOMP_EXT_ACT
69 #define SECCOMP_EXT_ACT 1
70 #endif
71 
72 #ifndef SECCOMP_EXT_ACT_TSYNC
73 #define SECCOMP_EXT_ACT_TSYNC 1
74 #endif
75 
76 #ifndef SECCOMP_MODE_STRICT
77 #define SECCOMP_MODE_STRICT 1
78 #endif
79 
80 #ifndef SECCOMP_MODE_FILTER
81 #define SECCOMP_MODE_FILTER 2
82 #endif
83 
84 #ifndef SECCOMP_RET_ALLOW
85 struct seccomp_data {
86 	int nr;
87 	__u32 arch;
88 	__u64 instruction_pointer;
89 	__u64 args[6];
90 };
91 #endif
92 
93 #ifndef SECCOMP_RET_KILL_PROCESS
94 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
95 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
96 #endif
97 #ifndef SECCOMP_RET_KILL
98 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
99 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
100 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
101 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
102 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
103 #endif
104 #ifndef SECCOMP_RET_LOG
105 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
106 #endif
107 
108 #ifndef __NR_seccomp
109 # if defined(__i386__)
110 #  define __NR_seccomp 354
111 # elif defined(__x86_64__)
112 #  define __NR_seccomp 317
113 # elif defined(__arm__)
114 #  define __NR_seccomp 383
115 # elif defined(__aarch64__)
116 #  define __NR_seccomp 277
117 # elif defined(__riscv)
118 #  define __NR_seccomp 277
119 # elif defined(__hppa__)
120 #  define __NR_seccomp 338
121 # elif defined(__powerpc__)
122 #  define __NR_seccomp 358
123 # elif defined(__s390__)
124 #  define __NR_seccomp 348
125 # else
126 #  warning "seccomp syscall number unknown for this architecture"
127 #  define __NR_seccomp 0xffff
128 # endif
129 #endif
130 
131 #ifndef SECCOMP_SET_MODE_STRICT
132 #define SECCOMP_SET_MODE_STRICT 0
133 #endif
134 
135 #ifndef SECCOMP_SET_MODE_FILTER
136 #define SECCOMP_SET_MODE_FILTER 1
137 #endif
138 
139 #ifndef SECCOMP_GET_ACTION_AVAIL
140 #define SECCOMP_GET_ACTION_AVAIL 2
141 #endif
142 
143 #ifndef SECCOMP_GET_NOTIF_SIZES
144 #define SECCOMP_GET_NOTIF_SIZES 3
145 #endif
146 
147 #ifndef SECCOMP_FILTER_FLAG_TSYNC
148 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
149 #endif
150 
151 #ifndef SECCOMP_FILTER_FLAG_LOG
152 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
153 #endif
154 
155 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
156 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
157 #endif
158 
159 #ifndef PTRACE_SECCOMP_GET_METADATA
160 #define PTRACE_SECCOMP_GET_METADATA	0x420d
161 
162 struct seccomp_metadata {
163 	__u64 filter_off;       /* Input: which filter */
164 	__u64 flags;             /* Output: filter's flags */
165 };
166 #endif
167 
168 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
169 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
170 
171 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
172 
173 #define SECCOMP_IOC_MAGIC		'!'
174 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
175 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
176 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
177 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
178 
179 /* Flags for seccomp notification fd ioctl. */
180 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
181 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
182 						struct seccomp_notif_resp)
183 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
184 
185 struct seccomp_notif {
186 	__u64 id;
187 	__u32 pid;
188 	__u32 flags;
189 	struct seccomp_data data;
190 };
191 
192 struct seccomp_notif_resp {
193 	__u64 id;
194 	__s64 val;
195 	__s32 error;
196 	__u32 flags;
197 };
198 
199 struct seccomp_notif_sizes {
200 	__u16 seccomp_notif;
201 	__u16 seccomp_notif_resp;
202 	__u16 seccomp_data;
203 };
204 #endif
205 
206 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
207 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
208 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
209 #endif
210 
211 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
212 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
213 #endif
214 
215 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
216 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
217 #endif
218 
219 #ifndef seccomp
220 int seccomp(unsigned int op, unsigned int flags, void *args)
221 {
222 	errno = 0;
223 	return syscall(__NR_seccomp, op, flags, args);
224 }
225 #endif
226 
227 #if __BYTE_ORDER == __LITTLE_ENDIAN
228 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
229 #elif __BYTE_ORDER == __BIG_ENDIAN
230 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
231 #else
232 #error "wut? Unknown __BYTE_ORDER?!"
233 #endif
234 
235 #define SIBLING_EXIT_UNKILLED	0xbadbeef
236 #define SIBLING_EXIT_FAILURE	0xbadface
237 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
238 
239 TEST(mode_strict_support)
240 {
241 	long ret;
242 
243 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
244 	ASSERT_EQ(0, ret) {
245 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
246 	}
247 	syscall(__NR_exit, 0);
248 }
249 
250 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
251 {
252 	long ret;
253 
254 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
255 	ASSERT_EQ(0, ret) {
256 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
257 	}
258 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
259 		NULL, NULL, NULL);
260 	EXPECT_FALSE(true) {
261 		TH_LOG("Unreachable!");
262 	}
263 }
264 
265 /* Note! This doesn't test no new privs behavior */
266 TEST(no_new_privs_support)
267 {
268 	long ret;
269 
270 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
271 	EXPECT_EQ(0, ret) {
272 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
273 	}
274 }
275 
276 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
277 TEST(mode_filter_support)
278 {
279 	long ret;
280 
281 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
282 	ASSERT_EQ(0, ret) {
283 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
284 	}
285 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
286 	EXPECT_EQ(-1, ret);
287 	EXPECT_EQ(EFAULT, errno) {
288 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
289 	}
290 }
291 
292 TEST(mode_filter_without_nnp)
293 {
294 	struct sock_filter filter[] = {
295 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
296 	};
297 	struct sock_fprog prog = {
298 		.len = (unsigned short)ARRAY_SIZE(filter),
299 		.filter = filter,
300 	};
301 	long ret;
302 
303 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
304 	ASSERT_LE(0, ret) {
305 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
306 	}
307 	errno = 0;
308 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
309 	/* Succeeds with CAP_SYS_ADMIN, fails without */
310 	/* TODO(wad) check caps not euid */
311 	if (geteuid()) {
312 		EXPECT_EQ(-1, ret);
313 		EXPECT_EQ(EACCES, errno);
314 	} else {
315 		EXPECT_EQ(0, ret);
316 	}
317 }
318 
319 #define MAX_INSNS_PER_PATH 32768
320 
321 TEST(filter_size_limits)
322 {
323 	int i;
324 	int count = BPF_MAXINSNS + 1;
325 	struct sock_filter allow[] = {
326 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
327 	};
328 	struct sock_filter *filter;
329 	struct sock_fprog prog = { };
330 	long ret;
331 
332 	filter = calloc(count, sizeof(*filter));
333 	ASSERT_NE(NULL, filter);
334 
335 	for (i = 0; i < count; i++)
336 		filter[i] = allow[0];
337 
338 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
339 	ASSERT_EQ(0, ret);
340 
341 	prog.filter = filter;
342 	prog.len = count;
343 
344 	/* Too many filter instructions in a single filter. */
345 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
346 	ASSERT_NE(0, ret) {
347 		TH_LOG("Installing %d insn filter was allowed", prog.len);
348 	}
349 
350 	/* One less is okay, though. */
351 	prog.len -= 1;
352 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
353 	ASSERT_EQ(0, ret) {
354 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
355 	}
356 }
357 
358 TEST(filter_chain_limits)
359 {
360 	int i;
361 	int count = BPF_MAXINSNS;
362 	struct sock_filter allow[] = {
363 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
364 	};
365 	struct sock_filter *filter;
366 	struct sock_fprog prog = { };
367 	long ret;
368 
369 	filter = calloc(count, sizeof(*filter));
370 	ASSERT_NE(NULL, filter);
371 
372 	for (i = 0; i < count; i++)
373 		filter[i] = allow[0];
374 
375 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
376 	ASSERT_EQ(0, ret);
377 
378 	prog.filter = filter;
379 	prog.len = 1;
380 
381 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
382 	ASSERT_EQ(0, ret);
383 
384 	prog.len = count;
385 
386 	/* Too many total filter instructions. */
387 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
388 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
389 		if (ret != 0)
390 			break;
391 	}
392 	ASSERT_NE(0, ret) {
393 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
394 		       i, count, i * (count + 4));
395 	}
396 }
397 
398 TEST(mode_filter_cannot_move_to_strict)
399 {
400 	struct sock_filter filter[] = {
401 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
402 	};
403 	struct sock_fprog prog = {
404 		.len = (unsigned short)ARRAY_SIZE(filter),
405 		.filter = filter,
406 	};
407 	long ret;
408 
409 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
410 	ASSERT_EQ(0, ret);
411 
412 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
413 	ASSERT_EQ(0, ret);
414 
415 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
416 	EXPECT_EQ(-1, ret);
417 	EXPECT_EQ(EINVAL, errno);
418 }
419 
420 
421 TEST(mode_filter_get_seccomp)
422 {
423 	struct sock_filter filter[] = {
424 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
425 	};
426 	struct sock_fprog prog = {
427 		.len = (unsigned short)ARRAY_SIZE(filter),
428 		.filter = filter,
429 	};
430 	long ret;
431 
432 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
433 	ASSERT_EQ(0, ret);
434 
435 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
436 	EXPECT_EQ(0, ret);
437 
438 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
439 	ASSERT_EQ(0, ret);
440 
441 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
442 	EXPECT_EQ(2, ret);
443 }
444 
445 
446 TEST(ALLOW_all)
447 {
448 	struct sock_filter filter[] = {
449 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
450 	};
451 	struct sock_fprog prog = {
452 		.len = (unsigned short)ARRAY_SIZE(filter),
453 		.filter = filter,
454 	};
455 	long ret;
456 
457 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
458 	ASSERT_EQ(0, ret);
459 
460 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
461 	ASSERT_EQ(0, ret);
462 }
463 
464 TEST(empty_prog)
465 {
466 	struct sock_filter filter[] = {
467 	};
468 	struct sock_fprog prog = {
469 		.len = (unsigned short)ARRAY_SIZE(filter),
470 		.filter = filter,
471 	};
472 	long ret;
473 
474 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
475 	ASSERT_EQ(0, ret);
476 
477 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
478 	EXPECT_EQ(-1, ret);
479 	EXPECT_EQ(EINVAL, errno);
480 }
481 
482 TEST(log_all)
483 {
484 	struct sock_filter filter[] = {
485 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
486 	};
487 	struct sock_fprog prog = {
488 		.len = (unsigned short)ARRAY_SIZE(filter),
489 		.filter = filter,
490 	};
491 	long ret;
492 	pid_t parent = getppid();
493 
494 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
495 	ASSERT_EQ(0, ret);
496 
497 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
498 	ASSERT_EQ(0, ret);
499 
500 	/* getppid() should succeed and be logged (no check for logging) */
501 	EXPECT_EQ(parent, syscall(__NR_getppid));
502 }
503 
504 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
505 {
506 	struct sock_filter filter[] = {
507 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
508 	};
509 	struct sock_fprog prog = {
510 		.len = (unsigned short)ARRAY_SIZE(filter),
511 		.filter = filter,
512 	};
513 	long ret;
514 
515 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
516 	ASSERT_EQ(0, ret);
517 
518 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
519 	ASSERT_EQ(0, ret);
520 	EXPECT_EQ(0, syscall(__NR_getpid)) {
521 		TH_LOG("getpid() shouldn't ever return");
522 	}
523 }
524 
525 /* return code >= 0x80000000 is unused. */
526 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
527 {
528 	struct sock_filter filter[] = {
529 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
530 	};
531 	struct sock_fprog prog = {
532 		.len = (unsigned short)ARRAY_SIZE(filter),
533 		.filter = filter,
534 	};
535 	long ret;
536 
537 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
538 	ASSERT_EQ(0, ret);
539 
540 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
541 	ASSERT_EQ(0, ret);
542 	EXPECT_EQ(0, syscall(__NR_getpid)) {
543 		TH_LOG("getpid() shouldn't ever return");
544 	}
545 }
546 
547 TEST_SIGNAL(KILL_all, SIGSYS)
548 {
549 	struct sock_filter filter[] = {
550 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
551 	};
552 	struct sock_fprog prog = {
553 		.len = (unsigned short)ARRAY_SIZE(filter),
554 		.filter = filter,
555 	};
556 	long ret;
557 
558 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
559 	ASSERT_EQ(0, ret);
560 
561 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
562 	ASSERT_EQ(0, ret);
563 }
564 
565 TEST_SIGNAL(KILL_one, SIGSYS)
566 {
567 	struct sock_filter filter[] = {
568 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
569 			offsetof(struct seccomp_data, nr)),
570 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
571 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
572 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
573 	};
574 	struct sock_fprog prog = {
575 		.len = (unsigned short)ARRAY_SIZE(filter),
576 		.filter = filter,
577 	};
578 	long ret;
579 	pid_t parent = getppid();
580 
581 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
582 	ASSERT_EQ(0, ret);
583 
584 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
585 	ASSERT_EQ(0, ret);
586 
587 	EXPECT_EQ(parent, syscall(__NR_getppid));
588 	/* getpid() should never return. */
589 	EXPECT_EQ(0, syscall(__NR_getpid));
590 }
591 
592 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
593 {
594 	void *fatal_address;
595 	struct sock_filter filter[] = {
596 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
597 			offsetof(struct seccomp_data, nr)),
598 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
599 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
600 		/* Only both with lower 32-bit for now. */
601 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
602 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
603 			(unsigned long)&fatal_address, 0, 1),
604 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
605 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
606 	};
607 	struct sock_fprog prog = {
608 		.len = (unsigned short)ARRAY_SIZE(filter),
609 		.filter = filter,
610 	};
611 	long ret;
612 	pid_t parent = getppid();
613 	struct tms timebuf;
614 	clock_t clock = times(&timebuf);
615 
616 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
617 	ASSERT_EQ(0, ret);
618 
619 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
620 	ASSERT_EQ(0, ret);
621 
622 	EXPECT_EQ(parent, syscall(__NR_getppid));
623 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
624 	/* times() should never return. */
625 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
626 }
627 
628 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
629 {
630 #ifndef __NR_mmap2
631 	int sysno = __NR_mmap;
632 #else
633 	int sysno = __NR_mmap2;
634 #endif
635 	struct sock_filter filter[] = {
636 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
637 			offsetof(struct seccomp_data, nr)),
638 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
639 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
640 		/* Only both with lower 32-bit for now. */
641 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
642 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
643 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
644 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
645 	};
646 	struct sock_fprog prog = {
647 		.len = (unsigned short)ARRAY_SIZE(filter),
648 		.filter = filter,
649 	};
650 	long ret;
651 	pid_t parent = getppid();
652 	int fd;
653 	void *map1, *map2;
654 	int page_size = sysconf(_SC_PAGESIZE);
655 
656 	ASSERT_LT(0, page_size);
657 
658 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
659 	ASSERT_EQ(0, ret);
660 
661 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
662 	ASSERT_EQ(0, ret);
663 
664 	fd = open("/dev/zero", O_RDONLY);
665 	ASSERT_NE(-1, fd);
666 
667 	EXPECT_EQ(parent, syscall(__NR_getppid));
668 	map1 = (void *)syscall(sysno,
669 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
670 	EXPECT_NE(MAP_FAILED, map1);
671 	/* mmap2() should never return. */
672 	map2 = (void *)syscall(sysno,
673 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
674 	EXPECT_EQ(MAP_FAILED, map2);
675 
676 	/* The test failed, so clean up the resources. */
677 	munmap(map1, page_size);
678 	munmap(map2, page_size);
679 	close(fd);
680 }
681 
682 /* This is a thread task to die via seccomp filter violation. */
683 void *kill_thread(void *data)
684 {
685 	bool die = (bool)data;
686 
687 	if (die) {
688 		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
689 		return (void *)SIBLING_EXIT_FAILURE;
690 	}
691 
692 	return (void *)SIBLING_EXIT_UNKILLED;
693 }
694 
695 /* Prepare a thread that will kill itself or both of us. */
696 void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
697 {
698 	pthread_t thread;
699 	void *status;
700 	/* Kill only when calling __NR_prctl. */
701 	struct sock_filter filter_thread[] = {
702 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
703 			offsetof(struct seccomp_data, nr)),
704 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
705 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
706 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
707 	};
708 	struct sock_fprog prog_thread = {
709 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
710 		.filter = filter_thread,
711 	};
712 	struct sock_filter filter_process[] = {
713 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
714 			offsetof(struct seccomp_data, nr)),
715 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
716 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS),
717 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
718 	};
719 	struct sock_fprog prog_process = {
720 		.len = (unsigned short)ARRAY_SIZE(filter_process),
721 		.filter = filter_process,
722 	};
723 
724 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
725 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
726 	}
727 
728 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
729 			     kill_process ? &prog_process : &prog_thread));
730 
731 	/*
732 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
733 	 * flag cannot be downgraded by a new filter.
734 	 */
735 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
736 
737 	/* Start a thread that will exit immediately. */
738 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
739 	ASSERT_EQ(0, pthread_join(thread, &status));
740 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
741 
742 	/* Start a thread that will die immediately. */
743 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
744 	ASSERT_EQ(0, pthread_join(thread, &status));
745 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
746 
747 	/*
748 	 * If we get here, only the spawned thread died. Let the parent know
749 	 * the whole process didn't die (i.e. this thread, the spawner,
750 	 * stayed running).
751 	 */
752 	exit(42);
753 }
754 
755 TEST(KILL_thread)
756 {
757 	int status;
758 	pid_t child_pid;
759 
760 	child_pid = fork();
761 	ASSERT_LE(0, child_pid);
762 	if (child_pid == 0) {
763 		kill_thread_or_group(_metadata, false);
764 		_exit(38);
765 	}
766 
767 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
768 
769 	/* If only the thread was killed, we'll see exit 42. */
770 	ASSERT_TRUE(WIFEXITED(status));
771 	ASSERT_EQ(42, WEXITSTATUS(status));
772 }
773 
774 TEST(KILL_process)
775 {
776 	int status;
777 	pid_t child_pid;
778 
779 	child_pid = fork();
780 	ASSERT_LE(0, child_pid);
781 	if (child_pid == 0) {
782 		kill_thread_or_group(_metadata, true);
783 		_exit(38);
784 	}
785 
786 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
787 
788 	/* If the entire process was killed, we'll see SIGSYS. */
789 	ASSERT_TRUE(WIFSIGNALED(status));
790 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
791 }
792 
793 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
794 TEST(arg_out_of_range)
795 {
796 	struct sock_filter filter[] = {
797 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
798 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
799 	};
800 	struct sock_fprog prog = {
801 		.len = (unsigned short)ARRAY_SIZE(filter),
802 		.filter = filter,
803 	};
804 	long ret;
805 
806 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
807 	ASSERT_EQ(0, ret);
808 
809 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
810 	EXPECT_EQ(-1, ret);
811 	EXPECT_EQ(EINVAL, errno);
812 }
813 
814 #define ERRNO_FILTER(name, errno)					\
815 	struct sock_filter _read_filter_##name[] = {			\
816 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
817 			offsetof(struct seccomp_data, nr)),		\
818 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
819 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
820 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
821 	};								\
822 	struct sock_fprog prog_##name = {				\
823 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
824 		.filter = _read_filter_##name,				\
825 	}
826 
827 /* Make sure basic errno values are correctly passed through a filter. */
828 TEST(ERRNO_valid)
829 {
830 	ERRNO_FILTER(valid, E2BIG);
831 	long ret;
832 	pid_t parent = getppid();
833 
834 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
835 	ASSERT_EQ(0, ret);
836 
837 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
838 	ASSERT_EQ(0, ret);
839 
840 	EXPECT_EQ(parent, syscall(__NR_getppid));
841 	EXPECT_EQ(-1, read(0, NULL, 0));
842 	EXPECT_EQ(E2BIG, errno);
843 }
844 
845 /* Make sure an errno of zero is correctly handled by the arch code. */
846 TEST(ERRNO_zero)
847 {
848 	ERRNO_FILTER(zero, 0);
849 	long ret;
850 	pid_t parent = getppid();
851 
852 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
853 	ASSERT_EQ(0, ret);
854 
855 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
856 	ASSERT_EQ(0, ret);
857 
858 	EXPECT_EQ(parent, syscall(__NR_getppid));
859 	/* "errno" of 0 is ok. */
860 	EXPECT_EQ(0, read(0, NULL, 0));
861 }
862 
863 /*
864  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
865  * This tests that the errno value gets capped correctly, fixed by
866  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
867  */
868 TEST(ERRNO_capped)
869 {
870 	ERRNO_FILTER(capped, 4096);
871 	long ret;
872 	pid_t parent = getppid();
873 
874 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
875 	ASSERT_EQ(0, ret);
876 
877 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
878 	ASSERT_EQ(0, ret);
879 
880 	EXPECT_EQ(parent, syscall(__NR_getppid));
881 	EXPECT_EQ(-1, read(0, NULL, 0));
882 	EXPECT_EQ(4095, errno);
883 }
884 
885 /*
886  * Filters are processed in reverse order: last applied is executed first.
887  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
888  * SECCOMP_RET_DATA mask results will follow the most recently applied
889  * matching filter return (and not the lowest or highest value).
890  */
891 TEST(ERRNO_order)
892 {
893 	ERRNO_FILTER(first,  11);
894 	ERRNO_FILTER(second, 13);
895 	ERRNO_FILTER(third,  12);
896 	long ret;
897 	pid_t parent = getppid();
898 
899 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
900 	ASSERT_EQ(0, ret);
901 
902 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
903 	ASSERT_EQ(0, ret);
904 
905 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
906 	ASSERT_EQ(0, ret);
907 
908 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
909 	ASSERT_EQ(0, ret);
910 
911 	EXPECT_EQ(parent, syscall(__NR_getppid));
912 	EXPECT_EQ(-1, read(0, NULL, 0));
913 	EXPECT_EQ(12, errno);
914 }
915 
916 FIXTURE(TRAP) {
917 	struct sock_fprog prog;
918 };
919 
920 FIXTURE_SETUP(TRAP)
921 {
922 	struct sock_filter filter[] = {
923 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
924 			offsetof(struct seccomp_data, nr)),
925 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
926 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
927 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
928 	};
929 
930 	memset(&self->prog, 0, sizeof(self->prog));
931 	self->prog.filter = malloc(sizeof(filter));
932 	ASSERT_NE(NULL, self->prog.filter);
933 	memcpy(self->prog.filter, filter, sizeof(filter));
934 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
935 }
936 
937 FIXTURE_TEARDOWN(TRAP)
938 {
939 	if (self->prog.filter)
940 		free(self->prog.filter);
941 }
942 
943 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
944 {
945 	long ret;
946 
947 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
948 	ASSERT_EQ(0, ret);
949 
950 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
951 	ASSERT_EQ(0, ret);
952 	syscall(__NR_getpid);
953 }
954 
955 /* Ensure that SIGSYS overrides SIG_IGN */
956 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
957 {
958 	long ret;
959 
960 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
961 	ASSERT_EQ(0, ret);
962 
963 	signal(SIGSYS, SIG_IGN);
964 
965 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
966 	ASSERT_EQ(0, ret);
967 	syscall(__NR_getpid);
968 }
969 
970 static siginfo_t TRAP_info;
971 static volatile int TRAP_nr;
972 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
973 {
974 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
975 	TRAP_nr = nr;
976 }
977 
978 TEST_F(TRAP, handler)
979 {
980 	int ret, test;
981 	struct sigaction act;
982 	sigset_t mask;
983 
984 	memset(&act, 0, sizeof(act));
985 	sigemptyset(&mask);
986 	sigaddset(&mask, SIGSYS);
987 
988 	act.sa_sigaction = &TRAP_action;
989 	act.sa_flags = SA_SIGINFO;
990 	ret = sigaction(SIGSYS, &act, NULL);
991 	ASSERT_EQ(0, ret) {
992 		TH_LOG("sigaction failed");
993 	}
994 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
995 	ASSERT_EQ(0, ret) {
996 		TH_LOG("sigprocmask failed");
997 	}
998 
999 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1000 	ASSERT_EQ(0, ret);
1001 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1002 	ASSERT_EQ(0, ret);
1003 	TRAP_nr = 0;
1004 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1005 	/* Expect the registers to be rolled back. (nr = error) may vary
1006 	 * based on arch. */
1007 	ret = syscall(__NR_getpid);
1008 	/* Silence gcc warning about volatile. */
1009 	test = TRAP_nr;
1010 	EXPECT_EQ(SIGSYS, test);
1011 	struct local_sigsys {
1012 		void *_call_addr;	/* calling user insn */
1013 		int _syscall;		/* triggering system call number */
1014 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1015 	} *sigsys = (struct local_sigsys *)
1016 #ifdef si_syscall
1017 		&(TRAP_info.si_call_addr);
1018 #else
1019 		&TRAP_info.si_pid;
1020 #endif
1021 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1022 	/* Make sure arch is non-zero. */
1023 	EXPECT_NE(0, sigsys->_arch);
1024 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1025 }
1026 
1027 FIXTURE(precedence) {
1028 	struct sock_fprog allow;
1029 	struct sock_fprog log;
1030 	struct sock_fprog trace;
1031 	struct sock_fprog error;
1032 	struct sock_fprog trap;
1033 	struct sock_fprog kill;
1034 };
1035 
1036 FIXTURE_SETUP(precedence)
1037 {
1038 	struct sock_filter allow_insns[] = {
1039 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1040 	};
1041 	struct sock_filter log_insns[] = {
1042 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1043 			offsetof(struct seccomp_data, nr)),
1044 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1045 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1046 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1047 	};
1048 	struct sock_filter trace_insns[] = {
1049 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1050 			offsetof(struct seccomp_data, nr)),
1051 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1052 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1053 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1054 	};
1055 	struct sock_filter error_insns[] = {
1056 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1057 			offsetof(struct seccomp_data, nr)),
1058 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1059 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1060 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1061 	};
1062 	struct sock_filter trap_insns[] = {
1063 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1064 			offsetof(struct seccomp_data, nr)),
1065 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1066 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1067 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1068 	};
1069 	struct sock_filter kill_insns[] = {
1070 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1071 			offsetof(struct seccomp_data, nr)),
1072 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1073 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1074 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1075 	};
1076 
1077 	memset(self, 0, sizeof(*self));
1078 #define FILTER_ALLOC(_x) \
1079 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1080 	ASSERT_NE(NULL, self->_x.filter); \
1081 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1082 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1083 	FILTER_ALLOC(allow);
1084 	FILTER_ALLOC(log);
1085 	FILTER_ALLOC(trace);
1086 	FILTER_ALLOC(error);
1087 	FILTER_ALLOC(trap);
1088 	FILTER_ALLOC(kill);
1089 }
1090 
1091 FIXTURE_TEARDOWN(precedence)
1092 {
1093 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1094 	FILTER_FREE(allow);
1095 	FILTER_FREE(log);
1096 	FILTER_FREE(trace);
1097 	FILTER_FREE(error);
1098 	FILTER_FREE(trap);
1099 	FILTER_FREE(kill);
1100 }
1101 
1102 TEST_F(precedence, allow_ok)
1103 {
1104 	pid_t parent, res = 0;
1105 	long ret;
1106 
1107 	parent = getppid();
1108 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1109 	ASSERT_EQ(0, ret);
1110 
1111 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1112 	ASSERT_EQ(0, ret);
1113 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1114 	ASSERT_EQ(0, ret);
1115 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1116 	ASSERT_EQ(0, ret);
1117 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1118 	ASSERT_EQ(0, ret);
1119 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1120 	ASSERT_EQ(0, ret);
1121 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1122 	ASSERT_EQ(0, ret);
1123 	/* Should work just fine. */
1124 	res = syscall(__NR_getppid);
1125 	EXPECT_EQ(parent, res);
1126 }
1127 
1128 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1129 {
1130 	pid_t parent, res = 0;
1131 	long ret;
1132 
1133 	parent = getppid();
1134 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1135 	ASSERT_EQ(0, ret);
1136 
1137 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1138 	ASSERT_EQ(0, ret);
1139 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1140 	ASSERT_EQ(0, ret);
1141 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1142 	ASSERT_EQ(0, ret);
1143 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1144 	ASSERT_EQ(0, ret);
1145 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1146 	ASSERT_EQ(0, ret);
1147 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1148 	ASSERT_EQ(0, ret);
1149 	/* Should work just fine. */
1150 	res = syscall(__NR_getppid);
1151 	EXPECT_EQ(parent, res);
1152 	/* getpid() should never return. */
1153 	res = syscall(__NR_getpid);
1154 	EXPECT_EQ(0, res);
1155 }
1156 
1157 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1158 {
1159 	pid_t parent;
1160 	long ret;
1161 
1162 	parent = getppid();
1163 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1164 	ASSERT_EQ(0, ret);
1165 
1166 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1167 	ASSERT_EQ(0, ret);
1168 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1169 	ASSERT_EQ(0, ret);
1170 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1171 	ASSERT_EQ(0, ret);
1172 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1173 	ASSERT_EQ(0, ret);
1174 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1175 	ASSERT_EQ(0, ret);
1176 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1177 	ASSERT_EQ(0, ret);
1178 	/* Should work just fine. */
1179 	EXPECT_EQ(parent, syscall(__NR_getppid));
1180 	/* getpid() should never return. */
1181 	EXPECT_EQ(0, syscall(__NR_getpid));
1182 }
1183 
1184 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1185 {
1186 	pid_t parent;
1187 	long ret;
1188 
1189 	parent = getppid();
1190 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1191 	ASSERT_EQ(0, ret);
1192 
1193 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1194 	ASSERT_EQ(0, ret);
1195 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1196 	ASSERT_EQ(0, ret);
1197 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1198 	ASSERT_EQ(0, ret);
1199 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1200 	ASSERT_EQ(0, ret);
1201 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1202 	ASSERT_EQ(0, ret);
1203 	/* Should work just fine. */
1204 	EXPECT_EQ(parent, syscall(__NR_getppid));
1205 	/* getpid() should never return. */
1206 	EXPECT_EQ(0, syscall(__NR_getpid));
1207 }
1208 
1209 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1210 {
1211 	pid_t parent;
1212 	long ret;
1213 
1214 	parent = getppid();
1215 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1216 	ASSERT_EQ(0, ret);
1217 
1218 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1219 	ASSERT_EQ(0, ret);
1220 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1221 	ASSERT_EQ(0, ret);
1222 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1223 	ASSERT_EQ(0, ret);
1224 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1225 	ASSERT_EQ(0, ret);
1226 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1227 	ASSERT_EQ(0, ret);
1228 	/* Should work just fine. */
1229 	EXPECT_EQ(parent, syscall(__NR_getppid));
1230 	/* getpid() should never return. */
1231 	EXPECT_EQ(0, syscall(__NR_getpid));
1232 }
1233 
1234 TEST_F(precedence, errno_is_third)
1235 {
1236 	pid_t parent;
1237 	long ret;
1238 
1239 	parent = getppid();
1240 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1241 	ASSERT_EQ(0, ret);
1242 
1243 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1244 	ASSERT_EQ(0, ret);
1245 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1246 	ASSERT_EQ(0, ret);
1247 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1248 	ASSERT_EQ(0, ret);
1249 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1250 	ASSERT_EQ(0, ret);
1251 	/* Should work just fine. */
1252 	EXPECT_EQ(parent, syscall(__NR_getppid));
1253 	EXPECT_EQ(0, syscall(__NR_getpid));
1254 }
1255 
1256 TEST_F(precedence, errno_is_third_in_any_order)
1257 {
1258 	pid_t parent;
1259 	long ret;
1260 
1261 	parent = getppid();
1262 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1263 	ASSERT_EQ(0, ret);
1264 
1265 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1266 	ASSERT_EQ(0, ret);
1267 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1268 	ASSERT_EQ(0, ret);
1269 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1270 	ASSERT_EQ(0, ret);
1271 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1272 	ASSERT_EQ(0, ret);
1273 	/* Should work just fine. */
1274 	EXPECT_EQ(parent, syscall(__NR_getppid));
1275 	EXPECT_EQ(0, syscall(__NR_getpid));
1276 }
1277 
1278 TEST_F(precedence, trace_is_fourth)
1279 {
1280 	pid_t parent;
1281 	long ret;
1282 
1283 	parent = getppid();
1284 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1285 	ASSERT_EQ(0, ret);
1286 
1287 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1288 	ASSERT_EQ(0, ret);
1289 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1290 	ASSERT_EQ(0, ret);
1291 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1292 	ASSERT_EQ(0, ret);
1293 	/* Should work just fine. */
1294 	EXPECT_EQ(parent, syscall(__NR_getppid));
1295 	/* No ptracer */
1296 	EXPECT_EQ(-1, syscall(__NR_getpid));
1297 }
1298 
1299 TEST_F(precedence, trace_is_fourth_in_any_order)
1300 {
1301 	pid_t parent;
1302 	long ret;
1303 
1304 	parent = getppid();
1305 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1306 	ASSERT_EQ(0, ret);
1307 
1308 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1309 	ASSERT_EQ(0, ret);
1310 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1311 	ASSERT_EQ(0, ret);
1312 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1313 	ASSERT_EQ(0, ret);
1314 	/* Should work just fine. */
1315 	EXPECT_EQ(parent, syscall(__NR_getppid));
1316 	/* No ptracer */
1317 	EXPECT_EQ(-1, syscall(__NR_getpid));
1318 }
1319 
1320 TEST_F(precedence, log_is_fifth)
1321 {
1322 	pid_t mypid, parent;
1323 	long ret;
1324 
1325 	mypid = getpid();
1326 	parent = getppid();
1327 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1328 	ASSERT_EQ(0, ret);
1329 
1330 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1331 	ASSERT_EQ(0, ret);
1332 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1333 	ASSERT_EQ(0, ret);
1334 	/* Should work just fine. */
1335 	EXPECT_EQ(parent, syscall(__NR_getppid));
1336 	/* Should also work just fine */
1337 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1338 }
1339 
1340 TEST_F(precedence, log_is_fifth_in_any_order)
1341 {
1342 	pid_t mypid, parent;
1343 	long ret;
1344 
1345 	mypid = getpid();
1346 	parent = getppid();
1347 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1348 	ASSERT_EQ(0, ret);
1349 
1350 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1351 	ASSERT_EQ(0, ret);
1352 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1353 	ASSERT_EQ(0, ret);
1354 	/* Should work just fine. */
1355 	EXPECT_EQ(parent, syscall(__NR_getppid));
1356 	/* Should also work just fine */
1357 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1358 }
1359 
1360 #ifndef PTRACE_O_TRACESECCOMP
1361 #define PTRACE_O_TRACESECCOMP	0x00000080
1362 #endif
1363 
1364 /* Catch the Ubuntu 12.04 value error. */
1365 #if PTRACE_EVENT_SECCOMP != 7
1366 #undef PTRACE_EVENT_SECCOMP
1367 #endif
1368 
1369 #ifndef PTRACE_EVENT_SECCOMP
1370 #define PTRACE_EVENT_SECCOMP 7
1371 #endif
1372 
1373 #define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
1374 bool tracer_running;
1375 void tracer_stop(int sig)
1376 {
1377 	tracer_running = false;
1378 }
1379 
1380 typedef void tracer_func_t(struct __test_metadata *_metadata,
1381 			   pid_t tracee, int status, void *args);
1382 
1383 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1384 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1385 {
1386 	int ret = -1;
1387 	struct sigaction action = {
1388 		.sa_handler = tracer_stop,
1389 	};
1390 
1391 	/* Allow external shutdown. */
1392 	tracer_running = true;
1393 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1394 
1395 	errno = 0;
1396 	while (ret == -1 && errno != EINVAL)
1397 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1398 	ASSERT_EQ(0, ret) {
1399 		kill(tracee, SIGKILL);
1400 	}
1401 	/* Wait for attach stop */
1402 	wait(NULL);
1403 
1404 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1405 						      PTRACE_O_TRACESYSGOOD :
1406 						      PTRACE_O_TRACESECCOMP);
1407 	ASSERT_EQ(0, ret) {
1408 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1409 		kill(tracee, SIGKILL);
1410 	}
1411 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1412 		     tracee, NULL, 0);
1413 	ASSERT_EQ(0, ret);
1414 
1415 	/* Unblock the tracee */
1416 	ASSERT_EQ(1, write(fd, "A", 1));
1417 	ASSERT_EQ(0, close(fd));
1418 
1419 	/* Run until we're shut down. Must assert to stop execution. */
1420 	while (tracer_running) {
1421 		int status;
1422 
1423 		if (wait(&status) != tracee)
1424 			continue;
1425 		if (WIFSIGNALED(status) || WIFEXITED(status))
1426 			/* Child is dead. Time to go. */
1427 			return;
1428 
1429 		/* Check if this is a seccomp event. */
1430 		ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
1431 
1432 		tracer_func(_metadata, tracee, status, args);
1433 
1434 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1435 			     tracee, NULL, 0);
1436 		ASSERT_EQ(0, ret);
1437 	}
1438 	/* Directly report the status of our test harness results. */
1439 	syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1440 }
1441 
1442 /* Common tracer setup/teardown functions. */
1443 void cont_handler(int num)
1444 { }
1445 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1446 			  tracer_func_t func, void *args, bool ptrace_syscall)
1447 {
1448 	char sync;
1449 	int pipefd[2];
1450 	pid_t tracer_pid;
1451 	pid_t tracee = getpid();
1452 
1453 	/* Setup a pipe for clean synchronization. */
1454 	ASSERT_EQ(0, pipe(pipefd));
1455 
1456 	/* Fork a child which we'll promote to tracer */
1457 	tracer_pid = fork();
1458 	ASSERT_LE(0, tracer_pid);
1459 	signal(SIGALRM, cont_handler);
1460 	if (tracer_pid == 0) {
1461 		close(pipefd[0]);
1462 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1463 			     ptrace_syscall);
1464 		syscall(__NR_exit, 0);
1465 	}
1466 	close(pipefd[1]);
1467 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1468 	read(pipefd[0], &sync, 1);
1469 	close(pipefd[0]);
1470 
1471 	return tracer_pid;
1472 }
1473 void teardown_trace_fixture(struct __test_metadata *_metadata,
1474 			    pid_t tracer)
1475 {
1476 	if (tracer) {
1477 		int status;
1478 		/*
1479 		 * Extract the exit code from the other process and
1480 		 * adopt it for ourselves in case its asserts failed.
1481 		 */
1482 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1483 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1484 		if (WEXITSTATUS(status))
1485 			_metadata->passed = 0;
1486 	}
1487 }
1488 
1489 /* "poke" tracer arguments and function. */
1490 struct tracer_args_poke_t {
1491 	unsigned long poke_addr;
1492 };
1493 
1494 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1495 		 void *args)
1496 {
1497 	int ret;
1498 	unsigned long msg;
1499 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1500 
1501 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1502 	EXPECT_EQ(0, ret);
1503 	/* If this fails, don't try to recover. */
1504 	ASSERT_EQ(0x1001, msg) {
1505 		kill(tracee, SIGKILL);
1506 	}
1507 	/*
1508 	 * Poke in the message.
1509 	 * Registers are not touched to try to keep this relatively arch
1510 	 * agnostic.
1511 	 */
1512 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1513 	EXPECT_EQ(0, ret);
1514 }
1515 
1516 FIXTURE(TRACE_poke) {
1517 	struct sock_fprog prog;
1518 	pid_t tracer;
1519 	long poked;
1520 	struct tracer_args_poke_t tracer_args;
1521 };
1522 
1523 FIXTURE_SETUP(TRACE_poke)
1524 {
1525 	struct sock_filter filter[] = {
1526 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1527 			offsetof(struct seccomp_data, nr)),
1528 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1529 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1530 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1531 	};
1532 
1533 	self->poked = 0;
1534 	memset(&self->prog, 0, sizeof(self->prog));
1535 	self->prog.filter = malloc(sizeof(filter));
1536 	ASSERT_NE(NULL, self->prog.filter);
1537 	memcpy(self->prog.filter, filter, sizeof(filter));
1538 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1539 
1540 	/* Set up tracer args. */
1541 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1542 
1543 	/* Launch tracer. */
1544 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1545 					   &self->tracer_args, false);
1546 }
1547 
1548 FIXTURE_TEARDOWN(TRACE_poke)
1549 {
1550 	teardown_trace_fixture(_metadata, self->tracer);
1551 	if (self->prog.filter)
1552 		free(self->prog.filter);
1553 }
1554 
1555 TEST_F(TRACE_poke, read_has_side_effects)
1556 {
1557 	ssize_t ret;
1558 
1559 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1560 	ASSERT_EQ(0, ret);
1561 
1562 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1563 	ASSERT_EQ(0, ret);
1564 
1565 	EXPECT_EQ(0, self->poked);
1566 	ret = read(-1, NULL, 0);
1567 	EXPECT_EQ(-1, ret);
1568 	EXPECT_EQ(0x1001, self->poked);
1569 }
1570 
1571 TEST_F(TRACE_poke, getpid_runs_normally)
1572 {
1573 	long ret;
1574 
1575 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1576 	ASSERT_EQ(0, ret);
1577 
1578 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1579 	ASSERT_EQ(0, ret);
1580 
1581 	EXPECT_EQ(0, self->poked);
1582 	EXPECT_NE(0, syscall(__NR_getpid));
1583 	EXPECT_EQ(0, self->poked);
1584 }
1585 
1586 #if defined(__x86_64__)
1587 # define ARCH_REGS	struct user_regs_struct
1588 # define SYSCALL_NUM	orig_rax
1589 # define SYSCALL_RET	rax
1590 #elif defined(__i386__)
1591 # define ARCH_REGS	struct user_regs_struct
1592 # define SYSCALL_NUM	orig_eax
1593 # define SYSCALL_RET	eax
1594 #elif defined(__arm__)
1595 # define ARCH_REGS	struct pt_regs
1596 # define SYSCALL_NUM	ARM_r7
1597 # define SYSCALL_RET	ARM_r0
1598 #elif defined(__aarch64__)
1599 # define ARCH_REGS	struct user_pt_regs
1600 # define SYSCALL_NUM	regs[8]
1601 # define SYSCALL_RET	regs[0]
1602 #elif defined(__riscv) && __riscv_xlen == 64
1603 # define ARCH_REGS	struct user_regs_struct
1604 # define SYSCALL_NUM	a7
1605 # define SYSCALL_RET	a0
1606 #elif defined(__hppa__)
1607 # define ARCH_REGS	struct user_regs_struct
1608 # define SYSCALL_NUM	gr[20]
1609 # define SYSCALL_RET	gr[28]
1610 #elif defined(__powerpc__)
1611 # define ARCH_REGS	struct pt_regs
1612 # define SYSCALL_NUM	gpr[0]
1613 # define SYSCALL_RET	gpr[3]
1614 #elif defined(__s390__)
1615 # define ARCH_REGS     s390_regs
1616 # define SYSCALL_NUM   gprs[2]
1617 # define SYSCALL_RET   gprs[2]
1618 #elif defined(__mips__)
1619 # define ARCH_REGS	struct pt_regs
1620 # define SYSCALL_NUM	regs[2]
1621 # define SYSCALL_SYSCALL_NUM regs[4]
1622 # define SYSCALL_RET	regs[2]
1623 # define SYSCALL_NUM_RET_SHARE_REG
1624 #else
1625 # error "Do not know how to find your architecture's registers and syscalls"
1626 #endif
1627 
1628 /* When the syscall return can't be changed, stub out the tests for it. */
1629 #ifdef SYSCALL_NUM_RET_SHARE_REG
1630 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1631 #else
1632 # define EXPECT_SYSCALL_RETURN(val, action)		\
1633 	do {						\
1634 		errno = 0;				\
1635 		if (val < 0) {				\
1636 			EXPECT_EQ(-1, action);		\
1637 			EXPECT_EQ(-(val), errno);	\
1638 		} else {				\
1639 			EXPECT_EQ(val, action);		\
1640 		}					\
1641 	} while (0)
1642 #endif
1643 
1644 /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1645  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1646  */
1647 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1648 #define HAVE_GETREGS
1649 #endif
1650 
1651 /* Architecture-specific syscall fetching routine. */
1652 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1653 {
1654 	ARCH_REGS regs;
1655 #ifdef HAVE_GETREGS
1656 	EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, &regs)) {
1657 		TH_LOG("PTRACE_GETREGS failed");
1658 		return -1;
1659 	}
1660 #else
1661 	struct iovec iov;
1662 
1663 	iov.iov_base = &regs;
1664 	iov.iov_len = sizeof(regs);
1665 	EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
1666 		TH_LOG("PTRACE_GETREGSET failed");
1667 		return -1;
1668 	}
1669 #endif
1670 
1671 #if defined(__mips__)
1672 	if (regs.SYSCALL_NUM == __NR_O32_Linux)
1673 		return regs.SYSCALL_SYSCALL_NUM;
1674 #endif
1675 	return regs.SYSCALL_NUM;
1676 }
1677 
1678 /* Architecture-specific syscall changing routine. */
1679 void change_syscall(struct __test_metadata *_metadata,
1680 		    pid_t tracee, int syscall, int result)
1681 {
1682 	int ret;
1683 	ARCH_REGS regs;
1684 #ifdef HAVE_GETREGS
1685 	ret = ptrace(PTRACE_GETREGS, tracee, 0, &regs);
1686 #else
1687 	struct iovec iov;
1688 	iov.iov_base = &regs;
1689 	iov.iov_len = sizeof(regs);
1690 	ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
1691 #endif
1692 	EXPECT_EQ(0, ret) {}
1693 
1694 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
1695 	defined(__s390__) || defined(__hppa__) || defined(__riscv)
1696 	{
1697 		regs.SYSCALL_NUM = syscall;
1698 	}
1699 #elif defined(__mips__)
1700 	{
1701 		if (regs.SYSCALL_NUM == __NR_O32_Linux)
1702 			regs.SYSCALL_SYSCALL_NUM = syscall;
1703 		else
1704 			regs.SYSCALL_NUM = syscall;
1705 	}
1706 
1707 #elif defined(__arm__)
1708 # ifndef PTRACE_SET_SYSCALL
1709 #  define PTRACE_SET_SYSCALL   23
1710 # endif
1711 	{
1712 		ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
1713 		EXPECT_EQ(0, ret);
1714 	}
1715 
1716 #elif defined(__aarch64__)
1717 # ifndef NT_ARM_SYSTEM_CALL
1718 #  define NT_ARM_SYSTEM_CALL 0x404
1719 # endif
1720 	{
1721 		iov.iov_base = &syscall;
1722 		iov.iov_len = sizeof(syscall);
1723 		ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL,
1724 			     &iov);
1725 		EXPECT_EQ(0, ret);
1726 	}
1727 
1728 #else
1729 	ASSERT_EQ(1, 0) {
1730 		TH_LOG("How is the syscall changed on this architecture?");
1731 	}
1732 #endif
1733 
1734 	/* If syscall is skipped, change return value. */
1735 	if (syscall == -1)
1736 #ifdef SYSCALL_NUM_RET_SHARE_REG
1737 		TH_LOG("Can't modify syscall return on this architecture");
1738 #else
1739 		regs.SYSCALL_RET = result;
1740 #endif
1741 
1742 #ifdef HAVE_GETREGS
1743 	ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
1744 #else
1745 	iov.iov_base = &regs;
1746 	iov.iov_len = sizeof(regs);
1747 	ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
1748 #endif
1749 	EXPECT_EQ(0, ret);
1750 }
1751 
1752 void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee,
1753 		    int status, void *args)
1754 {
1755 	int ret;
1756 	unsigned long msg;
1757 
1758 	/* Make sure we got the right message. */
1759 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1760 	EXPECT_EQ(0, ret);
1761 
1762 	/* Validate and take action on expected syscalls. */
1763 	switch (msg) {
1764 	case 0x1002:
1765 		/* change getpid to getppid. */
1766 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1767 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1768 		break;
1769 	case 0x1003:
1770 		/* skip gettid with valid return code. */
1771 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1772 		change_syscall(_metadata, tracee, -1, 45000);
1773 		break;
1774 	case 0x1004:
1775 		/* skip openat with error. */
1776 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1777 		change_syscall(_metadata, tracee, -1, -ESRCH);
1778 		break;
1779 	case 0x1005:
1780 		/* do nothing (allow getppid) */
1781 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
1782 		break;
1783 	default:
1784 		EXPECT_EQ(0, msg) {
1785 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
1786 			kill(tracee, SIGKILL);
1787 		}
1788 	}
1789 
1790 }
1791 
1792 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
1793 		   int status, void *args)
1794 {
1795 	int ret, nr;
1796 	unsigned long msg;
1797 	static bool entry;
1798 
1799 	/*
1800 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
1801 	 * is by counting.
1802 	 */
1803 	entry = !entry;
1804 
1805 	/* Make sure we got an appropriate message. */
1806 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1807 	EXPECT_EQ(0, ret);
1808 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
1809 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
1810 
1811 	if (!entry)
1812 		return;
1813 
1814 	nr = get_syscall(_metadata, tracee);
1815 
1816 	if (nr == __NR_getpid)
1817 		change_syscall(_metadata, tracee, __NR_getppid, 0);
1818 	if (nr == __NR_gettid)
1819 		change_syscall(_metadata, tracee, -1, 45000);
1820 	if (nr == __NR_openat)
1821 		change_syscall(_metadata, tracee, -1, -ESRCH);
1822 }
1823 
1824 FIXTURE(TRACE_syscall) {
1825 	struct sock_fprog prog;
1826 	pid_t tracer, mytid, mypid, parent;
1827 };
1828 
1829 FIXTURE_SETUP(TRACE_syscall)
1830 {
1831 	struct sock_filter filter[] = {
1832 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1833 			offsetof(struct seccomp_data, nr)),
1834 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1835 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
1836 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
1837 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
1838 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
1839 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
1840 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
1841 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
1842 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1843 	};
1844 
1845 	memset(&self->prog, 0, sizeof(self->prog));
1846 	self->prog.filter = malloc(sizeof(filter));
1847 	ASSERT_NE(NULL, self->prog.filter);
1848 	memcpy(self->prog.filter, filter, sizeof(filter));
1849 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1850 
1851 	/* Prepare some testable syscall results. */
1852 	self->mytid = syscall(__NR_gettid);
1853 	ASSERT_GT(self->mytid, 0);
1854 	ASSERT_NE(self->mytid, 1) {
1855 		TH_LOG("Running this test as init is not supported. :)");
1856 	}
1857 
1858 	self->mypid = getpid();
1859 	ASSERT_GT(self->mypid, 0);
1860 	ASSERT_EQ(self->mytid, self->mypid);
1861 
1862 	self->parent = getppid();
1863 	ASSERT_GT(self->parent, 0);
1864 	ASSERT_NE(self->parent, self->mypid);
1865 
1866 	/* Launch tracer. */
1867 	self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL,
1868 					   false);
1869 }
1870 
1871 FIXTURE_TEARDOWN(TRACE_syscall)
1872 {
1873 	teardown_trace_fixture(_metadata, self->tracer);
1874 	if (self->prog.filter)
1875 		free(self->prog.filter);
1876 }
1877 
1878 TEST_F(TRACE_syscall, ptrace_syscall_redirected)
1879 {
1880 	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1881 	teardown_trace_fixture(_metadata, self->tracer);
1882 	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1883 					   true);
1884 
1885 	/* Tracer will redirect getpid to getppid. */
1886 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
1887 }
1888 
1889 TEST_F(TRACE_syscall, ptrace_syscall_errno)
1890 {
1891 	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1892 	teardown_trace_fixture(_metadata, self->tracer);
1893 	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1894 					   true);
1895 
1896 	/* Tracer should skip the open syscall, resulting in ESRCH. */
1897 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
1898 }
1899 
1900 TEST_F(TRACE_syscall, ptrace_syscall_faked)
1901 {
1902 	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1903 	teardown_trace_fixture(_metadata, self->tracer);
1904 	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1905 					   true);
1906 
1907 	/* Tracer should skip the gettid syscall, resulting fake pid. */
1908 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
1909 }
1910 
1911 TEST_F(TRACE_syscall, syscall_allowed)
1912 {
1913 	long ret;
1914 
1915 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1916 	ASSERT_EQ(0, ret);
1917 
1918 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1919 	ASSERT_EQ(0, ret);
1920 
1921 	/* getppid works as expected (no changes). */
1922 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
1923 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
1924 }
1925 
1926 TEST_F(TRACE_syscall, syscall_redirected)
1927 {
1928 	long ret;
1929 
1930 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1931 	ASSERT_EQ(0, ret);
1932 
1933 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1934 	ASSERT_EQ(0, ret);
1935 
1936 	/* getpid has been redirected to getppid as expected. */
1937 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
1938 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
1939 }
1940 
1941 TEST_F(TRACE_syscall, syscall_errno)
1942 {
1943 	long ret;
1944 
1945 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1946 	ASSERT_EQ(0, ret);
1947 
1948 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1949 	ASSERT_EQ(0, ret);
1950 
1951 	/* openat has been skipped and an errno return. */
1952 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
1953 }
1954 
1955 TEST_F(TRACE_syscall, syscall_faked)
1956 {
1957 	long ret;
1958 
1959 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1960 	ASSERT_EQ(0, ret);
1961 
1962 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1963 	ASSERT_EQ(0, ret);
1964 
1965 	/* gettid has been skipped and an altered return value stored. */
1966 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
1967 }
1968 
1969 TEST_F(TRACE_syscall, skip_after_RET_TRACE)
1970 {
1971 	struct sock_filter filter[] = {
1972 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1973 			offsetof(struct seccomp_data, nr)),
1974 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
1975 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
1976 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1977 	};
1978 	struct sock_fprog prog = {
1979 		.len = (unsigned short)ARRAY_SIZE(filter),
1980 		.filter = filter,
1981 	};
1982 	long ret;
1983 
1984 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1985 	ASSERT_EQ(0, ret);
1986 
1987 	/* Install fixture filter. */
1988 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1989 	ASSERT_EQ(0, ret);
1990 
1991 	/* Install "errno on getppid" filter. */
1992 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
1993 	ASSERT_EQ(0, ret);
1994 
1995 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
1996 	errno = 0;
1997 	EXPECT_EQ(-1, syscall(__NR_getpid));
1998 	EXPECT_EQ(EPERM, errno);
1999 }
2000 
2001 TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS)
2002 {
2003 	struct sock_filter filter[] = {
2004 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2005 			offsetof(struct seccomp_data, nr)),
2006 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2007 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2008 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2009 	};
2010 	struct sock_fprog prog = {
2011 		.len = (unsigned short)ARRAY_SIZE(filter),
2012 		.filter = filter,
2013 	};
2014 	long ret;
2015 
2016 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2017 	ASSERT_EQ(0, ret);
2018 
2019 	/* Install fixture filter. */
2020 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
2021 	ASSERT_EQ(0, ret);
2022 
2023 	/* Install "death on getppid" filter. */
2024 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2025 	ASSERT_EQ(0, ret);
2026 
2027 	/* Tracer will redirect getpid to getppid, and we should die. */
2028 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2029 }
2030 
2031 TEST_F(TRACE_syscall, skip_after_ptrace)
2032 {
2033 	struct sock_filter filter[] = {
2034 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2035 			offsetof(struct seccomp_data, nr)),
2036 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2037 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2038 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2039 	};
2040 	struct sock_fprog prog = {
2041 		.len = (unsigned short)ARRAY_SIZE(filter),
2042 		.filter = filter,
2043 	};
2044 	long ret;
2045 
2046 	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
2047 	teardown_trace_fixture(_metadata, self->tracer);
2048 	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
2049 					   true);
2050 
2051 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2052 	ASSERT_EQ(0, ret);
2053 
2054 	/* Install "errno on getppid" filter. */
2055 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2056 	ASSERT_EQ(0, ret);
2057 
2058 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2059 	EXPECT_EQ(-1, syscall(__NR_getpid));
2060 	EXPECT_EQ(EPERM, errno);
2061 }
2062 
2063 TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
2064 {
2065 	struct sock_filter filter[] = {
2066 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2067 			offsetof(struct seccomp_data, nr)),
2068 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2069 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2070 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2071 	};
2072 	struct sock_fprog prog = {
2073 		.len = (unsigned short)ARRAY_SIZE(filter),
2074 		.filter = filter,
2075 	};
2076 	long ret;
2077 
2078 	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
2079 	teardown_trace_fixture(_metadata, self->tracer);
2080 	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
2081 					   true);
2082 
2083 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2084 	ASSERT_EQ(0, ret);
2085 
2086 	/* Install "death on getppid" filter. */
2087 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2088 	ASSERT_EQ(0, ret);
2089 
2090 	/* Tracer will redirect getpid to getppid, and we should die. */
2091 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2092 }
2093 
2094 TEST(seccomp_syscall)
2095 {
2096 	struct sock_filter filter[] = {
2097 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2098 	};
2099 	struct sock_fprog prog = {
2100 		.len = (unsigned short)ARRAY_SIZE(filter),
2101 		.filter = filter,
2102 	};
2103 	long ret;
2104 
2105 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2106 	ASSERT_EQ(0, ret) {
2107 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2108 	}
2109 
2110 	/* Reject insane operation. */
2111 	ret = seccomp(-1, 0, &prog);
2112 	ASSERT_NE(ENOSYS, errno) {
2113 		TH_LOG("Kernel does not support seccomp syscall!");
2114 	}
2115 	EXPECT_EQ(EINVAL, errno) {
2116 		TH_LOG("Did not reject crazy op value!");
2117 	}
2118 
2119 	/* Reject strict with flags or pointer. */
2120 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2121 	EXPECT_EQ(EINVAL, errno) {
2122 		TH_LOG("Did not reject mode strict with flags!");
2123 	}
2124 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2125 	EXPECT_EQ(EINVAL, errno) {
2126 		TH_LOG("Did not reject mode strict with uargs!");
2127 	}
2128 
2129 	/* Reject insane args for filter. */
2130 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2131 	EXPECT_EQ(EINVAL, errno) {
2132 		TH_LOG("Did not reject crazy filter flags!");
2133 	}
2134 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2135 	EXPECT_EQ(EFAULT, errno) {
2136 		TH_LOG("Did not reject NULL filter!");
2137 	}
2138 
2139 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2140 	EXPECT_EQ(0, errno) {
2141 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2142 			strerror(errno));
2143 	}
2144 }
2145 
2146 TEST(seccomp_syscall_mode_lock)
2147 {
2148 	struct sock_filter filter[] = {
2149 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2150 	};
2151 	struct sock_fprog prog = {
2152 		.len = (unsigned short)ARRAY_SIZE(filter),
2153 		.filter = filter,
2154 	};
2155 	long ret;
2156 
2157 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2158 	ASSERT_EQ(0, ret) {
2159 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2160 	}
2161 
2162 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2163 	ASSERT_NE(ENOSYS, errno) {
2164 		TH_LOG("Kernel does not support seccomp syscall!");
2165 	}
2166 	EXPECT_EQ(0, ret) {
2167 		TH_LOG("Could not install filter!");
2168 	}
2169 
2170 	/* Make sure neither entry point will switch to strict. */
2171 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2172 	EXPECT_EQ(EINVAL, errno) {
2173 		TH_LOG("Switched to mode strict!");
2174 	}
2175 
2176 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2177 	EXPECT_EQ(EINVAL, errno) {
2178 		TH_LOG("Switched to mode strict!");
2179 	}
2180 }
2181 
2182 /*
2183  * Test detection of known and unknown filter flags. Userspace needs to be able
2184  * to check if a filter flag is supported by the current kernel and a good way
2185  * of doing that is by attempting to enter filter mode, with the flag bit in
2186  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2187  * that the flag is valid and EINVAL indicates that the flag is invalid.
2188  */
2189 TEST(detect_seccomp_filter_flags)
2190 {
2191 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2192 				 SECCOMP_FILTER_FLAG_LOG,
2193 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2194 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2195 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2196 	unsigned int exclusive[] = {
2197 				SECCOMP_FILTER_FLAG_TSYNC,
2198 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2199 	unsigned int flag, all_flags, exclusive_mask;
2200 	int i;
2201 	long ret;
2202 
2203 	/* Test detection of individual known-good filter flags */
2204 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2205 		int bits = 0;
2206 
2207 		flag = flags[i];
2208 		/* Make sure the flag is a single bit! */
2209 		while (flag) {
2210 			if (flag & 0x1)
2211 				bits ++;
2212 			flag >>= 1;
2213 		}
2214 		ASSERT_EQ(1, bits);
2215 		flag = flags[i];
2216 
2217 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2218 		ASSERT_NE(ENOSYS, errno) {
2219 			TH_LOG("Kernel does not support seccomp syscall!");
2220 		}
2221 		EXPECT_EQ(-1, ret);
2222 		EXPECT_EQ(EFAULT, errno) {
2223 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2224 			       flag);
2225 		}
2226 
2227 		all_flags |= flag;
2228 	}
2229 
2230 	/*
2231 	 * Test detection of all known-good filter flags combined. But
2232 	 * for the exclusive flags we need to mask them out and try them
2233 	 * individually for the "all flags" testing.
2234 	 */
2235 	exclusive_mask = 0;
2236 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2237 		exclusive_mask |= exclusive[i];
2238 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2239 		flag = all_flags & ~exclusive_mask;
2240 		flag |= exclusive[i];
2241 
2242 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2243 		EXPECT_EQ(-1, ret);
2244 		EXPECT_EQ(EFAULT, errno) {
2245 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2246 			       flag);
2247 		}
2248 	}
2249 
2250 	/* Test detection of an unknown filter flags, without exclusives. */
2251 	flag = -1;
2252 	flag &= ~exclusive_mask;
2253 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2254 	EXPECT_EQ(-1, ret);
2255 	EXPECT_EQ(EINVAL, errno) {
2256 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2257 		       flag);
2258 	}
2259 
2260 	/*
2261 	 * Test detection of an unknown filter flag that may simply need to be
2262 	 * added to this test
2263 	 */
2264 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2265 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2266 	EXPECT_EQ(-1, ret);
2267 	EXPECT_EQ(EINVAL, errno) {
2268 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2269 		       flag);
2270 	}
2271 }
2272 
2273 TEST(TSYNC_first)
2274 {
2275 	struct sock_filter filter[] = {
2276 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2277 	};
2278 	struct sock_fprog prog = {
2279 		.len = (unsigned short)ARRAY_SIZE(filter),
2280 		.filter = filter,
2281 	};
2282 	long ret;
2283 
2284 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2285 	ASSERT_EQ(0, ret) {
2286 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2287 	}
2288 
2289 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2290 		      &prog);
2291 	ASSERT_NE(ENOSYS, errno) {
2292 		TH_LOG("Kernel does not support seccomp syscall!");
2293 	}
2294 	EXPECT_EQ(0, ret) {
2295 		TH_LOG("Could not install initial filter with TSYNC!");
2296 	}
2297 }
2298 
2299 #define TSYNC_SIBLINGS 2
2300 struct tsync_sibling {
2301 	pthread_t tid;
2302 	pid_t system_tid;
2303 	sem_t *started;
2304 	pthread_cond_t *cond;
2305 	pthread_mutex_t *mutex;
2306 	int diverge;
2307 	int num_waits;
2308 	struct sock_fprog *prog;
2309 	struct __test_metadata *metadata;
2310 };
2311 
2312 /*
2313  * To avoid joining joined threads (which is not allowed by Bionic),
2314  * make sure we both successfully join and clear the tid to skip a
2315  * later join attempt during fixture teardown. Any remaining threads
2316  * will be directly killed during teardown.
2317  */
2318 #define PTHREAD_JOIN(tid, status)					\
2319 	do {								\
2320 		int _rc = pthread_join(tid, status);			\
2321 		if (_rc) {						\
2322 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2323 				(unsigned int)tid, _rc);		\
2324 		} else {						\
2325 			tid = 0;					\
2326 		}							\
2327 	} while (0)
2328 
2329 FIXTURE(TSYNC) {
2330 	struct sock_fprog root_prog, apply_prog;
2331 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2332 	sem_t started;
2333 	pthread_cond_t cond;
2334 	pthread_mutex_t mutex;
2335 	int sibling_count;
2336 };
2337 
2338 FIXTURE_SETUP(TSYNC)
2339 {
2340 	struct sock_filter root_filter[] = {
2341 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2342 	};
2343 	struct sock_filter apply_filter[] = {
2344 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2345 			offsetof(struct seccomp_data, nr)),
2346 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2347 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2348 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2349 	};
2350 
2351 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2352 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2353 	memset(&self->sibling, 0, sizeof(self->sibling));
2354 	self->root_prog.filter = malloc(sizeof(root_filter));
2355 	ASSERT_NE(NULL, self->root_prog.filter);
2356 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2357 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2358 
2359 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2360 	ASSERT_NE(NULL, self->apply_prog.filter);
2361 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2362 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2363 
2364 	self->sibling_count = 0;
2365 	pthread_mutex_init(&self->mutex, NULL);
2366 	pthread_cond_init(&self->cond, NULL);
2367 	sem_init(&self->started, 0, 0);
2368 	self->sibling[0].tid = 0;
2369 	self->sibling[0].cond = &self->cond;
2370 	self->sibling[0].started = &self->started;
2371 	self->sibling[0].mutex = &self->mutex;
2372 	self->sibling[0].diverge = 0;
2373 	self->sibling[0].num_waits = 1;
2374 	self->sibling[0].prog = &self->root_prog;
2375 	self->sibling[0].metadata = _metadata;
2376 	self->sibling[1].tid = 0;
2377 	self->sibling[1].cond = &self->cond;
2378 	self->sibling[1].started = &self->started;
2379 	self->sibling[1].mutex = &self->mutex;
2380 	self->sibling[1].diverge = 0;
2381 	self->sibling[1].prog = &self->root_prog;
2382 	self->sibling[1].num_waits = 1;
2383 	self->sibling[1].metadata = _metadata;
2384 }
2385 
2386 FIXTURE_TEARDOWN(TSYNC)
2387 {
2388 	int sib = 0;
2389 
2390 	if (self->root_prog.filter)
2391 		free(self->root_prog.filter);
2392 	if (self->apply_prog.filter)
2393 		free(self->apply_prog.filter);
2394 
2395 	for ( ; sib < self->sibling_count; ++sib) {
2396 		struct tsync_sibling *s = &self->sibling[sib];
2397 
2398 		if (!s->tid)
2399 			continue;
2400 		/*
2401 		 * If a thread is still running, it may be stuck, so hit
2402 		 * it over the head really hard.
2403 		 */
2404 		pthread_kill(s->tid, 9);
2405 	}
2406 	pthread_mutex_destroy(&self->mutex);
2407 	pthread_cond_destroy(&self->cond);
2408 	sem_destroy(&self->started);
2409 }
2410 
2411 void *tsync_sibling(void *data)
2412 {
2413 	long ret = 0;
2414 	struct tsync_sibling *me = data;
2415 
2416 	me->system_tid = syscall(__NR_gettid);
2417 
2418 	pthread_mutex_lock(me->mutex);
2419 	if (me->diverge) {
2420 		/* Just re-apply the root prog to fork the tree */
2421 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2422 				me->prog, 0, 0);
2423 	}
2424 	sem_post(me->started);
2425 	/* Return outside of started so parent notices failures. */
2426 	if (ret) {
2427 		pthread_mutex_unlock(me->mutex);
2428 		return (void *)SIBLING_EXIT_FAILURE;
2429 	}
2430 	do {
2431 		pthread_cond_wait(me->cond, me->mutex);
2432 		me->num_waits = me->num_waits - 1;
2433 	} while (me->num_waits);
2434 	pthread_mutex_unlock(me->mutex);
2435 
2436 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2437 	if (!ret)
2438 		return (void *)SIBLING_EXIT_NEWPRIVS;
2439 	read(0, NULL, 0);
2440 	return (void *)SIBLING_EXIT_UNKILLED;
2441 }
2442 
2443 void tsync_start_sibling(struct tsync_sibling *sibling)
2444 {
2445 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2446 }
2447 
2448 TEST_F(TSYNC, siblings_fail_prctl)
2449 {
2450 	long ret;
2451 	void *status;
2452 	struct sock_filter filter[] = {
2453 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2454 			offsetof(struct seccomp_data, nr)),
2455 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2456 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2457 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2458 	};
2459 	struct sock_fprog prog = {
2460 		.len = (unsigned short)ARRAY_SIZE(filter),
2461 		.filter = filter,
2462 	};
2463 
2464 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2465 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2466 	}
2467 
2468 	/* Check prctl failure detection by requesting sib 0 diverge. */
2469 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2470 	ASSERT_NE(ENOSYS, errno) {
2471 		TH_LOG("Kernel does not support seccomp syscall!");
2472 	}
2473 	ASSERT_EQ(0, ret) {
2474 		TH_LOG("setting filter failed");
2475 	}
2476 
2477 	self->sibling[0].diverge = 1;
2478 	tsync_start_sibling(&self->sibling[0]);
2479 	tsync_start_sibling(&self->sibling[1]);
2480 
2481 	while (self->sibling_count < TSYNC_SIBLINGS) {
2482 		sem_wait(&self->started);
2483 		self->sibling_count++;
2484 	}
2485 
2486 	/* Signal the threads to clean up*/
2487 	pthread_mutex_lock(&self->mutex);
2488 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2489 		TH_LOG("cond broadcast non-zero");
2490 	}
2491 	pthread_mutex_unlock(&self->mutex);
2492 
2493 	/* Ensure diverging sibling failed to call prctl. */
2494 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2495 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2496 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2497 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2498 }
2499 
2500 TEST_F(TSYNC, two_siblings_with_ancestor)
2501 {
2502 	long ret;
2503 	void *status;
2504 
2505 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2506 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2507 	}
2508 
2509 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2510 	ASSERT_NE(ENOSYS, errno) {
2511 		TH_LOG("Kernel does not support seccomp syscall!");
2512 	}
2513 	ASSERT_EQ(0, ret) {
2514 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2515 	}
2516 	tsync_start_sibling(&self->sibling[0]);
2517 	tsync_start_sibling(&self->sibling[1]);
2518 
2519 	while (self->sibling_count < TSYNC_SIBLINGS) {
2520 		sem_wait(&self->started);
2521 		self->sibling_count++;
2522 	}
2523 
2524 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2525 		      &self->apply_prog);
2526 	ASSERT_EQ(0, ret) {
2527 		TH_LOG("Could install filter on all threads!");
2528 	}
2529 	/* Tell the siblings to test the policy */
2530 	pthread_mutex_lock(&self->mutex);
2531 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2532 		TH_LOG("cond broadcast non-zero");
2533 	}
2534 	pthread_mutex_unlock(&self->mutex);
2535 	/* Ensure they are both killed and don't exit cleanly. */
2536 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2537 	EXPECT_EQ(0x0, (long)status);
2538 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2539 	EXPECT_EQ(0x0, (long)status);
2540 }
2541 
2542 TEST_F(TSYNC, two_sibling_want_nnp)
2543 {
2544 	void *status;
2545 
2546 	/* start siblings before any prctl() operations */
2547 	tsync_start_sibling(&self->sibling[0]);
2548 	tsync_start_sibling(&self->sibling[1]);
2549 	while (self->sibling_count < TSYNC_SIBLINGS) {
2550 		sem_wait(&self->started);
2551 		self->sibling_count++;
2552 	}
2553 
2554 	/* Tell the siblings to test no policy */
2555 	pthread_mutex_lock(&self->mutex);
2556 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2557 		TH_LOG("cond broadcast non-zero");
2558 	}
2559 	pthread_mutex_unlock(&self->mutex);
2560 
2561 	/* Ensure they are both upset about lacking nnp. */
2562 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2563 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2564 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2565 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2566 }
2567 
2568 TEST_F(TSYNC, two_siblings_with_no_filter)
2569 {
2570 	long ret;
2571 	void *status;
2572 
2573 	/* start siblings before any prctl() operations */
2574 	tsync_start_sibling(&self->sibling[0]);
2575 	tsync_start_sibling(&self->sibling[1]);
2576 	while (self->sibling_count < TSYNC_SIBLINGS) {
2577 		sem_wait(&self->started);
2578 		self->sibling_count++;
2579 	}
2580 
2581 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2582 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2583 	}
2584 
2585 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2586 		      &self->apply_prog);
2587 	ASSERT_NE(ENOSYS, errno) {
2588 		TH_LOG("Kernel does not support seccomp syscall!");
2589 	}
2590 	ASSERT_EQ(0, ret) {
2591 		TH_LOG("Could install filter on all threads!");
2592 	}
2593 
2594 	/* Tell the siblings to test the policy */
2595 	pthread_mutex_lock(&self->mutex);
2596 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2597 		TH_LOG("cond broadcast non-zero");
2598 	}
2599 	pthread_mutex_unlock(&self->mutex);
2600 
2601 	/* Ensure they are both killed and don't exit cleanly. */
2602 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2603 	EXPECT_EQ(0x0, (long)status);
2604 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2605 	EXPECT_EQ(0x0, (long)status);
2606 }
2607 
2608 TEST_F(TSYNC, two_siblings_with_one_divergence)
2609 {
2610 	long ret;
2611 	void *status;
2612 
2613 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2614 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2615 	}
2616 
2617 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2618 	ASSERT_NE(ENOSYS, errno) {
2619 		TH_LOG("Kernel does not support seccomp syscall!");
2620 	}
2621 	ASSERT_EQ(0, ret) {
2622 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2623 	}
2624 	self->sibling[0].diverge = 1;
2625 	tsync_start_sibling(&self->sibling[0]);
2626 	tsync_start_sibling(&self->sibling[1]);
2627 
2628 	while (self->sibling_count < TSYNC_SIBLINGS) {
2629 		sem_wait(&self->started);
2630 		self->sibling_count++;
2631 	}
2632 
2633 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2634 		      &self->apply_prog);
2635 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2636 		TH_LOG("Did not fail on diverged sibling.");
2637 	}
2638 
2639 	/* Wake the threads */
2640 	pthread_mutex_lock(&self->mutex);
2641 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2642 		TH_LOG("cond broadcast non-zero");
2643 	}
2644 	pthread_mutex_unlock(&self->mutex);
2645 
2646 	/* Ensure they are both unkilled. */
2647 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2648 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2649 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2650 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2651 }
2652 
2653 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2654 {
2655 	long ret, flags;
2656 	void *status;
2657 
2658 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2659 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2660 	}
2661 
2662 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2663 	ASSERT_NE(ENOSYS, errno) {
2664 		TH_LOG("Kernel does not support seccomp syscall!");
2665 	}
2666 	ASSERT_EQ(0, ret) {
2667 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2668 	}
2669 	self->sibling[0].diverge = 1;
2670 	tsync_start_sibling(&self->sibling[0]);
2671 	tsync_start_sibling(&self->sibling[1]);
2672 
2673 	while (self->sibling_count < TSYNC_SIBLINGS) {
2674 		sem_wait(&self->started);
2675 		self->sibling_count++;
2676 	}
2677 
2678 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2679 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2680 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2681 	ASSERT_EQ(ESRCH, errno) {
2682 		TH_LOG("Did not return ESRCH for diverged sibling.");
2683 	}
2684 	ASSERT_EQ(-1, ret) {
2685 		TH_LOG("Did not fail on diverged sibling.");
2686 	}
2687 
2688 	/* Wake the threads */
2689 	pthread_mutex_lock(&self->mutex);
2690 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2691 		TH_LOG("cond broadcast non-zero");
2692 	}
2693 	pthread_mutex_unlock(&self->mutex);
2694 
2695 	/* Ensure they are both unkilled. */
2696 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2697 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2698 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2699 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2700 }
2701 
2702 TEST_F(TSYNC, two_siblings_not_under_filter)
2703 {
2704 	long ret, sib;
2705 	void *status;
2706 	struct timespec delay = { .tv_nsec = 100000000 };
2707 
2708 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2709 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2710 	}
2711 
2712 	/*
2713 	 * Sibling 0 will have its own seccomp policy
2714 	 * and Sibling 1 will not be under seccomp at
2715 	 * all. Sibling 1 will enter seccomp and 0
2716 	 * will cause failure.
2717 	 */
2718 	self->sibling[0].diverge = 1;
2719 	tsync_start_sibling(&self->sibling[0]);
2720 	tsync_start_sibling(&self->sibling[1]);
2721 
2722 	while (self->sibling_count < TSYNC_SIBLINGS) {
2723 		sem_wait(&self->started);
2724 		self->sibling_count++;
2725 	}
2726 
2727 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2728 	ASSERT_NE(ENOSYS, errno) {
2729 		TH_LOG("Kernel does not support seccomp syscall!");
2730 	}
2731 	ASSERT_EQ(0, ret) {
2732 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2733 	}
2734 
2735 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2736 		      &self->apply_prog);
2737 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2738 		TH_LOG("Did not fail on diverged sibling.");
2739 	}
2740 	sib = 1;
2741 	if (ret == self->sibling[0].system_tid)
2742 		sib = 0;
2743 
2744 	pthread_mutex_lock(&self->mutex);
2745 
2746 	/* Increment the other siblings num_waits so we can clean up
2747 	 * the one we just saw.
2748 	 */
2749 	self->sibling[!sib].num_waits += 1;
2750 
2751 	/* Signal the thread to clean up*/
2752 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2753 		TH_LOG("cond broadcast non-zero");
2754 	}
2755 	pthread_mutex_unlock(&self->mutex);
2756 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2757 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2758 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2759 	while (!kill(self->sibling[sib].system_tid, 0))
2760 		nanosleep(&delay, NULL);
2761 	/* Switch to the remaining sibling */
2762 	sib = !sib;
2763 
2764 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2765 		      &self->apply_prog);
2766 	ASSERT_EQ(0, ret) {
2767 		TH_LOG("Expected the remaining sibling to sync");
2768 	};
2769 
2770 	pthread_mutex_lock(&self->mutex);
2771 
2772 	/* If remaining sibling didn't have a chance to wake up during
2773 	 * the first broadcast, manually reduce the num_waits now.
2774 	 */
2775 	if (self->sibling[sib].num_waits > 1)
2776 		self->sibling[sib].num_waits = 1;
2777 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2778 		TH_LOG("cond broadcast non-zero");
2779 	}
2780 	pthread_mutex_unlock(&self->mutex);
2781 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2782 	EXPECT_EQ(0, (long)status);
2783 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2784 	while (!kill(self->sibling[sib].system_tid, 0))
2785 		nanosleep(&delay, NULL);
2786 
2787 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2788 		      &self->apply_prog);
2789 	ASSERT_EQ(0, ret);  /* just us chickens */
2790 }
2791 
2792 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2793 TEST(syscall_restart)
2794 {
2795 	long ret;
2796 	unsigned long msg;
2797 	pid_t child_pid;
2798 	int pipefd[2];
2799 	int status;
2800 	siginfo_t info = { };
2801 	struct sock_filter filter[] = {
2802 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2803 			 offsetof(struct seccomp_data, nr)),
2804 
2805 #ifdef __NR_sigreturn
2806 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
2807 #endif
2808 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
2809 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
2810 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
2811 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
2812 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
2813 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
2814 
2815 		/* Allow __NR_write for easy logging. */
2816 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
2817 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2818 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2819 		/* The nanosleep jump target. */
2820 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
2821 		/* The restart_syscall jump target. */
2822 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
2823 	};
2824 	struct sock_fprog prog = {
2825 		.len = (unsigned short)ARRAY_SIZE(filter),
2826 		.filter = filter,
2827 	};
2828 #if defined(__arm__)
2829 	struct utsname utsbuf;
2830 #endif
2831 
2832 	ASSERT_EQ(0, pipe(pipefd));
2833 
2834 	child_pid = fork();
2835 	ASSERT_LE(0, child_pid);
2836 	if (child_pid == 0) {
2837 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
2838 		char buf = ' ';
2839 		struct timespec timeout = { };
2840 
2841 		/* Attach parent as tracer and stop. */
2842 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
2843 		EXPECT_EQ(0, raise(SIGSTOP));
2844 
2845 		EXPECT_EQ(0, close(pipefd[1]));
2846 
2847 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2848 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2849 		}
2850 
2851 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2852 		EXPECT_EQ(0, ret) {
2853 			TH_LOG("Failed to install filter!");
2854 		}
2855 
2856 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2857 			TH_LOG("Failed to read() sync from parent");
2858 		}
2859 		EXPECT_EQ('.', buf) {
2860 			TH_LOG("Failed to get sync data from read()");
2861 		}
2862 
2863 		/* Start nanosleep to be interrupted. */
2864 		timeout.tv_sec = 1;
2865 		errno = 0;
2866 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
2867 			TH_LOG("Call to nanosleep() failed (errno %d)", errno);
2868 		}
2869 
2870 		/* Read final sync from parent. */
2871 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
2872 			TH_LOG("Failed final read() from parent");
2873 		}
2874 		EXPECT_EQ('!', buf) {
2875 			TH_LOG("Failed to get final data from read()");
2876 		}
2877 
2878 		/* Directly report the status of our test harness results. */
2879 		syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
2880 						     : EXIT_FAILURE);
2881 	}
2882 	EXPECT_EQ(0, close(pipefd[0]));
2883 
2884 	/* Attach to child, setup options, and release. */
2885 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2886 	ASSERT_EQ(true, WIFSTOPPED(status));
2887 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
2888 			    PTRACE_O_TRACESECCOMP));
2889 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2890 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
2891 
2892 	/* Wait for nanosleep() to start. */
2893 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2894 	ASSERT_EQ(true, WIFSTOPPED(status));
2895 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2896 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2897 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2898 	ASSERT_EQ(0x100, msg);
2899 	ret = get_syscall(_metadata, child_pid);
2900 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
2901 
2902 	/* Might as well check siginfo for sanity while we're here. */
2903 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2904 	ASSERT_EQ(SIGTRAP, info.si_signo);
2905 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
2906 	EXPECT_EQ(0, info.si_errno);
2907 	EXPECT_EQ(getuid(), info.si_uid);
2908 	/* Verify signal delivery came from child (seccomp-triggered). */
2909 	EXPECT_EQ(child_pid, info.si_pid);
2910 
2911 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
2912 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
2913 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2914 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2915 	ASSERT_EQ(true, WIFSTOPPED(status));
2916 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
2917 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2918 	/*
2919 	 * There is no siginfo on SIGSTOP any more, so we can't verify
2920 	 * signal delivery came from parent now (getpid() == info.si_pid).
2921 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
2922 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
2923 	 */
2924 	EXPECT_EQ(SIGSTOP, info.si_signo);
2925 
2926 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
2927 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
2928 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2929 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2930 	ASSERT_EQ(true, WIFSTOPPED(status));
2931 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
2932 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2933 
2934 	/* Wait for restart_syscall() to start. */
2935 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2936 	ASSERT_EQ(true, WIFSTOPPED(status));
2937 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
2938 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2939 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2940 
2941 	ASSERT_EQ(0x200, msg);
2942 	ret = get_syscall(_metadata, child_pid);
2943 #if defined(__arm__)
2944 	/*
2945 	 * FIXME:
2946 	 * - native ARM registers do NOT expose true syscall.
2947 	 * - compat ARM registers on ARM64 DO expose true syscall.
2948 	 */
2949 	ASSERT_EQ(0, uname(&utsbuf));
2950 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
2951 		EXPECT_EQ(__NR_nanosleep, ret);
2952 	} else
2953 #endif
2954 	{
2955 		EXPECT_EQ(__NR_restart_syscall, ret);
2956 	}
2957 
2958 	/* Write again to end test. */
2959 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
2960 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
2961 	EXPECT_EQ(0, close(pipefd[1]));
2962 
2963 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2964 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
2965 		_metadata->passed = 0;
2966 }
2967 
2968 TEST_SIGNAL(filter_flag_log, SIGSYS)
2969 {
2970 	struct sock_filter allow_filter[] = {
2971 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2972 	};
2973 	struct sock_filter kill_filter[] = {
2974 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2975 			offsetof(struct seccomp_data, nr)),
2976 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2977 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2978 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2979 	};
2980 	struct sock_fprog allow_prog = {
2981 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
2982 		.filter = allow_filter,
2983 	};
2984 	struct sock_fprog kill_prog = {
2985 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
2986 		.filter = kill_filter,
2987 	};
2988 	long ret;
2989 	pid_t parent = getppid();
2990 
2991 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2992 	ASSERT_EQ(0, ret);
2993 
2994 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
2995 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
2996 		      &allow_prog);
2997 	ASSERT_NE(ENOSYS, errno) {
2998 		TH_LOG("Kernel does not support seccomp syscall!");
2999 	}
3000 	EXPECT_NE(0, ret) {
3001 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3002 	}
3003 	EXPECT_EQ(EINVAL, errno) {
3004 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3005 	}
3006 
3007 	/* Verify that a simple, permissive filter can be added with no flags */
3008 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3009 	EXPECT_EQ(0, ret);
3010 
3011 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3012 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3013 		      &allow_prog);
3014 	ASSERT_NE(EINVAL, errno) {
3015 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3016 	}
3017 	EXPECT_EQ(0, ret);
3018 
3019 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3020 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3021 		      &kill_prog);
3022 	EXPECT_EQ(0, ret);
3023 
3024 	EXPECT_EQ(parent, syscall(__NR_getppid));
3025 	/* getpid() should never return. */
3026 	EXPECT_EQ(0, syscall(__NR_getpid));
3027 }
3028 
3029 TEST(get_action_avail)
3030 {
3031 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3032 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3033 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3034 	__u32 unknown_action = 0x10000000U;
3035 	int i;
3036 	long ret;
3037 
3038 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3039 	ASSERT_NE(ENOSYS, errno) {
3040 		TH_LOG("Kernel does not support seccomp syscall!");
3041 	}
3042 	ASSERT_NE(EINVAL, errno) {
3043 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3044 	}
3045 	EXPECT_EQ(ret, 0);
3046 
3047 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3048 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3049 		EXPECT_EQ(ret, 0) {
3050 			TH_LOG("Expected action (0x%X) not available!",
3051 			       actions[i]);
3052 		}
3053 	}
3054 
3055 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3056 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3057 	EXPECT_EQ(ret, -1);
3058 	EXPECT_EQ(errno, EOPNOTSUPP);
3059 }
3060 
3061 TEST(get_metadata)
3062 {
3063 	pid_t pid;
3064 	int pipefd[2];
3065 	char buf;
3066 	struct seccomp_metadata md;
3067 	long ret;
3068 
3069 	/* Only real root can get metadata. */
3070 	if (geteuid()) {
3071 		XFAIL(return, "get_metadata requires real root");
3072 		return;
3073 	}
3074 
3075 	ASSERT_EQ(0, pipe(pipefd));
3076 
3077 	pid = fork();
3078 	ASSERT_GE(pid, 0);
3079 	if (pid == 0) {
3080 		struct sock_filter filter[] = {
3081 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3082 		};
3083 		struct sock_fprog prog = {
3084 			.len = (unsigned short)ARRAY_SIZE(filter),
3085 			.filter = filter,
3086 		};
3087 
3088 		/* one with log, one without */
3089 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3090 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3091 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3092 
3093 		EXPECT_EQ(0, close(pipefd[0]));
3094 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3095 		ASSERT_EQ(0, close(pipefd[1]));
3096 
3097 		while (1)
3098 			sleep(100);
3099 	}
3100 
3101 	ASSERT_EQ(0, close(pipefd[1]));
3102 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3103 
3104 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3105 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3106 
3107 	/* Past here must not use ASSERT or child process is never killed. */
3108 
3109 	md.filter_off = 0;
3110 	errno = 0;
3111 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3112 	EXPECT_EQ(sizeof(md), ret) {
3113 		if (errno == EINVAL)
3114 			XFAIL(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3115 	}
3116 
3117 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3118 	EXPECT_EQ(md.filter_off, 0);
3119 
3120 	md.filter_off = 1;
3121 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3122 	EXPECT_EQ(sizeof(md), ret);
3123 	EXPECT_EQ(md.flags, 0);
3124 	EXPECT_EQ(md.filter_off, 1);
3125 
3126 skip:
3127 	ASSERT_EQ(0, kill(pid, SIGKILL));
3128 }
3129 
3130 static int user_trap_syscall(int nr, unsigned int flags)
3131 {
3132 	struct sock_filter filter[] = {
3133 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
3134 			offsetof(struct seccomp_data, nr)),
3135 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
3136 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
3137 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
3138 	};
3139 
3140 	struct sock_fprog prog = {
3141 		.len = (unsigned short)ARRAY_SIZE(filter),
3142 		.filter = filter,
3143 	};
3144 
3145 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3146 }
3147 
3148 #define USER_NOTIF_MAGIC INT_MAX
3149 TEST(user_notification_basic)
3150 {
3151 	pid_t pid;
3152 	long ret;
3153 	int status, listener;
3154 	struct seccomp_notif req = {};
3155 	struct seccomp_notif_resp resp = {};
3156 	struct pollfd pollfd;
3157 
3158 	struct sock_filter filter[] = {
3159 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3160 	};
3161 	struct sock_fprog prog = {
3162 		.len = (unsigned short)ARRAY_SIZE(filter),
3163 		.filter = filter,
3164 	};
3165 
3166 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3167 	ASSERT_EQ(0, ret) {
3168 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3169 	}
3170 
3171 	pid = fork();
3172 	ASSERT_GE(pid, 0);
3173 
3174 	/* Check that we get -ENOSYS with no listener attached */
3175 	if (pid == 0) {
3176 		if (user_trap_syscall(__NR_getppid, 0) < 0)
3177 			exit(1);
3178 		ret = syscall(__NR_getppid);
3179 		exit(ret >= 0 || errno != ENOSYS);
3180 	}
3181 
3182 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3183 	EXPECT_EQ(true, WIFEXITED(status));
3184 	EXPECT_EQ(0, WEXITSTATUS(status));
3185 
3186 	/* Add some no-op filters for grins. */
3187 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3188 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3189 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3190 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3191 
3192 	/* Check that the basic notification machinery works */
3193 	listener = user_trap_syscall(__NR_getppid,
3194 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3195 	ASSERT_GE(listener, 0);
3196 
3197 	/* Installing a second listener in the chain should EBUSY */
3198 	EXPECT_EQ(user_trap_syscall(__NR_getppid,
3199 				    SECCOMP_FILTER_FLAG_NEW_LISTENER),
3200 		  -1);
3201 	EXPECT_EQ(errno, EBUSY);
3202 
3203 	pid = fork();
3204 	ASSERT_GE(pid, 0);
3205 
3206 	if (pid == 0) {
3207 		ret = syscall(__NR_getppid);
3208 		exit(ret != USER_NOTIF_MAGIC);
3209 	}
3210 
3211 	pollfd.fd = listener;
3212 	pollfd.events = POLLIN | POLLOUT;
3213 
3214 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3215 	EXPECT_EQ(pollfd.revents, POLLIN);
3216 
3217 	/* Test that we can't pass garbage to the kernel. */
3218 	memset(&req, 0, sizeof(req));
3219 	req.pid = -1;
3220 	errno = 0;
3221 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3222 	EXPECT_EQ(-1, ret);
3223 	EXPECT_EQ(EINVAL, errno);
3224 
3225 	if (ret) {
3226 		req.pid = 0;
3227 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3228 	}
3229 
3230 	pollfd.fd = listener;
3231 	pollfd.events = POLLIN | POLLOUT;
3232 
3233 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3234 	EXPECT_EQ(pollfd.revents, POLLOUT);
3235 
3236 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3237 
3238 	resp.id = req.id;
3239 	resp.error = 0;
3240 	resp.val = USER_NOTIF_MAGIC;
3241 
3242 	/* check that we make sure flags == 0 */
3243 	resp.flags = 1;
3244 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3245 	EXPECT_EQ(errno, EINVAL);
3246 
3247 	resp.flags = 0;
3248 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3249 
3250 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3251 	EXPECT_EQ(true, WIFEXITED(status));
3252 	EXPECT_EQ(0, WEXITSTATUS(status));
3253 }
3254 
3255 TEST(user_notification_with_tsync)
3256 {
3257 	int ret;
3258 	unsigned int flags;
3259 
3260 	/* these were exclusive */
3261 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3262 		SECCOMP_FILTER_FLAG_TSYNC;
3263 	ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags));
3264 	ASSERT_EQ(EINVAL, errno);
3265 
3266 	/* but now they're not */
3267 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3268 	ret = user_trap_syscall(__NR_getppid, flags);
3269 	close(ret);
3270 	ASSERT_LE(0, ret);
3271 }
3272 
3273 TEST(user_notification_kill_in_middle)
3274 {
3275 	pid_t pid;
3276 	long ret;
3277 	int listener;
3278 	struct seccomp_notif req = {};
3279 	struct seccomp_notif_resp resp = {};
3280 
3281 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3282 	ASSERT_EQ(0, ret) {
3283 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3284 	}
3285 
3286 	listener = user_trap_syscall(__NR_getppid,
3287 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3288 	ASSERT_GE(listener, 0);
3289 
3290 	/*
3291 	 * Check that nothing bad happens when we kill the task in the middle
3292 	 * of a syscall.
3293 	 */
3294 	pid = fork();
3295 	ASSERT_GE(pid, 0);
3296 
3297 	if (pid == 0) {
3298 		ret = syscall(__NR_getppid);
3299 		exit(ret != USER_NOTIF_MAGIC);
3300 	}
3301 
3302 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3303 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3304 
3305 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3306 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3307 
3308 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3309 
3310 	resp.id = req.id;
3311 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3312 	EXPECT_EQ(ret, -1);
3313 	EXPECT_EQ(errno, ENOENT);
3314 }
3315 
3316 static int handled = -1;
3317 
3318 static void signal_handler(int signal)
3319 {
3320 	if (write(handled, "c", 1) != 1)
3321 		perror("write from signal");
3322 }
3323 
3324 TEST(user_notification_signal)
3325 {
3326 	pid_t pid;
3327 	long ret;
3328 	int status, listener, sk_pair[2];
3329 	struct seccomp_notif req = {};
3330 	struct seccomp_notif_resp resp = {};
3331 	char c;
3332 
3333 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3334 	ASSERT_EQ(0, ret) {
3335 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3336 	}
3337 
3338 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3339 
3340 	listener = user_trap_syscall(__NR_gettid,
3341 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3342 	ASSERT_GE(listener, 0);
3343 
3344 	pid = fork();
3345 	ASSERT_GE(pid, 0);
3346 
3347 	if (pid == 0) {
3348 		close(sk_pair[0]);
3349 		handled = sk_pair[1];
3350 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3351 			perror("signal");
3352 			exit(1);
3353 		}
3354 		/*
3355 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3356 		 * to rely on a signal that has not yet been handled. Let's at
3357 		 * least check that the error code gets propagated through, and
3358 		 * hope that it doesn't break when there is actually a signal :)
3359 		 */
3360 		ret = syscall(__NR_gettid);
3361 		exit(!(ret == -1 && errno == 512));
3362 	}
3363 
3364 	close(sk_pair[1]);
3365 
3366 	memset(&req, 0, sizeof(req));
3367 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3368 
3369 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3370 
3371 	/*
3372 	 * Make sure the signal really is delivered, which means we're not
3373 	 * stuck in the user notification code any more and the notification
3374 	 * should be dead.
3375 	 */
3376 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3377 
3378 	resp.id = req.id;
3379 	resp.error = -EPERM;
3380 	resp.val = 0;
3381 
3382 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3383 	EXPECT_EQ(errno, ENOENT);
3384 
3385 	memset(&req, 0, sizeof(req));
3386 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3387 
3388 	resp.id = req.id;
3389 	resp.error = -512; /* -ERESTARTSYS */
3390 	resp.val = 0;
3391 
3392 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3393 
3394 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3395 	EXPECT_EQ(true, WIFEXITED(status));
3396 	EXPECT_EQ(0, WEXITSTATUS(status));
3397 }
3398 
3399 TEST(user_notification_closed_listener)
3400 {
3401 	pid_t pid;
3402 	long ret;
3403 	int status, listener;
3404 
3405 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3406 	ASSERT_EQ(0, ret) {
3407 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3408 	}
3409 
3410 	listener = user_trap_syscall(__NR_getppid,
3411 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3412 	ASSERT_GE(listener, 0);
3413 
3414 	/*
3415 	 * Check that we get an ENOSYS when the listener is closed.
3416 	 */
3417 	pid = fork();
3418 	ASSERT_GE(pid, 0);
3419 	if (pid == 0) {
3420 		close(listener);
3421 		ret = syscall(__NR_getppid);
3422 		exit(ret != -1 && errno != ENOSYS);
3423 	}
3424 
3425 	close(listener);
3426 
3427 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3428 	EXPECT_EQ(true, WIFEXITED(status));
3429 	EXPECT_EQ(0, WEXITSTATUS(status));
3430 }
3431 
3432 /*
3433  * Check that a pid in a child namespace still shows up as valid in ours.
3434  */
3435 TEST(user_notification_child_pid_ns)
3436 {
3437 	pid_t pid;
3438 	int status, listener;
3439 	struct seccomp_notif req = {};
3440 	struct seccomp_notif_resp resp = {};
3441 
3442 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0);
3443 
3444 	listener = user_trap_syscall(__NR_getppid,
3445 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3446 	ASSERT_GE(listener, 0);
3447 
3448 	pid = fork();
3449 	ASSERT_GE(pid, 0);
3450 
3451 	if (pid == 0)
3452 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3453 
3454 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3455 	EXPECT_EQ(req.pid, pid);
3456 
3457 	resp.id = req.id;
3458 	resp.error = 0;
3459 	resp.val = USER_NOTIF_MAGIC;
3460 
3461 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3462 
3463 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3464 	EXPECT_EQ(true, WIFEXITED(status));
3465 	EXPECT_EQ(0, WEXITSTATUS(status));
3466 	close(listener);
3467 }
3468 
3469 /*
3470  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3471  * invalid.
3472  */
3473 TEST(user_notification_sibling_pid_ns)
3474 {
3475 	pid_t pid, pid2;
3476 	int status, listener;
3477 	struct seccomp_notif req = {};
3478 	struct seccomp_notif_resp resp = {};
3479 
3480 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3481 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3482 	}
3483 
3484 	listener = user_trap_syscall(__NR_getppid,
3485 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3486 	ASSERT_GE(listener, 0);
3487 
3488 	pid = fork();
3489 	ASSERT_GE(pid, 0);
3490 
3491 	if (pid == 0) {
3492 		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3493 
3494 		pid2 = fork();
3495 		ASSERT_GE(pid2, 0);
3496 
3497 		if (pid2 == 0)
3498 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3499 
3500 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3501 		EXPECT_EQ(true, WIFEXITED(status));
3502 		EXPECT_EQ(0, WEXITSTATUS(status));
3503 		exit(WEXITSTATUS(status));
3504 	}
3505 
3506 	/* Create the sibling ns, and sibling in it. */
3507 	ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3508 	ASSERT_EQ(errno, 0);
3509 
3510 	pid2 = fork();
3511 	ASSERT_GE(pid2, 0);
3512 
3513 	if (pid2 == 0) {
3514 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3515 		/*
3516 		 * The pid should be 0, i.e. the task is in some namespace that
3517 		 * we can't "see".
3518 		 */
3519 		EXPECT_EQ(req.pid, 0);
3520 
3521 		resp.id = req.id;
3522 		resp.error = 0;
3523 		resp.val = USER_NOTIF_MAGIC;
3524 
3525 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3526 		exit(0);
3527 	}
3528 
3529 	close(listener);
3530 
3531 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3532 	EXPECT_EQ(true, WIFEXITED(status));
3533 	EXPECT_EQ(0, WEXITSTATUS(status));
3534 
3535 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3536 	EXPECT_EQ(true, WIFEXITED(status));
3537 	EXPECT_EQ(0, WEXITSTATUS(status));
3538 }
3539 
3540 TEST(user_notification_fault_recv)
3541 {
3542 	pid_t pid;
3543 	int status, listener;
3544 	struct seccomp_notif req = {};
3545 	struct seccomp_notif_resp resp = {};
3546 
3547 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
3548 
3549 	listener = user_trap_syscall(__NR_getppid,
3550 				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
3551 	ASSERT_GE(listener, 0);
3552 
3553 	pid = fork();
3554 	ASSERT_GE(pid, 0);
3555 
3556 	if (pid == 0)
3557 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3558 
3559 	/* Do a bad recv() */
3560 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3561 	EXPECT_EQ(errno, EFAULT);
3562 
3563 	/* We should still be able to receive this notification, though. */
3564 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3565 	EXPECT_EQ(req.pid, pid);
3566 
3567 	resp.id = req.id;
3568 	resp.error = 0;
3569 	resp.val = USER_NOTIF_MAGIC;
3570 
3571 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3572 
3573 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3574 	EXPECT_EQ(true, WIFEXITED(status));
3575 	EXPECT_EQ(0, WEXITSTATUS(status));
3576 }
3577 
3578 TEST(seccomp_get_notif_sizes)
3579 {
3580 	struct seccomp_notif_sizes sizes;
3581 
3582 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3583 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3584 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3585 }
3586 
3587 static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
3588 {
3589 #ifdef __NR_kcmp
3590 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
3591 #else
3592 	errno = ENOSYS;
3593 	return -1;
3594 #endif
3595 }
3596 
3597 TEST(user_notification_continue)
3598 {
3599 	pid_t pid;
3600 	long ret;
3601 	int status, listener;
3602 	struct seccomp_notif req = {};
3603 	struct seccomp_notif_resp resp = {};
3604 	struct pollfd pollfd;
3605 
3606 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3607 	ASSERT_EQ(0, ret) {
3608 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3609 	}
3610 
3611 	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3612 	ASSERT_GE(listener, 0);
3613 
3614 	pid = fork();
3615 	ASSERT_GE(pid, 0);
3616 
3617 	if (pid == 0) {
3618 		int dup_fd, pipe_fds[2];
3619 		pid_t self;
3620 
3621 		ret = pipe(pipe_fds);
3622 		if (ret < 0)
3623 			exit(1);
3624 
3625 		dup_fd = dup(pipe_fds[0]);
3626 		if (dup_fd < 0)
3627 			exit(1);
3628 
3629 		self = getpid();
3630 
3631 		ret = filecmp(self, self, pipe_fds[0], dup_fd);
3632 		if (ret)
3633 			exit(2);
3634 
3635 		exit(0);
3636 	}
3637 
3638 	pollfd.fd = listener;
3639 	pollfd.events = POLLIN | POLLOUT;
3640 
3641 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3642 	EXPECT_EQ(pollfd.revents, POLLIN);
3643 
3644 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3645 
3646 	pollfd.fd = listener;
3647 	pollfd.events = POLLIN | POLLOUT;
3648 
3649 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3650 	EXPECT_EQ(pollfd.revents, POLLOUT);
3651 
3652 	EXPECT_EQ(req.data.nr, __NR_dup);
3653 
3654 	resp.id = req.id;
3655 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3656 
3657 	/*
3658 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3659 	 * args be set to 0.
3660 	 */
3661 	resp.error = 0;
3662 	resp.val = USER_NOTIF_MAGIC;
3663 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3664 	EXPECT_EQ(errno, EINVAL);
3665 
3666 	resp.error = USER_NOTIF_MAGIC;
3667 	resp.val = 0;
3668 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3669 	EXPECT_EQ(errno, EINVAL);
3670 
3671 	resp.error = 0;
3672 	resp.val = 0;
3673 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3674 		if (errno == EINVAL)
3675 			XFAIL(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3676 	}
3677 
3678 skip:
3679 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3680 	EXPECT_EQ(true, WIFEXITED(status));
3681 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3682 		if (WEXITSTATUS(status) == 2) {
3683 			XFAIL(return, "Kernel does not support kcmp() syscall");
3684 			return;
3685 		}
3686 	}
3687 }
3688 
3689 /*
3690  * TODO:
3691  * - add microbenchmarks
3692  * - expand NNP testing
3693  * - better arch-specific TRACE and TRAP handlers.
3694  * - endianness checking when appropriate
3695  * - 64-bit arg prodding
3696  * - arch value testing (x86 modes especially)
3697  * - verify that FILTER_FLAG_LOG filters generate log messages
3698  * - verify that RET_LOG generates log messages
3699  * - ...
3700  */
3701 
3702 TEST_HARNESS_MAIN
3703