1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 #include <sys/capability.h>
50 
51 #include <unistd.h>
52 #include <sys/syscall.h>
53 #include <poll.h>
54 
55 #include "../kselftest_harness.h"
56 #include "../clone3/clone3_selftests.h"
57 
58 /* Attempt to de-conflict with the selftests tree. */
59 #ifndef SKIP
60 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
61 #endif
62 
63 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
64 
65 #ifndef PR_SET_PTRACER
66 # define PR_SET_PTRACER 0x59616d61
67 #endif
68 
69 #ifndef PR_SET_NO_NEW_PRIVS
70 #define PR_SET_NO_NEW_PRIVS 38
71 #define PR_GET_NO_NEW_PRIVS 39
72 #endif
73 
74 #ifndef PR_SECCOMP_EXT
75 #define PR_SECCOMP_EXT 43
76 #endif
77 
78 #ifndef SECCOMP_EXT_ACT
79 #define SECCOMP_EXT_ACT 1
80 #endif
81 
82 #ifndef SECCOMP_EXT_ACT_TSYNC
83 #define SECCOMP_EXT_ACT_TSYNC 1
84 #endif
85 
86 #ifndef SECCOMP_MODE_STRICT
87 #define SECCOMP_MODE_STRICT 1
88 #endif
89 
90 #ifndef SECCOMP_MODE_FILTER
91 #define SECCOMP_MODE_FILTER 2
92 #endif
93 
94 #ifndef SECCOMP_RET_ALLOW
95 struct seccomp_data {
96 	int nr;
97 	__u32 arch;
98 	__u64 instruction_pointer;
99 	__u64 args[6];
100 };
101 #endif
102 
103 #ifndef SECCOMP_RET_KILL_PROCESS
104 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
105 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
106 #endif
107 #ifndef SECCOMP_RET_KILL
108 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
109 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
110 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
111 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
112 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
113 #endif
114 #ifndef SECCOMP_RET_LOG
115 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
116 #endif
117 
118 #ifndef __NR_seccomp
119 # if defined(__i386__)
120 #  define __NR_seccomp 354
121 # elif defined(__x86_64__)
122 #  define __NR_seccomp 317
123 # elif defined(__arm__)
124 #  define __NR_seccomp 383
125 # elif defined(__aarch64__)
126 #  define __NR_seccomp 277
127 # elif defined(__riscv)
128 #  define __NR_seccomp 277
129 # elif defined(__csky__)
130 #  define __NR_seccomp 277
131 # elif defined(__hppa__)
132 #  define __NR_seccomp 338
133 # elif defined(__powerpc__)
134 #  define __NR_seccomp 358
135 # elif defined(__s390__)
136 #  define __NR_seccomp 348
137 # elif defined(__xtensa__)
138 #  define __NR_seccomp 337
139 # elif defined(__sh__)
140 #  define __NR_seccomp 372
141 # elif defined(__mc68000__)
142 #  define __NR_seccomp 380
143 # else
144 #  warning "seccomp syscall number unknown for this architecture"
145 #  define __NR_seccomp 0xffff
146 # endif
147 #endif
148 
149 #ifndef SECCOMP_SET_MODE_STRICT
150 #define SECCOMP_SET_MODE_STRICT 0
151 #endif
152 
153 #ifndef SECCOMP_SET_MODE_FILTER
154 #define SECCOMP_SET_MODE_FILTER 1
155 #endif
156 
157 #ifndef SECCOMP_GET_ACTION_AVAIL
158 #define SECCOMP_GET_ACTION_AVAIL 2
159 #endif
160 
161 #ifndef SECCOMP_GET_NOTIF_SIZES
162 #define SECCOMP_GET_NOTIF_SIZES 3
163 #endif
164 
165 #ifndef SECCOMP_FILTER_FLAG_TSYNC
166 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
167 #endif
168 
169 #ifndef SECCOMP_FILTER_FLAG_LOG
170 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
171 #endif
172 
173 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
174 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
175 #endif
176 
177 #ifndef PTRACE_SECCOMP_GET_METADATA
178 #define PTRACE_SECCOMP_GET_METADATA	0x420d
179 
180 struct seccomp_metadata {
181 	__u64 filter_off;       /* Input: which filter */
182 	__u64 flags;             /* Output: filter's flags */
183 };
184 #endif
185 
186 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
187 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
188 #endif
189 
190 #ifndef SECCOMP_RET_USER_NOTIF
191 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
192 
193 #define SECCOMP_IOC_MAGIC		'!'
194 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
195 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
196 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
197 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
198 
199 /* Flags for seccomp notification fd ioctl. */
200 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
201 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
202 						struct seccomp_notif_resp)
203 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
204 
205 struct seccomp_notif {
206 	__u64 id;
207 	__u32 pid;
208 	__u32 flags;
209 	struct seccomp_data data;
210 };
211 
212 struct seccomp_notif_resp {
213 	__u64 id;
214 	__s64 val;
215 	__s32 error;
216 	__u32 flags;
217 };
218 
219 struct seccomp_notif_sizes {
220 	__u16 seccomp_notif;
221 	__u16 seccomp_notif_resp;
222 	__u16 seccomp_data;
223 };
224 #endif
225 
226 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
227 /* On success, the return value is the remote process's added fd number */
228 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
229 						struct seccomp_notif_addfd)
230 
231 /* valid flags for seccomp_notif_addfd */
232 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
233 
234 struct seccomp_notif_addfd {
235 	__u64 id;
236 	__u32 flags;
237 	__u32 srcfd;
238 	__u32 newfd;
239 	__u32 newfd_flags;
240 };
241 #endif
242 
243 #ifndef SECCOMP_ADDFD_FLAG_SEND
244 #define SECCOMP_ADDFD_FLAG_SEND	(1UL << 1) /* Addfd and return it, atomically */
245 #endif
246 
247 struct seccomp_notif_addfd_small {
248 	__u64 id;
249 	char weird[4];
250 };
251 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
252 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
253 
254 struct seccomp_notif_addfd_big {
255 	union {
256 		struct seccomp_notif_addfd addfd;
257 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
258 	};
259 };
260 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
261 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
262 
263 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
264 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
265 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
266 #endif
267 
268 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
269 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
270 #endif
271 
272 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
273 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
274 #endif
275 
276 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
277 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
278 #endif
279 
280 #ifndef seccomp
281 int seccomp(unsigned int op, unsigned int flags, void *args)
282 {
283 	errno = 0;
284 	return syscall(__NR_seccomp, op, flags, args);
285 }
286 #endif
287 
288 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
289 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
290 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
291 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
292 #else
293 #error "wut? Unknown __BYTE_ORDER__?!"
294 #endif
295 
296 #define SIBLING_EXIT_UNKILLED	0xbadbeef
297 #define SIBLING_EXIT_FAILURE	0xbadface
298 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
299 
300 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
301 {
302 #ifdef __NR_kcmp
303 	errno = 0;
304 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
305 #else
306 	errno = ENOSYS;
307 	return -1;
308 #endif
309 }
310 
311 /* Have TH_LOG report actual location filecmp() is used. */
312 #define filecmp(pid1, pid2, fd1, fd2)	({		\
313 	int _ret;					\
314 							\
315 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
316 	if (_ret != 0) {				\
317 		if (_ret < 0 && errno == ENOSYS) {	\
318 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
319 			_ret = 0;			\
320 		}					\
321 	}						\
322 	_ret; })
323 
324 TEST(kcmp)
325 {
326 	int ret;
327 
328 	ret = __filecmp(getpid(), getpid(), 1, 1);
329 	EXPECT_EQ(ret, 0);
330 	if (ret != 0 && errno == ENOSYS)
331 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
332 }
333 
334 TEST(mode_strict_support)
335 {
336 	long ret;
337 
338 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
339 	ASSERT_EQ(0, ret) {
340 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
341 	}
342 	syscall(__NR_exit, 0);
343 }
344 
345 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
346 {
347 	long ret;
348 
349 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
350 	ASSERT_EQ(0, ret) {
351 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
352 	}
353 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
354 		NULL, NULL, NULL);
355 	EXPECT_FALSE(true) {
356 		TH_LOG("Unreachable!");
357 	}
358 }
359 
360 /* Note! This doesn't test no new privs behavior */
361 TEST(no_new_privs_support)
362 {
363 	long ret;
364 
365 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
366 	EXPECT_EQ(0, ret) {
367 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
368 	}
369 }
370 
371 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
372 TEST(mode_filter_support)
373 {
374 	long ret;
375 
376 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
377 	ASSERT_EQ(0, ret) {
378 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
379 	}
380 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
381 	EXPECT_EQ(-1, ret);
382 	EXPECT_EQ(EFAULT, errno) {
383 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
384 	}
385 }
386 
387 TEST(mode_filter_without_nnp)
388 {
389 	struct sock_filter filter[] = {
390 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
391 	};
392 	struct sock_fprog prog = {
393 		.len = (unsigned short)ARRAY_SIZE(filter),
394 		.filter = filter,
395 	};
396 	long ret;
397 	cap_t cap = cap_get_proc();
398 	cap_flag_value_t is_cap_sys_admin = 0;
399 
400 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
401 	ASSERT_LE(0, ret) {
402 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
403 	}
404 	errno = 0;
405 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
406 	/* Succeeds with CAP_SYS_ADMIN, fails without */
407 	cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);
408 	if (!is_cap_sys_admin) {
409 		EXPECT_EQ(-1, ret);
410 		EXPECT_EQ(EACCES, errno);
411 	} else {
412 		EXPECT_EQ(0, ret);
413 	}
414 }
415 
416 #define MAX_INSNS_PER_PATH 32768
417 
418 TEST(filter_size_limits)
419 {
420 	int i;
421 	int count = BPF_MAXINSNS + 1;
422 	struct sock_filter allow[] = {
423 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
424 	};
425 	struct sock_filter *filter;
426 	struct sock_fprog prog = { };
427 	long ret;
428 
429 	filter = calloc(count, sizeof(*filter));
430 	ASSERT_NE(NULL, filter);
431 
432 	for (i = 0; i < count; i++)
433 		filter[i] = allow[0];
434 
435 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
436 	ASSERT_EQ(0, ret);
437 
438 	prog.filter = filter;
439 	prog.len = count;
440 
441 	/* Too many filter instructions in a single filter. */
442 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
443 	ASSERT_NE(0, ret) {
444 		TH_LOG("Installing %d insn filter was allowed", prog.len);
445 	}
446 
447 	/* One less is okay, though. */
448 	prog.len -= 1;
449 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
450 	ASSERT_EQ(0, ret) {
451 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
452 	}
453 }
454 
455 TEST(filter_chain_limits)
456 {
457 	int i;
458 	int count = BPF_MAXINSNS;
459 	struct sock_filter allow[] = {
460 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
461 	};
462 	struct sock_filter *filter;
463 	struct sock_fprog prog = { };
464 	long ret;
465 
466 	filter = calloc(count, sizeof(*filter));
467 	ASSERT_NE(NULL, filter);
468 
469 	for (i = 0; i < count; i++)
470 		filter[i] = allow[0];
471 
472 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
473 	ASSERT_EQ(0, ret);
474 
475 	prog.filter = filter;
476 	prog.len = 1;
477 
478 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
479 	ASSERT_EQ(0, ret);
480 
481 	prog.len = count;
482 
483 	/* Too many total filter instructions. */
484 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
485 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
486 		if (ret != 0)
487 			break;
488 	}
489 	ASSERT_NE(0, ret) {
490 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
491 		       i, count, i * (count + 4));
492 	}
493 }
494 
495 TEST(mode_filter_cannot_move_to_strict)
496 {
497 	struct sock_filter filter[] = {
498 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
499 	};
500 	struct sock_fprog prog = {
501 		.len = (unsigned short)ARRAY_SIZE(filter),
502 		.filter = filter,
503 	};
504 	long ret;
505 
506 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
507 	ASSERT_EQ(0, ret);
508 
509 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
510 	ASSERT_EQ(0, ret);
511 
512 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
513 	EXPECT_EQ(-1, ret);
514 	EXPECT_EQ(EINVAL, errno);
515 }
516 
517 
518 TEST(mode_filter_get_seccomp)
519 {
520 	struct sock_filter filter[] = {
521 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
522 	};
523 	struct sock_fprog prog = {
524 		.len = (unsigned short)ARRAY_SIZE(filter),
525 		.filter = filter,
526 	};
527 	long ret;
528 
529 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
530 	ASSERT_EQ(0, ret);
531 
532 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
533 	EXPECT_EQ(0, ret);
534 
535 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
536 	ASSERT_EQ(0, ret);
537 
538 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
539 	EXPECT_EQ(2, ret);
540 }
541 
542 
543 TEST(ALLOW_all)
544 {
545 	struct sock_filter filter[] = {
546 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
547 	};
548 	struct sock_fprog prog = {
549 		.len = (unsigned short)ARRAY_SIZE(filter),
550 		.filter = filter,
551 	};
552 	long ret;
553 
554 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
555 	ASSERT_EQ(0, ret);
556 
557 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
558 	ASSERT_EQ(0, ret);
559 }
560 
561 TEST(empty_prog)
562 {
563 	struct sock_filter filter[] = {
564 	};
565 	struct sock_fprog prog = {
566 		.len = (unsigned short)ARRAY_SIZE(filter),
567 		.filter = filter,
568 	};
569 	long ret;
570 
571 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
572 	ASSERT_EQ(0, ret);
573 
574 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
575 	EXPECT_EQ(-1, ret);
576 	EXPECT_EQ(EINVAL, errno);
577 }
578 
579 TEST(log_all)
580 {
581 	struct sock_filter filter[] = {
582 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
583 	};
584 	struct sock_fprog prog = {
585 		.len = (unsigned short)ARRAY_SIZE(filter),
586 		.filter = filter,
587 	};
588 	long ret;
589 	pid_t parent = getppid();
590 
591 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
592 	ASSERT_EQ(0, ret);
593 
594 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
595 	ASSERT_EQ(0, ret);
596 
597 	/* getppid() should succeed and be logged (no check for logging) */
598 	EXPECT_EQ(parent, syscall(__NR_getppid));
599 }
600 
601 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
602 {
603 	struct sock_filter filter[] = {
604 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
605 	};
606 	struct sock_fprog prog = {
607 		.len = (unsigned short)ARRAY_SIZE(filter),
608 		.filter = filter,
609 	};
610 	long ret;
611 
612 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
613 	ASSERT_EQ(0, ret);
614 
615 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
616 	ASSERT_EQ(0, ret);
617 	EXPECT_EQ(0, syscall(__NR_getpid)) {
618 		TH_LOG("getpid() shouldn't ever return");
619 	}
620 }
621 
622 /* return code >= 0x80000000 is unused. */
623 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
624 {
625 	struct sock_filter filter[] = {
626 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
627 	};
628 	struct sock_fprog prog = {
629 		.len = (unsigned short)ARRAY_SIZE(filter),
630 		.filter = filter,
631 	};
632 	long ret;
633 
634 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
635 	ASSERT_EQ(0, ret);
636 
637 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
638 	ASSERT_EQ(0, ret);
639 	EXPECT_EQ(0, syscall(__NR_getpid)) {
640 		TH_LOG("getpid() shouldn't ever return");
641 	}
642 }
643 
644 TEST_SIGNAL(KILL_all, SIGSYS)
645 {
646 	struct sock_filter filter[] = {
647 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
648 	};
649 	struct sock_fprog prog = {
650 		.len = (unsigned short)ARRAY_SIZE(filter),
651 		.filter = filter,
652 	};
653 	long ret;
654 
655 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
656 	ASSERT_EQ(0, ret);
657 
658 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
659 	ASSERT_EQ(0, ret);
660 }
661 
662 TEST_SIGNAL(KILL_one, SIGSYS)
663 {
664 	struct sock_filter filter[] = {
665 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
666 			offsetof(struct seccomp_data, nr)),
667 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
668 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
669 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
670 	};
671 	struct sock_fprog prog = {
672 		.len = (unsigned short)ARRAY_SIZE(filter),
673 		.filter = filter,
674 	};
675 	long ret;
676 	pid_t parent = getppid();
677 
678 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
679 	ASSERT_EQ(0, ret);
680 
681 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
682 	ASSERT_EQ(0, ret);
683 
684 	EXPECT_EQ(parent, syscall(__NR_getppid));
685 	/* getpid() should never return. */
686 	EXPECT_EQ(0, syscall(__NR_getpid));
687 }
688 
689 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
690 {
691 	void *fatal_address;
692 	struct sock_filter filter[] = {
693 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
694 			offsetof(struct seccomp_data, nr)),
695 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
696 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
697 		/* Only both with lower 32-bit for now. */
698 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
699 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
700 			(unsigned long)&fatal_address, 0, 1),
701 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
702 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
703 	};
704 	struct sock_fprog prog = {
705 		.len = (unsigned short)ARRAY_SIZE(filter),
706 		.filter = filter,
707 	};
708 	long ret;
709 	pid_t parent = getppid();
710 	struct tms timebuf;
711 	clock_t clock = times(&timebuf);
712 
713 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
714 	ASSERT_EQ(0, ret);
715 
716 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
717 	ASSERT_EQ(0, ret);
718 
719 	EXPECT_EQ(parent, syscall(__NR_getppid));
720 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
721 	/* times() should never return. */
722 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
723 }
724 
725 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
726 {
727 #ifndef __NR_mmap2
728 	int sysno = __NR_mmap;
729 #else
730 	int sysno = __NR_mmap2;
731 #endif
732 	struct sock_filter filter[] = {
733 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
734 			offsetof(struct seccomp_data, nr)),
735 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
736 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
737 		/* Only both with lower 32-bit for now. */
738 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
739 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
740 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
741 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
742 	};
743 	struct sock_fprog prog = {
744 		.len = (unsigned short)ARRAY_SIZE(filter),
745 		.filter = filter,
746 	};
747 	long ret;
748 	pid_t parent = getppid();
749 	int fd;
750 	void *map1, *map2;
751 	int page_size = sysconf(_SC_PAGESIZE);
752 
753 	ASSERT_LT(0, page_size);
754 
755 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
756 	ASSERT_EQ(0, ret);
757 
758 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
759 	ASSERT_EQ(0, ret);
760 
761 	fd = open("/dev/zero", O_RDONLY);
762 	ASSERT_NE(-1, fd);
763 
764 	EXPECT_EQ(parent, syscall(__NR_getppid));
765 	map1 = (void *)syscall(sysno,
766 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
767 	EXPECT_NE(MAP_FAILED, map1);
768 	/* mmap2() should never return. */
769 	map2 = (void *)syscall(sysno,
770 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
771 	EXPECT_EQ(MAP_FAILED, map2);
772 
773 	/* The test failed, so clean up the resources. */
774 	munmap(map1, page_size);
775 	munmap(map2, page_size);
776 	close(fd);
777 }
778 
779 /* This is a thread task to die via seccomp filter violation. */
780 void *kill_thread(void *data)
781 {
782 	bool die = (bool)data;
783 
784 	if (die) {
785 		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
786 		return (void *)SIBLING_EXIT_FAILURE;
787 	}
788 
789 	return (void *)SIBLING_EXIT_UNKILLED;
790 }
791 
792 enum kill_t {
793 	KILL_THREAD,
794 	KILL_PROCESS,
795 	RET_UNKNOWN
796 };
797 
798 /* Prepare a thread that will kill itself or both of us. */
799 void kill_thread_or_group(struct __test_metadata *_metadata,
800 			  enum kill_t kill_how)
801 {
802 	pthread_t thread;
803 	void *status;
804 	/* Kill only when calling __NR_prctl. */
805 	struct sock_filter filter_thread[] = {
806 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
807 			offsetof(struct seccomp_data, nr)),
808 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
809 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
810 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
811 	};
812 	struct sock_fprog prog_thread = {
813 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
814 		.filter = filter_thread,
815 	};
816 	int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
817 	struct sock_filter filter_process[] = {
818 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
819 			offsetof(struct seccomp_data, nr)),
820 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
821 		BPF_STMT(BPF_RET|BPF_K, kill),
822 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
823 	};
824 	struct sock_fprog prog_process = {
825 		.len = (unsigned short)ARRAY_SIZE(filter_process),
826 		.filter = filter_process,
827 	};
828 
829 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
830 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
831 	}
832 
833 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
834 			     kill_how == KILL_THREAD ? &prog_thread
835 						     : &prog_process));
836 
837 	/*
838 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
839 	 * flag cannot be downgraded by a new filter.
840 	 */
841 	if (kill_how == KILL_PROCESS)
842 		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
843 
844 	/* Start a thread that will exit immediately. */
845 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
846 	ASSERT_EQ(0, pthread_join(thread, &status));
847 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
848 
849 	/* Start a thread that will die immediately. */
850 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
851 	ASSERT_EQ(0, pthread_join(thread, &status));
852 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
853 
854 	/*
855 	 * If we get here, only the spawned thread died. Let the parent know
856 	 * the whole process didn't die (i.e. this thread, the spawner,
857 	 * stayed running).
858 	 */
859 	exit(42);
860 }
861 
862 TEST(KILL_thread)
863 {
864 	int status;
865 	pid_t child_pid;
866 
867 	child_pid = fork();
868 	ASSERT_LE(0, child_pid);
869 	if (child_pid == 0) {
870 		kill_thread_or_group(_metadata, KILL_THREAD);
871 		_exit(38);
872 	}
873 
874 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
875 
876 	/* If only the thread was killed, we'll see exit 42. */
877 	ASSERT_TRUE(WIFEXITED(status));
878 	ASSERT_EQ(42, WEXITSTATUS(status));
879 }
880 
881 TEST(KILL_process)
882 {
883 	int status;
884 	pid_t child_pid;
885 
886 	child_pid = fork();
887 	ASSERT_LE(0, child_pid);
888 	if (child_pid == 0) {
889 		kill_thread_or_group(_metadata, KILL_PROCESS);
890 		_exit(38);
891 	}
892 
893 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
894 
895 	/* If the entire process was killed, we'll see SIGSYS. */
896 	ASSERT_TRUE(WIFSIGNALED(status));
897 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
898 }
899 
900 TEST(KILL_unknown)
901 {
902 	int status;
903 	pid_t child_pid;
904 
905 	child_pid = fork();
906 	ASSERT_LE(0, child_pid);
907 	if (child_pid == 0) {
908 		kill_thread_or_group(_metadata, RET_UNKNOWN);
909 		_exit(38);
910 	}
911 
912 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
913 
914 	/* If the entire process was killed, we'll see SIGSYS. */
915 	EXPECT_TRUE(WIFSIGNALED(status)) {
916 		TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
917 	}
918 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
919 }
920 
921 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
922 TEST(arg_out_of_range)
923 {
924 	struct sock_filter filter[] = {
925 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
926 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
927 	};
928 	struct sock_fprog prog = {
929 		.len = (unsigned short)ARRAY_SIZE(filter),
930 		.filter = filter,
931 	};
932 	long ret;
933 
934 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
935 	ASSERT_EQ(0, ret);
936 
937 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
938 	EXPECT_EQ(-1, ret);
939 	EXPECT_EQ(EINVAL, errno);
940 }
941 
942 #define ERRNO_FILTER(name, errno)					\
943 	struct sock_filter _read_filter_##name[] = {			\
944 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
945 			offsetof(struct seccomp_data, nr)),		\
946 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
947 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
948 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
949 	};								\
950 	struct sock_fprog prog_##name = {				\
951 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
952 		.filter = _read_filter_##name,				\
953 	}
954 
955 /* Make sure basic errno values are correctly passed through a filter. */
956 TEST(ERRNO_valid)
957 {
958 	ERRNO_FILTER(valid, E2BIG);
959 	long ret;
960 	pid_t parent = getppid();
961 
962 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
963 	ASSERT_EQ(0, ret);
964 
965 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
966 	ASSERT_EQ(0, ret);
967 
968 	EXPECT_EQ(parent, syscall(__NR_getppid));
969 	EXPECT_EQ(-1, read(-1, NULL, 0));
970 	EXPECT_EQ(E2BIG, errno);
971 }
972 
973 /* Make sure an errno of zero is correctly handled by the arch code. */
974 TEST(ERRNO_zero)
975 {
976 	ERRNO_FILTER(zero, 0);
977 	long ret;
978 	pid_t parent = getppid();
979 
980 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
981 	ASSERT_EQ(0, ret);
982 
983 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
984 	ASSERT_EQ(0, ret);
985 
986 	EXPECT_EQ(parent, syscall(__NR_getppid));
987 	/* "errno" of 0 is ok. */
988 	EXPECT_EQ(0, read(-1, NULL, 0));
989 }
990 
991 /*
992  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
993  * This tests that the errno value gets capped correctly, fixed by
994  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
995  */
996 TEST(ERRNO_capped)
997 {
998 	ERRNO_FILTER(capped, 4096);
999 	long ret;
1000 	pid_t parent = getppid();
1001 
1002 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1003 	ASSERT_EQ(0, ret);
1004 
1005 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1006 	ASSERT_EQ(0, ret);
1007 
1008 	EXPECT_EQ(parent, syscall(__NR_getppid));
1009 	EXPECT_EQ(-1, read(-1, NULL, 0));
1010 	EXPECT_EQ(4095, errno);
1011 }
1012 
1013 /*
1014  * Filters are processed in reverse order: last applied is executed first.
1015  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1016  * SECCOMP_RET_DATA mask results will follow the most recently applied
1017  * matching filter return (and not the lowest or highest value).
1018  */
1019 TEST(ERRNO_order)
1020 {
1021 	ERRNO_FILTER(first,  11);
1022 	ERRNO_FILTER(second, 13);
1023 	ERRNO_FILTER(third,  12);
1024 	long ret;
1025 	pid_t parent = getppid();
1026 
1027 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1028 	ASSERT_EQ(0, ret);
1029 
1030 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1031 	ASSERT_EQ(0, ret);
1032 
1033 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1034 	ASSERT_EQ(0, ret);
1035 
1036 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1037 	ASSERT_EQ(0, ret);
1038 
1039 	EXPECT_EQ(parent, syscall(__NR_getppid));
1040 	EXPECT_EQ(-1, read(-1, NULL, 0));
1041 	EXPECT_EQ(12, errno);
1042 }
1043 
1044 FIXTURE(TRAP) {
1045 	struct sock_fprog prog;
1046 };
1047 
1048 FIXTURE_SETUP(TRAP)
1049 {
1050 	struct sock_filter filter[] = {
1051 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1052 			offsetof(struct seccomp_data, nr)),
1053 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1054 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1055 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1056 	};
1057 
1058 	memset(&self->prog, 0, sizeof(self->prog));
1059 	self->prog.filter = malloc(sizeof(filter));
1060 	ASSERT_NE(NULL, self->prog.filter);
1061 	memcpy(self->prog.filter, filter, sizeof(filter));
1062 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1063 }
1064 
1065 FIXTURE_TEARDOWN(TRAP)
1066 {
1067 	if (self->prog.filter)
1068 		free(self->prog.filter);
1069 }
1070 
1071 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1072 {
1073 	long ret;
1074 
1075 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1076 	ASSERT_EQ(0, ret);
1077 
1078 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1079 	ASSERT_EQ(0, ret);
1080 	syscall(__NR_getpid);
1081 }
1082 
1083 /* Ensure that SIGSYS overrides SIG_IGN */
1084 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1085 {
1086 	long ret;
1087 
1088 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1089 	ASSERT_EQ(0, ret);
1090 
1091 	signal(SIGSYS, SIG_IGN);
1092 
1093 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1094 	ASSERT_EQ(0, ret);
1095 	syscall(__NR_getpid);
1096 }
1097 
1098 static siginfo_t TRAP_info;
1099 static volatile int TRAP_nr;
1100 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1101 {
1102 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1103 	TRAP_nr = nr;
1104 }
1105 
1106 TEST_F(TRAP, handler)
1107 {
1108 	int ret, test;
1109 	struct sigaction act;
1110 	sigset_t mask;
1111 
1112 	memset(&act, 0, sizeof(act));
1113 	sigemptyset(&mask);
1114 	sigaddset(&mask, SIGSYS);
1115 
1116 	act.sa_sigaction = &TRAP_action;
1117 	act.sa_flags = SA_SIGINFO;
1118 	ret = sigaction(SIGSYS, &act, NULL);
1119 	ASSERT_EQ(0, ret) {
1120 		TH_LOG("sigaction failed");
1121 	}
1122 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1123 	ASSERT_EQ(0, ret) {
1124 		TH_LOG("sigprocmask failed");
1125 	}
1126 
1127 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1128 	ASSERT_EQ(0, ret);
1129 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1130 	ASSERT_EQ(0, ret);
1131 	TRAP_nr = 0;
1132 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1133 	/* Expect the registers to be rolled back. (nr = error) may vary
1134 	 * based on arch. */
1135 	ret = syscall(__NR_getpid);
1136 	/* Silence gcc warning about volatile. */
1137 	test = TRAP_nr;
1138 	EXPECT_EQ(SIGSYS, test);
1139 	struct local_sigsys {
1140 		void *_call_addr;	/* calling user insn */
1141 		int _syscall;		/* triggering system call number */
1142 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1143 	} *sigsys = (struct local_sigsys *)
1144 #ifdef si_syscall
1145 		&(TRAP_info.si_call_addr);
1146 #else
1147 		&TRAP_info.si_pid;
1148 #endif
1149 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1150 	/* Make sure arch is non-zero. */
1151 	EXPECT_NE(0, sigsys->_arch);
1152 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1153 }
1154 
1155 FIXTURE(precedence) {
1156 	struct sock_fprog allow;
1157 	struct sock_fprog log;
1158 	struct sock_fprog trace;
1159 	struct sock_fprog error;
1160 	struct sock_fprog trap;
1161 	struct sock_fprog kill;
1162 };
1163 
1164 FIXTURE_SETUP(precedence)
1165 {
1166 	struct sock_filter allow_insns[] = {
1167 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1168 	};
1169 	struct sock_filter log_insns[] = {
1170 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1171 			offsetof(struct seccomp_data, nr)),
1172 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1173 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1174 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1175 	};
1176 	struct sock_filter trace_insns[] = {
1177 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1178 			offsetof(struct seccomp_data, nr)),
1179 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1180 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1181 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1182 	};
1183 	struct sock_filter error_insns[] = {
1184 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1185 			offsetof(struct seccomp_data, nr)),
1186 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1187 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1188 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1189 	};
1190 	struct sock_filter trap_insns[] = {
1191 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1192 			offsetof(struct seccomp_data, nr)),
1193 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1194 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1195 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1196 	};
1197 	struct sock_filter kill_insns[] = {
1198 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1199 			offsetof(struct seccomp_data, nr)),
1200 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1201 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1202 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1203 	};
1204 
1205 	memset(self, 0, sizeof(*self));
1206 #define FILTER_ALLOC(_x) \
1207 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1208 	ASSERT_NE(NULL, self->_x.filter); \
1209 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1210 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1211 	FILTER_ALLOC(allow);
1212 	FILTER_ALLOC(log);
1213 	FILTER_ALLOC(trace);
1214 	FILTER_ALLOC(error);
1215 	FILTER_ALLOC(trap);
1216 	FILTER_ALLOC(kill);
1217 }
1218 
1219 FIXTURE_TEARDOWN(precedence)
1220 {
1221 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1222 	FILTER_FREE(allow);
1223 	FILTER_FREE(log);
1224 	FILTER_FREE(trace);
1225 	FILTER_FREE(error);
1226 	FILTER_FREE(trap);
1227 	FILTER_FREE(kill);
1228 }
1229 
1230 TEST_F(precedence, allow_ok)
1231 {
1232 	pid_t parent, res = 0;
1233 	long ret;
1234 
1235 	parent = getppid();
1236 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1237 	ASSERT_EQ(0, ret);
1238 
1239 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1240 	ASSERT_EQ(0, ret);
1241 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1242 	ASSERT_EQ(0, ret);
1243 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1244 	ASSERT_EQ(0, ret);
1245 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1246 	ASSERT_EQ(0, ret);
1247 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1248 	ASSERT_EQ(0, ret);
1249 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1250 	ASSERT_EQ(0, ret);
1251 	/* Should work just fine. */
1252 	res = syscall(__NR_getppid);
1253 	EXPECT_EQ(parent, res);
1254 }
1255 
1256 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1257 {
1258 	pid_t parent, res = 0;
1259 	long ret;
1260 
1261 	parent = getppid();
1262 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1263 	ASSERT_EQ(0, ret);
1264 
1265 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1266 	ASSERT_EQ(0, ret);
1267 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1268 	ASSERT_EQ(0, ret);
1269 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1270 	ASSERT_EQ(0, ret);
1271 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1272 	ASSERT_EQ(0, ret);
1273 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1274 	ASSERT_EQ(0, ret);
1275 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1276 	ASSERT_EQ(0, ret);
1277 	/* Should work just fine. */
1278 	res = syscall(__NR_getppid);
1279 	EXPECT_EQ(parent, res);
1280 	/* getpid() should never return. */
1281 	res = syscall(__NR_getpid);
1282 	EXPECT_EQ(0, res);
1283 }
1284 
1285 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1286 {
1287 	pid_t parent;
1288 	long ret;
1289 
1290 	parent = getppid();
1291 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1292 	ASSERT_EQ(0, ret);
1293 
1294 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1295 	ASSERT_EQ(0, ret);
1296 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1297 	ASSERT_EQ(0, ret);
1298 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1299 	ASSERT_EQ(0, ret);
1300 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1301 	ASSERT_EQ(0, ret);
1302 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1303 	ASSERT_EQ(0, ret);
1304 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1305 	ASSERT_EQ(0, ret);
1306 	/* Should work just fine. */
1307 	EXPECT_EQ(parent, syscall(__NR_getppid));
1308 	/* getpid() should never return. */
1309 	EXPECT_EQ(0, syscall(__NR_getpid));
1310 }
1311 
1312 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1313 {
1314 	pid_t parent;
1315 	long ret;
1316 
1317 	parent = getppid();
1318 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1319 	ASSERT_EQ(0, ret);
1320 
1321 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1322 	ASSERT_EQ(0, ret);
1323 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1324 	ASSERT_EQ(0, ret);
1325 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1326 	ASSERT_EQ(0, ret);
1327 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1328 	ASSERT_EQ(0, ret);
1329 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1330 	ASSERT_EQ(0, ret);
1331 	/* Should work just fine. */
1332 	EXPECT_EQ(parent, syscall(__NR_getppid));
1333 	/* getpid() should never return. */
1334 	EXPECT_EQ(0, syscall(__NR_getpid));
1335 }
1336 
1337 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1338 {
1339 	pid_t parent;
1340 	long ret;
1341 
1342 	parent = getppid();
1343 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1344 	ASSERT_EQ(0, ret);
1345 
1346 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1347 	ASSERT_EQ(0, ret);
1348 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1349 	ASSERT_EQ(0, ret);
1350 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1351 	ASSERT_EQ(0, ret);
1352 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1353 	ASSERT_EQ(0, ret);
1354 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1355 	ASSERT_EQ(0, ret);
1356 	/* Should work just fine. */
1357 	EXPECT_EQ(parent, syscall(__NR_getppid));
1358 	/* getpid() should never return. */
1359 	EXPECT_EQ(0, syscall(__NR_getpid));
1360 }
1361 
1362 TEST_F(precedence, errno_is_third)
1363 {
1364 	pid_t parent;
1365 	long ret;
1366 
1367 	parent = getppid();
1368 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1369 	ASSERT_EQ(0, ret);
1370 
1371 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1372 	ASSERT_EQ(0, ret);
1373 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1374 	ASSERT_EQ(0, ret);
1375 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1376 	ASSERT_EQ(0, ret);
1377 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1378 	ASSERT_EQ(0, ret);
1379 	/* Should work just fine. */
1380 	EXPECT_EQ(parent, syscall(__NR_getppid));
1381 	EXPECT_EQ(0, syscall(__NR_getpid));
1382 }
1383 
1384 TEST_F(precedence, errno_is_third_in_any_order)
1385 {
1386 	pid_t parent;
1387 	long ret;
1388 
1389 	parent = getppid();
1390 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1391 	ASSERT_EQ(0, ret);
1392 
1393 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1394 	ASSERT_EQ(0, ret);
1395 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1396 	ASSERT_EQ(0, ret);
1397 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1398 	ASSERT_EQ(0, ret);
1399 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1400 	ASSERT_EQ(0, ret);
1401 	/* Should work just fine. */
1402 	EXPECT_EQ(parent, syscall(__NR_getppid));
1403 	EXPECT_EQ(0, syscall(__NR_getpid));
1404 }
1405 
1406 TEST_F(precedence, trace_is_fourth)
1407 {
1408 	pid_t parent;
1409 	long ret;
1410 
1411 	parent = getppid();
1412 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1413 	ASSERT_EQ(0, ret);
1414 
1415 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1416 	ASSERT_EQ(0, ret);
1417 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1418 	ASSERT_EQ(0, ret);
1419 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1420 	ASSERT_EQ(0, ret);
1421 	/* Should work just fine. */
1422 	EXPECT_EQ(parent, syscall(__NR_getppid));
1423 	/* No ptracer */
1424 	EXPECT_EQ(-1, syscall(__NR_getpid));
1425 }
1426 
1427 TEST_F(precedence, trace_is_fourth_in_any_order)
1428 {
1429 	pid_t parent;
1430 	long ret;
1431 
1432 	parent = getppid();
1433 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1434 	ASSERT_EQ(0, ret);
1435 
1436 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1437 	ASSERT_EQ(0, ret);
1438 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1439 	ASSERT_EQ(0, ret);
1440 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1441 	ASSERT_EQ(0, ret);
1442 	/* Should work just fine. */
1443 	EXPECT_EQ(parent, syscall(__NR_getppid));
1444 	/* No ptracer */
1445 	EXPECT_EQ(-1, syscall(__NR_getpid));
1446 }
1447 
1448 TEST_F(precedence, log_is_fifth)
1449 {
1450 	pid_t mypid, parent;
1451 	long ret;
1452 
1453 	mypid = getpid();
1454 	parent = getppid();
1455 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1456 	ASSERT_EQ(0, ret);
1457 
1458 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1459 	ASSERT_EQ(0, ret);
1460 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1461 	ASSERT_EQ(0, ret);
1462 	/* Should work just fine. */
1463 	EXPECT_EQ(parent, syscall(__NR_getppid));
1464 	/* Should also work just fine */
1465 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1466 }
1467 
1468 TEST_F(precedence, log_is_fifth_in_any_order)
1469 {
1470 	pid_t mypid, parent;
1471 	long ret;
1472 
1473 	mypid = getpid();
1474 	parent = getppid();
1475 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1476 	ASSERT_EQ(0, ret);
1477 
1478 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1479 	ASSERT_EQ(0, ret);
1480 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1481 	ASSERT_EQ(0, ret);
1482 	/* Should work just fine. */
1483 	EXPECT_EQ(parent, syscall(__NR_getppid));
1484 	/* Should also work just fine */
1485 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1486 }
1487 
1488 #ifndef PTRACE_O_TRACESECCOMP
1489 #define PTRACE_O_TRACESECCOMP	0x00000080
1490 #endif
1491 
1492 /* Catch the Ubuntu 12.04 value error. */
1493 #if PTRACE_EVENT_SECCOMP != 7
1494 #undef PTRACE_EVENT_SECCOMP
1495 #endif
1496 
1497 #ifndef PTRACE_EVENT_SECCOMP
1498 #define PTRACE_EVENT_SECCOMP 7
1499 #endif
1500 
1501 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1502 bool tracer_running;
1503 void tracer_stop(int sig)
1504 {
1505 	tracer_running = false;
1506 }
1507 
1508 typedef void tracer_func_t(struct __test_metadata *_metadata,
1509 			   pid_t tracee, int status, void *args);
1510 
1511 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1512 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1513 {
1514 	int ret = -1;
1515 	struct sigaction action = {
1516 		.sa_handler = tracer_stop,
1517 	};
1518 
1519 	/* Allow external shutdown. */
1520 	tracer_running = true;
1521 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1522 
1523 	errno = 0;
1524 	while (ret == -1 && errno != EINVAL)
1525 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1526 	ASSERT_EQ(0, ret) {
1527 		kill(tracee, SIGKILL);
1528 	}
1529 	/* Wait for attach stop */
1530 	wait(NULL);
1531 
1532 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1533 						      PTRACE_O_TRACESYSGOOD :
1534 						      PTRACE_O_TRACESECCOMP);
1535 	ASSERT_EQ(0, ret) {
1536 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1537 		kill(tracee, SIGKILL);
1538 	}
1539 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1540 		     tracee, NULL, 0);
1541 	ASSERT_EQ(0, ret);
1542 
1543 	/* Unblock the tracee */
1544 	ASSERT_EQ(1, write(fd, "A", 1));
1545 	ASSERT_EQ(0, close(fd));
1546 
1547 	/* Run until we're shut down. Must assert to stop execution. */
1548 	while (tracer_running) {
1549 		int status;
1550 
1551 		if (wait(&status) != tracee)
1552 			continue;
1553 
1554 		if (WIFSIGNALED(status)) {
1555 			/* Child caught a fatal signal. */
1556 			return;
1557 		}
1558 		if (WIFEXITED(status)) {
1559 			/* Child exited with code. */
1560 			return;
1561 		}
1562 
1563 		/* Check if we got an expected event. */
1564 		ASSERT_EQ(WIFCONTINUED(status), false);
1565 		ASSERT_EQ(WIFSTOPPED(status), true);
1566 		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1567 			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1568 		}
1569 
1570 		tracer_func(_metadata, tracee, status, args);
1571 
1572 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1573 			     tracee, NULL, 0);
1574 		ASSERT_EQ(0, ret);
1575 	}
1576 	/* Directly report the status of our test harness results. */
1577 	syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1578 }
1579 
1580 /* Common tracer setup/teardown functions. */
1581 void cont_handler(int num)
1582 { }
1583 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1584 			  tracer_func_t func, void *args, bool ptrace_syscall)
1585 {
1586 	char sync;
1587 	int pipefd[2];
1588 	pid_t tracer_pid;
1589 	pid_t tracee = getpid();
1590 
1591 	/* Setup a pipe for clean synchronization. */
1592 	ASSERT_EQ(0, pipe(pipefd));
1593 
1594 	/* Fork a child which we'll promote to tracer */
1595 	tracer_pid = fork();
1596 	ASSERT_LE(0, tracer_pid);
1597 	signal(SIGALRM, cont_handler);
1598 	if (tracer_pid == 0) {
1599 		close(pipefd[0]);
1600 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1601 			     ptrace_syscall);
1602 		syscall(__NR_exit, 0);
1603 	}
1604 	close(pipefd[1]);
1605 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1606 	read(pipefd[0], &sync, 1);
1607 	close(pipefd[0]);
1608 
1609 	return tracer_pid;
1610 }
1611 
1612 void teardown_trace_fixture(struct __test_metadata *_metadata,
1613 			    pid_t tracer)
1614 {
1615 	if (tracer) {
1616 		int status;
1617 		/*
1618 		 * Extract the exit code from the other process and
1619 		 * adopt it for ourselves in case its asserts failed.
1620 		 */
1621 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1622 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1623 		if (WEXITSTATUS(status))
1624 			_metadata->passed = 0;
1625 	}
1626 }
1627 
1628 /* "poke" tracer arguments and function. */
1629 struct tracer_args_poke_t {
1630 	unsigned long poke_addr;
1631 };
1632 
1633 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1634 		 void *args)
1635 {
1636 	int ret;
1637 	unsigned long msg;
1638 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1639 
1640 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1641 	EXPECT_EQ(0, ret);
1642 	/* If this fails, don't try to recover. */
1643 	ASSERT_EQ(0x1001, msg) {
1644 		kill(tracee, SIGKILL);
1645 	}
1646 	/*
1647 	 * Poke in the message.
1648 	 * Registers are not touched to try to keep this relatively arch
1649 	 * agnostic.
1650 	 */
1651 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1652 	EXPECT_EQ(0, ret);
1653 }
1654 
1655 FIXTURE(TRACE_poke) {
1656 	struct sock_fprog prog;
1657 	pid_t tracer;
1658 	long poked;
1659 	struct tracer_args_poke_t tracer_args;
1660 };
1661 
1662 FIXTURE_SETUP(TRACE_poke)
1663 {
1664 	struct sock_filter filter[] = {
1665 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1666 			offsetof(struct seccomp_data, nr)),
1667 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1668 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1669 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1670 	};
1671 
1672 	self->poked = 0;
1673 	memset(&self->prog, 0, sizeof(self->prog));
1674 	self->prog.filter = malloc(sizeof(filter));
1675 	ASSERT_NE(NULL, self->prog.filter);
1676 	memcpy(self->prog.filter, filter, sizeof(filter));
1677 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1678 
1679 	/* Set up tracer args. */
1680 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1681 
1682 	/* Launch tracer. */
1683 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1684 					   &self->tracer_args, false);
1685 }
1686 
1687 FIXTURE_TEARDOWN(TRACE_poke)
1688 {
1689 	teardown_trace_fixture(_metadata, self->tracer);
1690 	if (self->prog.filter)
1691 		free(self->prog.filter);
1692 }
1693 
1694 TEST_F(TRACE_poke, read_has_side_effects)
1695 {
1696 	ssize_t ret;
1697 
1698 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1699 	ASSERT_EQ(0, ret);
1700 
1701 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1702 	ASSERT_EQ(0, ret);
1703 
1704 	EXPECT_EQ(0, self->poked);
1705 	ret = read(-1, NULL, 0);
1706 	EXPECT_EQ(-1, ret);
1707 	EXPECT_EQ(0x1001, self->poked);
1708 }
1709 
1710 TEST_F(TRACE_poke, getpid_runs_normally)
1711 {
1712 	long ret;
1713 
1714 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1715 	ASSERT_EQ(0, ret);
1716 
1717 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1718 	ASSERT_EQ(0, ret);
1719 
1720 	EXPECT_EQ(0, self->poked);
1721 	EXPECT_NE(0, syscall(__NR_getpid));
1722 	EXPECT_EQ(0, self->poked);
1723 }
1724 
1725 #if defined(__x86_64__)
1726 # define ARCH_REGS		struct user_regs_struct
1727 # define SYSCALL_NUM(_regs)	(_regs).orig_rax
1728 # define SYSCALL_RET(_regs)	(_regs).rax
1729 #elif defined(__i386__)
1730 # define ARCH_REGS		struct user_regs_struct
1731 # define SYSCALL_NUM(_regs)	(_regs).orig_eax
1732 # define SYSCALL_RET(_regs)	(_regs).eax
1733 #elif defined(__arm__)
1734 # define ARCH_REGS		struct pt_regs
1735 # define SYSCALL_NUM(_regs)	(_regs).ARM_r7
1736 # ifndef PTRACE_SET_SYSCALL
1737 #  define PTRACE_SET_SYSCALL   23
1738 # endif
1739 # define SYSCALL_NUM_SET(_regs, _nr)	\
1740 		EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1741 # define SYSCALL_RET(_regs)	(_regs).ARM_r0
1742 #elif defined(__aarch64__)
1743 # define ARCH_REGS		struct user_pt_regs
1744 # define SYSCALL_NUM(_regs)	(_regs).regs[8]
1745 # ifndef NT_ARM_SYSTEM_CALL
1746 #  define NT_ARM_SYSTEM_CALL 0x404
1747 # endif
1748 # define SYSCALL_NUM_SET(_regs, _nr)				\
1749 	do {							\
1750 		struct iovec __v;				\
1751 		typeof(_nr) __nr = (_nr);			\
1752 		__v.iov_base = &__nr;				\
1753 		__v.iov_len = sizeof(__nr);			\
1754 		EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,	\
1755 				    NT_ARM_SYSTEM_CALL, &__v));	\
1756 	} while (0)
1757 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1758 #elif defined(__riscv) && __riscv_xlen == 64
1759 # define ARCH_REGS		struct user_regs_struct
1760 # define SYSCALL_NUM(_regs)	(_regs).a7
1761 # define SYSCALL_RET(_regs)	(_regs).a0
1762 #elif defined(__csky__)
1763 # define ARCH_REGS		struct pt_regs
1764 #  if defined(__CSKYABIV2__)
1765 #   define SYSCALL_NUM(_regs)	(_regs).regs[3]
1766 #  else
1767 #   define SYSCALL_NUM(_regs)	(_regs).regs[9]
1768 #  endif
1769 # define SYSCALL_RET(_regs)	(_regs).a0
1770 #elif defined(__hppa__)
1771 # define ARCH_REGS		struct user_regs_struct
1772 # define SYSCALL_NUM(_regs)	(_regs).gr[20]
1773 # define SYSCALL_RET(_regs)	(_regs).gr[28]
1774 #elif defined(__powerpc__)
1775 # define ARCH_REGS		struct pt_regs
1776 # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
1777 # define SYSCALL_RET(_regs)	(_regs).gpr[3]
1778 # define SYSCALL_RET_SET(_regs, _val)				\
1779 	do {							\
1780 		typeof(_val) _result = (_val);			\
1781 		if ((_regs.trap & 0xfff0) == 0x3000) {		\
1782 			/*					\
1783 			 * scv 0 system call uses -ve result	\
1784 			 * for error, so no need to adjust.	\
1785 			 */					\
1786 			SYSCALL_RET(_regs) = _result;		\
1787 		} else {					\
1788 			/*					\
1789 			 * A syscall error is signaled by the	\
1790 			 * CR0 SO bit and the code is stored as	\
1791 			 * a positive value.			\
1792 			 */					\
1793 			if (_result < 0) {			\
1794 				SYSCALL_RET(_regs) = -_result;	\
1795 				(_regs).ccr |= 0x10000000;	\
1796 			} else {				\
1797 				SYSCALL_RET(_regs) = _result;	\
1798 				(_regs).ccr &= ~0x10000000;	\
1799 			}					\
1800 		}						\
1801 	} while (0)
1802 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1803 #elif defined(__s390__)
1804 # define ARCH_REGS		s390_regs
1805 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
1806 # define SYSCALL_RET_SET(_regs, _val)			\
1807 		TH_LOG("Can't modify syscall return on this architecture")
1808 #elif defined(__mips__)
1809 # include <asm/unistd_nr_n32.h>
1810 # include <asm/unistd_nr_n64.h>
1811 # include <asm/unistd_nr_o32.h>
1812 # define ARCH_REGS		struct pt_regs
1813 # define SYSCALL_NUM(_regs)				\
1814 	({						\
1815 		typeof((_regs).regs[2]) _nr;		\
1816 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1817 			_nr = (_regs).regs[4];		\
1818 		else					\
1819 			_nr = (_regs).regs[2];		\
1820 		_nr;					\
1821 	})
1822 # define SYSCALL_NUM_SET(_regs, _nr)			\
1823 	do {						\
1824 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1825 			(_regs).regs[4] = _nr;		\
1826 		else					\
1827 			(_regs).regs[2] = _nr;		\
1828 	} while (0)
1829 # define SYSCALL_RET_SET(_regs, _val)			\
1830 		TH_LOG("Can't modify syscall return on this architecture")
1831 #elif defined(__xtensa__)
1832 # define ARCH_REGS		struct user_pt_regs
1833 # define SYSCALL_NUM(_regs)	(_regs).syscall
1834 /*
1835  * On xtensa syscall return value is in the register
1836  * a2 of the current window which is not fixed.
1837  */
1838 #define SYSCALL_RET(_regs)	(_regs).a[(_regs).windowbase * 4 + 2]
1839 #elif defined(__sh__)
1840 # define ARCH_REGS		struct pt_regs
1841 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
1842 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1843 #elif defined(__mc68000__)
1844 # define ARCH_REGS		struct user_regs_struct
1845 # define SYSCALL_NUM(_regs)	(_regs).orig_d0
1846 # define SYSCALL_RET(_regs)	(_regs).d0
1847 #else
1848 # error "Do not know how to find your architecture's registers and syscalls"
1849 #endif
1850 
1851 /*
1852  * Most architectures can change the syscall by just updating the
1853  * associated register. This is the default if not defined above.
1854  */
1855 #ifndef SYSCALL_NUM_SET
1856 # define SYSCALL_NUM_SET(_regs, _nr)		\
1857 	do {					\
1858 		SYSCALL_NUM(_regs) = (_nr);	\
1859 	} while (0)
1860 #endif
1861 /*
1862  * Most architectures can change the syscall return value by just
1863  * writing to the SYSCALL_RET register. This is the default if not
1864  * defined above. If an architecture cannot set the return value
1865  * (for example when the syscall and return value register is
1866  * shared), report it with TH_LOG() in an arch-specific definition
1867  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1868  */
1869 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1870 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1871 #endif
1872 #ifndef SYSCALL_RET_SET
1873 # define SYSCALL_RET_SET(_regs, _val)		\
1874 	do {					\
1875 		SYSCALL_RET(_regs) = (_val);	\
1876 	} while (0)
1877 #endif
1878 
1879 /* When the syscall return can't be changed, stub out the tests for it. */
1880 #ifndef SYSCALL_RET
1881 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1882 #else
1883 # define EXPECT_SYSCALL_RETURN(val, action)		\
1884 	do {						\
1885 		errno = 0;				\
1886 		if (val < 0) {				\
1887 			EXPECT_EQ(-1, action);		\
1888 			EXPECT_EQ(-(val), errno);	\
1889 		} else {				\
1890 			EXPECT_EQ(val, action);		\
1891 		}					\
1892 	} while (0)
1893 #endif
1894 
1895 /*
1896  * Some architectures (e.g. powerpc) can only set syscall
1897  * return values on syscall exit during ptrace.
1898  */
1899 const bool ptrace_entry_set_syscall_nr = true;
1900 const bool ptrace_entry_set_syscall_ret =
1901 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1902 	true;
1903 #else
1904 	false;
1905 #endif
1906 
1907 /*
1908  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1909  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1910  */
1911 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
1912 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1913 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1914 #else
1915 # define ARCH_GETREGS(_regs)	({					\
1916 		struct iovec __v;					\
1917 		__v.iov_base = &(_regs);				\
1918 		__v.iov_len = sizeof(_regs);				\
1919 		ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);	\
1920 	})
1921 # define ARCH_SETREGS(_regs)	({					\
1922 		struct iovec __v;					\
1923 		__v.iov_base = &(_regs);				\
1924 		__v.iov_len = sizeof(_regs);				\
1925 		ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);	\
1926 	})
1927 #endif
1928 
1929 /* Architecture-specific syscall fetching routine. */
1930 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1931 {
1932 	ARCH_REGS regs;
1933 
1934 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1935 		return -1;
1936 	}
1937 
1938 	return SYSCALL_NUM(regs);
1939 }
1940 
1941 /* Architecture-specific syscall changing routine. */
1942 void __change_syscall(struct __test_metadata *_metadata,
1943 		    pid_t tracee, long *syscall, long *ret)
1944 {
1945 	ARCH_REGS orig, regs;
1946 
1947 	/* Do not get/set registers if we have nothing to do. */
1948 	if (!syscall && !ret)
1949 		return;
1950 
1951 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1952 		return;
1953 	}
1954 	orig = regs;
1955 
1956 	if (syscall)
1957 		SYSCALL_NUM_SET(regs, *syscall);
1958 
1959 	if (ret)
1960 		SYSCALL_RET_SET(regs, *ret);
1961 
1962 	/* Flush any register changes made. */
1963 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1964 		EXPECT_EQ(0, ARCH_SETREGS(regs));
1965 }
1966 
1967 /* Change only syscall number. */
1968 void change_syscall_nr(struct __test_metadata *_metadata,
1969 		       pid_t tracee, long syscall)
1970 {
1971 	__change_syscall(_metadata, tracee, &syscall, NULL);
1972 }
1973 
1974 /* Change syscall return value (and set syscall number to -1). */
1975 void change_syscall_ret(struct __test_metadata *_metadata,
1976 			pid_t tracee, long ret)
1977 {
1978 	long syscall = -1;
1979 
1980 	__change_syscall(_metadata, tracee, &syscall, &ret);
1981 }
1982 
1983 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1984 		    int status, void *args)
1985 {
1986 	int ret;
1987 	unsigned long msg;
1988 
1989 	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
1990 		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
1991 		return;
1992 	}
1993 
1994 	/* Make sure we got the right message. */
1995 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1996 	EXPECT_EQ(0, ret);
1997 
1998 	/* Validate and take action on expected syscalls. */
1999 	switch (msg) {
2000 	case 0x1002:
2001 		/* change getpid to getppid. */
2002 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
2003 		change_syscall_nr(_metadata, tracee, __NR_getppid);
2004 		break;
2005 	case 0x1003:
2006 		/* skip gettid with valid return code. */
2007 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2008 		change_syscall_ret(_metadata, tracee, 45000);
2009 		break;
2010 	case 0x1004:
2011 		/* skip openat with error. */
2012 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2013 		change_syscall_ret(_metadata, tracee, -ESRCH);
2014 		break;
2015 	case 0x1005:
2016 		/* do nothing (allow getppid) */
2017 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2018 		break;
2019 	default:
2020 		EXPECT_EQ(0, msg) {
2021 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2022 			kill(tracee, SIGKILL);
2023 		}
2024 	}
2025 
2026 }
2027 
2028 FIXTURE(TRACE_syscall) {
2029 	struct sock_fprog prog;
2030 	pid_t tracer, mytid, mypid, parent;
2031 	long syscall_nr;
2032 };
2033 
2034 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2035 		   int status, void *args)
2036 {
2037 	int ret;
2038 	unsigned long msg;
2039 	static bool entry;
2040 	long syscall_nr_val, syscall_ret_val;
2041 	long *syscall_nr = NULL, *syscall_ret = NULL;
2042 	FIXTURE_DATA(TRACE_syscall) *self = args;
2043 
2044 	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2045 		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2046 		return;
2047 	}
2048 
2049 	/*
2050 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
2051 	 * is by counting.
2052 	 */
2053 	entry = !entry;
2054 
2055 	/* Make sure we got an appropriate message. */
2056 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2057 	EXPECT_EQ(0, ret);
2058 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2059 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2060 
2061 	/*
2062 	 * Some architectures only support setting return values during
2063 	 * syscall exit under ptrace, and on exit the syscall number may
2064 	 * no longer be available. Therefore, save the initial sycall
2065 	 * number here, so it can be examined during both entry and exit
2066 	 * phases.
2067 	 */
2068 	if (entry)
2069 		self->syscall_nr = get_syscall(_metadata, tracee);
2070 
2071 	/*
2072 	 * Depending on the architecture's syscall setting abilities, we
2073 	 * pick which things to set during this phase (entry or exit).
2074 	 */
2075 	if (entry == ptrace_entry_set_syscall_nr)
2076 		syscall_nr = &syscall_nr_val;
2077 	if (entry == ptrace_entry_set_syscall_ret)
2078 		syscall_ret = &syscall_ret_val;
2079 
2080 	/* Now handle the actual rewriting cases. */
2081 	switch (self->syscall_nr) {
2082 	case __NR_getpid:
2083 		syscall_nr_val = __NR_getppid;
2084 		/* Never change syscall return for this case. */
2085 		syscall_ret = NULL;
2086 		break;
2087 	case __NR_gettid:
2088 		syscall_nr_val = -1;
2089 		syscall_ret_val = 45000;
2090 		break;
2091 	case __NR_openat:
2092 		syscall_nr_val = -1;
2093 		syscall_ret_val = -ESRCH;
2094 		break;
2095 	default:
2096 		/* Unhandled, do nothing. */
2097 		return;
2098 	}
2099 
2100 	__change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2101 }
2102 
2103 FIXTURE_VARIANT(TRACE_syscall) {
2104 	/*
2105 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2106 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2107 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
2108 	 * ptrace (true).
2109 	 */
2110 	bool use_ptrace;
2111 };
2112 
2113 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2114 	.use_ptrace = true,
2115 };
2116 
2117 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2118 	.use_ptrace = false,
2119 };
2120 
2121 FIXTURE_SETUP(TRACE_syscall)
2122 {
2123 	struct sock_filter filter[] = {
2124 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2125 			offsetof(struct seccomp_data, nr)),
2126 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2127 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2128 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2129 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2130 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2131 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2132 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2133 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2134 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2135 	};
2136 	struct sock_fprog prog = {
2137 		.len = (unsigned short)ARRAY_SIZE(filter),
2138 		.filter = filter,
2139 	};
2140 	long ret;
2141 
2142 	/* Prepare some testable syscall results. */
2143 	self->mytid = syscall(__NR_gettid);
2144 	ASSERT_GT(self->mytid, 0);
2145 	ASSERT_NE(self->mytid, 1) {
2146 		TH_LOG("Running this test as init is not supported. :)");
2147 	}
2148 
2149 	self->mypid = getpid();
2150 	ASSERT_GT(self->mypid, 0);
2151 	ASSERT_EQ(self->mytid, self->mypid);
2152 
2153 	self->parent = getppid();
2154 	ASSERT_GT(self->parent, 0);
2155 	ASSERT_NE(self->parent, self->mypid);
2156 
2157 	/* Launch tracer. */
2158 	self->tracer = setup_trace_fixture(_metadata,
2159 					   variant->use_ptrace ? tracer_ptrace
2160 							       : tracer_seccomp,
2161 					   self, variant->use_ptrace);
2162 
2163 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2164 	ASSERT_EQ(0, ret);
2165 
2166 	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2167 	if (variant->use_ptrace)
2168 		return;
2169 
2170 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2171 	ASSERT_EQ(0, ret);
2172 }
2173 
2174 FIXTURE_TEARDOWN(TRACE_syscall)
2175 {
2176 	teardown_trace_fixture(_metadata, self->tracer);
2177 }
2178 
2179 TEST(negative_ENOSYS)
2180 {
2181 	/*
2182 	 * There should be no difference between an "internal" skip
2183 	 * and userspace asking for syscall "-1".
2184 	 */
2185 	errno = 0;
2186 	EXPECT_EQ(-1, syscall(-1));
2187 	EXPECT_EQ(errno, ENOSYS);
2188 	/* And no difference for "still not valid but not -1". */
2189 	errno = 0;
2190 	EXPECT_EQ(-1, syscall(-101));
2191 	EXPECT_EQ(errno, ENOSYS);
2192 }
2193 
2194 TEST_F(TRACE_syscall, negative_ENOSYS)
2195 {
2196 	negative_ENOSYS(_metadata);
2197 }
2198 
2199 TEST_F(TRACE_syscall, syscall_allowed)
2200 {
2201 	/* getppid works as expected (no changes). */
2202 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2203 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2204 }
2205 
2206 TEST_F(TRACE_syscall, syscall_redirected)
2207 {
2208 	/* getpid has been redirected to getppid as expected. */
2209 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2210 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2211 }
2212 
2213 TEST_F(TRACE_syscall, syscall_errno)
2214 {
2215 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2216 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2217 }
2218 
2219 TEST_F(TRACE_syscall, syscall_faked)
2220 {
2221 	/* Tracer skips the gettid syscall and store altered return value. */
2222 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2223 }
2224 
2225 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2226 {
2227 	struct sock_filter filter[] = {
2228 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2229 			offsetof(struct seccomp_data, nr)),
2230 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2231 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2232 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2233 	};
2234 	struct sock_fprog prog = {
2235 		.len = (unsigned short)ARRAY_SIZE(filter),
2236 		.filter = filter,
2237 	};
2238 	long ret;
2239 
2240 	/* Install "kill on mknodat" filter. */
2241 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2242 	ASSERT_EQ(0, ret);
2243 
2244 	/* This should immediately die with SIGSYS, regardless of tracer. */
2245 	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2246 }
2247 
2248 TEST_F(TRACE_syscall, skip_after)
2249 {
2250 	struct sock_filter filter[] = {
2251 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2252 			offsetof(struct seccomp_data, nr)),
2253 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2254 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2255 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2256 	};
2257 	struct sock_fprog prog = {
2258 		.len = (unsigned short)ARRAY_SIZE(filter),
2259 		.filter = filter,
2260 	};
2261 	long ret;
2262 
2263 	/* Install additional "errno on getppid" filter. */
2264 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2265 	ASSERT_EQ(0, ret);
2266 
2267 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2268 	errno = 0;
2269 	EXPECT_EQ(-1, syscall(__NR_getpid));
2270 	EXPECT_EQ(EPERM, errno);
2271 }
2272 
2273 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2274 {
2275 	struct sock_filter filter[] = {
2276 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2277 			offsetof(struct seccomp_data, nr)),
2278 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2279 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2280 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2281 	};
2282 	struct sock_fprog prog = {
2283 		.len = (unsigned short)ARRAY_SIZE(filter),
2284 		.filter = filter,
2285 	};
2286 	long ret;
2287 
2288 	/* Install additional "death on getppid" filter. */
2289 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2290 	ASSERT_EQ(0, ret);
2291 
2292 	/* Tracer will redirect getpid to getppid, and we should die. */
2293 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2294 }
2295 
2296 TEST(seccomp_syscall)
2297 {
2298 	struct sock_filter filter[] = {
2299 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2300 	};
2301 	struct sock_fprog prog = {
2302 		.len = (unsigned short)ARRAY_SIZE(filter),
2303 		.filter = filter,
2304 	};
2305 	long ret;
2306 
2307 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2308 	ASSERT_EQ(0, ret) {
2309 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2310 	}
2311 
2312 	/* Reject insane operation. */
2313 	ret = seccomp(-1, 0, &prog);
2314 	ASSERT_NE(ENOSYS, errno) {
2315 		TH_LOG("Kernel does not support seccomp syscall!");
2316 	}
2317 	EXPECT_EQ(EINVAL, errno) {
2318 		TH_LOG("Did not reject crazy op value!");
2319 	}
2320 
2321 	/* Reject strict with flags or pointer. */
2322 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2323 	EXPECT_EQ(EINVAL, errno) {
2324 		TH_LOG("Did not reject mode strict with flags!");
2325 	}
2326 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2327 	EXPECT_EQ(EINVAL, errno) {
2328 		TH_LOG("Did not reject mode strict with uargs!");
2329 	}
2330 
2331 	/* Reject insane args for filter. */
2332 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2333 	EXPECT_EQ(EINVAL, errno) {
2334 		TH_LOG("Did not reject crazy filter flags!");
2335 	}
2336 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2337 	EXPECT_EQ(EFAULT, errno) {
2338 		TH_LOG("Did not reject NULL filter!");
2339 	}
2340 
2341 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2342 	EXPECT_EQ(0, errno) {
2343 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2344 			strerror(errno));
2345 	}
2346 }
2347 
2348 TEST(seccomp_syscall_mode_lock)
2349 {
2350 	struct sock_filter filter[] = {
2351 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2352 	};
2353 	struct sock_fprog prog = {
2354 		.len = (unsigned short)ARRAY_SIZE(filter),
2355 		.filter = filter,
2356 	};
2357 	long ret;
2358 
2359 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2360 	ASSERT_EQ(0, ret) {
2361 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2362 	}
2363 
2364 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2365 	ASSERT_NE(ENOSYS, errno) {
2366 		TH_LOG("Kernel does not support seccomp syscall!");
2367 	}
2368 	EXPECT_EQ(0, ret) {
2369 		TH_LOG("Could not install filter!");
2370 	}
2371 
2372 	/* Make sure neither entry point will switch to strict. */
2373 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2374 	EXPECT_EQ(EINVAL, errno) {
2375 		TH_LOG("Switched to mode strict!");
2376 	}
2377 
2378 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2379 	EXPECT_EQ(EINVAL, errno) {
2380 		TH_LOG("Switched to mode strict!");
2381 	}
2382 }
2383 
2384 /*
2385  * Test detection of known and unknown filter flags. Userspace needs to be able
2386  * to check if a filter flag is supported by the current kernel and a good way
2387  * of doing that is by attempting to enter filter mode, with the flag bit in
2388  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2389  * that the flag is valid and EINVAL indicates that the flag is invalid.
2390  */
2391 TEST(detect_seccomp_filter_flags)
2392 {
2393 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2394 				 SECCOMP_FILTER_FLAG_LOG,
2395 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2396 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2397 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2398 	unsigned int exclusive[] = {
2399 				SECCOMP_FILTER_FLAG_TSYNC,
2400 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2401 	unsigned int flag, all_flags, exclusive_mask;
2402 	int i;
2403 	long ret;
2404 
2405 	/* Test detection of individual known-good filter flags */
2406 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2407 		int bits = 0;
2408 
2409 		flag = flags[i];
2410 		/* Make sure the flag is a single bit! */
2411 		while (flag) {
2412 			if (flag & 0x1)
2413 				bits ++;
2414 			flag >>= 1;
2415 		}
2416 		ASSERT_EQ(1, bits);
2417 		flag = flags[i];
2418 
2419 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2420 		ASSERT_NE(ENOSYS, errno) {
2421 			TH_LOG("Kernel does not support seccomp syscall!");
2422 		}
2423 		EXPECT_EQ(-1, ret);
2424 		EXPECT_EQ(EFAULT, errno) {
2425 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2426 			       flag);
2427 		}
2428 
2429 		all_flags |= flag;
2430 	}
2431 
2432 	/*
2433 	 * Test detection of all known-good filter flags combined. But
2434 	 * for the exclusive flags we need to mask them out and try them
2435 	 * individually for the "all flags" testing.
2436 	 */
2437 	exclusive_mask = 0;
2438 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2439 		exclusive_mask |= exclusive[i];
2440 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2441 		flag = all_flags & ~exclusive_mask;
2442 		flag |= exclusive[i];
2443 
2444 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2445 		EXPECT_EQ(-1, ret);
2446 		EXPECT_EQ(EFAULT, errno) {
2447 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2448 			       flag);
2449 		}
2450 	}
2451 
2452 	/* Test detection of an unknown filter flags, without exclusives. */
2453 	flag = -1;
2454 	flag &= ~exclusive_mask;
2455 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2456 	EXPECT_EQ(-1, ret);
2457 	EXPECT_EQ(EINVAL, errno) {
2458 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2459 		       flag);
2460 	}
2461 
2462 	/*
2463 	 * Test detection of an unknown filter flag that may simply need to be
2464 	 * added to this test
2465 	 */
2466 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2467 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2468 	EXPECT_EQ(-1, ret);
2469 	EXPECT_EQ(EINVAL, errno) {
2470 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2471 		       flag);
2472 	}
2473 }
2474 
2475 TEST(TSYNC_first)
2476 {
2477 	struct sock_filter filter[] = {
2478 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2479 	};
2480 	struct sock_fprog prog = {
2481 		.len = (unsigned short)ARRAY_SIZE(filter),
2482 		.filter = filter,
2483 	};
2484 	long ret;
2485 
2486 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2487 	ASSERT_EQ(0, ret) {
2488 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2489 	}
2490 
2491 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2492 		      &prog);
2493 	ASSERT_NE(ENOSYS, errno) {
2494 		TH_LOG("Kernel does not support seccomp syscall!");
2495 	}
2496 	EXPECT_EQ(0, ret) {
2497 		TH_LOG("Could not install initial filter with TSYNC!");
2498 	}
2499 }
2500 
2501 #define TSYNC_SIBLINGS 2
2502 struct tsync_sibling {
2503 	pthread_t tid;
2504 	pid_t system_tid;
2505 	sem_t *started;
2506 	pthread_cond_t *cond;
2507 	pthread_mutex_t *mutex;
2508 	int diverge;
2509 	int num_waits;
2510 	struct sock_fprog *prog;
2511 	struct __test_metadata *metadata;
2512 };
2513 
2514 /*
2515  * To avoid joining joined threads (which is not allowed by Bionic),
2516  * make sure we both successfully join and clear the tid to skip a
2517  * later join attempt during fixture teardown. Any remaining threads
2518  * will be directly killed during teardown.
2519  */
2520 #define PTHREAD_JOIN(tid, status)					\
2521 	do {								\
2522 		int _rc = pthread_join(tid, status);			\
2523 		if (_rc) {						\
2524 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2525 				(unsigned int)tid, _rc);		\
2526 		} else {						\
2527 			tid = 0;					\
2528 		}							\
2529 	} while (0)
2530 
2531 FIXTURE(TSYNC) {
2532 	struct sock_fprog root_prog, apply_prog;
2533 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2534 	sem_t started;
2535 	pthread_cond_t cond;
2536 	pthread_mutex_t mutex;
2537 	int sibling_count;
2538 };
2539 
2540 FIXTURE_SETUP(TSYNC)
2541 {
2542 	struct sock_filter root_filter[] = {
2543 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2544 	};
2545 	struct sock_filter apply_filter[] = {
2546 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2547 			offsetof(struct seccomp_data, nr)),
2548 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2549 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2550 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2551 	};
2552 
2553 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2554 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2555 	memset(&self->sibling, 0, sizeof(self->sibling));
2556 	self->root_prog.filter = malloc(sizeof(root_filter));
2557 	ASSERT_NE(NULL, self->root_prog.filter);
2558 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2559 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2560 
2561 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2562 	ASSERT_NE(NULL, self->apply_prog.filter);
2563 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2564 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2565 
2566 	self->sibling_count = 0;
2567 	pthread_mutex_init(&self->mutex, NULL);
2568 	pthread_cond_init(&self->cond, NULL);
2569 	sem_init(&self->started, 0, 0);
2570 	self->sibling[0].tid = 0;
2571 	self->sibling[0].cond = &self->cond;
2572 	self->sibling[0].started = &self->started;
2573 	self->sibling[0].mutex = &self->mutex;
2574 	self->sibling[0].diverge = 0;
2575 	self->sibling[0].num_waits = 1;
2576 	self->sibling[0].prog = &self->root_prog;
2577 	self->sibling[0].metadata = _metadata;
2578 	self->sibling[1].tid = 0;
2579 	self->sibling[1].cond = &self->cond;
2580 	self->sibling[1].started = &self->started;
2581 	self->sibling[1].mutex = &self->mutex;
2582 	self->sibling[1].diverge = 0;
2583 	self->sibling[1].prog = &self->root_prog;
2584 	self->sibling[1].num_waits = 1;
2585 	self->sibling[1].metadata = _metadata;
2586 }
2587 
2588 FIXTURE_TEARDOWN(TSYNC)
2589 {
2590 	int sib = 0;
2591 
2592 	if (self->root_prog.filter)
2593 		free(self->root_prog.filter);
2594 	if (self->apply_prog.filter)
2595 		free(self->apply_prog.filter);
2596 
2597 	for ( ; sib < self->sibling_count; ++sib) {
2598 		struct tsync_sibling *s = &self->sibling[sib];
2599 
2600 		if (!s->tid)
2601 			continue;
2602 		/*
2603 		 * If a thread is still running, it may be stuck, so hit
2604 		 * it over the head really hard.
2605 		 */
2606 		pthread_kill(s->tid, 9);
2607 	}
2608 	pthread_mutex_destroy(&self->mutex);
2609 	pthread_cond_destroy(&self->cond);
2610 	sem_destroy(&self->started);
2611 }
2612 
2613 void *tsync_sibling(void *data)
2614 {
2615 	long ret = 0;
2616 	struct tsync_sibling *me = data;
2617 
2618 	me->system_tid = syscall(__NR_gettid);
2619 
2620 	pthread_mutex_lock(me->mutex);
2621 	if (me->diverge) {
2622 		/* Just re-apply the root prog to fork the tree */
2623 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2624 				me->prog, 0, 0);
2625 	}
2626 	sem_post(me->started);
2627 	/* Return outside of started so parent notices failures. */
2628 	if (ret) {
2629 		pthread_mutex_unlock(me->mutex);
2630 		return (void *)SIBLING_EXIT_FAILURE;
2631 	}
2632 	do {
2633 		pthread_cond_wait(me->cond, me->mutex);
2634 		me->num_waits = me->num_waits - 1;
2635 	} while (me->num_waits);
2636 	pthread_mutex_unlock(me->mutex);
2637 
2638 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2639 	if (!ret)
2640 		return (void *)SIBLING_EXIT_NEWPRIVS;
2641 	read(-1, NULL, 0);
2642 	return (void *)SIBLING_EXIT_UNKILLED;
2643 }
2644 
2645 void tsync_start_sibling(struct tsync_sibling *sibling)
2646 {
2647 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2648 }
2649 
2650 TEST_F(TSYNC, siblings_fail_prctl)
2651 {
2652 	long ret;
2653 	void *status;
2654 	struct sock_filter filter[] = {
2655 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2656 			offsetof(struct seccomp_data, nr)),
2657 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2658 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2659 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2660 	};
2661 	struct sock_fprog prog = {
2662 		.len = (unsigned short)ARRAY_SIZE(filter),
2663 		.filter = filter,
2664 	};
2665 
2666 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2667 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2668 	}
2669 
2670 	/* Check prctl failure detection by requesting sib 0 diverge. */
2671 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2672 	ASSERT_NE(ENOSYS, errno) {
2673 		TH_LOG("Kernel does not support seccomp syscall!");
2674 	}
2675 	ASSERT_EQ(0, ret) {
2676 		TH_LOG("setting filter failed");
2677 	}
2678 
2679 	self->sibling[0].diverge = 1;
2680 	tsync_start_sibling(&self->sibling[0]);
2681 	tsync_start_sibling(&self->sibling[1]);
2682 
2683 	while (self->sibling_count < TSYNC_SIBLINGS) {
2684 		sem_wait(&self->started);
2685 		self->sibling_count++;
2686 	}
2687 
2688 	/* Signal the threads to clean up*/
2689 	pthread_mutex_lock(&self->mutex);
2690 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2691 		TH_LOG("cond broadcast non-zero");
2692 	}
2693 	pthread_mutex_unlock(&self->mutex);
2694 
2695 	/* Ensure diverging sibling failed to call prctl. */
2696 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2697 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2698 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2699 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2700 }
2701 
2702 TEST_F(TSYNC, two_siblings_with_ancestor)
2703 {
2704 	long ret;
2705 	void *status;
2706 
2707 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2708 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2709 	}
2710 
2711 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2712 	ASSERT_NE(ENOSYS, errno) {
2713 		TH_LOG("Kernel does not support seccomp syscall!");
2714 	}
2715 	ASSERT_EQ(0, ret) {
2716 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2717 	}
2718 	tsync_start_sibling(&self->sibling[0]);
2719 	tsync_start_sibling(&self->sibling[1]);
2720 
2721 	while (self->sibling_count < TSYNC_SIBLINGS) {
2722 		sem_wait(&self->started);
2723 		self->sibling_count++;
2724 	}
2725 
2726 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2727 		      &self->apply_prog);
2728 	ASSERT_EQ(0, ret) {
2729 		TH_LOG("Could install filter on all threads!");
2730 	}
2731 	/* Tell the siblings to test the policy */
2732 	pthread_mutex_lock(&self->mutex);
2733 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2734 		TH_LOG("cond broadcast non-zero");
2735 	}
2736 	pthread_mutex_unlock(&self->mutex);
2737 	/* Ensure they are both killed and don't exit cleanly. */
2738 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2739 	EXPECT_EQ(0x0, (long)status);
2740 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2741 	EXPECT_EQ(0x0, (long)status);
2742 }
2743 
2744 TEST_F(TSYNC, two_sibling_want_nnp)
2745 {
2746 	void *status;
2747 
2748 	/* start siblings before any prctl() operations */
2749 	tsync_start_sibling(&self->sibling[0]);
2750 	tsync_start_sibling(&self->sibling[1]);
2751 	while (self->sibling_count < TSYNC_SIBLINGS) {
2752 		sem_wait(&self->started);
2753 		self->sibling_count++;
2754 	}
2755 
2756 	/* Tell the siblings to test no policy */
2757 	pthread_mutex_lock(&self->mutex);
2758 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2759 		TH_LOG("cond broadcast non-zero");
2760 	}
2761 	pthread_mutex_unlock(&self->mutex);
2762 
2763 	/* Ensure they are both upset about lacking nnp. */
2764 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2765 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2766 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2767 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2768 }
2769 
2770 TEST_F(TSYNC, two_siblings_with_no_filter)
2771 {
2772 	long ret;
2773 	void *status;
2774 
2775 	/* start siblings before any prctl() operations */
2776 	tsync_start_sibling(&self->sibling[0]);
2777 	tsync_start_sibling(&self->sibling[1]);
2778 	while (self->sibling_count < TSYNC_SIBLINGS) {
2779 		sem_wait(&self->started);
2780 		self->sibling_count++;
2781 	}
2782 
2783 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2784 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2785 	}
2786 
2787 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2788 		      &self->apply_prog);
2789 	ASSERT_NE(ENOSYS, errno) {
2790 		TH_LOG("Kernel does not support seccomp syscall!");
2791 	}
2792 	ASSERT_EQ(0, ret) {
2793 		TH_LOG("Could install filter on all threads!");
2794 	}
2795 
2796 	/* Tell the siblings to test the policy */
2797 	pthread_mutex_lock(&self->mutex);
2798 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2799 		TH_LOG("cond broadcast non-zero");
2800 	}
2801 	pthread_mutex_unlock(&self->mutex);
2802 
2803 	/* Ensure they are both killed and don't exit cleanly. */
2804 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2805 	EXPECT_EQ(0x0, (long)status);
2806 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2807 	EXPECT_EQ(0x0, (long)status);
2808 }
2809 
2810 TEST_F(TSYNC, two_siblings_with_one_divergence)
2811 {
2812 	long ret;
2813 	void *status;
2814 
2815 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2816 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2817 	}
2818 
2819 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2820 	ASSERT_NE(ENOSYS, errno) {
2821 		TH_LOG("Kernel does not support seccomp syscall!");
2822 	}
2823 	ASSERT_EQ(0, ret) {
2824 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2825 	}
2826 	self->sibling[0].diverge = 1;
2827 	tsync_start_sibling(&self->sibling[0]);
2828 	tsync_start_sibling(&self->sibling[1]);
2829 
2830 	while (self->sibling_count < TSYNC_SIBLINGS) {
2831 		sem_wait(&self->started);
2832 		self->sibling_count++;
2833 	}
2834 
2835 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2836 		      &self->apply_prog);
2837 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2838 		TH_LOG("Did not fail on diverged sibling.");
2839 	}
2840 
2841 	/* Wake the threads */
2842 	pthread_mutex_lock(&self->mutex);
2843 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2844 		TH_LOG("cond broadcast non-zero");
2845 	}
2846 	pthread_mutex_unlock(&self->mutex);
2847 
2848 	/* Ensure they are both unkilled. */
2849 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2850 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2851 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2852 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2853 }
2854 
2855 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2856 {
2857 	long ret, flags;
2858 	void *status;
2859 
2860 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2861 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2862 	}
2863 
2864 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2865 	ASSERT_NE(ENOSYS, errno) {
2866 		TH_LOG("Kernel does not support seccomp syscall!");
2867 	}
2868 	ASSERT_EQ(0, ret) {
2869 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2870 	}
2871 	self->sibling[0].diverge = 1;
2872 	tsync_start_sibling(&self->sibling[0]);
2873 	tsync_start_sibling(&self->sibling[1]);
2874 
2875 	while (self->sibling_count < TSYNC_SIBLINGS) {
2876 		sem_wait(&self->started);
2877 		self->sibling_count++;
2878 	}
2879 
2880 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2881 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2882 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2883 	ASSERT_EQ(ESRCH, errno) {
2884 		TH_LOG("Did not return ESRCH for diverged sibling.");
2885 	}
2886 	ASSERT_EQ(-1, ret) {
2887 		TH_LOG("Did not fail on diverged sibling.");
2888 	}
2889 
2890 	/* Wake the threads */
2891 	pthread_mutex_lock(&self->mutex);
2892 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2893 		TH_LOG("cond broadcast non-zero");
2894 	}
2895 	pthread_mutex_unlock(&self->mutex);
2896 
2897 	/* Ensure they are both unkilled. */
2898 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2899 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2900 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2901 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2902 }
2903 
2904 TEST_F(TSYNC, two_siblings_not_under_filter)
2905 {
2906 	long ret, sib;
2907 	void *status;
2908 	struct timespec delay = { .tv_nsec = 100000000 };
2909 
2910 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2911 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2912 	}
2913 
2914 	/*
2915 	 * Sibling 0 will have its own seccomp policy
2916 	 * and Sibling 1 will not be under seccomp at
2917 	 * all. Sibling 1 will enter seccomp and 0
2918 	 * will cause failure.
2919 	 */
2920 	self->sibling[0].diverge = 1;
2921 	tsync_start_sibling(&self->sibling[0]);
2922 	tsync_start_sibling(&self->sibling[1]);
2923 
2924 	while (self->sibling_count < TSYNC_SIBLINGS) {
2925 		sem_wait(&self->started);
2926 		self->sibling_count++;
2927 	}
2928 
2929 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2930 	ASSERT_NE(ENOSYS, errno) {
2931 		TH_LOG("Kernel does not support seccomp syscall!");
2932 	}
2933 	ASSERT_EQ(0, ret) {
2934 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2935 	}
2936 
2937 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2938 		      &self->apply_prog);
2939 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2940 		TH_LOG("Did not fail on diverged sibling.");
2941 	}
2942 	sib = 1;
2943 	if (ret == self->sibling[0].system_tid)
2944 		sib = 0;
2945 
2946 	pthread_mutex_lock(&self->mutex);
2947 
2948 	/* Increment the other siblings num_waits so we can clean up
2949 	 * the one we just saw.
2950 	 */
2951 	self->sibling[!sib].num_waits += 1;
2952 
2953 	/* Signal the thread to clean up*/
2954 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2955 		TH_LOG("cond broadcast non-zero");
2956 	}
2957 	pthread_mutex_unlock(&self->mutex);
2958 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2959 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2960 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2961 	while (!kill(self->sibling[sib].system_tid, 0))
2962 		nanosleep(&delay, NULL);
2963 	/* Switch to the remaining sibling */
2964 	sib = !sib;
2965 
2966 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2967 		      &self->apply_prog);
2968 	ASSERT_EQ(0, ret) {
2969 		TH_LOG("Expected the remaining sibling to sync");
2970 	};
2971 
2972 	pthread_mutex_lock(&self->mutex);
2973 
2974 	/* If remaining sibling didn't have a chance to wake up during
2975 	 * the first broadcast, manually reduce the num_waits now.
2976 	 */
2977 	if (self->sibling[sib].num_waits > 1)
2978 		self->sibling[sib].num_waits = 1;
2979 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2980 		TH_LOG("cond broadcast non-zero");
2981 	}
2982 	pthread_mutex_unlock(&self->mutex);
2983 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2984 	EXPECT_EQ(0, (long)status);
2985 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2986 	while (!kill(self->sibling[sib].system_tid, 0))
2987 		nanosleep(&delay, NULL);
2988 
2989 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2990 		      &self->apply_prog);
2991 	ASSERT_EQ(0, ret);  /* just us chickens */
2992 }
2993 
2994 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2995 TEST(syscall_restart)
2996 {
2997 	long ret;
2998 	unsigned long msg;
2999 	pid_t child_pid;
3000 	int pipefd[2];
3001 	int status;
3002 	siginfo_t info = { };
3003 	struct sock_filter filter[] = {
3004 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3005 			 offsetof(struct seccomp_data, nr)),
3006 
3007 #ifdef __NR_sigreturn
3008 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3009 #endif
3010 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3011 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3012 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3013 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3014 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3015 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3016 
3017 		/* Allow __NR_write for easy logging. */
3018 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3019 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3020 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3021 		/* The nanosleep jump target. */
3022 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3023 		/* The restart_syscall jump target. */
3024 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3025 	};
3026 	struct sock_fprog prog = {
3027 		.len = (unsigned short)ARRAY_SIZE(filter),
3028 		.filter = filter,
3029 	};
3030 #if defined(__arm__)
3031 	struct utsname utsbuf;
3032 #endif
3033 
3034 	ASSERT_EQ(0, pipe(pipefd));
3035 
3036 	child_pid = fork();
3037 	ASSERT_LE(0, child_pid);
3038 	if (child_pid == 0) {
3039 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
3040 		char buf = ' ';
3041 		struct timespec timeout = { };
3042 
3043 		/* Attach parent as tracer and stop. */
3044 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3045 		EXPECT_EQ(0, raise(SIGSTOP));
3046 
3047 		EXPECT_EQ(0, close(pipefd[1]));
3048 
3049 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3050 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3051 		}
3052 
3053 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3054 		EXPECT_EQ(0, ret) {
3055 			TH_LOG("Failed to install filter!");
3056 		}
3057 
3058 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3059 			TH_LOG("Failed to read() sync from parent");
3060 		}
3061 		EXPECT_EQ('.', buf) {
3062 			TH_LOG("Failed to get sync data from read()");
3063 		}
3064 
3065 		/* Start nanosleep to be interrupted. */
3066 		timeout.tv_sec = 1;
3067 		errno = 0;
3068 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3069 			TH_LOG("Call to nanosleep() failed (errno %d)", errno);
3070 		}
3071 
3072 		/* Read final sync from parent. */
3073 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3074 			TH_LOG("Failed final read() from parent");
3075 		}
3076 		EXPECT_EQ('!', buf) {
3077 			TH_LOG("Failed to get final data from read()");
3078 		}
3079 
3080 		/* Directly report the status of our test harness results. */
3081 		syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
3082 						     : EXIT_FAILURE);
3083 	}
3084 	EXPECT_EQ(0, close(pipefd[0]));
3085 
3086 	/* Attach to child, setup options, and release. */
3087 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3088 	ASSERT_EQ(true, WIFSTOPPED(status));
3089 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3090 			    PTRACE_O_TRACESECCOMP));
3091 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3092 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
3093 
3094 	/* Wait for nanosleep() to start. */
3095 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3096 	ASSERT_EQ(true, WIFSTOPPED(status));
3097 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3098 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3099 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3100 	ASSERT_EQ(0x100, msg);
3101 	ret = get_syscall(_metadata, child_pid);
3102 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3103 
3104 	/* Might as well check siginfo for sanity while we're here. */
3105 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3106 	ASSERT_EQ(SIGTRAP, info.si_signo);
3107 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3108 	EXPECT_EQ(0, info.si_errno);
3109 	EXPECT_EQ(getuid(), info.si_uid);
3110 	/* Verify signal delivery came from child (seccomp-triggered). */
3111 	EXPECT_EQ(child_pid, info.si_pid);
3112 
3113 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3114 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3115 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3116 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3117 	ASSERT_EQ(true, WIFSTOPPED(status));
3118 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3119 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3120 	/*
3121 	 * There is no siginfo on SIGSTOP any more, so we can't verify
3122 	 * signal delivery came from parent now (getpid() == info.si_pid).
3123 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3124 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3125 	 */
3126 	EXPECT_EQ(SIGSTOP, info.si_signo);
3127 
3128 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3129 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
3130 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3131 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3132 	ASSERT_EQ(true, WIFSTOPPED(status));
3133 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3134 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3135 
3136 	/* Wait for restart_syscall() to start. */
3137 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3138 	ASSERT_EQ(true, WIFSTOPPED(status));
3139 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3140 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3141 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3142 
3143 	ASSERT_EQ(0x200, msg);
3144 	ret = get_syscall(_metadata, child_pid);
3145 #if defined(__arm__)
3146 	/*
3147 	 * FIXME:
3148 	 * - native ARM registers do NOT expose true syscall.
3149 	 * - compat ARM registers on ARM64 DO expose true syscall.
3150 	 */
3151 	ASSERT_EQ(0, uname(&utsbuf));
3152 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3153 		EXPECT_EQ(__NR_nanosleep, ret);
3154 	} else
3155 #endif
3156 	{
3157 		EXPECT_EQ(__NR_restart_syscall, ret);
3158 	}
3159 
3160 	/* Write again to end test. */
3161 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3162 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
3163 	EXPECT_EQ(0, close(pipefd[1]));
3164 
3165 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3166 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
3167 		_metadata->passed = 0;
3168 }
3169 
3170 TEST_SIGNAL(filter_flag_log, SIGSYS)
3171 {
3172 	struct sock_filter allow_filter[] = {
3173 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3174 	};
3175 	struct sock_filter kill_filter[] = {
3176 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3177 			offsetof(struct seccomp_data, nr)),
3178 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3179 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3180 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3181 	};
3182 	struct sock_fprog allow_prog = {
3183 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
3184 		.filter = allow_filter,
3185 	};
3186 	struct sock_fprog kill_prog = {
3187 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
3188 		.filter = kill_filter,
3189 	};
3190 	long ret;
3191 	pid_t parent = getppid();
3192 
3193 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3194 	ASSERT_EQ(0, ret);
3195 
3196 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3197 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3198 		      &allow_prog);
3199 	ASSERT_NE(ENOSYS, errno) {
3200 		TH_LOG("Kernel does not support seccomp syscall!");
3201 	}
3202 	EXPECT_NE(0, ret) {
3203 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3204 	}
3205 	EXPECT_EQ(EINVAL, errno) {
3206 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3207 	}
3208 
3209 	/* Verify that a simple, permissive filter can be added with no flags */
3210 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3211 	EXPECT_EQ(0, ret);
3212 
3213 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3214 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3215 		      &allow_prog);
3216 	ASSERT_NE(EINVAL, errno) {
3217 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3218 	}
3219 	EXPECT_EQ(0, ret);
3220 
3221 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3222 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3223 		      &kill_prog);
3224 	EXPECT_EQ(0, ret);
3225 
3226 	EXPECT_EQ(parent, syscall(__NR_getppid));
3227 	/* getpid() should never return. */
3228 	EXPECT_EQ(0, syscall(__NR_getpid));
3229 }
3230 
3231 TEST(get_action_avail)
3232 {
3233 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3234 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3235 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3236 	__u32 unknown_action = 0x10000000U;
3237 	int i;
3238 	long ret;
3239 
3240 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3241 	ASSERT_NE(ENOSYS, errno) {
3242 		TH_LOG("Kernel does not support seccomp syscall!");
3243 	}
3244 	ASSERT_NE(EINVAL, errno) {
3245 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3246 	}
3247 	EXPECT_EQ(ret, 0);
3248 
3249 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3250 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3251 		EXPECT_EQ(ret, 0) {
3252 			TH_LOG("Expected action (0x%X) not available!",
3253 			       actions[i]);
3254 		}
3255 	}
3256 
3257 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3258 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3259 	EXPECT_EQ(ret, -1);
3260 	EXPECT_EQ(errno, EOPNOTSUPP);
3261 }
3262 
3263 TEST(get_metadata)
3264 {
3265 	pid_t pid;
3266 	int pipefd[2];
3267 	char buf;
3268 	struct seccomp_metadata md;
3269 	long ret;
3270 
3271 	/* Only real root can get metadata. */
3272 	if (geteuid()) {
3273 		SKIP(return, "get_metadata requires real root");
3274 		return;
3275 	}
3276 
3277 	ASSERT_EQ(0, pipe(pipefd));
3278 
3279 	pid = fork();
3280 	ASSERT_GE(pid, 0);
3281 	if (pid == 0) {
3282 		struct sock_filter filter[] = {
3283 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3284 		};
3285 		struct sock_fprog prog = {
3286 			.len = (unsigned short)ARRAY_SIZE(filter),
3287 			.filter = filter,
3288 		};
3289 
3290 		/* one with log, one without */
3291 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3292 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3293 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3294 
3295 		EXPECT_EQ(0, close(pipefd[0]));
3296 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3297 		ASSERT_EQ(0, close(pipefd[1]));
3298 
3299 		while (1)
3300 			sleep(100);
3301 	}
3302 
3303 	ASSERT_EQ(0, close(pipefd[1]));
3304 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3305 
3306 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3307 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3308 
3309 	/* Past here must not use ASSERT or child process is never killed. */
3310 
3311 	md.filter_off = 0;
3312 	errno = 0;
3313 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3314 	EXPECT_EQ(sizeof(md), ret) {
3315 		if (errno == EINVAL)
3316 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3317 	}
3318 
3319 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3320 	EXPECT_EQ(md.filter_off, 0);
3321 
3322 	md.filter_off = 1;
3323 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3324 	EXPECT_EQ(sizeof(md), ret);
3325 	EXPECT_EQ(md.flags, 0);
3326 	EXPECT_EQ(md.filter_off, 1);
3327 
3328 skip:
3329 	ASSERT_EQ(0, kill(pid, SIGKILL));
3330 }
3331 
3332 static int user_notif_syscall(int nr, unsigned int flags)
3333 {
3334 	struct sock_filter filter[] = {
3335 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3336 			offsetof(struct seccomp_data, nr)),
3337 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3338 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3339 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3340 	};
3341 
3342 	struct sock_fprog prog = {
3343 		.len = (unsigned short)ARRAY_SIZE(filter),
3344 		.filter = filter,
3345 	};
3346 
3347 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3348 }
3349 
3350 #define USER_NOTIF_MAGIC INT_MAX
3351 TEST(user_notification_basic)
3352 {
3353 	pid_t pid;
3354 	long ret;
3355 	int status, listener;
3356 	struct seccomp_notif req = {};
3357 	struct seccomp_notif_resp resp = {};
3358 	struct pollfd pollfd;
3359 
3360 	struct sock_filter filter[] = {
3361 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3362 	};
3363 	struct sock_fprog prog = {
3364 		.len = (unsigned short)ARRAY_SIZE(filter),
3365 		.filter = filter,
3366 	};
3367 
3368 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3369 	ASSERT_EQ(0, ret) {
3370 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3371 	}
3372 
3373 	pid = fork();
3374 	ASSERT_GE(pid, 0);
3375 
3376 	/* Check that we get -ENOSYS with no listener attached */
3377 	if (pid == 0) {
3378 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3379 			exit(1);
3380 		ret = syscall(__NR_getppid);
3381 		exit(ret >= 0 || errno != ENOSYS);
3382 	}
3383 
3384 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3385 	EXPECT_EQ(true, WIFEXITED(status));
3386 	EXPECT_EQ(0, WEXITSTATUS(status));
3387 
3388 	/* Add some no-op filters for grins. */
3389 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3390 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3391 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3392 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3393 
3394 	/* Check that the basic notification machinery works */
3395 	listener = user_notif_syscall(__NR_getppid,
3396 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3397 	ASSERT_GE(listener, 0);
3398 
3399 	/* Installing a second listener in the chain should EBUSY */
3400 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3401 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3402 		  -1);
3403 	EXPECT_EQ(errno, EBUSY);
3404 
3405 	pid = fork();
3406 	ASSERT_GE(pid, 0);
3407 
3408 	if (pid == 0) {
3409 		ret = syscall(__NR_getppid);
3410 		exit(ret != USER_NOTIF_MAGIC);
3411 	}
3412 
3413 	pollfd.fd = listener;
3414 	pollfd.events = POLLIN | POLLOUT;
3415 
3416 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3417 	EXPECT_EQ(pollfd.revents, POLLIN);
3418 
3419 	/* Test that we can't pass garbage to the kernel. */
3420 	memset(&req, 0, sizeof(req));
3421 	req.pid = -1;
3422 	errno = 0;
3423 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3424 	EXPECT_EQ(-1, ret);
3425 	EXPECT_EQ(EINVAL, errno);
3426 
3427 	if (ret) {
3428 		req.pid = 0;
3429 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3430 	}
3431 
3432 	pollfd.fd = listener;
3433 	pollfd.events = POLLIN | POLLOUT;
3434 
3435 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3436 	EXPECT_EQ(pollfd.revents, POLLOUT);
3437 
3438 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3439 
3440 	resp.id = req.id;
3441 	resp.error = 0;
3442 	resp.val = USER_NOTIF_MAGIC;
3443 
3444 	/* check that we make sure flags == 0 */
3445 	resp.flags = 1;
3446 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3447 	EXPECT_EQ(errno, EINVAL);
3448 
3449 	resp.flags = 0;
3450 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3451 
3452 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3453 	EXPECT_EQ(true, WIFEXITED(status));
3454 	EXPECT_EQ(0, WEXITSTATUS(status));
3455 }
3456 
3457 TEST(user_notification_with_tsync)
3458 {
3459 	int ret;
3460 	unsigned int flags;
3461 
3462 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3463 	ASSERT_EQ(0, ret) {
3464 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3465 	}
3466 
3467 	/* these were exclusive */
3468 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3469 		SECCOMP_FILTER_FLAG_TSYNC;
3470 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3471 	ASSERT_EQ(EINVAL, errno);
3472 
3473 	/* but now they're not */
3474 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3475 	ret = user_notif_syscall(__NR_getppid, flags);
3476 	close(ret);
3477 	ASSERT_LE(0, ret);
3478 }
3479 
3480 TEST(user_notification_kill_in_middle)
3481 {
3482 	pid_t pid;
3483 	long ret;
3484 	int listener;
3485 	struct seccomp_notif req = {};
3486 	struct seccomp_notif_resp resp = {};
3487 
3488 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3489 	ASSERT_EQ(0, ret) {
3490 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3491 	}
3492 
3493 	listener = user_notif_syscall(__NR_getppid,
3494 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3495 	ASSERT_GE(listener, 0);
3496 
3497 	/*
3498 	 * Check that nothing bad happens when we kill the task in the middle
3499 	 * of a syscall.
3500 	 */
3501 	pid = fork();
3502 	ASSERT_GE(pid, 0);
3503 
3504 	if (pid == 0) {
3505 		ret = syscall(__NR_getppid);
3506 		exit(ret != USER_NOTIF_MAGIC);
3507 	}
3508 
3509 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3510 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3511 
3512 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3513 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3514 
3515 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3516 
3517 	resp.id = req.id;
3518 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3519 	EXPECT_EQ(ret, -1);
3520 	EXPECT_EQ(errno, ENOENT);
3521 }
3522 
3523 static int handled = -1;
3524 
3525 static void signal_handler(int signal)
3526 {
3527 	if (write(handled, "c", 1) != 1)
3528 		perror("write from signal");
3529 }
3530 
3531 TEST(user_notification_signal)
3532 {
3533 	pid_t pid;
3534 	long ret;
3535 	int status, listener, sk_pair[2];
3536 	struct seccomp_notif req = {};
3537 	struct seccomp_notif_resp resp = {};
3538 	char c;
3539 
3540 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3541 	ASSERT_EQ(0, ret) {
3542 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3543 	}
3544 
3545 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3546 
3547 	listener = user_notif_syscall(__NR_gettid,
3548 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3549 	ASSERT_GE(listener, 0);
3550 
3551 	pid = fork();
3552 	ASSERT_GE(pid, 0);
3553 
3554 	if (pid == 0) {
3555 		close(sk_pair[0]);
3556 		handled = sk_pair[1];
3557 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3558 			perror("signal");
3559 			exit(1);
3560 		}
3561 		/*
3562 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3563 		 * to rely on a signal that has not yet been handled. Let's at
3564 		 * least check that the error code gets propagated through, and
3565 		 * hope that it doesn't break when there is actually a signal :)
3566 		 */
3567 		ret = syscall(__NR_gettid);
3568 		exit(!(ret == -1 && errno == 512));
3569 	}
3570 
3571 	close(sk_pair[1]);
3572 
3573 	memset(&req, 0, sizeof(req));
3574 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3575 
3576 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3577 
3578 	/*
3579 	 * Make sure the signal really is delivered, which means we're not
3580 	 * stuck in the user notification code any more and the notification
3581 	 * should be dead.
3582 	 */
3583 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3584 
3585 	resp.id = req.id;
3586 	resp.error = -EPERM;
3587 	resp.val = 0;
3588 
3589 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3590 	EXPECT_EQ(errno, ENOENT);
3591 
3592 	memset(&req, 0, sizeof(req));
3593 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3594 
3595 	resp.id = req.id;
3596 	resp.error = -512; /* -ERESTARTSYS */
3597 	resp.val = 0;
3598 
3599 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3600 
3601 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3602 	EXPECT_EQ(true, WIFEXITED(status));
3603 	EXPECT_EQ(0, WEXITSTATUS(status));
3604 }
3605 
3606 TEST(user_notification_closed_listener)
3607 {
3608 	pid_t pid;
3609 	long ret;
3610 	int status, listener;
3611 
3612 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3613 	ASSERT_EQ(0, ret) {
3614 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3615 	}
3616 
3617 	listener = user_notif_syscall(__NR_getppid,
3618 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3619 	ASSERT_GE(listener, 0);
3620 
3621 	/*
3622 	 * Check that we get an ENOSYS when the listener is closed.
3623 	 */
3624 	pid = fork();
3625 	ASSERT_GE(pid, 0);
3626 	if (pid == 0) {
3627 		close(listener);
3628 		ret = syscall(__NR_getppid);
3629 		exit(ret != -1 && errno != ENOSYS);
3630 	}
3631 
3632 	close(listener);
3633 
3634 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3635 	EXPECT_EQ(true, WIFEXITED(status));
3636 	EXPECT_EQ(0, WEXITSTATUS(status));
3637 }
3638 
3639 /*
3640  * Check that a pid in a child namespace still shows up as valid in ours.
3641  */
3642 TEST(user_notification_child_pid_ns)
3643 {
3644 	pid_t pid;
3645 	int status, listener;
3646 	struct seccomp_notif req = {};
3647 	struct seccomp_notif_resp resp = {};
3648 
3649 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3650 		if (errno == EINVAL)
3651 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3652 	};
3653 
3654 	listener = user_notif_syscall(__NR_getppid,
3655 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3656 	ASSERT_GE(listener, 0);
3657 
3658 	pid = fork();
3659 	ASSERT_GE(pid, 0);
3660 
3661 	if (pid == 0)
3662 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3663 
3664 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3665 	EXPECT_EQ(req.pid, pid);
3666 
3667 	resp.id = req.id;
3668 	resp.error = 0;
3669 	resp.val = USER_NOTIF_MAGIC;
3670 
3671 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3672 
3673 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3674 	EXPECT_EQ(true, WIFEXITED(status));
3675 	EXPECT_EQ(0, WEXITSTATUS(status));
3676 	close(listener);
3677 }
3678 
3679 /*
3680  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3681  * invalid.
3682  */
3683 TEST(user_notification_sibling_pid_ns)
3684 {
3685 	pid_t pid, pid2;
3686 	int status, listener;
3687 	struct seccomp_notif req = {};
3688 	struct seccomp_notif_resp resp = {};
3689 
3690 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3691 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3692 	}
3693 
3694 	listener = user_notif_syscall(__NR_getppid,
3695 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3696 	ASSERT_GE(listener, 0);
3697 
3698 	pid = fork();
3699 	ASSERT_GE(pid, 0);
3700 
3701 	if (pid == 0) {
3702 		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3703 
3704 		pid2 = fork();
3705 		ASSERT_GE(pid2, 0);
3706 
3707 		if (pid2 == 0)
3708 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3709 
3710 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3711 		EXPECT_EQ(true, WIFEXITED(status));
3712 		EXPECT_EQ(0, WEXITSTATUS(status));
3713 		exit(WEXITSTATUS(status));
3714 	}
3715 
3716 	/* Create the sibling ns, and sibling in it. */
3717 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3718 		if (errno == EPERM)
3719 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3720 	}
3721 	ASSERT_EQ(errno, 0);
3722 
3723 	pid2 = fork();
3724 	ASSERT_GE(pid2, 0);
3725 
3726 	if (pid2 == 0) {
3727 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3728 		/*
3729 		 * The pid should be 0, i.e. the task is in some namespace that
3730 		 * we can't "see".
3731 		 */
3732 		EXPECT_EQ(req.pid, 0);
3733 
3734 		resp.id = req.id;
3735 		resp.error = 0;
3736 		resp.val = USER_NOTIF_MAGIC;
3737 
3738 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3739 		exit(0);
3740 	}
3741 
3742 	close(listener);
3743 
3744 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3745 	EXPECT_EQ(true, WIFEXITED(status));
3746 	EXPECT_EQ(0, WEXITSTATUS(status));
3747 
3748 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3749 	EXPECT_EQ(true, WIFEXITED(status));
3750 	EXPECT_EQ(0, WEXITSTATUS(status));
3751 }
3752 
3753 TEST(user_notification_fault_recv)
3754 {
3755 	pid_t pid;
3756 	int status, listener;
3757 	struct seccomp_notif req = {};
3758 	struct seccomp_notif_resp resp = {};
3759 
3760 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3761 		if (errno == EINVAL)
3762 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3763 	}
3764 
3765 	listener = user_notif_syscall(__NR_getppid,
3766 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3767 	ASSERT_GE(listener, 0);
3768 
3769 	pid = fork();
3770 	ASSERT_GE(pid, 0);
3771 
3772 	if (pid == 0)
3773 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3774 
3775 	/* Do a bad recv() */
3776 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3777 	EXPECT_EQ(errno, EFAULT);
3778 
3779 	/* We should still be able to receive this notification, though. */
3780 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3781 	EXPECT_EQ(req.pid, pid);
3782 
3783 	resp.id = req.id;
3784 	resp.error = 0;
3785 	resp.val = USER_NOTIF_MAGIC;
3786 
3787 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3788 
3789 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3790 	EXPECT_EQ(true, WIFEXITED(status));
3791 	EXPECT_EQ(0, WEXITSTATUS(status));
3792 }
3793 
3794 TEST(seccomp_get_notif_sizes)
3795 {
3796 	struct seccomp_notif_sizes sizes;
3797 
3798 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3799 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3800 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3801 }
3802 
3803 TEST(user_notification_continue)
3804 {
3805 	pid_t pid;
3806 	long ret;
3807 	int status, listener;
3808 	struct seccomp_notif req = {};
3809 	struct seccomp_notif_resp resp = {};
3810 	struct pollfd pollfd;
3811 
3812 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3813 	ASSERT_EQ(0, ret) {
3814 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3815 	}
3816 
3817 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3818 	ASSERT_GE(listener, 0);
3819 
3820 	pid = fork();
3821 	ASSERT_GE(pid, 0);
3822 
3823 	if (pid == 0) {
3824 		int dup_fd, pipe_fds[2];
3825 		pid_t self;
3826 
3827 		ASSERT_GE(pipe(pipe_fds), 0);
3828 
3829 		dup_fd = dup(pipe_fds[0]);
3830 		ASSERT_GE(dup_fd, 0);
3831 		EXPECT_NE(pipe_fds[0], dup_fd);
3832 
3833 		self = getpid();
3834 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3835 		exit(0);
3836 	}
3837 
3838 	pollfd.fd = listener;
3839 	pollfd.events = POLLIN | POLLOUT;
3840 
3841 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3842 	EXPECT_EQ(pollfd.revents, POLLIN);
3843 
3844 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3845 
3846 	pollfd.fd = listener;
3847 	pollfd.events = POLLIN | POLLOUT;
3848 
3849 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3850 	EXPECT_EQ(pollfd.revents, POLLOUT);
3851 
3852 	EXPECT_EQ(req.data.nr, __NR_dup);
3853 
3854 	resp.id = req.id;
3855 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3856 
3857 	/*
3858 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3859 	 * args be set to 0.
3860 	 */
3861 	resp.error = 0;
3862 	resp.val = USER_NOTIF_MAGIC;
3863 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3864 	EXPECT_EQ(errno, EINVAL);
3865 
3866 	resp.error = USER_NOTIF_MAGIC;
3867 	resp.val = 0;
3868 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3869 	EXPECT_EQ(errno, EINVAL);
3870 
3871 	resp.error = 0;
3872 	resp.val = 0;
3873 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3874 		if (errno == EINVAL)
3875 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3876 	}
3877 
3878 skip:
3879 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3880 	EXPECT_EQ(true, WIFEXITED(status));
3881 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3882 		if (WEXITSTATUS(status) == 2) {
3883 			SKIP(return, "Kernel does not support kcmp() syscall");
3884 			return;
3885 		}
3886 	}
3887 }
3888 
3889 TEST(user_notification_filter_empty)
3890 {
3891 	pid_t pid;
3892 	long ret;
3893 	int status;
3894 	struct pollfd pollfd;
3895 	struct __clone_args args = {
3896 		.flags = CLONE_FILES,
3897 		.exit_signal = SIGCHLD,
3898 	};
3899 
3900 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3901 	ASSERT_EQ(0, ret) {
3902 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3903 	}
3904 
3905 	pid = sys_clone3(&args, sizeof(args));
3906 	ASSERT_GE(pid, 0);
3907 
3908 	if (pid == 0) {
3909 		int listener;
3910 
3911 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3912 		if (listener < 0)
3913 			_exit(EXIT_FAILURE);
3914 
3915 		if (dup2(listener, 200) != 200)
3916 			_exit(EXIT_FAILURE);
3917 
3918 		close(listener);
3919 
3920 		_exit(EXIT_SUCCESS);
3921 	}
3922 
3923 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3924 	EXPECT_EQ(true, WIFEXITED(status));
3925 	EXPECT_EQ(0, WEXITSTATUS(status));
3926 
3927 	/*
3928 	 * The seccomp filter has become unused so we should be notified once
3929 	 * the kernel gets around to cleaning up task struct.
3930 	 */
3931 	pollfd.fd = 200;
3932 	pollfd.events = POLLHUP;
3933 
3934 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3935 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3936 }
3937 
3938 static void *do_thread(void *data)
3939 {
3940 	return NULL;
3941 }
3942 
3943 TEST(user_notification_filter_empty_threaded)
3944 {
3945 	pid_t pid;
3946 	long ret;
3947 	int status;
3948 	struct pollfd pollfd;
3949 	struct __clone_args args = {
3950 		.flags = CLONE_FILES,
3951 		.exit_signal = SIGCHLD,
3952 	};
3953 
3954 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3955 	ASSERT_EQ(0, ret) {
3956 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3957 	}
3958 
3959 	pid = sys_clone3(&args, sizeof(args));
3960 	ASSERT_GE(pid, 0);
3961 
3962 	if (pid == 0) {
3963 		pid_t pid1, pid2;
3964 		int listener, status;
3965 		pthread_t thread;
3966 
3967 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3968 		if (listener < 0)
3969 			_exit(EXIT_FAILURE);
3970 
3971 		if (dup2(listener, 200) != 200)
3972 			_exit(EXIT_FAILURE);
3973 
3974 		close(listener);
3975 
3976 		pid1 = fork();
3977 		if (pid1 < 0)
3978 			_exit(EXIT_FAILURE);
3979 
3980 		if (pid1 == 0)
3981 			_exit(EXIT_SUCCESS);
3982 
3983 		pid2 = fork();
3984 		if (pid2 < 0)
3985 			_exit(EXIT_FAILURE);
3986 
3987 		if (pid2 == 0)
3988 			_exit(EXIT_SUCCESS);
3989 
3990 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3991 		    pthread_join(thread, NULL))
3992 			_exit(EXIT_FAILURE);
3993 
3994 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
3995 		    pthread_join(thread, NULL))
3996 			_exit(EXIT_FAILURE);
3997 
3998 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3999 		    WEXITSTATUS(status))
4000 			_exit(EXIT_FAILURE);
4001 
4002 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
4003 		    WEXITSTATUS(status))
4004 			_exit(EXIT_FAILURE);
4005 
4006 		exit(EXIT_SUCCESS);
4007 	}
4008 
4009 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4010 	EXPECT_EQ(true, WIFEXITED(status));
4011 	EXPECT_EQ(0, WEXITSTATUS(status));
4012 
4013 	/*
4014 	 * The seccomp filter has become unused so we should be notified once
4015 	 * the kernel gets around to cleaning up task struct.
4016 	 */
4017 	pollfd.fd = 200;
4018 	pollfd.events = POLLHUP;
4019 
4020 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4021 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4022 }
4023 
4024 TEST(user_notification_addfd)
4025 {
4026 	pid_t pid;
4027 	long ret;
4028 	int status, listener, memfd, fd, nextfd;
4029 	struct seccomp_notif_addfd addfd = {};
4030 	struct seccomp_notif_addfd_small small = {};
4031 	struct seccomp_notif_addfd_big big = {};
4032 	struct seccomp_notif req = {};
4033 	struct seccomp_notif_resp resp = {};
4034 	/* 100 ms */
4035 	struct timespec delay = { .tv_nsec = 100000000 };
4036 
4037 	/* There may be arbitrary already-open fds at test start. */
4038 	memfd = memfd_create("test", 0);
4039 	ASSERT_GE(memfd, 0);
4040 	nextfd = memfd + 1;
4041 
4042 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4043 	ASSERT_EQ(0, ret) {
4044 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4045 	}
4046 
4047 	/* fd: 4 */
4048 	/* Check that the basic notification machinery works */
4049 	listener = user_notif_syscall(__NR_getppid,
4050 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4051 	ASSERT_EQ(listener, nextfd++);
4052 
4053 	pid = fork();
4054 	ASSERT_GE(pid, 0);
4055 
4056 	if (pid == 0) {
4057 		/* fds will be added and this value is expected */
4058 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4059 			exit(1);
4060 
4061 		/* Atomic addfd+send is received here. Check it is a valid fd */
4062 		if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4063 			exit(1);
4064 
4065 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4066 	}
4067 
4068 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4069 
4070 	addfd.srcfd = memfd;
4071 	addfd.newfd = 0;
4072 	addfd.id = req.id;
4073 	addfd.flags = 0x0;
4074 
4075 	/* Verify bad newfd_flags cannot be set */
4076 	addfd.newfd_flags = ~O_CLOEXEC;
4077 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4078 	EXPECT_EQ(errno, EINVAL);
4079 	addfd.newfd_flags = O_CLOEXEC;
4080 
4081 	/* Verify bad flags cannot be set */
4082 	addfd.flags = 0xff;
4083 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4084 	EXPECT_EQ(errno, EINVAL);
4085 	addfd.flags = 0;
4086 
4087 	/* Verify that remote_fd cannot be set without setting flags */
4088 	addfd.newfd = 1;
4089 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4090 	EXPECT_EQ(errno, EINVAL);
4091 	addfd.newfd = 0;
4092 
4093 	/* Verify small size cannot be set */
4094 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4095 	EXPECT_EQ(errno, EINVAL);
4096 
4097 	/* Verify we can't send bits filled in unknown buffer area */
4098 	memset(&big, 0xAA, sizeof(big));
4099 	big.addfd = addfd;
4100 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4101 	EXPECT_EQ(errno, E2BIG);
4102 
4103 
4104 	/* Verify we can set an arbitrary remote fd */
4105 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4106 	EXPECT_EQ(fd, nextfd++);
4107 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4108 
4109 	/* Verify we can set an arbitrary remote fd with large size */
4110 	memset(&big, 0x0, sizeof(big));
4111 	big.addfd = addfd;
4112 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4113 	EXPECT_EQ(fd, nextfd++);
4114 
4115 	/* Verify we can set a specific remote fd */
4116 	addfd.newfd = 42;
4117 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4118 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4119 	EXPECT_EQ(fd, 42);
4120 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4121 
4122 	/* Resume syscall */
4123 	resp.id = req.id;
4124 	resp.error = 0;
4125 	resp.val = USER_NOTIF_MAGIC;
4126 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4127 
4128 	/*
4129 	 * This sets the ID of the ADD FD to the last request plus 1. The
4130 	 * notification ID increments 1 per notification.
4131 	 */
4132 	addfd.id = req.id + 1;
4133 
4134 	/* This spins until the underlying notification is generated */
4135 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4136 	       errno != -EINPROGRESS)
4137 		nanosleep(&delay, NULL);
4138 
4139 	memset(&req, 0, sizeof(req));
4140 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4141 	ASSERT_EQ(addfd.id, req.id);
4142 
4143 	/* Verify we can do an atomic addfd and send */
4144 	addfd.newfd = 0;
4145 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4146 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4147 	/*
4148 	 * Child has earlier "low" fds and now 42, so we expect the next
4149 	 * lowest available fd to be assigned here.
4150 	 */
4151 	EXPECT_EQ(fd, nextfd++);
4152 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4153 
4154 	/*
4155 	 * This sets the ID of the ADD FD to the last request plus 1. The
4156 	 * notification ID increments 1 per notification.
4157 	 */
4158 	addfd.id = req.id + 1;
4159 
4160 	/* This spins until the underlying notification is generated */
4161 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4162 	       errno != -EINPROGRESS)
4163 		nanosleep(&delay, NULL);
4164 
4165 	memset(&req, 0, sizeof(req));
4166 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4167 	ASSERT_EQ(addfd.id, req.id);
4168 
4169 	resp.id = req.id;
4170 	resp.error = 0;
4171 	resp.val = USER_NOTIF_MAGIC;
4172 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4173 
4174 	/* Wait for child to finish. */
4175 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4176 	EXPECT_EQ(true, WIFEXITED(status));
4177 	EXPECT_EQ(0, WEXITSTATUS(status));
4178 
4179 	close(memfd);
4180 }
4181 
4182 TEST(user_notification_addfd_rlimit)
4183 {
4184 	pid_t pid;
4185 	long ret;
4186 	int status, listener, memfd;
4187 	struct seccomp_notif_addfd addfd = {};
4188 	struct seccomp_notif req = {};
4189 	struct seccomp_notif_resp resp = {};
4190 	const struct rlimit lim = {
4191 		.rlim_cur	= 0,
4192 		.rlim_max	= 0,
4193 	};
4194 
4195 	memfd = memfd_create("test", 0);
4196 	ASSERT_GE(memfd, 0);
4197 
4198 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4199 	ASSERT_EQ(0, ret) {
4200 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4201 	}
4202 
4203 	/* Check that the basic notification machinery works */
4204 	listener = user_notif_syscall(__NR_getppid,
4205 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4206 	ASSERT_GE(listener, 0);
4207 
4208 	pid = fork();
4209 	ASSERT_GE(pid, 0);
4210 
4211 	if (pid == 0)
4212 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4213 
4214 
4215 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4216 
4217 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4218 
4219 	addfd.srcfd = memfd;
4220 	addfd.newfd_flags = O_CLOEXEC;
4221 	addfd.newfd = 0;
4222 	addfd.id = req.id;
4223 	addfd.flags = 0;
4224 
4225 	/* Should probably spot check /proc/sys/fs/file-nr */
4226 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4227 	EXPECT_EQ(errno, EMFILE);
4228 
4229 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4230 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4231 	EXPECT_EQ(errno, EMFILE);
4232 
4233 	addfd.newfd = 100;
4234 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4235 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4236 	EXPECT_EQ(errno, EBADF);
4237 
4238 	resp.id = req.id;
4239 	resp.error = 0;
4240 	resp.val = USER_NOTIF_MAGIC;
4241 
4242 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4243 
4244 	/* Wait for child to finish. */
4245 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4246 	EXPECT_EQ(true, WIFEXITED(status));
4247 	EXPECT_EQ(0, WEXITSTATUS(status));
4248 
4249 	close(memfd);
4250 }
4251 
4252 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
4253 FIXTURE(O_SUSPEND_SECCOMP) {
4254 	pid_t pid;
4255 };
4256 
4257 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4258 {
4259 	ERRNO_FILTER(block_read, E2BIG);
4260 	cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4261 	cap_t caps;
4262 
4263 	self->pid = 0;
4264 
4265 	/* make sure we don't have CAP_SYS_ADMIN */
4266 	caps = cap_get_proc();
4267 	ASSERT_NE(NULL, caps);
4268 	ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4269 	ASSERT_EQ(0, cap_set_proc(caps));
4270 	cap_free(caps);
4271 
4272 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4273 	ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4274 
4275 	self->pid = fork();
4276 	ASSERT_GE(self->pid, 0);
4277 
4278 	if (self->pid == 0) {
4279 		while (1)
4280 			pause();
4281 		_exit(127);
4282 	}
4283 }
4284 
4285 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4286 {
4287 	if (self->pid)
4288 		kill(self->pid, SIGKILL);
4289 }
4290 
4291 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4292 {
4293 	int wstatus;
4294 
4295 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4296 	ASSERT_EQ(self->pid, wait(&wstatus));
4297 	ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4298 	if (errno == EINVAL)
4299 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4300 	ASSERT_EQ(EPERM, errno);
4301 }
4302 
4303 TEST_F(O_SUSPEND_SECCOMP, seize)
4304 {
4305 	int ret;
4306 
4307 	ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4308 	ASSERT_EQ(-1, ret);
4309 	if (errno == EINVAL)
4310 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4311 	ASSERT_EQ(EPERM, errno);
4312 }
4313 
4314 /*
4315  * get_nth - Get the nth, space separated entry in a file.
4316  *
4317  * Returns the length of the read field.
4318  * Throws error if field is zero-lengthed.
4319  */
4320 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4321 		     const unsigned int position, char **entry)
4322 {
4323 	char *line = NULL;
4324 	unsigned int i;
4325 	ssize_t nread;
4326 	size_t len = 0;
4327 	FILE *f;
4328 
4329 	f = fopen(path, "r");
4330 	ASSERT_NE(f, NULL) {
4331 		TH_LOG("Could not open %s: %s", path, strerror(errno));
4332 	}
4333 
4334 	for (i = 0; i < position; i++) {
4335 		nread = getdelim(&line, &len, ' ', f);
4336 		ASSERT_GE(nread, 0) {
4337 			TH_LOG("Failed to read %d entry in file %s", i, path);
4338 		}
4339 	}
4340 	fclose(f);
4341 
4342 	ASSERT_GT(nread, 0) {
4343 		TH_LOG("Entry in file %s had zero length", path);
4344 	}
4345 
4346 	*entry = line;
4347 	return nread - 1;
4348 }
4349 
4350 /* For a given PID, get the task state (D, R, etc...) */
4351 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4352 {
4353 	char proc_path[100] = {0};
4354 	char status;
4355 	char *line;
4356 
4357 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4358 	ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4359 
4360 	status = *line;
4361 	free(line);
4362 
4363 	return status;
4364 }
4365 
4366 TEST(user_notification_fifo)
4367 {
4368 	struct seccomp_notif_resp resp = {};
4369 	struct seccomp_notif req = {};
4370 	int i, status, listener;
4371 	pid_t pid, pids[3];
4372 	__u64 baseid;
4373 	long ret;
4374 	/* 100 ms */
4375 	struct timespec delay = { .tv_nsec = 100000000 };
4376 
4377 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4378 	ASSERT_EQ(0, ret) {
4379 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4380 	}
4381 
4382 	/* Setup a listener */
4383 	listener = user_notif_syscall(__NR_getppid,
4384 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4385 	ASSERT_GE(listener, 0);
4386 
4387 	pid = fork();
4388 	ASSERT_GE(pid, 0);
4389 
4390 	if (pid == 0) {
4391 		ret = syscall(__NR_getppid);
4392 		exit(ret != USER_NOTIF_MAGIC);
4393 	}
4394 
4395 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4396 	baseid = req.id + 1;
4397 
4398 	resp.id = req.id;
4399 	resp.error = 0;
4400 	resp.val = USER_NOTIF_MAGIC;
4401 
4402 	/* check that we make sure flags == 0 */
4403 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4404 
4405 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4406 	EXPECT_EQ(true, WIFEXITED(status));
4407 	EXPECT_EQ(0, WEXITSTATUS(status));
4408 
4409 	/* Start children, and generate notifications */
4410 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4411 		pid = fork();
4412 		if (pid == 0) {
4413 			ret = syscall(__NR_getppid);
4414 			exit(ret != USER_NOTIF_MAGIC);
4415 		}
4416 		pids[i] = pid;
4417 	}
4418 
4419 	/* This spins until all of the children are sleeping */
4420 restart_wait:
4421 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4422 		if (get_proc_stat(_metadata, pids[i]) != 'S') {
4423 			nanosleep(&delay, NULL);
4424 			goto restart_wait;
4425 		}
4426 	}
4427 
4428 	/* Read the notifications in order (and respond) */
4429 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4430 		memset(&req, 0, sizeof(req));
4431 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4432 		EXPECT_EQ(req.id, baseid + i);
4433 		resp.id = req.id;
4434 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4435 	}
4436 
4437 	/* Make sure notifications were received */
4438 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4439 		EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4440 		EXPECT_EQ(true, WIFEXITED(status));
4441 		EXPECT_EQ(0, WEXITSTATUS(status));
4442 	}
4443 }
4444 
4445 /* get_proc_syscall - Get the syscall in progress for a given pid
4446  *
4447  * Returns the current syscall number for a given process
4448  * Returns -1 if not in syscall (running or blocked)
4449  */
4450 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4451 {
4452 	char proc_path[100] = {0};
4453 	long ret = -1;
4454 	ssize_t nread;
4455 	char *line;
4456 
4457 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4458 	nread = get_nth(_metadata, proc_path, 1, &line);
4459 	ASSERT_GT(nread, 0);
4460 
4461 	if (!strncmp("running", line, MIN(7, nread)))
4462 		ret = strtol(line, NULL, 16);
4463 
4464 	free(line);
4465 	return ret;
4466 }
4467 
4468 /* Ensure non-fatal signals prior to receive are unmodified */
4469 TEST(user_notification_wait_killable_pre_notification)
4470 {
4471 	struct sigaction new_action = {
4472 		.sa_handler = signal_handler,
4473 	};
4474 	int listener, status, sk_pair[2];
4475 	pid_t pid;
4476 	long ret;
4477 	char c;
4478 	/* 100 ms */
4479 	struct timespec delay = { .tv_nsec = 100000000 };
4480 
4481 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4482 
4483 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4484 	ASSERT_EQ(0, ret)
4485 	{
4486 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4487 	}
4488 
4489 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4490 
4491 	listener = user_notif_syscall(
4492 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4493 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4494 	ASSERT_GE(listener, 0);
4495 
4496 	/*
4497 	 * Check that we can kill the process with SIGUSR1 prior to receiving
4498 	 * the notification. SIGUSR1 is wired up to a custom signal handler,
4499 	 * and make sure it gets called.
4500 	 */
4501 	pid = fork();
4502 	ASSERT_GE(pid, 0);
4503 
4504 	if (pid == 0) {
4505 		close(sk_pair[0]);
4506 		handled = sk_pair[1];
4507 
4508 		/* Setup the non-fatal sigaction without SA_RESTART */
4509 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4510 			perror("sigaction");
4511 			exit(1);
4512 		}
4513 
4514 		ret = syscall(__NR_getppid);
4515 		/* Make sure we got a return from a signal interruption */
4516 		exit(ret != -1 || errno != EINTR);
4517 	}
4518 
4519 	/*
4520 	 * Make sure we've gotten to the seccomp user notification wait
4521 	 * from getppid prior to sending any signals
4522 	 */
4523 	while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4524 	       get_proc_stat(_metadata, pid) != 'S')
4525 		nanosleep(&delay, NULL);
4526 
4527 	/* Send non-fatal kill signal */
4528 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4529 
4530 	/* wait for process to exit (exit checks for EINTR) */
4531 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4532 	EXPECT_EQ(true, WIFEXITED(status));
4533 	EXPECT_EQ(0, WEXITSTATUS(status));
4534 
4535 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4536 }
4537 
4538 /* Ensure non-fatal signals after receive are blocked */
4539 TEST(user_notification_wait_killable)
4540 {
4541 	struct sigaction new_action = {
4542 		.sa_handler = signal_handler,
4543 	};
4544 	struct seccomp_notif_resp resp = {};
4545 	struct seccomp_notif req = {};
4546 	int listener, status, sk_pair[2];
4547 	pid_t pid;
4548 	long ret;
4549 	char c;
4550 	/* 100 ms */
4551 	struct timespec delay = { .tv_nsec = 100000000 };
4552 
4553 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4554 
4555 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4556 	ASSERT_EQ(0, ret)
4557 	{
4558 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4559 	}
4560 
4561 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4562 
4563 	listener = user_notif_syscall(
4564 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4565 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4566 	ASSERT_GE(listener, 0);
4567 
4568 	pid = fork();
4569 	ASSERT_GE(pid, 0);
4570 
4571 	if (pid == 0) {
4572 		close(sk_pair[0]);
4573 		handled = sk_pair[1];
4574 
4575 		/* Setup the sigaction without SA_RESTART */
4576 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4577 			perror("sigaction");
4578 			exit(1);
4579 		}
4580 
4581 		/* Make sure that the syscall is completed (no EINTR) */
4582 		ret = syscall(__NR_getppid);
4583 		exit(ret != USER_NOTIF_MAGIC);
4584 	}
4585 
4586 	/*
4587 	 * Get the notification, to make move the notifying process into a
4588 	 * non-preemptible (TASK_KILLABLE) state.
4589 	 */
4590 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4591 	/* Send non-fatal kill signal */
4592 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4593 
4594 	/*
4595 	 * Make sure the task enters moves to TASK_KILLABLE by waiting for
4596 	 * D (Disk Sleep) state after receiving non-fatal signal.
4597 	 */
4598 	while (get_proc_stat(_metadata, pid) != 'D')
4599 		nanosleep(&delay, NULL);
4600 
4601 	resp.id = req.id;
4602 	resp.val = USER_NOTIF_MAGIC;
4603 	/* Make sure the notification is found and able to be replied to */
4604 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4605 
4606 	/*
4607 	 * Make sure that the signal handler does get called once we're back in
4608 	 * userspace.
4609 	 */
4610 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4611 	/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4612 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4613 	EXPECT_EQ(true, WIFEXITED(status));
4614 	EXPECT_EQ(0, WEXITSTATUS(status));
4615 }
4616 
4617 /* Ensure fatal signals after receive are not blocked */
4618 TEST(user_notification_wait_killable_fatal)
4619 {
4620 	struct seccomp_notif req = {};
4621 	int listener, status;
4622 	pid_t pid;
4623 	long ret;
4624 	/* 100 ms */
4625 	struct timespec delay = { .tv_nsec = 100000000 };
4626 
4627 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4628 	ASSERT_EQ(0, ret)
4629 	{
4630 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4631 	}
4632 
4633 	listener = user_notif_syscall(
4634 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4635 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4636 	ASSERT_GE(listener, 0);
4637 
4638 	pid = fork();
4639 	ASSERT_GE(pid, 0);
4640 
4641 	if (pid == 0) {
4642 		/* This should never complete as it should get a SIGTERM */
4643 		syscall(__NR_getppid);
4644 		exit(1);
4645 	}
4646 
4647 	while (get_proc_stat(_metadata, pid) != 'S')
4648 		nanosleep(&delay, NULL);
4649 
4650 	/*
4651 	 * Get the notification, to make move the notifying process into a
4652 	 * non-preemptible (TASK_KILLABLE) state.
4653 	 */
4654 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4655 	/* Kill the process with a fatal signal */
4656 	EXPECT_EQ(kill(pid, SIGTERM), 0);
4657 
4658 	/*
4659 	 * Wait for the process to exit, and make sure the process terminated
4660 	 * due to the SIGTERM signal.
4661 	 */
4662 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4663 	EXPECT_EQ(true, WIFSIGNALED(status));
4664 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
4665 }
4666 
4667 /*
4668  * TODO:
4669  * - expand NNP testing
4670  * - better arch-specific TRACE and TRAP handlers.
4671  * - endianness checking when appropriate
4672  * - 64-bit arg prodding
4673  * - arch value testing (x86 modes especially)
4674  * - verify that FILTER_FLAG_LOG filters generate log messages
4675  * - verify that RET_LOG generates log messages
4676  */
4677 
4678 TEST_HARNESS_MAIN
4679