xref: /openbmc/linux/kernel/seccomp.c (revision 6562c9ac)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * linux/kernel/seccomp.c
4  *
5  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
6  *
7  * Copyright (C) 2012 Google, Inc.
8  * Will Drewry <wad@chromium.org>
9  *
10  * This defines a simple but solid secure-computing facility.
11  *
12  * Mode 1 uses a fixed list of allowed system calls.
13  * Mode 2 allows user-defined system call filters in the form
14  *        of Berkeley Packet Filters/Linux Socket Filters.
15  */
16 #define pr_fmt(fmt) "seccomp: " fmt
17 
18 #include <linux/refcount.h>
19 #include <linux/audit.h>
20 #include <linux/compat.h>
21 #include <linux/coredump.h>
22 #include <linux/kmemleak.h>
23 #include <linux/nospec.h>
24 #include <linux/prctl.h>
25 #include <linux/sched.h>
26 #include <linux/sched/task_stack.h>
27 #include <linux/seccomp.h>
28 #include <linux/slab.h>
29 #include <linux/syscalls.h>
30 #include <linux/sysctl.h>
31 
32 /* Not exposed in headers: strictly internal use only. */
33 #define SECCOMP_MODE_DEAD	(SECCOMP_MODE_FILTER + 1)
34 
35 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
36 #include <asm/syscall.h>
37 #endif
38 
39 #ifdef CONFIG_SECCOMP_FILTER
40 #include <linux/file.h>
41 #include <linux/filter.h>
42 #include <linux/pid.h>
43 #include <linux/ptrace.h>
44 #include <linux/capability.h>
45 #include <linux/uaccess.h>
46 #include <linux/anon_inodes.h>
47 #include <linux/lockdep.h>
48 
49 /*
50  * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
51  * wrong direction flag in the ioctl number. This is the broken one,
52  * which the kernel needs to keep supporting until all userspaces stop
53  * using the wrong command number.
54  */
55 #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR	SECCOMP_IOR(2, __u64)
56 
57 enum notify_state {
58 	SECCOMP_NOTIFY_INIT,
59 	SECCOMP_NOTIFY_SENT,
60 	SECCOMP_NOTIFY_REPLIED,
61 };
62 
63 struct seccomp_knotif {
64 	/* The struct pid of the task whose filter triggered the notification */
65 	struct task_struct *task;
66 
67 	/* The "cookie" for this request; this is unique for this filter. */
68 	u64 id;
69 
70 	/*
71 	 * The seccomp data. This pointer is valid the entire time this
72 	 * notification is active, since it comes from __seccomp_filter which
73 	 * eclipses the entire lifecycle here.
74 	 */
75 	const struct seccomp_data *data;
76 
77 	/*
78 	 * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
79 	 * struct seccomp_knotif is created and starts out in INIT. Once the
80 	 * handler reads the notification off of an FD, it transitions to SENT.
81 	 * If a signal is received the state transitions back to INIT and
82 	 * another message is sent. When the userspace handler replies, state
83 	 * transitions to REPLIED.
84 	 */
85 	enum notify_state state;
86 
87 	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
88 	int error;
89 	long val;
90 	u32 flags;
91 
92 	/*
93 	 * Signals when this has changed states, such as the listener
94 	 * dying, a new seccomp addfd message, or changing to REPLIED
95 	 */
96 	struct completion ready;
97 
98 	struct list_head list;
99 
100 	/* outstanding addfd requests */
101 	struct list_head addfd;
102 };
103 
104 /**
105  * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
106  *
107  * @file: A reference to the file to install in the other task
108  * @fd: The fd number to install it at. If the fd number is -1, it means the
109  *      installing process should allocate the fd as normal.
110  * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
111  *         is allowed.
112  * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
113  * @ret: The return value of the installing process. It is set to the fd num
114  *       upon success (>= 0).
115  * @completion: Indicates that the installing process has completed fd
116  *              installation, or gone away (either due to successful
117  *              reply, or signal)
118  *
119  */
120 struct seccomp_kaddfd {
121 	struct file *file;
122 	int fd;
123 	unsigned int flags;
124 	__u32 ioctl_flags;
125 
126 	union {
127 		bool setfd;
128 		/* To only be set on reply */
129 		int ret;
130 	};
131 	struct completion completion;
132 	struct list_head list;
133 };
134 
135 /**
136  * struct notification - container for seccomp userspace notifications. Since
137  * most seccomp filters will not have notification listeners attached and this
138  * structure is fairly large, we store the notification-specific stuff in a
139  * separate structure.
140  *
141  * @request: A semaphore that users of this notification can wait on for
142  *           changes. Actual reads and writes are still controlled with
143  *           filter->notify_lock.
144  * @next_id: The id of the next request.
145  * @notifications: A list of struct seccomp_knotif elements.
146  */
147 struct notification {
148 	struct semaphore request;
149 	u64 next_id;
150 	struct list_head notifications;
151 };
152 
153 #ifdef SECCOMP_ARCH_NATIVE
154 /**
155  * struct action_cache - per-filter cache of seccomp actions per
156  * arch/syscall pair
157  *
158  * @allow_native: A bitmap where each bit represents whether the
159  *		  filter will always allow the syscall, for the
160  *		  native architecture.
161  * @allow_compat: A bitmap where each bit represents whether the
162  *		  filter will always allow the syscall, for the
163  *		  compat architecture.
164  */
165 struct action_cache {
166 	DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
167 #ifdef SECCOMP_ARCH_COMPAT
168 	DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
169 #endif
170 };
171 #else
172 struct action_cache { };
173 
174 static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
175 					     const struct seccomp_data *sd)
176 {
177 	return false;
178 }
179 
180 static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
181 {
182 }
183 #endif /* SECCOMP_ARCH_NATIVE */
184 
185 /**
186  * struct seccomp_filter - container for seccomp BPF programs
187  *
188  * @refs: Reference count to manage the object lifetime.
189  *	  A filter's reference count is incremented for each directly
190  *	  attached task, once for the dependent filter, and if
191  *	  requested for the user notifier. When @refs reaches zero,
192  *	  the filter can be freed.
193  * @users: A filter's @users count is incremented for each directly
194  *         attached task (filter installation, fork(), thread_sync),
195  *	   and once for the dependent filter (tracked in filter->prev).
196  *	   When it reaches zero it indicates that no direct or indirect
197  *	   users of that filter exist. No new tasks can get associated with
198  *	   this filter after reaching 0. The @users count is always smaller
199  *	   or equal to @refs. Hence, reaching 0 for @users does not mean
200  *	   the filter can be freed.
201  * @cache: cache of arch/syscall mappings to actions
202  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
203  * @wait_killable_recv: Put notifying process in killable state once the
204  *			notification is received by the userspace listener.
205  * @prev: points to a previously installed, or inherited, filter
206  * @prog: the BPF program to evaluate
207  * @notif: the struct that holds all notification related information
208  * @notify_lock: A lock for all notification-related accesses.
209  * @wqh: A wait queue for poll if a notifier is in use.
210  *
211  * seccomp_filter objects are organized in a tree linked via the @prev
212  * pointer.  For any task, it appears to be a singly-linked list starting
213  * with current->seccomp.filter, the most recently attached or inherited filter.
214  * However, multiple filters may share a @prev node, by way of fork(), which
215  * results in a unidirectional tree existing in memory.  This is similar to
216  * how namespaces work.
217  *
218  * seccomp_filter objects should never be modified after being attached
219  * to a task_struct (other than @refs).
220  */
221 struct seccomp_filter {
222 	refcount_t refs;
223 	refcount_t users;
224 	bool log;
225 	bool wait_killable_recv;
226 	struct action_cache cache;
227 	struct seccomp_filter *prev;
228 	struct bpf_prog *prog;
229 	struct notification *notif;
230 	struct mutex notify_lock;
231 	wait_queue_head_t wqh;
232 };
233 
234 /* Limit any path through the tree to 256KB worth of instructions. */
235 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
236 
237 /*
238  * Endianness is explicitly ignored and left for BPF program authors to manage
239  * as per the specific architecture.
240  */
241 static void populate_seccomp_data(struct seccomp_data *sd)
242 {
243 	/*
244 	 * Instead of using current_pt_reg(), we're already doing the work
245 	 * to safely fetch "current", so just use "task" everywhere below.
246 	 */
247 	struct task_struct *task = current;
248 	struct pt_regs *regs = task_pt_regs(task);
249 	unsigned long args[6];
250 
251 	sd->nr = syscall_get_nr(task, regs);
252 	sd->arch = syscall_get_arch(task);
253 	syscall_get_arguments(task, regs, args);
254 	sd->args[0] = args[0];
255 	sd->args[1] = args[1];
256 	sd->args[2] = args[2];
257 	sd->args[3] = args[3];
258 	sd->args[4] = args[4];
259 	sd->args[5] = args[5];
260 	sd->instruction_pointer = KSTK_EIP(task);
261 }
262 
263 /**
264  *	seccomp_check_filter - verify seccomp filter code
265  *	@filter: filter to verify
266  *	@flen: length of filter
267  *
268  * Takes a previously checked filter (by bpf_check_classic) and
269  * redirects all filter code that loads struct sk_buff data
270  * and related data through seccomp_bpf_load.  It also
271  * enforces length and alignment checking of those loads.
272  *
273  * Returns 0 if the rule set is legal or -EINVAL if not.
274  */
275 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
276 {
277 	int pc;
278 	for (pc = 0; pc < flen; pc++) {
279 		struct sock_filter *ftest = &filter[pc];
280 		u16 code = ftest->code;
281 		u32 k = ftest->k;
282 
283 		switch (code) {
284 		case BPF_LD | BPF_W | BPF_ABS:
285 			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
286 			/* 32-bit aligned and not out of bounds. */
287 			if (k >= sizeof(struct seccomp_data) || k & 3)
288 				return -EINVAL;
289 			continue;
290 		case BPF_LD | BPF_W | BPF_LEN:
291 			ftest->code = BPF_LD | BPF_IMM;
292 			ftest->k = sizeof(struct seccomp_data);
293 			continue;
294 		case BPF_LDX | BPF_W | BPF_LEN:
295 			ftest->code = BPF_LDX | BPF_IMM;
296 			ftest->k = sizeof(struct seccomp_data);
297 			continue;
298 		/* Explicitly include allowed calls. */
299 		case BPF_RET | BPF_K:
300 		case BPF_RET | BPF_A:
301 		case BPF_ALU | BPF_ADD | BPF_K:
302 		case BPF_ALU | BPF_ADD | BPF_X:
303 		case BPF_ALU | BPF_SUB | BPF_K:
304 		case BPF_ALU | BPF_SUB | BPF_X:
305 		case BPF_ALU | BPF_MUL | BPF_K:
306 		case BPF_ALU | BPF_MUL | BPF_X:
307 		case BPF_ALU | BPF_DIV | BPF_K:
308 		case BPF_ALU | BPF_DIV | BPF_X:
309 		case BPF_ALU | BPF_AND | BPF_K:
310 		case BPF_ALU | BPF_AND | BPF_X:
311 		case BPF_ALU | BPF_OR | BPF_K:
312 		case BPF_ALU | BPF_OR | BPF_X:
313 		case BPF_ALU | BPF_XOR | BPF_K:
314 		case BPF_ALU | BPF_XOR | BPF_X:
315 		case BPF_ALU | BPF_LSH | BPF_K:
316 		case BPF_ALU | BPF_LSH | BPF_X:
317 		case BPF_ALU | BPF_RSH | BPF_K:
318 		case BPF_ALU | BPF_RSH | BPF_X:
319 		case BPF_ALU | BPF_NEG:
320 		case BPF_LD | BPF_IMM:
321 		case BPF_LDX | BPF_IMM:
322 		case BPF_MISC | BPF_TAX:
323 		case BPF_MISC | BPF_TXA:
324 		case BPF_LD | BPF_MEM:
325 		case BPF_LDX | BPF_MEM:
326 		case BPF_ST:
327 		case BPF_STX:
328 		case BPF_JMP | BPF_JA:
329 		case BPF_JMP | BPF_JEQ | BPF_K:
330 		case BPF_JMP | BPF_JEQ | BPF_X:
331 		case BPF_JMP | BPF_JGE | BPF_K:
332 		case BPF_JMP | BPF_JGE | BPF_X:
333 		case BPF_JMP | BPF_JGT | BPF_K:
334 		case BPF_JMP | BPF_JGT | BPF_X:
335 		case BPF_JMP | BPF_JSET | BPF_K:
336 		case BPF_JMP | BPF_JSET | BPF_X:
337 			continue;
338 		default:
339 			return -EINVAL;
340 		}
341 	}
342 	return 0;
343 }
344 
345 #ifdef SECCOMP_ARCH_NATIVE
346 static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
347 						    size_t bitmap_size,
348 						    int syscall_nr)
349 {
350 	if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
351 		return false;
352 	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
353 
354 	return test_bit(syscall_nr, bitmap);
355 }
356 
357 /**
358  * seccomp_cache_check_allow - lookup seccomp cache
359  * @sfilter: The seccomp filter
360  * @sd: The seccomp data to lookup the cache with
361  *
362  * Returns true if the seccomp_data is cached and allowed.
363  */
364 static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
365 					     const struct seccomp_data *sd)
366 {
367 	int syscall_nr = sd->nr;
368 	const struct action_cache *cache = &sfilter->cache;
369 
370 #ifndef SECCOMP_ARCH_COMPAT
371 	/* A native-only architecture doesn't need to check sd->arch. */
372 	return seccomp_cache_check_allow_bitmap(cache->allow_native,
373 						SECCOMP_ARCH_NATIVE_NR,
374 						syscall_nr);
375 #else
376 	if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
377 		return seccomp_cache_check_allow_bitmap(cache->allow_native,
378 							SECCOMP_ARCH_NATIVE_NR,
379 							syscall_nr);
380 	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
381 		return seccomp_cache_check_allow_bitmap(cache->allow_compat,
382 							SECCOMP_ARCH_COMPAT_NR,
383 							syscall_nr);
384 #endif /* SECCOMP_ARCH_COMPAT */
385 
386 	WARN_ON_ONCE(true);
387 	return false;
388 }
389 #endif /* SECCOMP_ARCH_NATIVE */
390 
391 /**
392  * seccomp_run_filters - evaluates all seccomp filters against @sd
393  * @sd: optional seccomp data to be passed to filters
394  * @match: stores struct seccomp_filter that resulted in the return value,
395  *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
396  *         be unchanged.
397  *
398  * Returns valid seccomp BPF response codes.
399  */
400 #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
401 static u32 seccomp_run_filters(const struct seccomp_data *sd,
402 			       struct seccomp_filter **match)
403 {
404 	u32 ret = SECCOMP_RET_ALLOW;
405 	/* Make sure cross-thread synced filter points somewhere sane. */
406 	struct seccomp_filter *f =
407 			READ_ONCE(current->seccomp.filter);
408 
409 	/* Ensure unexpected behavior doesn't result in failing open. */
410 	if (WARN_ON(f == NULL))
411 		return SECCOMP_RET_KILL_PROCESS;
412 
413 	if (seccomp_cache_check_allow(f, sd))
414 		return SECCOMP_RET_ALLOW;
415 
416 	/*
417 	 * All filters in the list are evaluated and the lowest BPF return
418 	 * value always takes priority (ignoring the DATA).
419 	 */
420 	for (; f; f = f->prev) {
421 		u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
422 
423 		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
424 			ret = cur_ret;
425 			*match = f;
426 		}
427 	}
428 	return ret;
429 }
430 #endif /* CONFIG_SECCOMP_FILTER */
431 
432 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
433 {
434 	assert_spin_locked(&current->sighand->siglock);
435 
436 	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
437 		return false;
438 
439 	return true;
440 }
441 
442 void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
443 
444 static inline void seccomp_assign_mode(struct task_struct *task,
445 				       unsigned long seccomp_mode,
446 				       unsigned long flags)
447 {
448 	assert_spin_locked(&task->sighand->siglock);
449 
450 	task->seccomp.mode = seccomp_mode;
451 	/*
452 	 * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
453 	 * filter) is set.
454 	 */
455 	smp_mb__before_atomic();
456 	/* Assume default seccomp processes want spec flaw mitigation. */
457 	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
458 		arch_seccomp_spec_mitigate(task);
459 	set_task_syscall_work(task, SECCOMP);
460 }
461 
462 #ifdef CONFIG_SECCOMP_FILTER
463 /* Returns 1 if the parent is an ancestor of the child. */
464 static int is_ancestor(struct seccomp_filter *parent,
465 		       struct seccomp_filter *child)
466 {
467 	/* NULL is the root ancestor. */
468 	if (parent == NULL)
469 		return 1;
470 	for (; child; child = child->prev)
471 		if (child == parent)
472 			return 1;
473 	return 0;
474 }
475 
476 /**
477  * seccomp_can_sync_threads: checks if all threads can be synchronized
478  *
479  * Expects sighand and cred_guard_mutex locks to be held.
480  *
481  * Returns 0 on success, -ve on error, or the pid of a thread which was
482  * either not in the correct seccomp mode or did not have an ancestral
483  * seccomp filter.
484  */
485 static inline pid_t seccomp_can_sync_threads(void)
486 {
487 	struct task_struct *thread, *caller;
488 
489 	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
490 	assert_spin_locked(&current->sighand->siglock);
491 
492 	/* Validate all threads being eligible for synchronization. */
493 	caller = current;
494 	for_each_thread(caller, thread) {
495 		pid_t failed;
496 
497 		/* Skip current, since it is initiating the sync. */
498 		if (thread == caller)
499 			continue;
500 
501 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
502 		    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
503 		     is_ancestor(thread->seccomp.filter,
504 				 caller->seccomp.filter)))
505 			continue;
506 
507 		/* Return the first thread that cannot be synchronized. */
508 		failed = task_pid_vnr(thread);
509 		/* If the pid cannot be resolved, then return -ESRCH */
510 		if (WARN_ON(failed == 0))
511 			failed = -ESRCH;
512 		return failed;
513 	}
514 
515 	return 0;
516 }
517 
518 static inline void seccomp_filter_free(struct seccomp_filter *filter)
519 {
520 	if (filter) {
521 		bpf_prog_destroy(filter->prog);
522 		kfree(filter);
523 	}
524 }
525 
526 static void __seccomp_filter_orphan(struct seccomp_filter *orig)
527 {
528 	while (orig && refcount_dec_and_test(&orig->users)) {
529 		if (waitqueue_active(&orig->wqh))
530 			wake_up_poll(&orig->wqh, EPOLLHUP);
531 		orig = orig->prev;
532 	}
533 }
534 
535 static void __put_seccomp_filter(struct seccomp_filter *orig)
536 {
537 	/* Clean up single-reference branches iteratively. */
538 	while (orig && refcount_dec_and_test(&orig->refs)) {
539 		struct seccomp_filter *freeme = orig;
540 		orig = orig->prev;
541 		seccomp_filter_free(freeme);
542 	}
543 }
544 
545 static void __seccomp_filter_release(struct seccomp_filter *orig)
546 {
547 	/* Notify about any unused filters in the task's former filter tree. */
548 	__seccomp_filter_orphan(orig);
549 	/* Finally drop all references to the task's former tree. */
550 	__put_seccomp_filter(orig);
551 }
552 
553 /**
554  * seccomp_filter_release - Detach the task from its filter tree,
555  *			    drop its reference count, and notify
556  *			    about unused filters
557  *
558  * This function should only be called when the task is exiting as
559  * it detaches it from its filter tree. As such, READ_ONCE() and
560  * barriers are not needed here, as would normally be needed.
561  */
562 void seccomp_filter_release(struct task_struct *tsk)
563 {
564 	struct seccomp_filter *orig = tsk->seccomp.filter;
565 
566 	/* We are effectively holding the siglock by not having any sighand. */
567 	WARN_ON(tsk->sighand != NULL);
568 
569 	/* Detach task from its filter tree. */
570 	tsk->seccomp.filter = NULL;
571 	__seccomp_filter_release(orig);
572 }
573 
574 /**
575  * seccomp_sync_threads: sets all threads to use current's filter
576  *
577  * Expects sighand and cred_guard_mutex locks to be held, and for
578  * seccomp_can_sync_threads() to have returned success already
579  * without dropping the locks.
580  *
581  */
582 static inline void seccomp_sync_threads(unsigned long flags)
583 {
584 	struct task_struct *thread, *caller;
585 
586 	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
587 	assert_spin_locked(&current->sighand->siglock);
588 
589 	/* Synchronize all threads. */
590 	caller = current;
591 	for_each_thread(caller, thread) {
592 		/* Skip current, since it needs no changes. */
593 		if (thread == caller)
594 			continue;
595 
596 		/* Get a task reference for the new leaf node. */
597 		get_seccomp_filter(caller);
598 
599 		/*
600 		 * Drop the task reference to the shared ancestor since
601 		 * current's path will hold a reference.  (This also
602 		 * allows a put before the assignment.)
603 		 */
604 		__seccomp_filter_release(thread->seccomp.filter);
605 
606 		/* Make our new filter tree visible. */
607 		smp_store_release(&thread->seccomp.filter,
608 				  caller->seccomp.filter);
609 		atomic_set(&thread->seccomp.filter_count,
610 			   atomic_read(&caller->seccomp.filter_count));
611 
612 		/*
613 		 * Don't let an unprivileged task work around
614 		 * the no_new_privs restriction by creating
615 		 * a thread that sets it up, enters seccomp,
616 		 * then dies.
617 		 */
618 		if (task_no_new_privs(caller))
619 			task_set_no_new_privs(thread);
620 
621 		/*
622 		 * Opt the other thread into seccomp if needed.
623 		 * As threads are considered to be trust-realm
624 		 * equivalent (see ptrace_may_access), it is safe to
625 		 * allow one thread to transition the other.
626 		 */
627 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
628 			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
629 					    flags);
630 	}
631 }
632 
633 /**
634  * seccomp_prepare_filter: Prepares a seccomp filter for use.
635  * @fprog: BPF program to install
636  *
637  * Returns filter on success or an ERR_PTR on failure.
638  */
639 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
640 {
641 	struct seccomp_filter *sfilter;
642 	int ret;
643 	const bool save_orig =
644 #if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
645 		true;
646 #else
647 		false;
648 #endif
649 
650 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
651 		return ERR_PTR(-EINVAL);
652 
653 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
654 
655 	/*
656 	 * Installing a seccomp filter requires that the task has
657 	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
658 	 * This avoids scenarios where unprivileged tasks can affect the
659 	 * behavior of privileged children.
660 	 */
661 	if (!task_no_new_privs(current) &&
662 			!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
663 		return ERR_PTR(-EACCES);
664 
665 	/* Allocate a new seccomp_filter */
666 	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
667 	if (!sfilter)
668 		return ERR_PTR(-ENOMEM);
669 
670 	mutex_init(&sfilter->notify_lock);
671 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
672 					seccomp_check_filter, save_orig);
673 	if (ret < 0) {
674 		kfree(sfilter);
675 		return ERR_PTR(ret);
676 	}
677 
678 	refcount_set(&sfilter->refs, 1);
679 	refcount_set(&sfilter->users, 1);
680 	init_waitqueue_head(&sfilter->wqh);
681 
682 	return sfilter;
683 }
684 
685 /**
686  * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
687  * @user_filter: pointer to the user data containing a sock_fprog.
688  *
689  * Returns 0 on success and non-zero otherwise.
690  */
691 static struct seccomp_filter *
692 seccomp_prepare_user_filter(const char __user *user_filter)
693 {
694 	struct sock_fprog fprog;
695 	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
696 
697 #ifdef CONFIG_COMPAT
698 	if (in_compat_syscall()) {
699 		struct compat_sock_fprog fprog32;
700 		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
701 			goto out;
702 		fprog.len = fprog32.len;
703 		fprog.filter = compat_ptr(fprog32.filter);
704 	} else /* falls through to the if below. */
705 #endif
706 	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
707 		goto out;
708 	filter = seccomp_prepare_filter(&fprog);
709 out:
710 	return filter;
711 }
712 
713 #ifdef SECCOMP_ARCH_NATIVE
714 /**
715  * seccomp_is_const_allow - check if filter is constant allow with given data
716  * @fprog: The BPF programs
717  * @sd: The seccomp data to check against, only syscall number and arch
718  *      number are considered constant.
719  */
720 static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
721 				   struct seccomp_data *sd)
722 {
723 	unsigned int reg_value = 0;
724 	unsigned int pc;
725 	bool op_res;
726 
727 	if (WARN_ON_ONCE(!fprog))
728 		return false;
729 
730 	for (pc = 0; pc < fprog->len; pc++) {
731 		struct sock_filter *insn = &fprog->filter[pc];
732 		u16 code = insn->code;
733 		u32 k = insn->k;
734 
735 		switch (code) {
736 		case BPF_LD | BPF_W | BPF_ABS:
737 			switch (k) {
738 			case offsetof(struct seccomp_data, nr):
739 				reg_value = sd->nr;
740 				break;
741 			case offsetof(struct seccomp_data, arch):
742 				reg_value = sd->arch;
743 				break;
744 			default:
745 				/* can't optimize (non-constant value load) */
746 				return false;
747 			}
748 			break;
749 		case BPF_RET | BPF_K:
750 			/* reached return with constant values only, check allow */
751 			return k == SECCOMP_RET_ALLOW;
752 		case BPF_JMP | BPF_JA:
753 			pc += insn->k;
754 			break;
755 		case BPF_JMP | BPF_JEQ | BPF_K:
756 		case BPF_JMP | BPF_JGE | BPF_K:
757 		case BPF_JMP | BPF_JGT | BPF_K:
758 		case BPF_JMP | BPF_JSET | BPF_K:
759 			switch (BPF_OP(code)) {
760 			case BPF_JEQ:
761 				op_res = reg_value == k;
762 				break;
763 			case BPF_JGE:
764 				op_res = reg_value >= k;
765 				break;
766 			case BPF_JGT:
767 				op_res = reg_value > k;
768 				break;
769 			case BPF_JSET:
770 				op_res = !!(reg_value & k);
771 				break;
772 			default:
773 				/* can't optimize (unknown jump) */
774 				return false;
775 			}
776 
777 			pc += op_res ? insn->jt : insn->jf;
778 			break;
779 		case BPF_ALU | BPF_AND | BPF_K:
780 			reg_value &= k;
781 			break;
782 		default:
783 			/* can't optimize (unknown insn) */
784 			return false;
785 		}
786 	}
787 
788 	/* ran off the end of the filter?! */
789 	WARN_ON(1);
790 	return false;
791 }
792 
793 static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
794 					 void *bitmap, const void *bitmap_prev,
795 					 size_t bitmap_size, int arch)
796 {
797 	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
798 	struct seccomp_data sd;
799 	int nr;
800 
801 	if (bitmap_prev) {
802 		/* The new filter must be as restrictive as the last. */
803 		bitmap_copy(bitmap, bitmap_prev, bitmap_size);
804 	} else {
805 		/* Before any filters, all syscalls are always allowed. */
806 		bitmap_fill(bitmap, bitmap_size);
807 	}
808 
809 	for (nr = 0; nr < bitmap_size; nr++) {
810 		/* No bitmap change: not a cacheable action. */
811 		if (!test_bit(nr, bitmap))
812 			continue;
813 
814 		sd.nr = nr;
815 		sd.arch = arch;
816 
817 		/* No bitmap change: continue to always allow. */
818 		if (seccomp_is_const_allow(fprog, &sd))
819 			continue;
820 
821 		/*
822 		 * Not a cacheable action: always run filters.
823 		 * atomic clear_bit() not needed, filter not visible yet.
824 		 */
825 		__clear_bit(nr, bitmap);
826 	}
827 }
828 
829 /**
830  * seccomp_cache_prepare - emulate the filter to find cacheable syscalls
831  * @sfilter: The seccomp filter
832  *
833  * Returns 0 if successful or -errno if error occurred.
834  */
835 static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
836 {
837 	struct action_cache *cache = &sfilter->cache;
838 	const struct action_cache *cache_prev =
839 		sfilter->prev ? &sfilter->prev->cache : NULL;
840 
841 	seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
842 				     cache_prev ? cache_prev->allow_native : NULL,
843 				     SECCOMP_ARCH_NATIVE_NR,
844 				     SECCOMP_ARCH_NATIVE);
845 
846 #ifdef SECCOMP_ARCH_COMPAT
847 	seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
848 				     cache_prev ? cache_prev->allow_compat : NULL,
849 				     SECCOMP_ARCH_COMPAT_NR,
850 				     SECCOMP_ARCH_COMPAT);
851 #endif /* SECCOMP_ARCH_COMPAT */
852 }
853 #endif /* SECCOMP_ARCH_NATIVE */
854 
855 /**
856  * seccomp_attach_filter: validate and attach filter
857  * @flags:  flags to change filter behavior
858  * @filter: seccomp filter to add to the current process
859  *
860  * Caller must be holding current->sighand->siglock lock.
861  *
862  * Returns 0 on success, -ve on error, or
863  *   - in TSYNC mode: the pid of a thread which was either not in the correct
864  *     seccomp mode or did not have an ancestral seccomp filter
865  *   - in NEW_LISTENER mode: the fd of the new listener
866  */
867 static long seccomp_attach_filter(unsigned int flags,
868 				  struct seccomp_filter *filter)
869 {
870 	unsigned long total_insns;
871 	struct seccomp_filter *walker;
872 
873 	assert_spin_locked(&current->sighand->siglock);
874 
875 	/* Validate resulting filter length. */
876 	total_insns = filter->prog->len;
877 	for (walker = current->seccomp.filter; walker; walker = walker->prev)
878 		total_insns += walker->prog->len + 4;  /* 4 instr penalty */
879 	if (total_insns > MAX_INSNS_PER_PATH)
880 		return -ENOMEM;
881 
882 	/* If thread sync has been requested, check that it is possible. */
883 	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
884 		int ret;
885 
886 		ret = seccomp_can_sync_threads();
887 		if (ret) {
888 			if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
889 				return -ESRCH;
890 			else
891 				return ret;
892 		}
893 	}
894 
895 	/* Set log flag, if present. */
896 	if (flags & SECCOMP_FILTER_FLAG_LOG)
897 		filter->log = true;
898 
899 	/* Set wait killable flag, if present. */
900 	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
901 		filter->wait_killable_recv = true;
902 
903 	/*
904 	 * If there is an existing filter, make it the prev and don't drop its
905 	 * task reference.
906 	 */
907 	filter->prev = current->seccomp.filter;
908 	seccomp_cache_prepare(filter);
909 	current->seccomp.filter = filter;
910 	atomic_inc(&current->seccomp.filter_count);
911 
912 	/* Now that the new filter is in place, synchronize to all threads. */
913 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
914 		seccomp_sync_threads(flags);
915 
916 	return 0;
917 }
918 
919 static void __get_seccomp_filter(struct seccomp_filter *filter)
920 {
921 	refcount_inc(&filter->refs);
922 }
923 
924 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
925 void get_seccomp_filter(struct task_struct *tsk)
926 {
927 	struct seccomp_filter *orig = tsk->seccomp.filter;
928 	if (!orig)
929 		return;
930 	__get_seccomp_filter(orig);
931 	refcount_inc(&orig->users);
932 }
933 
934 #endif	/* CONFIG_SECCOMP_FILTER */
935 
936 /* For use with seccomp_actions_logged */
937 #define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
938 #define SECCOMP_LOG_KILL_THREAD		(1 << 1)
939 #define SECCOMP_LOG_TRAP		(1 << 2)
940 #define SECCOMP_LOG_ERRNO		(1 << 3)
941 #define SECCOMP_LOG_TRACE		(1 << 4)
942 #define SECCOMP_LOG_LOG			(1 << 5)
943 #define SECCOMP_LOG_ALLOW		(1 << 6)
944 #define SECCOMP_LOG_USER_NOTIF		(1 << 7)
945 
946 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
947 				    SECCOMP_LOG_KILL_THREAD  |
948 				    SECCOMP_LOG_TRAP  |
949 				    SECCOMP_LOG_ERRNO |
950 				    SECCOMP_LOG_USER_NOTIF |
951 				    SECCOMP_LOG_TRACE |
952 				    SECCOMP_LOG_LOG;
953 
954 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
955 			       bool requested)
956 {
957 	bool log = false;
958 
959 	switch (action) {
960 	case SECCOMP_RET_ALLOW:
961 		break;
962 	case SECCOMP_RET_TRAP:
963 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
964 		break;
965 	case SECCOMP_RET_ERRNO:
966 		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
967 		break;
968 	case SECCOMP_RET_TRACE:
969 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
970 		break;
971 	case SECCOMP_RET_USER_NOTIF:
972 		log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
973 		break;
974 	case SECCOMP_RET_LOG:
975 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
976 		break;
977 	case SECCOMP_RET_KILL_THREAD:
978 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
979 		break;
980 	case SECCOMP_RET_KILL_PROCESS:
981 	default:
982 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
983 	}
984 
985 	/*
986 	 * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
987 	 * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
988 	 * any action from being logged by removing the action name from the
989 	 * seccomp_actions_logged sysctl.
990 	 */
991 	if (!log)
992 		return;
993 
994 	audit_seccomp(syscall, signr, action);
995 }
996 
997 /*
998  * Secure computing mode 1 allows only read/write/exit/sigreturn.
999  * To be fully secure this must be combined with rlimit
1000  * to limit the stack allocations too.
1001  */
1002 static const int mode1_syscalls[] = {
1003 	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
1004 	-1, /* negative terminated */
1005 };
1006 
1007 static void __secure_computing_strict(int this_syscall)
1008 {
1009 	const int *allowed_syscalls = mode1_syscalls;
1010 #ifdef CONFIG_COMPAT
1011 	if (in_compat_syscall())
1012 		allowed_syscalls = get_compat_mode1_syscalls();
1013 #endif
1014 	do {
1015 		if (*allowed_syscalls == this_syscall)
1016 			return;
1017 	} while (*++allowed_syscalls != -1);
1018 
1019 #ifdef SECCOMP_DEBUG
1020 	dump_stack();
1021 #endif
1022 	current->seccomp.mode = SECCOMP_MODE_DEAD;
1023 	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
1024 	do_exit(SIGKILL);
1025 }
1026 
1027 #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
1028 void secure_computing_strict(int this_syscall)
1029 {
1030 	int mode = current->seccomp.mode;
1031 
1032 	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1033 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1034 		return;
1035 
1036 	if (mode == SECCOMP_MODE_DISABLED)
1037 		return;
1038 	else if (mode == SECCOMP_MODE_STRICT)
1039 		__secure_computing_strict(this_syscall);
1040 	else
1041 		BUG();
1042 }
1043 #else
1044 
1045 #ifdef CONFIG_SECCOMP_FILTER
1046 static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
1047 {
1048 	/*
1049 	 * Note: overflow is ok here, the id just needs to be unique per
1050 	 * filter.
1051 	 */
1052 	lockdep_assert_held(&filter->notify_lock);
1053 	return filter->notif->next_id++;
1054 }
1055 
1056 static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n)
1057 {
1058 	int fd;
1059 
1060 	/*
1061 	 * Remove the notification, and reset the list pointers, indicating
1062 	 * that it has been handled.
1063 	 */
1064 	list_del_init(&addfd->list);
1065 	if (!addfd->setfd)
1066 		fd = receive_fd(addfd->file, addfd->flags);
1067 	else
1068 		fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
1069 	addfd->ret = fd;
1070 
1071 	if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
1072 		/* If we fail reset and return an error to the notifier */
1073 		if (fd < 0) {
1074 			n->state = SECCOMP_NOTIFY_SENT;
1075 		} else {
1076 			/* Return the FD we just added */
1077 			n->flags = 0;
1078 			n->error = 0;
1079 			n->val = fd;
1080 		}
1081 	}
1082 
1083 	/*
1084 	 * Mark the notification as completed. From this point, addfd mem
1085 	 * might be invalidated and we can't safely read it anymore.
1086 	 */
1087 	complete(&addfd->completion);
1088 }
1089 
1090 static bool should_sleep_killable(struct seccomp_filter *match,
1091 				  struct seccomp_knotif *n)
1092 {
1093 	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
1094 }
1095 
1096 static int seccomp_do_user_notification(int this_syscall,
1097 					struct seccomp_filter *match,
1098 					const struct seccomp_data *sd)
1099 {
1100 	int err;
1101 	u32 flags = 0;
1102 	long ret = 0;
1103 	struct seccomp_knotif n = {};
1104 	struct seccomp_kaddfd *addfd, *tmp;
1105 
1106 	mutex_lock(&match->notify_lock);
1107 	err = -ENOSYS;
1108 	if (!match->notif)
1109 		goto out;
1110 
1111 	n.task = current;
1112 	n.state = SECCOMP_NOTIFY_INIT;
1113 	n.data = sd;
1114 	n.id = seccomp_next_notify_id(match);
1115 	init_completion(&n.ready);
1116 	list_add_tail(&n.list, &match->notif->notifications);
1117 	INIT_LIST_HEAD(&n.addfd);
1118 
1119 	up(&match->notif->request);
1120 	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
1121 
1122 	/*
1123 	 * This is where we wait for a reply from userspace.
1124 	 */
1125 	do {
1126 		bool wait_killable = should_sleep_killable(match, &n);
1127 
1128 		mutex_unlock(&match->notify_lock);
1129 		if (wait_killable)
1130 			err = wait_for_completion_killable(&n.ready);
1131 		else
1132 			err = wait_for_completion_interruptible(&n.ready);
1133 		mutex_lock(&match->notify_lock);
1134 
1135 		if (err != 0) {
1136 			/*
1137 			 * Check to see if the notifcation got picked up and
1138 			 * whether we should switch to wait killable.
1139 			 */
1140 			if (!wait_killable && should_sleep_killable(match, &n))
1141 				continue;
1142 
1143 			goto interrupted;
1144 		}
1145 
1146 		addfd = list_first_entry_or_null(&n.addfd,
1147 						 struct seccomp_kaddfd, list);
1148 		/* Check if we were woken up by a addfd message */
1149 		if (addfd)
1150 			seccomp_handle_addfd(addfd, &n);
1151 
1152 	}  while (n.state != SECCOMP_NOTIFY_REPLIED);
1153 
1154 	ret = n.val;
1155 	err = n.error;
1156 	flags = n.flags;
1157 
1158 interrupted:
1159 	/* If there were any pending addfd calls, clear them out */
1160 	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
1161 		/* The process went away before we got a chance to handle it */
1162 		addfd->ret = -ESRCH;
1163 		list_del_init(&addfd->list);
1164 		complete(&addfd->completion);
1165 	}
1166 
1167 	/*
1168 	 * Note that it's possible the listener died in between the time when
1169 	 * we were notified of a response (or a signal) and when we were able to
1170 	 * re-acquire the lock, so only delete from the list if the
1171 	 * notification actually exists.
1172 	 *
1173 	 * Also note that this test is only valid because there's no way to
1174 	 * *reattach* to a notifier right now. If one is added, we'll need to
1175 	 * keep track of the notif itself and make sure they match here.
1176 	 */
1177 	if (match->notif)
1178 		list_del(&n.list);
1179 out:
1180 	mutex_unlock(&match->notify_lock);
1181 
1182 	/* Userspace requests to continue the syscall. */
1183 	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1184 		return 0;
1185 
1186 	syscall_set_return_value(current, current_pt_regs(),
1187 				 err, ret);
1188 	return -1;
1189 }
1190 
1191 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1192 			    const bool recheck_after_trace)
1193 {
1194 	u32 filter_ret, action;
1195 	struct seccomp_filter *match = NULL;
1196 	int data;
1197 	struct seccomp_data sd_local;
1198 
1199 	/*
1200 	 * Make sure that any changes to mode from another thread have
1201 	 * been seen after SYSCALL_WORK_SECCOMP was seen.
1202 	 */
1203 	smp_rmb();
1204 
1205 	if (!sd) {
1206 		populate_seccomp_data(&sd_local);
1207 		sd = &sd_local;
1208 	}
1209 
1210 	filter_ret = seccomp_run_filters(sd, &match);
1211 	data = filter_ret & SECCOMP_RET_DATA;
1212 	action = filter_ret & SECCOMP_RET_ACTION_FULL;
1213 
1214 	switch (action) {
1215 	case SECCOMP_RET_ERRNO:
1216 		/* Set low-order bits as an errno, capped at MAX_ERRNO. */
1217 		if (data > MAX_ERRNO)
1218 			data = MAX_ERRNO;
1219 		syscall_set_return_value(current, current_pt_regs(),
1220 					 -data, 0);
1221 		goto skip;
1222 
1223 	case SECCOMP_RET_TRAP:
1224 		/* Show the handler the original registers. */
1225 		syscall_rollback(current, current_pt_regs());
1226 		/* Let the filter pass back 16 bits of data. */
1227 		force_sig_seccomp(this_syscall, data, false);
1228 		goto skip;
1229 
1230 	case SECCOMP_RET_TRACE:
1231 		/* We've been put in this state by the ptracer already. */
1232 		if (recheck_after_trace)
1233 			return 0;
1234 
1235 		/* ENOSYS these calls if there is no tracer attached. */
1236 		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
1237 			syscall_set_return_value(current,
1238 						 current_pt_regs(),
1239 						 -ENOSYS, 0);
1240 			goto skip;
1241 		}
1242 
1243 		/* Allow the BPF to provide the event message */
1244 		ptrace_event(PTRACE_EVENT_SECCOMP, data);
1245 		/*
1246 		 * The delivery of a fatal signal during event
1247 		 * notification may silently skip tracer notification,
1248 		 * which could leave us with a potentially unmodified
1249 		 * syscall that the tracer would have liked to have
1250 		 * changed. Since the process is about to die, we just
1251 		 * force the syscall to be skipped and let the signal
1252 		 * kill the process and correctly handle any tracer exit
1253 		 * notifications.
1254 		 */
1255 		if (fatal_signal_pending(current))
1256 			goto skip;
1257 		/* Check if the tracer forced the syscall to be skipped. */
1258 		this_syscall = syscall_get_nr(current, current_pt_regs());
1259 		if (this_syscall < 0)
1260 			goto skip;
1261 
1262 		/*
1263 		 * Recheck the syscall, since it may have changed. This
1264 		 * intentionally uses a NULL struct seccomp_data to force
1265 		 * a reload of all registers. This does not goto skip since
1266 		 * a skip would have already been reported.
1267 		 */
1268 		if (__seccomp_filter(this_syscall, NULL, true))
1269 			return -1;
1270 
1271 		return 0;
1272 
1273 	case SECCOMP_RET_USER_NOTIF:
1274 		if (seccomp_do_user_notification(this_syscall, match, sd))
1275 			goto skip;
1276 
1277 		return 0;
1278 
1279 	case SECCOMP_RET_LOG:
1280 		seccomp_log(this_syscall, 0, action, true);
1281 		return 0;
1282 
1283 	case SECCOMP_RET_ALLOW:
1284 		/*
1285 		 * Note that the "match" filter will always be NULL for
1286 		 * this action since SECCOMP_RET_ALLOW is the starting
1287 		 * state in seccomp_run_filters().
1288 		 */
1289 		return 0;
1290 
1291 	case SECCOMP_RET_KILL_THREAD:
1292 	case SECCOMP_RET_KILL_PROCESS:
1293 	default:
1294 		current->seccomp.mode = SECCOMP_MODE_DEAD;
1295 		seccomp_log(this_syscall, SIGSYS, action, true);
1296 		/* Dump core only if this is the last remaining thread. */
1297 		if (action != SECCOMP_RET_KILL_THREAD ||
1298 		    (atomic_read(&current->signal->live) == 1)) {
1299 			/* Show the original registers in the dump. */
1300 			syscall_rollback(current, current_pt_regs());
1301 			/* Trigger a coredump with SIGSYS */
1302 			force_sig_seccomp(this_syscall, data, true);
1303 		} else {
1304 			do_exit(SIGSYS);
1305 		}
1306 		return -1; /* skip the syscall go directly to signal handling */
1307 	}
1308 
1309 	unreachable();
1310 
1311 skip:
1312 	seccomp_log(this_syscall, 0, action, match ? match->log : false);
1313 	return -1;
1314 }
1315 #else
1316 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1317 			    const bool recheck_after_trace)
1318 {
1319 	BUG();
1320 
1321 	return -1;
1322 }
1323 #endif
1324 
1325 int __secure_computing(const struct seccomp_data *sd)
1326 {
1327 	int mode = current->seccomp.mode;
1328 	int this_syscall;
1329 
1330 	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1331 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1332 		return 0;
1333 
1334 	this_syscall = sd ? sd->nr :
1335 		syscall_get_nr(current, current_pt_regs());
1336 
1337 	switch (mode) {
1338 	case SECCOMP_MODE_STRICT:
1339 		__secure_computing_strict(this_syscall);  /* may call do_exit */
1340 		return 0;
1341 	case SECCOMP_MODE_FILTER:
1342 		return __seccomp_filter(this_syscall, sd, false);
1343 	/* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
1344 	case SECCOMP_MODE_DEAD:
1345 		WARN_ON_ONCE(1);
1346 		do_exit(SIGKILL);
1347 		return -1;
1348 	default:
1349 		BUG();
1350 	}
1351 }
1352 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1353 
1354 long prctl_get_seccomp(void)
1355 {
1356 	return current->seccomp.mode;
1357 }
1358 
1359 /**
1360  * seccomp_set_mode_strict: internal function for setting strict seccomp
1361  *
1362  * Once current->seccomp.mode is non-zero, it may not be changed.
1363  *
1364  * Returns 0 on success or -EINVAL on failure.
1365  */
1366 static long seccomp_set_mode_strict(void)
1367 {
1368 	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
1369 	long ret = -EINVAL;
1370 
1371 	spin_lock_irq(&current->sighand->siglock);
1372 
1373 	if (!seccomp_may_assign_mode(seccomp_mode))
1374 		goto out;
1375 
1376 #ifdef TIF_NOTSC
1377 	disable_TSC();
1378 #endif
1379 	seccomp_assign_mode(current, seccomp_mode, 0);
1380 	ret = 0;
1381 
1382 out:
1383 	spin_unlock_irq(&current->sighand->siglock);
1384 
1385 	return ret;
1386 }
1387 
1388 #ifdef CONFIG_SECCOMP_FILTER
1389 static void seccomp_notify_free(struct seccomp_filter *filter)
1390 {
1391 	kfree(filter->notif);
1392 	filter->notif = NULL;
1393 }
1394 
1395 static void seccomp_notify_detach(struct seccomp_filter *filter)
1396 {
1397 	struct seccomp_knotif *knotif;
1398 
1399 	if (!filter)
1400 		return;
1401 
1402 	mutex_lock(&filter->notify_lock);
1403 
1404 	/*
1405 	 * If this file is being closed because e.g. the task who owned it
1406 	 * died, let's wake everyone up who was waiting on us.
1407 	 */
1408 	list_for_each_entry(knotif, &filter->notif->notifications, list) {
1409 		if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1410 			continue;
1411 
1412 		knotif->state = SECCOMP_NOTIFY_REPLIED;
1413 		knotif->error = -ENOSYS;
1414 		knotif->val = 0;
1415 
1416 		/*
1417 		 * We do not need to wake up any pending addfd messages, as
1418 		 * the notifier will do that for us, as this just looks
1419 		 * like a standard reply.
1420 		 */
1421 		complete(&knotif->ready);
1422 	}
1423 
1424 	seccomp_notify_free(filter);
1425 	mutex_unlock(&filter->notify_lock);
1426 }
1427 
1428 static int seccomp_notify_release(struct inode *inode, struct file *file)
1429 {
1430 	struct seccomp_filter *filter = file->private_data;
1431 
1432 	seccomp_notify_detach(filter);
1433 	__put_seccomp_filter(filter);
1434 	return 0;
1435 }
1436 
1437 /* must be called with notif_lock held */
1438 static inline struct seccomp_knotif *
1439 find_notification(struct seccomp_filter *filter, u64 id)
1440 {
1441 	struct seccomp_knotif *cur;
1442 
1443 	lockdep_assert_held(&filter->notify_lock);
1444 
1445 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1446 		if (cur->id == id)
1447 			return cur;
1448 	}
1449 
1450 	return NULL;
1451 }
1452 
1453 
1454 static long seccomp_notify_recv(struct seccomp_filter *filter,
1455 				void __user *buf)
1456 {
1457 	struct seccomp_knotif *knotif = NULL, *cur;
1458 	struct seccomp_notif unotif;
1459 	ssize_t ret;
1460 
1461 	/* Verify that we're not given garbage to keep struct extensible. */
1462 	ret = check_zeroed_user(buf, sizeof(unotif));
1463 	if (ret < 0)
1464 		return ret;
1465 	if (!ret)
1466 		return -EINVAL;
1467 
1468 	memset(&unotif, 0, sizeof(unotif));
1469 
1470 	ret = down_interruptible(&filter->notif->request);
1471 	if (ret < 0)
1472 		return ret;
1473 
1474 	mutex_lock(&filter->notify_lock);
1475 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1476 		if (cur->state == SECCOMP_NOTIFY_INIT) {
1477 			knotif = cur;
1478 			break;
1479 		}
1480 	}
1481 
1482 	/*
1483 	 * If we didn't find a notification, it could be that the task was
1484 	 * interrupted by a fatal signal between the time we were woken and
1485 	 * when we were able to acquire the rw lock.
1486 	 */
1487 	if (!knotif) {
1488 		ret = -ENOENT;
1489 		goto out;
1490 	}
1491 
1492 	unotif.id = knotif->id;
1493 	unotif.pid = task_pid_vnr(knotif->task);
1494 	unotif.data = *(knotif->data);
1495 
1496 	knotif->state = SECCOMP_NOTIFY_SENT;
1497 	wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
1498 	ret = 0;
1499 out:
1500 	mutex_unlock(&filter->notify_lock);
1501 
1502 	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
1503 		ret = -EFAULT;
1504 
1505 		/*
1506 		 * Userspace screwed up. To make sure that we keep this
1507 		 * notification alive, let's reset it back to INIT. It
1508 		 * may have died when we released the lock, so we need to make
1509 		 * sure it's still around.
1510 		 */
1511 		mutex_lock(&filter->notify_lock);
1512 		knotif = find_notification(filter, unotif.id);
1513 		if (knotif) {
1514 			/* Reset the process to make sure it's not stuck */
1515 			if (should_sleep_killable(filter, knotif))
1516 				complete(&knotif->ready);
1517 			knotif->state = SECCOMP_NOTIFY_INIT;
1518 			up(&filter->notif->request);
1519 		}
1520 		mutex_unlock(&filter->notify_lock);
1521 	}
1522 
1523 	return ret;
1524 }
1525 
1526 static long seccomp_notify_send(struct seccomp_filter *filter,
1527 				void __user *buf)
1528 {
1529 	struct seccomp_notif_resp resp = {};
1530 	struct seccomp_knotif *knotif;
1531 	long ret;
1532 
1533 	if (copy_from_user(&resp, buf, sizeof(resp)))
1534 		return -EFAULT;
1535 
1536 	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1537 		return -EINVAL;
1538 
1539 	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1540 	    (resp.error || resp.val))
1541 		return -EINVAL;
1542 
1543 	ret = mutex_lock_interruptible(&filter->notify_lock);
1544 	if (ret < 0)
1545 		return ret;
1546 
1547 	knotif = find_notification(filter, resp.id);
1548 	if (!knotif) {
1549 		ret = -ENOENT;
1550 		goto out;
1551 	}
1552 
1553 	/* Allow exactly one reply. */
1554 	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1555 		ret = -EINPROGRESS;
1556 		goto out;
1557 	}
1558 
1559 	ret = 0;
1560 	knotif->state = SECCOMP_NOTIFY_REPLIED;
1561 	knotif->error = resp.error;
1562 	knotif->val = resp.val;
1563 	knotif->flags = resp.flags;
1564 	complete(&knotif->ready);
1565 out:
1566 	mutex_unlock(&filter->notify_lock);
1567 	return ret;
1568 }
1569 
1570 static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1571 				    void __user *buf)
1572 {
1573 	struct seccomp_knotif *knotif;
1574 	u64 id;
1575 	long ret;
1576 
1577 	if (copy_from_user(&id, buf, sizeof(id)))
1578 		return -EFAULT;
1579 
1580 	ret = mutex_lock_interruptible(&filter->notify_lock);
1581 	if (ret < 0)
1582 		return ret;
1583 
1584 	knotif = find_notification(filter, id);
1585 	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1586 		ret = 0;
1587 	else
1588 		ret = -ENOENT;
1589 
1590 	mutex_unlock(&filter->notify_lock);
1591 	return ret;
1592 }
1593 
1594 static long seccomp_notify_addfd(struct seccomp_filter *filter,
1595 				 struct seccomp_notif_addfd __user *uaddfd,
1596 				 unsigned int size)
1597 {
1598 	struct seccomp_notif_addfd addfd;
1599 	struct seccomp_knotif *knotif;
1600 	struct seccomp_kaddfd kaddfd;
1601 	int ret;
1602 
1603 	BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1604 	BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1605 
1606 	if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
1607 		return -EINVAL;
1608 
1609 	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
1610 	if (ret)
1611 		return ret;
1612 
1613 	if (addfd.newfd_flags & ~O_CLOEXEC)
1614 		return -EINVAL;
1615 
1616 	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND))
1617 		return -EINVAL;
1618 
1619 	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1620 		return -EINVAL;
1621 
1622 	kaddfd.file = fget(addfd.srcfd);
1623 	if (!kaddfd.file)
1624 		return -EBADF;
1625 
1626 	kaddfd.ioctl_flags = addfd.flags;
1627 	kaddfd.flags = addfd.newfd_flags;
1628 	kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
1629 	kaddfd.fd = addfd.newfd;
1630 	init_completion(&kaddfd.completion);
1631 
1632 	ret = mutex_lock_interruptible(&filter->notify_lock);
1633 	if (ret < 0)
1634 		goto out;
1635 
1636 	knotif = find_notification(filter, addfd.id);
1637 	if (!knotif) {
1638 		ret = -ENOENT;
1639 		goto out_unlock;
1640 	}
1641 
1642 	/*
1643 	 * We do not want to allow for FD injection to occur before the
1644 	 * notification has been picked up by a userspace handler, or after
1645 	 * the notification has been replied to.
1646 	 */
1647 	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1648 		ret = -EINPROGRESS;
1649 		goto out_unlock;
1650 	}
1651 
1652 	if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
1653 		/*
1654 		 * Disallow queuing an atomic addfd + send reply while there are
1655 		 * some addfd requests still to process.
1656 		 *
1657 		 * There is no clear reason to support it and allows us to keep
1658 		 * the loop on the other side straight-forward.
1659 		 */
1660 		if (!list_empty(&knotif->addfd)) {
1661 			ret = -EBUSY;
1662 			goto out_unlock;
1663 		}
1664 
1665 		/* Allow exactly only one reply */
1666 		knotif->state = SECCOMP_NOTIFY_REPLIED;
1667 	}
1668 
1669 	list_add(&kaddfd.list, &knotif->addfd);
1670 	complete(&knotif->ready);
1671 	mutex_unlock(&filter->notify_lock);
1672 
1673 	/* Now we wait for it to be processed or be interrupted */
1674 	ret = wait_for_completion_interruptible(&kaddfd.completion);
1675 	if (ret == 0) {
1676 		/*
1677 		 * We had a successful completion. The other side has already
1678 		 * removed us from the addfd queue, and
1679 		 * wait_for_completion_interruptible has a memory barrier upon
1680 		 * success that lets us read this value directly without
1681 		 * locking.
1682 		 */
1683 		ret = kaddfd.ret;
1684 		goto out;
1685 	}
1686 
1687 	mutex_lock(&filter->notify_lock);
1688 	/*
1689 	 * Even though we were woken up by a signal and not a successful
1690 	 * completion, a completion may have happened in the mean time.
1691 	 *
1692 	 * We need to check again if the addfd request has been handled,
1693 	 * and if not, we will remove it from the queue.
1694 	 */
1695 	if (list_empty(&kaddfd.list))
1696 		ret = kaddfd.ret;
1697 	else
1698 		list_del(&kaddfd.list);
1699 
1700 out_unlock:
1701 	mutex_unlock(&filter->notify_lock);
1702 out:
1703 	fput(kaddfd.file);
1704 
1705 	return ret;
1706 }
1707 
1708 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
1709 				 unsigned long arg)
1710 {
1711 	struct seccomp_filter *filter = file->private_data;
1712 	void __user *buf = (void __user *)arg;
1713 
1714 	/* Fixed-size ioctls */
1715 	switch (cmd) {
1716 	case SECCOMP_IOCTL_NOTIF_RECV:
1717 		return seccomp_notify_recv(filter, buf);
1718 	case SECCOMP_IOCTL_NOTIF_SEND:
1719 		return seccomp_notify_send(filter, buf);
1720 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1721 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
1722 		return seccomp_notify_id_valid(filter, buf);
1723 	}
1724 
1725 	/* Extensible Argument ioctls */
1726 #define EA_IOCTL(cmd)	((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
1727 	switch (EA_IOCTL(cmd)) {
1728 	case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1729 		return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
1730 	default:
1731 		return -EINVAL;
1732 	}
1733 }
1734 
1735 static __poll_t seccomp_notify_poll(struct file *file,
1736 				    struct poll_table_struct *poll_tab)
1737 {
1738 	struct seccomp_filter *filter = file->private_data;
1739 	__poll_t ret = 0;
1740 	struct seccomp_knotif *cur;
1741 
1742 	poll_wait(file, &filter->wqh, poll_tab);
1743 
1744 	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
1745 		return EPOLLERR;
1746 
1747 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1748 		if (cur->state == SECCOMP_NOTIFY_INIT)
1749 			ret |= EPOLLIN | EPOLLRDNORM;
1750 		if (cur->state == SECCOMP_NOTIFY_SENT)
1751 			ret |= EPOLLOUT | EPOLLWRNORM;
1752 		if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1753 			break;
1754 	}
1755 
1756 	mutex_unlock(&filter->notify_lock);
1757 
1758 	if (refcount_read(&filter->users) == 0)
1759 		ret |= EPOLLHUP;
1760 
1761 	return ret;
1762 }
1763 
1764 static const struct file_operations seccomp_notify_ops = {
1765 	.poll = seccomp_notify_poll,
1766 	.release = seccomp_notify_release,
1767 	.unlocked_ioctl = seccomp_notify_ioctl,
1768 	.compat_ioctl = seccomp_notify_ioctl,
1769 };
1770 
1771 static struct file *init_listener(struct seccomp_filter *filter)
1772 {
1773 	struct file *ret;
1774 
1775 	ret = ERR_PTR(-ENOMEM);
1776 	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
1777 	if (!filter->notif)
1778 		goto out;
1779 
1780 	sema_init(&filter->notif->request, 0);
1781 	filter->notif->next_id = get_random_u64();
1782 	INIT_LIST_HEAD(&filter->notif->notifications);
1783 
1784 	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
1785 				 filter, O_RDWR);
1786 	if (IS_ERR(ret))
1787 		goto out_notif;
1788 
1789 	/* The file has a reference to it now */
1790 	__get_seccomp_filter(filter);
1791 
1792 out_notif:
1793 	if (IS_ERR(ret))
1794 		seccomp_notify_free(filter);
1795 out:
1796 	return ret;
1797 }
1798 
1799 /*
1800  * Does @new_child have a listener while an ancestor also has a listener?
1801  * If so, we'll want to reject this filter.
1802  * This only has to be tested for the current process, even in the TSYNC case,
1803  * because TSYNC installs @child with the same parent on all threads.
1804  * Note that @new_child is not hooked up to its parent at this point yet, so
1805  * we use current->seccomp.filter.
1806  */
1807 static bool has_duplicate_listener(struct seccomp_filter *new_child)
1808 {
1809 	struct seccomp_filter *cur;
1810 
1811 	/* must be protected against concurrent TSYNC */
1812 	lockdep_assert_held(&current->sighand->siglock);
1813 
1814 	if (!new_child->notif)
1815 		return false;
1816 	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1817 		if (cur->notif)
1818 			return true;
1819 	}
1820 
1821 	return false;
1822 }
1823 
1824 /**
1825  * seccomp_set_mode_filter: internal function for setting seccomp filter
1826  * @flags:  flags to change filter behavior
1827  * @filter: struct sock_fprog containing filter
1828  *
1829  * This function may be called repeatedly to install additional filters.
1830  * Every filter successfully installed will be evaluated (in reverse order)
1831  * for each system call the task makes.
1832  *
1833  * Once current->seccomp.mode is non-zero, it may not be changed.
1834  *
1835  * Returns 0 on success or -EINVAL on failure.
1836  */
1837 static long seccomp_set_mode_filter(unsigned int flags,
1838 				    const char __user *filter)
1839 {
1840 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1841 	struct seccomp_filter *prepared = NULL;
1842 	long ret = -EINVAL;
1843 	int listener = -1;
1844 	struct file *listener_f = NULL;
1845 
1846 	/* Validate flags. */
1847 	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1848 		return -EINVAL;
1849 
1850 	/*
1851 	 * In the successful case, NEW_LISTENER returns the new listener fd.
1852 	 * But in the failure case, TSYNC returns the thread that died. If you
1853 	 * combine these two flags, there's no way to tell whether something
1854 	 * succeeded or failed. So, let's disallow this combination if the user
1855 	 * has not explicitly requested no errors from TSYNC.
1856 	 */
1857 	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1858 	    (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1859 	    ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
1860 		return -EINVAL;
1861 
1862 	/*
1863 	 * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
1864 	 * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
1865 	 */
1866 	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
1867 	    ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
1868 		return -EINVAL;
1869 
1870 	/* Prepare the new filter before holding any locks. */
1871 	prepared = seccomp_prepare_user_filter(filter);
1872 	if (IS_ERR(prepared))
1873 		return PTR_ERR(prepared);
1874 
1875 	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1876 		listener = get_unused_fd_flags(O_CLOEXEC);
1877 		if (listener < 0) {
1878 			ret = listener;
1879 			goto out_free;
1880 		}
1881 
1882 		listener_f = init_listener(prepared);
1883 		if (IS_ERR(listener_f)) {
1884 			put_unused_fd(listener);
1885 			ret = PTR_ERR(listener_f);
1886 			goto out_free;
1887 		}
1888 	}
1889 
1890 	/*
1891 	 * Make sure we cannot change seccomp or nnp state via TSYNC
1892 	 * while another thread is in the middle of calling exec.
1893 	 */
1894 	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
1895 	    mutex_lock_killable(&current->signal->cred_guard_mutex))
1896 		goto out_put_fd;
1897 
1898 	spin_lock_irq(&current->sighand->siglock);
1899 
1900 	if (!seccomp_may_assign_mode(seccomp_mode))
1901 		goto out;
1902 
1903 	if (has_duplicate_listener(prepared)) {
1904 		ret = -EBUSY;
1905 		goto out;
1906 	}
1907 
1908 	ret = seccomp_attach_filter(flags, prepared);
1909 	if (ret)
1910 		goto out;
1911 	/* Do not free the successfully attached filter. */
1912 	prepared = NULL;
1913 
1914 	seccomp_assign_mode(current, seccomp_mode, flags);
1915 out:
1916 	spin_unlock_irq(&current->sighand->siglock);
1917 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
1918 		mutex_unlock(&current->signal->cred_guard_mutex);
1919 out_put_fd:
1920 	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1921 		if (ret) {
1922 			listener_f->private_data = NULL;
1923 			fput(listener_f);
1924 			put_unused_fd(listener);
1925 			seccomp_notify_detach(prepared);
1926 		} else {
1927 			fd_install(listener, listener_f);
1928 			ret = listener;
1929 		}
1930 	}
1931 out_free:
1932 	seccomp_filter_free(prepared);
1933 	return ret;
1934 }
1935 #else
1936 static inline long seccomp_set_mode_filter(unsigned int flags,
1937 					   const char __user *filter)
1938 {
1939 	return -EINVAL;
1940 }
1941 #endif
1942 
1943 static long seccomp_get_action_avail(const char __user *uaction)
1944 {
1945 	u32 action;
1946 
1947 	if (copy_from_user(&action, uaction, sizeof(action)))
1948 		return -EFAULT;
1949 
1950 	switch (action) {
1951 	case SECCOMP_RET_KILL_PROCESS:
1952 	case SECCOMP_RET_KILL_THREAD:
1953 	case SECCOMP_RET_TRAP:
1954 	case SECCOMP_RET_ERRNO:
1955 	case SECCOMP_RET_USER_NOTIF:
1956 	case SECCOMP_RET_TRACE:
1957 	case SECCOMP_RET_LOG:
1958 	case SECCOMP_RET_ALLOW:
1959 		break;
1960 	default:
1961 		return -EOPNOTSUPP;
1962 	}
1963 
1964 	return 0;
1965 }
1966 
1967 static long seccomp_get_notif_sizes(void __user *usizes)
1968 {
1969 	struct seccomp_notif_sizes sizes = {
1970 		.seccomp_notif = sizeof(struct seccomp_notif),
1971 		.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
1972 		.seccomp_data = sizeof(struct seccomp_data),
1973 	};
1974 
1975 	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
1976 		return -EFAULT;
1977 
1978 	return 0;
1979 }
1980 
1981 /* Common entry point for both prctl and syscall. */
1982 static long do_seccomp(unsigned int op, unsigned int flags,
1983 		       void __user *uargs)
1984 {
1985 	switch (op) {
1986 	case SECCOMP_SET_MODE_STRICT:
1987 		if (flags != 0 || uargs != NULL)
1988 			return -EINVAL;
1989 		return seccomp_set_mode_strict();
1990 	case SECCOMP_SET_MODE_FILTER:
1991 		return seccomp_set_mode_filter(flags, uargs);
1992 	case SECCOMP_GET_ACTION_AVAIL:
1993 		if (flags != 0)
1994 			return -EINVAL;
1995 
1996 		return seccomp_get_action_avail(uargs);
1997 	case SECCOMP_GET_NOTIF_SIZES:
1998 		if (flags != 0)
1999 			return -EINVAL;
2000 
2001 		return seccomp_get_notif_sizes(uargs);
2002 	default:
2003 		return -EINVAL;
2004 	}
2005 }
2006 
2007 SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
2008 			 void __user *, uargs)
2009 {
2010 	return do_seccomp(op, flags, uargs);
2011 }
2012 
2013 /**
2014  * prctl_set_seccomp: configures current->seccomp.mode
2015  * @seccomp_mode: requested mode to use
2016  * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
2017  *
2018  * Returns 0 on success or -EINVAL on failure.
2019  */
2020 long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
2021 {
2022 	unsigned int op;
2023 	void __user *uargs;
2024 
2025 	switch (seccomp_mode) {
2026 	case SECCOMP_MODE_STRICT:
2027 		op = SECCOMP_SET_MODE_STRICT;
2028 		/*
2029 		 * Setting strict mode through prctl always ignored filter,
2030 		 * so make sure it is always NULL here to pass the internal
2031 		 * check in do_seccomp().
2032 		 */
2033 		uargs = NULL;
2034 		break;
2035 	case SECCOMP_MODE_FILTER:
2036 		op = SECCOMP_SET_MODE_FILTER;
2037 		uargs = filter;
2038 		break;
2039 	default:
2040 		return -EINVAL;
2041 	}
2042 
2043 	/* prctl interface doesn't have flags, so they are always zero. */
2044 	return do_seccomp(op, 0, uargs);
2045 }
2046 
2047 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
2048 static struct seccomp_filter *get_nth_filter(struct task_struct *task,
2049 					     unsigned long filter_off)
2050 {
2051 	struct seccomp_filter *orig, *filter;
2052 	unsigned long count;
2053 
2054 	/*
2055 	 * Note: this is only correct because the caller should be the (ptrace)
2056 	 * tracer of the task, otherwise lock_task_sighand is needed.
2057 	 */
2058 	spin_lock_irq(&task->sighand->siglock);
2059 
2060 	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
2061 		spin_unlock_irq(&task->sighand->siglock);
2062 		return ERR_PTR(-EINVAL);
2063 	}
2064 
2065 	orig = task->seccomp.filter;
2066 	__get_seccomp_filter(orig);
2067 	spin_unlock_irq(&task->sighand->siglock);
2068 
2069 	count = 0;
2070 	for (filter = orig; filter; filter = filter->prev)
2071 		count++;
2072 
2073 	if (filter_off >= count) {
2074 		filter = ERR_PTR(-ENOENT);
2075 		goto out;
2076 	}
2077 
2078 	count -= filter_off;
2079 	for (filter = orig; filter && count > 1; filter = filter->prev)
2080 		count--;
2081 
2082 	if (WARN_ON(count != 1 || !filter)) {
2083 		filter = ERR_PTR(-ENOENT);
2084 		goto out;
2085 	}
2086 
2087 	__get_seccomp_filter(filter);
2088 
2089 out:
2090 	__put_seccomp_filter(orig);
2091 	return filter;
2092 }
2093 
2094 long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
2095 			void __user *data)
2096 {
2097 	struct seccomp_filter *filter;
2098 	struct sock_fprog_kern *fprog;
2099 	long ret;
2100 
2101 	if (!capable(CAP_SYS_ADMIN) ||
2102 	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2103 		return -EACCES;
2104 	}
2105 
2106 	filter = get_nth_filter(task, filter_off);
2107 	if (IS_ERR(filter))
2108 		return PTR_ERR(filter);
2109 
2110 	fprog = filter->prog->orig_prog;
2111 	if (!fprog) {
2112 		/* This must be a new non-cBPF filter, since we save
2113 		 * every cBPF filter's orig_prog above when
2114 		 * CONFIG_CHECKPOINT_RESTORE is enabled.
2115 		 */
2116 		ret = -EMEDIUMTYPE;
2117 		goto out;
2118 	}
2119 
2120 	ret = fprog->len;
2121 	if (!data)
2122 		goto out;
2123 
2124 	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
2125 		ret = -EFAULT;
2126 
2127 out:
2128 	__put_seccomp_filter(filter);
2129 	return ret;
2130 }
2131 
2132 long seccomp_get_metadata(struct task_struct *task,
2133 			  unsigned long size, void __user *data)
2134 {
2135 	long ret;
2136 	struct seccomp_filter *filter;
2137 	struct seccomp_metadata kmd = {};
2138 
2139 	if (!capable(CAP_SYS_ADMIN) ||
2140 	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2141 		return -EACCES;
2142 	}
2143 
2144 	size = min_t(unsigned long, size, sizeof(kmd));
2145 
2146 	if (size < sizeof(kmd.filter_off))
2147 		return -EINVAL;
2148 
2149 	if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
2150 		return -EFAULT;
2151 
2152 	filter = get_nth_filter(task, kmd.filter_off);
2153 	if (IS_ERR(filter))
2154 		return PTR_ERR(filter);
2155 
2156 	if (filter->log)
2157 		kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
2158 
2159 	ret = size;
2160 	if (copy_to_user(data, &kmd, size))
2161 		ret = -EFAULT;
2162 
2163 	__put_seccomp_filter(filter);
2164 	return ret;
2165 }
2166 #endif
2167 
2168 #ifdef CONFIG_SYSCTL
2169 
2170 /* Human readable action names for friendly sysctl interaction */
2171 #define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process"
2172 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
2173 #define SECCOMP_RET_TRAP_NAME		"trap"
2174 #define SECCOMP_RET_ERRNO_NAME		"errno"
2175 #define SECCOMP_RET_USER_NOTIF_NAME	"user_notif"
2176 #define SECCOMP_RET_TRACE_NAME		"trace"
2177 #define SECCOMP_RET_LOG_NAME		"log"
2178 #define SECCOMP_RET_ALLOW_NAME		"allow"
2179 
2180 static const char seccomp_actions_avail[] =
2181 				SECCOMP_RET_KILL_PROCESS_NAME	" "
2182 				SECCOMP_RET_KILL_THREAD_NAME	" "
2183 				SECCOMP_RET_TRAP_NAME		" "
2184 				SECCOMP_RET_ERRNO_NAME		" "
2185 				SECCOMP_RET_USER_NOTIF_NAME     " "
2186 				SECCOMP_RET_TRACE_NAME		" "
2187 				SECCOMP_RET_LOG_NAME		" "
2188 				SECCOMP_RET_ALLOW_NAME;
2189 
2190 struct seccomp_log_name {
2191 	u32		log;
2192 	const char	*name;
2193 };
2194 
2195 static const struct seccomp_log_name seccomp_log_names[] = {
2196 	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
2197 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
2198 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
2199 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
2200 	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
2201 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
2202 	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
2203 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
2204 	{ }
2205 };
2206 
2207 static bool seccomp_names_from_actions_logged(char *names, size_t size,
2208 					      u32 actions_logged,
2209 					      const char *sep)
2210 {
2211 	const struct seccomp_log_name *cur;
2212 	bool append_sep = false;
2213 
2214 	for (cur = seccomp_log_names; cur->name && size; cur++) {
2215 		ssize_t ret;
2216 
2217 		if (!(actions_logged & cur->log))
2218 			continue;
2219 
2220 		if (append_sep) {
2221 			ret = strscpy(names, sep, size);
2222 			if (ret < 0)
2223 				return false;
2224 
2225 			names += ret;
2226 			size -= ret;
2227 		} else
2228 			append_sep = true;
2229 
2230 		ret = strscpy(names, cur->name, size);
2231 		if (ret < 0)
2232 			return false;
2233 
2234 		names += ret;
2235 		size -= ret;
2236 	}
2237 
2238 	return true;
2239 }
2240 
2241 static bool seccomp_action_logged_from_name(u32 *action_logged,
2242 					    const char *name)
2243 {
2244 	const struct seccomp_log_name *cur;
2245 
2246 	for (cur = seccomp_log_names; cur->name; cur++) {
2247 		if (!strcmp(cur->name, name)) {
2248 			*action_logged = cur->log;
2249 			return true;
2250 		}
2251 	}
2252 
2253 	return false;
2254 }
2255 
2256 static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
2257 {
2258 	char *name;
2259 
2260 	*actions_logged = 0;
2261 	while ((name = strsep(&names, " ")) && *name) {
2262 		u32 action_logged = 0;
2263 
2264 		if (!seccomp_action_logged_from_name(&action_logged, name))
2265 			return false;
2266 
2267 		*actions_logged |= action_logged;
2268 	}
2269 
2270 	return true;
2271 }
2272 
2273 static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
2274 			       size_t *lenp, loff_t *ppos)
2275 {
2276 	char names[sizeof(seccomp_actions_avail)];
2277 	struct ctl_table table;
2278 
2279 	memset(names, 0, sizeof(names));
2280 
2281 	if (!seccomp_names_from_actions_logged(names, sizeof(names),
2282 					       seccomp_actions_logged, " "))
2283 		return -EINVAL;
2284 
2285 	table = *ro_table;
2286 	table.data = names;
2287 	table.maxlen = sizeof(names);
2288 	return proc_dostring(&table, 0, buffer, lenp, ppos);
2289 }
2290 
2291 static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
2292 				size_t *lenp, loff_t *ppos, u32 *actions_logged)
2293 {
2294 	char names[sizeof(seccomp_actions_avail)];
2295 	struct ctl_table table;
2296 	int ret;
2297 
2298 	if (!capable(CAP_SYS_ADMIN))
2299 		return -EPERM;
2300 
2301 	memset(names, 0, sizeof(names));
2302 
2303 	table = *ro_table;
2304 	table.data = names;
2305 	table.maxlen = sizeof(names);
2306 	ret = proc_dostring(&table, 1, buffer, lenp, ppos);
2307 	if (ret)
2308 		return ret;
2309 
2310 	if (!seccomp_actions_logged_from_names(actions_logged, table.data))
2311 		return -EINVAL;
2312 
2313 	if (*actions_logged & SECCOMP_LOG_ALLOW)
2314 		return -EINVAL;
2315 
2316 	seccomp_actions_logged = *actions_logged;
2317 	return 0;
2318 }
2319 
2320 static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
2321 				 int ret)
2322 {
2323 	char names[sizeof(seccomp_actions_avail)];
2324 	char old_names[sizeof(seccomp_actions_avail)];
2325 	const char *new = names;
2326 	const char *old = old_names;
2327 
2328 	if (!audit_enabled)
2329 		return;
2330 
2331 	memset(names, 0, sizeof(names));
2332 	memset(old_names, 0, sizeof(old_names));
2333 
2334 	if (ret)
2335 		new = "?";
2336 	else if (!actions_logged)
2337 		new = "(none)";
2338 	else if (!seccomp_names_from_actions_logged(names, sizeof(names),
2339 						    actions_logged, ","))
2340 		new = "?";
2341 
2342 	if (!old_actions_logged)
2343 		old = "(none)";
2344 	else if (!seccomp_names_from_actions_logged(old_names,
2345 						    sizeof(old_names),
2346 						    old_actions_logged, ","))
2347 		old = "?";
2348 
2349 	return audit_seccomp_actions_logged(new, old, !ret);
2350 }
2351 
2352 static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
2353 					  void *buffer, size_t *lenp,
2354 					  loff_t *ppos)
2355 {
2356 	int ret;
2357 
2358 	if (write) {
2359 		u32 actions_logged = 0;
2360 		u32 old_actions_logged = seccomp_actions_logged;
2361 
2362 		ret = write_actions_logged(ro_table, buffer, lenp, ppos,
2363 					   &actions_logged);
2364 		audit_actions_logged(actions_logged, old_actions_logged, ret);
2365 	} else
2366 		ret = read_actions_logged(ro_table, buffer, lenp, ppos);
2367 
2368 	return ret;
2369 }
2370 
2371 static struct ctl_path seccomp_sysctl_path[] = {
2372 	{ .procname = "kernel", },
2373 	{ .procname = "seccomp", },
2374 	{ }
2375 };
2376 
2377 static struct ctl_table seccomp_sysctl_table[] = {
2378 	{
2379 		.procname	= "actions_avail",
2380 		.data		= (void *) &seccomp_actions_avail,
2381 		.maxlen		= sizeof(seccomp_actions_avail),
2382 		.mode		= 0444,
2383 		.proc_handler	= proc_dostring,
2384 	},
2385 	{
2386 		.procname	= "actions_logged",
2387 		.mode		= 0644,
2388 		.proc_handler	= seccomp_actions_logged_handler,
2389 	},
2390 	{ }
2391 };
2392 
2393 static int __init seccomp_sysctl_init(void)
2394 {
2395 	struct ctl_table_header *hdr;
2396 
2397 	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
2398 	if (!hdr)
2399 		pr_warn("sysctl registration failed\n");
2400 	else
2401 		kmemleak_not_leak(hdr);
2402 
2403 	return 0;
2404 }
2405 
2406 device_initcall(seccomp_sysctl_init)
2407 
2408 #endif /* CONFIG_SYSCTL */
2409 
2410 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
2411 /* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
2412 static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
2413 					const void *bitmap, size_t bitmap_size)
2414 {
2415 	int nr;
2416 
2417 	for (nr = 0; nr < bitmap_size; nr++) {
2418 		bool cached = test_bit(nr, bitmap);
2419 		char *status = cached ? "ALLOW" : "FILTER";
2420 
2421 		seq_printf(m, "%s %d %s\n", name, nr, status);
2422 	}
2423 }
2424 
2425 int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
2426 			   struct pid *pid, struct task_struct *task)
2427 {
2428 	struct seccomp_filter *f;
2429 	unsigned long flags;
2430 
2431 	/*
2432 	 * We don't want some sandboxed process to know what their seccomp
2433 	 * filters consist of.
2434 	 */
2435 	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
2436 		return -EACCES;
2437 
2438 	if (!lock_task_sighand(task, &flags))
2439 		return -ESRCH;
2440 
2441 	f = READ_ONCE(task->seccomp.filter);
2442 	if (!f) {
2443 		unlock_task_sighand(task, &flags);
2444 		return 0;
2445 	}
2446 
2447 	/* prevent filter from being freed while we are printing it */
2448 	__get_seccomp_filter(f);
2449 	unlock_task_sighand(task, &flags);
2450 
2451 	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
2452 				    f->cache.allow_native,
2453 				    SECCOMP_ARCH_NATIVE_NR);
2454 
2455 #ifdef SECCOMP_ARCH_COMPAT
2456 	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
2457 				    f->cache.allow_compat,
2458 				    SECCOMP_ARCH_COMPAT_NR);
2459 #endif /* SECCOMP_ARCH_COMPAT */
2460 
2461 	__put_seccomp_filter(f);
2462 	return 0;
2463 }
2464 #endif /* CONFIG_SECCOMP_CACHE_DEBUG */
2465