xref: /openbmc/linux/kernel/seccomp.c (revision c0c45238fcf44b05c86f2f7d1dda136df7a83ff9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * linux/kernel/seccomp.c
4  *
5  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
6  *
7  * Copyright (C) 2012 Google, Inc.
8  * Will Drewry <wad@chromium.org>
9  *
10  * This defines a simple but solid secure-computing facility.
11  *
12  * Mode 1 uses a fixed list of allowed system calls.
13  * Mode 2 allows user-defined system call filters in the form
14  *        of Berkeley Packet Filters/Linux Socket Filters.
15  */
16 #define pr_fmt(fmt) "seccomp: " fmt
17 
18 #include <linux/refcount.h>
19 #include <linux/audit.h>
20 #include <linux/compat.h>
21 #include <linux/coredump.h>
22 #include <linux/kmemleak.h>
23 #include <linux/nospec.h>
24 #include <linux/prctl.h>
25 #include <linux/sched.h>
26 #include <linux/sched/task_stack.h>
27 #include <linux/seccomp.h>
28 #include <linux/slab.h>
29 #include <linux/syscalls.h>
30 #include <linux/sysctl.h>
31 
32 /* Not exposed in headers: strictly internal use only. */
33 #define SECCOMP_MODE_DEAD	(SECCOMP_MODE_FILTER + 1)
34 
35 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
36 #include <asm/syscall.h>
37 #endif
38 
39 #ifdef CONFIG_SECCOMP_FILTER
40 #include <linux/file.h>
41 #include <linux/filter.h>
42 #include <linux/pid.h>
43 #include <linux/ptrace.h>
44 #include <linux/capability.h>
45 #include <linux/uaccess.h>
46 #include <linux/anon_inodes.h>
47 #include <linux/lockdep.h>
48 
49 /*
50  * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
51  * wrong direction flag in the ioctl number. This is the broken one,
52  * which the kernel needs to keep supporting until all userspaces stop
53  * using the wrong command number.
54  */
55 #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR	SECCOMP_IOR(2, __u64)
56 
57 enum notify_state {
58 	SECCOMP_NOTIFY_INIT,
59 	SECCOMP_NOTIFY_SENT,
60 	SECCOMP_NOTIFY_REPLIED,
61 };
62 
63 struct seccomp_knotif {
64 	/* The struct pid of the task whose filter triggered the notification */
65 	struct task_struct *task;
66 
67 	/* The "cookie" for this request; this is unique for this filter. */
68 	u64 id;
69 
70 	/*
71 	 * The seccomp data. This pointer is valid the entire time this
72 	 * notification is active, since it comes from __seccomp_filter which
73 	 * eclipses the entire lifecycle here.
74 	 */
75 	const struct seccomp_data *data;
76 
77 	/*
78 	 * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
79 	 * struct seccomp_knotif is created and starts out in INIT. Once the
80 	 * handler reads the notification off of an FD, it transitions to SENT.
81 	 * If a signal is received the state transitions back to INIT and
82 	 * another message is sent. When the userspace handler replies, state
83 	 * transitions to REPLIED.
84 	 */
85 	enum notify_state state;
86 
87 	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
88 	int error;
89 	long val;
90 	u32 flags;
91 
92 	/*
93 	 * Signals when this has changed states, such as the listener
94 	 * dying, a new seccomp addfd message, or changing to REPLIED
95 	 */
96 	struct completion ready;
97 
98 	struct list_head list;
99 
100 	/* outstanding addfd requests */
101 	struct list_head addfd;
102 };
103 
104 /**
105  * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
106  *
107  * @file: A reference to the file to install in the other task
108  * @fd: The fd number to install it at. If the fd number is -1, it means the
109  *      installing process should allocate the fd as normal.
110  * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
111  *         is allowed.
112  * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
113  * @ret: The return value of the installing process. It is set to the fd num
114  *       upon success (>= 0).
115  * @completion: Indicates that the installing process has completed fd
116  *              installation, or gone away (either due to successful
117  *              reply, or signal)
118  *
119  */
120 struct seccomp_kaddfd {
121 	struct file *file;
122 	int fd;
123 	unsigned int flags;
124 	__u32 ioctl_flags;
125 
126 	union {
127 		bool setfd;
128 		/* To only be set on reply */
129 		int ret;
130 	};
131 	struct completion completion;
132 	struct list_head list;
133 };
134 
135 /**
136  * struct notification - container for seccomp userspace notifications. Since
137  * most seccomp filters will not have notification listeners attached and this
138  * structure is fairly large, we store the notification-specific stuff in a
139  * separate structure.
140  *
141  * @request: A semaphore that users of this notification can wait on for
142  *           changes. Actual reads and writes are still controlled with
143  *           filter->notify_lock.
144  * @next_id: The id of the next request.
145  * @notifications: A list of struct seccomp_knotif elements.
146  */
147 struct notification {
148 	struct semaphore request;
149 	u64 next_id;
150 	struct list_head notifications;
151 };
152 
153 #ifdef SECCOMP_ARCH_NATIVE
154 /**
155  * struct action_cache - per-filter cache of seccomp actions per
156  * arch/syscall pair
157  *
158  * @allow_native: A bitmap where each bit represents whether the
159  *		  filter will always allow the syscall, for the
160  *		  native architecture.
161  * @allow_compat: A bitmap where each bit represents whether the
162  *		  filter will always allow the syscall, for the
163  *		  compat architecture.
164  */
165 struct action_cache {
166 	DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
167 #ifdef SECCOMP_ARCH_COMPAT
168 	DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
169 #endif
170 };
171 #else
172 struct action_cache { };
173 
174 static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
175 					     const struct seccomp_data *sd)
176 {
177 	return false;
178 }
179 
180 static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
181 {
182 }
183 #endif /* SECCOMP_ARCH_NATIVE */
184 
185 /**
186  * struct seccomp_filter - container for seccomp BPF programs
187  *
188  * @refs: Reference count to manage the object lifetime.
189  *	  A filter's reference count is incremented for each directly
190  *	  attached task, once for the dependent filter, and if
191  *	  requested for the user notifier. When @refs reaches zero,
192  *	  the filter can be freed.
193  * @users: A filter's @users count is incremented for each directly
194  *         attached task (filter installation, fork(), thread_sync),
195  *	   and once for the dependent filter (tracked in filter->prev).
196  *	   When it reaches zero it indicates that no direct or indirect
197  *	   users of that filter exist. No new tasks can get associated with
198  *	   this filter after reaching 0. The @users count is always smaller
199  *	   or equal to @refs. Hence, reaching 0 for @users does not mean
200  *	   the filter can be freed.
201  * @cache: cache of arch/syscall mappings to actions
202  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
203  * @prev: points to a previously installed, or inherited, filter
204  * @prog: the BPF program to evaluate
205  * @notif: the struct that holds all notification related information
206  * @notify_lock: A lock for all notification-related accesses.
207  * @wqh: A wait queue for poll if a notifier is in use.
208  *
209  * seccomp_filter objects are organized in a tree linked via the @prev
210  * pointer.  For any task, it appears to be a singly-linked list starting
211  * with current->seccomp.filter, the most recently attached or inherited filter.
212  * However, multiple filters may share a @prev node, by way of fork(), which
213  * results in a unidirectional tree existing in memory.  This is similar to
214  * how namespaces work.
215  *
216  * seccomp_filter objects should never be modified after being attached
217  * to a task_struct (other than @refs).
218  */
219 struct seccomp_filter {
220 	refcount_t refs;
221 	refcount_t users;
222 	bool log;
223 	struct action_cache cache;
224 	struct seccomp_filter *prev;
225 	struct bpf_prog *prog;
226 	struct notification *notif;
227 	struct mutex notify_lock;
228 	wait_queue_head_t wqh;
229 };
230 
231 /* Limit any path through the tree to 256KB worth of instructions. */
232 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
233 
234 /*
235  * Endianness is explicitly ignored and left for BPF program authors to manage
236  * as per the specific architecture.
237  */
238 static void populate_seccomp_data(struct seccomp_data *sd)
239 {
240 	/*
241 	 * Instead of using current_pt_reg(), we're already doing the work
242 	 * to safely fetch "current", so just use "task" everywhere below.
243 	 */
244 	struct task_struct *task = current;
245 	struct pt_regs *regs = task_pt_regs(task);
246 	unsigned long args[6];
247 
248 	sd->nr = syscall_get_nr(task, regs);
249 	sd->arch = syscall_get_arch(task);
250 	syscall_get_arguments(task, regs, args);
251 	sd->args[0] = args[0];
252 	sd->args[1] = args[1];
253 	sd->args[2] = args[2];
254 	sd->args[3] = args[3];
255 	sd->args[4] = args[4];
256 	sd->args[5] = args[5];
257 	sd->instruction_pointer = KSTK_EIP(task);
258 }
259 
260 /**
261  *	seccomp_check_filter - verify seccomp filter code
262  *	@filter: filter to verify
263  *	@flen: length of filter
264  *
265  * Takes a previously checked filter (by bpf_check_classic) and
266  * redirects all filter code that loads struct sk_buff data
267  * and related data through seccomp_bpf_load.  It also
268  * enforces length and alignment checking of those loads.
269  *
270  * Returns 0 if the rule set is legal or -EINVAL if not.
271  */
272 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
273 {
274 	int pc;
275 	for (pc = 0; pc < flen; pc++) {
276 		struct sock_filter *ftest = &filter[pc];
277 		u16 code = ftest->code;
278 		u32 k = ftest->k;
279 
280 		switch (code) {
281 		case BPF_LD | BPF_W | BPF_ABS:
282 			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
283 			/* 32-bit aligned and not out of bounds. */
284 			if (k >= sizeof(struct seccomp_data) || k & 3)
285 				return -EINVAL;
286 			continue;
287 		case BPF_LD | BPF_W | BPF_LEN:
288 			ftest->code = BPF_LD | BPF_IMM;
289 			ftest->k = sizeof(struct seccomp_data);
290 			continue;
291 		case BPF_LDX | BPF_W | BPF_LEN:
292 			ftest->code = BPF_LDX | BPF_IMM;
293 			ftest->k = sizeof(struct seccomp_data);
294 			continue;
295 		/* Explicitly include allowed calls. */
296 		case BPF_RET | BPF_K:
297 		case BPF_RET | BPF_A:
298 		case BPF_ALU | BPF_ADD | BPF_K:
299 		case BPF_ALU | BPF_ADD | BPF_X:
300 		case BPF_ALU | BPF_SUB | BPF_K:
301 		case BPF_ALU | BPF_SUB | BPF_X:
302 		case BPF_ALU | BPF_MUL | BPF_K:
303 		case BPF_ALU | BPF_MUL | BPF_X:
304 		case BPF_ALU | BPF_DIV | BPF_K:
305 		case BPF_ALU | BPF_DIV | BPF_X:
306 		case BPF_ALU | BPF_AND | BPF_K:
307 		case BPF_ALU | BPF_AND | BPF_X:
308 		case BPF_ALU | BPF_OR | BPF_K:
309 		case BPF_ALU | BPF_OR | BPF_X:
310 		case BPF_ALU | BPF_XOR | BPF_K:
311 		case BPF_ALU | BPF_XOR | BPF_X:
312 		case BPF_ALU | BPF_LSH | BPF_K:
313 		case BPF_ALU | BPF_LSH | BPF_X:
314 		case BPF_ALU | BPF_RSH | BPF_K:
315 		case BPF_ALU | BPF_RSH | BPF_X:
316 		case BPF_ALU | BPF_NEG:
317 		case BPF_LD | BPF_IMM:
318 		case BPF_LDX | BPF_IMM:
319 		case BPF_MISC | BPF_TAX:
320 		case BPF_MISC | BPF_TXA:
321 		case BPF_LD | BPF_MEM:
322 		case BPF_LDX | BPF_MEM:
323 		case BPF_ST:
324 		case BPF_STX:
325 		case BPF_JMP | BPF_JA:
326 		case BPF_JMP | BPF_JEQ | BPF_K:
327 		case BPF_JMP | BPF_JEQ | BPF_X:
328 		case BPF_JMP | BPF_JGE | BPF_K:
329 		case BPF_JMP | BPF_JGE | BPF_X:
330 		case BPF_JMP | BPF_JGT | BPF_K:
331 		case BPF_JMP | BPF_JGT | BPF_X:
332 		case BPF_JMP | BPF_JSET | BPF_K:
333 		case BPF_JMP | BPF_JSET | BPF_X:
334 			continue;
335 		default:
336 			return -EINVAL;
337 		}
338 	}
339 	return 0;
340 }
341 
342 #ifdef SECCOMP_ARCH_NATIVE
343 static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
344 						    size_t bitmap_size,
345 						    int syscall_nr)
346 {
347 	if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
348 		return false;
349 	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
350 
351 	return test_bit(syscall_nr, bitmap);
352 }
353 
354 /**
355  * seccomp_cache_check_allow - lookup seccomp cache
356  * @sfilter: The seccomp filter
357  * @sd: The seccomp data to lookup the cache with
358  *
359  * Returns true if the seccomp_data is cached and allowed.
360  */
361 static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
362 					     const struct seccomp_data *sd)
363 {
364 	int syscall_nr = sd->nr;
365 	const struct action_cache *cache = &sfilter->cache;
366 
367 #ifndef SECCOMP_ARCH_COMPAT
368 	/* A native-only architecture doesn't need to check sd->arch. */
369 	return seccomp_cache_check_allow_bitmap(cache->allow_native,
370 						SECCOMP_ARCH_NATIVE_NR,
371 						syscall_nr);
372 #else
373 	if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
374 		return seccomp_cache_check_allow_bitmap(cache->allow_native,
375 							SECCOMP_ARCH_NATIVE_NR,
376 							syscall_nr);
377 	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
378 		return seccomp_cache_check_allow_bitmap(cache->allow_compat,
379 							SECCOMP_ARCH_COMPAT_NR,
380 							syscall_nr);
381 #endif /* SECCOMP_ARCH_COMPAT */
382 
383 	WARN_ON_ONCE(true);
384 	return false;
385 }
386 #endif /* SECCOMP_ARCH_NATIVE */
387 
388 /**
389  * seccomp_run_filters - evaluates all seccomp filters against @sd
390  * @sd: optional seccomp data to be passed to filters
391  * @match: stores struct seccomp_filter that resulted in the return value,
392  *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
393  *         be unchanged.
394  *
395  * Returns valid seccomp BPF response codes.
396  */
397 #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
398 static u32 seccomp_run_filters(const struct seccomp_data *sd,
399 			       struct seccomp_filter **match)
400 {
401 	u32 ret = SECCOMP_RET_ALLOW;
402 	/* Make sure cross-thread synced filter points somewhere sane. */
403 	struct seccomp_filter *f =
404 			READ_ONCE(current->seccomp.filter);
405 
406 	/* Ensure unexpected behavior doesn't result in failing open. */
407 	if (WARN_ON(f == NULL))
408 		return SECCOMP_RET_KILL_PROCESS;
409 
410 	if (seccomp_cache_check_allow(f, sd))
411 		return SECCOMP_RET_ALLOW;
412 
413 	/*
414 	 * All filters in the list are evaluated and the lowest BPF return
415 	 * value always takes priority (ignoring the DATA).
416 	 */
417 	for (; f; f = f->prev) {
418 		u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
419 
420 		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
421 			ret = cur_ret;
422 			*match = f;
423 		}
424 	}
425 	return ret;
426 }
427 #endif /* CONFIG_SECCOMP_FILTER */
428 
429 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
430 {
431 	assert_spin_locked(&current->sighand->siglock);
432 
433 	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
434 		return false;
435 
436 	return true;
437 }
438 
439 void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
440 
441 static inline void seccomp_assign_mode(struct task_struct *task,
442 				       unsigned long seccomp_mode,
443 				       unsigned long flags)
444 {
445 	assert_spin_locked(&task->sighand->siglock);
446 
447 	task->seccomp.mode = seccomp_mode;
448 	/*
449 	 * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
450 	 * filter) is set.
451 	 */
452 	smp_mb__before_atomic();
453 	/* Assume default seccomp processes want spec flaw mitigation. */
454 	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
455 		arch_seccomp_spec_mitigate(task);
456 	set_task_syscall_work(task, SECCOMP);
457 }
458 
459 #ifdef CONFIG_SECCOMP_FILTER
460 /* Returns 1 if the parent is an ancestor of the child. */
461 static int is_ancestor(struct seccomp_filter *parent,
462 		       struct seccomp_filter *child)
463 {
464 	/* NULL is the root ancestor. */
465 	if (parent == NULL)
466 		return 1;
467 	for (; child; child = child->prev)
468 		if (child == parent)
469 			return 1;
470 	return 0;
471 }
472 
473 /**
474  * seccomp_can_sync_threads: checks if all threads can be synchronized
475  *
476  * Expects sighand and cred_guard_mutex locks to be held.
477  *
478  * Returns 0 on success, -ve on error, or the pid of a thread which was
479  * either not in the correct seccomp mode or did not have an ancestral
480  * seccomp filter.
481  */
482 static inline pid_t seccomp_can_sync_threads(void)
483 {
484 	struct task_struct *thread, *caller;
485 
486 	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
487 	assert_spin_locked(&current->sighand->siglock);
488 
489 	/* Validate all threads being eligible for synchronization. */
490 	caller = current;
491 	for_each_thread(caller, thread) {
492 		pid_t failed;
493 
494 		/* Skip current, since it is initiating the sync. */
495 		if (thread == caller)
496 			continue;
497 
498 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
499 		    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
500 		     is_ancestor(thread->seccomp.filter,
501 				 caller->seccomp.filter)))
502 			continue;
503 
504 		/* Return the first thread that cannot be synchronized. */
505 		failed = task_pid_vnr(thread);
506 		/* If the pid cannot be resolved, then return -ESRCH */
507 		if (WARN_ON(failed == 0))
508 			failed = -ESRCH;
509 		return failed;
510 	}
511 
512 	return 0;
513 }
514 
515 static inline void seccomp_filter_free(struct seccomp_filter *filter)
516 {
517 	if (filter) {
518 		bpf_prog_destroy(filter->prog);
519 		kfree(filter);
520 	}
521 }
522 
523 static void __seccomp_filter_orphan(struct seccomp_filter *orig)
524 {
525 	while (orig && refcount_dec_and_test(&orig->users)) {
526 		if (waitqueue_active(&orig->wqh))
527 			wake_up_poll(&orig->wqh, EPOLLHUP);
528 		orig = orig->prev;
529 	}
530 }
531 
532 static void __put_seccomp_filter(struct seccomp_filter *orig)
533 {
534 	/* Clean up single-reference branches iteratively. */
535 	while (orig && refcount_dec_and_test(&orig->refs)) {
536 		struct seccomp_filter *freeme = orig;
537 		orig = orig->prev;
538 		seccomp_filter_free(freeme);
539 	}
540 }
541 
542 static void __seccomp_filter_release(struct seccomp_filter *orig)
543 {
544 	/* Notify about any unused filters in the task's former filter tree. */
545 	__seccomp_filter_orphan(orig);
546 	/* Finally drop all references to the task's former tree. */
547 	__put_seccomp_filter(orig);
548 }
549 
550 /**
551  * seccomp_filter_release - Detach the task from its filter tree,
552  *			    drop its reference count, and notify
553  *			    about unused filters
554  *
555  * This function should only be called when the task is exiting as
556  * it detaches it from its filter tree. As such, READ_ONCE() and
557  * barriers are not needed here, as would normally be needed.
558  */
559 void seccomp_filter_release(struct task_struct *tsk)
560 {
561 	struct seccomp_filter *orig = tsk->seccomp.filter;
562 
563 	/* We are effectively holding the siglock by not having any sighand. */
564 	WARN_ON(tsk->sighand != NULL);
565 
566 	/* Detach task from its filter tree. */
567 	tsk->seccomp.filter = NULL;
568 	__seccomp_filter_release(orig);
569 }
570 
571 /**
572  * seccomp_sync_threads: sets all threads to use current's filter
573  *
574  * Expects sighand and cred_guard_mutex locks to be held, and for
575  * seccomp_can_sync_threads() to have returned success already
576  * without dropping the locks.
577  *
578  */
579 static inline void seccomp_sync_threads(unsigned long flags)
580 {
581 	struct task_struct *thread, *caller;
582 
583 	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
584 	assert_spin_locked(&current->sighand->siglock);
585 
586 	/* Synchronize all threads. */
587 	caller = current;
588 	for_each_thread(caller, thread) {
589 		/* Skip current, since it needs no changes. */
590 		if (thread == caller)
591 			continue;
592 
593 		/* Get a task reference for the new leaf node. */
594 		get_seccomp_filter(caller);
595 
596 		/*
597 		 * Drop the task reference to the shared ancestor since
598 		 * current's path will hold a reference.  (This also
599 		 * allows a put before the assignment.)
600 		 */
601 		__seccomp_filter_release(thread->seccomp.filter);
602 
603 		/* Make our new filter tree visible. */
604 		smp_store_release(&thread->seccomp.filter,
605 				  caller->seccomp.filter);
606 		atomic_set(&thread->seccomp.filter_count,
607 			   atomic_read(&caller->seccomp.filter_count));
608 
609 		/*
610 		 * Don't let an unprivileged task work around
611 		 * the no_new_privs restriction by creating
612 		 * a thread that sets it up, enters seccomp,
613 		 * then dies.
614 		 */
615 		if (task_no_new_privs(caller))
616 			task_set_no_new_privs(thread);
617 
618 		/*
619 		 * Opt the other thread into seccomp if needed.
620 		 * As threads are considered to be trust-realm
621 		 * equivalent (see ptrace_may_access), it is safe to
622 		 * allow one thread to transition the other.
623 		 */
624 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
625 			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
626 					    flags);
627 	}
628 }
629 
630 /**
631  * seccomp_prepare_filter: Prepares a seccomp filter for use.
632  * @fprog: BPF program to install
633  *
634  * Returns filter on success or an ERR_PTR on failure.
635  */
636 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
637 {
638 	struct seccomp_filter *sfilter;
639 	int ret;
640 	const bool save_orig =
641 #if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
642 		true;
643 #else
644 		false;
645 #endif
646 
647 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
648 		return ERR_PTR(-EINVAL);
649 
650 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
651 
652 	/*
653 	 * Installing a seccomp filter requires that the task has
654 	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
655 	 * This avoids scenarios where unprivileged tasks can affect the
656 	 * behavior of privileged children.
657 	 */
658 	if (!task_no_new_privs(current) &&
659 			!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
660 		return ERR_PTR(-EACCES);
661 
662 	/* Allocate a new seccomp_filter */
663 	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
664 	if (!sfilter)
665 		return ERR_PTR(-ENOMEM);
666 
667 	mutex_init(&sfilter->notify_lock);
668 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
669 					seccomp_check_filter, save_orig);
670 	if (ret < 0) {
671 		kfree(sfilter);
672 		return ERR_PTR(ret);
673 	}
674 
675 	refcount_set(&sfilter->refs, 1);
676 	refcount_set(&sfilter->users, 1);
677 	init_waitqueue_head(&sfilter->wqh);
678 
679 	return sfilter;
680 }
681 
682 /**
683  * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
684  * @user_filter: pointer to the user data containing a sock_fprog.
685  *
686  * Returns 0 on success and non-zero otherwise.
687  */
688 static struct seccomp_filter *
689 seccomp_prepare_user_filter(const char __user *user_filter)
690 {
691 	struct sock_fprog fprog;
692 	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
693 
694 #ifdef CONFIG_COMPAT
695 	if (in_compat_syscall()) {
696 		struct compat_sock_fprog fprog32;
697 		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
698 			goto out;
699 		fprog.len = fprog32.len;
700 		fprog.filter = compat_ptr(fprog32.filter);
701 	} else /* falls through to the if below. */
702 #endif
703 	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
704 		goto out;
705 	filter = seccomp_prepare_filter(&fprog);
706 out:
707 	return filter;
708 }
709 
710 #ifdef SECCOMP_ARCH_NATIVE
711 /**
712  * seccomp_is_const_allow - check if filter is constant allow with given data
713  * @fprog: The BPF programs
714  * @sd: The seccomp data to check against, only syscall number and arch
715  *      number are considered constant.
716  */
717 static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
718 				   struct seccomp_data *sd)
719 {
720 	unsigned int reg_value = 0;
721 	unsigned int pc;
722 	bool op_res;
723 
724 	if (WARN_ON_ONCE(!fprog))
725 		return false;
726 
727 	for (pc = 0; pc < fprog->len; pc++) {
728 		struct sock_filter *insn = &fprog->filter[pc];
729 		u16 code = insn->code;
730 		u32 k = insn->k;
731 
732 		switch (code) {
733 		case BPF_LD | BPF_W | BPF_ABS:
734 			switch (k) {
735 			case offsetof(struct seccomp_data, nr):
736 				reg_value = sd->nr;
737 				break;
738 			case offsetof(struct seccomp_data, arch):
739 				reg_value = sd->arch;
740 				break;
741 			default:
742 				/* can't optimize (non-constant value load) */
743 				return false;
744 			}
745 			break;
746 		case BPF_RET | BPF_K:
747 			/* reached return with constant values only, check allow */
748 			return k == SECCOMP_RET_ALLOW;
749 		case BPF_JMP | BPF_JA:
750 			pc += insn->k;
751 			break;
752 		case BPF_JMP | BPF_JEQ | BPF_K:
753 		case BPF_JMP | BPF_JGE | BPF_K:
754 		case BPF_JMP | BPF_JGT | BPF_K:
755 		case BPF_JMP | BPF_JSET | BPF_K:
756 			switch (BPF_OP(code)) {
757 			case BPF_JEQ:
758 				op_res = reg_value == k;
759 				break;
760 			case BPF_JGE:
761 				op_res = reg_value >= k;
762 				break;
763 			case BPF_JGT:
764 				op_res = reg_value > k;
765 				break;
766 			case BPF_JSET:
767 				op_res = !!(reg_value & k);
768 				break;
769 			default:
770 				/* can't optimize (unknown jump) */
771 				return false;
772 			}
773 
774 			pc += op_res ? insn->jt : insn->jf;
775 			break;
776 		case BPF_ALU | BPF_AND | BPF_K:
777 			reg_value &= k;
778 			break;
779 		default:
780 			/* can't optimize (unknown insn) */
781 			return false;
782 		}
783 	}
784 
785 	/* ran off the end of the filter?! */
786 	WARN_ON(1);
787 	return false;
788 }
789 
790 static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
791 					 void *bitmap, const void *bitmap_prev,
792 					 size_t bitmap_size, int arch)
793 {
794 	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
795 	struct seccomp_data sd;
796 	int nr;
797 
798 	if (bitmap_prev) {
799 		/* The new filter must be as restrictive as the last. */
800 		bitmap_copy(bitmap, bitmap_prev, bitmap_size);
801 	} else {
802 		/* Before any filters, all syscalls are always allowed. */
803 		bitmap_fill(bitmap, bitmap_size);
804 	}
805 
806 	for (nr = 0; nr < bitmap_size; nr++) {
807 		/* No bitmap change: not a cacheable action. */
808 		if (!test_bit(nr, bitmap))
809 			continue;
810 
811 		sd.nr = nr;
812 		sd.arch = arch;
813 
814 		/* No bitmap change: continue to always allow. */
815 		if (seccomp_is_const_allow(fprog, &sd))
816 			continue;
817 
818 		/*
819 		 * Not a cacheable action: always run filters.
820 		 * atomic clear_bit() not needed, filter not visible yet.
821 		 */
822 		__clear_bit(nr, bitmap);
823 	}
824 }
825 
826 /**
827  * seccomp_cache_prepare - emulate the filter to find cacheable syscalls
828  * @sfilter: The seccomp filter
829  *
830  * Returns 0 if successful or -errno if error occurred.
831  */
832 static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
833 {
834 	struct action_cache *cache = &sfilter->cache;
835 	const struct action_cache *cache_prev =
836 		sfilter->prev ? &sfilter->prev->cache : NULL;
837 
838 	seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
839 				     cache_prev ? cache_prev->allow_native : NULL,
840 				     SECCOMP_ARCH_NATIVE_NR,
841 				     SECCOMP_ARCH_NATIVE);
842 
843 #ifdef SECCOMP_ARCH_COMPAT
844 	seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
845 				     cache_prev ? cache_prev->allow_compat : NULL,
846 				     SECCOMP_ARCH_COMPAT_NR,
847 				     SECCOMP_ARCH_COMPAT);
848 #endif /* SECCOMP_ARCH_COMPAT */
849 }
850 #endif /* SECCOMP_ARCH_NATIVE */
851 
852 /**
853  * seccomp_attach_filter: validate and attach filter
854  * @flags:  flags to change filter behavior
855  * @filter: seccomp filter to add to the current process
856  *
857  * Caller must be holding current->sighand->siglock lock.
858  *
859  * Returns 0 on success, -ve on error, or
860  *   - in TSYNC mode: the pid of a thread which was either not in the correct
861  *     seccomp mode or did not have an ancestral seccomp filter
862  *   - in NEW_LISTENER mode: the fd of the new listener
863  */
864 static long seccomp_attach_filter(unsigned int flags,
865 				  struct seccomp_filter *filter)
866 {
867 	unsigned long total_insns;
868 	struct seccomp_filter *walker;
869 
870 	assert_spin_locked(&current->sighand->siglock);
871 
872 	/* Validate resulting filter length. */
873 	total_insns = filter->prog->len;
874 	for (walker = current->seccomp.filter; walker; walker = walker->prev)
875 		total_insns += walker->prog->len + 4;  /* 4 instr penalty */
876 	if (total_insns > MAX_INSNS_PER_PATH)
877 		return -ENOMEM;
878 
879 	/* If thread sync has been requested, check that it is possible. */
880 	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
881 		int ret;
882 
883 		ret = seccomp_can_sync_threads();
884 		if (ret) {
885 			if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
886 				return -ESRCH;
887 			else
888 				return ret;
889 		}
890 	}
891 
892 	/* Set log flag, if present. */
893 	if (flags & SECCOMP_FILTER_FLAG_LOG)
894 		filter->log = true;
895 
896 	/*
897 	 * If there is an existing filter, make it the prev and don't drop its
898 	 * task reference.
899 	 */
900 	filter->prev = current->seccomp.filter;
901 	seccomp_cache_prepare(filter);
902 	current->seccomp.filter = filter;
903 	atomic_inc(&current->seccomp.filter_count);
904 
905 	/* Now that the new filter is in place, synchronize to all threads. */
906 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
907 		seccomp_sync_threads(flags);
908 
909 	return 0;
910 }
911 
912 static void __get_seccomp_filter(struct seccomp_filter *filter)
913 {
914 	refcount_inc(&filter->refs);
915 }
916 
917 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
918 void get_seccomp_filter(struct task_struct *tsk)
919 {
920 	struct seccomp_filter *orig = tsk->seccomp.filter;
921 	if (!orig)
922 		return;
923 	__get_seccomp_filter(orig);
924 	refcount_inc(&orig->users);
925 }
926 
927 #endif	/* CONFIG_SECCOMP_FILTER */
928 
929 /* For use with seccomp_actions_logged */
930 #define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
931 #define SECCOMP_LOG_KILL_THREAD		(1 << 1)
932 #define SECCOMP_LOG_TRAP		(1 << 2)
933 #define SECCOMP_LOG_ERRNO		(1 << 3)
934 #define SECCOMP_LOG_TRACE		(1 << 4)
935 #define SECCOMP_LOG_LOG			(1 << 5)
936 #define SECCOMP_LOG_ALLOW		(1 << 6)
937 #define SECCOMP_LOG_USER_NOTIF		(1 << 7)
938 
939 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
940 				    SECCOMP_LOG_KILL_THREAD  |
941 				    SECCOMP_LOG_TRAP  |
942 				    SECCOMP_LOG_ERRNO |
943 				    SECCOMP_LOG_USER_NOTIF |
944 				    SECCOMP_LOG_TRACE |
945 				    SECCOMP_LOG_LOG;
946 
947 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
948 			       bool requested)
949 {
950 	bool log = false;
951 
952 	switch (action) {
953 	case SECCOMP_RET_ALLOW:
954 		break;
955 	case SECCOMP_RET_TRAP:
956 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
957 		break;
958 	case SECCOMP_RET_ERRNO:
959 		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
960 		break;
961 	case SECCOMP_RET_TRACE:
962 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
963 		break;
964 	case SECCOMP_RET_USER_NOTIF:
965 		log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
966 		break;
967 	case SECCOMP_RET_LOG:
968 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
969 		break;
970 	case SECCOMP_RET_KILL_THREAD:
971 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
972 		break;
973 	case SECCOMP_RET_KILL_PROCESS:
974 	default:
975 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
976 	}
977 
978 	/*
979 	 * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
980 	 * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
981 	 * any action from being logged by removing the action name from the
982 	 * seccomp_actions_logged sysctl.
983 	 */
984 	if (!log)
985 		return;
986 
987 	audit_seccomp(syscall, signr, action);
988 }
989 
990 /*
991  * Secure computing mode 1 allows only read/write/exit/sigreturn.
992  * To be fully secure this must be combined with rlimit
993  * to limit the stack allocations too.
994  */
995 static const int mode1_syscalls[] = {
996 	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
997 	-1, /* negative terminated */
998 };
999 
1000 static void __secure_computing_strict(int this_syscall)
1001 {
1002 	const int *allowed_syscalls = mode1_syscalls;
1003 #ifdef CONFIG_COMPAT
1004 	if (in_compat_syscall())
1005 		allowed_syscalls = get_compat_mode1_syscalls();
1006 #endif
1007 	do {
1008 		if (*allowed_syscalls == this_syscall)
1009 			return;
1010 	} while (*++allowed_syscalls != -1);
1011 
1012 #ifdef SECCOMP_DEBUG
1013 	dump_stack();
1014 #endif
1015 	current->seccomp.mode = SECCOMP_MODE_DEAD;
1016 	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
1017 	do_exit(SIGKILL);
1018 }
1019 
1020 #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
1021 void secure_computing_strict(int this_syscall)
1022 {
1023 	int mode = current->seccomp.mode;
1024 
1025 	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1026 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1027 		return;
1028 
1029 	if (mode == SECCOMP_MODE_DISABLED)
1030 		return;
1031 	else if (mode == SECCOMP_MODE_STRICT)
1032 		__secure_computing_strict(this_syscall);
1033 	else
1034 		BUG();
1035 }
1036 #else
1037 
1038 #ifdef CONFIG_SECCOMP_FILTER
1039 static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
1040 {
1041 	/*
1042 	 * Note: overflow is ok here, the id just needs to be unique per
1043 	 * filter.
1044 	 */
1045 	lockdep_assert_held(&filter->notify_lock);
1046 	return filter->notif->next_id++;
1047 }
1048 
1049 static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n)
1050 {
1051 	int fd;
1052 
1053 	/*
1054 	 * Remove the notification, and reset the list pointers, indicating
1055 	 * that it has been handled.
1056 	 */
1057 	list_del_init(&addfd->list);
1058 	if (!addfd->setfd)
1059 		fd = receive_fd(addfd->file, addfd->flags);
1060 	else
1061 		fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
1062 	addfd->ret = fd;
1063 
1064 	if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
1065 		/* If we fail reset and return an error to the notifier */
1066 		if (fd < 0) {
1067 			n->state = SECCOMP_NOTIFY_SENT;
1068 		} else {
1069 			/* Return the FD we just added */
1070 			n->flags = 0;
1071 			n->error = 0;
1072 			n->val = fd;
1073 		}
1074 	}
1075 
1076 	/*
1077 	 * Mark the notification as completed. From this point, addfd mem
1078 	 * might be invalidated and we can't safely read it anymore.
1079 	 */
1080 	complete(&addfd->completion);
1081 }
1082 
1083 static int seccomp_do_user_notification(int this_syscall,
1084 					struct seccomp_filter *match,
1085 					const struct seccomp_data *sd)
1086 {
1087 	int err;
1088 	u32 flags = 0;
1089 	long ret = 0;
1090 	struct seccomp_knotif n = {};
1091 	struct seccomp_kaddfd *addfd, *tmp;
1092 
1093 	mutex_lock(&match->notify_lock);
1094 	err = -ENOSYS;
1095 	if (!match->notif)
1096 		goto out;
1097 
1098 	n.task = current;
1099 	n.state = SECCOMP_NOTIFY_INIT;
1100 	n.data = sd;
1101 	n.id = seccomp_next_notify_id(match);
1102 	init_completion(&n.ready);
1103 	list_add(&n.list, &match->notif->notifications);
1104 	INIT_LIST_HEAD(&n.addfd);
1105 
1106 	up(&match->notif->request);
1107 	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
1108 
1109 	/*
1110 	 * This is where we wait for a reply from userspace.
1111 	 */
1112 	do {
1113 		mutex_unlock(&match->notify_lock);
1114 		err = wait_for_completion_interruptible(&n.ready);
1115 		mutex_lock(&match->notify_lock);
1116 		if (err != 0)
1117 			goto interrupted;
1118 
1119 		addfd = list_first_entry_or_null(&n.addfd,
1120 						 struct seccomp_kaddfd, list);
1121 		/* Check if we were woken up by a addfd message */
1122 		if (addfd)
1123 			seccomp_handle_addfd(addfd, &n);
1124 
1125 	}  while (n.state != SECCOMP_NOTIFY_REPLIED);
1126 
1127 	ret = n.val;
1128 	err = n.error;
1129 	flags = n.flags;
1130 
1131 interrupted:
1132 	/* If there were any pending addfd calls, clear them out */
1133 	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
1134 		/* The process went away before we got a chance to handle it */
1135 		addfd->ret = -ESRCH;
1136 		list_del_init(&addfd->list);
1137 		complete(&addfd->completion);
1138 	}
1139 
1140 	/*
1141 	 * Note that it's possible the listener died in between the time when
1142 	 * we were notified of a response (or a signal) and when we were able to
1143 	 * re-acquire the lock, so only delete from the list if the
1144 	 * notification actually exists.
1145 	 *
1146 	 * Also note that this test is only valid because there's no way to
1147 	 * *reattach* to a notifier right now. If one is added, we'll need to
1148 	 * keep track of the notif itself and make sure they match here.
1149 	 */
1150 	if (match->notif)
1151 		list_del(&n.list);
1152 out:
1153 	mutex_unlock(&match->notify_lock);
1154 
1155 	/* Userspace requests to continue the syscall. */
1156 	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1157 		return 0;
1158 
1159 	syscall_set_return_value(current, current_pt_regs(),
1160 				 err, ret);
1161 	return -1;
1162 }
1163 
1164 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1165 			    const bool recheck_after_trace)
1166 {
1167 	u32 filter_ret, action;
1168 	struct seccomp_filter *match = NULL;
1169 	int data;
1170 	struct seccomp_data sd_local;
1171 
1172 	/*
1173 	 * Make sure that any changes to mode from another thread have
1174 	 * been seen after SYSCALL_WORK_SECCOMP was seen.
1175 	 */
1176 	smp_rmb();
1177 
1178 	if (!sd) {
1179 		populate_seccomp_data(&sd_local);
1180 		sd = &sd_local;
1181 	}
1182 
1183 	filter_ret = seccomp_run_filters(sd, &match);
1184 	data = filter_ret & SECCOMP_RET_DATA;
1185 	action = filter_ret & SECCOMP_RET_ACTION_FULL;
1186 
1187 	switch (action) {
1188 	case SECCOMP_RET_ERRNO:
1189 		/* Set low-order bits as an errno, capped at MAX_ERRNO. */
1190 		if (data > MAX_ERRNO)
1191 			data = MAX_ERRNO;
1192 		syscall_set_return_value(current, current_pt_regs(),
1193 					 -data, 0);
1194 		goto skip;
1195 
1196 	case SECCOMP_RET_TRAP:
1197 		/* Show the handler the original registers. */
1198 		syscall_rollback(current, current_pt_regs());
1199 		/* Let the filter pass back 16 bits of data. */
1200 		force_sig_seccomp(this_syscall, data, false);
1201 		goto skip;
1202 
1203 	case SECCOMP_RET_TRACE:
1204 		/* We've been put in this state by the ptracer already. */
1205 		if (recheck_after_trace)
1206 			return 0;
1207 
1208 		/* ENOSYS these calls if there is no tracer attached. */
1209 		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
1210 			syscall_set_return_value(current,
1211 						 current_pt_regs(),
1212 						 -ENOSYS, 0);
1213 			goto skip;
1214 		}
1215 
1216 		/* Allow the BPF to provide the event message */
1217 		ptrace_event(PTRACE_EVENT_SECCOMP, data);
1218 		/*
1219 		 * The delivery of a fatal signal during event
1220 		 * notification may silently skip tracer notification,
1221 		 * which could leave us with a potentially unmodified
1222 		 * syscall that the tracer would have liked to have
1223 		 * changed. Since the process is about to die, we just
1224 		 * force the syscall to be skipped and let the signal
1225 		 * kill the process and correctly handle any tracer exit
1226 		 * notifications.
1227 		 */
1228 		if (fatal_signal_pending(current))
1229 			goto skip;
1230 		/* Check if the tracer forced the syscall to be skipped. */
1231 		this_syscall = syscall_get_nr(current, current_pt_regs());
1232 		if (this_syscall < 0)
1233 			goto skip;
1234 
1235 		/*
1236 		 * Recheck the syscall, since it may have changed. This
1237 		 * intentionally uses a NULL struct seccomp_data to force
1238 		 * a reload of all registers. This does not goto skip since
1239 		 * a skip would have already been reported.
1240 		 */
1241 		if (__seccomp_filter(this_syscall, NULL, true))
1242 			return -1;
1243 
1244 		return 0;
1245 
1246 	case SECCOMP_RET_USER_NOTIF:
1247 		if (seccomp_do_user_notification(this_syscall, match, sd))
1248 			goto skip;
1249 
1250 		return 0;
1251 
1252 	case SECCOMP_RET_LOG:
1253 		seccomp_log(this_syscall, 0, action, true);
1254 		return 0;
1255 
1256 	case SECCOMP_RET_ALLOW:
1257 		/*
1258 		 * Note that the "match" filter will always be NULL for
1259 		 * this action since SECCOMP_RET_ALLOW is the starting
1260 		 * state in seccomp_run_filters().
1261 		 */
1262 		return 0;
1263 
1264 	case SECCOMP_RET_KILL_THREAD:
1265 	case SECCOMP_RET_KILL_PROCESS:
1266 	default:
1267 		current->seccomp.mode = SECCOMP_MODE_DEAD;
1268 		seccomp_log(this_syscall, SIGSYS, action, true);
1269 		/* Dump core only if this is the last remaining thread. */
1270 		if (action != SECCOMP_RET_KILL_THREAD ||
1271 		    (atomic_read(&current->signal->live) == 1)) {
1272 			/* Show the original registers in the dump. */
1273 			syscall_rollback(current, current_pt_regs());
1274 			/* Trigger a coredump with SIGSYS */
1275 			force_sig_seccomp(this_syscall, data, true);
1276 		} else {
1277 			do_exit(SIGSYS);
1278 		}
1279 		return -1; /* skip the syscall go directly to signal handling */
1280 	}
1281 
1282 	unreachable();
1283 
1284 skip:
1285 	seccomp_log(this_syscall, 0, action, match ? match->log : false);
1286 	return -1;
1287 }
1288 #else
1289 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1290 			    const bool recheck_after_trace)
1291 {
1292 	BUG();
1293 
1294 	return -1;
1295 }
1296 #endif
1297 
1298 int __secure_computing(const struct seccomp_data *sd)
1299 {
1300 	int mode = current->seccomp.mode;
1301 	int this_syscall;
1302 
1303 	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1304 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1305 		return 0;
1306 
1307 	this_syscall = sd ? sd->nr :
1308 		syscall_get_nr(current, current_pt_regs());
1309 
1310 	switch (mode) {
1311 	case SECCOMP_MODE_STRICT:
1312 		__secure_computing_strict(this_syscall);  /* may call do_exit */
1313 		return 0;
1314 	case SECCOMP_MODE_FILTER:
1315 		return __seccomp_filter(this_syscall, sd, false);
1316 	/* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
1317 	case SECCOMP_MODE_DEAD:
1318 		WARN_ON_ONCE(1);
1319 		do_exit(SIGKILL);
1320 		return -1;
1321 	default:
1322 		BUG();
1323 	}
1324 }
1325 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1326 
1327 long prctl_get_seccomp(void)
1328 {
1329 	return current->seccomp.mode;
1330 }
1331 
1332 /**
1333  * seccomp_set_mode_strict: internal function for setting strict seccomp
1334  *
1335  * Once current->seccomp.mode is non-zero, it may not be changed.
1336  *
1337  * Returns 0 on success or -EINVAL on failure.
1338  */
1339 static long seccomp_set_mode_strict(void)
1340 {
1341 	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
1342 	long ret = -EINVAL;
1343 
1344 	spin_lock_irq(&current->sighand->siglock);
1345 
1346 	if (!seccomp_may_assign_mode(seccomp_mode))
1347 		goto out;
1348 
1349 #ifdef TIF_NOTSC
1350 	disable_TSC();
1351 #endif
1352 	seccomp_assign_mode(current, seccomp_mode, 0);
1353 	ret = 0;
1354 
1355 out:
1356 	spin_unlock_irq(&current->sighand->siglock);
1357 
1358 	return ret;
1359 }
1360 
1361 #ifdef CONFIG_SECCOMP_FILTER
1362 static void seccomp_notify_free(struct seccomp_filter *filter)
1363 {
1364 	kfree(filter->notif);
1365 	filter->notif = NULL;
1366 }
1367 
1368 static void seccomp_notify_detach(struct seccomp_filter *filter)
1369 {
1370 	struct seccomp_knotif *knotif;
1371 
1372 	if (!filter)
1373 		return;
1374 
1375 	mutex_lock(&filter->notify_lock);
1376 
1377 	/*
1378 	 * If this file is being closed because e.g. the task who owned it
1379 	 * died, let's wake everyone up who was waiting on us.
1380 	 */
1381 	list_for_each_entry(knotif, &filter->notif->notifications, list) {
1382 		if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1383 			continue;
1384 
1385 		knotif->state = SECCOMP_NOTIFY_REPLIED;
1386 		knotif->error = -ENOSYS;
1387 		knotif->val = 0;
1388 
1389 		/*
1390 		 * We do not need to wake up any pending addfd messages, as
1391 		 * the notifier will do that for us, as this just looks
1392 		 * like a standard reply.
1393 		 */
1394 		complete(&knotif->ready);
1395 	}
1396 
1397 	seccomp_notify_free(filter);
1398 	mutex_unlock(&filter->notify_lock);
1399 }
1400 
1401 static int seccomp_notify_release(struct inode *inode, struct file *file)
1402 {
1403 	struct seccomp_filter *filter = file->private_data;
1404 
1405 	seccomp_notify_detach(filter);
1406 	__put_seccomp_filter(filter);
1407 	return 0;
1408 }
1409 
1410 /* must be called with notif_lock held */
1411 static inline struct seccomp_knotif *
1412 find_notification(struct seccomp_filter *filter, u64 id)
1413 {
1414 	struct seccomp_knotif *cur;
1415 
1416 	lockdep_assert_held(&filter->notify_lock);
1417 
1418 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1419 		if (cur->id == id)
1420 			return cur;
1421 	}
1422 
1423 	return NULL;
1424 }
1425 
1426 
1427 static long seccomp_notify_recv(struct seccomp_filter *filter,
1428 				void __user *buf)
1429 {
1430 	struct seccomp_knotif *knotif = NULL, *cur;
1431 	struct seccomp_notif unotif;
1432 	ssize_t ret;
1433 
1434 	/* Verify that we're not given garbage to keep struct extensible. */
1435 	ret = check_zeroed_user(buf, sizeof(unotif));
1436 	if (ret < 0)
1437 		return ret;
1438 	if (!ret)
1439 		return -EINVAL;
1440 
1441 	memset(&unotif, 0, sizeof(unotif));
1442 
1443 	ret = down_interruptible(&filter->notif->request);
1444 	if (ret < 0)
1445 		return ret;
1446 
1447 	mutex_lock(&filter->notify_lock);
1448 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1449 		if (cur->state == SECCOMP_NOTIFY_INIT) {
1450 			knotif = cur;
1451 			break;
1452 		}
1453 	}
1454 
1455 	/*
1456 	 * If we didn't find a notification, it could be that the task was
1457 	 * interrupted by a fatal signal between the time we were woken and
1458 	 * when we were able to acquire the rw lock.
1459 	 */
1460 	if (!knotif) {
1461 		ret = -ENOENT;
1462 		goto out;
1463 	}
1464 
1465 	unotif.id = knotif->id;
1466 	unotif.pid = task_pid_vnr(knotif->task);
1467 	unotif.data = *(knotif->data);
1468 
1469 	knotif->state = SECCOMP_NOTIFY_SENT;
1470 	wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
1471 	ret = 0;
1472 out:
1473 	mutex_unlock(&filter->notify_lock);
1474 
1475 	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
1476 		ret = -EFAULT;
1477 
1478 		/*
1479 		 * Userspace screwed up. To make sure that we keep this
1480 		 * notification alive, let's reset it back to INIT. It
1481 		 * may have died when we released the lock, so we need to make
1482 		 * sure it's still around.
1483 		 */
1484 		mutex_lock(&filter->notify_lock);
1485 		knotif = find_notification(filter, unotif.id);
1486 		if (knotif) {
1487 			knotif->state = SECCOMP_NOTIFY_INIT;
1488 			up(&filter->notif->request);
1489 		}
1490 		mutex_unlock(&filter->notify_lock);
1491 	}
1492 
1493 	return ret;
1494 }
1495 
1496 static long seccomp_notify_send(struct seccomp_filter *filter,
1497 				void __user *buf)
1498 {
1499 	struct seccomp_notif_resp resp = {};
1500 	struct seccomp_knotif *knotif;
1501 	long ret;
1502 
1503 	if (copy_from_user(&resp, buf, sizeof(resp)))
1504 		return -EFAULT;
1505 
1506 	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1507 		return -EINVAL;
1508 
1509 	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1510 	    (resp.error || resp.val))
1511 		return -EINVAL;
1512 
1513 	ret = mutex_lock_interruptible(&filter->notify_lock);
1514 	if (ret < 0)
1515 		return ret;
1516 
1517 	knotif = find_notification(filter, resp.id);
1518 	if (!knotif) {
1519 		ret = -ENOENT;
1520 		goto out;
1521 	}
1522 
1523 	/* Allow exactly one reply. */
1524 	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1525 		ret = -EINPROGRESS;
1526 		goto out;
1527 	}
1528 
1529 	ret = 0;
1530 	knotif->state = SECCOMP_NOTIFY_REPLIED;
1531 	knotif->error = resp.error;
1532 	knotif->val = resp.val;
1533 	knotif->flags = resp.flags;
1534 	complete(&knotif->ready);
1535 out:
1536 	mutex_unlock(&filter->notify_lock);
1537 	return ret;
1538 }
1539 
1540 static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1541 				    void __user *buf)
1542 {
1543 	struct seccomp_knotif *knotif;
1544 	u64 id;
1545 	long ret;
1546 
1547 	if (copy_from_user(&id, buf, sizeof(id)))
1548 		return -EFAULT;
1549 
1550 	ret = mutex_lock_interruptible(&filter->notify_lock);
1551 	if (ret < 0)
1552 		return ret;
1553 
1554 	knotif = find_notification(filter, id);
1555 	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1556 		ret = 0;
1557 	else
1558 		ret = -ENOENT;
1559 
1560 	mutex_unlock(&filter->notify_lock);
1561 	return ret;
1562 }
1563 
1564 static long seccomp_notify_addfd(struct seccomp_filter *filter,
1565 				 struct seccomp_notif_addfd __user *uaddfd,
1566 				 unsigned int size)
1567 {
1568 	struct seccomp_notif_addfd addfd;
1569 	struct seccomp_knotif *knotif;
1570 	struct seccomp_kaddfd kaddfd;
1571 	int ret;
1572 
1573 	BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1574 	BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1575 
1576 	if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
1577 		return -EINVAL;
1578 
1579 	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
1580 	if (ret)
1581 		return ret;
1582 
1583 	if (addfd.newfd_flags & ~O_CLOEXEC)
1584 		return -EINVAL;
1585 
1586 	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND))
1587 		return -EINVAL;
1588 
1589 	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1590 		return -EINVAL;
1591 
1592 	kaddfd.file = fget(addfd.srcfd);
1593 	if (!kaddfd.file)
1594 		return -EBADF;
1595 
1596 	kaddfd.ioctl_flags = addfd.flags;
1597 	kaddfd.flags = addfd.newfd_flags;
1598 	kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
1599 	kaddfd.fd = addfd.newfd;
1600 	init_completion(&kaddfd.completion);
1601 
1602 	ret = mutex_lock_interruptible(&filter->notify_lock);
1603 	if (ret < 0)
1604 		goto out;
1605 
1606 	knotif = find_notification(filter, addfd.id);
1607 	if (!knotif) {
1608 		ret = -ENOENT;
1609 		goto out_unlock;
1610 	}
1611 
1612 	/*
1613 	 * We do not want to allow for FD injection to occur before the
1614 	 * notification has been picked up by a userspace handler, or after
1615 	 * the notification has been replied to.
1616 	 */
1617 	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1618 		ret = -EINPROGRESS;
1619 		goto out_unlock;
1620 	}
1621 
1622 	if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
1623 		/*
1624 		 * Disallow queuing an atomic addfd + send reply while there are
1625 		 * some addfd requests still to process.
1626 		 *
1627 		 * There is no clear reason to support it and allows us to keep
1628 		 * the loop on the other side straight-forward.
1629 		 */
1630 		if (!list_empty(&knotif->addfd)) {
1631 			ret = -EBUSY;
1632 			goto out_unlock;
1633 		}
1634 
1635 		/* Allow exactly only one reply */
1636 		knotif->state = SECCOMP_NOTIFY_REPLIED;
1637 	}
1638 
1639 	list_add(&kaddfd.list, &knotif->addfd);
1640 	complete(&knotif->ready);
1641 	mutex_unlock(&filter->notify_lock);
1642 
1643 	/* Now we wait for it to be processed or be interrupted */
1644 	ret = wait_for_completion_interruptible(&kaddfd.completion);
1645 	if (ret == 0) {
1646 		/*
1647 		 * We had a successful completion. The other side has already
1648 		 * removed us from the addfd queue, and
1649 		 * wait_for_completion_interruptible has a memory barrier upon
1650 		 * success that lets us read this value directly without
1651 		 * locking.
1652 		 */
1653 		ret = kaddfd.ret;
1654 		goto out;
1655 	}
1656 
1657 	mutex_lock(&filter->notify_lock);
1658 	/*
1659 	 * Even though we were woken up by a signal and not a successful
1660 	 * completion, a completion may have happened in the mean time.
1661 	 *
1662 	 * We need to check again if the addfd request has been handled,
1663 	 * and if not, we will remove it from the queue.
1664 	 */
1665 	if (list_empty(&kaddfd.list))
1666 		ret = kaddfd.ret;
1667 	else
1668 		list_del(&kaddfd.list);
1669 
1670 out_unlock:
1671 	mutex_unlock(&filter->notify_lock);
1672 out:
1673 	fput(kaddfd.file);
1674 
1675 	return ret;
1676 }
1677 
1678 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
1679 				 unsigned long arg)
1680 {
1681 	struct seccomp_filter *filter = file->private_data;
1682 	void __user *buf = (void __user *)arg;
1683 
1684 	/* Fixed-size ioctls */
1685 	switch (cmd) {
1686 	case SECCOMP_IOCTL_NOTIF_RECV:
1687 		return seccomp_notify_recv(filter, buf);
1688 	case SECCOMP_IOCTL_NOTIF_SEND:
1689 		return seccomp_notify_send(filter, buf);
1690 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1691 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
1692 		return seccomp_notify_id_valid(filter, buf);
1693 	}
1694 
1695 	/* Extensible Argument ioctls */
1696 #define EA_IOCTL(cmd)	((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
1697 	switch (EA_IOCTL(cmd)) {
1698 	case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1699 		return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
1700 	default:
1701 		return -EINVAL;
1702 	}
1703 }
1704 
1705 static __poll_t seccomp_notify_poll(struct file *file,
1706 				    struct poll_table_struct *poll_tab)
1707 {
1708 	struct seccomp_filter *filter = file->private_data;
1709 	__poll_t ret = 0;
1710 	struct seccomp_knotif *cur;
1711 
1712 	poll_wait(file, &filter->wqh, poll_tab);
1713 
1714 	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
1715 		return EPOLLERR;
1716 
1717 	list_for_each_entry(cur, &filter->notif->notifications, list) {
1718 		if (cur->state == SECCOMP_NOTIFY_INIT)
1719 			ret |= EPOLLIN | EPOLLRDNORM;
1720 		if (cur->state == SECCOMP_NOTIFY_SENT)
1721 			ret |= EPOLLOUT | EPOLLWRNORM;
1722 		if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1723 			break;
1724 	}
1725 
1726 	mutex_unlock(&filter->notify_lock);
1727 
1728 	if (refcount_read(&filter->users) == 0)
1729 		ret |= EPOLLHUP;
1730 
1731 	return ret;
1732 }
1733 
1734 static const struct file_operations seccomp_notify_ops = {
1735 	.poll = seccomp_notify_poll,
1736 	.release = seccomp_notify_release,
1737 	.unlocked_ioctl = seccomp_notify_ioctl,
1738 	.compat_ioctl = seccomp_notify_ioctl,
1739 };
1740 
1741 static struct file *init_listener(struct seccomp_filter *filter)
1742 {
1743 	struct file *ret;
1744 
1745 	ret = ERR_PTR(-ENOMEM);
1746 	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
1747 	if (!filter->notif)
1748 		goto out;
1749 
1750 	sema_init(&filter->notif->request, 0);
1751 	filter->notif->next_id = get_random_u64();
1752 	INIT_LIST_HEAD(&filter->notif->notifications);
1753 
1754 	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
1755 				 filter, O_RDWR);
1756 	if (IS_ERR(ret))
1757 		goto out_notif;
1758 
1759 	/* The file has a reference to it now */
1760 	__get_seccomp_filter(filter);
1761 
1762 out_notif:
1763 	if (IS_ERR(ret))
1764 		seccomp_notify_free(filter);
1765 out:
1766 	return ret;
1767 }
1768 
1769 /*
1770  * Does @new_child have a listener while an ancestor also has a listener?
1771  * If so, we'll want to reject this filter.
1772  * This only has to be tested for the current process, even in the TSYNC case,
1773  * because TSYNC installs @child with the same parent on all threads.
1774  * Note that @new_child is not hooked up to its parent at this point yet, so
1775  * we use current->seccomp.filter.
1776  */
1777 static bool has_duplicate_listener(struct seccomp_filter *new_child)
1778 {
1779 	struct seccomp_filter *cur;
1780 
1781 	/* must be protected against concurrent TSYNC */
1782 	lockdep_assert_held(&current->sighand->siglock);
1783 
1784 	if (!new_child->notif)
1785 		return false;
1786 	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1787 		if (cur->notif)
1788 			return true;
1789 	}
1790 
1791 	return false;
1792 }
1793 
1794 /**
1795  * seccomp_set_mode_filter: internal function for setting seccomp filter
1796  * @flags:  flags to change filter behavior
1797  * @filter: struct sock_fprog containing filter
1798  *
1799  * This function may be called repeatedly to install additional filters.
1800  * Every filter successfully installed will be evaluated (in reverse order)
1801  * for each system call the task makes.
1802  *
1803  * Once current->seccomp.mode is non-zero, it may not be changed.
1804  *
1805  * Returns 0 on success or -EINVAL on failure.
1806  */
1807 static long seccomp_set_mode_filter(unsigned int flags,
1808 				    const char __user *filter)
1809 {
1810 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1811 	struct seccomp_filter *prepared = NULL;
1812 	long ret = -EINVAL;
1813 	int listener = -1;
1814 	struct file *listener_f = NULL;
1815 
1816 	/* Validate flags. */
1817 	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1818 		return -EINVAL;
1819 
1820 	/*
1821 	 * In the successful case, NEW_LISTENER returns the new listener fd.
1822 	 * But in the failure case, TSYNC returns the thread that died. If you
1823 	 * combine these two flags, there's no way to tell whether something
1824 	 * succeeded or failed. So, let's disallow this combination if the user
1825 	 * has not explicitly requested no errors from TSYNC.
1826 	 */
1827 	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1828 	    (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1829 	    ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
1830 		return -EINVAL;
1831 
1832 	/* Prepare the new filter before holding any locks. */
1833 	prepared = seccomp_prepare_user_filter(filter);
1834 	if (IS_ERR(prepared))
1835 		return PTR_ERR(prepared);
1836 
1837 	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1838 		listener = get_unused_fd_flags(O_CLOEXEC);
1839 		if (listener < 0) {
1840 			ret = listener;
1841 			goto out_free;
1842 		}
1843 
1844 		listener_f = init_listener(prepared);
1845 		if (IS_ERR(listener_f)) {
1846 			put_unused_fd(listener);
1847 			ret = PTR_ERR(listener_f);
1848 			goto out_free;
1849 		}
1850 	}
1851 
1852 	/*
1853 	 * Make sure we cannot change seccomp or nnp state via TSYNC
1854 	 * while another thread is in the middle of calling exec.
1855 	 */
1856 	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
1857 	    mutex_lock_killable(&current->signal->cred_guard_mutex))
1858 		goto out_put_fd;
1859 
1860 	spin_lock_irq(&current->sighand->siglock);
1861 
1862 	if (!seccomp_may_assign_mode(seccomp_mode))
1863 		goto out;
1864 
1865 	if (has_duplicate_listener(prepared)) {
1866 		ret = -EBUSY;
1867 		goto out;
1868 	}
1869 
1870 	ret = seccomp_attach_filter(flags, prepared);
1871 	if (ret)
1872 		goto out;
1873 	/* Do not free the successfully attached filter. */
1874 	prepared = NULL;
1875 
1876 	seccomp_assign_mode(current, seccomp_mode, flags);
1877 out:
1878 	spin_unlock_irq(&current->sighand->siglock);
1879 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
1880 		mutex_unlock(&current->signal->cred_guard_mutex);
1881 out_put_fd:
1882 	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1883 		if (ret) {
1884 			listener_f->private_data = NULL;
1885 			fput(listener_f);
1886 			put_unused_fd(listener);
1887 			seccomp_notify_detach(prepared);
1888 		} else {
1889 			fd_install(listener, listener_f);
1890 			ret = listener;
1891 		}
1892 	}
1893 out_free:
1894 	seccomp_filter_free(prepared);
1895 	return ret;
1896 }
1897 #else
1898 static inline long seccomp_set_mode_filter(unsigned int flags,
1899 					   const char __user *filter)
1900 {
1901 	return -EINVAL;
1902 }
1903 #endif
1904 
1905 static long seccomp_get_action_avail(const char __user *uaction)
1906 {
1907 	u32 action;
1908 
1909 	if (copy_from_user(&action, uaction, sizeof(action)))
1910 		return -EFAULT;
1911 
1912 	switch (action) {
1913 	case SECCOMP_RET_KILL_PROCESS:
1914 	case SECCOMP_RET_KILL_THREAD:
1915 	case SECCOMP_RET_TRAP:
1916 	case SECCOMP_RET_ERRNO:
1917 	case SECCOMP_RET_USER_NOTIF:
1918 	case SECCOMP_RET_TRACE:
1919 	case SECCOMP_RET_LOG:
1920 	case SECCOMP_RET_ALLOW:
1921 		break;
1922 	default:
1923 		return -EOPNOTSUPP;
1924 	}
1925 
1926 	return 0;
1927 }
1928 
1929 static long seccomp_get_notif_sizes(void __user *usizes)
1930 {
1931 	struct seccomp_notif_sizes sizes = {
1932 		.seccomp_notif = sizeof(struct seccomp_notif),
1933 		.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
1934 		.seccomp_data = sizeof(struct seccomp_data),
1935 	};
1936 
1937 	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
1938 		return -EFAULT;
1939 
1940 	return 0;
1941 }
1942 
1943 /* Common entry point for both prctl and syscall. */
1944 static long do_seccomp(unsigned int op, unsigned int flags,
1945 		       void __user *uargs)
1946 {
1947 	switch (op) {
1948 	case SECCOMP_SET_MODE_STRICT:
1949 		if (flags != 0 || uargs != NULL)
1950 			return -EINVAL;
1951 		return seccomp_set_mode_strict();
1952 	case SECCOMP_SET_MODE_FILTER:
1953 		return seccomp_set_mode_filter(flags, uargs);
1954 	case SECCOMP_GET_ACTION_AVAIL:
1955 		if (flags != 0)
1956 			return -EINVAL;
1957 
1958 		return seccomp_get_action_avail(uargs);
1959 	case SECCOMP_GET_NOTIF_SIZES:
1960 		if (flags != 0)
1961 			return -EINVAL;
1962 
1963 		return seccomp_get_notif_sizes(uargs);
1964 	default:
1965 		return -EINVAL;
1966 	}
1967 }
1968 
1969 SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
1970 			 void __user *, uargs)
1971 {
1972 	return do_seccomp(op, flags, uargs);
1973 }
1974 
1975 /**
1976  * prctl_set_seccomp: configures current->seccomp.mode
1977  * @seccomp_mode: requested mode to use
1978  * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
1979  *
1980  * Returns 0 on success or -EINVAL on failure.
1981  */
1982 long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
1983 {
1984 	unsigned int op;
1985 	void __user *uargs;
1986 
1987 	switch (seccomp_mode) {
1988 	case SECCOMP_MODE_STRICT:
1989 		op = SECCOMP_SET_MODE_STRICT;
1990 		/*
1991 		 * Setting strict mode through prctl always ignored filter,
1992 		 * so make sure it is always NULL here to pass the internal
1993 		 * check in do_seccomp().
1994 		 */
1995 		uargs = NULL;
1996 		break;
1997 	case SECCOMP_MODE_FILTER:
1998 		op = SECCOMP_SET_MODE_FILTER;
1999 		uargs = filter;
2000 		break;
2001 	default:
2002 		return -EINVAL;
2003 	}
2004 
2005 	/* prctl interface doesn't have flags, so they are always zero. */
2006 	return do_seccomp(op, 0, uargs);
2007 }
2008 
2009 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
2010 static struct seccomp_filter *get_nth_filter(struct task_struct *task,
2011 					     unsigned long filter_off)
2012 {
2013 	struct seccomp_filter *orig, *filter;
2014 	unsigned long count;
2015 
2016 	/*
2017 	 * Note: this is only correct because the caller should be the (ptrace)
2018 	 * tracer of the task, otherwise lock_task_sighand is needed.
2019 	 */
2020 	spin_lock_irq(&task->sighand->siglock);
2021 
2022 	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
2023 		spin_unlock_irq(&task->sighand->siglock);
2024 		return ERR_PTR(-EINVAL);
2025 	}
2026 
2027 	orig = task->seccomp.filter;
2028 	__get_seccomp_filter(orig);
2029 	spin_unlock_irq(&task->sighand->siglock);
2030 
2031 	count = 0;
2032 	for (filter = orig; filter; filter = filter->prev)
2033 		count++;
2034 
2035 	if (filter_off >= count) {
2036 		filter = ERR_PTR(-ENOENT);
2037 		goto out;
2038 	}
2039 
2040 	count -= filter_off;
2041 	for (filter = orig; filter && count > 1; filter = filter->prev)
2042 		count--;
2043 
2044 	if (WARN_ON(count != 1 || !filter)) {
2045 		filter = ERR_PTR(-ENOENT);
2046 		goto out;
2047 	}
2048 
2049 	__get_seccomp_filter(filter);
2050 
2051 out:
2052 	__put_seccomp_filter(orig);
2053 	return filter;
2054 }
2055 
2056 long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
2057 			void __user *data)
2058 {
2059 	struct seccomp_filter *filter;
2060 	struct sock_fprog_kern *fprog;
2061 	long ret;
2062 
2063 	if (!capable(CAP_SYS_ADMIN) ||
2064 	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2065 		return -EACCES;
2066 	}
2067 
2068 	filter = get_nth_filter(task, filter_off);
2069 	if (IS_ERR(filter))
2070 		return PTR_ERR(filter);
2071 
2072 	fprog = filter->prog->orig_prog;
2073 	if (!fprog) {
2074 		/* This must be a new non-cBPF filter, since we save
2075 		 * every cBPF filter's orig_prog above when
2076 		 * CONFIG_CHECKPOINT_RESTORE is enabled.
2077 		 */
2078 		ret = -EMEDIUMTYPE;
2079 		goto out;
2080 	}
2081 
2082 	ret = fprog->len;
2083 	if (!data)
2084 		goto out;
2085 
2086 	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
2087 		ret = -EFAULT;
2088 
2089 out:
2090 	__put_seccomp_filter(filter);
2091 	return ret;
2092 }
2093 
2094 long seccomp_get_metadata(struct task_struct *task,
2095 			  unsigned long size, void __user *data)
2096 {
2097 	long ret;
2098 	struct seccomp_filter *filter;
2099 	struct seccomp_metadata kmd = {};
2100 
2101 	if (!capable(CAP_SYS_ADMIN) ||
2102 	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2103 		return -EACCES;
2104 	}
2105 
2106 	size = min_t(unsigned long, size, sizeof(kmd));
2107 
2108 	if (size < sizeof(kmd.filter_off))
2109 		return -EINVAL;
2110 
2111 	if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
2112 		return -EFAULT;
2113 
2114 	filter = get_nth_filter(task, kmd.filter_off);
2115 	if (IS_ERR(filter))
2116 		return PTR_ERR(filter);
2117 
2118 	if (filter->log)
2119 		kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
2120 
2121 	ret = size;
2122 	if (copy_to_user(data, &kmd, size))
2123 		ret = -EFAULT;
2124 
2125 	__put_seccomp_filter(filter);
2126 	return ret;
2127 }
2128 #endif
2129 
2130 #ifdef CONFIG_SYSCTL
2131 
2132 /* Human readable action names for friendly sysctl interaction */
2133 #define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process"
2134 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
2135 #define SECCOMP_RET_TRAP_NAME		"trap"
2136 #define SECCOMP_RET_ERRNO_NAME		"errno"
2137 #define SECCOMP_RET_USER_NOTIF_NAME	"user_notif"
2138 #define SECCOMP_RET_TRACE_NAME		"trace"
2139 #define SECCOMP_RET_LOG_NAME		"log"
2140 #define SECCOMP_RET_ALLOW_NAME		"allow"
2141 
2142 static const char seccomp_actions_avail[] =
2143 				SECCOMP_RET_KILL_PROCESS_NAME	" "
2144 				SECCOMP_RET_KILL_THREAD_NAME	" "
2145 				SECCOMP_RET_TRAP_NAME		" "
2146 				SECCOMP_RET_ERRNO_NAME		" "
2147 				SECCOMP_RET_USER_NOTIF_NAME     " "
2148 				SECCOMP_RET_TRACE_NAME		" "
2149 				SECCOMP_RET_LOG_NAME		" "
2150 				SECCOMP_RET_ALLOW_NAME;
2151 
2152 struct seccomp_log_name {
2153 	u32		log;
2154 	const char	*name;
2155 };
2156 
2157 static const struct seccomp_log_name seccomp_log_names[] = {
2158 	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
2159 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
2160 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
2161 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
2162 	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
2163 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
2164 	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
2165 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
2166 	{ }
2167 };
2168 
2169 static bool seccomp_names_from_actions_logged(char *names, size_t size,
2170 					      u32 actions_logged,
2171 					      const char *sep)
2172 {
2173 	const struct seccomp_log_name *cur;
2174 	bool append_sep = false;
2175 
2176 	for (cur = seccomp_log_names; cur->name && size; cur++) {
2177 		ssize_t ret;
2178 
2179 		if (!(actions_logged & cur->log))
2180 			continue;
2181 
2182 		if (append_sep) {
2183 			ret = strscpy(names, sep, size);
2184 			if (ret < 0)
2185 				return false;
2186 
2187 			names += ret;
2188 			size -= ret;
2189 		} else
2190 			append_sep = true;
2191 
2192 		ret = strscpy(names, cur->name, size);
2193 		if (ret < 0)
2194 			return false;
2195 
2196 		names += ret;
2197 		size -= ret;
2198 	}
2199 
2200 	return true;
2201 }
2202 
2203 static bool seccomp_action_logged_from_name(u32 *action_logged,
2204 					    const char *name)
2205 {
2206 	const struct seccomp_log_name *cur;
2207 
2208 	for (cur = seccomp_log_names; cur->name; cur++) {
2209 		if (!strcmp(cur->name, name)) {
2210 			*action_logged = cur->log;
2211 			return true;
2212 		}
2213 	}
2214 
2215 	return false;
2216 }
2217 
2218 static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
2219 {
2220 	char *name;
2221 
2222 	*actions_logged = 0;
2223 	while ((name = strsep(&names, " ")) && *name) {
2224 		u32 action_logged = 0;
2225 
2226 		if (!seccomp_action_logged_from_name(&action_logged, name))
2227 			return false;
2228 
2229 		*actions_logged |= action_logged;
2230 	}
2231 
2232 	return true;
2233 }
2234 
2235 static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
2236 			       size_t *lenp, loff_t *ppos)
2237 {
2238 	char names[sizeof(seccomp_actions_avail)];
2239 	struct ctl_table table;
2240 
2241 	memset(names, 0, sizeof(names));
2242 
2243 	if (!seccomp_names_from_actions_logged(names, sizeof(names),
2244 					       seccomp_actions_logged, " "))
2245 		return -EINVAL;
2246 
2247 	table = *ro_table;
2248 	table.data = names;
2249 	table.maxlen = sizeof(names);
2250 	return proc_dostring(&table, 0, buffer, lenp, ppos);
2251 }
2252 
2253 static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
2254 				size_t *lenp, loff_t *ppos, u32 *actions_logged)
2255 {
2256 	char names[sizeof(seccomp_actions_avail)];
2257 	struct ctl_table table;
2258 	int ret;
2259 
2260 	if (!capable(CAP_SYS_ADMIN))
2261 		return -EPERM;
2262 
2263 	memset(names, 0, sizeof(names));
2264 
2265 	table = *ro_table;
2266 	table.data = names;
2267 	table.maxlen = sizeof(names);
2268 	ret = proc_dostring(&table, 1, buffer, lenp, ppos);
2269 	if (ret)
2270 		return ret;
2271 
2272 	if (!seccomp_actions_logged_from_names(actions_logged, table.data))
2273 		return -EINVAL;
2274 
2275 	if (*actions_logged & SECCOMP_LOG_ALLOW)
2276 		return -EINVAL;
2277 
2278 	seccomp_actions_logged = *actions_logged;
2279 	return 0;
2280 }
2281 
2282 static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
2283 				 int ret)
2284 {
2285 	char names[sizeof(seccomp_actions_avail)];
2286 	char old_names[sizeof(seccomp_actions_avail)];
2287 	const char *new = names;
2288 	const char *old = old_names;
2289 
2290 	if (!audit_enabled)
2291 		return;
2292 
2293 	memset(names, 0, sizeof(names));
2294 	memset(old_names, 0, sizeof(old_names));
2295 
2296 	if (ret)
2297 		new = "?";
2298 	else if (!actions_logged)
2299 		new = "(none)";
2300 	else if (!seccomp_names_from_actions_logged(names, sizeof(names),
2301 						    actions_logged, ","))
2302 		new = "?";
2303 
2304 	if (!old_actions_logged)
2305 		old = "(none)";
2306 	else if (!seccomp_names_from_actions_logged(old_names,
2307 						    sizeof(old_names),
2308 						    old_actions_logged, ","))
2309 		old = "?";
2310 
2311 	return audit_seccomp_actions_logged(new, old, !ret);
2312 }
2313 
2314 static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
2315 					  void *buffer, size_t *lenp,
2316 					  loff_t *ppos)
2317 {
2318 	int ret;
2319 
2320 	if (write) {
2321 		u32 actions_logged = 0;
2322 		u32 old_actions_logged = seccomp_actions_logged;
2323 
2324 		ret = write_actions_logged(ro_table, buffer, lenp, ppos,
2325 					   &actions_logged);
2326 		audit_actions_logged(actions_logged, old_actions_logged, ret);
2327 	} else
2328 		ret = read_actions_logged(ro_table, buffer, lenp, ppos);
2329 
2330 	return ret;
2331 }
2332 
2333 static struct ctl_path seccomp_sysctl_path[] = {
2334 	{ .procname = "kernel", },
2335 	{ .procname = "seccomp", },
2336 	{ }
2337 };
2338 
2339 static struct ctl_table seccomp_sysctl_table[] = {
2340 	{
2341 		.procname	= "actions_avail",
2342 		.data		= (void *) &seccomp_actions_avail,
2343 		.maxlen		= sizeof(seccomp_actions_avail),
2344 		.mode		= 0444,
2345 		.proc_handler	= proc_dostring,
2346 	},
2347 	{
2348 		.procname	= "actions_logged",
2349 		.mode		= 0644,
2350 		.proc_handler	= seccomp_actions_logged_handler,
2351 	},
2352 	{ }
2353 };
2354 
2355 static int __init seccomp_sysctl_init(void)
2356 {
2357 	struct ctl_table_header *hdr;
2358 
2359 	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
2360 	if (!hdr)
2361 		pr_warn("sysctl registration failed\n");
2362 	else
2363 		kmemleak_not_leak(hdr);
2364 
2365 	return 0;
2366 }
2367 
2368 device_initcall(seccomp_sysctl_init)
2369 
2370 #endif /* CONFIG_SYSCTL */
2371 
2372 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
2373 /* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
2374 static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
2375 					const void *bitmap, size_t bitmap_size)
2376 {
2377 	int nr;
2378 
2379 	for (nr = 0; nr < bitmap_size; nr++) {
2380 		bool cached = test_bit(nr, bitmap);
2381 		char *status = cached ? "ALLOW" : "FILTER";
2382 
2383 		seq_printf(m, "%s %d %s\n", name, nr, status);
2384 	}
2385 }
2386 
2387 int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
2388 			   struct pid *pid, struct task_struct *task)
2389 {
2390 	struct seccomp_filter *f;
2391 	unsigned long flags;
2392 
2393 	/*
2394 	 * We don't want some sandboxed process to know what their seccomp
2395 	 * filters consist of.
2396 	 */
2397 	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
2398 		return -EACCES;
2399 
2400 	if (!lock_task_sighand(task, &flags))
2401 		return -ESRCH;
2402 
2403 	f = READ_ONCE(task->seccomp.filter);
2404 	if (!f) {
2405 		unlock_task_sighand(task, &flags);
2406 		return 0;
2407 	}
2408 
2409 	/* prevent filter from being freed while we are printing it */
2410 	__get_seccomp_filter(f);
2411 	unlock_task_sighand(task, &flags);
2412 
2413 	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
2414 				    f->cache.allow_native,
2415 				    SECCOMP_ARCH_NATIVE_NR);
2416 
2417 #ifdef SECCOMP_ARCH_COMPAT
2418 	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
2419 				    f->cache.allow_compat,
2420 				    SECCOMP_ARCH_COMPAT_NR);
2421 #endif /* SECCOMP_ARCH_COMPAT */
2422 
2423 	__put_seccomp_filter(f);
2424 	return 0;
2425 }
2426 #endif /* CONFIG_SECCOMP_CACHE_DEBUG */
2427