1 /* 2 * linux/kernel/seccomp.c 3 * 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 5 * 6 * Copyright (C) 2012 Google, Inc. 7 * Will Drewry <wad@chromium.org> 8 * 9 * This defines a simple but solid secure-computing facility. 10 * 11 * Mode 1 uses a fixed list of allowed system calls. 12 * Mode 2 allows user-defined system call filters in the form 13 * of Berkeley Packet Filters/Linux Socket Filters. 14 */ 15 16 #include <linux/atomic.h> 17 #include <linux/audit.h> 18 #include <linux/compat.h> 19 #include <linux/sched.h> 20 #include <linux/seccomp.h> 21 22 /* #define SECCOMP_DEBUG 1 */ 23 24 #ifdef CONFIG_SECCOMP_FILTER 25 #include <asm/syscall.h> 26 #include <linux/filter.h> 27 #include <linux/ptrace.h> 28 #include <linux/security.h> 29 #include <linux/slab.h> 30 #include <linux/tracehook.h> 31 #include <linux/uaccess.h> 32 33 /** 34 * struct seccomp_filter - container for seccomp BPF programs 35 * 36 * @usage: reference count to manage the object lifetime. 37 * get/put helpers should be used when accessing an instance 38 * outside of a lifetime-guarded section. In general, this 39 * is only needed for handling filters shared across tasks. 40 * @prev: points to a previously installed, or inherited, filter 41 * @len: the number of instructions in the program 42 * @insns: the BPF program instructions to evaluate 43 * 44 * seccomp_filter objects are organized in a tree linked via the @prev 45 * pointer. For any task, it appears to be a singly-linked list starting 46 * with current->seccomp.filter, the most recently attached or inherited filter. 47 * However, multiple filters may share a @prev node, by way of fork(), which 48 * results in a unidirectional tree existing in memory. This is similar to 49 * how namespaces work. 50 * 51 * seccomp_filter objects should never be modified after being attached 52 * to a task_struct (other than @usage). 53 */ 54 struct seccomp_filter { 55 atomic_t usage; 56 struct seccomp_filter *prev; 57 unsigned short len; /* Instruction count */ 58 struct sock_filter insns[]; 59 }; 60 61 /* Limit any path through the tree to 256KB worth of instructions. */ 62 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) 63 64 /** 65 * get_u32 - returns a u32 offset into data 66 * @data: a unsigned 64 bit value 67 * @index: 0 or 1 to return the first or second 32-bits 68 * 69 * This inline exists to hide the length of unsigned long. If a 32-bit 70 * unsigned long is passed in, it will be extended and the top 32-bits will be 71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be 72 * properly returned. 73 * 74 * Endianness is explicitly ignored and left for BPF program authors to manage 75 * as per the specific architecture. 76 */ 77 static inline u32 get_u32(u64 data, int index) 78 { 79 return ((u32 *)&data)[index]; 80 } 81 82 /* Helper for bpf_load below. */ 83 #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) 84 /** 85 * bpf_load: checks and returns a pointer to the requested offset 86 * @off: offset into struct seccomp_data to load from 87 * 88 * Returns the requested 32-bits of data. 89 * seccomp_check_filter() should assure that @off is 32-bit aligned 90 * and not out of bounds. Failure to do so is a BUG. 91 */ 92 u32 seccomp_bpf_load(int off) 93 { 94 struct pt_regs *regs = task_pt_regs(current); 95 if (off == BPF_DATA(nr)) 96 return syscall_get_nr(current, regs); 97 if (off == BPF_DATA(arch)) 98 return syscall_get_arch(current, regs); 99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { 100 unsigned long value; 101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64); 102 int index = !!(off % sizeof(u64)); 103 syscall_get_arguments(current, regs, arg, 1, &value); 104 return get_u32(value, index); 105 } 106 if (off == BPF_DATA(instruction_pointer)) 107 return get_u32(KSTK_EIP(current), 0); 108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) 109 return get_u32(KSTK_EIP(current), 1); 110 /* seccomp_check_filter should make this impossible. */ 111 BUG(); 112 } 113 114 /** 115 * seccomp_check_filter - verify seccomp filter code 116 * @filter: filter to verify 117 * @flen: length of filter 118 * 119 * Takes a previously checked filter (by sk_chk_filter) and 120 * redirects all filter code that loads struct sk_buff data 121 * and related data through seccomp_bpf_load. It also 122 * enforces length and alignment checking of those loads. 123 * 124 * Returns 0 if the rule set is legal or -EINVAL if not. 125 */ 126 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) 127 { 128 int pc; 129 for (pc = 0; pc < flen; pc++) { 130 struct sock_filter *ftest = &filter[pc]; 131 u16 code = ftest->code; 132 u32 k = ftest->k; 133 134 switch (code) { 135 case BPF_S_LD_W_ABS: 136 ftest->code = BPF_S_ANC_SECCOMP_LD_W; 137 /* 32-bit aligned and not out of bounds. */ 138 if (k >= sizeof(struct seccomp_data) || k & 3) 139 return -EINVAL; 140 continue; 141 case BPF_S_LD_W_LEN: 142 ftest->code = BPF_S_LD_IMM; 143 ftest->k = sizeof(struct seccomp_data); 144 continue; 145 case BPF_S_LDX_W_LEN: 146 ftest->code = BPF_S_LDX_IMM; 147 ftest->k = sizeof(struct seccomp_data); 148 continue; 149 /* Explicitly include allowed calls. */ 150 case BPF_S_RET_K: 151 case BPF_S_RET_A: 152 case BPF_S_ALU_ADD_K: 153 case BPF_S_ALU_ADD_X: 154 case BPF_S_ALU_SUB_K: 155 case BPF_S_ALU_SUB_X: 156 case BPF_S_ALU_MUL_K: 157 case BPF_S_ALU_MUL_X: 158 case BPF_S_ALU_DIV_X: 159 case BPF_S_ALU_AND_K: 160 case BPF_S_ALU_AND_X: 161 case BPF_S_ALU_OR_K: 162 case BPF_S_ALU_OR_X: 163 case BPF_S_ALU_XOR_K: 164 case BPF_S_ALU_XOR_X: 165 case BPF_S_ALU_LSH_K: 166 case BPF_S_ALU_LSH_X: 167 case BPF_S_ALU_RSH_K: 168 case BPF_S_ALU_RSH_X: 169 case BPF_S_ALU_NEG: 170 case BPF_S_LD_IMM: 171 case BPF_S_LDX_IMM: 172 case BPF_S_MISC_TAX: 173 case BPF_S_MISC_TXA: 174 case BPF_S_ALU_DIV_K: 175 case BPF_S_LD_MEM: 176 case BPF_S_LDX_MEM: 177 case BPF_S_ST: 178 case BPF_S_STX: 179 case BPF_S_JMP_JA: 180 case BPF_S_JMP_JEQ_K: 181 case BPF_S_JMP_JEQ_X: 182 case BPF_S_JMP_JGE_K: 183 case BPF_S_JMP_JGE_X: 184 case BPF_S_JMP_JGT_K: 185 case BPF_S_JMP_JGT_X: 186 case BPF_S_JMP_JSET_K: 187 case BPF_S_JMP_JSET_X: 188 continue; 189 default: 190 return -EINVAL; 191 } 192 } 193 return 0; 194 } 195 196 /** 197 * seccomp_run_filters - evaluates all seccomp filters against @syscall 198 * @syscall: number of the current system call 199 * 200 * Returns valid seccomp BPF response codes. 201 */ 202 static u32 seccomp_run_filters(int syscall) 203 { 204 struct seccomp_filter *f; 205 u32 ret = SECCOMP_RET_ALLOW; 206 207 /* Ensure unexpected behavior doesn't result in failing open. */ 208 if (WARN_ON(current->seccomp.filter == NULL)) 209 return SECCOMP_RET_KILL; 210 211 /* 212 * All filters in the list are evaluated and the lowest BPF return 213 * value always takes priority (ignoring the DATA). 214 */ 215 for (f = current->seccomp.filter; f; f = f->prev) { 216 u32 cur_ret = sk_run_filter(NULL, f->insns); 217 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 218 ret = cur_ret; 219 } 220 return ret; 221 } 222 223 /** 224 * seccomp_attach_filter: Attaches a seccomp filter to current. 225 * @fprog: BPF program to install 226 * 227 * Returns 0 on success or an errno on failure. 228 */ 229 static long seccomp_attach_filter(struct sock_fprog *fprog) 230 { 231 struct seccomp_filter *filter; 232 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 233 unsigned long total_insns = fprog->len; 234 long ret; 235 236 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 237 return -EINVAL; 238 239 for (filter = current->seccomp.filter; filter; filter = filter->prev) 240 total_insns += filter->len + 4; /* include a 4 instr penalty */ 241 if (total_insns > MAX_INSNS_PER_PATH) 242 return -ENOMEM; 243 244 /* 245 * Installing a seccomp filter requires that the task have 246 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 247 * This avoids scenarios where unprivileged tasks can affect the 248 * behavior of privileged children. 249 */ 250 if (!current->no_new_privs && 251 security_capable_noaudit(current_cred(), current_user_ns(), 252 CAP_SYS_ADMIN) != 0) 253 return -EACCES; 254 255 /* Allocate a new seccomp_filter */ 256 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, 257 GFP_KERNEL|__GFP_NOWARN); 258 if (!filter) 259 return -ENOMEM; 260 atomic_set(&filter->usage, 1); 261 filter->len = fprog->len; 262 263 /* Copy the instructions from fprog. */ 264 ret = -EFAULT; 265 if (copy_from_user(filter->insns, fprog->filter, fp_size)) 266 goto fail; 267 268 /* Check and rewrite the fprog via the skb checker */ 269 ret = sk_chk_filter(filter->insns, filter->len); 270 if (ret) 271 goto fail; 272 273 /* Check and rewrite the fprog for seccomp use */ 274 ret = seccomp_check_filter(filter->insns, filter->len); 275 if (ret) 276 goto fail; 277 278 /* 279 * If there is an existing filter, make it the prev and don't drop its 280 * task reference. 281 */ 282 filter->prev = current->seccomp.filter; 283 current->seccomp.filter = filter; 284 return 0; 285 fail: 286 kfree(filter); 287 return ret; 288 } 289 290 /** 291 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog 292 * @user_filter: pointer to the user data containing a sock_fprog. 293 * 294 * Returns 0 on success and non-zero otherwise. 295 */ 296 long seccomp_attach_user_filter(char __user *user_filter) 297 { 298 struct sock_fprog fprog; 299 long ret = -EFAULT; 300 301 #ifdef CONFIG_COMPAT 302 if (is_compat_task()) { 303 struct compat_sock_fprog fprog32; 304 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) 305 goto out; 306 fprog.len = fprog32.len; 307 fprog.filter = compat_ptr(fprog32.filter); 308 } else /* falls through to the if below. */ 309 #endif 310 if (copy_from_user(&fprog, user_filter, sizeof(fprog))) 311 goto out; 312 ret = seccomp_attach_filter(&fprog); 313 out: 314 return ret; 315 } 316 317 /* get_seccomp_filter - increments the reference count of the filter on @tsk */ 318 void get_seccomp_filter(struct task_struct *tsk) 319 { 320 struct seccomp_filter *orig = tsk->seccomp.filter; 321 if (!orig) 322 return; 323 /* Reference count is bounded by the number of total processes. */ 324 atomic_inc(&orig->usage); 325 } 326 327 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 328 void put_seccomp_filter(struct task_struct *tsk) 329 { 330 struct seccomp_filter *orig = tsk->seccomp.filter; 331 /* Clean up single-reference branches iteratively. */ 332 while (orig && atomic_dec_and_test(&orig->usage)) { 333 struct seccomp_filter *freeme = orig; 334 orig = orig->prev; 335 kfree(freeme); 336 } 337 } 338 339 /** 340 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation 341 * @syscall: syscall number to send to userland 342 * @reason: filter-supplied reason code to send to userland (via si_errno) 343 * 344 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. 345 */ 346 static void seccomp_send_sigsys(int syscall, int reason) 347 { 348 struct siginfo info; 349 memset(&info, 0, sizeof(info)); 350 info.si_signo = SIGSYS; 351 info.si_code = SYS_SECCOMP; 352 info.si_call_addr = (void __user *)KSTK_EIP(current); 353 info.si_errno = reason; 354 info.si_arch = syscall_get_arch(current, task_pt_regs(current)); 355 info.si_syscall = syscall; 356 force_sig_info(SIGSYS, &info, current); 357 } 358 #endif /* CONFIG_SECCOMP_FILTER */ 359 360 /* 361 * Secure computing mode 1 allows only read/write/exit/sigreturn. 362 * To be fully secure this must be combined with rlimit 363 * to limit the stack allocations too. 364 */ 365 static int mode1_syscalls[] = { 366 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 367 0, /* null terminated */ 368 }; 369 370 #ifdef CONFIG_COMPAT 371 static int mode1_syscalls_32[] = { 372 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 373 0, /* null terminated */ 374 }; 375 #endif 376 377 int __secure_computing(int this_syscall) 378 { 379 int mode = current->seccomp.mode; 380 int exit_sig = 0; 381 int *syscall; 382 u32 ret; 383 384 switch (mode) { 385 case SECCOMP_MODE_STRICT: 386 syscall = mode1_syscalls; 387 #ifdef CONFIG_COMPAT 388 if (is_compat_task()) 389 syscall = mode1_syscalls_32; 390 #endif 391 do { 392 if (*syscall == this_syscall) 393 return 0; 394 } while (*++syscall); 395 exit_sig = SIGKILL; 396 ret = SECCOMP_RET_KILL; 397 break; 398 #ifdef CONFIG_SECCOMP_FILTER 399 case SECCOMP_MODE_FILTER: { 400 int data; 401 struct pt_regs *regs = task_pt_regs(current); 402 ret = seccomp_run_filters(this_syscall); 403 data = ret & SECCOMP_RET_DATA; 404 ret &= SECCOMP_RET_ACTION; 405 switch (ret) { 406 case SECCOMP_RET_ERRNO: 407 /* Set the low-order 16-bits as a errno. */ 408 syscall_set_return_value(current, regs, 409 -data, 0); 410 goto skip; 411 case SECCOMP_RET_TRAP: 412 /* Show the handler the original registers. */ 413 syscall_rollback(current, regs); 414 /* Let the filter pass back 16 bits of data. */ 415 seccomp_send_sigsys(this_syscall, data); 416 goto skip; 417 case SECCOMP_RET_TRACE: 418 /* Skip these calls if there is no tracer. */ 419 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { 420 syscall_set_return_value(current, regs, 421 -ENOSYS, 0); 422 goto skip; 423 } 424 /* Allow the BPF to provide the event message */ 425 ptrace_event(PTRACE_EVENT_SECCOMP, data); 426 /* 427 * The delivery of a fatal signal during event 428 * notification may silently skip tracer notification. 429 * Terminating the task now avoids executing a system 430 * call that may not be intended. 431 */ 432 if (fatal_signal_pending(current)) 433 break; 434 if (syscall_get_nr(current, regs) < 0) 435 goto skip; /* Explicit request to skip. */ 436 437 return 0; 438 case SECCOMP_RET_ALLOW: 439 return 0; 440 case SECCOMP_RET_KILL: 441 default: 442 break; 443 } 444 exit_sig = SIGSYS; 445 break; 446 } 447 #endif 448 default: 449 BUG(); 450 } 451 452 #ifdef SECCOMP_DEBUG 453 dump_stack(); 454 #endif 455 audit_seccomp(this_syscall, exit_sig, ret); 456 do_exit(exit_sig); 457 #ifdef CONFIG_SECCOMP_FILTER 458 skip: 459 audit_seccomp(this_syscall, exit_sig, ret); 460 #endif 461 return -1; 462 } 463 464 long prctl_get_seccomp(void) 465 { 466 return current->seccomp.mode; 467 } 468 469 /** 470 * prctl_set_seccomp: configures current->seccomp.mode 471 * @seccomp_mode: requested mode to use 472 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER 473 * 474 * This function may be called repeatedly with a @seccomp_mode of 475 * SECCOMP_MODE_FILTER to install additional filters. Every filter 476 * successfully installed will be evaluated (in reverse order) for each system 477 * call the task makes. 478 * 479 * Once current->seccomp.mode is non-zero, it may not be changed. 480 * 481 * Returns 0 on success or -EINVAL on failure. 482 */ 483 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) 484 { 485 long ret = -EINVAL; 486 487 if (current->seccomp.mode && 488 current->seccomp.mode != seccomp_mode) 489 goto out; 490 491 switch (seccomp_mode) { 492 case SECCOMP_MODE_STRICT: 493 ret = 0; 494 #ifdef TIF_NOTSC 495 disable_TSC(); 496 #endif 497 break; 498 #ifdef CONFIG_SECCOMP_FILTER 499 case SECCOMP_MODE_FILTER: 500 ret = seccomp_attach_user_filter(filter); 501 if (ret) 502 goto out; 503 break; 504 #endif 505 default: 506 goto out; 507 } 508 509 current->seccomp.mode = seccomp_mode; 510 set_thread_flag(TIF_SECCOMP); 511 out: 512 return ret; 513 } 514