1 /* 2 * linux/kernel/seccomp.c 3 * 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 5 * 6 * Copyright (C) 2012 Google, Inc. 7 * Will Drewry <wad@chromium.org> 8 * 9 * This defines a simple but solid secure-computing facility. 10 * 11 * Mode 1 uses a fixed list of allowed system calls. 12 * Mode 2 allows user-defined system call filters in the form 13 * of Berkeley Packet Filters/Linux Socket Filters. 14 */ 15 16 #include <linux/atomic.h> 17 #include <linux/audit.h> 18 #include <linux/compat.h> 19 #include <linux/sched.h> 20 #include <linux/seccomp.h> 21 22 /* #define SECCOMP_DEBUG 1 */ 23 24 #ifdef CONFIG_SECCOMP_FILTER 25 #include <asm/syscall.h> 26 #include <linux/filter.h> 27 #include <linux/ptrace.h> 28 #include <linux/security.h> 29 #include <linux/slab.h> 30 #include <linux/tracehook.h> 31 #include <linux/uaccess.h> 32 33 /** 34 * struct seccomp_filter - container for seccomp BPF programs 35 * 36 * @usage: reference count to manage the object lifetime. 37 * get/put helpers should be used when accessing an instance 38 * outside of a lifetime-guarded section. In general, this 39 * is only needed for handling filters shared across tasks. 40 * @prev: points to a previously installed, or inherited, filter 41 * @len: the number of instructions in the program 42 * @insnsi: the BPF program instructions to evaluate 43 * 44 * seccomp_filter objects are organized in a tree linked via the @prev 45 * pointer. For any task, it appears to be a singly-linked list starting 46 * with current->seccomp.filter, the most recently attached or inherited filter. 47 * However, multiple filters may share a @prev node, by way of fork(), which 48 * results in a unidirectional tree existing in memory. This is similar to 49 * how namespaces work. 50 * 51 * seccomp_filter objects should never be modified after being attached 52 * to a task_struct (other than @usage). 53 */ 54 struct seccomp_filter { 55 atomic_t usage; 56 struct seccomp_filter *prev; 57 struct sk_filter *prog; 58 }; 59 60 /* Limit any path through the tree to 256KB worth of instructions. */ 61 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) 62 63 /* 64 * Endianness is explicitly ignored and left for BPF program authors to manage 65 * as per the specific architecture. 66 */ 67 static void populate_seccomp_data(struct seccomp_data *sd) 68 { 69 struct task_struct *task = current; 70 struct pt_regs *regs = task_pt_regs(task); 71 unsigned long args[6]; 72 73 sd->nr = syscall_get_nr(task, regs); 74 sd->arch = syscall_get_arch(); 75 syscall_get_arguments(task, regs, 0, 6, args); 76 sd->args[0] = args[0]; 77 sd->args[1] = args[1]; 78 sd->args[2] = args[2]; 79 sd->args[3] = args[3]; 80 sd->args[4] = args[4]; 81 sd->args[5] = args[5]; 82 sd->instruction_pointer = KSTK_EIP(task); 83 } 84 85 /** 86 * seccomp_check_filter - verify seccomp filter code 87 * @filter: filter to verify 88 * @flen: length of filter 89 * 90 * Takes a previously checked filter (by sk_chk_filter) and 91 * redirects all filter code that loads struct sk_buff data 92 * and related data through seccomp_bpf_load. It also 93 * enforces length and alignment checking of those loads. 94 * 95 * Returns 0 if the rule set is legal or -EINVAL if not. 96 */ 97 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) 98 { 99 int pc; 100 for (pc = 0; pc < flen; pc++) { 101 struct sock_filter *ftest = &filter[pc]; 102 u16 code = ftest->code; 103 u32 k = ftest->k; 104 105 switch (code) { 106 case BPF_LD | BPF_W | BPF_ABS: 107 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 108 /* 32-bit aligned and not out of bounds. */ 109 if (k >= sizeof(struct seccomp_data) || k & 3) 110 return -EINVAL; 111 continue; 112 case BPF_LD | BPF_W | BPF_LEN: 113 ftest->code = BPF_LD | BPF_IMM; 114 ftest->k = sizeof(struct seccomp_data); 115 continue; 116 case BPF_LDX | BPF_W | BPF_LEN: 117 ftest->code = BPF_LDX | BPF_IMM; 118 ftest->k = sizeof(struct seccomp_data); 119 continue; 120 /* Explicitly include allowed calls. */ 121 case BPF_RET | BPF_K: 122 case BPF_RET | BPF_A: 123 case BPF_ALU | BPF_ADD | BPF_K: 124 case BPF_ALU | BPF_ADD | BPF_X: 125 case BPF_ALU | BPF_SUB | BPF_K: 126 case BPF_ALU | BPF_SUB | BPF_X: 127 case BPF_ALU | BPF_MUL | BPF_K: 128 case BPF_ALU | BPF_MUL | BPF_X: 129 case BPF_ALU | BPF_DIV | BPF_K: 130 case BPF_ALU | BPF_DIV | BPF_X: 131 case BPF_ALU | BPF_AND | BPF_K: 132 case BPF_ALU | BPF_AND | BPF_X: 133 case BPF_ALU | BPF_OR | BPF_K: 134 case BPF_ALU | BPF_OR | BPF_X: 135 case BPF_ALU | BPF_XOR | BPF_K: 136 case BPF_ALU | BPF_XOR | BPF_X: 137 case BPF_ALU | BPF_LSH | BPF_K: 138 case BPF_ALU | BPF_LSH | BPF_X: 139 case BPF_ALU | BPF_RSH | BPF_K: 140 case BPF_ALU | BPF_RSH | BPF_X: 141 case BPF_ALU | BPF_NEG: 142 case BPF_LD | BPF_IMM: 143 case BPF_LDX | BPF_IMM: 144 case BPF_MISC | BPF_TAX: 145 case BPF_MISC | BPF_TXA: 146 case BPF_LD | BPF_MEM: 147 case BPF_LDX | BPF_MEM: 148 case BPF_ST: 149 case BPF_STX: 150 case BPF_JMP | BPF_JA: 151 case BPF_JMP | BPF_JEQ | BPF_K: 152 case BPF_JMP | BPF_JEQ | BPF_X: 153 case BPF_JMP | BPF_JGE | BPF_K: 154 case BPF_JMP | BPF_JGE | BPF_X: 155 case BPF_JMP | BPF_JGT | BPF_K: 156 case BPF_JMP | BPF_JGT | BPF_X: 157 case BPF_JMP | BPF_JSET | BPF_K: 158 case BPF_JMP | BPF_JSET | BPF_X: 159 continue; 160 default: 161 return -EINVAL; 162 } 163 } 164 return 0; 165 } 166 167 /** 168 * seccomp_run_filters - evaluates all seccomp filters against @syscall 169 * @syscall: number of the current system call 170 * 171 * Returns valid seccomp BPF response codes. 172 */ 173 static u32 seccomp_run_filters(int syscall) 174 { 175 struct seccomp_filter *f; 176 struct seccomp_data sd; 177 u32 ret = SECCOMP_RET_ALLOW; 178 179 /* Ensure unexpected behavior doesn't result in failing open. */ 180 if (WARN_ON(current->seccomp.filter == NULL)) 181 return SECCOMP_RET_KILL; 182 183 populate_seccomp_data(&sd); 184 185 /* 186 * All filters in the list are evaluated and the lowest BPF return 187 * value always takes priority (ignoring the DATA). 188 */ 189 for (f = current->seccomp.filter; f; f = f->prev) { 190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); 191 192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 193 ret = cur_ret; 194 } 195 return ret; 196 } 197 198 /** 199 * seccomp_attach_filter: Attaches a seccomp filter to current. 200 * @fprog: BPF program to install 201 * 202 * Returns 0 on success or an errno on failure. 203 */ 204 static long seccomp_attach_filter(struct sock_fprog *fprog) 205 { 206 struct seccomp_filter *filter; 207 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 208 unsigned long total_insns = fprog->len; 209 struct sock_filter *fp; 210 int new_len; 211 long ret; 212 213 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 214 return -EINVAL; 215 216 for (filter = current->seccomp.filter; filter; filter = filter->prev) 217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ 218 if (total_insns > MAX_INSNS_PER_PATH) 219 return -ENOMEM; 220 221 /* 222 * Installing a seccomp filter requires that the task has 223 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 224 * This avoids scenarios where unprivileged tasks can affect the 225 * behavior of privileged children. 226 */ 227 if (!current->no_new_privs && 228 security_capable_noaudit(current_cred(), current_user_ns(), 229 CAP_SYS_ADMIN) != 0) 230 return -EACCES; 231 232 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); 233 if (!fp) 234 return -ENOMEM; 235 236 /* Copy the instructions from fprog. */ 237 ret = -EFAULT; 238 if (copy_from_user(fp, fprog->filter, fp_size)) 239 goto free_prog; 240 241 /* Check and rewrite the fprog via the skb checker */ 242 ret = sk_chk_filter(fp, fprog->len); 243 if (ret) 244 goto free_prog; 245 246 /* Check and rewrite the fprog for seccomp use */ 247 ret = seccomp_check_filter(fp, fprog->len); 248 if (ret) 249 goto free_prog; 250 251 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ 252 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); 253 if (ret) 254 goto free_prog; 255 256 /* Allocate a new seccomp_filter */ 257 ret = -ENOMEM; 258 filter = kzalloc(sizeof(struct seccomp_filter), 259 GFP_KERNEL|__GFP_NOWARN); 260 if (!filter) 261 goto free_prog; 262 263 filter->prog = kzalloc(sk_filter_size(new_len), 264 GFP_KERNEL|__GFP_NOWARN); 265 if (!filter->prog) 266 goto free_filter; 267 268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); 269 if (ret) 270 goto free_filter_prog; 271 kfree(fp); 272 273 atomic_set(&filter->usage, 1); 274 filter->prog->len = new_len; 275 276 sk_filter_select_runtime(filter->prog); 277 278 /* 279 * If there is an existing filter, make it the prev and don't drop its 280 * task reference. 281 */ 282 filter->prev = current->seccomp.filter; 283 current->seccomp.filter = filter; 284 return 0; 285 286 free_filter_prog: 287 kfree(filter->prog); 288 free_filter: 289 kfree(filter); 290 free_prog: 291 kfree(fp); 292 return ret; 293 } 294 295 /** 296 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog 297 * @user_filter: pointer to the user data containing a sock_fprog. 298 * 299 * Returns 0 on success and non-zero otherwise. 300 */ 301 static long seccomp_attach_user_filter(char __user *user_filter) 302 { 303 struct sock_fprog fprog; 304 long ret = -EFAULT; 305 306 #ifdef CONFIG_COMPAT 307 if (is_compat_task()) { 308 struct compat_sock_fprog fprog32; 309 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) 310 goto out; 311 fprog.len = fprog32.len; 312 fprog.filter = compat_ptr(fprog32.filter); 313 } else /* falls through to the if below. */ 314 #endif 315 if (copy_from_user(&fprog, user_filter, sizeof(fprog))) 316 goto out; 317 ret = seccomp_attach_filter(&fprog); 318 out: 319 return ret; 320 } 321 322 /* get_seccomp_filter - increments the reference count of the filter on @tsk */ 323 void get_seccomp_filter(struct task_struct *tsk) 324 { 325 struct seccomp_filter *orig = tsk->seccomp.filter; 326 if (!orig) 327 return; 328 /* Reference count is bounded by the number of total processes. */ 329 atomic_inc(&orig->usage); 330 } 331 332 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 333 void put_seccomp_filter(struct task_struct *tsk) 334 { 335 struct seccomp_filter *orig = tsk->seccomp.filter; 336 /* Clean up single-reference branches iteratively. */ 337 while (orig && atomic_dec_and_test(&orig->usage)) { 338 struct seccomp_filter *freeme = orig; 339 orig = orig->prev; 340 sk_filter_free(freeme->prog); 341 kfree(freeme); 342 } 343 } 344 345 /** 346 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation 347 * @syscall: syscall number to send to userland 348 * @reason: filter-supplied reason code to send to userland (via si_errno) 349 * 350 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. 351 */ 352 static void seccomp_send_sigsys(int syscall, int reason) 353 { 354 struct siginfo info; 355 memset(&info, 0, sizeof(info)); 356 info.si_signo = SIGSYS; 357 info.si_code = SYS_SECCOMP; 358 info.si_call_addr = (void __user *)KSTK_EIP(current); 359 info.si_errno = reason; 360 info.si_arch = syscall_get_arch(); 361 info.si_syscall = syscall; 362 force_sig_info(SIGSYS, &info, current); 363 } 364 #endif /* CONFIG_SECCOMP_FILTER */ 365 366 /* 367 * Secure computing mode 1 allows only read/write/exit/sigreturn. 368 * To be fully secure this must be combined with rlimit 369 * to limit the stack allocations too. 370 */ 371 static int mode1_syscalls[] = { 372 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 373 0, /* null terminated */ 374 }; 375 376 #ifdef CONFIG_COMPAT 377 static int mode1_syscalls_32[] = { 378 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 379 0, /* null terminated */ 380 }; 381 #endif 382 383 int __secure_computing(int this_syscall) 384 { 385 int mode = current->seccomp.mode; 386 int exit_sig = 0; 387 int *syscall; 388 u32 ret; 389 390 switch (mode) { 391 case SECCOMP_MODE_STRICT: 392 syscall = mode1_syscalls; 393 #ifdef CONFIG_COMPAT 394 if (is_compat_task()) 395 syscall = mode1_syscalls_32; 396 #endif 397 do { 398 if (*syscall == this_syscall) 399 return 0; 400 } while (*++syscall); 401 exit_sig = SIGKILL; 402 ret = SECCOMP_RET_KILL; 403 break; 404 #ifdef CONFIG_SECCOMP_FILTER 405 case SECCOMP_MODE_FILTER: { 406 int data; 407 struct pt_regs *regs = task_pt_regs(current); 408 ret = seccomp_run_filters(this_syscall); 409 data = ret & SECCOMP_RET_DATA; 410 ret &= SECCOMP_RET_ACTION; 411 switch (ret) { 412 case SECCOMP_RET_ERRNO: 413 /* Set the low-order 16-bits as a errno. */ 414 syscall_set_return_value(current, regs, 415 -data, 0); 416 goto skip; 417 case SECCOMP_RET_TRAP: 418 /* Show the handler the original registers. */ 419 syscall_rollback(current, regs); 420 /* Let the filter pass back 16 bits of data. */ 421 seccomp_send_sigsys(this_syscall, data); 422 goto skip; 423 case SECCOMP_RET_TRACE: 424 /* Skip these calls if there is no tracer. */ 425 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { 426 syscall_set_return_value(current, regs, 427 -ENOSYS, 0); 428 goto skip; 429 } 430 /* Allow the BPF to provide the event message */ 431 ptrace_event(PTRACE_EVENT_SECCOMP, data); 432 /* 433 * The delivery of a fatal signal during event 434 * notification may silently skip tracer notification. 435 * Terminating the task now avoids executing a system 436 * call that may not be intended. 437 */ 438 if (fatal_signal_pending(current)) 439 break; 440 if (syscall_get_nr(current, regs) < 0) 441 goto skip; /* Explicit request to skip. */ 442 443 return 0; 444 case SECCOMP_RET_ALLOW: 445 return 0; 446 case SECCOMP_RET_KILL: 447 default: 448 break; 449 } 450 exit_sig = SIGSYS; 451 break; 452 } 453 #endif 454 default: 455 BUG(); 456 } 457 458 #ifdef SECCOMP_DEBUG 459 dump_stack(); 460 #endif 461 audit_seccomp(this_syscall, exit_sig, ret); 462 do_exit(exit_sig); 463 #ifdef CONFIG_SECCOMP_FILTER 464 skip: 465 audit_seccomp(this_syscall, exit_sig, ret); 466 #endif 467 return -1; 468 } 469 470 long prctl_get_seccomp(void) 471 { 472 return current->seccomp.mode; 473 } 474 475 /** 476 * prctl_set_seccomp: configures current->seccomp.mode 477 * @seccomp_mode: requested mode to use 478 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER 479 * 480 * This function may be called repeatedly with a @seccomp_mode of 481 * SECCOMP_MODE_FILTER to install additional filters. Every filter 482 * successfully installed will be evaluated (in reverse order) for each system 483 * call the task makes. 484 * 485 * Once current->seccomp.mode is non-zero, it may not be changed. 486 * 487 * Returns 0 on success or -EINVAL on failure. 488 */ 489 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) 490 { 491 long ret = -EINVAL; 492 493 if (current->seccomp.mode && 494 current->seccomp.mode != seccomp_mode) 495 goto out; 496 497 switch (seccomp_mode) { 498 case SECCOMP_MODE_STRICT: 499 ret = 0; 500 #ifdef TIF_NOTSC 501 disable_TSC(); 502 #endif 503 break; 504 #ifdef CONFIG_SECCOMP_FILTER 505 case SECCOMP_MODE_FILTER: 506 ret = seccomp_attach_user_filter(filter); 507 if (ret) 508 goto out; 509 break; 510 #endif 511 default: 512 goto out; 513 } 514 515 current->seccomp.mode = seccomp_mode; 516 set_thread_flag(TIF_SECCOMP); 517 out: 518 return ret; 519 } 520