1 /* 2 * linux/kernel/seccomp.c 3 * 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 5 * 6 * Copyright (C) 2012 Google, Inc. 7 * Will Drewry <wad@chromium.org> 8 * 9 * This defines a simple but solid secure-computing facility. 10 * 11 * Mode 1 uses a fixed list of allowed system calls. 12 * Mode 2 allows user-defined system call filters in the form 13 * of Berkeley Packet Filters/Linux Socket Filters. 14 */ 15 16 #include <linux/atomic.h> 17 #include <linux/audit.h> 18 #include <linux/compat.h> 19 #include <linux/sched.h> 20 #include <linux/seccomp.h> 21 22 /* #define SECCOMP_DEBUG 1 */ 23 24 #ifdef CONFIG_SECCOMP_FILTER 25 #include <asm/syscall.h> 26 #include <linux/filter.h> 27 #include <linux/ptrace.h> 28 #include <linux/security.h> 29 #include <linux/slab.h> 30 #include <linux/tracehook.h> 31 #include <linux/uaccess.h> 32 33 /** 34 * struct seccomp_filter - container for seccomp BPF programs 35 * 36 * @usage: reference count to manage the object lifetime. 37 * get/put helpers should be used when accessing an instance 38 * outside of a lifetime-guarded section. In general, this 39 * is only needed for handling filters shared across tasks. 40 * @prev: points to a previously installed, or inherited, filter 41 * @len: the number of instructions in the program 42 * @insns: the BPF program instructions to evaluate 43 * 44 * seccomp_filter objects are organized in a tree linked via the @prev 45 * pointer. For any task, it appears to be a singly-linked list starting 46 * with current->seccomp.filter, the most recently attached or inherited filter. 47 * However, multiple filters may share a @prev node, by way of fork(), which 48 * results in a unidirectional tree existing in memory. This is similar to 49 * how namespaces work. 50 * 51 * seccomp_filter objects should never be modified after being attached 52 * to a task_struct (other than @usage). 53 */ 54 struct seccomp_filter { 55 atomic_t usage; 56 struct seccomp_filter *prev; 57 unsigned short len; /* Instruction count */ 58 struct sock_filter_int insnsi[]; 59 }; 60 61 /* Limit any path through the tree to 256KB worth of instructions. */ 62 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) 63 64 /* 65 * Endianness is explicitly ignored and left for BPF program authors to manage 66 * as per the specific architecture. 67 */ 68 static void populate_seccomp_data(struct seccomp_data *sd) 69 { 70 struct task_struct *task = current; 71 struct pt_regs *regs = task_pt_regs(task); 72 73 sd->nr = syscall_get_nr(task, regs); 74 sd->arch = syscall_get_arch(); 75 76 /* Unroll syscall_get_args to help gcc on arm. */ 77 syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); 78 syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); 79 syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); 80 syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); 81 syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); 82 syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); 83 84 sd->instruction_pointer = KSTK_EIP(task); 85 } 86 87 /** 88 * seccomp_check_filter - verify seccomp filter code 89 * @filter: filter to verify 90 * @flen: length of filter 91 * 92 * Takes a previously checked filter (by sk_chk_filter) and 93 * redirects all filter code that loads struct sk_buff data 94 * and related data through seccomp_bpf_load. It also 95 * enforces length and alignment checking of those loads. 96 * 97 * Returns 0 if the rule set is legal or -EINVAL if not. 98 */ 99 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) 100 { 101 int pc; 102 for (pc = 0; pc < flen; pc++) { 103 struct sock_filter *ftest = &filter[pc]; 104 u16 code = ftest->code; 105 u32 k = ftest->k; 106 107 switch (code) { 108 case BPF_S_LD_W_ABS: 109 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 110 /* 32-bit aligned and not out of bounds. */ 111 if (k >= sizeof(struct seccomp_data) || k & 3) 112 return -EINVAL; 113 continue; 114 case BPF_S_LD_W_LEN: 115 ftest->code = BPF_LD | BPF_IMM; 116 ftest->k = sizeof(struct seccomp_data); 117 continue; 118 case BPF_S_LDX_W_LEN: 119 ftest->code = BPF_LDX | BPF_IMM; 120 ftest->k = sizeof(struct seccomp_data); 121 continue; 122 /* Explicitly include allowed calls. */ 123 case BPF_S_RET_K: 124 case BPF_S_RET_A: 125 case BPF_S_ALU_ADD_K: 126 case BPF_S_ALU_ADD_X: 127 case BPF_S_ALU_SUB_K: 128 case BPF_S_ALU_SUB_X: 129 case BPF_S_ALU_MUL_K: 130 case BPF_S_ALU_MUL_X: 131 case BPF_S_ALU_DIV_X: 132 case BPF_S_ALU_AND_K: 133 case BPF_S_ALU_AND_X: 134 case BPF_S_ALU_OR_K: 135 case BPF_S_ALU_OR_X: 136 case BPF_S_ALU_XOR_K: 137 case BPF_S_ALU_XOR_X: 138 case BPF_S_ALU_LSH_K: 139 case BPF_S_ALU_LSH_X: 140 case BPF_S_ALU_RSH_K: 141 case BPF_S_ALU_RSH_X: 142 case BPF_S_ALU_NEG: 143 case BPF_S_LD_IMM: 144 case BPF_S_LDX_IMM: 145 case BPF_S_MISC_TAX: 146 case BPF_S_MISC_TXA: 147 case BPF_S_ALU_DIV_K: 148 case BPF_S_LD_MEM: 149 case BPF_S_LDX_MEM: 150 case BPF_S_ST: 151 case BPF_S_STX: 152 case BPF_S_JMP_JA: 153 case BPF_S_JMP_JEQ_K: 154 case BPF_S_JMP_JEQ_X: 155 case BPF_S_JMP_JGE_K: 156 case BPF_S_JMP_JGE_X: 157 case BPF_S_JMP_JGT_K: 158 case BPF_S_JMP_JGT_X: 159 case BPF_S_JMP_JSET_K: 160 case BPF_S_JMP_JSET_X: 161 sk_decode_filter(ftest, ftest); 162 continue; 163 default: 164 return -EINVAL; 165 } 166 } 167 return 0; 168 } 169 170 /** 171 * seccomp_run_filters - evaluates all seccomp filters against @syscall 172 * @syscall: number of the current system call 173 * 174 * Returns valid seccomp BPF response codes. 175 */ 176 static u32 seccomp_run_filters(int syscall) 177 { 178 struct seccomp_filter *f; 179 struct seccomp_data sd; 180 u32 ret = SECCOMP_RET_ALLOW; 181 182 /* Ensure unexpected behavior doesn't result in failing open. */ 183 if (WARN_ON(current->seccomp.filter == NULL)) 184 return SECCOMP_RET_KILL; 185 186 populate_seccomp_data(&sd); 187 188 /* 189 * All filters in the list are evaluated and the lowest BPF return 190 * value always takes priority (ignoring the DATA). 191 */ 192 for (f = current->seccomp.filter; f; f = f->prev) { 193 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); 194 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 195 ret = cur_ret; 196 } 197 return ret; 198 } 199 200 /** 201 * seccomp_attach_filter: Attaches a seccomp filter to current. 202 * @fprog: BPF program to install 203 * 204 * Returns 0 on success or an errno on failure. 205 */ 206 static long seccomp_attach_filter(struct sock_fprog *fprog) 207 { 208 struct seccomp_filter *filter; 209 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 210 unsigned long total_insns = fprog->len; 211 struct sock_filter *fp; 212 int new_len; 213 long ret; 214 215 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 216 return -EINVAL; 217 218 for (filter = current->seccomp.filter; filter; filter = filter->prev) 219 total_insns += filter->len + 4; /* include a 4 instr penalty */ 220 if (total_insns > MAX_INSNS_PER_PATH) 221 return -ENOMEM; 222 223 /* 224 * Installing a seccomp filter requires that the task have 225 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 226 * This avoids scenarios where unprivileged tasks can affect the 227 * behavior of privileged children. 228 */ 229 if (!current->no_new_privs && 230 security_capable_noaudit(current_cred(), current_user_ns(), 231 CAP_SYS_ADMIN) != 0) 232 return -EACCES; 233 234 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); 235 if (!fp) 236 return -ENOMEM; 237 238 /* Copy the instructions from fprog. */ 239 ret = -EFAULT; 240 if (copy_from_user(fp, fprog->filter, fp_size)) 241 goto free_prog; 242 243 /* Check and rewrite the fprog via the skb checker */ 244 ret = sk_chk_filter(fp, fprog->len); 245 if (ret) 246 goto free_prog; 247 248 /* Check and rewrite the fprog for seccomp use */ 249 ret = seccomp_check_filter(fp, fprog->len); 250 if (ret) 251 goto free_prog; 252 253 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ 254 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); 255 if (ret) 256 goto free_prog; 257 258 /* Allocate a new seccomp_filter */ 259 filter = kzalloc(sizeof(struct seccomp_filter) + 260 sizeof(struct sock_filter_int) * new_len, 261 GFP_KERNEL|__GFP_NOWARN); 262 if (!filter) 263 goto free_prog; 264 265 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); 266 if (ret) 267 goto free_filter; 268 269 atomic_set(&filter->usage, 1); 270 filter->len = new_len; 271 272 /* 273 * If there is an existing filter, make it the prev and don't drop its 274 * task reference. 275 */ 276 filter->prev = current->seccomp.filter; 277 current->seccomp.filter = filter; 278 return 0; 279 280 free_filter: 281 kfree(filter); 282 free_prog: 283 kfree(fp); 284 return ret; 285 } 286 287 /** 288 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog 289 * @user_filter: pointer to the user data containing a sock_fprog. 290 * 291 * Returns 0 on success and non-zero otherwise. 292 */ 293 static long seccomp_attach_user_filter(char __user *user_filter) 294 { 295 struct sock_fprog fprog; 296 long ret = -EFAULT; 297 298 #ifdef CONFIG_COMPAT 299 if (is_compat_task()) { 300 struct compat_sock_fprog fprog32; 301 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) 302 goto out; 303 fprog.len = fprog32.len; 304 fprog.filter = compat_ptr(fprog32.filter); 305 } else /* falls through to the if below. */ 306 #endif 307 if (copy_from_user(&fprog, user_filter, sizeof(fprog))) 308 goto out; 309 ret = seccomp_attach_filter(&fprog); 310 out: 311 return ret; 312 } 313 314 /* get_seccomp_filter - increments the reference count of the filter on @tsk */ 315 void get_seccomp_filter(struct task_struct *tsk) 316 { 317 struct seccomp_filter *orig = tsk->seccomp.filter; 318 if (!orig) 319 return; 320 /* Reference count is bounded by the number of total processes. */ 321 atomic_inc(&orig->usage); 322 } 323 324 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 325 void put_seccomp_filter(struct task_struct *tsk) 326 { 327 struct seccomp_filter *orig = tsk->seccomp.filter; 328 /* Clean up single-reference branches iteratively. */ 329 while (orig && atomic_dec_and_test(&orig->usage)) { 330 struct seccomp_filter *freeme = orig; 331 orig = orig->prev; 332 kfree(freeme); 333 } 334 } 335 336 /** 337 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation 338 * @syscall: syscall number to send to userland 339 * @reason: filter-supplied reason code to send to userland (via si_errno) 340 * 341 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. 342 */ 343 static void seccomp_send_sigsys(int syscall, int reason) 344 { 345 struct siginfo info; 346 memset(&info, 0, sizeof(info)); 347 info.si_signo = SIGSYS; 348 info.si_code = SYS_SECCOMP; 349 info.si_call_addr = (void __user *)KSTK_EIP(current); 350 info.si_errno = reason; 351 info.si_arch = syscall_get_arch(); 352 info.si_syscall = syscall; 353 force_sig_info(SIGSYS, &info, current); 354 } 355 #endif /* CONFIG_SECCOMP_FILTER */ 356 357 /* 358 * Secure computing mode 1 allows only read/write/exit/sigreturn. 359 * To be fully secure this must be combined with rlimit 360 * to limit the stack allocations too. 361 */ 362 static int mode1_syscalls[] = { 363 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 364 0, /* null terminated */ 365 }; 366 367 #ifdef CONFIG_COMPAT 368 static int mode1_syscalls_32[] = { 369 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 370 0, /* null terminated */ 371 }; 372 #endif 373 374 int __secure_computing(int this_syscall) 375 { 376 int mode = current->seccomp.mode; 377 int exit_sig = 0; 378 int *syscall; 379 u32 ret; 380 381 switch (mode) { 382 case SECCOMP_MODE_STRICT: 383 syscall = mode1_syscalls; 384 #ifdef CONFIG_COMPAT 385 if (is_compat_task()) 386 syscall = mode1_syscalls_32; 387 #endif 388 do { 389 if (*syscall == this_syscall) 390 return 0; 391 } while (*++syscall); 392 exit_sig = SIGKILL; 393 ret = SECCOMP_RET_KILL; 394 break; 395 #ifdef CONFIG_SECCOMP_FILTER 396 case SECCOMP_MODE_FILTER: { 397 int data; 398 struct pt_regs *regs = task_pt_regs(current); 399 ret = seccomp_run_filters(this_syscall); 400 data = ret & SECCOMP_RET_DATA; 401 ret &= SECCOMP_RET_ACTION; 402 switch (ret) { 403 case SECCOMP_RET_ERRNO: 404 /* Set the low-order 16-bits as a errno. */ 405 syscall_set_return_value(current, regs, 406 -data, 0); 407 goto skip; 408 case SECCOMP_RET_TRAP: 409 /* Show the handler the original registers. */ 410 syscall_rollback(current, regs); 411 /* Let the filter pass back 16 bits of data. */ 412 seccomp_send_sigsys(this_syscall, data); 413 goto skip; 414 case SECCOMP_RET_TRACE: 415 /* Skip these calls if there is no tracer. */ 416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { 417 syscall_set_return_value(current, regs, 418 -ENOSYS, 0); 419 goto skip; 420 } 421 /* Allow the BPF to provide the event message */ 422 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 /* 424 * The delivery of a fatal signal during event 425 * notification may silently skip tracer notification. 426 * Terminating the task now avoids executing a system 427 * call that may not be intended. 428 */ 429 if (fatal_signal_pending(current)) 430 break; 431 if (syscall_get_nr(current, regs) < 0) 432 goto skip; /* Explicit request to skip. */ 433 434 return 0; 435 case SECCOMP_RET_ALLOW: 436 return 0; 437 case SECCOMP_RET_KILL: 438 default: 439 break; 440 } 441 exit_sig = SIGSYS; 442 break; 443 } 444 #endif 445 default: 446 BUG(); 447 } 448 449 #ifdef SECCOMP_DEBUG 450 dump_stack(); 451 #endif 452 audit_seccomp(this_syscall, exit_sig, ret); 453 do_exit(exit_sig); 454 #ifdef CONFIG_SECCOMP_FILTER 455 skip: 456 audit_seccomp(this_syscall, exit_sig, ret); 457 #endif 458 return -1; 459 } 460 461 long prctl_get_seccomp(void) 462 { 463 return current->seccomp.mode; 464 } 465 466 /** 467 * prctl_set_seccomp: configures current->seccomp.mode 468 * @seccomp_mode: requested mode to use 469 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER 470 * 471 * This function may be called repeatedly with a @seccomp_mode of 472 * SECCOMP_MODE_FILTER to install additional filters. Every filter 473 * successfully installed will be evaluated (in reverse order) for each system 474 * call the task makes. 475 * 476 * Once current->seccomp.mode is non-zero, it may not be changed. 477 * 478 * Returns 0 on success or -EINVAL on failure. 479 */ 480 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) 481 { 482 long ret = -EINVAL; 483 484 if (current->seccomp.mode && 485 current->seccomp.mode != seccomp_mode) 486 goto out; 487 488 switch (seccomp_mode) { 489 case SECCOMP_MODE_STRICT: 490 ret = 0; 491 #ifdef TIF_NOTSC 492 disable_TSC(); 493 #endif 494 break; 495 #ifdef CONFIG_SECCOMP_FILTER 496 case SECCOMP_MODE_FILTER: 497 ret = seccomp_attach_user_filter(filter); 498 if (ret) 499 goto out; 500 break; 501 #endif 502 default: 503 goto out; 504 } 505 506 current->seccomp.mode = seccomp_mode; 507 set_thread_flag(TIF_SECCOMP); 508 out: 509 return ret; 510 } 511