1 /* 2 * QEMU seccomp mode 2 support with libseccomp 3 * 4 * Copyright IBM, Corp. 2012 5 * 6 * Authors: 7 * Eduardo Otubo <eotubo@br.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/config-file.h" 19 #include "qemu/option.h" 20 #include "qemu/module.h" 21 #include <sys/prctl.h> 22 #include <seccomp.h> 23 #include "sysemu/seccomp.h" 24 #include <linux/seccomp.h> 25 26 /* For some architectures (notably ARM) cacheflush is not supported until 27 * libseccomp 2.2.3, but configure enforces that we are using a more recent 28 * version on those hosts, so it is OK for this check to be less strict. 29 */ 30 #if SCMP_VER_MAJOR >= 3 31 #define HAVE_CACHEFLUSH 32 #elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 2 33 #define HAVE_CACHEFLUSH 34 #endif 35 36 struct QemuSeccompSyscall { 37 int32_t num; 38 uint8_t set; 39 uint8_t narg; 40 const struct scmp_arg_cmp *arg_cmp; 41 uint32_t action; 42 }; 43 44 const struct scmp_arg_cmp sched_setscheduler_arg[] = { 45 /* was SCMP_A1(SCMP_CMP_NE, SCHED_IDLE), but expanded due to GCC 4.x bug */ 46 { .arg = 1, .op = SCMP_CMP_NE, .datum_a = SCHED_IDLE } 47 }; 48 49 /* 50 * See 'NOTES' in 'man 2 clone' - s390 & cross have 'flags' in 51 * different position to other architectures 52 */ 53 #if defined(HOST_S390X) || defined(HOST_S390) || defined(HOST_CRIS) 54 #define CLONE_FLAGS_ARG 1 55 #else 56 #define CLONE_FLAGS_ARG 0 57 #endif 58 59 #ifndef CLONE_PIDFD 60 # define CLONE_PIDFD 0x00001000 61 #endif 62 63 #define REQUIRE_CLONE_FLAG(flag) \ 64 const struct scmp_arg_cmp clone_arg ## flag[] = { \ 65 { .arg = CLONE_FLAGS_ARG, \ 66 .op = SCMP_CMP_MASKED_EQ, \ 67 .datum_a = flag, .datum_b = 0 } } 68 69 #define FORBID_CLONE_FLAG(flag) \ 70 const struct scmp_arg_cmp clone_arg ## flag[] = { \ 71 { .arg = CLONE_FLAGS_ARG, \ 72 .op = SCMP_CMP_MASKED_EQ, \ 73 .datum_a = flag, .datum_b = flag } } 74 75 #define RULE_CLONE_FLAG(flag) \ 76 { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, \ 77 ARRAY_SIZE(clone_arg ## flag), clone_arg ## flag, SCMP_ACT_ERRNO(EPERM) } 78 79 /* If no CLONE_* flags are set, except CSIGNAL, deny */ 80 const struct scmp_arg_cmp clone_arg_none[] = { 81 { .arg = CLONE_FLAGS_ARG, 82 .op = SCMP_CMP_MASKED_EQ, 83 .datum_a = ~(CSIGNAL), .datum_b = 0 } 84 }; 85 86 /* 87 * pthread_create should always set all of these. 88 */ 89 REQUIRE_CLONE_FLAG(CLONE_VM); 90 REQUIRE_CLONE_FLAG(CLONE_FS); 91 REQUIRE_CLONE_FLAG(CLONE_FILES); 92 REQUIRE_CLONE_FLAG(CLONE_SIGHAND); 93 REQUIRE_CLONE_FLAG(CLONE_THREAD); 94 REQUIRE_CLONE_FLAG(CLONE_SYSVSEM); 95 REQUIRE_CLONE_FLAG(CLONE_SETTLS); 96 REQUIRE_CLONE_FLAG(CLONE_PARENT_SETTID); 97 REQUIRE_CLONE_FLAG(CLONE_CHILD_CLEARTID); 98 /* 99 * Musl sets this in pthread_create too, but it is 100 * obsolete and harmless since its behaviour is 101 * subsumed under CLONE_THREAD 102 */ 103 /*REQUIRE_CLONE_FLAG(CLONE_DETACHED);*/ 104 105 106 /* 107 * These all indicate an attempt to spawn a process 108 * instead of a thread, or other undesirable scenarios 109 */ 110 FORBID_CLONE_FLAG(CLONE_PIDFD); 111 FORBID_CLONE_FLAG(CLONE_PTRACE); 112 FORBID_CLONE_FLAG(CLONE_VFORK); 113 FORBID_CLONE_FLAG(CLONE_PARENT); 114 FORBID_CLONE_FLAG(CLONE_NEWNS); 115 FORBID_CLONE_FLAG(CLONE_UNTRACED); 116 FORBID_CLONE_FLAG(CLONE_NEWCGROUP); 117 FORBID_CLONE_FLAG(CLONE_NEWUTS); 118 FORBID_CLONE_FLAG(CLONE_NEWIPC); 119 FORBID_CLONE_FLAG(CLONE_NEWUSER); 120 FORBID_CLONE_FLAG(CLONE_NEWPID); 121 FORBID_CLONE_FLAG(CLONE_NEWNET); 122 FORBID_CLONE_FLAG(CLONE_IO); 123 124 125 static const struct QemuSeccompSyscall denylist[] = { 126 /* default set of syscalls that should get blocked */ 127 { SCMP_SYS(reboot), QEMU_SECCOMP_SET_DEFAULT, 128 0, NULL, SCMP_ACT_TRAP }, 129 { SCMP_SYS(swapon), QEMU_SECCOMP_SET_DEFAULT, 130 0, NULL, SCMP_ACT_TRAP }, 131 { SCMP_SYS(swapoff), QEMU_SECCOMP_SET_DEFAULT, 132 0, NULL, SCMP_ACT_TRAP }, 133 { SCMP_SYS(syslog), QEMU_SECCOMP_SET_DEFAULT, 134 0, NULL, SCMP_ACT_TRAP }, 135 { SCMP_SYS(mount), QEMU_SECCOMP_SET_DEFAULT, 136 0, NULL, SCMP_ACT_TRAP }, 137 { SCMP_SYS(umount), QEMU_SECCOMP_SET_DEFAULT, 138 0, NULL, SCMP_ACT_TRAP }, 139 { SCMP_SYS(kexec_load), QEMU_SECCOMP_SET_DEFAULT, 140 0, NULL, SCMP_ACT_TRAP }, 141 { SCMP_SYS(afs_syscall), QEMU_SECCOMP_SET_DEFAULT, 142 0, NULL, SCMP_ACT_TRAP }, 143 { SCMP_SYS(break), QEMU_SECCOMP_SET_DEFAULT, 144 0, NULL, SCMP_ACT_TRAP }, 145 { SCMP_SYS(ftime), QEMU_SECCOMP_SET_DEFAULT, 146 0, NULL, SCMP_ACT_TRAP }, 147 { SCMP_SYS(getpmsg), QEMU_SECCOMP_SET_DEFAULT, 148 0, NULL, SCMP_ACT_TRAP }, 149 { SCMP_SYS(gtty), QEMU_SECCOMP_SET_DEFAULT, 150 0, NULL, SCMP_ACT_TRAP }, 151 { SCMP_SYS(lock), QEMU_SECCOMP_SET_DEFAULT, 152 0, NULL, SCMP_ACT_TRAP }, 153 { SCMP_SYS(mpx), QEMU_SECCOMP_SET_DEFAULT, 154 0, NULL, SCMP_ACT_TRAP }, 155 { SCMP_SYS(prof), QEMU_SECCOMP_SET_DEFAULT, 156 0, NULL, SCMP_ACT_TRAP }, 157 { SCMP_SYS(profil), QEMU_SECCOMP_SET_DEFAULT, 158 0, NULL, SCMP_ACT_TRAP }, 159 { SCMP_SYS(putpmsg), QEMU_SECCOMP_SET_DEFAULT, 160 0, NULL, SCMP_ACT_TRAP }, 161 { SCMP_SYS(security), QEMU_SECCOMP_SET_DEFAULT, 162 0, NULL, SCMP_ACT_TRAP }, 163 { SCMP_SYS(stty), QEMU_SECCOMP_SET_DEFAULT, 164 0, NULL, SCMP_ACT_TRAP }, 165 { SCMP_SYS(tuxcall), QEMU_SECCOMP_SET_DEFAULT, 166 0, NULL, SCMP_ACT_TRAP }, 167 { SCMP_SYS(ulimit), QEMU_SECCOMP_SET_DEFAULT, 168 0, NULL, SCMP_ACT_TRAP }, 169 { SCMP_SYS(vserver), QEMU_SECCOMP_SET_DEFAULT, 170 0, NULL, SCMP_ACT_TRAP }, 171 /* obsolete */ 172 { SCMP_SYS(readdir), QEMU_SECCOMP_SET_OBSOLETE, 173 0, NULL, SCMP_ACT_TRAP }, 174 { SCMP_SYS(_sysctl), QEMU_SECCOMP_SET_OBSOLETE, 175 0, NULL, SCMP_ACT_TRAP }, 176 { SCMP_SYS(bdflush), QEMU_SECCOMP_SET_OBSOLETE, 177 0, NULL, SCMP_ACT_TRAP }, 178 { SCMP_SYS(create_module), QEMU_SECCOMP_SET_OBSOLETE, 179 0, NULL, SCMP_ACT_TRAP }, 180 { SCMP_SYS(get_kernel_syms), QEMU_SECCOMP_SET_OBSOLETE, 181 0, NULL, SCMP_ACT_TRAP }, 182 { SCMP_SYS(query_module), QEMU_SECCOMP_SET_OBSOLETE, 183 0, NULL, SCMP_ACT_TRAP }, 184 { SCMP_SYS(sgetmask), QEMU_SECCOMP_SET_OBSOLETE, 185 0, NULL, SCMP_ACT_TRAP }, 186 { SCMP_SYS(ssetmask), QEMU_SECCOMP_SET_OBSOLETE, 187 0, NULL, SCMP_ACT_TRAP }, 188 { SCMP_SYS(sysfs), QEMU_SECCOMP_SET_OBSOLETE, 189 0, NULL, SCMP_ACT_TRAP }, 190 { SCMP_SYS(uselib), QEMU_SECCOMP_SET_OBSOLETE, 191 0, NULL, SCMP_ACT_TRAP }, 192 { SCMP_SYS(ustat), QEMU_SECCOMP_SET_OBSOLETE, 193 0, NULL, SCMP_ACT_TRAP }, 194 /* privileged */ 195 { SCMP_SYS(setuid), QEMU_SECCOMP_SET_PRIVILEGED, 196 0, NULL, SCMP_ACT_TRAP }, 197 { SCMP_SYS(setgid), QEMU_SECCOMP_SET_PRIVILEGED, 198 0, NULL, SCMP_ACT_TRAP }, 199 { SCMP_SYS(setpgid), QEMU_SECCOMP_SET_PRIVILEGED, 200 0, NULL, SCMP_ACT_TRAP }, 201 { SCMP_SYS(setsid), QEMU_SECCOMP_SET_PRIVILEGED, 202 0, NULL, SCMP_ACT_TRAP }, 203 { SCMP_SYS(setreuid), QEMU_SECCOMP_SET_PRIVILEGED, 204 0, NULL, SCMP_ACT_TRAP }, 205 { SCMP_SYS(setregid), QEMU_SECCOMP_SET_PRIVILEGED, 206 0, NULL, SCMP_ACT_TRAP }, 207 { SCMP_SYS(setresuid), QEMU_SECCOMP_SET_PRIVILEGED, 208 0, NULL, SCMP_ACT_TRAP }, 209 { SCMP_SYS(setresgid), QEMU_SECCOMP_SET_PRIVILEGED, 210 0, NULL, SCMP_ACT_TRAP }, 211 { SCMP_SYS(setfsuid), QEMU_SECCOMP_SET_PRIVILEGED, 212 0, NULL, SCMP_ACT_TRAP }, 213 { SCMP_SYS(setfsgid), QEMU_SECCOMP_SET_PRIVILEGED, 214 0, NULL, SCMP_ACT_TRAP }, 215 /* spawn */ 216 { SCMP_SYS(fork), QEMU_SECCOMP_SET_SPAWN, 217 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 218 { SCMP_SYS(vfork), QEMU_SECCOMP_SET_SPAWN, 219 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 220 { SCMP_SYS(execve), QEMU_SECCOMP_SET_SPAWN, 221 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 222 { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, 223 ARRAY_SIZE(clone_arg_none), clone_arg_none, SCMP_ACT_ERRNO(EPERM) }, 224 RULE_CLONE_FLAG(CLONE_VM), 225 RULE_CLONE_FLAG(CLONE_FS), 226 RULE_CLONE_FLAG(CLONE_FILES), 227 RULE_CLONE_FLAG(CLONE_SIGHAND), 228 RULE_CLONE_FLAG(CLONE_THREAD), 229 RULE_CLONE_FLAG(CLONE_SYSVSEM), 230 RULE_CLONE_FLAG(CLONE_SETTLS), 231 RULE_CLONE_FLAG(CLONE_PARENT_SETTID), 232 RULE_CLONE_FLAG(CLONE_CHILD_CLEARTID), 233 /*RULE_CLONE_FLAG(CLONE_DETACHED),*/ 234 RULE_CLONE_FLAG(CLONE_PIDFD), 235 RULE_CLONE_FLAG(CLONE_PTRACE), 236 RULE_CLONE_FLAG(CLONE_VFORK), 237 RULE_CLONE_FLAG(CLONE_PARENT), 238 RULE_CLONE_FLAG(CLONE_NEWNS), 239 RULE_CLONE_FLAG(CLONE_UNTRACED), 240 RULE_CLONE_FLAG(CLONE_NEWCGROUP), 241 RULE_CLONE_FLAG(CLONE_NEWUTS), 242 RULE_CLONE_FLAG(CLONE_NEWIPC), 243 RULE_CLONE_FLAG(CLONE_NEWUSER), 244 RULE_CLONE_FLAG(CLONE_NEWPID), 245 RULE_CLONE_FLAG(CLONE_NEWNET), 246 RULE_CLONE_FLAG(CLONE_IO), 247 #ifdef __SNR_clone3 248 { SCMP_SYS(clone3), QEMU_SECCOMP_SET_SPAWN, 249 0, NULL, SCMP_ACT_ERRNO(ENOSYS) }, 250 #endif 251 #ifdef __SNR_execveat 252 { SCMP_SYS(execveat), QEMU_SECCOMP_SET_SPAWN }, 253 #endif 254 { SCMP_SYS(setns), QEMU_SECCOMP_SET_SPAWN }, 255 { SCMP_SYS(unshare), QEMU_SECCOMP_SET_SPAWN }, 256 /* resource control */ 257 { SCMP_SYS(setpriority), QEMU_SECCOMP_SET_RESOURCECTL, 258 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 259 { SCMP_SYS(sched_setparam), QEMU_SECCOMP_SET_RESOURCECTL, 260 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 261 { SCMP_SYS(sched_setscheduler), QEMU_SECCOMP_SET_RESOURCECTL, 262 ARRAY_SIZE(sched_setscheduler_arg), sched_setscheduler_arg, 263 SCMP_ACT_ERRNO(EPERM) }, 264 { SCMP_SYS(sched_setaffinity), QEMU_SECCOMP_SET_RESOURCECTL, 265 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 266 }; 267 268 static inline __attribute__((unused)) int 269 qemu_seccomp(unsigned int operation, unsigned int flags, void *args) 270 { 271 #ifdef __NR_seccomp 272 return syscall(__NR_seccomp, operation, flags, args); 273 #else 274 errno = ENOSYS; 275 return -1; 276 #endif 277 } 278 279 static uint32_t qemu_seccomp_update_action(uint32_t action) 280 { 281 #if defined(SECCOMP_GET_ACTION_AVAIL) && defined(SCMP_ACT_KILL_PROCESS) && \ 282 defined(SECCOMP_RET_KILL_PROCESS) 283 if (action == SCMP_ACT_TRAP) { 284 static int kill_process = -1; 285 if (kill_process == -1) { 286 uint32_t testaction = SECCOMP_RET_KILL_PROCESS; 287 288 if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &testaction) == 0) { 289 kill_process = 1; 290 } else { 291 kill_process = 0; 292 } 293 } 294 if (kill_process == 1) { 295 return SCMP_ACT_KILL_PROCESS; 296 } 297 } 298 #endif 299 return action; 300 } 301 302 303 static int seccomp_start(uint32_t seccomp_opts, Error **errp) 304 { 305 int rc = -1; 306 unsigned int i = 0; 307 scmp_filter_ctx ctx; 308 309 ctx = seccomp_init(SCMP_ACT_ALLOW); 310 if (ctx == NULL) { 311 error_setg(errp, "failed to initialize seccomp context"); 312 goto seccomp_return; 313 } 314 315 #if defined(CONFIG_SECCOMP_SYSRAWRC) 316 /* 317 * This must be the first seccomp_attr_set() call to have full 318 * error propagation from subsequent seccomp APIs. 319 */ 320 rc = seccomp_attr_set(ctx, SCMP_FLTATR_API_SYSRAWRC, 1); 321 if (rc != 0) { 322 error_setg_errno(errp, -rc, 323 "failed to set seccomp rawrc attribute"); 324 goto seccomp_return; 325 } 326 #endif 327 328 rc = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_TSYNC, 1); 329 if (rc != 0) { 330 error_setg_errno(errp, -rc, 331 "failed to set seccomp thread synchronization"); 332 goto seccomp_return; 333 } 334 335 for (i = 0; i < ARRAY_SIZE(denylist); i++) { 336 uint32_t action; 337 if (!(seccomp_opts & denylist[i].set)) { 338 continue; 339 } 340 341 action = qemu_seccomp_update_action(denylist[i].action); 342 rc = seccomp_rule_add_array(ctx, action, denylist[i].num, 343 denylist[i].narg, denylist[i].arg_cmp); 344 if (rc < 0) { 345 error_setg_errno(errp, -rc, 346 "failed to add seccomp denylist rules"); 347 goto seccomp_return; 348 } 349 } 350 351 rc = seccomp_load(ctx); 352 if (rc < 0) { 353 error_setg_errno(errp, -rc, 354 "failed to load seccomp syscall filter in kernel"); 355 } 356 357 seccomp_return: 358 seccomp_release(ctx); 359 return rc < 0 ? -1 : 0; 360 } 361 362 int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp) 363 { 364 if (qemu_opt_get_bool(opts, "enable", false)) { 365 uint32_t seccomp_opts = QEMU_SECCOMP_SET_DEFAULT 366 | QEMU_SECCOMP_SET_OBSOLETE; 367 const char *value = NULL; 368 369 value = qemu_opt_get(opts, "obsolete"); 370 if (value) { 371 if (g_str_equal(value, "allow")) { 372 seccomp_opts &= ~QEMU_SECCOMP_SET_OBSOLETE; 373 } else if (g_str_equal(value, "deny")) { 374 /* this is the default option, this if is here 375 * to provide a little bit of consistency for 376 * the command line */ 377 } else { 378 error_setg(errp, "invalid argument for obsolete"); 379 return -1; 380 } 381 } 382 383 value = qemu_opt_get(opts, "elevateprivileges"); 384 if (value) { 385 if (g_str_equal(value, "deny")) { 386 seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; 387 } else if (g_str_equal(value, "children")) { 388 seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; 389 390 /* calling prctl directly because we're 391 * not sure if host has CAP_SYS_ADMIN set*/ 392 if (prctl(PR_SET_NO_NEW_PRIVS, 1)) { 393 error_setg(errp, "failed to set no_new_privs aborting"); 394 return -1; 395 } 396 } else if (g_str_equal(value, "allow")) { 397 /* default value */ 398 } else { 399 error_setg(errp, "invalid argument for elevateprivileges"); 400 return -1; 401 } 402 } 403 404 value = qemu_opt_get(opts, "spawn"); 405 if (value) { 406 if (g_str_equal(value, "deny")) { 407 seccomp_opts |= QEMU_SECCOMP_SET_SPAWN; 408 } else if (g_str_equal(value, "allow")) { 409 /* default value */ 410 } else { 411 error_setg(errp, "invalid argument for spawn"); 412 return -1; 413 } 414 } 415 416 value = qemu_opt_get(opts, "resourcecontrol"); 417 if (value) { 418 if (g_str_equal(value, "deny")) { 419 seccomp_opts |= QEMU_SECCOMP_SET_RESOURCECTL; 420 } else if (g_str_equal(value, "allow")) { 421 /* default value */ 422 } else { 423 error_setg(errp, "invalid argument for resourcecontrol"); 424 return -1; 425 } 426 } 427 428 if (seccomp_start(seccomp_opts, errp) < 0) { 429 return -1; 430 } 431 } 432 433 return 0; 434 } 435 436 static QemuOptsList qemu_sandbox_opts = { 437 .name = "sandbox", 438 .implied_opt_name = "enable", 439 .head = QTAILQ_HEAD_INITIALIZER(qemu_sandbox_opts.head), 440 .desc = { 441 { 442 .name = "enable", 443 .type = QEMU_OPT_BOOL, 444 }, 445 { 446 .name = "obsolete", 447 .type = QEMU_OPT_STRING, 448 }, 449 { 450 .name = "elevateprivileges", 451 .type = QEMU_OPT_STRING, 452 }, 453 { 454 .name = "spawn", 455 .type = QEMU_OPT_STRING, 456 }, 457 { 458 .name = "resourcecontrol", 459 .type = QEMU_OPT_STRING, 460 }, 461 { /* end of list */ } 462 }, 463 }; 464 465 static void seccomp_register(void) 466 { 467 bool add = false; 468 469 /* FIXME: use seccomp_api_get() >= 2 check when released */ 470 471 #if defined(SECCOMP_FILTER_FLAG_TSYNC) 472 int check; 473 474 /* check host TSYNC capability, it returns errno == ENOSYS if unavailable */ 475 check = qemu_seccomp(SECCOMP_SET_MODE_FILTER, 476 SECCOMP_FILTER_FLAG_TSYNC, NULL); 477 if (check < 0 && errno == EFAULT) { 478 add = true; 479 } 480 #endif 481 482 if (add) { 483 qemu_add_opts(&qemu_sandbox_opts); 484 } 485 } 486 opts_init(seccomp_register); 487