1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/kernel/sys.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/export.h> 9 #include <linux/mm.h> 10 #include <linux/utsname.h> 11 #include <linux/mman.h> 12 #include <linux/reboot.h> 13 #include <linux/prctl.h> 14 #include <linux/highuid.h> 15 #include <linux/fs.h> 16 #include <linux/kmod.h> 17 #include <linux/perf_event.h> 18 #include <linux/resource.h> 19 #include <linux/kernel.h> 20 #include <linux/workqueue.h> 21 #include <linux/capability.h> 22 #include <linux/device.h> 23 #include <linux/key.h> 24 #include <linux/times.h> 25 #include <linux/posix-timers.h> 26 #include <linux/security.h> 27 #include <linux/suspend.h> 28 #include <linux/tty.h> 29 #include <linux/signal.h> 30 #include <linux/cn_proc.h> 31 #include <linux/getcpu.h> 32 #include <linux/task_io_accounting_ops.h> 33 #include <linux/seccomp.h> 34 #include <linux/cpu.h> 35 #include <linux/personality.h> 36 #include <linux/ptrace.h> 37 #include <linux/fs_struct.h> 38 #include <linux/file.h> 39 #include <linux/mount.h> 40 #include <linux/gfp.h> 41 #include <linux/syscore_ops.h> 42 #include <linux/version.h> 43 #include <linux/ctype.h> 44 #include <linux/syscall_user_dispatch.h> 45 46 #include <linux/compat.h> 47 #include <linux/syscalls.h> 48 #include <linux/kprobes.h> 49 #include <linux/user_namespace.h> 50 #include <linux/time_namespace.h> 51 #include <linux/binfmts.h> 52 53 #include <linux/sched.h> 54 #include <linux/sched/autogroup.h> 55 #include <linux/sched/loadavg.h> 56 #include <linux/sched/stat.h> 57 #include <linux/sched/mm.h> 58 #include <linux/sched/coredump.h> 59 #include <linux/sched/task.h> 60 #include <linux/sched/cputime.h> 61 #include <linux/rcupdate.h> 62 #include <linux/uidgid.h> 63 #include <linux/cred.h> 64 65 #include <linux/nospec.h> 66 67 #include <linux/kmsg_dump.h> 68 /* Move somewhere else to avoid recompiling? */ 69 #include <generated/utsrelease.h> 70 71 #include <linux/uaccess.h> 72 #include <asm/io.h> 73 #include <asm/unistd.h> 74 75 #include "uid16.h" 76 77 #ifndef SET_UNALIGN_CTL 78 # define SET_UNALIGN_CTL(a, b) (-EINVAL) 79 #endif 80 #ifndef GET_UNALIGN_CTL 81 # define GET_UNALIGN_CTL(a, b) (-EINVAL) 82 #endif 83 #ifndef SET_FPEMU_CTL 84 # define SET_FPEMU_CTL(a, b) (-EINVAL) 85 #endif 86 #ifndef GET_FPEMU_CTL 87 # define GET_FPEMU_CTL(a, b) (-EINVAL) 88 #endif 89 #ifndef SET_FPEXC_CTL 90 # define SET_FPEXC_CTL(a, b) (-EINVAL) 91 #endif 92 #ifndef GET_FPEXC_CTL 93 # define GET_FPEXC_CTL(a, b) (-EINVAL) 94 #endif 95 #ifndef GET_ENDIAN 96 # define GET_ENDIAN(a, b) (-EINVAL) 97 #endif 98 #ifndef SET_ENDIAN 99 # define SET_ENDIAN(a, b) (-EINVAL) 100 #endif 101 #ifndef GET_TSC_CTL 102 # define GET_TSC_CTL(a) (-EINVAL) 103 #endif 104 #ifndef SET_TSC_CTL 105 # define SET_TSC_CTL(a) (-EINVAL) 106 #endif 107 #ifndef GET_FP_MODE 108 # define GET_FP_MODE(a) (-EINVAL) 109 #endif 110 #ifndef SET_FP_MODE 111 # define SET_FP_MODE(a,b) (-EINVAL) 112 #endif 113 #ifndef SVE_SET_VL 114 # define SVE_SET_VL(a) (-EINVAL) 115 #endif 116 #ifndef SVE_GET_VL 117 # define SVE_GET_VL() (-EINVAL) 118 #endif 119 #ifndef PAC_RESET_KEYS 120 # define PAC_RESET_KEYS(a, b) (-EINVAL) 121 #endif 122 #ifndef PAC_SET_ENABLED_KEYS 123 # define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL) 124 #endif 125 #ifndef PAC_GET_ENABLED_KEYS 126 # define PAC_GET_ENABLED_KEYS(a) (-EINVAL) 127 #endif 128 #ifndef SET_TAGGED_ADDR_CTRL 129 # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) 130 #endif 131 #ifndef GET_TAGGED_ADDR_CTRL 132 # define GET_TAGGED_ADDR_CTRL() (-EINVAL) 133 #endif 134 135 /* 136 * this is where the system-wide overflow UID and GID are defined, for 137 * architectures that now have 32-bit UID/GID but didn't in the past 138 */ 139 140 int overflowuid = DEFAULT_OVERFLOWUID; 141 int overflowgid = DEFAULT_OVERFLOWGID; 142 143 EXPORT_SYMBOL(overflowuid); 144 EXPORT_SYMBOL(overflowgid); 145 146 /* 147 * the same as above, but for filesystems which can only store a 16-bit 148 * UID and GID. as such, this is needed on all architectures 149 */ 150 151 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; 152 int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; 153 154 EXPORT_SYMBOL(fs_overflowuid); 155 EXPORT_SYMBOL(fs_overflowgid); 156 157 /* 158 * Returns true if current's euid is same as p's uid or euid, 159 * or has CAP_SYS_NICE to p's user_ns. 160 * 161 * Called with rcu_read_lock, creds are safe 162 */ 163 static bool set_one_prio_perm(struct task_struct *p) 164 { 165 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 166 167 if (uid_eq(pcred->uid, cred->euid) || 168 uid_eq(pcred->euid, cred->euid)) 169 return true; 170 if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) 171 return true; 172 return false; 173 } 174 175 /* 176 * set the priority of a task 177 * - the caller must hold the RCU read lock 178 */ 179 static int set_one_prio(struct task_struct *p, int niceval, int error) 180 { 181 int no_nice; 182 183 if (!set_one_prio_perm(p)) { 184 error = -EPERM; 185 goto out; 186 } 187 if (niceval < task_nice(p) && !can_nice(p, niceval)) { 188 error = -EACCES; 189 goto out; 190 } 191 no_nice = security_task_setnice(p, niceval); 192 if (no_nice) { 193 error = no_nice; 194 goto out; 195 } 196 if (error == -ESRCH) 197 error = 0; 198 set_user_nice(p, niceval); 199 out: 200 return error; 201 } 202 203 SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) 204 { 205 struct task_struct *g, *p; 206 struct user_struct *user; 207 const struct cred *cred = current_cred(); 208 int error = -EINVAL; 209 struct pid *pgrp; 210 kuid_t uid; 211 212 if (which > PRIO_USER || which < PRIO_PROCESS) 213 goto out; 214 215 /* normalize: avoid signed division (rounding problems) */ 216 error = -ESRCH; 217 if (niceval < MIN_NICE) 218 niceval = MIN_NICE; 219 if (niceval > MAX_NICE) 220 niceval = MAX_NICE; 221 222 rcu_read_lock(); 223 switch (which) { 224 case PRIO_PROCESS: 225 if (who) 226 p = find_task_by_vpid(who); 227 else 228 p = current; 229 if (p) 230 error = set_one_prio(p, niceval, error); 231 break; 232 case PRIO_PGRP: 233 if (who) 234 pgrp = find_vpid(who); 235 else 236 pgrp = task_pgrp(current); 237 read_lock(&tasklist_lock); 238 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 239 error = set_one_prio(p, niceval, error); 240 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 241 read_unlock(&tasklist_lock); 242 break; 243 case PRIO_USER: 244 uid = make_kuid(cred->user_ns, who); 245 user = cred->user; 246 if (!who) 247 uid = cred->uid; 248 else if (!uid_eq(uid, cred->uid)) { 249 user = find_user(uid); 250 if (!user) 251 goto out_unlock; /* No processes for this user */ 252 } 253 for_each_process_thread(g, p) { 254 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) 255 error = set_one_prio(p, niceval, error); 256 } 257 if (!uid_eq(uid, cred->uid)) 258 free_uid(user); /* For find_user() */ 259 break; 260 } 261 out_unlock: 262 rcu_read_unlock(); 263 out: 264 return error; 265 } 266 267 /* 268 * Ugh. To avoid negative return values, "getpriority()" will 269 * not return the normal nice-value, but a negated value that 270 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 271 * to stay compatible. 272 */ 273 SYSCALL_DEFINE2(getpriority, int, which, int, who) 274 { 275 struct task_struct *g, *p; 276 struct user_struct *user; 277 const struct cred *cred = current_cred(); 278 long niceval, retval = -ESRCH; 279 struct pid *pgrp; 280 kuid_t uid; 281 282 if (which > PRIO_USER || which < PRIO_PROCESS) 283 return -EINVAL; 284 285 rcu_read_lock(); 286 switch (which) { 287 case PRIO_PROCESS: 288 if (who) 289 p = find_task_by_vpid(who); 290 else 291 p = current; 292 if (p) { 293 niceval = nice_to_rlimit(task_nice(p)); 294 if (niceval > retval) 295 retval = niceval; 296 } 297 break; 298 case PRIO_PGRP: 299 if (who) 300 pgrp = find_vpid(who); 301 else 302 pgrp = task_pgrp(current); 303 read_lock(&tasklist_lock); 304 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 305 niceval = nice_to_rlimit(task_nice(p)); 306 if (niceval > retval) 307 retval = niceval; 308 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 309 read_unlock(&tasklist_lock); 310 break; 311 case PRIO_USER: 312 uid = make_kuid(cred->user_ns, who); 313 user = cred->user; 314 if (!who) 315 uid = cred->uid; 316 else if (!uid_eq(uid, cred->uid)) { 317 user = find_user(uid); 318 if (!user) 319 goto out_unlock; /* No processes for this user */ 320 } 321 for_each_process_thread(g, p) { 322 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { 323 niceval = nice_to_rlimit(task_nice(p)); 324 if (niceval > retval) 325 retval = niceval; 326 } 327 } 328 if (!uid_eq(uid, cred->uid)) 329 free_uid(user); /* for find_user() */ 330 break; 331 } 332 out_unlock: 333 rcu_read_unlock(); 334 335 return retval; 336 } 337 338 /* 339 * Unprivileged users may change the real gid to the effective gid 340 * or vice versa. (BSD-style) 341 * 342 * If you set the real gid at all, or set the effective gid to a value not 343 * equal to the real gid, then the saved gid is set to the new effective gid. 344 * 345 * This makes it possible for a setgid program to completely drop its 346 * privileges, which is often a useful assertion to make when you are doing 347 * a security audit over a program. 348 * 349 * The general idea is that a program which uses just setregid() will be 350 * 100% compatible with BSD. A program which uses just setgid() will be 351 * 100% compatible with POSIX with saved IDs. 352 * 353 * SMP: There are not races, the GIDs are checked only by filesystem 354 * operations (as far as semantic preservation is concerned). 355 */ 356 #ifdef CONFIG_MULTIUSER 357 long __sys_setregid(gid_t rgid, gid_t egid) 358 { 359 struct user_namespace *ns = current_user_ns(); 360 const struct cred *old; 361 struct cred *new; 362 int retval; 363 kgid_t krgid, kegid; 364 365 krgid = make_kgid(ns, rgid); 366 kegid = make_kgid(ns, egid); 367 368 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 369 return -EINVAL; 370 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 371 return -EINVAL; 372 373 new = prepare_creds(); 374 if (!new) 375 return -ENOMEM; 376 old = current_cred(); 377 378 retval = -EPERM; 379 if (rgid != (gid_t) -1) { 380 if (gid_eq(old->gid, krgid) || 381 gid_eq(old->egid, krgid) || 382 ns_capable_setid(old->user_ns, CAP_SETGID)) 383 new->gid = krgid; 384 else 385 goto error; 386 } 387 if (egid != (gid_t) -1) { 388 if (gid_eq(old->gid, kegid) || 389 gid_eq(old->egid, kegid) || 390 gid_eq(old->sgid, kegid) || 391 ns_capable_setid(old->user_ns, CAP_SETGID)) 392 new->egid = kegid; 393 else 394 goto error; 395 } 396 397 if (rgid != (gid_t) -1 || 398 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) 399 new->sgid = new->egid; 400 new->fsgid = new->egid; 401 402 retval = security_task_fix_setgid(new, old, LSM_SETID_RE); 403 if (retval < 0) 404 goto error; 405 406 return commit_creds(new); 407 408 error: 409 abort_creds(new); 410 return retval; 411 } 412 413 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 414 { 415 return __sys_setregid(rgid, egid); 416 } 417 418 /* 419 * setgid() is implemented like SysV w/ SAVED_IDS 420 * 421 * SMP: Same implicit races as above. 422 */ 423 long __sys_setgid(gid_t gid) 424 { 425 struct user_namespace *ns = current_user_ns(); 426 const struct cred *old; 427 struct cred *new; 428 int retval; 429 kgid_t kgid; 430 431 kgid = make_kgid(ns, gid); 432 if (!gid_valid(kgid)) 433 return -EINVAL; 434 435 new = prepare_creds(); 436 if (!new) 437 return -ENOMEM; 438 old = current_cred(); 439 440 retval = -EPERM; 441 if (ns_capable_setid(old->user_ns, CAP_SETGID)) 442 new->gid = new->egid = new->sgid = new->fsgid = kgid; 443 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 444 new->egid = new->fsgid = kgid; 445 else 446 goto error; 447 448 retval = security_task_fix_setgid(new, old, LSM_SETID_ID); 449 if (retval < 0) 450 goto error; 451 452 return commit_creds(new); 453 454 error: 455 abort_creds(new); 456 return retval; 457 } 458 459 SYSCALL_DEFINE1(setgid, gid_t, gid) 460 { 461 return __sys_setgid(gid); 462 } 463 464 /* 465 * change the user struct in a credentials set to match the new UID 466 */ 467 static int set_user(struct cred *new) 468 { 469 struct user_struct *new_user; 470 471 new_user = alloc_uid(new->uid); 472 if (!new_user) 473 return -EAGAIN; 474 475 /* 476 * We don't fail in case of NPROC limit excess here because too many 477 * poorly written programs don't check set*uid() return code, assuming 478 * it never fails if called by root. We may still enforce NPROC limit 479 * for programs doing set*uid()+execve() by harmlessly deferring the 480 * failure to the execve() stage. 481 */ 482 if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && 483 new_user != INIT_USER && 484 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) 485 current->flags |= PF_NPROC_EXCEEDED; 486 else 487 current->flags &= ~PF_NPROC_EXCEEDED; 488 489 free_uid(new->user); 490 new->user = new_user; 491 return 0; 492 } 493 494 /* 495 * Unprivileged users may change the real uid to the effective uid 496 * or vice versa. (BSD-style) 497 * 498 * If you set the real uid at all, or set the effective uid to a value not 499 * equal to the real uid, then the saved uid is set to the new effective uid. 500 * 501 * This makes it possible for a setuid program to completely drop its 502 * privileges, which is often a useful assertion to make when you are doing 503 * a security audit over a program. 504 * 505 * The general idea is that a program which uses just setreuid() will be 506 * 100% compatible with BSD. A program which uses just setuid() will be 507 * 100% compatible with POSIX with saved IDs. 508 */ 509 long __sys_setreuid(uid_t ruid, uid_t euid) 510 { 511 struct user_namespace *ns = current_user_ns(); 512 const struct cred *old; 513 struct cred *new; 514 int retval; 515 kuid_t kruid, keuid; 516 517 kruid = make_kuid(ns, ruid); 518 keuid = make_kuid(ns, euid); 519 520 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 521 return -EINVAL; 522 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 523 return -EINVAL; 524 525 new = prepare_creds(); 526 if (!new) 527 return -ENOMEM; 528 old = current_cred(); 529 530 retval = -EPERM; 531 if (ruid != (uid_t) -1) { 532 new->uid = kruid; 533 if (!uid_eq(old->uid, kruid) && 534 !uid_eq(old->euid, kruid) && 535 !ns_capable_setid(old->user_ns, CAP_SETUID)) 536 goto error; 537 } 538 539 if (euid != (uid_t) -1) { 540 new->euid = keuid; 541 if (!uid_eq(old->uid, keuid) && 542 !uid_eq(old->euid, keuid) && 543 !uid_eq(old->suid, keuid) && 544 !ns_capable_setid(old->user_ns, CAP_SETUID)) 545 goto error; 546 } 547 548 if (!uid_eq(new->uid, old->uid)) { 549 retval = set_user(new); 550 if (retval < 0) 551 goto error; 552 } 553 if (ruid != (uid_t) -1 || 554 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) 555 new->suid = new->euid; 556 new->fsuid = new->euid; 557 558 retval = security_task_fix_setuid(new, old, LSM_SETID_RE); 559 if (retval < 0) 560 goto error; 561 562 retval = set_cred_ucounts(new); 563 if (retval < 0) 564 goto error; 565 566 return commit_creds(new); 567 568 error: 569 abort_creds(new); 570 return retval; 571 } 572 573 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 574 { 575 return __sys_setreuid(ruid, euid); 576 } 577 578 /* 579 * setuid() is implemented like SysV with SAVED_IDS 580 * 581 * Note that SAVED_ID's is deficient in that a setuid root program 582 * like sendmail, for example, cannot set its uid to be a normal 583 * user and then switch back, because if you're root, setuid() sets 584 * the saved uid too. If you don't like this, blame the bright people 585 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 586 * will allow a root program to temporarily drop privileges and be able to 587 * regain them by swapping the real and effective uid. 588 */ 589 long __sys_setuid(uid_t uid) 590 { 591 struct user_namespace *ns = current_user_ns(); 592 const struct cred *old; 593 struct cred *new; 594 int retval; 595 kuid_t kuid; 596 597 kuid = make_kuid(ns, uid); 598 if (!uid_valid(kuid)) 599 return -EINVAL; 600 601 new = prepare_creds(); 602 if (!new) 603 return -ENOMEM; 604 old = current_cred(); 605 606 retval = -EPERM; 607 if (ns_capable_setid(old->user_ns, CAP_SETUID)) { 608 new->suid = new->uid = kuid; 609 if (!uid_eq(kuid, old->uid)) { 610 retval = set_user(new); 611 if (retval < 0) 612 goto error; 613 } 614 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { 615 goto error; 616 } 617 618 new->fsuid = new->euid = kuid; 619 620 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 621 if (retval < 0) 622 goto error; 623 624 retval = set_cred_ucounts(new); 625 if (retval < 0) 626 goto error; 627 628 return commit_creds(new); 629 630 error: 631 abort_creds(new); 632 return retval; 633 } 634 635 SYSCALL_DEFINE1(setuid, uid_t, uid) 636 { 637 return __sys_setuid(uid); 638 } 639 640 641 /* 642 * This function implements a generic ability to update ruid, euid, 643 * and suid. This allows you to implement the 4.4 compatible seteuid(). 644 */ 645 long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 646 { 647 struct user_namespace *ns = current_user_ns(); 648 const struct cred *old; 649 struct cred *new; 650 int retval; 651 kuid_t kruid, keuid, ksuid; 652 653 kruid = make_kuid(ns, ruid); 654 keuid = make_kuid(ns, euid); 655 ksuid = make_kuid(ns, suid); 656 657 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 658 return -EINVAL; 659 660 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 661 return -EINVAL; 662 663 if ((suid != (uid_t) -1) && !uid_valid(ksuid)) 664 return -EINVAL; 665 666 new = prepare_creds(); 667 if (!new) 668 return -ENOMEM; 669 670 old = current_cred(); 671 672 retval = -EPERM; 673 if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { 674 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 675 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 676 goto error; 677 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && 678 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) 679 goto error; 680 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && 681 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) 682 goto error; 683 } 684 685 if (ruid != (uid_t) -1) { 686 new->uid = kruid; 687 if (!uid_eq(kruid, old->uid)) { 688 retval = set_user(new); 689 if (retval < 0) 690 goto error; 691 } 692 } 693 if (euid != (uid_t) -1) 694 new->euid = keuid; 695 if (suid != (uid_t) -1) 696 new->suid = ksuid; 697 new->fsuid = new->euid; 698 699 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 700 if (retval < 0) 701 goto error; 702 703 retval = set_cred_ucounts(new); 704 if (retval < 0) 705 goto error; 706 707 return commit_creds(new); 708 709 error: 710 abort_creds(new); 711 return retval; 712 } 713 714 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 715 { 716 return __sys_setresuid(ruid, euid, suid); 717 } 718 719 SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) 720 { 721 const struct cred *cred = current_cred(); 722 int retval; 723 uid_t ruid, euid, suid; 724 725 ruid = from_kuid_munged(cred->user_ns, cred->uid); 726 euid = from_kuid_munged(cred->user_ns, cred->euid); 727 suid = from_kuid_munged(cred->user_ns, cred->suid); 728 729 retval = put_user(ruid, ruidp); 730 if (!retval) { 731 retval = put_user(euid, euidp); 732 if (!retval) 733 return put_user(suid, suidp); 734 } 735 return retval; 736 } 737 738 /* 739 * Same as above, but for rgid, egid, sgid. 740 */ 741 long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 742 { 743 struct user_namespace *ns = current_user_ns(); 744 const struct cred *old; 745 struct cred *new; 746 int retval; 747 kgid_t krgid, kegid, ksgid; 748 749 krgid = make_kgid(ns, rgid); 750 kegid = make_kgid(ns, egid); 751 ksgid = make_kgid(ns, sgid); 752 753 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 754 return -EINVAL; 755 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 756 return -EINVAL; 757 if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) 758 return -EINVAL; 759 760 new = prepare_creds(); 761 if (!new) 762 return -ENOMEM; 763 old = current_cred(); 764 765 retval = -EPERM; 766 if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { 767 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 768 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 769 goto error; 770 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && 771 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) 772 goto error; 773 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && 774 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) 775 goto error; 776 } 777 778 if (rgid != (gid_t) -1) 779 new->gid = krgid; 780 if (egid != (gid_t) -1) 781 new->egid = kegid; 782 if (sgid != (gid_t) -1) 783 new->sgid = ksgid; 784 new->fsgid = new->egid; 785 786 retval = security_task_fix_setgid(new, old, LSM_SETID_RES); 787 if (retval < 0) 788 goto error; 789 790 return commit_creds(new); 791 792 error: 793 abort_creds(new); 794 return retval; 795 } 796 797 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 798 { 799 return __sys_setresgid(rgid, egid, sgid); 800 } 801 802 SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) 803 { 804 const struct cred *cred = current_cred(); 805 int retval; 806 gid_t rgid, egid, sgid; 807 808 rgid = from_kgid_munged(cred->user_ns, cred->gid); 809 egid = from_kgid_munged(cred->user_ns, cred->egid); 810 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 811 812 retval = put_user(rgid, rgidp); 813 if (!retval) { 814 retval = put_user(egid, egidp); 815 if (!retval) 816 retval = put_user(sgid, sgidp); 817 } 818 819 return retval; 820 } 821 822 823 /* 824 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This 825 * is used for "access()" and for the NFS daemon (letting nfsd stay at 826 * whatever uid it wants to). It normally shadows "euid", except when 827 * explicitly set by setfsuid() or for access.. 828 */ 829 long __sys_setfsuid(uid_t uid) 830 { 831 const struct cred *old; 832 struct cred *new; 833 uid_t old_fsuid; 834 kuid_t kuid; 835 836 old = current_cred(); 837 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); 838 839 kuid = make_kuid(old->user_ns, uid); 840 if (!uid_valid(kuid)) 841 return old_fsuid; 842 843 new = prepare_creds(); 844 if (!new) 845 return old_fsuid; 846 847 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 848 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 849 ns_capable_setid(old->user_ns, CAP_SETUID)) { 850 if (!uid_eq(kuid, old->fsuid)) { 851 new->fsuid = kuid; 852 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 853 goto change_okay; 854 } 855 } 856 857 abort_creds(new); 858 return old_fsuid; 859 860 change_okay: 861 commit_creds(new); 862 return old_fsuid; 863 } 864 865 SYSCALL_DEFINE1(setfsuid, uid_t, uid) 866 { 867 return __sys_setfsuid(uid); 868 } 869 870 /* 871 * Samma på svenska.. 872 */ 873 long __sys_setfsgid(gid_t gid) 874 { 875 const struct cred *old; 876 struct cred *new; 877 gid_t old_fsgid; 878 kgid_t kgid; 879 880 old = current_cred(); 881 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); 882 883 kgid = make_kgid(old->user_ns, gid); 884 if (!gid_valid(kgid)) 885 return old_fsgid; 886 887 new = prepare_creds(); 888 if (!new) 889 return old_fsgid; 890 891 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 892 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 893 ns_capable_setid(old->user_ns, CAP_SETGID)) { 894 if (!gid_eq(kgid, old->fsgid)) { 895 new->fsgid = kgid; 896 if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) 897 goto change_okay; 898 } 899 } 900 901 abort_creds(new); 902 return old_fsgid; 903 904 change_okay: 905 commit_creds(new); 906 return old_fsgid; 907 } 908 909 SYSCALL_DEFINE1(setfsgid, gid_t, gid) 910 { 911 return __sys_setfsgid(gid); 912 } 913 #endif /* CONFIG_MULTIUSER */ 914 915 /** 916 * sys_getpid - return the thread group id of the current process 917 * 918 * Note, despite the name, this returns the tgid not the pid. The tgid and 919 * the pid are identical unless CLONE_THREAD was specified on clone() in 920 * which case the tgid is the same in all threads of the same group. 921 * 922 * This is SMP safe as current->tgid does not change. 923 */ 924 SYSCALL_DEFINE0(getpid) 925 { 926 return task_tgid_vnr(current); 927 } 928 929 /* Thread ID - the internal kernel "pid" */ 930 SYSCALL_DEFINE0(gettid) 931 { 932 return task_pid_vnr(current); 933 } 934 935 /* 936 * Accessing ->real_parent is not SMP-safe, it could 937 * change from under us. However, we can use a stale 938 * value of ->real_parent under rcu_read_lock(), see 939 * release_task()->call_rcu(delayed_put_task_struct). 940 */ 941 SYSCALL_DEFINE0(getppid) 942 { 943 int pid; 944 945 rcu_read_lock(); 946 pid = task_tgid_vnr(rcu_dereference(current->real_parent)); 947 rcu_read_unlock(); 948 949 return pid; 950 } 951 952 SYSCALL_DEFINE0(getuid) 953 { 954 /* Only we change this so SMP safe */ 955 return from_kuid_munged(current_user_ns(), current_uid()); 956 } 957 958 SYSCALL_DEFINE0(geteuid) 959 { 960 /* Only we change this so SMP safe */ 961 return from_kuid_munged(current_user_ns(), current_euid()); 962 } 963 964 SYSCALL_DEFINE0(getgid) 965 { 966 /* Only we change this so SMP safe */ 967 return from_kgid_munged(current_user_ns(), current_gid()); 968 } 969 970 SYSCALL_DEFINE0(getegid) 971 { 972 /* Only we change this so SMP safe */ 973 return from_kgid_munged(current_user_ns(), current_egid()); 974 } 975 976 static void do_sys_times(struct tms *tms) 977 { 978 u64 tgutime, tgstime, cutime, cstime; 979 980 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 981 cutime = current->signal->cutime; 982 cstime = current->signal->cstime; 983 tms->tms_utime = nsec_to_clock_t(tgutime); 984 tms->tms_stime = nsec_to_clock_t(tgstime); 985 tms->tms_cutime = nsec_to_clock_t(cutime); 986 tms->tms_cstime = nsec_to_clock_t(cstime); 987 } 988 989 SYSCALL_DEFINE1(times, struct tms __user *, tbuf) 990 { 991 if (tbuf) { 992 struct tms tmp; 993 994 do_sys_times(&tmp); 995 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 996 return -EFAULT; 997 } 998 force_successful_syscall_return(); 999 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 1000 } 1001 1002 #ifdef CONFIG_COMPAT 1003 static compat_clock_t clock_t_to_compat_clock_t(clock_t x) 1004 { 1005 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); 1006 } 1007 1008 COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) 1009 { 1010 if (tbuf) { 1011 struct tms tms; 1012 struct compat_tms tmp; 1013 1014 do_sys_times(&tms); 1015 /* Convert our struct tms to the compat version. */ 1016 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); 1017 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); 1018 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); 1019 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); 1020 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 1021 return -EFAULT; 1022 } 1023 force_successful_syscall_return(); 1024 return compat_jiffies_to_clock_t(jiffies); 1025 } 1026 #endif 1027 1028 /* 1029 * This needs some heavy checking ... 1030 * I just haven't the stomach for it. I also don't fully 1031 * understand sessions/pgrp etc. Let somebody who does explain it. 1032 * 1033 * OK, I think I have the protection semantics right.... this is really 1034 * only important on a multi-user system anyway, to make sure one user 1035 * can't send a signal to a process owned by another. -TYT, 12/12/91 1036 * 1037 * !PF_FORKNOEXEC check to conform completely to POSIX. 1038 */ 1039 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) 1040 { 1041 struct task_struct *p; 1042 struct task_struct *group_leader = current->group_leader; 1043 struct pid *pgrp; 1044 int err; 1045 1046 if (!pid) 1047 pid = task_pid_vnr(group_leader); 1048 if (!pgid) 1049 pgid = pid; 1050 if (pgid < 0) 1051 return -EINVAL; 1052 rcu_read_lock(); 1053 1054 /* From this point forward we keep holding onto the tasklist lock 1055 * so that our parent does not change from under us. -DaveM 1056 */ 1057 write_lock_irq(&tasklist_lock); 1058 1059 err = -ESRCH; 1060 p = find_task_by_vpid(pid); 1061 if (!p) 1062 goto out; 1063 1064 err = -EINVAL; 1065 if (!thread_group_leader(p)) 1066 goto out; 1067 1068 if (same_thread_group(p->real_parent, group_leader)) { 1069 err = -EPERM; 1070 if (task_session(p) != task_session(group_leader)) 1071 goto out; 1072 err = -EACCES; 1073 if (!(p->flags & PF_FORKNOEXEC)) 1074 goto out; 1075 } else { 1076 err = -ESRCH; 1077 if (p != group_leader) 1078 goto out; 1079 } 1080 1081 err = -EPERM; 1082 if (p->signal->leader) 1083 goto out; 1084 1085 pgrp = task_pid(p); 1086 if (pgid != pid) { 1087 struct task_struct *g; 1088 1089 pgrp = find_vpid(pgid); 1090 g = pid_task(pgrp, PIDTYPE_PGID); 1091 if (!g || task_session(g) != task_session(group_leader)) 1092 goto out; 1093 } 1094 1095 err = security_task_setpgid(p, pgid); 1096 if (err) 1097 goto out; 1098 1099 if (task_pgrp(p) != pgrp) 1100 change_pid(p, PIDTYPE_PGID, pgrp); 1101 1102 err = 0; 1103 out: 1104 /* All paths lead to here, thus we are safe. -DaveM */ 1105 write_unlock_irq(&tasklist_lock); 1106 rcu_read_unlock(); 1107 return err; 1108 } 1109 1110 static int do_getpgid(pid_t pid) 1111 { 1112 struct task_struct *p; 1113 struct pid *grp; 1114 int retval; 1115 1116 rcu_read_lock(); 1117 if (!pid) 1118 grp = task_pgrp(current); 1119 else { 1120 retval = -ESRCH; 1121 p = find_task_by_vpid(pid); 1122 if (!p) 1123 goto out; 1124 grp = task_pgrp(p); 1125 if (!grp) 1126 goto out; 1127 1128 retval = security_task_getpgid(p); 1129 if (retval) 1130 goto out; 1131 } 1132 retval = pid_vnr(grp); 1133 out: 1134 rcu_read_unlock(); 1135 return retval; 1136 } 1137 1138 SYSCALL_DEFINE1(getpgid, pid_t, pid) 1139 { 1140 return do_getpgid(pid); 1141 } 1142 1143 #ifdef __ARCH_WANT_SYS_GETPGRP 1144 1145 SYSCALL_DEFINE0(getpgrp) 1146 { 1147 return do_getpgid(0); 1148 } 1149 1150 #endif 1151 1152 SYSCALL_DEFINE1(getsid, pid_t, pid) 1153 { 1154 struct task_struct *p; 1155 struct pid *sid; 1156 int retval; 1157 1158 rcu_read_lock(); 1159 if (!pid) 1160 sid = task_session(current); 1161 else { 1162 retval = -ESRCH; 1163 p = find_task_by_vpid(pid); 1164 if (!p) 1165 goto out; 1166 sid = task_session(p); 1167 if (!sid) 1168 goto out; 1169 1170 retval = security_task_getsid(p); 1171 if (retval) 1172 goto out; 1173 } 1174 retval = pid_vnr(sid); 1175 out: 1176 rcu_read_unlock(); 1177 return retval; 1178 } 1179 1180 static void set_special_pids(struct pid *pid) 1181 { 1182 struct task_struct *curr = current->group_leader; 1183 1184 if (task_session(curr) != pid) 1185 change_pid(curr, PIDTYPE_SID, pid); 1186 1187 if (task_pgrp(curr) != pid) 1188 change_pid(curr, PIDTYPE_PGID, pid); 1189 } 1190 1191 int ksys_setsid(void) 1192 { 1193 struct task_struct *group_leader = current->group_leader; 1194 struct pid *sid = task_pid(group_leader); 1195 pid_t session = pid_vnr(sid); 1196 int err = -EPERM; 1197 1198 write_lock_irq(&tasklist_lock); 1199 /* Fail if I am already a session leader */ 1200 if (group_leader->signal->leader) 1201 goto out; 1202 1203 /* Fail if a process group id already exists that equals the 1204 * proposed session id. 1205 */ 1206 if (pid_task(sid, PIDTYPE_PGID)) 1207 goto out; 1208 1209 group_leader->signal->leader = 1; 1210 set_special_pids(sid); 1211 1212 proc_clear_tty(group_leader); 1213 1214 err = session; 1215 out: 1216 write_unlock_irq(&tasklist_lock); 1217 if (err > 0) { 1218 proc_sid_connector(group_leader); 1219 sched_autogroup_create_attach(group_leader); 1220 } 1221 return err; 1222 } 1223 1224 SYSCALL_DEFINE0(setsid) 1225 { 1226 return ksys_setsid(); 1227 } 1228 1229 DECLARE_RWSEM(uts_sem); 1230 1231 #ifdef COMPAT_UTS_MACHINE 1232 #define override_architecture(name) \ 1233 (personality(current->personality) == PER_LINUX32 && \ 1234 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ 1235 sizeof(COMPAT_UTS_MACHINE))) 1236 #else 1237 #define override_architecture(name) 0 1238 #endif 1239 1240 /* 1241 * Work around broken programs that cannot handle "Linux 3.0". 1242 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1243 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be 1244 * 2.6.60. 1245 */ 1246 static int override_release(char __user *release, size_t len) 1247 { 1248 int ret = 0; 1249 1250 if (current->personality & UNAME26) { 1251 const char *rest = UTS_RELEASE; 1252 char buf[65] = { 0 }; 1253 int ndots = 0; 1254 unsigned v; 1255 size_t copy; 1256 1257 while (*rest) { 1258 if (*rest == '.' && ++ndots >= 3) 1259 break; 1260 if (!isdigit(*rest) && *rest != '.') 1261 break; 1262 rest++; 1263 } 1264 v = LINUX_VERSION_PATCHLEVEL + 60; 1265 copy = clamp_t(size_t, len, 1, sizeof(buf)); 1266 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); 1267 ret = copy_to_user(release, buf, copy + 1); 1268 } 1269 return ret; 1270 } 1271 1272 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1273 { 1274 struct new_utsname tmp; 1275 1276 down_read(&uts_sem); 1277 memcpy(&tmp, utsname(), sizeof(tmp)); 1278 up_read(&uts_sem); 1279 if (copy_to_user(name, &tmp, sizeof(tmp))) 1280 return -EFAULT; 1281 1282 if (override_release(name->release, sizeof(name->release))) 1283 return -EFAULT; 1284 if (override_architecture(name)) 1285 return -EFAULT; 1286 return 0; 1287 } 1288 1289 #ifdef __ARCH_WANT_SYS_OLD_UNAME 1290 /* 1291 * Old cruft 1292 */ 1293 SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) 1294 { 1295 struct old_utsname tmp; 1296 1297 if (!name) 1298 return -EFAULT; 1299 1300 down_read(&uts_sem); 1301 memcpy(&tmp, utsname(), sizeof(tmp)); 1302 up_read(&uts_sem); 1303 if (copy_to_user(name, &tmp, sizeof(tmp))) 1304 return -EFAULT; 1305 1306 if (override_release(name->release, sizeof(name->release))) 1307 return -EFAULT; 1308 if (override_architecture(name)) 1309 return -EFAULT; 1310 return 0; 1311 } 1312 1313 SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) 1314 { 1315 struct oldold_utsname tmp; 1316 1317 if (!name) 1318 return -EFAULT; 1319 1320 memset(&tmp, 0, sizeof(tmp)); 1321 1322 down_read(&uts_sem); 1323 memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); 1324 memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); 1325 memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); 1326 memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); 1327 memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); 1328 up_read(&uts_sem); 1329 if (copy_to_user(name, &tmp, sizeof(tmp))) 1330 return -EFAULT; 1331 1332 if (override_architecture(name)) 1333 return -EFAULT; 1334 if (override_release(name->release, sizeof(name->release))) 1335 return -EFAULT; 1336 return 0; 1337 } 1338 #endif 1339 1340 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1341 { 1342 int errno; 1343 char tmp[__NEW_UTS_LEN]; 1344 1345 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1346 return -EPERM; 1347 1348 if (len < 0 || len > __NEW_UTS_LEN) 1349 return -EINVAL; 1350 errno = -EFAULT; 1351 if (!copy_from_user(tmp, name, len)) { 1352 struct new_utsname *u; 1353 1354 down_write(&uts_sem); 1355 u = utsname(); 1356 memcpy(u->nodename, tmp, len); 1357 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1358 errno = 0; 1359 uts_proc_notify(UTS_PROC_HOSTNAME); 1360 up_write(&uts_sem); 1361 } 1362 return errno; 1363 } 1364 1365 #ifdef __ARCH_WANT_SYS_GETHOSTNAME 1366 1367 SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) 1368 { 1369 int i; 1370 struct new_utsname *u; 1371 char tmp[__NEW_UTS_LEN + 1]; 1372 1373 if (len < 0) 1374 return -EINVAL; 1375 down_read(&uts_sem); 1376 u = utsname(); 1377 i = 1 + strlen(u->nodename); 1378 if (i > len) 1379 i = len; 1380 memcpy(tmp, u->nodename, i); 1381 up_read(&uts_sem); 1382 if (copy_to_user(name, tmp, i)) 1383 return -EFAULT; 1384 return 0; 1385 } 1386 1387 #endif 1388 1389 /* 1390 * Only setdomainname; getdomainname can be implemented by calling 1391 * uname() 1392 */ 1393 SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) 1394 { 1395 int errno; 1396 char tmp[__NEW_UTS_LEN]; 1397 1398 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1399 return -EPERM; 1400 if (len < 0 || len > __NEW_UTS_LEN) 1401 return -EINVAL; 1402 1403 errno = -EFAULT; 1404 if (!copy_from_user(tmp, name, len)) { 1405 struct new_utsname *u; 1406 1407 down_write(&uts_sem); 1408 u = utsname(); 1409 memcpy(u->domainname, tmp, len); 1410 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1411 errno = 0; 1412 uts_proc_notify(UTS_PROC_DOMAINNAME); 1413 up_write(&uts_sem); 1414 } 1415 return errno; 1416 } 1417 1418 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1419 { 1420 struct rlimit value; 1421 int ret; 1422 1423 ret = do_prlimit(current, resource, NULL, &value); 1424 if (!ret) 1425 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1426 1427 return ret; 1428 } 1429 1430 #ifdef CONFIG_COMPAT 1431 1432 COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, 1433 struct compat_rlimit __user *, rlim) 1434 { 1435 struct rlimit r; 1436 struct compat_rlimit r32; 1437 1438 if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) 1439 return -EFAULT; 1440 1441 if (r32.rlim_cur == COMPAT_RLIM_INFINITY) 1442 r.rlim_cur = RLIM_INFINITY; 1443 else 1444 r.rlim_cur = r32.rlim_cur; 1445 if (r32.rlim_max == COMPAT_RLIM_INFINITY) 1446 r.rlim_max = RLIM_INFINITY; 1447 else 1448 r.rlim_max = r32.rlim_max; 1449 return do_prlimit(current, resource, &r, NULL); 1450 } 1451 1452 COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, 1453 struct compat_rlimit __user *, rlim) 1454 { 1455 struct rlimit r; 1456 int ret; 1457 1458 ret = do_prlimit(current, resource, NULL, &r); 1459 if (!ret) { 1460 struct compat_rlimit r32; 1461 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 1462 r32.rlim_cur = COMPAT_RLIM_INFINITY; 1463 else 1464 r32.rlim_cur = r.rlim_cur; 1465 if (r.rlim_max > COMPAT_RLIM_INFINITY) 1466 r32.rlim_max = COMPAT_RLIM_INFINITY; 1467 else 1468 r32.rlim_max = r.rlim_max; 1469 1470 if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) 1471 return -EFAULT; 1472 } 1473 return ret; 1474 } 1475 1476 #endif 1477 1478 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1479 1480 /* 1481 * Back compatibility for getrlimit. Needed for some apps. 1482 */ 1483 SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1484 struct rlimit __user *, rlim) 1485 { 1486 struct rlimit x; 1487 if (resource >= RLIM_NLIMITS) 1488 return -EINVAL; 1489 1490 resource = array_index_nospec(resource, RLIM_NLIMITS); 1491 task_lock(current->group_leader); 1492 x = current->signal->rlim[resource]; 1493 task_unlock(current->group_leader); 1494 if (x.rlim_cur > 0x7FFFFFFF) 1495 x.rlim_cur = 0x7FFFFFFF; 1496 if (x.rlim_max > 0x7FFFFFFF) 1497 x.rlim_max = 0x7FFFFFFF; 1498 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; 1499 } 1500 1501 #ifdef CONFIG_COMPAT 1502 COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1503 struct compat_rlimit __user *, rlim) 1504 { 1505 struct rlimit r; 1506 1507 if (resource >= RLIM_NLIMITS) 1508 return -EINVAL; 1509 1510 resource = array_index_nospec(resource, RLIM_NLIMITS); 1511 task_lock(current->group_leader); 1512 r = current->signal->rlim[resource]; 1513 task_unlock(current->group_leader); 1514 if (r.rlim_cur > 0x7FFFFFFF) 1515 r.rlim_cur = 0x7FFFFFFF; 1516 if (r.rlim_max > 0x7FFFFFFF) 1517 r.rlim_max = 0x7FFFFFFF; 1518 1519 if (put_user(r.rlim_cur, &rlim->rlim_cur) || 1520 put_user(r.rlim_max, &rlim->rlim_max)) 1521 return -EFAULT; 1522 return 0; 1523 } 1524 #endif 1525 1526 #endif 1527 1528 static inline bool rlim64_is_infinity(__u64 rlim64) 1529 { 1530 #if BITS_PER_LONG < 64 1531 return rlim64 >= ULONG_MAX; 1532 #else 1533 return rlim64 == RLIM64_INFINITY; 1534 #endif 1535 } 1536 1537 static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) 1538 { 1539 if (rlim->rlim_cur == RLIM_INFINITY) 1540 rlim64->rlim_cur = RLIM64_INFINITY; 1541 else 1542 rlim64->rlim_cur = rlim->rlim_cur; 1543 if (rlim->rlim_max == RLIM_INFINITY) 1544 rlim64->rlim_max = RLIM64_INFINITY; 1545 else 1546 rlim64->rlim_max = rlim->rlim_max; 1547 } 1548 1549 static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) 1550 { 1551 if (rlim64_is_infinity(rlim64->rlim_cur)) 1552 rlim->rlim_cur = RLIM_INFINITY; 1553 else 1554 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; 1555 if (rlim64_is_infinity(rlim64->rlim_max)) 1556 rlim->rlim_max = RLIM_INFINITY; 1557 else 1558 rlim->rlim_max = (unsigned long)rlim64->rlim_max; 1559 } 1560 1561 /* make sure you are allowed to change @tsk limits before calling this */ 1562 int do_prlimit(struct task_struct *tsk, unsigned int resource, 1563 struct rlimit *new_rlim, struct rlimit *old_rlim) 1564 { 1565 struct rlimit *rlim; 1566 int retval = 0; 1567 1568 if (resource >= RLIM_NLIMITS) 1569 return -EINVAL; 1570 if (new_rlim) { 1571 if (new_rlim->rlim_cur > new_rlim->rlim_max) 1572 return -EINVAL; 1573 if (resource == RLIMIT_NOFILE && 1574 new_rlim->rlim_max > sysctl_nr_open) 1575 return -EPERM; 1576 } 1577 1578 /* protect tsk->signal and tsk->sighand from disappearing */ 1579 read_lock(&tasklist_lock); 1580 if (!tsk->sighand) { 1581 retval = -ESRCH; 1582 goto out; 1583 } 1584 1585 rlim = tsk->signal->rlim + resource; 1586 task_lock(tsk->group_leader); 1587 if (new_rlim) { 1588 /* Keep the capable check against init_user_ns until 1589 cgroups can contain all limits */ 1590 if (new_rlim->rlim_max > rlim->rlim_max && 1591 !capable(CAP_SYS_RESOURCE)) 1592 retval = -EPERM; 1593 if (!retval) 1594 retval = security_task_setrlimit(tsk, resource, new_rlim); 1595 } 1596 if (!retval) { 1597 if (old_rlim) 1598 *old_rlim = *rlim; 1599 if (new_rlim) 1600 *rlim = *new_rlim; 1601 } 1602 task_unlock(tsk->group_leader); 1603 1604 /* 1605 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not 1606 * infinite. In case of RLIM_INFINITY the posix CPU timer code 1607 * ignores the rlimit. 1608 */ 1609 if (!retval && new_rlim && resource == RLIMIT_CPU && 1610 new_rlim->rlim_cur != RLIM_INFINITY && 1611 IS_ENABLED(CONFIG_POSIX_TIMERS)) 1612 update_rlimit_cpu(tsk, new_rlim->rlim_cur); 1613 out: 1614 read_unlock(&tasklist_lock); 1615 return retval; 1616 } 1617 1618 /* rcu lock must be held */ 1619 static int check_prlimit_permission(struct task_struct *task, 1620 unsigned int flags) 1621 { 1622 const struct cred *cred = current_cred(), *tcred; 1623 bool id_match; 1624 1625 if (current == task) 1626 return 0; 1627 1628 tcred = __task_cred(task); 1629 id_match = (uid_eq(cred->uid, tcred->euid) && 1630 uid_eq(cred->uid, tcred->suid) && 1631 uid_eq(cred->uid, tcred->uid) && 1632 gid_eq(cred->gid, tcred->egid) && 1633 gid_eq(cred->gid, tcred->sgid) && 1634 gid_eq(cred->gid, tcred->gid)); 1635 if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) 1636 return -EPERM; 1637 1638 return security_task_prlimit(cred, tcred, flags); 1639 } 1640 1641 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1642 const struct rlimit64 __user *, new_rlim, 1643 struct rlimit64 __user *, old_rlim) 1644 { 1645 struct rlimit64 old64, new64; 1646 struct rlimit old, new; 1647 struct task_struct *tsk; 1648 unsigned int checkflags = 0; 1649 int ret; 1650 1651 if (old_rlim) 1652 checkflags |= LSM_PRLIMIT_READ; 1653 1654 if (new_rlim) { 1655 if (copy_from_user(&new64, new_rlim, sizeof(new64))) 1656 return -EFAULT; 1657 rlim64_to_rlim(&new64, &new); 1658 checkflags |= LSM_PRLIMIT_WRITE; 1659 } 1660 1661 rcu_read_lock(); 1662 tsk = pid ? find_task_by_vpid(pid) : current; 1663 if (!tsk) { 1664 rcu_read_unlock(); 1665 return -ESRCH; 1666 } 1667 ret = check_prlimit_permission(tsk, checkflags); 1668 if (ret) { 1669 rcu_read_unlock(); 1670 return ret; 1671 } 1672 get_task_struct(tsk); 1673 rcu_read_unlock(); 1674 1675 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, 1676 old_rlim ? &old : NULL); 1677 1678 if (!ret && old_rlim) { 1679 rlim_to_rlim64(&old, &old64); 1680 if (copy_to_user(old_rlim, &old64, sizeof(old64))) 1681 ret = -EFAULT; 1682 } 1683 1684 put_task_struct(tsk); 1685 return ret; 1686 } 1687 1688 SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1689 { 1690 struct rlimit new_rlim; 1691 1692 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1693 return -EFAULT; 1694 return do_prlimit(current, resource, &new_rlim, NULL); 1695 } 1696 1697 /* 1698 * It would make sense to put struct rusage in the task_struct, 1699 * except that would make the task_struct be *really big*. After 1700 * task_struct gets moved into malloc'ed memory, it would 1701 * make sense to do this. It will make moving the rest of the information 1702 * a lot simpler! (Which we're not doing right now because we're not 1703 * measuring them yet). 1704 * 1705 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1706 * races with threads incrementing their own counters. But since word 1707 * reads are atomic, we either get new values or old values and we don't 1708 * care which for the sums. We always take the siglock to protect reading 1709 * the c* fields from p->signal from races with exit.c updating those 1710 * fields when reaping, so a sample either gets all the additions of a 1711 * given child after it's reaped, or none so this sample is before reaping. 1712 * 1713 * Locking: 1714 * We need to take the siglock for CHILDEREN, SELF and BOTH 1715 * for the cases current multithreaded, non-current single threaded 1716 * non-current multithreaded. Thread traversal is now safe with 1717 * the siglock held. 1718 * Strictly speaking, we donot need to take the siglock if we are current and 1719 * single threaded, as no one else can take our signal_struct away, no one 1720 * else can reap the children to update signal->c* counters, and no one else 1721 * can race with the signal-> fields. If we do not take any lock, the 1722 * signal-> fields could be read out of order while another thread was just 1723 * exiting. So we should place a read memory barrier when we avoid the lock. 1724 * On the writer side, write memory barrier is implied in __exit_signal 1725 * as __exit_signal releases the siglock spinlock after updating the signal-> 1726 * fields. But we don't do this yet to keep things simple. 1727 * 1728 */ 1729 1730 static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) 1731 { 1732 r->ru_nvcsw += t->nvcsw; 1733 r->ru_nivcsw += t->nivcsw; 1734 r->ru_minflt += t->min_flt; 1735 r->ru_majflt += t->maj_flt; 1736 r->ru_inblock += task_io_get_inblock(t); 1737 r->ru_oublock += task_io_get_oublock(t); 1738 } 1739 1740 void getrusage(struct task_struct *p, int who, struct rusage *r) 1741 { 1742 struct task_struct *t; 1743 unsigned long flags; 1744 u64 tgutime, tgstime, utime, stime; 1745 unsigned long maxrss = 0; 1746 1747 memset((char *)r, 0, sizeof (*r)); 1748 utime = stime = 0; 1749 1750 if (who == RUSAGE_THREAD) { 1751 task_cputime_adjusted(current, &utime, &stime); 1752 accumulate_thread_rusage(p, r); 1753 maxrss = p->signal->maxrss; 1754 goto out; 1755 } 1756 1757 if (!lock_task_sighand(p, &flags)) 1758 return; 1759 1760 switch (who) { 1761 case RUSAGE_BOTH: 1762 case RUSAGE_CHILDREN: 1763 utime = p->signal->cutime; 1764 stime = p->signal->cstime; 1765 r->ru_nvcsw = p->signal->cnvcsw; 1766 r->ru_nivcsw = p->signal->cnivcsw; 1767 r->ru_minflt = p->signal->cmin_flt; 1768 r->ru_majflt = p->signal->cmaj_flt; 1769 r->ru_inblock = p->signal->cinblock; 1770 r->ru_oublock = p->signal->coublock; 1771 maxrss = p->signal->cmaxrss; 1772 1773 if (who == RUSAGE_CHILDREN) 1774 break; 1775 fallthrough; 1776 1777 case RUSAGE_SELF: 1778 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1779 utime += tgutime; 1780 stime += tgstime; 1781 r->ru_nvcsw += p->signal->nvcsw; 1782 r->ru_nivcsw += p->signal->nivcsw; 1783 r->ru_minflt += p->signal->min_flt; 1784 r->ru_majflt += p->signal->maj_flt; 1785 r->ru_inblock += p->signal->inblock; 1786 r->ru_oublock += p->signal->oublock; 1787 if (maxrss < p->signal->maxrss) 1788 maxrss = p->signal->maxrss; 1789 t = p; 1790 do { 1791 accumulate_thread_rusage(t, r); 1792 } while_each_thread(p, t); 1793 break; 1794 1795 default: 1796 BUG(); 1797 } 1798 unlock_task_sighand(p, &flags); 1799 1800 out: 1801 r->ru_utime = ns_to_kernel_old_timeval(utime); 1802 r->ru_stime = ns_to_kernel_old_timeval(stime); 1803 1804 if (who != RUSAGE_CHILDREN) { 1805 struct mm_struct *mm = get_task_mm(p); 1806 1807 if (mm) { 1808 setmax_mm_hiwater_rss(&maxrss, mm); 1809 mmput(mm); 1810 } 1811 } 1812 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ 1813 } 1814 1815 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) 1816 { 1817 struct rusage r; 1818 1819 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1820 who != RUSAGE_THREAD) 1821 return -EINVAL; 1822 1823 getrusage(current, who, &r); 1824 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1825 } 1826 1827 #ifdef CONFIG_COMPAT 1828 COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) 1829 { 1830 struct rusage r; 1831 1832 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1833 who != RUSAGE_THREAD) 1834 return -EINVAL; 1835 1836 getrusage(current, who, &r); 1837 return put_compat_rusage(&r, ru); 1838 } 1839 #endif 1840 1841 SYSCALL_DEFINE1(umask, int, mask) 1842 { 1843 mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); 1844 return mask; 1845 } 1846 1847 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1848 { 1849 struct fd exe; 1850 struct inode *inode; 1851 int err; 1852 1853 exe = fdget(fd); 1854 if (!exe.file) 1855 return -EBADF; 1856 1857 inode = file_inode(exe.file); 1858 1859 /* 1860 * Because the original mm->exe_file points to executable file, make 1861 * sure that this one is executable as well, to avoid breaking an 1862 * overall picture. 1863 */ 1864 err = -EACCES; 1865 if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) 1866 goto exit; 1867 1868 err = file_permission(exe.file, MAY_EXEC); 1869 if (err) 1870 goto exit; 1871 1872 err = replace_mm_exe_file(mm, exe.file); 1873 exit: 1874 fdput(exe); 1875 return err; 1876 } 1877 1878 /* 1879 * Check arithmetic relations of passed addresses. 1880 * 1881 * WARNING: we don't require any capability here so be very careful 1882 * in what is allowed for modification from userspace. 1883 */ 1884 static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) 1885 { 1886 unsigned long mmap_max_addr = TASK_SIZE; 1887 int error = -EINVAL, i; 1888 1889 static const unsigned char offsets[] = { 1890 offsetof(struct prctl_mm_map, start_code), 1891 offsetof(struct prctl_mm_map, end_code), 1892 offsetof(struct prctl_mm_map, start_data), 1893 offsetof(struct prctl_mm_map, end_data), 1894 offsetof(struct prctl_mm_map, start_brk), 1895 offsetof(struct prctl_mm_map, brk), 1896 offsetof(struct prctl_mm_map, start_stack), 1897 offsetof(struct prctl_mm_map, arg_start), 1898 offsetof(struct prctl_mm_map, arg_end), 1899 offsetof(struct prctl_mm_map, env_start), 1900 offsetof(struct prctl_mm_map, env_end), 1901 }; 1902 1903 /* 1904 * Make sure the members are not somewhere outside 1905 * of allowed address space. 1906 */ 1907 for (i = 0; i < ARRAY_SIZE(offsets); i++) { 1908 u64 val = *(u64 *)((char *)prctl_map + offsets[i]); 1909 1910 if ((unsigned long)val >= mmap_max_addr || 1911 (unsigned long)val < mmap_min_addr) 1912 goto out; 1913 } 1914 1915 /* 1916 * Make sure the pairs are ordered. 1917 */ 1918 #define __prctl_check_order(__m1, __op, __m2) \ 1919 ((unsigned long)prctl_map->__m1 __op \ 1920 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL 1921 error = __prctl_check_order(start_code, <, end_code); 1922 error |= __prctl_check_order(start_data,<=, end_data); 1923 error |= __prctl_check_order(start_brk, <=, brk); 1924 error |= __prctl_check_order(arg_start, <=, arg_end); 1925 error |= __prctl_check_order(env_start, <=, env_end); 1926 if (error) 1927 goto out; 1928 #undef __prctl_check_order 1929 1930 error = -EINVAL; 1931 1932 /* 1933 * Neither we should allow to override limits if they set. 1934 */ 1935 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, 1936 prctl_map->start_brk, prctl_map->end_data, 1937 prctl_map->start_data)) 1938 goto out; 1939 1940 error = 0; 1941 out: 1942 return error; 1943 } 1944 1945 #ifdef CONFIG_CHECKPOINT_RESTORE 1946 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) 1947 { 1948 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; 1949 unsigned long user_auxv[AT_VECTOR_SIZE]; 1950 struct mm_struct *mm = current->mm; 1951 int error; 1952 1953 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 1954 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); 1955 1956 if (opt == PR_SET_MM_MAP_SIZE) 1957 return put_user((unsigned int)sizeof(prctl_map), 1958 (unsigned int __user *)addr); 1959 1960 if (data_size != sizeof(prctl_map)) 1961 return -EINVAL; 1962 1963 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) 1964 return -EFAULT; 1965 1966 error = validate_prctl_map_addr(&prctl_map); 1967 if (error) 1968 return error; 1969 1970 if (prctl_map.auxv_size) { 1971 /* 1972 * Someone is trying to cheat the auxv vector. 1973 */ 1974 if (!prctl_map.auxv || 1975 prctl_map.auxv_size > sizeof(mm->saved_auxv)) 1976 return -EINVAL; 1977 1978 memset(user_auxv, 0, sizeof(user_auxv)); 1979 if (copy_from_user(user_auxv, 1980 (const void __user *)prctl_map.auxv, 1981 prctl_map.auxv_size)) 1982 return -EFAULT; 1983 1984 /* Last entry must be AT_NULL as specification requires */ 1985 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; 1986 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; 1987 } 1988 1989 if (prctl_map.exe_fd != (u32)-1) { 1990 /* 1991 * Check if the current user is checkpoint/restore capable. 1992 * At the time of this writing, it checks for CAP_SYS_ADMIN 1993 * or CAP_CHECKPOINT_RESTORE. 1994 * Note that a user with access to ptrace can masquerade an 1995 * arbitrary program as any executable, even setuid ones. 1996 * This may have implications in the tomoyo subsystem. 1997 */ 1998 if (!checkpoint_restore_ns_capable(current_user_ns())) 1999 return -EPERM; 2000 2001 error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); 2002 if (error) 2003 return error; 2004 } 2005 2006 /* 2007 * arg_lock protects concurrent updates but we still need mmap_lock for 2008 * read to exclude races with sys_brk. 2009 */ 2010 mmap_read_lock(mm); 2011 2012 /* 2013 * We don't validate if these members are pointing to 2014 * real present VMAs because application may have correspond 2015 * VMAs already unmapped and kernel uses these members for statistics 2016 * output in procfs mostly, except 2017 * 2018 * - @start_brk/@brk which are used in do_brk_flags but kernel lookups 2019 * for VMAs when updating these members so anything wrong written 2020 * here cause kernel to swear at userspace program but won't lead 2021 * to any problem in kernel itself 2022 */ 2023 2024 spin_lock(&mm->arg_lock); 2025 mm->start_code = prctl_map.start_code; 2026 mm->end_code = prctl_map.end_code; 2027 mm->start_data = prctl_map.start_data; 2028 mm->end_data = prctl_map.end_data; 2029 mm->start_brk = prctl_map.start_brk; 2030 mm->brk = prctl_map.brk; 2031 mm->start_stack = prctl_map.start_stack; 2032 mm->arg_start = prctl_map.arg_start; 2033 mm->arg_end = prctl_map.arg_end; 2034 mm->env_start = prctl_map.env_start; 2035 mm->env_end = prctl_map.env_end; 2036 spin_unlock(&mm->arg_lock); 2037 2038 /* 2039 * Note this update of @saved_auxv is lockless thus 2040 * if someone reads this member in procfs while we're 2041 * updating -- it may get partly updated results. It's 2042 * known and acceptable trade off: we leave it as is to 2043 * not introduce additional locks here making the kernel 2044 * more complex. 2045 */ 2046 if (prctl_map.auxv_size) 2047 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); 2048 2049 mmap_read_unlock(mm); 2050 return 0; 2051 } 2052 #endif /* CONFIG_CHECKPOINT_RESTORE */ 2053 2054 static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, 2055 unsigned long len) 2056 { 2057 /* 2058 * This doesn't move the auxiliary vector itself since it's pinned to 2059 * mm_struct, but it permits filling the vector with new values. It's 2060 * up to the caller to provide sane values here, otherwise userspace 2061 * tools which use this vector might be unhappy. 2062 */ 2063 unsigned long user_auxv[AT_VECTOR_SIZE] = {}; 2064 2065 if (len > sizeof(user_auxv)) 2066 return -EINVAL; 2067 2068 if (copy_from_user(user_auxv, (const void __user *)addr, len)) 2069 return -EFAULT; 2070 2071 /* Make sure the last entry is always AT_NULL */ 2072 user_auxv[AT_VECTOR_SIZE - 2] = 0; 2073 user_auxv[AT_VECTOR_SIZE - 1] = 0; 2074 2075 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 2076 2077 task_lock(current); 2078 memcpy(mm->saved_auxv, user_auxv, len); 2079 task_unlock(current); 2080 2081 return 0; 2082 } 2083 2084 static int prctl_set_mm(int opt, unsigned long addr, 2085 unsigned long arg4, unsigned long arg5) 2086 { 2087 struct mm_struct *mm = current->mm; 2088 struct prctl_mm_map prctl_map = { 2089 .auxv = NULL, 2090 .auxv_size = 0, 2091 .exe_fd = -1, 2092 }; 2093 struct vm_area_struct *vma; 2094 int error; 2095 2096 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && 2097 opt != PR_SET_MM_MAP && 2098 opt != PR_SET_MM_MAP_SIZE))) 2099 return -EINVAL; 2100 2101 #ifdef CONFIG_CHECKPOINT_RESTORE 2102 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) 2103 return prctl_set_mm_map(opt, (const void __user *)addr, arg4); 2104 #endif 2105 2106 if (!capable(CAP_SYS_RESOURCE)) 2107 return -EPERM; 2108 2109 if (opt == PR_SET_MM_EXE_FILE) 2110 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 2111 2112 if (opt == PR_SET_MM_AUXV) 2113 return prctl_set_auxv(mm, addr, arg4); 2114 2115 if (addr >= TASK_SIZE || addr < mmap_min_addr) 2116 return -EINVAL; 2117 2118 error = -EINVAL; 2119 2120 /* 2121 * arg_lock protects concurrent updates of arg boundaries, we need 2122 * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr 2123 * validation. 2124 */ 2125 mmap_read_lock(mm); 2126 vma = find_vma(mm, addr); 2127 2128 spin_lock(&mm->arg_lock); 2129 prctl_map.start_code = mm->start_code; 2130 prctl_map.end_code = mm->end_code; 2131 prctl_map.start_data = mm->start_data; 2132 prctl_map.end_data = mm->end_data; 2133 prctl_map.start_brk = mm->start_brk; 2134 prctl_map.brk = mm->brk; 2135 prctl_map.start_stack = mm->start_stack; 2136 prctl_map.arg_start = mm->arg_start; 2137 prctl_map.arg_end = mm->arg_end; 2138 prctl_map.env_start = mm->env_start; 2139 prctl_map.env_end = mm->env_end; 2140 2141 switch (opt) { 2142 case PR_SET_MM_START_CODE: 2143 prctl_map.start_code = addr; 2144 break; 2145 case PR_SET_MM_END_CODE: 2146 prctl_map.end_code = addr; 2147 break; 2148 case PR_SET_MM_START_DATA: 2149 prctl_map.start_data = addr; 2150 break; 2151 case PR_SET_MM_END_DATA: 2152 prctl_map.end_data = addr; 2153 break; 2154 case PR_SET_MM_START_STACK: 2155 prctl_map.start_stack = addr; 2156 break; 2157 case PR_SET_MM_START_BRK: 2158 prctl_map.start_brk = addr; 2159 break; 2160 case PR_SET_MM_BRK: 2161 prctl_map.brk = addr; 2162 break; 2163 case PR_SET_MM_ARG_START: 2164 prctl_map.arg_start = addr; 2165 break; 2166 case PR_SET_MM_ARG_END: 2167 prctl_map.arg_end = addr; 2168 break; 2169 case PR_SET_MM_ENV_START: 2170 prctl_map.env_start = addr; 2171 break; 2172 case PR_SET_MM_ENV_END: 2173 prctl_map.env_end = addr; 2174 break; 2175 default: 2176 goto out; 2177 } 2178 2179 error = validate_prctl_map_addr(&prctl_map); 2180 if (error) 2181 goto out; 2182 2183 switch (opt) { 2184 /* 2185 * If command line arguments and environment 2186 * are placed somewhere else on stack, we can 2187 * set them up here, ARG_START/END to setup 2188 * command line arguments and ENV_START/END 2189 * for environment. 2190 */ 2191 case PR_SET_MM_START_STACK: 2192 case PR_SET_MM_ARG_START: 2193 case PR_SET_MM_ARG_END: 2194 case PR_SET_MM_ENV_START: 2195 case PR_SET_MM_ENV_END: 2196 if (!vma) { 2197 error = -EFAULT; 2198 goto out; 2199 } 2200 } 2201 2202 mm->start_code = prctl_map.start_code; 2203 mm->end_code = prctl_map.end_code; 2204 mm->start_data = prctl_map.start_data; 2205 mm->end_data = prctl_map.end_data; 2206 mm->start_brk = prctl_map.start_brk; 2207 mm->brk = prctl_map.brk; 2208 mm->start_stack = prctl_map.start_stack; 2209 mm->arg_start = prctl_map.arg_start; 2210 mm->arg_end = prctl_map.arg_end; 2211 mm->env_start = prctl_map.env_start; 2212 mm->env_end = prctl_map.env_end; 2213 2214 error = 0; 2215 out: 2216 spin_unlock(&mm->arg_lock); 2217 mmap_read_unlock(mm); 2218 return error; 2219 } 2220 2221 #ifdef CONFIG_CHECKPOINT_RESTORE 2222 static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) 2223 { 2224 return put_user(me->clear_child_tid, tid_addr); 2225 } 2226 #else 2227 static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) 2228 { 2229 return -EINVAL; 2230 } 2231 #endif 2232 2233 static int propagate_has_child_subreaper(struct task_struct *p, void *data) 2234 { 2235 /* 2236 * If task has has_child_subreaper - all its descendants 2237 * already have these flag too and new descendants will 2238 * inherit it on fork, skip them. 2239 * 2240 * If we've found child_reaper - skip descendants in 2241 * it's subtree as they will never get out pidns. 2242 */ 2243 if (p->signal->has_child_subreaper || 2244 is_child_reaper(task_pid(p))) 2245 return 0; 2246 2247 p->signal->has_child_subreaper = 1; 2248 return 1; 2249 } 2250 2251 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) 2252 { 2253 return -EINVAL; 2254 } 2255 2256 int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, 2257 unsigned long ctrl) 2258 { 2259 return -EINVAL; 2260 } 2261 2262 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) 2263 2264 #ifdef CONFIG_ANON_VMA_NAME 2265 2266 #define ANON_VMA_NAME_MAX_LEN 80 2267 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" 2268 2269 static inline bool is_valid_name_char(char ch) 2270 { 2271 /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ 2272 return ch > 0x1f && ch < 0x7f && 2273 !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); 2274 } 2275 2276 static int prctl_set_vma(unsigned long opt, unsigned long addr, 2277 unsigned long size, unsigned long arg) 2278 { 2279 struct mm_struct *mm = current->mm; 2280 const char __user *uname; 2281 char *name, *pch; 2282 int error; 2283 2284 switch (opt) { 2285 case PR_SET_VMA_ANON_NAME: 2286 uname = (const char __user *)arg; 2287 if (uname) { 2288 name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); 2289 2290 if (IS_ERR(name)) 2291 return PTR_ERR(name); 2292 2293 for (pch = name; *pch != '\0'; pch++) { 2294 if (!is_valid_name_char(*pch)) { 2295 kfree(name); 2296 return -EINVAL; 2297 } 2298 } 2299 } else { 2300 /* Reset the name */ 2301 name = NULL; 2302 } 2303 2304 mmap_write_lock(mm); 2305 error = madvise_set_anon_name(mm, addr, size, name); 2306 mmap_write_unlock(mm); 2307 kfree(name); 2308 break; 2309 default: 2310 error = -EINVAL; 2311 } 2312 2313 return error; 2314 } 2315 2316 #else /* CONFIG_ANON_VMA_NAME */ 2317 static int prctl_set_vma(unsigned long opt, unsigned long start, 2318 unsigned long size, unsigned long arg) 2319 { 2320 return -EINVAL; 2321 } 2322 #endif /* CONFIG_ANON_VMA_NAME */ 2323 2324 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 2325 unsigned long, arg4, unsigned long, arg5) 2326 { 2327 struct task_struct *me = current; 2328 unsigned char comm[sizeof(me->comm)]; 2329 long error; 2330 2331 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 2332 if (error != -ENOSYS) 2333 return error; 2334 2335 error = 0; 2336 switch (option) { 2337 case PR_SET_PDEATHSIG: 2338 if (!valid_signal(arg2)) { 2339 error = -EINVAL; 2340 break; 2341 } 2342 me->pdeath_signal = arg2; 2343 break; 2344 case PR_GET_PDEATHSIG: 2345 error = put_user(me->pdeath_signal, (int __user *)arg2); 2346 break; 2347 case PR_GET_DUMPABLE: 2348 error = get_dumpable(me->mm); 2349 break; 2350 case PR_SET_DUMPABLE: 2351 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { 2352 error = -EINVAL; 2353 break; 2354 } 2355 set_dumpable(me->mm, arg2); 2356 break; 2357 2358 case PR_SET_UNALIGN: 2359 error = SET_UNALIGN_CTL(me, arg2); 2360 break; 2361 case PR_GET_UNALIGN: 2362 error = GET_UNALIGN_CTL(me, arg2); 2363 break; 2364 case PR_SET_FPEMU: 2365 error = SET_FPEMU_CTL(me, arg2); 2366 break; 2367 case PR_GET_FPEMU: 2368 error = GET_FPEMU_CTL(me, arg2); 2369 break; 2370 case PR_SET_FPEXC: 2371 error = SET_FPEXC_CTL(me, arg2); 2372 break; 2373 case PR_GET_FPEXC: 2374 error = GET_FPEXC_CTL(me, arg2); 2375 break; 2376 case PR_GET_TIMING: 2377 error = PR_TIMING_STATISTICAL; 2378 break; 2379 case PR_SET_TIMING: 2380 if (arg2 != PR_TIMING_STATISTICAL) 2381 error = -EINVAL; 2382 break; 2383 case PR_SET_NAME: 2384 comm[sizeof(me->comm) - 1] = 0; 2385 if (strncpy_from_user(comm, (char __user *)arg2, 2386 sizeof(me->comm) - 1) < 0) 2387 return -EFAULT; 2388 set_task_comm(me, comm); 2389 proc_comm_connector(me); 2390 break; 2391 case PR_GET_NAME: 2392 get_task_comm(comm, me); 2393 if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) 2394 return -EFAULT; 2395 break; 2396 case PR_GET_ENDIAN: 2397 error = GET_ENDIAN(me, arg2); 2398 break; 2399 case PR_SET_ENDIAN: 2400 error = SET_ENDIAN(me, arg2); 2401 break; 2402 case PR_GET_SECCOMP: 2403 error = prctl_get_seccomp(); 2404 break; 2405 case PR_SET_SECCOMP: 2406 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2407 break; 2408 case PR_GET_TSC: 2409 error = GET_TSC_CTL(arg2); 2410 break; 2411 case PR_SET_TSC: 2412 error = SET_TSC_CTL(arg2); 2413 break; 2414 case PR_TASK_PERF_EVENTS_DISABLE: 2415 error = perf_event_task_disable(); 2416 break; 2417 case PR_TASK_PERF_EVENTS_ENABLE: 2418 error = perf_event_task_enable(); 2419 break; 2420 case PR_GET_TIMERSLACK: 2421 if (current->timer_slack_ns > ULONG_MAX) 2422 error = ULONG_MAX; 2423 else 2424 error = current->timer_slack_ns; 2425 break; 2426 case PR_SET_TIMERSLACK: 2427 if (arg2 <= 0) 2428 current->timer_slack_ns = 2429 current->default_timer_slack_ns; 2430 else 2431 current->timer_slack_ns = arg2; 2432 break; 2433 case PR_MCE_KILL: 2434 if (arg4 | arg5) 2435 return -EINVAL; 2436 switch (arg2) { 2437 case PR_MCE_KILL_CLEAR: 2438 if (arg3 != 0) 2439 return -EINVAL; 2440 current->flags &= ~PF_MCE_PROCESS; 2441 break; 2442 case PR_MCE_KILL_SET: 2443 current->flags |= PF_MCE_PROCESS; 2444 if (arg3 == PR_MCE_KILL_EARLY) 2445 current->flags |= PF_MCE_EARLY; 2446 else if (arg3 == PR_MCE_KILL_LATE) 2447 current->flags &= ~PF_MCE_EARLY; 2448 else if (arg3 == PR_MCE_KILL_DEFAULT) 2449 current->flags &= 2450 ~(PF_MCE_EARLY|PF_MCE_PROCESS); 2451 else 2452 return -EINVAL; 2453 break; 2454 default: 2455 return -EINVAL; 2456 } 2457 break; 2458 case PR_MCE_KILL_GET: 2459 if (arg2 | arg3 | arg4 | arg5) 2460 return -EINVAL; 2461 if (current->flags & PF_MCE_PROCESS) 2462 error = (current->flags & PF_MCE_EARLY) ? 2463 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2464 else 2465 error = PR_MCE_KILL_DEFAULT; 2466 break; 2467 case PR_SET_MM: 2468 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2469 break; 2470 case PR_GET_TID_ADDRESS: 2471 error = prctl_get_tid_address(me, (int __user * __user *)arg2); 2472 break; 2473 case PR_SET_CHILD_SUBREAPER: 2474 me->signal->is_child_subreaper = !!arg2; 2475 if (!arg2) 2476 break; 2477 2478 walk_process_tree(me, propagate_has_child_subreaper, NULL); 2479 break; 2480 case PR_GET_CHILD_SUBREAPER: 2481 error = put_user(me->signal->is_child_subreaper, 2482 (int __user *)arg2); 2483 break; 2484 case PR_SET_NO_NEW_PRIVS: 2485 if (arg2 != 1 || arg3 || arg4 || arg5) 2486 return -EINVAL; 2487 2488 task_set_no_new_privs(current); 2489 break; 2490 case PR_GET_NO_NEW_PRIVS: 2491 if (arg2 || arg3 || arg4 || arg5) 2492 return -EINVAL; 2493 return task_no_new_privs(current) ? 1 : 0; 2494 case PR_GET_THP_DISABLE: 2495 if (arg2 || arg3 || arg4 || arg5) 2496 return -EINVAL; 2497 error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); 2498 break; 2499 case PR_SET_THP_DISABLE: 2500 if (arg3 || arg4 || arg5) 2501 return -EINVAL; 2502 if (mmap_write_lock_killable(me->mm)) 2503 return -EINTR; 2504 if (arg2) 2505 set_bit(MMF_DISABLE_THP, &me->mm->flags); 2506 else 2507 clear_bit(MMF_DISABLE_THP, &me->mm->flags); 2508 mmap_write_unlock(me->mm); 2509 break; 2510 case PR_MPX_ENABLE_MANAGEMENT: 2511 case PR_MPX_DISABLE_MANAGEMENT: 2512 /* No longer implemented: */ 2513 return -EINVAL; 2514 case PR_SET_FP_MODE: 2515 error = SET_FP_MODE(me, arg2); 2516 break; 2517 case PR_GET_FP_MODE: 2518 error = GET_FP_MODE(me); 2519 break; 2520 case PR_SVE_SET_VL: 2521 error = SVE_SET_VL(arg2); 2522 break; 2523 case PR_SVE_GET_VL: 2524 error = SVE_GET_VL(); 2525 break; 2526 case PR_GET_SPECULATION_CTRL: 2527 if (arg3 || arg4 || arg5) 2528 return -EINVAL; 2529 error = arch_prctl_spec_ctrl_get(me, arg2); 2530 break; 2531 case PR_SET_SPECULATION_CTRL: 2532 if (arg4 || arg5) 2533 return -EINVAL; 2534 error = arch_prctl_spec_ctrl_set(me, arg2, arg3); 2535 break; 2536 case PR_PAC_RESET_KEYS: 2537 if (arg3 || arg4 || arg5) 2538 return -EINVAL; 2539 error = PAC_RESET_KEYS(me, arg2); 2540 break; 2541 case PR_PAC_SET_ENABLED_KEYS: 2542 if (arg4 || arg5) 2543 return -EINVAL; 2544 error = PAC_SET_ENABLED_KEYS(me, arg2, arg3); 2545 break; 2546 case PR_PAC_GET_ENABLED_KEYS: 2547 if (arg2 || arg3 || arg4 || arg5) 2548 return -EINVAL; 2549 error = PAC_GET_ENABLED_KEYS(me); 2550 break; 2551 case PR_SET_TAGGED_ADDR_CTRL: 2552 if (arg3 || arg4 || arg5) 2553 return -EINVAL; 2554 error = SET_TAGGED_ADDR_CTRL(arg2); 2555 break; 2556 case PR_GET_TAGGED_ADDR_CTRL: 2557 if (arg2 || arg3 || arg4 || arg5) 2558 return -EINVAL; 2559 error = GET_TAGGED_ADDR_CTRL(); 2560 break; 2561 case PR_SET_IO_FLUSHER: 2562 if (!capable(CAP_SYS_RESOURCE)) 2563 return -EPERM; 2564 2565 if (arg3 || arg4 || arg5) 2566 return -EINVAL; 2567 2568 if (arg2 == 1) 2569 current->flags |= PR_IO_FLUSHER; 2570 else if (!arg2) 2571 current->flags &= ~PR_IO_FLUSHER; 2572 else 2573 return -EINVAL; 2574 break; 2575 case PR_GET_IO_FLUSHER: 2576 if (!capable(CAP_SYS_RESOURCE)) 2577 return -EPERM; 2578 2579 if (arg2 || arg3 || arg4 || arg5) 2580 return -EINVAL; 2581 2582 error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; 2583 break; 2584 case PR_SET_SYSCALL_USER_DISPATCH: 2585 error = set_syscall_user_dispatch(arg2, arg3, arg4, 2586 (char __user *) arg5); 2587 break; 2588 #ifdef CONFIG_SCHED_CORE 2589 case PR_SCHED_CORE: 2590 error = sched_core_share_pid(arg2, arg3, arg4, arg5); 2591 break; 2592 #endif 2593 case PR_SET_VMA: 2594 error = prctl_set_vma(arg2, arg3, arg4, arg5); 2595 break; 2596 default: 2597 error = -EINVAL; 2598 break; 2599 } 2600 return error; 2601 } 2602 2603 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, 2604 struct getcpu_cache __user *, unused) 2605 { 2606 int err = 0; 2607 int cpu = raw_smp_processor_id(); 2608 2609 if (cpup) 2610 err |= put_user(cpu, cpup); 2611 if (nodep) 2612 err |= put_user(cpu_to_node(cpu), nodep); 2613 return err ? -EFAULT : 0; 2614 } 2615 2616 /** 2617 * do_sysinfo - fill in sysinfo struct 2618 * @info: pointer to buffer to fill 2619 */ 2620 static int do_sysinfo(struct sysinfo *info) 2621 { 2622 unsigned long mem_total, sav_total; 2623 unsigned int mem_unit, bitcount; 2624 struct timespec64 tp; 2625 2626 memset(info, 0, sizeof(struct sysinfo)); 2627 2628 ktime_get_boottime_ts64(&tp); 2629 timens_add_boottime(&tp); 2630 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2631 2632 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2633 2634 info->procs = nr_threads; 2635 2636 si_meminfo(info); 2637 si_swapinfo(info); 2638 2639 /* 2640 * If the sum of all the available memory (i.e. ram + swap) 2641 * is less than can be stored in a 32 bit unsigned long then 2642 * we can be binary compatible with 2.2.x kernels. If not, 2643 * well, in that case 2.2.x was broken anyways... 2644 * 2645 * -Erik Andersen <andersee@debian.org> 2646 */ 2647 2648 mem_total = info->totalram + info->totalswap; 2649 if (mem_total < info->totalram || mem_total < info->totalswap) 2650 goto out; 2651 bitcount = 0; 2652 mem_unit = info->mem_unit; 2653 while (mem_unit > 1) { 2654 bitcount++; 2655 mem_unit >>= 1; 2656 sav_total = mem_total; 2657 mem_total <<= 1; 2658 if (mem_total < sav_total) 2659 goto out; 2660 } 2661 2662 /* 2663 * If mem_total did not overflow, multiply all memory values by 2664 * info->mem_unit and set it to 1. This leaves things compatible 2665 * with 2.2.x, and also retains compatibility with earlier 2.4.x 2666 * kernels... 2667 */ 2668 2669 info->mem_unit = 1; 2670 info->totalram <<= bitcount; 2671 info->freeram <<= bitcount; 2672 info->sharedram <<= bitcount; 2673 info->bufferram <<= bitcount; 2674 info->totalswap <<= bitcount; 2675 info->freeswap <<= bitcount; 2676 info->totalhigh <<= bitcount; 2677 info->freehigh <<= bitcount; 2678 2679 out: 2680 return 0; 2681 } 2682 2683 SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) 2684 { 2685 struct sysinfo val; 2686 2687 do_sysinfo(&val); 2688 2689 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 2690 return -EFAULT; 2691 2692 return 0; 2693 } 2694 2695 #ifdef CONFIG_COMPAT 2696 struct compat_sysinfo { 2697 s32 uptime; 2698 u32 loads[3]; 2699 u32 totalram; 2700 u32 freeram; 2701 u32 sharedram; 2702 u32 bufferram; 2703 u32 totalswap; 2704 u32 freeswap; 2705 u16 procs; 2706 u16 pad; 2707 u32 totalhigh; 2708 u32 freehigh; 2709 u32 mem_unit; 2710 char _f[20-2*sizeof(u32)-sizeof(int)]; 2711 }; 2712 2713 COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) 2714 { 2715 struct sysinfo s; 2716 struct compat_sysinfo s_32; 2717 2718 do_sysinfo(&s); 2719 2720 /* Check to see if any memory value is too large for 32-bit and scale 2721 * down if needed 2722 */ 2723 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { 2724 int bitcount = 0; 2725 2726 while (s.mem_unit < PAGE_SIZE) { 2727 s.mem_unit <<= 1; 2728 bitcount++; 2729 } 2730 2731 s.totalram >>= bitcount; 2732 s.freeram >>= bitcount; 2733 s.sharedram >>= bitcount; 2734 s.bufferram >>= bitcount; 2735 s.totalswap >>= bitcount; 2736 s.freeswap >>= bitcount; 2737 s.totalhigh >>= bitcount; 2738 s.freehigh >>= bitcount; 2739 } 2740 2741 memset(&s_32, 0, sizeof(s_32)); 2742 s_32.uptime = s.uptime; 2743 s_32.loads[0] = s.loads[0]; 2744 s_32.loads[1] = s.loads[1]; 2745 s_32.loads[2] = s.loads[2]; 2746 s_32.totalram = s.totalram; 2747 s_32.freeram = s.freeram; 2748 s_32.sharedram = s.sharedram; 2749 s_32.bufferram = s.bufferram; 2750 s_32.totalswap = s.totalswap; 2751 s_32.freeswap = s.freeswap; 2752 s_32.procs = s.procs; 2753 s_32.totalhigh = s.totalhigh; 2754 s_32.freehigh = s.freehigh; 2755 s_32.mem_unit = s.mem_unit; 2756 if (copy_to_user(info, &s_32, sizeof(s_32))) 2757 return -EFAULT; 2758 return 0; 2759 } 2760 #endif /* CONFIG_COMPAT */ 2761