1 /* 2 * linux/kernel/exit.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/slab.h> 9 #include <linux/interrupt.h> 10 #include <linux/module.h> 11 #include <linux/capability.h> 12 #include <linux/completion.h> 13 #include <linux/personality.h> 14 #include <linux/tty.h> 15 #include <linux/iocontext.h> 16 #include <linux/key.h> 17 #include <linux/security.h> 18 #include <linux/cpu.h> 19 #include <linux/acct.h> 20 #include <linux/tsacct_kern.h> 21 #include <linux/file.h> 22 #include <linux/fdtable.h> 23 #include <linux/freezer.h> 24 #include <linux/binfmts.h> 25 #include <linux/nsproxy.h> 26 #include <linux/pid_namespace.h> 27 #include <linux/ptrace.h> 28 #include <linux/profile.h> 29 #include <linux/mount.h> 30 #include <linux/proc_fs.h> 31 #include <linux/kthread.h> 32 #include <linux/mempolicy.h> 33 #include <linux/taskstats_kern.h> 34 #include <linux/delayacct.h> 35 #include <linux/cgroup.h> 36 #include <linux/syscalls.h> 37 #include <linux/signal.h> 38 #include <linux/posix-timers.h> 39 #include <linux/cn_proc.h> 40 #include <linux/mutex.h> 41 #include <linux/futex.h> 42 #include <linux/pipe_fs_i.h> 43 #include <linux/audit.h> /* for audit_free() */ 44 #include <linux/resource.h> 45 #include <linux/blkdev.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/tracehook.h> 48 #include <linux/fs_struct.h> 49 #include <linux/init_task.h> 50 #include <linux/perf_event.h> 51 #include <trace/events/sched.h> 52 #include <linux/hw_breakpoint.h> 53 #include <linux/oom.h> 54 #include <linux/writeback.h> 55 #include <linux/shm.h> 56 57 #include <asm/uaccess.h> 58 #include <asm/unistd.h> 59 #include <asm/pgtable.h> 60 #include <asm/mmu_context.h> 61 62 static void exit_mm(struct task_struct * tsk); 63 64 static void __unhash_process(struct task_struct *p, bool group_dead) 65 { 66 nr_threads--; 67 detach_pid(p, PIDTYPE_PID); 68 if (group_dead) { 69 detach_pid(p, PIDTYPE_PGID); 70 detach_pid(p, PIDTYPE_SID); 71 72 list_del_rcu(&p->tasks); 73 list_del_init(&p->sibling); 74 __this_cpu_dec(process_counts); 75 } 76 list_del_rcu(&p->thread_group); 77 } 78 79 /* 80 * This function expects the tasklist_lock write-locked. 81 */ 82 static void __exit_signal(struct task_struct *tsk) 83 { 84 struct signal_struct *sig = tsk->signal; 85 bool group_dead = thread_group_leader(tsk); 86 struct sighand_struct *sighand; 87 struct tty_struct *uninitialized_var(tty); 88 cputime_t utime, stime; 89 90 sighand = rcu_dereference_check(tsk->sighand, 91 lockdep_tasklist_lock_is_held()); 92 spin_lock(&sighand->siglock); 93 94 posix_cpu_timers_exit(tsk); 95 if (group_dead) { 96 posix_cpu_timers_exit_group(tsk); 97 tty = sig->tty; 98 sig->tty = NULL; 99 } else { 100 /* 101 * This can only happen if the caller is de_thread(). 102 * FIXME: this is the temporary hack, we should teach 103 * posix-cpu-timers to handle this case correctly. 104 */ 105 if (unlikely(has_group_leader_pid(tsk))) 106 posix_cpu_timers_exit_group(tsk); 107 108 /* 109 * If there is any task waiting for the group exit 110 * then notify it: 111 */ 112 if (sig->notify_count > 0 && !--sig->notify_count) 113 wake_up_process(sig->group_exit_task); 114 115 if (tsk == sig->curr_target) 116 sig->curr_target = next_thread(tsk); 117 /* 118 * Accumulate here the counters for all threads but the 119 * group leader as they die, so they can be added into 120 * the process-wide totals when those are taken. 121 * The group leader stays around as a zombie as long 122 * as there are other threads. When it gets reaped, 123 * the exit.c code will add its counts into these totals. 124 * We won't ever get here for the group leader, since it 125 * will have been the last reference on the signal_struct. 126 */ 127 task_cputime(tsk, &utime, &stime); 128 sig->utime += utime; 129 sig->stime += stime; 130 sig->gtime += task_gtime(tsk); 131 sig->min_flt += tsk->min_flt; 132 sig->maj_flt += tsk->maj_flt; 133 sig->nvcsw += tsk->nvcsw; 134 sig->nivcsw += tsk->nivcsw; 135 sig->inblock += task_io_get_inblock(tsk); 136 sig->oublock += task_io_get_oublock(tsk); 137 task_io_accounting_add(&sig->ioac, &tsk->ioac); 138 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 139 } 140 141 sig->nr_threads--; 142 __unhash_process(tsk, group_dead); 143 144 /* 145 * Do this under ->siglock, we can race with another thread 146 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 147 */ 148 flush_sigqueue(&tsk->pending); 149 tsk->sighand = NULL; 150 spin_unlock(&sighand->siglock); 151 152 __cleanup_sighand(sighand); 153 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 154 if (group_dead) { 155 flush_sigqueue(&sig->shared_pending); 156 tty_kref_put(tty); 157 } 158 } 159 160 static void delayed_put_task_struct(struct rcu_head *rhp) 161 { 162 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 163 164 perf_event_delayed_put(tsk); 165 trace_sched_process_free(tsk); 166 put_task_struct(tsk); 167 } 168 169 170 void release_task(struct task_struct * p) 171 { 172 struct task_struct *leader; 173 int zap_leader; 174 repeat: 175 /* don't need to get the RCU readlock here - the process is dead and 176 * can't be modifying its own credentials. But shut RCU-lockdep up */ 177 rcu_read_lock(); 178 atomic_dec(&__task_cred(p)->user->processes); 179 rcu_read_unlock(); 180 181 proc_flush_task(p); 182 183 write_lock_irq(&tasklist_lock); 184 ptrace_release_task(p); 185 __exit_signal(p); 186 187 /* 188 * If we are the last non-leader member of the thread 189 * group, and the leader is zombie, then notify the 190 * group leader's parent process. (if it wants notification.) 191 */ 192 zap_leader = 0; 193 leader = p->group_leader; 194 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 195 /* 196 * If we were the last child thread and the leader has 197 * exited already, and the leader's parent ignores SIGCHLD, 198 * then we are the one who should release the leader. 199 */ 200 zap_leader = do_notify_parent(leader, leader->exit_signal); 201 if (zap_leader) 202 leader->exit_state = EXIT_DEAD; 203 } 204 205 write_unlock_irq(&tasklist_lock); 206 release_thread(p); 207 call_rcu(&p->rcu, delayed_put_task_struct); 208 209 p = leader; 210 if (unlikely(zap_leader)) 211 goto repeat; 212 } 213 214 /* 215 * This checks not only the pgrp, but falls back on the pid if no 216 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 217 * without this... 218 * 219 * The caller must hold rcu lock or the tasklist lock. 220 */ 221 struct pid *session_of_pgrp(struct pid *pgrp) 222 { 223 struct task_struct *p; 224 struct pid *sid = NULL; 225 226 p = pid_task(pgrp, PIDTYPE_PGID); 227 if (p == NULL) 228 p = pid_task(pgrp, PIDTYPE_PID); 229 if (p != NULL) 230 sid = task_session(p); 231 232 return sid; 233 } 234 235 /* 236 * Determine if a process group is "orphaned", according to the POSIX 237 * definition in 2.2.2.52. Orphaned process groups are not to be affected 238 * by terminal-generated stop signals. Newly orphaned process groups are 239 * to receive a SIGHUP and a SIGCONT. 240 * 241 * "I ask you, have you ever known what it is to be an orphan?" 242 */ 243 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) 244 { 245 struct task_struct *p; 246 247 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 248 if ((p == ignored_task) || 249 (p->exit_state && thread_group_empty(p)) || 250 is_global_init(p->real_parent)) 251 continue; 252 253 if (task_pgrp(p->real_parent) != pgrp && 254 task_session(p->real_parent) == task_session(p)) 255 return 0; 256 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 257 258 return 1; 259 } 260 261 int is_current_pgrp_orphaned(void) 262 { 263 int retval; 264 265 read_lock(&tasklist_lock); 266 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); 267 read_unlock(&tasklist_lock); 268 269 return retval; 270 } 271 272 static bool has_stopped_jobs(struct pid *pgrp) 273 { 274 struct task_struct *p; 275 276 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 277 if (p->signal->flags & SIGNAL_STOP_STOPPED) 278 return true; 279 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 280 281 return false; 282 } 283 284 /* 285 * Check to see if any process groups have become orphaned as 286 * a result of our exiting, and if they have any stopped jobs, 287 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 288 */ 289 static void 290 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) 291 { 292 struct pid *pgrp = task_pgrp(tsk); 293 struct task_struct *ignored_task = tsk; 294 295 if (!parent) 296 /* exit: our father is in a different pgrp than 297 * we are and we were the only connection outside. 298 */ 299 parent = tsk->real_parent; 300 else 301 /* reparent: our child is in a different pgrp than 302 * we are, and it was the only connection outside. 303 */ 304 ignored_task = NULL; 305 306 if (task_pgrp(parent) != pgrp && 307 task_session(parent) == task_session(tsk) && 308 will_become_orphaned_pgrp(pgrp, ignored_task) && 309 has_stopped_jobs(pgrp)) { 310 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 311 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 312 } 313 } 314 315 /* 316 * Let kernel threads use this to say that they allow a certain signal. 317 * Must not be used if kthread was cloned with CLONE_SIGHAND. 318 */ 319 int allow_signal(int sig) 320 { 321 if (!valid_signal(sig) || sig < 1) 322 return -EINVAL; 323 324 spin_lock_irq(¤t->sighand->siglock); 325 /* This is only needed for daemonize()'ed kthreads */ 326 sigdelset(¤t->blocked, sig); 327 /* 328 * Kernel threads handle their own signals. Let the signal code 329 * know it'll be handled, so that they don't get converted to 330 * SIGKILL or just silently dropped. 331 */ 332 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 333 recalc_sigpending(); 334 spin_unlock_irq(¤t->sighand->siglock); 335 return 0; 336 } 337 338 EXPORT_SYMBOL(allow_signal); 339 340 int disallow_signal(int sig) 341 { 342 if (!valid_signal(sig) || sig < 1) 343 return -EINVAL; 344 345 spin_lock_irq(¤t->sighand->siglock); 346 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; 347 recalc_sigpending(); 348 spin_unlock_irq(¤t->sighand->siglock); 349 return 0; 350 } 351 352 EXPORT_SYMBOL(disallow_signal); 353 354 #ifdef CONFIG_MM_OWNER 355 /* 356 * A task is exiting. If it owned this mm, find a new owner for the mm. 357 */ 358 void mm_update_next_owner(struct mm_struct *mm) 359 { 360 struct task_struct *c, *g, *p = current; 361 362 retry: 363 /* 364 * If the exiting or execing task is not the owner, it's 365 * someone else's problem. 366 */ 367 if (mm->owner != p) 368 return; 369 /* 370 * The current owner is exiting/execing and there are no other 371 * candidates. Do not leave the mm pointing to a possibly 372 * freed task structure. 373 */ 374 if (atomic_read(&mm->mm_users) <= 1) { 375 mm->owner = NULL; 376 return; 377 } 378 379 read_lock(&tasklist_lock); 380 /* 381 * Search in the children 382 */ 383 list_for_each_entry(c, &p->children, sibling) { 384 if (c->mm == mm) 385 goto assign_new_owner; 386 } 387 388 /* 389 * Search in the siblings 390 */ 391 list_for_each_entry(c, &p->real_parent->children, sibling) { 392 if (c->mm == mm) 393 goto assign_new_owner; 394 } 395 396 /* 397 * Search through everything else. We should not get 398 * here often 399 */ 400 do_each_thread(g, c) { 401 if (c->mm == mm) 402 goto assign_new_owner; 403 } while_each_thread(g, c); 404 405 read_unlock(&tasklist_lock); 406 /* 407 * We found no owner yet mm_users > 1: this implies that we are 408 * most likely racing with swapoff (try_to_unuse()) or /proc or 409 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 410 */ 411 mm->owner = NULL; 412 return; 413 414 assign_new_owner: 415 BUG_ON(c == p); 416 get_task_struct(c); 417 /* 418 * The task_lock protects c->mm from changing. 419 * We always want mm->owner->mm == mm 420 */ 421 task_lock(c); 422 /* 423 * Delay read_unlock() till we have the task_lock() 424 * to ensure that c does not slip away underneath us 425 */ 426 read_unlock(&tasklist_lock); 427 if (c->mm != mm) { 428 task_unlock(c); 429 put_task_struct(c); 430 goto retry; 431 } 432 mm->owner = c; 433 task_unlock(c); 434 put_task_struct(c); 435 } 436 #endif /* CONFIG_MM_OWNER */ 437 438 /* 439 * Turn us into a lazy TLB process if we 440 * aren't already.. 441 */ 442 static void exit_mm(struct task_struct * tsk) 443 { 444 struct mm_struct *mm = tsk->mm; 445 struct core_state *core_state; 446 447 mm_release(tsk, mm); 448 if (!mm) 449 return; 450 sync_mm_rss(mm); 451 /* 452 * Serialize with any possible pending coredump. 453 * We must hold mmap_sem around checking core_state 454 * and clearing tsk->mm. The core-inducing thread 455 * will increment ->nr_threads for each thread in the 456 * group with ->mm != NULL. 457 */ 458 down_read(&mm->mmap_sem); 459 core_state = mm->core_state; 460 if (core_state) { 461 struct core_thread self; 462 up_read(&mm->mmap_sem); 463 464 self.task = tsk; 465 self.next = xchg(&core_state->dumper.next, &self); 466 /* 467 * Implies mb(), the result of xchg() must be visible 468 * to core_state->dumper. 469 */ 470 if (atomic_dec_and_test(&core_state->nr_threads)) 471 complete(&core_state->startup); 472 473 for (;;) { 474 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 475 if (!self.task) /* see coredump_finish() */ 476 break; 477 freezable_schedule(); 478 } 479 __set_task_state(tsk, TASK_RUNNING); 480 down_read(&mm->mmap_sem); 481 } 482 atomic_inc(&mm->mm_count); 483 BUG_ON(mm != tsk->active_mm); 484 /* more a memory barrier than a real lock */ 485 task_lock(tsk); 486 tsk->mm = NULL; 487 up_read(&mm->mmap_sem); 488 enter_lazy_tlb(mm, current); 489 task_unlock(tsk); 490 mm_update_next_owner(mm); 491 mmput(mm); 492 } 493 494 /* 495 * When we die, we re-parent all our children, and try to: 496 * 1. give them to another thread in our thread group, if such a member exists 497 * 2. give it to the first ancestor process which prctl'd itself as a 498 * child_subreaper for its children (like a service manager) 499 * 3. give it to the init process (PID 1) in our pid namespace 500 */ 501 static struct task_struct *find_new_reaper(struct task_struct *father) 502 __releases(&tasklist_lock) 503 __acquires(&tasklist_lock) 504 { 505 struct pid_namespace *pid_ns = task_active_pid_ns(father); 506 struct task_struct *thread; 507 508 thread = father; 509 while_each_thread(father, thread) { 510 if (thread->flags & PF_EXITING) 511 continue; 512 if (unlikely(pid_ns->child_reaper == father)) 513 pid_ns->child_reaper = thread; 514 return thread; 515 } 516 517 if (unlikely(pid_ns->child_reaper == father)) { 518 write_unlock_irq(&tasklist_lock); 519 if (unlikely(pid_ns == &init_pid_ns)) { 520 panic("Attempted to kill init! exitcode=0x%08x\n", 521 father->signal->group_exit_code ?: 522 father->exit_code); 523 } 524 525 zap_pid_ns_processes(pid_ns); 526 write_lock_irq(&tasklist_lock); 527 } else if (father->signal->has_child_subreaper) { 528 struct task_struct *reaper; 529 530 /* 531 * Find the first ancestor marked as child_subreaper. 532 * Note that the code below checks same_thread_group(reaper, 533 * pid_ns->child_reaper). This is what we need to DTRT in a 534 * PID namespace. However we still need the check above, see 535 * http://marc.info/?l=linux-kernel&m=131385460420380 536 */ 537 for (reaper = father->real_parent; 538 reaper != &init_task; 539 reaper = reaper->real_parent) { 540 if (same_thread_group(reaper, pid_ns->child_reaper)) 541 break; 542 if (!reaper->signal->is_child_subreaper) 543 continue; 544 thread = reaper; 545 do { 546 if (!(thread->flags & PF_EXITING)) 547 return reaper; 548 } while_each_thread(reaper, thread); 549 } 550 } 551 552 return pid_ns->child_reaper; 553 } 554 555 /* 556 * Any that need to be release_task'd are put on the @dead list. 557 */ 558 static void reparent_leader(struct task_struct *father, struct task_struct *p, 559 struct list_head *dead) 560 { 561 list_move_tail(&p->sibling, &p->real_parent->children); 562 563 if (p->exit_state == EXIT_DEAD) 564 return; 565 /* 566 * If this is a threaded reparent there is no need to 567 * notify anyone anything has happened. 568 */ 569 if (same_thread_group(p->real_parent, father)) 570 return; 571 572 /* We don't want people slaying init. */ 573 p->exit_signal = SIGCHLD; 574 575 /* If it has exited notify the new parent about this child's death. */ 576 if (!p->ptrace && 577 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 578 if (do_notify_parent(p, p->exit_signal)) { 579 p->exit_state = EXIT_DEAD; 580 list_move_tail(&p->sibling, dead); 581 } 582 } 583 584 kill_orphaned_pgrp(p, father); 585 } 586 587 static void forget_original_parent(struct task_struct *father) 588 { 589 struct task_struct *p, *n, *reaper; 590 LIST_HEAD(dead_children); 591 592 write_lock_irq(&tasklist_lock); 593 /* 594 * Note that exit_ptrace() and find_new_reaper() might 595 * drop tasklist_lock and reacquire it. 596 */ 597 exit_ptrace(father); 598 reaper = find_new_reaper(father); 599 600 list_for_each_entry_safe(p, n, &father->children, sibling) { 601 struct task_struct *t = p; 602 do { 603 t->real_parent = reaper; 604 if (t->parent == father) { 605 BUG_ON(t->ptrace); 606 t->parent = t->real_parent; 607 } 608 if (t->pdeath_signal) 609 group_send_sig_info(t->pdeath_signal, 610 SEND_SIG_NOINFO, t); 611 } while_each_thread(p, t); 612 reparent_leader(father, p, &dead_children); 613 } 614 write_unlock_irq(&tasklist_lock); 615 616 BUG_ON(!list_empty(&father->children)); 617 618 list_for_each_entry_safe(p, n, &dead_children, sibling) { 619 list_del_init(&p->sibling); 620 release_task(p); 621 } 622 } 623 624 /* 625 * Send signals to all our closest relatives so that they know 626 * to properly mourn us.. 627 */ 628 static void exit_notify(struct task_struct *tsk, int group_dead) 629 { 630 bool autoreap; 631 632 /* 633 * This does two things: 634 * 635 * A. Make init inherit all the child processes 636 * B. Check to see if any process groups have become orphaned 637 * as a result of our exiting, and if they have any stopped 638 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 639 */ 640 forget_original_parent(tsk); 641 642 write_lock_irq(&tasklist_lock); 643 if (group_dead) 644 kill_orphaned_pgrp(tsk->group_leader, NULL); 645 646 if (unlikely(tsk->ptrace)) { 647 int sig = thread_group_leader(tsk) && 648 thread_group_empty(tsk) && 649 !ptrace_reparented(tsk) ? 650 tsk->exit_signal : SIGCHLD; 651 autoreap = do_notify_parent(tsk, sig); 652 } else if (thread_group_leader(tsk)) { 653 autoreap = thread_group_empty(tsk) && 654 do_notify_parent(tsk, tsk->exit_signal); 655 } else { 656 autoreap = true; 657 } 658 659 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 660 661 /* mt-exec, de_thread() is waiting for group leader */ 662 if (unlikely(tsk->signal->notify_count < 0)) 663 wake_up_process(tsk->signal->group_exit_task); 664 write_unlock_irq(&tasklist_lock); 665 666 /* If the process is dead, release it - nobody will wait for it */ 667 if (autoreap) 668 release_task(tsk); 669 } 670 671 #ifdef CONFIG_DEBUG_STACK_USAGE 672 static void check_stack_usage(void) 673 { 674 static DEFINE_SPINLOCK(low_water_lock); 675 static int lowest_to_date = THREAD_SIZE; 676 unsigned long free; 677 678 free = stack_not_used(current); 679 680 if (free >= lowest_to_date) 681 return; 682 683 spin_lock(&low_water_lock); 684 if (free < lowest_to_date) { 685 printk(KERN_WARNING "%s (%d) used greatest stack depth: " 686 "%lu bytes left\n", 687 current->comm, task_pid_nr(current), free); 688 lowest_to_date = free; 689 } 690 spin_unlock(&low_water_lock); 691 } 692 #else 693 static inline void check_stack_usage(void) {} 694 #endif 695 696 void do_exit(long code) 697 { 698 struct task_struct *tsk = current; 699 int group_dead; 700 701 profile_task_exit(tsk); 702 703 WARN_ON(blk_needs_flush_plug(tsk)); 704 705 if (unlikely(in_interrupt())) 706 panic("Aiee, killing interrupt handler!"); 707 if (unlikely(!tsk->pid)) 708 panic("Attempted to kill the idle task!"); 709 710 /* 711 * If do_exit is called because this processes oopsed, it's possible 712 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before 713 * continuing. Amongst other possible reasons, this is to prevent 714 * mm_release()->clear_child_tid() from writing to a user-controlled 715 * kernel address. 716 */ 717 set_fs(USER_DS); 718 719 ptrace_event(PTRACE_EVENT_EXIT, code); 720 721 validate_creds_for_do_exit(tsk); 722 723 /* 724 * We're taking recursive faults here in do_exit. Safest is to just 725 * leave this task alone and wait for reboot. 726 */ 727 if (unlikely(tsk->flags & PF_EXITING)) { 728 printk(KERN_ALERT 729 "Fixing recursive fault but reboot is needed!\n"); 730 /* 731 * We can do this unlocked here. The futex code uses 732 * this flag just to verify whether the pi state 733 * cleanup has been done or not. In the worst case it 734 * loops once more. We pretend that the cleanup was 735 * done as there is no way to return. Either the 736 * OWNER_DIED bit is set by now or we push the blocked 737 * task into the wait for ever nirwana as well. 738 */ 739 tsk->flags |= PF_EXITPIDONE; 740 set_current_state(TASK_UNINTERRUPTIBLE); 741 schedule(); 742 } 743 744 exit_signals(tsk); /* sets PF_EXITING */ 745 /* 746 * tsk->flags are checked in the futex code to protect against 747 * an exiting task cleaning up the robust pi futexes. 748 */ 749 smp_mb(); 750 raw_spin_unlock_wait(&tsk->pi_lock); 751 752 if (unlikely(in_atomic())) 753 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 754 current->comm, task_pid_nr(current), 755 preempt_count()); 756 757 acct_update_integrals(tsk); 758 /* sync mm's RSS info before statistics gathering */ 759 if (tsk->mm) 760 sync_mm_rss(tsk->mm); 761 group_dead = atomic_dec_and_test(&tsk->signal->live); 762 if (group_dead) { 763 hrtimer_cancel(&tsk->signal->real_timer); 764 exit_itimers(tsk->signal); 765 if (tsk->mm) 766 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); 767 } 768 acct_collect(code, group_dead); 769 if (group_dead) 770 tty_audit_exit(); 771 audit_free(tsk); 772 773 tsk->exit_code = code; 774 taskstats_exit(tsk, group_dead); 775 776 exit_mm(tsk); 777 778 if (group_dead) 779 acct_process(); 780 trace_sched_process_exit(tsk); 781 782 exit_sem(tsk); 783 exit_shm(tsk); 784 exit_files(tsk); 785 exit_fs(tsk); 786 exit_task_namespaces(tsk); 787 exit_task_work(tsk); 788 check_stack_usage(); 789 exit_thread(); 790 791 /* 792 * Flush inherited counters to the parent - before the parent 793 * gets woken up by child-exit notifications. 794 * 795 * because of cgroup mode, must be called before cgroup_exit() 796 */ 797 perf_event_exit_task(tsk); 798 799 cgroup_exit(tsk, 1); 800 801 if (group_dead) 802 disassociate_ctty(1); 803 804 module_put(task_thread_info(tsk)->exec_domain->module); 805 806 proc_exit_connector(tsk); 807 808 /* 809 * FIXME: do that only when needed, using sched_exit tracepoint 810 */ 811 flush_ptrace_hw_breakpoint(tsk); 812 813 exit_notify(tsk, group_dead); 814 #ifdef CONFIG_NUMA 815 task_lock(tsk); 816 mpol_put(tsk->mempolicy); 817 tsk->mempolicy = NULL; 818 task_unlock(tsk); 819 #endif 820 #ifdef CONFIG_FUTEX 821 if (unlikely(current->pi_state_cache)) 822 kfree(current->pi_state_cache); 823 #endif 824 /* 825 * Make sure we are holding no locks: 826 */ 827 debug_check_no_locks_held(); 828 /* 829 * We can do this unlocked here. The futex code uses this flag 830 * just to verify whether the pi state cleanup has been done 831 * or not. In the worst case it loops once more. 832 */ 833 tsk->flags |= PF_EXITPIDONE; 834 835 if (tsk->io_context) 836 exit_io_context(tsk); 837 838 if (tsk->splice_pipe) 839 free_pipe_info(tsk->splice_pipe); 840 841 if (tsk->task_frag.page) 842 put_page(tsk->task_frag.page); 843 844 validate_creds_for_do_exit(tsk); 845 846 preempt_disable(); 847 if (tsk->nr_dirtied) 848 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 849 exit_rcu(); 850 851 /* 852 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed 853 * when the following two conditions become true. 854 * - There is race condition of mmap_sem (It is acquired by 855 * exit_mm()), and 856 * - SMI occurs before setting TASK_RUNINNG. 857 * (or hypervisor of virtual machine switches to other guest) 858 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD 859 * 860 * To avoid it, we have to wait for releasing tsk->pi_lock which 861 * is held by try_to_wake_up() 862 */ 863 smp_mb(); 864 raw_spin_unlock_wait(&tsk->pi_lock); 865 866 /* causes final put_task_struct in finish_task_switch(). */ 867 tsk->state = TASK_DEAD; 868 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ 869 schedule(); 870 BUG(); 871 /* Avoid "noreturn function does return". */ 872 for (;;) 873 cpu_relax(); /* For when BUG is null */ 874 } 875 876 EXPORT_SYMBOL_GPL(do_exit); 877 878 void complete_and_exit(struct completion *comp, long code) 879 { 880 if (comp) 881 complete(comp); 882 883 do_exit(code); 884 } 885 886 EXPORT_SYMBOL(complete_and_exit); 887 888 SYSCALL_DEFINE1(exit, int, error_code) 889 { 890 do_exit((error_code&0xff)<<8); 891 } 892 893 /* 894 * Take down every thread in the group. This is called by fatal signals 895 * as well as by sys_exit_group (below). 896 */ 897 void 898 do_group_exit(int exit_code) 899 { 900 struct signal_struct *sig = current->signal; 901 902 BUG_ON(exit_code & 0x80); /* core dumps don't get here */ 903 904 if (signal_group_exit(sig)) 905 exit_code = sig->group_exit_code; 906 else if (!thread_group_empty(current)) { 907 struct sighand_struct *const sighand = current->sighand; 908 spin_lock_irq(&sighand->siglock); 909 if (signal_group_exit(sig)) 910 /* Another thread got here before we took the lock. */ 911 exit_code = sig->group_exit_code; 912 else { 913 sig->group_exit_code = exit_code; 914 sig->flags = SIGNAL_GROUP_EXIT; 915 zap_other_threads(current); 916 } 917 spin_unlock_irq(&sighand->siglock); 918 } 919 920 do_exit(exit_code); 921 /* NOTREACHED */ 922 } 923 924 /* 925 * this kills every thread in the thread group. Note that any externally 926 * wait4()-ing process will get the correct exit code - even if this 927 * thread is not the thread group leader. 928 */ 929 SYSCALL_DEFINE1(exit_group, int, error_code) 930 { 931 do_group_exit((error_code & 0xff) << 8); 932 /* NOTREACHED */ 933 return 0; 934 } 935 936 struct wait_opts { 937 enum pid_type wo_type; 938 int wo_flags; 939 struct pid *wo_pid; 940 941 struct siginfo __user *wo_info; 942 int __user *wo_stat; 943 struct rusage __user *wo_rusage; 944 945 wait_queue_t child_wait; 946 int notask_error; 947 }; 948 949 static inline 950 struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 951 { 952 if (type != PIDTYPE_PID) 953 task = task->group_leader; 954 return task->pids[type].pid; 955 } 956 957 static int eligible_pid(struct wait_opts *wo, struct task_struct *p) 958 { 959 return wo->wo_type == PIDTYPE_MAX || 960 task_pid_type(p, wo->wo_type) == wo->wo_pid; 961 } 962 963 static int eligible_child(struct wait_opts *wo, struct task_struct *p) 964 { 965 if (!eligible_pid(wo, p)) 966 return 0; 967 /* Wait for all children (clone and not) if __WALL is set; 968 * otherwise, wait for clone children *only* if __WCLONE is 969 * set; otherwise, wait for non-clone children *only*. (Note: 970 * A "clone" child here is one that reports to its parent 971 * using a signal other than SIGCHLD.) */ 972 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 973 && !(wo->wo_flags & __WALL)) 974 return 0; 975 976 return 1; 977 } 978 979 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, 980 pid_t pid, uid_t uid, int why, int status) 981 { 982 struct siginfo __user *infop; 983 int retval = wo->wo_rusage 984 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 985 986 put_task_struct(p); 987 infop = wo->wo_info; 988 if (infop) { 989 if (!retval) 990 retval = put_user(SIGCHLD, &infop->si_signo); 991 if (!retval) 992 retval = put_user(0, &infop->si_errno); 993 if (!retval) 994 retval = put_user((short)why, &infop->si_code); 995 if (!retval) 996 retval = put_user(pid, &infop->si_pid); 997 if (!retval) 998 retval = put_user(uid, &infop->si_uid); 999 if (!retval) 1000 retval = put_user(status, &infop->si_status); 1001 } 1002 if (!retval) 1003 retval = pid; 1004 return retval; 1005 } 1006 1007 /* 1008 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1009 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1010 * the lock and this task is uninteresting. If we return nonzero, we have 1011 * released the lock and the system call should return. 1012 */ 1013 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1014 { 1015 unsigned long state; 1016 int retval, status, traced; 1017 pid_t pid = task_pid_vnr(p); 1018 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1019 struct siginfo __user *infop; 1020 1021 if (!likely(wo->wo_flags & WEXITED)) 1022 return 0; 1023 1024 if (unlikely(wo->wo_flags & WNOWAIT)) { 1025 int exit_code = p->exit_code; 1026 int why; 1027 1028 get_task_struct(p); 1029 read_unlock(&tasklist_lock); 1030 if ((exit_code & 0x7f) == 0) { 1031 why = CLD_EXITED; 1032 status = exit_code >> 8; 1033 } else { 1034 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1035 status = exit_code & 0x7f; 1036 } 1037 return wait_noreap_copyout(wo, p, pid, uid, why, status); 1038 } 1039 1040 /* 1041 * Try to move the task's state to DEAD 1042 * only one thread is allowed to do this: 1043 */ 1044 state = xchg(&p->exit_state, EXIT_DEAD); 1045 if (state != EXIT_ZOMBIE) { 1046 BUG_ON(state != EXIT_DEAD); 1047 return 0; 1048 } 1049 1050 traced = ptrace_reparented(p); 1051 /* 1052 * It can be ptraced but not reparented, check 1053 * thread_group_leader() to filter out sub-threads. 1054 */ 1055 if (likely(!traced) && thread_group_leader(p)) { 1056 struct signal_struct *psig; 1057 struct signal_struct *sig; 1058 unsigned long maxrss; 1059 cputime_t tgutime, tgstime; 1060 1061 /* 1062 * The resource counters for the group leader are in its 1063 * own task_struct. Those for dead threads in the group 1064 * are in its signal_struct, as are those for the child 1065 * processes it has previously reaped. All these 1066 * accumulate in the parent's signal_struct c* fields. 1067 * 1068 * We don't bother to take a lock here to protect these 1069 * p->signal fields, because they are only touched by 1070 * __exit_signal, which runs with tasklist_lock 1071 * write-locked anyway, and so is excluded here. We do 1072 * need to protect the access to parent->signal fields, 1073 * as other threads in the parent group can be right 1074 * here reaping other children at the same time. 1075 * 1076 * We use thread_group_cputime_adjusted() to get times for the thread 1077 * group, which consolidates times for all threads in the 1078 * group including the group leader. 1079 */ 1080 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1081 spin_lock_irq(&p->real_parent->sighand->siglock); 1082 psig = p->real_parent->signal; 1083 sig = p->signal; 1084 psig->cutime += tgutime + sig->cutime; 1085 psig->cstime += tgstime + sig->cstime; 1086 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1087 psig->cmin_flt += 1088 p->min_flt + sig->min_flt + sig->cmin_flt; 1089 psig->cmaj_flt += 1090 p->maj_flt + sig->maj_flt + sig->cmaj_flt; 1091 psig->cnvcsw += 1092 p->nvcsw + sig->nvcsw + sig->cnvcsw; 1093 psig->cnivcsw += 1094 p->nivcsw + sig->nivcsw + sig->cnivcsw; 1095 psig->cinblock += 1096 task_io_get_inblock(p) + 1097 sig->inblock + sig->cinblock; 1098 psig->coublock += 1099 task_io_get_oublock(p) + 1100 sig->oublock + sig->coublock; 1101 maxrss = max(sig->maxrss, sig->cmaxrss); 1102 if (psig->cmaxrss < maxrss) 1103 psig->cmaxrss = maxrss; 1104 task_io_accounting_add(&psig->ioac, &p->ioac); 1105 task_io_accounting_add(&psig->ioac, &sig->ioac); 1106 spin_unlock_irq(&p->real_parent->sighand->siglock); 1107 } 1108 1109 /* 1110 * Now we are sure this task is interesting, and no other 1111 * thread can reap it because we set its state to EXIT_DEAD. 1112 */ 1113 read_unlock(&tasklist_lock); 1114 1115 retval = wo->wo_rusage 1116 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1117 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1118 ? p->signal->group_exit_code : p->exit_code; 1119 if (!retval && wo->wo_stat) 1120 retval = put_user(status, wo->wo_stat); 1121 1122 infop = wo->wo_info; 1123 if (!retval && infop) 1124 retval = put_user(SIGCHLD, &infop->si_signo); 1125 if (!retval && infop) 1126 retval = put_user(0, &infop->si_errno); 1127 if (!retval && infop) { 1128 int why; 1129 1130 if ((status & 0x7f) == 0) { 1131 why = CLD_EXITED; 1132 status >>= 8; 1133 } else { 1134 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; 1135 status &= 0x7f; 1136 } 1137 retval = put_user((short)why, &infop->si_code); 1138 if (!retval) 1139 retval = put_user(status, &infop->si_status); 1140 } 1141 if (!retval && infop) 1142 retval = put_user(pid, &infop->si_pid); 1143 if (!retval && infop) 1144 retval = put_user(uid, &infop->si_uid); 1145 if (!retval) 1146 retval = pid; 1147 1148 if (traced) { 1149 write_lock_irq(&tasklist_lock); 1150 /* We dropped tasklist, ptracer could die and untrace */ 1151 ptrace_unlink(p); 1152 /* 1153 * If this is not a sub-thread, notify the parent. 1154 * If parent wants a zombie, don't release it now. 1155 */ 1156 if (thread_group_leader(p) && 1157 !do_notify_parent(p, p->exit_signal)) { 1158 p->exit_state = EXIT_ZOMBIE; 1159 p = NULL; 1160 } 1161 write_unlock_irq(&tasklist_lock); 1162 } 1163 if (p != NULL) 1164 release_task(p); 1165 1166 return retval; 1167 } 1168 1169 static int *task_stopped_code(struct task_struct *p, bool ptrace) 1170 { 1171 if (ptrace) { 1172 if (task_is_stopped_or_traced(p) && 1173 !(p->jobctl & JOBCTL_LISTENING)) 1174 return &p->exit_code; 1175 } else { 1176 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1177 return &p->signal->group_exit_code; 1178 } 1179 return NULL; 1180 } 1181 1182 /** 1183 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED 1184 * @wo: wait options 1185 * @ptrace: is the wait for ptrace 1186 * @p: task to wait for 1187 * 1188 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. 1189 * 1190 * CONTEXT: 1191 * read_lock(&tasklist_lock), which is released if return value is 1192 * non-zero. Also, grabs and releases @p->sighand->siglock. 1193 * 1194 * RETURNS: 1195 * 0 if wait condition didn't exist and search for other wait conditions 1196 * should continue. Non-zero return, -errno on failure and @p's pid on 1197 * success, implies that tasklist_lock is released and wait condition 1198 * search should terminate. 1199 */ 1200 static int wait_task_stopped(struct wait_opts *wo, 1201 int ptrace, struct task_struct *p) 1202 { 1203 struct siginfo __user *infop; 1204 int retval, exit_code, *p_code, why; 1205 uid_t uid = 0; /* unneeded, required by compiler */ 1206 pid_t pid; 1207 1208 /* 1209 * Traditionally we see ptrace'd stopped tasks regardless of options. 1210 */ 1211 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1212 return 0; 1213 1214 if (!task_stopped_code(p, ptrace)) 1215 return 0; 1216 1217 exit_code = 0; 1218 spin_lock_irq(&p->sighand->siglock); 1219 1220 p_code = task_stopped_code(p, ptrace); 1221 if (unlikely(!p_code)) 1222 goto unlock_sig; 1223 1224 exit_code = *p_code; 1225 if (!exit_code) 1226 goto unlock_sig; 1227 1228 if (!unlikely(wo->wo_flags & WNOWAIT)) 1229 *p_code = 0; 1230 1231 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1232 unlock_sig: 1233 spin_unlock_irq(&p->sighand->siglock); 1234 if (!exit_code) 1235 return 0; 1236 1237 /* 1238 * Now we are pretty sure this task is interesting. 1239 * Make sure it doesn't get reaped out from under us while we 1240 * give up the lock and then examine it below. We don't want to 1241 * keep holding onto the tasklist_lock while we call getrusage and 1242 * possibly take page faults for user memory. 1243 */ 1244 get_task_struct(p); 1245 pid = task_pid_vnr(p); 1246 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1247 read_unlock(&tasklist_lock); 1248 1249 if (unlikely(wo->wo_flags & WNOWAIT)) 1250 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1251 1252 retval = wo->wo_rusage 1253 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1254 if (!retval && wo->wo_stat) 1255 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); 1256 1257 infop = wo->wo_info; 1258 if (!retval && infop) 1259 retval = put_user(SIGCHLD, &infop->si_signo); 1260 if (!retval && infop) 1261 retval = put_user(0, &infop->si_errno); 1262 if (!retval && infop) 1263 retval = put_user((short)why, &infop->si_code); 1264 if (!retval && infop) 1265 retval = put_user(exit_code, &infop->si_status); 1266 if (!retval && infop) 1267 retval = put_user(pid, &infop->si_pid); 1268 if (!retval && infop) 1269 retval = put_user(uid, &infop->si_uid); 1270 if (!retval) 1271 retval = pid; 1272 put_task_struct(p); 1273 1274 BUG_ON(!retval); 1275 return retval; 1276 } 1277 1278 /* 1279 * Handle do_wait work for one task in a live, non-stopped state. 1280 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1281 * the lock and this task is uninteresting. If we return nonzero, we have 1282 * released the lock and the system call should return. 1283 */ 1284 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1285 { 1286 int retval; 1287 pid_t pid; 1288 uid_t uid; 1289 1290 if (!unlikely(wo->wo_flags & WCONTINUED)) 1291 return 0; 1292 1293 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1294 return 0; 1295 1296 spin_lock_irq(&p->sighand->siglock); 1297 /* Re-check with the lock held. */ 1298 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { 1299 spin_unlock_irq(&p->sighand->siglock); 1300 return 0; 1301 } 1302 if (!unlikely(wo->wo_flags & WNOWAIT)) 1303 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1304 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1305 spin_unlock_irq(&p->sighand->siglock); 1306 1307 pid = task_pid_vnr(p); 1308 get_task_struct(p); 1309 read_unlock(&tasklist_lock); 1310 1311 if (!wo->wo_info) { 1312 retval = wo->wo_rusage 1313 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1314 put_task_struct(p); 1315 if (!retval && wo->wo_stat) 1316 retval = put_user(0xffff, wo->wo_stat); 1317 if (!retval) 1318 retval = pid; 1319 } else { 1320 retval = wait_noreap_copyout(wo, p, pid, uid, 1321 CLD_CONTINUED, SIGCONT); 1322 BUG_ON(retval == 0); 1323 } 1324 1325 return retval; 1326 } 1327 1328 /* 1329 * Consider @p for a wait by @parent. 1330 * 1331 * -ECHILD should be in ->notask_error before the first call. 1332 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1333 * Returns zero if the search for a child should continue; 1334 * then ->notask_error is 0 if @p is an eligible child, 1335 * or another error from security_task_wait(), or still -ECHILD. 1336 */ 1337 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1338 struct task_struct *p) 1339 { 1340 int ret = eligible_child(wo, p); 1341 if (!ret) 1342 return ret; 1343 1344 ret = security_task_wait(p); 1345 if (unlikely(ret < 0)) { 1346 /* 1347 * If we have not yet seen any eligible child, 1348 * then let this error code replace -ECHILD. 1349 * A permission error will give the user a clue 1350 * to look for security policy problems, rather 1351 * than for mysterious wait bugs. 1352 */ 1353 if (wo->notask_error) 1354 wo->notask_error = ret; 1355 return 0; 1356 } 1357 1358 /* dead body doesn't have much to contribute */ 1359 if (unlikely(p->exit_state == EXIT_DEAD)) { 1360 /* 1361 * But do not ignore this task until the tracer does 1362 * wait_task_zombie()->do_notify_parent(). 1363 */ 1364 if (likely(!ptrace) && unlikely(ptrace_reparented(p))) 1365 wo->notask_error = 0; 1366 return 0; 1367 } 1368 1369 /* slay zombie? */ 1370 if (p->exit_state == EXIT_ZOMBIE) { 1371 /* 1372 * A zombie ptracee is only visible to its ptracer. 1373 * Notification and reaping will be cascaded to the real 1374 * parent when the ptracer detaches. 1375 */ 1376 if (likely(!ptrace) && unlikely(p->ptrace)) { 1377 /* it will become visible, clear notask_error */ 1378 wo->notask_error = 0; 1379 return 0; 1380 } 1381 1382 /* we don't reap group leaders with subthreads */ 1383 if (!delay_group_leader(p)) 1384 return wait_task_zombie(wo, p); 1385 1386 /* 1387 * Allow access to stopped/continued state via zombie by 1388 * falling through. Clearing of notask_error is complex. 1389 * 1390 * When !@ptrace: 1391 * 1392 * If WEXITED is set, notask_error should naturally be 1393 * cleared. If not, subset of WSTOPPED|WCONTINUED is set, 1394 * so, if there are live subthreads, there are events to 1395 * wait for. If all subthreads are dead, it's still safe 1396 * to clear - this function will be called again in finite 1397 * amount time once all the subthreads are released and 1398 * will then return without clearing. 1399 * 1400 * When @ptrace: 1401 * 1402 * Stopped state is per-task and thus can't change once the 1403 * target task dies. Only continued and exited can happen. 1404 * Clear notask_error if WCONTINUED | WEXITED. 1405 */ 1406 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) 1407 wo->notask_error = 0; 1408 } else { 1409 /* 1410 * If @p is ptraced by a task in its real parent's group, 1411 * hide group stop/continued state when looking at @p as 1412 * the real parent; otherwise, a single stop can be 1413 * reported twice as group and ptrace stops. 1414 * 1415 * If a ptracer wants to distinguish the two events for its 1416 * own children, it should create a separate process which 1417 * takes the role of real parent. 1418 */ 1419 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) 1420 return 0; 1421 1422 /* 1423 * @p is alive and it's gonna stop, continue or exit, so 1424 * there always is something to wait for. 1425 */ 1426 wo->notask_error = 0; 1427 } 1428 1429 /* 1430 * Wait for stopped. Depending on @ptrace, different stopped state 1431 * is used and the two don't interact with each other. 1432 */ 1433 ret = wait_task_stopped(wo, ptrace, p); 1434 if (ret) 1435 return ret; 1436 1437 /* 1438 * Wait for continued. There's only one continued state and the 1439 * ptracer can consume it which can confuse the real parent. Don't 1440 * use WCONTINUED from ptracer. You don't need or want it. 1441 */ 1442 return wait_task_continued(wo, p); 1443 } 1444 1445 /* 1446 * Do the work of do_wait() for one thread in the group, @tsk. 1447 * 1448 * -ECHILD should be in ->notask_error before the first call. 1449 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1450 * Returns zero if the search for a child should continue; then 1451 * ->notask_error is 0 if there were any eligible children, 1452 * or another error from security_task_wait(), or still -ECHILD. 1453 */ 1454 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) 1455 { 1456 struct task_struct *p; 1457 1458 list_for_each_entry(p, &tsk->children, sibling) { 1459 int ret = wait_consider_task(wo, 0, p); 1460 if (ret) 1461 return ret; 1462 } 1463 1464 return 0; 1465 } 1466 1467 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) 1468 { 1469 struct task_struct *p; 1470 1471 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1472 int ret = wait_consider_task(wo, 1, p); 1473 if (ret) 1474 return ret; 1475 } 1476 1477 return 0; 1478 } 1479 1480 static int child_wait_callback(wait_queue_t *wait, unsigned mode, 1481 int sync, void *key) 1482 { 1483 struct wait_opts *wo = container_of(wait, struct wait_opts, 1484 child_wait); 1485 struct task_struct *p = key; 1486 1487 if (!eligible_pid(wo, p)) 1488 return 0; 1489 1490 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) 1491 return 0; 1492 1493 return default_wake_function(wait, mode, sync, key); 1494 } 1495 1496 void __wake_up_parent(struct task_struct *p, struct task_struct *parent) 1497 { 1498 __wake_up_sync_key(&parent->signal->wait_chldexit, 1499 TASK_INTERRUPTIBLE, 1, p); 1500 } 1501 1502 static long do_wait(struct wait_opts *wo) 1503 { 1504 struct task_struct *tsk; 1505 int retval; 1506 1507 trace_sched_process_wait(wo->wo_pid); 1508 1509 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); 1510 wo->child_wait.private = current; 1511 add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1512 repeat: 1513 /* 1514 * If there is nothing that can match our critiera just get out. 1515 * We will clear ->notask_error to zero if we see any child that 1516 * might later match our criteria, even if we are not able to reap 1517 * it yet. 1518 */ 1519 wo->notask_error = -ECHILD; 1520 if ((wo->wo_type < PIDTYPE_MAX) && 1521 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) 1522 goto notask; 1523 1524 set_current_state(TASK_INTERRUPTIBLE); 1525 read_lock(&tasklist_lock); 1526 tsk = current; 1527 do { 1528 retval = do_wait_thread(wo, tsk); 1529 if (retval) 1530 goto end; 1531 1532 retval = ptrace_do_wait(wo, tsk); 1533 if (retval) 1534 goto end; 1535 1536 if (wo->wo_flags & __WNOTHREAD) 1537 break; 1538 } while_each_thread(current, tsk); 1539 read_unlock(&tasklist_lock); 1540 1541 notask: 1542 retval = wo->notask_error; 1543 if (!retval && !(wo->wo_flags & WNOHANG)) { 1544 retval = -ERESTARTSYS; 1545 if (!signal_pending(current)) { 1546 schedule(); 1547 goto repeat; 1548 } 1549 } 1550 end: 1551 __set_current_state(TASK_RUNNING); 1552 remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); 1553 return retval; 1554 } 1555 1556 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1557 infop, int, options, struct rusage __user *, ru) 1558 { 1559 struct wait_opts wo; 1560 struct pid *pid = NULL; 1561 enum pid_type type; 1562 long ret; 1563 1564 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) 1565 return -EINVAL; 1566 if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) 1567 return -EINVAL; 1568 1569 switch (which) { 1570 case P_ALL: 1571 type = PIDTYPE_MAX; 1572 break; 1573 case P_PID: 1574 type = PIDTYPE_PID; 1575 if (upid <= 0) 1576 return -EINVAL; 1577 break; 1578 case P_PGID: 1579 type = PIDTYPE_PGID; 1580 if (upid <= 0) 1581 return -EINVAL; 1582 break; 1583 default: 1584 return -EINVAL; 1585 } 1586 1587 if (type < PIDTYPE_MAX) 1588 pid = find_get_pid(upid); 1589 1590 wo.wo_type = type; 1591 wo.wo_pid = pid; 1592 wo.wo_flags = options; 1593 wo.wo_info = infop; 1594 wo.wo_stat = NULL; 1595 wo.wo_rusage = ru; 1596 ret = do_wait(&wo); 1597 1598 if (ret > 0) { 1599 ret = 0; 1600 } else if (infop) { 1601 /* 1602 * For a WNOHANG return, clear out all the fields 1603 * we would set so the user can easily tell the 1604 * difference. 1605 */ 1606 if (!ret) 1607 ret = put_user(0, &infop->si_signo); 1608 if (!ret) 1609 ret = put_user(0, &infop->si_errno); 1610 if (!ret) 1611 ret = put_user(0, &infop->si_code); 1612 if (!ret) 1613 ret = put_user(0, &infop->si_pid); 1614 if (!ret) 1615 ret = put_user(0, &infop->si_uid); 1616 if (!ret) 1617 ret = put_user(0, &infop->si_status); 1618 } 1619 1620 put_pid(pid); 1621 return ret; 1622 } 1623 1624 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1625 int, options, struct rusage __user *, ru) 1626 { 1627 struct wait_opts wo; 1628 struct pid *pid = NULL; 1629 enum pid_type type; 1630 long ret; 1631 1632 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1633 __WNOTHREAD|__WCLONE|__WALL)) 1634 return -EINVAL; 1635 1636 if (upid == -1) 1637 type = PIDTYPE_MAX; 1638 else if (upid < 0) { 1639 type = PIDTYPE_PGID; 1640 pid = find_get_pid(-upid); 1641 } else if (upid == 0) { 1642 type = PIDTYPE_PGID; 1643 pid = get_task_pid(current, PIDTYPE_PGID); 1644 } else /* upid > 0 */ { 1645 type = PIDTYPE_PID; 1646 pid = find_get_pid(upid); 1647 } 1648 1649 wo.wo_type = type; 1650 wo.wo_pid = pid; 1651 wo.wo_flags = options | WEXITED; 1652 wo.wo_info = NULL; 1653 wo.wo_stat = stat_addr; 1654 wo.wo_rusage = ru; 1655 ret = do_wait(&wo); 1656 put_pid(pid); 1657 1658 return ret; 1659 } 1660 1661 #ifdef __ARCH_WANT_SYS_WAITPID 1662 1663 /* 1664 * sys_waitpid() remains for compatibility. waitpid() should be 1665 * implemented by calling sys_wait4() from libc.a. 1666 */ 1667 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1668 { 1669 return sys_wait4(pid, stat_addr, options, NULL); 1670 } 1671 1672 #endif 1673