fork.c (cbb245239282870bc6f54d5137dfe0f84b48ea72) fork.c (a8ea6fc9b089156d9230bfeef964dd9be101a4a9)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * linux/kernel/fork.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8/*

--- 82 unchanged lines hidden (view full) ---

91#include <linux/sysctl.h>
92#include <linux/kcov.h>
93#include <linux/livepatch.h>
94#include <linux/thread_info.h>
95#include <linux/stackleak.h>
96#include <linux/kasan.h>
97#include <linux/scs.h>
98#include <linux/io_uring.h>
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * linux/kernel/fork.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8/*

--- 82 unchanged lines hidden (view full) ---

91#include <linux/sysctl.h>
92#include <linux/kcov.h>
93#include <linux/livepatch.h>
94#include <linux/thread_info.h>
95#include <linux/stackleak.h>
96#include <linux/kasan.h>
97#include <linux/scs.h>
98#include <linux/io_uring.h>
99#include <linux/bpf.h>
99
100#include <asm/pgalloc.h>
101#include <linux/uaccess.h>
102#include <asm/mmu_context.h>
103#include <asm/cacheflush.h>
104#include <asm/tlbflush.h>
105
106#include <trace/events/sched.h>

--- 267 unchanged lines hidden (view full) ---

374 kmem_cache_free(vm_area_cachep, vma);
375}
376
377static void account_kernel_stack(struct task_struct *tsk, int account)
378{
379 void *stack = task_stack_page(tsk);
380 struct vm_struct *vm = task_stack_vm_area(tsk);
381
100
101#include <asm/pgalloc.h>
102#include <linux/uaccess.h>
103#include <asm/mmu_context.h>
104#include <asm/cacheflush.h>
105#include <asm/tlbflush.h>
106
107#include <trace/events/sched.h>

--- 267 unchanged lines hidden (view full) ---

375 kmem_cache_free(vm_area_cachep, vma);
376}
377
378static void account_kernel_stack(struct task_struct *tsk, int account)
379{
380 void *stack = task_stack_page(tsk);
381 struct vm_struct *vm = task_stack_vm_area(tsk);
382
383 if (vm) {
384 int i;
382
385
383 /* All stack pages are in the same node. */
384 if (vm)
385 mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
386 account * (THREAD_SIZE / 1024));
387 else
386 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
387 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
388 account * (PAGE_SIZE / 1024));
389 } else {
390 /* All stack pages are in the same node. */
388 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
389 account * (THREAD_SIZE / 1024));
391 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
392 account * (THREAD_SIZE / 1024));
393 }
390}
391
392static int memcg_charge_kernel_stack(struct task_struct *tsk)
393{
394#ifdef CONFIG_VMAP_STACK
395 struct vm_struct *vm = task_stack_vm_area(tsk);
396 int ret;
397

--- 331 unchanged lines hidden (view full) ---

729 WARN_ON(!tsk->exit_state);
730 WARN_ON(refcount_read(&tsk->usage));
731 WARN_ON(tsk == current);
732
733 io_uring_free(tsk);
734 cgroup_free(tsk);
735 task_numa_free(tsk, true);
736 security_task_free(tsk);
394}
395
396static int memcg_charge_kernel_stack(struct task_struct *tsk)
397{
398#ifdef CONFIG_VMAP_STACK
399 struct vm_struct *vm = task_stack_vm_area(tsk);
400 int ret;
401

--- 331 unchanged lines hidden (view full) ---

733 WARN_ON(!tsk->exit_state);
734 WARN_ON(refcount_read(&tsk->usage));
735 WARN_ON(tsk == current);
736
737 io_uring_free(tsk);
738 cgroup_free(tsk);
739 task_numa_free(tsk, true);
740 security_task_free(tsk);
741 bpf_task_storage_free(tsk);
737 exit_creds(tsk);
738 delayacct_tsk_free(tsk);
739 put_signal_struct(tsk->signal);
742 exit_creds(tsk);
743 delayacct_tsk_free(tsk);
744 put_signal_struct(tsk->signal);
745 sched_core_free(tsk);
740
741 if (!profile_handoff_task(tsk))
742 free_task(tsk);
743}
744EXPORT_SYMBOL_GPL(__put_task_struct);
745
746void __init __weak arch_task_cache_init(void) { }
747

--- 174 unchanged lines hidden (view full) ---

922 /* One for the rcu users */
923 refcount_set(&tsk->usage, 1);
924#ifdef CONFIG_BLK_DEV_IO_TRACE
925 tsk->btrace_seq = 0;
926#endif
927 tsk->splice_pipe = NULL;
928 tsk->task_frag.page = NULL;
929 tsk->wake_q.next = NULL;
746
747 if (!profile_handoff_task(tsk))
748 free_task(tsk);
749}
750EXPORT_SYMBOL_GPL(__put_task_struct);
751
752void __init __weak arch_task_cache_init(void) { }
753

--- 174 unchanged lines hidden (view full) ---

928 /* One for the rcu users */
929 refcount_set(&tsk->usage, 1);
930#ifdef CONFIG_BLK_DEV_IO_TRACE
931 tsk->btrace_seq = 0;
932#endif
933 tsk->splice_pipe = NULL;
934 tsk->task_frag.page = NULL;
935 tsk->wake_q.next = NULL;
936 tsk->pf_io_worker = NULL;
930
931 account_kernel_stack(tsk, 1);
932
933 kcov_task_init(tsk);
934 kmap_local_fork(tsk);
935
936#ifdef CONFIG_FAULT_INJECTION
937 tsk->fail_nth = 0;

--- 196 unchanged lines hidden (view full) ---

1134 * set_mm_exe_file - change a reference to the mm's executable file
1135 *
1136 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1137 *
1138 * Main users are mmput() and sys_execve(). Callers prevent concurrent
1139 * invocations: in mmput() nobody alive left, in execve task is single
1140 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
1141 * mm->exe_file, but does so without using set_mm_exe_file() in order
937
938 account_kernel_stack(tsk, 1);
939
940 kcov_task_init(tsk);
941 kmap_local_fork(tsk);
942
943#ifdef CONFIG_FAULT_INJECTION
944 tsk->fail_nth = 0;

--- 196 unchanged lines hidden (view full) ---

1141 * set_mm_exe_file - change a reference to the mm's executable file
1142 *
1143 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1144 *
1145 * Main users are mmput() and sys_execve(). Callers prevent concurrent
1146 * invocations: in mmput() nobody alive left, in execve task is single
1147 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
1148 * mm->exe_file, but does so without using set_mm_exe_file() in order
1142 * to do avoid the need for any locks.
1149 * to avoid the need for any locks.
1143 */
1144void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1145{
1146 struct file *old_exe_file;
1147
1148 /*
1149 * It is safe to dereference the exe_file without RCU as
1150 * this function is only called if nobody else can access

--- 234 unchanged lines hidden (view full) ---

1385
1386fail_nomem:
1387 return NULL;
1388}
1389
1390static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1391{
1392 struct mm_struct *mm, *oldmm;
1150 */
1151void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1152{
1153 struct file *old_exe_file;
1154
1155 /*
1156 * It is safe to dereference the exe_file without RCU as
1157 * this function is only called if nobody else can access

--- 234 unchanged lines hidden (view full) ---

1392
1393fail_nomem:
1394 return NULL;
1395}
1396
1397static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1398{
1399 struct mm_struct *mm, *oldmm;
1393 int retval;
1394
1395 tsk->min_flt = tsk->maj_flt = 0;
1396 tsk->nvcsw = tsk->nivcsw = 0;
1397#ifdef CONFIG_DETECT_HUNG_TASK
1398 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1399 tsk->last_switch_time = 0;
1400#endif
1401

--- 10 unchanged lines hidden (view full) ---

1412 return 0;
1413
1414 /* initialize the new vmacache entries */
1415 vmacache_flush(tsk);
1416
1417 if (clone_flags & CLONE_VM) {
1418 mmget(oldmm);
1419 mm = oldmm;
1400
1401 tsk->min_flt = tsk->maj_flt = 0;
1402 tsk->nvcsw = tsk->nivcsw = 0;
1403#ifdef CONFIG_DETECT_HUNG_TASK
1404 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1405 tsk->last_switch_time = 0;
1406#endif
1407

--- 10 unchanged lines hidden (view full) ---

1418 return 0;
1419
1420 /* initialize the new vmacache entries */
1421 vmacache_flush(tsk);
1422
1423 if (clone_flags & CLONE_VM) {
1424 mmget(oldmm);
1425 mm = oldmm;
1420 goto good_mm;
1426 } else {
1427 mm = dup_mm(tsk, current->mm);
1428 if (!mm)
1429 return -ENOMEM;
1421 }
1422
1430 }
1431
1423 retval = -ENOMEM;
1424 mm = dup_mm(tsk, current->mm);
1425 if (!mm)
1426 goto fail_nomem;
1427
1428good_mm:
1429 tsk->mm = mm;
1430 tsk->active_mm = mm;
1431 return 0;
1432 tsk->mm = mm;
1433 tsk->active_mm = mm;
1434 return 0;
1432
1433fail_nomem:
1434 return retval;
1435}
1436
1437static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1438{
1439 struct fs_struct *fs = current->fs;
1440 if (clone_flags & CLONE_FS) {
1441 /* tsk->fs is already what we want */
1442 spin_lock(&fs->lock);

--- 289 unchanged lines hidden (view full) ---

1732 * Pid field and the first entry in the NSpid field will be identical.
1733 * If the pid namespace of the process is not a descendant of the pid
1734 * namespace of the procfs instance 0 will be shown as its first NSpid
1735 * entry and no others will be shown.
1736 * Note that this differs from the Pid and NSpid fields in
1737 * /proc/<pid>/status where Pid and NSpid are always shown relative to
1738 * the pid namespace of the procfs instance. The difference becomes
1739 * obvious when sending around a pidfd between pid namespaces from a
1435}
1436
1437static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1438{
1439 struct fs_struct *fs = current->fs;
1440 if (clone_flags & CLONE_FS) {
1441 /* tsk->fs is already what we want */
1442 spin_lock(&fs->lock);

--- 289 unchanged lines hidden (view full) ---

1732 * Pid field and the first entry in the NSpid field will be identical.
1733 * If the pid namespace of the process is not a descendant of the pid
1734 * namespace of the procfs instance 0 will be shown as its first NSpid
1735 * entry and no others will be shown.
1736 * Note that this differs from the Pid and NSpid fields in
1737 * /proc/<pid>/status where Pid and NSpid are always shown relative to
1738 * the pid namespace of the procfs instance. The difference becomes
1739 * obvious when sending around a pidfd between pid namespaces from a
1740 * different branch of the tree, i.e. where no ancestoral relation is
1740 * different branch of the tree, i.e. where no ancestral relation is
1741 * present between the pid namespaces:
1742 * - create two new pid namespaces ns1 and ns2 in the initial pid
1743 * namespace (also take care to create new mount namespaces in the
1744 * new pid namespace and mount procfs)
1745 * - create a process with a pidfd in ns1
1746 * - send pidfd from ns1 to ns2
1747 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
1748 * have exactly one entry, which is 0

--- 187 unchanged lines hidden (view full) ---

1936 INIT_HLIST_NODE(&delayed.node);
1937
1938 spin_lock_irq(&current->sighand->siglock);
1939 if (!(clone_flags & CLONE_THREAD))
1940 hlist_add_head(&delayed.node, &current->signal->multiprocess);
1941 recalc_sigpending();
1942 spin_unlock_irq(&current->sighand->siglock);
1943 retval = -ERESTARTNOINTR;
1741 * present between the pid namespaces:
1742 * - create two new pid namespaces ns1 and ns2 in the initial pid
1743 * namespace (also take care to create new mount namespaces in the
1744 * new pid namespace and mount procfs)
1745 * - create a process with a pidfd in ns1
1746 * - send pidfd from ns1 to ns2
1747 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
1748 * have exactly one entry, which is 0

--- 187 unchanged lines hidden (view full) ---

1936 INIT_HLIST_NODE(&delayed.node);
1937
1938 spin_lock_irq(&current->sighand->siglock);
1939 if (!(clone_flags & CLONE_THREAD))
1940 hlist_add_head(&delayed.node, &current->signal->multiprocess);
1941 recalc_sigpending();
1942 spin_unlock_irq(&current->sighand->siglock);
1943 retval = -ERESTARTNOINTR;
1944 if (signal_pending(current))
1944 if (task_sigpending(current))
1945 goto fork_out;
1946
1947 retval = -ENOMEM;
1948 p = dup_task_struct(current, node);
1949 if (!p)
1950 goto fork_out;
1951 if (args->io_thread) {
1952 /*

--- 42 unchanged lines hidden (view full) ---

1995 * triggers too late. This doesn't hurt, the check is only there
1996 * to stop root fork bombs.
1997 */
1998 retval = -EAGAIN;
1999 if (data_race(nr_threads >= max_threads))
2000 goto bad_fork_cleanup_count;
2001
2002 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1945 goto fork_out;
1946
1947 retval = -ENOMEM;
1948 p = dup_task_struct(current, node);
1949 if (!p)
1950 goto fork_out;
1951 if (args->io_thread) {
1952 /*

--- 42 unchanged lines hidden (view full) ---

1995 * triggers too late. This doesn't hurt, the check is only there
1996 * to stop root fork bombs.
1997 */
1998 retval = -EAGAIN;
1999 if (data_race(nr_threads >= max_threads))
2000 goto bad_fork_cleanup_count;
2001
2002 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
2003 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
2003 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2004 p->flags |= PF_FORKNOEXEC;
2005 INIT_LIST_HEAD(&p->children);
2006 INIT_LIST_HEAD(&p->sibling);
2007 rcu_copy_process(p);
2008 p->vfork_done = NULL;
2009 spin_lock_init(&p->alloc_lock);
2010
2011 init_sigpending(&p->pending);
2004 p->flags |= PF_FORKNOEXEC;
2005 INIT_LIST_HEAD(&p->children);
2006 INIT_LIST_HEAD(&p->sibling);
2007 rcu_copy_process(p);
2008 p->vfork_done = NULL;
2009 spin_lock_init(&p->alloc_lock);
2010
2011 init_sigpending(&p->pending);
2012 p->sigqueue_cache = NULL;
2012
2013 p->utime = p->stime = p->gtime = 0;
2014#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2015 p->utimescaled = p->stimescaled = 0;
2016#endif
2017 prev_cputime_init(&p->prev_cputime);
2018
2019#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN

--- 53 unchanged lines hidden (view full) ---

2073
2074#ifdef CONFIG_DEBUG_MUTEXES
2075 p->blocked_on = NULL; /* not blocked yet */
2076#endif
2077#ifdef CONFIG_BCACHE
2078 p->sequential_io = 0;
2079 p->sequential_io_avg = 0;
2080#endif
2013
2014 p->utime = p->stime = p->gtime = 0;
2015#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2016 p->utimescaled = p->stimescaled = 0;
2017#endif
2018 prev_cputime_init(&p->prev_cputime);
2019
2020#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN

--- 53 unchanged lines hidden (view full) ---

2074
2075#ifdef CONFIG_DEBUG_MUTEXES
2076 p->blocked_on = NULL; /* not blocked yet */
2077#endif
2078#ifdef CONFIG_BCACHE
2079 p->sequential_io = 0;
2080 p->sequential_io_avg = 0;
2081#endif
2082#ifdef CONFIG_BPF_SYSCALL
2083 RCU_INIT_POINTER(p->bpf_storage, NULL);
2084#endif
2081
2082 /* Perform scheduler related setup. Assign this task to a CPU. */
2083 retval = sched_fork(clone_flags, p);
2084 if (retval)
2085 goto bad_fork_cleanup_policy;
2086
2085
2086 /* Perform scheduler related setup. Assign this task to a CPU. */
2087 retval = sched_fork(clone_flags, p);
2088 if (retval)
2089 goto bad_fork_cleanup_policy;
2090
2087 retval = perf_event_init_task(p);
2091 retval = perf_event_init_task(p, clone_flags);
2088 if (retval)
2089 goto bad_fork_cleanup_policy;
2090 retval = audit_alloc(p);
2091 if (retval)
2092 goto bad_fork_cleanup_perf;
2093 /* copy all the process information */
2094 shm_init_task(p);
2095 retval = security_task_alloc(p, clone_flags);

--- 146 unchanged lines hidden (view full) ---

2242 } else {
2243 p->real_parent = current;
2244 p->parent_exec_id = current->self_exec_id;
2245 p->exit_signal = args->exit_signal;
2246 }
2247
2248 klp_copy_process(p);
2249
2092 if (retval)
2093 goto bad_fork_cleanup_policy;
2094 retval = audit_alloc(p);
2095 if (retval)
2096 goto bad_fork_cleanup_perf;
2097 /* copy all the process information */
2098 shm_init_task(p);
2099 retval = security_task_alloc(p, clone_flags);

--- 146 unchanged lines hidden (view full) ---

2246 } else {
2247 p->real_parent = current;
2248 p->parent_exec_id = current->self_exec_id;
2249 p->exit_signal = args->exit_signal;
2250 }
2251
2252 klp_copy_process(p);
2253
2254 sched_core_fork(p);
2255
2250 spin_lock(&current->sighand->siglock);
2251
2252 /*
2253 * Copy seccomp details explicitly here, in case they were changed
2254 * before holding sighand lock.
2255 */
2256 copy_seccomp(p);
2257

--- 71 unchanged lines hidden (view full) ---

2329 trace_task_newtask(p, clone_flags);
2330 uprobe_copy_process(p, clone_flags);
2331
2332 copy_oom_score_adj(clone_flags, p);
2333
2334 return p;
2335
2336bad_fork_cancel_cgroup:
2256 spin_lock(&current->sighand->siglock);
2257
2258 /*
2259 * Copy seccomp details explicitly here, in case they were changed
2260 * before holding sighand lock.
2261 */
2262 copy_seccomp(p);
2263

--- 71 unchanged lines hidden (view full) ---

2335 trace_task_newtask(p, clone_flags);
2336 uprobe_copy_process(p, clone_flags);
2337
2338 copy_oom_score_adj(clone_flags, p);
2339
2340 return p;
2341
2342bad_fork_cancel_cgroup:
2343 sched_core_free(p);
2337 spin_unlock(&current->sighand->siglock);
2338 write_unlock_irq(&tasklist_lock);
2339 cgroup_cancel_fork(p, args);
2340bad_fork_put_pidfd:
2341 if (clone_flags & CLONE_PIDFD) {
2342 fput(pidfile);
2343 put_unused_fd(pidfd);
2344 }

--- 55 unchanged lines hidden (view full) ---

2400 enum pid_type type;
2401
2402 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2403 INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
2404 init_task_pid(idle, type, &init_struct_pid);
2405 }
2406}
2407
2344 spin_unlock(&current->sighand->siglock);
2345 write_unlock_irq(&tasklist_lock);
2346 cgroup_cancel_fork(p, args);
2347bad_fork_put_pidfd:
2348 if (clone_flags & CLONE_PIDFD) {
2349 fput(pidfile);
2350 put_unused_fd(pidfd);
2351 }

--- 55 unchanged lines hidden (view full) ---

2407 enum pid_type type;
2408
2409 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2410 INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
2411 init_task_pid(idle, type, &init_struct_pid);
2412 }
2413}
2414
2408struct task_struct *fork_idle(int cpu)
2415struct task_struct * __init fork_idle(int cpu)
2409{
2410 struct task_struct *task;
2411 struct kernel_clone_args args = {
2412 .flags = CLONE_VM,
2413 };
2414
2415 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2416 if (!IS_ERR(task)) {

--- 303 unchanged lines hidden (view full) ---

2720static bool clone3_args_valid(struct kernel_clone_args *kargs)
2721{
2722 /* Verify that no unknown flags are passed along. */
2723 if (kargs->flags &
2724 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2725 return false;
2726
2727 /*
2416{
2417 struct task_struct *task;
2418 struct kernel_clone_args args = {
2419 .flags = CLONE_VM,
2420 };
2421
2422 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2423 if (!IS_ERR(task)) {

--- 303 unchanged lines hidden (view full) ---

2727static bool clone3_args_valid(struct kernel_clone_args *kargs)
2728{
2729 /* Verify that no unknown flags are passed along. */
2730 if (kargs->flags &
2731 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2732 return false;
2733
2734 /*
2728 * - make the CLONE_DETACHED bit reuseable for clone3
2729 * - make the CSIGNAL bits reuseable for clone3
2735 * - make the CLONE_DETACHED bit reusable for clone3
2736 * - make the CSIGNAL bits reusable for clone3
2730 */
2731 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2732 return false;
2733
2734 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2735 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2736 return false;
2737

--- 372 unchanged lines hidden ---
2737 */
2738 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2739 return false;
2740
2741 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2742 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2743 return false;
2744

--- 372 unchanged lines hidden ---