174bd59bbSPavel Emelyanov /* 274bd59bbSPavel Emelyanov * Pid namespaces 374bd59bbSPavel Emelyanov * 474bd59bbSPavel Emelyanov * Authors: 574bd59bbSPavel Emelyanov * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. 674bd59bbSPavel Emelyanov * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM 774bd59bbSPavel Emelyanov * Many thanks to Oleg Nesterov for comments and help 874bd59bbSPavel Emelyanov * 974bd59bbSPavel Emelyanov */ 1074bd59bbSPavel Emelyanov 1174bd59bbSPavel Emelyanov #include <linux/pid.h> 1274bd59bbSPavel Emelyanov #include <linux/pid_namespace.h> 1374bd59bbSPavel Emelyanov #include <linux/syscalls.h> 1474bd59bbSPavel Emelyanov #include <linux/err.h> 150b6b030fSPavel Emelyanov #include <linux/acct.h> 165a0e3ad6STejun Heo #include <linux/slab.h> 174308eebbSEric W. Biederman #include <linux/proc_fs.h> 18cf3f8921SDaniel Lezcano #include <linux/reboot.h> 19523a6a94SEric W. Biederman #include <linux/export.h> 2074bd59bbSPavel Emelyanov 2174bd59bbSPavel Emelyanov #define BITS_PER_PAGE (PAGE_SIZE*8) 2274bd59bbSPavel Emelyanov 2374bd59bbSPavel Emelyanov struct pid_cache { 2474bd59bbSPavel Emelyanov int nr_ids; 2574bd59bbSPavel Emelyanov char name[16]; 2674bd59bbSPavel Emelyanov struct kmem_cache *cachep; 2774bd59bbSPavel Emelyanov struct list_head list; 2874bd59bbSPavel Emelyanov }; 2974bd59bbSPavel Emelyanov 3074bd59bbSPavel Emelyanov static LIST_HEAD(pid_caches_lh); 3174bd59bbSPavel Emelyanov static DEFINE_MUTEX(pid_caches_mutex); 3274bd59bbSPavel Emelyanov static struct kmem_cache *pid_ns_cachep; 3374bd59bbSPavel Emelyanov 3474bd59bbSPavel Emelyanov /* 3574bd59bbSPavel Emelyanov * creates the kmem cache to allocate pids from. 3674bd59bbSPavel Emelyanov * @nr_ids: the number of numerical ids this pid will have to carry 3774bd59bbSPavel Emelyanov */ 3874bd59bbSPavel Emelyanov 3974bd59bbSPavel Emelyanov static struct kmem_cache *create_pid_cachep(int nr_ids) 4074bd59bbSPavel Emelyanov { 4174bd59bbSPavel Emelyanov struct pid_cache *pcache; 4274bd59bbSPavel Emelyanov struct kmem_cache *cachep; 4374bd59bbSPavel Emelyanov 4474bd59bbSPavel Emelyanov mutex_lock(&pid_caches_mutex); 4574bd59bbSPavel Emelyanov list_for_each_entry(pcache, &pid_caches_lh, list) 4674bd59bbSPavel Emelyanov if (pcache->nr_ids == nr_ids) 4774bd59bbSPavel Emelyanov goto out; 4874bd59bbSPavel Emelyanov 4974bd59bbSPavel Emelyanov pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); 5074bd59bbSPavel Emelyanov if (pcache == NULL) 5174bd59bbSPavel Emelyanov goto err_alloc; 5274bd59bbSPavel Emelyanov 5374bd59bbSPavel Emelyanov snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); 5474bd59bbSPavel Emelyanov cachep = kmem_cache_create(pcache->name, 5574bd59bbSPavel Emelyanov sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), 5674bd59bbSPavel Emelyanov 0, SLAB_HWCACHE_ALIGN, NULL); 5774bd59bbSPavel Emelyanov if (cachep == NULL) 5874bd59bbSPavel Emelyanov goto err_cachep; 5974bd59bbSPavel Emelyanov 6074bd59bbSPavel Emelyanov pcache->nr_ids = nr_ids; 6174bd59bbSPavel Emelyanov pcache->cachep = cachep; 6274bd59bbSPavel Emelyanov list_add(&pcache->list, &pid_caches_lh); 6374bd59bbSPavel Emelyanov out: 6474bd59bbSPavel Emelyanov mutex_unlock(&pid_caches_mutex); 6574bd59bbSPavel Emelyanov return pcache->cachep; 6674bd59bbSPavel Emelyanov 6774bd59bbSPavel Emelyanov err_cachep: 6874bd59bbSPavel Emelyanov kfree(pcache); 6974bd59bbSPavel Emelyanov err_alloc: 7074bd59bbSPavel Emelyanov mutex_unlock(&pid_caches_mutex); 7174bd59bbSPavel Emelyanov return NULL; 7274bd59bbSPavel Emelyanov } 7374bd59bbSPavel Emelyanov 74*f2302505SAndrew Vagin /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 75*f2302505SAndrew Vagin #define MAX_PID_NS_LEVEL 32 76*f2302505SAndrew Vagin 77ed469a63SAlexey Dobriyan static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 7874bd59bbSPavel Emelyanov { 7974bd59bbSPavel Emelyanov struct pid_namespace *ns; 80ed469a63SAlexey Dobriyan unsigned int level = parent_pid_ns->level + 1; 81*f2302505SAndrew Vagin int i; 82*f2302505SAndrew Vagin int err; 8374bd59bbSPavel Emelyanov 84*f2302505SAndrew Vagin if (level > MAX_PID_NS_LEVEL) { 85*f2302505SAndrew Vagin err = -EINVAL; 86*f2302505SAndrew Vagin goto out; 87*f2302505SAndrew Vagin } 88*f2302505SAndrew Vagin 89*f2302505SAndrew Vagin err = -ENOMEM; 9084406c15SPavel Emelyanov ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 9174bd59bbSPavel Emelyanov if (ns == NULL) 9274bd59bbSPavel Emelyanov goto out; 9374bd59bbSPavel Emelyanov 9474bd59bbSPavel Emelyanov ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 9574bd59bbSPavel Emelyanov if (!ns->pidmap[0].page) 9674bd59bbSPavel Emelyanov goto out_free; 9774bd59bbSPavel Emelyanov 9874bd59bbSPavel Emelyanov ns->pid_cachep = create_pid_cachep(level + 1); 9974bd59bbSPavel Emelyanov if (ns->pid_cachep == NULL) 10074bd59bbSPavel Emelyanov goto out_free_map; 10174bd59bbSPavel Emelyanov 10274bd59bbSPavel Emelyanov kref_init(&ns->kref); 10374bd59bbSPavel Emelyanov ns->level = level; 104ed469a63SAlexey Dobriyan ns->parent = get_pid_ns(parent_pid_ns); 10574bd59bbSPavel Emelyanov 10674bd59bbSPavel Emelyanov set_bit(0, ns->pidmap[0].page); 10774bd59bbSPavel Emelyanov atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 10874bd59bbSPavel Emelyanov 10984406c15SPavel Emelyanov for (i = 1; i < PIDMAP_ENTRIES; i++) 11074bd59bbSPavel Emelyanov atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 11174bd59bbSPavel Emelyanov 1124308eebbSEric W. Biederman err = pid_ns_prepare_proc(ns); 1134308eebbSEric W. Biederman if (err) 1144308eebbSEric W. Biederman goto out_put_parent_pid_ns; 1154308eebbSEric W. Biederman 11674bd59bbSPavel Emelyanov return ns; 11774bd59bbSPavel Emelyanov 1184308eebbSEric W. Biederman out_put_parent_pid_ns: 1194308eebbSEric W. Biederman put_pid_ns(parent_pid_ns); 12074bd59bbSPavel Emelyanov out_free_map: 12174bd59bbSPavel Emelyanov kfree(ns->pidmap[0].page); 12274bd59bbSPavel Emelyanov out_free: 12374bd59bbSPavel Emelyanov kmem_cache_free(pid_ns_cachep, ns); 12474bd59bbSPavel Emelyanov out: 1254308eebbSEric W. Biederman return ERR_PTR(err); 12674bd59bbSPavel Emelyanov } 12774bd59bbSPavel Emelyanov 12874bd59bbSPavel Emelyanov static void destroy_pid_namespace(struct pid_namespace *ns) 12974bd59bbSPavel Emelyanov { 13074bd59bbSPavel Emelyanov int i; 13174bd59bbSPavel Emelyanov 13274bd59bbSPavel Emelyanov for (i = 0; i < PIDMAP_ENTRIES; i++) 13374bd59bbSPavel Emelyanov kfree(ns->pidmap[i].page); 13474bd59bbSPavel Emelyanov kmem_cache_free(pid_ns_cachep, ns); 13574bd59bbSPavel Emelyanov } 13674bd59bbSPavel Emelyanov 13774bd59bbSPavel Emelyanov struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 13874bd59bbSPavel Emelyanov { 13974bd59bbSPavel Emelyanov if (!(flags & CLONE_NEWPID)) 140dca4a979SAlexey Dobriyan return get_pid_ns(old_ns); 141e5a47386SSukadev Bhattiprolu if (flags & (CLONE_THREAD|CLONE_PARENT)) 142dca4a979SAlexey Dobriyan return ERR_PTR(-EINVAL); 143dca4a979SAlexey Dobriyan return create_pid_namespace(old_ns); 14474bd59bbSPavel Emelyanov } 14574bd59bbSPavel Emelyanov 146bbc2e3efSCyrill Gorcunov static void free_pid_ns(struct kref *kref) 14774bd59bbSPavel Emelyanov { 148bbc2e3efSCyrill Gorcunov struct pid_namespace *ns; 14974bd59bbSPavel Emelyanov 15074bd59bbSPavel Emelyanov ns = container_of(kref, struct pid_namespace, kref); 15174bd59bbSPavel Emelyanov destroy_pid_namespace(ns); 15274bd59bbSPavel Emelyanov } 153bbc2e3efSCyrill Gorcunov 154bbc2e3efSCyrill Gorcunov void put_pid_ns(struct pid_namespace *ns) 155bbc2e3efSCyrill Gorcunov { 156bbc2e3efSCyrill Gorcunov struct pid_namespace *parent; 157bbc2e3efSCyrill Gorcunov 158bbc2e3efSCyrill Gorcunov while (ns != &init_pid_ns) { 159bbc2e3efSCyrill Gorcunov parent = ns->parent; 160bbc2e3efSCyrill Gorcunov if (!kref_put(&ns->kref, free_pid_ns)) 161bbc2e3efSCyrill Gorcunov break; 162bbc2e3efSCyrill Gorcunov ns = parent; 163bbc2e3efSCyrill Gorcunov } 164bbc2e3efSCyrill Gorcunov } 165bbc2e3efSCyrill Gorcunov EXPORT_SYMBOL_GPL(put_pid_ns); 16674bd59bbSPavel Emelyanov 16774bd59bbSPavel Emelyanov void zap_pid_ns_processes(struct pid_namespace *pid_ns) 16874bd59bbSPavel Emelyanov { 16974bd59bbSPavel Emelyanov int nr; 17074bd59bbSPavel Emelyanov int rc; 17100c10bc1SEric W. Biederman struct task_struct *task, *me = current; 17200c10bc1SEric W. Biederman 17300c10bc1SEric W. Biederman /* Ignore SIGCHLD causing any terminated children to autoreap */ 17400c10bc1SEric W. Biederman spin_lock_irq(&me->sighand->siglock); 17500c10bc1SEric W. Biederman me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 17600c10bc1SEric W. Biederman spin_unlock_irq(&me->sighand->siglock); 17774bd59bbSPavel Emelyanov 17874bd59bbSPavel Emelyanov /* 17974bd59bbSPavel Emelyanov * The last thread in the cgroup-init thread group is terminating. 18074bd59bbSPavel Emelyanov * Find remaining pid_ts in the namespace, signal and wait for them 18174bd59bbSPavel Emelyanov * to exit. 18274bd59bbSPavel Emelyanov * 18374bd59bbSPavel Emelyanov * Note: This signals each threads in the namespace - even those that 18474bd59bbSPavel Emelyanov * belong to the same thread group, To avoid this, we would have 18574bd59bbSPavel Emelyanov * to walk the entire tasklist looking a processes in this 18674bd59bbSPavel Emelyanov * namespace, but that could be unnecessarily expensive if the 18774bd59bbSPavel Emelyanov * pid namespace has just a few processes. Or we need to 18874bd59bbSPavel Emelyanov * maintain a tasklist for each pid namespace. 18974bd59bbSPavel Emelyanov * 19074bd59bbSPavel Emelyanov */ 19174bd59bbSPavel Emelyanov read_lock(&tasklist_lock); 19274bd59bbSPavel Emelyanov nr = next_pidmap(pid_ns, 1); 19374bd59bbSPavel Emelyanov while (nr > 0) { 194e4da026fSSukadev Bhattiprolu rcu_read_lock(); 195e4da026fSSukadev Bhattiprolu 196e4da026fSSukadev Bhattiprolu task = pid_task(find_vpid(nr), PIDTYPE_PID); 197a02d6fd6SOleg Nesterov if (task && !__fatal_signal_pending(task)) 198a02d6fd6SOleg Nesterov send_sig_info(SIGKILL, SEND_SIG_FORCED, task); 199e4da026fSSukadev Bhattiprolu 200e4da026fSSukadev Bhattiprolu rcu_read_unlock(); 201e4da026fSSukadev Bhattiprolu 20274bd59bbSPavel Emelyanov nr = next_pidmap(pid_ns, nr); 20374bd59bbSPavel Emelyanov } 20474bd59bbSPavel Emelyanov read_unlock(&tasklist_lock); 20574bd59bbSPavel Emelyanov 2066347e900SEric W. Biederman /* Firstly reap the EXIT_ZOMBIE children we may have. */ 20774bd59bbSPavel Emelyanov do { 20874bd59bbSPavel Emelyanov clear_thread_flag(TIF_SIGPENDING); 20974bd59bbSPavel Emelyanov rc = sys_wait4(-1, NULL, __WALL, NULL); 21074bd59bbSPavel Emelyanov } while (rc != -ECHILD); 21174bd59bbSPavel Emelyanov 2126347e900SEric W. Biederman /* 2136347e900SEric W. Biederman * sys_wait4() above can't reap the TASK_DEAD children. 2146347e900SEric W. Biederman * Make sure they all go away, see __unhash_process(). 2156347e900SEric W. Biederman */ 2166347e900SEric W. Biederman for (;;) { 2176347e900SEric W. Biederman bool need_wait = false; 2186347e900SEric W. Biederman 2196347e900SEric W. Biederman read_lock(&tasklist_lock); 2206347e900SEric W. Biederman if (!list_empty(¤t->children)) { 2216347e900SEric W. Biederman __set_current_state(TASK_UNINTERRUPTIBLE); 2226347e900SEric W. Biederman need_wait = true; 2236347e900SEric W. Biederman } 2246347e900SEric W. Biederman read_unlock(&tasklist_lock); 2256347e900SEric W. Biederman 2266347e900SEric W. Biederman if (!need_wait) 2276347e900SEric W. Biederman break; 2286347e900SEric W. Biederman schedule(); 2296347e900SEric W. Biederman } 2306347e900SEric W. Biederman 231cf3f8921SDaniel Lezcano if (pid_ns->reboot) 232cf3f8921SDaniel Lezcano current->signal->group_exit_code = pid_ns->reboot; 233cf3f8921SDaniel Lezcano 2340b6b030fSPavel Emelyanov acct_exit_ns(pid_ns); 23574bd59bbSPavel Emelyanov return; 23674bd59bbSPavel Emelyanov } 23774bd59bbSPavel Emelyanov 23898ed57eeSCyrill Gorcunov #ifdef CONFIG_CHECKPOINT_RESTORE 239b8f566b0SPavel Emelyanov static int pid_ns_ctl_handler(struct ctl_table *table, int write, 240b8f566b0SPavel Emelyanov void __user *buffer, size_t *lenp, loff_t *ppos) 241b8f566b0SPavel Emelyanov { 242b8f566b0SPavel Emelyanov struct ctl_table tmp = *table; 243b8f566b0SPavel Emelyanov 244b8f566b0SPavel Emelyanov if (write && !capable(CAP_SYS_ADMIN)) 245b8f566b0SPavel Emelyanov return -EPERM; 246b8f566b0SPavel Emelyanov 247b8f566b0SPavel Emelyanov /* 248b8f566b0SPavel Emelyanov * Writing directly to ns' last_pid field is OK, since this field 249b8f566b0SPavel Emelyanov * is volatile in a living namespace anyway and a code writing to 250b8f566b0SPavel Emelyanov * it should synchronize its usage with external means. 251b8f566b0SPavel Emelyanov */ 252b8f566b0SPavel Emelyanov 253b8f566b0SPavel Emelyanov tmp.data = ¤t->nsproxy->pid_ns->last_pid; 254579035dcSAndrew Vagin return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 255b8f566b0SPavel Emelyanov } 256b8f566b0SPavel Emelyanov 257579035dcSAndrew Vagin extern int pid_max; 258579035dcSAndrew Vagin static int zero = 0; 259b8f566b0SPavel Emelyanov static struct ctl_table pid_ns_ctl_table[] = { 260b8f566b0SPavel Emelyanov { 261b8f566b0SPavel Emelyanov .procname = "ns_last_pid", 262b8f566b0SPavel Emelyanov .maxlen = sizeof(int), 263b8f566b0SPavel Emelyanov .mode = 0666, /* permissions are checked in the handler */ 264b8f566b0SPavel Emelyanov .proc_handler = pid_ns_ctl_handler, 265579035dcSAndrew Vagin .extra1 = &zero, 266579035dcSAndrew Vagin .extra2 = &pid_max, 267b8f566b0SPavel Emelyanov }, 268b8f566b0SPavel Emelyanov { } 269b8f566b0SPavel Emelyanov }; 270b8f566b0SPavel Emelyanov static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 27198ed57eeSCyrill Gorcunov #endif /* CONFIG_CHECKPOINT_RESTORE */ 272b8f566b0SPavel Emelyanov 273cf3f8921SDaniel Lezcano int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) 274cf3f8921SDaniel Lezcano { 275cf3f8921SDaniel Lezcano if (pid_ns == &init_pid_ns) 276cf3f8921SDaniel Lezcano return 0; 277cf3f8921SDaniel Lezcano 278cf3f8921SDaniel Lezcano switch (cmd) { 279cf3f8921SDaniel Lezcano case LINUX_REBOOT_CMD_RESTART2: 280cf3f8921SDaniel Lezcano case LINUX_REBOOT_CMD_RESTART: 281cf3f8921SDaniel Lezcano pid_ns->reboot = SIGHUP; 282cf3f8921SDaniel Lezcano break; 283cf3f8921SDaniel Lezcano 284cf3f8921SDaniel Lezcano case LINUX_REBOOT_CMD_POWER_OFF: 285cf3f8921SDaniel Lezcano case LINUX_REBOOT_CMD_HALT: 286cf3f8921SDaniel Lezcano pid_ns->reboot = SIGINT; 287cf3f8921SDaniel Lezcano break; 288cf3f8921SDaniel Lezcano default: 289cf3f8921SDaniel Lezcano return -EINVAL; 290cf3f8921SDaniel Lezcano } 291cf3f8921SDaniel Lezcano 292cf3f8921SDaniel Lezcano read_lock(&tasklist_lock); 293cf3f8921SDaniel Lezcano force_sig(SIGKILL, pid_ns->child_reaper); 294cf3f8921SDaniel Lezcano read_unlock(&tasklist_lock); 295cf3f8921SDaniel Lezcano 296cf3f8921SDaniel Lezcano do_exit(0); 297cf3f8921SDaniel Lezcano 298cf3f8921SDaniel Lezcano /* Not reached */ 299cf3f8921SDaniel Lezcano return 0; 300cf3f8921SDaniel Lezcano } 301cf3f8921SDaniel Lezcano 30274bd59bbSPavel Emelyanov static __init int pid_namespaces_init(void) 30374bd59bbSPavel Emelyanov { 30474bd59bbSPavel Emelyanov pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 30598ed57eeSCyrill Gorcunov 30698ed57eeSCyrill Gorcunov #ifdef CONFIG_CHECKPOINT_RESTORE 307b8f566b0SPavel Emelyanov register_sysctl_paths(kern_path, pid_ns_ctl_table); 30898ed57eeSCyrill Gorcunov #endif 30974bd59bbSPavel Emelyanov return 0; 31074bd59bbSPavel Emelyanov } 31174bd59bbSPavel Emelyanov 31274bd59bbSPavel Emelyanov __initcall(pid_namespaces_init); 313