1 /* 2 * Process number limiting controller for cgroups. 3 * 4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing 5 * after a certain limit is reached. 6 * 7 * Since it is trivial to hit the task limit without hitting any kmemcg limits 8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be 9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting 10 * of the number of tasks in a cgroup. 11 * 12 * In order to use the `pids` controller, set the maximum number of tasks in 13 * pids.max (this is not available in the root cgroup for obvious reasons). The 14 * number of processes currently in the cgroup is given by pids.current. 15 * Organisational operations are not blocked by cgroup policies, so it is 16 * possible to have pids.current > pids.max. However, it is not possible to 17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking 18 * would cause a cgroup policy to be violated. 19 * 20 * To set a cgroup to have no limit, set pids.max to "max". This is the default 21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most 22 * stringent limit in the hierarchy is followed). 23 * 24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is 25 * a superset of parent/child/pids.current. 26 * 27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> 28 * 29 * This file is subject to the terms and conditions of version 2 of the GNU 30 * General Public License. See the file COPYING in the main directory of the 31 * Linux distribution for more details. 32 */ 33 34 #include <linux/kernel.h> 35 #include <linux/threads.h> 36 #include <linux/atomic.h> 37 #include <linux/cgroup.h> 38 #include <linux/slab.h> 39 40 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) 41 #define PIDS_MAX_STR "max" 42 43 struct pids_cgroup { 44 struct cgroup_subsys_state css; 45 46 /* 47 * Use 64-bit types so that we can safely represent "max" as 48 * %PIDS_MAX = (%PID_MAX_LIMIT + 1). 49 */ 50 atomic64_t counter; 51 int64_t limit; 52 53 /* Handle for "pids.events" */ 54 struct cgroup_file events_file; 55 56 /* Number of times fork failed because limit was hit. */ 57 atomic64_t events_limit; 58 }; 59 60 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) 61 { 62 return container_of(css, struct pids_cgroup, css); 63 } 64 65 static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) 66 { 67 return css_pids(pids->css.parent); 68 } 69 70 static struct cgroup_subsys_state * 71 pids_css_alloc(struct cgroup_subsys_state *parent) 72 { 73 struct pids_cgroup *pids; 74 75 pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); 76 if (!pids) 77 return ERR_PTR(-ENOMEM); 78 79 pids->limit = PIDS_MAX; 80 atomic64_set(&pids->counter, 0); 81 atomic64_set(&pids->events_limit, 0); 82 return &pids->css; 83 } 84 85 static void pids_css_free(struct cgroup_subsys_state *css) 86 { 87 kfree(css_pids(css)); 88 } 89 90 /** 91 * pids_cancel - uncharge the local pid count 92 * @pids: the pid cgroup state 93 * @num: the number of pids to cancel 94 * 95 * This function will WARN if the pid count goes under 0, because such a case is 96 * a bug in the pids controller proper. 97 */ 98 static void pids_cancel(struct pids_cgroup *pids, int num) 99 { 100 /* 101 * A negative count (or overflow for that matter) is invalid, 102 * and indicates a bug in the `pids` controller proper. 103 */ 104 WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); 105 } 106 107 /** 108 * pids_uncharge - hierarchically uncharge the pid count 109 * @pids: the pid cgroup state 110 * @num: the number of pids to uncharge 111 */ 112 static void pids_uncharge(struct pids_cgroup *pids, int num) 113 { 114 struct pids_cgroup *p; 115 116 for (p = pids; parent_pids(p); p = parent_pids(p)) 117 pids_cancel(p, num); 118 } 119 120 /** 121 * pids_charge - hierarchically charge the pid count 122 * @pids: the pid cgroup state 123 * @num: the number of pids to charge 124 * 125 * This function does *not* follow the pid limit set. It cannot fail and the new 126 * pid count may exceed the limit. This is only used for reverting failed 127 * attaches, where there is no other way out than violating the limit. 128 */ 129 static void pids_charge(struct pids_cgroup *pids, int num) 130 { 131 struct pids_cgroup *p; 132 133 for (p = pids; parent_pids(p); p = parent_pids(p)) 134 atomic64_add(num, &p->counter); 135 } 136 137 /** 138 * pids_try_charge - hierarchically try to charge the pid count 139 * @pids: the pid cgroup state 140 * @num: the number of pids to charge 141 * 142 * This function follows the set limit. It will fail if the charge would cause 143 * the new value to exceed the hierarchical limit. Returns 0 if the charge 144 * succeeded, otherwise -EAGAIN. 145 */ 146 static int pids_try_charge(struct pids_cgroup *pids, int num) 147 { 148 struct pids_cgroup *p, *q; 149 150 for (p = pids; parent_pids(p); p = parent_pids(p)) { 151 int64_t new = atomic64_add_return(num, &p->counter); 152 153 /* 154 * Since new is capped to the maximum number of pid_t, if 155 * p->limit is %PIDS_MAX then we know that this test will never 156 * fail. 157 */ 158 if (new > p->limit) 159 goto revert; 160 } 161 162 return 0; 163 164 revert: 165 for (q = pids; q != p; q = parent_pids(q)) 166 pids_cancel(q, num); 167 pids_cancel(p, num); 168 169 return -EAGAIN; 170 } 171 172 static int pids_can_attach(struct cgroup_taskset *tset) 173 { 174 struct task_struct *task; 175 struct cgroup_subsys_state *dst_css; 176 177 cgroup_taskset_for_each(task, dst_css, tset) { 178 struct pids_cgroup *pids = css_pids(dst_css); 179 struct cgroup_subsys_state *old_css; 180 struct pids_cgroup *old_pids; 181 182 /* 183 * No need to pin @old_css between here and cancel_attach() 184 * because cgroup core protects it from being freed before 185 * the migration completes or fails. 186 */ 187 old_css = task_css(task, pids_cgrp_id); 188 old_pids = css_pids(old_css); 189 190 pids_charge(pids, 1); 191 pids_uncharge(old_pids, 1); 192 } 193 194 return 0; 195 } 196 197 static void pids_cancel_attach(struct cgroup_taskset *tset) 198 { 199 struct task_struct *task; 200 struct cgroup_subsys_state *dst_css; 201 202 cgroup_taskset_for_each(task, dst_css, tset) { 203 struct pids_cgroup *pids = css_pids(dst_css); 204 struct cgroup_subsys_state *old_css; 205 struct pids_cgroup *old_pids; 206 207 old_css = task_css(task, pids_cgrp_id); 208 old_pids = css_pids(old_css); 209 210 pids_charge(old_pids, 1); 211 pids_uncharge(pids, 1); 212 } 213 } 214 215 /* 216 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies 217 * on cgroup_threadgroup_change_begin() held by the copy_process(). 218 */ 219 static int pids_can_fork(struct task_struct *task) 220 { 221 struct cgroup_subsys_state *css; 222 struct pids_cgroup *pids; 223 int err; 224 225 css = task_css_check(current, pids_cgrp_id, true); 226 pids = css_pids(css); 227 err = pids_try_charge(pids, 1); 228 if (err) { 229 /* Only log the first time events_limit is incremented. */ 230 if (atomic64_inc_return(&pids->events_limit) == 1) { 231 pr_info("cgroup: fork rejected by pids controller in "); 232 pr_cont_cgroup_path(css->cgroup); 233 pr_cont("\n"); 234 } 235 cgroup_file_notify(&pids->events_file); 236 } 237 return err; 238 } 239 240 static void pids_cancel_fork(struct task_struct *task) 241 { 242 struct cgroup_subsys_state *css; 243 struct pids_cgroup *pids; 244 245 css = task_css_check(current, pids_cgrp_id, true); 246 pids = css_pids(css); 247 pids_uncharge(pids, 1); 248 } 249 250 static void pids_free(struct task_struct *task) 251 { 252 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); 253 254 pids_uncharge(pids, 1); 255 } 256 257 static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, 258 size_t nbytes, loff_t off) 259 { 260 struct cgroup_subsys_state *css = of_css(of); 261 struct pids_cgroup *pids = css_pids(css); 262 int64_t limit; 263 int err; 264 265 buf = strstrip(buf); 266 if (!strcmp(buf, PIDS_MAX_STR)) { 267 limit = PIDS_MAX; 268 goto set_limit; 269 } 270 271 err = kstrtoll(buf, 0, &limit); 272 if (err) 273 return err; 274 275 if (limit < 0 || limit >= PIDS_MAX) 276 return -EINVAL; 277 278 set_limit: 279 /* 280 * Limit updates don't need to be mutex'd, since it isn't 281 * critical that any racing fork()s follow the new limit. 282 */ 283 pids->limit = limit; 284 return nbytes; 285 } 286 287 static int pids_max_show(struct seq_file *sf, void *v) 288 { 289 struct cgroup_subsys_state *css = seq_css(sf); 290 struct pids_cgroup *pids = css_pids(css); 291 int64_t limit = pids->limit; 292 293 if (limit >= PIDS_MAX) 294 seq_printf(sf, "%s\n", PIDS_MAX_STR); 295 else 296 seq_printf(sf, "%lld\n", limit); 297 298 return 0; 299 } 300 301 static s64 pids_current_read(struct cgroup_subsys_state *css, 302 struct cftype *cft) 303 { 304 struct pids_cgroup *pids = css_pids(css); 305 306 return atomic64_read(&pids->counter); 307 } 308 309 static int pids_events_show(struct seq_file *sf, void *v) 310 { 311 struct pids_cgroup *pids = css_pids(seq_css(sf)); 312 313 seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); 314 return 0; 315 } 316 317 static struct cftype pids_files[] = { 318 { 319 .name = "max", 320 .write = pids_max_write, 321 .seq_show = pids_max_show, 322 .flags = CFTYPE_NOT_ON_ROOT, 323 }, 324 { 325 .name = "current", 326 .read_s64 = pids_current_read, 327 .flags = CFTYPE_NOT_ON_ROOT, 328 }, 329 { 330 .name = "events", 331 .seq_show = pids_events_show, 332 .file_offset = offsetof(struct pids_cgroup, events_file), 333 .flags = CFTYPE_NOT_ON_ROOT, 334 }, 335 { } /* terminate */ 336 }; 337 338 struct cgroup_subsys pids_cgrp_subsys = { 339 .css_alloc = pids_css_alloc, 340 .css_free = pids_css_free, 341 .can_attach = pids_can_attach, 342 .cancel_attach = pids_cancel_attach, 343 .can_fork = pids_can_fork, 344 .cancel_fork = pids_cancel_fork, 345 .free = pids_free, 346 .legacy_cftypes = pids_files, 347 .dfl_cftypes = pids_files, 348 .threaded = true, 349 }; 350