1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Process number limiting controller for cgroups. 4 * 5 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing 6 * after a certain limit is reached. 7 * 8 * Since it is trivial to hit the task limit without hitting any kmemcg limits 9 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be 10 * preventable in the scope of a cgroup hierarchy by allowing resource limiting 11 * of the number of tasks in a cgroup. 12 * 13 * In order to use the `pids` controller, set the maximum number of tasks in 14 * pids.max (this is not available in the root cgroup for obvious reasons). The 15 * number of processes currently in the cgroup is given by pids.current. 16 * Organisational operations are not blocked by cgroup policies, so it is 17 * possible to have pids.current > pids.max. However, it is not possible to 18 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking 19 * would cause a cgroup policy to be violated. 20 * 21 * To set a cgroup to have no limit, set pids.max to "max". This is the default 22 * for all new cgroups (N.B. that PID limits are hierarchical, so the most 23 * stringent limit in the hierarchy is followed). 24 * 25 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is 26 * a superset of parent/child/pids.current. 27 * 28 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> 29 */ 30 31 #include <linux/kernel.h> 32 #include <linux/threads.h> 33 #include <linux/atomic.h> 34 #include <linux/cgroup.h> 35 #include <linux/slab.h> 36 37 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) 38 #define PIDS_MAX_STR "max" 39 40 struct pids_cgroup { 41 struct cgroup_subsys_state css; 42 43 /* 44 * Use 64-bit types so that we can safely represent "max" as 45 * %PIDS_MAX = (%PID_MAX_LIMIT + 1). 46 */ 47 atomic64_t counter; 48 int64_t limit; 49 50 /* Handle for "pids.events" */ 51 struct cgroup_file events_file; 52 53 /* Number of times fork failed because limit was hit. */ 54 atomic64_t events_limit; 55 }; 56 57 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) 58 { 59 return container_of(css, struct pids_cgroup, css); 60 } 61 62 static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) 63 { 64 return css_pids(pids->css.parent); 65 } 66 67 static struct cgroup_subsys_state * 68 pids_css_alloc(struct cgroup_subsys_state *parent) 69 { 70 struct pids_cgroup *pids; 71 72 pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); 73 if (!pids) 74 return ERR_PTR(-ENOMEM); 75 76 pids->limit = PIDS_MAX; 77 atomic64_set(&pids->counter, 0); 78 atomic64_set(&pids->events_limit, 0); 79 return &pids->css; 80 } 81 82 static void pids_css_free(struct cgroup_subsys_state *css) 83 { 84 kfree(css_pids(css)); 85 } 86 87 /** 88 * pids_cancel - uncharge the local pid count 89 * @pids: the pid cgroup state 90 * @num: the number of pids to cancel 91 * 92 * This function will WARN if the pid count goes under 0, because such a case is 93 * a bug in the pids controller proper. 94 */ 95 static void pids_cancel(struct pids_cgroup *pids, int num) 96 { 97 /* 98 * A negative count (or overflow for that matter) is invalid, 99 * and indicates a bug in the `pids` controller proper. 100 */ 101 WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); 102 } 103 104 /** 105 * pids_uncharge - hierarchically uncharge the pid count 106 * @pids: the pid cgroup state 107 * @num: the number of pids to uncharge 108 */ 109 static void pids_uncharge(struct pids_cgroup *pids, int num) 110 { 111 struct pids_cgroup *p; 112 113 for (p = pids; parent_pids(p); p = parent_pids(p)) 114 pids_cancel(p, num); 115 } 116 117 /** 118 * pids_charge - hierarchically charge the pid count 119 * @pids: the pid cgroup state 120 * @num: the number of pids to charge 121 * 122 * This function does *not* follow the pid limit set. It cannot fail and the new 123 * pid count may exceed the limit. This is only used for reverting failed 124 * attaches, where there is no other way out than violating the limit. 125 */ 126 static void pids_charge(struct pids_cgroup *pids, int num) 127 { 128 struct pids_cgroup *p; 129 130 for (p = pids; parent_pids(p); p = parent_pids(p)) 131 atomic64_add(num, &p->counter); 132 } 133 134 /** 135 * pids_try_charge - hierarchically try to charge the pid count 136 * @pids: the pid cgroup state 137 * @num: the number of pids to charge 138 * 139 * This function follows the set limit. It will fail if the charge would cause 140 * the new value to exceed the hierarchical limit. Returns 0 if the charge 141 * succeeded, otherwise -EAGAIN. 142 */ 143 static int pids_try_charge(struct pids_cgroup *pids, int num) 144 { 145 struct pids_cgroup *p, *q; 146 147 for (p = pids; parent_pids(p); p = parent_pids(p)) { 148 int64_t new = atomic64_add_return(num, &p->counter); 149 150 /* 151 * Since new is capped to the maximum number of pid_t, if 152 * p->limit is %PIDS_MAX then we know that this test will never 153 * fail. 154 */ 155 if (new > p->limit) 156 goto revert; 157 } 158 159 return 0; 160 161 revert: 162 for (q = pids; q != p; q = parent_pids(q)) 163 pids_cancel(q, num); 164 pids_cancel(p, num); 165 166 return -EAGAIN; 167 } 168 169 static int pids_can_attach(struct cgroup_taskset *tset) 170 { 171 struct task_struct *task; 172 struct cgroup_subsys_state *dst_css; 173 174 cgroup_taskset_for_each(task, dst_css, tset) { 175 struct pids_cgroup *pids = css_pids(dst_css); 176 struct cgroup_subsys_state *old_css; 177 struct pids_cgroup *old_pids; 178 179 /* 180 * No need to pin @old_css between here and cancel_attach() 181 * because cgroup core protects it from being freed before 182 * the migration completes or fails. 183 */ 184 old_css = task_css(task, pids_cgrp_id); 185 old_pids = css_pids(old_css); 186 187 pids_charge(pids, 1); 188 pids_uncharge(old_pids, 1); 189 } 190 191 return 0; 192 } 193 194 static void pids_cancel_attach(struct cgroup_taskset *tset) 195 { 196 struct task_struct *task; 197 struct cgroup_subsys_state *dst_css; 198 199 cgroup_taskset_for_each(task, dst_css, tset) { 200 struct pids_cgroup *pids = css_pids(dst_css); 201 struct cgroup_subsys_state *old_css; 202 struct pids_cgroup *old_pids; 203 204 old_css = task_css(task, pids_cgrp_id); 205 old_pids = css_pids(old_css); 206 207 pids_charge(old_pids, 1); 208 pids_uncharge(pids, 1); 209 } 210 } 211 212 /* 213 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies 214 * on cgroup_threadgroup_change_begin() held by the copy_process(). 215 */ 216 static int pids_can_fork(struct task_struct *task) 217 { 218 struct cgroup_subsys_state *css; 219 struct pids_cgroup *pids; 220 int err; 221 222 css = task_css_check(current, pids_cgrp_id, true); 223 pids = css_pids(css); 224 err = pids_try_charge(pids, 1); 225 if (err) { 226 /* Only log the first time events_limit is incremented. */ 227 if (atomic64_inc_return(&pids->events_limit) == 1) { 228 pr_info("cgroup: fork rejected by pids controller in "); 229 pr_cont_cgroup_path(css->cgroup); 230 pr_cont("\n"); 231 } 232 cgroup_file_notify(&pids->events_file); 233 } 234 return err; 235 } 236 237 static void pids_cancel_fork(struct task_struct *task) 238 { 239 struct cgroup_subsys_state *css; 240 struct pids_cgroup *pids; 241 242 css = task_css_check(current, pids_cgrp_id, true); 243 pids = css_pids(css); 244 pids_uncharge(pids, 1); 245 } 246 247 static void pids_release(struct task_struct *task) 248 { 249 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); 250 251 pids_uncharge(pids, 1); 252 } 253 254 static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, 255 size_t nbytes, loff_t off) 256 { 257 struct cgroup_subsys_state *css = of_css(of); 258 struct pids_cgroup *pids = css_pids(css); 259 int64_t limit; 260 int err; 261 262 buf = strstrip(buf); 263 if (!strcmp(buf, PIDS_MAX_STR)) { 264 limit = PIDS_MAX; 265 goto set_limit; 266 } 267 268 err = kstrtoll(buf, 0, &limit); 269 if (err) 270 return err; 271 272 if (limit < 0 || limit >= PIDS_MAX) 273 return -EINVAL; 274 275 set_limit: 276 /* 277 * Limit updates don't need to be mutex'd, since it isn't 278 * critical that any racing fork()s follow the new limit. 279 */ 280 pids->limit = limit; 281 return nbytes; 282 } 283 284 static int pids_max_show(struct seq_file *sf, void *v) 285 { 286 struct cgroup_subsys_state *css = seq_css(sf); 287 struct pids_cgroup *pids = css_pids(css); 288 int64_t limit = pids->limit; 289 290 if (limit >= PIDS_MAX) 291 seq_printf(sf, "%s\n", PIDS_MAX_STR); 292 else 293 seq_printf(sf, "%lld\n", limit); 294 295 return 0; 296 } 297 298 static s64 pids_current_read(struct cgroup_subsys_state *css, 299 struct cftype *cft) 300 { 301 struct pids_cgroup *pids = css_pids(css); 302 303 return atomic64_read(&pids->counter); 304 } 305 306 static int pids_events_show(struct seq_file *sf, void *v) 307 { 308 struct pids_cgroup *pids = css_pids(seq_css(sf)); 309 310 seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); 311 return 0; 312 } 313 314 static struct cftype pids_files[] = { 315 { 316 .name = "max", 317 .write = pids_max_write, 318 .seq_show = pids_max_show, 319 .flags = CFTYPE_NOT_ON_ROOT, 320 }, 321 { 322 .name = "current", 323 .read_s64 = pids_current_read, 324 .flags = CFTYPE_NOT_ON_ROOT, 325 }, 326 { 327 .name = "events", 328 .seq_show = pids_events_show, 329 .file_offset = offsetof(struct pids_cgroup, events_file), 330 .flags = CFTYPE_NOT_ON_ROOT, 331 }, 332 { } /* terminate */ 333 }; 334 335 struct cgroup_subsys pids_cgrp_subsys = { 336 .css_alloc = pids_css_alloc, 337 .css_free = pids_css_free, 338 .can_attach = pids_can_attach, 339 .cancel_attach = pids_cancel_attach, 340 .can_fork = pids_can_fork, 341 .cancel_fork = pids_cancel_fork, 342 .release = pids_release, 343 .legacy_cftypes = pids_files, 344 .dfl_cftypes = pids_files, 345 .threaded = true, 346 }; 347