1 /* 2 * Generic process-grouping system. 3 * 4 * Based originally on the cpuset system, extracted by Paul Menage 5 * Copyright (C) 2006 Google, Inc 6 * 7 * Notifications support 8 * Copyright (C) 2009 Nokia Corporation 9 * Author: Kirill A. Shutemov 10 * 11 * Copyright notices from the original cpuset code: 12 * -------------------------------------------------- 13 * Copyright (C) 2003 BULL SA. 14 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 15 * 16 * Portions derived from Patrick Mochel's sysfs code. 17 * sysfs is Copyright (c) 2001-3 Patrick Mochel 18 * 19 * 2003-10-10 Written by Simon Derr. 20 * 2003-10-22 Updates by Stephen Hemminger. 21 * 2004 May-July Rework by Paul Jackson. 22 * --------------------------------------------------- 23 * 24 * This file is subject to the terms and conditions of the GNU General Public 25 * License. See the file COPYING in the main directory of the Linux 26 * distribution for more details. 27 */ 28 29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 30 31 #include "cgroup-internal.h" 32 33 #include <linux/cred.h> 34 #include <linux/errno.h> 35 #include <linux/init_task.h> 36 #include <linux/kernel.h> 37 #include <linux/magic.h> 38 #include <linux/mutex.h> 39 #include <linux/mount.h> 40 #include <linux/pagemap.h> 41 #include <linux/proc_fs.h> 42 #include <linux/rcupdate.h> 43 #include <linux/sched.h> 44 #include <linux/slab.h> 45 #include <linux/spinlock.h> 46 #include <linux/percpu-rwsem.h> 47 #include <linux/string.h> 48 #include <linux/hashtable.h> 49 #include <linux/idr.h> 50 #include <linux/kthread.h> 51 #include <linux/atomic.h> 52 #include <linux/cpuset.h> 53 #include <linux/proc_ns.h> 54 #include <linux/nsproxy.h> 55 #include <linux/file.h> 56 #include <net/sock.h> 57 58 #define CREATE_TRACE_POINTS 59 #include <trace/events/cgroup.h> 60 61 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ 62 MAX_CFTYPE_NAME + 2) 63 64 /* 65 * cgroup_mutex is the master lock. Any modification to cgroup or its 66 * hierarchy must be performed while holding it. 67 * 68 * css_set_lock protects task->cgroups pointer, the list of css_set 69 * objects, and the chain of tasks off each css_set. 70 * 71 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 72 * cgroup.h can use them for lockdep annotations. 73 */ 74 DEFINE_MUTEX(cgroup_mutex); 75 DEFINE_SPINLOCK(css_set_lock); 76 77 #ifdef CONFIG_PROVE_RCU 78 EXPORT_SYMBOL_GPL(cgroup_mutex); 79 EXPORT_SYMBOL_GPL(css_set_lock); 80 #endif 81 82 /* 83 * Protects cgroup_idr and css_idr so that IDs can be released without 84 * grabbing cgroup_mutex. 85 */ 86 static DEFINE_SPINLOCK(cgroup_idr_lock); 87 88 /* 89 * Protects cgroup_file->kn for !self csses. It synchronizes notifications 90 * against file removal/re-creation across css hiding. 91 */ 92 static DEFINE_SPINLOCK(cgroup_file_kn_lock); 93 94 struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 95 96 #define cgroup_assert_mutex_or_rcu_locked() \ 97 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 98 !lockdep_is_held(&cgroup_mutex), \ 99 "cgroup_mutex or RCU read lock required"); 100 101 /* 102 * cgroup destruction makes heavy use of work items and there can be a lot 103 * of concurrent destructions. Use a separate workqueue so that cgroup 104 * destruction work items don't end up filling up max_active of system_wq 105 * which may lead to deadlock. 106 */ 107 static struct workqueue_struct *cgroup_destroy_wq; 108 109 /* generate an array of cgroup subsystem pointers */ 110 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, 111 struct cgroup_subsys *cgroup_subsys[] = { 112 #include <linux/cgroup_subsys.h> 113 }; 114 #undef SUBSYS 115 116 /* array of cgroup subsystem names */ 117 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, 118 static const char *cgroup_subsys_name[] = { 119 #include <linux/cgroup_subsys.h> 120 }; 121 #undef SUBSYS 122 123 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ 124 #define SUBSYS(_x) \ 125 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ 126 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ 127 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ 128 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); 129 #include <linux/cgroup_subsys.h> 130 #undef SUBSYS 131 132 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, 133 static struct static_key_true *cgroup_subsys_enabled_key[] = { 134 #include <linux/cgroup_subsys.h> 135 }; 136 #undef SUBSYS 137 138 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, 139 static struct static_key_true *cgroup_subsys_on_dfl_key[] = { 140 #include <linux/cgroup_subsys.h> 141 }; 142 #undef SUBSYS 143 144 /* 145 * The default hierarchy, reserved for the subsystems that are otherwise 146 * unattached - it never has more than a single cgroup, and all tasks are 147 * part of that cgroup. 148 */ 149 struct cgroup_root cgrp_dfl_root; 150 EXPORT_SYMBOL_GPL(cgrp_dfl_root); 151 152 /* 153 * The default hierarchy always exists but is hidden until mounted for the 154 * first time. This is for backward compatibility. 155 */ 156 static bool cgrp_dfl_visible; 157 158 /* some controllers are not supported in the default hierarchy */ 159 static u16 cgrp_dfl_inhibit_ss_mask; 160 161 /* some controllers are implicitly enabled on the default hierarchy */ 162 static u16 cgrp_dfl_implicit_ss_mask; 163 164 /* The list of hierarchy roots */ 165 LIST_HEAD(cgroup_roots); 166 static int cgroup_root_count; 167 168 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ 169 static DEFINE_IDR(cgroup_hierarchy_idr); 170 171 /* 172 * Assign a monotonically increasing serial number to csses. It guarantees 173 * cgroups with bigger numbers are newer than those with smaller numbers. 174 * Also, as csses are always appended to the parent's ->children list, it 175 * guarantees that sibling csses are always sorted in the ascending serial 176 * number order on the list. Protected by cgroup_mutex. 177 */ 178 static u64 css_serial_nr_next = 1; 179 180 /* 181 * These bitmasks identify subsystems with specific features to avoid 182 * having to do iterative checks repeatedly. 183 */ 184 static u16 have_fork_callback __read_mostly; 185 static u16 have_exit_callback __read_mostly; 186 static u16 have_free_callback __read_mostly; 187 static u16 have_canfork_callback __read_mostly; 188 189 /* cgroup namespace for init task */ 190 struct cgroup_namespace init_cgroup_ns = { 191 .count = { .counter = 2, }, 192 .user_ns = &init_user_ns, 193 .ns.ops = &cgroupns_operations, 194 .ns.inum = PROC_CGROUP_INIT_INO, 195 .root_cset = &init_css_set, 196 }; 197 198 static struct file_system_type cgroup2_fs_type; 199 static struct cftype cgroup_base_files[]; 200 201 static int cgroup_apply_control(struct cgroup *cgrp); 202 static void cgroup_finalize_control(struct cgroup *cgrp, int ret); 203 static void css_task_iter_advance(struct css_task_iter *it); 204 static int cgroup_destroy_locked(struct cgroup *cgrp); 205 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, 206 struct cgroup_subsys *ss); 207 static void css_release(struct percpu_ref *ref); 208 static void kill_css(struct cgroup_subsys_state *css); 209 static int cgroup_addrm_files(struct cgroup_subsys_state *css, 210 struct cgroup *cgrp, struct cftype cfts[], 211 bool is_add); 212 213 /** 214 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID 215 * @ssid: subsys ID of interest 216 * 217 * cgroup_subsys_enabled() can only be used with literal subsys names which 218 * is fine for individual subsystems but unsuitable for cgroup core. This 219 * is slower static_key_enabled() based test indexed by @ssid. 220 */ 221 bool cgroup_ssid_enabled(int ssid) 222 { 223 if (CGROUP_SUBSYS_COUNT == 0) 224 return false; 225 226 return static_key_enabled(cgroup_subsys_enabled_key[ssid]); 227 } 228 229 /** 230 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy 231 * @cgrp: the cgroup of interest 232 * 233 * The default hierarchy is the v2 interface of cgroup and this function 234 * can be used to test whether a cgroup is on the default hierarchy for 235 * cases where a subsystem should behave differnetly depending on the 236 * interface version. 237 * 238 * The set of behaviors which change on the default hierarchy are still 239 * being determined and the mount option is prefixed with __DEVEL__. 240 * 241 * List of changed behaviors: 242 * 243 * - Mount options "noprefix", "xattr", "clone_children", "release_agent" 244 * and "name" are disallowed. 245 * 246 * - When mounting an existing superblock, mount options should match. 247 * 248 * - Remount is disallowed. 249 * 250 * - rename(2) is disallowed. 251 * 252 * - "tasks" is removed. Everything should be at process granularity. Use 253 * "cgroup.procs" instead. 254 * 255 * - "cgroup.procs" is not sorted. pids will be unique unless they got 256 * recycled inbetween reads. 257 * 258 * - "release_agent" and "notify_on_release" are removed. Replacement 259 * notification mechanism will be implemented. 260 * 261 * - "cgroup.clone_children" is removed. 262 * 263 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup 264 * and its descendants contain no task; otherwise, 1. The file also 265 * generates kernfs notification which can be monitored through poll and 266 * [di]notify when the value of the file changes. 267 * 268 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and 269 * take masks of ancestors with non-empty cpus/mems, instead of being 270 * moved to an ancestor. 271 * 272 * - cpuset: a task can be moved into an empty cpuset, and again it takes 273 * masks of ancestors. 274 * 275 * - memcg: use_hierarchy is on by default and the cgroup file for the flag 276 * is not created. 277 * 278 * - blkcg: blk-throttle becomes properly hierarchical. 279 * 280 * - debug: disallowed on the default hierarchy. 281 */ 282 bool cgroup_on_dfl(const struct cgroup *cgrp) 283 { 284 return cgrp->root == &cgrp_dfl_root; 285 } 286 287 /* IDR wrappers which synchronize using cgroup_idr_lock */ 288 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 289 gfp_t gfp_mask) 290 { 291 int ret; 292 293 idr_preload(gfp_mask); 294 spin_lock_bh(&cgroup_idr_lock); 295 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); 296 spin_unlock_bh(&cgroup_idr_lock); 297 idr_preload_end(); 298 return ret; 299 } 300 301 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) 302 { 303 void *ret; 304 305 spin_lock_bh(&cgroup_idr_lock); 306 ret = idr_replace(idr, ptr, id); 307 spin_unlock_bh(&cgroup_idr_lock); 308 return ret; 309 } 310 311 static void cgroup_idr_remove(struct idr *idr, int id) 312 { 313 spin_lock_bh(&cgroup_idr_lock); 314 idr_remove(idr, id); 315 spin_unlock_bh(&cgroup_idr_lock); 316 } 317 318 static struct cgroup *cgroup_parent(struct cgroup *cgrp) 319 { 320 struct cgroup_subsys_state *parent_css = cgrp->self.parent; 321 322 if (parent_css) 323 return container_of(parent_css, struct cgroup, self); 324 return NULL; 325 } 326 327 /* subsystems visibly enabled on a cgroup */ 328 static u16 cgroup_control(struct cgroup *cgrp) 329 { 330 struct cgroup *parent = cgroup_parent(cgrp); 331 u16 root_ss_mask = cgrp->root->subsys_mask; 332 333 if (parent) 334 return parent->subtree_control; 335 336 if (cgroup_on_dfl(cgrp)) 337 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | 338 cgrp_dfl_implicit_ss_mask); 339 return root_ss_mask; 340 } 341 342 /* subsystems enabled on a cgroup */ 343 static u16 cgroup_ss_mask(struct cgroup *cgrp) 344 { 345 struct cgroup *parent = cgroup_parent(cgrp); 346 347 if (parent) 348 return parent->subtree_ss_mask; 349 350 return cgrp->root->subsys_mask; 351 } 352 353 /** 354 * cgroup_css - obtain a cgroup's css for the specified subsystem 355 * @cgrp: the cgroup of interest 356 * @ss: the subsystem of interest (%NULL returns @cgrp->self) 357 * 358 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 359 * function must be called either under cgroup_mutex or rcu_read_lock() and 360 * the caller is responsible for pinning the returned css if it wants to 361 * keep accessing it outside the said locks. This function may return 362 * %NULL if @cgrp doesn't have @subsys_id enabled. 363 */ 364 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, 365 struct cgroup_subsys *ss) 366 { 367 if (ss) 368 return rcu_dereference_check(cgrp->subsys[ss->id], 369 lockdep_is_held(&cgroup_mutex)); 370 else 371 return &cgrp->self; 372 } 373 374 /** 375 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem 376 * @cgrp: the cgroup of interest 377 * @ss: the subsystem of interest (%NULL returns @cgrp->self) 378 * 379 * Similar to cgroup_css() but returns the effective css, which is defined 380 * as the matching css of the nearest ancestor including self which has @ss 381 * enabled. If @ss is associated with the hierarchy @cgrp is on, this 382 * function is guaranteed to return non-NULL css. 383 */ 384 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, 385 struct cgroup_subsys *ss) 386 { 387 lockdep_assert_held(&cgroup_mutex); 388 389 if (!ss) 390 return &cgrp->self; 391 392 /* 393 * This function is used while updating css associations and thus 394 * can't test the csses directly. Test ss_mask. 395 */ 396 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { 397 cgrp = cgroup_parent(cgrp); 398 if (!cgrp) 399 return NULL; 400 } 401 402 return cgroup_css(cgrp, ss); 403 } 404 405 /** 406 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem 407 * @cgrp: the cgroup of interest 408 * @ss: the subsystem of interest 409 * 410 * Find and get the effective css of @cgrp for @ss. The effective css is 411 * defined as the matching css of the nearest ancestor including self which 412 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, 413 * the root css is returned, so this function always returns a valid css. 414 * The returned css must be put using css_put(). 415 */ 416 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, 417 struct cgroup_subsys *ss) 418 { 419 struct cgroup_subsys_state *css; 420 421 rcu_read_lock(); 422 423 do { 424 css = cgroup_css(cgrp, ss); 425 426 if (css && css_tryget_online(css)) 427 goto out_unlock; 428 cgrp = cgroup_parent(cgrp); 429 } while (cgrp); 430 431 css = init_css_set.subsys[ss->id]; 432 css_get(css); 433 out_unlock: 434 rcu_read_unlock(); 435 return css; 436 } 437 438 static void cgroup_get(struct cgroup *cgrp) 439 { 440 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 441 css_get(&cgrp->self); 442 } 443 444 static bool cgroup_tryget(struct cgroup *cgrp) 445 { 446 return css_tryget(&cgrp->self); 447 } 448 449 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) 450 { 451 struct cgroup *cgrp = of->kn->parent->priv; 452 struct cftype *cft = of_cft(of); 453 454 /* 455 * This is open and unprotected implementation of cgroup_css(). 456 * seq_css() is only called from a kernfs file operation which has 457 * an active reference on the file. Because all the subsystem 458 * files are drained before a css is disassociated with a cgroup, 459 * the matching css from the cgroup's subsys table is guaranteed to 460 * be and stay valid until the enclosing operation is complete. 461 */ 462 if (cft->ss) 463 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 464 else 465 return &cgrp->self; 466 } 467 EXPORT_SYMBOL_GPL(of_css); 468 469 /** 470 * for_each_css - iterate all css's of a cgroup 471 * @css: the iteration cursor 472 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 473 * @cgrp: the target cgroup to iterate css's of 474 * 475 * Should be called under cgroup_[tree_]mutex. 476 */ 477 #define for_each_css(css, ssid, cgrp) \ 478 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 479 if (!((css) = rcu_dereference_check( \ 480 (cgrp)->subsys[(ssid)], \ 481 lockdep_is_held(&cgroup_mutex)))) { } \ 482 else 483 484 /** 485 * for_each_e_css - iterate all effective css's of a cgroup 486 * @css: the iteration cursor 487 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 488 * @cgrp: the target cgroup to iterate css's of 489 * 490 * Should be called under cgroup_[tree_]mutex. 491 */ 492 #define for_each_e_css(css, ssid, cgrp) \ 493 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 494 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ 495 ; \ 496 else 497 498 /** 499 * do_each_subsys_mask - filter for_each_subsys with a bitmask 500 * @ss: the iteration cursor 501 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 502 * @ss_mask: the bitmask 503 * 504 * The block will only run for cases where the ssid-th bit (1 << ssid) of 505 * @ss_mask is set. 506 */ 507 #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ 508 unsigned long __ss_mask = (ss_mask); \ 509 if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ 510 (ssid) = 0; \ 511 break; \ 512 } \ 513 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ 514 (ss) = cgroup_subsys[ssid]; \ 515 { 516 517 #define while_each_subsys_mask() \ 518 } \ 519 } \ 520 } while (false) 521 522 /* iterate over child cgrps, lock should be held throughout iteration */ 523 #define cgroup_for_each_live_child(child, cgrp) \ 524 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ 525 if (({ lockdep_assert_held(&cgroup_mutex); \ 526 cgroup_is_dead(child); })) \ 527 ; \ 528 else 529 530 /* walk live descendants in preorder */ 531 #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ 532 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ 533 if (({ lockdep_assert_held(&cgroup_mutex); \ 534 (dsct) = (d_css)->cgroup; \ 535 cgroup_is_dead(dsct); })) \ 536 ; \ 537 else 538 539 /* walk live descendants in postorder */ 540 #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ 541 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ 542 if (({ lockdep_assert_held(&cgroup_mutex); \ 543 (dsct) = (d_css)->cgroup; \ 544 cgroup_is_dead(dsct); })) \ 545 ; \ 546 else 547 548 /* 549 * The default css_set - used by init and its children prior to any 550 * hierarchies being mounted. It contains a pointer to the root state 551 * for each subsystem. Also used to anchor the list of css_sets. Not 552 * reference-counted, to improve performance when child cgroups 553 * haven't been created. 554 */ 555 struct css_set init_css_set = { 556 .refcount = ATOMIC_INIT(1), 557 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 558 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 559 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), 560 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 561 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 562 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 563 }; 564 565 static int css_set_count = 1; /* 1 for init_css_set */ 566 567 /** 568 * css_set_populated - does a css_set contain any tasks? 569 * @cset: target css_set 570 */ 571 static bool css_set_populated(struct css_set *cset) 572 { 573 lockdep_assert_held(&css_set_lock); 574 575 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); 576 } 577 578 /** 579 * cgroup_update_populated - updated populated count of a cgroup 580 * @cgrp: the target cgroup 581 * @populated: inc or dec populated count 582 * 583 * One of the css_sets associated with @cgrp is either getting its first 584 * task or losing the last. Update @cgrp->populated_cnt accordingly. The 585 * count is propagated towards root so that a given cgroup's populated_cnt 586 * is zero iff the cgroup and all its descendants don't contain any tasks. 587 * 588 * @cgrp's interface file "cgroup.populated" is zero if 589 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt 590 * changes from or to zero, userland is notified that the content of the 591 * interface file has changed. This can be used to detect when @cgrp and 592 * its descendants become populated or empty. 593 */ 594 static void cgroup_update_populated(struct cgroup *cgrp, bool populated) 595 { 596 lockdep_assert_held(&css_set_lock); 597 598 do { 599 bool trigger; 600 601 if (populated) 602 trigger = !cgrp->populated_cnt++; 603 else 604 trigger = !--cgrp->populated_cnt; 605 606 if (!trigger) 607 break; 608 609 cgroup1_check_for_release(cgrp); 610 cgroup_file_notify(&cgrp->events_file); 611 612 cgrp = cgroup_parent(cgrp); 613 } while (cgrp); 614 } 615 616 /** 617 * css_set_update_populated - update populated state of a css_set 618 * @cset: target css_set 619 * @populated: whether @cset is populated or depopulated 620 * 621 * @cset is either getting the first task or losing the last. Update the 622 * ->populated_cnt of all associated cgroups accordingly. 623 */ 624 static void css_set_update_populated(struct css_set *cset, bool populated) 625 { 626 struct cgrp_cset_link *link; 627 628 lockdep_assert_held(&css_set_lock); 629 630 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) 631 cgroup_update_populated(link->cgrp, populated); 632 } 633 634 /** 635 * css_set_move_task - move a task from one css_set to another 636 * @task: task being moved 637 * @from_cset: css_set @task currently belongs to (may be NULL) 638 * @to_cset: new css_set @task is being moved to (may be NULL) 639 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks 640 * 641 * Move @task from @from_cset to @to_cset. If @task didn't belong to any 642 * css_set, @from_cset can be NULL. If @task is being disassociated 643 * instead of moved, @to_cset can be NULL. 644 * 645 * This function automatically handles populated_cnt updates and 646 * css_task_iter adjustments but the caller is responsible for managing 647 * @from_cset and @to_cset's reference counts. 648 */ 649 static void css_set_move_task(struct task_struct *task, 650 struct css_set *from_cset, struct css_set *to_cset, 651 bool use_mg_tasks) 652 { 653 lockdep_assert_held(&css_set_lock); 654 655 if (to_cset && !css_set_populated(to_cset)) 656 css_set_update_populated(to_cset, true); 657 658 if (from_cset) { 659 struct css_task_iter *it, *pos; 660 661 WARN_ON_ONCE(list_empty(&task->cg_list)); 662 663 /* 664 * @task is leaving, advance task iterators which are 665 * pointing to it so that they can resume at the next 666 * position. Advancing an iterator might remove it from 667 * the list, use safe walk. See css_task_iter_advance*() 668 * for details. 669 */ 670 list_for_each_entry_safe(it, pos, &from_cset->task_iters, 671 iters_node) 672 if (it->task_pos == &task->cg_list) 673 css_task_iter_advance(it); 674 675 list_del_init(&task->cg_list); 676 if (!css_set_populated(from_cset)) 677 css_set_update_populated(from_cset, false); 678 } else { 679 WARN_ON_ONCE(!list_empty(&task->cg_list)); 680 } 681 682 if (to_cset) { 683 /* 684 * We are synchronized through cgroup_threadgroup_rwsem 685 * against PF_EXITING setting such that we can't race 686 * against cgroup_exit() changing the css_set to 687 * init_css_set and dropping the old one. 688 */ 689 WARN_ON_ONCE(task->flags & PF_EXITING); 690 691 rcu_assign_pointer(task->cgroups, to_cset); 692 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : 693 &to_cset->tasks); 694 } 695 } 696 697 /* 698 * hash table for cgroup groups. This improves the performance to find 699 * an existing css_set. This hash doesn't (currently) take into 700 * account cgroups in empty hierarchies. 701 */ 702 #define CSS_SET_HASH_BITS 7 703 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); 704 705 static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 706 { 707 unsigned long key = 0UL; 708 struct cgroup_subsys *ss; 709 int i; 710 711 for_each_subsys(ss, i) 712 key += (unsigned long)css[i]; 713 key = (key >> 16) ^ key; 714 715 return key; 716 } 717 718 void put_css_set_locked(struct css_set *cset) 719 { 720 struct cgrp_cset_link *link, *tmp_link; 721 struct cgroup_subsys *ss; 722 int ssid; 723 724 lockdep_assert_held(&css_set_lock); 725 726 if (!atomic_dec_and_test(&cset->refcount)) 727 return; 728 729 /* This css_set is dead. unlink it and release cgroup and css refs */ 730 for_each_subsys(ss, ssid) { 731 list_del(&cset->e_cset_node[ssid]); 732 css_put(cset->subsys[ssid]); 733 } 734 hash_del(&cset->hlist); 735 css_set_count--; 736 737 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { 738 list_del(&link->cset_link); 739 list_del(&link->cgrp_link); 740 if (cgroup_parent(link->cgrp)) 741 cgroup_put(link->cgrp); 742 kfree(link); 743 } 744 745 kfree_rcu(cset, rcu_head); 746 } 747 748 /** 749 * compare_css_sets - helper function for find_existing_css_set(). 750 * @cset: candidate css_set being tested 751 * @old_cset: existing css_set for a task 752 * @new_cgrp: cgroup that's being entered by the task 753 * @template: desired set of css pointers in css_set (pre-calculated) 754 * 755 * Returns true if "cset" matches "old_cset" except for the hierarchy 756 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 757 */ 758 static bool compare_css_sets(struct css_set *cset, 759 struct css_set *old_cset, 760 struct cgroup *new_cgrp, 761 struct cgroup_subsys_state *template[]) 762 { 763 struct list_head *l1, *l2; 764 765 /* 766 * On the default hierarchy, there can be csets which are 767 * associated with the same set of cgroups but different csses. 768 * Let's first ensure that csses match. 769 */ 770 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) 771 return false; 772 773 /* 774 * Compare cgroup pointers in order to distinguish between 775 * different cgroups in hierarchies. As different cgroups may 776 * share the same effective css, this comparison is always 777 * necessary. 778 */ 779 l1 = &cset->cgrp_links; 780 l2 = &old_cset->cgrp_links; 781 while (1) { 782 struct cgrp_cset_link *link1, *link2; 783 struct cgroup *cgrp1, *cgrp2; 784 785 l1 = l1->next; 786 l2 = l2->next; 787 /* See if we reached the end - both lists are equal length. */ 788 if (l1 == &cset->cgrp_links) { 789 BUG_ON(l2 != &old_cset->cgrp_links); 790 break; 791 } else { 792 BUG_ON(l2 == &old_cset->cgrp_links); 793 } 794 /* Locate the cgroups associated with these links. */ 795 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); 796 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); 797 cgrp1 = link1->cgrp; 798 cgrp2 = link2->cgrp; 799 /* Hierarchies should be linked in the same order. */ 800 BUG_ON(cgrp1->root != cgrp2->root); 801 802 /* 803 * If this hierarchy is the hierarchy of the cgroup 804 * that's changing, then we need to check that this 805 * css_set points to the new cgroup; if it's any other 806 * hierarchy, then this css_set should point to the 807 * same cgroup as the old css_set. 808 */ 809 if (cgrp1->root == new_cgrp->root) { 810 if (cgrp1 != new_cgrp) 811 return false; 812 } else { 813 if (cgrp1 != cgrp2) 814 return false; 815 } 816 } 817 return true; 818 } 819 820 /** 821 * find_existing_css_set - init css array and find the matching css_set 822 * @old_cset: the css_set that we're using before the cgroup transition 823 * @cgrp: the cgroup that we're moving into 824 * @template: out param for the new set of csses, should be clear on entry 825 */ 826 static struct css_set *find_existing_css_set(struct css_set *old_cset, 827 struct cgroup *cgrp, 828 struct cgroup_subsys_state *template[]) 829 { 830 struct cgroup_root *root = cgrp->root; 831 struct cgroup_subsys *ss; 832 struct css_set *cset; 833 unsigned long key; 834 int i; 835 836 /* 837 * Build the set of subsystem state objects that we want to see in the 838 * new css_set. while subsystems can change globally, the entries here 839 * won't change, so no need for locking. 840 */ 841 for_each_subsys(ss, i) { 842 if (root->subsys_mask & (1UL << i)) { 843 /* 844 * @ss is in this hierarchy, so we want the 845 * effective css from @cgrp. 846 */ 847 template[i] = cgroup_e_css(cgrp, ss); 848 } else { 849 /* 850 * @ss is not in this hierarchy, so we don't want 851 * to change the css. 852 */ 853 template[i] = old_cset->subsys[i]; 854 } 855 } 856 857 key = css_set_hash(template); 858 hash_for_each_possible(css_set_table, cset, hlist, key) { 859 if (!compare_css_sets(cset, old_cset, cgrp, template)) 860 continue; 861 862 /* This css_set matches what we need */ 863 return cset; 864 } 865 866 /* No existing cgroup group matched */ 867 return NULL; 868 } 869 870 static void free_cgrp_cset_links(struct list_head *links_to_free) 871 { 872 struct cgrp_cset_link *link, *tmp_link; 873 874 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { 875 list_del(&link->cset_link); 876 kfree(link); 877 } 878 } 879 880 /** 881 * allocate_cgrp_cset_links - allocate cgrp_cset_links 882 * @count: the number of links to allocate 883 * @tmp_links: list_head the allocated links are put on 884 * 885 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links 886 * through ->cset_link. Returns 0 on success or -errno. 887 */ 888 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) 889 { 890 struct cgrp_cset_link *link; 891 int i; 892 893 INIT_LIST_HEAD(tmp_links); 894 895 for (i = 0; i < count; i++) { 896 link = kzalloc(sizeof(*link), GFP_KERNEL); 897 if (!link) { 898 free_cgrp_cset_links(tmp_links); 899 return -ENOMEM; 900 } 901 list_add(&link->cset_link, tmp_links); 902 } 903 return 0; 904 } 905 906 /** 907 * link_css_set - a helper function to link a css_set to a cgroup 908 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() 909 * @cset: the css_set to be linked 910 * @cgrp: the destination cgroup 911 */ 912 static void link_css_set(struct list_head *tmp_links, struct css_set *cset, 913 struct cgroup *cgrp) 914 { 915 struct cgrp_cset_link *link; 916 917 BUG_ON(list_empty(tmp_links)); 918 919 if (cgroup_on_dfl(cgrp)) 920 cset->dfl_cgrp = cgrp; 921 922 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 923 link->cset = cset; 924 link->cgrp = cgrp; 925 926 /* 927 * Always add links to the tail of the lists so that the lists are 928 * in choronological order. 929 */ 930 list_move_tail(&link->cset_link, &cgrp->cset_links); 931 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 932 933 if (cgroup_parent(cgrp)) 934 cgroup_get(cgrp); 935 } 936 937 /** 938 * find_css_set - return a new css_set with one cgroup updated 939 * @old_cset: the baseline css_set 940 * @cgrp: the cgroup to be updated 941 * 942 * Return a new css_set that's equivalent to @old_cset, but with @cgrp 943 * substituted into the appropriate hierarchy. 944 */ 945 static struct css_set *find_css_set(struct css_set *old_cset, 946 struct cgroup *cgrp) 947 { 948 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; 949 struct css_set *cset; 950 struct list_head tmp_links; 951 struct cgrp_cset_link *link; 952 struct cgroup_subsys *ss; 953 unsigned long key; 954 int ssid; 955 956 lockdep_assert_held(&cgroup_mutex); 957 958 /* First see if we already have a cgroup group that matches 959 * the desired set */ 960 spin_lock_irq(&css_set_lock); 961 cset = find_existing_css_set(old_cset, cgrp, template); 962 if (cset) 963 get_css_set(cset); 964 spin_unlock_irq(&css_set_lock); 965 966 if (cset) 967 return cset; 968 969 cset = kzalloc(sizeof(*cset), GFP_KERNEL); 970 if (!cset) 971 return NULL; 972 973 /* Allocate all the cgrp_cset_link objects that we'll need */ 974 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { 975 kfree(cset); 976 return NULL; 977 } 978 979 atomic_set(&cset->refcount, 1); 980 INIT_LIST_HEAD(&cset->tasks); 981 INIT_LIST_HEAD(&cset->mg_tasks); 982 INIT_LIST_HEAD(&cset->task_iters); 983 INIT_HLIST_NODE(&cset->hlist); 984 INIT_LIST_HEAD(&cset->cgrp_links); 985 INIT_LIST_HEAD(&cset->mg_preload_node); 986 INIT_LIST_HEAD(&cset->mg_node); 987 988 /* Copy the set of subsystem state objects generated in 989 * find_existing_css_set() */ 990 memcpy(cset->subsys, template, sizeof(cset->subsys)); 991 992 spin_lock_irq(&css_set_lock); 993 /* Add reference counts and links from the new css_set. */ 994 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 995 struct cgroup *c = link->cgrp; 996 997 if (c->root == cgrp->root) 998 c = cgrp; 999 link_css_set(&tmp_links, cset, c); 1000 } 1001 1002 BUG_ON(!list_empty(&tmp_links)); 1003 1004 css_set_count++; 1005 1006 /* Add @cset to the hash table */ 1007 key = css_set_hash(cset->subsys); 1008 hash_add(css_set_table, &cset->hlist, key); 1009 1010 for_each_subsys(ss, ssid) { 1011 struct cgroup_subsys_state *css = cset->subsys[ssid]; 1012 1013 list_add_tail(&cset->e_cset_node[ssid], 1014 &css->cgroup->e_csets[ssid]); 1015 css_get(css); 1016 } 1017 1018 spin_unlock_irq(&css_set_lock); 1019 1020 return cset; 1021 } 1022 1023 struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 1024 { 1025 struct cgroup *root_cgrp = kf_root->kn->priv; 1026 1027 return root_cgrp->root; 1028 } 1029 1030 static int cgroup_init_root_id(struct cgroup_root *root) 1031 { 1032 int id; 1033 1034 lockdep_assert_held(&cgroup_mutex); 1035 1036 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); 1037 if (id < 0) 1038 return id; 1039 1040 root->hierarchy_id = id; 1041 return 0; 1042 } 1043 1044 static void cgroup_exit_root_id(struct cgroup_root *root) 1045 { 1046 lockdep_assert_held(&cgroup_mutex); 1047 1048 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); 1049 } 1050 1051 void cgroup_free_root(struct cgroup_root *root) 1052 { 1053 if (root) { 1054 idr_destroy(&root->cgroup_idr); 1055 kfree(root); 1056 } 1057 } 1058 1059 static void cgroup_destroy_root(struct cgroup_root *root) 1060 { 1061 struct cgroup *cgrp = &root->cgrp; 1062 struct cgrp_cset_link *link, *tmp_link; 1063 1064 trace_cgroup_destroy_root(root); 1065 1066 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); 1067 1068 BUG_ON(atomic_read(&root->nr_cgrps)); 1069 BUG_ON(!list_empty(&cgrp->self.children)); 1070 1071 /* Rebind all subsystems back to the default hierarchy */ 1072 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); 1073 1074 /* 1075 * Release all the links from cset_links to this hierarchy's 1076 * root cgroup 1077 */ 1078 spin_lock_irq(&css_set_lock); 1079 1080 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1081 list_del(&link->cset_link); 1082 list_del(&link->cgrp_link); 1083 kfree(link); 1084 } 1085 1086 spin_unlock_irq(&css_set_lock); 1087 1088 if (!list_empty(&root->root_list)) { 1089 list_del(&root->root_list); 1090 cgroup_root_count--; 1091 } 1092 1093 cgroup_exit_root_id(root); 1094 1095 mutex_unlock(&cgroup_mutex); 1096 1097 kernfs_destroy_root(root->kf_root); 1098 cgroup_free_root(root); 1099 } 1100 1101 /* 1102 * look up cgroup associated with current task's cgroup namespace on the 1103 * specified hierarchy 1104 */ 1105 static struct cgroup * 1106 current_cgns_cgroup_from_root(struct cgroup_root *root) 1107 { 1108 struct cgroup *res = NULL; 1109 struct css_set *cset; 1110 1111 lockdep_assert_held(&css_set_lock); 1112 1113 rcu_read_lock(); 1114 1115 cset = current->nsproxy->cgroup_ns->root_cset; 1116 if (cset == &init_css_set) { 1117 res = &root->cgrp; 1118 } else { 1119 struct cgrp_cset_link *link; 1120 1121 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 1122 struct cgroup *c = link->cgrp; 1123 1124 if (c->root == root) { 1125 res = c; 1126 break; 1127 } 1128 } 1129 } 1130 rcu_read_unlock(); 1131 1132 BUG_ON(!res); 1133 return res; 1134 } 1135 1136 /* look up cgroup associated with given css_set on the specified hierarchy */ 1137 static struct cgroup *cset_cgroup_from_root(struct css_set *cset, 1138 struct cgroup_root *root) 1139 { 1140 struct cgroup *res = NULL; 1141 1142 lockdep_assert_held(&cgroup_mutex); 1143 lockdep_assert_held(&css_set_lock); 1144 1145 if (cset == &init_css_set) { 1146 res = &root->cgrp; 1147 } else { 1148 struct cgrp_cset_link *link; 1149 1150 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 1151 struct cgroup *c = link->cgrp; 1152 1153 if (c->root == root) { 1154 res = c; 1155 break; 1156 } 1157 } 1158 } 1159 1160 BUG_ON(!res); 1161 return res; 1162 } 1163 1164 /* 1165 * Return the cgroup for "task" from the given hierarchy. Must be 1166 * called with cgroup_mutex and css_set_lock held. 1167 */ 1168 struct cgroup *task_cgroup_from_root(struct task_struct *task, 1169 struct cgroup_root *root) 1170 { 1171 /* 1172 * No need to lock the task - since we hold cgroup_mutex the 1173 * task can't change groups, so the only thing that can happen 1174 * is that it exits and its css is set back to init_css_set. 1175 */ 1176 return cset_cgroup_from_root(task_css_set(task), root); 1177 } 1178 1179 /* 1180 * A task must hold cgroup_mutex to modify cgroups. 1181 * 1182 * Any task can increment and decrement the count field without lock. 1183 * So in general, code holding cgroup_mutex can't rely on the count 1184 * field not changing. However, if the count goes to zero, then only 1185 * cgroup_attach_task() can increment it again. Because a count of zero 1186 * means that no tasks are currently attached, therefore there is no 1187 * way a task attached to that cgroup can fork (the other way to 1188 * increment the count). So code holding cgroup_mutex can safely 1189 * assume that if the count is zero, it will stay zero. Similarly, if 1190 * a task holds cgroup_mutex on a cgroup with zero count, it 1191 * knows that the cgroup won't be removed, as cgroup_rmdir() 1192 * needs that mutex. 1193 * 1194 * A cgroup can only be deleted if both its 'count' of using tasks 1195 * is zero, and its list of 'children' cgroups is empty. Since all 1196 * tasks in the system use _some_ cgroup, and since there is always at 1197 * least one task in the system (init, pid == 1), therefore, root cgroup 1198 * always has either children cgroups and/or using tasks. So we don't 1199 * need a special hack to ensure that root cgroup cannot be deleted. 1200 * 1201 * P.S. One more locking exception. RCU is used to guard the 1202 * update of a tasks cgroup pointer by cgroup_attach_task() 1203 */ 1204 1205 static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1206 1207 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, 1208 char *buf) 1209 { 1210 struct cgroup_subsys *ss = cft->ss; 1211 1212 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && 1213 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) 1214 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", 1215 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, 1216 cft->name); 1217 else 1218 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 1219 return buf; 1220 } 1221 1222 /** 1223 * cgroup_file_mode - deduce file mode of a control file 1224 * @cft: the control file in question 1225 * 1226 * S_IRUGO for read, S_IWUSR for write. 1227 */ 1228 static umode_t cgroup_file_mode(const struct cftype *cft) 1229 { 1230 umode_t mode = 0; 1231 1232 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1233 mode |= S_IRUGO; 1234 1235 if (cft->write_u64 || cft->write_s64 || cft->write) { 1236 if (cft->flags & CFTYPE_WORLD_WRITABLE) 1237 mode |= S_IWUGO; 1238 else 1239 mode |= S_IWUSR; 1240 } 1241 1242 return mode; 1243 } 1244 1245 /** 1246 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask 1247 * @subtree_control: the new subtree_control mask to consider 1248 * @this_ss_mask: available subsystems 1249 * 1250 * On the default hierarchy, a subsystem may request other subsystems to be 1251 * enabled together through its ->depends_on mask. In such cases, more 1252 * subsystems than specified in "cgroup.subtree_control" may be enabled. 1253 * 1254 * This function calculates which subsystems need to be enabled if 1255 * @subtree_control is to be applied while restricted to @this_ss_mask. 1256 */ 1257 static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) 1258 { 1259 u16 cur_ss_mask = subtree_control; 1260 struct cgroup_subsys *ss; 1261 int ssid; 1262 1263 lockdep_assert_held(&cgroup_mutex); 1264 1265 cur_ss_mask |= cgrp_dfl_implicit_ss_mask; 1266 1267 while (true) { 1268 u16 new_ss_mask = cur_ss_mask; 1269 1270 do_each_subsys_mask(ss, ssid, cur_ss_mask) { 1271 new_ss_mask |= ss->depends_on; 1272 } while_each_subsys_mask(); 1273 1274 /* 1275 * Mask out subsystems which aren't available. This can 1276 * happen only if some depended-upon subsystems were bound 1277 * to non-default hierarchies. 1278 */ 1279 new_ss_mask &= this_ss_mask; 1280 1281 if (new_ss_mask == cur_ss_mask) 1282 break; 1283 cur_ss_mask = new_ss_mask; 1284 } 1285 1286 return cur_ss_mask; 1287 } 1288 1289 /** 1290 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods 1291 * @kn: the kernfs_node being serviced 1292 * 1293 * This helper undoes cgroup_kn_lock_live() and should be invoked before 1294 * the method finishes if locking succeeded. Note that once this function 1295 * returns the cgroup returned by cgroup_kn_lock_live() may become 1296 * inaccessible any time. If the caller intends to continue to access the 1297 * cgroup, it should pin it before invoking this function. 1298 */ 1299 void cgroup_kn_unlock(struct kernfs_node *kn) 1300 { 1301 struct cgroup *cgrp; 1302 1303 if (kernfs_type(kn) == KERNFS_DIR) 1304 cgrp = kn->priv; 1305 else 1306 cgrp = kn->parent->priv; 1307 1308 mutex_unlock(&cgroup_mutex); 1309 1310 kernfs_unbreak_active_protection(kn); 1311 cgroup_put(cgrp); 1312 } 1313 1314 /** 1315 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods 1316 * @kn: the kernfs_node being serviced 1317 * @drain_offline: perform offline draining on the cgroup 1318 * 1319 * This helper is to be used by a cgroup kernfs method currently servicing 1320 * @kn. It breaks the active protection, performs cgroup locking and 1321 * verifies that the associated cgroup is alive. Returns the cgroup if 1322 * alive; otherwise, %NULL. A successful return should be undone by a 1323 * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the 1324 * cgroup is drained of offlining csses before return. 1325 * 1326 * Any cgroup kernfs method implementation which requires locking the 1327 * associated cgroup should use this helper. It avoids nesting cgroup 1328 * locking under kernfs active protection and allows all kernfs operations 1329 * including self-removal. 1330 */ 1331 struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) 1332 { 1333 struct cgroup *cgrp; 1334 1335 if (kernfs_type(kn) == KERNFS_DIR) 1336 cgrp = kn->priv; 1337 else 1338 cgrp = kn->parent->priv; 1339 1340 /* 1341 * We're gonna grab cgroup_mutex which nests outside kernfs 1342 * active_ref. cgroup liveliness check alone provides enough 1343 * protection against removal. Ensure @cgrp stays accessible and 1344 * break the active_ref protection. 1345 */ 1346 if (!cgroup_tryget(cgrp)) 1347 return NULL; 1348 kernfs_break_active_protection(kn); 1349 1350 if (drain_offline) 1351 cgroup_lock_and_drain_offline(cgrp); 1352 else 1353 mutex_lock(&cgroup_mutex); 1354 1355 if (!cgroup_is_dead(cgrp)) 1356 return cgrp; 1357 1358 cgroup_kn_unlock(kn); 1359 return NULL; 1360 } 1361 1362 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1363 { 1364 char name[CGROUP_FILE_NAME_MAX]; 1365 1366 lockdep_assert_held(&cgroup_mutex); 1367 1368 if (cft->file_offset) { 1369 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); 1370 struct cgroup_file *cfile = (void *)css + cft->file_offset; 1371 1372 spin_lock_irq(&cgroup_file_kn_lock); 1373 cfile->kn = NULL; 1374 spin_unlock_irq(&cgroup_file_kn_lock); 1375 } 1376 1377 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1378 } 1379 1380 /** 1381 * css_clear_dir - remove subsys files in a cgroup directory 1382 * @css: taget css 1383 */ 1384 static void css_clear_dir(struct cgroup_subsys_state *css) 1385 { 1386 struct cgroup *cgrp = css->cgroup; 1387 struct cftype *cfts; 1388 1389 if (!(css->flags & CSS_VISIBLE)) 1390 return; 1391 1392 css->flags &= ~CSS_VISIBLE; 1393 1394 list_for_each_entry(cfts, &css->ss->cfts, node) 1395 cgroup_addrm_files(css, cgrp, cfts, false); 1396 } 1397 1398 /** 1399 * css_populate_dir - create subsys files in a cgroup directory 1400 * @css: target css 1401 * 1402 * On failure, no file is added. 1403 */ 1404 static int css_populate_dir(struct cgroup_subsys_state *css) 1405 { 1406 struct cgroup *cgrp = css->cgroup; 1407 struct cftype *cfts, *failed_cfts; 1408 int ret; 1409 1410 if ((css->flags & CSS_VISIBLE) || !cgrp->kn) 1411 return 0; 1412 1413 if (!css->ss) { 1414 if (cgroup_on_dfl(cgrp)) 1415 cfts = cgroup_base_files; 1416 else 1417 cfts = cgroup1_base_files; 1418 1419 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); 1420 } 1421 1422 list_for_each_entry(cfts, &css->ss->cfts, node) { 1423 ret = cgroup_addrm_files(css, cgrp, cfts, true); 1424 if (ret < 0) { 1425 failed_cfts = cfts; 1426 goto err; 1427 } 1428 } 1429 1430 css->flags |= CSS_VISIBLE; 1431 1432 return 0; 1433 err: 1434 list_for_each_entry(cfts, &css->ss->cfts, node) { 1435 if (cfts == failed_cfts) 1436 break; 1437 cgroup_addrm_files(css, cgrp, cfts, false); 1438 } 1439 return ret; 1440 } 1441 1442 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) 1443 { 1444 struct cgroup *dcgrp = &dst_root->cgrp; 1445 struct cgroup_subsys *ss; 1446 int ssid, i, ret; 1447 1448 lockdep_assert_held(&cgroup_mutex); 1449 1450 do_each_subsys_mask(ss, ssid, ss_mask) { 1451 /* 1452 * If @ss has non-root csses attached to it, can't move. 1453 * If @ss is an implicit controller, it is exempt from this 1454 * rule and can be stolen. 1455 */ 1456 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && 1457 !ss->implicit_on_dfl) 1458 return -EBUSY; 1459 1460 /* can't move between two non-dummy roots either */ 1461 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) 1462 return -EBUSY; 1463 } while_each_subsys_mask(); 1464 1465 do_each_subsys_mask(ss, ssid, ss_mask) { 1466 struct cgroup_root *src_root = ss->root; 1467 struct cgroup *scgrp = &src_root->cgrp; 1468 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); 1469 struct css_set *cset; 1470 1471 WARN_ON(!css || cgroup_css(dcgrp, ss)); 1472 1473 /* disable from the source */ 1474 src_root->subsys_mask &= ~(1 << ssid); 1475 WARN_ON(cgroup_apply_control(scgrp)); 1476 cgroup_finalize_control(scgrp, 0); 1477 1478 /* rebind */ 1479 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); 1480 rcu_assign_pointer(dcgrp->subsys[ssid], css); 1481 ss->root = dst_root; 1482 css->cgroup = dcgrp; 1483 1484 spin_lock_irq(&css_set_lock); 1485 hash_for_each(css_set_table, i, cset, hlist) 1486 list_move_tail(&cset->e_cset_node[ss->id], 1487 &dcgrp->e_csets[ss->id]); 1488 spin_unlock_irq(&css_set_lock); 1489 1490 /* default hierarchy doesn't enable controllers by default */ 1491 dst_root->subsys_mask |= 1 << ssid; 1492 if (dst_root == &cgrp_dfl_root) { 1493 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); 1494 } else { 1495 dcgrp->subtree_control |= 1 << ssid; 1496 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); 1497 } 1498 1499 ret = cgroup_apply_control(dcgrp); 1500 if (ret) 1501 pr_warn("partial failure to rebind %s controller (err=%d)\n", 1502 ss->name, ret); 1503 1504 if (ss->bind) 1505 ss->bind(css); 1506 } while_each_subsys_mask(); 1507 1508 kernfs_activate(dcgrp->kn); 1509 return 0; 1510 } 1511 1512 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, 1513 struct kernfs_root *kf_root) 1514 { 1515 int len = 0; 1516 char *buf = NULL; 1517 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); 1518 struct cgroup *ns_cgroup; 1519 1520 buf = kmalloc(PATH_MAX, GFP_KERNEL); 1521 if (!buf) 1522 return -ENOMEM; 1523 1524 spin_lock_irq(&css_set_lock); 1525 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); 1526 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); 1527 spin_unlock_irq(&css_set_lock); 1528 1529 if (len >= PATH_MAX) 1530 len = -ERANGE; 1531 else if (len > 0) { 1532 seq_escape(sf, buf, " \t\n\\"); 1533 len = 0; 1534 } 1535 kfree(buf); 1536 return len; 1537 } 1538 1539 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1540 { 1541 pr_err("remount is not allowed\n"); 1542 return -EINVAL; 1543 } 1544 1545 /* 1546 * To reduce the fork() overhead for systems that are not actually using 1547 * their cgroups capability, we don't maintain the lists running through 1548 * each css_set to its tasks until we see the list actually used - in other 1549 * words after the first mount. 1550 */ 1551 static bool use_task_css_set_links __read_mostly; 1552 1553 static void cgroup_enable_task_cg_lists(void) 1554 { 1555 struct task_struct *p, *g; 1556 1557 spin_lock_irq(&css_set_lock); 1558 1559 if (use_task_css_set_links) 1560 goto out_unlock; 1561 1562 use_task_css_set_links = true; 1563 1564 /* 1565 * We need tasklist_lock because RCU is not safe against 1566 * while_each_thread(). Besides, a forking task that has passed 1567 * cgroup_post_fork() without seeing use_task_css_set_links = 1 1568 * is not guaranteed to have its child immediately visible in the 1569 * tasklist if we walk through it with RCU. 1570 */ 1571 read_lock(&tasklist_lock); 1572 do_each_thread(g, p) { 1573 WARN_ON_ONCE(!list_empty(&p->cg_list) || 1574 task_css_set(p) != &init_css_set); 1575 1576 /* 1577 * We should check if the process is exiting, otherwise 1578 * it will race with cgroup_exit() in that the list 1579 * entry won't be deleted though the process has exited. 1580 * Do it while holding siglock so that we don't end up 1581 * racing against cgroup_exit(). 1582 * 1583 * Interrupts were already disabled while acquiring 1584 * the css_set_lock, so we do not need to disable it 1585 * again when acquiring the sighand->siglock here. 1586 */ 1587 spin_lock(&p->sighand->siglock); 1588 if (!(p->flags & PF_EXITING)) { 1589 struct css_set *cset = task_css_set(p); 1590 1591 if (!css_set_populated(cset)) 1592 css_set_update_populated(cset, true); 1593 list_add_tail(&p->cg_list, &cset->tasks); 1594 get_css_set(cset); 1595 } 1596 spin_unlock(&p->sighand->siglock); 1597 } while_each_thread(g, p); 1598 read_unlock(&tasklist_lock); 1599 out_unlock: 1600 spin_unlock_irq(&css_set_lock); 1601 } 1602 1603 static void init_cgroup_housekeeping(struct cgroup *cgrp) 1604 { 1605 struct cgroup_subsys *ss; 1606 int ssid; 1607 1608 INIT_LIST_HEAD(&cgrp->self.sibling); 1609 INIT_LIST_HEAD(&cgrp->self.children); 1610 INIT_LIST_HEAD(&cgrp->cset_links); 1611 INIT_LIST_HEAD(&cgrp->pidlists); 1612 mutex_init(&cgrp->pidlist_mutex); 1613 cgrp->self.cgroup = cgrp; 1614 cgrp->self.flags |= CSS_ONLINE; 1615 1616 for_each_subsys(ss, ssid) 1617 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1618 1619 init_waitqueue_head(&cgrp->offline_waitq); 1620 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); 1621 } 1622 1623 void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) 1624 { 1625 struct cgroup *cgrp = &root->cgrp; 1626 1627 INIT_LIST_HEAD(&root->root_list); 1628 atomic_set(&root->nr_cgrps, 1); 1629 cgrp->root = root; 1630 init_cgroup_housekeeping(cgrp); 1631 idr_init(&root->cgroup_idr); 1632 1633 root->flags = opts->flags; 1634 if (opts->release_agent) 1635 strcpy(root->release_agent_path, opts->release_agent); 1636 if (opts->name) 1637 strcpy(root->name, opts->name); 1638 if (opts->cpuset_clone_children) 1639 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1640 } 1641 1642 int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) 1643 { 1644 LIST_HEAD(tmp_links); 1645 struct cgroup *root_cgrp = &root->cgrp; 1646 struct kernfs_syscall_ops *kf_sops; 1647 struct css_set *cset; 1648 int i, ret; 1649 1650 lockdep_assert_held(&cgroup_mutex); 1651 1652 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); 1653 if (ret < 0) 1654 goto out; 1655 root_cgrp->id = ret; 1656 root_cgrp->ancestor_ids[0] = ret; 1657 1658 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, 1659 GFP_KERNEL); 1660 if (ret) 1661 goto out; 1662 1663 /* 1664 * We're accessing css_set_count without locking css_set_lock here, 1665 * but that's OK - it can only be increased by someone holding 1666 * cgroup_lock, and that's us. Later rebinding may disable 1667 * controllers on the default hierarchy and thus create new csets, 1668 * which can't be more than the existing ones. Allocate 2x. 1669 */ 1670 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); 1671 if (ret) 1672 goto cancel_ref; 1673 1674 ret = cgroup_init_root_id(root); 1675 if (ret) 1676 goto cancel_ref; 1677 1678 kf_sops = root == &cgrp_dfl_root ? 1679 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; 1680 1681 root->kf_root = kernfs_create_root(kf_sops, 1682 KERNFS_ROOT_CREATE_DEACTIVATED, 1683 root_cgrp); 1684 if (IS_ERR(root->kf_root)) { 1685 ret = PTR_ERR(root->kf_root); 1686 goto exit_root_id; 1687 } 1688 root_cgrp->kn = root->kf_root->kn; 1689 1690 ret = css_populate_dir(&root_cgrp->self); 1691 if (ret) 1692 goto destroy_root; 1693 1694 ret = rebind_subsystems(root, ss_mask); 1695 if (ret) 1696 goto destroy_root; 1697 1698 trace_cgroup_setup_root(root); 1699 1700 /* 1701 * There must be no failure case after here, since rebinding takes 1702 * care of subsystems' refcounts, which are explicitly dropped in 1703 * the failure exit path. 1704 */ 1705 list_add(&root->root_list, &cgroup_roots); 1706 cgroup_root_count++; 1707 1708 /* 1709 * Link the root cgroup in this hierarchy into all the css_set 1710 * objects. 1711 */ 1712 spin_lock_irq(&css_set_lock); 1713 hash_for_each(css_set_table, i, cset, hlist) { 1714 link_css_set(&tmp_links, cset, root_cgrp); 1715 if (css_set_populated(cset)) 1716 cgroup_update_populated(root_cgrp, true); 1717 } 1718 spin_unlock_irq(&css_set_lock); 1719 1720 BUG_ON(!list_empty(&root_cgrp->self.children)); 1721 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1722 1723 kernfs_activate(root_cgrp->kn); 1724 ret = 0; 1725 goto out; 1726 1727 destroy_root: 1728 kernfs_destroy_root(root->kf_root); 1729 root->kf_root = NULL; 1730 exit_root_id: 1731 cgroup_exit_root_id(root); 1732 cancel_ref: 1733 percpu_ref_exit(&root_cgrp->self.refcnt); 1734 out: 1735 free_cgrp_cset_links(&tmp_links); 1736 return ret; 1737 } 1738 1739 struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, 1740 struct cgroup_root *root, unsigned long magic, 1741 struct cgroup_namespace *ns) 1742 { 1743 struct dentry *dentry; 1744 bool new_sb; 1745 1746 dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); 1747 1748 /* 1749 * In non-init cgroup namespace, instead of root cgroup's dentry, 1750 * we return the dentry corresponding to the cgroupns->root_cgrp. 1751 */ 1752 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { 1753 struct dentry *nsdentry; 1754 struct cgroup *cgrp; 1755 1756 mutex_lock(&cgroup_mutex); 1757 spin_lock_irq(&css_set_lock); 1758 1759 cgrp = cset_cgroup_from_root(ns->root_cset, root); 1760 1761 spin_unlock_irq(&css_set_lock); 1762 mutex_unlock(&cgroup_mutex); 1763 1764 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); 1765 dput(dentry); 1766 dentry = nsdentry; 1767 } 1768 1769 if (IS_ERR(dentry) || !new_sb) 1770 cgroup_put(&root->cgrp); 1771 1772 return dentry; 1773 } 1774 1775 static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1776 int flags, const char *unused_dev_name, 1777 void *data) 1778 { 1779 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 1780 struct dentry *dentry; 1781 1782 get_cgroup_ns(ns); 1783 1784 /* Check if the caller has permission to mount. */ 1785 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { 1786 put_cgroup_ns(ns); 1787 return ERR_PTR(-EPERM); 1788 } 1789 1790 /* 1791 * The first time anyone tries to mount a cgroup, enable the list 1792 * linking each css_set to its tasks and fix up all existing tasks. 1793 */ 1794 if (!use_task_css_set_links) 1795 cgroup_enable_task_cg_lists(); 1796 1797 if (fs_type == &cgroup2_fs_type) { 1798 if (data) { 1799 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); 1800 put_cgroup_ns(ns); 1801 return ERR_PTR(-EINVAL); 1802 } 1803 cgrp_dfl_visible = true; 1804 cgroup_get(&cgrp_dfl_root.cgrp); 1805 1806 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, 1807 CGROUP2_SUPER_MAGIC, ns); 1808 } else { 1809 dentry = cgroup1_mount(&cgroup_fs_type, flags, data, 1810 CGROUP_SUPER_MAGIC, ns); 1811 } 1812 1813 put_cgroup_ns(ns); 1814 return dentry; 1815 } 1816 1817 static void cgroup_kill_sb(struct super_block *sb) 1818 { 1819 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1820 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1821 1822 /* 1823 * If @root doesn't have any mounts or children, start killing it. 1824 * This prevents new mounts by disabling percpu_ref_tryget_live(). 1825 * cgroup_mount() may wait for @root's release. 1826 * 1827 * And don't kill the default root. 1828 */ 1829 if (!list_empty(&root->cgrp.self.children) || 1830 root == &cgrp_dfl_root) 1831 cgroup_put(&root->cgrp); 1832 else 1833 percpu_ref_kill(&root->cgrp.self.refcnt); 1834 1835 kernfs_kill_sb(sb); 1836 } 1837 1838 struct file_system_type cgroup_fs_type = { 1839 .name = "cgroup", 1840 .mount = cgroup_mount, 1841 .kill_sb = cgroup_kill_sb, 1842 .fs_flags = FS_USERNS_MOUNT, 1843 }; 1844 1845 static struct file_system_type cgroup2_fs_type = { 1846 .name = "cgroup2", 1847 .mount = cgroup_mount, 1848 .kill_sb = cgroup_kill_sb, 1849 .fs_flags = FS_USERNS_MOUNT, 1850 }; 1851 1852 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, 1853 struct cgroup_namespace *ns) 1854 { 1855 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); 1856 1857 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); 1858 } 1859 1860 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, 1861 struct cgroup_namespace *ns) 1862 { 1863 int ret; 1864 1865 mutex_lock(&cgroup_mutex); 1866 spin_lock_irq(&css_set_lock); 1867 1868 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); 1869 1870 spin_unlock_irq(&css_set_lock); 1871 mutex_unlock(&cgroup_mutex); 1872 1873 return ret; 1874 } 1875 EXPORT_SYMBOL_GPL(cgroup_path_ns); 1876 1877 /** 1878 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1879 * @task: target task 1880 * @buf: the buffer to write the path into 1881 * @buflen: the length of the buffer 1882 * 1883 * Determine @task's cgroup on the first (the one with the lowest non-zero 1884 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This 1885 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1886 * cgroup controller callbacks. 1887 * 1888 * Return value is the same as kernfs_path(). 1889 */ 1890 int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1891 { 1892 struct cgroup_root *root; 1893 struct cgroup *cgrp; 1894 int hierarchy_id = 1; 1895 int ret; 1896 1897 mutex_lock(&cgroup_mutex); 1898 spin_lock_irq(&css_set_lock); 1899 1900 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1901 1902 if (root) { 1903 cgrp = task_cgroup_from_root(task, root); 1904 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); 1905 } else { 1906 /* if no hierarchy exists, everyone is in "/" */ 1907 ret = strlcpy(buf, "/", buflen); 1908 } 1909 1910 spin_unlock_irq(&css_set_lock); 1911 mutex_unlock(&cgroup_mutex); 1912 return ret; 1913 } 1914 EXPORT_SYMBOL_GPL(task_cgroup_path); 1915 1916 /** 1917 * cgroup_migrate_add_task - add a migration target task to a migration context 1918 * @task: target task 1919 * @mgctx: target migration context 1920 * 1921 * Add @task, which is a migration target, to @mgctx->tset. This function 1922 * becomes noop if @task doesn't need to be migrated. @task's css_set 1923 * should have been added as a migration source and @task->cg_list will be 1924 * moved from the css_set's tasks list to mg_tasks one. 1925 */ 1926 static void cgroup_migrate_add_task(struct task_struct *task, 1927 struct cgroup_mgctx *mgctx) 1928 { 1929 struct css_set *cset; 1930 1931 lockdep_assert_held(&css_set_lock); 1932 1933 /* @task either already exited or can't exit until the end */ 1934 if (task->flags & PF_EXITING) 1935 return; 1936 1937 /* leave @task alone if post_fork() hasn't linked it yet */ 1938 if (list_empty(&task->cg_list)) 1939 return; 1940 1941 cset = task_css_set(task); 1942 if (!cset->mg_src_cgrp) 1943 return; 1944 1945 list_move_tail(&task->cg_list, &cset->mg_tasks); 1946 if (list_empty(&cset->mg_node)) 1947 list_add_tail(&cset->mg_node, 1948 &mgctx->tset.src_csets); 1949 if (list_empty(&cset->mg_dst_cset->mg_node)) 1950 list_add_tail(&cset->mg_dst_cset->mg_node, 1951 &mgctx->tset.dst_csets); 1952 } 1953 1954 /** 1955 * cgroup_taskset_first - reset taskset and return the first task 1956 * @tset: taskset of interest 1957 * @dst_cssp: output variable for the destination css 1958 * 1959 * @tset iteration is initialized and the first task is returned. 1960 */ 1961 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, 1962 struct cgroup_subsys_state **dst_cssp) 1963 { 1964 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); 1965 tset->cur_task = NULL; 1966 1967 return cgroup_taskset_next(tset, dst_cssp); 1968 } 1969 1970 /** 1971 * cgroup_taskset_next - iterate to the next task in taskset 1972 * @tset: taskset of interest 1973 * @dst_cssp: output variable for the destination css 1974 * 1975 * Return the next task in @tset. Iteration must have been initialized 1976 * with cgroup_taskset_first(). 1977 */ 1978 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, 1979 struct cgroup_subsys_state **dst_cssp) 1980 { 1981 struct css_set *cset = tset->cur_cset; 1982 struct task_struct *task = tset->cur_task; 1983 1984 while (&cset->mg_node != tset->csets) { 1985 if (!task) 1986 task = list_first_entry(&cset->mg_tasks, 1987 struct task_struct, cg_list); 1988 else 1989 task = list_next_entry(task, cg_list); 1990 1991 if (&task->cg_list != &cset->mg_tasks) { 1992 tset->cur_cset = cset; 1993 tset->cur_task = task; 1994 1995 /* 1996 * This function may be called both before and 1997 * after cgroup_taskset_migrate(). The two cases 1998 * can be distinguished by looking at whether @cset 1999 * has its ->mg_dst_cset set. 2000 */ 2001 if (cset->mg_dst_cset) 2002 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; 2003 else 2004 *dst_cssp = cset->subsys[tset->ssid]; 2005 2006 return task; 2007 } 2008 2009 cset = list_next_entry(cset, mg_node); 2010 task = NULL; 2011 } 2012 2013 return NULL; 2014 } 2015 2016 /** 2017 * cgroup_taskset_migrate - migrate a taskset 2018 * @mgctx: migration context 2019 * 2020 * Migrate tasks in @mgctx as setup by migration preparation functions. 2021 * This function fails iff one of the ->can_attach callbacks fails and 2022 * guarantees that either all or none of the tasks in @mgctx are migrated. 2023 * @mgctx is consumed regardless of success. 2024 */ 2025 static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) 2026 { 2027 struct cgroup_taskset *tset = &mgctx->tset; 2028 struct cgroup_subsys *ss; 2029 struct task_struct *task, *tmp_task; 2030 struct css_set *cset, *tmp_cset; 2031 int ssid, failed_ssid, ret; 2032 2033 /* methods shouldn't be called if no task is actually migrating */ 2034 if (list_empty(&tset->src_csets)) 2035 return 0; 2036 2037 /* check that we can legitimately attach to the cgroup */ 2038 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { 2039 if (ss->can_attach) { 2040 tset->ssid = ssid; 2041 ret = ss->can_attach(tset); 2042 if (ret) { 2043 failed_ssid = ssid; 2044 goto out_cancel_attach; 2045 } 2046 } 2047 } while_each_subsys_mask(); 2048 2049 /* 2050 * Now that we're guaranteed success, proceed to move all tasks to 2051 * the new cgroup. There are no failure cases after here, so this 2052 * is the commit point. 2053 */ 2054 spin_lock_irq(&css_set_lock); 2055 list_for_each_entry(cset, &tset->src_csets, mg_node) { 2056 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { 2057 struct css_set *from_cset = task_css_set(task); 2058 struct css_set *to_cset = cset->mg_dst_cset; 2059 2060 get_css_set(to_cset); 2061 css_set_move_task(task, from_cset, to_cset, true); 2062 put_css_set_locked(from_cset); 2063 } 2064 } 2065 spin_unlock_irq(&css_set_lock); 2066 2067 /* 2068 * Migration is committed, all target tasks are now on dst_csets. 2069 * Nothing is sensitive to fork() after this point. Notify 2070 * controllers that migration is complete. 2071 */ 2072 tset->csets = &tset->dst_csets; 2073 2074 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { 2075 if (ss->attach) { 2076 tset->ssid = ssid; 2077 ss->attach(tset); 2078 } 2079 } while_each_subsys_mask(); 2080 2081 ret = 0; 2082 goto out_release_tset; 2083 2084 out_cancel_attach: 2085 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { 2086 if (ssid == failed_ssid) 2087 break; 2088 if (ss->cancel_attach) { 2089 tset->ssid = ssid; 2090 ss->cancel_attach(tset); 2091 } 2092 } while_each_subsys_mask(); 2093 out_release_tset: 2094 spin_lock_irq(&css_set_lock); 2095 list_splice_init(&tset->dst_csets, &tset->src_csets); 2096 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { 2097 list_splice_tail_init(&cset->mg_tasks, &cset->tasks); 2098 list_del_init(&cset->mg_node); 2099 } 2100 spin_unlock_irq(&css_set_lock); 2101 return ret; 2102 } 2103 2104 /** 2105 * cgroup_may_migrate_to - verify whether a cgroup can be migration destination 2106 * @dst_cgrp: destination cgroup to test 2107 * 2108 * On the default hierarchy, except for the root, subtree_control must be 2109 * zero for migration destination cgroups with tasks so that child cgroups 2110 * don't compete against tasks. 2111 */ 2112 bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) 2113 { 2114 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || 2115 !dst_cgrp->subtree_control; 2116 } 2117 2118 /** 2119 * cgroup_migrate_finish - cleanup after attach 2120 * @mgctx: migration context 2121 * 2122 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See 2123 * those functions for details. 2124 */ 2125 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) 2126 { 2127 LIST_HEAD(preloaded); 2128 struct css_set *cset, *tmp_cset; 2129 2130 lockdep_assert_held(&cgroup_mutex); 2131 2132 spin_lock_irq(&css_set_lock); 2133 2134 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); 2135 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); 2136 2137 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { 2138 cset->mg_src_cgrp = NULL; 2139 cset->mg_dst_cgrp = NULL; 2140 cset->mg_dst_cset = NULL; 2141 list_del_init(&cset->mg_preload_node); 2142 put_css_set_locked(cset); 2143 } 2144 2145 spin_unlock_irq(&css_set_lock); 2146 } 2147 2148 /** 2149 * cgroup_migrate_add_src - add a migration source css_set 2150 * @src_cset: the source css_set to add 2151 * @dst_cgrp: the destination cgroup 2152 * @mgctx: migration context 2153 * 2154 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin 2155 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned 2156 * up by cgroup_migrate_finish(). 2157 * 2158 * This function may be called without holding cgroup_threadgroup_rwsem 2159 * even if the target is a process. Threads may be created and destroyed 2160 * but as long as cgroup_mutex is not dropped, no new css_set can be put 2161 * into play and the preloaded css_sets are guaranteed to cover all 2162 * migrations. 2163 */ 2164 void cgroup_migrate_add_src(struct css_set *src_cset, 2165 struct cgroup *dst_cgrp, 2166 struct cgroup_mgctx *mgctx) 2167 { 2168 struct cgroup *src_cgrp; 2169 2170 lockdep_assert_held(&cgroup_mutex); 2171 lockdep_assert_held(&css_set_lock); 2172 2173 /* 2174 * If ->dead, @src_set is associated with one or more dead cgroups 2175 * and doesn't contain any migratable tasks. Ignore it early so 2176 * that the rest of migration path doesn't get confused by it. 2177 */ 2178 if (src_cset->dead) 2179 return; 2180 2181 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2182 2183 if (!list_empty(&src_cset->mg_preload_node)) 2184 return; 2185 2186 WARN_ON(src_cset->mg_src_cgrp); 2187 WARN_ON(src_cset->mg_dst_cgrp); 2188 WARN_ON(!list_empty(&src_cset->mg_tasks)); 2189 WARN_ON(!list_empty(&src_cset->mg_node)); 2190 2191 src_cset->mg_src_cgrp = src_cgrp; 2192 src_cset->mg_dst_cgrp = dst_cgrp; 2193 get_css_set(src_cset); 2194 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); 2195 } 2196 2197 /** 2198 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2199 * @mgctx: migration context 2200 * 2201 * Tasks are about to be moved and all the source css_sets have been 2202 * preloaded to @mgctx->preloaded_src_csets. This function looks up and 2203 * pins all destination css_sets, links each to its source, and append them 2204 * to @mgctx->preloaded_dst_csets. 2205 * 2206 * This function must be called after cgroup_migrate_add_src() has been 2207 * called on each migration source css_set. After migration is performed 2208 * using cgroup_migrate(), cgroup_migrate_finish() must be called on 2209 * @mgctx. 2210 */ 2211 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) 2212 { 2213 struct css_set *src_cset, *tmp_cset; 2214 2215 lockdep_assert_held(&cgroup_mutex); 2216 2217 /* look up the dst cset for each src cset and link it to src */ 2218 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, 2219 mg_preload_node) { 2220 struct css_set *dst_cset; 2221 struct cgroup_subsys *ss; 2222 int ssid; 2223 2224 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); 2225 if (!dst_cset) 2226 goto err; 2227 2228 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2229 2230 /* 2231 * If src cset equals dst, it's noop. Drop the src. 2232 * cgroup_migrate() will skip the cset too. Note that we 2233 * can't handle src == dst as some nodes are used by both. 2234 */ 2235 if (src_cset == dst_cset) { 2236 src_cset->mg_src_cgrp = NULL; 2237 src_cset->mg_dst_cgrp = NULL; 2238 list_del_init(&src_cset->mg_preload_node); 2239 put_css_set(src_cset); 2240 put_css_set(dst_cset); 2241 continue; 2242 } 2243 2244 src_cset->mg_dst_cset = dst_cset; 2245 2246 if (list_empty(&dst_cset->mg_preload_node)) 2247 list_add_tail(&dst_cset->mg_preload_node, 2248 &mgctx->preloaded_dst_csets); 2249 else 2250 put_css_set(dst_cset); 2251 2252 for_each_subsys(ss, ssid) 2253 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) 2254 mgctx->ss_mask |= 1 << ssid; 2255 } 2256 2257 return 0; 2258 err: 2259 cgroup_migrate_finish(mgctx); 2260 return -ENOMEM; 2261 } 2262 2263 /** 2264 * cgroup_migrate - migrate a process or task to a cgroup 2265 * @leader: the leader of the process or the task to migrate 2266 * @threadgroup: whether @leader points to the whole process or a single task 2267 * @mgctx: migration context 2268 * 2269 * Migrate a process or task denoted by @leader. If migrating a process, 2270 * the caller must be holding cgroup_threadgroup_rwsem. The caller is also 2271 * responsible for invoking cgroup_migrate_add_src() and 2272 * cgroup_migrate_prepare_dst() on the targets before invoking this 2273 * function and following up with cgroup_migrate_finish(). 2274 * 2275 * As long as a controller's ->can_attach() doesn't fail, this function is 2276 * guaranteed to succeed. This means that, excluding ->can_attach() 2277 * failure, when migrating multiple targets, the success or failure can be 2278 * decided for all targets by invoking group_migrate_prepare_dst() before 2279 * actually starting migrating. 2280 */ 2281 int cgroup_migrate(struct task_struct *leader, bool threadgroup, 2282 struct cgroup_mgctx *mgctx) 2283 { 2284 struct task_struct *task; 2285 2286 /* 2287 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2288 * already PF_EXITING could be freed from underneath us unless we 2289 * take an rcu_read_lock. 2290 */ 2291 spin_lock_irq(&css_set_lock); 2292 rcu_read_lock(); 2293 task = leader; 2294 do { 2295 cgroup_migrate_add_task(task, mgctx); 2296 if (!threadgroup) 2297 break; 2298 } while_each_thread(leader, task); 2299 rcu_read_unlock(); 2300 spin_unlock_irq(&css_set_lock); 2301 2302 return cgroup_migrate_execute(mgctx); 2303 } 2304 2305 /** 2306 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 2307 * @dst_cgrp: the cgroup to attach to 2308 * @leader: the task or the leader of the threadgroup to be attached 2309 * @threadgroup: attach the whole threadgroup? 2310 * 2311 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. 2312 */ 2313 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, 2314 bool threadgroup) 2315 { 2316 DEFINE_CGROUP_MGCTX(mgctx); 2317 struct task_struct *task; 2318 int ret; 2319 2320 if (!cgroup_may_migrate_to(dst_cgrp)) 2321 return -EBUSY; 2322 2323 /* look up all src csets */ 2324 spin_lock_irq(&css_set_lock); 2325 rcu_read_lock(); 2326 task = leader; 2327 do { 2328 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); 2329 if (!threadgroup) 2330 break; 2331 } while_each_thread(leader, task); 2332 rcu_read_unlock(); 2333 spin_unlock_irq(&css_set_lock); 2334 2335 /* prepare dst csets and commit */ 2336 ret = cgroup_migrate_prepare_dst(&mgctx); 2337 if (!ret) 2338 ret = cgroup_migrate(leader, threadgroup, &mgctx); 2339 2340 cgroup_migrate_finish(&mgctx); 2341 2342 if (!ret) 2343 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); 2344 2345 return ret; 2346 } 2347 2348 static int cgroup_procs_write_permission(struct task_struct *task, 2349 struct cgroup *dst_cgrp, 2350 struct kernfs_open_file *of) 2351 { 2352 int ret = 0; 2353 2354 if (cgroup_on_dfl(dst_cgrp)) { 2355 struct super_block *sb = of->file->f_path.dentry->d_sb; 2356 struct cgroup *cgrp; 2357 struct inode *inode; 2358 2359 spin_lock_irq(&css_set_lock); 2360 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 2361 spin_unlock_irq(&css_set_lock); 2362 2363 while (!cgroup_is_descendant(dst_cgrp, cgrp)) 2364 cgrp = cgroup_parent(cgrp); 2365 2366 ret = -ENOMEM; 2367 inode = kernfs_get_inode(sb, cgrp->procs_file.kn); 2368 if (inode) { 2369 ret = inode_permission(inode, MAY_WRITE); 2370 iput(inode); 2371 } 2372 } else { 2373 const struct cred *cred = current_cred(); 2374 const struct cred *tcred = get_task_cred(task); 2375 2376 /* 2377 * even if we're attaching all tasks in the thread group, 2378 * we only need to check permissions on one of them. 2379 */ 2380 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2381 !uid_eq(cred->euid, tcred->uid) && 2382 !uid_eq(cred->euid, tcred->suid)) 2383 ret = -EACCES; 2384 put_cred(tcred); 2385 } 2386 2387 return ret; 2388 } 2389 2390 /* 2391 * Find the task_struct of the task to attach by vpid and pass it along to the 2392 * function to attach either it or all tasks in its threadgroup. Will lock 2393 * cgroup_mutex and threadgroup. 2394 */ 2395 ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, 2396 size_t nbytes, loff_t off, bool threadgroup) 2397 { 2398 struct task_struct *tsk; 2399 struct cgroup_subsys *ss; 2400 struct cgroup *cgrp; 2401 pid_t pid; 2402 int ssid, ret; 2403 2404 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) 2405 return -EINVAL; 2406 2407 cgrp = cgroup_kn_lock_live(of->kn, false); 2408 if (!cgrp) 2409 return -ENODEV; 2410 2411 percpu_down_write(&cgroup_threadgroup_rwsem); 2412 rcu_read_lock(); 2413 if (pid) { 2414 tsk = find_task_by_vpid(pid); 2415 if (!tsk) { 2416 ret = -ESRCH; 2417 goto out_unlock_rcu; 2418 } 2419 } else { 2420 tsk = current; 2421 } 2422 2423 if (threadgroup) 2424 tsk = tsk->group_leader; 2425 2426 /* 2427 * Workqueue threads may acquire PF_NO_SETAFFINITY and become 2428 * trapped in a cpuset, or RT worker may be born in a cgroup 2429 * with no rt_runtime allocated. Just say no. 2430 */ 2431 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2432 ret = -EINVAL; 2433 goto out_unlock_rcu; 2434 } 2435 2436 get_task_struct(tsk); 2437 rcu_read_unlock(); 2438 2439 ret = cgroup_procs_write_permission(tsk, cgrp, of); 2440 if (!ret) 2441 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2442 2443 put_task_struct(tsk); 2444 goto out_unlock_threadgroup; 2445 2446 out_unlock_rcu: 2447 rcu_read_unlock(); 2448 out_unlock_threadgroup: 2449 percpu_up_write(&cgroup_threadgroup_rwsem); 2450 for_each_subsys(ss, ssid) 2451 if (ss->post_attach) 2452 ss->post_attach(); 2453 cgroup_kn_unlock(of->kn); 2454 return ret ?: nbytes; 2455 } 2456 2457 ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, 2458 loff_t off) 2459 { 2460 return __cgroup_procs_write(of, buf, nbytes, off, true); 2461 } 2462 2463 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) 2464 { 2465 struct cgroup_subsys *ss; 2466 bool printed = false; 2467 int ssid; 2468 2469 do_each_subsys_mask(ss, ssid, ss_mask) { 2470 if (printed) 2471 seq_putc(seq, ' '); 2472 seq_printf(seq, "%s", ss->name); 2473 printed = true; 2474 } while_each_subsys_mask(); 2475 if (printed) 2476 seq_putc(seq, '\n'); 2477 } 2478 2479 /* show controllers which are enabled from the parent */ 2480 static int cgroup_controllers_show(struct seq_file *seq, void *v) 2481 { 2482 struct cgroup *cgrp = seq_css(seq)->cgroup; 2483 2484 cgroup_print_ss_mask(seq, cgroup_control(cgrp)); 2485 return 0; 2486 } 2487 2488 /* show controllers which are enabled for a given cgroup's children */ 2489 static int cgroup_subtree_control_show(struct seq_file *seq, void *v) 2490 { 2491 struct cgroup *cgrp = seq_css(seq)->cgroup; 2492 2493 cgroup_print_ss_mask(seq, cgrp->subtree_control); 2494 return 0; 2495 } 2496 2497 /** 2498 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy 2499 * @cgrp: root of the subtree to update csses for 2500 * 2501 * @cgrp's control masks have changed and its subtree's css associations 2502 * need to be updated accordingly. This function looks up all css_sets 2503 * which are attached to the subtree, creates the matching updated css_sets 2504 * and migrates the tasks to the new ones. 2505 */ 2506 static int cgroup_update_dfl_csses(struct cgroup *cgrp) 2507 { 2508 DEFINE_CGROUP_MGCTX(mgctx); 2509 struct cgroup_subsys_state *d_css; 2510 struct cgroup *dsct; 2511 struct css_set *src_cset; 2512 int ret; 2513 2514 lockdep_assert_held(&cgroup_mutex); 2515 2516 percpu_down_write(&cgroup_threadgroup_rwsem); 2517 2518 /* look up all csses currently attached to @cgrp's subtree */ 2519 spin_lock_irq(&css_set_lock); 2520 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { 2521 struct cgrp_cset_link *link; 2522 2523 list_for_each_entry(link, &dsct->cset_links, cset_link) 2524 cgroup_migrate_add_src(link->cset, dsct, &mgctx); 2525 } 2526 spin_unlock_irq(&css_set_lock); 2527 2528 /* NULL dst indicates self on default hierarchy */ 2529 ret = cgroup_migrate_prepare_dst(&mgctx); 2530 if (ret) 2531 goto out_finish; 2532 2533 spin_lock_irq(&css_set_lock); 2534 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { 2535 struct task_struct *task, *ntask; 2536 2537 /* all tasks in src_csets need to be migrated */ 2538 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) 2539 cgroup_migrate_add_task(task, &mgctx); 2540 } 2541 spin_unlock_irq(&css_set_lock); 2542 2543 ret = cgroup_migrate_execute(&mgctx); 2544 out_finish: 2545 cgroup_migrate_finish(&mgctx); 2546 percpu_up_write(&cgroup_threadgroup_rwsem); 2547 return ret; 2548 } 2549 2550 /** 2551 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses 2552 * @cgrp: root of the target subtree 2553 * 2554 * Because css offlining is asynchronous, userland may try to re-enable a 2555 * controller while the previous css is still around. This function grabs 2556 * cgroup_mutex and drains the previous css instances of @cgrp's subtree. 2557 */ 2558 void cgroup_lock_and_drain_offline(struct cgroup *cgrp) 2559 __acquires(&cgroup_mutex) 2560 { 2561 struct cgroup *dsct; 2562 struct cgroup_subsys_state *d_css; 2563 struct cgroup_subsys *ss; 2564 int ssid; 2565 2566 restart: 2567 mutex_lock(&cgroup_mutex); 2568 2569 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { 2570 for_each_subsys(ss, ssid) { 2571 struct cgroup_subsys_state *css = cgroup_css(dsct, ss); 2572 DEFINE_WAIT(wait); 2573 2574 if (!css || !percpu_ref_is_dying(&css->refcnt)) 2575 continue; 2576 2577 cgroup_get(dsct); 2578 prepare_to_wait(&dsct->offline_waitq, &wait, 2579 TASK_UNINTERRUPTIBLE); 2580 2581 mutex_unlock(&cgroup_mutex); 2582 schedule(); 2583 finish_wait(&dsct->offline_waitq, &wait); 2584 2585 cgroup_put(dsct); 2586 goto restart; 2587 } 2588 } 2589 } 2590 2591 /** 2592 * cgroup_save_control - save control masks of a subtree 2593 * @cgrp: root of the target subtree 2594 * 2595 * Save ->subtree_control and ->subtree_ss_mask to the respective old_ 2596 * prefixed fields for @cgrp's subtree including @cgrp itself. 2597 */ 2598 static void cgroup_save_control(struct cgroup *cgrp) 2599 { 2600 struct cgroup *dsct; 2601 struct cgroup_subsys_state *d_css; 2602 2603 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { 2604 dsct->old_subtree_control = dsct->subtree_control; 2605 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; 2606 } 2607 } 2608 2609 /** 2610 * cgroup_propagate_control - refresh control masks of a subtree 2611 * @cgrp: root of the target subtree 2612 * 2613 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches 2614 * ->subtree_control and propagate controller availability through the 2615 * subtree so that descendants don't have unavailable controllers enabled. 2616 */ 2617 static void cgroup_propagate_control(struct cgroup *cgrp) 2618 { 2619 struct cgroup *dsct; 2620 struct cgroup_subsys_state *d_css; 2621 2622 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { 2623 dsct->subtree_control &= cgroup_control(dsct); 2624 dsct->subtree_ss_mask = 2625 cgroup_calc_subtree_ss_mask(dsct->subtree_control, 2626 cgroup_ss_mask(dsct)); 2627 } 2628 } 2629 2630 /** 2631 * cgroup_restore_control - restore control masks of a subtree 2632 * @cgrp: root of the target subtree 2633 * 2634 * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ 2635 * prefixed fields for @cgrp's subtree including @cgrp itself. 2636 */ 2637 static void cgroup_restore_control(struct cgroup *cgrp) 2638 { 2639 struct cgroup *dsct; 2640 struct cgroup_subsys_state *d_css; 2641 2642 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { 2643 dsct->subtree_control = dsct->old_subtree_control; 2644 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; 2645 } 2646 } 2647 2648 static bool css_visible(struct cgroup_subsys_state *css) 2649 { 2650 struct cgroup_subsys *ss = css->ss; 2651 struct cgroup *cgrp = css->cgroup; 2652 2653 if (cgroup_control(cgrp) & (1 << ss->id)) 2654 return true; 2655 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) 2656 return false; 2657 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; 2658 } 2659 2660 /** 2661 * cgroup_apply_control_enable - enable or show csses according to control 2662 * @cgrp: root of the target subtree 2663 * 2664 * Walk @cgrp's subtree and create new csses or make the existing ones 2665 * visible. A css is created invisible if it's being implicitly enabled 2666 * through dependency. An invisible css is made visible when the userland 2667 * explicitly enables it. 2668 * 2669 * Returns 0 on success, -errno on failure. On failure, csses which have 2670 * been processed already aren't cleaned up. The caller is responsible for 2671 * cleaning up with cgroup_apply_control_disble(). 2672 */ 2673 static int cgroup_apply_control_enable(struct cgroup *cgrp) 2674 { 2675 struct cgroup *dsct; 2676 struct cgroup_subsys_state *d_css; 2677 struct cgroup_subsys *ss; 2678 int ssid, ret; 2679 2680 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { 2681 for_each_subsys(ss, ssid) { 2682 struct cgroup_subsys_state *css = cgroup_css(dsct, ss); 2683 2684 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); 2685 2686 if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) 2687 continue; 2688 2689 if (!css) { 2690 css = css_create(dsct, ss); 2691 if (IS_ERR(css)) 2692 return PTR_ERR(css); 2693 } 2694 2695 if (css_visible(css)) { 2696 ret = css_populate_dir(css); 2697 if (ret) 2698 return ret; 2699 } 2700 } 2701 } 2702 2703 return 0; 2704 } 2705 2706 /** 2707 * cgroup_apply_control_disable - kill or hide csses according to control 2708 * @cgrp: root of the target subtree 2709 * 2710 * Walk @cgrp's subtree and kill and hide csses so that they match 2711 * cgroup_ss_mask() and cgroup_visible_mask(). 2712 * 2713 * A css is hidden when the userland requests it to be disabled while other 2714 * subsystems are still depending on it. The css must not actively control 2715 * resources and be in the vanilla state if it's made visible again later. 2716 * Controllers which may be depended upon should provide ->css_reset() for 2717 * this purpose. 2718 */ 2719 static void cgroup_apply_control_disable(struct cgroup *cgrp) 2720 { 2721 struct cgroup *dsct; 2722 struct cgroup_subsys_state *d_css; 2723 struct cgroup_subsys *ss; 2724 int ssid; 2725 2726 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { 2727 for_each_subsys(ss, ssid) { 2728 struct cgroup_subsys_state *css = cgroup_css(dsct, ss); 2729 2730 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); 2731 2732 if (!css) 2733 continue; 2734 2735 if (css->parent && 2736 !(cgroup_ss_mask(dsct) & (1 << ss->id))) { 2737 kill_css(css); 2738 } else if (!css_visible(css)) { 2739 css_clear_dir(css); 2740 if (ss->css_reset) 2741 ss->css_reset(css); 2742 } 2743 } 2744 } 2745 } 2746 2747 /** 2748 * cgroup_apply_control - apply control mask updates to the subtree 2749 * @cgrp: root of the target subtree 2750 * 2751 * subsystems can be enabled and disabled in a subtree using the following 2752 * steps. 2753 * 2754 * 1. Call cgroup_save_control() to stash the current state. 2755 * 2. Update ->subtree_control masks in the subtree as desired. 2756 * 3. Call cgroup_apply_control() to apply the changes. 2757 * 4. Optionally perform other related operations. 2758 * 5. Call cgroup_finalize_control() to finish up. 2759 * 2760 * This function implements step 3 and propagates the mask changes 2761 * throughout @cgrp's subtree, updates csses accordingly and perform 2762 * process migrations. 2763 */ 2764 static int cgroup_apply_control(struct cgroup *cgrp) 2765 { 2766 int ret; 2767 2768 cgroup_propagate_control(cgrp); 2769 2770 ret = cgroup_apply_control_enable(cgrp); 2771 if (ret) 2772 return ret; 2773 2774 /* 2775 * At this point, cgroup_e_css() results reflect the new csses 2776 * making the following cgroup_update_dfl_csses() properly update 2777 * css associations of all tasks in the subtree. 2778 */ 2779 ret = cgroup_update_dfl_csses(cgrp); 2780 if (ret) 2781 return ret; 2782 2783 return 0; 2784 } 2785 2786 /** 2787 * cgroup_finalize_control - finalize control mask update 2788 * @cgrp: root of the target subtree 2789 * @ret: the result of the update 2790 * 2791 * Finalize control mask update. See cgroup_apply_control() for more info. 2792 */ 2793 static void cgroup_finalize_control(struct cgroup *cgrp, int ret) 2794 { 2795 if (ret) { 2796 cgroup_restore_control(cgrp); 2797 cgroup_propagate_control(cgrp); 2798 } 2799 2800 cgroup_apply_control_disable(cgrp); 2801 } 2802 2803 /* change the enabled child controllers for a cgroup in the default hierarchy */ 2804 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, 2805 char *buf, size_t nbytes, 2806 loff_t off) 2807 { 2808 u16 enable = 0, disable = 0; 2809 struct cgroup *cgrp, *child; 2810 struct cgroup_subsys *ss; 2811 char *tok; 2812 int ssid, ret; 2813 2814 /* 2815 * Parse input - space separated list of subsystem names prefixed 2816 * with either + or -. 2817 */ 2818 buf = strstrip(buf); 2819 while ((tok = strsep(&buf, " "))) { 2820 if (tok[0] == '\0') 2821 continue; 2822 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { 2823 if (!cgroup_ssid_enabled(ssid) || 2824 strcmp(tok + 1, ss->name)) 2825 continue; 2826 2827 if (*tok == '+') { 2828 enable |= 1 << ssid; 2829 disable &= ~(1 << ssid); 2830 } else if (*tok == '-') { 2831 disable |= 1 << ssid; 2832 enable &= ~(1 << ssid); 2833 } else { 2834 return -EINVAL; 2835 } 2836 break; 2837 } while_each_subsys_mask(); 2838 if (ssid == CGROUP_SUBSYS_COUNT) 2839 return -EINVAL; 2840 } 2841 2842 cgrp = cgroup_kn_lock_live(of->kn, true); 2843 if (!cgrp) 2844 return -ENODEV; 2845 2846 for_each_subsys(ss, ssid) { 2847 if (enable & (1 << ssid)) { 2848 if (cgrp->subtree_control & (1 << ssid)) { 2849 enable &= ~(1 << ssid); 2850 continue; 2851 } 2852 2853 if (!(cgroup_control(cgrp) & (1 << ssid))) { 2854 ret = -ENOENT; 2855 goto out_unlock; 2856 } 2857 } else if (disable & (1 << ssid)) { 2858 if (!(cgrp->subtree_control & (1 << ssid))) { 2859 disable &= ~(1 << ssid); 2860 continue; 2861 } 2862 2863 /* a child has it enabled? */ 2864 cgroup_for_each_live_child(child, cgrp) { 2865 if (child->subtree_control & (1 << ssid)) { 2866 ret = -EBUSY; 2867 goto out_unlock; 2868 } 2869 } 2870 } 2871 } 2872 2873 if (!enable && !disable) { 2874 ret = 0; 2875 goto out_unlock; 2876 } 2877 2878 /* 2879 * Except for the root, subtree_control must be zero for a cgroup 2880 * with tasks so that child cgroups don't compete against tasks. 2881 */ 2882 if (enable && cgroup_parent(cgrp)) { 2883 struct cgrp_cset_link *link; 2884 2885 /* 2886 * Because namespaces pin csets too, @cgrp->cset_links 2887 * might not be empty even when @cgrp is empty. Walk and 2888 * verify each cset. 2889 */ 2890 spin_lock_irq(&css_set_lock); 2891 2892 ret = 0; 2893 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 2894 if (css_set_populated(link->cset)) { 2895 ret = -EBUSY; 2896 break; 2897 } 2898 } 2899 2900 spin_unlock_irq(&css_set_lock); 2901 2902 if (ret) 2903 goto out_unlock; 2904 } 2905 2906 /* save and update control masks and prepare csses */ 2907 cgroup_save_control(cgrp); 2908 2909 cgrp->subtree_control |= enable; 2910 cgrp->subtree_control &= ~disable; 2911 2912 ret = cgroup_apply_control(cgrp); 2913 2914 cgroup_finalize_control(cgrp, ret); 2915 2916 kernfs_activate(cgrp->kn); 2917 ret = 0; 2918 out_unlock: 2919 cgroup_kn_unlock(of->kn); 2920 return ret ?: nbytes; 2921 } 2922 2923 static int cgroup_events_show(struct seq_file *seq, void *v) 2924 { 2925 seq_printf(seq, "populated %d\n", 2926 cgroup_is_populated(seq_css(seq)->cgroup)); 2927 return 0; 2928 } 2929 2930 static int cgroup_file_open(struct kernfs_open_file *of) 2931 { 2932 struct cftype *cft = of->kn->priv; 2933 2934 if (cft->open) 2935 return cft->open(of); 2936 return 0; 2937 } 2938 2939 static void cgroup_file_release(struct kernfs_open_file *of) 2940 { 2941 struct cftype *cft = of->kn->priv; 2942 2943 if (cft->release) 2944 cft->release(of); 2945 } 2946 2947 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2948 size_t nbytes, loff_t off) 2949 { 2950 struct cgroup *cgrp = of->kn->parent->priv; 2951 struct cftype *cft = of->kn->priv; 2952 struct cgroup_subsys_state *css; 2953 int ret; 2954 2955 if (cft->write) 2956 return cft->write(of, buf, nbytes, off); 2957 2958 /* 2959 * kernfs guarantees that a file isn't deleted with operations in 2960 * flight, which means that the matching css is and stays alive and 2961 * doesn't need to be pinned. The RCU locking is not necessary 2962 * either. It's just for the convenience of using cgroup_css(). 2963 */ 2964 rcu_read_lock(); 2965 css = cgroup_css(cgrp, cft->ss); 2966 rcu_read_unlock(); 2967 2968 if (cft->write_u64) { 2969 unsigned long long v; 2970 ret = kstrtoull(buf, 0, &v); 2971 if (!ret) 2972 ret = cft->write_u64(css, cft, v); 2973 } else if (cft->write_s64) { 2974 long long v; 2975 ret = kstrtoll(buf, 0, &v); 2976 if (!ret) 2977 ret = cft->write_s64(css, cft, v); 2978 } else { 2979 ret = -EINVAL; 2980 } 2981 2982 return ret ?: nbytes; 2983 } 2984 2985 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2986 { 2987 return seq_cft(seq)->seq_start(seq, ppos); 2988 } 2989 2990 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2991 { 2992 return seq_cft(seq)->seq_next(seq, v, ppos); 2993 } 2994 2995 static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2996 { 2997 if (seq_cft(seq)->seq_stop) 2998 seq_cft(seq)->seq_stop(seq, v); 2999 } 3000 3001 static int cgroup_seqfile_show(struct seq_file *m, void *arg) 3002 { 3003 struct cftype *cft = seq_cft(m); 3004 struct cgroup_subsys_state *css = seq_css(m); 3005 3006 if (cft->seq_show) 3007 return cft->seq_show(m, arg); 3008 3009 if (cft->read_u64) 3010 seq_printf(m, "%llu\n", cft->read_u64(css, cft)); 3011 else if (cft->read_s64) 3012 seq_printf(m, "%lld\n", cft->read_s64(css, cft)); 3013 else 3014 return -EINVAL; 3015 return 0; 3016 } 3017 3018 static struct kernfs_ops cgroup_kf_single_ops = { 3019 .atomic_write_len = PAGE_SIZE, 3020 .open = cgroup_file_open, 3021 .release = cgroup_file_release, 3022 .write = cgroup_file_write, 3023 .seq_show = cgroup_seqfile_show, 3024 }; 3025 3026 static struct kernfs_ops cgroup_kf_ops = { 3027 .atomic_write_len = PAGE_SIZE, 3028 .open = cgroup_file_open, 3029 .release = cgroup_file_release, 3030 .write = cgroup_file_write, 3031 .seq_start = cgroup_seqfile_start, 3032 .seq_next = cgroup_seqfile_next, 3033 .seq_stop = cgroup_seqfile_stop, 3034 .seq_show = cgroup_seqfile_show, 3035 }; 3036 3037 /* set uid and gid of cgroup dirs and files to that of the creator */ 3038 static int cgroup_kn_set_ugid(struct kernfs_node *kn) 3039 { 3040 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 3041 .ia_uid = current_fsuid(), 3042 .ia_gid = current_fsgid(), }; 3043 3044 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 3045 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 3046 return 0; 3047 3048 return kernfs_setattr(kn, &iattr); 3049 } 3050 3051 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, 3052 struct cftype *cft) 3053 { 3054 char name[CGROUP_FILE_NAME_MAX]; 3055 struct kernfs_node *kn; 3056 struct lock_class_key *key = NULL; 3057 int ret; 3058 3059 #ifdef CONFIG_DEBUG_LOCK_ALLOC 3060 key = &cft->lockdep_key; 3061 #endif 3062 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), 3063 cgroup_file_mode(cft), 0, cft->kf_ops, cft, 3064 NULL, key); 3065 if (IS_ERR(kn)) 3066 return PTR_ERR(kn); 3067 3068 ret = cgroup_kn_set_ugid(kn); 3069 if (ret) { 3070 kernfs_remove(kn); 3071 return ret; 3072 } 3073 3074 if (cft->file_offset) { 3075 struct cgroup_file *cfile = (void *)css + cft->file_offset; 3076 3077 spin_lock_irq(&cgroup_file_kn_lock); 3078 cfile->kn = kn; 3079 spin_unlock_irq(&cgroup_file_kn_lock); 3080 } 3081 3082 return 0; 3083 } 3084 3085 /** 3086 * cgroup_addrm_files - add or remove files to a cgroup directory 3087 * @css: the target css 3088 * @cgrp: the target cgroup (usually css->cgroup) 3089 * @cfts: array of cftypes to be added 3090 * @is_add: whether to add or remove 3091 * 3092 * Depending on @is_add, add or remove files defined by @cfts on @cgrp. 3093 * For removals, this function never fails. 3094 */ 3095 static int cgroup_addrm_files(struct cgroup_subsys_state *css, 3096 struct cgroup *cgrp, struct cftype cfts[], 3097 bool is_add) 3098 { 3099 struct cftype *cft, *cft_end = NULL; 3100 int ret = 0; 3101 3102 lockdep_assert_held(&cgroup_mutex); 3103 3104 restart: 3105 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { 3106 /* does cft->flags tell us to skip this file on @cgrp? */ 3107 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3108 continue; 3109 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) 3110 continue; 3111 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) 3112 continue; 3113 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) 3114 continue; 3115 3116 if (is_add) { 3117 ret = cgroup_add_file(css, cgrp, cft); 3118 if (ret) { 3119 pr_warn("%s: failed to add %s, err=%d\n", 3120 __func__, cft->name, ret); 3121 cft_end = cft; 3122 is_add = false; 3123 goto restart; 3124 } 3125 } else { 3126 cgroup_rm_file(cgrp, cft); 3127 } 3128 } 3129 return ret; 3130 } 3131 3132 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) 3133 { 3134 LIST_HEAD(pending); 3135 struct cgroup_subsys *ss = cfts[0].ss; 3136 struct cgroup *root = &ss->root->cgrp; 3137 struct cgroup_subsys_state *css; 3138 int ret = 0; 3139 3140 lockdep_assert_held(&cgroup_mutex); 3141 3142 /* add/rm files for all cgroups created before */ 3143 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 3144 struct cgroup *cgrp = css->cgroup; 3145 3146 if (!(css->flags & CSS_VISIBLE)) 3147 continue; 3148 3149 ret = cgroup_addrm_files(css, cgrp, cfts, is_add); 3150 if (ret) 3151 break; 3152 } 3153 3154 if (is_add && !ret) 3155 kernfs_activate(root->kn); 3156 return ret; 3157 } 3158 3159 static void cgroup_exit_cftypes(struct cftype *cfts) 3160 { 3161 struct cftype *cft; 3162 3163 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3164 /* free copy for custom atomic_write_len, see init_cftypes() */ 3165 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) 3166 kfree(cft->kf_ops); 3167 cft->kf_ops = NULL; 3168 cft->ss = NULL; 3169 3170 /* revert flags set by cgroup core while adding @cfts */ 3171 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); 3172 } 3173 } 3174 3175 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3176 { 3177 struct cftype *cft; 3178 3179 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3180 struct kernfs_ops *kf_ops; 3181 3182 WARN_ON(cft->ss || cft->kf_ops); 3183 3184 if (cft->seq_start) 3185 kf_ops = &cgroup_kf_ops; 3186 else 3187 kf_ops = &cgroup_kf_single_ops; 3188 3189 /* 3190 * Ugh... if @cft wants a custom max_write_len, we need to 3191 * make a copy of kf_ops to set its atomic_write_len. 3192 */ 3193 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { 3194 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); 3195 if (!kf_ops) { 3196 cgroup_exit_cftypes(cfts); 3197 return -ENOMEM; 3198 } 3199 kf_ops->atomic_write_len = cft->max_write_len; 3200 } 3201 3202 cft->kf_ops = kf_ops; 3203 cft->ss = ss; 3204 } 3205 3206 return 0; 3207 } 3208 3209 static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3210 { 3211 lockdep_assert_held(&cgroup_mutex); 3212 3213 if (!cfts || !cfts[0].ss) 3214 return -ENOENT; 3215 3216 list_del(&cfts->node); 3217 cgroup_apply_cftypes(cfts, false); 3218 cgroup_exit_cftypes(cfts); 3219 return 0; 3220 } 3221 3222 /** 3223 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 3224 * @cfts: zero-length name terminated array of cftypes 3225 * 3226 * Unregister @cfts. Files described by @cfts are removed from all 3227 * existing cgroups and all future cgroups won't have them either. This 3228 * function can be called anytime whether @cfts' subsys is attached or not. 3229 * 3230 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 3231 * registered. 3232 */ 3233 int cgroup_rm_cftypes(struct cftype *cfts) 3234 { 3235 int ret; 3236 3237 mutex_lock(&cgroup_mutex); 3238 ret = cgroup_rm_cftypes_locked(cfts); 3239 mutex_unlock(&cgroup_mutex); 3240 return ret; 3241 } 3242 3243 /** 3244 * cgroup_add_cftypes - add an array of cftypes to a subsystem 3245 * @ss: target cgroup subsystem 3246 * @cfts: zero-length name terminated array of cftypes 3247 * 3248 * Register @cfts to @ss. Files described by @cfts are created for all 3249 * existing cgroups to which @ss is attached and all future cgroups will 3250 * have them too. This function can be called anytime whether @ss is 3251 * attached or not. 3252 * 3253 * Returns 0 on successful registration, -errno on failure. Note that this 3254 * function currently returns 0 as long as @cfts registration is successful 3255 * even if some file creation attempts on existing cgroups fail. 3256 */ 3257 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3258 { 3259 int ret; 3260 3261 if (!cgroup_ssid_enabled(ss->id)) 3262 return 0; 3263 3264 if (!cfts || cfts[0].name[0] == '\0') 3265 return 0; 3266 3267 ret = cgroup_init_cftypes(ss, cfts); 3268 if (ret) 3269 return ret; 3270 3271 mutex_lock(&cgroup_mutex); 3272 3273 list_add_tail(&cfts->node, &ss->cfts); 3274 ret = cgroup_apply_cftypes(cfts, true); 3275 if (ret) 3276 cgroup_rm_cftypes_locked(cfts); 3277 3278 mutex_unlock(&cgroup_mutex); 3279 return ret; 3280 } 3281 3282 /** 3283 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy 3284 * @ss: target cgroup subsystem 3285 * @cfts: zero-length name terminated array of cftypes 3286 * 3287 * Similar to cgroup_add_cftypes() but the added files are only used for 3288 * the default hierarchy. 3289 */ 3290 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3291 { 3292 struct cftype *cft; 3293 3294 for (cft = cfts; cft && cft->name[0] != '\0'; cft++) 3295 cft->flags |= __CFTYPE_ONLY_ON_DFL; 3296 return cgroup_add_cftypes(ss, cfts); 3297 } 3298 3299 /** 3300 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies 3301 * @ss: target cgroup subsystem 3302 * @cfts: zero-length name terminated array of cftypes 3303 * 3304 * Similar to cgroup_add_cftypes() but the added files are only used for 3305 * the legacy hierarchies. 3306 */ 3307 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3308 { 3309 struct cftype *cft; 3310 3311 for (cft = cfts; cft && cft->name[0] != '\0'; cft++) 3312 cft->flags |= __CFTYPE_NOT_ON_DFL; 3313 return cgroup_add_cftypes(ss, cfts); 3314 } 3315 3316 /** 3317 * cgroup_file_notify - generate a file modified event for a cgroup_file 3318 * @cfile: target cgroup_file 3319 * 3320 * @cfile must have been obtained by setting cftype->file_offset. 3321 */ 3322 void cgroup_file_notify(struct cgroup_file *cfile) 3323 { 3324 unsigned long flags; 3325 3326 spin_lock_irqsave(&cgroup_file_kn_lock, flags); 3327 if (cfile->kn) 3328 kernfs_notify(cfile->kn); 3329 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); 3330 } 3331 3332 /** 3333 * css_next_child - find the next child of a given css 3334 * @pos: the current position (%NULL to initiate traversal) 3335 * @parent: css whose children to walk 3336 * 3337 * This function returns the next child of @parent and should be called 3338 * under either cgroup_mutex or RCU read lock. The only requirement is 3339 * that @parent and @pos are accessible. The next sibling is guaranteed to 3340 * be returned regardless of their states. 3341 * 3342 * If a subsystem synchronizes ->css_online() and the start of iteration, a 3343 * css which finished ->css_online() is guaranteed to be visible in the 3344 * future iterations and will stay visible until the last reference is put. 3345 * A css which hasn't finished ->css_online() or already finished 3346 * ->css_offline() may show up during traversal. It's each subsystem's 3347 * responsibility to synchronize against on/offlining. 3348 */ 3349 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, 3350 struct cgroup_subsys_state *parent) 3351 { 3352 struct cgroup_subsys_state *next; 3353 3354 cgroup_assert_mutex_or_rcu_locked(); 3355 3356 /* 3357 * @pos could already have been unlinked from the sibling list. 3358 * Once a cgroup is removed, its ->sibling.next is no longer 3359 * updated when its next sibling changes. CSS_RELEASED is set when 3360 * @pos is taken off list, at which time its next pointer is valid, 3361 * and, as releases are serialized, the one pointed to by the next 3362 * pointer is guaranteed to not have started release yet. This 3363 * implies that if we observe !CSS_RELEASED on @pos in this RCU 3364 * critical section, the one pointed to by its next pointer is 3365 * guaranteed to not have finished its RCU grace period even if we 3366 * have dropped rcu_read_lock() inbetween iterations. 3367 * 3368 * If @pos has CSS_RELEASED set, its next pointer can't be 3369 * dereferenced; however, as each css is given a monotonically 3370 * increasing unique serial number and always appended to the 3371 * sibling list, the next one can be found by walking the parent's 3372 * children until the first css with higher serial number than 3373 * @pos's. While this path can be slower, it happens iff iteration 3374 * races against release and the race window is very small. 3375 */ 3376 if (!pos) { 3377 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); 3378 } else if (likely(!(pos->flags & CSS_RELEASED))) { 3379 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); 3380 } else { 3381 list_for_each_entry_rcu(next, &parent->children, sibling) 3382 if (next->serial_nr > pos->serial_nr) 3383 break; 3384 } 3385 3386 /* 3387 * @next, if not pointing to the head, can be dereferenced and is 3388 * the next sibling. 3389 */ 3390 if (&next->sibling != &parent->children) 3391 return next; 3392 return NULL; 3393 } 3394 3395 /** 3396 * css_next_descendant_pre - find the next descendant for pre-order walk 3397 * @pos: the current position (%NULL to initiate traversal) 3398 * @root: css whose descendants to walk 3399 * 3400 * To be used by css_for_each_descendant_pre(). Find the next descendant 3401 * to visit for pre-order traversal of @root's descendants. @root is 3402 * included in the iteration and the first node to be visited. 3403 * 3404 * While this function requires cgroup_mutex or RCU read locking, it 3405 * doesn't require the whole traversal to be contained in a single critical 3406 * section. This function will return the correct next descendant as long 3407 * as both @pos and @root are accessible and @pos is a descendant of @root. 3408 * 3409 * If a subsystem synchronizes ->css_online() and the start of iteration, a 3410 * css which finished ->css_online() is guaranteed to be visible in the 3411 * future iterations and will stay visible until the last reference is put. 3412 * A css which hasn't finished ->css_online() or already finished 3413 * ->css_offline() may show up during traversal. It's each subsystem's 3414 * responsibility to synchronize against on/offlining. 3415 */ 3416 struct cgroup_subsys_state * 3417 css_next_descendant_pre(struct cgroup_subsys_state *pos, 3418 struct cgroup_subsys_state *root) 3419 { 3420 struct cgroup_subsys_state *next; 3421 3422 cgroup_assert_mutex_or_rcu_locked(); 3423 3424 /* if first iteration, visit @root */ 3425 if (!pos) 3426 return root; 3427 3428 /* visit the first child if exists */ 3429 next = css_next_child(NULL, pos); 3430 if (next) 3431 return next; 3432 3433 /* no child, visit my or the closest ancestor's next sibling */ 3434 while (pos != root) { 3435 next = css_next_child(pos, pos->parent); 3436 if (next) 3437 return next; 3438 pos = pos->parent; 3439 } 3440 3441 return NULL; 3442 } 3443 3444 /** 3445 * css_rightmost_descendant - return the rightmost descendant of a css 3446 * @pos: css of interest 3447 * 3448 * Return the rightmost descendant of @pos. If there's no descendant, @pos 3449 * is returned. This can be used during pre-order traversal to skip 3450 * subtree of @pos. 3451 * 3452 * While this function requires cgroup_mutex or RCU read locking, it 3453 * doesn't require the whole traversal to be contained in a single critical 3454 * section. This function will return the correct rightmost descendant as 3455 * long as @pos is accessible. 3456 */ 3457 struct cgroup_subsys_state * 3458 css_rightmost_descendant(struct cgroup_subsys_state *pos) 3459 { 3460 struct cgroup_subsys_state *last, *tmp; 3461 3462 cgroup_assert_mutex_or_rcu_locked(); 3463 3464 do { 3465 last = pos; 3466 /* ->prev isn't RCU safe, walk ->next till the end */ 3467 pos = NULL; 3468 css_for_each_child(tmp, last) 3469 pos = tmp; 3470 } while (pos); 3471 3472 return last; 3473 } 3474 3475 static struct cgroup_subsys_state * 3476 css_leftmost_descendant(struct cgroup_subsys_state *pos) 3477 { 3478 struct cgroup_subsys_state *last; 3479 3480 do { 3481 last = pos; 3482 pos = css_next_child(NULL, pos); 3483 } while (pos); 3484 3485 return last; 3486 } 3487 3488 /** 3489 * css_next_descendant_post - find the next descendant for post-order walk 3490 * @pos: the current position (%NULL to initiate traversal) 3491 * @root: css whose descendants to walk 3492 * 3493 * To be used by css_for_each_descendant_post(). Find the next descendant 3494 * to visit for post-order traversal of @root's descendants. @root is 3495 * included in the iteration and the last node to be visited. 3496 * 3497 * While this function requires cgroup_mutex or RCU read locking, it 3498 * doesn't require the whole traversal to be contained in a single critical 3499 * section. This function will return the correct next descendant as long 3500 * as both @pos and @cgroup are accessible and @pos is a descendant of 3501 * @cgroup. 3502 * 3503 * If a subsystem synchronizes ->css_online() and the start of iteration, a 3504 * css which finished ->css_online() is guaranteed to be visible in the 3505 * future iterations and will stay visible until the last reference is put. 3506 * A css which hasn't finished ->css_online() or already finished 3507 * ->css_offline() may show up during traversal. It's each subsystem's 3508 * responsibility to synchronize against on/offlining. 3509 */ 3510 struct cgroup_subsys_state * 3511 css_next_descendant_post(struct cgroup_subsys_state *pos, 3512 struct cgroup_subsys_state *root) 3513 { 3514 struct cgroup_subsys_state *next; 3515 3516 cgroup_assert_mutex_or_rcu_locked(); 3517 3518 /* if first iteration, visit leftmost descendant which may be @root */ 3519 if (!pos) 3520 return css_leftmost_descendant(root); 3521 3522 /* if we visited @root, we're done */ 3523 if (pos == root) 3524 return NULL; 3525 3526 /* if there's an unvisited sibling, visit its leftmost descendant */ 3527 next = css_next_child(pos, pos->parent); 3528 if (next) 3529 return css_leftmost_descendant(next); 3530 3531 /* no sibling left, visit parent */ 3532 return pos->parent; 3533 } 3534 3535 /** 3536 * css_has_online_children - does a css have online children 3537 * @css: the target css 3538 * 3539 * Returns %true if @css has any online children; otherwise, %false. This 3540 * function can be called from any context but the caller is responsible 3541 * for synchronizing against on/offlining as necessary. 3542 */ 3543 bool css_has_online_children(struct cgroup_subsys_state *css) 3544 { 3545 struct cgroup_subsys_state *child; 3546 bool ret = false; 3547 3548 rcu_read_lock(); 3549 css_for_each_child(child, css) { 3550 if (child->flags & CSS_ONLINE) { 3551 ret = true; 3552 break; 3553 } 3554 } 3555 rcu_read_unlock(); 3556 return ret; 3557 } 3558 3559 /** 3560 * css_task_iter_advance_css_set - advance a task itererator to the next css_set 3561 * @it: the iterator to advance 3562 * 3563 * Advance @it to the next css_set to walk. 3564 */ 3565 static void css_task_iter_advance_css_set(struct css_task_iter *it) 3566 { 3567 struct list_head *l = it->cset_pos; 3568 struct cgrp_cset_link *link; 3569 struct css_set *cset; 3570 3571 lockdep_assert_held(&css_set_lock); 3572 3573 /* Advance to the next non-empty css_set */ 3574 do { 3575 l = l->next; 3576 if (l == it->cset_head) { 3577 it->cset_pos = NULL; 3578 it->task_pos = NULL; 3579 return; 3580 } 3581 3582 if (it->ss) { 3583 cset = container_of(l, struct css_set, 3584 e_cset_node[it->ss->id]); 3585 } else { 3586 link = list_entry(l, struct cgrp_cset_link, cset_link); 3587 cset = link->cset; 3588 } 3589 } while (!css_set_populated(cset)); 3590 3591 it->cset_pos = l; 3592 3593 if (!list_empty(&cset->tasks)) 3594 it->task_pos = cset->tasks.next; 3595 else 3596 it->task_pos = cset->mg_tasks.next; 3597 3598 it->tasks_head = &cset->tasks; 3599 it->mg_tasks_head = &cset->mg_tasks; 3600 3601 /* 3602 * We don't keep css_sets locked across iteration steps and thus 3603 * need to take steps to ensure that iteration can be resumed after 3604 * the lock is re-acquired. Iteration is performed at two levels - 3605 * css_sets and tasks in them. 3606 * 3607 * Once created, a css_set never leaves its cgroup lists, so a 3608 * pinned css_set is guaranteed to stay put and we can resume 3609 * iteration afterwards. 3610 * 3611 * Tasks may leave @cset across iteration steps. This is resolved 3612 * by registering each iterator with the css_set currently being 3613 * walked and making css_set_move_task() advance iterators whose 3614 * next task is leaving. 3615 */ 3616 if (it->cur_cset) { 3617 list_del(&it->iters_node); 3618 put_css_set_locked(it->cur_cset); 3619 } 3620 get_css_set(cset); 3621 it->cur_cset = cset; 3622 list_add(&it->iters_node, &cset->task_iters); 3623 } 3624 3625 static void css_task_iter_advance(struct css_task_iter *it) 3626 { 3627 struct list_head *l = it->task_pos; 3628 3629 lockdep_assert_held(&css_set_lock); 3630 WARN_ON_ONCE(!l); 3631 3632 /* 3633 * Advance iterator to find next entry. cset->tasks is consumed 3634 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 3635 * next cset. 3636 */ 3637 l = l->next; 3638 3639 if (l == it->tasks_head) 3640 l = it->mg_tasks_head->next; 3641 3642 if (l == it->mg_tasks_head) 3643 css_task_iter_advance_css_set(it); 3644 else 3645 it->task_pos = l; 3646 } 3647 3648 /** 3649 * css_task_iter_start - initiate task iteration 3650 * @css: the css to walk tasks of 3651 * @it: the task iterator to use 3652 * 3653 * Initiate iteration through the tasks of @css. The caller can call 3654 * css_task_iter_next() to walk through the tasks until the function 3655 * returns NULL. On completion of iteration, css_task_iter_end() must be 3656 * called. 3657 */ 3658 void css_task_iter_start(struct cgroup_subsys_state *css, 3659 struct css_task_iter *it) 3660 { 3661 /* no one should try to iterate before mounting cgroups */ 3662 WARN_ON_ONCE(!use_task_css_set_links); 3663 3664 memset(it, 0, sizeof(*it)); 3665 3666 spin_lock_irq(&css_set_lock); 3667 3668 it->ss = css->ss; 3669 3670 if (it->ss) 3671 it->cset_pos = &css->cgroup->e_csets[css->ss->id]; 3672 else 3673 it->cset_pos = &css->cgroup->cset_links; 3674 3675 it->cset_head = it->cset_pos; 3676 3677 css_task_iter_advance_css_set(it); 3678 3679 spin_unlock_irq(&css_set_lock); 3680 } 3681 3682 /** 3683 * css_task_iter_next - return the next task for the iterator 3684 * @it: the task iterator being iterated 3685 * 3686 * The "next" function for task iteration. @it should have been 3687 * initialized via css_task_iter_start(). Returns NULL when the iteration 3688 * reaches the end. 3689 */ 3690 struct task_struct *css_task_iter_next(struct css_task_iter *it) 3691 { 3692 if (it->cur_task) { 3693 put_task_struct(it->cur_task); 3694 it->cur_task = NULL; 3695 } 3696 3697 spin_lock_irq(&css_set_lock); 3698 3699 if (it->task_pos) { 3700 it->cur_task = list_entry(it->task_pos, struct task_struct, 3701 cg_list); 3702 get_task_struct(it->cur_task); 3703 css_task_iter_advance(it); 3704 } 3705 3706 spin_unlock_irq(&css_set_lock); 3707 3708 return it->cur_task; 3709 } 3710 3711 /** 3712 * css_task_iter_end - finish task iteration 3713 * @it: the task iterator to finish 3714 * 3715 * Finish task iteration started by css_task_iter_start(). 3716 */ 3717 void css_task_iter_end(struct css_task_iter *it) 3718 { 3719 if (it->cur_cset) { 3720 spin_lock_irq(&css_set_lock); 3721 list_del(&it->iters_node); 3722 put_css_set_locked(it->cur_cset); 3723 spin_unlock_irq(&css_set_lock); 3724 } 3725 3726 if (it->cur_task) 3727 put_task_struct(it->cur_task); 3728 } 3729 3730 static void cgroup_procs_release(struct kernfs_open_file *of) 3731 { 3732 if (of->priv) { 3733 css_task_iter_end(of->priv); 3734 kfree(of->priv); 3735 } 3736 } 3737 3738 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) 3739 { 3740 struct kernfs_open_file *of = s->private; 3741 struct css_task_iter *it = of->priv; 3742 struct task_struct *task; 3743 3744 do { 3745 task = css_task_iter_next(it); 3746 } while (task && !thread_group_leader(task)); 3747 3748 return task; 3749 } 3750 3751 static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) 3752 { 3753 struct kernfs_open_file *of = s->private; 3754 struct cgroup *cgrp = seq_css(s)->cgroup; 3755 struct css_task_iter *it = of->priv; 3756 3757 /* 3758 * When a seq_file is seeked, it's always traversed sequentially 3759 * from position 0, so we can simply keep iterating on !0 *pos. 3760 */ 3761 if (!it) { 3762 if (WARN_ON_ONCE((*pos)++)) 3763 return ERR_PTR(-EINVAL); 3764 3765 it = kzalloc(sizeof(*it), GFP_KERNEL); 3766 if (!it) 3767 return ERR_PTR(-ENOMEM); 3768 of->priv = it; 3769 css_task_iter_start(&cgrp->self, it); 3770 } else if (!(*pos)++) { 3771 css_task_iter_end(it); 3772 css_task_iter_start(&cgrp->self, it); 3773 } 3774 3775 return cgroup_procs_next(s, NULL, NULL); 3776 } 3777 3778 static int cgroup_procs_show(struct seq_file *s, void *v) 3779 { 3780 seq_printf(s, "%d\n", task_tgid_vnr(v)); 3781 return 0; 3782 } 3783 3784 /* cgroup core interface files for the default hierarchy */ 3785 static struct cftype cgroup_base_files[] = { 3786 { 3787 .name = "cgroup.procs", 3788 .file_offset = offsetof(struct cgroup, procs_file), 3789 .release = cgroup_procs_release, 3790 .seq_start = cgroup_procs_start, 3791 .seq_next = cgroup_procs_next, 3792 .seq_show = cgroup_procs_show, 3793 .write = cgroup_procs_write, 3794 }, 3795 { 3796 .name = "cgroup.controllers", 3797 .seq_show = cgroup_controllers_show, 3798 }, 3799 { 3800 .name = "cgroup.subtree_control", 3801 .seq_show = cgroup_subtree_control_show, 3802 .write = cgroup_subtree_control_write, 3803 }, 3804 { 3805 .name = "cgroup.events", 3806 .flags = CFTYPE_NOT_ON_ROOT, 3807 .file_offset = offsetof(struct cgroup, events_file), 3808 .seq_show = cgroup_events_show, 3809 }, 3810 { } /* terminate */ 3811 }; 3812 3813 /* 3814 * css destruction is four-stage process. 3815 * 3816 * 1. Destruction starts. Killing of the percpu_ref is initiated. 3817 * Implemented in kill_css(). 3818 * 3819 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 3820 * and thus css_tryget_online() is guaranteed to fail, the css can be 3821 * offlined by invoking offline_css(). After offlining, the base ref is 3822 * put. Implemented in css_killed_work_fn(). 3823 * 3824 * 3. When the percpu_ref reaches zero, the only possible remaining 3825 * accessors are inside RCU read sections. css_release() schedules the 3826 * RCU callback. 3827 * 3828 * 4. After the grace period, the css can be freed. Implemented in 3829 * css_free_work_fn(). 3830 * 3831 * It is actually hairier because both step 2 and 4 require process context 3832 * and thus involve punting to css->destroy_work adding two additional 3833 * steps to the already complex sequence. 3834 */ 3835 static void css_free_work_fn(struct work_struct *work) 3836 { 3837 struct cgroup_subsys_state *css = 3838 container_of(work, struct cgroup_subsys_state, destroy_work); 3839 struct cgroup_subsys *ss = css->ss; 3840 struct cgroup *cgrp = css->cgroup; 3841 3842 percpu_ref_exit(&css->refcnt); 3843 3844 if (ss) { 3845 /* css free path */ 3846 struct cgroup_subsys_state *parent = css->parent; 3847 int id = css->id; 3848 3849 ss->css_free(css); 3850 cgroup_idr_remove(&ss->css_idr, id); 3851 cgroup_put(cgrp); 3852 3853 if (parent) 3854 css_put(parent); 3855 } else { 3856 /* cgroup free path */ 3857 atomic_dec(&cgrp->root->nr_cgrps); 3858 cgroup1_pidlist_destroy_all(cgrp); 3859 cancel_work_sync(&cgrp->release_agent_work); 3860 3861 if (cgroup_parent(cgrp)) { 3862 /* 3863 * We get a ref to the parent, and put the ref when 3864 * this cgroup is being freed, so it's guaranteed 3865 * that the parent won't be destroyed before its 3866 * children. 3867 */ 3868 cgroup_put(cgroup_parent(cgrp)); 3869 kernfs_put(cgrp->kn); 3870 kfree(cgrp); 3871 } else { 3872 /* 3873 * This is root cgroup's refcnt reaching zero, 3874 * which indicates that the root should be 3875 * released. 3876 */ 3877 cgroup_destroy_root(cgrp->root); 3878 } 3879 } 3880 } 3881 3882 static void css_free_rcu_fn(struct rcu_head *rcu_head) 3883 { 3884 struct cgroup_subsys_state *css = 3885 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3886 3887 INIT_WORK(&css->destroy_work, css_free_work_fn); 3888 queue_work(cgroup_destroy_wq, &css->destroy_work); 3889 } 3890 3891 static void css_release_work_fn(struct work_struct *work) 3892 { 3893 struct cgroup_subsys_state *css = 3894 container_of(work, struct cgroup_subsys_state, destroy_work); 3895 struct cgroup_subsys *ss = css->ss; 3896 struct cgroup *cgrp = css->cgroup; 3897 3898 mutex_lock(&cgroup_mutex); 3899 3900 css->flags |= CSS_RELEASED; 3901 list_del_rcu(&css->sibling); 3902 3903 if (ss) { 3904 /* css release path */ 3905 cgroup_idr_replace(&ss->css_idr, NULL, css->id); 3906 if (ss->css_released) 3907 ss->css_released(css); 3908 } else { 3909 /* cgroup release path */ 3910 trace_cgroup_release(cgrp); 3911 3912 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 3913 cgrp->id = -1; 3914 3915 /* 3916 * There are two control paths which try to determine 3917 * cgroup from dentry without going through kernfs - 3918 * cgroupstats_build() and css_tryget_online_from_dir(). 3919 * Those are supported by RCU protecting clearing of 3920 * cgrp->kn->priv backpointer. 3921 */ 3922 if (cgrp->kn) 3923 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, 3924 NULL); 3925 3926 cgroup_bpf_put(cgrp); 3927 } 3928 3929 mutex_unlock(&cgroup_mutex); 3930 3931 call_rcu(&css->rcu_head, css_free_rcu_fn); 3932 } 3933 3934 static void css_release(struct percpu_ref *ref) 3935 { 3936 struct cgroup_subsys_state *css = 3937 container_of(ref, struct cgroup_subsys_state, refcnt); 3938 3939 INIT_WORK(&css->destroy_work, css_release_work_fn); 3940 queue_work(cgroup_destroy_wq, &css->destroy_work); 3941 } 3942 3943 static void init_and_link_css(struct cgroup_subsys_state *css, 3944 struct cgroup_subsys *ss, struct cgroup *cgrp) 3945 { 3946 lockdep_assert_held(&cgroup_mutex); 3947 3948 cgroup_get(cgrp); 3949 3950 memset(css, 0, sizeof(*css)); 3951 css->cgroup = cgrp; 3952 css->ss = ss; 3953 css->id = -1; 3954 INIT_LIST_HEAD(&css->sibling); 3955 INIT_LIST_HEAD(&css->children); 3956 css->serial_nr = css_serial_nr_next++; 3957 atomic_set(&css->online_cnt, 0); 3958 3959 if (cgroup_parent(cgrp)) { 3960 css->parent = cgroup_css(cgroup_parent(cgrp), ss); 3961 css_get(css->parent); 3962 } 3963 3964 BUG_ON(cgroup_css(cgrp, ss)); 3965 } 3966 3967 /* invoke ->css_online() on a new CSS and mark it online if successful */ 3968 static int online_css(struct cgroup_subsys_state *css) 3969 { 3970 struct cgroup_subsys *ss = css->ss; 3971 int ret = 0; 3972 3973 lockdep_assert_held(&cgroup_mutex); 3974 3975 if (ss->css_online) 3976 ret = ss->css_online(css); 3977 if (!ret) { 3978 css->flags |= CSS_ONLINE; 3979 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 3980 3981 atomic_inc(&css->online_cnt); 3982 if (css->parent) 3983 atomic_inc(&css->parent->online_cnt); 3984 } 3985 return ret; 3986 } 3987 3988 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ 3989 static void offline_css(struct cgroup_subsys_state *css) 3990 { 3991 struct cgroup_subsys *ss = css->ss; 3992 3993 lockdep_assert_held(&cgroup_mutex); 3994 3995 if (!(css->flags & CSS_ONLINE)) 3996 return; 3997 3998 if (ss->css_reset) 3999 ss->css_reset(css); 4000 4001 if (ss->css_offline) 4002 ss->css_offline(css); 4003 4004 css->flags &= ~CSS_ONLINE; 4005 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); 4006 4007 wake_up_all(&css->cgroup->offline_waitq); 4008 } 4009 4010 /** 4011 * css_create - create a cgroup_subsys_state 4012 * @cgrp: the cgroup new css will be associated with 4013 * @ss: the subsys of new css 4014 * 4015 * Create a new css associated with @cgrp - @ss pair. On success, the new 4016 * css is online and installed in @cgrp. This function doesn't create the 4017 * interface files. Returns 0 on success, -errno on failure. 4018 */ 4019 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, 4020 struct cgroup_subsys *ss) 4021 { 4022 struct cgroup *parent = cgroup_parent(cgrp); 4023 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); 4024 struct cgroup_subsys_state *css; 4025 int err; 4026 4027 lockdep_assert_held(&cgroup_mutex); 4028 4029 css = ss->css_alloc(parent_css); 4030 if (!css) 4031 css = ERR_PTR(-ENOMEM); 4032 if (IS_ERR(css)) 4033 return css; 4034 4035 init_and_link_css(css, ss, cgrp); 4036 4037 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); 4038 if (err) 4039 goto err_free_css; 4040 4041 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); 4042 if (err < 0) 4043 goto err_free_css; 4044 css->id = err; 4045 4046 /* @css is ready to be brought online now, make it visible */ 4047 list_add_tail_rcu(&css->sibling, &parent_css->children); 4048 cgroup_idr_replace(&ss->css_idr, css, css->id); 4049 4050 err = online_css(css); 4051 if (err) 4052 goto err_list_del; 4053 4054 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4055 cgroup_parent(parent)) { 4056 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4057 current->comm, current->pid, ss->name); 4058 if (!strcmp(ss->name, "memory")) 4059 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); 4060 ss->warned_broken_hierarchy = true; 4061 } 4062 4063 return css; 4064 4065 err_list_del: 4066 list_del_rcu(&css->sibling); 4067 err_free_css: 4068 call_rcu(&css->rcu_head, css_free_rcu_fn); 4069 return ERR_PTR(err); 4070 } 4071 4072 /* 4073 * The returned cgroup is fully initialized including its control mask, but 4074 * it isn't associated with its kernfs_node and doesn't have the control 4075 * mask applied. 4076 */ 4077 static struct cgroup *cgroup_create(struct cgroup *parent) 4078 { 4079 struct cgroup_root *root = parent->root; 4080 struct cgroup *cgrp, *tcgrp; 4081 int level = parent->level + 1; 4082 int ret; 4083 4084 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4085 cgrp = kzalloc(sizeof(*cgrp) + 4086 sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); 4087 if (!cgrp) 4088 return ERR_PTR(-ENOMEM); 4089 4090 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); 4091 if (ret) 4092 goto out_free_cgrp; 4093 4094 /* 4095 * Temporarily set the pointer to NULL, so idr_find() won't return 4096 * a half-baked cgroup. 4097 */ 4098 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); 4099 if (cgrp->id < 0) { 4100 ret = -ENOMEM; 4101 goto out_cancel_ref; 4102 } 4103 4104 init_cgroup_housekeeping(cgrp); 4105 4106 cgrp->self.parent = &parent->self; 4107 cgrp->root = root; 4108 cgrp->level = level; 4109 4110 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) 4111 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; 4112 4113 if (notify_on_release(parent)) 4114 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4115 4116 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4117 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4118 4119 cgrp->self.serial_nr = css_serial_nr_next++; 4120 4121 /* allocation complete, commit to creation */ 4122 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); 4123 atomic_inc(&root->nr_cgrps); 4124 cgroup_get(parent); 4125 4126 /* 4127 * @cgrp is now fully operational. If something fails after this 4128 * point, it'll be released via the normal destruction path. 4129 */ 4130 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4131 4132 /* 4133 * On the default hierarchy, a child doesn't automatically inherit 4134 * subtree_control from the parent. Each is configured manually. 4135 */ 4136 if (!cgroup_on_dfl(cgrp)) 4137 cgrp->subtree_control = cgroup_control(cgrp); 4138 4139 if (parent) 4140 cgroup_bpf_inherit(cgrp, parent); 4141 4142 cgroup_propagate_control(cgrp); 4143 4144 return cgrp; 4145 4146 out_cancel_ref: 4147 percpu_ref_exit(&cgrp->self.refcnt); 4148 out_free_cgrp: 4149 kfree(cgrp); 4150 return ERR_PTR(ret); 4151 } 4152 4153 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) 4154 { 4155 struct cgroup *parent, *cgrp; 4156 struct kernfs_node *kn; 4157 int ret; 4158 4159 /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ 4160 if (strchr(name, '\n')) 4161 return -EINVAL; 4162 4163 parent = cgroup_kn_lock_live(parent_kn, false); 4164 if (!parent) 4165 return -ENODEV; 4166 4167 cgrp = cgroup_create(parent); 4168 if (IS_ERR(cgrp)) { 4169 ret = PTR_ERR(cgrp); 4170 goto out_unlock; 4171 } 4172 4173 /* create the directory */ 4174 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4175 if (IS_ERR(kn)) { 4176 ret = PTR_ERR(kn); 4177 goto out_destroy; 4178 } 4179 cgrp->kn = kn; 4180 4181 /* 4182 * This extra ref will be put in cgroup_free_fn() and guarantees 4183 * that @cgrp->kn is always accessible. 4184 */ 4185 kernfs_get(kn); 4186 4187 ret = cgroup_kn_set_ugid(kn); 4188 if (ret) 4189 goto out_destroy; 4190 4191 ret = css_populate_dir(&cgrp->self); 4192 if (ret) 4193 goto out_destroy; 4194 4195 ret = cgroup_apply_control_enable(cgrp); 4196 if (ret) 4197 goto out_destroy; 4198 4199 trace_cgroup_mkdir(cgrp); 4200 4201 /* let's create and online css's */ 4202 kernfs_activate(kn); 4203 4204 ret = 0; 4205 goto out_unlock; 4206 4207 out_destroy: 4208 cgroup_destroy_locked(cgrp); 4209 out_unlock: 4210 cgroup_kn_unlock(parent_kn); 4211 return ret; 4212 } 4213 4214 /* 4215 * This is called when the refcnt of a css is confirmed to be killed. 4216 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to 4217 * initate destruction and put the css ref from kill_css(). 4218 */ 4219 static void css_killed_work_fn(struct work_struct *work) 4220 { 4221 struct cgroup_subsys_state *css = 4222 container_of(work, struct cgroup_subsys_state, destroy_work); 4223 4224 mutex_lock(&cgroup_mutex); 4225 4226 do { 4227 offline_css(css); 4228 css_put(css); 4229 /* @css can't go away while we're holding cgroup_mutex */ 4230 css = css->parent; 4231 } while (css && atomic_dec_and_test(&css->online_cnt)); 4232 4233 mutex_unlock(&cgroup_mutex); 4234 } 4235 4236 /* css kill confirmation processing requires process context, bounce */ 4237 static void css_killed_ref_fn(struct percpu_ref *ref) 4238 { 4239 struct cgroup_subsys_state *css = 4240 container_of(ref, struct cgroup_subsys_state, refcnt); 4241 4242 if (atomic_dec_and_test(&css->online_cnt)) { 4243 INIT_WORK(&css->destroy_work, css_killed_work_fn); 4244 queue_work(cgroup_destroy_wq, &css->destroy_work); 4245 } 4246 } 4247 4248 /** 4249 * kill_css - destroy a css 4250 * @css: css to destroy 4251 * 4252 * This function initiates destruction of @css by removing cgroup interface 4253 * files and putting its base reference. ->css_offline() will be invoked 4254 * asynchronously once css_tryget_online() is guaranteed to fail and when 4255 * the reference count reaches zero, @css will be released. 4256 */ 4257 static void kill_css(struct cgroup_subsys_state *css) 4258 { 4259 lockdep_assert_held(&cgroup_mutex); 4260 4261 /* 4262 * This must happen before css is disassociated with its cgroup. 4263 * See seq_css() for details. 4264 */ 4265 css_clear_dir(css); 4266 4267 /* 4268 * Killing would put the base ref, but we need to keep it alive 4269 * until after ->css_offline(). 4270 */ 4271 css_get(css); 4272 4273 /* 4274 * cgroup core guarantees that, by the time ->css_offline() is 4275 * invoked, no new css reference will be given out via 4276 * css_tryget_online(). We can't simply call percpu_ref_kill() and 4277 * proceed to offlining css's because percpu_ref_kill() doesn't 4278 * guarantee that the ref is seen as killed on all CPUs on return. 4279 * 4280 * Use percpu_ref_kill_and_confirm() to get notifications as each 4281 * css is confirmed to be seen as killed on all CPUs. 4282 */ 4283 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); 4284 } 4285 4286 /** 4287 * cgroup_destroy_locked - the first stage of cgroup destruction 4288 * @cgrp: cgroup to be destroyed 4289 * 4290 * css's make use of percpu refcnts whose killing latency shouldn't be 4291 * exposed to userland and are RCU protected. Also, cgroup core needs to 4292 * guarantee that css_tryget_online() won't succeed by the time 4293 * ->css_offline() is invoked. To satisfy all the requirements, 4294 * destruction is implemented in the following two steps. 4295 * 4296 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4297 * userland visible parts and start killing the percpu refcnts of 4298 * css's. Set up so that the next stage will be kicked off once all 4299 * the percpu refcnts are confirmed to be killed. 4300 * 4301 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the 4302 * rest of destruction. Once all cgroup references are gone, the 4303 * cgroup is RCU-freed. 4304 * 4305 * This function implements s1. After this step, @cgrp is gone as far as 4306 * the userland is concerned and a new cgroup with the same name may be 4307 * created. As cgroup doesn't care about the names internally, this 4308 * doesn't cause any problem. 4309 */ 4310 static int cgroup_destroy_locked(struct cgroup *cgrp) 4311 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4312 { 4313 struct cgroup_subsys_state *css; 4314 struct cgrp_cset_link *link; 4315 int ssid; 4316 4317 lockdep_assert_held(&cgroup_mutex); 4318 4319 /* 4320 * Only migration can raise populated from zero and we're already 4321 * holding cgroup_mutex. 4322 */ 4323 if (cgroup_is_populated(cgrp)) 4324 return -EBUSY; 4325 4326 /* 4327 * Make sure there's no live children. We can't test emptiness of 4328 * ->self.children as dead children linger on it while being 4329 * drained; otherwise, "rmdir parent/child parent" may fail. 4330 */ 4331 if (css_has_online_children(&cgrp->self)) 4332 return -EBUSY; 4333 4334 /* 4335 * Mark @cgrp and the associated csets dead. The former prevents 4336 * further task migration and child creation by disabling 4337 * cgroup_lock_live_group(). The latter makes the csets ignored by 4338 * the migration path. 4339 */ 4340 cgrp->self.flags &= ~CSS_ONLINE; 4341 4342 spin_lock_irq(&css_set_lock); 4343 list_for_each_entry(link, &cgrp->cset_links, cset_link) 4344 link->cset->dead = true; 4345 spin_unlock_irq(&css_set_lock); 4346 4347 /* initiate massacre of all css's */ 4348 for_each_css(css, ssid, cgrp) 4349 kill_css(css); 4350 4351 /* 4352 * Remove @cgrp directory along with the base files. @cgrp has an 4353 * extra ref on its kn. 4354 */ 4355 kernfs_remove(cgrp->kn); 4356 4357 cgroup1_check_for_release(cgroup_parent(cgrp)); 4358 4359 /* put the base reference */ 4360 percpu_ref_kill(&cgrp->self.refcnt); 4361 4362 return 0; 4363 }; 4364 4365 int cgroup_rmdir(struct kernfs_node *kn) 4366 { 4367 struct cgroup *cgrp; 4368 int ret = 0; 4369 4370 cgrp = cgroup_kn_lock_live(kn, false); 4371 if (!cgrp) 4372 return 0; 4373 4374 ret = cgroup_destroy_locked(cgrp); 4375 4376 if (!ret) 4377 trace_cgroup_rmdir(cgrp); 4378 4379 cgroup_kn_unlock(kn); 4380 return ret; 4381 } 4382 4383 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4384 .remount_fs = cgroup_remount, 4385 .mkdir = cgroup_mkdir, 4386 .rmdir = cgroup_rmdir, 4387 .show_path = cgroup_show_path, 4388 }; 4389 4390 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) 4391 { 4392 struct cgroup_subsys_state *css; 4393 4394 pr_debug("Initializing cgroup subsys %s\n", ss->name); 4395 4396 mutex_lock(&cgroup_mutex); 4397 4398 idr_init(&ss->css_idr); 4399 INIT_LIST_HEAD(&ss->cfts); 4400 4401 /* Create the root cgroup state for this subsystem */ 4402 ss->root = &cgrp_dfl_root; 4403 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4404 /* We don't handle early failures gracefully */ 4405 BUG_ON(IS_ERR(css)); 4406 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); 4407 4408 /* 4409 * Root csses are never destroyed and we can't initialize 4410 * percpu_ref during early init. Disable refcnting. 4411 */ 4412 css->flags |= CSS_NO_REF; 4413 4414 if (early) { 4415 /* allocation can't be done safely during early init */ 4416 css->id = 1; 4417 } else { 4418 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); 4419 BUG_ON(css->id < 0); 4420 } 4421 4422 /* Update the init_css_set to contain a subsys 4423 * pointer to this state - since the subsystem is 4424 * newly registered, all tasks and hence the 4425 * init_css_set is in the subsystem's root cgroup. */ 4426 init_css_set.subsys[ss->id] = css; 4427 4428 have_fork_callback |= (bool)ss->fork << ss->id; 4429 have_exit_callback |= (bool)ss->exit << ss->id; 4430 have_free_callback |= (bool)ss->free << ss->id; 4431 have_canfork_callback |= (bool)ss->can_fork << ss->id; 4432 4433 /* At system boot, before all subsystems have been 4434 * registered, no tasks have been forked, so we don't 4435 * need to invoke fork callbacks here. */ 4436 BUG_ON(!list_empty(&init_task.tasks)); 4437 4438 BUG_ON(online_css(css)); 4439 4440 mutex_unlock(&cgroup_mutex); 4441 } 4442 4443 /** 4444 * cgroup_init_early - cgroup initialization at system boot 4445 * 4446 * Initialize cgroups at system boot, and initialize any 4447 * subsystems that request early init. 4448 */ 4449 int __init cgroup_init_early(void) 4450 { 4451 static struct cgroup_sb_opts __initdata opts; 4452 struct cgroup_subsys *ss; 4453 int i; 4454 4455 init_cgroup_root(&cgrp_dfl_root, &opts); 4456 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; 4457 4458 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4459 4460 for_each_subsys(ss, i) { 4461 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, 4462 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", 4463 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, 4464 ss->id, ss->name); 4465 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, 4466 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); 4467 4468 ss->id = i; 4469 ss->name = cgroup_subsys_name[i]; 4470 if (!ss->legacy_name) 4471 ss->legacy_name = cgroup_subsys_name[i]; 4472 4473 if (ss->early_init) 4474 cgroup_init_subsys(ss, true); 4475 } 4476 return 0; 4477 } 4478 4479 static u16 cgroup_disable_mask __initdata; 4480 4481 /** 4482 * cgroup_init - cgroup initialization 4483 * 4484 * Register cgroup filesystem and /proc file, and initialize 4485 * any subsystems that didn't request early init. 4486 */ 4487 int __init cgroup_init(void) 4488 { 4489 struct cgroup_subsys *ss; 4490 int ssid; 4491 4492 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); 4493 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); 4494 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4495 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 4496 4497 /* 4498 * The latency of the synchronize_sched() is too high for cgroups, 4499 * avoid it at the cost of forcing all readers into the slow path. 4500 */ 4501 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); 4502 4503 get_user_ns(init_cgroup_ns.user_ns); 4504 4505 mutex_lock(&cgroup_mutex); 4506 4507 /* 4508 * Add init_css_set to the hash table so that dfl_root can link to 4509 * it during init. 4510 */ 4511 hash_add(css_set_table, &init_css_set.hlist, 4512 css_set_hash(init_css_set.subsys)); 4513 4514 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4515 4516 mutex_unlock(&cgroup_mutex); 4517 4518 for_each_subsys(ss, ssid) { 4519 if (ss->early_init) { 4520 struct cgroup_subsys_state *css = 4521 init_css_set.subsys[ss->id]; 4522 4523 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, 4524 GFP_KERNEL); 4525 BUG_ON(css->id < 0); 4526 } else { 4527 cgroup_init_subsys(ss, false); 4528 } 4529 4530 list_add_tail(&init_css_set.e_cset_node[ssid], 4531 &cgrp_dfl_root.cgrp.e_csets[ssid]); 4532 4533 /* 4534 * Setting dfl_root subsys_mask needs to consider the 4535 * disabled flag and cftype registration needs kmalloc, 4536 * both of which aren't available during early_init. 4537 */ 4538 if (cgroup_disable_mask & (1 << ssid)) { 4539 static_branch_disable(cgroup_subsys_enabled_key[ssid]); 4540 printk(KERN_INFO "Disabling %s control group subsystem\n", 4541 ss->name); 4542 continue; 4543 } 4544 4545 if (cgroup1_ssid_disabled(ssid)) 4546 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", 4547 ss->name); 4548 4549 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4550 4551 if (ss->implicit_on_dfl) 4552 cgrp_dfl_implicit_ss_mask |= 1 << ss->id; 4553 else if (!ss->dfl_cftypes) 4554 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; 4555 4556 if (ss->dfl_cftypes == ss->legacy_cftypes) { 4557 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); 4558 } else { 4559 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); 4560 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); 4561 } 4562 4563 if (ss->bind) 4564 ss->bind(init_css_set.subsys[ssid]); 4565 } 4566 4567 /* init_css_set.subsys[] has been updated, re-hash */ 4568 hash_del(&init_css_set.hlist); 4569 hash_add(css_set_table, &init_css_set.hlist, 4570 css_set_hash(init_css_set.subsys)); 4571 4572 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); 4573 WARN_ON(register_filesystem(&cgroup_fs_type)); 4574 WARN_ON(register_filesystem(&cgroup2_fs_type)); 4575 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); 4576 4577 return 0; 4578 } 4579 4580 static int __init cgroup_wq_init(void) 4581 { 4582 /* 4583 * There isn't much point in executing destruction path in 4584 * parallel. Good chunk is serialized with cgroup_mutex anyway. 4585 * Use 1 for @max_active. 4586 * 4587 * We would prefer to do this in cgroup_init() above, but that 4588 * is called before init_workqueues(): so leave this until after. 4589 */ 4590 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4591 BUG_ON(!cgroup_destroy_wq); 4592 return 0; 4593 } 4594 core_initcall(cgroup_wq_init); 4595 4596 /* 4597 * proc_cgroup_show() 4598 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4599 * - Used for /proc/<pid>/cgroup. 4600 */ 4601 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, 4602 struct pid *pid, struct task_struct *tsk) 4603 { 4604 char *buf; 4605 int retval; 4606 struct cgroup_root *root; 4607 4608 retval = -ENOMEM; 4609 buf = kmalloc(PATH_MAX, GFP_KERNEL); 4610 if (!buf) 4611 goto out; 4612 4613 mutex_lock(&cgroup_mutex); 4614 spin_lock_irq(&css_set_lock); 4615 4616 for_each_root(root) { 4617 struct cgroup_subsys *ss; 4618 struct cgroup *cgrp; 4619 int ssid, count = 0; 4620 4621 if (root == &cgrp_dfl_root && !cgrp_dfl_visible) 4622 continue; 4623 4624 seq_printf(m, "%d:", root->hierarchy_id); 4625 if (root != &cgrp_dfl_root) 4626 for_each_subsys(ss, ssid) 4627 if (root->subsys_mask & (1 << ssid)) 4628 seq_printf(m, "%s%s", count++ ? "," : "", 4629 ss->legacy_name); 4630 if (strlen(root->name)) 4631 seq_printf(m, "%sname=%s", count ? "," : "", 4632 root->name); 4633 seq_putc(m, ':'); 4634 4635 cgrp = task_cgroup_from_root(tsk, root); 4636 4637 /* 4638 * On traditional hierarchies, all zombie tasks show up as 4639 * belonging to the root cgroup. On the default hierarchy, 4640 * while a zombie doesn't show up in "cgroup.procs" and 4641 * thus can't be migrated, its /proc/PID/cgroup keeps 4642 * reporting the cgroup it belonged to before exiting. If 4643 * the cgroup is removed before the zombie is reaped, 4644 * " (deleted)" is appended to the cgroup path. 4645 */ 4646 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { 4647 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, 4648 current->nsproxy->cgroup_ns); 4649 if (retval >= PATH_MAX) 4650 retval = -ENAMETOOLONG; 4651 if (retval < 0) 4652 goto out_unlock; 4653 4654 seq_puts(m, buf); 4655 } else { 4656 seq_puts(m, "/"); 4657 } 4658 4659 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) 4660 seq_puts(m, " (deleted)\n"); 4661 else 4662 seq_putc(m, '\n'); 4663 } 4664 4665 retval = 0; 4666 out_unlock: 4667 spin_unlock_irq(&css_set_lock); 4668 mutex_unlock(&cgroup_mutex); 4669 kfree(buf); 4670 out: 4671 return retval; 4672 } 4673 4674 /** 4675 * cgroup_fork - initialize cgroup related fields during copy_process() 4676 * @child: pointer to task_struct of forking parent process. 4677 * 4678 * A task is associated with the init_css_set until cgroup_post_fork() 4679 * attaches it to the parent's css_set. Empty cg_list indicates that 4680 * @child isn't holding reference to its css_set. 4681 */ 4682 void cgroup_fork(struct task_struct *child) 4683 { 4684 RCU_INIT_POINTER(child->cgroups, &init_css_set); 4685 INIT_LIST_HEAD(&child->cg_list); 4686 } 4687 4688 /** 4689 * cgroup_can_fork - called on a new task before the process is exposed 4690 * @child: the task in question. 4691 * 4692 * This calls the subsystem can_fork() callbacks. If the can_fork() callback 4693 * returns an error, the fork aborts with that error code. This allows for 4694 * a cgroup subsystem to conditionally allow or deny new forks. 4695 */ 4696 int cgroup_can_fork(struct task_struct *child) 4697 { 4698 struct cgroup_subsys *ss; 4699 int i, j, ret; 4700 4701 do_each_subsys_mask(ss, i, have_canfork_callback) { 4702 ret = ss->can_fork(child); 4703 if (ret) 4704 goto out_revert; 4705 } while_each_subsys_mask(); 4706 4707 return 0; 4708 4709 out_revert: 4710 for_each_subsys(ss, j) { 4711 if (j >= i) 4712 break; 4713 if (ss->cancel_fork) 4714 ss->cancel_fork(child); 4715 } 4716 4717 return ret; 4718 } 4719 4720 /** 4721 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() 4722 * @child: the task in question 4723 * 4724 * This calls the cancel_fork() callbacks if a fork failed *after* 4725 * cgroup_can_fork() succeded. 4726 */ 4727 void cgroup_cancel_fork(struct task_struct *child) 4728 { 4729 struct cgroup_subsys *ss; 4730 int i; 4731 4732 for_each_subsys(ss, i) 4733 if (ss->cancel_fork) 4734 ss->cancel_fork(child); 4735 } 4736 4737 /** 4738 * cgroup_post_fork - called on a new task after adding it to the task list 4739 * @child: the task in question 4740 * 4741 * Adds the task to the list running through its css_set if necessary and 4742 * call the subsystem fork() callbacks. Has to be after the task is 4743 * visible on the task list in case we race with the first call to 4744 * cgroup_task_iter_start() - to guarantee that the new task ends up on its 4745 * list. 4746 */ 4747 void cgroup_post_fork(struct task_struct *child) 4748 { 4749 struct cgroup_subsys *ss; 4750 int i; 4751 4752 /* 4753 * This may race against cgroup_enable_task_cg_lists(). As that 4754 * function sets use_task_css_set_links before grabbing 4755 * tasklist_lock and we just went through tasklist_lock to add 4756 * @child, it's guaranteed that either we see the set 4757 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees 4758 * @child during its iteration. 4759 * 4760 * If we won the race, @child is associated with %current's 4761 * css_set. Grabbing css_set_lock guarantees both that the 4762 * association is stable, and, on completion of the parent's 4763 * migration, @child is visible in the source of migration or 4764 * already in the destination cgroup. This guarantee is necessary 4765 * when implementing operations which need to migrate all tasks of 4766 * a cgroup to another. 4767 * 4768 * Note that if we lose to cgroup_enable_task_cg_lists(), @child 4769 * will remain in init_css_set. This is safe because all tasks are 4770 * in the init_css_set before cg_links is enabled and there's no 4771 * operation which transfers all tasks out of init_css_set. 4772 */ 4773 if (use_task_css_set_links) { 4774 struct css_set *cset; 4775 4776 spin_lock_irq(&css_set_lock); 4777 cset = task_css_set(current); 4778 if (list_empty(&child->cg_list)) { 4779 get_css_set(cset); 4780 css_set_move_task(child, NULL, cset, false); 4781 } 4782 spin_unlock_irq(&css_set_lock); 4783 } 4784 4785 /* 4786 * Call ss->fork(). This must happen after @child is linked on 4787 * css_set; otherwise, @child might change state between ->fork() 4788 * and addition to css_set. 4789 */ 4790 do_each_subsys_mask(ss, i, have_fork_callback) { 4791 ss->fork(child); 4792 } while_each_subsys_mask(); 4793 } 4794 4795 /** 4796 * cgroup_exit - detach cgroup from exiting task 4797 * @tsk: pointer to task_struct of exiting process 4798 * 4799 * Description: Detach cgroup from @tsk and release it. 4800 * 4801 * Note that cgroups marked notify_on_release force every task in 4802 * them to take the global cgroup_mutex mutex when exiting. 4803 * This could impact scaling on very large systems. Be reluctant to 4804 * use notify_on_release cgroups where very high task exit scaling 4805 * is required on large systems. 4806 * 4807 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We 4808 * call cgroup_exit() while the task is still competent to handle 4809 * notify_on_release(), then leave the task attached to the root cgroup in 4810 * each hierarchy for the remainder of its exit. No need to bother with 4811 * init_css_set refcnting. init_css_set never goes away and we can't race 4812 * with migration path - PF_EXITING is visible to migration path. 4813 */ 4814 void cgroup_exit(struct task_struct *tsk) 4815 { 4816 struct cgroup_subsys *ss; 4817 struct css_set *cset; 4818 int i; 4819 4820 /* 4821 * Unlink from @tsk from its css_set. As migration path can't race 4822 * with us, we can check css_set and cg_list without synchronization. 4823 */ 4824 cset = task_css_set(tsk); 4825 4826 if (!list_empty(&tsk->cg_list)) { 4827 spin_lock_irq(&css_set_lock); 4828 css_set_move_task(tsk, cset, NULL, false); 4829 spin_unlock_irq(&css_set_lock); 4830 } else { 4831 get_css_set(cset); 4832 } 4833 4834 /* see cgroup_post_fork() for details */ 4835 do_each_subsys_mask(ss, i, have_exit_callback) { 4836 ss->exit(tsk); 4837 } while_each_subsys_mask(); 4838 } 4839 4840 void cgroup_free(struct task_struct *task) 4841 { 4842 struct css_set *cset = task_css_set(task); 4843 struct cgroup_subsys *ss; 4844 int ssid; 4845 4846 do_each_subsys_mask(ss, ssid, have_free_callback) { 4847 ss->free(task); 4848 } while_each_subsys_mask(); 4849 4850 put_css_set(cset); 4851 } 4852 4853 static int __init cgroup_disable(char *str) 4854 { 4855 struct cgroup_subsys *ss; 4856 char *token; 4857 int i; 4858 4859 while ((token = strsep(&str, ",")) != NULL) { 4860 if (!*token) 4861 continue; 4862 4863 for_each_subsys(ss, i) { 4864 if (strcmp(token, ss->name) && 4865 strcmp(token, ss->legacy_name)) 4866 continue; 4867 cgroup_disable_mask |= 1 << i; 4868 } 4869 } 4870 return 1; 4871 } 4872 __setup("cgroup_disable=", cgroup_disable); 4873 4874 /** 4875 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 4876 * @dentry: directory dentry of interest 4877 * @ss: subsystem of interest 4878 * 4879 * If @dentry is a directory for a cgroup which has @ss enabled on it, try 4880 * to get the corresponding css and return it. If such css doesn't exist 4881 * or can't be pinned, an ERR_PTR value is returned. 4882 */ 4883 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, 4884 struct cgroup_subsys *ss) 4885 { 4886 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 4887 struct file_system_type *s_type = dentry->d_sb->s_type; 4888 struct cgroup_subsys_state *css = NULL; 4889 struct cgroup *cgrp; 4890 4891 /* is @dentry a cgroup dir? */ 4892 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || 4893 !kn || kernfs_type(kn) != KERNFS_DIR) 4894 return ERR_PTR(-EBADF); 4895 4896 rcu_read_lock(); 4897 4898 /* 4899 * This path doesn't originate from kernfs and @kn could already 4900 * have been or be removed at any point. @kn->priv is RCU 4901 * protected for this access. See css_release_work_fn() for details. 4902 */ 4903 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); 4904 if (cgrp) 4905 css = cgroup_css(cgrp, ss); 4906 4907 if (!css || !css_tryget_online(css)) 4908 css = ERR_PTR(-ENOENT); 4909 4910 rcu_read_unlock(); 4911 return css; 4912 } 4913 4914 /** 4915 * css_from_id - lookup css by id 4916 * @id: the cgroup id 4917 * @ss: cgroup subsys to be looked into 4918 * 4919 * Returns the css if there's valid one with @id, otherwise returns NULL. 4920 * Should be called under rcu_read_lock(). 4921 */ 4922 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 4923 { 4924 WARN_ON_ONCE(!rcu_read_lock_held()); 4925 return idr_find(&ss->css_idr, id); 4926 } 4927 4928 /** 4929 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path 4930 * @path: path on the default hierarchy 4931 * 4932 * Find the cgroup at @path on the default hierarchy, increment its 4933 * reference count and return it. Returns pointer to the found cgroup on 4934 * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) 4935 * if @path points to a non-directory. 4936 */ 4937 struct cgroup *cgroup_get_from_path(const char *path) 4938 { 4939 struct kernfs_node *kn; 4940 struct cgroup *cgrp; 4941 4942 mutex_lock(&cgroup_mutex); 4943 4944 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); 4945 if (kn) { 4946 if (kernfs_type(kn) == KERNFS_DIR) { 4947 cgrp = kn->priv; 4948 cgroup_get(cgrp); 4949 } else { 4950 cgrp = ERR_PTR(-ENOTDIR); 4951 } 4952 kernfs_put(kn); 4953 } else { 4954 cgrp = ERR_PTR(-ENOENT); 4955 } 4956 4957 mutex_unlock(&cgroup_mutex); 4958 return cgrp; 4959 } 4960 EXPORT_SYMBOL_GPL(cgroup_get_from_path); 4961 4962 /** 4963 * cgroup_get_from_fd - get a cgroup pointer from a fd 4964 * @fd: fd obtained by open(cgroup2_dir) 4965 * 4966 * Find the cgroup from a fd which should be obtained 4967 * by opening a cgroup directory. Returns a pointer to the 4968 * cgroup on success. ERR_PTR is returned if the cgroup 4969 * cannot be found. 4970 */ 4971 struct cgroup *cgroup_get_from_fd(int fd) 4972 { 4973 struct cgroup_subsys_state *css; 4974 struct cgroup *cgrp; 4975 struct file *f; 4976 4977 f = fget_raw(fd); 4978 if (!f) 4979 return ERR_PTR(-EBADF); 4980 4981 css = css_tryget_online_from_dir(f->f_path.dentry, NULL); 4982 fput(f); 4983 if (IS_ERR(css)) 4984 return ERR_CAST(css); 4985 4986 cgrp = css->cgroup; 4987 if (!cgroup_on_dfl(cgrp)) { 4988 cgroup_put(cgrp); 4989 return ERR_PTR(-EBADF); 4990 } 4991 4992 return cgrp; 4993 } 4994 EXPORT_SYMBOL_GPL(cgroup_get_from_fd); 4995 4996 /* 4997 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data 4998 * definition in cgroup-defs.h. 4999 */ 5000 #ifdef CONFIG_SOCK_CGROUP_DATA 5001 5002 #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) 5003 5004 DEFINE_SPINLOCK(cgroup_sk_update_lock); 5005 static bool cgroup_sk_alloc_disabled __read_mostly; 5006 5007 void cgroup_sk_alloc_disable(void) 5008 { 5009 if (cgroup_sk_alloc_disabled) 5010 return; 5011 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); 5012 cgroup_sk_alloc_disabled = true; 5013 } 5014 5015 #else 5016 5017 #define cgroup_sk_alloc_disabled false 5018 5019 #endif 5020 5021 void cgroup_sk_alloc(struct sock_cgroup_data *skcd) 5022 { 5023 if (cgroup_sk_alloc_disabled) 5024 return; 5025 5026 /* Socket clone path */ 5027 if (skcd->val) { 5028 cgroup_get(sock_cgroup_ptr(skcd)); 5029 return; 5030 } 5031 5032 rcu_read_lock(); 5033 5034 while (true) { 5035 struct css_set *cset; 5036 5037 cset = task_css_set(current); 5038 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 5039 skcd->val = (unsigned long)cset->dfl_cgrp; 5040 break; 5041 } 5042 cpu_relax(); 5043 } 5044 5045 rcu_read_unlock(); 5046 } 5047 5048 void cgroup_sk_free(struct sock_cgroup_data *skcd) 5049 { 5050 cgroup_put(sock_cgroup_ptr(skcd)); 5051 } 5052 5053 #endif /* CONFIG_SOCK_CGROUP_DATA */ 5054 5055 #ifdef CONFIG_CGROUP_BPF 5056 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, 5057 enum bpf_attach_type type, bool overridable) 5058 { 5059 struct cgroup *parent = cgroup_parent(cgrp); 5060 int ret; 5061 5062 mutex_lock(&cgroup_mutex); 5063 ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); 5064 mutex_unlock(&cgroup_mutex); 5065 return ret; 5066 } 5067 #endif /* CONFIG_CGROUP_BPF */ 5068