1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/cpumask.h> 27 #include <linux/cpuset.h> 28 #include <linux/err.h> 29 #include <linux/errno.h> 30 #include <linux/file.h> 31 #include <linux/fs.h> 32 #include <linux/init.h> 33 #include <linux/interrupt.h> 34 #include <linux/kernel.h> 35 #include <linux/kmod.h> 36 #include <linux/list.h> 37 #include <linux/mempolicy.h> 38 #include <linux/mm.h> 39 #include <linux/memory.h> 40 #include <linux/export.h> 41 #include <linux/mount.h> 42 #include <linux/namei.h> 43 #include <linux/pagemap.h> 44 #include <linux/proc_fs.h> 45 #include <linux/rcupdate.h> 46 #include <linux/sched.h> 47 #include <linux/sched/mm.h> 48 #include <linux/seq_file.h> 49 #include <linux/security.h> 50 #include <linux/slab.h> 51 #include <linux/spinlock.h> 52 #include <linux/stat.h> 53 #include <linux/string.h> 54 #include <linux/time.h> 55 #include <linux/time64.h> 56 #include <linux/backing-dev.h> 57 #include <linux/sort.h> 58 59 #include <linux/uaccess.h> 60 #include <linux/atomic.h> 61 #include <linux/mutex.h> 62 #include <linux/cgroup.h> 63 #include <linux/wait.h> 64 65 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 66 67 /* See "Frequency meter" comments, below. */ 68 69 struct fmeter { 70 int cnt; /* unprocessed events count */ 71 int val; /* most recent output value */ 72 time64_t time; /* clock (secs) when val computed */ 73 spinlock_t lock; /* guards read or write of above */ 74 }; 75 76 struct cpuset { 77 struct cgroup_subsys_state css; 78 79 unsigned long flags; /* "unsigned long" so bitops work */ 80 81 /* 82 * On default hierarchy: 83 * 84 * The user-configured masks can only be changed by writing to 85 * cpuset.cpus and cpuset.mems, and won't be limited by the 86 * parent masks. 87 * 88 * The effective masks is the real masks that apply to the tasks 89 * in the cpuset. They may be changed if the configured masks are 90 * changed or hotplug happens. 91 * 92 * effective_mask == configured_mask & parent's effective_mask, 93 * and if it ends up empty, it will inherit the parent's mask. 94 * 95 * 96 * On legacy hierachy: 97 * 98 * The user-configured masks are always the same with effective masks. 99 */ 100 101 /* user-configured CPUs and Memory Nodes allow to tasks */ 102 cpumask_var_t cpus_allowed; 103 nodemask_t mems_allowed; 104 105 /* effective CPUs and Memory Nodes allow to tasks */ 106 cpumask_var_t effective_cpus; 107 nodemask_t effective_mems; 108 109 /* 110 * This is old Memory Nodes tasks took on. 111 * 112 * - top_cpuset.old_mems_allowed is initialized to mems_allowed. 113 * - A new cpuset's old_mems_allowed is initialized when some 114 * task is moved into it. 115 * - old_mems_allowed is used in cpuset_migrate_mm() when we change 116 * cpuset.mems_allowed and have tasks' nodemask updated, and 117 * then old_mems_allowed is updated to mems_allowed. 118 */ 119 nodemask_t old_mems_allowed; 120 121 struct fmeter fmeter; /* memory_pressure filter */ 122 123 /* 124 * Tasks are being attached to this cpuset. Used to prevent 125 * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). 126 */ 127 int attach_in_progress; 128 129 /* partition number for rebuild_sched_domains() */ 130 int pn; 131 132 /* for custom sched domain */ 133 int relax_domain_level; 134 }; 135 136 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) 137 { 138 return css ? container_of(css, struct cpuset, css) : NULL; 139 } 140 141 /* Retrieve the cpuset for a task */ 142 static inline struct cpuset *task_cs(struct task_struct *task) 143 { 144 return css_cs(task_css(task, cpuset_cgrp_id)); 145 } 146 147 static inline struct cpuset *parent_cs(struct cpuset *cs) 148 { 149 return css_cs(cs->css.parent); 150 } 151 152 #ifdef CONFIG_NUMA 153 static inline bool task_has_mempolicy(struct task_struct *task) 154 { 155 return task->mempolicy; 156 } 157 #else 158 static inline bool task_has_mempolicy(struct task_struct *task) 159 { 160 return false; 161 } 162 #endif 163 164 165 /* bits in struct cpuset flags field */ 166 typedef enum { 167 CS_ONLINE, 168 CS_CPU_EXCLUSIVE, 169 CS_MEM_EXCLUSIVE, 170 CS_MEM_HARDWALL, 171 CS_MEMORY_MIGRATE, 172 CS_SCHED_LOAD_BALANCE, 173 CS_SPREAD_PAGE, 174 CS_SPREAD_SLAB, 175 } cpuset_flagbits_t; 176 177 /* convenient tests for these bits */ 178 static inline bool is_cpuset_online(const struct cpuset *cs) 179 { 180 return test_bit(CS_ONLINE, &cs->flags); 181 } 182 183 static inline int is_cpu_exclusive(const struct cpuset *cs) 184 { 185 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 186 } 187 188 static inline int is_mem_exclusive(const struct cpuset *cs) 189 { 190 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 191 } 192 193 static inline int is_mem_hardwall(const struct cpuset *cs) 194 { 195 return test_bit(CS_MEM_HARDWALL, &cs->flags); 196 } 197 198 static inline int is_sched_load_balance(const struct cpuset *cs) 199 { 200 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 201 } 202 203 static inline int is_memory_migrate(const struct cpuset *cs) 204 { 205 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 206 } 207 208 static inline int is_spread_page(const struct cpuset *cs) 209 { 210 return test_bit(CS_SPREAD_PAGE, &cs->flags); 211 } 212 213 static inline int is_spread_slab(const struct cpuset *cs) 214 { 215 return test_bit(CS_SPREAD_SLAB, &cs->flags); 216 } 217 218 static struct cpuset top_cpuset = { 219 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | 220 (1 << CS_MEM_EXCLUSIVE)), 221 }; 222 223 /** 224 * cpuset_for_each_child - traverse online children of a cpuset 225 * @child_cs: loop cursor pointing to the current child 226 * @pos_css: used for iteration 227 * @parent_cs: target cpuset to walk children of 228 * 229 * Walk @child_cs through the online children of @parent_cs. Must be used 230 * with RCU read locked. 231 */ 232 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ 233 css_for_each_child((pos_css), &(parent_cs)->css) \ 234 if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) 235 236 /** 237 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 238 * @des_cs: loop cursor pointing to the current descendant 239 * @pos_css: used for iteration 240 * @root_cs: target cpuset to walk ancestor of 241 * 242 * Walk @des_cs through the online descendants of @root_cs. Must be used 243 * with RCU read locked. The caller may modify @pos_css by calling 244 * css_rightmost_descendant() to skip subtree. @root_cs is included in the 245 * iteration and the first node to be visited. 246 */ 247 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ 248 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ 249 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 250 251 /* 252 * There are two global locks guarding cpuset structures - cpuset_mutex and 253 * callback_lock. We also require taking task_lock() when dereferencing a 254 * task's cpuset pointer. See "The task_lock() exception", at the end of this 255 * comment. 256 * 257 * A task must hold both locks to modify cpusets. If a task holds 258 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 259 * is the only task able to also acquire callback_lock and be able to 260 * modify cpusets. It can perform various checks on the cpuset structure 261 * first, knowing nothing will change. It can also allocate memory while 262 * just holding cpuset_mutex. While it is performing these checks, various 263 * callback routines can briefly acquire callback_lock to query cpusets. 264 * Once it is ready to make the changes, it takes callback_lock, blocking 265 * everyone else. 266 * 267 * Calls to the kernel memory allocator can not be made while holding 268 * callback_lock, as that would risk double tripping on callback_lock 269 * from one of the callbacks into the cpuset code from within 270 * __alloc_pages(). 271 * 272 * If a task is only holding callback_lock, then it has read-only 273 * access to cpusets. 274 * 275 * Now, the task_struct fields mems_allowed and mempolicy may be changed 276 * by other task, we use alloc_lock in the task_struct fields to protect 277 * them. 278 * 279 * The cpuset_common_file_read() handlers only hold callback_lock across 280 * small pieces of code, such as when reading out possibly multi-word 281 * cpumasks and nodemasks. 282 * 283 * Accessing a task's cpuset should be done in accordance with the 284 * guidelines for accessing subsystem state in kernel/cgroup.c 285 */ 286 287 static DEFINE_MUTEX(cpuset_mutex); 288 static DEFINE_SPINLOCK(callback_lock); 289 290 static struct workqueue_struct *cpuset_migrate_mm_wq; 291 292 /* 293 * CPU / memory hotplug is handled asynchronously. 294 */ 295 static void cpuset_hotplug_workfn(struct work_struct *work); 296 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 297 298 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 299 300 /* 301 * This is ugly, but preserves the userspace API for existing cpuset 302 * users. If someone tries to mount the "cpuset" filesystem, we 303 * silently switch it to mount "cgroup" instead 304 */ 305 static struct dentry *cpuset_mount(struct file_system_type *fs_type, 306 int flags, const char *unused_dev_name, void *data) 307 { 308 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 309 struct dentry *ret = ERR_PTR(-ENODEV); 310 if (cgroup_fs) { 311 char mountopts[] = 312 "cpuset,noprefix," 313 "release_agent=/sbin/cpuset_release_agent"; 314 ret = cgroup_fs->mount(cgroup_fs, flags, 315 unused_dev_name, mountopts); 316 put_filesystem(cgroup_fs); 317 } 318 return ret; 319 } 320 321 static struct file_system_type cpuset_fs_type = { 322 .name = "cpuset", 323 .mount = cpuset_mount, 324 }; 325 326 /* 327 * Return in pmask the portion of a cpusets's cpus_allowed that 328 * are online. If none are online, walk up the cpuset hierarchy 329 * until we find one that does have some online cpus. 330 * 331 * One way or another, we guarantee to return some non-empty subset 332 * of cpu_online_mask. 333 * 334 * Call with callback_lock or cpuset_mutex held. 335 */ 336 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 337 { 338 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { 339 cs = parent_cs(cs); 340 if (unlikely(!cs)) { 341 /* 342 * The top cpuset doesn't have any online cpu as a 343 * consequence of a race between cpuset_hotplug_work 344 * and cpu hotplug notifier. But we know the top 345 * cpuset's effective_cpus is on its way to to be 346 * identical to cpu_online_mask. 347 */ 348 cpumask_copy(pmask, cpu_online_mask); 349 return; 350 } 351 } 352 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 353 } 354 355 /* 356 * Return in *pmask the portion of a cpusets's mems_allowed that 357 * are online, with memory. If none are online with memory, walk 358 * up the cpuset hierarchy until we find one that does have some 359 * online mems. The top cpuset always has some mems online. 360 * 361 * One way or another, we guarantee to return some non-empty subset 362 * of node_states[N_MEMORY]. 363 * 364 * Call with callback_lock or cpuset_mutex held. 365 */ 366 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 367 { 368 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 369 cs = parent_cs(cs); 370 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 371 } 372 373 /* 374 * update task's spread flag if cpuset's page/slab spread flag is set 375 * 376 * Call with callback_lock or cpuset_mutex held. 377 */ 378 static void cpuset_update_task_spread_flag(struct cpuset *cs, 379 struct task_struct *tsk) 380 { 381 if (is_spread_page(cs)) 382 task_set_spread_page(tsk); 383 else 384 task_clear_spread_page(tsk); 385 386 if (is_spread_slab(cs)) 387 task_set_spread_slab(tsk); 388 else 389 task_clear_spread_slab(tsk); 390 } 391 392 /* 393 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 394 * 395 * One cpuset is a subset of another if all its allowed CPUs and 396 * Memory Nodes are a subset of the other, and its exclusive flags 397 * are only set if the other's are set. Call holding cpuset_mutex. 398 */ 399 400 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 401 { 402 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 403 nodes_subset(p->mems_allowed, q->mems_allowed) && 404 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 405 is_mem_exclusive(p) <= is_mem_exclusive(q); 406 } 407 408 /** 409 * alloc_trial_cpuset - allocate a trial cpuset 410 * @cs: the cpuset that the trial cpuset duplicates 411 */ 412 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 413 { 414 struct cpuset *trial; 415 416 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 417 if (!trial) 418 return NULL; 419 420 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) 421 goto free_cs; 422 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) 423 goto free_cpus; 424 425 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 426 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 427 return trial; 428 429 free_cpus: 430 free_cpumask_var(trial->cpus_allowed); 431 free_cs: 432 kfree(trial); 433 return NULL; 434 } 435 436 /** 437 * free_trial_cpuset - free the trial cpuset 438 * @trial: the trial cpuset to be freed 439 */ 440 static void free_trial_cpuset(struct cpuset *trial) 441 { 442 free_cpumask_var(trial->effective_cpus); 443 free_cpumask_var(trial->cpus_allowed); 444 kfree(trial); 445 } 446 447 /* 448 * validate_change() - Used to validate that any proposed cpuset change 449 * follows the structural rules for cpusets. 450 * 451 * If we replaced the flag and mask values of the current cpuset 452 * (cur) with those values in the trial cpuset (trial), would 453 * our various subset and exclusive rules still be valid? Presumes 454 * cpuset_mutex held. 455 * 456 * 'cur' is the address of an actual, in-use cpuset. Operations 457 * such as list traversal that depend on the actual address of the 458 * cpuset in the list must use cur below, not trial. 459 * 460 * 'trial' is the address of bulk structure copy of cur, with 461 * perhaps one or more of the fields cpus_allowed, mems_allowed, 462 * or flags changed to new, trial values. 463 * 464 * Return 0 if valid, -errno if not. 465 */ 466 467 static int validate_change(struct cpuset *cur, struct cpuset *trial) 468 { 469 struct cgroup_subsys_state *css; 470 struct cpuset *c, *par; 471 int ret; 472 473 rcu_read_lock(); 474 475 /* Each of our child cpusets must be a subset of us */ 476 ret = -EBUSY; 477 cpuset_for_each_child(c, css, cur) 478 if (!is_cpuset_subset(c, trial)) 479 goto out; 480 481 /* Remaining checks don't apply to root cpuset */ 482 ret = 0; 483 if (cur == &top_cpuset) 484 goto out; 485 486 par = parent_cs(cur); 487 488 /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 489 ret = -EACCES; 490 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 491 !is_cpuset_subset(trial, par)) 492 goto out; 493 494 /* 495 * If either I or some sibling (!= me) is exclusive, we can't 496 * overlap 497 */ 498 ret = -EINVAL; 499 cpuset_for_each_child(c, css, par) { 500 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 501 c != cur && 502 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 503 goto out; 504 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 505 c != cur && 506 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 507 goto out; 508 } 509 510 /* 511 * Cpusets with tasks - existing or newly being attached - can't 512 * be changed to have empty cpus_allowed or mems_allowed. 513 */ 514 ret = -ENOSPC; 515 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 516 if (!cpumask_empty(cur->cpus_allowed) && 517 cpumask_empty(trial->cpus_allowed)) 518 goto out; 519 if (!nodes_empty(cur->mems_allowed) && 520 nodes_empty(trial->mems_allowed)) 521 goto out; 522 } 523 524 /* 525 * We can't shrink if we won't have enough room for SCHED_DEADLINE 526 * tasks. 527 */ 528 ret = -EBUSY; 529 if (is_cpu_exclusive(cur) && 530 !cpuset_cpumask_can_shrink(cur->cpus_allowed, 531 trial->cpus_allowed)) 532 goto out; 533 534 ret = 0; 535 out: 536 rcu_read_unlock(); 537 return ret; 538 } 539 540 #ifdef CONFIG_SMP 541 /* 542 * Helper routine for generate_sched_domains(). 543 * Do cpusets a, b have overlapping effective cpus_allowed masks? 544 */ 545 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 546 { 547 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 548 } 549 550 static void 551 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 552 { 553 if (dattr->relax_domain_level < c->relax_domain_level) 554 dattr->relax_domain_level = c->relax_domain_level; 555 return; 556 } 557 558 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 559 struct cpuset *root_cs) 560 { 561 struct cpuset *cp; 562 struct cgroup_subsys_state *pos_css; 563 564 rcu_read_lock(); 565 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 566 /* skip the whole subtree if @cp doesn't have any CPU */ 567 if (cpumask_empty(cp->cpus_allowed)) { 568 pos_css = css_rightmost_descendant(pos_css); 569 continue; 570 } 571 572 if (is_sched_load_balance(cp)) 573 update_domain_attr(dattr, cp); 574 } 575 rcu_read_unlock(); 576 } 577 578 /* 579 * generate_sched_domains() 580 * 581 * This function builds a partial partition of the systems CPUs 582 * A 'partial partition' is a set of non-overlapping subsets whose 583 * union is a subset of that set. 584 * The output of this function needs to be passed to kernel/sched/core.c 585 * partition_sched_domains() routine, which will rebuild the scheduler's 586 * load balancing domains (sched domains) as specified by that partial 587 * partition. 588 * 589 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt 590 * for a background explanation of this. 591 * 592 * Does not return errors, on the theory that the callers of this 593 * routine would rather not worry about failures to rebuild sched 594 * domains when operating in the severe memory shortage situations 595 * that could cause allocation failures below. 596 * 597 * Must be called with cpuset_mutex held. 598 * 599 * The three key local variables below are: 600 * q - a linked-list queue of cpuset pointers, used to implement a 601 * top-down scan of all cpusets. This scan loads a pointer 602 * to each cpuset marked is_sched_load_balance into the 603 * array 'csa'. For our purposes, rebuilding the schedulers 604 * sched domains, we can ignore !is_sched_load_balance cpusets. 605 * csa - (for CpuSet Array) Array of pointers to all the cpusets 606 * that need to be load balanced, for convenient iterative 607 * access by the subsequent code that finds the best partition, 608 * i.e the set of domains (subsets) of CPUs such that the 609 * cpus_allowed of every cpuset marked is_sched_load_balance 610 * is a subset of one of these domains, while there are as 611 * many such domains as possible, each as small as possible. 612 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 613 * the kernel/sched/core.c routine partition_sched_domains() in a 614 * convenient format, that can be easily compared to the prior 615 * value to determine what partition elements (sched domains) 616 * were changed (added or removed.) 617 * 618 * Finding the best partition (set of domains): 619 * The triple nested loops below over i, j, k scan over the 620 * load balanced cpusets (using the array of cpuset pointers in 621 * csa[]) looking for pairs of cpusets that have overlapping 622 * cpus_allowed, but which don't have the same 'pn' partition 623 * number and gives them in the same partition number. It keeps 624 * looping on the 'restart' label until it can no longer find 625 * any such pairs. 626 * 627 * The union of the cpus_allowed masks from the set of 628 * all cpusets having the same 'pn' value then form the one 629 * element of the partition (one sched domain) to be passed to 630 * partition_sched_domains(). 631 */ 632 static int generate_sched_domains(cpumask_var_t **domains, 633 struct sched_domain_attr **attributes) 634 { 635 struct cpuset *cp; /* scans q */ 636 struct cpuset **csa; /* array of all cpuset ptrs */ 637 int csn; /* how many cpuset ptrs in csa so far */ 638 int i, j, k; /* indices for partition finding loops */ 639 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 640 cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ 641 struct sched_domain_attr *dattr; /* attributes for custom domains */ 642 int ndoms = 0; /* number of sched domains in result */ 643 int nslot; /* next empty doms[] struct cpumask slot */ 644 struct cgroup_subsys_state *pos_css; 645 646 doms = NULL; 647 dattr = NULL; 648 csa = NULL; 649 650 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) 651 goto done; 652 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 653 654 /* Special case for the 99% of systems with one, full, sched domain */ 655 if (is_sched_load_balance(&top_cpuset)) { 656 ndoms = 1; 657 doms = alloc_sched_domains(ndoms); 658 if (!doms) 659 goto done; 660 661 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 662 if (dattr) { 663 *dattr = SD_ATTR_INIT; 664 update_domain_attr_tree(dattr, &top_cpuset); 665 } 666 cpumask_and(doms[0], top_cpuset.effective_cpus, 667 non_isolated_cpus); 668 669 goto done; 670 } 671 672 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); 673 if (!csa) 674 goto done; 675 csn = 0; 676 677 rcu_read_lock(); 678 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 679 if (cp == &top_cpuset) 680 continue; 681 /* 682 * Continue traversing beyond @cp iff @cp has some CPUs and 683 * isn't load balancing. The former is obvious. The 684 * latter: All child cpusets contain a subset of the 685 * parent's cpus, so just skip them, and then we call 686 * update_domain_attr_tree() to calc relax_domain_level of 687 * the corresponding sched domain. 688 */ 689 if (!cpumask_empty(cp->cpus_allowed) && 690 !(is_sched_load_balance(cp) && 691 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) 692 continue; 693 694 if (is_sched_load_balance(cp)) 695 csa[csn++] = cp; 696 697 /* skip @cp's subtree */ 698 pos_css = css_rightmost_descendant(pos_css); 699 } 700 rcu_read_unlock(); 701 702 for (i = 0; i < csn; i++) 703 csa[i]->pn = i; 704 ndoms = csn; 705 706 restart: 707 /* Find the best partition (set of sched domains) */ 708 for (i = 0; i < csn; i++) { 709 struct cpuset *a = csa[i]; 710 int apn = a->pn; 711 712 for (j = 0; j < csn; j++) { 713 struct cpuset *b = csa[j]; 714 int bpn = b->pn; 715 716 if (apn != bpn && cpusets_overlap(a, b)) { 717 for (k = 0; k < csn; k++) { 718 struct cpuset *c = csa[k]; 719 720 if (c->pn == bpn) 721 c->pn = apn; 722 } 723 ndoms--; /* one less element */ 724 goto restart; 725 } 726 } 727 } 728 729 /* 730 * Now we know how many domains to create. 731 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 732 */ 733 doms = alloc_sched_domains(ndoms); 734 if (!doms) 735 goto done; 736 737 /* 738 * The rest of the code, including the scheduler, can deal with 739 * dattr==NULL case. No need to abort if alloc fails. 740 */ 741 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 742 743 for (nslot = 0, i = 0; i < csn; i++) { 744 struct cpuset *a = csa[i]; 745 struct cpumask *dp; 746 int apn = a->pn; 747 748 if (apn < 0) { 749 /* Skip completed partitions */ 750 continue; 751 } 752 753 dp = doms[nslot]; 754 755 if (nslot == ndoms) { 756 static int warnings = 10; 757 if (warnings) { 758 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", 759 nslot, ndoms, csn, i, apn); 760 warnings--; 761 } 762 continue; 763 } 764 765 cpumask_clear(dp); 766 if (dattr) 767 *(dattr + nslot) = SD_ATTR_INIT; 768 for (j = i; j < csn; j++) { 769 struct cpuset *b = csa[j]; 770 771 if (apn == b->pn) { 772 cpumask_or(dp, dp, b->effective_cpus); 773 cpumask_and(dp, dp, non_isolated_cpus); 774 if (dattr) 775 update_domain_attr_tree(dattr + nslot, b); 776 777 /* Done with this partition */ 778 b->pn = -1; 779 } 780 } 781 nslot++; 782 } 783 BUG_ON(nslot != ndoms); 784 785 done: 786 free_cpumask_var(non_isolated_cpus); 787 kfree(csa); 788 789 /* 790 * Fallback to the default domain if kmalloc() failed. 791 * See comments in partition_sched_domains(). 792 */ 793 if (doms == NULL) 794 ndoms = 1; 795 796 *domains = doms; 797 *attributes = dattr; 798 return ndoms; 799 } 800 801 /* 802 * Rebuild scheduler domains. 803 * 804 * If the flag 'sched_load_balance' of any cpuset with non-empty 805 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 806 * which has that flag enabled, or if any cpuset with a non-empty 807 * 'cpus' is removed, then call this routine to rebuild the 808 * scheduler's dynamic sched domains. 809 * 810 * Call with cpuset_mutex held. Takes get_online_cpus(). 811 */ 812 static void rebuild_sched_domains_locked(void) 813 { 814 struct sched_domain_attr *attr; 815 cpumask_var_t *doms; 816 int ndoms; 817 818 lockdep_assert_held(&cpuset_mutex); 819 get_online_cpus(); 820 821 /* 822 * We have raced with CPU hotplug. Don't do anything to avoid 823 * passing doms with offlined cpu to partition_sched_domains(). 824 * Anyways, hotplug work item will rebuild sched domains. 825 */ 826 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 827 goto out; 828 829 /* Generate domain masks and attrs */ 830 ndoms = generate_sched_domains(&doms, &attr); 831 832 /* Have scheduler rebuild the domains */ 833 partition_sched_domains(ndoms, doms, attr); 834 out: 835 put_online_cpus(); 836 } 837 #else /* !CONFIG_SMP */ 838 static void rebuild_sched_domains_locked(void) 839 { 840 } 841 #endif /* CONFIG_SMP */ 842 843 void rebuild_sched_domains(void) 844 { 845 mutex_lock(&cpuset_mutex); 846 rebuild_sched_domains_locked(); 847 mutex_unlock(&cpuset_mutex); 848 } 849 850 /** 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 853 * 854 * Iterate through each task of @cs updating its cpus_allowed to the 855 * effective cpuset's. As this function is called with cpuset_mutex held, 856 * cpuset membership stays stable. 857 */ 858 static void update_tasks_cpumask(struct cpuset *cs) 859 { 860 struct css_task_iter it; 861 struct task_struct *task; 862 863 css_task_iter_start(&cs->css, &it); 864 while ((task = css_task_iter_next(&it))) 865 set_cpus_allowed_ptr(task, cs->effective_cpus); 866 css_task_iter_end(&it); 867 } 868 869 /* 870 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 871 * @cs: the cpuset to consider 872 * @new_cpus: temp variable for calculating new effective_cpus 873 * 874 * When congifured cpumask is changed, the effective cpumasks of this cpuset 875 * and all its descendants need to be updated. 876 * 877 * On legacy hierachy, effective_cpus will be the same with cpu_allowed. 878 * 879 * Called with cpuset_mutex held 880 */ 881 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) 882 { 883 struct cpuset *cp; 884 struct cgroup_subsys_state *pos_css; 885 bool need_rebuild_sched_domains = false; 886 887 rcu_read_lock(); 888 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 889 struct cpuset *parent = parent_cs(cp); 890 891 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); 892 893 /* 894 * If it becomes empty, inherit the effective mask of the 895 * parent, which is guaranteed to have some CPUs. 896 */ 897 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 898 cpumask_empty(new_cpus)) 899 cpumask_copy(new_cpus, parent->effective_cpus); 900 901 /* Skip the whole subtree if the cpumask remains the same. */ 902 if (cpumask_equal(new_cpus, cp->effective_cpus)) { 903 pos_css = css_rightmost_descendant(pos_css); 904 continue; 905 } 906 907 if (!css_tryget_online(&cp->css)) 908 continue; 909 rcu_read_unlock(); 910 911 spin_lock_irq(&callback_lock); 912 cpumask_copy(cp->effective_cpus, new_cpus); 913 spin_unlock_irq(&callback_lock); 914 915 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 916 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 917 918 update_tasks_cpumask(cp); 919 920 /* 921 * If the effective cpumask of any non-empty cpuset is changed, 922 * we need to rebuild sched domains. 923 */ 924 if (!cpumask_empty(cp->cpus_allowed) && 925 is_sched_load_balance(cp)) 926 need_rebuild_sched_domains = true; 927 928 rcu_read_lock(); 929 css_put(&cp->css); 930 } 931 rcu_read_unlock(); 932 933 if (need_rebuild_sched_domains) 934 rebuild_sched_domains_locked(); 935 } 936 937 /** 938 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 939 * @cs: the cpuset to consider 940 * @trialcs: trial cpuset 941 * @buf: buffer of cpu numbers written to this cpuset 942 */ 943 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 944 const char *buf) 945 { 946 int retval; 947 948 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 949 if (cs == &top_cpuset) 950 return -EACCES; 951 952 /* 953 * An empty cpus_allowed is ok only if the cpuset has no tasks. 954 * Since cpulist_parse() fails on an empty mask, we special case 955 * that parsing. The validate_change() call ensures that cpusets 956 * with tasks have cpus. 957 */ 958 if (!*buf) { 959 cpumask_clear(trialcs->cpus_allowed); 960 } else { 961 retval = cpulist_parse(buf, trialcs->cpus_allowed); 962 if (retval < 0) 963 return retval; 964 965 if (!cpumask_subset(trialcs->cpus_allowed, 966 top_cpuset.cpus_allowed)) 967 return -EINVAL; 968 } 969 970 /* Nothing to do if the cpus didn't change */ 971 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 972 return 0; 973 974 retval = validate_change(cs, trialcs); 975 if (retval < 0) 976 return retval; 977 978 spin_lock_irq(&callback_lock); 979 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 980 spin_unlock_irq(&callback_lock); 981 982 /* use trialcs->cpus_allowed as a temp variable */ 983 update_cpumasks_hier(cs, trialcs->cpus_allowed); 984 return 0; 985 } 986 987 /* 988 * Migrate memory region from one set of nodes to another. This is 989 * performed asynchronously as it can be called from process migration path 990 * holding locks involved in process management. All mm migrations are 991 * performed in the queued order and can be waited for by flushing 992 * cpuset_migrate_mm_wq. 993 */ 994 995 struct cpuset_migrate_mm_work { 996 struct work_struct work; 997 struct mm_struct *mm; 998 nodemask_t from; 999 nodemask_t to; 1000 }; 1001 1002 static void cpuset_migrate_mm_workfn(struct work_struct *work) 1003 { 1004 struct cpuset_migrate_mm_work *mwork = 1005 container_of(work, struct cpuset_migrate_mm_work, work); 1006 1007 /* on a wq worker, no need to worry about %current's mems_allowed */ 1008 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 1009 mmput(mwork->mm); 1010 kfree(mwork); 1011 } 1012 1013 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 1014 const nodemask_t *to) 1015 { 1016 struct cpuset_migrate_mm_work *mwork; 1017 1018 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 1019 if (mwork) { 1020 mwork->mm = mm; 1021 mwork->from = *from; 1022 mwork->to = *to; 1023 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 1024 queue_work(cpuset_migrate_mm_wq, &mwork->work); 1025 } else { 1026 mmput(mm); 1027 } 1028 } 1029 1030 static void cpuset_post_attach(void) 1031 { 1032 flush_workqueue(cpuset_migrate_mm_wq); 1033 } 1034 1035 /* 1036 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 1037 * @tsk: the task to change 1038 * @newmems: new nodes that the task will be set 1039 * 1040 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 1041 * we structure updates as setting all new allowed nodes, then clearing newly 1042 * disallowed ones. 1043 */ 1044 static void cpuset_change_task_nodemask(struct task_struct *tsk, 1045 nodemask_t *newmems) 1046 { 1047 bool need_loop; 1048 1049 task_lock(tsk); 1050 /* 1051 * Determine if a loop is necessary if another thread is doing 1052 * read_mems_allowed_begin(). If at least one node remains unchanged and 1053 * tsk does not have a mempolicy, then an empty nodemask will not be 1054 * possible when mems_allowed is larger than a word. 1055 */ 1056 need_loop = task_has_mempolicy(tsk) || 1057 !nodes_intersects(*newmems, tsk->mems_allowed); 1058 1059 if (need_loop) { 1060 local_irq_disable(); 1061 write_seqcount_begin(&tsk->mems_allowed_seq); 1062 } 1063 1064 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1065 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1066 1067 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1068 tsk->mems_allowed = *newmems; 1069 1070 if (need_loop) { 1071 write_seqcount_end(&tsk->mems_allowed_seq); 1072 local_irq_enable(); 1073 } 1074 1075 task_unlock(tsk); 1076 } 1077 1078 static void *cpuset_being_rebound; 1079 1080 /** 1081 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1082 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1083 * 1084 * Iterate through each task of @cs updating its mems_allowed to the 1085 * effective cpuset's. As this function is called with cpuset_mutex held, 1086 * cpuset membership stays stable. 1087 */ 1088 static void update_tasks_nodemask(struct cpuset *cs) 1089 { 1090 static nodemask_t newmems; /* protected by cpuset_mutex */ 1091 struct css_task_iter it; 1092 struct task_struct *task; 1093 1094 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1095 1096 guarantee_online_mems(cs, &newmems); 1097 1098 /* 1099 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1100 * take while holding tasklist_lock. Forks can happen - the 1101 * mpol_dup() cpuset_being_rebound check will catch such forks, 1102 * and rebind their vma mempolicies too. Because we still hold 1103 * the global cpuset_mutex, we know that no other rebind effort 1104 * will be contending for the global variable cpuset_being_rebound. 1105 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1106 * is idempotent. Also migrate pages in each mm to new nodes. 1107 */ 1108 css_task_iter_start(&cs->css, &it); 1109 while ((task = css_task_iter_next(&it))) { 1110 struct mm_struct *mm; 1111 bool migrate; 1112 1113 cpuset_change_task_nodemask(task, &newmems); 1114 1115 mm = get_task_mm(task); 1116 if (!mm) 1117 continue; 1118 1119 migrate = is_memory_migrate(cs); 1120 1121 mpol_rebind_mm(mm, &cs->mems_allowed); 1122 if (migrate) 1123 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1124 else 1125 mmput(mm); 1126 } 1127 css_task_iter_end(&it); 1128 1129 /* 1130 * All the tasks' nodemasks have been updated, update 1131 * cs->old_mems_allowed. 1132 */ 1133 cs->old_mems_allowed = newmems; 1134 1135 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1136 cpuset_being_rebound = NULL; 1137 } 1138 1139 /* 1140 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 1141 * @cs: the cpuset to consider 1142 * @new_mems: a temp variable for calculating new effective_mems 1143 * 1144 * When configured nodemask is changed, the effective nodemasks of this cpuset 1145 * and all its descendants need to be updated. 1146 * 1147 * On legacy hiearchy, effective_mems will be the same with mems_allowed. 1148 * 1149 * Called with cpuset_mutex held 1150 */ 1151 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 1152 { 1153 struct cpuset *cp; 1154 struct cgroup_subsys_state *pos_css; 1155 1156 rcu_read_lock(); 1157 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 1158 struct cpuset *parent = parent_cs(cp); 1159 1160 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 1161 1162 /* 1163 * If it becomes empty, inherit the effective mask of the 1164 * parent, which is guaranteed to have some MEMs. 1165 */ 1166 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1167 nodes_empty(*new_mems)) 1168 *new_mems = parent->effective_mems; 1169 1170 /* Skip the whole subtree if the nodemask remains the same. */ 1171 if (nodes_equal(*new_mems, cp->effective_mems)) { 1172 pos_css = css_rightmost_descendant(pos_css); 1173 continue; 1174 } 1175 1176 if (!css_tryget_online(&cp->css)) 1177 continue; 1178 rcu_read_unlock(); 1179 1180 spin_lock_irq(&callback_lock); 1181 cp->effective_mems = *new_mems; 1182 spin_unlock_irq(&callback_lock); 1183 1184 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1185 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1186 1187 update_tasks_nodemask(cp); 1188 1189 rcu_read_lock(); 1190 css_put(&cp->css); 1191 } 1192 rcu_read_unlock(); 1193 } 1194 1195 /* 1196 * Handle user request to change the 'mems' memory placement 1197 * of a cpuset. Needs to validate the request, update the 1198 * cpusets mems_allowed, and for each task in the cpuset, 1199 * update mems_allowed and rebind task's mempolicy and any vma 1200 * mempolicies and if the cpuset is marked 'memory_migrate', 1201 * migrate the tasks pages to the new memory. 1202 * 1203 * Call with cpuset_mutex held. May take callback_lock during call. 1204 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1205 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1206 * their mempolicies to the cpusets new mems_allowed. 1207 */ 1208 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1209 const char *buf) 1210 { 1211 int retval; 1212 1213 /* 1214 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1215 * it's read-only 1216 */ 1217 if (cs == &top_cpuset) { 1218 retval = -EACCES; 1219 goto done; 1220 } 1221 1222 /* 1223 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1224 * Since nodelist_parse() fails on an empty mask, we special case 1225 * that parsing. The validate_change() call ensures that cpusets 1226 * with tasks have memory. 1227 */ 1228 if (!*buf) { 1229 nodes_clear(trialcs->mems_allowed); 1230 } else { 1231 retval = nodelist_parse(buf, trialcs->mems_allowed); 1232 if (retval < 0) 1233 goto done; 1234 1235 if (!nodes_subset(trialcs->mems_allowed, 1236 top_cpuset.mems_allowed)) { 1237 retval = -EINVAL; 1238 goto done; 1239 } 1240 } 1241 1242 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 1243 retval = 0; /* Too easy - nothing to do */ 1244 goto done; 1245 } 1246 retval = validate_change(cs, trialcs); 1247 if (retval < 0) 1248 goto done; 1249 1250 spin_lock_irq(&callback_lock); 1251 cs->mems_allowed = trialcs->mems_allowed; 1252 spin_unlock_irq(&callback_lock); 1253 1254 /* use trialcs->mems_allowed as a temp variable */ 1255 update_nodemasks_hier(cs, &trialcs->mems_allowed); 1256 done: 1257 return retval; 1258 } 1259 1260 int current_cpuset_is_being_rebound(void) 1261 { 1262 int ret; 1263 1264 rcu_read_lock(); 1265 ret = task_cs(current) == cpuset_being_rebound; 1266 rcu_read_unlock(); 1267 1268 return ret; 1269 } 1270 1271 static int update_relax_domain_level(struct cpuset *cs, s64 val) 1272 { 1273 #ifdef CONFIG_SMP 1274 if (val < -1 || val >= sched_domain_level_max) 1275 return -EINVAL; 1276 #endif 1277 1278 if (val != cs->relax_domain_level) { 1279 cs->relax_domain_level = val; 1280 if (!cpumask_empty(cs->cpus_allowed) && 1281 is_sched_load_balance(cs)) 1282 rebuild_sched_domains_locked(); 1283 } 1284 1285 return 0; 1286 } 1287 1288 /** 1289 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1290 * @cs: the cpuset in which each task's spread flags needs to be changed 1291 * 1292 * Iterate through each task of @cs updating its spread flags. As this 1293 * function is called with cpuset_mutex held, cpuset membership stays 1294 * stable. 1295 */ 1296 static void update_tasks_flags(struct cpuset *cs) 1297 { 1298 struct css_task_iter it; 1299 struct task_struct *task; 1300 1301 css_task_iter_start(&cs->css, &it); 1302 while ((task = css_task_iter_next(&it))) 1303 cpuset_update_task_spread_flag(cs, task); 1304 css_task_iter_end(&it); 1305 } 1306 1307 /* 1308 * update_flag - read a 0 or a 1 in a file and update associated flag 1309 * bit: the bit to update (see cpuset_flagbits_t) 1310 * cs: the cpuset to update 1311 * turning_on: whether the flag is being set or cleared 1312 * 1313 * Call with cpuset_mutex held. 1314 */ 1315 1316 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1317 int turning_on) 1318 { 1319 struct cpuset *trialcs; 1320 int balance_flag_changed; 1321 int spread_flag_changed; 1322 int err; 1323 1324 trialcs = alloc_trial_cpuset(cs); 1325 if (!trialcs) 1326 return -ENOMEM; 1327 1328 if (turning_on) 1329 set_bit(bit, &trialcs->flags); 1330 else 1331 clear_bit(bit, &trialcs->flags); 1332 1333 err = validate_change(cs, trialcs); 1334 if (err < 0) 1335 goto out; 1336 1337 balance_flag_changed = (is_sched_load_balance(cs) != 1338 is_sched_load_balance(trialcs)); 1339 1340 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1341 || (is_spread_page(cs) != is_spread_page(trialcs))); 1342 1343 spin_lock_irq(&callback_lock); 1344 cs->flags = trialcs->flags; 1345 spin_unlock_irq(&callback_lock); 1346 1347 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1348 rebuild_sched_domains_locked(); 1349 1350 if (spread_flag_changed) 1351 update_tasks_flags(cs); 1352 out: 1353 free_trial_cpuset(trialcs); 1354 return err; 1355 } 1356 1357 /* 1358 * Frequency meter - How fast is some event occurring? 1359 * 1360 * These routines manage a digitally filtered, constant time based, 1361 * event frequency meter. There are four routines: 1362 * fmeter_init() - initialize a frequency meter. 1363 * fmeter_markevent() - called each time the event happens. 1364 * fmeter_getrate() - returns the recent rate of such events. 1365 * fmeter_update() - internal routine used to update fmeter. 1366 * 1367 * A common data structure is passed to each of these routines, 1368 * which is used to keep track of the state required to manage the 1369 * frequency meter and its digital filter. 1370 * 1371 * The filter works on the number of events marked per unit time. 1372 * The filter is single-pole low-pass recursive (IIR). The time unit 1373 * is 1 second. Arithmetic is done using 32-bit integers scaled to 1374 * simulate 3 decimal digits of precision (multiplied by 1000). 1375 * 1376 * With an FM_COEF of 933, and a time base of 1 second, the filter 1377 * has a half-life of 10 seconds, meaning that if the events quit 1378 * happening, then the rate returned from the fmeter_getrate() 1379 * will be cut in half each 10 seconds, until it converges to zero. 1380 * 1381 * It is not worth doing a real infinitely recursive filter. If more 1382 * than FM_MAXTICKS ticks have elapsed since the last filter event, 1383 * just compute FM_MAXTICKS ticks worth, by which point the level 1384 * will be stable. 1385 * 1386 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 1387 * arithmetic overflow in the fmeter_update() routine. 1388 * 1389 * Given the simple 32 bit integer arithmetic used, this meter works 1390 * best for reporting rates between one per millisecond (msec) and 1391 * one per 32 (approx) seconds. At constant rates faster than one 1392 * per msec it maxes out at values just under 1,000,000. At constant 1393 * rates between one per msec, and one per second it will stabilize 1394 * to a value N*1000, where N is the rate of events per second. 1395 * At constant rates between one per second and one per 32 seconds, 1396 * it will be choppy, moving up on the seconds that have an event, 1397 * and then decaying until the next event. At rates slower than 1398 * about one in 32 seconds, it decays all the way back to zero between 1399 * each event. 1400 */ 1401 1402 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 1403 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 1404 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 1405 #define FM_SCALE 1000 /* faux fixed point scale */ 1406 1407 /* Initialize a frequency meter */ 1408 static void fmeter_init(struct fmeter *fmp) 1409 { 1410 fmp->cnt = 0; 1411 fmp->val = 0; 1412 fmp->time = 0; 1413 spin_lock_init(&fmp->lock); 1414 } 1415 1416 /* Internal meter update - process cnt events and update value */ 1417 static void fmeter_update(struct fmeter *fmp) 1418 { 1419 time64_t now; 1420 u32 ticks; 1421 1422 now = ktime_get_seconds(); 1423 ticks = now - fmp->time; 1424 1425 if (ticks == 0) 1426 return; 1427 1428 ticks = min(FM_MAXTICKS, ticks); 1429 while (ticks-- > 0) 1430 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 1431 fmp->time = now; 1432 1433 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 1434 fmp->cnt = 0; 1435 } 1436 1437 /* Process any previous ticks, then bump cnt by one (times scale). */ 1438 static void fmeter_markevent(struct fmeter *fmp) 1439 { 1440 spin_lock(&fmp->lock); 1441 fmeter_update(fmp); 1442 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 1443 spin_unlock(&fmp->lock); 1444 } 1445 1446 /* Process any previous ticks, then return current value. */ 1447 static int fmeter_getrate(struct fmeter *fmp) 1448 { 1449 int val; 1450 1451 spin_lock(&fmp->lock); 1452 fmeter_update(fmp); 1453 val = fmp->val; 1454 spin_unlock(&fmp->lock); 1455 return val; 1456 } 1457 1458 static struct cpuset *cpuset_attach_old_cs; 1459 1460 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1461 static int cpuset_can_attach(struct cgroup_taskset *tset) 1462 { 1463 struct cgroup_subsys_state *css; 1464 struct cpuset *cs; 1465 struct task_struct *task; 1466 int ret; 1467 1468 /* used later by cpuset_attach() */ 1469 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 1470 cs = css_cs(css); 1471 1472 mutex_lock(&cpuset_mutex); 1473 1474 /* allow moving tasks into an empty cpuset if on default hierarchy */ 1475 ret = -ENOSPC; 1476 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1477 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1478 goto out_unlock; 1479 1480 cgroup_taskset_for_each(task, css, tset) { 1481 ret = task_can_attach(task, cs->cpus_allowed); 1482 if (ret) 1483 goto out_unlock; 1484 ret = security_task_setscheduler(task); 1485 if (ret) 1486 goto out_unlock; 1487 } 1488 1489 /* 1490 * Mark attach is in progress. This makes validate_change() fail 1491 * changes which zero cpus/mems_allowed. 1492 */ 1493 cs->attach_in_progress++; 1494 ret = 0; 1495 out_unlock: 1496 mutex_unlock(&cpuset_mutex); 1497 return ret; 1498 } 1499 1500 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 1501 { 1502 struct cgroup_subsys_state *css; 1503 struct cpuset *cs; 1504 1505 cgroup_taskset_first(tset, &css); 1506 cs = css_cs(css); 1507 1508 mutex_lock(&cpuset_mutex); 1509 css_cs(css)->attach_in_progress--; 1510 mutex_unlock(&cpuset_mutex); 1511 } 1512 1513 /* 1514 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() 1515 * but we can't allocate it dynamically there. Define it global and 1516 * allocate from cpuset_init(). 1517 */ 1518 static cpumask_var_t cpus_attach; 1519 1520 static void cpuset_attach(struct cgroup_taskset *tset) 1521 { 1522 /* static buf protected by cpuset_mutex */ 1523 static nodemask_t cpuset_attach_nodemask_to; 1524 struct task_struct *task; 1525 struct task_struct *leader; 1526 struct cgroup_subsys_state *css; 1527 struct cpuset *cs; 1528 struct cpuset *oldcs = cpuset_attach_old_cs; 1529 1530 cgroup_taskset_first(tset, &css); 1531 cs = css_cs(css); 1532 1533 mutex_lock(&cpuset_mutex); 1534 1535 /* prepare for attach */ 1536 if (cs == &top_cpuset) 1537 cpumask_copy(cpus_attach, cpu_possible_mask); 1538 else 1539 guarantee_online_cpus(cs, cpus_attach); 1540 1541 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1542 1543 cgroup_taskset_for_each(task, css, tset) { 1544 /* 1545 * can_attach beforehand should guarantee that this doesn't 1546 * fail. TODO: have a better way to handle failure here 1547 */ 1548 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 1549 1550 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 1551 cpuset_update_task_spread_flag(cs, task); 1552 } 1553 1554 /* 1555 * Change mm for all threadgroup leaders. This is expensive and may 1556 * sleep and should be moved outside migration path proper. 1557 */ 1558 cpuset_attach_nodemask_to = cs->effective_mems; 1559 cgroup_taskset_for_each_leader(leader, css, tset) { 1560 struct mm_struct *mm = get_task_mm(leader); 1561 1562 if (mm) { 1563 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1564 1565 /* 1566 * old_mems_allowed is the same with mems_allowed 1567 * here, except if this task is being moved 1568 * automatically due to hotplug. In that case 1569 * @mems_allowed has been updated and is empty, so 1570 * @old_mems_allowed is the right nodesets that we 1571 * migrate mm from. 1572 */ 1573 if (is_memory_migrate(cs)) 1574 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1575 &cpuset_attach_nodemask_to); 1576 else 1577 mmput(mm); 1578 } 1579 } 1580 1581 cs->old_mems_allowed = cpuset_attach_nodemask_to; 1582 1583 cs->attach_in_progress--; 1584 if (!cs->attach_in_progress) 1585 wake_up(&cpuset_attach_wq); 1586 1587 mutex_unlock(&cpuset_mutex); 1588 } 1589 1590 /* The various types of files and directories in a cpuset file system */ 1591 1592 typedef enum { 1593 FILE_MEMORY_MIGRATE, 1594 FILE_CPULIST, 1595 FILE_MEMLIST, 1596 FILE_EFFECTIVE_CPULIST, 1597 FILE_EFFECTIVE_MEMLIST, 1598 FILE_CPU_EXCLUSIVE, 1599 FILE_MEM_EXCLUSIVE, 1600 FILE_MEM_HARDWALL, 1601 FILE_SCHED_LOAD_BALANCE, 1602 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1603 FILE_MEMORY_PRESSURE_ENABLED, 1604 FILE_MEMORY_PRESSURE, 1605 FILE_SPREAD_PAGE, 1606 FILE_SPREAD_SLAB, 1607 } cpuset_filetype_t; 1608 1609 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 1610 u64 val) 1611 { 1612 struct cpuset *cs = css_cs(css); 1613 cpuset_filetype_t type = cft->private; 1614 int retval = 0; 1615 1616 mutex_lock(&cpuset_mutex); 1617 if (!is_cpuset_online(cs)) { 1618 retval = -ENODEV; 1619 goto out_unlock; 1620 } 1621 1622 switch (type) { 1623 case FILE_CPU_EXCLUSIVE: 1624 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); 1625 break; 1626 case FILE_MEM_EXCLUSIVE: 1627 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 1628 break; 1629 case FILE_MEM_HARDWALL: 1630 retval = update_flag(CS_MEM_HARDWALL, cs, val); 1631 break; 1632 case FILE_SCHED_LOAD_BALANCE: 1633 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 1634 break; 1635 case FILE_MEMORY_MIGRATE: 1636 retval = update_flag(CS_MEMORY_MIGRATE, cs, val); 1637 break; 1638 case FILE_MEMORY_PRESSURE_ENABLED: 1639 cpuset_memory_pressure_enabled = !!val; 1640 break; 1641 case FILE_SPREAD_PAGE: 1642 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1643 break; 1644 case FILE_SPREAD_SLAB: 1645 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1646 break; 1647 default: 1648 retval = -EINVAL; 1649 break; 1650 } 1651 out_unlock: 1652 mutex_unlock(&cpuset_mutex); 1653 return retval; 1654 } 1655 1656 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 1657 s64 val) 1658 { 1659 struct cpuset *cs = css_cs(css); 1660 cpuset_filetype_t type = cft->private; 1661 int retval = -ENODEV; 1662 1663 mutex_lock(&cpuset_mutex); 1664 if (!is_cpuset_online(cs)) 1665 goto out_unlock; 1666 1667 switch (type) { 1668 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1669 retval = update_relax_domain_level(cs, val); 1670 break; 1671 default: 1672 retval = -EINVAL; 1673 break; 1674 } 1675 out_unlock: 1676 mutex_unlock(&cpuset_mutex); 1677 return retval; 1678 } 1679 1680 /* 1681 * Common handling for a write to a "cpus" or "mems" file. 1682 */ 1683 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 1684 char *buf, size_t nbytes, loff_t off) 1685 { 1686 struct cpuset *cs = css_cs(of_css(of)); 1687 struct cpuset *trialcs; 1688 int retval = -ENODEV; 1689 1690 buf = strstrip(buf); 1691 1692 /* 1693 * CPU or memory hotunplug may leave @cs w/o any execution 1694 * resources, in which case the hotplug code asynchronously updates 1695 * configuration and transfers all tasks to the nearest ancestor 1696 * which can execute. 1697 * 1698 * As writes to "cpus" or "mems" may restore @cs's execution 1699 * resources, wait for the previously scheduled operations before 1700 * proceeding, so that we don't end up keep removing tasks added 1701 * after execution capability is restored. 1702 * 1703 * cpuset_hotplug_work calls back into cgroup core via 1704 * cgroup_transfer_tasks() and waiting for it from a cgroupfs 1705 * operation like this one can lead to a deadlock through kernfs 1706 * active_ref protection. Let's break the protection. Losing the 1707 * protection is okay as we check whether @cs is online after 1708 * grabbing cpuset_mutex anyway. This only happens on the legacy 1709 * hierarchies. 1710 */ 1711 css_get(&cs->css); 1712 kernfs_break_active_protection(of->kn); 1713 flush_work(&cpuset_hotplug_work); 1714 1715 mutex_lock(&cpuset_mutex); 1716 if (!is_cpuset_online(cs)) 1717 goto out_unlock; 1718 1719 trialcs = alloc_trial_cpuset(cs); 1720 if (!trialcs) { 1721 retval = -ENOMEM; 1722 goto out_unlock; 1723 } 1724 1725 switch (of_cft(of)->private) { 1726 case FILE_CPULIST: 1727 retval = update_cpumask(cs, trialcs, buf); 1728 break; 1729 case FILE_MEMLIST: 1730 retval = update_nodemask(cs, trialcs, buf); 1731 break; 1732 default: 1733 retval = -EINVAL; 1734 break; 1735 } 1736 1737 free_trial_cpuset(trialcs); 1738 out_unlock: 1739 mutex_unlock(&cpuset_mutex); 1740 kernfs_unbreak_active_protection(of->kn); 1741 css_put(&cs->css); 1742 flush_workqueue(cpuset_migrate_mm_wq); 1743 return retval ?: nbytes; 1744 } 1745 1746 /* 1747 * These ascii lists should be read in a single call, by using a user 1748 * buffer large enough to hold the entire map. If read in smaller 1749 * chunks, there is no guarantee of atomicity. Since the display format 1750 * used, list of ranges of sequential numbers, is variable length, 1751 * and since these maps can change value dynamically, one could read 1752 * gibberish by doing partial reads while a list was changing. 1753 */ 1754 static int cpuset_common_seq_show(struct seq_file *sf, void *v) 1755 { 1756 struct cpuset *cs = css_cs(seq_css(sf)); 1757 cpuset_filetype_t type = seq_cft(sf)->private; 1758 int ret = 0; 1759 1760 spin_lock_irq(&callback_lock); 1761 1762 switch (type) { 1763 case FILE_CPULIST: 1764 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 1765 break; 1766 case FILE_MEMLIST: 1767 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 1768 break; 1769 case FILE_EFFECTIVE_CPULIST: 1770 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 1771 break; 1772 case FILE_EFFECTIVE_MEMLIST: 1773 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 1774 break; 1775 default: 1776 ret = -EINVAL; 1777 } 1778 1779 spin_unlock_irq(&callback_lock); 1780 return ret; 1781 } 1782 1783 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1784 { 1785 struct cpuset *cs = css_cs(css); 1786 cpuset_filetype_t type = cft->private; 1787 switch (type) { 1788 case FILE_CPU_EXCLUSIVE: 1789 return is_cpu_exclusive(cs); 1790 case FILE_MEM_EXCLUSIVE: 1791 return is_mem_exclusive(cs); 1792 case FILE_MEM_HARDWALL: 1793 return is_mem_hardwall(cs); 1794 case FILE_SCHED_LOAD_BALANCE: 1795 return is_sched_load_balance(cs); 1796 case FILE_MEMORY_MIGRATE: 1797 return is_memory_migrate(cs); 1798 case FILE_MEMORY_PRESSURE_ENABLED: 1799 return cpuset_memory_pressure_enabled; 1800 case FILE_MEMORY_PRESSURE: 1801 return fmeter_getrate(&cs->fmeter); 1802 case FILE_SPREAD_PAGE: 1803 return is_spread_page(cs); 1804 case FILE_SPREAD_SLAB: 1805 return is_spread_slab(cs); 1806 default: 1807 BUG(); 1808 } 1809 1810 /* Unreachable but makes gcc happy */ 1811 return 0; 1812 } 1813 1814 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 1815 { 1816 struct cpuset *cs = css_cs(css); 1817 cpuset_filetype_t type = cft->private; 1818 switch (type) { 1819 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1820 return cs->relax_domain_level; 1821 default: 1822 BUG(); 1823 } 1824 1825 /* Unrechable but makes gcc happy */ 1826 return 0; 1827 } 1828 1829 1830 /* 1831 * for the common functions, 'private' gives the type of file 1832 */ 1833 1834 static struct cftype files[] = { 1835 { 1836 .name = "cpus", 1837 .seq_show = cpuset_common_seq_show, 1838 .write = cpuset_write_resmask, 1839 .max_write_len = (100U + 6 * NR_CPUS), 1840 .private = FILE_CPULIST, 1841 }, 1842 1843 { 1844 .name = "mems", 1845 .seq_show = cpuset_common_seq_show, 1846 .write = cpuset_write_resmask, 1847 .max_write_len = (100U + 6 * MAX_NUMNODES), 1848 .private = FILE_MEMLIST, 1849 }, 1850 1851 { 1852 .name = "effective_cpus", 1853 .seq_show = cpuset_common_seq_show, 1854 .private = FILE_EFFECTIVE_CPULIST, 1855 }, 1856 1857 { 1858 .name = "effective_mems", 1859 .seq_show = cpuset_common_seq_show, 1860 .private = FILE_EFFECTIVE_MEMLIST, 1861 }, 1862 1863 { 1864 .name = "cpu_exclusive", 1865 .read_u64 = cpuset_read_u64, 1866 .write_u64 = cpuset_write_u64, 1867 .private = FILE_CPU_EXCLUSIVE, 1868 }, 1869 1870 { 1871 .name = "mem_exclusive", 1872 .read_u64 = cpuset_read_u64, 1873 .write_u64 = cpuset_write_u64, 1874 .private = FILE_MEM_EXCLUSIVE, 1875 }, 1876 1877 { 1878 .name = "mem_hardwall", 1879 .read_u64 = cpuset_read_u64, 1880 .write_u64 = cpuset_write_u64, 1881 .private = FILE_MEM_HARDWALL, 1882 }, 1883 1884 { 1885 .name = "sched_load_balance", 1886 .read_u64 = cpuset_read_u64, 1887 .write_u64 = cpuset_write_u64, 1888 .private = FILE_SCHED_LOAD_BALANCE, 1889 }, 1890 1891 { 1892 .name = "sched_relax_domain_level", 1893 .read_s64 = cpuset_read_s64, 1894 .write_s64 = cpuset_write_s64, 1895 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1896 }, 1897 1898 { 1899 .name = "memory_migrate", 1900 .read_u64 = cpuset_read_u64, 1901 .write_u64 = cpuset_write_u64, 1902 .private = FILE_MEMORY_MIGRATE, 1903 }, 1904 1905 { 1906 .name = "memory_pressure", 1907 .read_u64 = cpuset_read_u64, 1908 }, 1909 1910 { 1911 .name = "memory_spread_page", 1912 .read_u64 = cpuset_read_u64, 1913 .write_u64 = cpuset_write_u64, 1914 .private = FILE_SPREAD_PAGE, 1915 }, 1916 1917 { 1918 .name = "memory_spread_slab", 1919 .read_u64 = cpuset_read_u64, 1920 .write_u64 = cpuset_write_u64, 1921 .private = FILE_SPREAD_SLAB, 1922 }, 1923 1924 { 1925 .name = "memory_pressure_enabled", 1926 .flags = CFTYPE_ONLY_ON_ROOT, 1927 .read_u64 = cpuset_read_u64, 1928 .write_u64 = cpuset_write_u64, 1929 .private = FILE_MEMORY_PRESSURE_ENABLED, 1930 }, 1931 1932 { } /* terminate */ 1933 }; 1934 1935 /* 1936 * cpuset_css_alloc - allocate a cpuset css 1937 * cgrp: control group that the new cpuset will be part of 1938 */ 1939 1940 static struct cgroup_subsys_state * 1941 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 1942 { 1943 struct cpuset *cs; 1944 1945 if (!parent_css) 1946 return &top_cpuset.css; 1947 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1949 if (!cs) 1950 return ERR_PTR(-ENOMEM); 1951 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) 1952 goto free_cs; 1953 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) 1954 goto free_cpus; 1955 1956 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1957 cpumask_clear(cs->cpus_allowed); 1958 nodes_clear(cs->mems_allowed); 1959 cpumask_clear(cs->effective_cpus); 1960 nodes_clear(cs->effective_mems); 1961 fmeter_init(&cs->fmeter); 1962 cs->relax_domain_level = -1; 1963 1964 return &cs->css; 1965 1966 free_cpus: 1967 free_cpumask_var(cs->cpus_allowed); 1968 free_cs: 1969 kfree(cs); 1970 return ERR_PTR(-ENOMEM); 1971 } 1972 1973 static int cpuset_css_online(struct cgroup_subsys_state *css) 1974 { 1975 struct cpuset *cs = css_cs(css); 1976 struct cpuset *parent = parent_cs(cs); 1977 struct cpuset *tmp_cs; 1978 struct cgroup_subsys_state *pos_css; 1979 1980 if (!parent) 1981 return 0; 1982 1983 mutex_lock(&cpuset_mutex); 1984 1985 set_bit(CS_ONLINE, &cs->flags); 1986 if (is_spread_page(parent)) 1987 set_bit(CS_SPREAD_PAGE, &cs->flags); 1988 if (is_spread_slab(parent)) 1989 set_bit(CS_SPREAD_SLAB, &cs->flags); 1990 1991 cpuset_inc(); 1992 1993 spin_lock_irq(&callback_lock); 1994 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 1995 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1996 cs->effective_mems = parent->effective_mems; 1997 } 1998 spin_unlock_irq(&callback_lock); 1999 2000 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 2001 goto out_unlock; 2002 2003 /* 2004 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 2005 * set. This flag handling is implemented in cgroup core for 2006 * histrical reasons - the flag may be specified during mount. 2007 * 2008 * Currently, if any sibling cpusets have exclusive cpus or mem, we 2009 * refuse to clone the configuration - thereby refusing the task to 2010 * be entered, and as a result refusing the sys_unshare() or 2011 * clone() which initiated it. If this becomes a problem for some 2012 * users who wish to allow that scenario, then this could be 2013 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 2014 * (and likewise for mems) to the new cgroup. 2015 */ 2016 rcu_read_lock(); 2017 cpuset_for_each_child(tmp_cs, pos_css, parent) { 2018 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2019 rcu_read_unlock(); 2020 goto out_unlock; 2021 } 2022 } 2023 rcu_read_unlock(); 2024 2025 spin_lock_irq(&callback_lock); 2026 cs->mems_allowed = parent->mems_allowed; 2027 cs->effective_mems = parent->mems_allowed; 2028 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 2029 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 2030 spin_unlock_irq(&callback_lock); 2031 out_unlock: 2032 mutex_unlock(&cpuset_mutex); 2033 return 0; 2034 } 2035 2036 /* 2037 * If the cpuset being removed has its flag 'sched_load_balance' 2038 * enabled, then simulate turning sched_load_balance off, which 2039 * will call rebuild_sched_domains_locked(). 2040 */ 2041 2042 static void cpuset_css_offline(struct cgroup_subsys_state *css) 2043 { 2044 struct cpuset *cs = css_cs(css); 2045 2046 mutex_lock(&cpuset_mutex); 2047 2048 if (is_sched_load_balance(cs)) 2049 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 2050 2051 cpuset_dec(); 2052 clear_bit(CS_ONLINE, &cs->flags); 2053 2054 mutex_unlock(&cpuset_mutex); 2055 } 2056 2057 static void cpuset_css_free(struct cgroup_subsys_state *css) 2058 { 2059 struct cpuset *cs = css_cs(css); 2060 2061 free_cpumask_var(cs->effective_cpus); 2062 free_cpumask_var(cs->cpus_allowed); 2063 kfree(cs); 2064 } 2065 2066 static void cpuset_bind(struct cgroup_subsys_state *root_css) 2067 { 2068 mutex_lock(&cpuset_mutex); 2069 spin_lock_irq(&callback_lock); 2070 2071 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2073 top_cpuset.mems_allowed = node_possible_map; 2074 } else { 2075 cpumask_copy(top_cpuset.cpus_allowed, 2076 top_cpuset.effective_cpus); 2077 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2078 } 2079 2080 spin_unlock_irq(&callback_lock); 2081 mutex_unlock(&cpuset_mutex); 2082 } 2083 2084 /* 2085 * Make sure the new task conform to the current state of its parent, 2086 * which could have been changed by cpuset just after it inherits the 2087 * state from the parent and before it sits on the cgroup's task list. 2088 */ 2089 static void cpuset_fork(struct task_struct *task) 2090 { 2091 if (task_css_is_root(task, cpuset_cgrp_id)) 2092 return; 2093 2094 set_cpus_allowed_ptr(task, ¤t->cpus_allowed); 2095 task->mems_allowed = current->mems_allowed; 2096 } 2097 2098 struct cgroup_subsys cpuset_cgrp_subsys = { 2099 .css_alloc = cpuset_css_alloc, 2100 .css_online = cpuset_css_online, 2101 .css_offline = cpuset_css_offline, 2102 .css_free = cpuset_css_free, 2103 .can_attach = cpuset_can_attach, 2104 .cancel_attach = cpuset_cancel_attach, 2105 .attach = cpuset_attach, 2106 .post_attach = cpuset_post_attach, 2107 .bind = cpuset_bind, 2108 .fork = cpuset_fork, 2109 .legacy_cftypes = files, 2110 .early_init = true, 2111 }; 2112 2113 /** 2114 * cpuset_init - initialize cpusets at system boot 2115 * 2116 * Description: Initialize top_cpuset and the cpuset internal file system, 2117 **/ 2118 2119 int __init cpuset_init(void) 2120 { 2121 int err = 0; 2122 2123 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2124 BUG(); 2125 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) 2126 BUG(); 2127 2128 cpumask_setall(top_cpuset.cpus_allowed); 2129 nodes_setall(top_cpuset.mems_allowed); 2130 cpumask_setall(top_cpuset.effective_cpus); 2131 nodes_setall(top_cpuset.effective_mems); 2132 2133 fmeter_init(&top_cpuset.fmeter); 2134 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 2135 top_cpuset.relax_domain_level = -1; 2136 2137 err = register_filesystem(&cpuset_fs_type); 2138 if (err < 0) 2139 return err; 2140 2141 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 2142 BUG(); 2143 2144 return 0; 2145 } 2146 2147 /* 2148 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 2149 * or memory nodes, we need to walk over the cpuset hierarchy, 2150 * removing that CPU or node from all cpusets. If this removes the 2151 * last CPU or node from a cpuset, then move the tasks in the empty 2152 * cpuset to its next-highest non-empty parent. 2153 */ 2154 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2155 { 2156 struct cpuset *parent; 2157 2158 /* 2159 * Find its next-highest non-empty parent, (top cpuset 2160 * has online cpus, so can't be empty). 2161 */ 2162 parent = parent_cs(cs); 2163 while (cpumask_empty(parent->cpus_allowed) || 2164 nodes_empty(parent->mems_allowed)) 2165 parent = parent_cs(parent); 2166 2167 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2168 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 2169 pr_cont_cgroup_name(cs->css.cgroup); 2170 pr_cont("\n"); 2171 } 2172 } 2173 2174 static void 2175 hotplug_update_tasks_legacy(struct cpuset *cs, 2176 struct cpumask *new_cpus, nodemask_t *new_mems, 2177 bool cpus_updated, bool mems_updated) 2178 { 2179 bool is_empty; 2180 2181 spin_lock_irq(&callback_lock); 2182 cpumask_copy(cs->cpus_allowed, new_cpus); 2183 cpumask_copy(cs->effective_cpus, new_cpus); 2184 cs->mems_allowed = *new_mems; 2185 cs->effective_mems = *new_mems; 2186 spin_unlock_irq(&callback_lock); 2187 2188 /* 2189 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2190 * as the tasks will be migratecd to an ancestor. 2191 */ 2192 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 2193 update_tasks_cpumask(cs); 2194 if (mems_updated && !nodes_empty(cs->mems_allowed)) 2195 update_tasks_nodemask(cs); 2196 2197 is_empty = cpumask_empty(cs->cpus_allowed) || 2198 nodes_empty(cs->mems_allowed); 2199 2200 mutex_unlock(&cpuset_mutex); 2201 2202 /* 2203 * Move tasks to the nearest ancestor with execution resources, 2204 * This is full cgroup operation which will also call back into 2205 * cpuset. Should be done outside any lock. 2206 */ 2207 if (is_empty) 2208 remove_tasks_in_empty_cpuset(cs); 2209 2210 mutex_lock(&cpuset_mutex); 2211 } 2212 2213 static void 2214 hotplug_update_tasks(struct cpuset *cs, 2215 struct cpumask *new_cpus, nodemask_t *new_mems, 2216 bool cpus_updated, bool mems_updated) 2217 { 2218 if (cpumask_empty(new_cpus)) 2219 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 2220 if (nodes_empty(*new_mems)) 2221 *new_mems = parent_cs(cs)->effective_mems; 2222 2223 spin_lock_irq(&callback_lock); 2224 cpumask_copy(cs->effective_cpus, new_cpus); 2225 cs->effective_mems = *new_mems; 2226 spin_unlock_irq(&callback_lock); 2227 2228 if (cpus_updated) 2229 update_tasks_cpumask(cs); 2230 if (mems_updated) 2231 update_tasks_nodemask(cs); 2232 } 2233 2234 /** 2235 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2236 * @cs: cpuset in interest 2237 * 2238 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2239 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2240 * all its tasks are moved to the nearest ancestor with both resources. 2241 */ 2242 static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2243 { 2244 static cpumask_t new_cpus; 2245 static nodemask_t new_mems; 2246 bool cpus_updated; 2247 bool mems_updated; 2248 retry: 2249 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2250 2251 mutex_lock(&cpuset_mutex); 2252 2253 /* 2254 * We have raced with task attaching. We wait until attaching 2255 * is finished, so we won't attach a task to an empty cpuset. 2256 */ 2257 if (cs->attach_in_progress) { 2258 mutex_unlock(&cpuset_mutex); 2259 goto retry; 2260 } 2261 2262 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); 2263 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); 2264 2265 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2266 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2267 2268 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 2269 hotplug_update_tasks(cs, &new_cpus, &new_mems, 2270 cpus_updated, mems_updated); 2271 else 2272 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 2273 cpus_updated, mems_updated); 2274 2275 mutex_unlock(&cpuset_mutex); 2276 } 2277 2278 /** 2279 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset 2280 * 2281 * This function is called after either CPU or memory configuration has 2282 * changed and updates cpuset accordingly. The top_cpuset is always 2283 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 2284 * order to make cpusets transparent (of no affect) on systems that are 2285 * actively using CPU hotplug but making no active use of cpusets. 2286 * 2287 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2288 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 2289 * all descendants. 2290 * 2291 * Note that CPU offlining during suspend is ignored. We don't modify 2292 * cpusets across suspend/resume cycles at all. 2293 */ 2294 static void cpuset_hotplug_workfn(struct work_struct *work) 2295 { 2296 static cpumask_t new_cpus; 2297 static nodemask_t new_mems; 2298 bool cpus_updated, mems_updated; 2299 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); 2300 2301 mutex_lock(&cpuset_mutex); 2302 2303 /* fetch the available cpus/mems and find out which changed how */ 2304 cpumask_copy(&new_cpus, cpu_active_mask); 2305 new_mems = node_states[N_MEMORY]; 2306 2307 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); 2308 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 2309 2310 /* synchronize cpus_allowed to cpu_active_mask */ 2311 if (cpus_updated) { 2312 spin_lock_irq(&callback_lock); 2313 if (!on_dfl) 2314 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2315 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2316 spin_unlock_irq(&callback_lock); 2317 /* we don't mess with cpumasks of tasks in top_cpuset */ 2318 } 2319 2320 /* synchronize mems_allowed to N_MEMORY */ 2321 if (mems_updated) { 2322 spin_lock_irq(&callback_lock); 2323 if (!on_dfl) 2324 top_cpuset.mems_allowed = new_mems; 2325 top_cpuset.effective_mems = new_mems; 2326 spin_unlock_irq(&callback_lock); 2327 update_tasks_nodemask(&top_cpuset); 2328 } 2329 2330 mutex_unlock(&cpuset_mutex); 2331 2332 /* if cpus or mems changed, we need to propagate to descendants */ 2333 if (cpus_updated || mems_updated) { 2334 struct cpuset *cs; 2335 struct cgroup_subsys_state *pos_css; 2336 2337 rcu_read_lock(); 2338 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2339 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 2340 continue; 2341 rcu_read_unlock(); 2342 2343 cpuset_hotplug_update_tasks(cs); 2344 2345 rcu_read_lock(); 2346 css_put(&cs->css); 2347 } 2348 rcu_read_unlock(); 2349 } 2350 2351 /* rebuild sched domains if cpus_allowed has changed */ 2352 if (cpus_updated) 2353 rebuild_sched_domains(); 2354 } 2355 2356 void cpuset_update_active_cpus(bool cpu_online) 2357 { 2358 /* 2359 * We're inside cpu hotplug critical region which usually nests 2360 * inside cgroup synchronization. Bounce actual hotplug processing 2361 * to a work item to avoid reverse locking order. 2362 * 2363 * We still need to do partition_sched_domains() synchronously; 2364 * otherwise, the scheduler will get confused and put tasks to the 2365 * dead CPU. Fall back to the default single domain. 2366 * cpuset_hotplug_workfn() will rebuild it as necessary. 2367 */ 2368 partition_sched_domains(1, NULL, NULL); 2369 schedule_work(&cpuset_hotplug_work); 2370 } 2371 2372 /* 2373 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2374 * Call this routine anytime after node_states[N_MEMORY] changes. 2375 * See cpuset_update_active_cpus() for CPU hotplug handling. 2376 */ 2377 static int cpuset_track_online_nodes(struct notifier_block *self, 2378 unsigned long action, void *arg) 2379 { 2380 schedule_work(&cpuset_hotplug_work); 2381 return NOTIFY_OK; 2382 } 2383 2384 static struct notifier_block cpuset_track_online_nodes_nb = { 2385 .notifier_call = cpuset_track_online_nodes, 2386 .priority = 10, /* ??! */ 2387 }; 2388 2389 /** 2390 * cpuset_init_smp - initialize cpus_allowed 2391 * 2392 * Description: Finish top cpuset after cpu, node maps are initialized 2393 */ 2394 void __init cpuset_init_smp(void) 2395 { 2396 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2397 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2398 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2399 2400 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 2401 top_cpuset.effective_mems = node_states[N_MEMORY]; 2402 2403 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2404 2405 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 2406 BUG_ON(!cpuset_migrate_mm_wq); 2407 } 2408 2409 /** 2410 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2411 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2412 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 2413 * 2414 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 2415 * attached to the specified @tsk. Guaranteed to return some non-empty 2416 * subset of cpu_online_mask, even if this means going outside the 2417 * tasks cpuset. 2418 **/ 2419 2420 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2421 { 2422 unsigned long flags; 2423 2424 spin_lock_irqsave(&callback_lock, flags); 2425 rcu_read_lock(); 2426 guarantee_online_cpus(task_cs(tsk), pmask); 2427 rcu_read_unlock(); 2428 spin_unlock_irqrestore(&callback_lock, flags); 2429 } 2430 2431 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2432 { 2433 rcu_read_lock(); 2434 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); 2435 rcu_read_unlock(); 2436 2437 /* 2438 * We own tsk->cpus_allowed, nobody can change it under us. 2439 * 2440 * But we used cs && cs->cpus_allowed lockless and thus can 2441 * race with cgroup_attach_task() or update_cpumask() and get 2442 * the wrong tsk->cpus_allowed. However, both cases imply the 2443 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 2444 * which takes task_rq_lock(). 2445 * 2446 * If we are called after it dropped the lock we must see all 2447 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2448 * set any mask even if it is not right from task_cs() pov, 2449 * the pending set_cpus_allowed_ptr() will fix things. 2450 * 2451 * select_fallback_rq() will fix things ups and set cpu_possible_mask 2452 * if required. 2453 */ 2454 } 2455 2456 void __init cpuset_init_current_mems_allowed(void) 2457 { 2458 nodes_setall(current->mems_allowed); 2459 } 2460 2461 /** 2462 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 2463 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 2464 * 2465 * Description: Returns the nodemask_t mems_allowed of the cpuset 2466 * attached to the specified @tsk. Guaranteed to return some non-empty 2467 * subset of node_states[N_MEMORY], even if this means going outside the 2468 * tasks cpuset. 2469 **/ 2470 2471 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2472 { 2473 nodemask_t mask; 2474 unsigned long flags; 2475 2476 spin_lock_irqsave(&callback_lock, flags); 2477 rcu_read_lock(); 2478 guarantee_online_mems(task_cs(tsk), &mask); 2479 rcu_read_unlock(); 2480 spin_unlock_irqrestore(&callback_lock, flags); 2481 2482 return mask; 2483 } 2484 2485 /** 2486 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed 2487 * @nodemask: the nodemask to be checked 2488 * 2489 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 2490 */ 2491 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 2492 { 2493 return nodes_intersects(*nodemask, current->mems_allowed); 2494 } 2495 2496 /* 2497 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2498 * mem_hardwall ancestor to the specified cpuset. Call holding 2499 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 2500 * (an unusual configuration), then returns the root cpuset. 2501 */ 2502 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2503 { 2504 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2505 cs = parent_cs(cs); 2506 return cs; 2507 } 2508 2509 /** 2510 * cpuset_node_allowed - Can we allocate on a memory node? 2511 * @node: is this an allowed node? 2512 * @gfp_mask: memory allocation flags 2513 * 2514 * If we're in interrupt, yes, we can always allocate. If @node is set in 2515 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 2516 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 2517 * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. 2518 * Otherwise, no. 2519 * 2520 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2521 * and do not allow allocations outside the current tasks cpuset 2522 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2523 * GFP_KERNEL allocations are not so marked, so can escape to the 2524 * nearest enclosing hardwalled ancestor cpuset. 2525 * 2526 * Scanning up parent cpusets requires callback_lock. The 2527 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2528 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2529 * current tasks mems_allowed came up empty on the first pass over 2530 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2531 * cpuset are short of memory, might require taking the callback_lock. 2532 * 2533 * The first call here from mm/page_alloc:get_page_from_freelist() 2534 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2535 * so no allocation on a node outside the cpuset is allowed (unless 2536 * in interrupt, of course). 2537 * 2538 * The second pass through get_page_from_freelist() doesn't even call 2539 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2540 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 2541 * in alloc_flags. That logic and the checks below have the combined 2542 * affect that: 2543 * in_interrupt - any node ok (current task context irrelevant) 2544 * GFP_ATOMIC - any node ok 2545 * TIF_MEMDIE - any node ok 2546 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2547 * GFP_USER - only nodes in current tasks mems allowed ok. 2548 */ 2549 bool __cpuset_node_allowed(int node, gfp_t gfp_mask) 2550 { 2551 struct cpuset *cs; /* current cpuset ancestors */ 2552 int allowed; /* is allocation in zone z allowed? */ 2553 unsigned long flags; 2554 2555 if (in_interrupt()) 2556 return true; 2557 if (node_isset(node, current->mems_allowed)) 2558 return true; 2559 /* 2560 * Allow tasks that have access to memory reserves because they have 2561 * been OOM killed to get memory anywhere. 2562 */ 2563 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2564 return true; 2565 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2566 return false; 2567 2568 if (current->flags & PF_EXITING) /* Let dying task have memory */ 2569 return true; 2570 2571 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2572 spin_lock_irqsave(&callback_lock, flags); 2573 2574 rcu_read_lock(); 2575 cs = nearest_hardwall_ancestor(task_cs(current)); 2576 allowed = node_isset(node, cs->mems_allowed); 2577 rcu_read_unlock(); 2578 2579 spin_unlock_irqrestore(&callback_lock, flags); 2580 return allowed; 2581 } 2582 2583 /** 2584 * cpuset_mem_spread_node() - On which node to begin search for a file page 2585 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2586 * 2587 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2588 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2589 * and if the memory allocation used cpuset_mem_spread_node() 2590 * to determine on which node to start looking, as it will for 2591 * certain page cache or slab cache pages such as used for file 2592 * system buffers and inode caches, then instead of starting on the 2593 * local node to look for a free page, rather spread the starting 2594 * node around the tasks mems_allowed nodes. 2595 * 2596 * We don't have to worry about the returned node being offline 2597 * because "it can't happen", and even if it did, it would be ok. 2598 * 2599 * The routines calling guarantee_online_mems() are careful to 2600 * only set nodes in task->mems_allowed that are online. So it 2601 * should not be possible for the following code to return an 2602 * offline node. But if it did, that would be ok, as this routine 2603 * is not returning the node where the allocation must be, only 2604 * the node where the search should start. The zonelist passed to 2605 * __alloc_pages() will include all nodes. If the slab allocator 2606 * is passed an offline node, it will fall back to the local node. 2607 * See kmem_cache_alloc_node(). 2608 */ 2609 2610 static int cpuset_spread_node(int *rotor) 2611 { 2612 return *rotor = next_node_in(*rotor, current->mems_allowed); 2613 } 2614 2615 int cpuset_mem_spread_node(void) 2616 { 2617 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 2618 current->cpuset_mem_spread_rotor = 2619 node_random(¤t->mems_allowed); 2620 2621 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 2622 } 2623 2624 int cpuset_slab_spread_node(void) 2625 { 2626 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) 2627 current->cpuset_slab_spread_rotor = 2628 node_random(¤t->mems_allowed); 2629 2630 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); 2631 } 2632 2633 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2634 2635 /** 2636 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 2637 * @tsk1: pointer to task_struct of some task. 2638 * @tsk2: pointer to task_struct of some other task. 2639 * 2640 * Description: Return true if @tsk1's mems_allowed intersects the 2641 * mems_allowed of @tsk2. Used by the OOM killer to determine if 2642 * one of the task's memory usage might impact the memory available 2643 * to the other. 2644 **/ 2645 2646 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 2647 const struct task_struct *tsk2) 2648 { 2649 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2650 } 2651 2652 /** 2653 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 2654 * 2655 * Description: Prints current's name, cpuset name, and cached copy of its 2656 * mems_allowed to the kernel log. 2657 */ 2658 void cpuset_print_current_mems_allowed(void) 2659 { 2660 struct cgroup *cgrp; 2661 2662 rcu_read_lock(); 2663 2664 cgrp = task_cs(current)->css.cgroup; 2665 pr_info("%s cpuset=", current->comm); 2666 pr_cont_cgroup_name(cgrp); 2667 pr_cont(" mems_allowed=%*pbl\n", 2668 nodemask_pr_args(¤t->mems_allowed)); 2669 2670 rcu_read_unlock(); 2671 } 2672 2673 /* 2674 * Collection of memory_pressure is suppressed unless 2675 * this flag is enabled by writing "1" to the special 2676 * cpuset file 'memory_pressure_enabled' in the root cpuset. 2677 */ 2678 2679 int cpuset_memory_pressure_enabled __read_mostly; 2680 2681 /** 2682 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 2683 * 2684 * Keep a running average of the rate of synchronous (direct) 2685 * page reclaim efforts initiated by tasks in each cpuset. 2686 * 2687 * This represents the rate at which some task in the cpuset 2688 * ran low on memory on all nodes it was allowed to use, and 2689 * had to enter the kernels page reclaim code in an effort to 2690 * create more free memory by tossing clean pages or swapping 2691 * or writing dirty pages. 2692 * 2693 * Display to user space in the per-cpuset read-only file 2694 * "memory_pressure". Value displayed is an integer 2695 * representing the recent rate of entry into the synchronous 2696 * (direct) page reclaim by any task attached to the cpuset. 2697 **/ 2698 2699 void __cpuset_memory_pressure_bump(void) 2700 { 2701 rcu_read_lock(); 2702 fmeter_markevent(&task_cs(current)->fmeter); 2703 rcu_read_unlock(); 2704 } 2705 2706 #ifdef CONFIG_PROC_PID_CPUSET 2707 /* 2708 * proc_cpuset_show() 2709 * - Print tasks cpuset path into seq_file. 2710 * - Used for /proc/<pid>/cpuset. 2711 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2712 * doesn't really matter if tsk->cpuset changes after we read it, 2713 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2714 * anyway. 2715 */ 2716 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 2717 struct pid *pid, struct task_struct *tsk) 2718 { 2719 char *buf; 2720 struct cgroup_subsys_state *css; 2721 int retval; 2722 2723 retval = -ENOMEM; 2724 buf = kmalloc(PATH_MAX, GFP_KERNEL); 2725 if (!buf) 2726 goto out; 2727 2728 css = task_get_css(tsk, cpuset_cgrp_id); 2729 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, 2730 current->nsproxy->cgroup_ns); 2731 css_put(css); 2732 if (retval >= PATH_MAX) 2733 retval = -ENAMETOOLONG; 2734 if (retval < 0) 2735 goto out_free; 2736 seq_puts(m, buf); 2737 seq_putc(m, '\n'); 2738 retval = 0; 2739 out_free: 2740 kfree(buf); 2741 out: 2742 return retval; 2743 } 2744 #endif /* CONFIG_PROC_PID_CPUSET */ 2745 2746 /* Display task mems_allowed in /proc/<pid>/status file. */ 2747 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2748 { 2749 seq_printf(m, "Mems_allowed:\t%*pb\n", 2750 nodemask_pr_args(&task->mems_allowed)); 2751 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 2752 nodemask_pr_args(&task->mems_allowed)); 2753 } 2754