1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Scheduler topology setup/handling methods 4 */ 5 6 DEFINE_MUTEX(sched_domains_mutex); 7 8 /* Protected by sched_domains_mutex: */ 9 static cpumask_var_t sched_domains_tmpmask; 10 static cpumask_var_t sched_domains_tmpmask2; 11 12 #ifdef CONFIG_SCHED_DEBUG 13 14 static int __init sched_debug_setup(char *str) 15 { 16 sched_debug_verbose = true; 17 18 return 0; 19 } 20 early_param("sched_verbose", sched_debug_setup); 21 22 static inline bool sched_debug(void) 23 { 24 return sched_debug_verbose; 25 } 26 27 #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, 28 const struct sd_flag_debug sd_flag_debug[] = { 29 #include <linux/sched/sd_flags.h> 30 }; 31 #undef SD_FLAG 32 33 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 34 struct cpumask *groupmask) 35 { 36 struct sched_group *group = sd->groups; 37 unsigned long flags = sd->flags; 38 unsigned int idx; 39 40 cpumask_clear(groupmask); 41 42 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); 43 printk(KERN_CONT "span=%*pbl level=%s\n", 44 cpumask_pr_args(sched_domain_span(sd)), sd->name); 45 46 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 47 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 48 } 49 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { 50 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 51 } 52 53 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { 54 unsigned int flag = BIT(idx); 55 unsigned int meta_flags = sd_flag_debug[idx].meta_flags; 56 57 if ((meta_flags & SDF_SHARED_CHILD) && sd->child && 58 !(sd->child->flags & flag)) 59 printk(KERN_ERR "ERROR: flag %s set here but not in child\n", 60 sd_flag_debug[idx].name); 61 62 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && 63 !(sd->parent->flags & flag)) 64 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", 65 sd_flag_debug[idx].name); 66 } 67 68 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 69 do { 70 if (!group) { 71 printk("\n"); 72 printk(KERN_ERR "ERROR: group is NULL\n"); 73 break; 74 } 75 76 if (cpumask_empty(sched_group_span(group))) { 77 printk(KERN_CONT "\n"); 78 printk(KERN_ERR "ERROR: empty group\n"); 79 break; 80 } 81 82 if (!(sd->flags & SD_OVERLAP) && 83 cpumask_intersects(groupmask, sched_group_span(group))) { 84 printk(KERN_CONT "\n"); 85 printk(KERN_ERR "ERROR: repeated CPUs\n"); 86 break; 87 } 88 89 cpumask_or(groupmask, groupmask, sched_group_span(group)); 90 91 printk(KERN_CONT " %d:{ span=%*pbl", 92 group->sgc->id, 93 cpumask_pr_args(sched_group_span(group))); 94 95 if ((sd->flags & SD_OVERLAP) && 96 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { 97 printk(KERN_CONT " mask=%*pbl", 98 cpumask_pr_args(group_balance_mask(group))); 99 } 100 101 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) 102 printk(KERN_CONT " cap=%lu", group->sgc->capacity); 103 104 if (group == sd->groups && sd->child && 105 !cpumask_equal(sched_domain_span(sd->child), 106 sched_group_span(group))) { 107 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); 108 } 109 110 printk(KERN_CONT " }"); 111 112 group = group->next; 113 114 if (group != sd->groups) 115 printk(KERN_CONT ","); 116 117 } while (group != sd->groups); 118 printk(KERN_CONT "\n"); 119 120 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 121 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 122 123 if (sd->parent && 124 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 125 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 126 return 0; 127 } 128 129 static void sched_domain_debug(struct sched_domain *sd, int cpu) 130 { 131 int level = 0; 132 133 if (!sched_debug_verbose) 134 return; 135 136 if (!sd) { 137 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 138 return; 139 } 140 141 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); 142 143 for (;;) { 144 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 145 break; 146 level++; 147 sd = sd->parent; 148 if (!sd) 149 break; 150 } 151 } 152 #else /* !CONFIG_SCHED_DEBUG */ 153 154 # define sched_debug_verbose 0 155 # define sched_domain_debug(sd, cpu) do { } while (0) 156 static inline bool sched_debug(void) 157 { 158 return false; 159 } 160 #endif /* CONFIG_SCHED_DEBUG */ 161 162 /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ 163 #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | 164 static const unsigned int SD_DEGENERATE_GROUPS_MASK = 165 #include <linux/sched/sd_flags.h> 166 0; 167 #undef SD_FLAG 168 169 static int sd_degenerate(struct sched_domain *sd) 170 { 171 if (cpumask_weight(sched_domain_span(sd)) == 1) 172 return 1; 173 174 /* Following flags need at least 2 groups */ 175 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && 176 (sd->groups != sd->groups->next)) 177 return 0; 178 179 /* Following flags don't use groups */ 180 if (sd->flags & (SD_WAKE_AFFINE)) 181 return 0; 182 183 return 1; 184 } 185 186 static int 187 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 188 { 189 unsigned long cflags = sd->flags, pflags = parent->flags; 190 191 if (sd_degenerate(parent)) 192 return 1; 193 194 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 195 return 0; 196 197 /* Flags needing groups don't count if only 1 group in parent */ 198 if (parent->groups == parent->groups->next) 199 pflags &= ~SD_DEGENERATE_GROUPS_MASK; 200 201 if (~cflags & pflags) 202 return 0; 203 204 return 1; 205 } 206 207 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 208 DEFINE_STATIC_KEY_FALSE(sched_energy_present); 209 unsigned int sysctl_sched_energy_aware = 1; 210 DEFINE_MUTEX(sched_energy_mutex); 211 bool sched_energy_update; 212 213 void rebuild_sched_domains_energy(void) 214 { 215 mutex_lock(&sched_energy_mutex); 216 sched_energy_update = true; 217 rebuild_sched_domains(); 218 sched_energy_update = false; 219 mutex_unlock(&sched_energy_mutex); 220 } 221 222 #ifdef CONFIG_PROC_SYSCTL 223 int sched_energy_aware_handler(struct ctl_table *table, int write, 224 void *buffer, size_t *lenp, loff_t *ppos) 225 { 226 int ret, state; 227 228 if (write && !capable(CAP_SYS_ADMIN)) 229 return -EPERM; 230 231 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 232 if (!ret && write) { 233 state = static_branch_unlikely(&sched_energy_present); 234 if (state != sysctl_sched_energy_aware) 235 rebuild_sched_domains_energy(); 236 } 237 238 return ret; 239 } 240 #endif 241 242 static void free_pd(struct perf_domain *pd) 243 { 244 struct perf_domain *tmp; 245 246 while (pd) { 247 tmp = pd->next; 248 kfree(pd); 249 pd = tmp; 250 } 251 } 252 253 static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) 254 { 255 while (pd) { 256 if (cpumask_test_cpu(cpu, perf_domain_span(pd))) 257 return pd; 258 pd = pd->next; 259 } 260 261 return NULL; 262 } 263 264 static struct perf_domain *pd_init(int cpu) 265 { 266 struct em_perf_domain *obj = em_cpu_get(cpu); 267 struct perf_domain *pd; 268 269 if (!obj) { 270 if (sched_debug()) 271 pr_info("%s: no EM found for CPU%d\n", __func__, cpu); 272 return NULL; 273 } 274 275 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 276 if (!pd) 277 return NULL; 278 pd->em_pd = obj; 279 280 return pd; 281 } 282 283 static void perf_domain_debug(const struct cpumask *cpu_map, 284 struct perf_domain *pd) 285 { 286 if (!sched_debug() || !pd) 287 return; 288 289 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); 290 291 while (pd) { 292 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", 293 cpumask_first(perf_domain_span(pd)), 294 cpumask_pr_args(perf_domain_span(pd)), 295 em_pd_nr_perf_states(pd->em_pd)); 296 pd = pd->next; 297 } 298 299 printk(KERN_CONT "\n"); 300 } 301 302 static void destroy_perf_domain_rcu(struct rcu_head *rp) 303 { 304 struct perf_domain *pd; 305 306 pd = container_of(rp, struct perf_domain, rcu); 307 free_pd(pd); 308 } 309 310 static void sched_energy_set(bool has_eas) 311 { 312 if (!has_eas && static_branch_unlikely(&sched_energy_present)) { 313 if (sched_debug()) 314 pr_info("%s: stopping EAS\n", __func__); 315 static_branch_disable_cpuslocked(&sched_energy_present); 316 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { 317 if (sched_debug()) 318 pr_info("%s: starting EAS\n", __func__); 319 static_branch_enable_cpuslocked(&sched_energy_present); 320 } 321 } 322 323 /* 324 * EAS can be used on a root domain if it meets all the following conditions: 325 * 1. an Energy Model (EM) is available; 326 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. 327 * 3. no SMT is detected. 328 * 4. the EM complexity is low enough to keep scheduling overheads low; 329 * 5. schedutil is driving the frequency of all CPUs of the rd; 330 * 6. frequency invariance support is present; 331 * 332 * The complexity of the Energy Model is defined as: 333 * 334 * C = nr_pd * (nr_cpus + nr_ps) 335 * 336 * with parameters defined as: 337 * - nr_pd: the number of performance domains 338 * - nr_cpus: the number of CPUs 339 * - nr_ps: the sum of the number of performance states of all performance 340 * domains (for example, on a system with 2 performance domains, 341 * with 10 performance states each, nr_ps = 2 * 10 = 20). 342 * 343 * It is generally not a good idea to use such a model in the wake-up path on 344 * very complex platforms because of the associated scheduling overheads. The 345 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs 346 * with per-CPU DVFS and less than 8 performance states each, for example. 347 */ 348 #define EM_MAX_COMPLEXITY 2048 349 350 extern struct cpufreq_governor schedutil_gov; 351 static bool build_perf_domains(const struct cpumask *cpu_map) 352 { 353 int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); 354 struct perf_domain *pd = NULL, *tmp; 355 int cpu = cpumask_first(cpu_map); 356 struct root_domain *rd = cpu_rq(cpu)->rd; 357 struct cpufreq_policy *policy; 358 struct cpufreq_governor *gov; 359 360 if (!sysctl_sched_energy_aware) 361 goto free; 362 363 /* EAS is enabled for asymmetric CPU capacity topologies. */ 364 if (!per_cpu(sd_asym_cpucapacity, cpu)) { 365 if (sched_debug()) { 366 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", 367 cpumask_pr_args(cpu_map)); 368 } 369 goto free; 370 } 371 372 /* EAS definitely does *not* handle SMT */ 373 if (sched_smt_active()) { 374 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", 375 cpumask_pr_args(cpu_map)); 376 goto free; 377 } 378 379 if (!arch_scale_freq_invariant()) { 380 if (sched_debug()) { 381 pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", 382 cpumask_pr_args(cpu_map)); 383 } 384 goto free; 385 } 386 387 for_each_cpu(i, cpu_map) { 388 /* Skip already covered CPUs. */ 389 if (find_pd(pd, i)) 390 continue; 391 392 /* Do not attempt EAS if schedutil is not being used. */ 393 policy = cpufreq_cpu_get(i); 394 if (!policy) 395 goto free; 396 gov = policy->governor; 397 cpufreq_cpu_put(policy); 398 if (gov != &schedutil_gov) { 399 if (rd->pd) 400 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", 401 cpumask_pr_args(cpu_map)); 402 goto free; 403 } 404 405 /* Create the new pd and add it to the local list. */ 406 tmp = pd_init(i); 407 if (!tmp) 408 goto free; 409 tmp->next = pd; 410 pd = tmp; 411 412 /* 413 * Count performance domains and performance states for the 414 * complexity check. 415 */ 416 nr_pd++; 417 nr_ps += em_pd_nr_perf_states(pd->em_pd); 418 } 419 420 /* Bail out if the Energy Model complexity is too high. */ 421 if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { 422 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", 423 cpumask_pr_args(cpu_map)); 424 goto free; 425 } 426 427 perf_domain_debug(cpu_map, pd); 428 429 /* Attach the new list of performance domains to the root domain. */ 430 tmp = rd->pd; 431 rcu_assign_pointer(rd->pd, pd); 432 if (tmp) 433 call_rcu(&tmp->rcu, destroy_perf_domain_rcu); 434 435 return !!pd; 436 437 free: 438 free_pd(pd); 439 tmp = rd->pd; 440 rcu_assign_pointer(rd->pd, NULL); 441 if (tmp) 442 call_rcu(&tmp->rcu, destroy_perf_domain_rcu); 443 444 return false; 445 } 446 #else 447 static void free_pd(struct perf_domain *pd) { } 448 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ 449 450 static void free_rootdomain(struct rcu_head *rcu) 451 { 452 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 453 454 cpupri_cleanup(&rd->cpupri); 455 cpudl_cleanup(&rd->cpudl); 456 free_cpumask_var(rd->dlo_mask); 457 free_cpumask_var(rd->rto_mask); 458 free_cpumask_var(rd->online); 459 free_cpumask_var(rd->span); 460 free_pd(rd->pd); 461 kfree(rd); 462 } 463 464 void rq_attach_root(struct rq *rq, struct root_domain *rd) 465 { 466 struct root_domain *old_rd = NULL; 467 unsigned long flags; 468 469 raw_spin_rq_lock_irqsave(rq, flags); 470 471 if (rq->rd) { 472 old_rd = rq->rd; 473 474 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 475 set_rq_offline(rq); 476 477 cpumask_clear_cpu(rq->cpu, old_rd->span); 478 479 /* 480 * If we dont want to free the old_rd yet then 481 * set old_rd to NULL to skip the freeing later 482 * in this function: 483 */ 484 if (!atomic_dec_and_test(&old_rd->refcount)) 485 old_rd = NULL; 486 } 487 488 atomic_inc(&rd->refcount); 489 rq->rd = rd; 490 491 cpumask_set_cpu(rq->cpu, rd->span); 492 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 493 set_rq_online(rq); 494 495 raw_spin_rq_unlock_irqrestore(rq, flags); 496 497 if (old_rd) 498 call_rcu(&old_rd->rcu, free_rootdomain); 499 } 500 501 void sched_get_rd(struct root_domain *rd) 502 { 503 atomic_inc(&rd->refcount); 504 } 505 506 void sched_put_rd(struct root_domain *rd) 507 { 508 if (!atomic_dec_and_test(&rd->refcount)) 509 return; 510 511 call_rcu(&rd->rcu, free_rootdomain); 512 } 513 514 static int init_rootdomain(struct root_domain *rd) 515 { 516 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) 517 goto out; 518 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) 519 goto free_span; 520 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 521 goto free_online; 522 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 523 goto free_dlo_mask; 524 525 #ifdef HAVE_RT_PUSH_IPI 526 rd->rto_cpu = -1; 527 raw_spin_lock_init(&rd->rto_lock); 528 rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); 529 #endif 530 531 rd->visit_gen = 0; 532 init_dl_bw(&rd->dl_bw); 533 if (cpudl_init(&rd->cpudl) != 0) 534 goto free_rto_mask; 535 536 if (cpupri_init(&rd->cpupri) != 0) 537 goto free_cpudl; 538 return 0; 539 540 free_cpudl: 541 cpudl_cleanup(&rd->cpudl); 542 free_rto_mask: 543 free_cpumask_var(rd->rto_mask); 544 free_dlo_mask: 545 free_cpumask_var(rd->dlo_mask); 546 free_online: 547 free_cpumask_var(rd->online); 548 free_span: 549 free_cpumask_var(rd->span); 550 out: 551 return -ENOMEM; 552 } 553 554 /* 555 * By default the system creates a single root-domain with all CPUs as 556 * members (mimicking the global state we have today). 557 */ 558 struct root_domain def_root_domain; 559 560 void init_defrootdomain(void) 561 { 562 init_rootdomain(&def_root_domain); 563 564 atomic_set(&def_root_domain.refcount, 1); 565 } 566 567 static struct root_domain *alloc_rootdomain(void) 568 { 569 struct root_domain *rd; 570 571 rd = kzalloc(sizeof(*rd), GFP_KERNEL); 572 if (!rd) 573 return NULL; 574 575 if (init_rootdomain(rd) != 0) { 576 kfree(rd); 577 return NULL; 578 } 579 580 return rd; 581 } 582 583 static void free_sched_groups(struct sched_group *sg, int free_sgc) 584 { 585 struct sched_group *tmp, *first; 586 587 if (!sg) 588 return; 589 590 first = sg; 591 do { 592 tmp = sg->next; 593 594 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 595 kfree(sg->sgc); 596 597 if (atomic_dec_and_test(&sg->ref)) 598 kfree(sg); 599 sg = tmp; 600 } while (sg != first); 601 } 602 603 static void destroy_sched_domain(struct sched_domain *sd) 604 { 605 /* 606 * A normal sched domain may have multiple group references, an 607 * overlapping domain, having private groups, only one. Iterate, 608 * dropping group/capacity references, freeing where none remain. 609 */ 610 free_sched_groups(sd->groups, 1); 611 612 if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) 613 kfree(sd->shared); 614 kfree(sd); 615 } 616 617 static void destroy_sched_domains_rcu(struct rcu_head *rcu) 618 { 619 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 620 621 while (sd) { 622 struct sched_domain *parent = sd->parent; 623 destroy_sched_domain(sd); 624 sd = parent; 625 } 626 } 627 628 static void destroy_sched_domains(struct sched_domain *sd) 629 { 630 if (sd) 631 call_rcu(&sd->rcu, destroy_sched_domains_rcu); 632 } 633 634 /* 635 * Keep a special pointer to the highest sched_domain that has 636 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 637 * allows us to avoid some pointer chasing select_idle_sibling(). 638 * 639 * Also keep a unique ID per domain (we use the first CPU number in 640 * the cpumask of the domain), this allows us to quickly tell if 641 * two CPUs are in the same cache domain, see cpus_share_cache(). 642 */ 643 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); 644 DEFINE_PER_CPU(int, sd_llc_size); 645 DEFINE_PER_CPU(int, sd_llc_id); 646 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); 647 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); 648 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); 649 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); 650 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); 651 652 static void update_top_cache_domain(int cpu) 653 { 654 struct sched_domain_shared *sds = NULL; 655 struct sched_domain *sd; 656 int id = cpu; 657 int size = 1; 658 659 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 660 if (sd) { 661 id = cpumask_first(sched_domain_span(sd)); 662 size = cpumask_weight(sched_domain_span(sd)); 663 sds = sd->shared; 664 } 665 666 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 667 per_cpu(sd_llc_size, cpu) = size; 668 per_cpu(sd_llc_id, cpu) = id; 669 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); 670 671 sd = lowest_flag_domain(cpu, SD_NUMA); 672 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 673 674 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 675 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); 676 677 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); 678 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); 679 } 680 681 /* 682 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 683 * hold the hotplug lock. 684 */ 685 static void 686 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 687 { 688 struct rq *rq = cpu_rq(cpu); 689 struct sched_domain *tmp; 690 691 /* Remove the sched domains which do not contribute to scheduling. */ 692 for (tmp = sd; tmp; ) { 693 struct sched_domain *parent = tmp->parent; 694 if (!parent) 695 break; 696 697 if (sd_parent_degenerate(tmp, parent)) { 698 tmp->parent = parent->parent; 699 if (parent->parent) 700 parent->parent->child = tmp; 701 /* 702 * Transfer SD_PREFER_SIBLING down in case of a 703 * degenerate parent; the spans match for this 704 * so the property transfers. 705 */ 706 if (parent->flags & SD_PREFER_SIBLING) 707 tmp->flags |= SD_PREFER_SIBLING; 708 destroy_sched_domain(parent); 709 } else 710 tmp = tmp->parent; 711 } 712 713 if (sd && sd_degenerate(sd)) { 714 tmp = sd; 715 sd = sd->parent; 716 destroy_sched_domain(tmp); 717 if (sd) { 718 struct sched_group *sg = sd->groups; 719 720 /* 721 * sched groups hold the flags of the child sched 722 * domain for convenience. Clear such flags since 723 * the child is being destroyed. 724 */ 725 do { 726 sg->flags = 0; 727 } while (sg != sd->groups); 728 729 sd->child = NULL; 730 } 731 } 732 733 sched_domain_debug(sd, cpu); 734 735 rq_attach_root(rq, rd); 736 tmp = rq->sd; 737 rcu_assign_pointer(rq->sd, sd); 738 dirty_sched_domain_sysctl(cpu); 739 destroy_sched_domains(tmp); 740 741 update_top_cache_domain(cpu); 742 } 743 744 struct s_data { 745 struct sched_domain * __percpu *sd; 746 struct root_domain *rd; 747 }; 748 749 enum s_alloc { 750 sa_rootdomain, 751 sa_sd, 752 sa_sd_storage, 753 sa_none, 754 }; 755 756 /* 757 * Return the canonical balance CPU for this group, this is the first CPU 758 * of this group that's also in the balance mask. 759 * 760 * The balance mask are all those CPUs that could actually end up at this 761 * group. See build_balance_mask(). 762 * 763 * Also see should_we_balance(). 764 */ 765 int group_balance_cpu(struct sched_group *sg) 766 { 767 return cpumask_first(group_balance_mask(sg)); 768 } 769 770 771 /* 772 * NUMA topology (first read the regular topology blurb below) 773 * 774 * Given a node-distance table, for example: 775 * 776 * node 0 1 2 3 777 * 0: 10 20 30 20 778 * 1: 20 10 20 30 779 * 2: 30 20 10 20 780 * 3: 20 30 20 10 781 * 782 * which represents a 4 node ring topology like: 783 * 784 * 0 ----- 1 785 * | | 786 * | | 787 * | | 788 * 3 ----- 2 789 * 790 * We want to construct domains and groups to represent this. The way we go 791 * about doing this is to build the domains on 'hops'. For each NUMA level we 792 * construct the mask of all nodes reachable in @level hops. 793 * 794 * For the above NUMA topology that gives 3 levels: 795 * 796 * NUMA-2 0-3 0-3 0-3 0-3 797 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} 798 * 799 * NUMA-1 0-1,3 0-2 1-3 0,2-3 800 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} 801 * 802 * NUMA-0 0 1 2 3 803 * 804 * 805 * As can be seen; things don't nicely line up as with the regular topology. 806 * When we iterate a domain in child domain chunks some nodes can be 807 * represented multiple times -- hence the "overlap" naming for this part of 808 * the topology. 809 * 810 * In order to minimize this overlap, we only build enough groups to cover the 811 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. 812 * 813 * Because: 814 * 815 * - the first group of each domain is its child domain; this 816 * gets us the first 0-1,3 817 * - the only uncovered node is 2, who's child domain is 1-3. 818 * 819 * However, because of the overlap, computing a unique CPU for each group is 820 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both 821 * groups include the CPUs of Node-0, while those CPUs would not in fact ever 822 * end up at those groups (they would end up in group: 0-1,3). 823 * 824 * To correct this we have to introduce the group balance mask. This mask 825 * will contain those CPUs in the group that can reach this group given the 826 * (child) domain tree. 827 * 828 * With this we can once again compute balance_cpu and sched_group_capacity 829 * relations. 830 * 831 * XXX include words on how balance_cpu is unique and therefore can be 832 * used for sched_group_capacity links. 833 * 834 * 835 * Another 'interesting' topology is: 836 * 837 * node 0 1 2 3 838 * 0: 10 20 20 30 839 * 1: 20 10 20 20 840 * 2: 20 20 10 20 841 * 3: 30 20 20 10 842 * 843 * Which looks a little like: 844 * 845 * 0 ----- 1 846 * | / | 847 * | / | 848 * | / | 849 * 2 ----- 3 850 * 851 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 852 * are not. 853 * 854 * This leads to a few particularly weird cases where the sched_domain's are 855 * not of the same number for each CPU. Consider: 856 * 857 * NUMA-2 0-3 0-3 858 * groups: {0-2},{1-3} {1-3},{0-2} 859 * 860 * NUMA-1 0-2 0-3 0-3 1-3 861 * 862 * NUMA-0 0 1 2 3 863 * 864 */ 865 866 867 /* 868 * Build the balance mask; it contains only those CPUs that can arrive at this 869 * group and should be considered to continue balancing. 870 * 871 * We do this during the group creation pass, therefore the group information 872 * isn't complete yet, however since each group represents a (child) domain we 873 * can fully construct this using the sched_domain bits (which are already 874 * complete). 875 */ 876 static void 877 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) 878 { 879 const struct cpumask *sg_span = sched_group_span(sg); 880 struct sd_data *sdd = sd->private; 881 struct sched_domain *sibling; 882 int i; 883 884 cpumask_clear(mask); 885 886 for_each_cpu(i, sg_span) { 887 sibling = *per_cpu_ptr(sdd->sd, i); 888 889 /* 890 * Can happen in the asymmetric case, where these siblings are 891 * unused. The mask will not be empty because those CPUs that 892 * do have the top domain _should_ span the domain. 893 */ 894 if (!sibling->child) 895 continue; 896 897 /* If we would not end up here, we can't continue from here */ 898 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) 899 continue; 900 901 cpumask_set_cpu(i, mask); 902 } 903 904 /* We must not have empty masks here */ 905 WARN_ON_ONCE(cpumask_empty(mask)); 906 } 907 908 /* 909 * XXX: This creates per-node group entries; since the load-balancer will 910 * immediately access remote memory to construct this group's load-balance 911 * statistics having the groups node local is of dubious benefit. 912 */ 913 static struct sched_group * 914 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) 915 { 916 struct sched_group *sg; 917 struct cpumask *sg_span; 918 919 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 920 GFP_KERNEL, cpu_to_node(cpu)); 921 922 if (!sg) 923 return NULL; 924 925 sg_span = sched_group_span(sg); 926 if (sd->child) { 927 cpumask_copy(sg_span, sched_domain_span(sd->child)); 928 sg->flags = sd->child->flags; 929 } else { 930 cpumask_copy(sg_span, sched_domain_span(sd)); 931 } 932 933 atomic_inc(&sg->ref); 934 return sg; 935 } 936 937 static void init_overlap_sched_group(struct sched_domain *sd, 938 struct sched_group *sg) 939 { 940 struct cpumask *mask = sched_domains_tmpmask2; 941 struct sd_data *sdd = sd->private; 942 struct cpumask *sg_span; 943 int cpu; 944 945 build_balance_mask(sd, sg, mask); 946 cpu = cpumask_first(mask); 947 948 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); 949 if (atomic_inc_return(&sg->sgc->ref) == 1) 950 cpumask_copy(group_balance_mask(sg), mask); 951 else 952 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); 953 954 /* 955 * Initialize sgc->capacity such that even if we mess up the 956 * domains and no possible iteration will get us here, we won't 957 * die on a /0 trap. 958 */ 959 sg_span = sched_group_span(sg); 960 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 961 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; 962 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; 963 } 964 965 static struct sched_domain * 966 find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) 967 { 968 /* 969 * The proper descendant would be the one whose child won't span out 970 * of sd 971 */ 972 while (sibling->child && 973 !cpumask_subset(sched_domain_span(sibling->child), 974 sched_domain_span(sd))) 975 sibling = sibling->child; 976 977 /* 978 * As we are referencing sgc across different topology level, we need 979 * to go down to skip those sched_domains which don't contribute to 980 * scheduling because they will be degenerated in cpu_attach_domain 981 */ 982 while (sibling->child && 983 cpumask_equal(sched_domain_span(sibling->child), 984 sched_domain_span(sibling))) 985 sibling = sibling->child; 986 987 return sibling; 988 } 989 990 static int 991 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 992 { 993 struct sched_group *first = NULL, *last = NULL, *sg; 994 const struct cpumask *span = sched_domain_span(sd); 995 struct cpumask *covered = sched_domains_tmpmask; 996 struct sd_data *sdd = sd->private; 997 struct sched_domain *sibling; 998 int i; 999 1000 cpumask_clear(covered); 1001 1002 for_each_cpu_wrap(i, span, cpu) { 1003 struct cpumask *sg_span; 1004 1005 if (cpumask_test_cpu(i, covered)) 1006 continue; 1007 1008 sibling = *per_cpu_ptr(sdd->sd, i); 1009 1010 /* 1011 * Asymmetric node setups can result in situations where the 1012 * domain tree is of unequal depth, make sure to skip domains 1013 * that already cover the entire range. 1014 * 1015 * In that case build_sched_domains() will have terminated the 1016 * iteration early and our sibling sd spans will be empty. 1017 * Domains should always include the CPU they're built on, so 1018 * check that. 1019 */ 1020 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 1021 continue; 1022 1023 /* 1024 * Usually we build sched_group by sibling's child sched_domain 1025 * But for machines whose NUMA diameter are 3 or above, we move 1026 * to build sched_group by sibling's proper descendant's child 1027 * domain because sibling's child sched_domain will span out of 1028 * the sched_domain being built as below. 1029 * 1030 * Smallest diameter=3 topology is: 1031 * 1032 * node 0 1 2 3 1033 * 0: 10 20 30 40 1034 * 1: 20 10 20 30 1035 * 2: 30 20 10 20 1036 * 3: 40 30 20 10 1037 * 1038 * 0 --- 1 --- 2 --- 3 1039 * 1040 * NUMA-3 0-3 N/A N/A 0-3 1041 * groups: {0-2},{1-3} {1-3},{0-2} 1042 * 1043 * NUMA-2 0-2 0-3 0-3 1-3 1044 * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} 1045 * 1046 * NUMA-1 0-1 0-2 1-3 2-3 1047 * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} 1048 * 1049 * NUMA-0 0 1 2 3 1050 * 1051 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the 1052 * group span isn't a subset of the domain span. 1053 */ 1054 if (sibling->child && 1055 !cpumask_subset(sched_domain_span(sibling->child), span)) 1056 sibling = find_descended_sibling(sd, sibling); 1057 1058 sg = build_group_from_child_sched_domain(sibling, cpu); 1059 if (!sg) 1060 goto fail; 1061 1062 sg_span = sched_group_span(sg); 1063 cpumask_or(covered, covered, sg_span); 1064 1065 init_overlap_sched_group(sibling, sg); 1066 1067 if (!first) 1068 first = sg; 1069 if (last) 1070 last->next = sg; 1071 last = sg; 1072 last->next = first; 1073 } 1074 sd->groups = first; 1075 1076 return 0; 1077 1078 fail: 1079 free_sched_groups(first, 0); 1080 1081 return -ENOMEM; 1082 } 1083 1084 1085 /* 1086 * Package topology (also see the load-balance blurb in fair.c) 1087 * 1088 * The scheduler builds a tree structure to represent a number of important 1089 * topology features. By default (default_topology[]) these include: 1090 * 1091 * - Simultaneous multithreading (SMT) 1092 * - Multi-Core Cache (MC) 1093 * - Package (DIE) 1094 * 1095 * Where the last one more or less denotes everything up to a NUMA node. 1096 * 1097 * The tree consists of 3 primary data structures: 1098 * 1099 * sched_domain -> sched_group -> sched_group_capacity 1100 * ^ ^ ^ ^ 1101 * `-' `-' 1102 * 1103 * The sched_domains are per-CPU and have a two way link (parent & child) and 1104 * denote the ever growing mask of CPUs belonging to that level of topology. 1105 * 1106 * Each sched_domain has a circular (double) linked list of sched_group's, each 1107 * denoting the domains of the level below (or individual CPUs in case of the 1108 * first domain level). The sched_group linked by a sched_domain includes the 1109 * CPU of that sched_domain [*]. 1110 * 1111 * Take for instance a 2 threaded, 2 core, 2 cache cluster part: 1112 * 1113 * CPU 0 1 2 3 4 5 6 7 1114 * 1115 * DIE [ ] 1116 * MC [ ] [ ] 1117 * SMT [ ] [ ] [ ] [ ] 1118 * 1119 * - or - 1120 * 1121 * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 1122 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 1123 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 1124 * 1125 * CPU 0 1 2 3 4 5 6 7 1126 * 1127 * One way to think about it is: sched_domain moves you up and down among these 1128 * topology levels, while sched_group moves you sideways through it, at child 1129 * domain granularity. 1130 * 1131 * sched_group_capacity ensures each unique sched_group has shared storage. 1132 * 1133 * There are two related construction problems, both require a CPU that 1134 * uniquely identify each group (for a given domain): 1135 * 1136 * - The first is the balance_cpu (see should_we_balance() and the 1137 * load-balance blub in fair.c); for each group we only want 1 CPU to 1138 * continue balancing at a higher domain. 1139 * 1140 * - The second is the sched_group_capacity; we want all identical groups 1141 * to share a single sched_group_capacity. 1142 * 1143 * Since these topologies are exclusive by construction. That is, its 1144 * impossible for an SMT thread to belong to multiple cores, and cores to 1145 * be part of multiple caches. There is a very clear and unique location 1146 * for each CPU in the hierarchy. 1147 * 1148 * Therefore computing a unique CPU for each group is trivial (the iteration 1149 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ 1150 * group), we can simply pick the first CPU in each group. 1151 * 1152 * 1153 * [*] in other words, the first group of each domain is its child domain. 1154 */ 1155 1156 static struct sched_group *get_group(int cpu, struct sd_data *sdd) 1157 { 1158 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1159 struct sched_domain *child = sd->child; 1160 struct sched_group *sg; 1161 bool already_visited; 1162 1163 if (child) 1164 cpu = cpumask_first(sched_domain_span(child)); 1165 1166 sg = *per_cpu_ptr(sdd->sg, cpu); 1167 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); 1168 1169 /* Increase refcounts for claim_allocations: */ 1170 already_visited = atomic_inc_return(&sg->ref) > 1; 1171 /* sgc visits should follow a similar trend as sg */ 1172 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); 1173 1174 /* If we have already visited that group, it's already initialized. */ 1175 if (already_visited) 1176 return sg; 1177 1178 if (child) { 1179 cpumask_copy(sched_group_span(sg), sched_domain_span(child)); 1180 cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); 1181 sg->flags = child->flags; 1182 } else { 1183 cpumask_set_cpu(cpu, sched_group_span(sg)); 1184 cpumask_set_cpu(cpu, group_balance_mask(sg)); 1185 } 1186 1187 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); 1188 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; 1189 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; 1190 1191 return sg; 1192 } 1193 1194 /* 1195 * build_sched_groups will build a circular linked list of the groups 1196 * covered by the given span, will set each group's ->cpumask correctly, 1197 * and will initialize their ->sgc. 1198 * 1199 * Assumes the sched_domain tree is fully constructed 1200 */ 1201 static int 1202 build_sched_groups(struct sched_domain *sd, int cpu) 1203 { 1204 struct sched_group *first = NULL, *last = NULL; 1205 struct sd_data *sdd = sd->private; 1206 const struct cpumask *span = sched_domain_span(sd); 1207 struct cpumask *covered; 1208 int i; 1209 1210 lockdep_assert_held(&sched_domains_mutex); 1211 covered = sched_domains_tmpmask; 1212 1213 cpumask_clear(covered); 1214 1215 for_each_cpu_wrap(i, span, cpu) { 1216 struct sched_group *sg; 1217 1218 if (cpumask_test_cpu(i, covered)) 1219 continue; 1220 1221 sg = get_group(i, sdd); 1222 1223 cpumask_or(covered, covered, sched_group_span(sg)); 1224 1225 if (!first) 1226 first = sg; 1227 if (last) 1228 last->next = sg; 1229 last = sg; 1230 } 1231 last->next = first; 1232 sd->groups = first; 1233 1234 return 0; 1235 } 1236 1237 /* 1238 * Initialize sched groups cpu_capacity. 1239 * 1240 * cpu_capacity indicates the capacity of sched group, which is used while 1241 * distributing the load between different sched groups in a sched domain. 1242 * Typically cpu_capacity for all the groups in a sched domain will be same 1243 * unless there are asymmetries in the topology. If there are asymmetries, 1244 * group having more cpu_capacity will pickup more load compared to the 1245 * group having less cpu_capacity. 1246 */ 1247 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 1248 { 1249 struct sched_group *sg = sd->groups; 1250 1251 WARN_ON(!sg); 1252 1253 do { 1254 int cpu, max_cpu = -1; 1255 1256 sg->group_weight = cpumask_weight(sched_group_span(sg)); 1257 1258 if (!(sd->flags & SD_ASYM_PACKING)) 1259 goto next; 1260 1261 for_each_cpu(cpu, sched_group_span(sg)) { 1262 if (max_cpu < 0) 1263 max_cpu = cpu; 1264 else if (sched_asym_prefer(cpu, max_cpu)) 1265 max_cpu = cpu; 1266 } 1267 sg->asym_prefer_cpu = max_cpu; 1268 1269 next: 1270 sg = sg->next; 1271 } while (sg != sd->groups); 1272 1273 if (cpu != group_balance_cpu(sg)) 1274 return; 1275 1276 update_group_capacity(sd, cpu); 1277 } 1278 1279 /* 1280 * Asymmetric CPU capacity bits 1281 */ 1282 struct asym_cap_data { 1283 struct list_head link; 1284 unsigned long capacity; 1285 unsigned long cpus[]; 1286 }; 1287 1288 /* 1289 * Set of available CPUs grouped by their corresponding capacities 1290 * Each list entry contains a CPU mask reflecting CPUs that share the same 1291 * capacity. 1292 * The lifespan of data is unlimited. 1293 */ 1294 static LIST_HEAD(asym_cap_list); 1295 1296 #define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) 1297 1298 /* 1299 * Verify whether there is any CPU capacity asymmetry in a given sched domain. 1300 * Provides sd_flags reflecting the asymmetry scope. 1301 */ 1302 static inline int 1303 asym_cpu_capacity_classify(const struct cpumask *sd_span, 1304 const struct cpumask *cpu_map) 1305 { 1306 struct asym_cap_data *entry; 1307 int count = 0, miss = 0; 1308 1309 /* 1310 * Count how many unique CPU capacities this domain spans across 1311 * (compare sched_domain CPUs mask with ones representing available 1312 * CPUs capacities). Take into account CPUs that might be offline: 1313 * skip those. 1314 */ 1315 list_for_each_entry(entry, &asym_cap_list, link) { 1316 if (cpumask_intersects(sd_span, cpu_capacity_span(entry))) 1317 ++count; 1318 else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry))) 1319 ++miss; 1320 } 1321 1322 WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); 1323 1324 /* No asymmetry detected */ 1325 if (count < 2) 1326 return 0; 1327 /* Some of the available CPU capacity values have not been detected */ 1328 if (miss) 1329 return SD_ASYM_CPUCAPACITY; 1330 1331 /* Full asymmetry */ 1332 return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; 1333 1334 } 1335 1336 static inline void asym_cpu_capacity_update_data(int cpu) 1337 { 1338 unsigned long capacity = arch_scale_cpu_capacity(cpu); 1339 struct asym_cap_data *entry = NULL; 1340 1341 list_for_each_entry(entry, &asym_cap_list, link) { 1342 if (capacity == entry->capacity) 1343 goto done; 1344 } 1345 1346 entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); 1347 if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) 1348 return; 1349 entry->capacity = capacity; 1350 list_add(&entry->link, &asym_cap_list); 1351 done: 1352 __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); 1353 } 1354 1355 /* 1356 * Build-up/update list of CPUs grouped by their capacities 1357 * An update requires explicit request to rebuild sched domains 1358 * with state indicating CPU topology changes. 1359 */ 1360 static void asym_cpu_capacity_scan(void) 1361 { 1362 struct asym_cap_data *entry, *next; 1363 int cpu; 1364 1365 list_for_each_entry(entry, &asym_cap_list, link) 1366 cpumask_clear(cpu_capacity_span(entry)); 1367 1368 for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) 1369 asym_cpu_capacity_update_data(cpu); 1370 1371 list_for_each_entry_safe(entry, next, &asym_cap_list, link) { 1372 if (cpumask_empty(cpu_capacity_span(entry))) { 1373 list_del(&entry->link); 1374 kfree(entry); 1375 } 1376 } 1377 1378 /* 1379 * Only one capacity value has been detected i.e. this system is symmetric. 1380 * No need to keep this data around. 1381 */ 1382 if (list_is_singular(&asym_cap_list)) { 1383 entry = list_first_entry(&asym_cap_list, typeof(*entry), link); 1384 list_del(&entry->link); 1385 kfree(entry); 1386 } 1387 } 1388 1389 /* 1390 * Initializers for schedule domains 1391 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 1392 */ 1393 1394 static int default_relax_domain_level = -1; 1395 int sched_domain_level_max; 1396 1397 static int __init setup_relax_domain_level(char *str) 1398 { 1399 if (kstrtoint(str, 0, &default_relax_domain_level)) 1400 pr_warn("Unable to set relax_domain_level\n"); 1401 1402 return 1; 1403 } 1404 __setup("relax_domain_level=", setup_relax_domain_level); 1405 1406 static void set_domain_attribute(struct sched_domain *sd, 1407 struct sched_domain_attr *attr) 1408 { 1409 int request; 1410 1411 if (!attr || attr->relax_domain_level < 0) { 1412 if (default_relax_domain_level < 0) 1413 return; 1414 request = default_relax_domain_level; 1415 } else 1416 request = attr->relax_domain_level; 1417 1418 if (sd->level > request) { 1419 /* Turn off idle balance on this domain: */ 1420 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 1421 } 1422 } 1423 1424 static void __sdt_free(const struct cpumask *cpu_map); 1425 static int __sdt_alloc(const struct cpumask *cpu_map); 1426 1427 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 1428 const struct cpumask *cpu_map) 1429 { 1430 switch (what) { 1431 case sa_rootdomain: 1432 if (!atomic_read(&d->rd->refcount)) 1433 free_rootdomain(&d->rd->rcu); 1434 fallthrough; 1435 case sa_sd: 1436 free_percpu(d->sd); 1437 fallthrough; 1438 case sa_sd_storage: 1439 __sdt_free(cpu_map); 1440 fallthrough; 1441 case sa_none: 1442 break; 1443 } 1444 } 1445 1446 static enum s_alloc 1447 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) 1448 { 1449 memset(d, 0, sizeof(*d)); 1450 1451 if (__sdt_alloc(cpu_map)) 1452 return sa_sd_storage; 1453 d->sd = alloc_percpu(struct sched_domain *); 1454 if (!d->sd) 1455 return sa_sd_storage; 1456 d->rd = alloc_rootdomain(); 1457 if (!d->rd) 1458 return sa_sd; 1459 1460 return sa_rootdomain; 1461 } 1462 1463 /* 1464 * NULL the sd_data elements we've used to build the sched_domain and 1465 * sched_group structure so that the subsequent __free_domain_allocs() 1466 * will not free the data we're using. 1467 */ 1468 static void claim_allocations(int cpu, struct sched_domain *sd) 1469 { 1470 struct sd_data *sdd = sd->private; 1471 1472 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 1473 *per_cpu_ptr(sdd->sd, cpu) = NULL; 1474 1475 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) 1476 *per_cpu_ptr(sdd->sds, cpu) = NULL; 1477 1478 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 1479 *per_cpu_ptr(sdd->sg, cpu) = NULL; 1480 1481 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 1482 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 1483 } 1484 1485 #ifdef CONFIG_NUMA 1486 enum numa_topology_type sched_numa_topology_type; 1487 1488 static int sched_domains_numa_levels; 1489 static int sched_domains_curr_level; 1490 1491 int sched_max_numa_distance; 1492 static int *sched_domains_numa_distance; 1493 static struct cpumask ***sched_domains_numa_masks; 1494 #endif 1495 1496 /* 1497 * SD_flags allowed in topology descriptions. 1498 * 1499 * These flags are purely descriptive of the topology and do not prescribe 1500 * behaviour. Behaviour is artificial and mapped in the below sd_init() 1501 * function: 1502 * 1503 * SD_SHARE_CPUCAPACITY - describes SMT topologies 1504 * SD_SHARE_PKG_RESOURCES - describes shared caches 1505 * SD_NUMA - describes NUMA topologies 1506 * 1507 * Odd one out, which beside describing the topology has a quirk also 1508 * prescribes the desired behaviour that goes along with it: 1509 * 1510 * SD_ASYM_PACKING - describes SMT quirks 1511 */ 1512 #define TOPOLOGY_SD_FLAGS \ 1513 (SD_SHARE_CPUCAPACITY | \ 1514 SD_SHARE_PKG_RESOURCES | \ 1515 SD_NUMA | \ 1516 SD_ASYM_PACKING) 1517 1518 static struct sched_domain * 1519 sd_init(struct sched_domain_topology_level *tl, 1520 const struct cpumask *cpu_map, 1521 struct sched_domain *child, int cpu) 1522 { 1523 struct sd_data *sdd = &tl->data; 1524 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1525 int sd_id, sd_weight, sd_flags = 0; 1526 struct cpumask *sd_span; 1527 1528 #ifdef CONFIG_NUMA 1529 /* 1530 * Ugly hack to pass state to sd_numa_mask()... 1531 */ 1532 sched_domains_curr_level = tl->numa_level; 1533 #endif 1534 1535 sd_weight = cpumask_weight(tl->mask(cpu)); 1536 1537 if (tl->sd_flags) 1538 sd_flags = (*tl->sd_flags)(); 1539 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 1540 "wrong sd_flags in topology description\n")) 1541 sd_flags &= TOPOLOGY_SD_FLAGS; 1542 1543 *sd = (struct sched_domain){ 1544 .min_interval = sd_weight, 1545 .max_interval = 2*sd_weight, 1546 .busy_factor = 16, 1547 .imbalance_pct = 117, 1548 1549 .cache_nice_tries = 0, 1550 1551 .flags = 1*SD_BALANCE_NEWIDLE 1552 | 1*SD_BALANCE_EXEC 1553 | 1*SD_BALANCE_FORK 1554 | 0*SD_BALANCE_WAKE 1555 | 1*SD_WAKE_AFFINE 1556 | 0*SD_SHARE_CPUCAPACITY 1557 | 0*SD_SHARE_PKG_RESOURCES 1558 | 0*SD_SERIALIZE 1559 | 1*SD_PREFER_SIBLING 1560 | 0*SD_NUMA 1561 | sd_flags 1562 , 1563 1564 .last_balance = jiffies, 1565 .balance_interval = sd_weight, 1566 .max_newidle_lb_cost = 0, 1567 .last_decay_max_lb_cost = jiffies, 1568 .child = child, 1569 #ifdef CONFIG_SCHED_DEBUG 1570 .name = tl->name, 1571 #endif 1572 }; 1573 1574 sd_span = sched_domain_span(sd); 1575 cpumask_and(sd_span, cpu_map, tl->mask(cpu)); 1576 sd_id = cpumask_first(sd_span); 1577 1578 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); 1579 1580 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == 1581 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), 1582 "CPU capacity asymmetry not supported on SMT\n"); 1583 1584 /* 1585 * Convert topological properties into behaviour. 1586 */ 1587 /* Don't attempt to spread across CPUs of different capacities. */ 1588 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) 1589 sd->child->flags &= ~SD_PREFER_SIBLING; 1590 1591 if (sd->flags & SD_SHARE_CPUCAPACITY) { 1592 sd->imbalance_pct = 110; 1593 1594 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1595 sd->imbalance_pct = 117; 1596 sd->cache_nice_tries = 1; 1597 1598 #ifdef CONFIG_NUMA 1599 } else if (sd->flags & SD_NUMA) { 1600 sd->cache_nice_tries = 2; 1601 1602 sd->flags &= ~SD_PREFER_SIBLING; 1603 sd->flags |= SD_SERIALIZE; 1604 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { 1605 sd->flags &= ~(SD_BALANCE_EXEC | 1606 SD_BALANCE_FORK | 1607 SD_WAKE_AFFINE); 1608 } 1609 1610 #endif 1611 } else { 1612 sd->cache_nice_tries = 1; 1613 } 1614 1615 /* 1616 * For all levels sharing cache; connect a sched_domain_shared 1617 * instance. 1618 */ 1619 if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1620 sd->shared = *per_cpu_ptr(sdd->sds, sd_id); 1621 atomic_inc(&sd->shared->ref); 1622 atomic_set(&sd->shared->nr_busy_cpus, sd_weight); 1623 } 1624 1625 sd->private = sdd; 1626 1627 return sd; 1628 } 1629 1630 /* 1631 * Topology list, bottom-up. 1632 */ 1633 static struct sched_domain_topology_level default_topology[] = { 1634 #ifdef CONFIG_SCHED_SMT 1635 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 1636 #endif 1637 1638 #ifdef CONFIG_SCHED_CLUSTER 1639 { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, 1640 #endif 1641 1642 #ifdef CONFIG_SCHED_MC 1643 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 1644 #endif 1645 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 1646 { NULL, }, 1647 }; 1648 1649 static struct sched_domain_topology_level *sched_domain_topology = 1650 default_topology; 1651 static struct sched_domain_topology_level *sched_domain_topology_saved; 1652 1653 #define for_each_sd_topology(tl) \ 1654 for (tl = sched_domain_topology; tl->mask; tl++) 1655 1656 void set_sched_topology(struct sched_domain_topology_level *tl) 1657 { 1658 if (WARN_ON_ONCE(sched_smp_initialized)) 1659 return; 1660 1661 sched_domain_topology = tl; 1662 sched_domain_topology_saved = NULL; 1663 } 1664 1665 #ifdef CONFIG_NUMA 1666 1667 static const struct cpumask *sd_numa_mask(int cpu) 1668 { 1669 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 1670 } 1671 1672 static void sched_numa_warn(const char *str) 1673 { 1674 static int done = false; 1675 int i,j; 1676 1677 if (done) 1678 return; 1679 1680 done = true; 1681 1682 printk(KERN_WARNING "ERROR: %s\n\n", str); 1683 1684 for (i = 0; i < nr_node_ids; i++) { 1685 printk(KERN_WARNING " "); 1686 for (j = 0; j < nr_node_ids; j++) { 1687 if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) 1688 printk(KERN_CONT "(%02d) ", node_distance(i,j)); 1689 else 1690 printk(KERN_CONT " %02d ", node_distance(i,j)); 1691 } 1692 printk(KERN_CONT "\n"); 1693 } 1694 printk(KERN_WARNING "\n"); 1695 } 1696 1697 bool find_numa_distance(int distance) 1698 { 1699 bool found = false; 1700 int i, *distances; 1701 1702 if (distance == node_distance(0, 0)) 1703 return true; 1704 1705 rcu_read_lock(); 1706 distances = rcu_dereference(sched_domains_numa_distance); 1707 if (!distances) 1708 goto unlock; 1709 for (i = 0; i < sched_domains_numa_levels; i++) { 1710 if (distances[i] == distance) { 1711 found = true; 1712 break; 1713 } 1714 } 1715 unlock: 1716 rcu_read_unlock(); 1717 1718 return found; 1719 } 1720 1721 #define for_each_cpu_node_but(n, nbut) \ 1722 for_each_node_state(n, N_CPU) \ 1723 if (n == nbut) \ 1724 continue; \ 1725 else 1726 1727 /* 1728 * A system can have three types of NUMA topology: 1729 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 1730 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 1731 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 1732 * 1733 * The difference between a glueless mesh topology and a backplane 1734 * topology lies in whether communication between not directly 1735 * connected nodes goes through intermediary nodes (where programs 1736 * could run), or through backplane controllers. This affects 1737 * placement of programs. 1738 * 1739 * The type of topology can be discerned with the following tests: 1740 * - If the maximum distance between any nodes is 1 hop, the system 1741 * is directly connected. 1742 * - If for two nodes A and B, located N > 1 hops away from each other, 1743 * there is an intermediary node C, which is < N hops away from both 1744 * nodes A and B, the system is a glueless mesh. 1745 */ 1746 static void init_numa_topology_type(int offline_node) 1747 { 1748 int a, b, c, n; 1749 1750 n = sched_max_numa_distance; 1751 1752 if (sched_domains_numa_levels <= 2) { 1753 sched_numa_topology_type = NUMA_DIRECT; 1754 return; 1755 } 1756 1757 for_each_cpu_node_but(a, offline_node) { 1758 for_each_cpu_node_but(b, offline_node) { 1759 /* Find two nodes furthest removed from each other. */ 1760 if (node_distance(a, b) < n) 1761 continue; 1762 1763 /* Is there an intermediary node between a and b? */ 1764 for_each_cpu_node_but(c, offline_node) { 1765 if (node_distance(a, c) < n && 1766 node_distance(b, c) < n) { 1767 sched_numa_topology_type = 1768 NUMA_GLUELESS_MESH; 1769 return; 1770 } 1771 } 1772 1773 sched_numa_topology_type = NUMA_BACKPLANE; 1774 return; 1775 } 1776 } 1777 1778 pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); 1779 sched_numa_topology_type = NUMA_DIRECT; 1780 } 1781 1782 1783 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) 1784 1785 void sched_init_numa(int offline_node) 1786 { 1787 struct sched_domain_topology_level *tl; 1788 unsigned long *distance_map; 1789 int nr_levels = 0; 1790 int i, j; 1791 int *distances; 1792 struct cpumask ***masks; 1793 1794 /* 1795 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 1796 * unique distances in the node_distance() table. 1797 */ 1798 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); 1799 if (!distance_map) 1800 return; 1801 1802 bitmap_zero(distance_map, NR_DISTANCE_VALUES); 1803 for_each_cpu_node_but(i, offline_node) { 1804 for_each_cpu_node_but(j, offline_node) { 1805 int distance = node_distance(i, j); 1806 1807 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { 1808 sched_numa_warn("Invalid distance value range"); 1809 bitmap_free(distance_map); 1810 return; 1811 } 1812 1813 bitmap_set(distance_map, distance, 1); 1814 } 1815 } 1816 /* 1817 * We can now figure out how many unique distance values there are and 1818 * allocate memory accordingly. 1819 */ 1820 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); 1821 1822 distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); 1823 if (!distances) { 1824 bitmap_free(distance_map); 1825 return; 1826 } 1827 1828 for (i = 0, j = 0; i < nr_levels; i++, j++) { 1829 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); 1830 distances[i] = j; 1831 } 1832 rcu_assign_pointer(sched_domains_numa_distance, distances); 1833 1834 bitmap_free(distance_map); 1835 1836 /* 1837 * 'nr_levels' contains the number of unique distances 1838 * 1839 * The sched_domains_numa_distance[] array includes the actual distance 1840 * numbers. 1841 */ 1842 1843 /* 1844 * Here, we should temporarily reset sched_domains_numa_levels to 0. 1845 * If it fails to allocate memory for array sched_domains_numa_masks[][], 1846 * the array will contain less then 'nr_levels' members. This could be 1847 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 1848 * in other functions. 1849 * 1850 * We reset it to 'nr_levels' at the end of this function. 1851 */ 1852 sched_domains_numa_levels = 0; 1853 1854 masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); 1855 if (!masks) 1856 return; 1857 1858 /* 1859 * Now for each level, construct a mask per node which contains all 1860 * CPUs of nodes that are that many hops away from us. 1861 */ 1862 for (i = 0; i < nr_levels; i++) { 1863 masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 1864 if (!masks[i]) 1865 return; 1866 1867 for_each_cpu_node_but(j, offline_node) { 1868 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 1869 int k; 1870 1871 if (!mask) 1872 return; 1873 1874 masks[i][j] = mask; 1875 1876 for_each_cpu_node_but(k, offline_node) { 1877 if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) 1878 sched_numa_warn("Node-distance not symmetric"); 1879 1880 if (node_distance(j, k) > sched_domains_numa_distance[i]) 1881 continue; 1882 1883 cpumask_or(mask, mask, cpumask_of_node(k)); 1884 } 1885 } 1886 } 1887 rcu_assign_pointer(sched_domains_numa_masks, masks); 1888 1889 /* Compute default topology size */ 1890 for (i = 0; sched_domain_topology[i].mask; i++); 1891 1892 tl = kzalloc((i + nr_levels + 1) * 1893 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 1894 if (!tl) 1895 return; 1896 1897 /* 1898 * Copy the default topology bits.. 1899 */ 1900 for (i = 0; sched_domain_topology[i].mask; i++) 1901 tl[i] = sched_domain_topology[i]; 1902 1903 /* 1904 * Add the NUMA identity distance, aka single NODE. 1905 */ 1906 tl[i++] = (struct sched_domain_topology_level){ 1907 .mask = sd_numa_mask, 1908 .numa_level = 0, 1909 SD_INIT_NAME(NODE) 1910 }; 1911 1912 /* 1913 * .. and append 'j' levels of NUMA goodness. 1914 */ 1915 for (j = 1; j < nr_levels; i++, j++) { 1916 tl[i] = (struct sched_domain_topology_level){ 1917 .mask = sd_numa_mask, 1918 .sd_flags = cpu_numa_flags, 1919 .flags = SDTL_OVERLAP, 1920 .numa_level = j, 1921 SD_INIT_NAME(NUMA) 1922 }; 1923 } 1924 1925 sched_domain_topology_saved = sched_domain_topology; 1926 sched_domain_topology = tl; 1927 1928 sched_domains_numa_levels = nr_levels; 1929 WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); 1930 1931 init_numa_topology_type(offline_node); 1932 } 1933 1934 1935 static void sched_reset_numa(void) 1936 { 1937 int nr_levels, *distances; 1938 struct cpumask ***masks; 1939 1940 nr_levels = sched_domains_numa_levels; 1941 sched_domains_numa_levels = 0; 1942 sched_max_numa_distance = 0; 1943 sched_numa_topology_type = NUMA_DIRECT; 1944 distances = sched_domains_numa_distance; 1945 rcu_assign_pointer(sched_domains_numa_distance, NULL); 1946 masks = sched_domains_numa_masks; 1947 rcu_assign_pointer(sched_domains_numa_masks, NULL); 1948 if (distances || masks) { 1949 int i, j; 1950 1951 synchronize_rcu(); 1952 kfree(distances); 1953 for (i = 0; i < nr_levels && masks; i++) { 1954 if (!masks[i]) 1955 continue; 1956 for_each_node(j) 1957 kfree(masks[i][j]); 1958 kfree(masks[i]); 1959 } 1960 kfree(masks); 1961 } 1962 if (sched_domain_topology_saved) { 1963 kfree(sched_domain_topology); 1964 sched_domain_topology = sched_domain_topology_saved; 1965 sched_domain_topology_saved = NULL; 1966 } 1967 } 1968 1969 /* 1970 * Call with hotplug lock held 1971 */ 1972 void sched_update_numa(int cpu, bool online) 1973 { 1974 int node; 1975 1976 node = cpu_to_node(cpu); 1977 /* 1978 * Scheduler NUMA topology is updated when the first CPU of a 1979 * node is onlined or the last CPU of a node is offlined. 1980 */ 1981 if (cpumask_weight(cpumask_of_node(node)) != 1) 1982 return; 1983 1984 sched_reset_numa(); 1985 sched_init_numa(online ? NUMA_NO_NODE : node); 1986 } 1987 1988 void sched_domains_numa_masks_set(unsigned int cpu) 1989 { 1990 int node = cpu_to_node(cpu); 1991 int i, j; 1992 1993 for (i = 0; i < sched_domains_numa_levels; i++) { 1994 for (j = 0; j < nr_node_ids; j++) { 1995 if (!node_state(j, N_CPU)) 1996 continue; 1997 1998 /* Set ourselves in the remote node's masks */ 1999 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 2000 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 2001 } 2002 } 2003 } 2004 2005 void sched_domains_numa_masks_clear(unsigned int cpu) 2006 { 2007 int i, j; 2008 2009 for (i = 0; i < sched_domains_numa_levels; i++) { 2010 for (j = 0; j < nr_node_ids; j++) { 2011 if (sched_domains_numa_masks[i][j]) 2012 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 2013 } 2014 } 2015 } 2016 2017 /* 2018 * sched_numa_find_closest() - given the NUMA topology, find the cpu 2019 * closest to @cpu from @cpumask. 2020 * cpumask: cpumask to find a cpu from 2021 * cpu: cpu to be close to 2022 * 2023 * returns: cpu, or nr_cpu_ids when nothing found. 2024 */ 2025 int sched_numa_find_closest(const struct cpumask *cpus, int cpu) 2026 { 2027 int i, j = cpu_to_node(cpu), found = nr_cpu_ids; 2028 struct cpumask ***masks; 2029 2030 rcu_read_lock(); 2031 masks = rcu_dereference(sched_domains_numa_masks); 2032 if (!masks) 2033 goto unlock; 2034 for (i = 0; i < sched_domains_numa_levels; i++) { 2035 if (!masks[i][j]) 2036 break; 2037 cpu = cpumask_any_and(cpus, masks[i][j]); 2038 if (cpu < nr_cpu_ids) { 2039 found = cpu; 2040 break; 2041 } 2042 } 2043 unlock: 2044 rcu_read_unlock(); 2045 2046 return found; 2047 } 2048 2049 #endif /* CONFIG_NUMA */ 2050 2051 static int __sdt_alloc(const struct cpumask *cpu_map) 2052 { 2053 struct sched_domain_topology_level *tl; 2054 int j; 2055 2056 for_each_sd_topology(tl) { 2057 struct sd_data *sdd = &tl->data; 2058 2059 sdd->sd = alloc_percpu(struct sched_domain *); 2060 if (!sdd->sd) 2061 return -ENOMEM; 2062 2063 sdd->sds = alloc_percpu(struct sched_domain_shared *); 2064 if (!sdd->sds) 2065 return -ENOMEM; 2066 2067 sdd->sg = alloc_percpu(struct sched_group *); 2068 if (!sdd->sg) 2069 return -ENOMEM; 2070 2071 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 2072 if (!sdd->sgc) 2073 return -ENOMEM; 2074 2075 for_each_cpu(j, cpu_map) { 2076 struct sched_domain *sd; 2077 struct sched_domain_shared *sds; 2078 struct sched_group *sg; 2079 struct sched_group_capacity *sgc; 2080 2081 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 2082 GFP_KERNEL, cpu_to_node(j)); 2083 if (!sd) 2084 return -ENOMEM; 2085 2086 *per_cpu_ptr(sdd->sd, j) = sd; 2087 2088 sds = kzalloc_node(sizeof(struct sched_domain_shared), 2089 GFP_KERNEL, cpu_to_node(j)); 2090 if (!sds) 2091 return -ENOMEM; 2092 2093 *per_cpu_ptr(sdd->sds, j) = sds; 2094 2095 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 2096 GFP_KERNEL, cpu_to_node(j)); 2097 if (!sg) 2098 return -ENOMEM; 2099 2100 sg->next = sg; 2101 2102 *per_cpu_ptr(sdd->sg, j) = sg; 2103 2104 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 2105 GFP_KERNEL, cpu_to_node(j)); 2106 if (!sgc) 2107 return -ENOMEM; 2108 2109 #ifdef CONFIG_SCHED_DEBUG 2110 sgc->id = j; 2111 #endif 2112 2113 *per_cpu_ptr(sdd->sgc, j) = sgc; 2114 } 2115 } 2116 2117 return 0; 2118 } 2119 2120 static void __sdt_free(const struct cpumask *cpu_map) 2121 { 2122 struct sched_domain_topology_level *tl; 2123 int j; 2124 2125 for_each_sd_topology(tl) { 2126 struct sd_data *sdd = &tl->data; 2127 2128 for_each_cpu(j, cpu_map) { 2129 struct sched_domain *sd; 2130 2131 if (sdd->sd) { 2132 sd = *per_cpu_ptr(sdd->sd, j); 2133 if (sd && (sd->flags & SD_OVERLAP)) 2134 free_sched_groups(sd->groups, 0); 2135 kfree(*per_cpu_ptr(sdd->sd, j)); 2136 } 2137 2138 if (sdd->sds) 2139 kfree(*per_cpu_ptr(sdd->sds, j)); 2140 if (sdd->sg) 2141 kfree(*per_cpu_ptr(sdd->sg, j)); 2142 if (sdd->sgc) 2143 kfree(*per_cpu_ptr(sdd->sgc, j)); 2144 } 2145 free_percpu(sdd->sd); 2146 sdd->sd = NULL; 2147 free_percpu(sdd->sds); 2148 sdd->sds = NULL; 2149 free_percpu(sdd->sg); 2150 sdd->sg = NULL; 2151 free_percpu(sdd->sgc); 2152 sdd->sgc = NULL; 2153 } 2154 } 2155 2156 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 2157 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 2158 struct sched_domain *child, int cpu) 2159 { 2160 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); 2161 2162 if (child) { 2163 sd->level = child->level + 1; 2164 sched_domain_level_max = max(sched_domain_level_max, sd->level); 2165 child->parent = sd; 2166 2167 if (!cpumask_subset(sched_domain_span(child), 2168 sched_domain_span(sd))) { 2169 pr_err("BUG: arch topology borken\n"); 2170 #ifdef CONFIG_SCHED_DEBUG 2171 pr_err(" the %s domain not a subset of the %s domain\n", 2172 child->name, sd->name); 2173 #endif 2174 /* Fixup, ensure @sd has at least @child CPUs. */ 2175 cpumask_or(sched_domain_span(sd), 2176 sched_domain_span(sd), 2177 sched_domain_span(child)); 2178 } 2179 2180 } 2181 set_domain_attribute(sd, attr); 2182 2183 return sd; 2184 } 2185 2186 /* 2187 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for 2188 * any two given CPUs at this (non-NUMA) topology level. 2189 */ 2190 static bool topology_span_sane(struct sched_domain_topology_level *tl, 2191 const struct cpumask *cpu_map, int cpu) 2192 { 2193 int i; 2194 2195 /* NUMA levels are allowed to overlap */ 2196 if (tl->flags & SDTL_OVERLAP) 2197 return true; 2198 2199 /* 2200 * Non-NUMA levels cannot partially overlap - they must be either 2201 * completely equal or completely disjoint. Otherwise we can end up 2202 * breaking the sched_group lists - i.e. a later get_group() pass 2203 * breaks the linking done for an earlier span. 2204 */ 2205 for_each_cpu(i, cpu_map) { 2206 if (i == cpu) 2207 continue; 2208 /* 2209 * We should 'and' all those masks with 'cpu_map' to exactly 2210 * match the topology we're about to build, but that can only 2211 * remove CPUs, which only lessens our ability to detect 2212 * overlaps 2213 */ 2214 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && 2215 cpumask_intersects(tl->mask(cpu), tl->mask(i))) 2216 return false; 2217 } 2218 2219 return true; 2220 } 2221 2222 /* 2223 * Build sched domains for a given set of CPUs and attach the sched domains 2224 * to the individual CPUs 2225 */ 2226 static int 2227 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) 2228 { 2229 enum s_alloc alloc_state = sa_none; 2230 struct sched_domain *sd; 2231 struct s_data d; 2232 struct rq *rq = NULL; 2233 int i, ret = -ENOMEM; 2234 bool has_asym = false; 2235 2236 if (WARN_ON(cpumask_empty(cpu_map))) 2237 goto error; 2238 2239 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 2240 if (alloc_state != sa_rootdomain) 2241 goto error; 2242 2243 /* Set up domains for CPUs specified by the cpu_map: */ 2244 for_each_cpu(i, cpu_map) { 2245 struct sched_domain_topology_level *tl; 2246 2247 sd = NULL; 2248 for_each_sd_topology(tl) { 2249 2250 if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) 2251 goto error; 2252 2253 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 2254 2255 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; 2256 2257 if (tl == sched_domain_topology) 2258 *per_cpu_ptr(d.sd, i) = sd; 2259 if (tl->flags & SDTL_OVERLAP) 2260 sd->flags |= SD_OVERLAP; 2261 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 2262 break; 2263 } 2264 } 2265 2266 /* Build the groups for the domains */ 2267 for_each_cpu(i, cpu_map) { 2268 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2269 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 2270 if (sd->flags & SD_OVERLAP) { 2271 if (build_overlap_sched_groups(sd, i)) 2272 goto error; 2273 } else { 2274 if (build_sched_groups(sd, i)) 2275 goto error; 2276 } 2277 } 2278 } 2279 2280 /* 2281 * Calculate an allowed NUMA imbalance such that LLCs do not get 2282 * imbalanced. 2283 */ 2284 for_each_cpu(i, cpu_map) { 2285 unsigned int imb = 0; 2286 unsigned int imb_span = 1; 2287 2288 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2289 struct sched_domain *child = sd->child; 2290 2291 if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && 2292 (child->flags & SD_SHARE_PKG_RESOURCES)) { 2293 struct sched_domain __rcu *top_p; 2294 unsigned int nr_llcs; 2295 2296 /* 2297 * For a single LLC per node, allow an 2298 * imbalance up to 25% of the node. This is an 2299 * arbitrary cutoff based on SMT-2 to balance 2300 * between memory bandwidth and avoiding 2301 * premature sharing of HT resources and SMT-4 2302 * or SMT-8 *may* benefit from a different 2303 * cutoff. 2304 * 2305 * For multiple LLCs, allow an imbalance 2306 * until multiple tasks would share an LLC 2307 * on one node while LLCs on another node 2308 * remain idle. 2309 */ 2310 nr_llcs = sd->span_weight / child->span_weight; 2311 if (nr_llcs == 1) 2312 imb = sd->span_weight >> 2; 2313 else 2314 imb = nr_llcs; 2315 sd->imb_numa_nr = imb; 2316 2317 /* Set span based on the first NUMA domain. */ 2318 top_p = sd->parent; 2319 while (top_p && !(top_p->flags & SD_NUMA)) { 2320 top_p = top_p->parent; 2321 } 2322 imb_span = top_p ? top_p->span_weight : sd->span_weight; 2323 } else { 2324 int factor = max(1U, (sd->span_weight / imb_span)); 2325 2326 sd->imb_numa_nr = imb * factor; 2327 } 2328 } 2329 } 2330 2331 /* Calculate CPU capacity for physical packages and nodes */ 2332 for (i = nr_cpumask_bits-1; i >= 0; i--) { 2333 if (!cpumask_test_cpu(i, cpu_map)) 2334 continue; 2335 2336 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2337 claim_allocations(i, sd); 2338 init_sched_groups_capacity(i, sd); 2339 } 2340 } 2341 2342 /* Attach the domains */ 2343 rcu_read_lock(); 2344 for_each_cpu(i, cpu_map) { 2345 rq = cpu_rq(i); 2346 sd = *per_cpu_ptr(d.sd, i); 2347 2348 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ 2349 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) 2350 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); 2351 2352 cpu_attach_domain(sd, d.rd, i); 2353 } 2354 rcu_read_unlock(); 2355 2356 if (has_asym) 2357 static_branch_inc_cpuslocked(&sched_asym_cpucapacity); 2358 2359 if (rq && sched_debug_verbose) { 2360 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", 2361 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); 2362 } 2363 2364 ret = 0; 2365 error: 2366 __free_domain_allocs(&d, alloc_state, cpu_map); 2367 2368 return ret; 2369 } 2370 2371 /* Current sched domains: */ 2372 static cpumask_var_t *doms_cur; 2373 2374 /* Number of sched domains in 'doms_cur': */ 2375 static int ndoms_cur; 2376 2377 /* Attributes of custom domains in 'doms_cur' */ 2378 static struct sched_domain_attr *dattr_cur; 2379 2380 /* 2381 * Special case: If a kmalloc() of a doms_cur partition (array of 2382 * cpumask) fails, then fallback to a single sched domain, 2383 * as determined by the single cpumask fallback_doms. 2384 */ 2385 static cpumask_var_t fallback_doms; 2386 2387 /* 2388 * arch_update_cpu_topology lets virtualized architectures update the 2389 * CPU core maps. It is supposed to return 1 if the topology changed 2390 * or 0 if it stayed the same. 2391 */ 2392 int __weak arch_update_cpu_topology(void) 2393 { 2394 return 0; 2395 } 2396 2397 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 2398 { 2399 int i; 2400 cpumask_var_t *doms; 2401 2402 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL); 2403 if (!doms) 2404 return NULL; 2405 for (i = 0; i < ndoms; i++) { 2406 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 2407 free_sched_domains(doms, i); 2408 return NULL; 2409 } 2410 } 2411 return doms; 2412 } 2413 2414 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 2415 { 2416 unsigned int i; 2417 for (i = 0; i < ndoms; i++) 2418 free_cpumask_var(doms[i]); 2419 kfree(doms); 2420 } 2421 2422 /* 2423 * Set up scheduler domains and groups. For now this just excludes isolated 2424 * CPUs, but could be used to exclude other special cases in the future. 2425 */ 2426 int sched_init_domains(const struct cpumask *cpu_map) 2427 { 2428 int err; 2429 2430 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); 2431 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); 2432 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); 2433 2434 arch_update_cpu_topology(); 2435 asym_cpu_capacity_scan(); 2436 ndoms_cur = 1; 2437 doms_cur = alloc_sched_domains(ndoms_cur); 2438 if (!doms_cur) 2439 doms_cur = &fallback_doms; 2440 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); 2441 err = build_sched_domains(doms_cur[0], NULL); 2442 2443 return err; 2444 } 2445 2446 /* 2447 * Detach sched domains from a group of CPUs specified in cpu_map 2448 * These CPUs will now be attached to the NULL domain 2449 */ 2450 static void detach_destroy_domains(const struct cpumask *cpu_map) 2451 { 2452 unsigned int cpu = cpumask_any(cpu_map); 2453 int i; 2454 2455 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) 2456 static_branch_dec_cpuslocked(&sched_asym_cpucapacity); 2457 2458 rcu_read_lock(); 2459 for_each_cpu(i, cpu_map) 2460 cpu_attach_domain(NULL, &def_root_domain, i); 2461 rcu_read_unlock(); 2462 } 2463 2464 /* handle null as "default" */ 2465 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 2466 struct sched_domain_attr *new, int idx_new) 2467 { 2468 struct sched_domain_attr tmp; 2469 2470 /* Fast path: */ 2471 if (!new && !cur) 2472 return 1; 2473 2474 tmp = SD_ATTR_INIT; 2475 2476 return !memcmp(cur ? (cur + idx_cur) : &tmp, 2477 new ? (new + idx_new) : &tmp, 2478 sizeof(struct sched_domain_attr)); 2479 } 2480 2481 /* 2482 * Partition sched domains as specified by the 'ndoms_new' 2483 * cpumasks in the array doms_new[] of cpumasks. This compares 2484 * doms_new[] to the current sched domain partitioning, doms_cur[]. 2485 * It destroys each deleted domain and builds each new domain. 2486 * 2487 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 2488 * The masks don't intersect (don't overlap.) We should setup one 2489 * sched domain for each mask. CPUs not in any of the cpumasks will 2490 * not be load balanced. If the same cpumask appears both in the 2491 * current 'doms_cur' domains and in the new 'doms_new', we can leave 2492 * it as it is. 2493 * 2494 * The passed in 'doms_new' should be allocated using 2495 * alloc_sched_domains. This routine takes ownership of it and will 2496 * free_sched_domains it when done with it. If the caller failed the 2497 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 2498 * and partition_sched_domains() will fallback to the single partition 2499 * 'fallback_doms', it also forces the domains to be rebuilt. 2500 * 2501 * If doms_new == NULL it will be replaced with cpu_online_mask. 2502 * ndoms_new == 0 is a special case for destroying existing domains, 2503 * and it will not create the default domain. 2504 * 2505 * Call with hotplug lock and sched_domains_mutex held 2506 */ 2507 void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], 2508 struct sched_domain_attr *dattr_new) 2509 { 2510 bool __maybe_unused has_eas = false; 2511 int i, j, n; 2512 int new_topology; 2513 2514 lockdep_assert_held(&sched_domains_mutex); 2515 2516 /* Let the architecture update CPU core mappings: */ 2517 new_topology = arch_update_cpu_topology(); 2518 /* Trigger rebuilding CPU capacity asymmetry data */ 2519 if (new_topology) 2520 asym_cpu_capacity_scan(); 2521 2522 if (!doms_new) { 2523 WARN_ON_ONCE(dattr_new); 2524 n = 0; 2525 doms_new = alloc_sched_domains(1); 2526 if (doms_new) { 2527 n = 1; 2528 cpumask_and(doms_new[0], cpu_active_mask, 2529 housekeeping_cpumask(HK_TYPE_DOMAIN)); 2530 } 2531 } else { 2532 n = ndoms_new; 2533 } 2534 2535 /* Destroy deleted domains: */ 2536 for (i = 0; i < ndoms_cur; i++) { 2537 for (j = 0; j < n && !new_topology; j++) { 2538 if (cpumask_equal(doms_cur[i], doms_new[j]) && 2539 dattrs_equal(dattr_cur, i, dattr_new, j)) { 2540 struct root_domain *rd; 2541 2542 /* 2543 * This domain won't be destroyed and as such 2544 * its dl_bw->total_bw needs to be cleared. It 2545 * will be recomputed in function 2546 * update_tasks_root_domain(). 2547 */ 2548 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; 2549 dl_clear_root_domain(rd); 2550 goto match1; 2551 } 2552 } 2553 /* No match - a current sched domain not in new doms_new[] */ 2554 detach_destroy_domains(doms_cur[i]); 2555 match1: 2556 ; 2557 } 2558 2559 n = ndoms_cur; 2560 if (!doms_new) { 2561 n = 0; 2562 doms_new = &fallback_doms; 2563 cpumask_and(doms_new[0], cpu_active_mask, 2564 housekeeping_cpumask(HK_TYPE_DOMAIN)); 2565 } 2566 2567 /* Build new domains: */ 2568 for (i = 0; i < ndoms_new; i++) { 2569 for (j = 0; j < n && !new_topology; j++) { 2570 if (cpumask_equal(doms_new[i], doms_cur[j]) && 2571 dattrs_equal(dattr_new, i, dattr_cur, j)) 2572 goto match2; 2573 } 2574 /* No match - add a new doms_new */ 2575 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 2576 match2: 2577 ; 2578 } 2579 2580 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 2581 /* Build perf. domains: */ 2582 for (i = 0; i < ndoms_new; i++) { 2583 for (j = 0; j < n && !sched_energy_update; j++) { 2584 if (cpumask_equal(doms_new[i], doms_cur[j]) && 2585 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { 2586 has_eas = true; 2587 goto match3; 2588 } 2589 } 2590 /* No match - add perf. domains for a new rd */ 2591 has_eas |= build_perf_domains(doms_new[i]); 2592 match3: 2593 ; 2594 } 2595 sched_energy_set(has_eas); 2596 #endif 2597 2598 /* Remember the new sched domains: */ 2599 if (doms_cur != &fallback_doms) 2600 free_sched_domains(doms_cur, ndoms_cur); 2601 2602 kfree(dattr_cur); 2603 doms_cur = doms_new; 2604 dattr_cur = dattr_new; 2605 ndoms_cur = ndoms_new; 2606 2607 update_sched_domain_debugfs(); 2608 } 2609 2610 /* 2611 * Call with hotplug lock held 2612 */ 2613 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 2614 struct sched_domain_attr *dattr_new) 2615 { 2616 mutex_lock(&sched_domains_mutex); 2617 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 2618 mutex_unlock(&sched_domains_mutex); 2619 } 2620