xref: /openbmc/linux/kernel/sched/topology.c (revision b0151c25548cacc50771a7930475727c6c8ee869)
1 /*
2  * Scheduler topology setup/handling methods
3  */
4 #include <linux/sched.h>
5 #include <linux/mutex.h>
6 
7 #include "sched.h"
8 
9 DEFINE_MUTEX(sched_domains_mutex);
10 
11 /* Protected by sched_domains_mutex: */
12 cpumask_var_t sched_domains_tmpmask;
13 
14 #ifdef CONFIG_SCHED_DEBUG
15 
16 static __read_mostly int sched_debug_enabled;
17 
18 static int __init sched_debug_setup(char *str)
19 {
20 	sched_debug_enabled = 1;
21 
22 	return 0;
23 }
24 early_param("sched_debug", sched_debug_setup);
25 
26 static inline bool sched_debug(void)
27 {
28 	return sched_debug_enabled;
29 }
30 
31 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
32 				  struct cpumask *groupmask)
33 {
34 	struct sched_group *group = sd->groups;
35 
36 	cpumask_clear(groupmask);
37 
38 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
39 
40 	if (!(sd->flags & SD_LOAD_BALANCE)) {
41 		printk("does not load-balance\n");
42 		if (sd->parent)
43 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
44 					" has parent");
45 		return -1;
46 	}
47 
48 	printk(KERN_CONT "span %*pbl level %s\n",
49 	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
50 
51 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
52 		printk(KERN_ERR "ERROR: domain->span does not contain "
53 				"CPU%d\n", cpu);
54 	}
55 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
56 		printk(KERN_ERR "ERROR: domain->groups does not contain"
57 				" CPU%d\n", cpu);
58 	}
59 
60 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
61 	do {
62 		if (!group) {
63 			printk("\n");
64 			printk(KERN_ERR "ERROR: group is NULL\n");
65 			break;
66 		}
67 
68 		if (!cpumask_weight(sched_group_cpus(group))) {
69 			printk(KERN_CONT "\n");
70 			printk(KERN_ERR "ERROR: empty group\n");
71 			break;
72 		}
73 
74 		if (!(sd->flags & SD_OVERLAP) &&
75 		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
76 			printk(KERN_CONT "\n");
77 			printk(KERN_ERR "ERROR: repeated CPUs\n");
78 			break;
79 		}
80 
81 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
82 
83 		printk(KERN_CONT " %*pbl",
84 		       cpumask_pr_args(sched_group_cpus(group)));
85 
86 		if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) {
87 			printk(KERN_CONT " (mask: %*pbl)",
88 				cpumask_pr_args(sched_group_mask(group)));
89 		}
90 
91 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
92 			printk(KERN_CONT " (cpu_capacity: %lu)",
93 				group->sgc->capacity);
94 		}
95 
96 		group = group->next;
97 
98 		if (group != sd->groups)
99 			printk(KERN_CONT ",");
100 
101 	} while (group != sd->groups);
102 	printk(KERN_CONT "\n");
103 
104 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
105 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
106 
107 	if (sd->parent &&
108 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
109 		printk(KERN_ERR "ERROR: parent span is not a superset "
110 			"of domain->span\n");
111 	return 0;
112 }
113 
114 static void sched_domain_debug(struct sched_domain *sd, int cpu)
115 {
116 	int level = 0;
117 
118 	if (!sched_debug_enabled)
119 		return;
120 
121 	if (!sd) {
122 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
123 		return;
124 	}
125 
126 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
127 
128 	for (;;) {
129 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
130 			break;
131 		level++;
132 		sd = sd->parent;
133 		if (!sd)
134 			break;
135 	}
136 }
137 #else /* !CONFIG_SCHED_DEBUG */
138 
139 # define sched_debug_enabled 0
140 # define sched_domain_debug(sd, cpu) do { } while (0)
141 static inline bool sched_debug(void)
142 {
143 	return false;
144 }
145 #endif /* CONFIG_SCHED_DEBUG */
146 
147 static int sd_degenerate(struct sched_domain *sd)
148 {
149 	if (cpumask_weight(sched_domain_span(sd)) == 1)
150 		return 1;
151 
152 	/* Following flags need at least 2 groups */
153 	if (sd->flags & (SD_LOAD_BALANCE |
154 			 SD_BALANCE_NEWIDLE |
155 			 SD_BALANCE_FORK |
156 			 SD_BALANCE_EXEC |
157 			 SD_SHARE_CPUCAPACITY |
158 			 SD_ASYM_CPUCAPACITY |
159 			 SD_SHARE_PKG_RESOURCES |
160 			 SD_SHARE_POWERDOMAIN)) {
161 		if (sd->groups != sd->groups->next)
162 			return 0;
163 	}
164 
165 	/* Following flags don't use groups */
166 	if (sd->flags & (SD_WAKE_AFFINE))
167 		return 0;
168 
169 	return 1;
170 }
171 
172 static int
173 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
174 {
175 	unsigned long cflags = sd->flags, pflags = parent->flags;
176 
177 	if (sd_degenerate(parent))
178 		return 1;
179 
180 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
181 		return 0;
182 
183 	/* Flags needing groups don't count if only 1 group in parent */
184 	if (parent->groups == parent->groups->next) {
185 		pflags &= ~(SD_LOAD_BALANCE |
186 				SD_BALANCE_NEWIDLE |
187 				SD_BALANCE_FORK |
188 				SD_BALANCE_EXEC |
189 				SD_ASYM_CPUCAPACITY |
190 				SD_SHARE_CPUCAPACITY |
191 				SD_SHARE_PKG_RESOURCES |
192 				SD_PREFER_SIBLING |
193 				SD_SHARE_POWERDOMAIN);
194 		if (nr_node_ids == 1)
195 			pflags &= ~SD_SERIALIZE;
196 	}
197 	if (~cflags & pflags)
198 		return 0;
199 
200 	return 1;
201 }
202 
203 static void free_rootdomain(struct rcu_head *rcu)
204 {
205 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
206 
207 	cpupri_cleanup(&rd->cpupri);
208 	cpudl_cleanup(&rd->cpudl);
209 	free_cpumask_var(rd->dlo_mask);
210 	free_cpumask_var(rd->rto_mask);
211 	free_cpumask_var(rd->online);
212 	free_cpumask_var(rd->span);
213 	kfree(rd);
214 }
215 
216 void rq_attach_root(struct rq *rq, struct root_domain *rd)
217 {
218 	struct root_domain *old_rd = NULL;
219 	unsigned long flags;
220 
221 	raw_spin_lock_irqsave(&rq->lock, flags);
222 
223 	if (rq->rd) {
224 		old_rd = rq->rd;
225 
226 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
227 			set_rq_offline(rq);
228 
229 		cpumask_clear_cpu(rq->cpu, old_rd->span);
230 
231 		/*
232 		 * If we dont want to free the old_rd yet then
233 		 * set old_rd to NULL to skip the freeing later
234 		 * in this function:
235 		 */
236 		if (!atomic_dec_and_test(&old_rd->refcount))
237 			old_rd = NULL;
238 	}
239 
240 	atomic_inc(&rd->refcount);
241 	rq->rd = rd;
242 
243 	cpumask_set_cpu(rq->cpu, rd->span);
244 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
245 		set_rq_online(rq);
246 
247 	raw_spin_unlock_irqrestore(&rq->lock, flags);
248 
249 	if (old_rd)
250 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
251 }
252 
253 static int init_rootdomain(struct root_domain *rd)
254 {
255 	memset(rd, 0, sizeof(*rd));
256 
257 	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
258 		goto out;
259 	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
260 		goto free_span;
261 	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
262 		goto free_online;
263 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
264 		goto free_dlo_mask;
265 
266 	init_dl_bw(&rd->dl_bw);
267 	if (cpudl_init(&rd->cpudl) != 0)
268 		goto free_rto_mask;
269 
270 	if (cpupri_init(&rd->cpupri) != 0)
271 		goto free_cpudl;
272 	return 0;
273 
274 free_cpudl:
275 	cpudl_cleanup(&rd->cpudl);
276 free_rto_mask:
277 	free_cpumask_var(rd->rto_mask);
278 free_dlo_mask:
279 	free_cpumask_var(rd->dlo_mask);
280 free_online:
281 	free_cpumask_var(rd->online);
282 free_span:
283 	free_cpumask_var(rd->span);
284 out:
285 	return -ENOMEM;
286 }
287 
288 /*
289  * By default the system creates a single root-domain with all CPUs as
290  * members (mimicking the global state we have today).
291  */
292 struct root_domain def_root_domain;
293 
294 void init_defrootdomain(void)
295 {
296 	init_rootdomain(&def_root_domain);
297 
298 	atomic_set(&def_root_domain.refcount, 1);
299 }
300 
301 static struct root_domain *alloc_rootdomain(void)
302 {
303 	struct root_domain *rd;
304 
305 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
306 	if (!rd)
307 		return NULL;
308 
309 	if (init_rootdomain(rd) != 0) {
310 		kfree(rd);
311 		return NULL;
312 	}
313 
314 	return rd;
315 }
316 
317 static void free_sched_groups(struct sched_group *sg, int free_sgc)
318 {
319 	struct sched_group *tmp, *first;
320 
321 	if (!sg)
322 		return;
323 
324 	first = sg;
325 	do {
326 		tmp = sg->next;
327 
328 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
329 			kfree(sg->sgc);
330 
331 		kfree(sg);
332 		sg = tmp;
333 	} while (sg != first);
334 }
335 
336 static void destroy_sched_domain(struct sched_domain *sd)
337 {
338 	/*
339 	 * If its an overlapping domain it has private groups, iterate and
340 	 * nuke them all.
341 	 */
342 	if (sd->flags & SD_OVERLAP) {
343 		free_sched_groups(sd->groups, 1);
344 	} else if (atomic_dec_and_test(&sd->groups->ref)) {
345 		kfree(sd->groups->sgc);
346 		kfree(sd->groups);
347 	}
348 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
349 		kfree(sd->shared);
350 	kfree(sd);
351 }
352 
353 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
354 {
355 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
356 
357 	while (sd) {
358 		struct sched_domain *parent = sd->parent;
359 		destroy_sched_domain(sd);
360 		sd = parent;
361 	}
362 }
363 
364 static void destroy_sched_domains(struct sched_domain *sd)
365 {
366 	if (sd)
367 		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
368 }
369 
370 /*
371  * Keep a special pointer to the highest sched_domain that has
372  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
373  * allows us to avoid some pointer chasing select_idle_sibling().
374  *
375  * Also keep a unique ID per domain (we use the first CPU number in
376  * the cpumask of the domain), this allows us to quickly tell if
377  * two CPUs are in the same cache domain, see cpus_share_cache().
378  */
379 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
380 DEFINE_PER_CPU(int, sd_llc_size);
381 DEFINE_PER_CPU(int, sd_llc_id);
382 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
383 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
384 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
385 
386 static void update_top_cache_domain(int cpu)
387 {
388 	struct sched_domain_shared *sds = NULL;
389 	struct sched_domain *sd;
390 	int id = cpu;
391 	int size = 1;
392 
393 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
394 	if (sd) {
395 		id = cpumask_first(sched_domain_span(sd));
396 		size = cpumask_weight(sched_domain_span(sd));
397 		sds = sd->shared;
398 	}
399 
400 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
401 	per_cpu(sd_llc_size, cpu) = size;
402 	per_cpu(sd_llc_id, cpu) = id;
403 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
404 
405 	sd = lowest_flag_domain(cpu, SD_NUMA);
406 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
407 
408 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
409 	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
410 }
411 
412 /*
413  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
414  * hold the hotplug lock.
415  */
416 static void
417 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
418 {
419 	struct rq *rq = cpu_rq(cpu);
420 	struct sched_domain *tmp;
421 
422 	/* Remove the sched domains which do not contribute to scheduling. */
423 	for (tmp = sd; tmp; ) {
424 		struct sched_domain *parent = tmp->parent;
425 		if (!parent)
426 			break;
427 
428 		if (sd_parent_degenerate(tmp, parent)) {
429 			tmp->parent = parent->parent;
430 			if (parent->parent)
431 				parent->parent->child = tmp;
432 			/*
433 			 * Transfer SD_PREFER_SIBLING down in case of a
434 			 * degenerate parent; the spans match for this
435 			 * so the property transfers.
436 			 */
437 			if (parent->flags & SD_PREFER_SIBLING)
438 				tmp->flags |= SD_PREFER_SIBLING;
439 			destroy_sched_domain(parent);
440 		} else
441 			tmp = tmp->parent;
442 	}
443 
444 	if (sd && sd_degenerate(sd)) {
445 		tmp = sd;
446 		sd = sd->parent;
447 		destroy_sched_domain(tmp);
448 		if (sd)
449 			sd->child = NULL;
450 	}
451 
452 	sched_domain_debug(sd, cpu);
453 
454 	rq_attach_root(rq, rd);
455 	tmp = rq->sd;
456 	rcu_assign_pointer(rq->sd, sd);
457 	destroy_sched_domains(tmp);
458 
459 	update_top_cache_domain(cpu);
460 }
461 
462 /* Setup the mask of CPUs configured for isolated domains */
463 static int __init isolated_cpu_setup(char *str)
464 {
465 	int ret;
466 
467 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
468 	ret = cpulist_parse(str, cpu_isolated_map);
469 	if (ret) {
470 		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
471 		return 0;
472 	}
473 	return 1;
474 }
475 __setup("isolcpus=", isolated_cpu_setup);
476 
477 struct s_data {
478 	struct sched_domain ** __percpu sd;
479 	struct root_domain	*rd;
480 };
481 
482 enum s_alloc {
483 	sa_rootdomain,
484 	sa_sd,
485 	sa_sd_storage,
486 	sa_none,
487 };
488 
489 /*
490  * Build an iteration mask that can exclude certain CPUs from the upwards
491  * domain traversal.
492  *
493  * Asymmetric node setups can result in situations where the domain tree is of
494  * unequal depth, make sure to skip domains that already cover the entire
495  * range.
496  *
497  * In that case build_sched_domains() will have terminated the iteration early
498  * and our sibling sd spans will be empty. Domains should always include the
499  * CPU they're built on, so check that.
500  */
501 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
502 {
503 	const struct cpumask *span = sched_domain_span(sd);
504 	struct sd_data *sdd = sd->private;
505 	struct sched_domain *sibling;
506 	int i;
507 
508 	for_each_cpu(i, span) {
509 		sibling = *per_cpu_ptr(sdd->sd, i);
510 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
511 			continue;
512 
513 		cpumask_set_cpu(i, sched_group_mask(sg));
514 	}
515 }
516 
517 /*
518  * Return the canonical balance CPU for this group, this is the first CPU
519  * of this group that's also in the iteration mask.
520  */
521 int group_balance_cpu(struct sched_group *sg)
522 {
523 	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
524 }
525 
526 static struct sched_group *
527 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
528 {
529 	struct sched_group *sg;
530 	struct cpumask *sg_span;
531 
532 	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
533 			GFP_KERNEL, cpu_to_node(cpu));
534 
535 	if (!sg)
536 		return NULL;
537 
538 	sg_span = sched_group_cpus(sg);
539 	if (sd->child)
540 		cpumask_copy(sg_span, sched_domain_span(sd->child));
541 	else
542 		cpumask_copy(sg_span, sched_domain_span(sd));
543 
544 	return sg;
545 }
546 
547 static void init_overlap_sched_group(struct sched_domain *sd,
548 				     struct sched_group *sg, int cpu)
549 {
550 	struct sd_data *sdd = sd->private;
551 	struct cpumask *sg_span;
552 
553 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
554 	if (atomic_inc_return(&sg->sgc->ref) == 1)
555 		build_group_mask(sd, sg);
556 
557 	/*
558 	 * Initialize sgc->capacity such that even if we mess up the
559 	 * domains and no possible iteration will get us here, we won't
560 	 * die on a /0 trap.
561 	 */
562 	sg_span = sched_group_cpus(sg);
563 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
564 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
565 }
566 
567 static int
568 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
569 {
570 	struct sched_group *first = NULL, *last = NULL, *sg;
571 	const struct cpumask *span = sched_domain_span(sd);
572 	struct cpumask *covered = sched_domains_tmpmask;
573 	struct sd_data *sdd = sd->private;
574 	struct sched_domain *sibling;
575 	int i;
576 
577 	cpumask_clear(covered);
578 
579 	for_each_cpu_wrap(i, span, cpu) {
580 		struct cpumask *sg_span;
581 
582 		if (cpumask_test_cpu(i, covered))
583 			continue;
584 
585 		sibling = *per_cpu_ptr(sdd->sd, i);
586 
587 		/* See the comment near build_group_mask(). */
588 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
589 			continue;
590 
591 		sg = build_group_from_child_sched_domain(sibling, cpu);
592 		if (!sg)
593 			goto fail;
594 
595 		sg_span = sched_group_cpus(sg);
596 		cpumask_or(covered, covered, sg_span);
597 
598 		init_overlap_sched_group(sd, sg, i);
599 
600 		if (!first)
601 			first = sg;
602 		if (last)
603 			last->next = sg;
604 		last = sg;
605 		last->next = first;
606 	}
607 	sd->groups = first;
608 
609 	return 0;
610 
611 fail:
612 	free_sched_groups(first, 0);
613 
614 	return -ENOMEM;
615 }
616 
617 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
618 {
619 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
620 	struct sched_domain *child = sd->child;
621 
622 	if (child)
623 		cpu = cpumask_first(sched_domain_span(child));
624 
625 	if (sg) {
626 		*sg = *per_cpu_ptr(sdd->sg, cpu);
627 		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
628 
629 		/* For claim_allocations: */
630 		atomic_set(&(*sg)->sgc->ref, 1);
631 	}
632 
633 	return cpu;
634 }
635 
636 /*
637  * build_sched_groups will build a circular linked list of the groups
638  * covered by the given span, and will set each group's ->cpumask correctly,
639  * and ->cpu_capacity to 0.
640  *
641  * Assumes the sched_domain tree is fully constructed
642  */
643 static int
644 build_sched_groups(struct sched_domain *sd, int cpu)
645 {
646 	struct sched_group *first = NULL, *last = NULL;
647 	struct sd_data *sdd = sd->private;
648 	const struct cpumask *span = sched_domain_span(sd);
649 	struct cpumask *covered;
650 	int i;
651 
652 	get_group(cpu, sdd, &sd->groups);
653 	atomic_inc(&sd->groups->ref);
654 
655 	if (cpu != cpumask_first(span))
656 		return 0;
657 
658 	lockdep_assert_held(&sched_domains_mutex);
659 	covered = sched_domains_tmpmask;
660 
661 	cpumask_clear(covered);
662 
663 	for_each_cpu(i, span) {
664 		struct sched_group *sg;
665 		int group, j;
666 
667 		if (cpumask_test_cpu(i, covered))
668 			continue;
669 
670 		group = get_group(i, sdd, &sg);
671 		cpumask_setall(sched_group_mask(sg));
672 
673 		for_each_cpu(j, span) {
674 			if (get_group(j, sdd, NULL) != group)
675 				continue;
676 
677 			cpumask_set_cpu(j, covered);
678 			cpumask_set_cpu(j, sched_group_cpus(sg));
679 		}
680 
681 		if (!first)
682 			first = sg;
683 		if (last)
684 			last->next = sg;
685 		last = sg;
686 	}
687 	last->next = first;
688 
689 	return 0;
690 }
691 
692 /*
693  * Initialize sched groups cpu_capacity.
694  *
695  * cpu_capacity indicates the capacity of sched group, which is used while
696  * distributing the load between different sched groups in a sched domain.
697  * Typically cpu_capacity for all the groups in a sched domain will be same
698  * unless there are asymmetries in the topology. If there are asymmetries,
699  * group having more cpu_capacity will pickup more load compared to the
700  * group having less cpu_capacity.
701  */
702 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
703 {
704 	struct sched_group *sg = sd->groups;
705 
706 	WARN_ON(!sg);
707 
708 	do {
709 		int cpu, max_cpu = -1;
710 
711 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
712 
713 		if (!(sd->flags & SD_ASYM_PACKING))
714 			goto next;
715 
716 		for_each_cpu(cpu, sched_group_cpus(sg)) {
717 			if (max_cpu < 0)
718 				max_cpu = cpu;
719 			else if (sched_asym_prefer(cpu, max_cpu))
720 				max_cpu = cpu;
721 		}
722 		sg->asym_prefer_cpu = max_cpu;
723 
724 next:
725 		sg = sg->next;
726 	} while (sg != sd->groups);
727 
728 	if (cpu != group_balance_cpu(sg))
729 		return;
730 
731 	update_group_capacity(sd, cpu);
732 }
733 
734 /*
735  * Initializers for schedule domains
736  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
737  */
738 
739 static int default_relax_domain_level = -1;
740 int sched_domain_level_max;
741 
742 static int __init setup_relax_domain_level(char *str)
743 {
744 	if (kstrtoint(str, 0, &default_relax_domain_level))
745 		pr_warn("Unable to set relax_domain_level\n");
746 
747 	return 1;
748 }
749 __setup("relax_domain_level=", setup_relax_domain_level);
750 
751 static void set_domain_attribute(struct sched_domain *sd,
752 				 struct sched_domain_attr *attr)
753 {
754 	int request;
755 
756 	if (!attr || attr->relax_domain_level < 0) {
757 		if (default_relax_domain_level < 0)
758 			return;
759 		else
760 			request = default_relax_domain_level;
761 	} else
762 		request = attr->relax_domain_level;
763 	if (request < sd->level) {
764 		/* Turn off idle balance on this domain: */
765 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
766 	} else {
767 		/* Turn on idle balance on this domain: */
768 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
769 	}
770 }
771 
772 static void __sdt_free(const struct cpumask *cpu_map);
773 static int __sdt_alloc(const struct cpumask *cpu_map);
774 
775 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
776 				 const struct cpumask *cpu_map)
777 {
778 	switch (what) {
779 	case sa_rootdomain:
780 		if (!atomic_read(&d->rd->refcount))
781 			free_rootdomain(&d->rd->rcu);
782 		/* Fall through */
783 	case sa_sd:
784 		free_percpu(d->sd);
785 		/* Fall through */
786 	case sa_sd_storage:
787 		__sdt_free(cpu_map);
788 		/* Fall through */
789 	case sa_none:
790 		break;
791 	}
792 }
793 
794 static enum s_alloc
795 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
796 {
797 	memset(d, 0, sizeof(*d));
798 
799 	if (__sdt_alloc(cpu_map))
800 		return sa_sd_storage;
801 	d->sd = alloc_percpu(struct sched_domain *);
802 	if (!d->sd)
803 		return sa_sd_storage;
804 	d->rd = alloc_rootdomain();
805 	if (!d->rd)
806 		return sa_sd;
807 	return sa_rootdomain;
808 }
809 
810 /*
811  * NULL the sd_data elements we've used to build the sched_domain and
812  * sched_group structure so that the subsequent __free_domain_allocs()
813  * will not free the data we're using.
814  */
815 static void claim_allocations(int cpu, struct sched_domain *sd)
816 {
817 	struct sd_data *sdd = sd->private;
818 
819 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
820 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
821 
822 	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
823 		*per_cpu_ptr(sdd->sds, cpu) = NULL;
824 
825 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
826 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
827 
828 	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
829 		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
830 }
831 
832 #ifdef CONFIG_NUMA
833 static int sched_domains_numa_levels;
834 enum numa_topology_type sched_numa_topology_type;
835 static int *sched_domains_numa_distance;
836 int sched_max_numa_distance;
837 static struct cpumask ***sched_domains_numa_masks;
838 static int sched_domains_curr_level;
839 #endif
840 
841 /*
842  * SD_flags allowed in topology descriptions.
843  *
844  * These flags are purely descriptive of the topology and do not prescribe
845  * behaviour. Behaviour is artificial and mapped in the below sd_init()
846  * function:
847  *
848  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
849  *   SD_SHARE_PKG_RESOURCES - describes shared caches
850  *   SD_NUMA                - describes NUMA topologies
851  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
852  *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
853  *
854  * Odd one out, which beside describing the topology has a quirk also
855  * prescribes the desired behaviour that goes along with it:
856  *
857  *   SD_ASYM_PACKING        - describes SMT quirks
858  */
859 #define TOPOLOGY_SD_FLAGS		\
860 	(SD_SHARE_CPUCAPACITY |		\
861 	 SD_SHARE_PKG_RESOURCES |	\
862 	 SD_NUMA |			\
863 	 SD_ASYM_PACKING |		\
864 	 SD_ASYM_CPUCAPACITY |		\
865 	 SD_SHARE_POWERDOMAIN)
866 
867 static struct sched_domain *
868 sd_init(struct sched_domain_topology_level *tl,
869 	const struct cpumask *cpu_map,
870 	struct sched_domain *child, int cpu)
871 {
872 	struct sd_data *sdd = &tl->data;
873 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
874 	int sd_id, sd_weight, sd_flags = 0;
875 
876 #ifdef CONFIG_NUMA
877 	/*
878 	 * Ugly hack to pass state to sd_numa_mask()...
879 	 */
880 	sched_domains_curr_level = tl->numa_level;
881 #endif
882 
883 	sd_weight = cpumask_weight(tl->mask(cpu));
884 
885 	if (tl->sd_flags)
886 		sd_flags = (*tl->sd_flags)();
887 	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
888 			"wrong sd_flags in topology description\n"))
889 		sd_flags &= ~TOPOLOGY_SD_FLAGS;
890 
891 	*sd = (struct sched_domain){
892 		.min_interval		= sd_weight,
893 		.max_interval		= 2*sd_weight,
894 		.busy_factor		= 32,
895 		.imbalance_pct		= 125,
896 
897 		.cache_nice_tries	= 0,
898 		.busy_idx		= 0,
899 		.idle_idx		= 0,
900 		.newidle_idx		= 0,
901 		.wake_idx		= 0,
902 		.forkexec_idx		= 0,
903 
904 		.flags			= 1*SD_LOAD_BALANCE
905 					| 1*SD_BALANCE_NEWIDLE
906 					| 1*SD_BALANCE_EXEC
907 					| 1*SD_BALANCE_FORK
908 					| 0*SD_BALANCE_WAKE
909 					| 1*SD_WAKE_AFFINE
910 					| 0*SD_SHARE_CPUCAPACITY
911 					| 0*SD_SHARE_PKG_RESOURCES
912 					| 0*SD_SERIALIZE
913 					| 0*SD_PREFER_SIBLING
914 					| 0*SD_NUMA
915 					| sd_flags
916 					,
917 
918 		.last_balance		= jiffies,
919 		.balance_interval	= sd_weight,
920 		.smt_gain		= 0,
921 		.max_newidle_lb_cost	= 0,
922 		.next_decay_max_lb_cost	= jiffies,
923 		.child			= child,
924 #ifdef CONFIG_SCHED_DEBUG
925 		.name			= tl->name,
926 #endif
927 	};
928 
929 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
930 	sd_id = cpumask_first(sched_domain_span(sd));
931 
932 	/*
933 	 * Convert topological properties into behaviour.
934 	 */
935 
936 	if (sd->flags & SD_ASYM_CPUCAPACITY) {
937 		struct sched_domain *t = sd;
938 
939 		for_each_lower_domain(t)
940 			t->flags |= SD_BALANCE_WAKE;
941 	}
942 
943 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
944 		sd->flags |= SD_PREFER_SIBLING;
945 		sd->imbalance_pct = 110;
946 		sd->smt_gain = 1178; /* ~15% */
947 
948 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
949 		sd->imbalance_pct = 117;
950 		sd->cache_nice_tries = 1;
951 		sd->busy_idx = 2;
952 
953 #ifdef CONFIG_NUMA
954 	} else if (sd->flags & SD_NUMA) {
955 		sd->cache_nice_tries = 2;
956 		sd->busy_idx = 3;
957 		sd->idle_idx = 2;
958 
959 		sd->flags |= SD_SERIALIZE;
960 		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
961 			sd->flags &= ~(SD_BALANCE_EXEC |
962 				       SD_BALANCE_FORK |
963 				       SD_WAKE_AFFINE);
964 		}
965 
966 #endif
967 	} else {
968 		sd->flags |= SD_PREFER_SIBLING;
969 		sd->cache_nice_tries = 1;
970 		sd->busy_idx = 2;
971 		sd->idle_idx = 1;
972 	}
973 
974 	/*
975 	 * For all levels sharing cache; connect a sched_domain_shared
976 	 * instance.
977 	 */
978 	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
979 		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
980 		atomic_inc(&sd->shared->ref);
981 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
982 	}
983 
984 	sd->private = sdd;
985 
986 	return sd;
987 }
988 
989 /*
990  * Topology list, bottom-up.
991  */
992 static struct sched_domain_topology_level default_topology[] = {
993 #ifdef CONFIG_SCHED_SMT
994 	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
995 #endif
996 #ifdef CONFIG_SCHED_MC
997 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
998 #endif
999 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
1000 	{ NULL, },
1001 };
1002 
1003 static struct sched_domain_topology_level *sched_domain_topology =
1004 	default_topology;
1005 
1006 #define for_each_sd_topology(tl)			\
1007 	for (tl = sched_domain_topology; tl->mask; tl++)
1008 
1009 void set_sched_topology(struct sched_domain_topology_level *tl)
1010 {
1011 	if (WARN_ON_ONCE(sched_smp_initialized))
1012 		return;
1013 
1014 	sched_domain_topology = tl;
1015 }
1016 
1017 #ifdef CONFIG_NUMA
1018 
1019 static const struct cpumask *sd_numa_mask(int cpu)
1020 {
1021 	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1022 }
1023 
1024 static void sched_numa_warn(const char *str)
1025 {
1026 	static int done = false;
1027 	int i,j;
1028 
1029 	if (done)
1030 		return;
1031 
1032 	done = true;
1033 
1034 	printk(KERN_WARNING "ERROR: %s\n\n", str);
1035 
1036 	for (i = 0; i < nr_node_ids; i++) {
1037 		printk(KERN_WARNING "  ");
1038 		for (j = 0; j < nr_node_ids; j++)
1039 			printk(KERN_CONT "%02d ", node_distance(i,j));
1040 		printk(KERN_CONT "\n");
1041 	}
1042 	printk(KERN_WARNING "\n");
1043 }
1044 
1045 bool find_numa_distance(int distance)
1046 {
1047 	int i;
1048 
1049 	if (distance == node_distance(0, 0))
1050 		return true;
1051 
1052 	for (i = 0; i < sched_domains_numa_levels; i++) {
1053 		if (sched_domains_numa_distance[i] == distance)
1054 			return true;
1055 	}
1056 
1057 	return false;
1058 }
1059 
1060 /*
1061  * A system can have three types of NUMA topology:
1062  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1063  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1064  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1065  *
1066  * The difference between a glueless mesh topology and a backplane
1067  * topology lies in whether communication between not directly
1068  * connected nodes goes through intermediary nodes (where programs
1069  * could run), or through backplane controllers. This affects
1070  * placement of programs.
1071  *
1072  * The type of topology can be discerned with the following tests:
1073  * - If the maximum distance between any nodes is 1 hop, the system
1074  *   is directly connected.
1075  * - If for two nodes A and B, located N > 1 hops away from each other,
1076  *   there is an intermediary node C, which is < N hops away from both
1077  *   nodes A and B, the system is a glueless mesh.
1078  */
1079 static void init_numa_topology_type(void)
1080 {
1081 	int a, b, c, n;
1082 
1083 	n = sched_max_numa_distance;
1084 
1085 	if (sched_domains_numa_levels <= 1) {
1086 		sched_numa_topology_type = NUMA_DIRECT;
1087 		return;
1088 	}
1089 
1090 	for_each_online_node(a) {
1091 		for_each_online_node(b) {
1092 			/* Find two nodes furthest removed from each other. */
1093 			if (node_distance(a, b) < n)
1094 				continue;
1095 
1096 			/* Is there an intermediary node between a and b? */
1097 			for_each_online_node(c) {
1098 				if (node_distance(a, c) < n &&
1099 				    node_distance(b, c) < n) {
1100 					sched_numa_topology_type =
1101 							NUMA_GLUELESS_MESH;
1102 					return;
1103 				}
1104 			}
1105 
1106 			sched_numa_topology_type = NUMA_BACKPLANE;
1107 			return;
1108 		}
1109 	}
1110 }
1111 
1112 void sched_init_numa(void)
1113 {
1114 	int next_distance, curr_distance = node_distance(0, 0);
1115 	struct sched_domain_topology_level *tl;
1116 	int level = 0;
1117 	int i, j, k;
1118 
1119 	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1120 	if (!sched_domains_numa_distance)
1121 		return;
1122 
1123 	/*
1124 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1125 	 * unique distances in the node_distance() table.
1126 	 *
1127 	 * Assumes node_distance(0,j) includes all distances in
1128 	 * node_distance(i,j) in order to avoid cubic time.
1129 	 */
1130 	next_distance = curr_distance;
1131 	for (i = 0; i < nr_node_ids; i++) {
1132 		for (j = 0; j < nr_node_ids; j++) {
1133 			for (k = 0; k < nr_node_ids; k++) {
1134 				int distance = node_distance(i, k);
1135 
1136 				if (distance > curr_distance &&
1137 				    (distance < next_distance ||
1138 				     next_distance == curr_distance))
1139 					next_distance = distance;
1140 
1141 				/*
1142 				 * While not a strong assumption it would be nice to know
1143 				 * about cases where if node A is connected to B, B is not
1144 				 * equally connected to A.
1145 				 */
1146 				if (sched_debug() && node_distance(k, i) != distance)
1147 					sched_numa_warn("Node-distance not symmetric");
1148 
1149 				if (sched_debug() && i && !find_numa_distance(distance))
1150 					sched_numa_warn("Node-0 not representative");
1151 			}
1152 			if (next_distance != curr_distance) {
1153 				sched_domains_numa_distance[level++] = next_distance;
1154 				sched_domains_numa_levels = level;
1155 				curr_distance = next_distance;
1156 			} else break;
1157 		}
1158 
1159 		/*
1160 		 * In case of sched_debug() we verify the above assumption.
1161 		 */
1162 		if (!sched_debug())
1163 			break;
1164 	}
1165 
1166 	if (!level)
1167 		return;
1168 
1169 	/*
1170 	 * 'level' contains the number of unique distances, excluding the
1171 	 * identity distance node_distance(i,i).
1172 	 *
1173 	 * The sched_domains_numa_distance[] array includes the actual distance
1174 	 * numbers.
1175 	 */
1176 
1177 	/*
1178 	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1179 	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1180 	 * the array will contain less then 'level' members. This could be
1181 	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1182 	 * in other functions.
1183 	 *
1184 	 * We reset it to 'level' at the end of this function.
1185 	 */
1186 	sched_domains_numa_levels = 0;
1187 
1188 	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1189 	if (!sched_domains_numa_masks)
1190 		return;
1191 
1192 	/*
1193 	 * Now for each level, construct a mask per node which contains all
1194 	 * CPUs of nodes that are that many hops away from us.
1195 	 */
1196 	for (i = 0; i < level; i++) {
1197 		sched_domains_numa_masks[i] =
1198 			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1199 		if (!sched_domains_numa_masks[i])
1200 			return;
1201 
1202 		for (j = 0; j < nr_node_ids; j++) {
1203 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1204 			if (!mask)
1205 				return;
1206 
1207 			sched_domains_numa_masks[i][j] = mask;
1208 
1209 			for_each_node(k) {
1210 				if (node_distance(j, k) > sched_domains_numa_distance[i])
1211 					continue;
1212 
1213 				cpumask_or(mask, mask, cpumask_of_node(k));
1214 			}
1215 		}
1216 	}
1217 
1218 	/* Compute default topology size */
1219 	for (i = 0; sched_domain_topology[i].mask; i++);
1220 
1221 	tl = kzalloc((i + level + 1) *
1222 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1223 	if (!tl)
1224 		return;
1225 
1226 	/*
1227 	 * Copy the default topology bits..
1228 	 */
1229 	for (i = 0; sched_domain_topology[i].mask; i++)
1230 		tl[i] = sched_domain_topology[i];
1231 
1232 	/*
1233 	 * .. and append 'j' levels of NUMA goodness.
1234 	 */
1235 	for (j = 0; j < level; i++, j++) {
1236 		tl[i] = (struct sched_domain_topology_level){
1237 			.mask = sd_numa_mask,
1238 			.sd_flags = cpu_numa_flags,
1239 			.flags = SDTL_OVERLAP,
1240 			.numa_level = j,
1241 			SD_INIT_NAME(NUMA)
1242 		};
1243 	}
1244 
1245 	sched_domain_topology = tl;
1246 
1247 	sched_domains_numa_levels = level;
1248 	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1249 
1250 	init_numa_topology_type();
1251 }
1252 
1253 void sched_domains_numa_masks_set(unsigned int cpu)
1254 {
1255 	int node = cpu_to_node(cpu);
1256 	int i, j;
1257 
1258 	for (i = 0; i < sched_domains_numa_levels; i++) {
1259 		for (j = 0; j < nr_node_ids; j++) {
1260 			if (node_distance(j, node) <= sched_domains_numa_distance[i])
1261 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1262 		}
1263 	}
1264 }
1265 
1266 void sched_domains_numa_masks_clear(unsigned int cpu)
1267 {
1268 	int i, j;
1269 
1270 	for (i = 0; i < sched_domains_numa_levels; i++) {
1271 		for (j = 0; j < nr_node_ids; j++)
1272 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1273 	}
1274 }
1275 
1276 #endif /* CONFIG_NUMA */
1277 
1278 static int __sdt_alloc(const struct cpumask *cpu_map)
1279 {
1280 	struct sched_domain_topology_level *tl;
1281 	int j;
1282 
1283 	for_each_sd_topology(tl) {
1284 		struct sd_data *sdd = &tl->data;
1285 
1286 		sdd->sd = alloc_percpu(struct sched_domain *);
1287 		if (!sdd->sd)
1288 			return -ENOMEM;
1289 
1290 		sdd->sds = alloc_percpu(struct sched_domain_shared *);
1291 		if (!sdd->sds)
1292 			return -ENOMEM;
1293 
1294 		sdd->sg = alloc_percpu(struct sched_group *);
1295 		if (!sdd->sg)
1296 			return -ENOMEM;
1297 
1298 		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1299 		if (!sdd->sgc)
1300 			return -ENOMEM;
1301 
1302 		for_each_cpu(j, cpu_map) {
1303 			struct sched_domain *sd;
1304 			struct sched_domain_shared *sds;
1305 			struct sched_group *sg;
1306 			struct sched_group_capacity *sgc;
1307 
1308 			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1309 					GFP_KERNEL, cpu_to_node(j));
1310 			if (!sd)
1311 				return -ENOMEM;
1312 
1313 			*per_cpu_ptr(sdd->sd, j) = sd;
1314 
1315 			sds = kzalloc_node(sizeof(struct sched_domain_shared),
1316 					GFP_KERNEL, cpu_to_node(j));
1317 			if (!sds)
1318 				return -ENOMEM;
1319 
1320 			*per_cpu_ptr(sdd->sds, j) = sds;
1321 
1322 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1323 					GFP_KERNEL, cpu_to_node(j));
1324 			if (!sg)
1325 				return -ENOMEM;
1326 
1327 			sg->next = sg;
1328 
1329 			*per_cpu_ptr(sdd->sg, j) = sg;
1330 
1331 			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1332 					GFP_KERNEL, cpu_to_node(j));
1333 			if (!sgc)
1334 				return -ENOMEM;
1335 
1336 			*per_cpu_ptr(sdd->sgc, j) = sgc;
1337 		}
1338 	}
1339 
1340 	return 0;
1341 }
1342 
1343 static void __sdt_free(const struct cpumask *cpu_map)
1344 {
1345 	struct sched_domain_topology_level *tl;
1346 	int j;
1347 
1348 	for_each_sd_topology(tl) {
1349 		struct sd_data *sdd = &tl->data;
1350 
1351 		for_each_cpu(j, cpu_map) {
1352 			struct sched_domain *sd;
1353 
1354 			if (sdd->sd) {
1355 				sd = *per_cpu_ptr(sdd->sd, j);
1356 				if (sd && (sd->flags & SD_OVERLAP))
1357 					free_sched_groups(sd->groups, 0);
1358 				kfree(*per_cpu_ptr(sdd->sd, j));
1359 			}
1360 
1361 			if (sdd->sds)
1362 				kfree(*per_cpu_ptr(sdd->sds, j));
1363 			if (sdd->sg)
1364 				kfree(*per_cpu_ptr(sdd->sg, j));
1365 			if (sdd->sgc)
1366 				kfree(*per_cpu_ptr(sdd->sgc, j));
1367 		}
1368 		free_percpu(sdd->sd);
1369 		sdd->sd = NULL;
1370 		free_percpu(sdd->sds);
1371 		sdd->sds = NULL;
1372 		free_percpu(sdd->sg);
1373 		sdd->sg = NULL;
1374 		free_percpu(sdd->sgc);
1375 		sdd->sgc = NULL;
1376 	}
1377 }
1378 
1379 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1380 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1381 		struct sched_domain *child, int cpu)
1382 {
1383 	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1384 
1385 	if (child) {
1386 		sd->level = child->level + 1;
1387 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
1388 		child->parent = sd;
1389 
1390 		if (!cpumask_subset(sched_domain_span(child),
1391 				    sched_domain_span(sd))) {
1392 			pr_err("BUG: arch topology borken\n");
1393 #ifdef CONFIG_SCHED_DEBUG
1394 			pr_err("     the %s domain not a subset of the %s domain\n",
1395 					child->name, sd->name);
1396 #endif
1397 			/* Fixup, ensure @sd has at least @child cpus. */
1398 			cpumask_or(sched_domain_span(sd),
1399 				   sched_domain_span(sd),
1400 				   sched_domain_span(child));
1401 		}
1402 
1403 	}
1404 	set_domain_attribute(sd, attr);
1405 
1406 	return sd;
1407 }
1408 
1409 /*
1410  * Build sched domains for a given set of CPUs and attach the sched domains
1411  * to the individual CPUs
1412  */
1413 static int
1414 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1415 {
1416 	enum s_alloc alloc_state;
1417 	struct sched_domain *sd;
1418 	struct s_data d;
1419 	struct rq *rq = NULL;
1420 	int i, ret = -ENOMEM;
1421 
1422 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1423 	if (alloc_state != sa_rootdomain)
1424 		goto error;
1425 
1426 	/* Set up domains for CPUs specified by the cpu_map: */
1427 	for_each_cpu(i, cpu_map) {
1428 		struct sched_domain_topology_level *tl;
1429 
1430 		sd = NULL;
1431 		for_each_sd_topology(tl) {
1432 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1433 			if (tl == sched_domain_topology)
1434 				*per_cpu_ptr(d.sd, i) = sd;
1435 			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
1436 				sd->flags |= SD_OVERLAP;
1437 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1438 				break;
1439 		}
1440 	}
1441 
1442 	/* Build the groups for the domains */
1443 	for_each_cpu(i, cpu_map) {
1444 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1445 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
1446 			if (sd->flags & SD_OVERLAP) {
1447 				if (build_overlap_sched_groups(sd, i))
1448 					goto error;
1449 			} else {
1450 				if (build_sched_groups(sd, i))
1451 					goto error;
1452 			}
1453 		}
1454 	}
1455 
1456 	/* Calculate CPU capacity for physical packages and nodes */
1457 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
1458 		if (!cpumask_test_cpu(i, cpu_map))
1459 			continue;
1460 
1461 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1462 			claim_allocations(i, sd);
1463 			init_sched_groups_capacity(i, sd);
1464 		}
1465 	}
1466 
1467 	/* Attach the domains */
1468 	rcu_read_lock();
1469 	for_each_cpu(i, cpu_map) {
1470 		rq = cpu_rq(i);
1471 		sd = *per_cpu_ptr(d.sd, i);
1472 
1473 		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
1474 		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
1475 			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
1476 
1477 		cpu_attach_domain(sd, d.rd, i);
1478 	}
1479 	rcu_read_unlock();
1480 
1481 	if (rq && sched_debug_enabled) {
1482 		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
1483 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
1484 	}
1485 
1486 	ret = 0;
1487 error:
1488 	__free_domain_allocs(&d, alloc_state, cpu_map);
1489 	return ret;
1490 }
1491 
1492 /* Current sched domains: */
1493 static cpumask_var_t			*doms_cur;
1494 
1495 /* Number of sched domains in 'doms_cur': */
1496 static int				ndoms_cur;
1497 
1498 /* Attribues of custom domains in 'doms_cur' */
1499 static struct sched_domain_attr		*dattr_cur;
1500 
1501 /*
1502  * Special case: If a kmalloc() of a doms_cur partition (array of
1503  * cpumask) fails, then fallback to a single sched domain,
1504  * as determined by the single cpumask fallback_doms.
1505  */
1506 cpumask_var_t				fallback_doms;
1507 
1508 /*
1509  * arch_update_cpu_topology lets virtualized architectures update the
1510  * CPU core maps. It is supposed to return 1 if the topology changed
1511  * or 0 if it stayed the same.
1512  */
1513 int __weak arch_update_cpu_topology(void)
1514 {
1515 	return 0;
1516 }
1517 
1518 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1519 {
1520 	int i;
1521 	cpumask_var_t *doms;
1522 
1523 	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1524 	if (!doms)
1525 		return NULL;
1526 	for (i = 0; i < ndoms; i++) {
1527 		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1528 			free_sched_domains(doms, i);
1529 			return NULL;
1530 		}
1531 	}
1532 	return doms;
1533 }
1534 
1535 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1536 {
1537 	unsigned int i;
1538 	for (i = 0; i < ndoms; i++)
1539 		free_cpumask_var(doms[i]);
1540 	kfree(doms);
1541 }
1542 
1543 /*
1544  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1545  * For now this just excludes isolated CPUs, but could be used to
1546  * exclude other special cases in the future.
1547  */
1548 int init_sched_domains(const struct cpumask *cpu_map)
1549 {
1550 	int err;
1551 
1552 	arch_update_cpu_topology();
1553 	ndoms_cur = 1;
1554 	doms_cur = alloc_sched_domains(ndoms_cur);
1555 	if (!doms_cur)
1556 		doms_cur = &fallback_doms;
1557 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
1558 	err = build_sched_domains(doms_cur[0], NULL);
1559 	register_sched_domain_sysctl();
1560 
1561 	return err;
1562 }
1563 
1564 /*
1565  * Detach sched domains from a group of CPUs specified in cpu_map
1566  * These CPUs will now be attached to the NULL domain
1567  */
1568 static void detach_destroy_domains(const struct cpumask *cpu_map)
1569 {
1570 	int i;
1571 
1572 	rcu_read_lock();
1573 	for_each_cpu(i, cpu_map)
1574 		cpu_attach_domain(NULL, &def_root_domain, i);
1575 	rcu_read_unlock();
1576 }
1577 
1578 /* handle null as "default" */
1579 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1580 			struct sched_domain_attr *new, int idx_new)
1581 {
1582 	struct sched_domain_attr tmp;
1583 
1584 	/* Fast path: */
1585 	if (!new && !cur)
1586 		return 1;
1587 
1588 	tmp = SD_ATTR_INIT;
1589 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
1590 			new ? (new + idx_new) : &tmp,
1591 			sizeof(struct sched_domain_attr));
1592 }
1593 
1594 /*
1595  * Partition sched domains as specified by the 'ndoms_new'
1596  * cpumasks in the array doms_new[] of cpumasks. This compares
1597  * doms_new[] to the current sched domain partitioning, doms_cur[].
1598  * It destroys each deleted domain and builds each new domain.
1599  *
1600  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
1601  * The masks don't intersect (don't overlap.) We should setup one
1602  * sched domain for each mask. CPUs not in any of the cpumasks will
1603  * not be load balanced. If the same cpumask appears both in the
1604  * current 'doms_cur' domains and in the new 'doms_new', we can leave
1605  * it as it is.
1606  *
1607  * The passed in 'doms_new' should be allocated using
1608  * alloc_sched_domains.  This routine takes ownership of it and will
1609  * free_sched_domains it when done with it. If the caller failed the
1610  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
1611  * and partition_sched_domains() will fallback to the single partition
1612  * 'fallback_doms', it also forces the domains to be rebuilt.
1613  *
1614  * If doms_new == NULL it will be replaced with cpu_online_mask.
1615  * ndoms_new == 0 is a special case for destroying existing domains,
1616  * and it will not create the default domain.
1617  *
1618  * Call with hotplug lock held
1619  */
1620 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1621 			     struct sched_domain_attr *dattr_new)
1622 {
1623 	int i, j, n;
1624 	int new_topology;
1625 
1626 	mutex_lock(&sched_domains_mutex);
1627 
1628 	/* Always unregister in case we don't destroy any domains: */
1629 	unregister_sched_domain_sysctl();
1630 
1631 	/* Let the architecture update CPU core mappings: */
1632 	new_topology = arch_update_cpu_topology();
1633 
1634 	n = doms_new ? ndoms_new : 0;
1635 
1636 	/* Destroy deleted domains: */
1637 	for (i = 0; i < ndoms_cur; i++) {
1638 		for (j = 0; j < n && !new_topology; j++) {
1639 			if (cpumask_equal(doms_cur[i], doms_new[j])
1640 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
1641 				goto match1;
1642 		}
1643 		/* No match - a current sched domain not in new doms_new[] */
1644 		detach_destroy_domains(doms_cur[i]);
1645 match1:
1646 		;
1647 	}
1648 
1649 	n = ndoms_cur;
1650 	if (doms_new == NULL) {
1651 		n = 0;
1652 		doms_new = &fallback_doms;
1653 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
1654 		WARN_ON_ONCE(dattr_new);
1655 	}
1656 
1657 	/* Build new domains: */
1658 	for (i = 0; i < ndoms_new; i++) {
1659 		for (j = 0; j < n && !new_topology; j++) {
1660 			if (cpumask_equal(doms_new[i], doms_cur[j])
1661 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
1662 				goto match2;
1663 		}
1664 		/* No match - add a new doms_new */
1665 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
1666 match2:
1667 		;
1668 	}
1669 
1670 	/* Remember the new sched domains: */
1671 	if (doms_cur != &fallback_doms)
1672 		free_sched_domains(doms_cur, ndoms_cur);
1673 
1674 	kfree(dattr_cur);
1675 	doms_cur = doms_new;
1676 	dattr_cur = dattr_new;
1677 	ndoms_cur = ndoms_new;
1678 
1679 	register_sched_domain_sysctl();
1680 
1681 	mutex_unlock(&sched_domains_mutex);
1682 }
1683 
1684