kernel/sched/topology.c

1 // SPDX-License-Identifier: GPL-2.0
38 	struct sched_group *group = sd->groups;  in sched_domain_debug_one()
39 	unsigned long flags = sd->flags;  in sched_domain_debug_one()
44 	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);  in sched_domain_debug_one()
46 	       cpumask_pr_args(sched_domain_span(sd)), sd->name);  in sched_domain_debug_one()
49 		printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);  in sched_domain_debug_one()
52 		printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);  in sched_domain_debug_one()
59 		if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&  in sched_domain_debug_one()
60 		    !(sd->child->flags & flag))  in sched_domain_debug_one()
64 		if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&  in sched_domain_debug_one()
65 		    !(sd->parent->flags & flag))  in sched_domain_debug_one()
84 		if (!(sd->flags & SD_OVERLAP) &&  in sched_domain_debug_one()
94 				group->sgc->id,  in sched_domain_debug_one()
97 		if ((sd->flags & SD_OVERLAP) &&  in sched_domain_debug_one()
103 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE)  in sched_domain_debug_one()
104 			printk(KERN_CONT " cap=%lu", group->sgc->capacity);  in sched_domain_debug_one()
106 		if (group == sd->groups && sd->child &&  in sched_domain_debug_one()
107 		    !cpumask_equal(sched_domain_span(sd->child),  in sched_domain_debug_one()
109 			printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");  in sched_domain_debug_one()
114 		group = group->next;  in sched_domain_debug_one()
116 		if (group != sd->groups)  in sched_domain_debug_one()
119 	} while (group != sd->groups);  in sched_domain_debug_one()
123 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");  in sched_domain_debug_one()
125 	if (sd->parent &&  in sched_domain_debug_one()
126 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))  in sched_domain_debug_one()
127 		printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");  in sched_domain_debug_one()
139 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);  in sched_domain_debug()
143 	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);  in sched_domain_debug()
149 		sd = sd->parent;  in sched_domain_debug()
177 	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&  in sd_degenerate()
178 	    (sd->groups != sd->groups->next))  in sd_degenerate()
182 	if (sd->flags & (SD_WAKE_AFFINE))  in sd_degenerate()
191 	unsigned long cflags = sd->flags, pflags = parent->flags;  in sd_parent_degenerate()
200 	if (parent->groups == parent->groups->next)  in sd_parent_degenerate()
231 		return -EPERM;  in sched_energy_aware_handler()
270 		tmp = pd->next;  in free_pd()
281 		pd = pd->next;  in find_pd()
301 	pd->em_pd = obj;  in pd_init()
318 				em_pd_nr_perf_states(pd->em_pd));  in perf_domain_debug()
319 		pd = pd->next;  in perf_domain_debug()
360  *  - nr_pd:    the number of performance domains
361  *  - nr_cpus:  the number of CPUs
362  *  - nr_ps:    the sum of the number of performance states of all performance
366  * It is generally not a good idea to use such a model in the wake-up path on
369  * with per-CPU DVFS and less than 8 performance states each, for example.
379 	struct root_domain *rd = cpu_rq(cpu)->rd;  in build_perf_domains()
404 			pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",  in build_perf_domains()
419 		gov = policy->governor;  in build_perf_domains()
422 			if (rd->pd)  in build_perf_domains()
432 		tmp->next = pd;  in build_perf_domains()
440 		nr_ps += em_pd_nr_perf_states(pd->em_pd);  in build_perf_domains()
443 	/* Bail out if the Energy Model complexity is too high. */  in build_perf_domains()
453 	tmp = rd->pd;  in build_perf_domains()
454 	rcu_assign_pointer(rd->pd, pd);  in build_perf_domains()
456 		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);  in build_perf_domains()
462 	tmp = rd->pd;  in build_perf_domains()
463 	rcu_assign_pointer(rd->pd, NULL);  in build_perf_domains()
465 		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);  in build_perf_domains()
477 	cpupri_cleanup(&rd->cpupri);  in free_rootdomain()
478 	cpudl_cleanup(&rd->cpudl);  in free_rootdomain()
479 	free_cpumask_var(rd->dlo_mask);  in free_rootdomain()
480 	free_cpumask_var(rd->rto_mask);  in free_rootdomain()
481 	free_cpumask_var(rd->online);  in free_rootdomain()
482 	free_cpumask_var(rd->span);  in free_rootdomain()
483 	free_pd(rd->pd);  in free_rootdomain()
494 	if (rq->rd) {  in rq_attach_root()
495 		old_rd = rq->rd;  in rq_attach_root()
497 		if (cpumask_test_cpu(rq->cpu, old_rd->online))  in rq_attach_root()
500 		cpumask_clear_cpu(rq->cpu, old_rd->span);  in rq_attach_root()
507 		if (!atomic_dec_and_test(&old_rd->refcount))  in rq_attach_root()
511 	atomic_inc(&rd->refcount);  in rq_attach_root()
512 	rq->rd = rd;  in rq_attach_root()
514 	cpumask_set_cpu(rq->cpu, rd->span);  in rq_attach_root()
515 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))  in rq_attach_root()
521 		call_rcu(&old_rd->rcu, free_rootdomain);  in rq_attach_root()
526 	atomic_inc(&rd->refcount);  in sched_get_rd()
531 	if (!atomic_dec_and_test(&rd->refcount))  in sched_put_rd()
534 	call_rcu(&rd->rcu, free_rootdomain);  in sched_put_rd()
539 	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))  in init_rootdomain()
540 		goto out;  in init_rootdomain()
541 	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))  in init_rootdomain()
543 	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))  in init_rootdomain()
545 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))  in init_rootdomain()
549 	rd->rto_cpu = -1;  in init_rootdomain()
550 	raw_spin_lock_init(&rd->rto_lock);  in init_rootdomain()
551 	rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);  in init_rootdomain()
554 	rd->visit_gen = 0;  in init_rootdomain()
555 	init_dl_bw(&rd->dl_bw);  in init_rootdomain()
556 	if (cpudl_init(&rd->cpudl) != 0)  in init_rootdomain()
559 	if (cpupri_init(&rd->cpupri) != 0)  in init_rootdomain()
564 	cpudl_cleanup(&rd->cpudl);  in init_rootdomain()
566 	free_cpumask_var(rd->rto_mask);  in init_rootdomain()
568 	free_cpumask_var(rd->dlo_mask);  in init_rootdomain()
570 	free_cpumask_var(rd->online);  in init_rootdomain()
572 	free_cpumask_var(rd->span);  in init_rootdomain()
573 out:  in init_rootdomain()
574 	return -ENOMEM;  in init_rootdomain()
578  * By default the system creates a single root-domain with all CPUs as
615 		tmp = sg->next;  in free_sched_groups()
617 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))  in free_sched_groups()
618 			kfree(sg->sgc);  in free_sched_groups()
620 		if (atomic_dec_and_test(&sg->ref))  in free_sched_groups()
633 	free_sched_groups(sd->groups, 1);  in destroy_sched_domain()
635 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))  in destroy_sched_domain()
636 		kfree(sd->shared);  in destroy_sched_domain()
645 		struct sched_domain *parent = sd->parent;  in destroy_sched_domains_rcu()
654 		call_rcu(&sd->rcu, destroy_sched_domains_rcu);  in destroy_sched_domains()
686 		sds = sd->shared;  in update_top_cache_domain()
716 		struct sched_domain *parent = tmp->parent;  in cpu_attach_domain()
721 			tmp->parent = parent->parent;  in cpu_attach_domain()
723 			if (parent->parent) {  in cpu_attach_domain()
724 				parent->parent->child = tmp;  in cpu_attach_domain()
725 				parent->parent->groups->flags = tmp->flags;  in cpu_attach_domain()
733 			if (parent->flags & SD_PREFER_SIBLING)  in cpu_attach_domain()
734 				tmp->flags |= SD_PREFER_SIBLING;  in cpu_attach_domain()
737 			tmp = tmp->parent;  in cpu_attach_domain()
742 		sd = sd->parent;  in cpu_attach_domain()
745 			struct sched_group *sg = sd->groups;  in cpu_attach_domain()
753 				sg->flags = 0;  in cpu_attach_domain()
754 			} while (sg != sd->groups);  in cpu_attach_domain()
756 			sd->child = NULL;  in cpu_attach_domain()
763 	tmp = rq->sd;  in cpu_attach_domain()
764 	rcu_assign_pointer(rq->sd, sd);  in cpu_attach_domain()
801  * Given a node-distance table, for example:
811  *   0 ----- 1
815  *   3 ----- 2
823  * NUMA-2	0-3		0-3		0-3		0-3
824  *  groups:	{0-1,3},{1-3}	{0-2},{0,2-3}	{1-3},{0-1,3}	{0,2-3},{0-2}
826  * NUMA-1	0-1,3		0-2		1-3		0,2-3
829  * NUMA-0	0		1		2		3
834  * represented multiple times -- hence the "overlap" naming for this part of
838  * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
842  *  - the first group of each domain is its child domain; this
843  *    gets us the first 0-1,3
844  *  - the only uncovered node is 2, who's child domain is 1-3.
847  * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
848  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
849  * end up at those groups (they would end up in group: 0-1,3).
872  *   0 ----- 1
876  *   2 ----- 3
884  * NUMA-2	0-3						0-3
885  *  groups:	{0-2},{1-3}					{1-3},{0-2}
887  * NUMA-1	0-2		0-3		0-3		1-3
889  * NUMA-0	0		1		2		3
907 	struct sd_data *sdd = sd->private;  in build_balance_mask()
914 		sibling = *per_cpu_ptr(sdd->sd, i);  in build_balance_mask()
921 		if (!sibling->child)  in build_balance_mask()
925 		if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))  in build_balance_mask()
931 	/* We must not have empty masks here */  in build_balance_mask()
936  * XXX: This creates per-node group entries; since the load-balancer will
937  * immediately access remote memory to construct this group's load-balance
953 	if (sd->child) {  in build_group_from_child_sched_domain()
954 		cpumask_copy(sg_span, sched_domain_span(sd->child));  in build_group_from_child_sched_domain()
955 		sg->flags = sd->child->flags;  in build_group_from_child_sched_domain()
960 	atomic_inc(&sg->ref);  in build_group_from_child_sched_domain()
968 	struct sd_data *sdd = sd->private;  in init_overlap_sched_group()
975 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);  in init_overlap_sched_group()
976 	if (atomic_inc_return(&sg->sgc->ref) == 1)  in init_overlap_sched_group()
982 	 * Initialize sgc->capacity such that even if we mess up the  in init_overlap_sched_group()
987 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);  in init_overlap_sched_group()
988 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;  in init_overlap_sched_group()
989 	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;  in init_overlap_sched_group()
996 	 * The proper descendant would be the one whose child won't span out  in find_descended_sibling()
999 	while (sibling->child &&  in find_descended_sibling()
1000 	       !cpumask_subset(sched_domain_span(sibling->child),  in find_descended_sibling()
1002 		sibling = sibling->child;  in find_descended_sibling()
1009 	while (sibling->child &&  in find_descended_sibling()
1010 	       cpumask_equal(sched_domain_span(sibling->child),  in find_descended_sibling()
1012 		sibling = sibling->child;  in find_descended_sibling()
1023 	struct sd_data *sdd = sd->private;  in build_overlap_sched_groups()
1035 		sibling = *per_cpu_ptr(sdd->sd, i);  in build_overlap_sched_groups()
1054 		 * domain because sibling's child sched_domain will span out of  in build_overlap_sched_groups()
1065 		 *   0 --- 1 --- 2 --- 3  in build_overlap_sched_groups()
1067 		 * NUMA-3       0-3             N/A             N/A             0-3  in build_overlap_sched_groups()
1068 		 *  groups:     {0-2},{1-3}                                     {1-3},{0-2}  in build_overlap_sched_groups()
1070 		 * NUMA-2       0-2             0-3             0-3             1-3  in build_overlap_sched_groups()
1071 		 *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}  in build_overlap_sched_groups()
1073 		 * NUMA-1       0-1             0-2             1-3             2-3  in build_overlap_sched_groups()
1076 		 * NUMA-0       0               1               2               3  in build_overlap_sched_groups()
1078 		 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the  in build_overlap_sched_groups()
1081 		if (sibling->child &&  in build_overlap_sched_groups()
1082 		    !cpumask_subset(sched_domain_span(sibling->child), span))  in build_overlap_sched_groups()
1097 			last->next = sg;  in build_overlap_sched_groups()
1099 		last->next = first;  in build_overlap_sched_groups()
1101 	sd->groups = first;  in build_overlap_sched_groups()
1108 	return -ENOMEM;  in build_overlap_sched_groups()
1113  * Package topology (also see the load-balance blurb in fair.c)
1118  *  - Simultaneous multithreading (SMT)
1119  *  - Multi-Core Cache (MC)
1120  *  - Package (PKG)
1126  *	sched_domain -> sched_group -> sched_group_capacity
1128  *          `-'             `-'
1130  * The sched_domains are per-CPU and have a two way link (parent & child) and
1146  *  - or -
1148  * PKG  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
1149  * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
1150  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
1163  *  - The first is the balance_cpu (see should_we_balance() and the
1164  *    load-balance blub in fair.c); for each group we only want 1 CPU to
1167  *  - The second is the sched_group_capacity; we want all identical groups
1185 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);  in get_group()
1186 	struct sched_domain *child = sd->child;  in get_group()
1193 	sg = *per_cpu_ptr(sdd->sg, cpu);  in get_group()
1194 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);  in get_group()
1197 	already_visited = atomic_inc_return(&sg->ref) > 1;  in get_group()
1199 	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));  in get_group()
1208 		sg->flags = child->flags;  in get_group()
1214 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));  in get_group()
1215 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;  in get_group()
1216 	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;  in get_group()
1223  * covered by the given span, will set each group's ->cpumask correctly,
1224  * and will initialize their ->sgc.
1232 	struct sd_data *sdd = sd->private;  in build_sched_groups()
1255 			last->next = sg;  in build_sched_groups()
1258 	last->next = first;  in build_sched_groups()
1259 	sd->groups = first;  in build_sched_groups()
1276 	struct sched_group *sg = sd->groups;  in init_sched_groups_capacity()
1282 		int cpu, cores = 0, max_cpu = -1;  in init_sched_groups_capacity()
1284 		sg->group_weight = cpumask_weight(sched_group_span(sg));  in init_sched_groups_capacity()
1293 		sg->cores = cores;  in init_sched_groups_capacity()
1295 		if (!(sd->flags & SD_ASYM_PACKING))  in init_sched_groups_capacity()
1304 		sg->asym_prefer_cpu = max_cpu;  in init_sched_groups_capacity()
1307 		sg = sg->next;  in init_sched_groups_capacity()
1308 	} while (sg != sd->groups);  in init_sched_groups_capacity()
1333 #define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
1379 		if (capacity == entry->capacity)  in asym_cpu_capacity_update_data()
1386 	entry->capacity = capacity;  in asym_cpu_capacity_update_data()
1387 	list_add(&entry->link, &asym_cap_list);  in asym_cpu_capacity_update_data()
1393  * Build-up/update list of CPUs grouped by their capacities
1410 			list_del(&entry->link);  in asym_cpu_capacity_scan()
1421 		list_del(&entry->link);  in asym_cpu_capacity_scan()
1428  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1431 static int default_relax_domain_level = -1;
1448 	if (!attr || attr->relax_domain_level < 0) {  in set_domain_attribute()
1453 		request = attr->relax_domain_level;  in set_domain_attribute()
1455 	if (sd->level >= request) {  in set_domain_attribute()
1457 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);  in set_domain_attribute()
1469 		if (!atomic_read(&d->rd->refcount))  in __free_domain_allocs()
1470 			free_rootdomain(&d->rd->rcu);  in __free_domain_allocs()
1473 		free_percpu(d->sd);  in __free_domain_allocs()
1490 	d->sd = alloc_percpu(struct sched_domain *);  in __visit_domain_allocation_hell()
1491 	if (!d->sd)  in __visit_domain_allocation_hell()
1493 	d->rd = alloc_rootdomain();  in __visit_domain_allocation_hell()
1494 	if (!d->rd)  in __visit_domain_allocation_hell()
1507 	struct sd_data *sdd = sd->private;  in claim_allocations()
1509 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);  in claim_allocations()
1510 	*per_cpu_ptr(sdd->sd, cpu) = NULL;  in claim_allocations()
1512 	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))  in claim_allocations()
1513 		*per_cpu_ptr(sdd->sds, cpu) = NULL;  in claim_allocations()
1515 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))  in claim_allocations()
1516 		*per_cpu_ptr(sdd->sg, cpu) = NULL;  in claim_allocations()
1518 	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))  in claim_allocations()
1519 		*per_cpu_ptr(sdd->sgc, cpu) = NULL;  in claim_allocations()
1540  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
1541  *   SD_SHARE_PKG_RESOURCES - describes shared caches
1542  *   SD_NUMA                - describes NUMA topologies
1544  * Odd one out, which beside describing the topology has a quirk also
1547  *   SD_ASYM_PACKING        - describes SMT quirks
1560 	struct sd_data *sdd = &tl->data;  in sd_init()
1561 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);  in sd_init()
1569 	sched_domains_curr_level = tl->numa_level;  in sd_init()
1572 	sd_weight = cpumask_weight(tl->mask(cpu));  in sd_init()
1574 	if (tl->sd_flags)  in sd_init()
1575 		sd_flags = (*tl->sd_flags)();  in sd_init()
1607 		.name			= tl->name,  in sd_init()
1612 	cpumask_and(sd_span, cpu_map, tl->mask(cpu));  in sd_init()
1615 	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);  in sd_init()
1617 	WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==  in sd_init()
1625 	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)  in sd_init()
1626 		sd->child->flags &= ~SD_PREFER_SIBLING;  in sd_init()
1628 	if (sd->flags & SD_SHARE_CPUCAPACITY) {  in sd_init()
1629 		sd->imbalance_pct = 110;  in sd_init()
1631 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {  in sd_init()
1632 		sd->imbalance_pct = 117;  in sd_init()
1633 		sd->cache_nice_tries = 1;  in sd_init()
1636 	} else if (sd->flags & SD_NUMA) {  in sd_init()
1637 		sd->cache_nice_tries = 2;  in sd_init()
1639 		sd->flags &= ~SD_PREFER_SIBLING;  in sd_init()
1640 		sd->flags |= SD_SERIALIZE;  in sd_init()
1641 		if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {  in sd_init()
1642 			sd->flags &= ~(SD_BALANCE_EXEC |  in sd_init()
1649 		sd->cache_nice_tries = 1;  in sd_init()
1656 	if (sd->flags & SD_SHARE_PKG_RESOURCES) {  in sd_init()
1657 		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);  in sd_init()
1658 		atomic_inc(&sd->shared->ref);  in sd_init()
1659 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);  in sd_init()
1662 	sd->private = sdd;  in sd_init()
1668  * Topology list, bottom-up.
1691 	for (tl = sched_domain_topology; tl->mask; tl++)
1777  * - If the maximum distance between any nodes is 1 hop, the system
1779  * - If for two nodes A and B, located N > 1 hops away from each other,
1829 	struct cpumask ***masks;  in sched_init_numa()  local
1832 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the  in sched_init_numa()
1854 	 * We can now figure out how many unique distance values there are and  in sched_init_numa()
1891 	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);  in sched_init_numa()
1892 	if (!masks)  in sched_init_numa()
1900 		masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);  in sched_init_numa()
1901 		if (!masks[i])  in sched_init_numa()
1911 			masks[i][j] = mask;  in sched_init_numa()
1915 					sched_numa_warn("Node-distance not symmetric");  in sched_init_numa()
1924 	rcu_assign_pointer(sched_domains_numa_masks, masks);  in sched_init_numa()
1966 	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);  in sched_init_numa()
1975 	struct cpumask ***masks;  in sched_reset_numa()  local
1983 	masks = sched_domains_numa_masks;  in sched_reset_numa()
1985 	if (distances || masks) {  in sched_reset_numa()
1990 		for (i = 0; i < nr_levels && masks; i++) {  in sched_reset_numa()
1991 			if (!masks[i])  in sched_reset_numa()
1994 				kfree(masks[i][j]);  in sched_reset_numa()
1995 			kfree(masks[i]);  in sched_reset_numa()
1997 		kfree(masks);  in sched_reset_numa()
2035 			/* Set ourselves in the remote node's masks */  in sched_domains_numa_masks_set()
2055  * sched_numa_find_closest() - given the NUMA topology, find the cpu
2065 	struct cpumask ***masks;  in sched_numa_find_closest()  local
2068 	masks = rcu_dereference(sched_domains_numa_masks);  in sched_numa_find_closest()
2069 	if (!masks)  in sched_numa_find_closest()
2072 		if (!masks[i][j])  in sched_numa_find_closest()
2074 		cpu = cpumask_any_and(cpus, masks[i][j]);  in sched_numa_find_closest()
2088 	struct cpumask ***masks;  member
2099 	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)  in hop_cmp()
2102 	if (b == k->masks) {  in hop_cmp()
2103 		k->w = 0;  in hop_cmp()
2107 	prev_hop = *((struct cpumask ***)b - 1);  in hop_cmp()
2108 	k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);  in hop_cmp()
2109 	if (k->w <= k->cpu)  in hop_cmp()
2112 	return -1;  in hop_cmp()
2116  * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
2134 	/* CPU-less node entries are uninitialized in sched_domains_numa_masks */  in sched_numa_find_nth_cpu()
2138 	k.masks = rcu_dereference(sched_domains_numa_masks);  in sched_numa_find_nth_cpu()
2139 	if (!k.masks)  in sched_numa_find_nth_cpu()
2142 	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);  in sched_numa_find_nth_cpu()
2143 	hop = hop_masks	- k.masks;  in sched_numa_find_nth_cpu()
2146 		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :  in sched_numa_find_nth_cpu()
2147 		cpumask_nth_and(cpu, cpus, k.masks[0][node]);  in sched_numa_find_nth_cpu()
2155  * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
2164  * read-side section, copy it if required beyond that.
2167  * distances and masks are handled.
2169  * during the lifetime of the system (offline nodes are taken out of the masks).
2173 	struct cpumask ***masks;  in sched_numa_hop_mask()  local
2176 		return ERR_PTR(-EINVAL);  in sched_numa_hop_mask()
2178 	masks = rcu_dereference(sched_domains_numa_masks);  in sched_numa_hop_mask()
2179 	if (!masks)  in sched_numa_hop_mask()
2180 		return ERR_PTR(-EBUSY);  in sched_numa_hop_mask()
2182 	return masks[hops][node];  in sched_numa_hop_mask()
2194 		struct sd_data *sdd = &tl->data;  in __sdt_alloc()
2196 		sdd->sd = alloc_percpu(struct sched_domain *);  in __sdt_alloc()
2197 		if (!sdd->sd)  in __sdt_alloc()
2198 			return -ENOMEM;  in __sdt_alloc()
2200 		sdd->sds = alloc_percpu(struct sched_domain_shared *);  in __sdt_alloc()
2201 		if (!sdd->sds)  in __sdt_alloc()
2202 			return -ENOMEM;  in __sdt_alloc()
2204 		sdd->sg = alloc_percpu(struct sched_group *);  in __sdt_alloc()
2205 		if (!sdd->sg)  in __sdt_alloc()
2206 			return -ENOMEM;  in __sdt_alloc()
2208 		sdd->sgc = alloc_percpu(struct sched_group_capacity *);  in __sdt_alloc()
2209 		if (!sdd->sgc)  in __sdt_alloc()
2210 			return -ENOMEM;  in __sdt_alloc()
2221 				return -ENOMEM;  in __sdt_alloc()
2223 			*per_cpu_ptr(sdd->sd, j) = sd;  in __sdt_alloc()
2228 				return -ENOMEM;  in __sdt_alloc()
2230 			*per_cpu_ptr(sdd->sds, j) = sds;  in __sdt_alloc()
2235 				return -ENOMEM;  in __sdt_alloc()
2237 			sg->next = sg;  in __sdt_alloc()
2239 			*per_cpu_ptr(sdd->sg, j) = sg;  in __sdt_alloc()
2244 				return -ENOMEM;  in __sdt_alloc()
2247 			sgc->id = j;  in __sdt_alloc()
2250 			*per_cpu_ptr(sdd->sgc, j) = sgc;  in __sdt_alloc()
2263 		struct sd_data *sdd = &tl->data;  in __sdt_free()
2268 			if (sdd->sd) {  in __sdt_free()
2269 				sd = *per_cpu_ptr(sdd->sd, j);  in __sdt_free()
2270 				if (sd && (sd->flags & SD_OVERLAP))  in __sdt_free()
2271 					free_sched_groups(sd->groups, 0);  in __sdt_free()
2272 				kfree(*per_cpu_ptr(sdd->sd, j));  in __sdt_free()
2275 			if (sdd->sds)  in __sdt_free()
2276 				kfree(*per_cpu_ptr(sdd->sds, j));  in __sdt_free()
2277 			if (sdd->sg)  in __sdt_free()
2278 				kfree(*per_cpu_ptr(sdd->sg, j));  in __sdt_free()
2279 			if (sdd->sgc)  in __sdt_free()
2280 				kfree(*per_cpu_ptr(sdd->sgc, j));  in __sdt_free()
2282 		free_percpu(sdd->sd);  in __sdt_free()
2283 		sdd->sd = NULL;  in __sdt_free()
2284 		free_percpu(sdd->sds);  in __sdt_free()
2285 		sdd->sds = NULL;  in __sdt_free()
2286 		free_percpu(sdd->sg);  in __sdt_free()
2287 		sdd->sg = NULL;  in __sdt_free()
2288 		free_percpu(sdd->sgc);  in __sdt_free()
2289 		sdd->sgc = NULL;  in __sdt_free()
2300 		sd->level = child->level + 1;  in build_sched_domain()
2301 		sched_domain_level_max = max(sched_domain_level_max, sd->level);  in build_sched_domain()
2302 		child->parent = sd;  in build_sched_domain()
2309 					child->name, sd->name);  in build_sched_domain()
2324  * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
2325  * any two given CPUs at this (non-NUMA) topology level.
2333 	if (tl->flags & SDTL_OVERLAP)  in topology_span_sane()
2337 	 * Non-NUMA levels cannot partially overlap - they must be either  in topology_span_sane()
2339 	 * breaking the sched_group lists - i.e. a later get_group() pass  in topology_span_sane()
2346 		 * We should 'and' all those masks with 'cpu_map' to exactly  in topology_span_sane()
2351 		if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&  in topology_span_sane()
2352 		    cpumask_intersects(tl->mask(cpu), tl->mask(i)))  in topology_span_sane()
2370 	int i, ret = -ENOMEM;  in build_sched_domains()
2392 			has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;  in build_sched_domains()
2396 			if (tl->flags & SDTL_OVERLAP)  in build_sched_domains()
2397 				sd->flags |= SD_OVERLAP;  in build_sched_domains()
2405 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {  in build_sched_domains()
2406 			sd->span_weight = cpumask_weight(sched_domain_span(sd));  in build_sched_domains()
2407 			if (sd->flags & SD_OVERLAP) {  in build_sched_domains()
2425 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {  in build_sched_domains()
2426 			struct sched_domain *child = sd->child;  in build_sched_domains()
2428 			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&  in build_sched_domains()
2429 			    (child->flags & SD_SHARE_PKG_RESOURCES)) {  in build_sched_domains()
2436 				 * arbitrary cutoff based two factors -- SMT and  in build_sched_domains()
2437 				 * memory channels. For SMT-2, the intent is to  in build_sched_domains()
2439 				 * SMT-4 or SMT-8 *may* benefit from a different  in build_sched_domains()
2453 				nr_llcs = sd->span_weight / child->span_weight;  in build_sched_domains()
2455 					imb = sd->span_weight >> 3;  in build_sched_domains()
2459 				sd->imb_numa_nr = imb;  in build_sched_domains()
2462 				top_p = sd->parent;  in build_sched_domains()
2463 				while (top_p && !(top_p->flags & SD_NUMA)) {  in build_sched_domains()
2464 					top_p = top_p->parent;  in build_sched_domains()
2466 				imb_span = top_p ? top_p->span_weight : sd->span_weight;  in build_sched_domains()
2468 				int factor = max(1U, (sd->span_weight / imb_span));  in build_sched_domains()
2470 				sd->imb_numa_nr = imb * factor;  in build_sched_domains()
2476 	for (i = nr_cpumask_bits-1; i >= 0; i--) {  in build_sched_domains()
2480 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {  in build_sched_domains()
2496 		if (capacity > READ_ONCE(d.rd->max_cpu_capacity))  in build_sched_domains()
2497 			WRITE_ONCE(d.rd->max_cpu_capacity, capacity);  in build_sched_domains()
2508 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);  in build_sched_domains()
2635  * The masks don't intersect (don't overlap.) We should setup one
2691 				 * its dl_bw->total_bw needs to be cleared.  It  in partition_sched_domains_locked()
2695 				rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;  in partition_sched_domains_locked()
2700 		/* No match - a current sched domain not in new doms_new[] */  in partition_sched_domains_locked()
2721 		/* No match - add a new doms_new */  in partition_sched_domains_locked()
2732 			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {  in partition_sched_domains_locked()
2737 		/* No match - add perf. domains for a new rd */  in partition_sched_domains_locked()