xref: /openbmc/linux/lib/group_cpus.c (revision 39f603a2)
1f7b3ea8cSMing Lei // SPDX-License-Identifier: GPL-2.0
2f7b3ea8cSMing Lei /*
3f7b3ea8cSMing Lei  * Copyright (C) 2016 Thomas Gleixner.
4f7b3ea8cSMing Lei  * Copyright (C) 2016-2017 Christoph Hellwig.
5f7b3ea8cSMing Lei  */
6f7b3ea8cSMing Lei #include <linux/kernel.h>
7f7b3ea8cSMing Lei #include <linux/slab.h>
8f7b3ea8cSMing Lei #include <linux/cpu.h>
9f7b3ea8cSMing Lei #include <linux/sort.h>
10f7b3ea8cSMing Lei #include <linux/group_cpus.h>
11f7b3ea8cSMing Lei 
12188a5696SIngo Molnar #ifdef CONFIG_SMP
13188a5696SIngo Molnar 
grp_spread_init_one(struct cpumask * irqmsk,struct cpumask * nmsk,unsigned int cpus_per_grp)14f7b3ea8cSMing Lei static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
15f7b3ea8cSMing Lei 				unsigned int cpus_per_grp)
16f7b3ea8cSMing Lei {
17f7b3ea8cSMing Lei 	const struct cpumask *siblmsk;
18f7b3ea8cSMing Lei 	int cpu, sibl;
19f7b3ea8cSMing Lei 
20f7b3ea8cSMing Lei 	for ( ; cpus_per_grp > 0; ) {
21f7b3ea8cSMing Lei 		cpu = cpumask_first(nmsk);
22f7b3ea8cSMing Lei 
23f7b3ea8cSMing Lei 		/* Should not happen, but I'm too lazy to think about it */
24f7b3ea8cSMing Lei 		if (cpu >= nr_cpu_ids)
25f7b3ea8cSMing Lei 			return;
26f7b3ea8cSMing Lei 
27f7b3ea8cSMing Lei 		cpumask_clear_cpu(cpu, nmsk);
28f7b3ea8cSMing Lei 		cpumask_set_cpu(cpu, irqmsk);
29f7b3ea8cSMing Lei 		cpus_per_grp--;
30f7b3ea8cSMing Lei 
31f7b3ea8cSMing Lei 		/* If the cpu has siblings, use them first */
32f7b3ea8cSMing Lei 		siblmsk = topology_sibling_cpumask(cpu);
33f7b3ea8cSMing Lei 		for (sibl = -1; cpus_per_grp > 0; ) {
34f7b3ea8cSMing Lei 			sibl = cpumask_next(sibl, siblmsk);
35f7b3ea8cSMing Lei 			if (sibl >= nr_cpu_ids)
36f7b3ea8cSMing Lei 				break;
37f7b3ea8cSMing Lei 			if (!cpumask_test_and_clear_cpu(sibl, nmsk))
38f7b3ea8cSMing Lei 				continue;
39f7b3ea8cSMing Lei 			cpumask_set_cpu(sibl, irqmsk);
40f7b3ea8cSMing Lei 			cpus_per_grp--;
41f7b3ea8cSMing Lei 		}
42f7b3ea8cSMing Lei 	}
43f7b3ea8cSMing Lei }
44f7b3ea8cSMing Lei 
alloc_node_to_cpumask(void)45f7b3ea8cSMing Lei static cpumask_var_t *alloc_node_to_cpumask(void)
46f7b3ea8cSMing Lei {
47f7b3ea8cSMing Lei 	cpumask_var_t *masks;
48f7b3ea8cSMing Lei 	int node;
49f7b3ea8cSMing Lei 
50f7b3ea8cSMing Lei 	masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
51f7b3ea8cSMing Lei 	if (!masks)
52f7b3ea8cSMing Lei 		return NULL;
53f7b3ea8cSMing Lei 
54f7b3ea8cSMing Lei 	for (node = 0; node < nr_node_ids; node++) {
55f7b3ea8cSMing Lei 		if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
56f7b3ea8cSMing Lei 			goto out_unwind;
57f7b3ea8cSMing Lei 	}
58f7b3ea8cSMing Lei 
59f7b3ea8cSMing Lei 	return masks;
60f7b3ea8cSMing Lei 
61f7b3ea8cSMing Lei out_unwind:
62f7b3ea8cSMing Lei 	while (--node >= 0)
63f7b3ea8cSMing Lei 		free_cpumask_var(masks[node]);
64f7b3ea8cSMing Lei 	kfree(masks);
65f7b3ea8cSMing Lei 	return NULL;
66f7b3ea8cSMing Lei }
67f7b3ea8cSMing Lei 
free_node_to_cpumask(cpumask_var_t * masks)68f7b3ea8cSMing Lei static void free_node_to_cpumask(cpumask_var_t *masks)
69f7b3ea8cSMing Lei {
70f7b3ea8cSMing Lei 	int node;
71f7b3ea8cSMing Lei 
72f7b3ea8cSMing Lei 	for (node = 0; node < nr_node_ids; node++)
73f7b3ea8cSMing Lei 		free_cpumask_var(masks[node]);
74f7b3ea8cSMing Lei 	kfree(masks);
75f7b3ea8cSMing Lei }
76f7b3ea8cSMing Lei 
build_node_to_cpumask(cpumask_var_t * masks)77f7b3ea8cSMing Lei static void build_node_to_cpumask(cpumask_var_t *masks)
78f7b3ea8cSMing Lei {
79f7b3ea8cSMing Lei 	int cpu;
80f7b3ea8cSMing Lei 
81f7b3ea8cSMing Lei 	for_each_possible_cpu(cpu)
82f7b3ea8cSMing Lei 		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
83f7b3ea8cSMing Lei }
84f7b3ea8cSMing Lei 
get_nodes_in_cpumask(cpumask_var_t * node_to_cpumask,const struct cpumask * mask,nodemask_t * nodemsk)85f7b3ea8cSMing Lei static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
86f7b3ea8cSMing Lei 				const struct cpumask *mask, nodemask_t *nodemsk)
87f7b3ea8cSMing Lei {
88f7b3ea8cSMing Lei 	int n, nodes = 0;
89f7b3ea8cSMing Lei 
90f7b3ea8cSMing Lei 	/* Calculate the number of nodes in the supplied affinity mask */
91f7b3ea8cSMing Lei 	for_each_node(n) {
92f7b3ea8cSMing Lei 		if (cpumask_intersects(mask, node_to_cpumask[n])) {
93f7b3ea8cSMing Lei 			node_set(n, *nodemsk);
94f7b3ea8cSMing Lei 			nodes++;
95f7b3ea8cSMing Lei 		}
96f7b3ea8cSMing Lei 	}
97f7b3ea8cSMing Lei 	return nodes;
98f7b3ea8cSMing Lei }
99f7b3ea8cSMing Lei 
100f7b3ea8cSMing Lei struct node_groups {
101f7b3ea8cSMing Lei 	unsigned id;
102f7b3ea8cSMing Lei 
103f7b3ea8cSMing Lei 	union {
104f7b3ea8cSMing Lei 		unsigned ngroups;
105f7b3ea8cSMing Lei 		unsigned ncpus;
106f7b3ea8cSMing Lei 	};
107f7b3ea8cSMing Lei };
108f7b3ea8cSMing Lei 
ncpus_cmp_func(const void * l,const void * r)109f7b3ea8cSMing Lei static int ncpus_cmp_func(const void *l, const void *r)
110f7b3ea8cSMing Lei {
111f7b3ea8cSMing Lei 	const struct node_groups *ln = l;
112f7b3ea8cSMing Lei 	const struct node_groups *rn = r;
113f7b3ea8cSMing Lei 
114f7b3ea8cSMing Lei 	return ln->ncpus - rn->ncpus;
115f7b3ea8cSMing Lei }
116f7b3ea8cSMing Lei 
117f7b3ea8cSMing Lei /*
118f7b3ea8cSMing Lei  * Allocate group number for each node, so that for each node:
119f7b3ea8cSMing Lei  *
120f7b3ea8cSMing Lei  * 1) the allocated number is >= 1
121f7b3ea8cSMing Lei  *
122f7b3ea8cSMing Lei  * 2) the allocated number is <= active CPU number of this node
123f7b3ea8cSMing Lei  *
124f7b3ea8cSMing Lei  * The actual allocated total groups may be less than @numgrps when
125f7b3ea8cSMing Lei  * active total CPU number is less than @numgrps.
126f7b3ea8cSMing Lei  *
127f7b3ea8cSMing Lei  * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128f7b3ea8cSMing Lei  * for each node.
129f7b3ea8cSMing Lei  */
alloc_nodes_groups(unsigned int numgrps,cpumask_var_t * node_to_cpumask,const struct cpumask * cpu_mask,const nodemask_t nodemsk,struct cpumask * nmsk,struct node_groups * node_groups)130f7b3ea8cSMing Lei static void alloc_nodes_groups(unsigned int numgrps,
131f7b3ea8cSMing Lei 			       cpumask_var_t *node_to_cpumask,
132f7b3ea8cSMing Lei 			       const struct cpumask *cpu_mask,
133f7b3ea8cSMing Lei 			       const nodemask_t nodemsk,
134f7b3ea8cSMing Lei 			       struct cpumask *nmsk,
135f7b3ea8cSMing Lei 			       struct node_groups *node_groups)
136f7b3ea8cSMing Lei {
137f7b3ea8cSMing Lei 	unsigned n, remaining_ncpus = 0;
138f7b3ea8cSMing Lei 
139f7b3ea8cSMing Lei 	for (n = 0; n < nr_node_ids; n++) {
140f7b3ea8cSMing Lei 		node_groups[n].id = n;
141f7b3ea8cSMing Lei 		node_groups[n].ncpus = UINT_MAX;
142f7b3ea8cSMing Lei 	}
143f7b3ea8cSMing Lei 
144f7b3ea8cSMing Lei 	for_each_node_mask(n, nodemsk) {
145f7b3ea8cSMing Lei 		unsigned ncpus;
146f7b3ea8cSMing Lei 
147f7b3ea8cSMing Lei 		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
148f7b3ea8cSMing Lei 		ncpus = cpumask_weight(nmsk);
149f7b3ea8cSMing Lei 
150f7b3ea8cSMing Lei 		if (!ncpus)
151f7b3ea8cSMing Lei 			continue;
152f7b3ea8cSMing Lei 		remaining_ncpus += ncpus;
153f7b3ea8cSMing Lei 		node_groups[n].ncpus = ncpus;
154f7b3ea8cSMing Lei 	}
155f7b3ea8cSMing Lei 
156f7b3ea8cSMing Lei 	numgrps = min_t(unsigned, remaining_ncpus, numgrps);
157f7b3ea8cSMing Lei 
158f7b3ea8cSMing Lei 	sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
159f7b3ea8cSMing Lei 	     ncpus_cmp_func, NULL);
160f7b3ea8cSMing Lei 
161f7b3ea8cSMing Lei 	/*
162f7b3ea8cSMing Lei 	 * Allocate groups for each node according to the ratio of this
163f7b3ea8cSMing Lei 	 * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
164f7b3ea8cSMing Lei 	 * bigger than number of active numa nodes. Always start the
165f7b3ea8cSMing Lei 	 * allocation from the node with minimized nr_cpus.
166f7b3ea8cSMing Lei 	 *
167f7b3ea8cSMing Lei 	 * This way guarantees that each active node gets allocated at
168f7b3ea8cSMing Lei 	 * least one group, and the theory is simple: over-allocation
169f7b3ea8cSMing Lei 	 * is only done when this node is assigned by one group, so
170f7b3ea8cSMing Lei 	 * other nodes will be allocated >= 1 groups, since 'numgrps' is
171f7b3ea8cSMing Lei 	 * bigger than number of numa nodes.
172f7b3ea8cSMing Lei 	 *
173f7b3ea8cSMing Lei 	 * One perfect invariant is that number of allocated groups for
174f7b3ea8cSMing Lei 	 * each node is <= CPU count of this node:
175f7b3ea8cSMing Lei 	 *
176f7b3ea8cSMing Lei 	 * 1) suppose there are two nodes: A and B
177f7b3ea8cSMing Lei 	 * 	ncpu(X) is CPU count of node X
178f7b3ea8cSMing Lei 	 * 	grps(X) is the group count allocated to node X via this
179f7b3ea8cSMing Lei 	 * 	algorithm
180f7b3ea8cSMing Lei 	 *
181f7b3ea8cSMing Lei 	 * 	ncpu(A) <= ncpu(B)
182f7b3ea8cSMing Lei 	 * 	ncpu(A) + ncpu(B) = N
183f7b3ea8cSMing Lei 	 * 	grps(A) + grps(B) = G
184f7b3ea8cSMing Lei 	 *
185f7b3ea8cSMing Lei 	 * 	grps(A) = max(1, round_down(G * ncpu(A) / N))
186f7b3ea8cSMing Lei 	 * 	grps(B) = G - grps(A)
187f7b3ea8cSMing Lei 	 *
188f7b3ea8cSMing Lei 	 * 	both N and G are integer, and 2 <= G <= N, suppose
189f7b3ea8cSMing Lei 	 * 	G = N - delta, and 0 <= delta <= N - 2
190f7b3ea8cSMing Lei 	 *
191f7b3ea8cSMing Lei 	 * 2) obviously grps(A) <= ncpu(A) because:
192f7b3ea8cSMing Lei 	 *
193f7b3ea8cSMing Lei 	 * 	if grps(A) is 1, then grps(A) <= ncpu(A) given
194f7b3ea8cSMing Lei 	 * 	ncpu(A) >= 1
195f7b3ea8cSMing Lei 	 *
196f7b3ea8cSMing Lei 	 * 	otherwise,
197f7b3ea8cSMing Lei 	 * 		grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
198f7b3ea8cSMing Lei 	 *
199f7b3ea8cSMing Lei 	 * 3) prove how grps(B) <= ncpu(B):
200f7b3ea8cSMing Lei 	 *
201f7b3ea8cSMing Lei 	 * 	if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
202f7b3ea8cSMing Lei 	 * 	over-allocated, so grps(B) <= ncpu(B),
203f7b3ea8cSMing Lei 	 *
204f7b3ea8cSMing Lei 	 * 	otherwise:
205f7b3ea8cSMing Lei 	 *
206f7b3ea8cSMing Lei 	 * 	grps(A) =
207f7b3ea8cSMing Lei 	 * 		round_down(G * ncpu(A) / N) =
208f7b3ea8cSMing Lei 	 * 		round_down((N - delta) * ncpu(A) / N) =
209f7b3ea8cSMing Lei 	 * 		round_down((N * ncpu(A) - delta * ncpu(A)) / N)	 >=
210f7b3ea8cSMing Lei 	 * 		round_down((N * ncpu(A) - delta * N) / N)	 =
211f7b3ea8cSMing Lei 	 * 		cpu(A) - delta
212f7b3ea8cSMing Lei 	 *
213f7b3ea8cSMing Lei 	 * 	then:
214f7b3ea8cSMing Lei 	 *
215f7b3ea8cSMing Lei 	 * 	grps(A) - G >= ncpu(A) - delta - G
216f7b3ea8cSMing Lei 	 * 	=>
217f7b3ea8cSMing Lei 	 * 	G - grps(A) <= G + delta - ncpu(A)
218f7b3ea8cSMing Lei 	 * 	=>
219f7b3ea8cSMing Lei 	 * 	grps(B) <= N - ncpu(A)
220f7b3ea8cSMing Lei 	 * 	=>
221f7b3ea8cSMing Lei 	 * 	grps(B) <= cpu(B)
222f7b3ea8cSMing Lei 	 *
223f7b3ea8cSMing Lei 	 * For nodes >= 3, it can be thought as one node and another big
224f7b3ea8cSMing Lei 	 * node given that is exactly what this algorithm is implemented,
225f7b3ea8cSMing Lei 	 * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
226f7b3ea8cSMing Lei 	 * finally for each node X: grps(X) <= ncpu(X).
227f7b3ea8cSMing Lei 	 *
228f7b3ea8cSMing Lei 	 */
229f7b3ea8cSMing Lei 	for (n = 0; n < nr_node_ids; n++) {
230f7b3ea8cSMing Lei 		unsigned ngroups, ncpus;
231f7b3ea8cSMing Lei 
232f7b3ea8cSMing Lei 		if (node_groups[n].ncpus == UINT_MAX)
233f7b3ea8cSMing Lei 			continue;
234f7b3ea8cSMing Lei 
235f7b3ea8cSMing Lei 		WARN_ON_ONCE(numgrps == 0);
236f7b3ea8cSMing Lei 
237f7b3ea8cSMing Lei 		ncpus = node_groups[n].ncpus;
238f7b3ea8cSMing Lei 		ngroups = max_t(unsigned, 1,
239f7b3ea8cSMing Lei 				 numgrps * ncpus / remaining_ncpus);
240f7b3ea8cSMing Lei 		WARN_ON_ONCE(ngroups > ncpus);
241f7b3ea8cSMing Lei 
242f7b3ea8cSMing Lei 		node_groups[n].ngroups = ngroups;
243f7b3ea8cSMing Lei 
244f7b3ea8cSMing Lei 		remaining_ncpus -= ncpus;
245f7b3ea8cSMing Lei 		numgrps -= ngroups;
246f7b3ea8cSMing Lei 	}
247f7b3ea8cSMing Lei }
248f7b3ea8cSMing Lei 
__group_cpus_evenly(unsigned int startgrp,unsigned int numgrps,cpumask_var_t * node_to_cpumask,const struct cpumask * cpu_mask,struct cpumask * nmsk,struct cpumask * masks)249f7b3ea8cSMing Lei static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
250f7b3ea8cSMing Lei 			       cpumask_var_t *node_to_cpumask,
251f7b3ea8cSMing Lei 			       const struct cpumask *cpu_mask,
252f7b3ea8cSMing Lei 			       struct cpumask *nmsk, struct cpumask *masks)
253f7b3ea8cSMing Lei {
254f7b3ea8cSMing Lei 	unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
255f7b3ea8cSMing Lei 	unsigned int last_grp = numgrps;
256f7b3ea8cSMing Lei 	unsigned int curgrp = startgrp;
257f7b3ea8cSMing Lei 	nodemask_t nodemsk = NODE_MASK_NONE;
258f7b3ea8cSMing Lei 	struct node_groups *node_groups;
259f7b3ea8cSMing Lei 
260f7b3ea8cSMing Lei 	if (cpumask_empty(cpu_mask))
261f7b3ea8cSMing Lei 		return 0;
262f7b3ea8cSMing Lei 
263f7b3ea8cSMing Lei 	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
264f7b3ea8cSMing Lei 
265f7b3ea8cSMing Lei 	/*
266f7b3ea8cSMing Lei 	 * If the number of nodes in the mask is greater than or equal the
267f7b3ea8cSMing Lei 	 * number of groups we just spread the groups across the nodes.
268f7b3ea8cSMing Lei 	 */
269f7b3ea8cSMing Lei 	if (numgrps <= nodes) {
270f7b3ea8cSMing Lei 		for_each_node_mask(n, nodemsk) {
271f7b3ea8cSMing Lei 			/* Ensure that only CPUs which are in both masks are set */
272f7b3ea8cSMing Lei 			cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
273f7b3ea8cSMing Lei 			cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
274f7b3ea8cSMing Lei 			if (++curgrp == last_grp)
275f7b3ea8cSMing Lei 				curgrp = 0;
276f7b3ea8cSMing Lei 		}
277f7b3ea8cSMing Lei 		return numgrps;
278f7b3ea8cSMing Lei 	}
279f7b3ea8cSMing Lei 
280f7b3ea8cSMing Lei 	node_groups = kcalloc(nr_node_ids,
281f7b3ea8cSMing Lei 			       sizeof(struct node_groups),
282f7b3ea8cSMing Lei 			       GFP_KERNEL);
283f7b3ea8cSMing Lei 	if (!node_groups)
284f7b3ea8cSMing Lei 		return -ENOMEM;
285f7b3ea8cSMing Lei 
286f7b3ea8cSMing Lei 	/* allocate group number for each node */
287f7b3ea8cSMing Lei 	alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
288f7b3ea8cSMing Lei 			   nodemsk, nmsk, node_groups);
289f7b3ea8cSMing Lei 	for (i = 0; i < nr_node_ids; i++) {
290f7b3ea8cSMing Lei 		unsigned int ncpus, v;
291f7b3ea8cSMing Lei 		struct node_groups *nv = &node_groups[i];
292f7b3ea8cSMing Lei 
293f7b3ea8cSMing Lei 		if (nv->ngroups == UINT_MAX)
294f7b3ea8cSMing Lei 			continue;
295f7b3ea8cSMing Lei 
296f7b3ea8cSMing Lei 		/* Get the cpus on this node which are in the mask */
297f7b3ea8cSMing Lei 		cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
298f7b3ea8cSMing Lei 		ncpus = cpumask_weight(nmsk);
299f7b3ea8cSMing Lei 		if (!ncpus)
300f7b3ea8cSMing Lei 			continue;
301f7b3ea8cSMing Lei 
302f7b3ea8cSMing Lei 		WARN_ON_ONCE(nv->ngroups > ncpus);
303f7b3ea8cSMing Lei 
304f7b3ea8cSMing Lei 		/* Account for rounding errors */
305f7b3ea8cSMing Lei 		extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
306f7b3ea8cSMing Lei 
307f7b3ea8cSMing Lei 		/* Spread allocated groups on CPUs of the current node */
308f7b3ea8cSMing Lei 		for (v = 0; v < nv->ngroups; v++, curgrp++) {
309f7b3ea8cSMing Lei 			cpus_per_grp = ncpus / nv->ngroups;
310f7b3ea8cSMing Lei 
311f7b3ea8cSMing Lei 			/* Account for extra groups to compensate rounding errors */
312f7b3ea8cSMing Lei 			if (extra_grps) {
313f7b3ea8cSMing Lei 				cpus_per_grp++;
314f7b3ea8cSMing Lei 				--extra_grps;
315f7b3ea8cSMing Lei 			}
316f7b3ea8cSMing Lei 
317f7b3ea8cSMing Lei 			/*
318f7b3ea8cSMing Lei 			 * wrapping has to be considered given 'startgrp'
319f7b3ea8cSMing Lei 			 * may start anywhere
320f7b3ea8cSMing Lei 			 */
321f7b3ea8cSMing Lei 			if (curgrp >= last_grp)
322f7b3ea8cSMing Lei 				curgrp = 0;
323f7b3ea8cSMing Lei 			grp_spread_init_one(&masks[curgrp], nmsk,
324f7b3ea8cSMing Lei 						cpus_per_grp);
325f7b3ea8cSMing Lei 		}
326f7b3ea8cSMing Lei 		done += nv->ngroups;
327f7b3ea8cSMing Lei 	}
328f7b3ea8cSMing Lei 	kfree(node_groups);
329f7b3ea8cSMing Lei 	return done;
330f7b3ea8cSMing Lei }
331f7b3ea8cSMing Lei 
332f7b3ea8cSMing Lei /**
333f7b3ea8cSMing Lei  * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
334f7b3ea8cSMing Lei  * @numgrps: number of groups
335f7b3ea8cSMing Lei  *
336f7b3ea8cSMing Lei  * Return: cpumask array if successful, NULL otherwise. And each element
337f7b3ea8cSMing Lei  * includes CPUs assigned to this group
338f7b3ea8cSMing Lei  *
339f7b3ea8cSMing Lei  * Try to put close CPUs from viewpoint of CPU and NUMA locality into
340f7b3ea8cSMing Lei  * same group, and run two-stage grouping:
341f7b3ea8cSMing Lei  *	1) allocate present CPUs on these groups evenly first
342f7b3ea8cSMing Lei  *	2) allocate other possible CPUs on these groups evenly
343f7b3ea8cSMing Lei  *
344f7b3ea8cSMing Lei  * We guarantee in the resulted grouping that all CPUs are covered, and
345f7b3ea8cSMing Lei  * no same CPU is assigned to multiple groups
346f7b3ea8cSMing Lei  */
group_cpus_evenly(unsigned int numgrps)347f7b3ea8cSMing Lei struct cpumask *group_cpus_evenly(unsigned int numgrps)
348f7b3ea8cSMing Lei {
349f7b3ea8cSMing Lei 	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
350f7b3ea8cSMing Lei 	cpumask_var_t *node_to_cpumask;
351f7b3ea8cSMing Lei 	cpumask_var_t nmsk, npresmsk;
352f7b3ea8cSMing Lei 	int ret = -ENOMEM;
353f7b3ea8cSMing Lei 	struct cpumask *masks = NULL;
354f7b3ea8cSMing Lei 
355f7b3ea8cSMing Lei 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
356f7b3ea8cSMing Lei 		return NULL;
357f7b3ea8cSMing Lei 
358f7b3ea8cSMing Lei 	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
359f7b3ea8cSMing Lei 		goto fail_nmsk;
360f7b3ea8cSMing Lei 
361f7b3ea8cSMing Lei 	node_to_cpumask = alloc_node_to_cpumask();
362f7b3ea8cSMing Lei 	if (!node_to_cpumask)
363f7b3ea8cSMing Lei 		goto fail_npresmsk;
364f7b3ea8cSMing Lei 
365f7b3ea8cSMing Lei 	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
366f7b3ea8cSMing Lei 	if (!masks)
367f7b3ea8cSMing Lei 		goto fail_node_to_cpumask;
368f7b3ea8cSMing Lei 
369f7b3ea8cSMing Lei 	build_node_to_cpumask(node_to_cpumask);
370f7b3ea8cSMing Lei 
371*39f603a2SMing Lei 	/*
372*39f603a2SMing Lei 	 * Make a local cache of 'cpu_present_mask', so the two stages
373*39f603a2SMing Lei 	 * spread can observe consistent 'cpu_present_mask' without holding
374*39f603a2SMing Lei 	 * cpu hotplug lock, then we can reduce deadlock risk with cpu
375*39f603a2SMing Lei 	 * hotplug code.
376*39f603a2SMing Lei 	 *
377*39f603a2SMing Lei 	 * Here CPU hotplug may happen when reading `cpu_present_mask`, and
378*39f603a2SMing Lei 	 * we can live with the case because it only affects that hotplug
379*39f603a2SMing Lei 	 * CPU is handled in the 1st or 2nd stage, and either way is correct
380*39f603a2SMing Lei 	 * from API user viewpoint since 2-stage spread is sort of
381*39f603a2SMing Lei 	 * optimization.
382*39f603a2SMing Lei 	 */
383*39f603a2SMing Lei 	cpumask_copy(npresmsk, data_race(cpu_present_mask));
384*39f603a2SMing Lei 
385f7b3ea8cSMing Lei 	/* grouping present CPUs first */
386f7b3ea8cSMing Lei 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
387*39f603a2SMing Lei 				  npresmsk, nmsk, masks);
388f7b3ea8cSMing Lei 	if (ret < 0)
389f7b3ea8cSMing Lei 		goto fail_build_affinity;
390f7b3ea8cSMing Lei 	nr_present = ret;
391f7b3ea8cSMing Lei 
392f7b3ea8cSMing Lei 	/*
393f7b3ea8cSMing Lei 	 * Allocate non present CPUs starting from the next group to be
394f7b3ea8cSMing Lei 	 * handled. If the grouping of present CPUs already exhausted the
395f7b3ea8cSMing Lei 	 * group space, assign the non present CPUs to the already
396f7b3ea8cSMing Lei 	 * allocated out groups.
397f7b3ea8cSMing Lei 	 */
398f7b3ea8cSMing Lei 	if (nr_present >= numgrps)
399f7b3ea8cSMing Lei 		curgrp = 0;
400f7b3ea8cSMing Lei 	else
401f7b3ea8cSMing Lei 		curgrp = nr_present;
402*39f603a2SMing Lei 	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
403f7b3ea8cSMing Lei 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
404f7b3ea8cSMing Lei 				  npresmsk, nmsk, masks);
405f7b3ea8cSMing Lei 	if (ret >= 0)
406f7b3ea8cSMing Lei 		nr_others = ret;
407f7b3ea8cSMing Lei 
408f7b3ea8cSMing Lei  fail_build_affinity:
409f7b3ea8cSMing Lei 	if (ret >= 0)
410f7b3ea8cSMing Lei 		WARN_ON(nr_present + nr_others < numgrps);
411f7b3ea8cSMing Lei 
412f7b3ea8cSMing Lei  fail_node_to_cpumask:
413f7b3ea8cSMing Lei 	free_node_to_cpumask(node_to_cpumask);
414f7b3ea8cSMing Lei 
415f7b3ea8cSMing Lei  fail_npresmsk:
416f7b3ea8cSMing Lei 	free_cpumask_var(npresmsk);
417f7b3ea8cSMing Lei 
418f7b3ea8cSMing Lei  fail_nmsk:
419f7b3ea8cSMing Lei 	free_cpumask_var(nmsk);
420f7b3ea8cSMing Lei 	if (ret < 0) {
421f7b3ea8cSMing Lei 		kfree(masks);
422f7b3ea8cSMing Lei 		return NULL;
423f7b3ea8cSMing Lei 	}
424f7b3ea8cSMing Lei 	return masks;
425f7b3ea8cSMing Lei }
426188a5696SIngo Molnar #else /* CONFIG_SMP */
group_cpus_evenly(unsigned int numgrps)427f7b3ea8cSMing Lei struct cpumask *group_cpus_evenly(unsigned int numgrps)
428f7b3ea8cSMing Lei {
429f7b3ea8cSMing Lei 	struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
430f7b3ea8cSMing Lei 
431f7b3ea8cSMing Lei 	if (!masks)
432f7b3ea8cSMing Lei 		return NULL;
433f7b3ea8cSMing Lei 
434f7b3ea8cSMing Lei 	/* assign all CPUs(cpu 0) to the 1st group only */
435f7b3ea8cSMing Lei 	cpumask_copy(&masks[0], cpu_possible_mask);
436f7b3ea8cSMing Lei 	return masks;
437f7b3ea8cSMing Lei }
438188a5696SIngo Molnar #endif /* CONFIG_SMP */
439aaf05948SXie Yongji EXPORT_SYMBOL_GPL(group_cpus_evenly);
440