1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2016 Thomas Gleixner. 4 * Copyright (C) 2016-2017 Christoph Hellwig. 5 */ 6 #include <linux/kernel.h> 7 #include <linux/slab.h> 8 #include <linux/cpu.h> 9 #include <linux/sort.h> 10 #include <linux/group_cpus.h> 11 12 #ifdef CONFIG_SMP 13 14 static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 15 unsigned int cpus_per_grp) 16 { 17 const struct cpumask *siblmsk; 18 int cpu, sibl; 19 20 for ( ; cpus_per_grp > 0; ) { 21 cpu = cpumask_first(nmsk); 22 23 /* Should not happen, but I'm too lazy to think about it */ 24 if (cpu >= nr_cpu_ids) 25 return; 26 27 cpumask_clear_cpu(cpu, nmsk); 28 cpumask_set_cpu(cpu, irqmsk); 29 cpus_per_grp--; 30 31 /* If the cpu has siblings, use them first */ 32 siblmsk = topology_sibling_cpumask(cpu); 33 for (sibl = -1; cpus_per_grp > 0; ) { 34 sibl = cpumask_next(sibl, siblmsk); 35 if (sibl >= nr_cpu_ids) 36 break; 37 if (!cpumask_test_and_clear_cpu(sibl, nmsk)) 38 continue; 39 cpumask_set_cpu(sibl, irqmsk); 40 cpus_per_grp--; 41 } 42 } 43 } 44 45 static cpumask_var_t *alloc_node_to_cpumask(void) 46 { 47 cpumask_var_t *masks; 48 int node; 49 50 masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); 51 if (!masks) 52 return NULL; 53 54 for (node = 0; node < nr_node_ids; node++) { 55 if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) 56 goto out_unwind; 57 } 58 59 return masks; 60 61 out_unwind: 62 while (--node >= 0) 63 free_cpumask_var(masks[node]); 64 kfree(masks); 65 return NULL; 66 } 67 68 static void free_node_to_cpumask(cpumask_var_t *masks) 69 { 70 int node; 71 72 for (node = 0; node < nr_node_ids; node++) 73 free_cpumask_var(masks[node]); 74 kfree(masks); 75 } 76 77 static void build_node_to_cpumask(cpumask_var_t *masks) 78 { 79 int cpu; 80 81 for_each_possible_cpu(cpu) 82 cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 83 } 84 85 static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, 86 const struct cpumask *mask, nodemask_t *nodemsk) 87 { 88 int n, nodes = 0; 89 90 /* Calculate the number of nodes in the supplied affinity mask */ 91 for_each_node(n) { 92 if (cpumask_intersects(mask, node_to_cpumask[n])) { 93 node_set(n, *nodemsk); 94 nodes++; 95 } 96 } 97 return nodes; 98 } 99 100 struct node_groups { 101 unsigned id; 102 103 union { 104 unsigned ngroups; 105 unsigned ncpus; 106 }; 107 }; 108 109 static int ncpus_cmp_func(const void *l, const void *r) 110 { 111 const struct node_groups *ln = l; 112 const struct node_groups *rn = r; 113 114 return ln->ncpus - rn->ncpus; 115 } 116 117 /* 118 * Allocate group number for each node, so that for each node: 119 * 120 * 1) the allocated number is >= 1 121 * 122 * 2) the allocated number is <= active CPU number of this node 123 * 124 * The actual allocated total groups may be less than @numgrps when 125 * active total CPU number is less than @numgrps. 126 * 127 * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 128 * for each node. 129 */ 130 static void alloc_nodes_groups(unsigned int numgrps, 131 cpumask_var_t *node_to_cpumask, 132 const struct cpumask *cpu_mask, 133 const nodemask_t nodemsk, 134 struct cpumask *nmsk, 135 struct node_groups *node_groups) 136 { 137 unsigned n, remaining_ncpus = 0; 138 139 for (n = 0; n < nr_node_ids; n++) { 140 node_groups[n].id = n; 141 node_groups[n].ncpus = UINT_MAX; 142 } 143 144 for_each_node_mask(n, nodemsk) { 145 unsigned ncpus; 146 147 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 148 ncpus = cpumask_weight(nmsk); 149 150 if (!ncpus) 151 continue; 152 remaining_ncpus += ncpus; 153 node_groups[n].ncpus = ncpus; 154 } 155 156 numgrps = min_t(unsigned, remaining_ncpus, numgrps); 157 158 sort(node_groups, nr_node_ids, sizeof(node_groups[0]), 159 ncpus_cmp_func, NULL); 160 161 /* 162 * Allocate groups for each node according to the ratio of this 163 * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is 164 * bigger than number of active numa nodes. Always start the 165 * allocation from the node with minimized nr_cpus. 166 * 167 * This way guarantees that each active node gets allocated at 168 * least one group, and the theory is simple: over-allocation 169 * is only done when this node is assigned by one group, so 170 * other nodes will be allocated >= 1 groups, since 'numgrps' is 171 * bigger than number of numa nodes. 172 * 173 * One perfect invariant is that number of allocated groups for 174 * each node is <= CPU count of this node: 175 * 176 * 1) suppose there are two nodes: A and B 177 * ncpu(X) is CPU count of node X 178 * grps(X) is the group count allocated to node X via this 179 * algorithm 180 * 181 * ncpu(A) <= ncpu(B) 182 * ncpu(A) + ncpu(B) = N 183 * grps(A) + grps(B) = G 184 * 185 * grps(A) = max(1, round_down(G * ncpu(A) / N)) 186 * grps(B) = G - grps(A) 187 * 188 * both N and G are integer, and 2 <= G <= N, suppose 189 * G = N - delta, and 0 <= delta <= N - 2 190 * 191 * 2) obviously grps(A) <= ncpu(A) because: 192 * 193 * if grps(A) is 1, then grps(A) <= ncpu(A) given 194 * ncpu(A) >= 1 195 * 196 * otherwise, 197 * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N 198 * 199 * 3) prove how grps(B) <= ncpu(B): 200 * 201 * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be 202 * over-allocated, so grps(B) <= ncpu(B), 203 * 204 * otherwise: 205 * 206 * grps(A) = 207 * round_down(G * ncpu(A) / N) = 208 * round_down((N - delta) * ncpu(A) / N) = 209 * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= 210 * round_down((N * ncpu(A) - delta * N) / N) = 211 * cpu(A) - delta 212 * 213 * then: 214 * 215 * grps(A) - G >= ncpu(A) - delta - G 216 * => 217 * G - grps(A) <= G + delta - ncpu(A) 218 * => 219 * grps(B) <= N - ncpu(A) 220 * => 221 * grps(B) <= cpu(B) 222 * 223 * For nodes >= 3, it can be thought as one node and another big 224 * node given that is exactly what this algorithm is implemented, 225 * and we always re-calculate 'remaining_ncpus' & 'numgrps', and 226 * finally for each node X: grps(X) <= ncpu(X). 227 * 228 */ 229 for (n = 0; n < nr_node_ids; n++) { 230 unsigned ngroups, ncpus; 231 232 if (node_groups[n].ncpus == UINT_MAX) 233 continue; 234 235 WARN_ON_ONCE(numgrps == 0); 236 237 ncpus = node_groups[n].ncpus; 238 ngroups = max_t(unsigned, 1, 239 numgrps * ncpus / remaining_ncpus); 240 WARN_ON_ONCE(ngroups > ncpus); 241 242 node_groups[n].ngroups = ngroups; 243 244 remaining_ncpus -= ncpus; 245 numgrps -= ngroups; 246 } 247 } 248 249 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 250 cpumask_var_t *node_to_cpumask, 251 const struct cpumask *cpu_mask, 252 struct cpumask *nmsk, struct cpumask *masks) 253 { 254 unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; 255 unsigned int last_grp = numgrps; 256 unsigned int curgrp = startgrp; 257 nodemask_t nodemsk = NODE_MASK_NONE; 258 struct node_groups *node_groups; 259 260 if (cpumask_empty(cpu_mask)) 261 return 0; 262 263 nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); 264 265 /* 266 * If the number of nodes in the mask is greater than or equal the 267 * number of groups we just spread the groups across the nodes. 268 */ 269 if (numgrps <= nodes) { 270 for_each_node_mask(n, nodemsk) { 271 /* Ensure that only CPUs which are in both masks are set */ 272 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 273 cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); 274 if (++curgrp == last_grp) 275 curgrp = 0; 276 } 277 return numgrps; 278 } 279 280 node_groups = kcalloc(nr_node_ids, 281 sizeof(struct node_groups), 282 GFP_KERNEL); 283 if (!node_groups) 284 return -ENOMEM; 285 286 /* allocate group number for each node */ 287 alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 288 nodemsk, nmsk, node_groups); 289 for (i = 0; i < nr_node_ids; i++) { 290 unsigned int ncpus, v; 291 struct node_groups *nv = &node_groups[i]; 292 293 if (nv->ngroups == UINT_MAX) 294 continue; 295 296 /* Get the cpus on this node which are in the mask */ 297 cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); 298 ncpus = cpumask_weight(nmsk); 299 if (!ncpus) 300 continue; 301 302 WARN_ON_ONCE(nv->ngroups > ncpus); 303 304 /* Account for rounding errors */ 305 extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 306 307 /* Spread allocated groups on CPUs of the current node */ 308 for (v = 0; v < nv->ngroups; v++, curgrp++) { 309 cpus_per_grp = ncpus / nv->ngroups; 310 311 /* Account for extra groups to compensate rounding errors */ 312 if (extra_grps) { 313 cpus_per_grp++; 314 --extra_grps; 315 } 316 317 /* 318 * wrapping has to be considered given 'startgrp' 319 * may start anywhere 320 */ 321 if (curgrp >= last_grp) 322 curgrp = 0; 323 grp_spread_init_one(&masks[curgrp], nmsk, 324 cpus_per_grp); 325 } 326 done += nv->ngroups; 327 } 328 kfree(node_groups); 329 return done; 330 } 331 332 /** 333 * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality 334 * @numgrps: number of groups 335 * 336 * Return: cpumask array if successful, NULL otherwise. And each element 337 * includes CPUs assigned to this group 338 * 339 * Try to put close CPUs from viewpoint of CPU and NUMA locality into 340 * same group, and run two-stage grouping: 341 * 1) allocate present CPUs on these groups evenly first 342 * 2) allocate other possible CPUs on these groups evenly 343 * 344 * We guarantee in the resulted grouping that all CPUs are covered, and 345 * no same CPU is assigned to multiple groups 346 */ 347 struct cpumask *group_cpus_evenly(unsigned int numgrps) 348 { 349 unsigned int curgrp = 0, nr_present = 0, nr_others = 0; 350 cpumask_var_t *node_to_cpumask; 351 cpumask_var_t nmsk, npresmsk; 352 int ret = -ENOMEM; 353 struct cpumask *masks = NULL; 354 355 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 356 return NULL; 357 358 if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) 359 goto fail_nmsk; 360 361 node_to_cpumask = alloc_node_to_cpumask(); 362 if (!node_to_cpumask) 363 goto fail_npresmsk; 364 365 masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 366 if (!masks) 367 goto fail_node_to_cpumask; 368 369 build_node_to_cpumask(node_to_cpumask); 370 371 /* 372 * Make a local cache of 'cpu_present_mask', so the two stages 373 * spread can observe consistent 'cpu_present_mask' without holding 374 * cpu hotplug lock, then we can reduce deadlock risk with cpu 375 * hotplug code. 376 * 377 * Here CPU hotplug may happen when reading `cpu_present_mask`, and 378 * we can live with the case because it only affects that hotplug 379 * CPU is handled in the 1st or 2nd stage, and either way is correct 380 * from API user viewpoint since 2-stage spread is sort of 381 * optimization. 382 */ 383 cpumask_copy(npresmsk, data_race(cpu_present_mask)); 384 385 /* grouping present CPUs first */ 386 ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 387 npresmsk, nmsk, masks); 388 if (ret < 0) 389 goto fail_build_affinity; 390 nr_present = ret; 391 392 /* 393 * Allocate non present CPUs starting from the next group to be 394 * handled. If the grouping of present CPUs already exhausted the 395 * group space, assign the non present CPUs to the already 396 * allocated out groups. 397 */ 398 if (nr_present >= numgrps) 399 curgrp = 0; 400 else 401 curgrp = nr_present; 402 cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); 403 ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 404 npresmsk, nmsk, masks); 405 if (ret >= 0) 406 nr_others = ret; 407 408 fail_build_affinity: 409 if (ret >= 0) 410 WARN_ON(nr_present + nr_others < numgrps); 411 412 fail_node_to_cpumask: 413 free_node_to_cpumask(node_to_cpumask); 414 415 fail_npresmsk: 416 free_cpumask_var(npresmsk); 417 418 fail_nmsk: 419 free_cpumask_var(nmsk); 420 if (ret < 0) { 421 kfree(masks); 422 return NULL; 423 } 424 return masks; 425 } 426 #else /* CONFIG_SMP */ 427 struct cpumask *group_cpus_evenly(unsigned int numgrps) 428 { 429 struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 430 431 if (!masks) 432 return NULL; 433 434 /* assign all CPUs(cpu 0) to the 1st group only */ 435 cpumask_copy(&masks[0], cpu_possible_mask); 436 return masks; 437 } 438 #endif /* CONFIG_SMP */ 439 EXPORT_SYMBOL_GPL(group_cpus_evenly); 440