xref: /openbmc/linux/drivers/infiniband/hw/hfi1/affinity.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2  /*
3   * Copyright(c) 2015 - 2020 Intel Corporation.
4   */
5  
6  #include <linux/topology.h>
7  #include <linux/cpumask.h>
8  #include <linux/interrupt.h>
9  #include <linux/numa.h>
10  
11  #include "hfi.h"
12  #include "affinity.h"
13  #include "sdma.h"
14  #include "trace.h"
15  
16  struct hfi1_affinity_node_list node_affinity = {
17  	.list = LIST_HEAD_INIT(node_affinity.list),
18  	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
19  };
20  
21  /* Name of IRQ types, indexed by enum irq_type */
22  static const char * const irq_type_names[] = {
23  	"SDMA",
24  	"RCVCTXT",
25  	"NETDEVCTXT",
26  	"GENERAL",
27  	"OTHER",
28  };
29  
30  /* Per NUMA node count of HFI devices */
31  static unsigned int *hfi1_per_node_cntr;
32  
init_cpu_mask_set(struct cpu_mask_set * set)33  static inline void init_cpu_mask_set(struct cpu_mask_set *set)
34  {
35  	cpumask_clear(&set->mask);
36  	cpumask_clear(&set->used);
37  	set->gen = 0;
38  }
39  
40  /* Increment generation of CPU set if needed */
_cpu_mask_set_gen_inc(struct cpu_mask_set * set)41  static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
42  {
43  	if (cpumask_equal(&set->mask, &set->used)) {
44  		/*
45  		 * We've used up all the CPUs, bump up the generation
46  		 * and reset the 'used' map
47  		 */
48  		set->gen++;
49  		cpumask_clear(&set->used);
50  	}
51  }
52  
_cpu_mask_set_gen_dec(struct cpu_mask_set * set)53  static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
54  {
55  	if (cpumask_empty(&set->used) && set->gen) {
56  		set->gen--;
57  		cpumask_copy(&set->used, &set->mask);
58  	}
59  }
60  
61  /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
cpu_mask_set_get_first(struct cpu_mask_set * set,cpumask_var_t diff)62  static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
63  {
64  	int cpu;
65  
66  	if (!diff || !set)
67  		return -EINVAL;
68  
69  	_cpu_mask_set_gen_inc(set);
70  
71  	/* Find out CPUs left in CPU mask */
72  	cpumask_andnot(diff, &set->mask, &set->used);
73  
74  	cpu = cpumask_first(diff);
75  	if (cpu >= nr_cpu_ids) /* empty */
76  		cpu = -EINVAL;
77  	else
78  		cpumask_set_cpu(cpu, &set->used);
79  
80  	return cpu;
81  }
82  
cpu_mask_set_put(struct cpu_mask_set * set,int cpu)83  static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
84  {
85  	if (!set)
86  		return;
87  
88  	cpumask_clear_cpu(cpu, &set->used);
89  	_cpu_mask_set_gen_dec(set);
90  }
91  
92  /* Initialize non-HT cpu cores mask */
init_real_cpu_mask(void)93  void init_real_cpu_mask(void)
94  {
95  	int possible, curr_cpu, i, ht;
96  
97  	cpumask_clear(&node_affinity.real_cpu_mask);
98  
99  	/* Start with cpu online mask as the real cpu mask */
100  	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
101  
102  	/*
103  	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
104  	 */
105  	possible = cpumask_weight(&node_affinity.real_cpu_mask);
106  	ht = cpumask_weight(topology_sibling_cpumask(
107  				cpumask_first(&node_affinity.real_cpu_mask)));
108  	/*
109  	 * Step 1.  Skip over the first N HT siblings and use them as the
110  	 * "real" cores.  Assumes that HT cores are not enumerated in
111  	 * succession (except in the single core case).
112  	 */
113  	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
114  	for (i = 0; i < possible / ht; i++)
115  		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
116  	/*
117  	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
118  	 * skip any gaps.
119  	 */
120  	for (; i < possible; i++) {
121  		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
122  		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
123  	}
124  }
125  
node_affinity_init(void)126  int node_affinity_init(void)
127  {
128  	int node;
129  	struct pci_dev *dev = NULL;
130  	const struct pci_device_id *ids = hfi1_pci_tbl;
131  
132  	cpumask_clear(&node_affinity.proc.used);
133  	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
134  
135  	node_affinity.proc.gen = 0;
136  	node_affinity.num_core_siblings =
137  				cpumask_weight(topology_sibling_cpumask(
138  					cpumask_first(&node_affinity.proc.mask)
139  					));
140  	node_affinity.num_possible_nodes = num_possible_nodes();
141  	node_affinity.num_online_nodes = num_online_nodes();
142  	node_affinity.num_online_cpus = num_online_cpus();
143  
144  	/*
145  	 * The real cpu mask is part of the affinity struct but it has to be
146  	 * initialized early. It is needed to calculate the number of user
147  	 * contexts in set_up_context_variables().
148  	 */
149  	init_real_cpu_mask();
150  
151  	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
152  				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
153  	if (!hfi1_per_node_cntr)
154  		return -ENOMEM;
155  
156  	while (ids->vendor) {
157  		dev = NULL;
158  		while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
159  			node = pcibus_to_node(dev->bus);
160  			if (node < 0)
161  				goto out;
162  
163  			hfi1_per_node_cntr[node]++;
164  		}
165  		ids++;
166  	}
167  
168  	return 0;
169  
170  out:
171  	/*
172  	 * Invalid PCI NUMA node information found, note it, and populate
173  	 * our database 1:1.
174  	 */
175  	pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
176  	pr_err("HFI: System BIOS may need to be upgraded\n");
177  	for (node = 0; node < node_affinity.num_possible_nodes; node++)
178  		hfi1_per_node_cntr[node] = 1;
179  
180  	pci_dev_put(dev);
181  
182  	return 0;
183  }
184  
node_affinity_destroy(struct hfi1_affinity_node * entry)185  static void node_affinity_destroy(struct hfi1_affinity_node *entry)
186  {
187  	free_percpu(entry->comp_vect_affinity);
188  	kfree(entry);
189  }
190  
node_affinity_destroy_all(void)191  void node_affinity_destroy_all(void)
192  {
193  	struct list_head *pos, *q;
194  	struct hfi1_affinity_node *entry;
195  
196  	mutex_lock(&node_affinity.lock);
197  	list_for_each_safe(pos, q, &node_affinity.list) {
198  		entry = list_entry(pos, struct hfi1_affinity_node,
199  				   list);
200  		list_del(pos);
201  		node_affinity_destroy(entry);
202  	}
203  	mutex_unlock(&node_affinity.lock);
204  	kfree(hfi1_per_node_cntr);
205  }
206  
node_affinity_allocate(int node)207  static struct hfi1_affinity_node *node_affinity_allocate(int node)
208  {
209  	struct hfi1_affinity_node *entry;
210  
211  	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
212  	if (!entry)
213  		return NULL;
214  	entry->node = node;
215  	entry->comp_vect_affinity = alloc_percpu(u16);
216  	INIT_LIST_HEAD(&entry->list);
217  
218  	return entry;
219  }
220  
221  /*
222   * It appends an entry to the list.
223   * It *must* be called with node_affinity.lock held.
224   */
node_affinity_add_tail(struct hfi1_affinity_node * entry)225  static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
226  {
227  	list_add_tail(&entry->list, &node_affinity.list);
228  }
229  
230  /* It must be called with node_affinity.lock held */
node_affinity_lookup(int node)231  static struct hfi1_affinity_node *node_affinity_lookup(int node)
232  {
233  	struct hfi1_affinity_node *entry;
234  
235  	list_for_each_entry(entry, &node_affinity.list, list) {
236  		if (entry->node == node)
237  			return entry;
238  	}
239  
240  	return NULL;
241  }
242  
per_cpu_affinity_get(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)243  static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
244  				u16 __percpu *comp_vect_affinity)
245  {
246  	int curr_cpu;
247  	u16 cntr;
248  	u16 prev_cntr;
249  	int ret_cpu;
250  
251  	if (!possible_cpumask) {
252  		ret_cpu = -EINVAL;
253  		goto fail;
254  	}
255  
256  	if (!comp_vect_affinity) {
257  		ret_cpu = -EINVAL;
258  		goto fail;
259  	}
260  
261  	ret_cpu = cpumask_first(possible_cpumask);
262  	if (ret_cpu >= nr_cpu_ids) {
263  		ret_cpu = -EINVAL;
264  		goto fail;
265  	}
266  
267  	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
268  	for_each_cpu(curr_cpu, possible_cpumask) {
269  		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
270  
271  		if (cntr < prev_cntr) {
272  			ret_cpu = curr_cpu;
273  			prev_cntr = cntr;
274  		}
275  	}
276  
277  	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
278  
279  fail:
280  	return ret_cpu;
281  }
282  
per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)283  static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
284  				    u16 __percpu *comp_vect_affinity)
285  {
286  	int curr_cpu;
287  	int max_cpu;
288  	u16 cntr;
289  	u16 prev_cntr;
290  
291  	if (!possible_cpumask)
292  		return -EINVAL;
293  
294  	if (!comp_vect_affinity)
295  		return -EINVAL;
296  
297  	max_cpu = cpumask_first(possible_cpumask);
298  	if (max_cpu >= nr_cpu_ids)
299  		return -EINVAL;
300  
301  	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
302  	for_each_cpu(curr_cpu, possible_cpumask) {
303  		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
304  
305  		if (cntr > prev_cntr) {
306  			max_cpu = curr_cpu;
307  			prev_cntr = cntr;
308  		}
309  	}
310  
311  	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
312  
313  	return max_cpu;
314  }
315  
316  /*
317   * Non-interrupt CPUs are used first, then interrupt CPUs.
318   * Two already allocated cpu masks must be passed.
319   */
_dev_comp_vect_cpu_get(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,cpumask_var_t non_intr_cpus,cpumask_var_t available_cpus)320  static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
321  				  struct hfi1_affinity_node *entry,
322  				  cpumask_var_t non_intr_cpus,
323  				  cpumask_var_t available_cpus)
324  	__must_hold(&node_affinity.lock)
325  {
326  	int cpu;
327  	struct cpu_mask_set *set = dd->comp_vect;
328  
329  	lockdep_assert_held(&node_affinity.lock);
330  	if (!non_intr_cpus) {
331  		cpu = -1;
332  		goto fail;
333  	}
334  
335  	if (!available_cpus) {
336  		cpu = -1;
337  		goto fail;
338  	}
339  
340  	/* Available CPUs for pinning completion vectors */
341  	_cpu_mask_set_gen_inc(set);
342  	cpumask_andnot(available_cpus, &set->mask, &set->used);
343  
344  	/* Available CPUs without SDMA engine interrupts */
345  	cpumask_andnot(non_intr_cpus, available_cpus,
346  		       &entry->def_intr.used);
347  
348  	/* If there are non-interrupt CPUs available, use them first */
349  	if (!cpumask_empty(non_intr_cpus))
350  		cpu = cpumask_first(non_intr_cpus);
351  	else /* Otherwise, use interrupt CPUs */
352  		cpu = cpumask_first(available_cpus);
353  
354  	if (cpu >= nr_cpu_ids) { /* empty */
355  		cpu = -1;
356  		goto fail;
357  	}
358  	cpumask_set_cpu(cpu, &set->used);
359  
360  fail:
361  	return cpu;
362  }
363  
_dev_comp_vect_cpu_put(struct hfi1_devdata * dd,int cpu)364  static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
365  {
366  	struct cpu_mask_set *set = dd->comp_vect;
367  
368  	if (cpu < 0)
369  		return;
370  
371  	cpu_mask_set_put(set, cpu);
372  }
373  
374  /* _dev_comp_vect_mappings_destroy() is reentrant */
_dev_comp_vect_mappings_destroy(struct hfi1_devdata * dd)375  static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
376  {
377  	int i, cpu;
378  
379  	if (!dd->comp_vect_mappings)
380  		return;
381  
382  	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
383  		cpu = dd->comp_vect_mappings[i];
384  		_dev_comp_vect_cpu_put(dd, cpu);
385  		dd->comp_vect_mappings[i] = -1;
386  		hfi1_cdbg(AFFINITY,
387  			  "[%s] Release CPU %d from completion vector %d",
388  			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
389  	}
390  
391  	kfree(dd->comp_vect_mappings);
392  	dd->comp_vect_mappings = NULL;
393  }
394  
395  /*
396   * This function creates the table for looking up CPUs for completion vectors.
397   * num_comp_vectors needs to have been initilized before calling this function.
398   */
_dev_comp_vect_mappings_create(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)399  static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
400  					  struct hfi1_affinity_node *entry)
401  	__must_hold(&node_affinity.lock)
402  {
403  	int i, cpu, ret;
404  	cpumask_var_t non_intr_cpus;
405  	cpumask_var_t available_cpus;
406  
407  	lockdep_assert_held(&node_affinity.lock);
408  
409  	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
410  		return -ENOMEM;
411  
412  	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
413  		free_cpumask_var(non_intr_cpus);
414  		return -ENOMEM;
415  	}
416  
417  	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
418  					 sizeof(*dd->comp_vect_mappings),
419  					 GFP_KERNEL);
420  	if (!dd->comp_vect_mappings) {
421  		ret = -ENOMEM;
422  		goto fail;
423  	}
424  	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
425  		dd->comp_vect_mappings[i] = -1;
426  
427  	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
428  		cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
429  					     available_cpus);
430  		if (cpu < 0) {
431  			ret = -EINVAL;
432  			goto fail;
433  		}
434  
435  		dd->comp_vect_mappings[i] = cpu;
436  		hfi1_cdbg(AFFINITY,
437  			  "[%s] Completion Vector %d -> CPU %d",
438  			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
439  	}
440  
441  	free_cpumask_var(available_cpus);
442  	free_cpumask_var(non_intr_cpus);
443  	return 0;
444  
445  fail:
446  	free_cpumask_var(available_cpus);
447  	free_cpumask_var(non_intr_cpus);
448  	_dev_comp_vect_mappings_destroy(dd);
449  
450  	return ret;
451  }
452  
hfi1_comp_vectors_set_up(struct hfi1_devdata * dd)453  int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
454  {
455  	int ret;
456  	struct hfi1_affinity_node *entry;
457  
458  	mutex_lock(&node_affinity.lock);
459  	entry = node_affinity_lookup(dd->node);
460  	if (!entry) {
461  		ret = -EINVAL;
462  		goto unlock;
463  	}
464  	ret = _dev_comp_vect_mappings_create(dd, entry);
465  unlock:
466  	mutex_unlock(&node_affinity.lock);
467  
468  	return ret;
469  }
470  
hfi1_comp_vectors_clean_up(struct hfi1_devdata * dd)471  void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
472  {
473  	_dev_comp_vect_mappings_destroy(dd);
474  }
475  
hfi1_comp_vect_mappings_lookup(struct rvt_dev_info * rdi,int comp_vect)476  int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
477  {
478  	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
479  	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
480  
481  	if (!dd->comp_vect_mappings)
482  		return -EINVAL;
483  	if (comp_vect >= dd->comp_vect_possible_cpus)
484  		return -EINVAL;
485  
486  	return dd->comp_vect_mappings[comp_vect];
487  }
488  
489  /*
490   * It assumes dd->comp_vect_possible_cpus is available.
491   */
_dev_comp_vect_cpu_mask_init(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,bool first_dev_init)492  static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
493  					struct hfi1_affinity_node *entry,
494  					bool first_dev_init)
495  	__must_hold(&node_affinity.lock)
496  {
497  	int i, j, curr_cpu;
498  	int possible_cpus_comp_vect = 0;
499  	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
500  
501  	lockdep_assert_held(&node_affinity.lock);
502  	/*
503  	 * If there's only one CPU available for completion vectors, then
504  	 * there will only be one completion vector available. Othewise,
505  	 * the number of completion vector available will be the number of
506  	 * available CPUs divide it by the number of devices in the
507  	 * local NUMA node.
508  	 */
509  	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
510  		possible_cpus_comp_vect = 1;
511  		dd_dev_warn(dd,
512  			    "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
513  	} else {
514  		possible_cpus_comp_vect +=
515  			cpumask_weight(&entry->comp_vect_mask) /
516  				       hfi1_per_node_cntr[dd->node];
517  
518  		/*
519  		 * If the completion vector CPUs available doesn't divide
520  		 * evenly among devices, then the first device device to be
521  		 * initialized gets an extra CPU.
522  		 */
523  		if (first_dev_init &&
524  		    cpumask_weight(&entry->comp_vect_mask) %
525  		    hfi1_per_node_cntr[dd->node] != 0)
526  			possible_cpus_comp_vect++;
527  	}
528  
529  	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
530  
531  	/* Reserving CPUs for device completion vector */
532  	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
533  		curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
534  						entry->comp_vect_affinity);
535  		if (curr_cpu < 0)
536  			goto fail;
537  
538  		cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
539  	}
540  
541  	hfi1_cdbg(AFFINITY,
542  		  "[%s] Completion vector affinity CPU set(s) %*pbl",
543  		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
544  		  cpumask_pr_args(dev_comp_vect_mask));
545  
546  	return 0;
547  
548  fail:
549  	for (j = 0; j < i; j++)
550  		per_cpu_affinity_put_max(&entry->comp_vect_mask,
551  					 entry->comp_vect_affinity);
552  
553  	return curr_cpu;
554  }
555  
556  /*
557   * It assumes dd->comp_vect_possible_cpus is available.
558   */
_dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)559  static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
560  					     struct hfi1_affinity_node *entry)
561  	__must_hold(&node_affinity.lock)
562  {
563  	int i, cpu;
564  
565  	lockdep_assert_held(&node_affinity.lock);
566  	if (!dd->comp_vect_possible_cpus)
567  		return;
568  
569  	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
570  		cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
571  					       entry->comp_vect_affinity);
572  		/* Clearing CPU in device completion vector cpu mask */
573  		if (cpu >= 0)
574  			cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
575  	}
576  
577  	dd->comp_vect_possible_cpus = 0;
578  }
579  
580  /*
581   * Interrupt affinity.
582   *
583   * non-rcv avail gets a default mask that
584   * starts as possible cpus with threads reset
585   * and each rcv avail reset.
586   *
587   * rcv avail gets node relative 1 wrapping back
588   * to the node relative 1 as necessary.
589   *
590   */
hfi1_dev_affinity_init(struct hfi1_devdata * dd)591  int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
592  {
593  	struct hfi1_affinity_node *entry;
594  	const struct cpumask *local_mask;
595  	int curr_cpu, possible, i, ret;
596  	bool new_entry = false;
597  
598  	local_mask = cpumask_of_node(dd->node);
599  	if (cpumask_first(local_mask) >= nr_cpu_ids)
600  		local_mask = topology_core_cpumask(0);
601  
602  	mutex_lock(&node_affinity.lock);
603  	entry = node_affinity_lookup(dd->node);
604  
605  	/*
606  	 * If this is the first time this NUMA node's affinity is used,
607  	 * create an entry in the global affinity structure and initialize it.
608  	 */
609  	if (!entry) {
610  		entry = node_affinity_allocate(dd->node);
611  		if (!entry) {
612  			dd_dev_err(dd,
613  				   "Unable to allocate global affinity node\n");
614  			ret = -ENOMEM;
615  			goto fail;
616  		}
617  		new_entry = true;
618  
619  		init_cpu_mask_set(&entry->def_intr);
620  		init_cpu_mask_set(&entry->rcv_intr);
621  		cpumask_clear(&entry->comp_vect_mask);
622  		cpumask_clear(&entry->general_intr_mask);
623  		/* Use the "real" cpu mask of this node as the default */
624  		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
625  			    local_mask);
626  
627  		/* fill in the receive list */
628  		possible = cpumask_weight(&entry->def_intr.mask);
629  		curr_cpu = cpumask_first(&entry->def_intr.mask);
630  
631  		if (possible == 1) {
632  			/* only one CPU, everyone will use it */
633  			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
634  			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
635  		} else {
636  			/*
637  			 * The general/control context will be the first CPU in
638  			 * the default list, so it is removed from the default
639  			 * list and added to the general interrupt list.
640  			 */
641  			cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
642  			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
643  			curr_cpu = cpumask_next(curr_cpu,
644  						&entry->def_intr.mask);
645  
646  			/*
647  			 * Remove the remaining kernel receive queues from
648  			 * the default list and add them to the receive list.
649  			 */
650  			for (i = 0;
651  			     i < (dd->n_krcv_queues - 1) *
652  				  hfi1_per_node_cntr[dd->node];
653  			     i++) {
654  				cpumask_clear_cpu(curr_cpu,
655  						  &entry->def_intr.mask);
656  				cpumask_set_cpu(curr_cpu,
657  						&entry->rcv_intr.mask);
658  				curr_cpu = cpumask_next(curr_cpu,
659  							&entry->def_intr.mask);
660  				if (curr_cpu >= nr_cpu_ids)
661  					break;
662  			}
663  
664  			/*
665  			 * If there ends up being 0 CPU cores leftover for SDMA
666  			 * engines, use the same CPU cores as general/control
667  			 * context.
668  			 */
669  			if (cpumask_empty(&entry->def_intr.mask))
670  				cpumask_copy(&entry->def_intr.mask,
671  					     &entry->general_intr_mask);
672  		}
673  
674  		/* Determine completion vector CPUs for the entire node */
675  		cpumask_and(&entry->comp_vect_mask,
676  			    &node_affinity.real_cpu_mask, local_mask);
677  		cpumask_andnot(&entry->comp_vect_mask,
678  			       &entry->comp_vect_mask,
679  			       &entry->rcv_intr.mask);
680  		cpumask_andnot(&entry->comp_vect_mask,
681  			       &entry->comp_vect_mask,
682  			       &entry->general_intr_mask);
683  
684  		/*
685  		 * If there ends up being 0 CPU cores leftover for completion
686  		 * vectors, use the same CPU core as the general/control
687  		 * context.
688  		 */
689  		if (cpumask_empty(&entry->comp_vect_mask))
690  			cpumask_copy(&entry->comp_vect_mask,
691  				     &entry->general_intr_mask);
692  	}
693  
694  	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
695  	if (ret < 0)
696  		goto fail;
697  
698  	if (new_entry)
699  		node_affinity_add_tail(entry);
700  
701  	dd->affinity_entry = entry;
702  	mutex_unlock(&node_affinity.lock);
703  
704  	return 0;
705  
706  fail:
707  	if (new_entry)
708  		node_affinity_destroy(entry);
709  	mutex_unlock(&node_affinity.lock);
710  	return ret;
711  }
712  
hfi1_dev_affinity_clean_up(struct hfi1_devdata * dd)713  void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
714  {
715  	struct hfi1_affinity_node *entry;
716  
717  	mutex_lock(&node_affinity.lock);
718  	if (!dd->affinity_entry)
719  		goto unlock;
720  	entry = node_affinity_lookup(dd->node);
721  	if (!entry)
722  		goto unlock;
723  
724  	/*
725  	 * Free device completion vector CPUs to be used by future
726  	 * completion vectors
727  	 */
728  	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
729  unlock:
730  	dd->affinity_entry = NULL;
731  	mutex_unlock(&node_affinity.lock);
732  }
733  
734  /*
735   * Function updates the irq affinity hint for msix after it has been changed
736   * by the user using the /proc/irq interface. This function only accepts
737   * one cpu in the mask.
738   */
hfi1_update_sdma_affinity(struct hfi1_msix_entry * msix,int cpu)739  static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
740  {
741  	struct sdma_engine *sde = msix->arg;
742  	struct hfi1_devdata *dd = sde->dd;
743  	struct hfi1_affinity_node *entry;
744  	struct cpu_mask_set *set;
745  	int i, old_cpu;
746  
747  	if (cpu > num_online_cpus() || cpu == sde->cpu)
748  		return;
749  
750  	mutex_lock(&node_affinity.lock);
751  	entry = node_affinity_lookup(dd->node);
752  	if (!entry)
753  		goto unlock;
754  
755  	old_cpu = sde->cpu;
756  	sde->cpu = cpu;
757  	cpumask_clear(&msix->mask);
758  	cpumask_set_cpu(cpu, &msix->mask);
759  	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
760  		   msix->irq, irq_type_names[msix->type],
761  		   sde->this_idx, cpu);
762  	irq_set_affinity_hint(msix->irq, &msix->mask);
763  
764  	/*
765  	 * Set the new cpu in the hfi1_affinity_node and clean
766  	 * the old cpu if it is not used by any other IRQ
767  	 */
768  	set = &entry->def_intr;
769  	cpumask_set_cpu(cpu, &set->mask);
770  	cpumask_set_cpu(cpu, &set->used);
771  	for (i = 0; i < dd->msix_info.max_requested; i++) {
772  		struct hfi1_msix_entry *other_msix;
773  
774  		other_msix = &dd->msix_info.msix_entries[i];
775  		if (other_msix->type != IRQ_SDMA || other_msix == msix)
776  			continue;
777  
778  		if (cpumask_test_cpu(old_cpu, &other_msix->mask))
779  			goto unlock;
780  	}
781  	cpumask_clear_cpu(old_cpu, &set->mask);
782  	cpumask_clear_cpu(old_cpu, &set->used);
783  unlock:
784  	mutex_unlock(&node_affinity.lock);
785  }
786  
hfi1_irq_notifier_notify(struct irq_affinity_notify * notify,const cpumask_t * mask)787  static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
788  				     const cpumask_t *mask)
789  {
790  	int cpu = cpumask_first(mask);
791  	struct hfi1_msix_entry *msix = container_of(notify,
792  						    struct hfi1_msix_entry,
793  						    notify);
794  
795  	/* Only one CPU configuration supported currently */
796  	hfi1_update_sdma_affinity(msix, cpu);
797  }
798  
hfi1_irq_notifier_release(struct kref * ref)799  static void hfi1_irq_notifier_release(struct kref *ref)
800  {
801  	/*
802  	 * This is required by affinity notifier. We don't have anything to
803  	 * free here.
804  	 */
805  }
806  
hfi1_setup_sdma_notifier(struct hfi1_msix_entry * msix)807  static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
808  {
809  	struct irq_affinity_notify *notify = &msix->notify;
810  
811  	notify->irq = msix->irq;
812  	notify->notify = hfi1_irq_notifier_notify;
813  	notify->release = hfi1_irq_notifier_release;
814  
815  	if (irq_set_affinity_notifier(notify->irq, notify))
816  		pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
817  		       notify->irq);
818  }
819  
hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry * msix)820  static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
821  {
822  	struct irq_affinity_notify *notify = &msix->notify;
823  
824  	if (irq_set_affinity_notifier(notify->irq, NULL))
825  		pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
826  		       notify->irq);
827  }
828  
829  /*
830   * Function sets the irq affinity for msix.
831   * It *must* be called with node_affinity.lock held.
832   */
get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)833  static int get_irq_affinity(struct hfi1_devdata *dd,
834  			    struct hfi1_msix_entry *msix)
835  {
836  	cpumask_var_t diff;
837  	struct hfi1_affinity_node *entry;
838  	struct cpu_mask_set *set = NULL;
839  	struct sdma_engine *sde = NULL;
840  	struct hfi1_ctxtdata *rcd = NULL;
841  	char extra[64];
842  	int cpu = -1;
843  
844  	extra[0] = '\0';
845  	cpumask_clear(&msix->mask);
846  
847  	entry = node_affinity_lookup(dd->node);
848  
849  	switch (msix->type) {
850  	case IRQ_SDMA:
851  		sde = (struct sdma_engine *)msix->arg;
852  		scnprintf(extra, 64, "engine %u", sde->this_idx);
853  		set = &entry->def_intr;
854  		break;
855  	case IRQ_GENERAL:
856  		cpu = cpumask_first(&entry->general_intr_mask);
857  		break;
858  	case IRQ_RCVCTXT:
859  		rcd = (struct hfi1_ctxtdata *)msix->arg;
860  		if (rcd->ctxt == HFI1_CTRL_CTXT)
861  			cpu = cpumask_first(&entry->general_intr_mask);
862  		else
863  			set = &entry->rcv_intr;
864  		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
865  		break;
866  	case IRQ_NETDEVCTXT:
867  		rcd = (struct hfi1_ctxtdata *)msix->arg;
868  		set = &entry->def_intr;
869  		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
870  		break;
871  	default:
872  		dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
873  		return -EINVAL;
874  	}
875  
876  	/*
877  	 * The general and control contexts are placed on a particular
878  	 * CPU, which is set above. Skip accounting for it. Everything else
879  	 * finds its CPU here.
880  	 */
881  	if (cpu == -1 && set) {
882  		if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
883  			return -ENOMEM;
884  
885  		cpu = cpu_mask_set_get_first(set, diff);
886  		if (cpu < 0) {
887  			free_cpumask_var(diff);
888  			dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
889  			return cpu;
890  		}
891  
892  		free_cpumask_var(diff);
893  	}
894  
895  	cpumask_set_cpu(cpu, &msix->mask);
896  	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
897  		    msix->irq, irq_type_names[msix->type],
898  		    extra, cpu);
899  	irq_set_affinity_hint(msix->irq, &msix->mask);
900  
901  	if (msix->type == IRQ_SDMA) {
902  		sde->cpu = cpu;
903  		hfi1_setup_sdma_notifier(msix);
904  	}
905  
906  	return 0;
907  }
908  
hfi1_get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)909  int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
910  {
911  	int ret;
912  
913  	mutex_lock(&node_affinity.lock);
914  	ret = get_irq_affinity(dd, msix);
915  	mutex_unlock(&node_affinity.lock);
916  	return ret;
917  }
918  
hfi1_put_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)919  void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
920  			   struct hfi1_msix_entry *msix)
921  {
922  	struct cpu_mask_set *set = NULL;
923  	struct hfi1_affinity_node *entry;
924  
925  	mutex_lock(&node_affinity.lock);
926  	entry = node_affinity_lookup(dd->node);
927  
928  	switch (msix->type) {
929  	case IRQ_SDMA:
930  		set = &entry->def_intr;
931  		hfi1_cleanup_sdma_notifier(msix);
932  		break;
933  	case IRQ_GENERAL:
934  		/* Don't do accounting for general contexts */
935  		break;
936  	case IRQ_RCVCTXT: {
937  		struct hfi1_ctxtdata *rcd = msix->arg;
938  
939  		/* Don't do accounting for control contexts */
940  		if (rcd->ctxt != HFI1_CTRL_CTXT)
941  			set = &entry->rcv_intr;
942  		break;
943  	}
944  	case IRQ_NETDEVCTXT:
945  		set = &entry->def_intr;
946  		break;
947  	default:
948  		mutex_unlock(&node_affinity.lock);
949  		return;
950  	}
951  
952  	if (set) {
953  		cpumask_andnot(&set->used, &set->used, &msix->mask);
954  		_cpu_mask_set_gen_dec(set);
955  	}
956  
957  	irq_set_affinity_hint(msix->irq, NULL);
958  	cpumask_clear(&msix->mask);
959  	mutex_unlock(&node_affinity.lock);
960  }
961  
962  /* This should be called with node_affinity.lock held */
find_hw_thread_mask(uint hw_thread_no,cpumask_var_t hw_thread_mask,struct hfi1_affinity_node_list * affinity)963  static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
964  				struct hfi1_affinity_node_list *affinity)
965  {
966  	int possible, curr_cpu, i;
967  	uint num_cores_per_socket = node_affinity.num_online_cpus /
968  					affinity->num_core_siblings /
969  						node_affinity.num_online_nodes;
970  
971  	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
972  	if (affinity->num_core_siblings > 0) {
973  		/* Removing other siblings not needed for now */
974  		possible = cpumask_weight(hw_thread_mask);
975  		curr_cpu = cpumask_first(hw_thread_mask);
976  		for (i = 0;
977  		     i < num_cores_per_socket * node_affinity.num_online_nodes;
978  		     i++)
979  			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
980  
981  		for (; i < possible; i++) {
982  			cpumask_clear_cpu(curr_cpu, hw_thread_mask);
983  			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
984  		}
985  
986  		/* Identifying correct HW threads within physical cores */
987  		cpumask_shift_left(hw_thread_mask, hw_thread_mask,
988  				   num_cores_per_socket *
989  				   node_affinity.num_online_nodes *
990  				   hw_thread_no);
991  	}
992  }
993  
hfi1_get_proc_affinity(int node)994  int hfi1_get_proc_affinity(int node)
995  {
996  	int cpu = -1, ret, i;
997  	struct hfi1_affinity_node *entry;
998  	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
999  	const struct cpumask *node_mask,
1000  		*proc_mask = current->cpus_ptr;
1001  	struct hfi1_affinity_node_list *affinity = &node_affinity;
1002  	struct cpu_mask_set *set = &affinity->proc;
1003  
1004  	/*
1005  	 * check whether process/context affinity has already
1006  	 * been set
1007  	 */
1008  	if (current->nr_cpus_allowed == 1) {
1009  		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1010  			  current->pid, current->comm,
1011  			  cpumask_pr_args(proc_mask));
1012  		/*
1013  		 * Mark the pre-set CPU as used. This is atomic so we don't
1014  		 * need the lock
1015  		 */
1016  		cpu = cpumask_first(proc_mask);
1017  		cpumask_set_cpu(cpu, &set->used);
1018  		goto done;
1019  	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1020  		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1021  			  current->pid, current->comm,
1022  			  cpumask_pr_args(proc_mask));
1023  		goto done;
1024  	}
1025  
1026  	/*
1027  	 * The process does not have a preset CPU affinity so find one to
1028  	 * recommend using the following algorithm:
1029  	 *
1030  	 * For each user process that is opening a context on HFI Y:
1031  	 *  a) If all cores are filled, reinitialize the bitmask
1032  	 *  b) Fill real cores first, then HT cores (First set of HT
1033  	 *     cores on all physical cores, then second set of HT core,
1034  	 *     and, so on) in the following order:
1035  	 *
1036  	 *     1. Same NUMA node as HFI Y and not running an IRQ
1037  	 *        handler
1038  	 *     2. Same NUMA node as HFI Y and running an IRQ handler
1039  	 *     3. Different NUMA node to HFI Y and not running an IRQ
1040  	 *        handler
1041  	 *     4. Different NUMA node to HFI Y and running an IRQ
1042  	 *        handler
1043  	 *  c) Mark core as filled in the bitmask. As user processes are
1044  	 *     done, clear cores from the bitmask.
1045  	 */
1046  
1047  	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1048  	if (!ret)
1049  		goto done;
1050  	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1051  	if (!ret)
1052  		goto free_diff;
1053  	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1054  	if (!ret)
1055  		goto free_hw_thread_mask;
1056  	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1057  	if (!ret)
1058  		goto free_available_mask;
1059  
1060  	mutex_lock(&affinity->lock);
1061  	/*
1062  	 * If we've used all available HW threads, clear the mask and start
1063  	 * overloading.
1064  	 */
1065  	_cpu_mask_set_gen_inc(set);
1066  
1067  	/*
1068  	 * If NUMA node has CPUs used by interrupt handlers, include them in the
1069  	 * interrupt handler mask.
1070  	 */
1071  	entry = node_affinity_lookup(node);
1072  	if (entry) {
1073  		cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1074  					  &entry->def_intr.mask :
1075  					  &entry->def_intr.used));
1076  		cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1077  						    &entry->rcv_intr.mask :
1078  						    &entry->rcv_intr.used));
1079  		cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1080  	}
1081  	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1082  		  cpumask_pr_args(intrs_mask));
1083  
1084  	cpumask_copy(hw_thread_mask, &set->mask);
1085  
1086  	/*
1087  	 * If HT cores are enabled, identify which HW threads within the
1088  	 * physical cores should be used.
1089  	 */
1090  	if (affinity->num_core_siblings > 0) {
1091  		for (i = 0; i < affinity->num_core_siblings; i++) {
1092  			find_hw_thread_mask(i, hw_thread_mask, affinity);
1093  
1094  			/*
1095  			 * If there's at least one available core for this HW
1096  			 * thread number, stop looking for a core.
1097  			 *
1098  			 * diff will always be not empty at least once in this
1099  			 * loop as the used mask gets reset when
1100  			 * (set->mask == set->used) before this loop.
1101  			 */
1102  			cpumask_andnot(diff, hw_thread_mask, &set->used);
1103  			if (!cpumask_empty(diff))
1104  				break;
1105  		}
1106  	}
1107  	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1108  		  cpumask_pr_args(hw_thread_mask));
1109  
1110  	node_mask = cpumask_of_node(node);
1111  	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1112  		  cpumask_pr_args(node_mask));
1113  
1114  	/* Get cpumask of available CPUs on preferred NUMA */
1115  	cpumask_and(available_mask, hw_thread_mask, node_mask);
1116  	cpumask_andnot(available_mask, available_mask, &set->used);
1117  	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1118  		  cpumask_pr_args(available_mask));
1119  
1120  	/*
1121  	 * At first, we don't want to place processes on the same
1122  	 * CPUs as interrupt handlers. Then, CPUs running interrupt
1123  	 * handlers are used.
1124  	 *
1125  	 * 1) If diff is not empty, then there are CPUs not running
1126  	 *    non-interrupt handlers available, so diff gets copied
1127  	 *    over to available_mask.
1128  	 * 2) If diff is empty, then all CPUs not running interrupt
1129  	 *    handlers are taken, so available_mask contains all
1130  	 *    available CPUs running interrupt handlers.
1131  	 * 3) If available_mask is empty, then all CPUs on the
1132  	 *    preferred NUMA node are taken, so other NUMA nodes are
1133  	 *    used for process assignments using the same method as
1134  	 *    the preferred NUMA node.
1135  	 */
1136  	cpumask_andnot(diff, available_mask, intrs_mask);
1137  	if (!cpumask_empty(diff))
1138  		cpumask_copy(available_mask, diff);
1139  
1140  	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
1141  	if (cpumask_empty(available_mask)) {
1142  		cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1143  		/* Excluding preferred NUMA cores */
1144  		cpumask_andnot(available_mask, available_mask, node_mask);
1145  		hfi1_cdbg(PROC,
1146  			  "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1147  			  cpumask_pr_args(available_mask));
1148  
1149  		/*
1150  		 * At first, we don't want to place processes on the same
1151  		 * CPUs as interrupt handlers.
1152  		 */
1153  		cpumask_andnot(diff, available_mask, intrs_mask);
1154  		if (!cpumask_empty(diff))
1155  			cpumask_copy(available_mask, diff);
1156  	}
1157  	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1158  		  cpumask_pr_args(available_mask));
1159  
1160  	cpu = cpumask_first(available_mask);
1161  	if (cpu >= nr_cpu_ids) /* empty */
1162  		cpu = -1;
1163  	else
1164  		cpumask_set_cpu(cpu, &set->used);
1165  
1166  	mutex_unlock(&affinity->lock);
1167  	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1168  
1169  	free_cpumask_var(intrs_mask);
1170  free_available_mask:
1171  	free_cpumask_var(available_mask);
1172  free_hw_thread_mask:
1173  	free_cpumask_var(hw_thread_mask);
1174  free_diff:
1175  	free_cpumask_var(diff);
1176  done:
1177  	return cpu;
1178  }
1179  
hfi1_put_proc_affinity(int cpu)1180  void hfi1_put_proc_affinity(int cpu)
1181  {
1182  	struct hfi1_affinity_node_list *affinity = &node_affinity;
1183  	struct cpu_mask_set *set = &affinity->proc;
1184  
1185  	if (cpu < 0)
1186  		return;
1187  
1188  	mutex_lock(&affinity->lock);
1189  	cpu_mask_set_put(set, cpu);
1190  	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1191  	mutex_unlock(&affinity->lock);
1192  }
1193