1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <linux/pci.h>
5 #include <linux/interrupt.h>
6 #include <linux/notifier.h>
7 #include <linux/mlx5/driver.h>
8 #include <linux/mlx5/vport.h>
9 #include "mlx5_core.h"
10 #include "mlx5_irq.h"
11 #include "pci_irq.h"
12 #include "lib/sf.h"
13 #include "lib/eq.h"
14 #ifdef CONFIG_RFS_ACCEL
15 #include <linux/cpu_rmap.h>
16 #endif
17 
18 #define MLX5_SFS_PER_CTRL_IRQ 64
19 #define MLX5_IRQ_CTRL_SF_MAX 8
20 /* min num of vectors for SFs to be enabled */
21 #define MLX5_IRQ_VEC_COMP_BASE_SF 2
22 
23 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
24 #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
25 #define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
26 #define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)
27 
28 struct mlx5_irq {
29 	struct atomic_notifier_head nh;
30 	cpumask_var_t mask;
31 	char name[MLX5_MAX_IRQ_NAME];
32 	struct mlx5_irq_pool *pool;
33 	int refcount;
34 	struct msi_map map;
35 };
36 
37 struct mlx5_irq_table {
38 	struct mlx5_irq_pool *pcif_pool;
39 	struct mlx5_irq_pool *sf_ctrl_pool;
40 	struct mlx5_irq_pool *sf_comp_pool;
41 };
42 
43 /**
44  * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
45  *                                   to be ssigned to each VF.
46  * @dev: PF to work on
47  * @num_vfs: Number of enabled VFs
48  */
49 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs)
50 {
51 	int num_vf_msix, min_msix, max_msix;
52 
53 	num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
54 	if (!num_vf_msix)
55 		return 0;
56 
57 	min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
58 	max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
59 
60 	/* Limit maximum number of MSI-X vectors so the default configuration
61 	 * has some available in the pool. This will allow the user to increase
62 	 * the number of vectors in a VF without having to first size-down other
63 	 * VFs.
64 	 */
65 	return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix);
66 }
67 
68 /**
69  * mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF
70  * @dev: PF to work on
71  * @function_id: Internal PCI VF function IDd
72  * @msix_vec_count: Number of MSI-X vectors to set
73  */
74 int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
75 			    int msix_vec_count)
76 {
77 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
78 	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
79 	void *hca_cap = NULL, *query_cap = NULL, *cap;
80 	int num_vf_msix, min_msix, max_msix;
81 	int ret;
82 
83 	num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
84 	if (!num_vf_msix)
85 		return 0;
86 
87 	if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev))
88 		return -EOPNOTSUPP;
89 
90 	min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
91 	max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
92 
93 	if (msix_vec_count < min_msix)
94 		return -EINVAL;
95 
96 	if (msix_vec_count > max_msix)
97 		return -EOVERFLOW;
98 
99 	query_cap = kvzalloc(query_sz, GFP_KERNEL);
100 	hca_cap = kvzalloc(set_sz, GFP_KERNEL);
101 	if (!hca_cap || !query_cap) {
102 		ret = -ENOMEM;
103 		goto out;
104 	}
105 
106 	ret = mlx5_vport_get_other_func_general_cap(dev, function_id, query_cap);
107 	if (ret)
108 		goto out;
109 
110 	cap = MLX5_ADDR_OF(set_hca_cap_in, hca_cap, capability);
111 	memcpy(cap, MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability),
112 	       MLX5_UN_SZ_BYTES(hca_cap_union));
113 	MLX5_SET(cmd_hca_cap, cap, dynamic_msix_table_size, msix_vec_count);
114 
115 	MLX5_SET(set_hca_cap_in, hca_cap, opcode, MLX5_CMD_OP_SET_HCA_CAP);
116 	MLX5_SET(set_hca_cap_in, hca_cap, other_function, 1);
117 	MLX5_SET(set_hca_cap_in, hca_cap, function_id, function_id);
118 
119 	MLX5_SET(set_hca_cap_in, hca_cap, op_mod,
120 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1);
121 	ret = mlx5_cmd_exec_in(dev, set_hca_cap, hca_cap);
122 out:
123 	kvfree(hca_cap);
124 	kvfree(query_cap);
125 	return ret;
126 }
127 
128 static void irq_release(struct mlx5_irq *irq)
129 {
130 	struct mlx5_irq_pool *pool = irq->pool;
131 #ifdef CONFIG_RFS_ACCEL
132 	struct cpu_rmap *rmap;
133 #endif
134 
135 	xa_erase(&pool->irqs, irq->map.index);
136 	/* free_irq requires that affinity_hint and rmap will be cleared before
137 	 * calling it. To satisfy this requirement, we call
138 	 * irq_cpu_rmap_remove() to remove the notifier
139 	 */
140 	irq_update_affinity_hint(irq->map.virq, NULL);
141 #ifdef CONFIG_RFS_ACCEL
142 	rmap = mlx5_eq_table_get_rmap(pool->dev);
143 	if (rmap && irq->map.index)
144 		irq_cpu_rmap_remove(rmap, irq->map.virq);
145 #endif
146 
147 	free_cpumask_var(irq->mask);
148 	free_irq(irq->map.virq, &irq->nh);
149 	if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev))
150 		pci_msix_free_irq(pool->dev->pdev, irq->map);
151 	kfree(irq);
152 }
153 
154 int mlx5_irq_put(struct mlx5_irq *irq)
155 {
156 	struct mlx5_irq_pool *pool = irq->pool;
157 	int ret = 0;
158 
159 	mutex_lock(&pool->lock);
160 	irq->refcount--;
161 	if (!irq->refcount) {
162 		irq_release(irq);
163 		ret = 1;
164 	}
165 	mutex_unlock(&pool->lock);
166 	return ret;
167 }
168 
169 int mlx5_irq_read_locked(struct mlx5_irq *irq)
170 {
171 	lockdep_assert_held(&irq->pool->lock);
172 	return irq->refcount;
173 }
174 
175 int mlx5_irq_get_locked(struct mlx5_irq *irq)
176 {
177 	lockdep_assert_held(&irq->pool->lock);
178 	if (WARN_ON_ONCE(!irq->refcount))
179 		return 0;
180 	irq->refcount++;
181 	return 1;
182 }
183 
184 static int irq_get(struct mlx5_irq *irq)
185 {
186 	int err;
187 
188 	mutex_lock(&irq->pool->lock);
189 	err = mlx5_irq_get_locked(irq);
190 	mutex_unlock(&irq->pool->lock);
191 	return err;
192 }
193 
194 static irqreturn_t irq_int_handler(int irq, void *nh)
195 {
196 	atomic_notifier_call_chain(nh, 0, NULL);
197 	return IRQ_HANDLED;
198 }
199 
200 static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
201 {
202 	snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
203 }
204 
205 static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
206 {
207 	if (!pool->xa_num_irqs.max) {
208 		/* in case we only have a single irq for the device */
209 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_combined%d", vecidx);
210 		return;
211 	}
212 
213 	if (!vecidx) {
214 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
215 		return;
216 	}
217 
218 	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx);
219 }
220 
221 struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
222 				struct irq_affinity_desc *af_desc,
223 				struct cpu_rmap **rmap)
224 {
225 	struct mlx5_core_dev *dev = pool->dev;
226 	char name[MLX5_MAX_IRQ_NAME];
227 	struct mlx5_irq *irq;
228 	int err;
229 
230 	irq = kzalloc(sizeof(*irq), GFP_KERNEL);
231 	if (!irq)
232 		return ERR_PTR(-ENOMEM);
233 	if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
234 		/* The vector at index 0 was already allocated.
235 		 * Just get the irq number. If dynamic irq is not supported
236 		 * vectors have also been allocated.
237 		 */
238 		irq->map.virq = pci_irq_vector(dev->pdev, i);
239 		irq->map.index = 0;
240 	} else {
241 		irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc);
242 		if (!irq->map.virq) {
243 			err = irq->map.index;
244 			goto err_alloc_irq;
245 		}
246 	}
247 
248 	if (i && rmap && *rmap) {
249 #ifdef CONFIG_RFS_ACCEL
250 		err = irq_cpu_rmap_add(*rmap, irq->map.virq);
251 		if (err)
252 			goto err_irq_rmap;
253 #endif
254 	}
255 	if (!mlx5_irq_pool_is_sf_pool(pool))
256 		irq_set_name(pool, name, i);
257 	else
258 		irq_sf_set_name(pool, name, i);
259 	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
260 	snprintf(irq->name, MLX5_MAX_IRQ_NAME,
261 		 "%s@pci:%s", name, pci_name(dev->pdev));
262 	err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
263 			  &irq->nh);
264 	if (err) {
265 		mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
266 		goto err_req_irq;
267 	}
268 	if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
269 		mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
270 		err = -ENOMEM;
271 		goto err_cpumask;
272 	}
273 	if (af_desc) {
274 		cpumask_copy(irq->mask, &af_desc->mask);
275 		irq_set_affinity_and_hint(irq->map.virq, irq->mask);
276 	}
277 	irq->pool = pool;
278 	irq->refcount = 1;
279 	irq->map.index = i;
280 	err = xa_err(xa_store(&pool->irqs, irq->map.index, irq, GFP_KERNEL));
281 	if (err) {
282 		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
283 			      irq->map.index, err);
284 		goto err_xa;
285 	}
286 	return irq;
287 err_xa:
288 	if (af_desc)
289 		irq_update_affinity_hint(irq->map.virq, NULL);
290 	free_cpumask_var(irq->mask);
291 err_cpumask:
292 	free_irq(irq->map.virq, &irq->nh);
293 err_req_irq:
294 #ifdef CONFIG_RFS_ACCEL
295 	if (i && rmap && *rmap) {
296 		free_irq_cpu_rmap(*rmap);
297 		*rmap = NULL;
298 	}
299 err_irq_rmap:
300 #endif
301 	if (i && pci_msix_can_alloc_dyn(dev->pdev))
302 		pci_msix_free_irq(dev->pdev, irq->map);
303 err_alloc_irq:
304 	kfree(irq);
305 	return ERR_PTR(err);
306 }
307 
308 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
309 {
310 	int ret;
311 
312 	ret = irq_get(irq);
313 	if (!ret)
314 		/* Something very bad happens here, we are enabling EQ
315 		 * on non-existing IRQ.
316 		 */
317 		return -ENOENT;
318 	ret = atomic_notifier_chain_register(&irq->nh, nb);
319 	if (ret)
320 		mlx5_irq_put(irq);
321 	return ret;
322 }
323 
324 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
325 {
326 	int err = 0;
327 
328 	err = atomic_notifier_chain_unregister(&irq->nh, nb);
329 	mlx5_irq_put(irq);
330 	return err;
331 }
332 
333 struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
334 {
335 	return irq->mask;
336 }
337 
338 int mlx5_irq_get_index(struct mlx5_irq *irq)
339 {
340 	return irq->map.index;
341 }
342 
343 /* irq_pool API */
344 
345 /* requesting an irq from a given pool according to given index */
346 static struct mlx5_irq *
347 irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
348 			struct irq_affinity_desc *af_desc,
349 			struct cpu_rmap **rmap)
350 {
351 	struct mlx5_irq *irq;
352 
353 	mutex_lock(&pool->lock);
354 	irq = xa_load(&pool->irqs, vecidx);
355 	if (irq) {
356 		mlx5_irq_get_locked(irq);
357 		goto unlock;
358 	}
359 	irq = mlx5_irq_alloc(pool, vecidx, af_desc, rmap);
360 unlock:
361 	mutex_unlock(&pool->lock);
362 	return irq;
363 }
364 
365 static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table)
366 {
367 	return irq_table->sf_ctrl_pool;
368 }
369 
370 static struct mlx5_irq_pool *sf_irq_pool_get(struct mlx5_irq_table *irq_table)
371 {
372 	return irq_table->sf_comp_pool;
373 }
374 
375 struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev)
376 {
377 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
378 	struct mlx5_irq_pool *pool = NULL;
379 
380 	if (mlx5_core_is_sf(dev))
381 		pool = sf_irq_pool_get(irq_table);
382 
383 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
384 	 * the PF IRQs pool in case the SF pool doesn't exist.
385 	 */
386 	return pool ? pool : irq_table->pcif_pool;
387 }
388 
389 static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
390 {
391 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
392 	struct mlx5_irq_pool *pool = NULL;
393 
394 	if (mlx5_core_is_sf(dev))
395 		pool = sf_ctrl_irq_pool_get(irq_table);
396 
397 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
398 	 * the PF IRQs pool in case the SF pool doesn't exist.
399 	 */
400 	return pool ? pool : irq_table->pcif_pool;
401 }
402 
403 /**
404  * mlx5_irqs_release - release one or more IRQs back to the system.
405  * @irqs: IRQs to be released.
406  * @nirqs: number of IRQs to be released.
407  */
408 static void mlx5_irqs_release(struct mlx5_irq **irqs, int nirqs)
409 {
410 	int i;
411 
412 	for (i = 0; i < nirqs; i++) {
413 		synchronize_irq(irqs[i]->map.virq);
414 		mlx5_irq_put(irqs[i]);
415 	}
416 }
417 
418 /**
419  * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system.
420  * @ctrl_irq: ctrl IRQ to be released.
421  */
422 void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq)
423 {
424 	mlx5_irqs_release(&ctrl_irq, 1);
425 }
426 
427 /**
428  * mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device.
429  * @dev: mlx5 device that requesting the IRQ.
430  *
431  * This function returns a pointer to IRQ, or ERR_PTR in case of error.
432  */
433 struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
434 {
435 	struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
436 	struct irq_affinity_desc af_desc;
437 	struct mlx5_irq *irq;
438 
439 	cpumask_copy(&af_desc.mask, cpu_online_mask);
440 	af_desc.is_managed = false;
441 	if (!mlx5_irq_pool_is_sf_pool(pool)) {
442 		/* In case we are allocating a control IRQ from a pci device's pool.
443 		 * This can happen also for a SF if the SFs pool is empty.
444 		 */
445 		if (!pool->xa_num_irqs.max) {
446 			cpumask_clear(&af_desc.mask);
447 			/* In case we only have a single IRQ for PF/VF */
448 			cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc.mask);
449 		}
450 		/* Allocate the IRQ in index 0. The vector was already allocated */
451 		irq = irq_pool_request_vector(pool, 0, &af_desc, NULL);
452 	} else {
453 		irq = mlx5_irq_affinity_request(pool, &af_desc);
454 	}
455 
456 	return irq;
457 }
458 
459 /**
460  * mlx5_irq_request - request an IRQ for mlx5 PF/VF device.
461  * @dev: mlx5 device that requesting the IRQ.
462  * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
463  * provided.
464  * @af_desc: affinity descriptor for this IRQ.
465  * @rmap: pointer to reverse map pointer for completion interrupts
466  *
467  * This function returns a pointer to IRQ, or ERR_PTR in case of error.
468  */
469 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
470 				  struct irq_affinity_desc *af_desc,
471 				  struct cpu_rmap **rmap)
472 {
473 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
474 	struct mlx5_irq_pool *pool;
475 	struct mlx5_irq *irq;
476 
477 	pool = irq_table->pcif_pool;
478 	irq = irq_pool_request_vector(pool, vecidx, af_desc, rmap);
479 	if (IS_ERR(irq))
480 		return irq;
481 	mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
482 		      irq->map.virq, cpumask_pr_args(&af_desc->mask),
483 		      irq->refcount / MLX5_EQ_REFS_PER_IRQ);
484 	return irq;
485 }
486 
487 /**
488  * mlx5_msix_alloc - allocate msix interrupt
489  * @dev: mlx5 device from which to request
490  * @handler: interrupt handler
491  * @affdesc: affinity descriptor
492  * @name: interrupt name
493  *
494  * Returns: struct msi_map with result encoded.
495  * Note: the caller must make sure to release the irq by calling
496  *       mlx5_msix_free() if shutdown was initiated.
497  */
498 struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
499 			       irqreturn_t (*handler)(int, void *),
500 			       const struct irq_affinity_desc *affdesc,
501 			       const char *name)
502 {
503 	struct msi_map map;
504 	int err;
505 
506 	if (!dev->pdev) {
507 		map.virq = 0;
508 		map.index = -EINVAL;
509 		return map;
510 	}
511 
512 	map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
513 	if (!map.virq)
514 		return map;
515 
516 	err = request_irq(map.virq, handler, 0, name, NULL);
517 	if (err) {
518 		mlx5_core_warn(dev, "err %d\n", err);
519 		pci_msix_free_irq(dev->pdev, map);
520 		map.virq = 0;
521 		map.index = -ENOMEM;
522 	}
523 	return map;
524 }
525 EXPORT_SYMBOL(mlx5_msix_alloc);
526 
527 /**
528  * mlx5_msix_free - free a previously allocated msix interrupt
529  * @dev: mlx5 device associated with interrupt
530  * @map: map previously returned by mlx5_msix_alloc()
531  */
532 void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
533 {
534 	free_irq(map.virq, NULL);
535 	pci_msix_free_irq(dev->pdev, map);
536 }
537 EXPORT_SYMBOL(mlx5_msix_free);
538 
539 /**
540  * mlx5_irqs_release_vectors - release one or more IRQs back to the system.
541  * @irqs: IRQs to be released.
542  * @nirqs: number of IRQs to be released.
543  */
544 void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
545 {
546 	mlx5_irqs_release(irqs, nirqs);
547 }
548 
549 /**
550  * mlx5_irqs_request_vectors - request one or more IRQs for mlx5 device.
551  * @dev: mlx5 device that is requesting the IRQs.
552  * @cpus: CPUs array for binding the IRQs
553  * @nirqs: number of IRQs to request.
554  * @irqs: an output array of IRQs pointers.
555  * @rmap: pointer to reverse map pointer for completion interrupts
556  *
557  * Each IRQ is bound to at most 1 CPU.
558  * This function is requests nirqs IRQs, starting from @vecidx.
559  *
560  * This function returns the number of IRQs requested, (which might be smaller than
561  * @nirqs), if successful, or a negative error code in case of an error.
562  */
563 int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
564 			      struct mlx5_irq **irqs, struct cpu_rmap **rmap)
565 {
566 	struct irq_affinity_desc af_desc;
567 	struct mlx5_irq *irq;
568 	int i;
569 
570 	af_desc.is_managed = 1;
571 	for (i = 0; i < nirqs; i++) {
572 		cpumask_set_cpu(cpus[i], &af_desc.mask);
573 		irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap);
574 		if (IS_ERR(irq))
575 			break;
576 		cpumask_clear(&af_desc.mask);
577 		irqs[i] = irq;
578 	}
579 
580 	return i ? i : PTR_ERR(irq);
581 }
582 
583 static struct mlx5_irq_pool *
584 irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
585 	       u32 min_threshold, u32 max_threshold)
586 {
587 	struct mlx5_irq_pool *pool = kvzalloc(sizeof(*pool), GFP_KERNEL);
588 
589 	if (!pool)
590 		return ERR_PTR(-ENOMEM);
591 	pool->dev = dev;
592 	mutex_init(&pool->lock);
593 	xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
594 	pool->xa_num_irqs.min = start;
595 	pool->xa_num_irqs.max = start + size - 1;
596 	if (name)
597 		snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
598 			 "%s", name);
599 	pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ;
600 	pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ;
601 	mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
602 		      name, size, start);
603 	return pool;
604 }
605 
606 static void irq_pool_free(struct mlx5_irq_pool *pool)
607 {
608 	struct mlx5_irq *irq;
609 	unsigned long index;
610 
611 	/* There are cases in which we are destrying the irq_table before
612 	 * freeing all the IRQs, fast teardown for example. Hence, free the irqs
613 	 * which might not have been freed.
614 	 */
615 	xa_for_each(&pool->irqs, index, irq)
616 		irq_release(irq);
617 	xa_destroy(&pool->irqs);
618 	mutex_destroy(&pool->lock);
619 	kfree(pool->irqs_per_cpu);
620 	kvfree(pool);
621 }
622 
623 static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
624 {
625 	struct mlx5_irq_table *table = dev->priv.irq_table;
626 	int num_sf_ctrl_by_msix;
627 	int num_sf_ctrl_by_sfs;
628 	int num_sf_ctrl;
629 	int err;
630 
631 	/* init pcif_pool */
632 	table->pcif_pool = irq_pool_alloc(dev, 0, pcif_vec, NULL,
633 					  MLX5_EQ_SHARE_IRQ_MIN_COMP,
634 					  MLX5_EQ_SHARE_IRQ_MAX_COMP);
635 	if (IS_ERR(table->pcif_pool))
636 		return PTR_ERR(table->pcif_pool);
637 	if (!mlx5_sf_max_functions(dev))
638 		return 0;
639 	if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
640 		mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n");
641 		return 0;
642 	}
643 
644 	/* init sf_ctrl_pool */
645 	num_sf_ctrl_by_msix = DIV_ROUND_UP(sf_vec, MLX5_COMP_EQS_PER_SF);
646 	num_sf_ctrl_by_sfs = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
647 					  MLX5_SFS_PER_CTRL_IRQ);
648 	num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs);
649 	num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
650 	table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
651 					     "mlx5_sf_ctrl",
652 					     MLX5_EQ_SHARE_IRQ_MIN_CTRL,
653 					     MLX5_EQ_SHARE_IRQ_MAX_CTRL);
654 	if (IS_ERR(table->sf_ctrl_pool)) {
655 		err = PTR_ERR(table->sf_ctrl_pool);
656 		goto err_pf;
657 	}
658 	/* init sf_comp_pool */
659 	table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
660 					     sf_vec - num_sf_ctrl, "mlx5_sf_comp",
661 					     MLX5_EQ_SHARE_IRQ_MIN_COMP,
662 					     MLX5_EQ_SHARE_IRQ_MAX_COMP);
663 	if (IS_ERR(table->sf_comp_pool)) {
664 		err = PTR_ERR(table->sf_comp_pool);
665 		goto err_sf_ctrl;
666 	}
667 
668 	table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL);
669 	if (!table->sf_comp_pool->irqs_per_cpu) {
670 		err = -ENOMEM;
671 		goto err_irqs_per_cpu;
672 	}
673 
674 	return 0;
675 
676 err_irqs_per_cpu:
677 	irq_pool_free(table->sf_comp_pool);
678 err_sf_ctrl:
679 	irq_pool_free(table->sf_ctrl_pool);
680 err_pf:
681 	irq_pool_free(table->pcif_pool);
682 	return err;
683 }
684 
685 static void irq_pools_destroy(struct mlx5_irq_table *table)
686 {
687 	if (table->sf_ctrl_pool) {
688 		irq_pool_free(table->sf_comp_pool);
689 		irq_pool_free(table->sf_ctrl_pool);
690 	}
691 	irq_pool_free(table->pcif_pool);
692 }
693 
694 /* irq_table API */
695 
696 int mlx5_irq_table_init(struct mlx5_core_dev *dev)
697 {
698 	struct mlx5_irq_table *irq_table;
699 
700 	if (mlx5_core_is_sf(dev))
701 		return 0;
702 
703 	irq_table = kvzalloc_node(sizeof(*irq_table), GFP_KERNEL,
704 				  dev->priv.numa_node);
705 	if (!irq_table)
706 		return -ENOMEM;
707 
708 	dev->priv.irq_table = irq_table;
709 	return 0;
710 }
711 
712 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
713 {
714 	if (mlx5_core_is_sf(dev))
715 		return;
716 
717 	kvfree(dev->priv.irq_table);
718 }
719 
720 int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
721 {
722 	if (!table->pcif_pool->xa_num_irqs.max)
723 		return 1;
724 	return table->pcif_pool->xa_num_irqs.max - table->pcif_pool->xa_num_irqs.min;
725 }
726 
727 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
728 {
729 	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
730 		      MLX5_CAP_GEN(dev, max_num_eqs) :
731 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
732 	int total_vec;
733 	int pcif_vec;
734 	int req_vec;
735 	int err;
736 	int n;
737 
738 	if (mlx5_core_is_sf(dev))
739 		return 0;
740 
741 	pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
742 	pcif_vec = min_t(int, pcif_vec, num_eqs);
743 
744 	total_vec = pcif_vec;
745 	if (mlx5_sf_max_functions(dev))
746 		total_vec += MLX5_IRQ_CTRL_SF_MAX +
747 			MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
748 	total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));
749 	pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));
750 
751 	req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
752 	n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
753 	if (n < 0)
754 		return n;
755 
756 	err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec);
757 	if (err)
758 		pci_free_irq_vectors(dev->pdev);
759 
760 	return err;
761 }
762 
763 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
764 {
765 	struct mlx5_irq_table *table = dev->priv.irq_table;
766 
767 	if (mlx5_core_is_sf(dev))
768 		return;
769 
770 	/* There are cases where IRQs still will be in used when we reaching
771 	 * to here. Hence, making sure all the irqs are released.
772 	 */
773 	irq_pools_destroy(table);
774 	pci_free_irq_vectors(dev->pdev);
775 }
776 
777 int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
778 {
779 	if (table->sf_comp_pool)
780 		return min_t(int, num_online_cpus(),
781 			     table->sf_comp_pool->xa_num_irqs.max -
782 			     table->sf_comp_pool->xa_num_irqs.min + 1);
783 	else
784 		return mlx5_irq_table_get_num_comp(table);
785 }
786 
787 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
788 {
789 #ifdef CONFIG_MLX5_SF
790 	if (mlx5_core_is_sf(dev))
791 		return dev->priv.parent_mdev->priv.irq_table;
792 #endif
793 	return dev->priv.irq_table;
794 }
795