xref: /openbmc/linux/drivers/infiniband/hw/mlx5/counters.c (revision 0545810f7edaf0c2869eccdd97a3694b5a292e1d)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
4  */
5 
6 #include "mlx5_ib.h"
7 #include <linux/mlx5/eswitch.h>
8 #include <linux/mlx5/vport.h>
9 #include "counters.h"
10 #include "ib_rep.h"
11 #include "qp.h"
12 
13 struct mlx5_ib_counter {
14 	const char *name;
15 	size_t offset;
16 	u32 type;
17 };
18 
19 #define INIT_Q_COUNTER(_name)		\
20 	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
21 
22 #define INIT_VPORT_Q_COUNTER(_name)		\
23 	{ .name = "vport_" #_name, .offset =	\
24 		MLX5_BYTE_OFF(query_q_counter_out, _name)}
25 
26 static const struct mlx5_ib_counter basic_q_cnts[] = {
27 	INIT_Q_COUNTER(rx_write_requests),
28 	INIT_Q_COUNTER(rx_read_requests),
29 	INIT_Q_COUNTER(rx_atomic_requests),
30 	INIT_Q_COUNTER(out_of_buffer),
31 };
32 
33 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
34 	INIT_Q_COUNTER(out_of_sequence),
35 };
36 
37 static const struct mlx5_ib_counter retrans_q_cnts[] = {
38 	INIT_Q_COUNTER(duplicate_request),
39 	INIT_Q_COUNTER(rnr_nak_retry_err),
40 	INIT_Q_COUNTER(packet_seq_err),
41 	INIT_Q_COUNTER(implied_nak_seq_err),
42 	INIT_Q_COUNTER(local_ack_timeout_err),
43 };
44 
45 static const struct mlx5_ib_counter vport_basic_q_cnts[] = {
46 	INIT_VPORT_Q_COUNTER(rx_write_requests),
47 	INIT_VPORT_Q_COUNTER(rx_read_requests),
48 	INIT_VPORT_Q_COUNTER(rx_atomic_requests),
49 	INIT_VPORT_Q_COUNTER(out_of_buffer),
50 };
51 
52 static const struct mlx5_ib_counter vport_out_of_seq_q_cnts[] = {
53 	INIT_VPORT_Q_COUNTER(out_of_sequence),
54 };
55 
56 static const struct mlx5_ib_counter vport_retrans_q_cnts[] = {
57 	INIT_VPORT_Q_COUNTER(duplicate_request),
58 	INIT_VPORT_Q_COUNTER(rnr_nak_retry_err),
59 	INIT_VPORT_Q_COUNTER(packet_seq_err),
60 	INIT_VPORT_Q_COUNTER(implied_nak_seq_err),
61 	INIT_VPORT_Q_COUNTER(local_ack_timeout_err),
62 };
63 
64 #define INIT_CONG_COUNTER(_name)		\
65 	{ .name = #_name, .offset =	\
66 		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
67 
68 static const struct mlx5_ib_counter cong_cnts[] = {
69 	INIT_CONG_COUNTER(rp_cnp_ignored),
70 	INIT_CONG_COUNTER(rp_cnp_handled),
71 	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
72 	INIT_CONG_COUNTER(np_cnp_sent),
73 };
74 
75 static const struct mlx5_ib_counter extended_err_cnts[] = {
76 	INIT_Q_COUNTER(resp_local_length_error),
77 	INIT_Q_COUNTER(resp_cqe_error),
78 	INIT_Q_COUNTER(req_cqe_error),
79 	INIT_Q_COUNTER(req_remote_invalid_request),
80 	INIT_Q_COUNTER(req_remote_access_errors),
81 	INIT_Q_COUNTER(resp_remote_access_errors),
82 	INIT_Q_COUNTER(resp_cqe_flush_error),
83 	INIT_Q_COUNTER(req_cqe_flush_error),
84 };
85 
86 static const struct mlx5_ib_counter roce_accl_cnts[] = {
87 	INIT_Q_COUNTER(roce_adp_retrans),
88 	INIT_Q_COUNTER(roce_adp_retrans_to),
89 	INIT_Q_COUNTER(roce_slow_restart),
90 	INIT_Q_COUNTER(roce_slow_restart_cnps),
91 	INIT_Q_COUNTER(roce_slow_restart_trans),
92 };
93 
94 static const struct mlx5_ib_counter vport_extended_err_cnts[] = {
95 	INIT_VPORT_Q_COUNTER(resp_local_length_error),
96 	INIT_VPORT_Q_COUNTER(resp_cqe_error),
97 	INIT_VPORT_Q_COUNTER(req_cqe_error),
98 	INIT_VPORT_Q_COUNTER(req_remote_invalid_request),
99 	INIT_VPORT_Q_COUNTER(req_remote_access_errors),
100 	INIT_VPORT_Q_COUNTER(resp_remote_access_errors),
101 	INIT_VPORT_Q_COUNTER(resp_cqe_flush_error),
102 	INIT_VPORT_Q_COUNTER(req_cqe_flush_error),
103 };
104 
105 static const struct mlx5_ib_counter vport_roce_accl_cnts[] = {
106 	INIT_VPORT_Q_COUNTER(roce_adp_retrans),
107 	INIT_VPORT_Q_COUNTER(roce_adp_retrans_to),
108 	INIT_VPORT_Q_COUNTER(roce_slow_restart),
109 	INIT_VPORT_Q_COUNTER(roce_slow_restart_cnps),
110 	INIT_VPORT_Q_COUNTER(roce_slow_restart_trans),
111 };
112 
113 #define INIT_EXT_PPCNT_COUNTER(_name)		\
114 	{ .name = #_name, .offset =	\
115 	MLX5_BYTE_OFF(ppcnt_reg, \
116 		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
117 
118 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
119 	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
120 };
121 
122 #define INIT_OP_COUNTER(_name, _type)		\
123 	{ .name = #_name, .type = MLX5_IB_OPCOUNTER_##_type}
124 
125 static const struct mlx5_ib_counter basic_op_cnts[] = {
126 	INIT_OP_COUNTER(cc_rx_ce_pkts, CC_RX_CE_PKTS),
127 };
128 
129 static const struct mlx5_ib_counter rdmarx_cnp_op_cnts[] = {
130 	INIT_OP_COUNTER(cc_rx_cnp_pkts, CC_RX_CNP_PKTS),
131 };
132 
133 static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = {
134 	INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS),
135 };
136 
137 static int mlx5_ib_read_counters(struct ib_counters *counters,
138 				 struct ib_counters_read_attr *read_attr,
139 				 struct uverbs_attr_bundle *attrs)
140 {
141 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
142 	struct mlx5_read_counters_attr mread_attr = {};
143 	struct mlx5_ib_flow_counters_desc *desc;
144 	int ret, i;
145 
146 	mutex_lock(&mcounters->mcntrs_mutex);
147 	if (mcounters->cntrs_max_index > read_attr->ncounters) {
148 		ret = -EINVAL;
149 		goto err_bound;
150 	}
151 
152 	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
153 				 GFP_KERNEL);
154 	if (!mread_attr.out) {
155 		ret = -ENOMEM;
156 		goto err_bound;
157 	}
158 
159 	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
160 	mread_attr.flags = read_attr->flags;
161 	ret = mcounters->read_counters(counters->device, &mread_attr);
162 	if (ret)
163 		goto err_read;
164 
165 	/* do the pass over the counters data array to assign according to the
166 	 * descriptions and indexing pairs
167 	 */
168 	desc = mcounters->counters_data;
169 	for (i = 0; i < mcounters->ncounters; i++)
170 		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
171 
172 err_read:
173 	kfree(mread_attr.out);
174 err_bound:
175 	mutex_unlock(&mcounters->mcntrs_mutex);
176 	return ret;
177 }
178 
179 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
180 {
181 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
182 
183 	mlx5_ib_counters_clear_description(counters);
184 	if (mcounters->hw_cntrs_hndl)
185 		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
186 				mcounters->hw_cntrs_hndl);
187 	return 0;
188 }
189 
190 static int mlx5_ib_create_counters(struct ib_counters *counters,
191 				   struct uverbs_attr_bundle *attrs)
192 {
193 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
194 
195 	mutex_init(&mcounters->mcntrs_mutex);
196 	return 0;
197 }
198 
199 static bool vport_qcounters_supported(struct mlx5_ib_dev *dev)
200 {
201 	return MLX5_CAP_GEN(dev->mdev, q_counter_other_vport) &&
202 	       MLX5_CAP_GEN(dev->mdev, q_counter_aggregation);
203 }
204 
205 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
206 						   u32 port_num)
207 {
208 	if ((is_mdev_switchdev_mode(dev->mdev) &&
209 	     !vport_qcounters_supported(dev)) || !port_num)
210 		return &dev->port[0].cnts;
211 
212 	return &dev->port[port_num - 1].cnts;
213 }
214 
215 /**
216  * mlx5_ib_get_counters_id - Returns counters id to use for device+port
217  * @dev:	Pointer to mlx5 IB device
218  * @port_num:	Zero based port number
219  *
220  * mlx5_ib_get_counters_id() Returns counters set id to use for given
221  * device port combination in switchdev and non switchdev mode of the
222  * parent device.
223  */
224 u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num)
225 {
226 	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num + 1);
227 
228 	return cnts->set_id;
229 }
230 
231 static struct rdma_hw_stats *do_alloc_stats(const struct mlx5_ib_counters *cnts)
232 {
233 	struct rdma_hw_stats *stats;
234 	u32 num_hw_counters;
235 	int i;
236 
237 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
238 			  cnts->num_ext_ppcnt_counters;
239 	stats = rdma_alloc_hw_stats_struct(cnts->descs,
240 					   num_hw_counters +
241 					   cnts->num_op_counters,
242 					   RDMA_HW_STATS_DEFAULT_LIFESPAN);
243 	if (!stats)
244 		return NULL;
245 
246 	for (i = 0; i < cnts->num_op_counters; i++)
247 		set_bit(num_hw_counters + i, stats->is_disabled);
248 
249 	return stats;
250 }
251 
252 static struct rdma_hw_stats *
253 mlx5_ib_alloc_hw_device_stats(struct ib_device *ibdev)
254 {
255 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
256 	const struct mlx5_ib_counters *cnts = &dev->port[0].cnts;
257 
258 	return do_alloc_stats(cnts);
259 }
260 
261 static struct rdma_hw_stats *
262 mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num)
263 {
264 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
265 	const struct mlx5_ib_counters *cnts = &dev->port[port_num - 1].cnts;
266 
267 	return do_alloc_stats(cnts);
268 }
269 
270 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
271 				    const struct mlx5_ib_counters *cnts,
272 				    struct rdma_hw_stats *stats,
273 				    u16 set_id)
274 {
275 	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
276 	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
277 	__be32 val;
278 	int ret, i;
279 
280 	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
281 	MLX5_SET(query_q_counter_in, in, counter_set_id, set_id);
282 	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
283 	if (ret)
284 		return ret;
285 
286 	for (i = 0; i < cnts->num_q_counters; i++) {
287 		val = *(__be32 *)((void *)out + cnts->offsets[i]);
288 		stats->value[i] = (u64)be32_to_cpu(val);
289 	}
290 
291 	return 0;
292 }
293 
294 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
295 					    const struct mlx5_ib_counters *cnts,
296 					    struct rdma_hw_stats *stats)
297 {
298 	int offset = cnts->num_q_counters + cnts->num_cong_counters;
299 	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
300 	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
301 	int ret, i;
302 	void *out;
303 
304 	out = kvzalloc(sz, GFP_KERNEL);
305 	if (!out)
306 		return -ENOMEM;
307 
308 	MLX5_SET(ppcnt_reg, in, local_port, 1);
309 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
310 	ret = mlx5_core_access_reg(dev->mdev, in, sz, out, sz, MLX5_REG_PPCNT,
311 				   0, 0);
312 	if (ret)
313 		goto free;
314 
315 	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
316 		stats->value[i + offset] =
317 			be64_to_cpup((__be64 *)(out +
318 				    cnts->offsets[i + offset]));
319 free:
320 	kvfree(out);
321 	return ret;
322 }
323 
324 static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev,
325 					  u32 port_num,
326 					  const struct mlx5_ib_counters *cnts,
327 					  struct rdma_hw_stats *stats)
328 
329 {
330 	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
331 	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
332 	__be32 val;
333 	int ret, i;
334 
335 	if (!dev->port[port_num].rep ||
336 	    dev->port[port_num].rep->vport == MLX5_VPORT_UPLINK)
337 		return 0;
338 
339 	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
340 	MLX5_SET(query_q_counter_in, in, other_vport, 1);
341 	MLX5_SET(query_q_counter_in, in, vport_number,
342 		 dev->port[port_num].rep->vport);
343 	MLX5_SET(query_q_counter_in, in, aggregate, 1);
344 	ret = mlx5_cmd_exec_inout(dev->mdev, query_q_counter, in, out);
345 	if (ret)
346 		return ret;
347 
348 	for (i = 0; i < cnts->num_q_counters; i++) {
349 		val = *(__be32 *)((void *)out + cnts->offsets[i]);
350 		stats->value[i] = (u64)be32_to_cpu(val);
351 	}
352 
353 	return 0;
354 }
355 
356 static int do_get_hw_stats(struct ib_device *ibdev,
357 			   struct rdma_hw_stats *stats,
358 			   u32 port_num, int index)
359 {
360 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
361 	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
362 	struct mlx5_core_dev *mdev;
363 	int ret, num_counters;
364 
365 	if (!stats)
366 		return -EINVAL;
367 
368 	num_counters = cnts->num_q_counters +
369 		       cnts->num_cong_counters +
370 		       cnts->num_ext_ppcnt_counters;
371 
372 	if (is_mdev_switchdev_mode(dev->mdev) && dev->is_rep && port_num != 0)
373 		ret = mlx5_ib_query_q_counters_vport(dev, port_num - 1, cnts,
374 						     stats);
375 	else
376 		ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats,
377 					       cnts->set_id);
378 	if (ret)
379 		return ret;
380 
381 	/* We don't expose device counters over Vports */
382 	if (is_mdev_switchdev_mode(dev->mdev) && port_num != 0)
383 		goto done;
384 
385 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
386 		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
387 		if (ret)
388 			return ret;
389 	}
390 
391 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
392 		if (!port_num)
393 			port_num = 1;
394 		mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL);
395 		if (!mdev) {
396 			/* If port is not affiliated yet, its in down state
397 			 * which doesn't have any counters yet, so it would be
398 			 * zero. So no need to read from the HCA.
399 			 */
400 			goto done;
401 		}
402 		ret = mlx5_lag_query_cong_counters(dev->mdev,
403 						   stats->value +
404 						   cnts->num_q_counters,
405 						   cnts->num_cong_counters,
406 						   cnts->offsets +
407 						   cnts->num_q_counters);
408 
409 		mlx5_ib_put_native_port_mdev(dev, port_num);
410 		if (ret)
411 			return ret;
412 	}
413 
414 done:
415 	return num_counters;
416 }
417 
418 static int do_get_op_stat(struct ib_device *ibdev,
419 			  struct rdma_hw_stats *stats,
420 			  u32 port_num, int index)
421 {
422 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
423 	const struct mlx5_ib_counters *cnts;
424 	const struct mlx5_ib_op_fc *opfcs;
425 	u64 packets = 0, bytes;
426 	u32 type;
427 	int ret;
428 
429 	cnts = get_counters(dev, port_num);
430 
431 	opfcs = cnts->opfcs;
432 	type = *(u32 *)cnts->descs[index].priv;
433 	if (type >= MLX5_IB_OPCOUNTER_MAX)
434 		return -EINVAL;
435 
436 	if (!opfcs[type].fc)
437 		goto out;
438 
439 	ret = mlx5_fc_query(dev->mdev, opfcs[type].fc,
440 			    &packets, &bytes);
441 	if (ret)
442 		return ret;
443 
444 out:
445 	stats->value[index] = packets;
446 	return index;
447 }
448 
449 static int do_get_op_stats(struct ib_device *ibdev,
450 			   struct rdma_hw_stats *stats,
451 			   u32 port_num)
452 {
453 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
454 	const struct mlx5_ib_counters *cnts;
455 	int index, ret, num_hw_counters;
456 
457 	cnts = get_counters(dev, port_num);
458 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
459 			  cnts->num_ext_ppcnt_counters;
460 	for (index = num_hw_counters;
461 	     index < (num_hw_counters + cnts->num_op_counters); index++) {
462 		ret = do_get_op_stat(ibdev, stats, port_num, index);
463 		if (ret != index)
464 			return ret;
465 	}
466 
467 	return cnts->num_op_counters;
468 }
469 
470 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
471 				struct rdma_hw_stats *stats,
472 				u32 port_num, int index)
473 {
474 	int num_counters, num_hw_counters, num_op_counters;
475 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
476 	const struct mlx5_ib_counters *cnts;
477 
478 	cnts = get_counters(dev, port_num);
479 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
480 		cnts->num_ext_ppcnt_counters;
481 	num_counters = num_hw_counters + cnts->num_op_counters;
482 
483 	if (index < 0 || index > num_counters)
484 		return -EINVAL;
485 	else if (index > 0 && index < num_hw_counters)
486 		return do_get_hw_stats(ibdev, stats, port_num, index);
487 	else if (index >= num_hw_counters && index < num_counters)
488 		return do_get_op_stat(ibdev, stats, port_num, index);
489 
490 	num_hw_counters = do_get_hw_stats(ibdev, stats, port_num, index);
491 	if (num_hw_counters < 0)
492 		return num_hw_counters;
493 
494 	num_op_counters = do_get_op_stats(ibdev, stats, port_num);
495 	if (num_op_counters < 0)
496 		return num_op_counters;
497 
498 	return num_hw_counters + num_op_counters;
499 }
500 
501 static struct rdma_hw_stats *
502 mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
503 {
504 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
505 	const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
506 
507 	return do_alloc_stats(cnts);
508 }
509 
510 static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
511 {
512 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
513 	const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
514 
515 	return mlx5_ib_query_q_counters(dev->mdev, cnts,
516 					counter->stats, counter->id);
517 }
518 
519 static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
520 {
521 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
522 	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
523 
524 	if (!counter->id)
525 		return 0;
526 
527 	MLX5_SET(dealloc_q_counter_in, in, opcode,
528 		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
529 	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
530 	return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
531 }
532 
533 static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
534 				   struct ib_qp *qp)
535 {
536 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
537 	int err;
538 
539 	if (!counter->id) {
540 		u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
541 		u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
542 
543 		MLX5_SET(alloc_q_counter_in, in, opcode,
544 			 MLX5_CMD_OP_ALLOC_Q_COUNTER);
545 		MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID);
546 		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
547 		if (err)
548 			return err;
549 		counter->id =
550 			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
551 	}
552 
553 	err = mlx5_ib_qp_set_counter(qp, counter);
554 	if (err)
555 		goto fail_set_counter;
556 
557 	return 0;
558 
559 fail_set_counter:
560 	mlx5_ib_counter_dealloc(counter);
561 	counter->id = 0;
562 
563 	return err;
564 }
565 
566 static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
567 {
568 	return mlx5_ib_qp_set_counter(qp, NULL);
569 }
570 
571 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
572 				  struct rdma_stat_desc *descs, size_t *offsets,
573 				  u32 port_num)
574 {
575 	bool is_vport = is_mdev_switchdev_mode(dev->mdev) &&
576 			port_num != MLX5_VPORT_PF;
577 	const struct mlx5_ib_counter *names;
578 	int j = 0, i;
579 
580 	names = is_vport ? vport_basic_q_cnts : basic_q_cnts;
581 	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
582 		descs[j].name = names[i].name;
583 		offsets[j] = basic_q_cnts[i].offset;
584 	}
585 
586 	names = is_vport ? vport_out_of_seq_q_cnts : out_of_seq_q_cnts;
587 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
588 		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
589 			descs[j].name = names[i].name;
590 			offsets[j] = out_of_seq_q_cnts[i].offset;
591 		}
592 	}
593 
594 	names = is_vport ? vport_retrans_q_cnts : retrans_q_cnts;
595 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
596 		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
597 			descs[j].name = names[i].name;
598 			offsets[j] = retrans_q_cnts[i].offset;
599 		}
600 	}
601 
602 	names = is_vport ? vport_extended_err_cnts : extended_err_cnts;
603 	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
604 		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
605 			descs[j].name = names[i].name;
606 			offsets[j] = extended_err_cnts[i].offset;
607 		}
608 	}
609 
610 	names = is_vport ? vport_roce_accl_cnts : roce_accl_cnts;
611 	if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
612 		for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
613 			descs[j].name = names[i].name;
614 			offsets[j] = roce_accl_cnts[i].offset;
615 		}
616 	}
617 
618 	if (is_vport)
619 		return;
620 
621 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
622 		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
623 			descs[j].name = cong_cnts[i].name;
624 			offsets[j] = cong_cnts[i].offset;
625 		}
626 	}
627 
628 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
629 		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
630 			descs[j].name = ext_ppcnt_cnts[i].name;
631 			offsets[j] = ext_ppcnt_cnts[i].offset;
632 		}
633 	}
634 
635 	for (i = 0; i < ARRAY_SIZE(basic_op_cnts); i++, j++) {
636 		descs[j].name = basic_op_cnts[i].name;
637 		descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
638 		descs[j].priv = &basic_op_cnts[i].type;
639 	}
640 
641 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
642 			       ft_field_support_2_nic_receive_rdma.bth_opcode)) {
643 		for (i = 0; i < ARRAY_SIZE(rdmarx_cnp_op_cnts); i++, j++) {
644 			descs[j].name = rdmarx_cnp_op_cnts[i].name;
645 			descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
646 			descs[j].priv = &rdmarx_cnp_op_cnts[i].type;
647 		}
648 	}
649 
650 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
651 			       ft_field_support_2_nic_transmit_rdma.bth_opcode)) {
652 		for (i = 0; i < ARRAY_SIZE(rdmatx_cnp_op_cnts); i++, j++) {
653 			descs[j].name = rdmatx_cnp_op_cnts[i].name;
654 			descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
655 			descs[j].priv = &rdmatx_cnp_op_cnts[i].type;
656 		}
657 	}
658 }
659 
660 
661 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
662 				    struct mlx5_ib_counters *cnts, u32 port_num)
663 {
664 	u32 num_counters, num_op_counters = 0;
665 
666 	num_counters = ARRAY_SIZE(basic_q_cnts);
667 
668 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
669 		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
670 
671 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
672 		num_counters += ARRAY_SIZE(retrans_q_cnts);
673 
674 	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
675 		num_counters += ARRAY_SIZE(extended_err_cnts);
676 
677 	if (MLX5_CAP_GEN(dev->mdev, roce_accl))
678 		num_counters += ARRAY_SIZE(roce_accl_cnts);
679 
680 	cnts->num_q_counters = num_counters;
681 
682 	if (is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF)
683 		goto skip_non_qcounters;
684 
685 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
686 		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
687 		num_counters += ARRAY_SIZE(cong_cnts);
688 	}
689 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
690 		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
691 		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
692 	}
693 
694 	num_op_counters = ARRAY_SIZE(basic_op_cnts);
695 
696 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
697 			       ft_field_support_2_nic_receive_rdma.bth_opcode))
698 		num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts);
699 
700 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
701 			       ft_field_support_2_nic_transmit_rdma.bth_opcode))
702 		num_op_counters += ARRAY_SIZE(rdmatx_cnp_op_cnts);
703 
704 skip_non_qcounters:
705 	cnts->num_op_counters = num_op_counters;
706 	num_counters += num_op_counters;
707 	cnts->descs = kcalloc(num_counters,
708 			      sizeof(struct rdma_stat_desc), GFP_KERNEL);
709 	if (!cnts->descs)
710 		return -ENOMEM;
711 
712 	cnts->offsets = kcalloc(num_counters,
713 				sizeof(*cnts->offsets), GFP_KERNEL);
714 	if (!cnts->offsets)
715 		goto err;
716 
717 	return 0;
718 
719 err:
720 	kfree(cnts->descs);
721 	cnts->descs = NULL;
722 	return -ENOMEM;
723 }
724 
725 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
726 {
727 	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
728 	int num_cnt_ports;
729 	int i, j;
730 
731 	num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) ||
732 			 vport_qcounters_supported(dev)) ? dev->num_ports : 1;
733 
734 	MLX5_SET(dealloc_q_counter_in, in, opcode,
735 		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
736 
737 	for (i = 0; i < num_cnt_ports; i++) {
738 		if (dev->port[i].cnts.set_id) {
739 			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
740 				 dev->port[i].cnts.set_id);
741 			mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
742 		}
743 		kfree(dev->port[i].cnts.descs);
744 		kfree(dev->port[i].cnts.offsets);
745 
746 		for (j = 0; j < MLX5_IB_OPCOUNTER_MAX; j++) {
747 			if (!dev->port[i].cnts.opfcs[j].fc)
748 				continue;
749 
750 			if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
751 				mlx5_ib_fs_remove_op_fc(dev,
752 					&dev->port[i].cnts.opfcs[j], j);
753 			mlx5_fc_destroy(dev->mdev,
754 					dev->port[i].cnts.opfcs[j].fc);
755 			dev->port[i].cnts.opfcs[j].fc = NULL;
756 		}
757 	}
758 }
759 
760 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
761 {
762 	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
763 	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
764 	int num_cnt_ports;
765 	int err = 0;
766 	int i;
767 	bool is_shared;
768 
769 	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
770 	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
771 	num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) ||
772 			 vport_qcounters_supported(dev)) ? dev->num_ports : 1;
773 
774 	for (i = 0; i < num_cnt_ports; i++) {
775 		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts, i);
776 		if (err)
777 			goto err_alloc;
778 
779 		mlx5_ib_fill_counters(dev, dev->port[i].cnts.descs,
780 				      dev->port[i].cnts.offsets, i);
781 
782 		MLX5_SET(alloc_q_counter_in, in, uid,
783 			 is_shared ? MLX5_SHARED_RESOURCE_UID : 0);
784 
785 		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
786 		if (err) {
787 			mlx5_ib_warn(dev,
788 				     "couldn't allocate queue counter for port %d, err %d\n",
789 				     i + 1, err);
790 			goto err_alloc;
791 		}
792 
793 		dev->port[i].cnts.set_id =
794 			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
795 	}
796 	return 0;
797 
798 err_alloc:
799 	mlx5_ib_dealloc_counters(dev);
800 	return err;
801 }
802 
803 static int read_flow_counters(struct ib_device *ibdev,
804 			      struct mlx5_read_counters_attr *read_attr)
805 {
806 	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
807 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
808 
809 	return mlx5_fc_query(dev->mdev, fc,
810 			     &read_attr->out[IB_COUNTER_PACKETS],
811 			     &read_attr->out[IB_COUNTER_BYTES]);
812 }
813 
814 /* flow counters currently expose two counters packets and bytes */
815 #define FLOW_COUNTERS_NUM 2
816 static int counters_set_description(
817 	struct ib_counters *counters, enum mlx5_ib_counters_type counters_type,
818 	struct mlx5_ib_flow_counters_desc *desc_data, u32 ncounters)
819 {
820 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
821 	u32 cntrs_max_index = 0;
822 	int i;
823 
824 	if (counters_type != MLX5_IB_COUNTERS_FLOW)
825 		return -EINVAL;
826 
827 	/* init the fields for the object */
828 	mcounters->type = counters_type;
829 	mcounters->read_counters = read_flow_counters;
830 	mcounters->counters_num = FLOW_COUNTERS_NUM;
831 	mcounters->ncounters = ncounters;
832 	/* each counter entry have both description and index pair */
833 	for (i = 0; i < ncounters; i++) {
834 		if (desc_data[i].description > IB_COUNTER_BYTES)
835 			return -EINVAL;
836 
837 		if (cntrs_max_index <= desc_data[i].index)
838 			cntrs_max_index = desc_data[i].index + 1;
839 	}
840 
841 	mutex_lock(&mcounters->mcntrs_mutex);
842 	mcounters->counters_data = desc_data;
843 	mcounters->cntrs_max_index = cntrs_max_index;
844 	mutex_unlock(&mcounters->mcntrs_mutex);
845 
846 	return 0;
847 }
848 
849 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
850 int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
851 				   struct mlx5_ib_create_flow *ucmd)
852 {
853 	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
854 	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
855 	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
856 	bool hw_hndl = false;
857 	int ret = 0;
858 
859 	if (ucmd && ucmd->ncounters_data != 0) {
860 		cntrs_data = ucmd->data;
861 		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
862 			return -EINVAL;
863 
864 		desc_data = kcalloc(cntrs_data->ncounters,
865 				    sizeof(*desc_data),
866 				    GFP_KERNEL);
867 		if (!desc_data)
868 			return  -ENOMEM;
869 
870 		if (copy_from_user(desc_data,
871 				   u64_to_user_ptr(cntrs_data->counters_data),
872 				   sizeof(*desc_data) * cntrs_data->ncounters)) {
873 			ret = -EFAULT;
874 			goto free;
875 		}
876 	}
877 
878 	if (!mcounters->hw_cntrs_hndl) {
879 		mcounters->hw_cntrs_hndl = mlx5_fc_create(
880 			to_mdev(ibcounters->device)->mdev, false);
881 		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
882 			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
883 			goto free;
884 		}
885 		hw_hndl = true;
886 	}
887 
888 	if (desc_data) {
889 		/* counters already bound to at least one flow */
890 		if (mcounters->cntrs_max_index) {
891 			ret = -EINVAL;
892 			goto free_hndl;
893 		}
894 
895 		ret = counters_set_description(ibcounters,
896 					       MLX5_IB_COUNTERS_FLOW,
897 					       desc_data,
898 					       cntrs_data->ncounters);
899 		if (ret)
900 			goto free_hndl;
901 
902 	} else if (!mcounters->cntrs_max_index) {
903 		/* counters not bound yet, must have udata passed */
904 		ret = -EINVAL;
905 		goto free_hndl;
906 	}
907 
908 	return 0;
909 
910 free_hndl:
911 	if (hw_hndl) {
912 		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
913 				mcounters->hw_cntrs_hndl);
914 		mcounters->hw_cntrs_hndl = NULL;
915 	}
916 free:
917 	kfree(desc_data);
918 	return ret;
919 }
920 
921 void mlx5_ib_counters_clear_description(struct ib_counters *counters)
922 {
923 	struct mlx5_ib_mcounters *mcounters;
924 
925 	if (!counters || atomic_read(&counters->usecnt) != 1)
926 		return;
927 
928 	mcounters = to_mcounters(counters);
929 
930 	mutex_lock(&mcounters->mcntrs_mutex);
931 	kfree(mcounters->counters_data);
932 	mcounters->counters_data = NULL;
933 	mcounters->cntrs_max_index = 0;
934 	mutex_unlock(&mcounters->mcntrs_mutex);
935 }
936 
937 static int mlx5_ib_modify_stat(struct ib_device *device, u32 port,
938 			       unsigned int index, bool enable)
939 {
940 	struct mlx5_ib_dev *dev = to_mdev(device);
941 	struct mlx5_ib_counters *cnts;
942 	struct mlx5_ib_op_fc *opfc;
943 	u32 num_hw_counters, type;
944 	int ret;
945 
946 	cnts = &dev->port[port - 1].cnts;
947 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
948 		cnts->num_ext_ppcnt_counters;
949 	if (index < num_hw_counters ||
950 	    index >= (num_hw_counters + cnts->num_op_counters))
951 		return -EINVAL;
952 
953 	if (!(cnts->descs[index].flags & IB_STAT_FLAG_OPTIONAL))
954 		return -EINVAL;
955 
956 	type = *(u32 *)cnts->descs[index].priv;
957 	if (type >= MLX5_IB_OPCOUNTER_MAX)
958 		return -EINVAL;
959 
960 	opfc = &cnts->opfcs[type];
961 
962 	if (enable) {
963 		if (opfc->fc)
964 			return -EEXIST;
965 
966 		opfc->fc = mlx5_fc_create(dev->mdev, false);
967 		if (IS_ERR(opfc->fc))
968 			return PTR_ERR(opfc->fc);
969 
970 		ret = mlx5_ib_fs_add_op_fc(dev, port, opfc, type);
971 		if (ret) {
972 			mlx5_fc_destroy(dev->mdev, opfc->fc);
973 			opfc->fc = NULL;
974 		}
975 		return ret;
976 	}
977 
978 	if (!opfc->fc)
979 		return -EINVAL;
980 
981 	mlx5_ib_fs_remove_op_fc(dev, opfc, type);
982 	mlx5_fc_destroy(dev->mdev, opfc->fc);
983 	opfc->fc = NULL;
984 	return 0;
985 }
986 
987 static const struct ib_device_ops hw_stats_ops = {
988 	.alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats,
989 	.get_hw_stats = mlx5_ib_get_hw_stats,
990 	.counter_bind_qp = mlx5_ib_counter_bind_qp,
991 	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
992 	.counter_dealloc = mlx5_ib_counter_dealloc,
993 	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
994 	.counter_update_stats = mlx5_ib_counter_update_stats,
995 	.modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ?
996 			  mlx5_ib_modify_stat : NULL,
997 };
998 
999 static const struct ib_device_ops hw_switchdev_vport_op = {
1000 	.alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats,
1001 };
1002 
1003 static const struct ib_device_ops hw_switchdev_stats_ops = {
1004 	.alloc_hw_device_stats = mlx5_ib_alloc_hw_device_stats,
1005 	.get_hw_stats = mlx5_ib_get_hw_stats,
1006 	.counter_bind_qp = mlx5_ib_counter_bind_qp,
1007 	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
1008 	.counter_dealloc = mlx5_ib_counter_dealloc,
1009 	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
1010 	.counter_update_stats = mlx5_ib_counter_update_stats,
1011 };
1012 
1013 static const struct ib_device_ops counters_ops = {
1014 	.create_counters = mlx5_ib_create_counters,
1015 	.destroy_counters = mlx5_ib_destroy_counters,
1016 	.read_counters = mlx5_ib_read_counters,
1017 
1018 	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
1019 };
1020 
1021 int mlx5_ib_counters_init(struct mlx5_ib_dev *dev)
1022 {
1023 	ib_set_device_ops(&dev->ib_dev, &counters_ops);
1024 
1025 	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
1026 		return 0;
1027 
1028 	if (is_mdev_switchdev_mode(dev->mdev)) {
1029 		ib_set_device_ops(&dev->ib_dev, &hw_switchdev_stats_ops);
1030 		if (vport_qcounters_supported(dev))
1031 			ib_set_device_ops(&dev->ib_dev, &hw_switchdev_vport_op);
1032 	} else
1033 		ib_set_device_ops(&dev->ib_dev, &hw_stats_ops);
1034 	return mlx5_ib_alloc_counters(dev);
1035 }
1036 
1037 void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev)
1038 {
1039 	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
1040 		return;
1041 
1042 	mlx5_ib_dealloc_counters(dev);
1043 }
1044