1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include "health.h"
5 #include "en/ptp.h"
6 
7 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
8 {
9 	unsigned long exp_time = jiffies +
10 				 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
11 
12 	while (time_before(jiffies, exp_time)) {
13 		if (sq->cc == sq->pc)
14 			return 0;
15 
16 		msleep(20);
17 	}
18 
19 	netdev_err(sq->netdev,
20 		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
21 		   sq->sqn, sq->cc, sq->pc);
22 
23 	return -ETIMEDOUT;
24 }
25 
26 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
27 {
28 	WARN_ONCE(sq->cc != sq->pc,
29 		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
30 		  sq->sqn, sq->cc, sq->pc);
31 	sq->cc = 0;
32 	sq->dma_fifo_cc = 0;
33 	sq->pc = 0;
34 }
35 
36 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
37 {
38 	struct mlx5_core_dev *mdev;
39 	struct net_device *dev;
40 	struct mlx5e_txqsq *sq;
41 	u8 state;
42 	int err;
43 
44 	sq = ctx;
45 	mdev = sq->mdev;
46 	dev = sq->netdev;
47 
48 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
49 		return 0;
50 
51 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
52 	if (err) {
53 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
54 			   sq->sqn, err);
55 		goto out;
56 	}
57 
58 	if (state != MLX5_SQC_STATE_ERR)
59 		goto out;
60 
61 	mlx5e_tx_disable_queue(sq->txq);
62 
63 	err = mlx5e_wait_for_sq_flush(sq);
64 	if (err)
65 		goto out;
66 
67 	/* At this point, no new packets will arrive from the stack as TXQ is
68 	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
69 	 * pending WQEs. SQ can safely reset the SQ.
70 	 */
71 
72 	err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
73 	if (err)
74 		goto out;
75 
76 	mlx5e_reset_txqsq_cc_pc(sq);
77 	sq->stats->recover++;
78 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
79 	mlx5e_activate_txqsq(sq);
80 
81 	return 0;
82 out:
83 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
84 	return err;
85 }
86 
87 struct mlx5e_tx_timeout_ctx {
88 	struct mlx5e_txqsq *sq;
89 	signed int status;
90 };
91 
92 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
93 {
94 	struct mlx5e_tx_timeout_ctx *to_ctx;
95 	struct mlx5e_priv *priv;
96 	struct mlx5_eq_comp *eq;
97 	struct mlx5e_txqsq *sq;
98 	int err;
99 
100 	to_ctx = ctx;
101 	sq = to_ctx->sq;
102 	eq = sq->cq.mcq.eq;
103 	priv = sq->priv;
104 	err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
105 	if (!err) {
106 		to_ctx->status = 0; /* this sq recovered */
107 		return err;
108 	}
109 
110 	err = mlx5e_safe_reopen_channels(priv);
111 	if (!err) {
112 		to_ctx->status = 1; /* all channels recovered */
113 		return err;
114 	}
115 
116 	to_ctx->status = err;
117 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
118 	netdev_err(priv->netdev,
119 		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
120 		   err);
121 
122 	return err;
123 }
124 
125 /* state lock cannot be grabbed within this function.
126  * It can cause a dead lock or a read-after-free.
127  */
128 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
129 {
130 	return err_ctx->recover(err_ctx->ctx);
131 }
132 
133 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
134 				     void *context,
135 				     struct netlink_ext_ack *extack)
136 {
137 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
138 	struct mlx5e_err_ctx *err_ctx = context;
139 
140 	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
141 			 mlx5e_health_recover_channels(priv);
142 }
143 
144 static int
145 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
146 						  struct mlx5e_txqsq *sq, int tc)
147 {
148 	bool stopped = netif_xmit_stopped(sq->txq);
149 	struct mlx5e_priv *priv = sq->priv;
150 	u8 state;
151 	int err;
152 
153 	err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
154 	if (err)
155 		return err;
156 
157 	err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
158 	if (err)
159 		return err;
160 
161 	err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
162 	if (err)
163 		return err;
164 
165 	err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
166 	if (err)
167 		return err;
168 
169 	err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
170 	if (err)
171 		return err;
172 
173 	err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
174 	if (err)
175 		return err;
176 
177 	err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
178 	if (err)
179 		return err;
180 
181 	err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
182 	if (err)
183 		return err;
184 
185 	err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
186 	if (err)
187 		return err;
188 
189 	return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
190 }
191 
192 static int
193 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
194 					struct mlx5e_txqsq *sq, int tc)
195 {
196 	int err;
197 
198 	err = devlink_fmsg_obj_nest_start(fmsg);
199 	if (err)
200 		return err;
201 
202 	err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
203 	if (err)
204 		return err;
205 
206 	err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
207 	if (err)
208 		return err;
209 
210 	err = devlink_fmsg_obj_nest_end(fmsg);
211 	if (err)
212 		return err;
213 
214 	return 0;
215 }
216 
217 static int
218 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
219 					      struct mlx5e_ptpsq *ptpsq, int tc)
220 {
221 	int err;
222 
223 	err = devlink_fmsg_obj_nest_start(fmsg);
224 	if (err)
225 		return err;
226 
227 	err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
228 	if (err)
229 		return err;
230 
231 	err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
232 	if (err)
233 		return err;
234 
235 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
236 	if (err)
237 		return err;
238 
239 	err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
240 	if (err)
241 		return err;
242 
243 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
244 	if (err)
245 		return err;
246 
247 	err = devlink_fmsg_obj_nest_end(fmsg);
248 	if (err)
249 		return err;
250 
251 	return 0;
252 }
253 
254 static int
255 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
256 					 struct mlx5e_txqsq *txqsq)
257 {
258 	u32 sq_stride, sq_sz;
259 	int err;
260 
261 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
262 	if (err)
263 		return err;
264 
265 	sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
266 	sq_stride = MLX5_SEND_WQE_BB;
267 
268 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
269 	if (err)
270 		return err;
271 
272 	err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
273 	if (err)
274 		return err;
275 
276 	err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
277 	if (err)
278 		return err;
279 
280 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
281 }
282 
283 static int
284 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
285 					      struct mlx5e_ptpsq *ptpsq)
286 {
287 	int err;
288 
289 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
290 	if (err)
291 		return err;
292 
293 	err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
294 	if (err)
295 		return err;
296 
297 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
298 }
299 
300 static int
301 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
302 					 struct devlink_fmsg *fmsg)
303 {
304 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
305 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
306 	struct mlx5e_ptpsq *generic_ptpsq;
307 	int err;
308 
309 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
310 	if (err)
311 		return err;
312 
313 	err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
314 	if (err)
315 		return err;
316 
317 	generic_ptpsq = priv->channels.port_ptp ?
318 			&priv->channels.port_ptp->ptpsq[0] :
319 			NULL;
320 	if (!generic_ptpsq)
321 		goto out;
322 
323 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
324 	if (err)
325 		return err;
326 
327 	err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
328 	if (err)
329 		return err;
330 
331 	err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
332 	if (err)
333 		return err;
334 
335 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
336 	if (err)
337 		return err;
338 
339 out:
340 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
341 }
342 
343 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
344 				      struct devlink_fmsg *fmsg,
345 				      struct netlink_ext_ack *extack)
346 {
347 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
348 	struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
349 
350 	int i, tc, err = 0;
351 
352 	mutex_lock(&priv->state_lock);
353 
354 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
355 		goto unlock;
356 
357 	err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
358 	if (err)
359 		goto unlock;
360 
361 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
362 	if (err)
363 		goto unlock;
364 
365 	for (i = 0; i < priv->channels.num; i++) {
366 		struct mlx5e_channel *c = priv->channels.c[i];
367 
368 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
369 			struct mlx5e_txqsq *sq = &c->sq[tc];
370 
371 			err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
372 			if (err)
373 				goto unlock;
374 		}
375 	}
376 
377 	if (!ptp_ch)
378 		goto close_sqs_nest;
379 
380 	for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
381 		err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
382 								    &ptp_ch->ptpsq[tc],
383 								    tc);
384 		if (err)
385 			goto unlock;
386 	}
387 
388 close_sqs_nest:
389 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
390 	if (err)
391 		goto unlock;
392 
393 unlock:
394 	mutex_unlock(&priv->state_lock);
395 	return err;
396 }
397 
398 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
399 				     void *ctx)
400 {
401 	struct mlx5_rsc_key key = {};
402 	struct mlx5e_txqsq *sq = ctx;
403 	int err;
404 
405 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
406 		return 0;
407 
408 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
409 	if (err)
410 		return err;
411 
412 	key.size = PAGE_SIZE;
413 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
414 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
415 	if (err)
416 		return err;
417 
418 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
419 	if (err)
420 		return err;
421 
422 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
423 	if (err)
424 		return err;
425 
426 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
427 	if (err)
428 		return err;
429 
430 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
431 	key.index1 = sq->sqn;
432 	key.num_of_obj1 = 1;
433 
434 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
435 	if (err)
436 		return err;
437 
438 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
439 	if (err)
440 		return err;
441 
442 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
443 	if (err)
444 		return err;
445 
446 	key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
447 	key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
448 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
449 	if (err)
450 		return err;
451 
452 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
453 	if (err)
454 		return err;
455 
456 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
457 }
458 
459 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
460 					  struct devlink_fmsg *fmsg)
461 {
462 	struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
463 	struct mlx5_rsc_key key = {};
464 	int i, tc, err;
465 
466 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
467 		return 0;
468 
469 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
470 	if (err)
471 		return err;
472 
473 	key.size = PAGE_SIZE;
474 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
475 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
476 	if (err)
477 		return err;
478 
479 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
480 	if (err)
481 		return err;
482 
483 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
484 	if (err)
485 		return err;
486 
487 	for (i = 0; i < priv->channels.num; i++) {
488 		struct mlx5e_channel *c = priv->channels.c[i];
489 
490 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
491 			struct mlx5e_txqsq *sq = &c->sq[tc];
492 
493 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
494 			if (err)
495 				return err;
496 		}
497 	}
498 
499 	if (ptp_ch) {
500 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
501 			struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
502 
503 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
504 			if (err)
505 				return err;
506 		}
507 	}
508 
509 	return devlink_fmsg_arr_pair_nest_end(fmsg);
510 }
511 
512 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
513 					   struct mlx5e_err_ctx *err_ctx,
514 					   struct devlink_fmsg *fmsg)
515 {
516 	return err_ctx->dump(priv, fmsg, err_ctx->ctx);
517 }
518 
519 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
520 				  struct devlink_fmsg *fmsg, void *context,
521 				  struct netlink_ext_ack *extack)
522 {
523 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
524 	struct mlx5e_err_ctx *err_ctx = context;
525 
526 	return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
527 			 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
528 }
529 
530 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
531 {
532 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
533 	struct mlx5e_priv *priv = sq->priv;
534 	struct mlx5e_err_ctx err_ctx = {};
535 
536 	err_ctx.ctx = sq;
537 	err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
538 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
539 	snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
540 
541 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
542 }
543 
544 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
545 {
546 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
547 	struct mlx5e_tx_timeout_ctx to_ctx = {};
548 	struct mlx5e_priv *priv = sq->priv;
549 	struct mlx5e_err_ctx err_ctx = {};
550 
551 	to_ctx.sq = sq;
552 	err_ctx.ctx = &to_ctx;
553 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
554 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
555 	snprintf(err_str, sizeof(err_str),
556 		 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
557 		 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
558 		 jiffies_to_usecs(jiffies - sq->txq->trans_start));
559 
560 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
561 	return to_ctx.status;
562 }
563 
564 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
565 		.name = "tx",
566 		.recover = mlx5e_tx_reporter_recover,
567 		.diagnose = mlx5e_tx_reporter_diagnose,
568 		.dump = mlx5e_tx_reporter_dump,
569 };
570 
571 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
572 
573 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
574 {
575 	struct devlink_health_reporter *reporter;
576 
577 	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
578 						       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
579 	if (IS_ERR(reporter)) {
580 		netdev_warn(priv->netdev,
581 			    "Failed to create tx reporter, err = %ld\n",
582 			    PTR_ERR(reporter));
583 		return;
584 	}
585 	priv->tx_reporter = reporter;
586 }
587 
588 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
589 {
590 	if (!priv->tx_reporter)
591 		return;
592 
593 	devlink_port_health_reporter_destroy(priv->tx_reporter);
594 }
595