1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include "health.h"
5 #include "en/ptp.h"
6 
7 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
8 {
9 	unsigned long exp_time = jiffies +
10 				 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
11 
12 	while (time_before(jiffies, exp_time)) {
13 		if (sq->cc == sq->pc)
14 			return 0;
15 
16 		msleep(20);
17 	}
18 
19 	netdev_err(sq->netdev,
20 		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
21 		   sq->sqn, sq->cc, sq->pc);
22 
23 	return -ETIMEDOUT;
24 }
25 
26 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
27 {
28 	WARN_ONCE(sq->cc != sq->pc,
29 		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
30 		  sq->sqn, sq->cc, sq->pc);
31 	sq->cc = 0;
32 	sq->dma_fifo_cc = 0;
33 	sq->pc = 0;
34 }
35 
36 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
37 {
38 	struct mlx5_core_dev *mdev;
39 	struct net_device *dev;
40 	struct mlx5e_txqsq *sq;
41 	u8 state;
42 	int err;
43 
44 	sq = ctx;
45 	mdev = sq->mdev;
46 	dev = sq->netdev;
47 
48 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
49 		return 0;
50 
51 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
52 	if (err) {
53 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
54 			   sq->sqn, err);
55 		goto out;
56 	}
57 
58 	if (state != MLX5_SQC_STATE_ERR)
59 		goto out;
60 
61 	mlx5e_tx_disable_queue(sq->txq);
62 
63 	err = mlx5e_wait_for_sq_flush(sq);
64 	if (err)
65 		goto out;
66 
67 	/* At this point, no new packets will arrive from the stack as TXQ is
68 	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
69 	 * pending WQEs. SQ can safely reset the SQ.
70 	 */
71 
72 	err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
73 	if (err)
74 		goto out;
75 
76 	mlx5e_reset_txqsq_cc_pc(sq);
77 	sq->stats->recover++;
78 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
79 	mlx5e_activate_txqsq(sq);
80 
81 	return 0;
82 out:
83 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
84 	return err;
85 }
86 
87 struct mlx5e_tx_timeout_ctx {
88 	struct mlx5e_txqsq *sq;
89 	signed int status;
90 };
91 
92 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
93 {
94 	struct mlx5e_tx_timeout_ctx *to_ctx;
95 	struct mlx5e_priv *priv;
96 	struct mlx5_eq_comp *eq;
97 	struct mlx5e_txqsq *sq;
98 	int err;
99 
100 	to_ctx = ctx;
101 	sq = to_ctx->sq;
102 	eq = sq->cq.mcq.eq;
103 	priv = sq->priv;
104 	err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
105 	if (!err) {
106 		to_ctx->status = 0; /* this sq recovered */
107 		return err;
108 	}
109 
110 	err = mlx5e_safe_reopen_channels(priv);
111 	if (!err) {
112 		to_ctx->status = 1; /* all channels recovered */
113 		return err;
114 	}
115 
116 	to_ctx->status = err;
117 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
118 	netdev_err(priv->netdev,
119 		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
120 		   err);
121 
122 	return err;
123 }
124 
125 /* state lock cannot be grabbed within this function.
126  * It can cause a dead lock or a read-after-free.
127  */
128 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
129 {
130 	return err_ctx->recover(err_ctx->ctx);
131 }
132 
133 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
134 				     void *context,
135 				     struct netlink_ext_ack *extack)
136 {
137 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
138 	struct mlx5e_err_ctx *err_ctx = context;
139 
140 	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
141 			 mlx5e_health_recover_channels(priv);
142 }
143 
144 static int
145 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
146 						  struct mlx5e_txqsq *sq, int tc)
147 {
148 	bool stopped = netif_xmit_stopped(sq->txq);
149 	struct mlx5e_priv *priv = sq->priv;
150 	u8 state;
151 	int err;
152 
153 	err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
154 	if (err)
155 		return err;
156 
157 	err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
158 	if (err)
159 		return err;
160 
161 	err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
162 	if (err)
163 		return err;
164 
165 	err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
166 	if (err)
167 		return err;
168 
169 	err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
170 	if (err)
171 		return err;
172 
173 	err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
174 	if (err)
175 		return err;
176 
177 	err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
178 	if (err)
179 		return err;
180 
181 	err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
182 	if (err)
183 		return err;
184 
185 	err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
186 	if (err)
187 		return err;
188 
189 	return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
190 }
191 
192 static int
193 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
194 					struct mlx5e_txqsq *sq, int tc)
195 {
196 	int err;
197 
198 	err = devlink_fmsg_obj_nest_start(fmsg);
199 	if (err)
200 		return err;
201 
202 	err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
203 	if (err)
204 		return err;
205 
206 	err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
207 	if (err)
208 		return err;
209 
210 	err = devlink_fmsg_obj_nest_end(fmsg);
211 	if (err)
212 		return err;
213 
214 	return 0;
215 }
216 
217 static int
218 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
219 					      struct mlx5e_ptpsq *ptpsq, int tc)
220 {
221 	int err;
222 
223 	err = devlink_fmsg_obj_nest_start(fmsg);
224 	if (err)
225 		return err;
226 
227 	err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
228 	if (err)
229 		return err;
230 
231 	err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg,
232 								&ptpsq->txqsq,
233 								tc);
234 	if (err)
235 		return err;
236 
237 	err = devlink_fmsg_obj_nest_end(fmsg);
238 	if (err)
239 		return err;
240 
241 	return 0;
242 }
243 
244 static int
245 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
246 					 struct mlx5e_txqsq *txqsq)
247 {
248 	u32 sq_stride, sq_sz;
249 	int err;
250 
251 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
252 	if (err)
253 		return err;
254 
255 	sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
256 	sq_stride = MLX5_SEND_WQE_BB;
257 
258 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
259 	if (err)
260 		return err;
261 
262 	err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
263 	if (err)
264 		return err;
265 
266 	err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
267 	if (err)
268 		return err;
269 
270 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
271 }
272 
273 static int
274 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
275 					 struct devlink_fmsg *fmsg)
276 {
277 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
278 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
279 	struct mlx5e_ptpsq *generic_ptpsq;
280 	int err;
281 
282 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
283 	if (err)
284 		return err;
285 
286 	err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
287 	if (err)
288 		return err;
289 
290 	generic_ptpsq = priv->channels.port_ptp ?
291 			&priv->channels.port_ptp->ptpsq[0] :
292 			NULL;
293 	if (!generic_ptpsq)
294 		goto out;
295 
296 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
297 	if (err)
298 		return err;
299 
300 	err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
301 	if (err)
302 		return err;
303 
304 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
305 	if (err)
306 		return err;
307 
308 out:
309 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
310 }
311 
312 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
313 				      struct devlink_fmsg *fmsg,
314 				      struct netlink_ext_ack *extack)
315 {
316 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
317 	struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
318 
319 	int i, tc, err = 0;
320 
321 	mutex_lock(&priv->state_lock);
322 
323 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
324 		goto unlock;
325 
326 	err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
327 	if (err)
328 		goto unlock;
329 
330 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
331 	if (err)
332 		goto unlock;
333 
334 	for (i = 0; i < priv->channels.num; i++) {
335 		struct mlx5e_channel *c = priv->channels.c[i];
336 
337 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
338 			struct mlx5e_txqsq *sq = &c->sq[tc];
339 
340 			err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
341 			if (err)
342 				goto unlock;
343 		}
344 	}
345 
346 	if (!ptp_ch)
347 		goto close_sqs_nest;
348 
349 	for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
350 		err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
351 								    &ptp_ch->ptpsq[tc],
352 								    tc);
353 		if (err)
354 			goto unlock;
355 	}
356 
357 close_sqs_nest:
358 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
359 	if (err)
360 		goto unlock;
361 
362 unlock:
363 	mutex_unlock(&priv->state_lock);
364 	return err;
365 }
366 
367 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
368 				     void *ctx)
369 {
370 	struct mlx5_rsc_key key = {};
371 	struct mlx5e_txqsq *sq = ctx;
372 	int err;
373 
374 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
375 		return 0;
376 
377 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
378 	if (err)
379 		return err;
380 
381 	key.size = PAGE_SIZE;
382 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
383 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
384 	if (err)
385 		return err;
386 
387 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
388 	if (err)
389 		return err;
390 
391 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
392 	if (err)
393 		return err;
394 
395 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
396 	if (err)
397 		return err;
398 
399 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
400 	key.index1 = sq->sqn;
401 	key.num_of_obj1 = 1;
402 
403 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
404 	if (err)
405 		return err;
406 
407 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
408 	if (err)
409 		return err;
410 
411 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
412 	if (err)
413 		return err;
414 
415 	key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
416 	key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
417 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
418 	if (err)
419 		return err;
420 
421 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
422 	if (err)
423 		return err;
424 
425 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
426 }
427 
428 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
429 					  struct devlink_fmsg *fmsg)
430 {
431 	struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
432 	struct mlx5_rsc_key key = {};
433 	int i, tc, err;
434 
435 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
436 		return 0;
437 
438 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
439 	if (err)
440 		return err;
441 
442 	key.size = PAGE_SIZE;
443 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
444 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
445 	if (err)
446 		return err;
447 
448 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
449 	if (err)
450 		return err;
451 
452 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
453 	if (err)
454 		return err;
455 
456 	for (i = 0; i < priv->channels.num; i++) {
457 		struct mlx5e_channel *c = priv->channels.c[i];
458 
459 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
460 			struct mlx5e_txqsq *sq = &c->sq[tc];
461 
462 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
463 			if (err)
464 				return err;
465 		}
466 	}
467 
468 	if (ptp_ch) {
469 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
470 			struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
471 
472 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
473 			if (err)
474 				return err;
475 		}
476 	}
477 
478 	return devlink_fmsg_arr_pair_nest_end(fmsg);
479 }
480 
481 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
482 					   struct mlx5e_err_ctx *err_ctx,
483 					   struct devlink_fmsg *fmsg)
484 {
485 	return err_ctx->dump(priv, fmsg, err_ctx->ctx);
486 }
487 
488 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
489 				  struct devlink_fmsg *fmsg, void *context,
490 				  struct netlink_ext_ack *extack)
491 {
492 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
493 	struct mlx5e_err_ctx *err_ctx = context;
494 
495 	return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
496 			 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
497 }
498 
499 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
500 {
501 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
502 	struct mlx5e_priv *priv = sq->priv;
503 	struct mlx5e_err_ctx err_ctx = {};
504 
505 	err_ctx.ctx = sq;
506 	err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
507 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
508 	snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
509 
510 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
511 }
512 
513 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
514 {
515 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
516 	struct mlx5e_tx_timeout_ctx to_ctx = {};
517 	struct mlx5e_priv *priv = sq->priv;
518 	struct mlx5e_err_ctx err_ctx = {};
519 
520 	to_ctx.sq = sq;
521 	err_ctx.ctx = &to_ctx;
522 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
523 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
524 	snprintf(err_str, sizeof(err_str),
525 		 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
526 		 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
527 		 jiffies_to_usecs(jiffies - sq->txq->trans_start));
528 
529 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
530 	return to_ctx.status;
531 }
532 
533 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
534 		.name = "tx",
535 		.recover = mlx5e_tx_reporter_recover,
536 		.diagnose = mlx5e_tx_reporter_diagnose,
537 		.dump = mlx5e_tx_reporter_dump,
538 };
539 
540 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
541 
542 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
543 {
544 	struct devlink_health_reporter *reporter;
545 
546 	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
547 						       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
548 	if (IS_ERR(reporter)) {
549 		netdev_warn(priv->netdev,
550 			    "Failed to create tx reporter, err = %ld\n",
551 			    PTR_ERR(reporter));
552 		return;
553 	}
554 	priv->tx_reporter = reporter;
555 }
556 
557 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
558 {
559 	if (!priv->tx_reporter)
560 		return;
561 
562 	devlink_port_health_reporter_destroy(priv->tx_reporter);
563 }
564