1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 #include "en/devlink.h" 7 8 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 9 { 10 unsigned long exp_time = jiffies + 11 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 12 13 while (time_before(jiffies, exp_time)) { 14 if (sq->cc == sq->pc) 15 return 0; 16 17 msleep(20); 18 } 19 20 netdev_err(sq->netdev, 21 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 22 sq->sqn, sq->cc, sq->pc); 23 24 return -ETIMEDOUT; 25 } 26 27 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 28 { 29 WARN_ONCE(sq->cc != sq->pc, 30 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 31 sq->sqn, sq->cc, sq->pc); 32 sq->cc = 0; 33 sq->dma_fifo_cc = 0; 34 sq->pc = 0; 35 } 36 37 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 38 { 39 struct mlx5_core_dev *mdev; 40 struct net_device *dev; 41 struct mlx5e_txqsq *sq; 42 u8 state; 43 int err; 44 45 sq = ctx; 46 mdev = sq->mdev; 47 dev = sq->netdev; 48 49 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 50 return 0; 51 52 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 53 if (err) { 54 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 55 sq->sqn, err); 56 goto out; 57 } 58 59 if (state != MLX5_SQC_STATE_ERR) 60 goto out; 61 62 mlx5e_tx_disable_queue(sq->txq); 63 64 err = mlx5e_wait_for_sq_flush(sq); 65 if (err) 66 goto out; 67 68 /* At this point, no new packets will arrive from the stack as TXQ is 69 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 70 * pending WQEs. SQ can safely reset the SQ. 71 */ 72 73 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 74 if (err) 75 goto out; 76 77 mlx5e_reset_txqsq_cc_pc(sq); 78 sq->stats->recover++; 79 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 80 mlx5e_activate_txqsq(sq); 81 82 return 0; 83 out: 84 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 85 return err; 86 } 87 88 struct mlx5e_tx_timeout_ctx { 89 struct mlx5e_txqsq *sq; 90 signed int status; 91 }; 92 93 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 94 { 95 struct mlx5e_tx_timeout_ctx *to_ctx; 96 struct mlx5e_priv *priv; 97 struct mlx5_eq_comp *eq; 98 struct mlx5e_txqsq *sq; 99 int err; 100 101 to_ctx = ctx; 102 sq = to_ctx->sq; 103 eq = sq->cq.mcq.eq; 104 priv = sq->priv; 105 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 106 if (!err) { 107 to_ctx->status = 0; /* this sq recovered */ 108 return err; 109 } 110 111 err = mlx5e_safe_reopen_channels(priv); 112 if (!err) { 113 to_ctx->status = 1; /* all channels recovered */ 114 return err; 115 } 116 117 to_ctx->status = err; 118 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 119 netdev_err(priv->netdev, 120 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 121 err); 122 123 return err; 124 } 125 126 /* state lock cannot be grabbed within this function. 127 * It can cause a dead lock or a read-after-free. 128 */ 129 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 130 { 131 return err_ctx->recover(err_ctx->ctx); 132 } 133 134 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 135 void *context, 136 struct netlink_ext_ack *extack) 137 { 138 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 139 struct mlx5e_err_ctx *err_ctx = context; 140 141 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 142 mlx5e_health_recover_channels(priv); 143 } 144 145 static int 146 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 147 struct mlx5e_txqsq *sq, int tc) 148 { 149 bool stopped = netif_xmit_stopped(sq->txq); 150 struct mlx5e_priv *priv = sq->priv; 151 u8 state; 152 int err; 153 154 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 155 if (err) 156 return err; 157 158 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 159 if (err) 160 return err; 161 162 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 163 if (err) 164 return err; 165 166 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 167 if (err) 168 return err; 169 170 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 171 if (err) 172 return err; 173 174 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 175 if (err) 176 return err; 177 178 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 179 if (err) 180 return err; 181 182 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 183 if (err) 184 return err; 185 186 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 187 if (err) 188 return err; 189 190 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 191 } 192 193 static int 194 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 195 struct mlx5e_txqsq *sq, int tc) 196 { 197 int err; 198 199 err = devlink_fmsg_obj_nest_start(fmsg); 200 if (err) 201 return err; 202 203 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 204 if (err) 205 return err; 206 207 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 208 if (err) 209 return err; 210 211 err = devlink_fmsg_obj_nest_end(fmsg); 212 if (err) 213 return err; 214 215 return 0; 216 } 217 218 static int 219 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 220 struct mlx5e_ptpsq *ptpsq, int tc) 221 { 222 int err; 223 224 err = devlink_fmsg_obj_nest_start(fmsg); 225 if (err) 226 return err; 227 228 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 229 if (err) 230 return err; 231 232 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); 233 if (err) 234 return err; 235 236 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 237 if (err) 238 return err; 239 240 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); 241 if (err) 242 return err; 243 244 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 245 if (err) 246 return err; 247 248 err = devlink_fmsg_obj_nest_end(fmsg); 249 if (err) 250 return err; 251 252 return 0; 253 } 254 255 static int 256 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 257 struct mlx5e_txqsq *txqsq) 258 { 259 u32 sq_stride, sq_sz; 260 int err; 261 262 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 263 if (err) 264 return err; 265 266 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 267 sq_stride = MLX5_SEND_WQE_BB; 268 269 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 270 if (err) 271 return err; 272 273 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 274 if (err) 275 return err; 276 277 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 278 if (err) 279 return err; 280 281 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 282 } 283 284 static int 285 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, 286 struct mlx5e_ptpsq *ptpsq) 287 { 288 int err; 289 290 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 291 if (err) 292 return err; 293 294 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); 295 if (err) 296 return err; 297 298 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 299 } 300 301 static int 302 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 303 struct devlink_fmsg *fmsg) 304 { 305 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 306 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 307 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 308 struct mlx5e_ptpsq *generic_ptpsq; 309 int err; 310 311 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 312 if (err) 313 return err; 314 315 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 316 if (err) 317 return err; 318 319 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 320 goto out; 321 322 generic_ptpsq = &ptp_ch->ptpsq[0]; 323 324 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 325 if (err) 326 return err; 327 328 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 329 if (err) 330 return err; 331 332 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); 333 if (err) 334 return err; 335 336 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 337 if (err) 338 return err; 339 340 out: 341 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 342 } 343 344 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 345 struct devlink_fmsg *fmsg, 346 struct netlink_ext_ack *extack) 347 { 348 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 349 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 350 351 int i, tc, err = 0; 352 353 mutex_lock(&priv->state_lock); 354 355 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 356 goto unlock; 357 358 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 359 if (err) 360 goto unlock; 361 362 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 363 if (err) 364 goto unlock; 365 366 for (i = 0; i < priv->channels.num; i++) { 367 struct mlx5e_channel *c = priv->channels.c[i]; 368 369 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 370 struct mlx5e_txqsq *sq = &c->sq[tc]; 371 372 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 373 if (err) 374 goto unlock; 375 } 376 } 377 378 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 379 goto close_sqs_nest; 380 381 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 382 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 383 &ptp_ch->ptpsq[tc], 384 tc); 385 if (err) 386 goto unlock; 387 } 388 389 close_sqs_nest: 390 err = devlink_fmsg_arr_pair_nest_end(fmsg); 391 if (err) 392 goto unlock; 393 394 unlock: 395 mutex_unlock(&priv->state_lock); 396 return err; 397 } 398 399 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 400 void *ctx) 401 { 402 struct mlx5_rsc_key key = {}; 403 struct mlx5e_txqsq *sq = ctx; 404 int err; 405 406 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 407 return 0; 408 409 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 410 if (err) 411 return err; 412 413 key.size = PAGE_SIZE; 414 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 415 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 416 if (err) 417 return err; 418 419 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 420 if (err) 421 return err; 422 423 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 424 if (err) 425 return err; 426 427 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 428 if (err) 429 return err; 430 431 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 432 key.index1 = sq->sqn; 433 key.num_of_obj1 = 1; 434 435 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 436 if (err) 437 return err; 438 439 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 440 if (err) 441 return err; 442 443 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 444 if (err) 445 return err; 446 447 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 448 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 449 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 450 if (err) 451 return err; 452 453 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 454 if (err) 455 return err; 456 457 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 458 } 459 460 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 461 struct devlink_fmsg *fmsg) 462 { 463 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 464 struct mlx5_rsc_key key = {}; 465 int i, tc, err; 466 467 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 468 return 0; 469 470 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 471 if (err) 472 return err; 473 474 key.size = PAGE_SIZE; 475 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 476 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 477 if (err) 478 return err; 479 480 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 481 if (err) 482 return err; 483 484 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 485 if (err) 486 return err; 487 488 for (i = 0; i < priv->channels.num; i++) { 489 struct mlx5e_channel *c = priv->channels.c[i]; 490 491 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 492 struct mlx5e_txqsq *sq = &c->sq[tc]; 493 494 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 495 if (err) 496 return err; 497 } 498 } 499 500 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) { 501 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 502 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 503 504 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 505 if (err) 506 return err; 507 } 508 } 509 510 return devlink_fmsg_arr_pair_nest_end(fmsg); 511 } 512 513 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 514 struct mlx5e_err_ctx *err_ctx, 515 struct devlink_fmsg *fmsg) 516 { 517 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 518 } 519 520 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 521 struct devlink_fmsg *fmsg, void *context, 522 struct netlink_ext_ack *extack) 523 { 524 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 525 struct mlx5e_err_ctx *err_ctx = context; 526 527 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 528 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 529 } 530 531 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 532 { 533 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 534 struct mlx5e_priv *priv = sq->priv; 535 struct mlx5e_err_ctx err_ctx = {}; 536 537 err_ctx.ctx = sq; 538 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 539 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 540 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 541 542 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 543 } 544 545 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 546 { 547 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 548 struct mlx5e_tx_timeout_ctx to_ctx = {}; 549 struct mlx5e_priv *priv = sq->priv; 550 struct mlx5e_err_ctx err_ctx = {}; 551 552 to_ctx.sq = sq; 553 err_ctx.ctx = &to_ctx; 554 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 555 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 556 snprintf(err_str, sizeof(err_str), 557 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 558 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 559 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 560 561 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 562 return to_ctx.status; 563 } 564 565 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 566 .name = "tx", 567 .recover = mlx5e_tx_reporter_recover, 568 .diagnose = mlx5e_tx_reporter_diagnose, 569 .dump = mlx5e_tx_reporter_dump, 570 }; 571 572 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 573 574 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 575 { 576 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 577 struct devlink_health_reporter *reporter; 578 579 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, 580 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 581 if (IS_ERR(reporter)) { 582 netdev_warn(priv->netdev, 583 "Failed to create tx reporter, err = %ld\n", 584 PTR_ERR(reporter)); 585 return; 586 } 587 priv->tx_reporter = reporter; 588 } 589 590 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 591 { 592 if (!priv->tx_reporter) 593 return; 594 595 devlink_port_health_reporter_destroy(priv->tx_reporter); 596 priv->tx_reporter = NULL; 597 } 598