1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 #include "en/devlink.h" 7 8 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 9 { 10 unsigned long exp_time = jiffies + 11 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 12 13 while (time_before(jiffies, exp_time)) { 14 if (sq->cc == sq->pc) 15 return 0; 16 17 msleep(20); 18 } 19 20 netdev_err(sq->netdev, 21 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 22 sq->sqn, sq->cc, sq->pc); 23 24 return -ETIMEDOUT; 25 } 26 27 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 28 { 29 WARN_ONCE(sq->cc != sq->pc, 30 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 31 sq->sqn, sq->cc, sq->pc); 32 sq->cc = 0; 33 sq->dma_fifo_cc = 0; 34 sq->pc = 0; 35 } 36 37 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 38 { 39 struct mlx5_core_dev *mdev; 40 struct net_device *dev; 41 struct mlx5e_txqsq *sq; 42 u8 state; 43 int err; 44 45 sq = ctx; 46 mdev = sq->mdev; 47 dev = sq->netdev; 48 49 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 50 return 0; 51 52 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 53 if (err) { 54 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 55 sq->sqn, err); 56 goto out; 57 } 58 59 if (state != MLX5_SQC_STATE_ERR) 60 goto out; 61 62 mlx5e_tx_disable_queue(sq->txq); 63 64 err = mlx5e_wait_for_sq_flush(sq); 65 if (err) 66 goto out; 67 68 /* At this point, no new packets will arrive from the stack as TXQ is 69 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 70 * pending WQEs. SQ can safely reset the SQ. 71 */ 72 73 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 74 if (err) 75 goto out; 76 77 mlx5e_reset_txqsq_cc_pc(sq); 78 sq->stats->recover++; 79 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 80 mlx5e_activate_txqsq(sq); 81 82 return 0; 83 out: 84 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 85 return err; 86 } 87 88 struct mlx5e_tx_timeout_ctx { 89 struct mlx5e_txqsq *sq; 90 signed int status; 91 }; 92 93 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 94 { 95 struct mlx5e_tx_timeout_ctx *to_ctx; 96 struct mlx5e_priv *priv; 97 struct mlx5_eq_comp *eq; 98 struct mlx5e_txqsq *sq; 99 int err; 100 101 to_ctx = ctx; 102 sq = to_ctx->sq; 103 eq = sq->cq.mcq.eq; 104 priv = sq->priv; 105 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 106 if (!err) { 107 to_ctx->status = 0; /* this sq recovered */ 108 return err; 109 } 110 111 err = mlx5e_safe_reopen_channels(priv); 112 if (!err) { 113 to_ctx->status = 1; /* all channels recovered */ 114 return err; 115 } 116 117 to_ctx->status = err; 118 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 119 netdev_err(priv->netdev, 120 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 121 err); 122 123 return err; 124 } 125 126 /* state lock cannot be grabbed within this function. 127 * It can cause a dead lock or a read-after-free. 128 */ 129 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 130 { 131 return err_ctx->recover(err_ctx->ctx); 132 } 133 134 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 135 void *context, 136 struct netlink_ext_ack *extack) 137 { 138 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 139 struct mlx5e_err_ctx *err_ctx = context; 140 141 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 142 mlx5e_health_recover_channels(priv); 143 } 144 145 static int 146 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 147 struct mlx5e_txqsq *sq, int tc) 148 { 149 bool stopped = netif_xmit_stopped(sq->txq); 150 struct mlx5e_priv *priv = sq->priv; 151 u8 state; 152 int err; 153 154 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 155 if (err) 156 return err; 157 158 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 159 if (err) 160 return err; 161 162 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 163 if (err) 164 return err; 165 166 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 167 if (err) 168 return err; 169 170 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 171 if (err) 172 return err; 173 174 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 175 if (err) 176 return err; 177 178 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 179 if (err) 180 return err; 181 182 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 183 if (err) 184 return err; 185 186 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 187 if (err) 188 return err; 189 190 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 191 } 192 193 static int 194 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 195 struct mlx5e_txqsq *sq, int tc) 196 { 197 int err; 198 199 err = devlink_fmsg_obj_nest_start(fmsg); 200 if (err) 201 return err; 202 203 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 204 if (err) 205 return err; 206 207 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 208 if (err) 209 return err; 210 211 err = devlink_fmsg_obj_nest_end(fmsg); 212 if (err) 213 return err; 214 215 return 0; 216 } 217 218 static int 219 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 220 struct mlx5e_ptpsq *ptpsq, int tc) 221 { 222 int err; 223 224 err = devlink_fmsg_obj_nest_start(fmsg); 225 if (err) 226 return err; 227 228 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 229 if (err) 230 return err; 231 232 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); 233 if (err) 234 return err; 235 236 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 237 if (err) 238 return err; 239 240 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); 241 if (err) 242 return err; 243 244 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 245 if (err) 246 return err; 247 248 err = devlink_fmsg_obj_nest_end(fmsg); 249 if (err) 250 return err; 251 252 return 0; 253 } 254 255 static int 256 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 257 struct mlx5e_txqsq *txqsq) 258 { 259 u32 sq_stride, sq_sz; 260 bool real_time; 261 int err; 262 263 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 264 if (err) 265 return err; 266 267 real_time = mlx5_is_real_time_sq(txqsq->mdev); 268 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 269 sq_stride = MLX5_SEND_WQE_BB; 270 271 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 272 if (err) 273 return err; 274 275 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 276 if (err) 277 return err; 278 279 err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC"); 280 if (err) 281 return err; 282 283 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 284 if (err) 285 return err; 286 287 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 288 } 289 290 static int 291 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, 292 struct mlx5e_ptpsq *ptpsq) 293 { 294 int err; 295 296 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 297 if (err) 298 return err; 299 300 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); 301 if (err) 302 return err; 303 304 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 305 } 306 307 static int 308 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 309 struct devlink_fmsg *fmsg) 310 { 311 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 312 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 313 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 314 struct mlx5e_ptpsq *generic_ptpsq; 315 int err; 316 317 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 318 if (err) 319 return err; 320 321 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 322 if (err) 323 return err; 324 325 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 326 goto out; 327 328 generic_ptpsq = &ptp_ch->ptpsq[0]; 329 330 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 331 if (err) 332 return err; 333 334 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 335 if (err) 336 return err; 337 338 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); 339 if (err) 340 return err; 341 342 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 343 if (err) 344 return err; 345 346 out: 347 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 348 } 349 350 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 351 struct devlink_fmsg *fmsg, 352 struct netlink_ext_ack *extack) 353 { 354 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 355 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 356 357 int i, tc, err = 0; 358 359 mutex_lock(&priv->state_lock); 360 361 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 362 goto unlock; 363 364 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 365 if (err) 366 goto unlock; 367 368 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 369 if (err) 370 goto unlock; 371 372 for (i = 0; i < priv->channels.num; i++) { 373 struct mlx5e_channel *c = priv->channels.c[i]; 374 375 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 376 struct mlx5e_txqsq *sq = &c->sq[tc]; 377 378 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 379 if (err) 380 goto unlock; 381 } 382 } 383 384 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 385 goto close_sqs_nest; 386 387 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 388 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 389 &ptp_ch->ptpsq[tc], 390 tc); 391 if (err) 392 goto unlock; 393 } 394 395 close_sqs_nest: 396 err = devlink_fmsg_arr_pair_nest_end(fmsg); 397 if (err) 398 goto unlock; 399 400 unlock: 401 mutex_unlock(&priv->state_lock); 402 return err; 403 } 404 405 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 406 void *ctx) 407 { 408 struct mlx5_rsc_key key = {}; 409 struct mlx5e_txqsq *sq = ctx; 410 int err; 411 412 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 413 return 0; 414 415 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 416 if (err) 417 return err; 418 419 key.size = PAGE_SIZE; 420 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 421 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 422 if (err) 423 return err; 424 425 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 426 if (err) 427 return err; 428 429 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 430 if (err) 431 return err; 432 433 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 434 if (err) 435 return err; 436 437 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 438 key.index1 = sq->sqn; 439 key.num_of_obj1 = 1; 440 441 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 442 if (err) 443 return err; 444 445 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 446 if (err) 447 return err; 448 449 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 450 if (err) 451 return err; 452 453 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 454 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 455 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 456 if (err) 457 return err; 458 459 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 460 if (err) 461 return err; 462 463 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 464 } 465 466 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 467 struct devlink_fmsg *fmsg) 468 { 469 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 470 struct mlx5_rsc_key key = {}; 471 int i, tc, err; 472 473 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 474 return 0; 475 476 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 477 if (err) 478 return err; 479 480 key.size = PAGE_SIZE; 481 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 482 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 483 if (err) 484 return err; 485 486 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 487 if (err) 488 return err; 489 490 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 491 if (err) 492 return err; 493 494 for (i = 0; i < priv->channels.num; i++) { 495 struct mlx5e_channel *c = priv->channels.c[i]; 496 497 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 498 struct mlx5e_txqsq *sq = &c->sq[tc]; 499 500 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 501 if (err) 502 return err; 503 } 504 } 505 506 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) { 507 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 508 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 509 510 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 511 if (err) 512 return err; 513 } 514 } 515 516 return devlink_fmsg_arr_pair_nest_end(fmsg); 517 } 518 519 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 520 struct mlx5e_err_ctx *err_ctx, 521 struct devlink_fmsg *fmsg) 522 { 523 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 524 } 525 526 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 527 struct devlink_fmsg *fmsg, void *context, 528 struct netlink_ext_ack *extack) 529 { 530 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 531 struct mlx5e_err_ctx *err_ctx = context; 532 533 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 534 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 535 } 536 537 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 538 { 539 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 540 struct mlx5e_priv *priv = sq->priv; 541 struct mlx5e_err_ctx err_ctx = {}; 542 543 err_ctx.ctx = sq; 544 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 545 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 546 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 547 548 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 549 } 550 551 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 552 { 553 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 554 struct mlx5e_tx_timeout_ctx to_ctx = {}; 555 struct mlx5e_priv *priv = sq->priv; 556 struct mlx5e_err_ctx err_ctx = {}; 557 558 to_ctx.sq = sq; 559 err_ctx.ctx = &to_ctx; 560 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 561 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 562 snprintf(err_str, sizeof(err_str), 563 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 564 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 565 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 566 567 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 568 return to_ctx.status; 569 } 570 571 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 572 .name = "tx", 573 .recover = mlx5e_tx_reporter_recover, 574 .diagnose = mlx5e_tx_reporter_diagnose, 575 .dump = mlx5e_tx_reporter_dump, 576 }; 577 578 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 579 580 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 581 { 582 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 583 struct devlink_health_reporter *reporter; 584 585 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, 586 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 587 if (IS_ERR(reporter)) { 588 netdev_warn(priv->netdev, 589 "Failed to create tx reporter, err = %ld\n", 590 PTR_ERR(reporter)); 591 return; 592 } 593 priv->tx_reporter = reporter; 594 } 595 596 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 597 { 598 if (!priv->tx_reporter) 599 return; 600 601 devlink_port_health_reporter_destroy(priv->tx_reporter); 602 priv->tx_reporter = NULL; 603 } 604