1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 #include "en/devlink.h" 7 #include "lib/tout.h" 8 9 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 10 { 11 struct mlx5_core_dev *dev = sq->mdev; 12 unsigned long exp_time; 13 14 exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR)); 15 16 while (time_before(jiffies, exp_time)) { 17 if (sq->cc == sq->pc) 18 return 0; 19 20 msleep(20); 21 } 22 23 netdev_err(sq->netdev, 24 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 25 sq->sqn, sq->cc, sq->pc); 26 27 return -ETIMEDOUT; 28 } 29 30 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 31 { 32 WARN_ONCE(sq->cc != sq->pc, 33 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 34 sq->sqn, sq->cc, sq->pc); 35 sq->cc = 0; 36 sq->dma_fifo_cc = 0; 37 sq->pc = 0; 38 } 39 40 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 41 { 42 struct mlx5_core_dev *mdev; 43 struct net_device *dev; 44 struct mlx5e_txqsq *sq; 45 u8 state; 46 int err; 47 48 sq = ctx; 49 mdev = sq->mdev; 50 dev = sq->netdev; 51 52 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 53 return 0; 54 55 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 56 if (err) { 57 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 58 sq->sqn, err); 59 goto out; 60 } 61 62 if (state != MLX5_SQC_STATE_ERR) 63 goto out; 64 65 mlx5e_tx_disable_queue(sq->txq); 66 67 err = mlx5e_wait_for_sq_flush(sq); 68 if (err) 69 goto out; 70 71 /* At this point, no new packets will arrive from the stack as TXQ is 72 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 73 * pending WQEs. SQ can safely reset the SQ. 74 */ 75 76 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 77 if (err) 78 goto out; 79 80 mlx5e_reset_txqsq_cc_pc(sq); 81 sq->stats->recover++; 82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 83 mlx5e_activate_txqsq(sq); 84 85 return 0; 86 out: 87 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 88 return err; 89 } 90 91 struct mlx5e_tx_timeout_ctx { 92 struct mlx5e_txqsq *sq; 93 signed int status; 94 }; 95 96 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 97 { 98 struct mlx5e_tx_timeout_ctx *to_ctx; 99 struct mlx5e_priv *priv; 100 struct mlx5_eq_comp *eq; 101 struct mlx5e_txqsq *sq; 102 int err; 103 104 to_ctx = ctx; 105 sq = to_ctx->sq; 106 eq = sq->cq.mcq.eq; 107 priv = sq->priv; 108 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 109 if (!err) { 110 to_ctx->status = 0; /* this sq recovered */ 111 return err; 112 } 113 114 err = mlx5e_safe_reopen_channels(priv); 115 if (!err) { 116 to_ctx->status = 1; /* all channels recovered */ 117 return err; 118 } 119 120 to_ctx->status = err; 121 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 122 netdev_err(priv->netdev, 123 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 124 err); 125 126 return err; 127 } 128 129 /* state lock cannot be grabbed within this function. 130 * It can cause a dead lock or a read-after-free. 131 */ 132 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 133 { 134 return err_ctx->recover(err_ctx->ctx); 135 } 136 137 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 138 void *context, 139 struct netlink_ext_ack *extack) 140 { 141 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 142 struct mlx5e_err_ctx *err_ctx = context; 143 144 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 145 mlx5e_health_recover_channels(priv); 146 } 147 148 static int 149 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 150 struct mlx5e_txqsq *sq, int tc) 151 { 152 bool stopped = netif_xmit_stopped(sq->txq); 153 struct mlx5e_priv *priv = sq->priv; 154 u8 state; 155 int err; 156 157 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 178 if (err) 179 return err; 180 181 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 182 if (err) 183 return err; 184 185 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 186 if (err) 187 return err; 188 189 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 190 if (err) 191 return err; 192 193 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 194 } 195 196 static int 197 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 198 struct mlx5e_txqsq *sq, int tc) 199 { 200 int err; 201 202 err = devlink_fmsg_obj_nest_start(fmsg); 203 if (err) 204 return err; 205 206 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 207 if (err) 208 return err; 209 210 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 211 if (err) 212 return err; 213 214 err = devlink_fmsg_obj_nest_end(fmsg); 215 if (err) 216 return err; 217 218 return 0; 219 } 220 221 static int 222 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 223 struct mlx5e_ptpsq *ptpsq, int tc) 224 { 225 int err; 226 227 err = devlink_fmsg_obj_nest_start(fmsg); 228 if (err) 229 return err; 230 231 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 232 if (err) 233 return err; 234 235 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); 236 if (err) 237 return err; 238 239 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 240 if (err) 241 return err; 242 243 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); 244 if (err) 245 return err; 246 247 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 248 if (err) 249 return err; 250 251 err = devlink_fmsg_obj_nest_end(fmsg); 252 if (err) 253 return err; 254 255 return 0; 256 } 257 258 static int 259 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 260 struct mlx5e_txqsq *txqsq) 261 { 262 u32 sq_stride, sq_sz; 263 bool real_time; 264 int err; 265 266 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 267 if (err) 268 return err; 269 270 real_time = mlx5_is_real_time_sq(txqsq->mdev); 271 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 272 sq_stride = MLX5_SEND_WQE_BB; 273 274 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 275 if (err) 276 return err; 277 278 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 279 if (err) 280 return err; 281 282 err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC"); 283 if (err) 284 return err; 285 286 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 287 if (err) 288 return err; 289 290 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 291 } 292 293 static int 294 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, 295 struct mlx5e_ptpsq *ptpsq) 296 { 297 int err; 298 299 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 300 if (err) 301 return err; 302 303 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); 304 if (err) 305 return err; 306 307 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 308 } 309 310 static int 311 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 312 struct devlink_fmsg *fmsg) 313 { 314 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 315 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 316 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 317 struct mlx5e_ptpsq *generic_ptpsq; 318 int err; 319 320 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 321 if (err) 322 return err; 323 324 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 325 if (err) 326 return err; 327 328 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 329 goto out; 330 331 generic_ptpsq = &ptp_ch->ptpsq[0]; 332 333 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 334 if (err) 335 return err; 336 337 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 338 if (err) 339 return err; 340 341 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); 342 if (err) 343 return err; 344 345 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 346 if (err) 347 return err; 348 349 out: 350 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 351 } 352 353 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 354 struct devlink_fmsg *fmsg, 355 struct netlink_ext_ack *extack) 356 { 357 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 358 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 359 360 int i, tc, err = 0; 361 362 mutex_lock(&priv->state_lock); 363 364 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 365 goto unlock; 366 367 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 368 if (err) 369 goto unlock; 370 371 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 372 if (err) 373 goto unlock; 374 375 for (i = 0; i < priv->channels.num; i++) { 376 struct mlx5e_channel *c = priv->channels.c[i]; 377 378 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 379 struct mlx5e_txqsq *sq = &c->sq[tc]; 380 381 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 382 if (err) 383 goto unlock; 384 } 385 } 386 387 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 388 goto close_sqs_nest; 389 390 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 391 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 392 &ptp_ch->ptpsq[tc], 393 tc); 394 if (err) 395 goto unlock; 396 } 397 398 close_sqs_nest: 399 err = devlink_fmsg_arr_pair_nest_end(fmsg); 400 if (err) 401 goto unlock; 402 403 unlock: 404 mutex_unlock(&priv->state_lock); 405 return err; 406 } 407 408 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 409 void *ctx) 410 { 411 struct mlx5_rsc_key key = {}; 412 struct mlx5e_txqsq *sq = ctx; 413 int err; 414 415 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 416 return 0; 417 418 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 419 if (err) 420 return err; 421 422 key.size = PAGE_SIZE; 423 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 424 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 425 if (err) 426 return err; 427 428 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 429 if (err) 430 return err; 431 432 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 433 if (err) 434 return err; 435 436 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 437 if (err) 438 return err; 439 440 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 441 key.index1 = sq->sqn; 442 key.num_of_obj1 = 1; 443 444 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 445 if (err) 446 return err; 447 448 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 449 if (err) 450 return err; 451 452 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 453 if (err) 454 return err; 455 456 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 457 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 458 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 459 if (err) 460 return err; 461 462 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 463 if (err) 464 return err; 465 466 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 467 } 468 469 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 470 void *ctx) 471 { 472 struct mlx5e_tx_timeout_ctx *to_ctx = ctx; 473 474 return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq); 475 } 476 477 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 478 struct devlink_fmsg *fmsg) 479 { 480 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 481 struct mlx5_rsc_key key = {}; 482 int i, tc, err; 483 484 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 485 return 0; 486 487 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 488 if (err) 489 return err; 490 491 key.size = PAGE_SIZE; 492 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 493 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 494 if (err) 495 return err; 496 497 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 498 if (err) 499 return err; 500 501 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 502 if (err) 503 return err; 504 505 for (i = 0; i < priv->channels.num; i++) { 506 struct mlx5e_channel *c = priv->channels.c[i]; 507 508 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 509 struct mlx5e_txqsq *sq = &c->sq[tc]; 510 511 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 512 if (err) 513 return err; 514 } 515 } 516 517 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) { 518 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 519 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 520 521 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 522 if (err) 523 return err; 524 } 525 } 526 527 return devlink_fmsg_arr_pair_nest_end(fmsg); 528 } 529 530 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 531 struct mlx5e_err_ctx *err_ctx, 532 struct devlink_fmsg *fmsg) 533 { 534 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 535 } 536 537 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 538 struct devlink_fmsg *fmsg, void *context, 539 struct netlink_ext_ack *extack) 540 { 541 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 542 struct mlx5e_err_ctx *err_ctx = context; 543 544 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 545 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 546 } 547 548 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 549 { 550 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 551 struct mlx5e_priv *priv = sq->priv; 552 struct mlx5e_err_ctx err_ctx = {}; 553 554 err_ctx.ctx = sq; 555 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 556 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 557 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 558 559 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 560 } 561 562 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 563 { 564 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 565 struct mlx5e_tx_timeout_ctx to_ctx = {}; 566 struct mlx5e_priv *priv = sq->priv; 567 struct mlx5e_err_ctx err_ctx = {}; 568 569 to_ctx.sq = sq; 570 err_ctx.ctx = &to_ctx; 571 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 572 err_ctx.dump = mlx5e_tx_reporter_timeout_dump; 573 snprintf(err_str, sizeof(err_str), 574 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 575 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 576 jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start))); 577 578 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 579 return to_ctx.status; 580 } 581 582 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 583 .name = "tx", 584 .recover = mlx5e_tx_reporter_recover, 585 .diagnose = mlx5e_tx_reporter_diagnose, 586 .dump = mlx5e_tx_reporter_dump, 587 }; 588 589 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 590 591 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 592 { 593 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 594 struct devlink_health_reporter *reporter; 595 596 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, 597 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 598 if (IS_ERR(reporter)) { 599 netdev_warn(priv->netdev, 600 "Failed to create tx reporter, err = %ld\n", 601 PTR_ERR(reporter)); 602 return; 603 } 604 priv->tx_reporter = reporter; 605 } 606 607 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 608 { 609 if (!priv->tx_reporter) 610 return; 611 612 devlink_port_health_reporter_destroy(priv->tx_reporter); 613 priv->tx_reporter = NULL; 614 } 615