1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 #include "en/devlink.h" 7 #include "lib/tout.h" 8 9 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 10 { 11 struct mlx5_core_dev *dev = sq->mdev; 12 unsigned long exp_time; 13 14 exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR)); 15 16 while (time_before(jiffies, exp_time)) { 17 if (sq->cc == sq->pc) 18 return 0; 19 20 msleep(20); 21 } 22 23 netdev_err(sq->netdev, 24 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 25 sq->sqn, sq->cc, sq->pc); 26 27 return -ETIMEDOUT; 28 } 29 30 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 31 { 32 WARN_ONCE(sq->cc != sq->pc, 33 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 34 sq->sqn, sq->cc, sq->pc); 35 sq->cc = 0; 36 sq->dma_fifo_cc = 0; 37 sq->pc = 0; 38 } 39 40 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 41 { 42 struct mlx5_core_dev *mdev; 43 struct net_device *dev; 44 struct mlx5e_txqsq *sq; 45 u8 state; 46 int err; 47 48 sq = ctx; 49 mdev = sq->mdev; 50 dev = sq->netdev; 51 52 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 53 return 0; 54 55 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 56 if (err) { 57 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 58 sq->sqn, err); 59 goto out; 60 } 61 62 if (state != MLX5_SQC_STATE_ERR) 63 goto out; 64 65 mlx5e_tx_disable_queue(sq->txq); 66 67 err = mlx5e_wait_for_sq_flush(sq); 68 if (err) 69 goto out; 70 71 /* At this point, no new packets will arrive from the stack as TXQ is 72 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 73 * pending WQEs. SQ can safely reset the SQ. 74 */ 75 76 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 77 if (err) 78 goto out; 79 80 mlx5e_reset_txqsq_cc_pc(sq); 81 sq->stats->recover++; 82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 83 mlx5e_activate_txqsq(sq); 84 if (sq->channel) 85 mlx5e_trigger_napi_icosq(sq->channel); 86 else 87 mlx5e_trigger_napi_sched(sq->cq.napi); 88 89 return 0; 90 out: 91 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 92 return err; 93 } 94 95 struct mlx5e_tx_timeout_ctx { 96 struct mlx5e_txqsq *sq; 97 signed int status; 98 }; 99 100 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 101 { 102 struct mlx5e_tx_timeout_ctx *to_ctx; 103 struct mlx5e_priv *priv; 104 struct mlx5_eq_comp *eq; 105 struct mlx5e_txqsq *sq; 106 int err; 107 108 to_ctx = ctx; 109 sq = to_ctx->sq; 110 eq = sq->cq.mcq.eq; 111 priv = sq->priv; 112 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 113 if (!err) { 114 to_ctx->status = 0; /* this sq recovered */ 115 return err; 116 } 117 118 err = mlx5e_safe_reopen_channels(priv); 119 if (!err) { 120 to_ctx->status = 1; /* all channels recovered */ 121 return err; 122 } 123 124 to_ctx->status = err; 125 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 126 netdev_err(priv->netdev, 127 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 128 err); 129 130 return err; 131 } 132 133 /* state lock cannot be grabbed within this function. 134 * It can cause a dead lock or a read-after-free. 135 */ 136 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 137 { 138 return err_ctx->recover(err_ctx->ctx); 139 } 140 141 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 142 void *context, 143 struct netlink_ext_ack *extack) 144 { 145 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 146 struct mlx5e_err_ctx *err_ctx = context; 147 148 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 149 mlx5e_health_recover_channels(priv); 150 } 151 152 static int 153 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 154 struct mlx5e_txqsq *sq, int tc) 155 { 156 bool stopped = netif_xmit_stopped(sq->txq); 157 struct mlx5e_priv *priv = sq->priv; 158 u8 state; 159 int err; 160 161 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 178 if (err) 179 return err; 180 181 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 182 if (err) 183 return err; 184 185 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 186 if (err) 187 return err; 188 189 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 190 if (err) 191 return err; 192 193 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 194 if (err) 195 return err; 196 197 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 198 } 199 200 static int 201 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 202 struct mlx5e_txqsq *sq, int tc) 203 { 204 int err; 205 206 err = devlink_fmsg_obj_nest_start(fmsg); 207 if (err) 208 return err; 209 210 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 211 if (err) 212 return err; 213 214 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 215 if (err) 216 return err; 217 218 err = devlink_fmsg_obj_nest_end(fmsg); 219 if (err) 220 return err; 221 222 return 0; 223 } 224 225 static int 226 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 227 struct mlx5e_ptpsq *ptpsq, int tc) 228 { 229 int err; 230 231 err = devlink_fmsg_obj_nest_start(fmsg); 232 if (err) 233 return err; 234 235 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 236 if (err) 237 return err; 238 239 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); 240 if (err) 241 return err; 242 243 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 244 if (err) 245 return err; 246 247 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); 248 if (err) 249 return err; 250 251 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 252 if (err) 253 return err; 254 255 err = devlink_fmsg_obj_nest_end(fmsg); 256 if (err) 257 return err; 258 259 return 0; 260 } 261 262 static int 263 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 264 struct mlx5e_txqsq *txqsq) 265 { 266 u32 sq_stride, sq_sz; 267 bool real_time; 268 int err; 269 270 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 271 if (err) 272 return err; 273 274 real_time = mlx5_is_real_time_sq(txqsq->mdev); 275 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 276 sq_stride = MLX5_SEND_WQE_BB; 277 278 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 279 if (err) 280 return err; 281 282 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 283 if (err) 284 return err; 285 286 err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC"); 287 if (err) 288 return err; 289 290 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 291 if (err) 292 return err; 293 294 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 295 } 296 297 static int 298 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, 299 struct mlx5e_ptpsq *ptpsq) 300 { 301 int err; 302 303 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 304 if (err) 305 return err; 306 307 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); 308 if (err) 309 return err; 310 311 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 312 } 313 314 static int 315 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 316 struct devlink_fmsg *fmsg) 317 { 318 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 319 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 320 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 321 struct mlx5e_ptpsq *generic_ptpsq; 322 int err; 323 324 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 325 if (err) 326 return err; 327 328 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 329 if (err) 330 return err; 331 332 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 333 goto out; 334 335 generic_ptpsq = &ptp_ch->ptpsq[0]; 336 337 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 338 if (err) 339 return err; 340 341 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 342 if (err) 343 return err; 344 345 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); 346 if (err) 347 return err; 348 349 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 350 if (err) 351 return err; 352 353 out: 354 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 355 } 356 357 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 358 struct devlink_fmsg *fmsg, 359 struct netlink_ext_ack *extack) 360 { 361 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 362 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 363 364 int i, tc, err = 0; 365 366 mutex_lock(&priv->state_lock); 367 368 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 369 goto unlock; 370 371 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 372 if (err) 373 goto unlock; 374 375 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 376 if (err) 377 goto unlock; 378 379 for (i = 0; i < priv->channels.num; i++) { 380 struct mlx5e_channel *c = priv->channels.c[i]; 381 382 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 383 struct mlx5e_txqsq *sq = &c->sq[tc]; 384 385 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 386 if (err) 387 goto unlock; 388 } 389 } 390 391 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) 392 goto close_sqs_nest; 393 394 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 395 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 396 &ptp_ch->ptpsq[tc], 397 tc); 398 if (err) 399 goto unlock; 400 } 401 402 close_sqs_nest: 403 err = devlink_fmsg_arr_pair_nest_end(fmsg); 404 if (err) 405 goto unlock; 406 407 unlock: 408 mutex_unlock(&priv->state_lock); 409 return err; 410 } 411 412 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 413 void *ctx) 414 { 415 struct mlx5_rsc_key key = {}; 416 struct mlx5e_txqsq *sq = ctx; 417 int err; 418 419 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 420 return 0; 421 422 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 423 if (err) 424 return err; 425 426 key.size = PAGE_SIZE; 427 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 428 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 429 if (err) 430 return err; 431 432 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 433 if (err) 434 return err; 435 436 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 437 if (err) 438 return err; 439 440 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 441 if (err) 442 return err; 443 444 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 445 key.index1 = sq->sqn; 446 key.num_of_obj1 = 1; 447 448 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 449 if (err) 450 return err; 451 452 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 453 if (err) 454 return err; 455 456 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 457 if (err) 458 return err; 459 460 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 461 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 462 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 463 if (err) 464 return err; 465 466 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 467 if (err) 468 return err; 469 470 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 471 } 472 473 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 474 void *ctx) 475 { 476 struct mlx5e_tx_timeout_ctx *to_ctx = ctx; 477 478 return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq); 479 } 480 481 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 482 struct devlink_fmsg *fmsg) 483 { 484 struct mlx5e_ptp *ptp_ch = priv->channels.ptp; 485 struct mlx5_rsc_key key = {}; 486 int i, tc, err; 487 488 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 489 return 0; 490 491 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 492 if (err) 493 return err; 494 495 key.size = PAGE_SIZE; 496 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 497 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 498 if (err) 499 return err; 500 501 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 502 if (err) 503 return err; 504 505 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 506 if (err) 507 return err; 508 509 for (i = 0; i < priv->channels.num; i++) { 510 struct mlx5e_channel *c = priv->channels.c[i]; 511 512 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 513 struct mlx5e_txqsq *sq = &c->sq[tc]; 514 515 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 516 if (err) 517 return err; 518 } 519 } 520 521 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) { 522 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { 523 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 524 525 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 526 if (err) 527 return err; 528 } 529 } 530 531 return devlink_fmsg_arr_pair_nest_end(fmsg); 532 } 533 534 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 535 struct mlx5e_err_ctx *err_ctx, 536 struct devlink_fmsg *fmsg) 537 { 538 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 539 } 540 541 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 542 struct devlink_fmsg *fmsg, void *context, 543 struct netlink_ext_ack *extack) 544 { 545 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 546 struct mlx5e_err_ctx *err_ctx = context; 547 548 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 549 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 550 } 551 552 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 553 { 554 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 555 struct mlx5e_priv *priv = sq->priv; 556 struct mlx5e_err_ctx err_ctx = {}; 557 558 err_ctx.ctx = sq; 559 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 560 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 561 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 562 563 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 564 } 565 566 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 567 { 568 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 569 struct mlx5e_tx_timeout_ctx to_ctx = {}; 570 struct mlx5e_priv *priv = sq->priv; 571 struct mlx5e_err_ctx err_ctx = {}; 572 573 to_ctx.sq = sq; 574 err_ctx.ctx = &to_ctx; 575 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 576 err_ctx.dump = mlx5e_tx_reporter_timeout_dump; 577 snprintf(err_str, sizeof(err_str), 578 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 579 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 580 jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start))); 581 582 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 583 return to_ctx.status; 584 } 585 586 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 587 .name = "tx", 588 .recover = mlx5e_tx_reporter_recover, 589 .diagnose = mlx5e_tx_reporter_diagnose, 590 .dump = mlx5e_tx_reporter_dump, 591 }; 592 593 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 594 595 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 596 { 597 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 598 struct devlink_health_reporter *reporter; 599 600 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, 601 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 602 if (IS_ERR(reporter)) { 603 netdev_warn(priv->netdev, 604 "Failed to create tx reporter, err = %ld\n", 605 PTR_ERR(reporter)); 606 return; 607 } 608 priv->tx_reporter = reporter; 609 } 610 611 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 612 { 613 if (!priv->tx_reporter) 614 return; 615 616 devlink_health_reporter_destroy(priv->tx_reporter); 617 priv->tx_reporter = NULL; 618 } 619