1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 7 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 8 { 9 unsigned long exp_time = jiffies + 10 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 11 12 while (time_before(jiffies, exp_time)) { 13 if (sq->cc == sq->pc) 14 return 0; 15 16 msleep(20); 17 } 18 19 netdev_err(sq->netdev, 20 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 21 sq->sqn, sq->cc, sq->pc); 22 23 return -ETIMEDOUT; 24 } 25 26 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 27 { 28 WARN_ONCE(sq->cc != sq->pc, 29 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 30 sq->sqn, sq->cc, sq->pc); 31 sq->cc = 0; 32 sq->dma_fifo_cc = 0; 33 sq->pc = 0; 34 } 35 36 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 37 { 38 struct mlx5_core_dev *mdev; 39 struct net_device *dev; 40 struct mlx5e_txqsq *sq; 41 u8 state; 42 int err; 43 44 sq = ctx; 45 mdev = sq->mdev; 46 dev = sq->netdev; 47 48 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 49 return 0; 50 51 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 52 if (err) { 53 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 54 sq->sqn, err); 55 goto out; 56 } 57 58 if (state != MLX5_SQC_STATE_ERR) 59 goto out; 60 61 mlx5e_tx_disable_queue(sq->txq); 62 63 err = mlx5e_wait_for_sq_flush(sq); 64 if (err) 65 goto out; 66 67 /* At this point, no new packets will arrive from the stack as TXQ is 68 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 69 * pending WQEs. SQ can safely reset the SQ. 70 */ 71 72 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 73 if (err) 74 goto out; 75 76 mlx5e_reset_txqsq_cc_pc(sq); 77 sq->stats->recover++; 78 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 79 mlx5e_activate_txqsq(sq); 80 81 return 0; 82 out: 83 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 84 return err; 85 } 86 87 struct mlx5e_tx_timeout_ctx { 88 struct mlx5e_txqsq *sq; 89 signed int status; 90 }; 91 92 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 93 { 94 struct mlx5e_tx_timeout_ctx *to_ctx; 95 struct mlx5e_priv *priv; 96 struct mlx5_eq_comp *eq; 97 struct mlx5e_txqsq *sq; 98 int err; 99 100 to_ctx = ctx; 101 sq = to_ctx->sq; 102 eq = sq->cq.mcq.eq; 103 priv = sq->priv; 104 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 105 if (!err) { 106 to_ctx->status = 0; /* this sq recovered */ 107 return err; 108 } 109 110 err = mlx5e_safe_reopen_channels(priv); 111 if (!err) { 112 to_ctx->status = 1; /* all channels recovered */ 113 return err; 114 } 115 116 to_ctx->status = err; 117 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 118 netdev_err(priv->netdev, 119 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 120 err); 121 122 return err; 123 } 124 125 /* state lock cannot be grabbed within this function. 126 * It can cause a dead lock or a read-after-free. 127 */ 128 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 129 { 130 return err_ctx->recover(err_ctx->ctx); 131 } 132 133 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 134 void *context, 135 struct netlink_ext_ack *extack) 136 { 137 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 138 struct mlx5e_err_ctx *err_ctx = context; 139 140 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 141 mlx5e_health_recover_channels(priv); 142 } 143 144 static int 145 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 146 struct mlx5e_txqsq *sq, int tc) 147 { 148 bool stopped = netif_xmit_stopped(sq->txq); 149 struct mlx5e_priv *priv = sq->priv; 150 u8 state; 151 int err; 152 153 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 154 if (err) 155 return err; 156 157 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 178 if (err) 179 return err; 180 181 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 182 if (err) 183 return err; 184 185 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 186 if (err) 187 return err; 188 189 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 190 } 191 192 static int 193 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 194 struct mlx5e_txqsq *sq, int tc) 195 { 196 int err; 197 198 err = devlink_fmsg_obj_nest_start(fmsg); 199 if (err) 200 return err; 201 202 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 203 if (err) 204 return err; 205 206 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 207 if (err) 208 return err; 209 210 err = devlink_fmsg_obj_nest_end(fmsg); 211 if (err) 212 return err; 213 214 return 0; 215 } 216 217 static int 218 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 219 struct mlx5e_ptpsq *ptpsq, int tc) 220 { 221 int err; 222 223 err = devlink_fmsg_obj_nest_start(fmsg); 224 if (err) 225 return err; 226 227 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 228 if (err) 229 return err; 230 231 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); 232 if (err) 233 return err; 234 235 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 236 if (err) 237 return err; 238 239 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); 240 if (err) 241 return err; 242 243 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 244 if (err) 245 return err; 246 247 err = devlink_fmsg_obj_nest_end(fmsg); 248 if (err) 249 return err; 250 251 return 0; 252 } 253 254 static int 255 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 256 struct mlx5e_txqsq *txqsq) 257 { 258 u32 sq_stride, sq_sz; 259 int err; 260 261 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 262 if (err) 263 return err; 264 265 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 266 sq_stride = MLX5_SEND_WQE_BB; 267 268 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 269 if (err) 270 return err; 271 272 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 273 if (err) 274 return err; 275 276 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 277 if (err) 278 return err; 279 280 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 281 } 282 283 static int 284 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, 285 struct mlx5e_ptpsq *ptpsq) 286 { 287 int err; 288 289 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); 290 if (err) 291 return err; 292 293 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); 294 if (err) 295 return err; 296 297 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 298 } 299 300 static int 301 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 302 struct devlink_fmsg *fmsg) 303 { 304 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 305 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 306 struct mlx5e_ptpsq *generic_ptpsq; 307 int err; 308 309 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 310 if (err) 311 return err; 312 313 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 314 if (err) 315 return err; 316 317 generic_ptpsq = priv->channels.port_ptp ? 318 &priv->channels.port_ptp->ptpsq[0] : 319 NULL; 320 if (!generic_ptpsq) 321 goto out; 322 323 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 324 if (err) 325 return err; 326 327 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 328 if (err) 329 return err; 330 331 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); 332 if (err) 333 return err; 334 335 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 336 if (err) 337 return err; 338 339 out: 340 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 341 } 342 343 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 344 struct devlink_fmsg *fmsg, 345 struct netlink_ext_ack *extack) 346 { 347 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 348 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp; 349 350 int i, tc, err = 0; 351 352 mutex_lock(&priv->state_lock); 353 354 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 355 goto unlock; 356 357 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 358 if (err) 359 goto unlock; 360 361 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 362 if (err) 363 goto unlock; 364 365 for (i = 0; i < priv->channels.num; i++) { 366 struct mlx5e_channel *c = priv->channels.c[i]; 367 368 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 369 struct mlx5e_txqsq *sq = &c->sq[tc]; 370 371 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 372 if (err) 373 goto unlock; 374 } 375 } 376 377 if (!ptp_ch) 378 goto close_sqs_nest; 379 380 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 381 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 382 &ptp_ch->ptpsq[tc], 383 tc); 384 if (err) 385 goto unlock; 386 } 387 388 close_sqs_nest: 389 err = devlink_fmsg_arr_pair_nest_end(fmsg); 390 if (err) 391 goto unlock; 392 393 unlock: 394 mutex_unlock(&priv->state_lock); 395 return err; 396 } 397 398 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 399 void *ctx) 400 { 401 struct mlx5_rsc_key key = {}; 402 struct mlx5e_txqsq *sq = ctx; 403 int err; 404 405 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 406 return 0; 407 408 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 409 if (err) 410 return err; 411 412 key.size = PAGE_SIZE; 413 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 414 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 415 if (err) 416 return err; 417 418 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 419 if (err) 420 return err; 421 422 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 423 if (err) 424 return err; 425 426 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 427 if (err) 428 return err; 429 430 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 431 key.index1 = sq->sqn; 432 key.num_of_obj1 = 1; 433 434 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 435 if (err) 436 return err; 437 438 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 439 if (err) 440 return err; 441 442 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 443 if (err) 444 return err; 445 446 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 447 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 448 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 449 if (err) 450 return err; 451 452 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 453 if (err) 454 return err; 455 456 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 457 } 458 459 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 460 struct devlink_fmsg *fmsg) 461 { 462 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp; 463 struct mlx5_rsc_key key = {}; 464 int i, tc, err; 465 466 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 467 return 0; 468 469 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 470 if (err) 471 return err; 472 473 key.size = PAGE_SIZE; 474 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 475 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 476 if (err) 477 return err; 478 479 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 480 if (err) 481 return err; 482 483 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 484 if (err) 485 return err; 486 487 for (i = 0; i < priv->channels.num; i++) { 488 struct mlx5e_channel *c = priv->channels.c[i]; 489 490 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 491 struct mlx5e_txqsq *sq = &c->sq[tc]; 492 493 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 494 if (err) 495 return err; 496 } 497 } 498 499 if (ptp_ch) { 500 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 501 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 502 503 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 504 if (err) 505 return err; 506 } 507 } 508 509 return devlink_fmsg_arr_pair_nest_end(fmsg); 510 } 511 512 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 513 struct mlx5e_err_ctx *err_ctx, 514 struct devlink_fmsg *fmsg) 515 { 516 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 517 } 518 519 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 520 struct devlink_fmsg *fmsg, void *context, 521 struct netlink_ext_ack *extack) 522 { 523 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 524 struct mlx5e_err_ctx *err_ctx = context; 525 526 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 527 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 528 } 529 530 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 531 { 532 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 533 struct mlx5e_priv *priv = sq->priv; 534 struct mlx5e_err_ctx err_ctx = {}; 535 536 err_ctx.ctx = sq; 537 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 538 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 539 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 540 541 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 542 } 543 544 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 545 { 546 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 547 struct mlx5e_tx_timeout_ctx to_ctx = {}; 548 struct mlx5e_priv *priv = sq->priv; 549 struct mlx5e_err_ctx err_ctx = {}; 550 551 to_ctx.sq = sq; 552 err_ctx.ctx = &to_ctx; 553 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 554 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 555 snprintf(err_str, sizeof(err_str), 556 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 557 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 558 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 559 560 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 561 return to_ctx.status; 562 } 563 564 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 565 .name = "tx", 566 .recover = mlx5e_tx_reporter_recover, 567 .diagnose = mlx5e_tx_reporter_diagnose, 568 .dump = mlx5e_tx_reporter_dump, 569 }; 570 571 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 572 573 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 574 { 575 struct devlink_health_reporter *reporter; 576 577 reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops, 578 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 579 if (IS_ERR(reporter)) { 580 netdev_warn(priv->netdev, 581 "Failed to create tx reporter, err = %ld\n", 582 PTR_ERR(reporter)); 583 return; 584 } 585 priv->tx_reporter = reporter; 586 } 587 588 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 589 { 590 if (!priv->tx_reporter) 591 return; 592 593 devlink_port_health_reporter_destroy(priv->tx_reporter); 594 } 595