1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 #include "en/ptp.h" 6 7 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 8 { 9 unsigned long exp_time = jiffies + 10 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 11 12 while (time_before(jiffies, exp_time)) { 13 if (sq->cc == sq->pc) 14 return 0; 15 16 msleep(20); 17 } 18 19 netdev_err(sq->netdev, 20 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 21 sq->sqn, sq->cc, sq->pc); 22 23 return -ETIMEDOUT; 24 } 25 26 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 27 { 28 WARN_ONCE(sq->cc != sq->pc, 29 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 30 sq->sqn, sq->cc, sq->pc); 31 sq->cc = 0; 32 sq->dma_fifo_cc = 0; 33 sq->pc = 0; 34 } 35 36 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 37 { 38 struct mlx5_core_dev *mdev; 39 struct net_device *dev; 40 struct mlx5e_txqsq *sq; 41 u8 state; 42 int err; 43 44 sq = ctx; 45 mdev = sq->mdev; 46 dev = sq->netdev; 47 48 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 49 return 0; 50 51 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 52 if (err) { 53 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 54 sq->sqn, err); 55 goto out; 56 } 57 58 if (state != MLX5_SQC_STATE_ERR) 59 goto out; 60 61 mlx5e_tx_disable_queue(sq->txq); 62 63 err = mlx5e_wait_for_sq_flush(sq); 64 if (err) 65 goto out; 66 67 /* At this point, no new packets will arrive from the stack as TXQ is 68 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 69 * pending WQEs. SQ can safely reset the SQ. 70 */ 71 72 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); 73 if (err) 74 goto out; 75 76 mlx5e_reset_txqsq_cc_pc(sq); 77 sq->stats->recover++; 78 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 79 mlx5e_activate_txqsq(sq); 80 81 return 0; 82 out: 83 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 84 return err; 85 } 86 87 struct mlx5e_tx_timeout_ctx { 88 struct mlx5e_txqsq *sq; 89 signed int status; 90 }; 91 92 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 93 { 94 struct mlx5e_tx_timeout_ctx *to_ctx; 95 struct mlx5e_priv *priv; 96 struct mlx5_eq_comp *eq; 97 struct mlx5e_txqsq *sq; 98 int err; 99 100 to_ctx = ctx; 101 sq = to_ctx->sq; 102 eq = sq->cq.mcq.eq; 103 priv = sq->priv; 104 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); 105 if (!err) { 106 to_ctx->status = 0; /* this sq recovered */ 107 return err; 108 } 109 110 err = mlx5e_safe_reopen_channels(priv); 111 if (!err) { 112 to_ctx->status = 1; /* all channels recovered */ 113 return err; 114 } 115 116 to_ctx->status = err; 117 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 118 netdev_err(priv->netdev, 119 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 120 err); 121 122 return err; 123 } 124 125 /* state lock cannot be grabbed within this function. 126 * It can cause a dead lock or a read-after-free. 127 */ 128 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 129 { 130 return err_ctx->recover(err_ctx->ctx); 131 } 132 133 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 134 void *context, 135 struct netlink_ext_ack *extack) 136 { 137 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 138 struct mlx5e_err_ctx *err_ctx = context; 139 140 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 141 mlx5e_health_recover_channels(priv); 142 } 143 144 static int 145 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, 146 struct mlx5e_txqsq *sq, int tc) 147 { 148 bool stopped = netif_xmit_stopped(sq->txq); 149 struct mlx5e_priv *priv = sq->priv; 150 u8 state; 151 int err; 152 153 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 154 if (err) 155 return err; 156 157 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 178 if (err) 179 return err; 180 181 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 182 if (err) 183 return err; 184 185 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 186 if (err) 187 return err; 188 189 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 190 } 191 192 static int 193 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 194 struct mlx5e_txqsq *sq, int tc) 195 { 196 int err; 197 198 err = devlink_fmsg_obj_nest_start(fmsg); 199 if (err) 200 return err; 201 202 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 203 if (err) 204 return err; 205 206 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); 207 if (err) 208 return err; 209 210 err = devlink_fmsg_obj_nest_end(fmsg); 211 if (err) 212 return err; 213 214 return 0; 215 } 216 217 static int 218 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, 219 struct mlx5e_ptpsq *ptpsq, int tc) 220 { 221 int err; 222 223 err = devlink_fmsg_obj_nest_start(fmsg); 224 if (err) 225 return err; 226 227 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); 228 if (err) 229 return err; 230 231 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, 232 &ptpsq->txqsq, 233 tc); 234 if (err) 235 return err; 236 237 err = devlink_fmsg_obj_nest_end(fmsg); 238 if (err) 239 return err; 240 241 return 0; 242 } 243 244 static int 245 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, 246 struct mlx5e_txqsq *txqsq) 247 { 248 u32 sq_stride, sq_sz; 249 int err; 250 251 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 252 if (err) 253 return err; 254 255 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); 256 sq_stride = MLX5_SEND_WQE_BB; 257 258 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 259 if (err) 260 return err; 261 262 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 263 if (err) 264 return err; 265 266 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); 267 if (err) 268 return err; 269 270 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 271 } 272 273 static int 274 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 275 struct devlink_fmsg *fmsg) 276 { 277 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 278 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 279 struct mlx5e_ptpsq *generic_ptpsq; 280 int err; 281 282 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 283 if (err) 284 return err; 285 286 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); 287 if (err) 288 return err; 289 290 generic_ptpsq = priv->channels.port_ptp ? 291 &priv->channels.port_ptp->ptpsq[0] : 292 NULL; 293 if (!generic_ptpsq) 294 goto out; 295 296 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); 297 if (err) 298 return err; 299 300 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); 301 if (err) 302 return err; 303 304 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 305 if (err) 306 return err; 307 308 out: 309 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 310 } 311 312 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 313 struct devlink_fmsg *fmsg, 314 struct netlink_ext_ack *extack) 315 { 316 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 317 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp; 318 319 int i, tc, err = 0; 320 321 mutex_lock(&priv->state_lock); 322 323 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 324 goto unlock; 325 326 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); 327 if (err) 328 goto unlock; 329 330 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 331 if (err) 332 goto unlock; 333 334 for (i = 0; i < priv->channels.num; i++) { 335 struct mlx5e_channel *c = priv->channels.c[i]; 336 337 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 338 struct mlx5e_txqsq *sq = &c->sq[tc]; 339 340 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 341 if (err) 342 goto unlock; 343 } 344 } 345 346 if (!ptp_ch) 347 goto close_sqs_nest; 348 349 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 350 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, 351 &ptp_ch->ptpsq[tc], 352 tc); 353 if (err) 354 goto unlock; 355 } 356 357 close_sqs_nest: 358 err = devlink_fmsg_arr_pair_nest_end(fmsg); 359 if (err) 360 goto unlock; 361 362 unlock: 363 mutex_unlock(&priv->state_lock); 364 return err; 365 } 366 367 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 368 void *ctx) 369 { 370 struct mlx5_rsc_key key = {}; 371 struct mlx5e_txqsq *sq = ctx; 372 int err; 373 374 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 375 return 0; 376 377 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 378 if (err) 379 return err; 380 381 key.size = PAGE_SIZE; 382 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 383 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 384 if (err) 385 return err; 386 387 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 388 if (err) 389 return err; 390 391 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 392 if (err) 393 return err; 394 395 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 396 if (err) 397 return err; 398 399 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 400 key.index1 = sq->sqn; 401 key.num_of_obj1 = 1; 402 403 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 404 if (err) 405 return err; 406 407 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 408 if (err) 409 return err; 410 411 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 412 if (err) 413 return err; 414 415 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 416 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 417 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 418 if (err) 419 return err; 420 421 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 422 if (err) 423 return err; 424 425 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 426 } 427 428 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 429 struct devlink_fmsg *fmsg) 430 { 431 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp; 432 struct mlx5_rsc_key key = {}; 433 int i, tc, err; 434 435 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 436 return 0; 437 438 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 439 if (err) 440 return err; 441 442 key.size = PAGE_SIZE; 443 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 444 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 445 if (err) 446 return err; 447 448 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 449 if (err) 450 return err; 451 452 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 453 if (err) 454 return err; 455 456 for (i = 0; i < priv->channels.num; i++) { 457 struct mlx5e_channel *c = priv->channels.c[i]; 458 459 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 460 struct mlx5e_txqsq *sq = &c->sq[tc]; 461 462 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 463 if (err) 464 return err; 465 } 466 } 467 468 if (ptp_ch) { 469 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 470 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; 471 472 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); 473 if (err) 474 return err; 475 } 476 } 477 478 return devlink_fmsg_arr_pair_nest_end(fmsg); 479 } 480 481 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 482 struct mlx5e_err_ctx *err_ctx, 483 struct devlink_fmsg *fmsg) 484 { 485 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 486 } 487 488 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 489 struct devlink_fmsg *fmsg, void *context, 490 struct netlink_ext_ack *extack) 491 { 492 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 493 struct mlx5e_err_ctx *err_ctx = context; 494 495 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 496 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 497 } 498 499 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 500 { 501 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 502 struct mlx5e_priv *priv = sq->priv; 503 struct mlx5e_err_ctx err_ctx = {}; 504 505 err_ctx.ctx = sq; 506 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 507 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 508 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 509 510 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 511 } 512 513 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 514 { 515 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 516 struct mlx5e_tx_timeout_ctx to_ctx = {}; 517 struct mlx5e_priv *priv = sq->priv; 518 struct mlx5e_err_ctx err_ctx = {}; 519 520 to_ctx.sq = sq; 521 err_ctx.ctx = &to_ctx; 522 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 523 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 524 snprintf(err_str, sizeof(err_str), 525 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 526 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 527 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 528 529 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 530 return to_ctx.status; 531 } 532 533 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 534 .name = "tx", 535 .recover = mlx5e_tx_reporter_recover, 536 .diagnose = mlx5e_tx_reporter_diagnose, 537 .dump = mlx5e_tx_reporter_dump, 538 }; 539 540 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 541 542 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 543 { 544 struct devlink_health_reporter *reporter; 545 546 reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops, 547 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 548 if (IS_ERR(reporter)) { 549 netdev_warn(priv->netdev, 550 "Failed to create tx reporter, err = %ld\n", 551 PTR_ERR(reporter)); 552 return; 553 } 554 priv->tx_reporter = reporter; 555 } 556 557 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 558 { 559 if (!priv->tx_reporter) 560 return; 561 562 devlink_port_health_reporter_destroy(priv->tx_reporter); 563 } 564