1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Mellanox Technologies. 3 4 #include "health.h" 5 #include "params.h" 6 #include "txrx.h" 7 #include "devlink.h" 8 9 static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) 10 { 11 int outlen = MLX5_ST_SZ_BYTES(query_rq_out); 12 void *out; 13 void *rqc; 14 int err; 15 16 out = kvzalloc(outlen, GFP_KERNEL); 17 if (!out) 18 return -ENOMEM; 19 20 err = mlx5_core_query_rq(dev, rqn, out); 21 if (err) 22 goto out; 23 24 rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); 25 *state = MLX5_GET(rqc, rqc, state); 26 27 out: 28 kvfree(out); 29 return err; 30 } 31 32 static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) 33 { 34 unsigned long exp_time = jiffies + 35 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 36 37 while (time_before(jiffies, exp_time)) { 38 if (icosq->cc == icosq->pc) 39 return 0; 40 41 msleep(20); 42 } 43 44 netdev_err(icosq->channel->netdev, 45 "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n", 46 icosq->sqn, icosq->cc, icosq->pc); 47 48 return -ETIMEDOUT; 49 } 50 51 static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq) 52 { 53 WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n", 54 icosq->sqn, icosq->cc, icosq->pc); 55 icosq->cc = 0; 56 icosq->pc = 0; 57 } 58 59 static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx) 60 { 61 struct mlx5_core_dev *mdev; 62 struct mlx5e_icosq *icosq; 63 struct net_device *dev; 64 struct mlx5e_rq *rq; 65 u8 state; 66 int err; 67 68 icosq = ctx; 69 rq = &icosq->channel->rq; 70 mdev = icosq->channel->mdev; 71 dev = icosq->channel->netdev; 72 err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); 73 if (err) { 74 netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n", 75 icosq->sqn, err); 76 goto out; 77 } 78 79 if (state != MLX5_SQC_STATE_ERR) 80 goto out; 81 82 mlx5e_deactivate_rq(rq); 83 err = mlx5e_wait_for_icosq_flush(icosq); 84 if (err) 85 goto out; 86 87 mlx5e_deactivate_icosq(icosq); 88 89 /* At this point, both the rq and the icosq are disabled */ 90 91 err = mlx5e_health_sq_to_ready(mdev, dev, icosq->sqn); 92 if (err) 93 goto out; 94 95 mlx5e_reset_icosq_cc_pc(icosq); 96 mlx5e_free_rx_in_progress_descs(rq); 97 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 98 mlx5e_activate_icosq(icosq); 99 mlx5e_activate_rq(rq); 100 101 rq->stats->recover++; 102 return 0; 103 out: 104 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 105 return err; 106 } 107 108 static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state) 109 { 110 struct net_device *dev = rq->netdev; 111 int err; 112 113 err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST); 114 if (err) { 115 netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn); 116 return err; 117 } 118 err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); 119 if (err) { 120 netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn); 121 return err; 122 } 123 124 return 0; 125 } 126 127 static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) 128 { 129 struct mlx5e_rq *rq = ctx; 130 int err; 131 132 mlx5e_deactivate_rq(rq); 133 mlx5e_free_rx_descs(rq); 134 135 err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR); 136 if (err) 137 goto out; 138 139 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 140 mlx5e_activate_rq(rq); 141 rq->stats->recover++; 142 return 0; 143 out: 144 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 145 return err; 146 } 147 148 static int mlx5e_rx_reporter_timeout_recover(void *ctx) 149 { 150 struct mlx5_eq_comp *eq; 151 struct mlx5e_rq *rq; 152 int err; 153 154 rq = ctx; 155 eq = rq->cq.mcq.eq; 156 157 err = mlx5e_health_channel_eq_recover(rq->netdev, eq, rq->cq.ch_stats); 158 if (err && rq->icosq) 159 clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); 160 161 return err; 162 } 163 164 static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 165 { 166 return err_ctx->recover(err_ctx->ctx); 167 } 168 169 static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, 170 void *context, 171 struct netlink_ext_ack *extack) 172 { 173 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 174 struct mlx5e_err_ctx *err_ctx = context; 175 176 return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : 177 mlx5e_health_recover_channels(priv); 178 } 179 180 static int mlx5e_reporter_icosq_diagnose(struct mlx5e_icosq *icosq, u8 hw_state, 181 struct devlink_fmsg *fmsg) 182 { 183 int err; 184 185 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); 186 if (err) 187 return err; 188 189 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", icosq->sqn); 190 if (err) 191 return err; 192 193 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); 194 if (err) 195 return err; 196 197 err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cc); 198 if (err) 199 return err; 200 201 err = devlink_fmsg_u32_pair_put(fmsg, "pc", icosq->pc); 202 if (err) 203 return err; 204 205 err = devlink_fmsg_u32_pair_put(fmsg, "WQE size", 206 mlx5_wq_cyc_get_size(&icosq->wq)); 207 if (err) 208 return err; 209 210 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ"); 211 if (err) 212 return err; 213 214 err = devlink_fmsg_u32_pair_put(fmsg, "cqn", icosq->cq.mcq.cqn); 215 if (err) 216 return err; 217 218 err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cq.wq.cc); 219 if (err) 220 return err; 221 222 err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&icosq->cq.wq)); 223 if (err) 224 return err; 225 226 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 227 if (err) 228 return err; 229 230 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 231 } 232 233 static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, 234 struct devlink_fmsg *fmsg) 235 { 236 u16 wqe_counter; 237 int wqes_sz; 238 u8 hw_state; 239 u16 wq_head; 240 int err; 241 242 err = mlx5e_query_rq_state(rq->mdev, rq->rqn, &hw_state); 243 if (err) 244 return err; 245 246 wqes_sz = mlx5e_rqwq_get_cur_sz(rq); 247 wq_head = mlx5e_rqwq_get_head(rq); 248 wqe_counter = mlx5e_rqwq_get_wqe_counter(rq); 249 250 err = devlink_fmsg_obj_nest_start(fmsg); 251 if (err) 252 return err; 253 254 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", rq->ix); 255 if (err) 256 return err; 257 258 err = devlink_fmsg_u32_pair_put(fmsg, "rqn", rq->rqn); 259 if (err) 260 return err; 261 262 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); 263 if (err) 264 return err; 265 266 err = devlink_fmsg_u8_pair_put(fmsg, "SW state", rq->state); 267 if (err) 268 return err; 269 270 err = devlink_fmsg_u32_pair_put(fmsg, "WQE counter", wqe_counter); 271 if (err) 272 return err; 273 274 err = devlink_fmsg_u32_pair_put(fmsg, "posted WQEs", wqes_sz); 275 if (err) 276 return err; 277 278 err = devlink_fmsg_u32_pair_put(fmsg, "cc", wq_head); 279 if (err) 280 return err; 281 282 err = mlx5e_health_cq_diag_fmsg(&rq->cq, fmsg); 283 if (err) 284 return err; 285 286 err = mlx5e_health_eq_diag_fmsg(rq->cq.mcq.eq, fmsg); 287 if (err) 288 return err; 289 290 if (rq->icosq) { 291 struct mlx5e_icosq *icosq = rq->icosq; 292 u8 icosq_hw_state; 293 294 err = mlx5_core_query_sq_state(rq->mdev, icosq->sqn, &icosq_hw_state); 295 if (err) 296 return err; 297 298 err = mlx5e_reporter_icosq_diagnose(icosq, icosq_hw_state, fmsg); 299 if (err) 300 return err; 301 } 302 303 err = devlink_fmsg_obj_nest_end(fmsg); 304 if (err) 305 return err; 306 307 return 0; 308 } 309 310 static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, 311 struct devlink_fmsg *fmsg, 312 struct netlink_ext_ack *extack) 313 { 314 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 315 struct mlx5e_params *params = &priv->channels.params; 316 struct mlx5e_rq *generic_rq; 317 u32 rq_stride, rq_sz; 318 int i, err = 0; 319 320 mutex_lock(&priv->state_lock); 321 322 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 323 goto unlock; 324 325 generic_rq = &priv->channels.c[0]->rq; 326 rq_sz = mlx5e_rqwq_get_size(generic_rq); 327 rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(priv->mdev, params, NULL)); 328 329 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common config"); 330 if (err) 331 goto unlock; 332 333 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); 334 if (err) 335 goto unlock; 336 337 err = devlink_fmsg_u8_pair_put(fmsg, "type", params->rq_wq_type); 338 if (err) 339 goto unlock; 340 341 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", rq_stride); 342 if (err) 343 goto unlock; 344 345 err = devlink_fmsg_u32_pair_put(fmsg, "size", rq_sz); 346 if (err) 347 goto unlock; 348 349 err = mlx5e_health_cq_common_diag_fmsg(&generic_rq->cq, fmsg); 350 if (err) 351 goto unlock; 352 353 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 354 if (err) 355 goto unlock; 356 357 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 358 if (err) 359 goto unlock; 360 361 err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); 362 if (err) 363 goto unlock; 364 365 for (i = 0; i < priv->channels.num; i++) { 366 struct mlx5e_rq *rq = &priv->channels.c[i]->rq; 367 368 err = mlx5e_rx_reporter_build_diagnose_output(rq, fmsg); 369 if (err) 370 goto unlock; 371 } 372 err = devlink_fmsg_arr_pair_nest_end(fmsg); 373 if (err) 374 goto unlock; 375 unlock: 376 mutex_unlock(&priv->state_lock); 377 return err; 378 } 379 380 static int mlx5e_rx_reporter_dump_icosq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 381 void *ctx) 382 { 383 struct mlx5e_txqsq *icosq = ctx; 384 struct mlx5_rsc_key key = {}; 385 int err; 386 387 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 388 return 0; 389 390 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 391 if (err) 392 return err; 393 394 key.size = PAGE_SIZE; 395 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 396 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 397 if (err) 398 return err; 399 400 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 401 if (err) 402 return err; 403 404 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); 405 if (err) 406 return err; 407 408 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 409 if (err) 410 return err; 411 412 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 413 key.index1 = icosq->sqn; 414 key.num_of_obj1 = 1; 415 416 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 417 if (err) 418 return err; 419 420 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 421 if (err) 422 return err; 423 424 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 425 if (err) 426 return err; 427 428 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 429 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 430 431 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 432 if (err) 433 return err; 434 435 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 436 if (err) 437 return err; 438 439 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 440 } 441 442 static int mlx5e_rx_reporter_dump_rq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 443 void *ctx) 444 { 445 struct mlx5_rsc_key key = {}; 446 struct mlx5e_rq *rq = ctx; 447 int err; 448 449 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 450 return 0; 451 452 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); 453 if (err) 454 return err; 455 456 key.size = PAGE_SIZE; 457 key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; 458 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 459 if (err) 460 return err; 461 462 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 463 if (err) 464 return err; 465 466 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); 467 if (err) 468 return err; 469 470 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 471 if (err) 472 return err; 473 474 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 475 key.index1 = rq->rqn; 476 key.num_of_obj1 = 1; 477 478 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 479 if (err) 480 return err; 481 482 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 483 if (err) 484 return err; 485 486 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "receive_buff"); 487 if (err) 488 return err; 489 490 key.rsc = MLX5_SGMT_TYPE_RCV_BUFF; 491 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 492 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 493 if (err) 494 return err; 495 496 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 497 if (err) 498 return err; 499 500 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 501 } 502 503 static int mlx5e_rx_reporter_dump_all_rqs(struct mlx5e_priv *priv, 504 struct devlink_fmsg *fmsg) 505 { 506 struct mlx5_rsc_key key = {}; 507 int i, err; 508 509 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 510 return 0; 511 512 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); 513 if (err) 514 return err; 515 516 key.size = PAGE_SIZE; 517 key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; 518 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 519 if (err) 520 return err; 521 522 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 523 if (err) 524 return err; 525 526 err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); 527 if (err) 528 return err; 529 530 for (i = 0; i < priv->channels.num; i++) { 531 struct mlx5e_rq *rq = &priv->channels.c[i]->rq; 532 533 err = mlx5e_health_queue_dump(priv, fmsg, rq->rqn, "RQ"); 534 if (err) 535 return err; 536 } 537 538 return devlink_fmsg_arr_pair_nest_end(fmsg); 539 } 540 541 static int mlx5e_rx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 542 struct mlx5e_err_ctx *err_ctx, 543 struct devlink_fmsg *fmsg) 544 { 545 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 546 } 547 548 static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, 549 struct devlink_fmsg *fmsg, void *context, 550 struct netlink_ext_ack *extack) 551 { 552 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 553 struct mlx5e_err_ctx *err_ctx = context; 554 555 return err_ctx ? mlx5e_rx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 556 mlx5e_rx_reporter_dump_all_rqs(priv, fmsg); 557 } 558 559 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) 560 { 561 char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; 562 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 563 struct mlx5e_icosq *icosq = rq->icosq; 564 struct mlx5e_priv *priv = rq->priv; 565 struct mlx5e_err_ctx err_ctx = {}; 566 567 err_ctx.ctx = rq; 568 err_ctx.recover = mlx5e_rx_reporter_timeout_recover; 569 err_ctx.dump = mlx5e_rx_reporter_dump_rq; 570 571 if (icosq) 572 snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); 573 snprintf(err_str, sizeof(err_str), 574 "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", 575 rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); 576 577 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 578 } 579 580 void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq) 581 { 582 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 583 struct mlx5e_priv *priv = rq->priv; 584 struct mlx5e_err_ctx err_ctx = {}; 585 586 err_ctx.ctx = rq; 587 err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; 588 err_ctx.dump = mlx5e_rx_reporter_dump_rq; 589 snprintf(err_str, sizeof(err_str), "ERR CQE on RQ: 0x%x", rq->rqn); 590 591 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 592 } 593 594 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq) 595 { 596 struct mlx5e_priv *priv = icosq->channel->priv; 597 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 598 struct mlx5e_err_ctx err_ctx = {}; 599 600 err_ctx.ctx = icosq; 601 err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover; 602 err_ctx.dump = mlx5e_rx_reporter_dump_icosq; 603 snprintf(err_str, sizeof(err_str), "ERR CQE on ICOSQ: 0x%x", icosq->sqn); 604 605 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 606 } 607 608 static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { 609 .name = "rx", 610 .recover = mlx5e_rx_reporter_recover, 611 .diagnose = mlx5e_rx_reporter_diagnose, 612 .dump = mlx5e_rx_reporter_dump, 613 }; 614 615 #define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 616 617 void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) 618 { 619 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 620 struct devlink_health_reporter *reporter; 621 622 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops, 623 MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); 624 if (IS_ERR(reporter)) { 625 netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", 626 PTR_ERR(reporter)); 627 return; 628 } 629 priv->rx_reporter = reporter; 630 } 631 632 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) 633 { 634 if (!priv->rx_reporter) 635 return; 636 637 devlink_port_health_reporter_destroy(priv->rx_reporter); 638 priv->rx_reporter = NULL; 639 } 640