1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Mellanox Technologies. 3 4 #include "health.h" 5 #include "params.h" 6 #include "txrx.h" 7 #include "devlink.h" 8 9 static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) 10 { 11 int outlen = MLX5_ST_SZ_BYTES(query_rq_out); 12 void *out; 13 void *rqc; 14 int err; 15 16 out = kvzalloc(outlen, GFP_KERNEL); 17 if (!out) 18 return -ENOMEM; 19 20 err = mlx5_core_query_rq(dev, rqn, out); 21 if (err) 22 goto out; 23 24 rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); 25 *state = MLX5_GET(rqc, rqc, state); 26 27 out: 28 kvfree(out); 29 return err; 30 } 31 32 static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) 33 { 34 unsigned long exp_time = jiffies + 35 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 36 37 while (time_before(jiffies, exp_time)) { 38 if (icosq->cc == icosq->pc) 39 return 0; 40 41 msleep(20); 42 } 43 44 netdev_err(icosq->channel->netdev, 45 "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n", 46 icosq->sqn, icosq->cc, icosq->pc); 47 48 return -ETIMEDOUT; 49 } 50 51 static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq) 52 { 53 WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n", 54 icosq->sqn, icosq->cc, icosq->pc); 55 icosq->cc = 0; 56 icosq->pc = 0; 57 } 58 59 static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx) 60 { 61 struct mlx5_core_dev *mdev; 62 struct mlx5e_icosq *icosq; 63 struct net_device *dev; 64 struct mlx5e_rq *rq; 65 u8 state; 66 int err; 67 68 icosq = ctx; 69 rq = &icosq->channel->rq; 70 mdev = icosq->channel->mdev; 71 dev = icosq->channel->netdev; 72 err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); 73 if (err) { 74 netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n", 75 icosq->sqn, err); 76 goto out; 77 } 78 79 if (state != MLX5_SQC_STATE_ERR) 80 goto out; 81 82 mlx5e_deactivate_rq(rq); 83 err = mlx5e_wait_for_icosq_flush(icosq); 84 if (err) 85 goto out; 86 87 mlx5e_deactivate_icosq(icosq); 88 89 /* At this point, both the rq and the icosq are disabled */ 90 91 err = mlx5e_health_sq_to_ready(mdev, dev, icosq->sqn); 92 if (err) 93 goto out; 94 95 mlx5e_reset_icosq_cc_pc(icosq); 96 mlx5e_free_rx_in_progress_descs(rq); 97 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 98 mlx5e_activate_icosq(icosq); 99 mlx5e_activate_rq(rq); 100 101 rq->stats->recover++; 102 return 0; 103 out: 104 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 105 return err; 106 } 107 108 static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state) 109 { 110 struct net_device *dev = rq->netdev; 111 int err; 112 113 err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST); 114 if (err) { 115 netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn); 116 return err; 117 } 118 err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); 119 if (err) { 120 netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn); 121 return err; 122 } 123 124 return 0; 125 } 126 127 static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) 128 { 129 struct mlx5e_rq *rq = ctx; 130 int err; 131 132 mlx5e_deactivate_rq(rq); 133 mlx5e_free_rx_descs(rq); 134 135 err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR); 136 if (err) 137 goto out; 138 139 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 140 mlx5e_activate_rq(rq); 141 rq->stats->recover++; 142 return 0; 143 out: 144 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 145 return err; 146 } 147 148 static int mlx5e_rx_reporter_timeout_recover(void *ctx) 149 { 150 struct mlx5_eq_comp *eq; 151 struct mlx5e_rq *rq; 152 int err; 153 154 rq = ctx; 155 eq = rq->cq.mcq.eq; 156 157 err = mlx5e_health_channel_eq_recover(rq->netdev, eq, rq->cq.ch_stats); 158 if (err && rq->icosq) 159 clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); 160 161 return err; 162 } 163 164 static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 165 { 166 return err_ctx->recover(err_ctx->ctx); 167 } 168 169 static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, 170 void *context, 171 struct netlink_ext_ack *extack) 172 { 173 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 174 struct mlx5e_err_ctx *err_ctx = context; 175 176 return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : 177 mlx5e_health_recover_channels(priv); 178 } 179 180 static int mlx5e_reporter_icosq_diagnose(struct mlx5e_icosq *icosq, u8 hw_state, 181 struct devlink_fmsg *fmsg) 182 { 183 int err; 184 185 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); 186 if (err) 187 return err; 188 189 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", icosq->sqn); 190 if (err) 191 return err; 192 193 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); 194 if (err) 195 return err; 196 197 err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cc); 198 if (err) 199 return err; 200 201 err = devlink_fmsg_u32_pair_put(fmsg, "pc", icosq->pc); 202 if (err) 203 return err; 204 205 err = devlink_fmsg_u32_pair_put(fmsg, "WQE size", 206 mlx5_wq_cyc_get_size(&icosq->wq)); 207 if (err) 208 return err; 209 210 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ"); 211 if (err) 212 return err; 213 214 err = devlink_fmsg_u32_pair_put(fmsg, "cqn", icosq->cq.mcq.cqn); 215 if (err) 216 return err; 217 218 err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cq.wq.cc); 219 if (err) 220 return err; 221 222 err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&icosq->cq.wq)); 223 if (err) 224 return err; 225 226 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 227 if (err) 228 return err; 229 230 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 231 } 232 233 static int 234 mlx5e_rx_reporter_build_diagnose_output_rq_common(struct mlx5e_rq *rq, 235 struct devlink_fmsg *fmsg) 236 { 237 u16 wqe_counter; 238 int wqes_sz; 239 u8 hw_state; 240 u16 wq_head; 241 int err; 242 243 err = mlx5e_query_rq_state(rq->mdev, rq->rqn, &hw_state); 244 if (err) 245 return err; 246 247 wqes_sz = mlx5e_rqwq_get_cur_sz(rq); 248 wq_head = mlx5e_rqwq_get_head(rq); 249 wqe_counter = mlx5e_rqwq_get_wqe_counter(rq); 250 251 err = devlink_fmsg_u32_pair_put(fmsg, "rqn", rq->rqn); 252 if (err) 253 return err; 254 255 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); 256 if (err) 257 return err; 258 259 err = devlink_fmsg_u8_pair_put(fmsg, "SW state", rq->state); 260 if (err) 261 return err; 262 263 err = devlink_fmsg_u32_pair_put(fmsg, "WQE counter", wqe_counter); 264 if (err) 265 return err; 266 267 err = devlink_fmsg_u32_pair_put(fmsg, "posted WQEs", wqes_sz); 268 if (err) 269 return err; 270 271 err = devlink_fmsg_u32_pair_put(fmsg, "cc", wq_head); 272 if (err) 273 return err; 274 275 err = mlx5e_health_cq_diag_fmsg(&rq->cq, fmsg); 276 if (err) 277 return err; 278 279 err = mlx5e_health_eq_diag_fmsg(rq->cq.mcq.eq, fmsg); 280 if (err) 281 return err; 282 283 if (rq->icosq) { 284 struct mlx5e_icosq *icosq = rq->icosq; 285 u8 icosq_hw_state; 286 287 err = mlx5_core_query_sq_state(rq->mdev, icosq->sqn, &icosq_hw_state); 288 if (err) 289 return err; 290 291 err = mlx5e_reporter_icosq_diagnose(icosq, icosq_hw_state, fmsg); 292 if (err) 293 return err; 294 } 295 296 return 0; 297 } 298 299 static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, 300 struct devlink_fmsg *fmsg) 301 { 302 int err; 303 304 err = devlink_fmsg_obj_nest_start(fmsg); 305 if (err) 306 return err; 307 308 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", rq->ix); 309 if (err) 310 return err; 311 312 err = mlx5e_rx_reporter_build_diagnose_output_rq_common(rq, fmsg); 313 if (err) 314 return err; 315 316 return devlink_fmsg_obj_nest_end(fmsg); 317 } 318 319 static int mlx5e_rx_reporter_diagnose_generic_rq(struct mlx5e_rq *rq, 320 struct devlink_fmsg *fmsg) 321 { 322 struct mlx5e_priv *priv = rq->priv; 323 struct mlx5e_params *params; 324 u32 rq_stride, rq_sz; 325 int err; 326 327 params = &priv->channels.params; 328 rq_sz = mlx5e_rqwq_get_size(rq); 329 rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(priv->mdev, params, NULL)); 330 331 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); 332 if (err) 333 return err; 334 335 err = devlink_fmsg_u8_pair_put(fmsg, "type", params->rq_wq_type); 336 if (err) 337 return err; 338 339 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", rq_stride); 340 if (err) 341 return err; 342 343 err = devlink_fmsg_u32_pair_put(fmsg, "size", rq_sz); 344 if (err) 345 return err; 346 347 err = mlx5e_health_cq_common_diag_fmsg(&rq->cq, fmsg); 348 if (err) 349 return err; 350 351 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 352 } 353 354 static int 355 mlx5e_rx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, 356 struct devlink_fmsg *fmsg) 357 { 358 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 359 struct mlx5e_rq *generic_rq = &priv->channels.c[0]->rq; 360 int err; 361 362 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common config"); 363 if (err) 364 return err; 365 366 err = mlx5e_rx_reporter_diagnose_generic_rq(generic_rq, fmsg); 367 if (err) 368 return err; 369 370 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 371 } 372 373 static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, 374 struct devlink_fmsg *fmsg, 375 struct netlink_ext_ack *extack) 376 { 377 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 378 int i, err = 0; 379 380 mutex_lock(&priv->state_lock); 381 382 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 383 goto unlock; 384 385 err = mlx5e_rx_reporter_diagnose_common_config(reporter, fmsg); 386 if (err) 387 goto unlock; 388 389 err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); 390 if (err) 391 goto unlock; 392 393 for (i = 0; i < priv->channels.num; i++) { 394 struct mlx5e_rq *rq = &priv->channels.c[i]->rq; 395 396 err = mlx5e_rx_reporter_build_diagnose_output(rq, fmsg); 397 if (err) 398 goto unlock; 399 } 400 err = devlink_fmsg_arr_pair_nest_end(fmsg); 401 if (err) 402 goto unlock; 403 unlock: 404 mutex_unlock(&priv->state_lock); 405 return err; 406 } 407 408 static int mlx5e_rx_reporter_dump_icosq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 409 void *ctx) 410 { 411 struct mlx5e_txqsq *icosq = ctx; 412 struct mlx5_rsc_key key = {}; 413 int err; 414 415 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 416 return 0; 417 418 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 419 if (err) 420 return err; 421 422 key.size = PAGE_SIZE; 423 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 424 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 425 if (err) 426 return err; 427 428 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 429 if (err) 430 return err; 431 432 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); 433 if (err) 434 return err; 435 436 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 437 if (err) 438 return err; 439 440 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 441 key.index1 = icosq->sqn; 442 key.num_of_obj1 = 1; 443 444 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 445 if (err) 446 return err; 447 448 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 449 if (err) 450 return err; 451 452 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 453 if (err) 454 return err; 455 456 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 457 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 458 459 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 460 if (err) 461 return err; 462 463 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 464 if (err) 465 return err; 466 467 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 468 } 469 470 static int mlx5e_rx_reporter_dump_rq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 471 void *ctx) 472 { 473 struct mlx5_rsc_key key = {}; 474 struct mlx5e_rq *rq = ctx; 475 int err; 476 477 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 478 return 0; 479 480 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); 481 if (err) 482 return err; 483 484 key.size = PAGE_SIZE; 485 key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; 486 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 487 if (err) 488 return err; 489 490 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 491 if (err) 492 return err; 493 494 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); 495 if (err) 496 return err; 497 498 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 499 if (err) 500 return err; 501 502 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 503 key.index1 = rq->rqn; 504 key.num_of_obj1 = 1; 505 506 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 507 if (err) 508 return err; 509 510 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 511 if (err) 512 return err; 513 514 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "receive_buff"); 515 if (err) 516 return err; 517 518 key.rsc = MLX5_SGMT_TYPE_RCV_BUFF; 519 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 520 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 521 if (err) 522 return err; 523 524 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 525 if (err) 526 return err; 527 528 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 529 } 530 531 static int mlx5e_rx_reporter_dump_all_rqs(struct mlx5e_priv *priv, 532 struct devlink_fmsg *fmsg) 533 { 534 struct mlx5_rsc_key key = {}; 535 int i, err; 536 537 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 538 return 0; 539 540 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); 541 if (err) 542 return err; 543 544 key.size = PAGE_SIZE; 545 key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; 546 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 547 if (err) 548 return err; 549 550 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 551 if (err) 552 return err; 553 554 err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); 555 if (err) 556 return err; 557 558 for (i = 0; i < priv->channels.num; i++) { 559 struct mlx5e_rq *rq = &priv->channels.c[i]->rq; 560 561 err = mlx5e_health_queue_dump(priv, fmsg, rq->rqn, "RQ"); 562 if (err) 563 return err; 564 } 565 566 return devlink_fmsg_arr_pair_nest_end(fmsg); 567 } 568 569 static int mlx5e_rx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 570 struct mlx5e_err_ctx *err_ctx, 571 struct devlink_fmsg *fmsg) 572 { 573 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 574 } 575 576 static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, 577 struct devlink_fmsg *fmsg, void *context, 578 struct netlink_ext_ack *extack) 579 { 580 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 581 struct mlx5e_err_ctx *err_ctx = context; 582 583 return err_ctx ? mlx5e_rx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 584 mlx5e_rx_reporter_dump_all_rqs(priv, fmsg); 585 } 586 587 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) 588 { 589 char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; 590 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 591 struct mlx5e_icosq *icosq = rq->icosq; 592 struct mlx5e_priv *priv = rq->priv; 593 struct mlx5e_err_ctx err_ctx = {}; 594 595 err_ctx.ctx = rq; 596 err_ctx.recover = mlx5e_rx_reporter_timeout_recover; 597 err_ctx.dump = mlx5e_rx_reporter_dump_rq; 598 599 if (icosq) 600 snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); 601 snprintf(err_str, sizeof(err_str), 602 "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", 603 rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); 604 605 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 606 } 607 608 void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq) 609 { 610 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 611 struct mlx5e_priv *priv = rq->priv; 612 struct mlx5e_err_ctx err_ctx = {}; 613 614 err_ctx.ctx = rq; 615 err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; 616 err_ctx.dump = mlx5e_rx_reporter_dump_rq; 617 snprintf(err_str, sizeof(err_str), "ERR CQE on RQ: 0x%x", rq->rqn); 618 619 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 620 } 621 622 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq) 623 { 624 struct mlx5e_priv *priv = icosq->channel->priv; 625 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 626 struct mlx5e_err_ctx err_ctx = {}; 627 628 err_ctx.ctx = icosq; 629 err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover; 630 err_ctx.dump = mlx5e_rx_reporter_dump_icosq; 631 snprintf(err_str, sizeof(err_str), "ERR CQE on ICOSQ: 0x%x", icosq->sqn); 632 633 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 634 } 635 636 static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { 637 .name = "rx", 638 .recover = mlx5e_rx_reporter_recover, 639 .diagnose = mlx5e_rx_reporter_diagnose, 640 .dump = mlx5e_rx_reporter_dump, 641 }; 642 643 #define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 644 645 void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) 646 { 647 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); 648 struct devlink_health_reporter *reporter; 649 650 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops, 651 MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); 652 if (IS_ERR(reporter)) { 653 netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", 654 PTR_ERR(reporter)); 655 return; 656 } 657 priv->rx_reporter = reporter; 658 } 659 660 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) 661 { 662 if (!priv->rx_reporter) 663 return; 664 665 devlink_port_health_reporter_destroy(priv->rx_reporter); 666 priv->rx_reporter = NULL; 667 } 668