1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + 9 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 10 11 while (time_before(jiffies, exp_time)) { 12 if (sq->cc == sq->pc) 13 return 0; 14 15 msleep(20); 16 } 17 18 netdev_err(sq->channel->netdev, 19 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 20 sq->sqn, sq->cc, sq->pc); 21 22 return -ETIMEDOUT; 23 } 24 25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 26 { 27 WARN_ONCE(sq->cc != sq->pc, 28 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 29 sq->sqn, sq->cc, sq->pc); 30 sq->cc = 0; 31 sq->dma_fifo_cc = 0; 32 sq->pc = 0; 33 } 34 35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 36 { 37 struct mlx5_core_dev *mdev; 38 struct net_device *dev; 39 struct mlx5e_txqsq *sq; 40 u8 state; 41 int err; 42 43 sq = ctx; 44 mdev = sq->channel->mdev; 45 dev = sq->channel->netdev; 46 47 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 48 return 0; 49 50 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 51 if (err) { 52 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 53 sq->sqn, err); 54 goto out; 55 } 56 57 if (state != MLX5_SQC_STATE_ERR) 58 goto out; 59 60 mlx5e_tx_disable_queue(sq->txq); 61 62 err = mlx5e_wait_for_sq_flush(sq); 63 if (err) 64 goto out; 65 66 /* At this point, no new packets will arrive from the stack as TXQ is 67 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 68 * pending WQEs. SQ can safely reset the SQ. 69 */ 70 71 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 72 if (err) 73 goto out; 74 75 mlx5e_reset_txqsq_cc_pc(sq); 76 sq->stats->recover++; 77 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 78 mlx5e_activate_txqsq(sq); 79 80 return 0; 81 out: 82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 83 return err; 84 } 85 86 struct mlx5e_tx_timeout_ctx { 87 struct mlx5e_txqsq *sq; 88 signed int status; 89 }; 90 91 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 92 { 93 struct mlx5e_tx_timeout_ctx *to_ctx; 94 struct mlx5e_priv *priv; 95 struct mlx5_eq_comp *eq; 96 struct mlx5e_txqsq *sq; 97 int err; 98 99 to_ctx = ctx; 100 sq = to_ctx->sq; 101 eq = sq->cq.mcq.eq; 102 priv = sq->channel->priv; 103 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 104 if (!err) { 105 to_ctx->status = 0; /* this sq recovered */ 106 return err; 107 } 108 109 err = mlx5e_safe_reopen_channels(priv); 110 if (!err) { 111 to_ctx->status = 1; /* all channels recovered */ 112 return err; 113 } 114 115 to_ctx->status = err; 116 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 117 netdev_err(priv->netdev, 118 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", 119 err); 120 121 return err; 122 } 123 124 /* state lock cannot be grabbed within this function. 125 * It can cause a dead lock or a read-after-free. 126 */ 127 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 128 { 129 return err_ctx->recover(err_ctx->ctx); 130 } 131 132 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 133 void *context, 134 struct netlink_ext_ack *extack) 135 { 136 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 137 struct mlx5e_err_ctx *err_ctx = context; 138 139 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 140 mlx5e_health_recover_channels(priv); 141 } 142 143 static int 144 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 145 struct mlx5e_txqsq *sq, int tc) 146 { 147 struct mlx5e_priv *priv = sq->channel->priv; 148 bool stopped = netif_xmit_stopped(sq->txq); 149 u8 state; 150 int err; 151 152 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 153 if (err) 154 return err; 155 156 err = devlink_fmsg_obj_nest_start(fmsg); 157 if (err) 158 return err; 159 160 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 161 if (err) 162 return err; 163 164 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 165 if (err) 166 return err; 167 168 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 169 if (err) 170 return err; 171 172 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 173 if (err) 174 return err; 175 176 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 177 if (err) 178 return err; 179 180 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 181 if (err) 182 return err; 183 184 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 185 if (err) 186 return err; 187 188 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 189 if (err) 190 return err; 191 192 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 193 if (err) 194 return err; 195 196 err = mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 197 if (err) 198 return err; 199 200 err = devlink_fmsg_obj_nest_end(fmsg); 201 if (err) 202 return err; 203 204 return 0; 205 } 206 207 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 208 struct devlink_fmsg *fmsg, 209 struct netlink_ext_ack *extack) 210 { 211 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 212 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 213 u32 sq_stride, sq_sz; 214 215 int i, tc, err = 0; 216 217 mutex_lock(&priv->state_lock); 218 219 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 220 goto unlock; 221 222 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 223 sq_stride = MLX5_SEND_WQE_BB; 224 225 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 226 if (err) 227 goto unlock; 228 229 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 230 if (err) 231 goto unlock; 232 233 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 234 if (err) 235 goto unlock; 236 237 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 238 if (err) 239 goto unlock; 240 241 err = mlx5e_health_cq_common_diag_fmsg(&generic_sq->cq, fmsg); 242 if (err) 243 goto unlock; 244 245 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 246 if (err) 247 goto unlock; 248 249 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 250 if (err) 251 goto unlock; 252 253 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 254 if (err) 255 goto unlock; 256 257 for (i = 0; i < priv->channels.num; i++) { 258 struct mlx5e_channel *c = priv->channels.c[i]; 259 260 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 261 struct mlx5e_txqsq *sq = &c->sq[tc]; 262 263 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 264 if (err) 265 goto unlock; 266 } 267 } 268 err = devlink_fmsg_arr_pair_nest_end(fmsg); 269 if (err) 270 goto unlock; 271 272 unlock: 273 mutex_unlock(&priv->state_lock); 274 return err; 275 } 276 277 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 278 void *ctx) 279 { 280 struct mlx5_rsc_key key = {}; 281 struct mlx5e_txqsq *sq = ctx; 282 int err; 283 284 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 285 return 0; 286 287 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 288 if (err) 289 return err; 290 291 key.size = PAGE_SIZE; 292 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 293 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 294 if (err) 295 return err; 296 297 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 298 if (err) 299 return err; 300 301 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 302 if (err) 303 return err; 304 305 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 306 if (err) 307 return err; 308 309 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 310 key.index1 = sq->sqn; 311 key.num_of_obj1 = 1; 312 313 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 314 if (err) 315 return err; 316 317 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 318 if (err) 319 return err; 320 321 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 322 if (err) 323 return err; 324 325 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 326 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 327 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 328 if (err) 329 return err; 330 331 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 332 if (err) 333 return err; 334 335 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 336 } 337 338 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 339 struct devlink_fmsg *fmsg) 340 { 341 struct mlx5_rsc_key key = {}; 342 int i, tc, err; 343 344 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 345 return 0; 346 347 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 348 if (err) 349 return err; 350 351 key.size = PAGE_SIZE; 352 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 353 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 354 if (err) 355 return err; 356 357 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 358 if (err) 359 return err; 360 361 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 362 if (err) 363 return err; 364 365 for (i = 0; i < priv->channels.num; i++) { 366 struct mlx5e_channel *c = priv->channels.c[i]; 367 368 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 369 struct mlx5e_txqsq *sq = &c->sq[tc]; 370 371 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 372 if (err) 373 return err; 374 } 375 } 376 return devlink_fmsg_arr_pair_nest_end(fmsg); 377 } 378 379 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 380 struct mlx5e_err_ctx *err_ctx, 381 struct devlink_fmsg *fmsg) 382 { 383 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 384 } 385 386 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 387 struct devlink_fmsg *fmsg, void *context, 388 struct netlink_ext_ack *extack) 389 { 390 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 391 struct mlx5e_err_ctx *err_ctx = context; 392 393 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 394 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 395 } 396 397 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 398 { 399 struct mlx5e_priv *priv = sq->channel->priv; 400 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 401 struct mlx5e_err_ctx err_ctx = {}; 402 403 err_ctx.ctx = sq; 404 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 405 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 406 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 407 408 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 409 } 410 411 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 412 { 413 struct mlx5e_priv *priv = sq->channel->priv; 414 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 415 struct mlx5e_tx_timeout_ctx to_ctx = {}; 416 struct mlx5e_err_ctx err_ctx = {}; 417 418 to_ctx.sq = sq; 419 err_ctx.ctx = &to_ctx; 420 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 421 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 422 snprintf(err_str, sizeof(err_str), 423 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 424 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 425 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 426 427 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 428 return to_ctx.status; 429 } 430 431 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 432 .name = "tx", 433 .recover = mlx5e_tx_reporter_recover, 434 .diagnose = mlx5e_tx_reporter_diagnose, 435 .dump = mlx5e_tx_reporter_dump, 436 }; 437 438 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 439 440 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 441 { 442 struct devlink_health_reporter *reporter; 443 444 reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops, 445 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); 446 if (IS_ERR(reporter)) { 447 netdev_warn(priv->netdev, 448 "Failed to create tx reporter, err = %ld\n", 449 PTR_ERR(reporter)); 450 return; 451 } 452 priv->tx_reporter = reporter; 453 } 454 455 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 456 { 457 if (!priv->tx_reporter) 458 return; 459 460 devlink_port_health_reporter_destroy(priv->tx_reporter); 461 } 462