1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + 9 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 10 11 while (time_before(jiffies, exp_time)) { 12 if (sq->cc == sq->pc) 13 return 0; 14 15 msleep(20); 16 } 17 18 netdev_err(sq->channel->netdev, 19 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 20 sq->sqn, sq->cc, sq->pc); 21 22 return -ETIMEDOUT; 23 } 24 25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 26 { 27 WARN_ONCE(sq->cc != sq->pc, 28 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 29 sq->sqn, sq->cc, sq->pc); 30 sq->cc = 0; 31 sq->dma_fifo_cc = 0; 32 sq->pc = 0; 33 } 34 35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 36 { 37 struct mlx5_core_dev *mdev; 38 struct net_device *dev; 39 struct mlx5e_txqsq *sq; 40 u8 state; 41 int err; 42 43 sq = ctx; 44 mdev = sq->channel->mdev; 45 dev = sq->channel->netdev; 46 47 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 48 return 0; 49 50 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 51 if (err) { 52 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 53 sq->sqn, err); 54 goto out; 55 } 56 57 if (state != MLX5_SQC_STATE_ERR) 58 goto out; 59 60 mlx5e_tx_disable_queue(sq->txq); 61 62 err = mlx5e_wait_for_sq_flush(sq); 63 if (err) 64 goto out; 65 66 /* At this point, no new packets will arrive from the stack as TXQ is 67 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 68 * pending WQEs. SQ can safely reset the SQ. 69 */ 70 71 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 72 if (err) 73 goto out; 74 75 mlx5e_reset_txqsq_cc_pc(sq); 76 sq->stats->recover++; 77 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 78 mlx5e_activate_txqsq(sq); 79 80 return 0; 81 out: 82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 83 return err; 84 } 85 86 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 87 { 88 struct mlx5_eq_comp *eq; 89 struct mlx5e_txqsq *sq; 90 int err; 91 92 sq = ctx; 93 eq = sq->cq.mcq.eq; 94 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 95 if (err) 96 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 97 98 return err; 99 } 100 101 /* state lock cannot be grabbed within this function. 102 * It can cause a dead lock or a read-after-free. 103 */ 104 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 105 { 106 return err_ctx->recover(err_ctx->ctx); 107 } 108 109 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 110 void *context, 111 struct netlink_ext_ack *extack) 112 { 113 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 114 struct mlx5e_err_ctx *err_ctx = context; 115 116 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 117 mlx5e_health_recover_channels(priv); 118 } 119 120 static int 121 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 122 struct mlx5e_txqsq *sq, int tc) 123 { 124 struct mlx5e_priv *priv = sq->channel->priv; 125 bool stopped = netif_xmit_stopped(sq->txq); 126 u8 state; 127 int err; 128 129 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 130 if (err) 131 return err; 132 133 err = devlink_fmsg_obj_nest_start(fmsg); 134 if (err) 135 return err; 136 137 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 138 if (err) 139 return err; 140 141 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 142 if (err) 143 return err; 144 145 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 146 if (err) 147 return err; 148 149 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 150 if (err) 151 return err; 152 153 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 154 if (err) 155 return err; 156 157 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 166 if (err) 167 return err; 168 169 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); 170 if (err) 171 return err; 172 173 err = mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_obj_nest_end(fmsg); 178 if (err) 179 return err; 180 181 return 0; 182 } 183 184 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 185 struct devlink_fmsg *fmsg, 186 struct netlink_ext_ack *extack) 187 { 188 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 189 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 190 u32 sq_stride, sq_sz; 191 192 int i, tc, err = 0; 193 194 mutex_lock(&priv->state_lock); 195 196 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 197 goto unlock; 198 199 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 200 sq_stride = MLX5_SEND_WQE_BB; 201 202 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); 203 if (err) 204 goto unlock; 205 206 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 207 if (err) 208 goto unlock; 209 210 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 211 if (err) 212 goto unlock; 213 214 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 215 if (err) 216 goto unlock; 217 218 err = mlx5e_health_cq_common_diag_fmsg(&generic_sq->cq, fmsg); 219 if (err) 220 goto unlock; 221 222 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 223 if (err) 224 goto unlock; 225 226 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 227 if (err) 228 goto unlock; 229 230 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 231 if (err) 232 goto unlock; 233 234 for (i = 0; i < priv->channels.num; i++) { 235 struct mlx5e_channel *c = priv->channels.c[i]; 236 237 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 238 struct mlx5e_txqsq *sq = &c->sq[tc]; 239 240 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 241 if (err) 242 goto unlock; 243 } 244 } 245 err = devlink_fmsg_arr_pair_nest_end(fmsg); 246 if (err) 247 goto unlock; 248 249 unlock: 250 mutex_unlock(&priv->state_lock); 251 return err; 252 } 253 254 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 255 void *ctx) 256 { 257 struct mlx5_rsc_key key = {}; 258 struct mlx5e_txqsq *sq = ctx; 259 int err; 260 261 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 262 return 0; 263 264 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 265 if (err) 266 return err; 267 268 key.size = PAGE_SIZE; 269 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 270 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 271 if (err) 272 return err; 273 274 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 275 if (err) 276 return err; 277 278 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); 279 if (err) 280 return err; 281 282 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); 283 if (err) 284 return err; 285 286 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 287 key.index1 = sq->sqn; 288 key.num_of_obj1 = 1; 289 290 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 291 if (err) 292 return err; 293 294 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 295 if (err) 296 return err; 297 298 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); 299 if (err) 300 return err; 301 302 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 303 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 304 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 305 if (err) 306 return err; 307 308 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 309 if (err) 310 return err; 311 312 return mlx5e_health_fmsg_named_obj_nest_end(fmsg); 313 } 314 315 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 316 struct devlink_fmsg *fmsg) 317 { 318 struct mlx5_rsc_key key = {}; 319 int i, tc, err; 320 321 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 322 return 0; 323 324 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); 325 if (err) 326 return err; 327 328 key.size = PAGE_SIZE; 329 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 330 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 331 if (err) 332 return err; 333 334 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); 335 if (err) 336 return err; 337 338 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 339 if (err) 340 return err; 341 342 for (i = 0; i < priv->channels.num; i++) { 343 struct mlx5e_channel *c = priv->channels.c[i]; 344 345 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 346 struct mlx5e_txqsq *sq = &c->sq[tc]; 347 348 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 349 if (err) 350 return err; 351 } 352 } 353 return devlink_fmsg_arr_pair_nest_end(fmsg); 354 } 355 356 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 357 struct mlx5e_err_ctx *err_ctx, 358 struct devlink_fmsg *fmsg) 359 { 360 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 361 } 362 363 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 364 struct devlink_fmsg *fmsg, void *context, 365 struct netlink_ext_ack *extack) 366 { 367 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 368 struct mlx5e_err_ctx *err_ctx = context; 369 370 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 371 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 372 } 373 374 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 375 { 376 struct mlx5e_priv *priv = sq->channel->priv; 377 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 378 struct mlx5e_err_ctx err_ctx = {}; 379 380 err_ctx.ctx = sq; 381 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 382 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 383 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 384 385 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 386 } 387 388 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 389 { 390 struct mlx5e_priv *priv = sq->channel->priv; 391 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 392 struct mlx5e_err_ctx err_ctx = {}; 393 394 err_ctx.ctx = sq; 395 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 396 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 397 snprintf(err_str, sizeof(err_str), 398 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 399 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 400 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 401 402 return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 403 } 404 405 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 406 .name = "tx", 407 .recover = mlx5e_tx_reporter_recover, 408 .diagnose = mlx5e_tx_reporter_diagnose, 409 .dump = mlx5e_tx_reporter_dump, 410 }; 411 412 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 413 414 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 415 { 416 struct devlink_health_reporter *reporter; 417 struct mlx5_core_dev *mdev = priv->mdev; 418 struct devlink *devlink; 419 420 devlink = priv_to_devlink(mdev); 421 reporter = 422 devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 423 MLX5_REPORTER_TX_GRACEFUL_PERIOD, 424 priv); 425 if (IS_ERR(reporter)) { 426 netdev_warn(priv->netdev, 427 "Failed to create tx reporter, err = %ld\n", 428 PTR_ERR(reporter)); 429 return; 430 } 431 priv->tx_reporter = reporter; 432 } 433 434 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 435 { 436 if (!priv->tx_reporter) 437 return; 438 439 devlink_health_reporter_destroy(priv->tx_reporter); 440 } 441