1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + 9 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC); 10 11 while (time_before(jiffies, exp_time)) { 12 if (sq->cc == sq->pc) 13 return 0; 14 15 msleep(20); 16 } 17 18 netdev_err(sq->channel->netdev, 19 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 20 sq->sqn, sq->cc, sq->pc); 21 22 return -ETIMEDOUT; 23 } 24 25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 26 { 27 WARN_ONCE(sq->cc != sq->pc, 28 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 29 sq->sqn, sq->cc, sq->pc); 30 sq->cc = 0; 31 sq->dma_fifo_cc = 0; 32 sq->pc = 0; 33 } 34 35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 36 { 37 struct mlx5_core_dev *mdev; 38 struct net_device *dev; 39 struct mlx5e_txqsq *sq; 40 u8 state; 41 int err; 42 43 sq = ctx; 44 mdev = sq->channel->mdev; 45 dev = sq->channel->netdev; 46 47 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 48 return 0; 49 50 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 51 if (err) { 52 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 53 sq->sqn, err); 54 goto out; 55 } 56 57 if (state != MLX5_SQC_STATE_ERR) 58 goto out; 59 60 mlx5e_tx_disable_queue(sq->txq); 61 62 err = mlx5e_wait_for_sq_flush(sq); 63 if (err) 64 goto out; 65 66 /* At this point, no new packets will arrive from the stack as TXQ is 67 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 68 * pending WQEs. SQ can safely reset the SQ. 69 */ 70 71 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 72 if (err) 73 goto out; 74 75 mlx5e_reset_txqsq_cc_pc(sq); 76 sq->stats->recover++; 77 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 78 mlx5e_activate_txqsq(sq); 79 80 return 0; 81 out: 82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 83 return err; 84 } 85 86 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 87 { 88 struct mlx5_eq_comp *eq; 89 struct mlx5e_txqsq *sq; 90 int err; 91 92 sq = ctx; 93 eq = sq->cq.mcq.eq; 94 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 95 if (err) 96 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 97 98 return err; 99 } 100 101 /* state lock cannot be grabbed within this function. 102 * It can cause a dead lock or a read-after-free. 103 */ 104 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 105 { 106 return err_ctx->recover(err_ctx->ctx); 107 } 108 109 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 110 void *context, 111 struct netlink_ext_ack *extack) 112 { 113 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 114 struct mlx5e_err_ctx *err_ctx = context; 115 116 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 117 mlx5e_health_recover_channels(priv); 118 } 119 120 static int 121 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 122 struct mlx5e_txqsq *sq, int tc) 123 { 124 struct mlx5e_priv *priv = sq->channel->priv; 125 bool stopped = netif_xmit_stopped(sq->txq); 126 u8 state; 127 int err; 128 129 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 130 if (err) 131 return err; 132 133 err = devlink_fmsg_obj_nest_start(fmsg); 134 if (err) 135 return err; 136 137 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 138 if (err) 139 return err; 140 141 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 142 if (err) 143 return err; 144 145 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 146 if (err) 147 return err; 148 149 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 150 if (err) 151 return err; 152 153 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 154 if (err) 155 return err; 156 157 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 166 if (err) 167 return err; 168 169 err = mlx5e_reporter_cq_diagnose(&sq->cq, fmsg); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_obj_nest_end(fmsg); 174 if (err) 175 return err; 176 177 return 0; 178 } 179 180 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 181 struct devlink_fmsg *fmsg, 182 struct netlink_ext_ack *extack) 183 { 184 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 185 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 186 u32 sq_stride, sq_sz; 187 188 int i, tc, err = 0; 189 190 mutex_lock(&priv->state_lock); 191 192 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 193 goto unlock; 194 195 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 196 sq_stride = MLX5_SEND_WQE_BB; 197 198 err = mlx5e_reporter_named_obj_nest_start(fmsg, "Common Config"); 199 if (err) 200 goto unlock; 201 202 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SQ"); 203 if (err) 204 goto unlock; 205 206 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 207 if (err) 208 goto unlock; 209 210 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 211 if (err) 212 goto unlock; 213 214 err = mlx5e_reporter_cq_common_diagnose(&generic_sq->cq, fmsg); 215 if (err) 216 goto unlock; 217 218 err = mlx5e_reporter_named_obj_nest_end(fmsg); 219 if (err) 220 goto unlock; 221 222 err = mlx5e_reporter_named_obj_nest_end(fmsg); 223 if (err) 224 goto unlock; 225 226 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 227 if (err) 228 goto unlock; 229 230 for (i = 0; i < priv->channels.num; i++) { 231 struct mlx5e_channel *c = priv->channels.c[i]; 232 233 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 234 struct mlx5e_txqsq *sq = &c->sq[tc]; 235 236 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 237 if (err) 238 goto unlock; 239 } 240 } 241 err = devlink_fmsg_arr_pair_nest_end(fmsg); 242 if (err) 243 goto unlock; 244 245 unlock: 246 mutex_unlock(&priv->state_lock); 247 return err; 248 } 249 250 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, 251 void *ctx) 252 { 253 struct mlx5_rsc_key key = {}; 254 struct mlx5e_txqsq *sq = ctx; 255 int err; 256 257 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 258 return 0; 259 260 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SX Slice"); 261 if (err) 262 return err; 263 264 key.size = PAGE_SIZE; 265 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 266 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 267 if (err) 268 return err; 269 270 err = mlx5e_reporter_named_obj_nest_end(fmsg); 271 if (err) 272 return err; 273 274 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SQ"); 275 if (err) 276 return err; 277 278 err = mlx5e_reporter_named_obj_nest_start(fmsg, "QPC"); 279 if (err) 280 return err; 281 282 key.rsc = MLX5_SGMT_TYPE_FULL_QPC; 283 key.index1 = sq->sqn; 284 key.num_of_obj1 = 1; 285 286 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 287 if (err) 288 return err; 289 290 err = mlx5e_reporter_named_obj_nest_end(fmsg); 291 if (err) 292 return err; 293 294 err = mlx5e_reporter_named_obj_nest_start(fmsg, "send_buff"); 295 if (err) 296 return err; 297 298 key.rsc = MLX5_SGMT_TYPE_SND_BUFF; 299 key.num_of_obj2 = MLX5_RSC_DUMP_ALL; 300 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 301 if (err) 302 return err; 303 304 err = mlx5e_reporter_named_obj_nest_end(fmsg); 305 if (err) 306 return err; 307 308 return mlx5e_reporter_named_obj_nest_end(fmsg); 309 } 310 311 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, 312 struct devlink_fmsg *fmsg) 313 { 314 struct mlx5_rsc_key key = {}; 315 int i, tc, err; 316 317 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 318 return 0; 319 320 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SX Slice"); 321 if (err) 322 return err; 323 324 key.size = PAGE_SIZE; 325 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; 326 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); 327 if (err) 328 return err; 329 330 err = mlx5e_reporter_named_obj_nest_end(fmsg); 331 if (err) 332 return err; 333 334 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 335 if (err) 336 return err; 337 338 for (i = 0; i < priv->channels.num; i++) { 339 struct mlx5e_channel *c = priv->channels.c[i]; 340 341 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 342 struct mlx5e_txqsq *sq = &c->sq[tc]; 343 344 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); 345 if (err) 346 return err; 347 } 348 } 349 return devlink_fmsg_arr_pair_nest_end(fmsg); 350 } 351 352 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, 353 struct mlx5e_err_ctx *err_ctx, 354 struct devlink_fmsg *fmsg) 355 { 356 return err_ctx->dump(priv, fmsg, err_ctx->ctx); 357 } 358 359 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, 360 struct devlink_fmsg *fmsg, void *context, 361 struct netlink_ext_ack *extack) 362 { 363 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 364 struct mlx5e_err_ctx *err_ctx = context; 365 366 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : 367 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); 368 } 369 370 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 371 { 372 struct mlx5e_priv *priv = sq->channel->priv; 373 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 374 struct mlx5e_err_ctx err_ctx = {}; 375 376 err_ctx.ctx = sq; 377 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 378 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 379 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); 380 381 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 382 } 383 384 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 385 { 386 struct mlx5e_priv *priv = sq->channel->priv; 387 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 388 struct mlx5e_err_ctx err_ctx = {}; 389 390 err_ctx.ctx = sq; 391 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 392 err_ctx.dump = mlx5e_tx_reporter_dump_sq; 393 snprintf(err_str, sizeof(err_str), 394 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", 395 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 396 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 397 398 return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 399 } 400 401 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 402 .name = "tx", 403 .recover = mlx5e_tx_reporter_recover, 404 .diagnose = mlx5e_tx_reporter_diagnose, 405 .dump = mlx5e_tx_reporter_dump, 406 }; 407 408 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 409 410 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 411 { 412 struct devlink_health_reporter *reporter; 413 struct mlx5_core_dev *mdev = priv->mdev; 414 struct devlink *devlink; 415 416 devlink = priv_to_devlink(mdev); 417 reporter = 418 devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 419 MLX5_REPORTER_TX_GRACEFUL_PERIOD, 420 priv); 421 if (IS_ERR(reporter)) { 422 netdev_warn(priv->netdev, 423 "Failed to create tx reporter, err = %ld\n", 424 PTR_ERR(reporter)); 425 return; 426 } 427 priv->tx_reporter = reporter; 428 } 429 430 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 431 { 432 if (!priv->tx_reporter) 433 return; 434 435 devlink_health_reporter_destroy(priv->tx_reporter); 436 } 437