1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + msecs_to_jiffies(2000); 9 10 while (time_before(jiffies, exp_time)) { 11 if (sq->cc == sq->pc) 12 return 0; 13 14 msleep(20); 15 } 16 17 netdev_err(sq->channel->netdev, 18 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 19 sq->sqn, sq->cc, sq->pc); 20 21 return -ETIMEDOUT; 22 } 23 24 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 25 { 26 WARN_ONCE(sq->cc != sq->pc, 27 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 28 sq->sqn, sq->cc, sq->pc); 29 sq->cc = 0; 30 sq->dma_fifo_cc = 0; 31 sq->pc = 0; 32 } 33 34 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 35 { 36 struct mlx5_core_dev *mdev; 37 struct net_device *dev; 38 struct mlx5e_txqsq *sq; 39 u8 state; 40 int err; 41 42 sq = ctx; 43 mdev = sq->channel->mdev; 44 dev = sq->channel->netdev; 45 46 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 47 return 0; 48 49 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 50 if (err) { 51 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 52 sq->sqn, err); 53 goto out; 54 } 55 56 if (state != MLX5_SQC_STATE_ERR) 57 goto out; 58 59 mlx5e_tx_disable_queue(sq->txq); 60 61 err = mlx5e_wait_for_sq_flush(sq); 62 if (err) 63 goto out; 64 65 /* At this point, no new packets will arrive from the stack as TXQ is 66 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 67 * pending WQEs. SQ can safely reset the SQ. 68 */ 69 70 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 71 if (err) 72 goto out; 73 74 mlx5e_reset_txqsq_cc_pc(sq); 75 sq->stats->recover++; 76 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 77 mlx5e_activate_txqsq(sq); 78 79 return 0; 80 out: 81 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 82 return err; 83 } 84 85 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 86 { 87 struct mlx5e_priv *priv = sq->channel->priv; 88 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 89 struct mlx5e_err_ctx err_ctx = {0}; 90 91 err_ctx.ctx = sq; 92 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 93 sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); 94 95 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 96 } 97 98 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 99 { 100 struct mlx5_eq_comp *eq; 101 struct mlx5e_txqsq *sq; 102 int err; 103 104 sq = ctx; 105 eq = sq->cq.mcq.eq; 106 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 107 if (err) 108 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 109 110 return err; 111 } 112 113 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 114 { 115 struct mlx5e_priv *priv = sq->channel->priv; 116 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 117 struct mlx5e_err_ctx err_ctx; 118 119 err_ctx.ctx = sq; 120 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 121 sprintf(err_str, 122 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", 123 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 124 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 125 126 return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 127 } 128 129 /* state lock cannot be grabbed within this function. 130 * It can cause a dead lock or a read-after-free. 131 */ 132 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 133 { 134 return err_ctx->recover(err_ctx->ctx); 135 } 136 137 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 138 void *context) 139 { 140 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 141 struct mlx5e_err_ctx *err_ctx = context; 142 143 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 144 mlx5e_health_recover_channels(priv); 145 } 146 147 static int 148 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 149 struct mlx5e_txqsq *sq, int tc) 150 { 151 struct mlx5e_priv *priv = sq->channel->priv; 152 bool stopped = netif_xmit_stopped(sq->txq); 153 u8 state; 154 int err; 155 156 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 157 if (err) 158 return err; 159 160 err = devlink_fmsg_obj_nest_start(fmsg); 161 if (err) 162 return err; 163 164 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 165 if (err) 166 return err; 167 168 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 169 if (err) 170 return err; 171 172 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 173 if (err) 174 return err; 175 176 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 177 if (err) 178 return err; 179 180 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 181 if (err) 182 return err; 183 184 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 185 if (err) 186 return err; 187 188 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 189 if (err) 190 return err; 191 192 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 193 if (err) 194 return err; 195 196 err = mlx5e_reporter_cq_diagnose(&sq->cq, fmsg); 197 if (err) 198 return err; 199 200 err = devlink_fmsg_obj_nest_end(fmsg); 201 if (err) 202 return err; 203 204 return 0; 205 } 206 207 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 208 struct devlink_fmsg *fmsg) 209 { 210 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 211 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 212 u32 sq_stride, sq_sz; 213 214 int i, tc, err = 0; 215 216 mutex_lock(&priv->state_lock); 217 218 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 219 goto unlock; 220 221 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 222 sq_stride = MLX5_SEND_WQE_BB; 223 224 err = mlx5e_reporter_named_obj_nest_start(fmsg, "Common Config"); 225 if (err) 226 goto unlock; 227 228 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SQ"); 229 if (err) 230 goto unlock; 231 232 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 233 if (err) 234 goto unlock; 235 236 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 237 if (err) 238 goto unlock; 239 240 err = mlx5e_reporter_cq_common_diagnose(&generic_sq->cq, fmsg); 241 if (err) 242 goto unlock; 243 244 err = mlx5e_reporter_named_obj_nest_end(fmsg); 245 if (err) 246 goto unlock; 247 248 err = mlx5e_reporter_named_obj_nest_end(fmsg); 249 if (err) 250 goto unlock; 251 252 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 253 if (err) 254 goto unlock; 255 256 for (i = 0; i < priv->channels.num; i++) { 257 struct mlx5e_channel *c = priv->channels.c[i]; 258 259 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 260 struct mlx5e_txqsq *sq = &c->sq[tc]; 261 262 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 263 if (err) 264 goto unlock; 265 } 266 } 267 err = devlink_fmsg_arr_pair_nest_end(fmsg); 268 if (err) 269 goto unlock; 270 271 unlock: 272 mutex_unlock(&priv->state_lock); 273 return err; 274 } 275 276 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 277 .name = "tx", 278 .recover = mlx5e_tx_reporter_recover, 279 .diagnose = mlx5e_tx_reporter_diagnose, 280 }; 281 282 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 283 284 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 285 { 286 struct devlink_health_reporter *reporter; 287 struct mlx5_core_dev *mdev = priv->mdev; 288 struct devlink *devlink; 289 290 devlink = priv_to_devlink(mdev); 291 reporter = 292 devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 293 MLX5_REPORTER_TX_GRACEFUL_PERIOD, 294 true, priv); 295 if (IS_ERR(reporter)) { 296 netdev_warn(priv->netdev, 297 "Failed to create tx reporter, err = %ld\n", 298 PTR_ERR(reporter)); 299 return PTR_ERR(reporter); 300 } 301 priv->tx_reporter = reporter; 302 return 0; 303 } 304 305 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 306 { 307 if (!priv->tx_reporter) 308 return; 309 310 devlink_health_reporter_destroy(priv->tx_reporter); 311 } 312