1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + msecs_to_jiffies(2000); 9 10 while (time_before(jiffies, exp_time)) { 11 if (sq->cc == sq->pc) 12 return 0; 13 14 msleep(20); 15 } 16 17 netdev_err(sq->channel->netdev, 18 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 19 sq->sqn, sq->cc, sq->pc); 20 21 return -ETIMEDOUT; 22 } 23 24 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 25 { 26 WARN_ONCE(sq->cc != sq->pc, 27 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 28 sq->sqn, sq->cc, sq->pc); 29 sq->cc = 0; 30 sq->dma_fifo_cc = 0; 31 sq->pc = 0; 32 } 33 34 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 35 { 36 struct mlx5_core_dev *mdev; 37 struct net_device *dev; 38 struct mlx5e_txqsq *sq; 39 u8 state; 40 int err; 41 42 sq = ctx; 43 mdev = sq->channel->mdev; 44 dev = sq->channel->netdev; 45 46 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 47 return 0; 48 49 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 50 if (err) { 51 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 52 sq->sqn, err); 53 goto out; 54 } 55 56 if (state != MLX5_SQC_STATE_ERR) 57 goto out; 58 59 mlx5e_tx_disable_queue(sq->txq); 60 61 err = mlx5e_wait_for_sq_flush(sq); 62 if (err) 63 goto out; 64 65 /* At this point, no new packets will arrive from the stack as TXQ is 66 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 67 * pending WQEs. SQ can safely reset the SQ. 68 */ 69 70 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 71 if (err) 72 goto out; 73 74 mlx5e_reset_txqsq_cc_pc(sq); 75 sq->stats->recover++; 76 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 77 mlx5e_activate_txqsq(sq); 78 79 return 0; 80 out: 81 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 82 return err; 83 } 84 85 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 86 { 87 struct mlx5e_priv *priv = sq->channel->priv; 88 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 89 struct mlx5e_err_ctx err_ctx = {0}; 90 91 err_ctx.ctx = sq; 92 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 93 sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); 94 95 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 96 } 97 98 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 99 { 100 struct mlx5_eq_comp *eq; 101 struct mlx5e_txqsq *sq; 102 int err; 103 104 sq = ctx; 105 eq = sq->cq.mcq.eq; 106 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 107 if (err) 108 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 109 110 return err; 111 } 112 113 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 114 { 115 struct mlx5e_priv *priv = sq->channel->priv; 116 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 117 struct mlx5e_err_ctx err_ctx; 118 119 err_ctx.ctx = sq; 120 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 121 sprintf(err_str, 122 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", 123 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 124 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 125 126 return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 127 } 128 129 /* state lock cannot be grabbed within this function. 130 * It can cause a dead lock or a read-after-free. 131 */ 132 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 133 { 134 return err_ctx->recover(err_ctx->ctx); 135 } 136 137 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 138 void *context) 139 { 140 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 141 struct mlx5e_err_ctx *err_ctx = context; 142 143 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 144 mlx5e_health_recover_channels(priv); 145 } 146 147 static int 148 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 149 u32 sqn, u8 state, bool stopped) 150 { 151 int err; 152 153 err = devlink_fmsg_obj_nest_start(fmsg); 154 if (err) 155 return err; 156 157 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sqn); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_obj_nest_end(fmsg); 170 if (err) 171 return err; 172 173 return 0; 174 } 175 176 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 177 struct devlink_fmsg *fmsg) 178 { 179 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 180 int i, err = 0; 181 182 mutex_lock(&priv->state_lock); 183 184 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 185 goto unlock; 186 187 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 188 if (err) 189 goto unlock; 190 191 for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; 192 i++) { 193 struct mlx5e_txqsq *sq = priv->txq2sq[i]; 194 u8 state; 195 196 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 197 if (err) 198 goto unlock; 199 200 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq->sqn, 201 state, 202 netif_xmit_stopped(sq->txq)); 203 if (err) 204 goto unlock; 205 } 206 err = devlink_fmsg_arr_pair_nest_end(fmsg); 207 if (err) 208 goto unlock; 209 210 unlock: 211 mutex_unlock(&priv->state_lock); 212 return err; 213 } 214 215 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 216 .name = "tx", 217 .recover = mlx5e_tx_reporter_recover, 218 .diagnose = mlx5e_tx_reporter_diagnose, 219 }; 220 221 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 222 223 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 224 { 225 struct devlink_health_reporter *reporter; 226 struct mlx5_core_dev *mdev = priv->mdev; 227 struct devlink *devlink; 228 229 devlink = priv_to_devlink(mdev); 230 reporter = 231 devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 232 MLX5_REPORTER_TX_GRACEFUL_PERIOD, 233 true, priv); 234 if (IS_ERR(reporter)) { 235 netdev_warn(priv->netdev, 236 "Failed to create tx reporter, err = %ld\n", 237 PTR_ERR(reporter)); 238 return PTR_ERR(reporter); 239 } 240 priv->tx_reporter = reporter; 241 return 0; 242 } 243 244 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 245 { 246 if (!priv->tx_reporter) 247 return; 248 249 devlink_health_reporter_destroy(priv->tx_reporter); 250 } 251