1de8650a8SEran Ben Elisha /* SPDX-License-Identifier: GPL-2.0 */ 2de8650a8SEran Ben Elisha /* Copyright (c) 2019 Mellanox Technologies. */ 3de8650a8SEran Ben Elisha 44edc17fdSAya Levin #include "health.h" 5de8650a8SEran Ben Elisha 6de8650a8SEran Ben Elisha static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7de8650a8SEran Ben Elisha { 8de8650a8SEran Ben Elisha unsigned long exp_time = jiffies + msecs_to_jiffies(2000); 9de8650a8SEran Ben Elisha 10de8650a8SEran Ben Elisha while (time_before(jiffies, exp_time)) { 11de8650a8SEran Ben Elisha if (sq->cc == sq->pc) 12de8650a8SEran Ben Elisha return 0; 13de8650a8SEran Ben Elisha 14de8650a8SEran Ben Elisha msleep(20); 15de8650a8SEran Ben Elisha } 16de8650a8SEran Ben Elisha 17de8650a8SEran Ben Elisha netdev_err(sq->channel->netdev, 18de8650a8SEran Ben Elisha "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 19de8650a8SEran Ben Elisha sq->sqn, sq->cc, sq->pc); 20de8650a8SEran Ben Elisha 21de8650a8SEran Ben Elisha return -ETIMEDOUT; 22de8650a8SEran Ben Elisha } 23de8650a8SEran Ben Elisha 24de8650a8SEran Ben Elisha static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 25de8650a8SEran Ben Elisha { 26de8650a8SEran Ben Elisha WARN_ONCE(sq->cc != sq->pc, 27de8650a8SEran Ben Elisha "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 28de8650a8SEran Ben Elisha sq->sqn, sq->cc, sq->pc); 29de8650a8SEran Ben Elisha sq->cc = 0; 30de8650a8SEran Ben Elisha sq->dma_fifo_cc = 0; 31de8650a8SEran Ben Elisha sq->pc = 0; 32de8650a8SEran Ben Elisha } 33de8650a8SEran Ben Elisha 34c50de4afSAya Levin static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 35de8650a8SEran Ben Elisha { 36c50de4afSAya Levin struct mlx5_core_dev *mdev; 37c50de4afSAya Levin struct net_device *dev; 38c50de4afSAya Levin struct mlx5e_txqsq *sq; 39de8650a8SEran Ben Elisha u8 state; 40de8650a8SEran Ben Elisha int err; 41de8650a8SEran Ben Elisha 42c50de4afSAya Levin sq = ctx; 43c50de4afSAya Levin mdev = sq->channel->mdev; 44c50de4afSAya Levin dev = sq->channel->netdev; 45c50de4afSAya Levin 46c50de4afSAya Levin if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 47c50de4afSAya Levin return 0; 48c50de4afSAya Levin 49de8650a8SEran Ben Elisha err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 50de8650a8SEran Ben Elisha if (err) { 51de8650a8SEran Ben Elisha netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 52de8650a8SEran Ben Elisha sq->sqn, err); 53276d197eSAya Levin goto out; 54de8650a8SEran Ben Elisha } 55de8650a8SEran Ben Elisha 56d9a2fcf5SAya Levin if (state != MLX5_SQC_STATE_ERR) 57276d197eSAya Levin goto out; 58de8650a8SEran Ben Elisha 59de8650a8SEran Ben Elisha mlx5e_tx_disable_queue(sq->txq); 60de8650a8SEran Ben Elisha 61de8650a8SEran Ben Elisha err = mlx5e_wait_for_sq_flush(sq); 62de8650a8SEran Ben Elisha if (err) 63276d197eSAya Levin goto out; 64de8650a8SEran Ben Elisha 65de8650a8SEran Ben Elisha /* At this point, no new packets will arrive from the stack as TXQ is 66de8650a8SEran Ben Elisha * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 67de8650a8SEran Ben Elisha * pending WQEs. SQ can safely reset the SQ. 68de8650a8SEran Ben Elisha */ 69de8650a8SEran Ben Elisha 70c50de4afSAya Levin err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 71de8650a8SEran Ben Elisha if (err) 72276d197eSAya Levin goto out; 73de8650a8SEran Ben Elisha 74de8650a8SEran Ben Elisha mlx5e_reset_txqsq_cc_pc(sq); 75de8650a8SEran Ben Elisha sq->stats->recover++; 76276d197eSAya Levin clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 77de8650a8SEran Ben Elisha mlx5e_activate_txqsq(sq); 78de8650a8SEran Ben Elisha 79de8650a8SEran Ben Elisha return 0; 80276d197eSAya Levin out: 81276d197eSAya Levin clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 82276d197eSAya Levin return err; 83de8650a8SEran Ben Elisha } 84de8650a8SEran Ben Elisha 8506293ae4SAya Levin void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 86de8650a8SEran Ben Elisha { 87c50de4afSAya Levin struct mlx5e_priv *priv = sq->channel->priv; 88c50de4afSAya Levin char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 89c50de4afSAya Levin struct mlx5e_err_ctx err_ctx = {0}; 90de8650a8SEran Ben Elisha 91c50de4afSAya Levin err_ctx.ctx = sq; 92de8650a8SEran Ben Elisha err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 93de8650a8SEran Ben Elisha sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); 94de8650a8SEran Ben Elisha 95c50de4afSAya Levin mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 96de8650a8SEran Ben Elisha } 97de8650a8SEran Ben Elisha 98c50de4afSAya Levin static int mlx5e_tx_reporter_timeout_recover(void *ctx) 997d91126bSEran Ben Elisha { 100c50de4afSAya Levin struct mlx5_eq_comp *eq; 101c50de4afSAya Levin struct mlx5e_txqsq *sq; 102c50de4afSAya Levin int err; 1037d91126bSEran Ben Elisha 104c50de4afSAya Levin sq = ctx; 105c50de4afSAya Levin eq = sq->cq.mcq.eq; 106c50de4afSAya Levin err = mlx5e_health_channel_eq_recover(eq, sq->channel); 107c50de4afSAya Levin if (err) 1087d91126bSEran Ben Elisha clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 1097d91126bSEran Ben Elisha 110c50de4afSAya Levin return err; 1117d91126bSEran Ben Elisha } 1127d91126bSEran Ben Elisha 11306293ae4SAya Levin int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 1147d91126bSEran Ben Elisha { 115c50de4afSAya Levin struct mlx5e_priv *priv = sq->channel->priv; 116c50de4afSAya Levin char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 117c50de4afSAya Levin struct mlx5e_err_ctx err_ctx; 1187d91126bSEran Ben Elisha 119c50de4afSAya Levin err_ctx.ctx = sq; 1207d91126bSEran Ben Elisha err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 1217d91126bSEran Ben Elisha sprintf(err_str, 1227d91126bSEran Ben Elisha "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", 1237d91126bSEran Ben Elisha sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 1247d91126bSEran Ben Elisha jiffies_to_usecs(jiffies - sq->txq->trans_start)); 1257d91126bSEran Ben Elisha 126c50de4afSAya Levin return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 1277d91126bSEran Ben Elisha } 1287d91126bSEran Ben Elisha 129de8650a8SEran Ben Elisha /* state lock cannot be grabbed within this function. 130de8650a8SEran Ben Elisha * It can cause a dead lock or a read-after-free. 131de8650a8SEran Ben Elisha */ 132c50de4afSAya Levin static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 133de8650a8SEran Ben Elisha { 134c50de4afSAya Levin return err_ctx->recover(err_ctx->ctx); 135de8650a8SEran Ben Elisha } 136de8650a8SEran Ben Elisha 137de8650a8SEran Ben Elisha static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 138e7a98105SJiri Pirko void *context, 139e7a98105SJiri Pirko struct netlink_ext_ack *extack) 140de8650a8SEran Ben Elisha { 141de8650a8SEran Ben Elisha struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 142c50de4afSAya Levin struct mlx5e_err_ctx *err_ctx = context; 143de8650a8SEran Ben Elisha 144de8650a8SEran Ben Elisha return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 145c50de4afSAya Levin mlx5e_health_recover_channels(priv); 146de8650a8SEran Ben Elisha } 147de8650a8SEran Ben Elisha 148de8650a8SEran Ben Elisha static int 149de8650a8SEran Ben Elisha mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 1502d708887SAya Levin struct mlx5e_txqsq *sq, int tc) 151de8650a8SEran Ben Elisha { 152dd921fd2SAya Levin struct mlx5e_priv *priv = sq->channel->priv; 153dd921fd2SAya Levin bool stopped = netif_xmit_stopped(sq->txq); 154dd921fd2SAya Levin u8 state; 155de8650a8SEran Ben Elisha int err; 156de8650a8SEran Ben Elisha 157dd921fd2SAya Levin err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 158dd921fd2SAya Levin if (err) 159dd921fd2SAya Levin return err; 160dd921fd2SAya Levin 161de8650a8SEran Ben Elisha err = devlink_fmsg_obj_nest_start(fmsg); 162de8650a8SEran Ben Elisha if (err) 163de8650a8SEran Ben Elisha return err; 164de8650a8SEran Ben Elisha 1652d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 1662d708887SAya Levin if (err) 1672d708887SAya Levin return err; 1682d708887SAya Levin 1692d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 1702d708887SAya Levin if (err) 1712d708887SAya Levin return err; 1722d708887SAya Levin 1732d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 1742d708887SAya Levin if (err) 1752d708887SAya Levin return err; 1762d708887SAya Levin 177dd921fd2SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 178de8650a8SEran Ben Elisha if (err) 179de8650a8SEran Ben Elisha return err; 180de8650a8SEran Ben Elisha 181de8650a8SEran Ben Elisha err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 182de8650a8SEran Ben Elisha if (err) 183de8650a8SEran Ben Elisha return err; 184de8650a8SEran Ben Elisha 185de8650a8SEran Ben Elisha err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 186de8650a8SEran Ben Elisha if (err) 187de8650a8SEran Ben Elisha return err; 188de8650a8SEran Ben Elisha 1892d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 1902d708887SAya Levin if (err) 1912d708887SAya Levin return err; 1922d708887SAya Levin 1932d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 1942d708887SAya Levin if (err) 1952d708887SAya Levin return err; 1962d708887SAya Levin 1972bf09e60SAya Levin err = mlx5e_reporter_cq_diagnose(&sq->cq, fmsg); 1982bf09e60SAya Levin if (err) 1992bf09e60SAya Levin return err; 2002bf09e60SAya Levin 201de8650a8SEran Ben Elisha err = devlink_fmsg_obj_nest_end(fmsg); 202de8650a8SEran Ben Elisha if (err) 203de8650a8SEran Ben Elisha return err; 204de8650a8SEran Ben Elisha 205de8650a8SEran Ben Elisha return 0; 206de8650a8SEran Ben Elisha } 207de8650a8SEran Ben Elisha 208de8650a8SEran Ben Elisha static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 209e7a98105SJiri Pirko struct devlink_fmsg *fmsg, 210e7a98105SJiri Pirko struct netlink_ext_ack *extack) 211de8650a8SEran Ben Elisha { 212de8650a8SEran Ben Elisha struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 2132d708887SAya Levin struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 2142d708887SAya Levin u32 sq_stride, sq_sz; 2152d708887SAya Levin 2162d708887SAya Levin int i, tc, err = 0; 217de8650a8SEran Ben Elisha 218de8650a8SEran Ben Elisha mutex_lock(&priv->state_lock); 219de8650a8SEran Ben Elisha 220de8650a8SEran Ben Elisha if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 221de8650a8SEran Ben Elisha goto unlock; 222de8650a8SEran Ben Elisha 2232d708887SAya Levin sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 2242d708887SAya Levin sq_stride = MLX5_SEND_WQE_BB; 2252d708887SAya Levin 2262d708887SAya Levin err = mlx5e_reporter_named_obj_nest_start(fmsg, "Common Config"); 2272d708887SAya Levin if (err) 2282d708887SAya Levin goto unlock; 2292d708887SAya Levin 2302d708887SAya Levin err = mlx5e_reporter_named_obj_nest_start(fmsg, "SQ"); 2312d708887SAya Levin if (err) 2322d708887SAya Levin goto unlock; 2332d708887SAya Levin 2342d708887SAya Levin err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 2352d708887SAya Levin if (err) 2362d708887SAya Levin goto unlock; 2372d708887SAya Levin 2382d708887SAya Levin err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 2392d708887SAya Levin if (err) 2402d708887SAya Levin goto unlock; 2412d708887SAya Levin 2422bf09e60SAya Levin err = mlx5e_reporter_cq_common_diagnose(&generic_sq->cq, fmsg); 2432bf09e60SAya Levin if (err) 2442bf09e60SAya Levin goto unlock; 2452bf09e60SAya Levin 2462d708887SAya Levin err = mlx5e_reporter_named_obj_nest_end(fmsg); 2472d708887SAya Levin if (err) 2482d708887SAya Levin goto unlock; 2492d708887SAya Levin 2502d708887SAya Levin err = mlx5e_reporter_named_obj_nest_end(fmsg); 2512d708887SAya Levin if (err) 2522d708887SAya Levin goto unlock; 2532d708887SAya Levin 254de8650a8SEran Ben Elisha err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 255de8650a8SEran Ben Elisha if (err) 256de8650a8SEran Ben Elisha goto unlock; 257de8650a8SEran Ben Elisha 2582d708887SAya Levin for (i = 0; i < priv->channels.num; i++) { 2592d708887SAya Levin struct mlx5e_channel *c = priv->channels.c[i]; 260de8650a8SEran Ben Elisha 2612d708887SAya Levin for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 2622d708887SAya Levin struct mlx5e_txqsq *sq = &c->sq[tc]; 2632d708887SAya Levin 2642d708887SAya Levin err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 265de8650a8SEran Ben Elisha if (err) 26699d31cbdSAya Levin goto unlock; 267de8650a8SEran Ben Elisha } 2682d708887SAya Levin } 269de8650a8SEran Ben Elisha err = devlink_fmsg_arr_pair_nest_end(fmsg); 270de8650a8SEran Ben Elisha if (err) 271de8650a8SEran Ben Elisha goto unlock; 272de8650a8SEran Ben Elisha 273de8650a8SEran Ben Elisha unlock: 274de8650a8SEran Ben Elisha mutex_unlock(&priv->state_lock); 275de8650a8SEran Ben Elisha return err; 276de8650a8SEran Ben Elisha } 277de8650a8SEran Ben Elisha 278de8650a8SEran Ben Elisha static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 279de8650a8SEran Ben Elisha .name = "tx", 280de8650a8SEran Ben Elisha .recover = mlx5e_tx_reporter_recover, 281de8650a8SEran Ben Elisha .diagnose = mlx5e_tx_reporter_diagnose, 282de8650a8SEran Ben Elisha }; 283de8650a8SEran Ben Elisha 284de8650a8SEran Ben Elisha #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 285de8650a8SEran Ben Elisha 28606293ae4SAya Levin int mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 287de8650a8SEran Ben Elisha { 288baf6dfdbSAya Levin struct devlink_health_reporter *reporter; 289de8650a8SEran Ben Elisha struct mlx5_core_dev *mdev = priv->mdev; 290c50de4afSAya Levin struct devlink *devlink; 291de8650a8SEran Ben Elisha 292c50de4afSAya Levin devlink = priv_to_devlink(mdev); 293baf6dfdbSAya Levin reporter = 294de8650a8SEran Ben Elisha devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 295de8650a8SEran Ben Elisha MLX5_REPORTER_TX_GRACEFUL_PERIOD, 296de8650a8SEran Ben Elisha true, priv); 297baf6dfdbSAya Levin if (IS_ERR(reporter)) { 298de8650a8SEran Ben Elisha netdev_warn(priv->netdev, 299de8650a8SEran Ben Elisha "Failed to create tx reporter, err = %ld\n", 300baf6dfdbSAya Levin PTR_ERR(reporter)); 301baf6dfdbSAya Levin return PTR_ERR(reporter); 3027f7cc235SAya Levin } 303baf6dfdbSAya Levin priv->tx_reporter = reporter; 3047f7cc235SAya Levin return 0; 305de8650a8SEran Ben Elisha } 306de8650a8SEran Ben Elisha 30706293ae4SAya Levin void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 308de8650a8SEran Ben Elisha { 309baf6dfdbSAya Levin if (!priv->tx_reporter) 310de8650a8SEran Ben Elisha return; 311de8650a8SEran Ben Elisha 312de8650a8SEran Ben Elisha devlink_health_reporter_destroy(priv->tx_reporter); 313de8650a8SEran Ben Elisha } 314