xref: /openbmc/linux/drivers/net/ethernet/mellanox/mlx5/core/en/health.c (revision 8be98d2f2a0a262f8bf8a0bc1fdf522b3c7aab17)
1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Mellanox Technologies.
3 
4 #include "health.h"
5 #include "lib/eq.h"
6 #include "lib/mlx5.h"
7 
mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg * fmsg,char * name)8 int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name)
9 {
10 	int err;
11 
12 	err = devlink_fmsg_pair_nest_start(fmsg, name);
13 	if (err)
14 		return err;
15 
16 	err = devlink_fmsg_obj_nest_start(fmsg);
17 	if (err)
18 		return err;
19 
20 	return 0;
21 }
22 
mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg * fmsg)23 int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg)
24 {
25 	int err;
26 
27 	err = devlink_fmsg_obj_nest_end(fmsg);
28 	if (err)
29 		return err;
30 
31 	err = devlink_fmsg_pair_nest_end(fmsg);
32 	if (err)
33 		return err;
34 
35 	return 0;
36 }
37 
mlx5e_health_cq_diag_fmsg(struct mlx5e_cq * cq,struct devlink_fmsg * fmsg)38 int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
39 {
40 	u32 out[MLX5_ST_SZ_DW(query_cq_out)] = {};
41 	u8 hw_status;
42 	void *cqc;
43 	int err;
44 
45 	err = mlx5_core_query_cq(cq->mdev, &cq->mcq, out);
46 	if (err)
47 		return err;
48 
49 	cqc = MLX5_ADDR_OF(query_cq_out, out, cq_context);
50 	hw_status = MLX5_GET(cqc, cqc, status);
51 
52 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
53 	if (err)
54 		return err;
55 
56 	err = devlink_fmsg_u32_pair_put(fmsg, "cqn", cq->mcq.cqn);
57 	if (err)
58 		return err;
59 
60 	err = devlink_fmsg_u8_pair_put(fmsg, "HW status", hw_status);
61 	if (err)
62 		return err;
63 
64 	err = devlink_fmsg_u32_pair_put(fmsg, "ci", mlx5_cqwq_get_ci(&cq->wq));
65 	if (err)
66 		return err;
67 
68 	err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&cq->wq));
69 	if (err)
70 		return err;
71 
72 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
73 	if (err)
74 		return err;
75 
76 	return 0;
77 }
78 
mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq * cq,struct devlink_fmsg * fmsg)79 int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
80 {
81 	u8 cq_log_stride;
82 	u32 cq_sz;
83 	int err;
84 
85 	cq_sz = mlx5_cqwq_get_size(&cq->wq);
86 	cq_log_stride = mlx5_cqwq_get_log_stride_size(&cq->wq);
87 
88 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
89 	if (err)
90 		return err;
91 
92 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", BIT(cq_log_stride));
93 	if (err)
94 		return err;
95 
96 	err = devlink_fmsg_u32_pair_put(fmsg, "size", cq_sz);
97 	if (err)
98 		return err;
99 
100 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
101 	if (err)
102 		return err;
103 
104 	return 0;
105 }
106 
mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp * eq,struct devlink_fmsg * fmsg)107 int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg)
108 {
109 	int err;
110 
111 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "EQ");
112 	if (err)
113 		return err;
114 
115 	err = devlink_fmsg_u8_pair_put(fmsg, "eqn", eq->core.eqn);
116 	if (err)
117 		return err;
118 
119 	err = devlink_fmsg_u32_pair_put(fmsg, "irqn", eq->core.irqn);
120 	if (err)
121 		return err;
122 
123 	err = devlink_fmsg_u32_pair_put(fmsg, "vecidx", eq->core.vecidx);
124 	if (err)
125 		return err;
126 
127 	err = devlink_fmsg_u32_pair_put(fmsg, "ci", eq->core.cons_index);
128 	if (err)
129 		return err;
130 
131 	err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core));
132 	if (err)
133 		return err;
134 
135 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
136 }
137 
mlx5e_health_create_reporters(struct mlx5e_priv * priv)138 void mlx5e_health_create_reporters(struct mlx5e_priv *priv)
139 {
140 	mlx5e_reporter_tx_create(priv);
141 	mlx5e_reporter_rx_create(priv);
142 }
143 
mlx5e_health_destroy_reporters(struct mlx5e_priv * priv)144 void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv)
145 {
146 	mlx5e_reporter_rx_destroy(priv);
147 	mlx5e_reporter_tx_destroy(priv);
148 }
149 
mlx5e_health_channels_update(struct mlx5e_priv * priv)150 void mlx5e_health_channels_update(struct mlx5e_priv *priv)
151 {
152 	if (priv->tx_reporter)
153 		devlink_health_reporter_state_update(priv->tx_reporter,
154 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
155 	if (priv->rx_reporter)
156 		devlink_health_reporter_state_update(priv->rx_reporter,
157 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
158 }
159 
mlx5e_health_sq_to_ready(struct mlx5_core_dev * mdev,struct net_device * dev,u32 sqn)160 int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn)
161 {
162 	struct mlx5e_modify_sq_param msp = {};
163 	int err;
164 
165 	msp.curr_state = MLX5_SQC_STATE_ERR;
166 	msp.next_state = MLX5_SQC_STATE_RST;
167 
168 	err = mlx5e_modify_sq(mdev, sqn, &msp);
169 	if (err) {
170 		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sqn);
171 		return err;
172 	}
173 
174 	memset(&msp, 0, sizeof(msp));
175 	msp.curr_state = MLX5_SQC_STATE_RST;
176 	msp.next_state = MLX5_SQC_STATE_RDY;
177 
178 	err = mlx5e_modify_sq(mdev, sqn, &msp);
179 	if (err) {
180 		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sqn);
181 		return err;
182 	}
183 
184 	return 0;
185 }
186 
mlx5e_health_recover_channels(struct mlx5e_priv * priv)187 int mlx5e_health_recover_channels(struct mlx5e_priv *priv)
188 {
189 	int err = 0;
190 
191 	rtnl_lock();
192 	mutex_lock(&priv->state_lock);
193 
194 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
195 		goto out;
196 
197 	err = mlx5e_safe_reopen_channels(priv);
198 
199 out:
200 	mutex_unlock(&priv->state_lock);
201 	rtnl_unlock();
202 
203 	return err;
204 }
205 
mlx5e_health_channel_eq_recover(struct net_device * dev,struct mlx5_eq_comp * eq,struct mlx5e_ch_stats * stats)206 int mlx5e_health_channel_eq_recover(struct net_device *dev, struct mlx5_eq_comp *eq,
207 				    struct mlx5e_ch_stats *stats)
208 {
209 	u32 eqe_count;
210 
211 	netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
212 		   eq->core.eqn, eq->core.cons_index, eq->core.irqn);
213 
214 	eqe_count = mlx5_eq_poll_irq_disabled(eq);
215 	if (!eqe_count)
216 		return -EIO;
217 
218 	netdev_err(dev, "Recovered %d eqes on EQ 0x%x\n",
219 		   eqe_count, eq->core.eqn);
220 
221 	stats->eq_rearm++;
222 	return 0;
223 }
224 
mlx5e_health_report(struct mlx5e_priv * priv,struct devlink_health_reporter * reporter,char * err_str,struct mlx5e_err_ctx * err_ctx)225 int mlx5e_health_report(struct mlx5e_priv *priv,
226 			struct devlink_health_reporter *reporter, char *err_str,
227 			struct mlx5e_err_ctx *err_ctx)
228 {
229 	netdev_err(priv->netdev, "%s\n", err_str);
230 
231 	if (!reporter)
232 		return err_ctx->recover(err_ctx->ctx);
233 
234 	return devlink_health_report(reporter, err_str, err_ctx);
235 }
236 
237 #define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024
mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg * fmsg,const void * value,u32 value_len)238 static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg,
239 					const void *value, u32 value_len)
240 
241 {
242 	u32 data_size;
243 	int err = 0;
244 	u32 offset;
245 
246 	for (offset = 0; offset < value_len; offset += data_size) {
247 		data_size = value_len - offset;
248 		if (data_size > MLX5_HEALTH_DEVLINK_MAX_SIZE)
249 			data_size = MLX5_HEALTH_DEVLINK_MAX_SIZE;
250 		err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
251 		if (err)
252 			break;
253 	}
254 	return err;
255 }
256 
mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv * priv,struct mlx5_rsc_key * key,struct devlink_fmsg * fmsg)257 int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key,
258 			       struct devlink_fmsg *fmsg)
259 {
260 	struct mlx5_core_dev *mdev = priv->mdev;
261 	struct mlx5_rsc_dump_cmd *cmd;
262 	struct page *page;
263 	int cmd_err, err;
264 	int end_err;
265 	int size;
266 
267 	if (IS_ERR_OR_NULL(mdev->rsc_dump))
268 		return -EOPNOTSUPP;
269 
270 	page = alloc_page(GFP_KERNEL);
271 	if (!page)
272 		return -ENOMEM;
273 
274 	err = devlink_fmsg_binary_pair_nest_start(fmsg, "data");
275 	if (err)
276 		goto free_page;
277 
278 	cmd = mlx5_rsc_dump_cmd_create(mdev, key);
279 	if (IS_ERR(cmd)) {
280 		err = PTR_ERR(cmd);
281 		goto free_page;
282 	}
283 
284 	do {
285 		cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
286 		if (cmd_err < 0) {
287 			err = cmd_err;
288 			goto destroy_cmd;
289 		}
290 
291 		err = mlx5e_health_rsc_fmsg_binary(fmsg, page_address(page), size);
292 		if (err)
293 			goto destroy_cmd;
294 
295 	} while (cmd_err > 0);
296 
297 destroy_cmd:
298 	mlx5_rsc_dump_cmd_destroy(cmd);
299 	end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
300 	if (end_err)
301 		err = end_err;
302 free_page:
303 	__free_page(page);
304 	return err;
305 }
306 
mlx5e_health_queue_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,int queue_idx,char * lbl)307 int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
308 			    int queue_idx, char *lbl)
309 {
310 	struct mlx5_rsc_key key = {};
311 	int err;
312 
313 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
314 	key.index1 = queue_idx;
315 	key.size = PAGE_SIZE;
316 	key.num_of_obj1 = 1;
317 
318 	err = devlink_fmsg_obj_nest_start(fmsg);
319 	if (err)
320 		return err;
321 
322 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, lbl);
323 	if (err)
324 		return err;
325 
326 	err = devlink_fmsg_u32_pair_put(fmsg, "index", queue_idx);
327 	if (err)
328 		return err;
329 
330 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
331 	if (err)
332 		return err;
333 
334 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
335 	if (err)
336 		return err;
337 
338 	return devlink_fmsg_obj_nest_end(fmsg);
339 }
340