xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 3ddc8b84)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 #include "mlx5_vnet.h"
22 
23 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
24 MODULE_DESCRIPTION("Mellanox VDPA driver");
25 MODULE_LICENSE("Dual BSD/GPL");
26 
27 #define VALID_FEATURES_MASK                                                                        \
28 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
29 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
30 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
32 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
33 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
34 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
36 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
37 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
38 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
39 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
40 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
41 
42 #define VALID_STATUS_MASK                                                                          \
43 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
44 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
45 
46 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
47 
48 #define MLX5V_UNTAGGED 0x1000
49 
50 struct mlx5_vdpa_cq_buf {
51 	struct mlx5_frag_buf_ctrl fbc;
52 	struct mlx5_frag_buf frag_buf;
53 	int cqe_size;
54 	int nent;
55 };
56 
57 struct mlx5_vdpa_cq {
58 	struct mlx5_core_cq mcq;
59 	struct mlx5_vdpa_cq_buf buf;
60 	struct mlx5_db db;
61 	int cqe;
62 };
63 
64 struct mlx5_vdpa_umem {
65 	struct mlx5_frag_buf_ctrl fbc;
66 	struct mlx5_frag_buf frag_buf;
67 	int size;
68 	u32 id;
69 };
70 
71 struct mlx5_vdpa_qp {
72 	struct mlx5_core_qp mqp;
73 	struct mlx5_frag_buf frag_buf;
74 	struct mlx5_db db;
75 	u16 head;
76 	bool fw;
77 };
78 
79 struct mlx5_vq_restore_info {
80 	u32 num_ent;
81 	u64 desc_addr;
82 	u64 device_addr;
83 	u64 driver_addr;
84 	u16 avail_index;
85 	u16 used_index;
86 	struct msi_map map;
87 	bool ready;
88 	bool restore;
89 };
90 
91 struct mlx5_vdpa_virtqueue {
92 	bool ready;
93 	u64 desc_addr;
94 	u64 device_addr;
95 	u64 driver_addr;
96 	u32 num_ent;
97 
98 	/* Resources for implementing the notification channel from the device
99 	 * to the driver. fwqp is the firmware end of an RC connection; the
100 	 * other end is vqqp used by the driver. cq is where completions are
101 	 * reported.
102 	 */
103 	struct mlx5_vdpa_cq cq;
104 	struct mlx5_vdpa_qp fwqp;
105 	struct mlx5_vdpa_qp vqqp;
106 
107 	/* umem resources are required for the virtqueue operation. They're use
108 	 * is internal and they must be provided by the driver.
109 	 */
110 	struct mlx5_vdpa_umem umem1;
111 	struct mlx5_vdpa_umem umem2;
112 	struct mlx5_vdpa_umem umem3;
113 
114 	u32 counter_set_id;
115 	bool initialized;
116 	int index;
117 	u32 virtq_id;
118 	struct mlx5_vdpa_net *ndev;
119 	u16 avail_idx;
120 	u16 used_idx;
121 	int fw_state;
122 	struct msi_map map;
123 
124 	/* keep last in the struct */
125 	struct mlx5_vq_restore_info ri;
126 };
127 
128 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
129 {
130 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
131 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
132 			return idx < 2;
133 		else
134 			return idx < 3;
135 	}
136 
137 	return idx <= mvdev->max_idx;
138 }
139 
140 static void free_resources(struct mlx5_vdpa_net *ndev);
141 static void init_mvqs(struct mlx5_vdpa_net *ndev);
142 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
143 static void teardown_driver(struct mlx5_vdpa_net *ndev);
144 
145 static bool mlx5_vdpa_debug;
146 
147 #define MLX5_CVQ_MAX_ENT 16
148 
149 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
150 	do {                                                                                       \
151 		if (features & BIT_ULL(_feature))                                                  \
152 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
153 	} while (0)
154 
155 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
156 	do {                                                                                       \
157 		if (status & (_status))                                                            \
158 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
159 	} while (0)
160 
161 /* TODO: cross-endian support */
162 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
163 {
164 	return virtio_legacy_is_little_endian() ||
165 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
166 }
167 
168 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
169 {
170 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
171 }
172 
173 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
174 {
175 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
176 }
177 
178 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
179 {
180 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
181 		return 2;
182 
183 	return mvdev->max_vqs;
184 }
185 
186 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
187 {
188 	return idx == ctrl_vq_idx(mvdev);
189 }
190 
191 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
192 {
193 	if (status & ~VALID_STATUS_MASK)
194 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
195 			       status & ~VALID_STATUS_MASK);
196 
197 	if (!mlx5_vdpa_debug)
198 		return;
199 
200 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
201 	if (set && !status) {
202 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
203 		return;
204 	}
205 
206 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
207 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
208 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
209 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
210 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
211 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
212 }
213 
214 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
215 {
216 	if (features & ~VALID_FEATURES_MASK)
217 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
218 			       features & ~VALID_FEATURES_MASK);
219 
220 	if (!mlx5_vdpa_debug)
221 		return;
222 
223 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
224 	if (!features)
225 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
226 
227 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
228 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
229 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
230 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
231 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
261 }
262 
263 static int create_tis(struct mlx5_vdpa_net *ndev)
264 {
265 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
266 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
267 	void *tisc;
268 	int err;
269 
270 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
271 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
272 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
273 	if (err)
274 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
275 
276 	return err;
277 }
278 
279 static void destroy_tis(struct mlx5_vdpa_net *ndev)
280 {
281 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
282 }
283 
284 #define MLX5_VDPA_CQE_SIZE 64
285 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
286 
287 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
288 {
289 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
290 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
291 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
292 	int err;
293 
294 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
295 				       ndev->mvdev.mdev->priv.numa_node);
296 	if (err)
297 		return err;
298 
299 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
300 
301 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
302 	buf->nent = nent;
303 
304 	return 0;
305 }
306 
307 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
308 {
309 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
310 
311 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
312 					ndev->mvdev.mdev->priv.numa_node);
313 }
314 
315 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
316 {
317 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
318 }
319 
320 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
321 {
322 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
323 }
324 
325 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
326 {
327 	struct mlx5_cqe64 *cqe64;
328 	void *cqe;
329 	int i;
330 
331 	for (i = 0; i < buf->nent; i++) {
332 		cqe = get_cqe(vcq, i);
333 		cqe64 = cqe;
334 		cqe64->op_own = MLX5_CQE_INVALID << 4;
335 	}
336 }
337 
338 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
339 {
340 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
341 
342 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
343 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
344 		return cqe64;
345 
346 	return NULL;
347 }
348 
349 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
350 {
351 	vqp->head += n;
352 	vqp->db.db[0] = cpu_to_be32(vqp->head);
353 }
354 
355 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
356 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
357 {
358 	struct mlx5_vdpa_qp *vqp;
359 	__be64 *pas;
360 	void *qpc;
361 
362 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
363 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
364 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
365 	if (vqp->fw) {
366 		/* Firmware QP is allocated by the driver for the firmware's
367 		 * use so we can skip part of the params as they will be chosen by firmware
368 		 */
369 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
370 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
371 		MLX5_SET(qpc, qpc, no_sq, 1);
372 		return;
373 	}
374 
375 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
376 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
377 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
378 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
379 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
380 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
381 	MLX5_SET(qpc, qpc, no_sq, 1);
382 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
383 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
384 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
385 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
386 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
387 }
388 
389 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
390 {
391 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
392 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
393 					ndev->mvdev.mdev->priv.numa_node);
394 }
395 
396 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
397 {
398 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
399 }
400 
401 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
402 		     struct mlx5_vdpa_qp *vqp)
403 {
404 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
405 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
406 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
407 	void *qpc;
408 	void *in;
409 	int err;
410 
411 	if (!vqp->fw) {
412 		vqp = &mvq->vqqp;
413 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
414 		if (err)
415 			return err;
416 
417 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
418 		if (err)
419 			goto err_db;
420 		inlen += vqp->frag_buf.npages * sizeof(__be64);
421 	}
422 
423 	in = kzalloc(inlen, GFP_KERNEL);
424 	if (!in) {
425 		err = -ENOMEM;
426 		goto err_kzalloc;
427 	}
428 
429 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
430 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
431 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
432 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
433 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
434 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
435 	if (!vqp->fw)
436 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
437 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
438 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
439 	kfree(in);
440 	if (err)
441 		goto err_kzalloc;
442 
443 	vqp->mqp.uid = ndev->mvdev.res.uid;
444 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
445 
446 	if (!vqp->fw)
447 		rx_post(vqp, mvq->num_ent);
448 
449 	return 0;
450 
451 err_kzalloc:
452 	if (!vqp->fw)
453 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
454 err_db:
455 	if (!vqp->fw)
456 		rq_buf_free(ndev, vqp);
457 
458 	return err;
459 }
460 
461 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
462 {
463 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
464 
465 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
466 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
467 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
468 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
469 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
470 	if (!vqp->fw) {
471 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
472 		rq_buf_free(ndev, vqp);
473 	}
474 }
475 
476 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
477 {
478 	return get_sw_cqe(cq, cq->mcq.cons_index);
479 }
480 
481 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
482 {
483 	struct mlx5_cqe64 *cqe64;
484 
485 	cqe64 = next_cqe_sw(vcq);
486 	if (!cqe64)
487 		return -EAGAIN;
488 
489 	vcq->mcq.cons_index++;
490 	return 0;
491 }
492 
493 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
494 {
495 	struct mlx5_vdpa_net *ndev = mvq->ndev;
496 	struct vdpa_callback *event_cb;
497 
498 	event_cb = &ndev->event_cbs[mvq->index];
499 	mlx5_cq_set_ci(&mvq->cq.mcq);
500 
501 	/* make sure CQ cosumer update is visible to the hardware before updating
502 	 * RX doorbell record.
503 	 */
504 	dma_wmb();
505 	rx_post(&mvq->vqqp, num);
506 	if (event_cb->callback)
507 		event_cb->callback(event_cb->private);
508 }
509 
510 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
511 {
512 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
513 	struct mlx5_vdpa_net *ndev = mvq->ndev;
514 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
515 	int num = 0;
516 
517 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
518 		num++;
519 		if (num > mvq->num_ent / 2) {
520 			/* If completions keep coming while we poll, we want to
521 			 * let the hardware know that we consumed them by
522 			 * updating the doorbell record.  We also let vdpa core
523 			 * know about this so it passes it on the virtio driver
524 			 * on the guest.
525 			 */
526 			mlx5_vdpa_handle_completions(mvq, num);
527 			num = 0;
528 		}
529 	}
530 
531 	if (num)
532 		mlx5_vdpa_handle_completions(mvq, num);
533 
534 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
535 }
536 
537 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
538 {
539 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
540 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
541 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
542 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
543 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
544 	__be64 *pas;
545 	int inlen;
546 	void *cqc;
547 	void *in;
548 	int err;
549 	int eqn;
550 
551 	err = mlx5_db_alloc(mdev, &vcq->db);
552 	if (err)
553 		return err;
554 
555 	vcq->mcq.set_ci_db = vcq->db.db;
556 	vcq->mcq.arm_db = vcq->db.db + 1;
557 	vcq->mcq.cqe_sz = 64;
558 
559 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
560 	if (err)
561 		goto err_db;
562 
563 	cq_frag_buf_init(vcq, &vcq->buf);
564 
565 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
566 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
567 	in = kzalloc(inlen, GFP_KERNEL);
568 	if (!in) {
569 		err = -ENOMEM;
570 		goto err_vzalloc;
571 	}
572 
573 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
574 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
575 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
576 
577 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
578 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
579 
580 	/* Use vector 0 by default. Consider adding code to choose least used
581 	 * vector.
582 	 */
583 	err = mlx5_comp_eqn_get(mdev, 0, &eqn);
584 	if (err)
585 		goto err_vec;
586 
587 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
588 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
589 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
590 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
591 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
592 
593 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
594 	if (err)
595 		goto err_vec;
596 
597 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
598 	vcq->cqe = num_ent;
599 	vcq->mcq.set_ci_db = vcq->db.db;
600 	vcq->mcq.arm_db = vcq->db.db + 1;
601 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
602 	kfree(in);
603 	return 0;
604 
605 err_vec:
606 	kfree(in);
607 err_vzalloc:
608 	cq_frag_buf_free(ndev, &vcq->buf);
609 err_db:
610 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
611 	return err;
612 }
613 
614 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
615 {
616 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
617 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
618 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
619 
620 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
621 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
622 		return;
623 	}
624 	cq_frag_buf_free(ndev, &vcq->buf);
625 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
626 }
627 
628 static int read_umem_params(struct mlx5_vdpa_net *ndev)
629 {
630 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
631 	u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01);
632 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
633 	int out_size;
634 	void *caps;
635 	void *out;
636 	int err;
637 
638 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
639 	out = kzalloc(out_size, GFP_KERNEL);
640 	if (!out)
641 		return -ENOMEM;
642 
643 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
644 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
645 	err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
646 	if (err) {
647 		mlx5_vdpa_warn(&ndev->mvdev,
648 			"Failed reading vdpa umem capabilities with err %d\n", err);
649 		goto out;
650 	}
651 
652 	caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
653 
654 	ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a);
655 	ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b);
656 
657 	ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a);
658 	ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b);
659 
660 	ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a);
661 	ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b);
662 
663 out:
664 	kfree(out);
665 	return 0;
666 }
667 
668 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
669 			  struct mlx5_vdpa_umem **umemp)
670 {
671 	u32 p_a;
672 	u32 p_b;
673 
674 	switch (num) {
675 	case 1:
676 		p_a = ndev->umem_1_buffer_param_a;
677 		p_b = ndev->umem_1_buffer_param_b;
678 		*umemp = &mvq->umem1;
679 		break;
680 	case 2:
681 		p_a = ndev->umem_2_buffer_param_a;
682 		p_b = ndev->umem_2_buffer_param_b;
683 		*umemp = &mvq->umem2;
684 		break;
685 	case 3:
686 		p_a = ndev->umem_3_buffer_param_a;
687 		p_b = ndev->umem_3_buffer_param_b;
688 		*umemp = &mvq->umem3;
689 		break;
690 	}
691 
692 	(*umemp)->size = p_a * mvq->num_ent + p_b;
693 }
694 
695 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
696 {
697 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
698 }
699 
700 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
701 {
702 	int inlen;
703 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
704 	void *um;
705 	void *in;
706 	int err;
707 	__be64 *pas;
708 	struct mlx5_vdpa_umem *umem;
709 
710 	set_umem_size(ndev, mvq, num, &umem);
711 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
712 	if (err)
713 		return err;
714 
715 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
716 
717 	in = kzalloc(inlen, GFP_KERNEL);
718 	if (!in) {
719 		err = -ENOMEM;
720 		goto err_in;
721 	}
722 
723 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
724 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
725 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
726 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
727 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
728 
729 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
730 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
731 
732 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
733 	if (err) {
734 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
735 		goto err_cmd;
736 	}
737 
738 	kfree(in);
739 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
740 
741 	return 0;
742 
743 err_cmd:
744 	kfree(in);
745 err_in:
746 	umem_frag_buf_free(ndev, umem);
747 	return err;
748 }
749 
750 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
751 {
752 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
753 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
754 	struct mlx5_vdpa_umem *umem;
755 
756 	switch (num) {
757 	case 1:
758 		umem = &mvq->umem1;
759 		break;
760 	case 2:
761 		umem = &mvq->umem2;
762 		break;
763 	case 3:
764 		umem = &mvq->umem3;
765 		break;
766 	}
767 
768 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
769 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
770 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
771 		return;
772 
773 	umem_frag_buf_free(ndev, umem);
774 }
775 
776 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
777 {
778 	int num;
779 	int err;
780 
781 	for (num = 1; num <= 3; num++) {
782 		err = create_umem(ndev, mvq, num);
783 		if (err)
784 			goto err_umem;
785 	}
786 	return 0;
787 
788 err_umem:
789 	for (num--; num > 0; num--)
790 		umem_destroy(ndev, mvq, num);
791 
792 	return err;
793 }
794 
795 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
796 {
797 	int num;
798 
799 	for (num = 3; num > 0; num--)
800 		umem_destroy(ndev, mvq, num);
801 }
802 
803 static int get_queue_type(struct mlx5_vdpa_net *ndev)
804 {
805 	u32 type_mask;
806 
807 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
808 
809 	/* prefer split queue */
810 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
811 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
812 
813 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
814 
815 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
816 }
817 
818 static bool vq_is_tx(u16 idx)
819 {
820 	return idx % 2;
821 }
822 
823 enum {
824 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
825 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
826 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
827 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
828 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
829 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
830 	MLX5_VIRTIO_NET_F_CSUM = 10,
831 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
832 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
833 };
834 
835 static u16 get_features(u64 features)
836 {
837 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
838 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
839 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
840 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
841 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
842 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
843 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
844 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
845 }
846 
847 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
848 {
849 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
850 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
851 }
852 
853 static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
854 {
855 	return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
856 		(1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
857 		pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
858 }
859 
860 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
861 {
862 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
863 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
864 	void *obj_context;
865 	u16 mlx_features;
866 	void *cmd_hdr;
867 	void *vq_ctx;
868 	void *in;
869 	int err;
870 
871 	err = umems_create(ndev, mvq);
872 	if (err)
873 		return err;
874 
875 	in = kzalloc(inlen, GFP_KERNEL);
876 	if (!in) {
877 		err = -ENOMEM;
878 		goto err_alloc;
879 	}
880 
881 	mlx_features = get_features(ndev->mvdev.actual_features);
882 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
883 
884 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
885 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
886 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
887 
888 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
889 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
890 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
891 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
892 		 mlx_features >> 3);
893 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
894 		 mlx_features & 7);
895 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
896 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
897 
898 	if (vq_is_tx(mvq->index))
899 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
900 
901 	if (mvq->map.virq) {
902 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
903 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
904 	} else {
905 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
906 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
907 	}
908 
909 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
910 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
911 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
912 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
913 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
914 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
915 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
916 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
917 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
918 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
919 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
920 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
921 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
922 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
923 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
924 	if (counters_supported(&ndev->mvdev))
925 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
926 
927 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
928 	if (err)
929 		goto err_cmd;
930 
931 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
932 	kfree(in);
933 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
934 
935 	return 0;
936 
937 err_cmd:
938 	kfree(in);
939 err_alloc:
940 	umems_destroy(ndev, mvq);
941 	return err;
942 }
943 
944 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
945 {
946 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
947 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
948 
949 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
950 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
951 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
952 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
953 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
954 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
955 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
956 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
957 		return;
958 	}
959 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
960 	umems_destroy(ndev, mvq);
961 }
962 
963 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
964 {
965 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
966 }
967 
968 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
969 {
970 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
971 }
972 
973 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
974 			int *outlen, u32 qpn, u32 rqpn)
975 {
976 	void *qpc;
977 	void *pp;
978 
979 	switch (cmd) {
980 	case MLX5_CMD_OP_2RST_QP:
981 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
982 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
983 		*in = kzalloc(*inlen, GFP_KERNEL);
984 		*out = kzalloc(*outlen, GFP_KERNEL);
985 		if (!*in || !*out)
986 			goto outerr;
987 
988 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
989 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
990 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
991 		break;
992 	case MLX5_CMD_OP_RST2INIT_QP:
993 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
994 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
995 		*in = kzalloc(*inlen, GFP_KERNEL);
996 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
997 		if (!*in || !*out)
998 			goto outerr;
999 
1000 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
1001 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
1002 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
1003 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1004 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1005 		MLX5_SET(qpc, qpc, rwe, 1);
1006 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1007 		MLX5_SET(ads, pp, vhca_port_num, 1);
1008 		break;
1009 	case MLX5_CMD_OP_INIT2RTR_QP:
1010 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
1011 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
1012 		*in = kzalloc(*inlen, GFP_KERNEL);
1013 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
1014 		if (!*in || !*out)
1015 			goto outerr;
1016 
1017 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
1018 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
1019 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
1020 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1021 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
1022 		MLX5_SET(qpc, qpc, log_msg_max, 30);
1023 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1024 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1025 		MLX5_SET(ads, pp, fl, 1);
1026 		break;
1027 	case MLX5_CMD_OP_RTR2RTS_QP:
1028 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
1029 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
1030 		*in = kzalloc(*inlen, GFP_KERNEL);
1031 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
1032 		if (!*in || !*out)
1033 			goto outerr;
1034 
1035 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
1036 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
1037 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
1038 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1039 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1040 		MLX5_SET(ads, pp, ack_timeout, 14);
1041 		MLX5_SET(qpc, qpc, retry_count, 7);
1042 		MLX5_SET(qpc, qpc, rnr_retry, 7);
1043 		break;
1044 	default:
1045 		goto outerr_nullify;
1046 	}
1047 
1048 	return;
1049 
1050 outerr:
1051 	kfree(*in);
1052 	kfree(*out);
1053 outerr_nullify:
1054 	*in = NULL;
1055 	*out = NULL;
1056 }
1057 
1058 static void free_inout(void *in, void *out)
1059 {
1060 	kfree(in);
1061 	kfree(out);
1062 }
1063 
1064 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1065  * firmware. The fw argument indicates whether the subjected QP is the one used
1066  * by firmware.
1067  */
1068 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1069 {
1070 	int outlen;
1071 	int inlen;
1072 	void *out;
1073 	void *in;
1074 	int err;
1075 
1076 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1077 	if (!in || !out)
1078 		return -ENOMEM;
1079 
1080 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1081 	free_inout(in, out);
1082 	return err;
1083 }
1084 
1085 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1086 {
1087 	int err;
1088 
1089 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1090 	if (err)
1091 		return err;
1092 
1093 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1094 	if (err)
1095 		return err;
1096 
1097 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1098 	if (err)
1099 		return err;
1100 
1101 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1102 	if (err)
1103 		return err;
1104 
1105 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1106 	if (err)
1107 		return err;
1108 
1109 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1110 	if (err)
1111 		return err;
1112 
1113 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1114 }
1115 
1116 struct mlx5_virtq_attr {
1117 	u8 state;
1118 	u16 available_index;
1119 	u16 used_index;
1120 };
1121 
1122 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1123 			   struct mlx5_virtq_attr *attr)
1124 {
1125 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1126 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1127 	void *out;
1128 	void *obj_context;
1129 	void *cmd_hdr;
1130 	int err;
1131 
1132 	out = kzalloc(outlen, GFP_KERNEL);
1133 	if (!out)
1134 		return -ENOMEM;
1135 
1136 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1137 
1138 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1139 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1140 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1141 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1142 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1143 	if (err)
1144 		goto err_cmd;
1145 
1146 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1147 	memset(attr, 0, sizeof(*attr));
1148 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1149 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1150 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1151 	kfree(out);
1152 	return 0;
1153 
1154 err_cmd:
1155 	kfree(out);
1156 	return err;
1157 }
1158 
1159 static bool is_valid_state_change(int oldstate, int newstate)
1160 {
1161 	switch (oldstate) {
1162 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1163 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1164 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1165 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1166 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1167 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1168 	default:
1169 		return false;
1170 	}
1171 }
1172 
1173 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1174 {
1175 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1176 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1177 	void *obj_context;
1178 	void *cmd_hdr;
1179 	void *in;
1180 	int err;
1181 
1182 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1183 		return 0;
1184 
1185 	if (!is_valid_state_change(mvq->fw_state, state))
1186 		return -EINVAL;
1187 
1188 	in = kzalloc(inlen, GFP_KERNEL);
1189 	if (!in)
1190 		return -ENOMEM;
1191 
1192 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1193 
1194 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1195 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1196 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1197 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1198 
1199 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1200 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1201 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1202 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1203 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1204 	kfree(in);
1205 	if (!err)
1206 		mvq->fw_state = state;
1207 
1208 	return err;
1209 }
1210 
1211 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1212 {
1213 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1214 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1215 	void *cmd_hdr;
1216 	int err;
1217 
1218 	if (!counters_supported(&ndev->mvdev))
1219 		return 0;
1220 
1221 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1222 
1223 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1224 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1225 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1226 
1227 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1228 	if (err)
1229 		return err;
1230 
1231 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1232 
1233 	return 0;
1234 }
1235 
1236 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1237 {
1238 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1239 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1240 
1241 	if (!counters_supported(&ndev->mvdev))
1242 		return;
1243 
1244 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1245 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1246 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1247 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1248 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1249 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1250 }
1251 
1252 static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
1253 {
1254 	struct vdpa_callback *cb = priv;
1255 
1256 	if (cb->callback)
1257 		return cb->callback(cb->private);
1258 
1259 	return IRQ_HANDLED;
1260 }
1261 
1262 static void alloc_vector(struct mlx5_vdpa_net *ndev,
1263 			 struct mlx5_vdpa_virtqueue *mvq)
1264 {
1265 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1266 	struct mlx5_vdpa_irq_pool_entry *ent;
1267 	int err;
1268 	int i;
1269 
1270 	for (i = 0; i < irqp->num_ent; i++) {
1271 		ent = &irqp->entries[i];
1272 		if (!ent->used) {
1273 			snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
1274 				 dev_name(&ndev->mvdev.vdev.dev), mvq->index);
1275 			ent->dev_id = &ndev->event_cbs[mvq->index];
1276 			err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
1277 					  ent->name, ent->dev_id);
1278 			if (err)
1279 				return;
1280 
1281 			ent->used = true;
1282 			mvq->map = ent->map;
1283 			return;
1284 		}
1285 	}
1286 }
1287 
1288 static void dealloc_vector(struct mlx5_vdpa_net *ndev,
1289 			   struct mlx5_vdpa_virtqueue *mvq)
1290 {
1291 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1292 	int i;
1293 
1294 	for (i = 0; i < irqp->num_ent; i++)
1295 		if (mvq->map.virq == irqp->entries[i].map.virq) {
1296 			free_irq(mvq->map.virq, irqp->entries[i].dev_id);
1297 			irqp->entries[i].used = false;
1298 			return;
1299 		}
1300 }
1301 
1302 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1303 {
1304 	u16 idx = mvq->index;
1305 	int err;
1306 
1307 	if (!mvq->num_ent)
1308 		return 0;
1309 
1310 	if (mvq->initialized)
1311 		return 0;
1312 
1313 	err = cq_create(ndev, idx, mvq->num_ent);
1314 	if (err)
1315 		return err;
1316 
1317 	err = qp_create(ndev, mvq, &mvq->fwqp);
1318 	if (err)
1319 		goto err_fwqp;
1320 
1321 	err = qp_create(ndev, mvq, &mvq->vqqp);
1322 	if (err)
1323 		goto err_vqqp;
1324 
1325 	err = connect_qps(ndev, mvq);
1326 	if (err)
1327 		goto err_connect;
1328 
1329 	err = counter_set_alloc(ndev, mvq);
1330 	if (err)
1331 		goto err_connect;
1332 
1333 	alloc_vector(ndev, mvq);
1334 	err = create_virtqueue(ndev, mvq);
1335 	if (err)
1336 		goto err_vq;
1337 
1338 	if (mvq->ready) {
1339 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1340 		if (err) {
1341 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1342 				       idx, err);
1343 			goto err_modify;
1344 		}
1345 	}
1346 
1347 	mvq->initialized = true;
1348 	return 0;
1349 
1350 err_modify:
1351 	destroy_virtqueue(ndev, mvq);
1352 err_vq:
1353 	dealloc_vector(ndev, mvq);
1354 	counter_set_dealloc(ndev, mvq);
1355 err_connect:
1356 	qp_destroy(ndev, &mvq->vqqp);
1357 err_vqqp:
1358 	qp_destroy(ndev, &mvq->fwqp);
1359 err_fwqp:
1360 	cq_destroy(ndev, idx);
1361 	return err;
1362 }
1363 
1364 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1365 {
1366 	struct mlx5_virtq_attr attr;
1367 
1368 	if (!mvq->initialized)
1369 		return;
1370 
1371 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1372 		return;
1373 
1374 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1375 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1376 
1377 	if (query_virtqueue(ndev, mvq, &attr)) {
1378 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1379 		return;
1380 	}
1381 	mvq->avail_idx = attr.available_index;
1382 	mvq->used_idx = attr.used_index;
1383 }
1384 
1385 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1386 {
1387 	int i;
1388 
1389 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1390 		suspend_vq(ndev, &ndev->vqs[i]);
1391 }
1392 
1393 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1394 {
1395 	if (!mvq->initialized)
1396 		return;
1397 
1398 	suspend_vq(ndev, mvq);
1399 	destroy_virtqueue(ndev, mvq);
1400 	dealloc_vector(ndev, mvq);
1401 	counter_set_dealloc(ndev, mvq);
1402 	qp_destroy(ndev, &mvq->vqqp);
1403 	qp_destroy(ndev, &mvq->fwqp);
1404 	cq_destroy(ndev, mvq->index);
1405 	mvq->initialized = false;
1406 }
1407 
1408 static int create_rqt(struct mlx5_vdpa_net *ndev)
1409 {
1410 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1411 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1412 	__be32 *list;
1413 	void *rqtc;
1414 	int inlen;
1415 	void *in;
1416 	int i, j;
1417 	int err;
1418 
1419 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1420 	in = kzalloc(inlen, GFP_KERNEL);
1421 	if (!in)
1422 		return -ENOMEM;
1423 
1424 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1425 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1426 
1427 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1428 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1429 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1430 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1431 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1432 
1433 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1434 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1435 	kfree(in);
1436 	if (err)
1437 		return err;
1438 
1439 	return 0;
1440 }
1441 
1442 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1443 
1444 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1445 {
1446 	int act_sz = roundup_pow_of_two(num / 2);
1447 	__be32 *list;
1448 	void *rqtc;
1449 	int inlen;
1450 	void *in;
1451 	int i, j;
1452 	int err;
1453 
1454 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1455 	in = kzalloc(inlen, GFP_KERNEL);
1456 	if (!in)
1457 		return -ENOMEM;
1458 
1459 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1460 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1461 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1462 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1463 
1464 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1465 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1466 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1467 
1468 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1469 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1470 	kfree(in);
1471 	if (err)
1472 		return err;
1473 
1474 	return 0;
1475 }
1476 
1477 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1478 {
1479 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1480 }
1481 
1482 static int create_tir(struct mlx5_vdpa_net *ndev)
1483 {
1484 #define HASH_IP_L4PORTS                                                                            \
1485 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1486 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1487 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1488 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1489 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1490 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1491 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1492 	void *rss_key;
1493 	void *outer;
1494 	void *tirc;
1495 	void *in;
1496 	int err;
1497 
1498 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1499 	if (!in)
1500 		return -ENOMEM;
1501 
1502 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1503 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1504 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1505 
1506 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1507 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1508 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1509 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1510 
1511 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1512 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1513 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1514 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1515 
1516 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1517 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1518 
1519 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1520 	kfree(in);
1521 	if (err)
1522 		return err;
1523 
1524 	mlx5_vdpa_add_tirn(ndev);
1525 	return err;
1526 }
1527 
1528 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1529 {
1530 	mlx5_vdpa_remove_tirn(ndev);
1531 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1532 }
1533 
1534 #define MAX_STEERING_ENT 0x8000
1535 #define MAX_STEERING_GROUPS 2
1536 
1537 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1538        #define NUM_DESTS 2
1539 #else
1540        #define NUM_DESTS 1
1541 #endif
1542 
1543 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1544 				 struct macvlan_node *node,
1545 				 struct mlx5_flow_act *flow_act,
1546 				 struct mlx5_flow_destination *dests)
1547 {
1548 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1549 	int err;
1550 
1551 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1552 	if (IS_ERR(node->ucast_counter.counter))
1553 		return PTR_ERR(node->ucast_counter.counter);
1554 
1555 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1556 	if (IS_ERR(node->mcast_counter.counter)) {
1557 		err = PTR_ERR(node->mcast_counter.counter);
1558 		goto err_mcast_counter;
1559 	}
1560 
1561 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1562 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1563 	return 0;
1564 
1565 err_mcast_counter:
1566 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1567 	return err;
1568 #else
1569 	return 0;
1570 #endif
1571 }
1572 
1573 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1574 				     struct macvlan_node *node)
1575 {
1576 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1577 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1578 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1579 #endif
1580 }
1581 
1582 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1583 					struct macvlan_node *node)
1584 {
1585 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1586 	struct mlx5_flow_act flow_act = {};
1587 	struct mlx5_flow_spec *spec;
1588 	void *headers_c;
1589 	void *headers_v;
1590 	u8 *dmac_c;
1591 	u8 *dmac_v;
1592 	int err;
1593 	u16 vid;
1594 
1595 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1596 	if (!spec)
1597 		return -ENOMEM;
1598 
1599 	vid = key2vid(node->macvlan);
1600 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1601 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1602 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1603 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1604 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1605 	eth_broadcast_addr(dmac_c);
1606 	ether_addr_copy(dmac_v, mac);
1607 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1608 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1609 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1610 	}
1611 	if (node->tagged) {
1612 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1613 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1614 	}
1615 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1616 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1617 	dests[0].tir_num = ndev->res.tirn;
1618 	err = add_steering_counters(ndev, node, &flow_act, dests);
1619 	if (err)
1620 		goto out_free;
1621 
1622 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1623 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1624 #endif
1625 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1626 	if (IS_ERR(node->ucast_rule)) {
1627 		err = PTR_ERR(node->ucast_rule);
1628 		goto err_ucast;
1629 	}
1630 
1631 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1632 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1633 #endif
1634 
1635 	memset(dmac_c, 0, ETH_ALEN);
1636 	memset(dmac_v, 0, ETH_ALEN);
1637 	dmac_c[0] = 1;
1638 	dmac_v[0] = 1;
1639 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1640 	if (IS_ERR(node->mcast_rule)) {
1641 		err = PTR_ERR(node->mcast_rule);
1642 		goto err_mcast;
1643 	}
1644 	kvfree(spec);
1645 	mlx5_vdpa_add_rx_counters(ndev, node);
1646 	return 0;
1647 
1648 err_mcast:
1649 	mlx5_del_flow_rules(node->ucast_rule);
1650 err_ucast:
1651 	remove_steering_counters(ndev, node);
1652 out_free:
1653 	kvfree(spec);
1654 	return err;
1655 }
1656 
1657 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1658 					 struct macvlan_node *node)
1659 {
1660 	mlx5_vdpa_remove_rx_counters(ndev, node);
1661 	mlx5_del_flow_rules(node->ucast_rule);
1662 	mlx5_del_flow_rules(node->mcast_rule);
1663 }
1664 
1665 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1666 {
1667 	u64 val;
1668 
1669 	if (!tagged)
1670 		vlan = MLX5V_UNTAGGED;
1671 
1672 	val = (u64)vlan << 48 |
1673 	      (u64)mac[0] << 40 |
1674 	      (u64)mac[1] << 32 |
1675 	      (u64)mac[2] << 24 |
1676 	      (u64)mac[3] << 16 |
1677 	      (u64)mac[4] << 8 |
1678 	      (u64)mac[5];
1679 
1680 	return val;
1681 }
1682 
1683 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1684 {
1685 	struct macvlan_node *pos;
1686 	u32 idx;
1687 
1688 	idx = hash_64(value, 8); // tbd 8
1689 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1690 		if (pos->macvlan == value)
1691 			return pos;
1692 	}
1693 	return NULL;
1694 }
1695 
1696 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1697 {
1698 	struct macvlan_node *ptr;
1699 	u64 val;
1700 	u32 idx;
1701 	int err;
1702 
1703 	val = search_val(mac, vid, tagged);
1704 	if (mac_vlan_lookup(ndev, val))
1705 		return -EEXIST;
1706 
1707 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1708 	if (!ptr)
1709 		return -ENOMEM;
1710 
1711 	ptr->tagged = tagged;
1712 	ptr->macvlan = val;
1713 	ptr->ndev = ndev;
1714 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1715 	if (err)
1716 		goto err_add;
1717 
1718 	idx = hash_64(val, 8);
1719 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1720 	return 0;
1721 
1722 err_add:
1723 	kfree(ptr);
1724 	return err;
1725 }
1726 
1727 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1728 {
1729 	struct macvlan_node *ptr;
1730 
1731 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1732 	if (!ptr)
1733 		return;
1734 
1735 	hlist_del(&ptr->hlist);
1736 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1737 	remove_steering_counters(ndev, ptr);
1738 	kfree(ptr);
1739 }
1740 
1741 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1742 {
1743 	struct macvlan_node *pos;
1744 	struct hlist_node *n;
1745 	int i;
1746 
1747 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1748 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1749 			hlist_del(&pos->hlist);
1750 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1751 			remove_steering_counters(ndev, pos);
1752 			kfree(pos);
1753 		}
1754 	}
1755 }
1756 
1757 static int setup_steering(struct mlx5_vdpa_net *ndev)
1758 {
1759 	struct mlx5_flow_table_attr ft_attr = {};
1760 	struct mlx5_flow_namespace *ns;
1761 	int err;
1762 
1763 	ft_attr.max_fte = MAX_STEERING_ENT;
1764 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
1765 
1766 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1767 	if (!ns) {
1768 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1769 		return -EOPNOTSUPP;
1770 	}
1771 
1772 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1773 	if (IS_ERR(ndev->rxft)) {
1774 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
1775 		return PTR_ERR(ndev->rxft);
1776 	}
1777 	mlx5_vdpa_add_rx_flow_table(ndev);
1778 
1779 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
1780 	if (err)
1781 		goto err_add;
1782 
1783 	return 0;
1784 
1785 err_add:
1786 	mlx5_vdpa_remove_rx_flow_table(ndev);
1787 	mlx5_destroy_flow_table(ndev->rxft);
1788 	return err;
1789 }
1790 
1791 static void teardown_steering(struct mlx5_vdpa_net *ndev)
1792 {
1793 	clear_mac_vlan_table(ndev);
1794 	mlx5_vdpa_remove_rx_flow_table(ndev);
1795 	mlx5_destroy_flow_table(ndev->rxft);
1796 }
1797 
1798 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1799 {
1800 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1801 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1802 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1803 	struct mlx5_core_dev *pfmdev;
1804 	size_t read;
1805 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1806 
1807 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1808 	switch (cmd) {
1809 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1810 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1811 		if (read != ETH_ALEN)
1812 			break;
1813 
1814 		if (!memcmp(ndev->config.mac, mac, 6)) {
1815 			status = VIRTIO_NET_OK;
1816 			break;
1817 		}
1818 
1819 		if (is_zero_ether_addr(mac))
1820 			break;
1821 
1822 		if (!is_zero_ether_addr(ndev->config.mac)) {
1823 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1824 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1825 					       ndev->config.mac);
1826 				break;
1827 			}
1828 		}
1829 
1830 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1831 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1832 				       mac);
1833 			break;
1834 		}
1835 
1836 		/* backup the original mac address so that if failed to add the forward rules
1837 		 * we could restore it
1838 		 */
1839 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1840 
1841 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1842 
1843 		/* Need recreate the flow table entry, so that the packet could forward back
1844 		 */
1845 		mac_vlan_del(ndev, mac_back, 0, false);
1846 
1847 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
1848 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1849 
1850 			/* Although it hardly run here, we still need double check */
1851 			if (is_zero_ether_addr(mac_back)) {
1852 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1853 				break;
1854 			}
1855 
1856 			/* Try to restore original mac address to MFPS table, and try to restore
1857 			 * the forward rule entry.
1858 			 */
1859 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1860 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1861 					       ndev->config.mac);
1862 			}
1863 
1864 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1865 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1866 					       mac_back);
1867 			}
1868 
1869 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1870 
1871 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
1872 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1873 
1874 			break;
1875 		}
1876 
1877 		status = VIRTIO_NET_OK;
1878 		break;
1879 
1880 	default:
1881 		break;
1882 	}
1883 
1884 	return status;
1885 }
1886 
1887 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1888 {
1889 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1890 	int cur_qps = ndev->cur_num_vqs / 2;
1891 	int err;
1892 	int i;
1893 
1894 	if (cur_qps > newqps) {
1895 		err = modify_rqt(ndev, 2 * newqps);
1896 		if (err)
1897 			return err;
1898 
1899 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1900 			teardown_vq(ndev, &ndev->vqs[i]);
1901 
1902 		ndev->cur_num_vqs = 2 * newqps;
1903 	} else {
1904 		ndev->cur_num_vqs = 2 * newqps;
1905 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1906 			err = setup_vq(ndev, &ndev->vqs[i]);
1907 			if (err)
1908 				goto clean_added;
1909 		}
1910 		err = modify_rqt(ndev, 2 * newqps);
1911 		if (err)
1912 			goto clean_added;
1913 	}
1914 	return 0;
1915 
1916 clean_added:
1917 	for (--i; i >= 2 * cur_qps; --i)
1918 		teardown_vq(ndev, &ndev->vqs[i]);
1919 
1920 	ndev->cur_num_vqs = 2 * cur_qps;
1921 
1922 	return err;
1923 }
1924 
1925 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1926 {
1927 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1928 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1929 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1930 	struct virtio_net_ctrl_mq mq;
1931 	size_t read;
1932 	u16 newqps;
1933 
1934 	switch (cmd) {
1935 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1936 		/* This mq feature check aligns with pre-existing userspace
1937 		 * implementation.
1938 		 *
1939 		 * Without it, an untrusted driver could fake a multiqueue config
1940 		 * request down to a non-mq device that may cause kernel to
1941 		 * panic due to uninitialized resources for extra vqs. Even with
1942 		 * a well behaving guest driver, it is not expected to allow
1943 		 * changing the number of vqs on a non-mq device.
1944 		 */
1945 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1946 			break;
1947 
1948 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1949 		if (read != sizeof(mq))
1950 			break;
1951 
1952 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1953 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1954 		    newqps > ndev->rqt_size)
1955 			break;
1956 
1957 		if (ndev->cur_num_vqs == 2 * newqps) {
1958 			status = VIRTIO_NET_OK;
1959 			break;
1960 		}
1961 
1962 		if (!change_num_qps(mvdev, newqps))
1963 			status = VIRTIO_NET_OK;
1964 
1965 		break;
1966 	default:
1967 		break;
1968 	}
1969 
1970 	return status;
1971 }
1972 
1973 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1974 {
1975 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1976 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1977 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1978 	__virtio16 vlan;
1979 	size_t read;
1980 	u16 id;
1981 
1982 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
1983 		return status;
1984 
1985 	switch (cmd) {
1986 	case VIRTIO_NET_CTRL_VLAN_ADD:
1987 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1988 		if (read != sizeof(vlan))
1989 			break;
1990 
1991 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
1992 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
1993 			break;
1994 
1995 		status = VIRTIO_NET_OK;
1996 		break;
1997 	case VIRTIO_NET_CTRL_VLAN_DEL:
1998 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1999 		if (read != sizeof(vlan))
2000 			break;
2001 
2002 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2003 		mac_vlan_del(ndev, ndev->config.mac, id, true);
2004 		status = VIRTIO_NET_OK;
2005 		break;
2006 	default:
2007 		break;
2008 	}
2009 
2010 	return status;
2011 }
2012 
2013 static void mlx5_cvq_kick_handler(struct work_struct *work)
2014 {
2015 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2016 	struct virtio_net_ctrl_hdr ctrl;
2017 	struct mlx5_vdpa_wq_ent *wqent;
2018 	struct mlx5_vdpa_dev *mvdev;
2019 	struct mlx5_control_vq *cvq;
2020 	struct mlx5_vdpa_net *ndev;
2021 	size_t read, write;
2022 	int err;
2023 
2024 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2025 	mvdev = wqent->mvdev;
2026 	ndev = to_mlx5_vdpa_ndev(mvdev);
2027 	cvq = &mvdev->cvq;
2028 
2029 	down_write(&ndev->reslock);
2030 
2031 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2032 		goto out;
2033 
2034 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
2035 		goto out;
2036 
2037 	if (!cvq->ready)
2038 		goto out;
2039 
2040 	while (true) {
2041 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
2042 					   GFP_ATOMIC);
2043 		if (err <= 0)
2044 			break;
2045 
2046 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
2047 		if (read != sizeof(ctrl))
2048 			break;
2049 
2050 		cvq->received_desc++;
2051 		switch (ctrl.class) {
2052 		case VIRTIO_NET_CTRL_MAC:
2053 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
2054 			break;
2055 		case VIRTIO_NET_CTRL_MQ:
2056 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
2057 			break;
2058 		case VIRTIO_NET_CTRL_VLAN:
2059 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
2060 			break;
2061 		default:
2062 			break;
2063 		}
2064 
2065 		/* Make sure data is written before advancing index */
2066 		smp_wmb();
2067 
2068 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
2069 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
2070 		vringh_kiov_cleanup(&cvq->riov);
2071 		vringh_kiov_cleanup(&cvq->wiov);
2072 
2073 		if (vringh_need_notify_iotlb(&cvq->vring))
2074 			vringh_notify(&cvq->vring);
2075 
2076 		cvq->completed_desc++;
2077 		queue_work(mvdev->wq, &wqent->work);
2078 		break;
2079 	}
2080 
2081 out:
2082 	up_write(&ndev->reslock);
2083 }
2084 
2085 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
2086 {
2087 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2088 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2089 	struct mlx5_vdpa_virtqueue *mvq;
2090 
2091 	if (!is_index_valid(mvdev, idx))
2092 		return;
2093 
2094 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
2095 		if (!mvdev->wq || !mvdev->cvq.ready)
2096 			return;
2097 
2098 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
2099 		return;
2100 	}
2101 
2102 	mvq = &ndev->vqs[idx];
2103 	if (unlikely(!mvq->ready))
2104 		return;
2105 
2106 	iowrite16(idx, ndev->mvdev.res.kick_addr);
2107 }
2108 
2109 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2110 				    u64 driver_area, u64 device_area)
2111 {
2112 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2113 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2114 	struct mlx5_vdpa_virtqueue *mvq;
2115 
2116 	if (!is_index_valid(mvdev, idx))
2117 		return -EINVAL;
2118 
2119 	if (is_ctrl_vq_idx(mvdev, idx)) {
2120 		mvdev->cvq.desc_addr = desc_area;
2121 		mvdev->cvq.device_addr = device_area;
2122 		mvdev->cvq.driver_addr = driver_area;
2123 		return 0;
2124 	}
2125 
2126 	mvq = &ndev->vqs[idx];
2127 	mvq->desc_addr = desc_area;
2128 	mvq->device_addr = device_area;
2129 	mvq->driver_addr = driver_area;
2130 	return 0;
2131 }
2132 
2133 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2134 {
2135 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2136 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2137 	struct mlx5_vdpa_virtqueue *mvq;
2138 
2139 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2140 		return;
2141 
2142 	mvq = &ndev->vqs[idx];
2143 	mvq->num_ent = num;
2144 }
2145 
2146 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2147 {
2148 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2149 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2150 
2151 	ndev->event_cbs[idx] = *cb;
2152 	if (is_ctrl_vq_idx(mvdev, idx))
2153 		mvdev->cvq.event_cb = *cb;
2154 }
2155 
2156 static void mlx5_cvq_notify(struct vringh *vring)
2157 {
2158 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2159 
2160 	if (!cvq->event_cb.callback)
2161 		return;
2162 
2163 	cvq->event_cb.callback(cvq->event_cb.private);
2164 }
2165 
2166 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2167 {
2168 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2169 
2170 	cvq->ready = ready;
2171 	if (!ready)
2172 		return;
2173 
2174 	cvq->vring.notify = mlx5_cvq_notify;
2175 }
2176 
2177 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2178 {
2179 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2180 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2181 	struct mlx5_vdpa_virtqueue *mvq;
2182 	int err;
2183 
2184 	if (!mvdev->actual_features)
2185 		return;
2186 
2187 	if (!is_index_valid(mvdev, idx))
2188 		return;
2189 
2190 	if (is_ctrl_vq_idx(mvdev, idx)) {
2191 		set_cvq_ready(mvdev, ready);
2192 		return;
2193 	}
2194 
2195 	mvq = &ndev->vqs[idx];
2196 	if (!ready) {
2197 		suspend_vq(ndev, mvq);
2198 	} else {
2199 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
2200 		if (err) {
2201 			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
2202 			ready = false;
2203 		}
2204 	}
2205 
2206 
2207 	mvq->ready = ready;
2208 }
2209 
2210 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2211 {
2212 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2213 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2214 
2215 	if (!is_index_valid(mvdev, idx))
2216 		return false;
2217 
2218 	if (is_ctrl_vq_idx(mvdev, idx))
2219 		return mvdev->cvq.ready;
2220 
2221 	return ndev->vqs[idx].ready;
2222 }
2223 
2224 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2225 				  const struct vdpa_vq_state *state)
2226 {
2227 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2228 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2229 	struct mlx5_vdpa_virtqueue *mvq;
2230 
2231 	if (!is_index_valid(mvdev, idx))
2232 		return -EINVAL;
2233 
2234 	if (is_ctrl_vq_idx(mvdev, idx)) {
2235 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2236 		return 0;
2237 	}
2238 
2239 	mvq = &ndev->vqs[idx];
2240 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2241 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2242 		return -EINVAL;
2243 	}
2244 
2245 	mvq->used_idx = state->split.avail_index;
2246 	mvq->avail_idx = state->split.avail_index;
2247 	return 0;
2248 }
2249 
2250 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2251 {
2252 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2253 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2254 	struct mlx5_vdpa_virtqueue *mvq;
2255 	struct mlx5_virtq_attr attr;
2256 	int err;
2257 
2258 	if (!is_index_valid(mvdev, idx))
2259 		return -EINVAL;
2260 
2261 	if (is_ctrl_vq_idx(mvdev, idx)) {
2262 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2263 		return 0;
2264 	}
2265 
2266 	mvq = &ndev->vqs[idx];
2267 	/* If the virtq object was destroyed, use the value saved at
2268 	 * the last minute of suspend_vq. This caters for userspace
2269 	 * that cares about emulating the index after vq is stopped.
2270 	 */
2271 	if (!mvq->initialized) {
2272 		/* Firmware returns a wrong value for the available index.
2273 		 * Since both values should be identical, we take the value of
2274 		 * used_idx which is reported correctly.
2275 		 */
2276 		state->split.avail_index = mvq->used_idx;
2277 		return 0;
2278 	}
2279 
2280 	err = query_virtqueue(ndev, mvq, &attr);
2281 	if (err) {
2282 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2283 		return err;
2284 	}
2285 	state->split.avail_index = attr.used_index;
2286 	return 0;
2287 }
2288 
2289 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2290 {
2291 	return PAGE_SIZE;
2292 }
2293 
2294 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2295 {
2296 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2297 
2298 	if (is_ctrl_vq_idx(mvdev, idx))
2299 		return MLX5_VDPA_CVQ_GROUP;
2300 
2301 	return MLX5_VDPA_DATAVQ_GROUP;
2302 }
2303 
2304 static u64 mlx_to_vritio_features(u16 dev_features)
2305 {
2306 	u64 result = 0;
2307 
2308 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2309 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2310 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2311 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2312 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2313 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2314 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2315 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2316 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2317 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2318 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2319 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2320 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2321 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2322 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2323 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2324 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2325 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2326 
2327 	return result;
2328 }
2329 
2330 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2331 {
2332 	u64 mlx_vdpa_features = 0;
2333 	u16 dev_features;
2334 
2335 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2336 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2337 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2338 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2339 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2340 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2341 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2342 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2343 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2344 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2345 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2346 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2347 
2348 	return mlx_vdpa_features;
2349 }
2350 
2351 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2352 {
2353 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2354 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2355 
2356 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2357 	return ndev->mvdev.mlx_features;
2358 }
2359 
2360 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2361 {
2362 	/* Minimum features to expect */
2363 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2364 		return -EOPNOTSUPP;
2365 
2366 	/* Double check features combination sent down by the driver.
2367 	 * Fail invalid features due to absence of the depended feature.
2368 	 *
2369 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2370 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2371 	 * By failing the invalid features sent down by untrusted drivers,
2372 	 * we're assured the assumption made upon is_index_valid() and
2373 	 * is_ctrl_vq_idx() will not be compromised.
2374 	 */
2375 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2376             BIT_ULL(VIRTIO_NET_F_MQ))
2377 		return -EINVAL;
2378 
2379 	return 0;
2380 }
2381 
2382 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
2383 {
2384 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2385 	int err;
2386 	int i;
2387 
2388 	for (i = 0; i < mvdev->max_vqs; i++) {
2389 		err = setup_vq(ndev, &ndev->vqs[i]);
2390 		if (err)
2391 			goto err_vq;
2392 	}
2393 
2394 	return 0;
2395 
2396 err_vq:
2397 	for (--i; i >= 0; i--)
2398 		teardown_vq(ndev, &ndev->vqs[i]);
2399 
2400 	return err;
2401 }
2402 
2403 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2404 {
2405 	struct mlx5_vdpa_virtqueue *mvq;
2406 	int i;
2407 
2408 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2409 		mvq = &ndev->vqs[i];
2410 		if (!mvq->initialized)
2411 			continue;
2412 
2413 		teardown_vq(ndev, mvq);
2414 	}
2415 }
2416 
2417 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2418 {
2419 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2420 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2421 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2422 			mvdev->max_idx = mvdev->max_vqs;
2423 		} else {
2424 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2425 			 * CVQ gets index 2
2426 			 */
2427 			mvdev->max_idx = 2;
2428 		}
2429 	} else {
2430 		/* Two data virtqueues only: one for rx and one for tx */
2431 		mvdev->max_idx = 1;
2432 	}
2433 }
2434 
2435 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2436 {
2437 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2438 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2439 	int err;
2440 
2441 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2442 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2443 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2444 	if (vport)
2445 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2446 
2447 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2448 	if (err)
2449 		return 0;
2450 
2451 	return MLX5_GET(query_vport_state_out, out, state);
2452 }
2453 
2454 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2455 {
2456 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2457 	    VPORT_STATE_UP)
2458 		return true;
2459 
2460 	return false;
2461 }
2462 
2463 static void update_carrier(struct work_struct *work)
2464 {
2465 	struct mlx5_vdpa_wq_ent *wqent;
2466 	struct mlx5_vdpa_dev *mvdev;
2467 	struct mlx5_vdpa_net *ndev;
2468 
2469 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2470 	mvdev = wqent->mvdev;
2471 	ndev = to_mlx5_vdpa_ndev(mvdev);
2472 	if (get_link_state(mvdev))
2473 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2474 	else
2475 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2476 
2477 	if (ndev->config_cb.callback)
2478 		ndev->config_cb.callback(ndev->config_cb.private);
2479 
2480 	kfree(wqent);
2481 }
2482 
2483 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2484 {
2485 	struct mlx5_vdpa_wq_ent *wqent;
2486 
2487 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2488 	if (!wqent)
2489 		return -ENOMEM;
2490 
2491 	wqent->mvdev = &ndev->mvdev;
2492 	INIT_WORK(&wqent->work, update_carrier);
2493 	queue_work(ndev->mvdev.wq, &wqent->work);
2494 	return 0;
2495 }
2496 
2497 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2498 {
2499 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2500 	struct mlx5_eqe *eqe = param;
2501 	int ret = NOTIFY_DONE;
2502 
2503 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2504 		switch (eqe->sub_type) {
2505 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2506 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2507 			if (queue_link_work(ndev))
2508 				return NOTIFY_DONE;
2509 
2510 			ret = NOTIFY_OK;
2511 			break;
2512 		default:
2513 			return NOTIFY_DONE;
2514 		}
2515 		return ret;
2516 	}
2517 	return ret;
2518 }
2519 
2520 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2521 {
2522 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2523 		return;
2524 
2525 	ndev->nb.notifier_call = event_handler;
2526 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2527 	ndev->nb_registered = true;
2528 	queue_link_work(ndev);
2529 }
2530 
2531 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2532 {
2533 	if (!ndev->nb_registered)
2534 		return;
2535 
2536 	ndev->nb_registered = false;
2537 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2538 	if (ndev->mvdev.wq)
2539 		flush_workqueue(ndev->mvdev.wq);
2540 }
2541 
2542 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2543 {
2544 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2545 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2546 	int err;
2547 
2548 	print_features(mvdev, features, true);
2549 
2550 	err = verify_driver_features(mvdev, features);
2551 	if (err)
2552 		return err;
2553 
2554 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2555 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2556 		ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2557 	else
2558 		ndev->rqt_size = 1;
2559 
2560 	/* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
2561 	 * 5.1.6.5.5 "Device operation in multiqueue mode":
2562 	 *
2563 	 * Multiqueue is disabled by default.
2564 	 * The driver enables multiqueue by sending a command using class
2565 	 * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
2566 	 * operation, as follows: ...
2567 	 */
2568 	ndev->cur_num_vqs = 2;
2569 
2570 	update_cvq_info(mvdev);
2571 	return err;
2572 }
2573 
2574 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2575 {
2576 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2577 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2578 
2579 	ndev->config_cb = *cb;
2580 }
2581 
2582 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2583 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2584 {
2585 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2586 }
2587 
2588 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2589 {
2590 	return VIRTIO_ID_NET;
2591 }
2592 
2593 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2594 {
2595 	return PCI_VENDOR_ID_MELLANOX;
2596 }
2597 
2598 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2599 {
2600 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2601 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2602 
2603 	print_status(mvdev, ndev->mvdev.status, false);
2604 	return ndev->mvdev.status;
2605 }
2606 
2607 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2608 {
2609 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2610 	struct mlx5_virtq_attr attr = {};
2611 	int err;
2612 
2613 	if (mvq->initialized) {
2614 		err = query_virtqueue(ndev, mvq, &attr);
2615 		if (err)
2616 			return err;
2617 	}
2618 
2619 	ri->avail_index = attr.available_index;
2620 	ri->used_index = attr.used_index;
2621 	ri->ready = mvq->ready;
2622 	ri->num_ent = mvq->num_ent;
2623 	ri->desc_addr = mvq->desc_addr;
2624 	ri->device_addr = mvq->device_addr;
2625 	ri->driver_addr = mvq->driver_addr;
2626 	ri->map = mvq->map;
2627 	ri->restore = true;
2628 	return 0;
2629 }
2630 
2631 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2632 {
2633 	int i;
2634 
2635 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2636 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2637 		save_channel_info(ndev, &ndev->vqs[i]);
2638 	}
2639 	return 0;
2640 }
2641 
2642 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2643 {
2644 	int i;
2645 
2646 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2647 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2648 }
2649 
2650 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2651 {
2652 	struct mlx5_vdpa_virtqueue *mvq;
2653 	struct mlx5_vq_restore_info *ri;
2654 	int i;
2655 
2656 	mlx5_clear_vqs(ndev);
2657 	init_mvqs(ndev);
2658 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2659 		mvq = &ndev->vqs[i];
2660 		ri = &mvq->ri;
2661 		if (!ri->restore)
2662 			continue;
2663 
2664 		mvq->avail_idx = ri->avail_index;
2665 		mvq->used_idx = ri->used_index;
2666 		mvq->ready = ri->ready;
2667 		mvq->num_ent = ri->num_ent;
2668 		mvq->desc_addr = ri->desc_addr;
2669 		mvq->device_addr = ri->device_addr;
2670 		mvq->driver_addr = ri->driver_addr;
2671 		mvq->map = ri->map;
2672 	}
2673 }
2674 
2675 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2676 				struct vhost_iotlb *iotlb, unsigned int asid)
2677 {
2678 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2679 	int err;
2680 
2681 	suspend_vqs(ndev);
2682 	err = save_channels_info(ndev);
2683 	if (err)
2684 		goto err_mr;
2685 
2686 	teardown_driver(ndev);
2687 	mlx5_vdpa_destroy_mr_asid(mvdev, asid);
2688 	err = mlx5_vdpa_create_mr(mvdev, iotlb, asid);
2689 	if (err)
2690 		goto err_mr;
2691 
2692 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2693 		goto err_mr;
2694 
2695 	restore_channels_info(ndev);
2696 	err = setup_driver(mvdev);
2697 	if (err)
2698 		goto err_setup;
2699 
2700 	return 0;
2701 
2702 err_setup:
2703 	mlx5_vdpa_destroy_mr_asid(mvdev, asid);
2704 err_mr:
2705 	return err;
2706 }
2707 
2708 /* reslock must be held for this function */
2709 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2710 {
2711 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2712 	int err;
2713 
2714 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2715 
2716 	if (ndev->setup) {
2717 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2718 		err = 0;
2719 		goto out;
2720 	}
2721 	mlx5_vdpa_add_debugfs(ndev);
2722 
2723 	err = read_umem_params(ndev);
2724 	if (err)
2725 		goto err_setup;
2726 
2727 	err = setup_virtqueues(mvdev);
2728 	if (err) {
2729 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2730 		goto err_setup;
2731 	}
2732 
2733 	err = create_rqt(ndev);
2734 	if (err) {
2735 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2736 		goto err_rqt;
2737 	}
2738 
2739 	err = create_tir(ndev);
2740 	if (err) {
2741 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2742 		goto err_tir;
2743 	}
2744 
2745 	err = setup_steering(ndev);
2746 	if (err) {
2747 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
2748 		goto err_fwd;
2749 	}
2750 	ndev->setup = true;
2751 
2752 	return 0;
2753 
2754 err_fwd:
2755 	destroy_tir(ndev);
2756 err_tir:
2757 	destroy_rqt(ndev);
2758 err_rqt:
2759 	teardown_virtqueues(ndev);
2760 err_setup:
2761 	mlx5_vdpa_remove_debugfs(ndev);
2762 out:
2763 	return err;
2764 }
2765 
2766 /* reslock must be held for this function */
2767 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2768 {
2769 
2770 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2771 
2772 	if (!ndev->setup)
2773 		return;
2774 
2775 	mlx5_vdpa_remove_debugfs(ndev);
2776 	teardown_steering(ndev);
2777 	destroy_tir(ndev);
2778 	destroy_rqt(ndev);
2779 	teardown_virtqueues(ndev);
2780 	ndev->setup = false;
2781 }
2782 
2783 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2784 {
2785 	int i;
2786 
2787 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2788 		ndev->vqs[i].ready = false;
2789 
2790 	ndev->mvdev.cvq.ready = false;
2791 }
2792 
2793 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
2794 {
2795 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2796 	int err = 0;
2797 
2798 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
2799 		u16 idx = cvq->vring.last_avail_idx;
2800 
2801 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2802 					MLX5_CVQ_MAX_ENT, false,
2803 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
2804 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
2805 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2806 
2807 		if (!err)
2808 			cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx;
2809 	}
2810 	return err;
2811 }
2812 
2813 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2814 {
2815 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2816 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2817 	int err;
2818 
2819 	print_status(mvdev, status, true);
2820 
2821 	down_write(&ndev->reslock);
2822 
2823 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2824 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2825 			err = setup_cvq_vring(mvdev);
2826 			if (err) {
2827 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
2828 				goto err_setup;
2829 			}
2830 			register_link_notifier(ndev);
2831 			err = setup_driver(mvdev);
2832 			if (err) {
2833 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2834 				goto err_driver;
2835 			}
2836 		} else {
2837 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2838 			goto err_clear;
2839 		}
2840 	}
2841 
2842 	ndev->mvdev.status = status;
2843 	up_write(&ndev->reslock);
2844 	return;
2845 
2846 err_driver:
2847 	unregister_link_notifier(ndev);
2848 err_setup:
2849 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2850 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2851 err_clear:
2852 	up_write(&ndev->reslock);
2853 }
2854 
2855 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
2856 {
2857 	int i;
2858 
2859 	/* default mapping all groups are mapped to asid 0 */
2860 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
2861 		mvdev->group2asid[i] = 0;
2862 }
2863 
2864 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2865 {
2866 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2867 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2868 
2869 	print_status(mvdev, 0, true);
2870 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2871 
2872 	down_write(&ndev->reslock);
2873 	unregister_link_notifier(ndev);
2874 	teardown_driver(ndev);
2875 	clear_vqs_ready(ndev);
2876 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2877 	ndev->mvdev.status = 0;
2878 	ndev->mvdev.suspended = false;
2879 	ndev->cur_num_vqs = 0;
2880 	ndev->mvdev.cvq.received_desc = 0;
2881 	ndev->mvdev.cvq.completed_desc = 0;
2882 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2883 	ndev->mvdev.actual_features = 0;
2884 	init_group_to_asid_map(mvdev);
2885 	++mvdev->generation;
2886 
2887 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2888 		if (mlx5_vdpa_create_mr(mvdev, NULL, 0))
2889 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2890 	}
2891 	up_write(&ndev->reslock);
2892 
2893 	return 0;
2894 }
2895 
2896 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2897 {
2898 	return sizeof(struct virtio_net_config);
2899 }
2900 
2901 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2902 				 unsigned int len)
2903 {
2904 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2905 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2906 
2907 	if (offset + len <= sizeof(struct virtio_net_config))
2908 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2909 }
2910 
2911 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2912 				 unsigned int len)
2913 {
2914 	/* not supported */
2915 }
2916 
2917 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2918 {
2919 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2920 
2921 	return mvdev->generation;
2922 }
2923 
2924 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
2925 			unsigned int asid)
2926 {
2927 	bool change_map;
2928 	int err;
2929 
2930 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid);
2931 	if (err) {
2932 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2933 		return err;
2934 	}
2935 
2936 	if (change_map)
2937 		err = mlx5_vdpa_change_map(mvdev, iotlb, asid);
2938 
2939 	return err;
2940 }
2941 
2942 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
2943 			     struct vhost_iotlb *iotlb)
2944 {
2945 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2946 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2947 	int err = -EINVAL;
2948 
2949 	down_write(&ndev->reslock);
2950 	err = set_map_data(mvdev, iotlb, asid);
2951 	up_write(&ndev->reslock);
2952 	return err;
2953 }
2954 
2955 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
2956 {
2957 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2958 
2959 	if (is_ctrl_vq_idx(mvdev, idx))
2960 		return &vdev->dev;
2961 
2962 	return mvdev->vdev.dma_dev;
2963 }
2964 
2965 static void free_irqs(struct mlx5_vdpa_net *ndev)
2966 {
2967 	struct mlx5_vdpa_irq_pool_entry *ent;
2968 	int i;
2969 
2970 	if (!msix_mode_supported(&ndev->mvdev))
2971 		return;
2972 
2973 	if (!ndev->irqp.entries)
2974 		return;
2975 
2976 	for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
2977 		ent = ndev->irqp.entries + i;
2978 		if (ent->map.virq)
2979 			pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
2980 	}
2981 	kfree(ndev->irqp.entries);
2982 }
2983 
2984 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2985 {
2986 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2987 	struct mlx5_core_dev *pfmdev;
2988 	struct mlx5_vdpa_net *ndev;
2989 
2990 	ndev = to_mlx5_vdpa_ndev(mvdev);
2991 
2992 	free_resources(ndev);
2993 	mlx5_vdpa_destroy_mr(mvdev);
2994 	if (!is_zero_ether_addr(ndev->config.mac)) {
2995 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2996 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2997 	}
2998 	mlx5_vdpa_free_resources(&ndev->mvdev);
2999 	free_irqs(ndev);
3000 	kfree(ndev->event_cbs);
3001 	kfree(ndev->vqs);
3002 }
3003 
3004 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
3005 {
3006 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3007 	struct vdpa_notification_area ret = {};
3008 	struct mlx5_vdpa_net *ndev;
3009 	phys_addr_t addr;
3010 
3011 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
3012 		return ret;
3013 
3014 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
3015 	 * notification to avoid the risk of mapping pages that contain BAR of more
3016 	 * than one SF
3017 	 */
3018 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
3019 		return ret;
3020 
3021 	ndev = to_mlx5_vdpa_ndev(mvdev);
3022 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
3023 	ret.addr = addr;
3024 	ret.size = PAGE_SIZE;
3025 	return ret;
3026 }
3027 
3028 static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
3029 {
3030 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3031 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3032 	struct mlx5_vdpa_virtqueue *mvq;
3033 
3034 	if (!is_index_valid(mvdev, idx))
3035 		return -EINVAL;
3036 
3037 	if (is_ctrl_vq_idx(mvdev, idx))
3038 		return -EOPNOTSUPP;
3039 
3040 	mvq = &ndev->vqs[idx];
3041 	if (!mvq->map.virq)
3042 		return -EOPNOTSUPP;
3043 
3044 	return mvq->map.virq;
3045 }
3046 
3047 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
3048 {
3049 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3050 
3051 	return mvdev->actual_features;
3052 }
3053 
3054 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
3055 			     u64 *received_desc, u64 *completed_desc)
3056 {
3057 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
3058 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
3059 	void *cmd_hdr;
3060 	void *ctx;
3061 	int err;
3062 
3063 	if (!counters_supported(&ndev->mvdev))
3064 		return -EOPNOTSUPP;
3065 
3066 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
3067 		return -EAGAIN;
3068 
3069 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
3070 
3071 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
3072 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
3073 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
3074 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
3075 
3076 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
3077 	if (err)
3078 		return err;
3079 
3080 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
3081 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
3082 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
3083 	return 0;
3084 }
3085 
3086 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
3087 					 struct sk_buff *msg,
3088 					 struct netlink_ext_ack *extack)
3089 {
3090 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3091 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3092 	struct mlx5_vdpa_virtqueue *mvq;
3093 	struct mlx5_control_vq *cvq;
3094 	u64 received_desc;
3095 	u64 completed_desc;
3096 	int err = 0;
3097 
3098 	down_read(&ndev->reslock);
3099 	if (!is_index_valid(mvdev, idx)) {
3100 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
3101 		err = -EINVAL;
3102 		goto out_err;
3103 	}
3104 
3105 	if (idx == ctrl_vq_idx(mvdev)) {
3106 		cvq = &mvdev->cvq;
3107 		received_desc = cvq->received_desc;
3108 		completed_desc = cvq->completed_desc;
3109 		goto out;
3110 	}
3111 
3112 	mvq = &ndev->vqs[idx];
3113 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
3114 	if (err) {
3115 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
3116 		goto out_err;
3117 	}
3118 
3119 out:
3120 	err = -EMSGSIZE;
3121 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
3122 		goto out_err;
3123 
3124 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
3125 			      VDPA_ATTR_PAD))
3126 		goto out_err;
3127 
3128 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
3129 		goto out_err;
3130 
3131 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
3132 			      VDPA_ATTR_PAD))
3133 		goto out_err;
3134 
3135 	err = 0;
3136 out_err:
3137 	up_read(&ndev->reslock);
3138 	return err;
3139 }
3140 
3141 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
3142 {
3143 	struct mlx5_control_vq *cvq;
3144 
3145 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
3146 		return;
3147 
3148 	cvq = &mvdev->cvq;
3149 	cvq->ready = false;
3150 }
3151 
3152 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
3153 {
3154 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3155 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3156 	struct mlx5_vdpa_virtqueue *mvq;
3157 	int i;
3158 
3159 	mlx5_vdpa_info(mvdev, "suspending device\n");
3160 
3161 	down_write(&ndev->reslock);
3162 	unregister_link_notifier(ndev);
3163 	for (i = 0; i < ndev->cur_num_vqs; i++) {
3164 		mvq = &ndev->vqs[i];
3165 		suspend_vq(ndev, mvq);
3166 	}
3167 	mlx5_vdpa_cvq_suspend(mvdev);
3168 	mvdev->suspended = true;
3169 	up_write(&ndev->reslock);
3170 	return 0;
3171 }
3172 
3173 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3174 			       unsigned int asid)
3175 {
3176 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3177 
3178 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3179 		return -EINVAL;
3180 
3181 	mvdev->group2asid[group] = asid;
3182 	return 0;
3183 }
3184 
3185 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3186 	.set_vq_address = mlx5_vdpa_set_vq_address,
3187 	.set_vq_num = mlx5_vdpa_set_vq_num,
3188 	.kick_vq = mlx5_vdpa_kick_vq,
3189 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3190 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3191 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3192 	.set_vq_state = mlx5_vdpa_set_vq_state,
3193 	.get_vq_state = mlx5_vdpa_get_vq_state,
3194 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3195 	.get_vq_notification = mlx5_get_vq_notification,
3196 	.get_vq_irq = mlx5_get_vq_irq,
3197 	.get_vq_align = mlx5_vdpa_get_vq_align,
3198 	.get_vq_group = mlx5_vdpa_get_vq_group,
3199 	.get_device_features = mlx5_vdpa_get_device_features,
3200 	.set_driver_features = mlx5_vdpa_set_driver_features,
3201 	.get_driver_features = mlx5_vdpa_get_driver_features,
3202 	.set_config_cb = mlx5_vdpa_set_config_cb,
3203 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3204 	.get_device_id = mlx5_vdpa_get_device_id,
3205 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3206 	.get_status = mlx5_vdpa_get_status,
3207 	.set_status = mlx5_vdpa_set_status,
3208 	.reset = mlx5_vdpa_reset,
3209 	.get_config_size = mlx5_vdpa_get_config_size,
3210 	.get_config = mlx5_vdpa_get_config,
3211 	.set_config = mlx5_vdpa_set_config,
3212 	.get_generation = mlx5_vdpa_get_generation,
3213 	.set_map = mlx5_vdpa_set_map,
3214 	.set_group_asid = mlx5_set_group_asid,
3215 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3216 	.free = mlx5_vdpa_free,
3217 	.suspend = mlx5_vdpa_suspend,
3218 };
3219 
3220 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3221 {
3222 	u16 hw_mtu;
3223 	int err;
3224 
3225 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3226 	if (err)
3227 		return err;
3228 
3229 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3230 	return 0;
3231 }
3232 
3233 static int alloc_resources(struct mlx5_vdpa_net *ndev)
3234 {
3235 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3236 	int err;
3237 
3238 	if (res->valid) {
3239 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3240 		return -EEXIST;
3241 	}
3242 
3243 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3244 	if (err)
3245 		return err;
3246 
3247 	err = create_tis(ndev);
3248 	if (err)
3249 		goto err_tis;
3250 
3251 	res->valid = true;
3252 
3253 	return 0;
3254 
3255 err_tis:
3256 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3257 	return err;
3258 }
3259 
3260 static void free_resources(struct mlx5_vdpa_net *ndev)
3261 {
3262 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3263 
3264 	if (!res->valid)
3265 		return;
3266 
3267 	destroy_tis(ndev);
3268 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3269 	res->valid = false;
3270 }
3271 
3272 static void init_mvqs(struct mlx5_vdpa_net *ndev)
3273 {
3274 	struct mlx5_vdpa_virtqueue *mvq;
3275 	int i;
3276 
3277 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3278 		mvq = &ndev->vqs[i];
3279 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3280 		mvq->index = i;
3281 		mvq->ndev = ndev;
3282 		mvq->fwqp.fw = true;
3283 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3284 	}
3285 	for (; i < ndev->mvdev.max_vqs; i++) {
3286 		mvq = &ndev->vqs[i];
3287 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3288 		mvq->index = i;
3289 		mvq->ndev = ndev;
3290 	}
3291 }
3292 
3293 struct mlx5_vdpa_mgmtdev {
3294 	struct vdpa_mgmt_dev mgtdev;
3295 	struct mlx5_adev *madev;
3296 	struct mlx5_vdpa_net *ndev;
3297 };
3298 
3299 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3300 {
3301 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3302 	void *in;
3303 	int err;
3304 
3305 	in = kvzalloc(inlen, GFP_KERNEL);
3306 	if (!in)
3307 		return -ENOMEM;
3308 
3309 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3310 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3311 		 mtu + MLX5V_ETH_HARD_MTU);
3312 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3313 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3314 
3315 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3316 
3317 	kvfree(in);
3318 	return err;
3319 }
3320 
3321 static void allocate_irqs(struct mlx5_vdpa_net *ndev)
3322 {
3323 	struct mlx5_vdpa_irq_pool_entry *ent;
3324 	int i;
3325 
3326 	if (!msix_mode_supported(&ndev->mvdev))
3327 		return;
3328 
3329 	if (!ndev->mvdev.mdev->pdev)
3330 		return;
3331 
3332 	ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
3333 	if (!ndev->irqp.entries)
3334 		return;
3335 
3336 
3337 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
3338 		ent = ndev->irqp.entries + i;
3339 		snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
3340 			 dev_name(&ndev->mvdev.vdev.dev), i);
3341 		ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
3342 		if (!ent->map.virq)
3343 			return;
3344 
3345 		ndev->irqp.num_ent++;
3346 	}
3347 }
3348 
3349 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3350 			     const struct vdpa_dev_set_config *add_config)
3351 {
3352 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3353 	struct virtio_net_config *config;
3354 	struct mlx5_core_dev *pfmdev;
3355 	struct mlx5_vdpa_dev *mvdev;
3356 	struct mlx5_vdpa_net *ndev;
3357 	struct mlx5_core_dev *mdev;
3358 	u64 device_features;
3359 	u32 max_vqs;
3360 	u16 mtu;
3361 	int err;
3362 
3363 	if (mgtdev->ndev)
3364 		return -ENOSPC;
3365 
3366 	mdev = mgtdev->madev->mdev;
3367 	device_features = mgtdev->mgtdev.supported_features;
3368 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3369 		if (add_config->device_features & ~device_features) {
3370 			dev_warn(mdev->device,
3371 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3372 				 add_config->device_features, device_features);
3373 			return -EINVAL;
3374 		}
3375 		device_features &= add_config->device_features;
3376 	} else {
3377 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3378 	}
3379 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3380 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3381 		dev_warn(mdev->device,
3382 			 "Must provision minimum features 0x%llx for this device",
3383 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3384 		return -EOPNOTSUPP;
3385 	}
3386 
3387 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3388 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3389 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3390 		return -EOPNOTSUPP;
3391 	}
3392 
3393 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3394 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3395 	if (max_vqs < 2) {
3396 		dev_warn(mdev->device,
3397 			 "%d virtqueues are supported. At least 2 are required\n",
3398 			 max_vqs);
3399 		return -EAGAIN;
3400 	}
3401 
3402 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3403 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3404 			return -EINVAL;
3405 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3406 	} else {
3407 		max_vqs = 2;
3408 	}
3409 
3410 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
3411 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3412 	if (IS_ERR(ndev))
3413 		return PTR_ERR(ndev);
3414 
3415 	ndev->mvdev.max_vqs = max_vqs;
3416 	mvdev = &ndev->mvdev;
3417 	mvdev->mdev = mdev;
3418 
3419 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3420 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3421 	if (!ndev->vqs || !ndev->event_cbs) {
3422 		err = -ENOMEM;
3423 		goto err_alloc;
3424 	}
3425 
3426 	init_mvqs(ndev);
3427 	allocate_irqs(ndev);
3428 	init_rwsem(&ndev->reslock);
3429 	config = &ndev->config;
3430 
3431 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3432 		err = config_func_mtu(mdev, add_config->net.mtu);
3433 		if (err)
3434 			goto err_alloc;
3435 	}
3436 
3437 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3438 		err = query_mtu(mdev, &mtu);
3439 		if (err)
3440 			goto err_alloc;
3441 
3442 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3443 	}
3444 
3445 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3446 		if (get_link_state(mvdev))
3447 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3448 		else
3449 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3450 	}
3451 
3452 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3453 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3454 	/* No bother setting mac address in config if not going to provision _F_MAC */
3455 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3456 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3457 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3458 		if (err)
3459 			goto err_alloc;
3460 	}
3461 
3462 	if (!is_zero_ether_addr(config->mac)) {
3463 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3464 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3465 		if (err)
3466 			goto err_alloc;
3467 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3468 		/*
3469 		 * We used to clear _F_MAC feature bit if seeing
3470 		 * zero mac address when device features are not
3471 		 * specifically provisioned. Keep the behaviour
3472 		 * so old scripts do not break.
3473 		 */
3474 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3475 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3476 		/* Don't provision zero mac address for _F_MAC */
3477 		mlx5_vdpa_warn(&ndev->mvdev,
3478 			       "No mac address provisioned?\n");
3479 		err = -EINVAL;
3480 		goto err_alloc;
3481 	}
3482 
3483 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
3484 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3485 
3486 	ndev->mvdev.mlx_features = device_features;
3487 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3488 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3489 	if (err)
3490 		goto err_mpfs;
3491 
3492 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3493 		err = mlx5_vdpa_create_mr(mvdev, NULL, 0);
3494 		if (err)
3495 			goto err_res;
3496 	}
3497 
3498 	err = alloc_resources(ndev);
3499 	if (err)
3500 		goto err_mr;
3501 
3502 	ndev->cvq_ent.mvdev = mvdev;
3503 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3504 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3505 	if (!mvdev->wq) {
3506 		err = -ENOMEM;
3507 		goto err_res2;
3508 	}
3509 
3510 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3511 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3512 	if (err)
3513 		goto err_reg;
3514 
3515 	mgtdev->ndev = ndev;
3516 	return 0;
3517 
3518 err_reg:
3519 	destroy_workqueue(mvdev->wq);
3520 err_res2:
3521 	free_resources(ndev);
3522 err_mr:
3523 	mlx5_vdpa_destroy_mr(mvdev);
3524 err_res:
3525 	mlx5_vdpa_free_resources(&ndev->mvdev);
3526 err_mpfs:
3527 	if (!is_zero_ether_addr(config->mac))
3528 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3529 err_alloc:
3530 	put_device(&mvdev->vdev.dev);
3531 	return err;
3532 }
3533 
3534 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3535 {
3536 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3537 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3538 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3539 	struct workqueue_struct *wq;
3540 
3541 	unregister_link_notifier(ndev);
3542 	_vdpa_unregister_device(dev);
3543 	wq = mvdev->wq;
3544 	mvdev->wq = NULL;
3545 	destroy_workqueue(wq);
3546 	mgtdev->ndev = NULL;
3547 }
3548 
3549 static const struct vdpa_mgmtdev_ops mdev_ops = {
3550 	.dev_add = mlx5_vdpa_dev_add,
3551 	.dev_del = mlx5_vdpa_dev_del,
3552 };
3553 
3554 static struct virtio_device_id id_table[] = {
3555 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3556 	{ 0 },
3557 };
3558 
3559 static int mlx5v_probe(struct auxiliary_device *adev,
3560 		       const struct auxiliary_device_id *id)
3561 
3562 {
3563 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3564 	struct mlx5_core_dev *mdev = madev->mdev;
3565 	struct mlx5_vdpa_mgmtdev *mgtdev;
3566 	int err;
3567 
3568 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3569 	if (!mgtdev)
3570 		return -ENOMEM;
3571 
3572 	mgtdev->mgtdev.ops = &mdev_ops;
3573 	mgtdev->mgtdev.device = mdev->device;
3574 	mgtdev->mgtdev.id_table = id_table;
3575 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3576 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3577 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3578 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3579 	mgtdev->mgtdev.max_supported_vqs =
3580 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3581 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3582 	mgtdev->madev = madev;
3583 
3584 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3585 	if (err)
3586 		goto reg_err;
3587 
3588 	auxiliary_set_drvdata(adev, mgtdev);
3589 
3590 	return 0;
3591 
3592 reg_err:
3593 	kfree(mgtdev);
3594 	return err;
3595 }
3596 
3597 static void mlx5v_remove(struct auxiliary_device *adev)
3598 {
3599 	struct mlx5_vdpa_mgmtdev *mgtdev;
3600 
3601 	mgtdev = auxiliary_get_drvdata(adev);
3602 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
3603 	kfree(mgtdev);
3604 }
3605 
3606 static const struct auxiliary_device_id mlx5v_id_table[] = {
3607 	{ .name = MLX5_ADEV_NAME ".vnet", },
3608 	{},
3609 };
3610 
3611 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
3612 
3613 static struct auxiliary_driver mlx5v_driver = {
3614 	.name = "vnet",
3615 	.probe = mlx5v_probe,
3616 	.remove = mlx5v_remove,
3617 	.id_table = mlx5v_id_table,
3618 };
3619 
3620 module_auxiliary_driver(mlx5v_driver);
3621