xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 703e7713)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 #include "mlx5_vnet.h"
22 
23 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
24 MODULE_DESCRIPTION("Mellanox VDPA driver");
25 MODULE_LICENSE("Dual BSD/GPL");
26 
27 #define VALID_FEATURES_MASK                                                                        \
28 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
29 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
30 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
32 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
33 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
34 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
36 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
37 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
38 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
39 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
40 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
41 
42 #define VALID_STATUS_MASK                                                                          \
43 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
44 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
45 
46 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
47 
48 #define MLX5V_UNTAGGED 0x1000
49 
50 struct mlx5_vdpa_cq_buf {
51 	struct mlx5_frag_buf_ctrl fbc;
52 	struct mlx5_frag_buf frag_buf;
53 	int cqe_size;
54 	int nent;
55 };
56 
57 struct mlx5_vdpa_cq {
58 	struct mlx5_core_cq mcq;
59 	struct mlx5_vdpa_cq_buf buf;
60 	struct mlx5_db db;
61 	int cqe;
62 };
63 
64 struct mlx5_vdpa_umem {
65 	struct mlx5_frag_buf_ctrl fbc;
66 	struct mlx5_frag_buf frag_buf;
67 	int size;
68 	u32 id;
69 };
70 
71 struct mlx5_vdpa_qp {
72 	struct mlx5_core_qp mqp;
73 	struct mlx5_frag_buf frag_buf;
74 	struct mlx5_db db;
75 	u16 head;
76 	bool fw;
77 };
78 
79 struct mlx5_vq_restore_info {
80 	u32 num_ent;
81 	u64 desc_addr;
82 	u64 device_addr;
83 	u64 driver_addr;
84 	u16 avail_index;
85 	u16 used_index;
86 	struct msi_map map;
87 	bool ready;
88 	bool restore;
89 };
90 
91 struct mlx5_vdpa_virtqueue {
92 	bool ready;
93 	u64 desc_addr;
94 	u64 device_addr;
95 	u64 driver_addr;
96 	u32 num_ent;
97 
98 	/* Resources for implementing the notification channel from the device
99 	 * to the driver. fwqp is the firmware end of an RC connection; the
100 	 * other end is vqqp used by the driver. cq is where completions are
101 	 * reported.
102 	 */
103 	struct mlx5_vdpa_cq cq;
104 	struct mlx5_vdpa_qp fwqp;
105 	struct mlx5_vdpa_qp vqqp;
106 
107 	/* umem resources are required for the virtqueue operation. They're use
108 	 * is internal and they must be provided by the driver.
109 	 */
110 	struct mlx5_vdpa_umem umem1;
111 	struct mlx5_vdpa_umem umem2;
112 	struct mlx5_vdpa_umem umem3;
113 
114 	u32 counter_set_id;
115 	bool initialized;
116 	int index;
117 	u32 virtq_id;
118 	struct mlx5_vdpa_net *ndev;
119 	u16 avail_idx;
120 	u16 used_idx;
121 	int fw_state;
122 	struct msi_map map;
123 
124 	/* keep last in the struct */
125 	struct mlx5_vq_restore_info ri;
126 };
127 
128 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
129 {
130 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
131 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
132 			return idx < 2;
133 		else
134 			return idx < 3;
135 	}
136 
137 	return idx <= mvdev->max_idx;
138 }
139 
140 static void free_resources(struct mlx5_vdpa_net *ndev);
141 static void init_mvqs(struct mlx5_vdpa_net *ndev);
142 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
143 static void teardown_driver(struct mlx5_vdpa_net *ndev);
144 
145 static bool mlx5_vdpa_debug;
146 
147 #define MLX5_CVQ_MAX_ENT 16
148 
149 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
150 	do {                                                                                       \
151 		if (features & BIT_ULL(_feature))                                                  \
152 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
153 	} while (0)
154 
155 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
156 	do {                                                                                       \
157 		if (status & (_status))                                                            \
158 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
159 	} while (0)
160 
161 /* TODO: cross-endian support */
162 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
163 {
164 	return virtio_legacy_is_little_endian() ||
165 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
166 }
167 
168 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
169 {
170 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
171 }
172 
173 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
174 {
175 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
176 }
177 
178 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
179 {
180 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
181 		return 2;
182 
183 	return mvdev->max_vqs;
184 }
185 
186 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
187 {
188 	return idx == ctrl_vq_idx(mvdev);
189 }
190 
191 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
192 {
193 	if (status & ~VALID_STATUS_MASK)
194 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
195 			       status & ~VALID_STATUS_MASK);
196 
197 	if (!mlx5_vdpa_debug)
198 		return;
199 
200 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
201 	if (set && !status) {
202 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
203 		return;
204 	}
205 
206 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
207 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
208 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
209 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
210 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
211 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
212 }
213 
214 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
215 {
216 	if (features & ~VALID_FEATURES_MASK)
217 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
218 			       features & ~VALID_FEATURES_MASK);
219 
220 	if (!mlx5_vdpa_debug)
221 		return;
222 
223 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
224 	if (!features)
225 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
226 
227 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
228 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
229 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
230 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
231 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
261 }
262 
263 static int create_tis(struct mlx5_vdpa_net *ndev)
264 {
265 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
266 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
267 	void *tisc;
268 	int err;
269 
270 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
271 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
272 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
273 	if (err)
274 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
275 
276 	return err;
277 }
278 
279 static void destroy_tis(struct mlx5_vdpa_net *ndev)
280 {
281 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
282 }
283 
284 #define MLX5_VDPA_CQE_SIZE 64
285 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
286 
287 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
288 {
289 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
290 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
291 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
292 	int err;
293 
294 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
295 				       ndev->mvdev.mdev->priv.numa_node);
296 	if (err)
297 		return err;
298 
299 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
300 
301 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
302 	buf->nent = nent;
303 
304 	return 0;
305 }
306 
307 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
308 {
309 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
310 
311 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
312 					ndev->mvdev.mdev->priv.numa_node);
313 }
314 
315 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
316 {
317 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
318 }
319 
320 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
321 {
322 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
323 }
324 
325 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
326 {
327 	struct mlx5_cqe64 *cqe64;
328 	void *cqe;
329 	int i;
330 
331 	for (i = 0; i < buf->nent; i++) {
332 		cqe = get_cqe(vcq, i);
333 		cqe64 = cqe;
334 		cqe64->op_own = MLX5_CQE_INVALID << 4;
335 	}
336 }
337 
338 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
339 {
340 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
341 
342 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
343 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
344 		return cqe64;
345 
346 	return NULL;
347 }
348 
349 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
350 {
351 	vqp->head += n;
352 	vqp->db.db[0] = cpu_to_be32(vqp->head);
353 }
354 
355 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
356 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
357 {
358 	struct mlx5_vdpa_qp *vqp;
359 	__be64 *pas;
360 	void *qpc;
361 
362 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
363 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
364 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
365 	if (vqp->fw) {
366 		/* Firmware QP is allocated by the driver for the firmware's
367 		 * use so we can skip part of the params as they will be chosen by firmware
368 		 */
369 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
370 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
371 		MLX5_SET(qpc, qpc, no_sq, 1);
372 		return;
373 	}
374 
375 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
376 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
377 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
378 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
379 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
380 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
381 	MLX5_SET(qpc, qpc, no_sq, 1);
382 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
383 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
384 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
385 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
386 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
387 }
388 
389 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
390 {
391 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
392 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
393 					ndev->mvdev.mdev->priv.numa_node);
394 }
395 
396 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
397 {
398 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
399 }
400 
401 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
402 		     struct mlx5_vdpa_qp *vqp)
403 {
404 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
405 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
406 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
407 	void *qpc;
408 	void *in;
409 	int err;
410 
411 	if (!vqp->fw) {
412 		vqp = &mvq->vqqp;
413 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
414 		if (err)
415 			return err;
416 
417 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
418 		if (err)
419 			goto err_db;
420 		inlen += vqp->frag_buf.npages * sizeof(__be64);
421 	}
422 
423 	in = kzalloc(inlen, GFP_KERNEL);
424 	if (!in) {
425 		err = -ENOMEM;
426 		goto err_kzalloc;
427 	}
428 
429 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
430 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
431 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
432 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
433 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
434 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
435 	if (!vqp->fw)
436 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
437 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
438 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
439 	kfree(in);
440 	if (err)
441 		goto err_kzalloc;
442 
443 	vqp->mqp.uid = ndev->mvdev.res.uid;
444 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
445 
446 	if (!vqp->fw)
447 		rx_post(vqp, mvq->num_ent);
448 
449 	return 0;
450 
451 err_kzalloc:
452 	if (!vqp->fw)
453 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
454 err_db:
455 	if (!vqp->fw)
456 		rq_buf_free(ndev, vqp);
457 
458 	return err;
459 }
460 
461 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
462 {
463 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
464 
465 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
466 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
467 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
468 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
469 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
470 	if (!vqp->fw) {
471 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
472 		rq_buf_free(ndev, vqp);
473 	}
474 }
475 
476 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
477 {
478 	return get_sw_cqe(cq, cq->mcq.cons_index);
479 }
480 
481 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
482 {
483 	struct mlx5_cqe64 *cqe64;
484 
485 	cqe64 = next_cqe_sw(vcq);
486 	if (!cqe64)
487 		return -EAGAIN;
488 
489 	vcq->mcq.cons_index++;
490 	return 0;
491 }
492 
493 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
494 {
495 	struct mlx5_vdpa_net *ndev = mvq->ndev;
496 	struct vdpa_callback *event_cb;
497 
498 	event_cb = &ndev->event_cbs[mvq->index];
499 	mlx5_cq_set_ci(&mvq->cq.mcq);
500 
501 	/* make sure CQ cosumer update is visible to the hardware before updating
502 	 * RX doorbell record.
503 	 */
504 	dma_wmb();
505 	rx_post(&mvq->vqqp, num);
506 	if (event_cb->callback)
507 		event_cb->callback(event_cb->private);
508 }
509 
510 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
511 {
512 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
513 	struct mlx5_vdpa_net *ndev = mvq->ndev;
514 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
515 	int num = 0;
516 
517 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
518 		num++;
519 		if (num > mvq->num_ent / 2) {
520 			/* If completions keep coming while we poll, we want to
521 			 * let the hardware know that we consumed them by
522 			 * updating the doorbell record.  We also let vdpa core
523 			 * know about this so it passes it on the virtio driver
524 			 * on the guest.
525 			 */
526 			mlx5_vdpa_handle_completions(mvq, num);
527 			num = 0;
528 		}
529 	}
530 
531 	if (num)
532 		mlx5_vdpa_handle_completions(mvq, num);
533 
534 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
535 }
536 
537 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
538 {
539 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
540 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
541 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
542 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
543 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
544 	__be64 *pas;
545 	int inlen;
546 	void *cqc;
547 	void *in;
548 	int err;
549 	int eqn;
550 
551 	err = mlx5_db_alloc(mdev, &vcq->db);
552 	if (err)
553 		return err;
554 
555 	vcq->mcq.set_ci_db = vcq->db.db;
556 	vcq->mcq.arm_db = vcq->db.db + 1;
557 	vcq->mcq.cqe_sz = 64;
558 
559 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
560 	if (err)
561 		goto err_db;
562 
563 	cq_frag_buf_init(vcq, &vcq->buf);
564 
565 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
566 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
567 	in = kzalloc(inlen, GFP_KERNEL);
568 	if (!in) {
569 		err = -ENOMEM;
570 		goto err_vzalloc;
571 	}
572 
573 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
574 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
575 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
576 
577 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
578 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
579 
580 	/* Use vector 0 by default. Consider adding code to choose least used
581 	 * vector.
582 	 */
583 	err = mlx5_vector2eqn(mdev, 0, &eqn);
584 	if (err)
585 		goto err_vec;
586 
587 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
588 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
589 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
590 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
591 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
592 
593 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
594 	if (err)
595 		goto err_vec;
596 
597 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
598 	vcq->cqe = num_ent;
599 	vcq->mcq.set_ci_db = vcq->db.db;
600 	vcq->mcq.arm_db = vcq->db.db + 1;
601 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
602 	kfree(in);
603 	return 0;
604 
605 err_vec:
606 	kfree(in);
607 err_vzalloc:
608 	cq_frag_buf_free(ndev, &vcq->buf);
609 err_db:
610 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
611 	return err;
612 }
613 
614 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
615 {
616 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
617 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
618 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
619 
620 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
621 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
622 		return;
623 	}
624 	cq_frag_buf_free(ndev, &vcq->buf);
625 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
626 }
627 
628 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
629 			  struct mlx5_vdpa_umem **umemp)
630 {
631 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
632 	int p_a;
633 	int p_b;
634 
635 	switch (num) {
636 	case 1:
637 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
638 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
639 		*umemp = &mvq->umem1;
640 		break;
641 	case 2:
642 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
643 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
644 		*umemp = &mvq->umem2;
645 		break;
646 	case 3:
647 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
648 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
649 		*umemp = &mvq->umem3;
650 		break;
651 	}
652 	(*umemp)->size = p_a * mvq->num_ent + p_b;
653 }
654 
655 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
656 {
657 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
658 }
659 
660 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
661 {
662 	int inlen;
663 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
664 	void *um;
665 	void *in;
666 	int err;
667 	__be64 *pas;
668 	struct mlx5_vdpa_umem *umem;
669 
670 	set_umem_size(ndev, mvq, num, &umem);
671 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
672 	if (err)
673 		return err;
674 
675 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
676 
677 	in = kzalloc(inlen, GFP_KERNEL);
678 	if (!in) {
679 		err = -ENOMEM;
680 		goto err_in;
681 	}
682 
683 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
684 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
685 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
686 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
687 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
688 
689 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
690 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
691 
692 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
693 	if (err) {
694 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
695 		goto err_cmd;
696 	}
697 
698 	kfree(in);
699 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
700 
701 	return 0;
702 
703 err_cmd:
704 	kfree(in);
705 err_in:
706 	umem_frag_buf_free(ndev, umem);
707 	return err;
708 }
709 
710 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
711 {
712 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
713 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
714 	struct mlx5_vdpa_umem *umem;
715 
716 	switch (num) {
717 	case 1:
718 		umem = &mvq->umem1;
719 		break;
720 	case 2:
721 		umem = &mvq->umem2;
722 		break;
723 	case 3:
724 		umem = &mvq->umem3;
725 		break;
726 	}
727 
728 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
729 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
730 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
731 		return;
732 
733 	umem_frag_buf_free(ndev, umem);
734 }
735 
736 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
737 {
738 	int num;
739 	int err;
740 
741 	for (num = 1; num <= 3; num++) {
742 		err = create_umem(ndev, mvq, num);
743 		if (err)
744 			goto err_umem;
745 	}
746 	return 0;
747 
748 err_umem:
749 	for (num--; num > 0; num--)
750 		umem_destroy(ndev, mvq, num);
751 
752 	return err;
753 }
754 
755 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
756 {
757 	int num;
758 
759 	for (num = 3; num > 0; num--)
760 		umem_destroy(ndev, mvq, num);
761 }
762 
763 static int get_queue_type(struct mlx5_vdpa_net *ndev)
764 {
765 	u32 type_mask;
766 
767 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
768 
769 	/* prefer split queue */
770 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
771 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
772 
773 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
774 
775 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
776 }
777 
778 static bool vq_is_tx(u16 idx)
779 {
780 	return idx % 2;
781 }
782 
783 enum {
784 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
785 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
786 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
787 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
788 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
789 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
790 	MLX5_VIRTIO_NET_F_CSUM = 10,
791 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
792 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
793 };
794 
795 static u16 get_features(u64 features)
796 {
797 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
798 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
799 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
800 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
801 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
802 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
803 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
804 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
805 }
806 
807 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
808 {
809 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
810 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
811 }
812 
813 static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
814 {
815 	return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
816 		(1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
817 		pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
818 }
819 
820 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
821 {
822 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
823 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
824 	void *obj_context;
825 	u16 mlx_features;
826 	void *cmd_hdr;
827 	void *vq_ctx;
828 	void *in;
829 	int err;
830 
831 	err = umems_create(ndev, mvq);
832 	if (err)
833 		return err;
834 
835 	in = kzalloc(inlen, GFP_KERNEL);
836 	if (!in) {
837 		err = -ENOMEM;
838 		goto err_alloc;
839 	}
840 
841 	mlx_features = get_features(ndev->mvdev.actual_features);
842 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
843 
844 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
845 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
846 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
847 
848 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
849 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
850 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
851 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
852 		 mlx_features >> 3);
853 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
854 		 mlx_features & 7);
855 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
856 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
857 
858 	if (vq_is_tx(mvq->index))
859 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
860 
861 	if (mvq->map.virq) {
862 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
863 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
864 	} else {
865 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
866 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
867 	}
868 
869 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
870 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
871 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
872 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
873 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
874 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
875 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
876 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
877 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
878 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
879 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
880 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
881 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
882 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
883 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
884 	if (counters_supported(&ndev->mvdev))
885 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
886 
887 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
888 	if (err)
889 		goto err_cmd;
890 
891 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
892 	kfree(in);
893 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
894 
895 	return 0;
896 
897 err_cmd:
898 	kfree(in);
899 err_alloc:
900 	umems_destroy(ndev, mvq);
901 	return err;
902 }
903 
904 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
905 {
906 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
907 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
908 
909 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
910 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
911 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
912 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
913 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
914 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
915 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
916 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
917 		return;
918 	}
919 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
920 	umems_destroy(ndev, mvq);
921 }
922 
923 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
924 {
925 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
926 }
927 
928 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
929 {
930 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
931 }
932 
933 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
934 			int *outlen, u32 qpn, u32 rqpn)
935 {
936 	void *qpc;
937 	void *pp;
938 
939 	switch (cmd) {
940 	case MLX5_CMD_OP_2RST_QP:
941 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
942 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
943 		*in = kzalloc(*inlen, GFP_KERNEL);
944 		*out = kzalloc(*outlen, GFP_KERNEL);
945 		if (!*in || !*out)
946 			goto outerr;
947 
948 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
949 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
950 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
951 		break;
952 	case MLX5_CMD_OP_RST2INIT_QP:
953 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
954 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
955 		*in = kzalloc(*inlen, GFP_KERNEL);
956 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
957 		if (!*in || !*out)
958 			goto outerr;
959 
960 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
961 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
962 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
963 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
964 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
965 		MLX5_SET(qpc, qpc, rwe, 1);
966 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
967 		MLX5_SET(ads, pp, vhca_port_num, 1);
968 		break;
969 	case MLX5_CMD_OP_INIT2RTR_QP:
970 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
971 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
972 		*in = kzalloc(*inlen, GFP_KERNEL);
973 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
974 		if (!*in || !*out)
975 			goto outerr;
976 
977 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
978 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
979 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
980 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
981 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
982 		MLX5_SET(qpc, qpc, log_msg_max, 30);
983 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
984 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
985 		MLX5_SET(ads, pp, fl, 1);
986 		break;
987 	case MLX5_CMD_OP_RTR2RTS_QP:
988 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
989 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
990 		*in = kzalloc(*inlen, GFP_KERNEL);
991 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
992 		if (!*in || !*out)
993 			goto outerr;
994 
995 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
996 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
997 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
998 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
999 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1000 		MLX5_SET(ads, pp, ack_timeout, 14);
1001 		MLX5_SET(qpc, qpc, retry_count, 7);
1002 		MLX5_SET(qpc, qpc, rnr_retry, 7);
1003 		break;
1004 	default:
1005 		goto outerr_nullify;
1006 	}
1007 
1008 	return;
1009 
1010 outerr:
1011 	kfree(*in);
1012 	kfree(*out);
1013 outerr_nullify:
1014 	*in = NULL;
1015 	*out = NULL;
1016 }
1017 
1018 static void free_inout(void *in, void *out)
1019 {
1020 	kfree(in);
1021 	kfree(out);
1022 }
1023 
1024 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1025  * firmware. The fw argument indicates whether the subjected QP is the one used
1026  * by firmware.
1027  */
1028 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1029 {
1030 	int outlen;
1031 	int inlen;
1032 	void *out;
1033 	void *in;
1034 	int err;
1035 
1036 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1037 	if (!in || !out)
1038 		return -ENOMEM;
1039 
1040 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1041 	free_inout(in, out);
1042 	return err;
1043 }
1044 
1045 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1046 {
1047 	int err;
1048 
1049 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1050 	if (err)
1051 		return err;
1052 
1053 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1054 	if (err)
1055 		return err;
1056 
1057 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1058 	if (err)
1059 		return err;
1060 
1061 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1062 	if (err)
1063 		return err;
1064 
1065 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1066 	if (err)
1067 		return err;
1068 
1069 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1070 	if (err)
1071 		return err;
1072 
1073 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1074 }
1075 
1076 struct mlx5_virtq_attr {
1077 	u8 state;
1078 	u16 available_index;
1079 	u16 used_index;
1080 };
1081 
1082 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1083 			   struct mlx5_virtq_attr *attr)
1084 {
1085 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1086 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1087 	void *out;
1088 	void *obj_context;
1089 	void *cmd_hdr;
1090 	int err;
1091 
1092 	out = kzalloc(outlen, GFP_KERNEL);
1093 	if (!out)
1094 		return -ENOMEM;
1095 
1096 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1097 
1098 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1099 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1100 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1101 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1102 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1103 	if (err)
1104 		goto err_cmd;
1105 
1106 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1107 	memset(attr, 0, sizeof(*attr));
1108 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1109 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1110 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1111 	kfree(out);
1112 	return 0;
1113 
1114 err_cmd:
1115 	kfree(out);
1116 	return err;
1117 }
1118 
1119 static bool is_valid_state_change(int oldstate, int newstate)
1120 {
1121 	switch (oldstate) {
1122 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1123 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1124 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1125 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1126 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1127 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1128 	default:
1129 		return false;
1130 	}
1131 }
1132 
1133 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1134 {
1135 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1136 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1137 	void *obj_context;
1138 	void *cmd_hdr;
1139 	void *in;
1140 	int err;
1141 
1142 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1143 		return 0;
1144 
1145 	if (!is_valid_state_change(mvq->fw_state, state))
1146 		return -EINVAL;
1147 
1148 	in = kzalloc(inlen, GFP_KERNEL);
1149 	if (!in)
1150 		return -ENOMEM;
1151 
1152 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1153 
1154 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1155 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1156 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1157 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1158 
1159 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1160 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1161 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1162 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1163 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1164 	kfree(in);
1165 	if (!err)
1166 		mvq->fw_state = state;
1167 
1168 	return err;
1169 }
1170 
1171 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1172 {
1173 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1174 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1175 	void *cmd_hdr;
1176 	int err;
1177 
1178 	if (!counters_supported(&ndev->mvdev))
1179 		return 0;
1180 
1181 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1182 
1183 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1184 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1185 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1186 
1187 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1188 	if (err)
1189 		return err;
1190 
1191 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1192 
1193 	return 0;
1194 }
1195 
1196 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1197 {
1198 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1199 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1200 
1201 	if (!counters_supported(&ndev->mvdev))
1202 		return;
1203 
1204 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1205 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1206 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1207 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1208 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1209 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1210 }
1211 
1212 static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
1213 {
1214 	struct vdpa_callback *cb = priv;
1215 
1216 	if (cb->callback)
1217 		return cb->callback(cb->private);
1218 
1219 	return IRQ_HANDLED;
1220 }
1221 
1222 static void alloc_vector(struct mlx5_vdpa_net *ndev,
1223 			 struct mlx5_vdpa_virtqueue *mvq)
1224 {
1225 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1226 	struct mlx5_vdpa_irq_pool_entry *ent;
1227 	int err;
1228 	int i;
1229 
1230 	for (i = 0; i < irqp->num_ent; i++) {
1231 		ent = &irqp->entries[i];
1232 		if (!ent->used) {
1233 			snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
1234 				 dev_name(&ndev->mvdev.vdev.dev), mvq->index);
1235 			ent->dev_id = &ndev->event_cbs[mvq->index];
1236 			err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
1237 					  ent->name, ent->dev_id);
1238 			if (err)
1239 				return;
1240 
1241 			ent->used = true;
1242 			mvq->map = ent->map;
1243 			return;
1244 		}
1245 	}
1246 }
1247 
1248 static void dealloc_vector(struct mlx5_vdpa_net *ndev,
1249 			   struct mlx5_vdpa_virtqueue *mvq)
1250 {
1251 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1252 	int i;
1253 
1254 	for (i = 0; i < irqp->num_ent; i++)
1255 		if (mvq->map.virq == irqp->entries[i].map.virq) {
1256 			free_irq(mvq->map.virq, irqp->entries[i].dev_id);
1257 			irqp->entries[i].used = false;
1258 			return;
1259 		}
1260 }
1261 
1262 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1263 {
1264 	u16 idx = mvq->index;
1265 	int err;
1266 
1267 	if (!mvq->num_ent)
1268 		return 0;
1269 
1270 	if (mvq->initialized)
1271 		return 0;
1272 
1273 	err = cq_create(ndev, idx, mvq->num_ent);
1274 	if (err)
1275 		return err;
1276 
1277 	err = qp_create(ndev, mvq, &mvq->fwqp);
1278 	if (err)
1279 		goto err_fwqp;
1280 
1281 	err = qp_create(ndev, mvq, &mvq->vqqp);
1282 	if (err)
1283 		goto err_vqqp;
1284 
1285 	err = connect_qps(ndev, mvq);
1286 	if (err)
1287 		goto err_connect;
1288 
1289 	err = counter_set_alloc(ndev, mvq);
1290 	if (err)
1291 		goto err_connect;
1292 
1293 	alloc_vector(ndev, mvq);
1294 	err = create_virtqueue(ndev, mvq);
1295 	if (err)
1296 		goto err_vq;
1297 
1298 	if (mvq->ready) {
1299 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1300 		if (err) {
1301 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1302 				       idx, err);
1303 			goto err_modify;
1304 		}
1305 	}
1306 
1307 	mvq->initialized = true;
1308 	return 0;
1309 
1310 err_modify:
1311 	destroy_virtqueue(ndev, mvq);
1312 err_vq:
1313 	dealloc_vector(ndev, mvq);
1314 	counter_set_dealloc(ndev, mvq);
1315 err_connect:
1316 	qp_destroy(ndev, &mvq->vqqp);
1317 err_vqqp:
1318 	qp_destroy(ndev, &mvq->fwqp);
1319 err_fwqp:
1320 	cq_destroy(ndev, idx);
1321 	return err;
1322 }
1323 
1324 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1325 {
1326 	struct mlx5_virtq_attr attr;
1327 
1328 	if (!mvq->initialized)
1329 		return;
1330 
1331 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1332 		return;
1333 
1334 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1335 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1336 
1337 	if (query_virtqueue(ndev, mvq, &attr)) {
1338 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1339 		return;
1340 	}
1341 	mvq->avail_idx = attr.available_index;
1342 	mvq->used_idx = attr.used_index;
1343 }
1344 
1345 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1346 {
1347 	int i;
1348 
1349 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1350 		suspend_vq(ndev, &ndev->vqs[i]);
1351 }
1352 
1353 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1354 {
1355 	if (!mvq->initialized)
1356 		return;
1357 
1358 	suspend_vq(ndev, mvq);
1359 	destroy_virtqueue(ndev, mvq);
1360 	dealloc_vector(ndev, mvq);
1361 	counter_set_dealloc(ndev, mvq);
1362 	qp_destroy(ndev, &mvq->vqqp);
1363 	qp_destroy(ndev, &mvq->fwqp);
1364 	cq_destroy(ndev, mvq->index);
1365 	mvq->initialized = false;
1366 }
1367 
1368 static int create_rqt(struct mlx5_vdpa_net *ndev)
1369 {
1370 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1371 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1372 	__be32 *list;
1373 	void *rqtc;
1374 	int inlen;
1375 	void *in;
1376 	int i, j;
1377 	int err;
1378 
1379 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1380 	in = kzalloc(inlen, GFP_KERNEL);
1381 	if (!in)
1382 		return -ENOMEM;
1383 
1384 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1385 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1386 
1387 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1388 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1389 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1390 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1391 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1392 
1393 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1394 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1395 	kfree(in);
1396 	if (err)
1397 		return err;
1398 
1399 	return 0;
1400 }
1401 
1402 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1403 
1404 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1405 {
1406 	int act_sz = roundup_pow_of_two(num / 2);
1407 	__be32 *list;
1408 	void *rqtc;
1409 	int inlen;
1410 	void *in;
1411 	int i, j;
1412 	int err;
1413 
1414 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1415 	in = kzalloc(inlen, GFP_KERNEL);
1416 	if (!in)
1417 		return -ENOMEM;
1418 
1419 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1420 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1421 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1422 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1423 
1424 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1425 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1426 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1427 
1428 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1429 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1430 	kfree(in);
1431 	if (err)
1432 		return err;
1433 
1434 	return 0;
1435 }
1436 
1437 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1438 {
1439 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1440 }
1441 
1442 static int create_tir(struct mlx5_vdpa_net *ndev)
1443 {
1444 #define HASH_IP_L4PORTS                                                                            \
1445 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1446 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1447 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1448 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1449 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1450 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1451 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1452 	void *rss_key;
1453 	void *outer;
1454 	void *tirc;
1455 	void *in;
1456 	int err;
1457 
1458 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1459 	if (!in)
1460 		return -ENOMEM;
1461 
1462 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1463 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1464 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1465 
1466 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1467 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1468 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1469 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1470 
1471 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1472 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1473 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1474 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1475 
1476 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1477 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1478 
1479 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1480 	kfree(in);
1481 	if (err)
1482 		return err;
1483 
1484 	mlx5_vdpa_add_tirn(ndev);
1485 	return err;
1486 }
1487 
1488 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1489 {
1490 	mlx5_vdpa_remove_tirn(ndev);
1491 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1492 }
1493 
1494 #define MAX_STEERING_ENT 0x8000
1495 #define MAX_STEERING_GROUPS 2
1496 
1497 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1498        #define NUM_DESTS 2
1499 #else
1500        #define NUM_DESTS 1
1501 #endif
1502 
1503 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1504 				 struct macvlan_node *node,
1505 				 struct mlx5_flow_act *flow_act,
1506 				 struct mlx5_flow_destination *dests)
1507 {
1508 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1509 	int err;
1510 
1511 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1512 	if (IS_ERR(node->ucast_counter.counter))
1513 		return PTR_ERR(node->ucast_counter.counter);
1514 
1515 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1516 	if (IS_ERR(node->mcast_counter.counter)) {
1517 		err = PTR_ERR(node->mcast_counter.counter);
1518 		goto err_mcast_counter;
1519 	}
1520 
1521 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1522 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1523 	return 0;
1524 
1525 err_mcast_counter:
1526 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1527 	return err;
1528 #else
1529 	return 0;
1530 #endif
1531 }
1532 
1533 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1534 				     struct macvlan_node *node)
1535 {
1536 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1537 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1538 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1539 #endif
1540 }
1541 
1542 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1543 					struct macvlan_node *node)
1544 {
1545 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1546 	struct mlx5_flow_act flow_act = {};
1547 	struct mlx5_flow_spec *spec;
1548 	void *headers_c;
1549 	void *headers_v;
1550 	u8 *dmac_c;
1551 	u8 *dmac_v;
1552 	int err;
1553 	u16 vid;
1554 
1555 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1556 	if (!spec)
1557 		return -ENOMEM;
1558 
1559 	vid = key2vid(node->macvlan);
1560 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1561 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1562 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1563 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1564 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1565 	eth_broadcast_addr(dmac_c);
1566 	ether_addr_copy(dmac_v, mac);
1567 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1568 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1569 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1570 	}
1571 	if (node->tagged) {
1572 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1573 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1574 	}
1575 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1576 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1577 	dests[0].tir_num = ndev->res.tirn;
1578 	err = add_steering_counters(ndev, node, &flow_act, dests);
1579 	if (err)
1580 		goto out_free;
1581 
1582 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1583 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1584 #endif
1585 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1586 	if (IS_ERR(node->ucast_rule)) {
1587 		err = PTR_ERR(node->ucast_rule);
1588 		goto err_ucast;
1589 	}
1590 
1591 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1592 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1593 #endif
1594 
1595 	memset(dmac_c, 0, ETH_ALEN);
1596 	memset(dmac_v, 0, ETH_ALEN);
1597 	dmac_c[0] = 1;
1598 	dmac_v[0] = 1;
1599 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1600 	if (IS_ERR(node->mcast_rule)) {
1601 		err = PTR_ERR(node->mcast_rule);
1602 		goto err_mcast;
1603 	}
1604 	kvfree(spec);
1605 	mlx5_vdpa_add_rx_counters(ndev, node);
1606 	return 0;
1607 
1608 err_mcast:
1609 	mlx5_del_flow_rules(node->ucast_rule);
1610 err_ucast:
1611 	remove_steering_counters(ndev, node);
1612 out_free:
1613 	kvfree(spec);
1614 	return err;
1615 }
1616 
1617 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1618 					 struct macvlan_node *node)
1619 {
1620 	mlx5_vdpa_remove_rx_counters(ndev, node);
1621 	mlx5_del_flow_rules(node->ucast_rule);
1622 	mlx5_del_flow_rules(node->mcast_rule);
1623 }
1624 
1625 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1626 {
1627 	u64 val;
1628 
1629 	if (!tagged)
1630 		vlan = MLX5V_UNTAGGED;
1631 
1632 	val = (u64)vlan << 48 |
1633 	      (u64)mac[0] << 40 |
1634 	      (u64)mac[1] << 32 |
1635 	      (u64)mac[2] << 24 |
1636 	      (u64)mac[3] << 16 |
1637 	      (u64)mac[4] << 8 |
1638 	      (u64)mac[5];
1639 
1640 	return val;
1641 }
1642 
1643 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1644 {
1645 	struct macvlan_node *pos;
1646 	u32 idx;
1647 
1648 	idx = hash_64(value, 8); // tbd 8
1649 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1650 		if (pos->macvlan == value)
1651 			return pos;
1652 	}
1653 	return NULL;
1654 }
1655 
1656 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1657 {
1658 	struct macvlan_node *ptr;
1659 	u64 val;
1660 	u32 idx;
1661 	int err;
1662 
1663 	val = search_val(mac, vid, tagged);
1664 	if (mac_vlan_lookup(ndev, val))
1665 		return -EEXIST;
1666 
1667 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1668 	if (!ptr)
1669 		return -ENOMEM;
1670 
1671 	ptr->tagged = tagged;
1672 	ptr->macvlan = val;
1673 	ptr->ndev = ndev;
1674 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1675 	if (err)
1676 		goto err_add;
1677 
1678 	idx = hash_64(val, 8);
1679 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1680 	return 0;
1681 
1682 err_add:
1683 	kfree(ptr);
1684 	return err;
1685 }
1686 
1687 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1688 {
1689 	struct macvlan_node *ptr;
1690 
1691 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1692 	if (!ptr)
1693 		return;
1694 
1695 	hlist_del(&ptr->hlist);
1696 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1697 	remove_steering_counters(ndev, ptr);
1698 	kfree(ptr);
1699 }
1700 
1701 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1702 {
1703 	struct macvlan_node *pos;
1704 	struct hlist_node *n;
1705 	int i;
1706 
1707 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1708 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1709 			hlist_del(&pos->hlist);
1710 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1711 			remove_steering_counters(ndev, pos);
1712 			kfree(pos);
1713 		}
1714 	}
1715 }
1716 
1717 static int setup_steering(struct mlx5_vdpa_net *ndev)
1718 {
1719 	struct mlx5_flow_table_attr ft_attr = {};
1720 	struct mlx5_flow_namespace *ns;
1721 	int err;
1722 
1723 	ft_attr.max_fte = MAX_STEERING_ENT;
1724 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
1725 
1726 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1727 	if (!ns) {
1728 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1729 		return -EOPNOTSUPP;
1730 	}
1731 
1732 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1733 	if (IS_ERR(ndev->rxft)) {
1734 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
1735 		return PTR_ERR(ndev->rxft);
1736 	}
1737 	mlx5_vdpa_add_rx_flow_table(ndev);
1738 
1739 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
1740 	if (err)
1741 		goto err_add;
1742 
1743 	return 0;
1744 
1745 err_add:
1746 	mlx5_vdpa_remove_rx_flow_table(ndev);
1747 	mlx5_destroy_flow_table(ndev->rxft);
1748 	return err;
1749 }
1750 
1751 static void teardown_steering(struct mlx5_vdpa_net *ndev)
1752 {
1753 	clear_mac_vlan_table(ndev);
1754 	mlx5_vdpa_remove_rx_flow_table(ndev);
1755 	mlx5_destroy_flow_table(ndev->rxft);
1756 }
1757 
1758 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1759 {
1760 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1761 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1762 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1763 	struct mlx5_core_dev *pfmdev;
1764 	size_t read;
1765 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1766 
1767 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1768 	switch (cmd) {
1769 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1770 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1771 		if (read != ETH_ALEN)
1772 			break;
1773 
1774 		if (!memcmp(ndev->config.mac, mac, 6)) {
1775 			status = VIRTIO_NET_OK;
1776 			break;
1777 		}
1778 
1779 		if (is_zero_ether_addr(mac))
1780 			break;
1781 
1782 		if (!is_zero_ether_addr(ndev->config.mac)) {
1783 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1784 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1785 					       ndev->config.mac);
1786 				break;
1787 			}
1788 		}
1789 
1790 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1791 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1792 				       mac);
1793 			break;
1794 		}
1795 
1796 		/* backup the original mac address so that if failed to add the forward rules
1797 		 * we could restore it
1798 		 */
1799 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1800 
1801 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1802 
1803 		/* Need recreate the flow table entry, so that the packet could forward back
1804 		 */
1805 		mac_vlan_del(ndev, mac_back, 0, false);
1806 
1807 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
1808 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1809 
1810 			/* Although it hardly run here, we still need double check */
1811 			if (is_zero_ether_addr(mac_back)) {
1812 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1813 				break;
1814 			}
1815 
1816 			/* Try to restore original mac address to MFPS table, and try to restore
1817 			 * the forward rule entry.
1818 			 */
1819 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1820 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1821 					       ndev->config.mac);
1822 			}
1823 
1824 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1825 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1826 					       mac_back);
1827 			}
1828 
1829 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1830 
1831 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
1832 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1833 
1834 			break;
1835 		}
1836 
1837 		status = VIRTIO_NET_OK;
1838 		break;
1839 
1840 	default:
1841 		break;
1842 	}
1843 
1844 	return status;
1845 }
1846 
1847 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1848 {
1849 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1850 	int cur_qps = ndev->cur_num_vqs / 2;
1851 	int err;
1852 	int i;
1853 
1854 	if (cur_qps > newqps) {
1855 		err = modify_rqt(ndev, 2 * newqps);
1856 		if (err)
1857 			return err;
1858 
1859 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1860 			teardown_vq(ndev, &ndev->vqs[i]);
1861 
1862 		ndev->cur_num_vqs = 2 * newqps;
1863 	} else {
1864 		ndev->cur_num_vqs = 2 * newqps;
1865 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1866 			err = setup_vq(ndev, &ndev->vqs[i]);
1867 			if (err)
1868 				goto clean_added;
1869 		}
1870 		err = modify_rqt(ndev, 2 * newqps);
1871 		if (err)
1872 			goto clean_added;
1873 	}
1874 	return 0;
1875 
1876 clean_added:
1877 	for (--i; i >= 2 * cur_qps; --i)
1878 		teardown_vq(ndev, &ndev->vqs[i]);
1879 
1880 	ndev->cur_num_vqs = 2 * cur_qps;
1881 
1882 	return err;
1883 }
1884 
1885 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1886 {
1887 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1888 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1889 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1890 	struct virtio_net_ctrl_mq mq;
1891 	size_t read;
1892 	u16 newqps;
1893 
1894 	switch (cmd) {
1895 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1896 		/* This mq feature check aligns with pre-existing userspace
1897 		 * implementation.
1898 		 *
1899 		 * Without it, an untrusted driver could fake a multiqueue config
1900 		 * request down to a non-mq device that may cause kernel to
1901 		 * panic due to uninitialized resources for extra vqs. Even with
1902 		 * a well behaving guest driver, it is not expected to allow
1903 		 * changing the number of vqs on a non-mq device.
1904 		 */
1905 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1906 			break;
1907 
1908 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1909 		if (read != sizeof(mq))
1910 			break;
1911 
1912 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1913 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1914 		    newqps > ndev->rqt_size)
1915 			break;
1916 
1917 		if (ndev->cur_num_vqs == 2 * newqps) {
1918 			status = VIRTIO_NET_OK;
1919 			break;
1920 		}
1921 
1922 		if (!change_num_qps(mvdev, newqps))
1923 			status = VIRTIO_NET_OK;
1924 
1925 		break;
1926 	default:
1927 		break;
1928 	}
1929 
1930 	return status;
1931 }
1932 
1933 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1934 {
1935 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1936 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1937 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1938 	__virtio16 vlan;
1939 	size_t read;
1940 	u16 id;
1941 
1942 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
1943 		return status;
1944 
1945 	switch (cmd) {
1946 	case VIRTIO_NET_CTRL_VLAN_ADD:
1947 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1948 		if (read != sizeof(vlan))
1949 			break;
1950 
1951 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
1952 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
1953 			break;
1954 
1955 		status = VIRTIO_NET_OK;
1956 		break;
1957 	case VIRTIO_NET_CTRL_VLAN_DEL:
1958 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1959 		if (read != sizeof(vlan))
1960 			break;
1961 
1962 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
1963 		mac_vlan_del(ndev, ndev->config.mac, id, true);
1964 		status = VIRTIO_NET_OK;
1965 		break;
1966 	default:
1967 		break;
1968 	}
1969 
1970 	return status;
1971 }
1972 
1973 static void mlx5_cvq_kick_handler(struct work_struct *work)
1974 {
1975 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1976 	struct virtio_net_ctrl_hdr ctrl;
1977 	struct mlx5_vdpa_wq_ent *wqent;
1978 	struct mlx5_vdpa_dev *mvdev;
1979 	struct mlx5_control_vq *cvq;
1980 	struct mlx5_vdpa_net *ndev;
1981 	size_t read, write;
1982 	int err;
1983 
1984 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1985 	mvdev = wqent->mvdev;
1986 	ndev = to_mlx5_vdpa_ndev(mvdev);
1987 	cvq = &mvdev->cvq;
1988 
1989 	down_write(&ndev->reslock);
1990 
1991 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1992 		goto out;
1993 
1994 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1995 		goto out;
1996 
1997 	if (!cvq->ready)
1998 		goto out;
1999 
2000 	while (true) {
2001 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
2002 					   GFP_ATOMIC);
2003 		if (err <= 0)
2004 			break;
2005 
2006 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
2007 		if (read != sizeof(ctrl))
2008 			break;
2009 
2010 		cvq->received_desc++;
2011 		switch (ctrl.class) {
2012 		case VIRTIO_NET_CTRL_MAC:
2013 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
2014 			break;
2015 		case VIRTIO_NET_CTRL_MQ:
2016 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
2017 			break;
2018 		case VIRTIO_NET_CTRL_VLAN:
2019 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
2020 			break;
2021 		default:
2022 			break;
2023 		}
2024 
2025 		/* Make sure data is written before advancing index */
2026 		smp_wmb();
2027 
2028 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
2029 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
2030 		vringh_kiov_cleanup(&cvq->riov);
2031 		vringh_kiov_cleanup(&cvq->wiov);
2032 
2033 		if (vringh_need_notify_iotlb(&cvq->vring))
2034 			vringh_notify(&cvq->vring);
2035 
2036 		cvq->completed_desc++;
2037 		queue_work(mvdev->wq, &wqent->work);
2038 		break;
2039 	}
2040 
2041 out:
2042 	up_write(&ndev->reslock);
2043 }
2044 
2045 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
2046 {
2047 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2048 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2049 	struct mlx5_vdpa_virtqueue *mvq;
2050 
2051 	if (!is_index_valid(mvdev, idx))
2052 		return;
2053 
2054 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
2055 		if (!mvdev->wq || !mvdev->cvq.ready)
2056 			return;
2057 
2058 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
2059 		return;
2060 	}
2061 
2062 	mvq = &ndev->vqs[idx];
2063 	if (unlikely(!mvq->ready))
2064 		return;
2065 
2066 	iowrite16(idx, ndev->mvdev.res.kick_addr);
2067 }
2068 
2069 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2070 				    u64 driver_area, u64 device_area)
2071 {
2072 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2073 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2074 	struct mlx5_vdpa_virtqueue *mvq;
2075 
2076 	if (!is_index_valid(mvdev, idx))
2077 		return -EINVAL;
2078 
2079 	if (is_ctrl_vq_idx(mvdev, idx)) {
2080 		mvdev->cvq.desc_addr = desc_area;
2081 		mvdev->cvq.device_addr = device_area;
2082 		mvdev->cvq.driver_addr = driver_area;
2083 		return 0;
2084 	}
2085 
2086 	mvq = &ndev->vqs[idx];
2087 	mvq->desc_addr = desc_area;
2088 	mvq->device_addr = device_area;
2089 	mvq->driver_addr = driver_area;
2090 	return 0;
2091 }
2092 
2093 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2094 {
2095 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2096 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2097 	struct mlx5_vdpa_virtqueue *mvq;
2098 
2099 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2100 		return;
2101 
2102 	mvq = &ndev->vqs[idx];
2103 	mvq->num_ent = num;
2104 }
2105 
2106 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2107 {
2108 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2109 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2110 
2111 	ndev->event_cbs[idx] = *cb;
2112 	if (is_ctrl_vq_idx(mvdev, idx))
2113 		mvdev->cvq.event_cb = *cb;
2114 }
2115 
2116 static void mlx5_cvq_notify(struct vringh *vring)
2117 {
2118 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2119 
2120 	if (!cvq->event_cb.callback)
2121 		return;
2122 
2123 	cvq->event_cb.callback(cvq->event_cb.private);
2124 }
2125 
2126 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2127 {
2128 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2129 
2130 	cvq->ready = ready;
2131 	if (!ready)
2132 		return;
2133 
2134 	cvq->vring.notify = mlx5_cvq_notify;
2135 }
2136 
2137 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2138 {
2139 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2140 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2141 	struct mlx5_vdpa_virtqueue *mvq;
2142 	int err;
2143 
2144 	if (!mvdev->actual_features)
2145 		return;
2146 
2147 	if (!is_index_valid(mvdev, idx))
2148 		return;
2149 
2150 	if (is_ctrl_vq_idx(mvdev, idx)) {
2151 		set_cvq_ready(mvdev, ready);
2152 		return;
2153 	}
2154 
2155 	mvq = &ndev->vqs[idx];
2156 	if (!ready) {
2157 		suspend_vq(ndev, mvq);
2158 	} else {
2159 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
2160 		if (err) {
2161 			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
2162 			ready = false;
2163 		}
2164 	}
2165 
2166 
2167 	mvq->ready = ready;
2168 }
2169 
2170 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2171 {
2172 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2173 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2174 
2175 	if (!is_index_valid(mvdev, idx))
2176 		return false;
2177 
2178 	if (is_ctrl_vq_idx(mvdev, idx))
2179 		return mvdev->cvq.ready;
2180 
2181 	return ndev->vqs[idx].ready;
2182 }
2183 
2184 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2185 				  const struct vdpa_vq_state *state)
2186 {
2187 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2188 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2189 	struct mlx5_vdpa_virtqueue *mvq;
2190 
2191 	if (!is_index_valid(mvdev, idx))
2192 		return -EINVAL;
2193 
2194 	if (is_ctrl_vq_idx(mvdev, idx)) {
2195 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2196 		return 0;
2197 	}
2198 
2199 	mvq = &ndev->vqs[idx];
2200 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2201 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2202 		return -EINVAL;
2203 	}
2204 
2205 	mvq->used_idx = state->split.avail_index;
2206 	mvq->avail_idx = state->split.avail_index;
2207 	return 0;
2208 }
2209 
2210 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2211 {
2212 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2213 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2214 	struct mlx5_vdpa_virtqueue *mvq;
2215 	struct mlx5_virtq_attr attr;
2216 	int err;
2217 
2218 	if (!is_index_valid(mvdev, idx))
2219 		return -EINVAL;
2220 
2221 	if (is_ctrl_vq_idx(mvdev, idx)) {
2222 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2223 		return 0;
2224 	}
2225 
2226 	mvq = &ndev->vqs[idx];
2227 	/* If the virtq object was destroyed, use the value saved at
2228 	 * the last minute of suspend_vq. This caters for userspace
2229 	 * that cares about emulating the index after vq is stopped.
2230 	 */
2231 	if (!mvq->initialized) {
2232 		/* Firmware returns a wrong value for the available index.
2233 		 * Since both values should be identical, we take the value of
2234 		 * used_idx which is reported correctly.
2235 		 */
2236 		state->split.avail_index = mvq->used_idx;
2237 		return 0;
2238 	}
2239 
2240 	err = query_virtqueue(ndev, mvq, &attr);
2241 	if (err) {
2242 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2243 		return err;
2244 	}
2245 	state->split.avail_index = attr.used_index;
2246 	return 0;
2247 }
2248 
2249 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2250 {
2251 	return PAGE_SIZE;
2252 }
2253 
2254 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2255 {
2256 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2257 
2258 	if (is_ctrl_vq_idx(mvdev, idx))
2259 		return MLX5_VDPA_CVQ_GROUP;
2260 
2261 	return MLX5_VDPA_DATAVQ_GROUP;
2262 }
2263 
2264 static u64 mlx_to_vritio_features(u16 dev_features)
2265 {
2266 	u64 result = 0;
2267 
2268 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2269 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2270 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2271 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2272 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2273 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2274 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2275 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2276 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2277 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2278 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2279 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2280 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2281 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2282 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2283 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2284 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2285 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2286 
2287 	return result;
2288 }
2289 
2290 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2291 {
2292 	u64 mlx_vdpa_features = 0;
2293 	u16 dev_features;
2294 
2295 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2296 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2297 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2298 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2299 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2300 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2301 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2302 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2303 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2304 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2305 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2306 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2307 
2308 	return mlx_vdpa_features;
2309 }
2310 
2311 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2312 {
2313 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2314 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2315 
2316 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2317 	return ndev->mvdev.mlx_features;
2318 }
2319 
2320 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2321 {
2322 	/* Minimum features to expect */
2323 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2324 		return -EOPNOTSUPP;
2325 
2326 	/* Double check features combination sent down by the driver.
2327 	 * Fail invalid features due to absence of the depended feature.
2328 	 *
2329 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2330 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2331 	 * By failing the invalid features sent down by untrusted drivers,
2332 	 * we're assured the assumption made upon is_index_valid() and
2333 	 * is_ctrl_vq_idx() will not be compromised.
2334 	 */
2335 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2336             BIT_ULL(VIRTIO_NET_F_MQ))
2337 		return -EINVAL;
2338 
2339 	return 0;
2340 }
2341 
2342 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
2343 {
2344 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2345 	int err;
2346 	int i;
2347 
2348 	for (i = 0; i < mvdev->max_vqs; i++) {
2349 		err = setup_vq(ndev, &ndev->vqs[i]);
2350 		if (err)
2351 			goto err_vq;
2352 	}
2353 
2354 	return 0;
2355 
2356 err_vq:
2357 	for (--i; i >= 0; i--)
2358 		teardown_vq(ndev, &ndev->vqs[i]);
2359 
2360 	return err;
2361 }
2362 
2363 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2364 {
2365 	struct mlx5_vdpa_virtqueue *mvq;
2366 	int i;
2367 
2368 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2369 		mvq = &ndev->vqs[i];
2370 		if (!mvq->initialized)
2371 			continue;
2372 
2373 		teardown_vq(ndev, mvq);
2374 	}
2375 }
2376 
2377 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2378 {
2379 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2380 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2381 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2382 			mvdev->max_idx = mvdev->max_vqs;
2383 		} else {
2384 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2385 			 * CVQ gets index 2
2386 			 */
2387 			mvdev->max_idx = 2;
2388 		}
2389 	} else {
2390 		/* Two data virtqueues only: one for rx and one for tx */
2391 		mvdev->max_idx = 1;
2392 	}
2393 }
2394 
2395 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2396 {
2397 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2398 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2399 	int err;
2400 
2401 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2402 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2403 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2404 	if (vport)
2405 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2406 
2407 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2408 	if (err)
2409 		return 0;
2410 
2411 	return MLX5_GET(query_vport_state_out, out, state);
2412 }
2413 
2414 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2415 {
2416 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2417 	    VPORT_STATE_UP)
2418 		return true;
2419 
2420 	return false;
2421 }
2422 
2423 static void update_carrier(struct work_struct *work)
2424 {
2425 	struct mlx5_vdpa_wq_ent *wqent;
2426 	struct mlx5_vdpa_dev *mvdev;
2427 	struct mlx5_vdpa_net *ndev;
2428 
2429 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2430 	mvdev = wqent->mvdev;
2431 	ndev = to_mlx5_vdpa_ndev(mvdev);
2432 	if (get_link_state(mvdev))
2433 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2434 	else
2435 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2436 
2437 	if (ndev->config_cb.callback)
2438 		ndev->config_cb.callback(ndev->config_cb.private);
2439 
2440 	kfree(wqent);
2441 }
2442 
2443 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2444 {
2445 	struct mlx5_vdpa_wq_ent *wqent;
2446 
2447 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2448 	if (!wqent)
2449 		return -ENOMEM;
2450 
2451 	wqent->mvdev = &ndev->mvdev;
2452 	INIT_WORK(&wqent->work, update_carrier);
2453 	queue_work(ndev->mvdev.wq, &wqent->work);
2454 	return 0;
2455 }
2456 
2457 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2458 {
2459 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2460 	struct mlx5_eqe *eqe = param;
2461 	int ret = NOTIFY_DONE;
2462 
2463 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2464 		switch (eqe->sub_type) {
2465 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2466 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2467 			if (queue_link_work(ndev))
2468 				return NOTIFY_DONE;
2469 
2470 			ret = NOTIFY_OK;
2471 			break;
2472 		default:
2473 			return NOTIFY_DONE;
2474 		}
2475 		return ret;
2476 	}
2477 	return ret;
2478 }
2479 
2480 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2481 {
2482 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2483 		return;
2484 
2485 	ndev->nb.notifier_call = event_handler;
2486 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2487 	ndev->nb_registered = true;
2488 	queue_link_work(ndev);
2489 }
2490 
2491 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2492 {
2493 	if (!ndev->nb_registered)
2494 		return;
2495 
2496 	ndev->nb_registered = false;
2497 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2498 	if (ndev->mvdev.wq)
2499 		flush_workqueue(ndev->mvdev.wq);
2500 }
2501 
2502 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2503 {
2504 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2505 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2506 	int err;
2507 
2508 	print_features(mvdev, features, true);
2509 
2510 	err = verify_driver_features(mvdev, features);
2511 	if (err)
2512 		return err;
2513 
2514 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2515 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2516 		ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2517 	else
2518 		ndev->rqt_size = 1;
2519 
2520 	ndev->cur_num_vqs = 2 * ndev->rqt_size;
2521 
2522 	update_cvq_info(mvdev);
2523 	return err;
2524 }
2525 
2526 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2527 {
2528 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2529 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2530 
2531 	ndev->config_cb = *cb;
2532 }
2533 
2534 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2535 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2536 {
2537 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2538 }
2539 
2540 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2541 {
2542 	return VIRTIO_ID_NET;
2543 }
2544 
2545 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2546 {
2547 	return PCI_VENDOR_ID_MELLANOX;
2548 }
2549 
2550 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2551 {
2552 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2553 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2554 
2555 	print_status(mvdev, ndev->mvdev.status, false);
2556 	return ndev->mvdev.status;
2557 }
2558 
2559 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2560 {
2561 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2562 	struct mlx5_virtq_attr attr = {};
2563 	int err;
2564 
2565 	if (mvq->initialized) {
2566 		err = query_virtqueue(ndev, mvq, &attr);
2567 		if (err)
2568 			return err;
2569 	}
2570 
2571 	ri->avail_index = attr.available_index;
2572 	ri->used_index = attr.used_index;
2573 	ri->ready = mvq->ready;
2574 	ri->num_ent = mvq->num_ent;
2575 	ri->desc_addr = mvq->desc_addr;
2576 	ri->device_addr = mvq->device_addr;
2577 	ri->driver_addr = mvq->driver_addr;
2578 	ri->map = mvq->map;
2579 	ri->restore = true;
2580 	return 0;
2581 }
2582 
2583 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2584 {
2585 	int i;
2586 
2587 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2588 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2589 		save_channel_info(ndev, &ndev->vqs[i]);
2590 	}
2591 	return 0;
2592 }
2593 
2594 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2595 {
2596 	int i;
2597 
2598 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2599 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2600 }
2601 
2602 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2603 {
2604 	struct mlx5_vdpa_virtqueue *mvq;
2605 	struct mlx5_vq_restore_info *ri;
2606 	int i;
2607 
2608 	mlx5_clear_vqs(ndev);
2609 	init_mvqs(ndev);
2610 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2611 		mvq = &ndev->vqs[i];
2612 		ri = &mvq->ri;
2613 		if (!ri->restore)
2614 			continue;
2615 
2616 		mvq->avail_idx = ri->avail_index;
2617 		mvq->used_idx = ri->used_index;
2618 		mvq->ready = ri->ready;
2619 		mvq->num_ent = ri->num_ent;
2620 		mvq->desc_addr = ri->desc_addr;
2621 		mvq->device_addr = ri->device_addr;
2622 		mvq->driver_addr = ri->driver_addr;
2623 		mvq->map = ri->map;
2624 	}
2625 }
2626 
2627 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2628 				struct vhost_iotlb *iotlb, unsigned int asid)
2629 {
2630 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2631 	int err;
2632 
2633 	suspend_vqs(ndev);
2634 	err = save_channels_info(ndev);
2635 	if (err)
2636 		goto err_mr;
2637 
2638 	teardown_driver(ndev);
2639 	mlx5_vdpa_destroy_mr(mvdev);
2640 	err = mlx5_vdpa_create_mr(mvdev, iotlb, asid);
2641 	if (err)
2642 		goto err_mr;
2643 
2644 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2645 		goto err_mr;
2646 
2647 	restore_channels_info(ndev);
2648 	err = setup_driver(mvdev);
2649 	if (err)
2650 		goto err_setup;
2651 
2652 	return 0;
2653 
2654 err_setup:
2655 	mlx5_vdpa_destroy_mr(mvdev);
2656 err_mr:
2657 	return err;
2658 }
2659 
2660 /* reslock must be held for this function */
2661 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2662 {
2663 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2664 	int err;
2665 
2666 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2667 
2668 	if (ndev->setup) {
2669 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2670 		err = 0;
2671 		goto out;
2672 	}
2673 	mlx5_vdpa_add_debugfs(ndev);
2674 	err = setup_virtqueues(mvdev);
2675 	if (err) {
2676 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2677 		goto err_setup;
2678 	}
2679 
2680 	err = create_rqt(ndev);
2681 	if (err) {
2682 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2683 		goto err_rqt;
2684 	}
2685 
2686 	err = create_tir(ndev);
2687 	if (err) {
2688 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2689 		goto err_tir;
2690 	}
2691 
2692 	err = setup_steering(ndev);
2693 	if (err) {
2694 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
2695 		goto err_fwd;
2696 	}
2697 	ndev->setup = true;
2698 
2699 	return 0;
2700 
2701 err_fwd:
2702 	destroy_tir(ndev);
2703 err_tir:
2704 	destroy_rqt(ndev);
2705 err_rqt:
2706 	teardown_virtqueues(ndev);
2707 err_setup:
2708 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
2709 out:
2710 	return err;
2711 }
2712 
2713 /* reslock must be held for this function */
2714 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2715 {
2716 
2717 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2718 
2719 	if (!ndev->setup)
2720 		return;
2721 
2722 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
2723 	ndev->debugfs = NULL;
2724 	teardown_steering(ndev);
2725 	destroy_tir(ndev);
2726 	destroy_rqt(ndev);
2727 	teardown_virtqueues(ndev);
2728 	ndev->setup = false;
2729 }
2730 
2731 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2732 {
2733 	int i;
2734 
2735 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2736 		ndev->vqs[i].ready = false;
2737 
2738 	ndev->mvdev.cvq.ready = false;
2739 }
2740 
2741 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
2742 {
2743 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2744 	int err = 0;
2745 
2746 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))
2747 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2748 					MLX5_CVQ_MAX_ENT, false,
2749 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
2750 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
2751 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2752 
2753 	return err;
2754 }
2755 
2756 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2757 {
2758 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2759 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2760 	int err;
2761 
2762 	print_status(mvdev, status, true);
2763 
2764 	down_write(&ndev->reslock);
2765 
2766 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2767 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2768 			err = setup_cvq_vring(mvdev);
2769 			if (err) {
2770 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
2771 				goto err_setup;
2772 			}
2773 			register_link_notifier(ndev);
2774 			err = setup_driver(mvdev);
2775 			if (err) {
2776 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2777 				goto err_driver;
2778 			}
2779 		} else {
2780 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2781 			goto err_clear;
2782 		}
2783 	}
2784 
2785 	ndev->mvdev.status = status;
2786 	up_write(&ndev->reslock);
2787 	return;
2788 
2789 err_driver:
2790 	unregister_link_notifier(ndev);
2791 err_setup:
2792 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2793 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2794 err_clear:
2795 	up_write(&ndev->reslock);
2796 }
2797 
2798 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
2799 {
2800 	int i;
2801 
2802 	/* default mapping all groups are mapped to asid 0 */
2803 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
2804 		mvdev->group2asid[i] = 0;
2805 }
2806 
2807 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2808 {
2809 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2810 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2811 
2812 	print_status(mvdev, 0, true);
2813 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2814 
2815 	down_write(&ndev->reslock);
2816 	unregister_link_notifier(ndev);
2817 	teardown_driver(ndev);
2818 	clear_vqs_ready(ndev);
2819 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2820 	ndev->mvdev.status = 0;
2821 	ndev->mvdev.suspended = false;
2822 	ndev->cur_num_vqs = 0;
2823 	ndev->mvdev.cvq.received_desc = 0;
2824 	ndev->mvdev.cvq.completed_desc = 0;
2825 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2826 	ndev->mvdev.actual_features = 0;
2827 	init_group_to_asid_map(mvdev);
2828 	++mvdev->generation;
2829 
2830 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2831 		if (mlx5_vdpa_create_mr(mvdev, NULL, 0))
2832 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2833 	}
2834 	up_write(&ndev->reslock);
2835 
2836 	return 0;
2837 }
2838 
2839 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2840 {
2841 	return sizeof(struct virtio_net_config);
2842 }
2843 
2844 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2845 				 unsigned int len)
2846 {
2847 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2848 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2849 
2850 	if (offset + len <= sizeof(struct virtio_net_config))
2851 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2852 }
2853 
2854 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2855 				 unsigned int len)
2856 {
2857 	/* not supported */
2858 }
2859 
2860 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2861 {
2862 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2863 
2864 	return mvdev->generation;
2865 }
2866 
2867 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
2868 			unsigned int asid)
2869 {
2870 	bool change_map;
2871 	int err;
2872 
2873 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid);
2874 	if (err) {
2875 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2876 		return err;
2877 	}
2878 
2879 	if (change_map)
2880 		err = mlx5_vdpa_change_map(mvdev, iotlb, asid);
2881 
2882 	return err;
2883 }
2884 
2885 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
2886 			     struct vhost_iotlb *iotlb)
2887 {
2888 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2889 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2890 	int err = -EINVAL;
2891 
2892 	down_write(&ndev->reslock);
2893 	err = set_map_data(mvdev, iotlb, asid);
2894 	up_write(&ndev->reslock);
2895 	return err;
2896 }
2897 
2898 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
2899 {
2900 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2901 
2902 	if (is_ctrl_vq_idx(mvdev, idx))
2903 		return &vdev->dev;
2904 
2905 	return mvdev->vdev.dma_dev;
2906 }
2907 
2908 static void free_irqs(struct mlx5_vdpa_net *ndev)
2909 {
2910 	struct mlx5_vdpa_irq_pool_entry *ent;
2911 	int i;
2912 
2913 	if (!msix_mode_supported(&ndev->mvdev))
2914 		return;
2915 
2916 	if (!ndev->irqp.entries)
2917 		return;
2918 
2919 	for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
2920 		ent = ndev->irqp.entries + i;
2921 		if (ent->map.virq)
2922 			pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
2923 	}
2924 	kfree(ndev->irqp.entries);
2925 }
2926 
2927 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2928 {
2929 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2930 	struct mlx5_core_dev *pfmdev;
2931 	struct mlx5_vdpa_net *ndev;
2932 
2933 	ndev = to_mlx5_vdpa_ndev(mvdev);
2934 
2935 	free_resources(ndev);
2936 	mlx5_vdpa_destroy_mr(mvdev);
2937 	if (!is_zero_ether_addr(ndev->config.mac)) {
2938 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2939 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2940 	}
2941 	mlx5_vdpa_free_resources(&ndev->mvdev);
2942 	free_irqs(ndev);
2943 	kfree(ndev->event_cbs);
2944 	kfree(ndev->vqs);
2945 }
2946 
2947 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2948 {
2949 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2950 	struct vdpa_notification_area ret = {};
2951 	struct mlx5_vdpa_net *ndev;
2952 	phys_addr_t addr;
2953 
2954 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2955 		return ret;
2956 
2957 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2958 	 * notification to avoid the risk of mapping pages that contain BAR of more
2959 	 * than one SF
2960 	 */
2961 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2962 		return ret;
2963 
2964 	ndev = to_mlx5_vdpa_ndev(mvdev);
2965 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2966 	ret.addr = addr;
2967 	ret.size = PAGE_SIZE;
2968 	return ret;
2969 }
2970 
2971 static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
2972 {
2973 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2974 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2975 	struct mlx5_vdpa_virtqueue *mvq;
2976 
2977 	if (!is_index_valid(mvdev, idx))
2978 		return -EINVAL;
2979 
2980 	if (is_ctrl_vq_idx(mvdev, idx))
2981 		return -EOPNOTSUPP;
2982 
2983 	mvq = &ndev->vqs[idx];
2984 	if (!mvq->map.virq)
2985 		return -EOPNOTSUPP;
2986 
2987 	return mvq->map.virq;
2988 }
2989 
2990 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2991 {
2992 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2993 
2994 	return mvdev->actual_features;
2995 }
2996 
2997 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
2998 			     u64 *received_desc, u64 *completed_desc)
2999 {
3000 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
3001 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
3002 	void *cmd_hdr;
3003 	void *ctx;
3004 	int err;
3005 
3006 	if (!counters_supported(&ndev->mvdev))
3007 		return -EOPNOTSUPP;
3008 
3009 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
3010 		return -EAGAIN;
3011 
3012 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
3013 
3014 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
3015 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
3016 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
3017 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
3018 
3019 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
3020 	if (err)
3021 		return err;
3022 
3023 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
3024 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
3025 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
3026 	return 0;
3027 }
3028 
3029 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
3030 					 struct sk_buff *msg,
3031 					 struct netlink_ext_ack *extack)
3032 {
3033 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3034 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3035 	struct mlx5_vdpa_virtqueue *mvq;
3036 	struct mlx5_control_vq *cvq;
3037 	u64 received_desc;
3038 	u64 completed_desc;
3039 	int err = 0;
3040 
3041 	down_read(&ndev->reslock);
3042 	if (!is_index_valid(mvdev, idx)) {
3043 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
3044 		err = -EINVAL;
3045 		goto out_err;
3046 	}
3047 
3048 	if (idx == ctrl_vq_idx(mvdev)) {
3049 		cvq = &mvdev->cvq;
3050 		received_desc = cvq->received_desc;
3051 		completed_desc = cvq->completed_desc;
3052 		goto out;
3053 	}
3054 
3055 	mvq = &ndev->vqs[idx];
3056 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
3057 	if (err) {
3058 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
3059 		goto out_err;
3060 	}
3061 
3062 out:
3063 	err = -EMSGSIZE;
3064 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
3065 		goto out_err;
3066 
3067 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
3068 			      VDPA_ATTR_PAD))
3069 		goto out_err;
3070 
3071 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
3072 		goto out_err;
3073 
3074 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
3075 			      VDPA_ATTR_PAD))
3076 		goto out_err;
3077 
3078 	err = 0;
3079 out_err:
3080 	up_read(&ndev->reslock);
3081 	return err;
3082 }
3083 
3084 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
3085 {
3086 	struct mlx5_control_vq *cvq;
3087 
3088 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
3089 		return;
3090 
3091 	cvq = &mvdev->cvq;
3092 	cvq->ready = false;
3093 }
3094 
3095 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
3096 {
3097 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3098 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3099 	struct mlx5_vdpa_virtqueue *mvq;
3100 	int i;
3101 
3102 	mlx5_vdpa_info(mvdev, "suspending device\n");
3103 
3104 	down_write(&ndev->reslock);
3105 	unregister_link_notifier(ndev);
3106 	for (i = 0; i < ndev->cur_num_vqs; i++) {
3107 		mvq = &ndev->vqs[i];
3108 		suspend_vq(ndev, mvq);
3109 	}
3110 	mlx5_vdpa_cvq_suspend(mvdev);
3111 	mvdev->suspended = true;
3112 	up_write(&ndev->reslock);
3113 	return 0;
3114 }
3115 
3116 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3117 			       unsigned int asid)
3118 {
3119 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3120 
3121 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3122 		return -EINVAL;
3123 
3124 	mvdev->group2asid[group] = asid;
3125 	return 0;
3126 }
3127 
3128 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3129 	.set_vq_address = mlx5_vdpa_set_vq_address,
3130 	.set_vq_num = mlx5_vdpa_set_vq_num,
3131 	.kick_vq = mlx5_vdpa_kick_vq,
3132 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3133 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3134 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3135 	.set_vq_state = mlx5_vdpa_set_vq_state,
3136 	.get_vq_state = mlx5_vdpa_get_vq_state,
3137 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3138 	.get_vq_notification = mlx5_get_vq_notification,
3139 	.get_vq_irq = mlx5_get_vq_irq,
3140 	.get_vq_align = mlx5_vdpa_get_vq_align,
3141 	.get_vq_group = mlx5_vdpa_get_vq_group,
3142 	.get_device_features = mlx5_vdpa_get_device_features,
3143 	.set_driver_features = mlx5_vdpa_set_driver_features,
3144 	.get_driver_features = mlx5_vdpa_get_driver_features,
3145 	.set_config_cb = mlx5_vdpa_set_config_cb,
3146 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3147 	.get_device_id = mlx5_vdpa_get_device_id,
3148 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3149 	.get_status = mlx5_vdpa_get_status,
3150 	.set_status = mlx5_vdpa_set_status,
3151 	.reset = mlx5_vdpa_reset,
3152 	.get_config_size = mlx5_vdpa_get_config_size,
3153 	.get_config = mlx5_vdpa_get_config,
3154 	.set_config = mlx5_vdpa_set_config,
3155 	.get_generation = mlx5_vdpa_get_generation,
3156 	.set_map = mlx5_vdpa_set_map,
3157 	.set_group_asid = mlx5_set_group_asid,
3158 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3159 	.free = mlx5_vdpa_free,
3160 	.suspend = mlx5_vdpa_suspend,
3161 };
3162 
3163 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3164 {
3165 	u16 hw_mtu;
3166 	int err;
3167 
3168 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3169 	if (err)
3170 		return err;
3171 
3172 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3173 	return 0;
3174 }
3175 
3176 static int alloc_resources(struct mlx5_vdpa_net *ndev)
3177 {
3178 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3179 	int err;
3180 
3181 	if (res->valid) {
3182 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3183 		return -EEXIST;
3184 	}
3185 
3186 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3187 	if (err)
3188 		return err;
3189 
3190 	err = create_tis(ndev);
3191 	if (err)
3192 		goto err_tis;
3193 
3194 	res->valid = true;
3195 
3196 	return 0;
3197 
3198 err_tis:
3199 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3200 	return err;
3201 }
3202 
3203 static void free_resources(struct mlx5_vdpa_net *ndev)
3204 {
3205 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3206 
3207 	if (!res->valid)
3208 		return;
3209 
3210 	destroy_tis(ndev);
3211 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3212 	res->valid = false;
3213 }
3214 
3215 static void init_mvqs(struct mlx5_vdpa_net *ndev)
3216 {
3217 	struct mlx5_vdpa_virtqueue *mvq;
3218 	int i;
3219 
3220 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3221 		mvq = &ndev->vqs[i];
3222 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3223 		mvq->index = i;
3224 		mvq->ndev = ndev;
3225 		mvq->fwqp.fw = true;
3226 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3227 	}
3228 	for (; i < ndev->mvdev.max_vqs; i++) {
3229 		mvq = &ndev->vqs[i];
3230 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3231 		mvq->index = i;
3232 		mvq->ndev = ndev;
3233 	}
3234 }
3235 
3236 struct mlx5_vdpa_mgmtdev {
3237 	struct vdpa_mgmt_dev mgtdev;
3238 	struct mlx5_adev *madev;
3239 	struct mlx5_vdpa_net *ndev;
3240 };
3241 
3242 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3243 {
3244 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3245 	void *in;
3246 	int err;
3247 
3248 	in = kvzalloc(inlen, GFP_KERNEL);
3249 	if (!in)
3250 		return -ENOMEM;
3251 
3252 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3253 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3254 		 mtu + MLX5V_ETH_HARD_MTU);
3255 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3256 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3257 
3258 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3259 
3260 	kvfree(in);
3261 	return err;
3262 }
3263 
3264 static void allocate_irqs(struct mlx5_vdpa_net *ndev)
3265 {
3266 	struct mlx5_vdpa_irq_pool_entry *ent;
3267 	int i;
3268 
3269 	if (!msix_mode_supported(&ndev->mvdev))
3270 		return;
3271 
3272 	if (!ndev->mvdev.mdev->pdev)
3273 		return;
3274 
3275 	ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
3276 	if (!ndev->irqp.entries)
3277 		return;
3278 
3279 
3280 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
3281 		ent = ndev->irqp.entries + i;
3282 		snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
3283 			 dev_name(&ndev->mvdev.vdev.dev), i);
3284 		ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
3285 		if (!ent->map.virq)
3286 			return;
3287 
3288 		ndev->irqp.num_ent++;
3289 	}
3290 }
3291 
3292 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3293 			     const struct vdpa_dev_set_config *add_config)
3294 {
3295 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3296 	struct virtio_net_config *config;
3297 	struct mlx5_core_dev *pfmdev;
3298 	struct mlx5_vdpa_dev *mvdev;
3299 	struct mlx5_vdpa_net *ndev;
3300 	struct mlx5_core_dev *mdev;
3301 	u64 device_features;
3302 	u32 max_vqs;
3303 	u16 mtu;
3304 	int err;
3305 
3306 	if (mgtdev->ndev)
3307 		return -ENOSPC;
3308 
3309 	mdev = mgtdev->madev->mdev;
3310 	device_features = mgtdev->mgtdev.supported_features;
3311 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3312 		if (add_config->device_features & ~device_features) {
3313 			dev_warn(mdev->device,
3314 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3315 				 add_config->device_features, device_features);
3316 			return -EINVAL;
3317 		}
3318 		device_features &= add_config->device_features;
3319 	} else {
3320 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3321 	}
3322 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3323 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3324 		dev_warn(mdev->device,
3325 			 "Must provision minimum features 0x%llx for this device",
3326 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3327 		return -EOPNOTSUPP;
3328 	}
3329 
3330 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3331 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3332 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3333 		return -EOPNOTSUPP;
3334 	}
3335 
3336 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3337 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3338 	if (max_vqs < 2) {
3339 		dev_warn(mdev->device,
3340 			 "%d virtqueues are supported. At least 2 are required\n",
3341 			 max_vqs);
3342 		return -EAGAIN;
3343 	}
3344 
3345 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3346 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3347 			return -EINVAL;
3348 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3349 	} else {
3350 		max_vqs = 2;
3351 	}
3352 
3353 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
3354 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3355 	if (IS_ERR(ndev))
3356 		return PTR_ERR(ndev);
3357 
3358 	ndev->mvdev.max_vqs = max_vqs;
3359 	mvdev = &ndev->mvdev;
3360 	mvdev->mdev = mdev;
3361 
3362 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3363 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3364 	if (!ndev->vqs || !ndev->event_cbs) {
3365 		err = -ENOMEM;
3366 		goto err_alloc;
3367 	}
3368 
3369 	init_mvqs(ndev);
3370 	allocate_irqs(ndev);
3371 	init_rwsem(&ndev->reslock);
3372 	config = &ndev->config;
3373 
3374 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3375 		err = config_func_mtu(mdev, add_config->net.mtu);
3376 		if (err)
3377 			goto err_alloc;
3378 	}
3379 
3380 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3381 		err = query_mtu(mdev, &mtu);
3382 		if (err)
3383 			goto err_alloc;
3384 
3385 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3386 	}
3387 
3388 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3389 		if (get_link_state(mvdev))
3390 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3391 		else
3392 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3393 	}
3394 
3395 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3396 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3397 	/* No bother setting mac address in config if not going to provision _F_MAC */
3398 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3399 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3400 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3401 		if (err)
3402 			goto err_alloc;
3403 	}
3404 
3405 	if (!is_zero_ether_addr(config->mac)) {
3406 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3407 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3408 		if (err)
3409 			goto err_alloc;
3410 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3411 		/*
3412 		 * We used to clear _F_MAC feature bit if seeing
3413 		 * zero mac address when device features are not
3414 		 * specifically provisioned. Keep the behaviour
3415 		 * so old scripts do not break.
3416 		 */
3417 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3418 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3419 		/* Don't provision zero mac address for _F_MAC */
3420 		mlx5_vdpa_warn(&ndev->mvdev,
3421 			       "No mac address provisioned?\n");
3422 		err = -EINVAL;
3423 		goto err_alloc;
3424 	}
3425 
3426 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
3427 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3428 
3429 	ndev->mvdev.mlx_features = device_features;
3430 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3431 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3432 	if (err)
3433 		goto err_mpfs;
3434 
3435 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3436 		err = mlx5_vdpa_create_mr(mvdev, NULL, 0);
3437 		if (err)
3438 			goto err_res;
3439 	}
3440 
3441 	err = alloc_resources(ndev);
3442 	if (err)
3443 		goto err_mr;
3444 
3445 	ndev->cvq_ent.mvdev = mvdev;
3446 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3447 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3448 	if (!mvdev->wq) {
3449 		err = -ENOMEM;
3450 		goto err_res2;
3451 	}
3452 
3453 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3454 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3455 	if (err)
3456 		goto err_reg;
3457 
3458 	mgtdev->ndev = ndev;
3459 	return 0;
3460 
3461 err_reg:
3462 	destroy_workqueue(mvdev->wq);
3463 err_res2:
3464 	free_resources(ndev);
3465 err_mr:
3466 	mlx5_vdpa_destroy_mr(mvdev);
3467 err_res:
3468 	mlx5_vdpa_free_resources(&ndev->mvdev);
3469 err_mpfs:
3470 	if (!is_zero_ether_addr(config->mac))
3471 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3472 err_alloc:
3473 	put_device(&mvdev->vdev.dev);
3474 	return err;
3475 }
3476 
3477 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3478 {
3479 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3480 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3481 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3482 	struct workqueue_struct *wq;
3483 
3484 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
3485 	ndev->debugfs = NULL;
3486 	unregister_link_notifier(ndev);
3487 	_vdpa_unregister_device(dev);
3488 	wq = mvdev->wq;
3489 	mvdev->wq = NULL;
3490 	destroy_workqueue(wq);
3491 	mgtdev->ndev = NULL;
3492 }
3493 
3494 static const struct vdpa_mgmtdev_ops mdev_ops = {
3495 	.dev_add = mlx5_vdpa_dev_add,
3496 	.dev_del = mlx5_vdpa_dev_del,
3497 };
3498 
3499 static struct virtio_device_id id_table[] = {
3500 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3501 	{ 0 },
3502 };
3503 
3504 static int mlx5v_probe(struct auxiliary_device *adev,
3505 		       const struct auxiliary_device_id *id)
3506 
3507 {
3508 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3509 	struct mlx5_core_dev *mdev = madev->mdev;
3510 	struct mlx5_vdpa_mgmtdev *mgtdev;
3511 	int err;
3512 
3513 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3514 	if (!mgtdev)
3515 		return -ENOMEM;
3516 
3517 	mgtdev->mgtdev.ops = &mdev_ops;
3518 	mgtdev->mgtdev.device = mdev->device;
3519 	mgtdev->mgtdev.id_table = id_table;
3520 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3521 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3522 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3523 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3524 	mgtdev->mgtdev.max_supported_vqs =
3525 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3526 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3527 	mgtdev->madev = madev;
3528 
3529 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3530 	if (err)
3531 		goto reg_err;
3532 
3533 	auxiliary_set_drvdata(adev, mgtdev);
3534 
3535 	return 0;
3536 
3537 reg_err:
3538 	kfree(mgtdev);
3539 	return err;
3540 }
3541 
3542 static void mlx5v_remove(struct auxiliary_device *adev)
3543 {
3544 	struct mlx5_vdpa_mgmtdev *mgtdev;
3545 
3546 	mgtdev = auxiliary_get_drvdata(adev);
3547 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
3548 	kfree(mgtdev);
3549 }
3550 
3551 static void mlx5v_shutdown(struct auxiliary_device *auxdev)
3552 {
3553 	struct mlx5_vdpa_mgmtdev *mgtdev;
3554 	struct mlx5_vdpa_net *ndev;
3555 
3556 	mgtdev = auxiliary_get_drvdata(auxdev);
3557 	ndev = mgtdev->ndev;
3558 
3559 	free_irqs(ndev);
3560 }
3561 
3562 static const struct auxiliary_device_id mlx5v_id_table[] = {
3563 	{ .name = MLX5_ADEV_NAME ".vnet", },
3564 	{},
3565 };
3566 
3567 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
3568 
3569 static struct auxiliary_driver mlx5v_driver = {
3570 	.name = "vnet",
3571 	.probe = mlx5v_probe,
3572 	.remove = mlx5v_remove,
3573 	.shutdown = mlx5v_shutdown,
3574 	.id_table = mlx5v_id_table,
3575 };
3576 
3577 module_auxiliary_driver(mlx5v_driver);
3578