xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision ffcdf473)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 #include "mlx5_vnet.h"
22 
23 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
24 MODULE_DESCRIPTION("Mellanox VDPA driver");
25 MODULE_LICENSE("Dual BSD/GPL");
26 
27 #define VALID_FEATURES_MASK                                                                        \
28 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
29 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
30 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
32 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
33 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
34 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
36 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
37 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
38 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
39 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
40 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
41 
42 #define VALID_STATUS_MASK                                                                          \
43 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
44 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
45 
46 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
47 
48 #define MLX5V_UNTAGGED 0x1000
49 
50 struct mlx5_vdpa_cq_buf {
51 	struct mlx5_frag_buf_ctrl fbc;
52 	struct mlx5_frag_buf frag_buf;
53 	int cqe_size;
54 	int nent;
55 };
56 
57 struct mlx5_vdpa_cq {
58 	struct mlx5_core_cq mcq;
59 	struct mlx5_vdpa_cq_buf buf;
60 	struct mlx5_db db;
61 	int cqe;
62 };
63 
64 struct mlx5_vdpa_umem {
65 	struct mlx5_frag_buf_ctrl fbc;
66 	struct mlx5_frag_buf frag_buf;
67 	int size;
68 	u32 id;
69 };
70 
71 struct mlx5_vdpa_qp {
72 	struct mlx5_core_qp mqp;
73 	struct mlx5_frag_buf frag_buf;
74 	struct mlx5_db db;
75 	u16 head;
76 	bool fw;
77 };
78 
79 struct mlx5_vq_restore_info {
80 	u32 num_ent;
81 	u64 desc_addr;
82 	u64 device_addr;
83 	u64 driver_addr;
84 	u16 avail_index;
85 	u16 used_index;
86 	bool ready;
87 	bool restore;
88 };
89 
90 struct mlx5_vdpa_virtqueue {
91 	bool ready;
92 	u64 desc_addr;
93 	u64 device_addr;
94 	u64 driver_addr;
95 	u32 num_ent;
96 
97 	/* Resources for implementing the notification channel from the device
98 	 * to the driver. fwqp is the firmware end of an RC connection; the
99 	 * other end is vqqp used by the driver. cq is where completions are
100 	 * reported.
101 	 */
102 	struct mlx5_vdpa_cq cq;
103 	struct mlx5_vdpa_qp fwqp;
104 	struct mlx5_vdpa_qp vqqp;
105 
106 	/* umem resources are required for the virtqueue operation. They're use
107 	 * is internal and they must be provided by the driver.
108 	 */
109 	struct mlx5_vdpa_umem umem1;
110 	struct mlx5_vdpa_umem umem2;
111 	struct mlx5_vdpa_umem umem3;
112 
113 	u32 counter_set_id;
114 	bool initialized;
115 	int index;
116 	u32 virtq_id;
117 	struct mlx5_vdpa_net *ndev;
118 	u16 avail_idx;
119 	u16 used_idx;
120 	int fw_state;
121 
122 	/* keep last in the struct */
123 	struct mlx5_vq_restore_info ri;
124 };
125 
126 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
127 {
128 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
129 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
130 			return idx < 2;
131 		else
132 			return idx < 3;
133 	}
134 
135 	return idx <= mvdev->max_idx;
136 }
137 
138 static void free_resources(struct mlx5_vdpa_net *ndev);
139 static void init_mvqs(struct mlx5_vdpa_net *ndev);
140 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
141 static void teardown_driver(struct mlx5_vdpa_net *ndev);
142 
143 static bool mlx5_vdpa_debug;
144 
145 #define MLX5_CVQ_MAX_ENT 16
146 
147 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
148 	do {                                                                                       \
149 		if (features & BIT_ULL(_feature))                                                  \
150 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
151 	} while (0)
152 
153 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
154 	do {                                                                                       \
155 		if (status & (_status))                                                            \
156 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
157 	} while (0)
158 
159 /* TODO: cross-endian support */
160 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
161 {
162 	return virtio_legacy_is_little_endian() ||
163 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
164 }
165 
166 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
167 {
168 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
169 }
170 
171 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
172 {
173 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
174 }
175 
176 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
177 {
178 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
179 		return 2;
180 
181 	return mvdev->max_vqs;
182 }
183 
184 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
185 {
186 	return idx == ctrl_vq_idx(mvdev);
187 }
188 
189 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
190 {
191 	if (status & ~VALID_STATUS_MASK)
192 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
193 			       status & ~VALID_STATUS_MASK);
194 
195 	if (!mlx5_vdpa_debug)
196 		return;
197 
198 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
199 	if (set && !status) {
200 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
201 		return;
202 	}
203 
204 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
205 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
206 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
207 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
208 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
209 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
210 }
211 
212 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
213 {
214 	if (features & ~VALID_FEATURES_MASK)
215 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
216 			       features & ~VALID_FEATURES_MASK);
217 
218 	if (!mlx5_vdpa_debug)
219 		return;
220 
221 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
222 	if (!features)
223 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
224 
225 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
226 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
227 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
228 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
229 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
230 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
231 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
259 }
260 
261 static int create_tis(struct mlx5_vdpa_net *ndev)
262 {
263 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
264 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
265 	void *tisc;
266 	int err;
267 
268 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
269 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
270 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
271 	if (err)
272 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
273 
274 	return err;
275 }
276 
277 static void destroy_tis(struct mlx5_vdpa_net *ndev)
278 {
279 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
280 }
281 
282 #define MLX5_VDPA_CQE_SIZE 64
283 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
284 
285 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
286 {
287 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
288 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
289 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
290 	int err;
291 
292 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
293 				       ndev->mvdev.mdev->priv.numa_node);
294 	if (err)
295 		return err;
296 
297 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
298 
299 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
300 	buf->nent = nent;
301 
302 	return 0;
303 }
304 
305 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
306 {
307 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
308 
309 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
310 					ndev->mvdev.mdev->priv.numa_node);
311 }
312 
313 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
314 {
315 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
316 }
317 
318 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
319 {
320 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
321 }
322 
323 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
324 {
325 	struct mlx5_cqe64 *cqe64;
326 	void *cqe;
327 	int i;
328 
329 	for (i = 0; i < buf->nent; i++) {
330 		cqe = get_cqe(vcq, i);
331 		cqe64 = cqe;
332 		cqe64->op_own = MLX5_CQE_INVALID << 4;
333 	}
334 }
335 
336 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
337 {
338 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
339 
340 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
341 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
342 		return cqe64;
343 
344 	return NULL;
345 }
346 
347 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
348 {
349 	vqp->head += n;
350 	vqp->db.db[0] = cpu_to_be32(vqp->head);
351 }
352 
353 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
354 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
355 {
356 	struct mlx5_vdpa_qp *vqp;
357 	__be64 *pas;
358 	void *qpc;
359 
360 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
361 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
362 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
363 	if (vqp->fw) {
364 		/* Firmware QP is allocated by the driver for the firmware's
365 		 * use so we can skip part of the params as they will be chosen by firmware
366 		 */
367 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
368 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
369 		MLX5_SET(qpc, qpc, no_sq, 1);
370 		return;
371 	}
372 
373 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
374 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
375 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
376 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
377 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
378 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
379 	MLX5_SET(qpc, qpc, no_sq, 1);
380 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
381 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
382 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
383 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
384 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
385 }
386 
387 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
388 {
389 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
390 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
391 					ndev->mvdev.mdev->priv.numa_node);
392 }
393 
394 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
395 {
396 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
397 }
398 
399 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
400 		     struct mlx5_vdpa_qp *vqp)
401 {
402 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
403 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
404 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
405 	void *qpc;
406 	void *in;
407 	int err;
408 
409 	if (!vqp->fw) {
410 		vqp = &mvq->vqqp;
411 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
412 		if (err)
413 			return err;
414 
415 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
416 		if (err)
417 			goto err_db;
418 		inlen += vqp->frag_buf.npages * sizeof(__be64);
419 	}
420 
421 	in = kzalloc(inlen, GFP_KERNEL);
422 	if (!in) {
423 		err = -ENOMEM;
424 		goto err_kzalloc;
425 	}
426 
427 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
428 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
429 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
430 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
431 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
432 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
433 	if (!vqp->fw)
434 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
435 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
436 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
437 	kfree(in);
438 	if (err)
439 		goto err_kzalloc;
440 
441 	vqp->mqp.uid = ndev->mvdev.res.uid;
442 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
443 
444 	if (!vqp->fw)
445 		rx_post(vqp, mvq->num_ent);
446 
447 	return 0;
448 
449 err_kzalloc:
450 	if (!vqp->fw)
451 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
452 err_db:
453 	if (!vqp->fw)
454 		rq_buf_free(ndev, vqp);
455 
456 	return err;
457 }
458 
459 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
460 {
461 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
462 
463 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
464 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
465 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
466 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
467 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
468 	if (!vqp->fw) {
469 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
470 		rq_buf_free(ndev, vqp);
471 	}
472 }
473 
474 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
475 {
476 	return get_sw_cqe(cq, cq->mcq.cons_index);
477 }
478 
479 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
480 {
481 	struct mlx5_cqe64 *cqe64;
482 
483 	cqe64 = next_cqe_sw(vcq);
484 	if (!cqe64)
485 		return -EAGAIN;
486 
487 	vcq->mcq.cons_index++;
488 	return 0;
489 }
490 
491 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
492 {
493 	struct mlx5_vdpa_net *ndev = mvq->ndev;
494 	struct vdpa_callback *event_cb;
495 
496 	event_cb = &ndev->event_cbs[mvq->index];
497 	mlx5_cq_set_ci(&mvq->cq.mcq);
498 
499 	/* make sure CQ cosumer update is visible to the hardware before updating
500 	 * RX doorbell record.
501 	 */
502 	dma_wmb();
503 	rx_post(&mvq->vqqp, num);
504 	if (event_cb->callback)
505 		event_cb->callback(event_cb->private);
506 }
507 
508 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
509 {
510 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
511 	struct mlx5_vdpa_net *ndev = mvq->ndev;
512 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
513 	int num = 0;
514 
515 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
516 		num++;
517 		if (num > mvq->num_ent / 2) {
518 			/* If completions keep coming while we poll, we want to
519 			 * let the hardware know that we consumed them by
520 			 * updating the doorbell record.  We also let vdpa core
521 			 * know about this so it passes it on the virtio driver
522 			 * on the guest.
523 			 */
524 			mlx5_vdpa_handle_completions(mvq, num);
525 			num = 0;
526 		}
527 	}
528 
529 	if (num)
530 		mlx5_vdpa_handle_completions(mvq, num);
531 
532 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
533 }
534 
535 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
536 {
537 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
538 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
539 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
540 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
541 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
542 	__be64 *pas;
543 	int inlen;
544 	void *cqc;
545 	void *in;
546 	int err;
547 	int eqn;
548 
549 	err = mlx5_db_alloc(mdev, &vcq->db);
550 	if (err)
551 		return err;
552 
553 	vcq->mcq.set_ci_db = vcq->db.db;
554 	vcq->mcq.arm_db = vcq->db.db + 1;
555 	vcq->mcq.cqe_sz = 64;
556 
557 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
558 	if (err)
559 		goto err_db;
560 
561 	cq_frag_buf_init(vcq, &vcq->buf);
562 
563 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
564 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
565 	in = kzalloc(inlen, GFP_KERNEL);
566 	if (!in) {
567 		err = -ENOMEM;
568 		goto err_vzalloc;
569 	}
570 
571 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
572 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
573 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
574 
575 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
576 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
577 
578 	/* Use vector 0 by default. Consider adding code to choose least used
579 	 * vector.
580 	 */
581 	err = mlx5_vector2eqn(mdev, 0, &eqn);
582 	if (err)
583 		goto err_vec;
584 
585 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
586 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
587 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
588 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
589 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
590 
591 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
592 	if (err)
593 		goto err_vec;
594 
595 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
596 	vcq->cqe = num_ent;
597 	vcq->mcq.set_ci_db = vcq->db.db;
598 	vcq->mcq.arm_db = vcq->db.db + 1;
599 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
600 	kfree(in);
601 	return 0;
602 
603 err_vec:
604 	kfree(in);
605 err_vzalloc:
606 	cq_frag_buf_free(ndev, &vcq->buf);
607 err_db:
608 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
609 	return err;
610 }
611 
612 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
613 {
614 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
615 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
616 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
617 
618 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
619 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
620 		return;
621 	}
622 	cq_frag_buf_free(ndev, &vcq->buf);
623 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
624 }
625 
626 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
627 			  struct mlx5_vdpa_umem **umemp)
628 {
629 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
630 	int p_a;
631 	int p_b;
632 
633 	switch (num) {
634 	case 1:
635 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
636 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
637 		*umemp = &mvq->umem1;
638 		break;
639 	case 2:
640 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
641 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
642 		*umemp = &mvq->umem2;
643 		break;
644 	case 3:
645 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
646 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
647 		*umemp = &mvq->umem3;
648 		break;
649 	}
650 	(*umemp)->size = p_a * mvq->num_ent + p_b;
651 }
652 
653 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
654 {
655 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
656 }
657 
658 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
659 {
660 	int inlen;
661 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
662 	void *um;
663 	void *in;
664 	int err;
665 	__be64 *pas;
666 	struct mlx5_vdpa_umem *umem;
667 
668 	set_umem_size(ndev, mvq, num, &umem);
669 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
670 	if (err)
671 		return err;
672 
673 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
674 
675 	in = kzalloc(inlen, GFP_KERNEL);
676 	if (!in) {
677 		err = -ENOMEM;
678 		goto err_in;
679 	}
680 
681 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
682 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
683 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
684 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
685 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
686 
687 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
688 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
689 
690 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
691 	if (err) {
692 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
693 		goto err_cmd;
694 	}
695 
696 	kfree(in);
697 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
698 
699 	return 0;
700 
701 err_cmd:
702 	kfree(in);
703 err_in:
704 	umem_frag_buf_free(ndev, umem);
705 	return err;
706 }
707 
708 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
709 {
710 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
711 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
712 	struct mlx5_vdpa_umem *umem;
713 
714 	switch (num) {
715 	case 1:
716 		umem = &mvq->umem1;
717 		break;
718 	case 2:
719 		umem = &mvq->umem2;
720 		break;
721 	case 3:
722 		umem = &mvq->umem3;
723 		break;
724 	}
725 
726 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
727 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
728 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
729 		return;
730 
731 	umem_frag_buf_free(ndev, umem);
732 }
733 
734 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
735 {
736 	int num;
737 	int err;
738 
739 	for (num = 1; num <= 3; num++) {
740 		err = create_umem(ndev, mvq, num);
741 		if (err)
742 			goto err_umem;
743 	}
744 	return 0;
745 
746 err_umem:
747 	for (num--; num > 0; num--)
748 		umem_destroy(ndev, mvq, num);
749 
750 	return err;
751 }
752 
753 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
754 {
755 	int num;
756 
757 	for (num = 3; num > 0; num--)
758 		umem_destroy(ndev, mvq, num);
759 }
760 
761 static int get_queue_type(struct mlx5_vdpa_net *ndev)
762 {
763 	u32 type_mask;
764 
765 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
766 
767 	/* prefer split queue */
768 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
769 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
770 
771 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
772 
773 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
774 }
775 
776 static bool vq_is_tx(u16 idx)
777 {
778 	return idx % 2;
779 }
780 
781 enum {
782 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
783 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
784 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
785 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
786 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
787 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
788 	MLX5_VIRTIO_NET_F_CSUM = 10,
789 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
790 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
791 };
792 
793 static u16 get_features(u64 features)
794 {
795 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
796 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
797 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
798 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
799 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
800 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
801 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
802 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
803 }
804 
805 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
806 {
807 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
808 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
809 }
810 
811 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
812 {
813 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
814 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
815 	void *obj_context;
816 	u16 mlx_features;
817 	void *cmd_hdr;
818 	void *vq_ctx;
819 	void *in;
820 	int err;
821 
822 	err = umems_create(ndev, mvq);
823 	if (err)
824 		return err;
825 
826 	in = kzalloc(inlen, GFP_KERNEL);
827 	if (!in) {
828 		err = -ENOMEM;
829 		goto err_alloc;
830 	}
831 
832 	mlx_features = get_features(ndev->mvdev.actual_features);
833 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
834 
835 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
836 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
837 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
838 
839 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
840 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
841 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
842 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
843 		 mlx_features >> 3);
844 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
845 		 mlx_features & 7);
846 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
847 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
848 
849 	if (vq_is_tx(mvq->index))
850 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
851 
852 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
853 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
854 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
855 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
856 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
857 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
858 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
859 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
860 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
861 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
862 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
863 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
864 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
865 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
866 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
867 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
868 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
869 	if (counters_supported(&ndev->mvdev))
870 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
871 
872 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
873 	if (err)
874 		goto err_cmd;
875 
876 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
877 	kfree(in);
878 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
879 
880 	return 0;
881 
882 err_cmd:
883 	kfree(in);
884 err_alloc:
885 	umems_destroy(ndev, mvq);
886 	return err;
887 }
888 
889 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
890 {
891 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
892 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
893 
894 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
895 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
896 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
897 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
898 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
899 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
900 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
901 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
902 		return;
903 	}
904 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
905 	umems_destroy(ndev, mvq);
906 }
907 
908 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
909 {
910 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
911 }
912 
913 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
914 {
915 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
916 }
917 
918 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
919 			int *outlen, u32 qpn, u32 rqpn)
920 {
921 	void *qpc;
922 	void *pp;
923 
924 	switch (cmd) {
925 	case MLX5_CMD_OP_2RST_QP:
926 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
927 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
928 		*in = kzalloc(*inlen, GFP_KERNEL);
929 		*out = kzalloc(*outlen, GFP_KERNEL);
930 		if (!*in || !*out)
931 			goto outerr;
932 
933 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
934 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
935 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
936 		break;
937 	case MLX5_CMD_OP_RST2INIT_QP:
938 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
939 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
940 		*in = kzalloc(*inlen, GFP_KERNEL);
941 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
942 		if (!*in || !*out)
943 			goto outerr;
944 
945 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
946 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
947 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
948 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
949 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
950 		MLX5_SET(qpc, qpc, rwe, 1);
951 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
952 		MLX5_SET(ads, pp, vhca_port_num, 1);
953 		break;
954 	case MLX5_CMD_OP_INIT2RTR_QP:
955 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
956 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
957 		*in = kzalloc(*inlen, GFP_KERNEL);
958 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
959 		if (!*in || !*out)
960 			goto outerr;
961 
962 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
963 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
964 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
965 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
966 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
967 		MLX5_SET(qpc, qpc, log_msg_max, 30);
968 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
969 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
970 		MLX5_SET(ads, pp, fl, 1);
971 		break;
972 	case MLX5_CMD_OP_RTR2RTS_QP:
973 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
974 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
975 		*in = kzalloc(*inlen, GFP_KERNEL);
976 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
977 		if (!*in || !*out)
978 			goto outerr;
979 
980 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
981 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
982 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
983 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
984 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
985 		MLX5_SET(ads, pp, ack_timeout, 14);
986 		MLX5_SET(qpc, qpc, retry_count, 7);
987 		MLX5_SET(qpc, qpc, rnr_retry, 7);
988 		break;
989 	default:
990 		goto outerr_nullify;
991 	}
992 
993 	return;
994 
995 outerr:
996 	kfree(*in);
997 	kfree(*out);
998 outerr_nullify:
999 	*in = NULL;
1000 	*out = NULL;
1001 }
1002 
1003 static void free_inout(void *in, void *out)
1004 {
1005 	kfree(in);
1006 	kfree(out);
1007 }
1008 
1009 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1010  * firmware. The fw argument indicates whether the subjected QP is the one used
1011  * by firmware.
1012  */
1013 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1014 {
1015 	int outlen;
1016 	int inlen;
1017 	void *out;
1018 	void *in;
1019 	int err;
1020 
1021 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1022 	if (!in || !out)
1023 		return -ENOMEM;
1024 
1025 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1026 	free_inout(in, out);
1027 	return err;
1028 }
1029 
1030 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1031 {
1032 	int err;
1033 
1034 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1035 	if (err)
1036 		return err;
1037 
1038 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1039 	if (err)
1040 		return err;
1041 
1042 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1043 	if (err)
1044 		return err;
1045 
1046 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1047 	if (err)
1048 		return err;
1049 
1050 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1051 	if (err)
1052 		return err;
1053 
1054 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1055 	if (err)
1056 		return err;
1057 
1058 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1059 }
1060 
1061 struct mlx5_virtq_attr {
1062 	u8 state;
1063 	u16 available_index;
1064 	u16 used_index;
1065 };
1066 
1067 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1068 			   struct mlx5_virtq_attr *attr)
1069 {
1070 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1071 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1072 	void *out;
1073 	void *obj_context;
1074 	void *cmd_hdr;
1075 	int err;
1076 
1077 	out = kzalloc(outlen, GFP_KERNEL);
1078 	if (!out)
1079 		return -ENOMEM;
1080 
1081 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1082 
1083 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1084 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1085 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1086 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1087 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1088 	if (err)
1089 		goto err_cmd;
1090 
1091 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1092 	memset(attr, 0, sizeof(*attr));
1093 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1094 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1095 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1096 	kfree(out);
1097 	return 0;
1098 
1099 err_cmd:
1100 	kfree(out);
1101 	return err;
1102 }
1103 
1104 static bool is_valid_state_change(int oldstate, int newstate)
1105 {
1106 	switch (oldstate) {
1107 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1108 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1109 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1110 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1111 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1112 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1113 	default:
1114 		return false;
1115 	}
1116 }
1117 
1118 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1119 {
1120 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1121 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1122 	void *obj_context;
1123 	void *cmd_hdr;
1124 	void *in;
1125 	int err;
1126 
1127 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1128 		return 0;
1129 
1130 	if (!is_valid_state_change(mvq->fw_state, state))
1131 		return -EINVAL;
1132 
1133 	in = kzalloc(inlen, GFP_KERNEL);
1134 	if (!in)
1135 		return -ENOMEM;
1136 
1137 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1138 
1139 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1140 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1141 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1142 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1143 
1144 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1145 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1146 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1147 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1148 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1149 	kfree(in);
1150 	if (!err)
1151 		mvq->fw_state = state;
1152 
1153 	return err;
1154 }
1155 
1156 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1157 {
1158 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1159 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1160 	void *cmd_hdr;
1161 	int err;
1162 
1163 	if (!counters_supported(&ndev->mvdev))
1164 		return 0;
1165 
1166 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1167 
1168 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1169 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1170 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1171 
1172 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1173 	if (err)
1174 		return err;
1175 
1176 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1177 
1178 	return 0;
1179 }
1180 
1181 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1182 {
1183 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1184 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1185 
1186 	if (!counters_supported(&ndev->mvdev))
1187 		return;
1188 
1189 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1190 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1191 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1192 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1193 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1194 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1195 }
1196 
1197 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1198 {
1199 	u16 idx = mvq->index;
1200 	int err;
1201 
1202 	if (!mvq->num_ent)
1203 		return 0;
1204 
1205 	if (mvq->initialized)
1206 		return 0;
1207 
1208 	err = cq_create(ndev, idx, mvq->num_ent);
1209 	if (err)
1210 		return err;
1211 
1212 	err = qp_create(ndev, mvq, &mvq->fwqp);
1213 	if (err)
1214 		goto err_fwqp;
1215 
1216 	err = qp_create(ndev, mvq, &mvq->vqqp);
1217 	if (err)
1218 		goto err_vqqp;
1219 
1220 	err = connect_qps(ndev, mvq);
1221 	if (err)
1222 		goto err_connect;
1223 
1224 	err = counter_set_alloc(ndev, mvq);
1225 	if (err)
1226 		goto err_counter;
1227 
1228 	err = create_virtqueue(ndev, mvq);
1229 	if (err)
1230 		goto err_connect;
1231 
1232 	if (mvq->ready) {
1233 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1234 		if (err) {
1235 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1236 				       idx, err);
1237 			goto err_connect;
1238 		}
1239 	}
1240 
1241 	mvq->initialized = true;
1242 	return 0;
1243 
1244 err_connect:
1245 	counter_set_dealloc(ndev, mvq);
1246 err_counter:
1247 	qp_destroy(ndev, &mvq->vqqp);
1248 err_vqqp:
1249 	qp_destroy(ndev, &mvq->fwqp);
1250 err_fwqp:
1251 	cq_destroy(ndev, idx);
1252 	return err;
1253 }
1254 
1255 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1256 {
1257 	struct mlx5_virtq_attr attr;
1258 
1259 	if (!mvq->initialized)
1260 		return;
1261 
1262 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1263 		return;
1264 
1265 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1266 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1267 
1268 	if (query_virtqueue(ndev, mvq, &attr)) {
1269 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1270 		return;
1271 	}
1272 	mvq->avail_idx = attr.available_index;
1273 	mvq->used_idx = attr.used_index;
1274 }
1275 
1276 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1277 {
1278 	int i;
1279 
1280 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1281 		suspend_vq(ndev, &ndev->vqs[i]);
1282 }
1283 
1284 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1285 {
1286 	if (!mvq->initialized)
1287 		return;
1288 
1289 	suspend_vq(ndev, mvq);
1290 	destroy_virtqueue(ndev, mvq);
1291 	counter_set_dealloc(ndev, mvq);
1292 	qp_destroy(ndev, &mvq->vqqp);
1293 	qp_destroy(ndev, &mvq->fwqp);
1294 	cq_destroy(ndev, mvq->index);
1295 	mvq->initialized = false;
1296 }
1297 
1298 static int create_rqt(struct mlx5_vdpa_net *ndev)
1299 {
1300 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1301 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1302 	__be32 *list;
1303 	void *rqtc;
1304 	int inlen;
1305 	void *in;
1306 	int i, j;
1307 	int err;
1308 
1309 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1310 	in = kzalloc(inlen, GFP_KERNEL);
1311 	if (!in)
1312 		return -ENOMEM;
1313 
1314 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1315 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1316 
1317 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1318 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1319 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1320 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1321 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1322 
1323 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1324 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1325 	kfree(in);
1326 	if (err)
1327 		return err;
1328 
1329 	return 0;
1330 }
1331 
1332 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1333 
1334 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1335 {
1336 	int act_sz = roundup_pow_of_two(num / 2);
1337 	__be32 *list;
1338 	void *rqtc;
1339 	int inlen;
1340 	void *in;
1341 	int i, j;
1342 	int err;
1343 
1344 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1345 	in = kzalloc(inlen, GFP_KERNEL);
1346 	if (!in)
1347 		return -ENOMEM;
1348 
1349 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1350 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1351 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1352 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1353 
1354 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1355 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1356 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1357 
1358 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1359 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1360 	kfree(in);
1361 	if (err)
1362 		return err;
1363 
1364 	return 0;
1365 }
1366 
1367 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1368 {
1369 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1370 }
1371 
1372 static int create_tir(struct mlx5_vdpa_net *ndev)
1373 {
1374 #define HASH_IP_L4PORTS                                                                            \
1375 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1376 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1377 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1378 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1379 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1380 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1381 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1382 	void *rss_key;
1383 	void *outer;
1384 	void *tirc;
1385 	void *in;
1386 	int err;
1387 
1388 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1389 	if (!in)
1390 		return -ENOMEM;
1391 
1392 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1393 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1394 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1395 
1396 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1397 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1398 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1399 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1400 
1401 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1402 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1403 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1404 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1405 
1406 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1407 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1408 
1409 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1410 	kfree(in);
1411 	if (err)
1412 		return err;
1413 
1414 	mlx5_vdpa_add_tirn(ndev);
1415 	return err;
1416 }
1417 
1418 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1419 {
1420 	mlx5_vdpa_remove_tirn(ndev);
1421 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1422 }
1423 
1424 #define MAX_STEERING_ENT 0x8000
1425 #define MAX_STEERING_GROUPS 2
1426 
1427 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1428        #define NUM_DESTS 2
1429 #else
1430        #define NUM_DESTS 1
1431 #endif
1432 
1433 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1434 				 struct macvlan_node *node,
1435 				 struct mlx5_flow_act *flow_act,
1436 				 struct mlx5_flow_destination *dests)
1437 {
1438 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1439 	int err;
1440 
1441 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1442 	if (IS_ERR(node->ucast_counter.counter))
1443 		return PTR_ERR(node->ucast_counter.counter);
1444 
1445 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1446 	if (IS_ERR(node->mcast_counter.counter)) {
1447 		err = PTR_ERR(node->mcast_counter.counter);
1448 		goto err_mcast_counter;
1449 	}
1450 
1451 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1452 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1453 	return 0;
1454 
1455 err_mcast_counter:
1456 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1457 	return err;
1458 #else
1459 	return 0;
1460 #endif
1461 }
1462 
1463 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1464 				     struct macvlan_node *node)
1465 {
1466 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1467 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1468 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1469 #endif
1470 }
1471 
1472 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1473 					struct macvlan_node *node)
1474 {
1475 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1476 	struct mlx5_flow_act flow_act = {};
1477 	struct mlx5_flow_spec *spec;
1478 	void *headers_c;
1479 	void *headers_v;
1480 	u8 *dmac_c;
1481 	u8 *dmac_v;
1482 	int err;
1483 	u16 vid;
1484 
1485 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1486 	if (!spec)
1487 		return -ENOMEM;
1488 
1489 	vid = key2vid(node->macvlan);
1490 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1491 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1492 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1493 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1494 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1495 	eth_broadcast_addr(dmac_c);
1496 	ether_addr_copy(dmac_v, mac);
1497 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1498 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1499 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1500 	}
1501 	if (node->tagged) {
1502 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1503 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1504 	}
1505 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1506 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1507 	dests[0].tir_num = ndev->res.tirn;
1508 	err = add_steering_counters(ndev, node, &flow_act, dests);
1509 	if (err)
1510 		goto out_free;
1511 
1512 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1513 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1514 #endif
1515 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1516 	if (IS_ERR(node->ucast_rule)) {
1517 		err = PTR_ERR(node->ucast_rule);
1518 		goto err_ucast;
1519 	}
1520 
1521 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1522 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1523 #endif
1524 
1525 	memset(dmac_c, 0, ETH_ALEN);
1526 	memset(dmac_v, 0, ETH_ALEN);
1527 	dmac_c[0] = 1;
1528 	dmac_v[0] = 1;
1529 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1530 	if (IS_ERR(node->mcast_rule)) {
1531 		err = PTR_ERR(node->mcast_rule);
1532 		goto err_mcast;
1533 	}
1534 	kvfree(spec);
1535 	mlx5_vdpa_add_rx_counters(ndev, node);
1536 	return 0;
1537 
1538 err_mcast:
1539 	mlx5_del_flow_rules(node->ucast_rule);
1540 err_ucast:
1541 	remove_steering_counters(ndev, node);
1542 out_free:
1543 	kvfree(spec);
1544 	return err;
1545 }
1546 
1547 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1548 					 struct macvlan_node *node)
1549 {
1550 	mlx5_vdpa_remove_rx_counters(ndev, node);
1551 	mlx5_del_flow_rules(node->ucast_rule);
1552 	mlx5_del_flow_rules(node->mcast_rule);
1553 }
1554 
1555 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1556 {
1557 	u64 val;
1558 
1559 	if (!tagged)
1560 		vlan = MLX5V_UNTAGGED;
1561 
1562 	val = (u64)vlan << 48 |
1563 	      (u64)mac[0] << 40 |
1564 	      (u64)mac[1] << 32 |
1565 	      (u64)mac[2] << 24 |
1566 	      (u64)mac[3] << 16 |
1567 	      (u64)mac[4] << 8 |
1568 	      (u64)mac[5];
1569 
1570 	return val;
1571 }
1572 
1573 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1574 {
1575 	struct macvlan_node *pos;
1576 	u32 idx;
1577 
1578 	idx = hash_64(value, 8); // tbd 8
1579 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1580 		if (pos->macvlan == value)
1581 			return pos;
1582 	}
1583 	return NULL;
1584 }
1585 
1586 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1587 {
1588 	struct macvlan_node *ptr;
1589 	u64 val;
1590 	u32 idx;
1591 	int err;
1592 
1593 	val = search_val(mac, vid, tagged);
1594 	if (mac_vlan_lookup(ndev, val))
1595 		return -EEXIST;
1596 
1597 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1598 	if (!ptr)
1599 		return -ENOMEM;
1600 
1601 	ptr->tagged = tagged;
1602 	ptr->macvlan = val;
1603 	ptr->ndev = ndev;
1604 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1605 	if (err)
1606 		goto err_add;
1607 
1608 	idx = hash_64(val, 8);
1609 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1610 	return 0;
1611 
1612 err_add:
1613 	kfree(ptr);
1614 	return err;
1615 }
1616 
1617 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1618 {
1619 	struct macvlan_node *ptr;
1620 
1621 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1622 	if (!ptr)
1623 		return;
1624 
1625 	hlist_del(&ptr->hlist);
1626 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1627 	remove_steering_counters(ndev, ptr);
1628 	kfree(ptr);
1629 }
1630 
1631 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1632 {
1633 	struct macvlan_node *pos;
1634 	struct hlist_node *n;
1635 	int i;
1636 
1637 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1638 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1639 			hlist_del(&pos->hlist);
1640 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1641 			remove_steering_counters(ndev, pos);
1642 			kfree(pos);
1643 		}
1644 	}
1645 }
1646 
1647 static int setup_steering(struct mlx5_vdpa_net *ndev)
1648 {
1649 	struct mlx5_flow_table_attr ft_attr = {};
1650 	struct mlx5_flow_namespace *ns;
1651 	int err;
1652 
1653 	ft_attr.max_fte = MAX_STEERING_ENT;
1654 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
1655 
1656 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1657 	if (!ns) {
1658 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1659 		return -EOPNOTSUPP;
1660 	}
1661 
1662 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1663 	if (IS_ERR(ndev->rxft)) {
1664 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
1665 		return PTR_ERR(ndev->rxft);
1666 	}
1667 	mlx5_vdpa_add_rx_flow_table(ndev);
1668 
1669 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
1670 	if (err)
1671 		goto err_add;
1672 
1673 	return 0;
1674 
1675 err_add:
1676 	mlx5_vdpa_remove_rx_flow_table(ndev);
1677 	mlx5_destroy_flow_table(ndev->rxft);
1678 	return err;
1679 }
1680 
1681 static void teardown_steering(struct mlx5_vdpa_net *ndev)
1682 {
1683 	clear_mac_vlan_table(ndev);
1684 	mlx5_vdpa_remove_rx_flow_table(ndev);
1685 	mlx5_destroy_flow_table(ndev->rxft);
1686 }
1687 
1688 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1689 {
1690 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1691 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1692 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1693 	struct mlx5_core_dev *pfmdev;
1694 	size_t read;
1695 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1696 
1697 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1698 	switch (cmd) {
1699 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1700 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1701 		if (read != ETH_ALEN)
1702 			break;
1703 
1704 		if (!memcmp(ndev->config.mac, mac, 6)) {
1705 			status = VIRTIO_NET_OK;
1706 			break;
1707 		}
1708 
1709 		if (is_zero_ether_addr(mac))
1710 			break;
1711 
1712 		if (!is_zero_ether_addr(ndev->config.mac)) {
1713 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1714 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1715 					       ndev->config.mac);
1716 				break;
1717 			}
1718 		}
1719 
1720 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1721 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1722 				       mac);
1723 			break;
1724 		}
1725 
1726 		/* backup the original mac address so that if failed to add the forward rules
1727 		 * we could restore it
1728 		 */
1729 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1730 
1731 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1732 
1733 		/* Need recreate the flow table entry, so that the packet could forward back
1734 		 */
1735 		mac_vlan_del(ndev, mac_back, 0, false);
1736 
1737 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
1738 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1739 
1740 			/* Although it hardly run here, we still need double check */
1741 			if (is_zero_ether_addr(mac_back)) {
1742 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1743 				break;
1744 			}
1745 
1746 			/* Try to restore original mac address to MFPS table, and try to restore
1747 			 * the forward rule entry.
1748 			 */
1749 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1750 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1751 					       ndev->config.mac);
1752 			}
1753 
1754 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1755 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1756 					       mac_back);
1757 			}
1758 
1759 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1760 
1761 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
1762 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1763 
1764 			break;
1765 		}
1766 
1767 		status = VIRTIO_NET_OK;
1768 		break;
1769 
1770 	default:
1771 		break;
1772 	}
1773 
1774 	return status;
1775 }
1776 
1777 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1778 {
1779 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1780 	int cur_qps = ndev->cur_num_vqs / 2;
1781 	int err;
1782 	int i;
1783 
1784 	if (cur_qps > newqps) {
1785 		err = modify_rqt(ndev, 2 * newqps);
1786 		if (err)
1787 			return err;
1788 
1789 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1790 			teardown_vq(ndev, &ndev->vqs[i]);
1791 
1792 		ndev->cur_num_vqs = 2 * newqps;
1793 	} else {
1794 		ndev->cur_num_vqs = 2 * newqps;
1795 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1796 			err = setup_vq(ndev, &ndev->vqs[i]);
1797 			if (err)
1798 				goto clean_added;
1799 		}
1800 		err = modify_rqt(ndev, 2 * newqps);
1801 		if (err)
1802 			goto clean_added;
1803 	}
1804 	return 0;
1805 
1806 clean_added:
1807 	for (--i; i >= 2 * cur_qps; --i)
1808 		teardown_vq(ndev, &ndev->vqs[i]);
1809 
1810 	ndev->cur_num_vqs = 2 * cur_qps;
1811 
1812 	return err;
1813 }
1814 
1815 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1816 {
1817 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1818 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1819 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1820 	struct virtio_net_ctrl_mq mq;
1821 	size_t read;
1822 	u16 newqps;
1823 
1824 	switch (cmd) {
1825 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1826 		/* This mq feature check aligns with pre-existing userspace
1827 		 * implementation.
1828 		 *
1829 		 * Without it, an untrusted driver could fake a multiqueue config
1830 		 * request down to a non-mq device that may cause kernel to
1831 		 * panic due to uninitialized resources for extra vqs. Even with
1832 		 * a well behaving guest driver, it is not expected to allow
1833 		 * changing the number of vqs on a non-mq device.
1834 		 */
1835 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1836 			break;
1837 
1838 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1839 		if (read != sizeof(mq))
1840 			break;
1841 
1842 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1843 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1844 		    newqps > ndev->rqt_size)
1845 			break;
1846 
1847 		if (ndev->cur_num_vqs == 2 * newqps) {
1848 			status = VIRTIO_NET_OK;
1849 			break;
1850 		}
1851 
1852 		if (!change_num_qps(mvdev, newqps))
1853 			status = VIRTIO_NET_OK;
1854 
1855 		break;
1856 	default:
1857 		break;
1858 	}
1859 
1860 	return status;
1861 }
1862 
1863 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1864 {
1865 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1866 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1867 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1868 	__virtio16 vlan;
1869 	size_t read;
1870 	u16 id;
1871 
1872 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
1873 		return status;
1874 
1875 	switch (cmd) {
1876 	case VIRTIO_NET_CTRL_VLAN_ADD:
1877 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1878 		if (read != sizeof(vlan))
1879 			break;
1880 
1881 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
1882 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
1883 			break;
1884 
1885 		status = VIRTIO_NET_OK;
1886 		break;
1887 	case VIRTIO_NET_CTRL_VLAN_DEL:
1888 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
1889 		if (read != sizeof(vlan))
1890 			break;
1891 
1892 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
1893 		mac_vlan_del(ndev, ndev->config.mac, id, true);
1894 		status = VIRTIO_NET_OK;
1895 		break;
1896 	default:
1897 		break;
1898 	}
1899 
1900 	return status;
1901 }
1902 
1903 static void mlx5_cvq_kick_handler(struct work_struct *work)
1904 {
1905 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1906 	struct virtio_net_ctrl_hdr ctrl;
1907 	struct mlx5_vdpa_wq_ent *wqent;
1908 	struct mlx5_vdpa_dev *mvdev;
1909 	struct mlx5_control_vq *cvq;
1910 	struct mlx5_vdpa_net *ndev;
1911 	size_t read, write;
1912 	int err;
1913 
1914 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1915 	mvdev = wqent->mvdev;
1916 	ndev = to_mlx5_vdpa_ndev(mvdev);
1917 	cvq = &mvdev->cvq;
1918 
1919 	down_write(&ndev->reslock);
1920 
1921 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1922 		goto out;
1923 
1924 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1925 		goto out;
1926 
1927 	if (!cvq->ready)
1928 		goto out;
1929 
1930 	while (true) {
1931 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1932 					   GFP_ATOMIC);
1933 		if (err <= 0)
1934 			break;
1935 
1936 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1937 		if (read != sizeof(ctrl))
1938 			break;
1939 
1940 		cvq->received_desc++;
1941 		switch (ctrl.class) {
1942 		case VIRTIO_NET_CTRL_MAC:
1943 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1944 			break;
1945 		case VIRTIO_NET_CTRL_MQ:
1946 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1947 			break;
1948 		case VIRTIO_NET_CTRL_VLAN:
1949 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
1950 			break;
1951 		default:
1952 			break;
1953 		}
1954 
1955 		/* Make sure data is written before advancing index */
1956 		smp_wmb();
1957 
1958 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1959 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1960 		vringh_kiov_cleanup(&cvq->riov);
1961 		vringh_kiov_cleanup(&cvq->wiov);
1962 
1963 		if (vringh_need_notify_iotlb(&cvq->vring))
1964 			vringh_notify(&cvq->vring);
1965 
1966 		cvq->completed_desc++;
1967 		queue_work(mvdev->wq, &wqent->work);
1968 		break;
1969 	}
1970 
1971 out:
1972 	up_write(&ndev->reslock);
1973 }
1974 
1975 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1976 {
1977 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1978 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1979 	struct mlx5_vdpa_virtqueue *mvq;
1980 
1981 	if (!is_index_valid(mvdev, idx))
1982 		return;
1983 
1984 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1985 		if (!mvdev->wq || !mvdev->cvq.ready)
1986 			return;
1987 
1988 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
1989 		return;
1990 	}
1991 
1992 	mvq = &ndev->vqs[idx];
1993 	if (unlikely(!mvq->ready))
1994 		return;
1995 
1996 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1997 }
1998 
1999 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2000 				    u64 driver_area, u64 device_area)
2001 {
2002 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2003 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2004 	struct mlx5_vdpa_virtqueue *mvq;
2005 
2006 	if (!is_index_valid(mvdev, idx))
2007 		return -EINVAL;
2008 
2009 	if (is_ctrl_vq_idx(mvdev, idx)) {
2010 		mvdev->cvq.desc_addr = desc_area;
2011 		mvdev->cvq.device_addr = device_area;
2012 		mvdev->cvq.driver_addr = driver_area;
2013 		return 0;
2014 	}
2015 
2016 	mvq = &ndev->vqs[idx];
2017 	mvq->desc_addr = desc_area;
2018 	mvq->device_addr = device_area;
2019 	mvq->driver_addr = driver_area;
2020 	return 0;
2021 }
2022 
2023 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2024 {
2025 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2026 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2027 	struct mlx5_vdpa_virtqueue *mvq;
2028 
2029 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2030 		return;
2031 
2032 	mvq = &ndev->vqs[idx];
2033 	mvq->num_ent = num;
2034 }
2035 
2036 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2037 {
2038 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2039 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2040 
2041 	ndev->event_cbs[idx] = *cb;
2042 	if (is_ctrl_vq_idx(mvdev, idx))
2043 		mvdev->cvq.event_cb = *cb;
2044 }
2045 
2046 static void mlx5_cvq_notify(struct vringh *vring)
2047 {
2048 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2049 
2050 	if (!cvq->event_cb.callback)
2051 		return;
2052 
2053 	cvq->event_cb.callback(cvq->event_cb.private);
2054 }
2055 
2056 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2057 {
2058 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2059 
2060 	cvq->ready = ready;
2061 	if (!ready)
2062 		return;
2063 
2064 	cvq->vring.notify = mlx5_cvq_notify;
2065 }
2066 
2067 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2068 {
2069 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2070 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2071 	struct mlx5_vdpa_virtqueue *mvq;
2072 	int err;
2073 
2074 	if (!mvdev->actual_features)
2075 		return;
2076 
2077 	if (!is_index_valid(mvdev, idx))
2078 		return;
2079 
2080 	if (is_ctrl_vq_idx(mvdev, idx)) {
2081 		set_cvq_ready(mvdev, ready);
2082 		return;
2083 	}
2084 
2085 	mvq = &ndev->vqs[idx];
2086 	if (!ready) {
2087 		suspend_vq(ndev, mvq);
2088 	} else {
2089 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
2090 		if (err) {
2091 			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
2092 			ready = false;
2093 		}
2094 	}
2095 
2096 
2097 	mvq->ready = ready;
2098 }
2099 
2100 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2101 {
2102 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2103 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2104 
2105 	if (!is_index_valid(mvdev, idx))
2106 		return false;
2107 
2108 	if (is_ctrl_vq_idx(mvdev, idx))
2109 		return mvdev->cvq.ready;
2110 
2111 	return ndev->vqs[idx].ready;
2112 }
2113 
2114 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2115 				  const struct vdpa_vq_state *state)
2116 {
2117 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2118 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2119 	struct mlx5_vdpa_virtqueue *mvq;
2120 
2121 	if (!is_index_valid(mvdev, idx))
2122 		return -EINVAL;
2123 
2124 	if (is_ctrl_vq_idx(mvdev, idx)) {
2125 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2126 		return 0;
2127 	}
2128 
2129 	mvq = &ndev->vqs[idx];
2130 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2131 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2132 		return -EINVAL;
2133 	}
2134 
2135 	mvq->used_idx = state->split.avail_index;
2136 	mvq->avail_idx = state->split.avail_index;
2137 	return 0;
2138 }
2139 
2140 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2141 {
2142 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2143 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2144 	struct mlx5_vdpa_virtqueue *mvq;
2145 	struct mlx5_virtq_attr attr;
2146 	int err;
2147 
2148 	if (!is_index_valid(mvdev, idx))
2149 		return -EINVAL;
2150 
2151 	if (is_ctrl_vq_idx(mvdev, idx)) {
2152 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2153 		return 0;
2154 	}
2155 
2156 	mvq = &ndev->vqs[idx];
2157 	/* If the virtq object was destroyed, use the value saved at
2158 	 * the last minute of suspend_vq. This caters for userspace
2159 	 * that cares about emulating the index after vq is stopped.
2160 	 */
2161 	if (!mvq->initialized) {
2162 		/* Firmware returns a wrong value for the available index.
2163 		 * Since both values should be identical, we take the value of
2164 		 * used_idx which is reported correctly.
2165 		 */
2166 		state->split.avail_index = mvq->used_idx;
2167 		return 0;
2168 	}
2169 
2170 	err = query_virtqueue(ndev, mvq, &attr);
2171 	if (err) {
2172 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2173 		return err;
2174 	}
2175 	state->split.avail_index = attr.used_index;
2176 	return 0;
2177 }
2178 
2179 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2180 {
2181 	return PAGE_SIZE;
2182 }
2183 
2184 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2185 {
2186 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2187 
2188 	if (is_ctrl_vq_idx(mvdev, idx))
2189 		return MLX5_VDPA_CVQ_GROUP;
2190 
2191 	return MLX5_VDPA_DATAVQ_GROUP;
2192 }
2193 
2194 static u64 mlx_to_vritio_features(u16 dev_features)
2195 {
2196 	u64 result = 0;
2197 
2198 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2199 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2200 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2201 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2202 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2203 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2204 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2205 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2206 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2207 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2208 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2209 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2210 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2211 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2212 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2213 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2214 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2215 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2216 
2217 	return result;
2218 }
2219 
2220 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2221 {
2222 	u64 mlx_vdpa_features = 0;
2223 	u16 dev_features;
2224 
2225 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2226 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2227 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2228 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2229 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2230 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2231 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2232 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2233 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2234 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2235 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2236 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2237 
2238 	return mlx_vdpa_features;
2239 }
2240 
2241 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2242 {
2243 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2244 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2245 
2246 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2247 	return ndev->mvdev.mlx_features;
2248 }
2249 
2250 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2251 {
2252 	/* Minimum features to expect */
2253 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2254 		return -EOPNOTSUPP;
2255 
2256 	/* Double check features combination sent down by the driver.
2257 	 * Fail invalid features due to absence of the depended feature.
2258 	 *
2259 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2260 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2261 	 * By failing the invalid features sent down by untrusted drivers,
2262 	 * we're assured the assumption made upon is_index_valid() and
2263 	 * is_ctrl_vq_idx() will not be compromised.
2264 	 */
2265 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2266             BIT_ULL(VIRTIO_NET_F_MQ))
2267 		return -EINVAL;
2268 
2269 	return 0;
2270 }
2271 
2272 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
2273 {
2274 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2275 	int err;
2276 	int i;
2277 
2278 	for (i = 0; i < mvdev->max_vqs; i++) {
2279 		err = setup_vq(ndev, &ndev->vqs[i]);
2280 		if (err)
2281 			goto err_vq;
2282 	}
2283 
2284 	return 0;
2285 
2286 err_vq:
2287 	for (--i; i >= 0; i--)
2288 		teardown_vq(ndev, &ndev->vqs[i]);
2289 
2290 	return err;
2291 }
2292 
2293 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2294 {
2295 	struct mlx5_vdpa_virtqueue *mvq;
2296 	int i;
2297 
2298 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2299 		mvq = &ndev->vqs[i];
2300 		if (!mvq->initialized)
2301 			continue;
2302 
2303 		teardown_vq(ndev, mvq);
2304 	}
2305 }
2306 
2307 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2308 {
2309 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2310 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2311 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2312 			mvdev->max_idx = mvdev->max_vqs;
2313 		} else {
2314 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2315 			 * CVQ gets index 2
2316 			 */
2317 			mvdev->max_idx = 2;
2318 		}
2319 	} else {
2320 		/* Two data virtqueues only: one for rx and one for tx */
2321 		mvdev->max_idx = 1;
2322 	}
2323 }
2324 
2325 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2326 {
2327 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2328 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2329 	int err;
2330 
2331 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2332 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2333 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2334 	if (vport)
2335 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2336 
2337 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2338 	if (err)
2339 		return 0;
2340 
2341 	return MLX5_GET(query_vport_state_out, out, state);
2342 }
2343 
2344 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2345 {
2346 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2347 	    VPORT_STATE_UP)
2348 		return true;
2349 
2350 	return false;
2351 }
2352 
2353 static void update_carrier(struct work_struct *work)
2354 {
2355 	struct mlx5_vdpa_wq_ent *wqent;
2356 	struct mlx5_vdpa_dev *mvdev;
2357 	struct mlx5_vdpa_net *ndev;
2358 
2359 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2360 	mvdev = wqent->mvdev;
2361 	ndev = to_mlx5_vdpa_ndev(mvdev);
2362 	if (get_link_state(mvdev))
2363 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2364 	else
2365 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2366 
2367 	if (ndev->config_cb.callback)
2368 		ndev->config_cb.callback(ndev->config_cb.private);
2369 
2370 	kfree(wqent);
2371 }
2372 
2373 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2374 {
2375 	struct mlx5_vdpa_wq_ent *wqent;
2376 
2377 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2378 	if (!wqent)
2379 		return -ENOMEM;
2380 
2381 	wqent->mvdev = &ndev->mvdev;
2382 	INIT_WORK(&wqent->work, update_carrier);
2383 	queue_work(ndev->mvdev.wq, &wqent->work);
2384 	return 0;
2385 }
2386 
2387 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2388 {
2389 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2390 	struct mlx5_eqe *eqe = param;
2391 	int ret = NOTIFY_DONE;
2392 
2393 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2394 		switch (eqe->sub_type) {
2395 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2396 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2397 			if (queue_link_work(ndev))
2398 				return NOTIFY_DONE;
2399 
2400 			ret = NOTIFY_OK;
2401 			break;
2402 		default:
2403 			return NOTIFY_DONE;
2404 		}
2405 		return ret;
2406 	}
2407 	return ret;
2408 }
2409 
2410 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2411 {
2412 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2413 		return;
2414 
2415 	ndev->nb.notifier_call = event_handler;
2416 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2417 	ndev->nb_registered = true;
2418 	queue_link_work(ndev);
2419 }
2420 
2421 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2422 {
2423 	if (!ndev->nb_registered)
2424 		return;
2425 
2426 	ndev->nb_registered = false;
2427 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2428 	if (ndev->mvdev.wq)
2429 		flush_workqueue(ndev->mvdev.wq);
2430 }
2431 
2432 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2433 {
2434 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2435 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2436 	int err;
2437 
2438 	print_features(mvdev, features, true);
2439 
2440 	err = verify_driver_features(mvdev, features);
2441 	if (err)
2442 		return err;
2443 
2444 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2445 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2446 		ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2447 	else
2448 		ndev->rqt_size = 1;
2449 
2450 	ndev->cur_num_vqs = 2 * ndev->rqt_size;
2451 
2452 	update_cvq_info(mvdev);
2453 	return err;
2454 }
2455 
2456 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2457 {
2458 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2459 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2460 
2461 	ndev->config_cb = *cb;
2462 }
2463 
2464 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2465 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2466 {
2467 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2468 }
2469 
2470 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2471 {
2472 	return VIRTIO_ID_NET;
2473 }
2474 
2475 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2476 {
2477 	return PCI_VENDOR_ID_MELLANOX;
2478 }
2479 
2480 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2481 {
2482 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2483 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2484 
2485 	print_status(mvdev, ndev->mvdev.status, false);
2486 	return ndev->mvdev.status;
2487 }
2488 
2489 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2490 {
2491 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2492 	struct mlx5_virtq_attr attr = {};
2493 	int err;
2494 
2495 	if (mvq->initialized) {
2496 		err = query_virtqueue(ndev, mvq, &attr);
2497 		if (err)
2498 			return err;
2499 	}
2500 
2501 	ri->avail_index = attr.available_index;
2502 	ri->used_index = attr.used_index;
2503 	ri->ready = mvq->ready;
2504 	ri->num_ent = mvq->num_ent;
2505 	ri->desc_addr = mvq->desc_addr;
2506 	ri->device_addr = mvq->device_addr;
2507 	ri->driver_addr = mvq->driver_addr;
2508 	ri->restore = true;
2509 	return 0;
2510 }
2511 
2512 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2513 {
2514 	int i;
2515 
2516 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2517 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2518 		save_channel_info(ndev, &ndev->vqs[i]);
2519 	}
2520 	return 0;
2521 }
2522 
2523 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2524 {
2525 	int i;
2526 
2527 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2528 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2529 }
2530 
2531 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2532 {
2533 	struct mlx5_vdpa_virtqueue *mvq;
2534 	struct mlx5_vq_restore_info *ri;
2535 	int i;
2536 
2537 	mlx5_clear_vqs(ndev);
2538 	init_mvqs(ndev);
2539 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2540 		mvq = &ndev->vqs[i];
2541 		ri = &mvq->ri;
2542 		if (!ri->restore)
2543 			continue;
2544 
2545 		mvq->avail_idx = ri->avail_index;
2546 		mvq->used_idx = ri->used_index;
2547 		mvq->ready = ri->ready;
2548 		mvq->num_ent = ri->num_ent;
2549 		mvq->desc_addr = ri->desc_addr;
2550 		mvq->device_addr = ri->device_addr;
2551 		mvq->driver_addr = ri->driver_addr;
2552 	}
2553 }
2554 
2555 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2556 				struct vhost_iotlb *iotlb, unsigned int asid)
2557 {
2558 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2559 	int err;
2560 
2561 	suspend_vqs(ndev);
2562 	err = save_channels_info(ndev);
2563 	if (err)
2564 		goto err_mr;
2565 
2566 	teardown_driver(ndev);
2567 	mlx5_vdpa_destroy_mr(mvdev);
2568 	err = mlx5_vdpa_create_mr(mvdev, iotlb, asid);
2569 	if (err)
2570 		goto err_mr;
2571 
2572 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2573 		goto err_mr;
2574 
2575 	restore_channels_info(ndev);
2576 	err = setup_driver(mvdev);
2577 	if (err)
2578 		goto err_setup;
2579 
2580 	return 0;
2581 
2582 err_setup:
2583 	mlx5_vdpa_destroy_mr(mvdev);
2584 err_mr:
2585 	return err;
2586 }
2587 
2588 /* reslock must be held for this function */
2589 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2590 {
2591 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2592 	int err;
2593 
2594 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2595 
2596 	if (ndev->setup) {
2597 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2598 		err = 0;
2599 		goto out;
2600 	}
2601 	mlx5_vdpa_add_debugfs(ndev);
2602 	err = setup_virtqueues(mvdev);
2603 	if (err) {
2604 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2605 		goto err_setup;
2606 	}
2607 
2608 	err = create_rqt(ndev);
2609 	if (err) {
2610 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2611 		goto err_rqt;
2612 	}
2613 
2614 	err = create_tir(ndev);
2615 	if (err) {
2616 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2617 		goto err_tir;
2618 	}
2619 
2620 	err = setup_steering(ndev);
2621 	if (err) {
2622 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
2623 		goto err_fwd;
2624 	}
2625 	ndev->setup = true;
2626 
2627 	return 0;
2628 
2629 err_fwd:
2630 	destroy_tir(ndev);
2631 err_tir:
2632 	destroy_rqt(ndev);
2633 err_rqt:
2634 	teardown_virtqueues(ndev);
2635 err_setup:
2636 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
2637 out:
2638 	return err;
2639 }
2640 
2641 /* reslock must be held for this function */
2642 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2643 {
2644 
2645 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2646 
2647 	if (!ndev->setup)
2648 		return;
2649 
2650 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
2651 	ndev->debugfs = NULL;
2652 	teardown_steering(ndev);
2653 	destroy_tir(ndev);
2654 	destroy_rqt(ndev);
2655 	teardown_virtqueues(ndev);
2656 	ndev->setup = false;
2657 }
2658 
2659 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2660 {
2661 	int i;
2662 
2663 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2664 		ndev->vqs[i].ready = false;
2665 
2666 	ndev->mvdev.cvq.ready = false;
2667 }
2668 
2669 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
2670 {
2671 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2672 	int err = 0;
2673 
2674 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))
2675 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2676 					MLX5_CVQ_MAX_ENT, false,
2677 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
2678 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
2679 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2680 
2681 	return err;
2682 }
2683 
2684 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2685 {
2686 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2687 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2688 	int err;
2689 
2690 	print_status(mvdev, status, true);
2691 
2692 	down_write(&ndev->reslock);
2693 
2694 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2695 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2696 			err = setup_cvq_vring(mvdev);
2697 			if (err) {
2698 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
2699 				goto err_setup;
2700 			}
2701 			register_link_notifier(ndev);
2702 			err = setup_driver(mvdev);
2703 			if (err) {
2704 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2705 				goto err_driver;
2706 			}
2707 		} else {
2708 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2709 			goto err_clear;
2710 		}
2711 	}
2712 
2713 	ndev->mvdev.status = status;
2714 	up_write(&ndev->reslock);
2715 	return;
2716 
2717 err_driver:
2718 	unregister_link_notifier(ndev);
2719 err_setup:
2720 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2721 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2722 err_clear:
2723 	up_write(&ndev->reslock);
2724 }
2725 
2726 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
2727 {
2728 	int i;
2729 
2730 	/* default mapping all groups are mapped to asid 0 */
2731 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
2732 		mvdev->group2asid[i] = 0;
2733 }
2734 
2735 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2736 {
2737 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2738 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2739 
2740 	print_status(mvdev, 0, true);
2741 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2742 
2743 	down_write(&ndev->reslock);
2744 	unregister_link_notifier(ndev);
2745 	teardown_driver(ndev);
2746 	clear_vqs_ready(ndev);
2747 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2748 	ndev->mvdev.status = 0;
2749 	ndev->mvdev.suspended = false;
2750 	ndev->cur_num_vqs = 0;
2751 	ndev->mvdev.cvq.received_desc = 0;
2752 	ndev->mvdev.cvq.completed_desc = 0;
2753 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2754 	ndev->mvdev.actual_features = 0;
2755 	init_group_to_asid_map(mvdev);
2756 	++mvdev->generation;
2757 
2758 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2759 		if (mlx5_vdpa_create_mr(mvdev, NULL, 0))
2760 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2761 	}
2762 	up_write(&ndev->reslock);
2763 
2764 	return 0;
2765 }
2766 
2767 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2768 {
2769 	return sizeof(struct virtio_net_config);
2770 }
2771 
2772 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2773 				 unsigned int len)
2774 {
2775 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2776 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2777 
2778 	if (offset + len <= sizeof(struct virtio_net_config))
2779 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2780 }
2781 
2782 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2783 				 unsigned int len)
2784 {
2785 	/* not supported */
2786 }
2787 
2788 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2789 {
2790 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2791 
2792 	return mvdev->generation;
2793 }
2794 
2795 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
2796 			unsigned int asid)
2797 {
2798 	bool change_map;
2799 	int err;
2800 
2801 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid);
2802 	if (err) {
2803 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2804 		return err;
2805 	}
2806 
2807 	if (change_map)
2808 		err = mlx5_vdpa_change_map(mvdev, iotlb, asid);
2809 
2810 	return err;
2811 }
2812 
2813 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
2814 			     struct vhost_iotlb *iotlb)
2815 {
2816 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2817 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2818 	int err = -EINVAL;
2819 
2820 	down_write(&ndev->reslock);
2821 	err = set_map_data(mvdev, iotlb, asid);
2822 	up_write(&ndev->reslock);
2823 	return err;
2824 }
2825 
2826 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
2827 {
2828 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2829 
2830 	if (is_ctrl_vq_idx(mvdev, idx))
2831 		return &vdev->dev;
2832 
2833 	return mvdev->vdev.dma_dev;
2834 }
2835 
2836 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2837 {
2838 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2839 	struct mlx5_core_dev *pfmdev;
2840 	struct mlx5_vdpa_net *ndev;
2841 
2842 	ndev = to_mlx5_vdpa_ndev(mvdev);
2843 
2844 	free_resources(ndev);
2845 	mlx5_vdpa_destroy_mr(mvdev);
2846 	if (!is_zero_ether_addr(ndev->config.mac)) {
2847 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2848 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2849 	}
2850 	mlx5_vdpa_free_resources(&ndev->mvdev);
2851 	kfree(ndev->event_cbs);
2852 	kfree(ndev->vqs);
2853 }
2854 
2855 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2856 {
2857 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2858 	struct vdpa_notification_area ret = {};
2859 	struct mlx5_vdpa_net *ndev;
2860 	phys_addr_t addr;
2861 
2862 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2863 		return ret;
2864 
2865 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2866 	 * notification to avoid the risk of mapping pages that contain BAR of more
2867 	 * than one SF
2868 	 */
2869 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2870 		return ret;
2871 
2872 	ndev = to_mlx5_vdpa_ndev(mvdev);
2873 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2874 	ret.addr = addr;
2875 	ret.size = PAGE_SIZE;
2876 	return ret;
2877 }
2878 
2879 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2880 {
2881 	return -EOPNOTSUPP;
2882 }
2883 
2884 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2885 {
2886 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2887 
2888 	return mvdev->actual_features;
2889 }
2890 
2891 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
2892 			     u64 *received_desc, u64 *completed_desc)
2893 {
2894 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
2895 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
2896 	void *cmd_hdr;
2897 	void *ctx;
2898 	int err;
2899 
2900 	if (!counters_supported(&ndev->mvdev))
2901 		return -EOPNOTSUPP;
2902 
2903 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
2904 		return -EAGAIN;
2905 
2906 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
2907 
2908 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
2909 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
2910 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
2911 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
2912 
2913 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
2914 	if (err)
2915 		return err;
2916 
2917 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
2918 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
2919 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
2920 	return 0;
2921 }
2922 
2923 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
2924 					 struct sk_buff *msg,
2925 					 struct netlink_ext_ack *extack)
2926 {
2927 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2928 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2929 	struct mlx5_vdpa_virtqueue *mvq;
2930 	struct mlx5_control_vq *cvq;
2931 	u64 received_desc;
2932 	u64 completed_desc;
2933 	int err = 0;
2934 
2935 	down_read(&ndev->reslock);
2936 	if (!is_index_valid(mvdev, idx)) {
2937 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
2938 		err = -EINVAL;
2939 		goto out_err;
2940 	}
2941 
2942 	if (idx == ctrl_vq_idx(mvdev)) {
2943 		cvq = &mvdev->cvq;
2944 		received_desc = cvq->received_desc;
2945 		completed_desc = cvq->completed_desc;
2946 		goto out;
2947 	}
2948 
2949 	mvq = &ndev->vqs[idx];
2950 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
2951 	if (err) {
2952 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
2953 		goto out_err;
2954 	}
2955 
2956 out:
2957 	err = -EMSGSIZE;
2958 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
2959 		goto out_err;
2960 
2961 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
2962 			      VDPA_ATTR_PAD))
2963 		goto out_err;
2964 
2965 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
2966 		goto out_err;
2967 
2968 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
2969 			      VDPA_ATTR_PAD))
2970 		goto out_err;
2971 
2972 	err = 0;
2973 out_err:
2974 	up_read(&ndev->reslock);
2975 	return err;
2976 }
2977 
2978 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
2979 {
2980 	struct mlx5_control_vq *cvq;
2981 
2982 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
2983 		return;
2984 
2985 	cvq = &mvdev->cvq;
2986 	cvq->ready = false;
2987 }
2988 
2989 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
2990 {
2991 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2992 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2993 	struct mlx5_vdpa_virtqueue *mvq;
2994 	int i;
2995 
2996 	mlx5_vdpa_info(mvdev, "suspending device\n");
2997 
2998 	down_write(&ndev->reslock);
2999 	unregister_link_notifier(ndev);
3000 	for (i = 0; i < ndev->cur_num_vqs; i++) {
3001 		mvq = &ndev->vqs[i];
3002 		suspend_vq(ndev, mvq);
3003 	}
3004 	mlx5_vdpa_cvq_suspend(mvdev);
3005 	mvdev->suspended = true;
3006 	up_write(&ndev->reslock);
3007 	return 0;
3008 }
3009 
3010 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3011 			       unsigned int asid)
3012 {
3013 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3014 
3015 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3016 		return -EINVAL;
3017 
3018 	mvdev->group2asid[group] = asid;
3019 	return 0;
3020 }
3021 
3022 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3023 	.set_vq_address = mlx5_vdpa_set_vq_address,
3024 	.set_vq_num = mlx5_vdpa_set_vq_num,
3025 	.kick_vq = mlx5_vdpa_kick_vq,
3026 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3027 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3028 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3029 	.set_vq_state = mlx5_vdpa_set_vq_state,
3030 	.get_vq_state = mlx5_vdpa_get_vq_state,
3031 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3032 	.get_vq_notification = mlx5_get_vq_notification,
3033 	.get_vq_irq = mlx5_get_vq_irq,
3034 	.get_vq_align = mlx5_vdpa_get_vq_align,
3035 	.get_vq_group = mlx5_vdpa_get_vq_group,
3036 	.get_device_features = mlx5_vdpa_get_device_features,
3037 	.set_driver_features = mlx5_vdpa_set_driver_features,
3038 	.get_driver_features = mlx5_vdpa_get_driver_features,
3039 	.set_config_cb = mlx5_vdpa_set_config_cb,
3040 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3041 	.get_device_id = mlx5_vdpa_get_device_id,
3042 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3043 	.get_status = mlx5_vdpa_get_status,
3044 	.set_status = mlx5_vdpa_set_status,
3045 	.reset = mlx5_vdpa_reset,
3046 	.get_config_size = mlx5_vdpa_get_config_size,
3047 	.get_config = mlx5_vdpa_get_config,
3048 	.set_config = mlx5_vdpa_set_config,
3049 	.get_generation = mlx5_vdpa_get_generation,
3050 	.set_map = mlx5_vdpa_set_map,
3051 	.set_group_asid = mlx5_set_group_asid,
3052 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3053 	.free = mlx5_vdpa_free,
3054 	.suspend = mlx5_vdpa_suspend,
3055 };
3056 
3057 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3058 {
3059 	u16 hw_mtu;
3060 	int err;
3061 
3062 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3063 	if (err)
3064 		return err;
3065 
3066 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3067 	return 0;
3068 }
3069 
3070 static int alloc_resources(struct mlx5_vdpa_net *ndev)
3071 {
3072 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3073 	int err;
3074 
3075 	if (res->valid) {
3076 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3077 		return -EEXIST;
3078 	}
3079 
3080 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3081 	if (err)
3082 		return err;
3083 
3084 	err = create_tis(ndev);
3085 	if (err)
3086 		goto err_tis;
3087 
3088 	res->valid = true;
3089 
3090 	return 0;
3091 
3092 err_tis:
3093 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3094 	return err;
3095 }
3096 
3097 static void free_resources(struct mlx5_vdpa_net *ndev)
3098 {
3099 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3100 
3101 	if (!res->valid)
3102 		return;
3103 
3104 	destroy_tis(ndev);
3105 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3106 	res->valid = false;
3107 }
3108 
3109 static void init_mvqs(struct mlx5_vdpa_net *ndev)
3110 {
3111 	struct mlx5_vdpa_virtqueue *mvq;
3112 	int i;
3113 
3114 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3115 		mvq = &ndev->vqs[i];
3116 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3117 		mvq->index = i;
3118 		mvq->ndev = ndev;
3119 		mvq->fwqp.fw = true;
3120 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3121 	}
3122 	for (; i < ndev->mvdev.max_vqs; i++) {
3123 		mvq = &ndev->vqs[i];
3124 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3125 		mvq->index = i;
3126 		mvq->ndev = ndev;
3127 	}
3128 }
3129 
3130 struct mlx5_vdpa_mgmtdev {
3131 	struct vdpa_mgmt_dev mgtdev;
3132 	struct mlx5_adev *madev;
3133 	struct mlx5_vdpa_net *ndev;
3134 };
3135 
3136 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3137 {
3138 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3139 	void *in;
3140 	int err;
3141 
3142 	in = kvzalloc(inlen, GFP_KERNEL);
3143 	if (!in)
3144 		return -ENOMEM;
3145 
3146 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3147 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3148 		 mtu + MLX5V_ETH_HARD_MTU);
3149 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3150 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3151 
3152 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3153 
3154 	kvfree(in);
3155 	return err;
3156 }
3157 
3158 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3159 			     const struct vdpa_dev_set_config *add_config)
3160 {
3161 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3162 	struct virtio_net_config *config;
3163 	struct mlx5_core_dev *pfmdev;
3164 	struct mlx5_vdpa_dev *mvdev;
3165 	struct mlx5_vdpa_net *ndev;
3166 	struct mlx5_core_dev *mdev;
3167 	u64 device_features;
3168 	u32 max_vqs;
3169 	u16 mtu;
3170 	int err;
3171 
3172 	if (mgtdev->ndev)
3173 		return -ENOSPC;
3174 
3175 	mdev = mgtdev->madev->mdev;
3176 	device_features = mgtdev->mgtdev.supported_features;
3177 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3178 		if (add_config->device_features & ~device_features) {
3179 			dev_warn(mdev->device,
3180 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3181 				 add_config->device_features, device_features);
3182 			return -EINVAL;
3183 		}
3184 		device_features &= add_config->device_features;
3185 	} else {
3186 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3187 	}
3188 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3189 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3190 		dev_warn(mdev->device,
3191 			 "Must provision minimum features 0x%llx for this device",
3192 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3193 		return -EOPNOTSUPP;
3194 	}
3195 
3196 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3197 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3198 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3199 		return -EOPNOTSUPP;
3200 	}
3201 
3202 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3203 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3204 	if (max_vqs < 2) {
3205 		dev_warn(mdev->device,
3206 			 "%d virtqueues are supported. At least 2 are required\n",
3207 			 max_vqs);
3208 		return -EAGAIN;
3209 	}
3210 
3211 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3212 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3213 			return -EINVAL;
3214 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3215 	} else {
3216 		max_vqs = 2;
3217 	}
3218 
3219 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
3220 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3221 	if (IS_ERR(ndev))
3222 		return PTR_ERR(ndev);
3223 
3224 	ndev->mvdev.max_vqs = max_vqs;
3225 	mvdev = &ndev->mvdev;
3226 	mvdev->mdev = mdev;
3227 
3228 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3229 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3230 	if (!ndev->vqs || !ndev->event_cbs) {
3231 		err = -ENOMEM;
3232 		goto err_alloc;
3233 	}
3234 
3235 	init_mvqs(ndev);
3236 	init_rwsem(&ndev->reslock);
3237 	config = &ndev->config;
3238 
3239 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3240 		err = config_func_mtu(mdev, add_config->net.mtu);
3241 		if (err)
3242 			goto err_alloc;
3243 	}
3244 
3245 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3246 		err = query_mtu(mdev, &mtu);
3247 		if (err)
3248 			goto err_alloc;
3249 
3250 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3251 	}
3252 
3253 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3254 		if (get_link_state(mvdev))
3255 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3256 		else
3257 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3258 	}
3259 
3260 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3261 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3262 	/* No bother setting mac address in config if not going to provision _F_MAC */
3263 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3264 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3265 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3266 		if (err)
3267 			goto err_alloc;
3268 	}
3269 
3270 	if (!is_zero_ether_addr(config->mac)) {
3271 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3272 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3273 		if (err)
3274 			goto err_alloc;
3275 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3276 		/*
3277 		 * We used to clear _F_MAC feature bit if seeing
3278 		 * zero mac address when device features are not
3279 		 * specifically provisioned. Keep the behaviour
3280 		 * so old scripts do not break.
3281 		 */
3282 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3283 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3284 		/* Don't provision zero mac address for _F_MAC */
3285 		mlx5_vdpa_warn(&ndev->mvdev,
3286 			       "No mac address provisioned?\n");
3287 		err = -EINVAL;
3288 		goto err_alloc;
3289 	}
3290 
3291 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
3292 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3293 
3294 	ndev->mvdev.mlx_features = device_features;
3295 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3296 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3297 	if (err)
3298 		goto err_mpfs;
3299 
3300 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3301 		err = mlx5_vdpa_create_mr(mvdev, NULL, 0);
3302 		if (err)
3303 			goto err_res;
3304 	}
3305 
3306 	err = alloc_resources(ndev);
3307 	if (err)
3308 		goto err_mr;
3309 
3310 	ndev->cvq_ent.mvdev = mvdev;
3311 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3312 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3313 	if (!mvdev->wq) {
3314 		err = -ENOMEM;
3315 		goto err_res2;
3316 	}
3317 
3318 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3319 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3320 	if (err)
3321 		goto err_reg;
3322 
3323 	mgtdev->ndev = ndev;
3324 	return 0;
3325 
3326 err_reg:
3327 	destroy_workqueue(mvdev->wq);
3328 err_res2:
3329 	free_resources(ndev);
3330 err_mr:
3331 	mlx5_vdpa_destroy_mr(mvdev);
3332 err_res:
3333 	mlx5_vdpa_free_resources(&ndev->mvdev);
3334 err_mpfs:
3335 	if (!is_zero_ether_addr(config->mac))
3336 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3337 err_alloc:
3338 	put_device(&mvdev->vdev.dev);
3339 	return err;
3340 }
3341 
3342 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3343 {
3344 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3345 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3346 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3347 	struct workqueue_struct *wq;
3348 
3349 	mlx5_vdpa_remove_debugfs(ndev->debugfs);
3350 	ndev->debugfs = NULL;
3351 	unregister_link_notifier(ndev);
3352 	wq = mvdev->wq;
3353 	mvdev->wq = NULL;
3354 	destroy_workqueue(wq);
3355 	_vdpa_unregister_device(dev);
3356 	mgtdev->ndev = NULL;
3357 }
3358 
3359 static const struct vdpa_mgmtdev_ops mdev_ops = {
3360 	.dev_add = mlx5_vdpa_dev_add,
3361 	.dev_del = mlx5_vdpa_dev_del,
3362 };
3363 
3364 static struct virtio_device_id id_table[] = {
3365 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3366 	{ 0 },
3367 };
3368 
3369 static int mlx5v_probe(struct auxiliary_device *adev,
3370 		       const struct auxiliary_device_id *id)
3371 
3372 {
3373 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3374 	struct mlx5_core_dev *mdev = madev->mdev;
3375 	struct mlx5_vdpa_mgmtdev *mgtdev;
3376 	int err;
3377 
3378 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3379 	if (!mgtdev)
3380 		return -ENOMEM;
3381 
3382 	mgtdev->mgtdev.ops = &mdev_ops;
3383 	mgtdev->mgtdev.device = mdev->device;
3384 	mgtdev->mgtdev.id_table = id_table;
3385 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3386 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3387 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3388 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3389 	mgtdev->mgtdev.max_supported_vqs =
3390 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3391 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3392 	mgtdev->madev = madev;
3393 
3394 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3395 	if (err)
3396 		goto reg_err;
3397 
3398 	auxiliary_set_drvdata(adev, mgtdev);
3399 
3400 	return 0;
3401 
3402 reg_err:
3403 	kfree(mgtdev);
3404 	return err;
3405 }
3406 
3407 static void mlx5v_remove(struct auxiliary_device *adev)
3408 {
3409 	struct mlx5_vdpa_mgmtdev *mgtdev;
3410 
3411 	mgtdev = auxiliary_get_drvdata(adev);
3412 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
3413 	kfree(mgtdev);
3414 }
3415 
3416 static const struct auxiliary_device_id mlx5v_id_table[] = {
3417 	{ .name = MLX5_ADEV_NAME ".vnet", },
3418 	{},
3419 };
3420 
3421 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
3422 
3423 static struct auxiliary_driver mlx5v_driver = {
3424 	.name = "vnet",
3425 	.probe = mlx5v_probe,
3426 	.remove = mlx5v_remove,
3427 	.id_table = mlx5v_id_table,
3428 };
3429 
3430 module_auxiliary_driver(mlx5v_driver);
3431